From c400f5c17af4996eb2ecf0597e17eb25c17857d8 Mon Sep 17 00:00:00 2001
From: dartraiden <wowemuh@gmail.com>
Date: Thu, 14 Sep 2023 22:53:45 +0300
Subject: libsodium: update to 1.0.19

---
 .../chacha20/dolbeau/chacha20_dolbeau-avx2.c       |  354 ++--
 .../chacha20/dolbeau/chacha20_dolbeau-avx2.h       |   16 +-
 .../chacha20/dolbeau/chacha20_dolbeau-ssse3.c      |  342 ++--
 .../chacha20/dolbeau/chacha20_dolbeau-ssse3.h      |   16 +-
 .../src/crypto_stream/chacha20/dolbeau/u0.h        |  172 +-
 .../src/crypto_stream/chacha20/dolbeau/u1.h        |  196 +-
 .../src/crypto_stream/chacha20/dolbeau/u4.h        |  352 ++--
 .../src/crypto_stream/chacha20/dolbeau/u8.h        |  714 ++++----
 .../src/crypto_stream/chacha20/ref/chacha20_ref.c  |  624 +++----
 .../src/crypto_stream/chacha20/ref/chacha20_ref.h  |   16 +-
 .../src/crypto_stream/chacha20/stream_chacha20.c   |  367 ++--
 .../src/crypto_stream/chacha20/stream_chacha20.h   |   44 +-
 libs/libsodium/src/crypto_stream/crypto_stream.c   |   98 +-
 .../src/crypto_stream/salsa20/ref/salsa20_ref.c    |  240 +--
 .../src/crypto_stream/salsa20/ref/salsa20_ref.h    |   16 +-
 .../src/crypto_stream/salsa20/stream_salsa20.c     |  200 +-
 .../src/crypto_stream/salsa20/stream_salsa20.h     |   32 +-
 .../crypto_stream/salsa20/xmm6/salsa20_xmm6-asm.S  | 1920 ++++++++++----------
 .../src/crypto_stream/salsa20/xmm6/salsa20_xmm6.c  |   62 +-
 .../src/crypto_stream/salsa20/xmm6/salsa20_xmm6.h  |   16 +-
 .../salsa20/xmm6int/salsa20_xmm6int-avx2.c         |  262 +--
 .../salsa20/xmm6int/salsa20_xmm6int-avx2.h         |   16 +-
 .../salsa20/xmm6int/salsa20_xmm6int-sse2.c         |  244 +--
 .../salsa20/xmm6int/salsa20_xmm6int-sse2.h         |   16 +-
 .../src/crypto_stream/salsa20/xmm6int/u0.h         |  390 ++--
 .../src/crypto_stream/salsa20/xmm6int/u1.h         |  414 ++---
 .../src/crypto_stream/salsa20/xmm6int/u4.h         | 1094 +++++------
 .../src/crypto_stream/salsa20/xmm6int/u8.h         |  953 +++++-----
 .../salsa2012/ref/stream_salsa2012_ref.c           |  212 +--
 .../src/crypto_stream/salsa2012/stream_salsa2012.c |   52 +-
 .../salsa208/ref/stream_salsa208_ref.c             |  212 +--
 .../src/crypto_stream/salsa208/stream_salsa208.c   |   52 +-
 .../src/crypto_stream/xchacha20/stream_xchacha20.c |  138 +-
 .../src/crypto_stream/xsalsa20/stream_xsalsa20.c   |  132 +-
 34 files changed, 4994 insertions(+), 4990 deletions(-)

(limited to 'libs/libsodium/src/crypto_stream')

diff --git a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-avx2.c b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-avx2.c
index f63e055265..1e2cdf266c 100644
--- a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-avx2.c
+++ b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-avx2.c
@@ -1,177 +1,177 @@
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "core.h"
-#include "crypto_stream_chacha20.h"
-#include "private/common.h"
-#include "private/sse2_64_32.h"
-#include "utils.h"
-
-#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
-        defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
-
-# ifdef __GNUC__
-#  pragma GCC target("sse2")
-#  pragma GCC target("ssse3")
-#  pragma GCC target("sse4.1")
-#  pragma GCC target("avx2")
-# endif
-
-# include <emmintrin.h>
-# include <immintrin.h>
-# include <smmintrin.h>
-# include <tmmintrin.h>
-
-# include "../stream_chacha20.h"
-# include "chacha20_dolbeau-avx2.h"
-
-# define ROUNDS 20
-
-typedef struct chacha_ctx {
-    uint32_t input[16];
-} chacha_ctx;
-
-static void
-chacha_keysetup(chacha_ctx *ctx, const uint8_t *k)
-{
-    ctx->input[0]  = 0x61707865;
-    ctx->input[1]  = 0x3320646e;
-    ctx->input[2]  = 0x79622d32;
-    ctx->input[3]  = 0x6b206574;
-    ctx->input[4]  = LOAD32_LE(k + 0);
-    ctx->input[5]  = LOAD32_LE(k + 4);
-    ctx->input[6]  = LOAD32_LE(k + 8);
-    ctx->input[7]  = LOAD32_LE(k + 12);
-    ctx->input[8]  = LOAD32_LE(k + 16);
-    ctx->input[9]  = LOAD32_LE(k + 20);
-    ctx->input[10] = LOAD32_LE(k + 24);
-    ctx->input[11] = LOAD32_LE(k + 28);
-}
-
-static void
-chacha_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
-{
-    ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter + 0);
-    ctx->input[13] = counter == NULL ? 0 : LOAD32_LE(counter + 4);
-    ctx->input[14] = LOAD32_LE(iv + 0);
-    ctx->input[15] = LOAD32_LE(iv + 4);
-}
-
-static void
-chacha_ietf_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
-{
-    ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter);
-    ctx->input[13] = LOAD32_LE(iv + 0);
-    ctx->input[14] = LOAD32_LE(iv + 4);
-    ctx->input[15] = LOAD32_LE(iv + 8);
-}
-
-static void
-chacha20_encrypt_bytes(chacha_ctx *ctx, const uint8_t *m, uint8_t *c,
-                       unsigned long long bytes)
-{
-    uint32_t * const x = &ctx->input[0];
-
-    if (!bytes) {
-        return; /* LCOV_EXCL_LINE */
-    }
-# include "u8.h"
-# include "u4.h"
-# include "u1.h"
-# include "u0.h"
-}
-
-static int
-stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n,
-           const unsigned char *k)
-{
-    struct chacha_ctx ctx;
-
-    if (!clen) {
-        return 0;
-    }
-    COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8);
-    chacha_keysetup(&ctx, k);
-    chacha_ivsetup(&ctx, n, NULL);
-    memset(c, 0, clen);
-    chacha20_encrypt_bytes(&ctx, c, c, clen);
-    sodium_memzero(&ctx, sizeof ctx);
-
-    return 0;
-}
-
-static int
-stream_ietf_ext_ref(unsigned char *c, unsigned long long clen,
-                    const unsigned char *n, const unsigned char *k)
-{
-    struct chacha_ctx ctx;
-
-    if (!clen) {
-        return 0;
-    }
-    COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8);
-    chacha_keysetup(&ctx, k);
-    chacha_ietf_ivsetup(&ctx, n, NULL);
-    memset(c, 0, clen);
-    chacha20_encrypt_bytes(&ctx, c, c, clen);
-    sodium_memzero(&ctx, sizeof ctx);
-
-    return 0;
-}
-
-static int
-stream_ref_xor_ic(unsigned char *c, const unsigned char *m,
-                  unsigned long long mlen, const unsigned char *n, uint64_t ic,
-                  const unsigned char *k)
-{
-    struct chacha_ctx ctx;
-    uint8_t           ic_bytes[8];
-    uint32_t          ic_high;
-    uint32_t          ic_low;
-
-    if (!mlen) {
-        return 0;
-    }
-    ic_high = (uint32_t) (ic >> 32);
-    ic_low  = (uint32_t) ic;
-    STORE32_LE(&ic_bytes[0], ic_low);
-    STORE32_LE(&ic_bytes[4], ic_high);
-    chacha_keysetup(&ctx, k);
-    chacha_ivsetup(&ctx, n, ic_bytes);
-    chacha20_encrypt_bytes(&ctx, m, c, mlen);
-    sodium_memzero(&ctx, sizeof ctx);
-
-    return 0;
-}
-
-static int
-stream_ietf_ext_ref_xor_ic(unsigned char *c, const unsigned char *m,
-                           unsigned long long mlen, const unsigned char *n,
-                           uint32_t ic, const unsigned char *k)
-{
-    struct chacha_ctx ctx;
-    uint8_t           ic_bytes[4];
-
-    if (!mlen) {
-        return 0;
-    }
-    STORE32_LE(ic_bytes, ic);
-    chacha_keysetup(&ctx, k);
-    chacha_ietf_ivsetup(&ctx, n, ic_bytes);
-    chacha20_encrypt_bytes(&ctx, m, c, mlen);
-    sodium_memzero(&ctx, sizeof ctx);
-
-    return 0;
-}
-
-struct crypto_stream_chacha20_implementation
-    crypto_stream_chacha20_dolbeau_avx2_implementation = {
-        SODIUM_C99(.stream =) stream_ref,
-        SODIUM_C99(.stream_ietf_ext =) stream_ietf_ext_ref,
-        SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic,
-        SODIUM_C99(.stream_ietf_ext_xor_ic =) stream_ietf_ext_ref_xor_ic
-    };
-
-#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "core.h"
+#include "crypto_stream_chacha20.h"
+#include "private/common.h"
+#include "private/sse2_64_32.h"
+#include "utils.h"
+
+#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
+        defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
+
+# ifdef __GNUC__
+#  pragma GCC target("sse2")
+#  pragma GCC target("ssse3")
+#  pragma GCC target("sse4.1")
+#  pragma GCC target("avx2")
+# endif
+
+# include <emmintrin.h>
+# include <immintrin.h>
+# include <smmintrin.h>
+# include <tmmintrin.h>
+
+# include "../stream_chacha20.h"
+# include "chacha20_dolbeau-avx2.h"
+
+# define ROUNDS 20
+
+typedef struct chacha_ctx {
+    uint32_t input[16];
+} chacha_ctx;
+
+static void
+chacha_keysetup(chacha_ctx *ctx, const uint8_t *k)
+{
+    ctx->input[0]  = 0x61707865;
+    ctx->input[1]  = 0x3320646e;
+    ctx->input[2]  = 0x79622d32;
+    ctx->input[3]  = 0x6b206574;
+    ctx->input[4]  = LOAD32_LE(k + 0);
+    ctx->input[5]  = LOAD32_LE(k + 4);
+    ctx->input[6]  = LOAD32_LE(k + 8);
+    ctx->input[7]  = LOAD32_LE(k + 12);
+    ctx->input[8]  = LOAD32_LE(k + 16);
+    ctx->input[9]  = LOAD32_LE(k + 20);
+    ctx->input[10] = LOAD32_LE(k + 24);
+    ctx->input[11] = LOAD32_LE(k + 28);
+}
+
+static void
+chacha_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
+{
+    ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter + 0);
+    ctx->input[13] = counter == NULL ? 0 : LOAD32_LE(counter + 4);
+    ctx->input[14] = LOAD32_LE(iv + 0);
+    ctx->input[15] = LOAD32_LE(iv + 4);
+}
+
+static void
+chacha_ietf_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
+{
+    ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter);
+    ctx->input[13] = LOAD32_LE(iv + 0);
+    ctx->input[14] = LOAD32_LE(iv + 4);
+    ctx->input[15] = LOAD32_LE(iv + 8);
+}
+
+static void
+chacha20_encrypt_bytes(chacha_ctx *ctx, const uint8_t *m, uint8_t *c,
+                       unsigned long long bytes)
+{
+    uint32_t * const x = &ctx->input[0];
+
+    if (!bytes) {
+        return; /* LCOV_EXCL_LINE */
+    }
+# include "u8.h"
+# include "u4.h"
+# include "u1.h"
+# include "u0.h"
+}
+
+static int
+stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n,
+           const unsigned char *k)
+{
+    struct chacha_ctx ctx;
+
+    if (!clen) {
+        return 0;
+    }
+    COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8);
+    chacha_keysetup(&ctx, k);
+    chacha_ivsetup(&ctx, n, NULL);
+    memset(c, 0, clen);
+    chacha20_encrypt_bytes(&ctx, c, c, clen);
+    sodium_memzero(&ctx, sizeof ctx);
+
+    return 0;
+}
+
+static int
+stream_ietf_ext_ref(unsigned char *c, unsigned long long clen,
+                    const unsigned char *n, const unsigned char *k)
+{
+    struct chacha_ctx ctx;
+
+    if (!clen) {
+        return 0;
+    }
+    COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8);
+    chacha_keysetup(&ctx, k);
+    chacha_ietf_ivsetup(&ctx, n, NULL);
+    memset(c, 0, clen);
+    chacha20_encrypt_bytes(&ctx, c, c, clen);
+    sodium_memzero(&ctx, sizeof ctx);
+
+    return 0;
+}
+
+static int
+stream_ref_xor_ic(unsigned char *c, const unsigned char *m,
+                  unsigned long long mlen, const unsigned char *n, uint64_t ic,
+                  const unsigned char *k)
+{
+    struct chacha_ctx ctx;
+    uint8_t           ic_bytes[8];
+    uint32_t          ic_high;
+    uint32_t          ic_low;
+
+    if (!mlen) {
+        return 0;
+    }
+    ic_high = (uint32_t) (ic >> 32);
+    ic_low  = (uint32_t) ic;
+    STORE32_LE(&ic_bytes[0], ic_low);
+    STORE32_LE(&ic_bytes[4], ic_high);
+    chacha_keysetup(&ctx, k);
+    chacha_ivsetup(&ctx, n, ic_bytes);
+    chacha20_encrypt_bytes(&ctx, m, c, mlen);
+    sodium_memzero(&ctx, sizeof ctx);
+
+    return 0;
+}
+
+static int
+stream_ietf_ext_ref_xor_ic(unsigned char *c, const unsigned char *m,
+                           unsigned long long mlen, const unsigned char *n,
+                           uint32_t ic, const unsigned char *k)
+{
+    struct chacha_ctx ctx;
+    uint8_t           ic_bytes[4];
+
+    if (!mlen) {
+        return 0;
+    }
+    STORE32_LE(ic_bytes, ic);
+    chacha_keysetup(&ctx, k);
+    chacha_ietf_ivsetup(&ctx, n, ic_bytes);
+    chacha20_encrypt_bytes(&ctx, m, c, mlen);
+    sodium_memzero(&ctx, sizeof ctx);
+
+    return 0;
+}
+
+struct crypto_stream_chacha20_implementation
+    crypto_stream_chacha20_dolbeau_avx2_implementation = {
+        SODIUM_C99(.stream =) stream_ref,
+        SODIUM_C99(.stream_ietf_ext =) stream_ietf_ext_ref,
+        SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic,
+        SODIUM_C99(.stream_ietf_ext_xor_ic =) stream_ietf_ext_ref_xor_ic
+    };
+
+#endif
diff --git a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-avx2.h b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-avx2.h
index 45eb98d797..a29fef9136 100644
--- a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-avx2.h
+++ b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-avx2.h
@@ -1,8 +1,8 @@
-
-#include <stdint.h>
-
-#include "../stream_chacha20.h"
-#include "crypto_stream_chacha20.h"
-
-extern struct crypto_stream_chacha20_implementation
-    crypto_stream_chacha20_dolbeau_avx2_implementation;
+
+#include <stdint.h>
+
+#include "../stream_chacha20.h"
+#include "crypto_stream_chacha20.h"
+
+extern struct crypto_stream_chacha20_implementation
+    crypto_stream_chacha20_dolbeau_avx2_implementation;
diff --git a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.c b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.c
index 6f5d3851c3..ae5df1cc28 100644
--- a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.c
+++ b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.c
@@ -1,171 +1,171 @@
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "core.h"
-#include "crypto_stream_chacha20.h"
-#include "private/common.h"
-#include "private/sse2_64_32.h"
-#include "utils.h"
-
-#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H)
-
-# ifdef __GNUC__
-#  pragma GCC target("sse2")
-#  pragma GCC target("ssse3")
-# endif
-
-# include <emmintrin.h>
-# include <tmmintrin.h>
-
-# include "../stream_chacha20.h"
-# include "chacha20_dolbeau-ssse3.h"
-
-# define ROUNDS 20
-
-typedef struct chacha_ctx {
-    uint32_t input[16];
-} chacha_ctx;
-
-static void
-chacha_keysetup(chacha_ctx *ctx, const uint8_t *k)
-{
-    ctx->input[0]  = 0x61707865;
-    ctx->input[1]  = 0x3320646e;
-    ctx->input[2]  = 0x79622d32;
-    ctx->input[3]  = 0x6b206574;
-    ctx->input[4]  = LOAD32_LE(k + 0);
-    ctx->input[5]  = LOAD32_LE(k + 4);
-    ctx->input[6]  = LOAD32_LE(k + 8);
-    ctx->input[7]  = LOAD32_LE(k + 12);
-    ctx->input[8]  = LOAD32_LE(k + 16);
-    ctx->input[9]  = LOAD32_LE(k + 20);
-    ctx->input[10] = LOAD32_LE(k + 24);
-    ctx->input[11] = LOAD32_LE(k + 28);
-}
-
-static void
-chacha_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
-{
-    ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter + 0);
-    ctx->input[13] = counter == NULL ? 0 : LOAD32_LE(counter + 4);
-    ctx->input[14] = LOAD32_LE(iv + 0);
-    ctx->input[15] = LOAD32_LE(iv + 4);
-}
-
-static void
-chacha_ietf_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
-{
-    ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter);
-    ctx->input[13] = LOAD32_LE(iv + 0);
-    ctx->input[14] = LOAD32_LE(iv + 4);
-    ctx->input[15] = LOAD32_LE(iv + 8);
-}
-
-static void
-chacha20_encrypt_bytes(chacha_ctx *ctx, const uint8_t *m, uint8_t *c,
-                       unsigned long long bytes)
-{
-    uint32_t * const x = &ctx->input[0];
-
-    if (!bytes) {
-        return; /* LCOV_EXCL_LINE */
-    }
-# include "u4.h"
-# include "u1.h"
-# include "u0.h"
-}
-
-static int
-stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n,
-           const unsigned char *k)
-{
-    struct chacha_ctx ctx;
-
-    if (!clen) {
-        return 0;
-    }
-    COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8);
-    chacha_keysetup(&ctx, k);
-    chacha_ivsetup(&ctx, n, NULL);
-    memset(c, 0, clen);
-    chacha20_encrypt_bytes(&ctx, c, c, clen);
-    sodium_memzero(&ctx, sizeof ctx);
-
-    return 0;
-}
-
-static int
-stream_ietf_ext_ref(unsigned char *c, unsigned long long clen,
-                    const unsigned char *n, const unsigned char *k)
-{
-    struct chacha_ctx ctx;
-
-    if (!clen) {
-        return 0;
-    }
-    COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8);
-    chacha_keysetup(&ctx, k);
-    chacha_ietf_ivsetup(&ctx, n, NULL);
-    memset(c, 0, clen);
-    chacha20_encrypt_bytes(&ctx, c, c, clen);
-    sodium_memzero(&ctx, sizeof ctx);
-
-    return 0;
-}
-
-static int
-stream_ref_xor_ic(unsigned char *c, const unsigned char *m,
-                  unsigned long long mlen, const unsigned char *n, uint64_t ic,
-                  const unsigned char *k)
-{
-    struct chacha_ctx ctx;
-    uint8_t           ic_bytes[8];
-    uint32_t          ic_high;
-    uint32_t          ic_low;
-
-    if (!mlen) {
-        return 0;
-    }
-    ic_high = (uint32_t) (ic >> 32);
-    ic_low  = (uint32_t) ic;
-    STORE32_LE(&ic_bytes[0], ic_low);
-    STORE32_LE(&ic_bytes[4], ic_high);
-    chacha_keysetup(&ctx, k);
-    chacha_ivsetup(&ctx, n, ic_bytes);
-    chacha20_encrypt_bytes(&ctx, m, c, mlen);
-    sodium_memzero(&ctx, sizeof ctx);
-
-    return 0;
-}
-
-static int
-stream_ietf_ext_ref_xor_ic(unsigned char *c, const unsigned char *m,
-                           unsigned long long mlen, const unsigned char *n,
-                           uint32_t ic, const unsigned char *k)
-{
-    struct chacha_ctx ctx;
-    uint8_t           ic_bytes[4];
-
-    if (!mlen) {
-        return 0;
-    }
-    STORE32_LE(ic_bytes, ic);
-    chacha_keysetup(&ctx, k);
-    chacha_ietf_ivsetup(&ctx, n, ic_bytes);
-    chacha20_encrypt_bytes(&ctx, m, c, mlen);
-    sodium_memzero(&ctx, sizeof ctx);
-
-    return 0;
-}
-
-struct crypto_stream_chacha20_implementation
-    crypto_stream_chacha20_dolbeau_ssse3_implementation = {
-        SODIUM_C99(.stream =) stream_ref,
-        SODIUM_C99(.stream_ietf_ext =) stream_ietf_ext_ref,
-        SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic,
-        SODIUM_C99(.stream_ietf_ext_xor_ic =) stream_ietf_ext_ref_xor_ic
-    };
-
-#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "core.h"
+#include "crypto_stream_chacha20.h"
+#include "private/common.h"
+#include "private/sse2_64_32.h"
+#include "utils.h"
+
+#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H)
+
+# ifdef __GNUC__
+#  pragma GCC target("sse2")
+#  pragma GCC target("ssse3")
+# endif
+
+# include <emmintrin.h>
+# include <tmmintrin.h>
+
+# include "../stream_chacha20.h"
+# include "chacha20_dolbeau-ssse3.h"
+
+# define ROUNDS 20
+
+typedef struct chacha_ctx {
+    uint32_t input[16];
+} chacha_ctx;
+
+static void
+chacha_keysetup(chacha_ctx *ctx, const uint8_t *k)
+{
+    ctx->input[0]  = 0x61707865;
+    ctx->input[1]  = 0x3320646e;
+    ctx->input[2]  = 0x79622d32;
+    ctx->input[3]  = 0x6b206574;
+    ctx->input[4]  = LOAD32_LE(k + 0);
+    ctx->input[5]  = LOAD32_LE(k + 4);
+    ctx->input[6]  = LOAD32_LE(k + 8);
+    ctx->input[7]  = LOAD32_LE(k + 12);
+    ctx->input[8]  = LOAD32_LE(k + 16);
+    ctx->input[9]  = LOAD32_LE(k + 20);
+    ctx->input[10] = LOAD32_LE(k + 24);
+    ctx->input[11] = LOAD32_LE(k + 28);
+}
+
+static void
+chacha_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
+{
+    ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter + 0);
+    ctx->input[13] = counter == NULL ? 0 : LOAD32_LE(counter + 4);
+    ctx->input[14] = LOAD32_LE(iv + 0);
+    ctx->input[15] = LOAD32_LE(iv + 4);
+}
+
+static void
+chacha_ietf_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
+{
+    ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter);
+    ctx->input[13] = LOAD32_LE(iv + 0);
+    ctx->input[14] = LOAD32_LE(iv + 4);
+    ctx->input[15] = LOAD32_LE(iv + 8);
+}
+
+static void
+chacha20_encrypt_bytes(chacha_ctx *ctx, const uint8_t *m, uint8_t *c,
+                       unsigned long long bytes)
+{
+    uint32_t * const x = &ctx->input[0];
+
+    if (!bytes) {
+        return; /* LCOV_EXCL_LINE */
+    }
+# include "u4.h"
+# include "u1.h"
+# include "u0.h"
+}
+
+static int
+stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n,
+           const unsigned char *k)
+{
+    struct chacha_ctx ctx;
+
+    if (!clen) {
+        return 0;
+    }
+    COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8);
+    chacha_keysetup(&ctx, k);
+    chacha_ivsetup(&ctx, n, NULL);
+    memset(c, 0, clen);
+    chacha20_encrypt_bytes(&ctx, c, c, clen);
+    sodium_memzero(&ctx, sizeof ctx);
+
+    return 0;
+}
+
+static int
+stream_ietf_ext_ref(unsigned char *c, unsigned long long clen,
+                    const unsigned char *n, const unsigned char *k)
+{
+    struct chacha_ctx ctx;
+
+    if (!clen) {
+        return 0;
+    }
+    COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8);
+    chacha_keysetup(&ctx, k);
+    chacha_ietf_ivsetup(&ctx, n, NULL);
+    memset(c, 0, clen);
+    chacha20_encrypt_bytes(&ctx, c, c, clen);
+    sodium_memzero(&ctx, sizeof ctx);
+
+    return 0;
+}
+
+static int
+stream_ref_xor_ic(unsigned char *c, const unsigned char *m,
+                  unsigned long long mlen, const unsigned char *n, uint64_t ic,
+                  const unsigned char *k)
+{
+    struct chacha_ctx ctx;
+    uint8_t           ic_bytes[8];
+    uint32_t          ic_high;
+    uint32_t          ic_low;
+
+    if (!mlen) {
+        return 0;
+    }
+    ic_high = (uint32_t) (ic >> 32);
+    ic_low  = (uint32_t) ic;
+    STORE32_LE(&ic_bytes[0], ic_low);
+    STORE32_LE(&ic_bytes[4], ic_high);
+    chacha_keysetup(&ctx, k);
+    chacha_ivsetup(&ctx, n, ic_bytes);
+    chacha20_encrypt_bytes(&ctx, m, c, mlen);
+    sodium_memzero(&ctx, sizeof ctx);
+
+    return 0;
+}
+
+static int
+stream_ietf_ext_ref_xor_ic(unsigned char *c, const unsigned char *m,
+                           unsigned long long mlen, const unsigned char *n,
+                           uint32_t ic, const unsigned char *k)
+{
+    struct chacha_ctx ctx;
+    uint8_t           ic_bytes[4];
+
+    if (!mlen) {
+        return 0;
+    }
+    STORE32_LE(ic_bytes, ic);
+    chacha_keysetup(&ctx, k);
+    chacha_ietf_ivsetup(&ctx, n, ic_bytes);
+    chacha20_encrypt_bytes(&ctx, m, c, mlen);
+    sodium_memzero(&ctx, sizeof ctx);
+
+    return 0;
+}
+
+struct crypto_stream_chacha20_implementation
+    crypto_stream_chacha20_dolbeau_ssse3_implementation = {
+        SODIUM_C99(.stream =) stream_ref,
+        SODIUM_C99(.stream_ietf_ext =) stream_ietf_ext_ref,
+        SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic,
+        SODIUM_C99(.stream_ietf_ext_xor_ic =) stream_ietf_ext_ref_xor_ic
+    };
+
+#endif
diff --git a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.h b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.h
index d67630f6a9..520761ab5e 100644
--- a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.h
+++ b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.h
@@ -1,8 +1,8 @@
-
-#include <stdint.h>
-
-#include "../stream_chacha20.h"
-#include "crypto_stream_chacha20.h"
-
-extern struct crypto_stream_chacha20_implementation
-    crypto_stream_chacha20_dolbeau_ssse3_implementation;
+
+#include <stdint.h>
+
+#include "../stream_chacha20.h"
+#include "crypto_stream_chacha20.h"
+
+extern struct crypto_stream_chacha20_implementation
+    crypto_stream_chacha20_dolbeau_ssse3_implementation;
diff --git a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u0.h b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u0.h
index 17c3ff8e08..f790a8625f 100644
--- a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u0.h
+++ b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u0.h
@@ -1,86 +1,86 @@
-if (bytes > 0) {
-    __m128i       x_0, x_1, x_2, x_3;
-    __m128i       t_1;
-    const __m128i rot16 =
-        _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
-    const __m128i rot8 =
-        _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
-    uint8_t partialblock[64];
-
-    unsigned int i;
-
-    x_0 = _mm_loadu_si128((__m128i*) (x + 0));
-    x_1 = _mm_loadu_si128((__m128i*) (x + 4));
-    x_2 = _mm_loadu_si128((__m128i*) (x + 8));
-    x_3 = _mm_loadu_si128((__m128i*) (x + 12));
-
-    for (i = 0; i < ROUNDS; i += 2) {
-        x_0 = _mm_add_epi32(x_0, x_1);
-        x_3 = _mm_xor_si128(x_3, x_0);
-        x_3 = _mm_shuffle_epi8(x_3, rot16);
-
-        x_2 = _mm_add_epi32(x_2, x_3);
-        x_1 = _mm_xor_si128(x_1, x_2);
-
-        t_1 = x_1;
-        x_1 = _mm_slli_epi32(x_1, 12);
-        t_1 = _mm_srli_epi32(t_1, 20);
-        x_1 = _mm_xor_si128(x_1, t_1);
-
-        x_0 = _mm_add_epi32(x_0, x_1);
-        x_3 = _mm_xor_si128(x_3, x_0);
-        x_0 = _mm_shuffle_epi32(x_0, 0x93);
-        x_3 = _mm_shuffle_epi8(x_3, rot8);
-
-        x_2 = _mm_add_epi32(x_2, x_3);
-        x_3 = _mm_shuffle_epi32(x_3, 0x4e);
-        x_1 = _mm_xor_si128(x_1, x_2);
-        x_2 = _mm_shuffle_epi32(x_2, 0x39);
-
-        t_1 = x_1;
-        x_1 = _mm_slli_epi32(x_1, 7);
-        t_1 = _mm_srli_epi32(t_1, 25);
-        x_1 = _mm_xor_si128(x_1, t_1);
-
-        x_0 = _mm_add_epi32(x_0, x_1);
-        x_3 = _mm_xor_si128(x_3, x_0);
-        x_3 = _mm_shuffle_epi8(x_3, rot16);
-
-        x_2 = _mm_add_epi32(x_2, x_3);
-        x_1 = _mm_xor_si128(x_1, x_2);
-
-        t_1 = x_1;
-        x_1 = _mm_slli_epi32(x_1, 12);
-        t_1 = _mm_srli_epi32(t_1, 20);
-        x_1 = _mm_xor_si128(x_1, t_1);
-
-        x_0 = _mm_add_epi32(x_0, x_1);
-        x_3 = _mm_xor_si128(x_3, x_0);
-        x_0 = _mm_shuffle_epi32(x_0, 0x39);
-        x_3 = _mm_shuffle_epi8(x_3, rot8);
-
-        x_2 = _mm_add_epi32(x_2, x_3);
-        x_3 = _mm_shuffle_epi32(x_3, 0x4e);
-        x_1 = _mm_xor_si128(x_1, x_2);
-        x_2 = _mm_shuffle_epi32(x_2, 0x93);
-
-        t_1 = x_1;
-        x_1 = _mm_slli_epi32(x_1, 7);
-        t_1 = _mm_srli_epi32(t_1, 25);
-        x_1 = _mm_xor_si128(x_1, t_1);
-    }
-    x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*) (x + 0)));
-    x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*) (x + 4)));
-    x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*) (x + 8)));
-    x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*) (x + 12)));
-    _mm_storeu_si128((__m128i*) (partialblock + 0), x_0);
-    _mm_storeu_si128((__m128i*) (partialblock + 16), x_1);
-    _mm_storeu_si128((__m128i*) (partialblock + 32), x_2);
-    _mm_storeu_si128((__m128i*) (partialblock + 48), x_3);
-
-    for (i = 0; i < bytes; i++) {
-        c[i] = m[i] ^ partialblock[i];
-    }
-
-    sodium_memzero(partialblock, sizeof partialblock);
-}
+if (bytes > 0) {
+    __m128i       x_0, x_1, x_2, x_3;
+    __m128i       t_1;
+    const __m128i rot16 =
+        _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
+    const __m128i rot8 =
+        _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
+    uint8_t partialblock[64];
+
+    unsigned int i;
+
+    x_0 = _mm_loadu_si128((const __m128i*) (x + 0));
+    x_1 = _mm_loadu_si128((const __m128i*) (x + 4));
+    x_2 = _mm_loadu_si128((const __m128i*) (x + 8));
+    x_3 = _mm_loadu_si128((const __m128i*) (x + 12));
+
+    for (i = 0; i < ROUNDS; i += 2) {
+        x_0 = _mm_add_epi32(x_0, x_1);
+        x_3 = _mm_xor_si128(x_3, x_0);
+        x_3 = _mm_shuffle_epi8(x_3, rot16);
+
+        x_2 = _mm_add_epi32(x_2, x_3);
+        x_1 = _mm_xor_si128(x_1, x_2);
+
+        t_1 = x_1;
+        x_1 = _mm_slli_epi32(x_1, 12);
+        t_1 = _mm_srli_epi32(t_1, 20);
+        x_1 = _mm_xor_si128(x_1, t_1);
+
+        x_0 = _mm_add_epi32(x_0, x_1);
+        x_3 = _mm_xor_si128(x_3, x_0);
+        x_0 = _mm_shuffle_epi32(x_0, 0x93);
+        x_3 = _mm_shuffle_epi8(x_3, rot8);
+
+        x_2 = _mm_add_epi32(x_2, x_3);
+        x_3 = _mm_shuffle_epi32(x_3, 0x4e);
+        x_1 = _mm_xor_si128(x_1, x_2);
+        x_2 = _mm_shuffle_epi32(x_2, 0x39);
+
+        t_1 = x_1;
+        x_1 = _mm_slli_epi32(x_1, 7);
+        t_1 = _mm_srli_epi32(t_1, 25);
+        x_1 = _mm_xor_si128(x_1, t_1);
+
+        x_0 = _mm_add_epi32(x_0, x_1);
+        x_3 = _mm_xor_si128(x_3, x_0);
+        x_3 = _mm_shuffle_epi8(x_3, rot16);
+
+        x_2 = _mm_add_epi32(x_2, x_3);
+        x_1 = _mm_xor_si128(x_1, x_2);
+
+        t_1 = x_1;
+        x_1 = _mm_slli_epi32(x_1, 12);
+        t_1 = _mm_srli_epi32(t_1, 20);
+        x_1 = _mm_xor_si128(x_1, t_1);
+
+        x_0 = _mm_add_epi32(x_0, x_1);
+        x_3 = _mm_xor_si128(x_3, x_0);
+        x_0 = _mm_shuffle_epi32(x_0, 0x39);
+        x_3 = _mm_shuffle_epi8(x_3, rot8);
+
+        x_2 = _mm_add_epi32(x_2, x_3);
+        x_3 = _mm_shuffle_epi32(x_3, 0x4e);
+        x_1 = _mm_xor_si128(x_1, x_2);
+        x_2 = _mm_shuffle_epi32(x_2, 0x93);
+
+        t_1 = x_1;
+        x_1 = _mm_slli_epi32(x_1, 7);
+        t_1 = _mm_srli_epi32(t_1, 25);
+        x_1 = _mm_xor_si128(x_1, t_1);
+    }
+    x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((const __m128i*) (x + 0)));
+    x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((const __m128i*) (x + 4)));
+    x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((const __m128i*) (x + 8)));
+    x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((const __m128i*) (x + 12)));
+    _mm_storeu_si128((__m128i*) (partialblock + 0), x_0);
+    _mm_storeu_si128((__m128i*) (partialblock + 16), x_1);
+    _mm_storeu_si128((__m128i*) (partialblock + 32), x_2);
+    _mm_storeu_si128((__m128i*) (partialblock + 48), x_3);
+
+    for (i = 0; i < bytes; i++) {
+        c[i] = m[i] ^ partialblock[i];
+    }
+
+    sodium_memzero(partialblock, sizeof partialblock);
+}
diff --git a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u1.h b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u1.h
index 867b44bcf2..893ec67371 100644
--- a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u1.h
+++ b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u1.h
@@ -1,98 +1,98 @@
-while (bytes >= 64) {
-    __m128i       x_0, x_1, x_2, x_3;
-    __m128i       t_1;
-    const __m128i rot16 =
-        _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
-    const __m128i rot8 =
-        _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
-
-    uint32_t in12;
-    uint32_t in13;
-    int      i;
-
-    x_0 = _mm_loadu_si128((__m128i*) (x + 0));
-    x_1 = _mm_loadu_si128((__m128i*) (x + 4));
-    x_2 = _mm_loadu_si128((__m128i*) (x + 8));
-    x_3 = _mm_loadu_si128((__m128i*) (x + 12));
-
-    for (i = 0; i < ROUNDS; i += 2) {
-        x_0 = _mm_add_epi32(x_0, x_1);
-        x_3 = _mm_xor_si128(x_3, x_0);
-        x_3 = _mm_shuffle_epi8(x_3, rot16);
-
-        x_2 = _mm_add_epi32(x_2, x_3);
-        x_1 = _mm_xor_si128(x_1, x_2);
-
-        t_1 = x_1;
-        x_1 = _mm_slli_epi32(x_1, 12);
-        t_1 = _mm_srli_epi32(t_1, 20);
-        x_1 = _mm_xor_si128(x_1, t_1);
-
-        x_0 = _mm_add_epi32(x_0, x_1);
-        x_3 = _mm_xor_si128(x_3, x_0);
-        x_0 = _mm_shuffle_epi32(x_0, 0x93);
-        x_3 = _mm_shuffle_epi8(x_3, rot8);
-
-        x_2 = _mm_add_epi32(x_2, x_3);
-        x_3 = _mm_shuffle_epi32(x_3, 0x4e);
-        x_1 = _mm_xor_si128(x_1, x_2);
-        x_2 = _mm_shuffle_epi32(x_2, 0x39);
-
-        t_1 = x_1;
-        x_1 = _mm_slli_epi32(x_1, 7);
-        t_1 = _mm_srli_epi32(t_1, 25);
-        x_1 = _mm_xor_si128(x_1, t_1);
-
-        x_0 = _mm_add_epi32(x_0, x_1);
-        x_3 = _mm_xor_si128(x_3, x_0);
-        x_3 = _mm_shuffle_epi8(x_3, rot16);
-
-        x_2 = _mm_add_epi32(x_2, x_3);
-        x_1 = _mm_xor_si128(x_1, x_2);
-
-        t_1 = x_1;
-        x_1 = _mm_slli_epi32(x_1, 12);
-        t_1 = _mm_srli_epi32(t_1, 20);
-        x_1 = _mm_xor_si128(x_1, t_1);
-
-        x_0 = _mm_add_epi32(x_0, x_1);
-        x_3 = _mm_xor_si128(x_3, x_0);
-        x_0 = _mm_shuffle_epi32(x_0, 0x39);
-        x_3 = _mm_shuffle_epi8(x_3, rot8);
-
-        x_2 = _mm_add_epi32(x_2, x_3);
-        x_3 = _mm_shuffle_epi32(x_3, 0x4e);
-        x_1 = _mm_xor_si128(x_1, x_2);
-        x_2 = _mm_shuffle_epi32(x_2, 0x93);
-
-        t_1 = x_1;
-        x_1 = _mm_slli_epi32(x_1, 7);
-        t_1 = _mm_srli_epi32(t_1, 25);
-        x_1 = _mm_xor_si128(x_1, t_1);
-    }
-    x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*) (x + 0)));
-    x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*) (x + 4)));
-    x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*) (x + 8)));
-    x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*) (x + 12)));
-    x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((__m128i*) (m + 0)));
-    x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((__m128i*) (m + 16)));
-    x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((__m128i*) (m + 32)));
-    x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((__m128i*) (m + 48)));
-    _mm_storeu_si128((__m128i*) (c + 0), x_0);
-    _mm_storeu_si128((__m128i*) (c + 16), x_1);
-    _mm_storeu_si128((__m128i*) (c + 32), x_2);
-    _mm_storeu_si128((__m128i*) (c + 48), x_3);
-
-    in12 = x[12];
-    in13 = x[13];
-    in12++;
-    if (in12 == 0) {
-        in13++;
-    }
-    x[12] = in12;
-    x[13] = in13;
-
-    bytes -= 64;
-    c += 64;
-    m += 64;
-}
+while (bytes >= 64) {
+    __m128i       x_0, x_1, x_2, x_3;
+    __m128i       t_1;
+    const __m128i rot16 =
+        _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
+    const __m128i rot8 =
+        _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
+
+    uint32_t in12;
+    uint32_t in13;
+    int      i;
+
+    x_0 = _mm_loadu_si128((const __m128i*) (x + 0));
+    x_1 = _mm_loadu_si128((const __m128i*) (x + 4));
+    x_2 = _mm_loadu_si128((const __m128i*) (x + 8));
+    x_3 = _mm_loadu_si128((const __m128i*) (x + 12));
+
+    for (i = 0; i < ROUNDS; i += 2) {
+        x_0 = _mm_add_epi32(x_0, x_1);
+        x_3 = _mm_xor_si128(x_3, x_0);
+        x_3 = _mm_shuffle_epi8(x_3, rot16);
+
+        x_2 = _mm_add_epi32(x_2, x_3);
+        x_1 = _mm_xor_si128(x_1, x_2);
+
+        t_1 = x_1;
+        x_1 = _mm_slli_epi32(x_1, 12);
+        t_1 = _mm_srli_epi32(t_1, 20);
+        x_1 = _mm_xor_si128(x_1, t_1);
+
+        x_0 = _mm_add_epi32(x_0, x_1);
+        x_3 = _mm_xor_si128(x_3, x_0);
+        x_0 = _mm_shuffle_epi32(x_0, 0x93);
+        x_3 = _mm_shuffle_epi8(x_3, rot8);
+
+        x_2 = _mm_add_epi32(x_2, x_3);
+        x_3 = _mm_shuffle_epi32(x_3, 0x4e);
+        x_1 = _mm_xor_si128(x_1, x_2);
+        x_2 = _mm_shuffle_epi32(x_2, 0x39);
+
+        t_1 = x_1;
+        x_1 = _mm_slli_epi32(x_1, 7);
+        t_1 = _mm_srli_epi32(t_1, 25);
+        x_1 = _mm_xor_si128(x_1, t_1);
+
+        x_0 = _mm_add_epi32(x_0, x_1);
+        x_3 = _mm_xor_si128(x_3, x_0);
+        x_3 = _mm_shuffle_epi8(x_3, rot16);
+
+        x_2 = _mm_add_epi32(x_2, x_3);
+        x_1 = _mm_xor_si128(x_1, x_2);
+
+        t_1 = x_1;
+        x_1 = _mm_slli_epi32(x_1, 12);
+        t_1 = _mm_srli_epi32(t_1, 20);
+        x_1 = _mm_xor_si128(x_1, t_1);
+
+        x_0 = _mm_add_epi32(x_0, x_1);
+        x_3 = _mm_xor_si128(x_3, x_0);
+        x_0 = _mm_shuffle_epi32(x_0, 0x39);
+        x_3 = _mm_shuffle_epi8(x_3, rot8);
+
+        x_2 = _mm_add_epi32(x_2, x_3);
+        x_3 = _mm_shuffle_epi32(x_3, 0x4e);
+        x_1 = _mm_xor_si128(x_1, x_2);
+        x_2 = _mm_shuffle_epi32(x_2, 0x93);
+
+        t_1 = x_1;
+        x_1 = _mm_slli_epi32(x_1, 7);
+        t_1 = _mm_srli_epi32(t_1, 25);
+        x_1 = _mm_xor_si128(x_1, t_1);
+    }
+    x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((const __m128i*) (x + 0)));
+    x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((const __m128i*) (x + 4)));
+    x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((const __m128i*) (x + 8)));
+    x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((const __m128i*) (x + 12)));
+    x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((const __m128i*) (m + 0)));
+    x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((const __m128i*) (m + 16)));
+    x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((const __m128i*) (m + 32)));
+    x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((const __m128i*) (m + 48)));
+    _mm_storeu_si128((__m128i*) (c + 0), x_0);
+    _mm_storeu_si128((__m128i*) (c + 16), x_1);
+    _mm_storeu_si128((__m128i*) (c + 32), x_2);
+    _mm_storeu_si128((__m128i*) (c + 48), x_3);
+
+    in12 = x[12];
+    in13 = x[13];
+    in12++;
+    if (in12 == 0) {
+        in13++;
+    }
+    x[12] = in12;
+    x[13] = in13;
+
+    bytes -= 64;
+    c += 64;
+    m += 64;
+}
diff --git a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u4.h b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u4.h
index 3ff8342609..b88a5fc960 100644
--- a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u4.h
+++ b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u4.h
@@ -1,175 +1,177 @@
-
-#define VEC4_ROT(A, IMM) \
-    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
-
-/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 &
- * 16) (better) */
-#define VEC4_QUARTERROUND_SHUFFLE(A, B, C, D) \
-    x_##A = _mm_add_epi32(x_##A, x_##B);      \
-    t_##A = _mm_xor_si128(x_##D, x_##A);      \
-    x_##D = _mm_shuffle_epi8(t_##A, rot16);   \
-    x_##C = _mm_add_epi32(x_##C, x_##D);      \
-    t_##C = _mm_xor_si128(x_##B, x_##C);      \
-    x_##B = VEC4_ROT(t_##C, 12);              \
-    x_##A = _mm_add_epi32(x_##A, x_##B);      \
-    t_##A = _mm_xor_si128(x_##D, x_##A);      \
-    x_##D = _mm_shuffle_epi8(t_##A, rot8);    \
-    x_##C = _mm_add_epi32(x_##C, x_##D);      \
-    t_##C = _mm_xor_si128(x_##B, x_##C);      \
-    x_##B = VEC4_ROT(t_##C, 7)
-
-#define VEC4_QUARTERROUND(A, B, C, D) VEC4_QUARTERROUND_SHUFFLE(A, B, C, D)
-
-if (bytes >= 256) {
-    /* constant for shuffling bytes (replacing multiple-of-8 rotates) */
-    __m128i rot16 =
-        _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
-    __m128i rot8 =
-        _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
-
-    __m128i x_0  = _mm_set1_epi32(x[0]);
-    __m128i x_1  = _mm_set1_epi32(x[1]);
-    __m128i x_2  = _mm_set1_epi32(x[2]);
-    __m128i x_3  = _mm_set1_epi32(x[3]);
-    __m128i x_4  = _mm_set1_epi32(x[4]);
-    __m128i x_5  = _mm_set1_epi32(x[5]);
-    __m128i x_6  = _mm_set1_epi32(x[6]);
-    __m128i x_7  = _mm_set1_epi32(x[7]);
-    __m128i x_8  = _mm_set1_epi32(x[8]);
-    __m128i x_9  = _mm_set1_epi32(x[9]);
-    __m128i x_10 = _mm_set1_epi32(x[10]);
-    __m128i x_11 = _mm_set1_epi32(x[11]);
-    __m128i x_12;
-    __m128i x_13;
-    __m128i x_14   = _mm_set1_epi32(x[14]);
-    __m128i x_15   = _mm_set1_epi32(x[15]);
-    __m128i orig0  = x_0;
-    __m128i orig1  = x_1;
-    __m128i orig2  = x_2;
-    __m128i orig3  = x_3;
-    __m128i orig4  = x_4;
-    __m128i orig5  = x_5;
-    __m128i orig6  = x_6;
-    __m128i orig7  = x_7;
-    __m128i orig8  = x_8;
-    __m128i orig9  = x_9;
-    __m128i orig10 = x_10;
-    __m128i orig11 = x_11;
-    __m128i orig12;
-    __m128i orig13;
-    __m128i orig14 = x_14;
-    __m128i orig15 = x_15;
-    __m128i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
-        t_13, t_14, t_15;
-
-    uint32_t in12, in13;
-    int      i;
-
-    while (bytes >= 256) {
-        const __m128i addv12 = _mm_set_epi64x(1, 0);
-        const __m128i addv13 = _mm_set_epi64x(3, 2);
-        __m128i       t12, t13;
-        uint64_t      in1213;
-
-        x_0  = orig0;
-        x_1  = orig1;
-        x_2  = orig2;
-        x_3  = orig3;
-        x_4  = orig4;
-        x_5  = orig5;
-        x_6  = orig6;
-        x_7  = orig7;
-        x_8  = orig8;
-        x_9  = orig9;
-        x_10 = orig10;
-        x_11 = orig11;
-        x_14 = orig14;
-        x_15 = orig15;
-
-        in12   = x[12];
-        in13   = x[13];
-        in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32);
-        t12    = _mm_set1_epi64x(in1213);
-        t13    = _mm_set1_epi64x(in1213);
-
-        x_12 = _mm_add_epi64(addv12, t12);
-        x_13 = _mm_add_epi64(addv13, t13);
-
-        t12 = _mm_unpacklo_epi32(x_12, x_13);
-        t13 = _mm_unpackhi_epi32(x_12, x_13);
-
-        x_12 = _mm_unpacklo_epi32(t12, t13);
-        x_13 = _mm_unpackhi_epi32(t12, t13);
-
-        orig12 = x_12;
-        orig13 = x_13;
-
-        in1213 += 4;
-
-        x[12] = in1213 & 0xFFFFFFFF;
-        x[13] = (in1213 >> 32) & 0xFFFFFFFF;
-
-        for (i = 0; i < ROUNDS; i += 2) {
-            VEC4_QUARTERROUND(0, 4, 8, 12);
-            VEC4_QUARTERROUND(1, 5, 9, 13);
-            VEC4_QUARTERROUND(2, 6, 10, 14);
-            VEC4_QUARTERROUND(3, 7, 11, 15);
-            VEC4_QUARTERROUND(0, 5, 10, 15);
-            VEC4_QUARTERROUND(1, 6, 11, 12);
-            VEC4_QUARTERROUND(2, 7, 8, 13);
-            VEC4_QUARTERROUND(3, 4, 9, 14);
-        }
-
-#define ONEQUAD_TRANSPOSE(A, B, C, D)                                     \
-    {                                                                     \
-        __m128i t0, t1, t2, t3;                                           \
-                                                                          \
-        x_##A = _mm_add_epi32(x_##A, orig##A);                            \
-        x_##B = _mm_add_epi32(x_##B, orig##B);                            \
-        x_##C = _mm_add_epi32(x_##C, orig##C);                            \
-        x_##D = _mm_add_epi32(x_##D, orig##D);                            \
-        t_##A = _mm_unpacklo_epi32(x_##A, x_##B);                         \
-        t_##B = _mm_unpacklo_epi32(x_##C, x_##D);                         \
-        t_##C = _mm_unpackhi_epi32(x_##A, x_##B);                         \
-        t_##D = _mm_unpackhi_epi32(x_##C, x_##D);                         \
-        x_##A = _mm_unpacklo_epi64(t_##A, t_##B);                         \
-        x_##B = _mm_unpackhi_epi64(t_##A, t_##B);                         \
-        x_##C = _mm_unpacklo_epi64(t_##C, t_##D);                         \
-        x_##D = _mm_unpackhi_epi64(t_##C, t_##D);                         \
-                                                                          \
-        t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((__m128i*) (m + 0)));   \
-        _mm_storeu_si128((__m128i*) (c + 0), t0);                         \
-        t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((__m128i*) (m + 64)));  \
-        _mm_storeu_si128((__m128i*) (c + 64), t1);                        \
-        t2 = _mm_xor_si128(x_##C, _mm_loadu_si128((__m128i*) (m + 128))); \
-        _mm_storeu_si128((__m128i*) (c + 128), t2);                       \
-        t3 = _mm_xor_si128(x_##D, _mm_loadu_si128((__m128i*) (m + 192))); \
-        _mm_storeu_si128((__m128i*) (c + 192), t3);                       \
-    }
-
-#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
-
-        ONEQUAD(0, 1, 2, 3);
-        m += 16;
-        c += 16;
-        ONEQUAD(4, 5, 6, 7);
-        m += 16;
-        c += 16;
-        ONEQUAD(8, 9, 10, 11);
-        m += 16;
-        c += 16;
-        ONEQUAD(12, 13, 14, 15);
-        m -= 48;
-        c -= 48;
-
-#undef ONEQUAD
-#undef ONEQUAD_TRANSPOSE
-
-        bytes -= 256;
-        c += 256;
-        m += 256;
-    }
-}
-#undef VEC4_ROT
-#undef VEC4_QUARTERROUND
-#undef VEC4_QUARTERROUND_SHUFFLE
+
+#define VEC4_ROT(A, IMM) \
+    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
+
+/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 &
+ * 16) (better) */
+#define VEC4_QUARTERROUND_SHUFFLE(A, B, C, D) \
+    x_##A = _mm_add_epi32(x_##A, x_##B);      \
+    t_##A = _mm_xor_si128(x_##D, x_##A);      \
+    x_##D = _mm_shuffle_epi8(t_##A, rot16);   \
+    x_##C = _mm_add_epi32(x_##C, x_##D);      \
+    t_##C = _mm_xor_si128(x_##B, x_##C);      \
+    x_##B = VEC4_ROT(t_##C, 12);              \
+    x_##A = _mm_add_epi32(x_##A, x_##B);      \
+    t_##A = _mm_xor_si128(x_##D, x_##A);      \
+    x_##D = _mm_shuffle_epi8(t_##A, rot8);    \
+    x_##C = _mm_add_epi32(x_##C, x_##D);      \
+    t_##C = _mm_xor_si128(x_##B, x_##C);      \
+    x_##B = VEC4_ROT(t_##C, 7)
+
+#define VEC4_QUARTERROUND(A, B, C, D) VEC4_QUARTERROUND_SHUFFLE(A, B, C, D)
+
+if (bytes >= 256) {
+    /* constant for shuffling bytes (replacing multiple-of-8 rotates) */
+    __m128i rot16 =
+        _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
+    __m128i rot8 =
+        _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
+
+    __m128i x_0  = _mm_set1_epi32(x[0]);
+    __m128i x_1  = _mm_set1_epi32(x[1]);
+    __m128i x_2  = _mm_set1_epi32(x[2]);
+    __m128i x_3  = _mm_set1_epi32(x[3]);
+    __m128i x_4  = _mm_set1_epi32(x[4]);
+    __m128i x_5  = _mm_set1_epi32(x[5]);
+    __m128i x_6  = _mm_set1_epi32(x[6]);
+    __m128i x_7  = _mm_set1_epi32(x[7]);
+    __m128i x_8  = _mm_set1_epi32(x[8]);
+    __m128i x_9  = _mm_set1_epi32(x[9]);
+    __m128i x_10 = _mm_set1_epi32(x[10]);
+    __m128i x_11 = _mm_set1_epi32(x[11]);
+    __m128i x_12;
+    __m128i x_13;
+    __m128i x_14   = _mm_set1_epi32(x[14]);
+    __m128i x_15   = _mm_set1_epi32(x[15]);
+    __m128i orig0  = x_0;
+    __m128i orig1  = x_1;
+    __m128i orig2  = x_2;
+    __m128i orig3  = x_3;
+    __m128i orig4  = x_4;
+    __m128i orig5  = x_5;
+    __m128i orig6  = x_6;
+    __m128i orig7  = x_7;
+    __m128i orig8  = x_8;
+    __m128i orig9  = x_9;
+    __m128i orig10 = x_10;
+    __m128i orig11 = x_11;
+    __m128i orig12;
+    __m128i orig13;
+    __m128i orig14 = x_14;
+    __m128i orig15 = x_15;
+    __m128i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
+        t_13, t_14, t_15;
+
+    uint32_t in12, in13;
+    int      i;
+
+    while (bytes >= 256) {
+        const __m128i addv12 = _mm_set_epi64x(1, 0);
+        const __m128i addv13 = _mm_set_epi64x(3, 2);
+        __m128i       t12, t13;
+        uint64_t      in1213;
+
+        x_0  = orig0;
+        x_1  = orig1;
+        x_2  = orig2;
+        x_3  = orig3;
+        x_4  = orig4;
+        x_5  = orig5;
+        x_6  = orig6;
+        x_7  = orig7;
+        x_8  = orig8;
+        x_9  = orig9;
+        x_10 = orig10;
+        x_11 = orig11;
+        x_14 = orig14;
+        x_15 = orig15;
+
+        in12   = x[12];
+        in13   = x[13];
+        in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32);
+        t12    = _mm_set1_epi64x(in1213);
+        t13    = _mm_set1_epi64x(in1213);
+
+        x_12 = _mm_add_epi64(addv12, t12);
+        x_13 = _mm_add_epi64(addv13, t13);
+
+        t12 = _mm_unpacklo_epi32(x_12, x_13);
+        t13 = _mm_unpackhi_epi32(x_12, x_13);
+
+        x_12 = _mm_unpacklo_epi32(t12, t13);
+        x_13 = _mm_unpackhi_epi32(t12, t13);
+
+        orig12 = x_12;
+        orig13 = x_13;
+
+        in1213 += 4;
+
+        x[12] = in1213 & 0xFFFFFFFF;
+        x[13] = (in1213 >> 32) & 0xFFFFFFFF;
+
+        for (i = 0; i < ROUNDS; i += 2) {
+            VEC4_QUARTERROUND(0, 4, 8, 12);
+            VEC4_QUARTERROUND(1, 5, 9, 13);
+            VEC4_QUARTERROUND(2, 6, 10, 14);
+            VEC4_QUARTERROUND(3, 7, 11, 15);
+            VEC4_QUARTERROUND(0, 5, 10, 15);
+            VEC4_QUARTERROUND(1, 6, 11, 12);
+            VEC4_QUARTERROUND(2, 7, 8, 13);
+            VEC4_QUARTERROUND(3, 4, 9, 14);
+        }
+
+#define ONEQUAD_TRANSPOSE(A, B, C, D)                                          \
+    {                                                                          \
+        __m128i t0, t1, t2, t3;                                                \
+                                                                               \
+        x_##A = _mm_add_epi32(x_##A, orig##A);                                 \
+        x_##B = _mm_add_epi32(x_##B, orig##B);                                 \
+        x_##C = _mm_add_epi32(x_##C, orig##C);                                 \
+        x_##D = _mm_add_epi32(x_##D, orig##D);                                 \
+        t_##A = _mm_unpacklo_epi32(x_##A, x_##B);                              \
+        t_##B = _mm_unpacklo_epi32(x_##C, x_##D);                              \
+        t_##C = _mm_unpackhi_epi32(x_##A, x_##B);                              \
+        t_##D = _mm_unpackhi_epi32(x_##C, x_##D);                              \
+        x_##A = _mm_unpacklo_epi64(t_##A, t_##B);                              \
+        x_##B = _mm_unpackhi_epi64(t_##A, t_##B);                              \
+        x_##C = _mm_unpacklo_epi64(t_##C, t_##D);                              \
+        x_##D = _mm_unpackhi_epi64(t_##C, t_##D);                              \
+                                                                               \
+        t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((const __m128i*) (m + 0)));  \
+        _mm_storeu_si128((__m128i*) (c + 0), t0);                              \
+        t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((const __m128i*) (m + 64))); \
+        _mm_storeu_si128((__m128i*) (c + 64), t1);                             \
+        t2 =                                                                   \
+            _mm_xor_si128(x_##C, _mm_loadu_si128((const __m128i*) (m + 128))); \
+        _mm_storeu_si128((__m128i*) (c + 128), t2);                            \
+        t3 =                                                                   \
+            _mm_xor_si128(x_##D, _mm_loadu_si128((const __m128i*) (m + 192))); \
+        _mm_storeu_si128((__m128i*) (c + 192), t3);                            \
+    }
+
+#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
+
+        ONEQUAD(0, 1, 2, 3);
+        m += 16;
+        c += 16;
+        ONEQUAD(4, 5, 6, 7);
+        m += 16;
+        c += 16;
+        ONEQUAD(8, 9, 10, 11);
+        m += 16;
+        c += 16;
+        ONEQUAD(12, 13, 14, 15);
+        m -= 48;
+        c -= 48;
+
+#undef ONEQUAD
+#undef ONEQUAD_TRANSPOSE
+
+        bytes -= 256;
+        c += 256;
+        m += 256;
+    }
+}
+#undef VEC4_ROT
+#undef VEC4_QUARTERROUND
+#undef VEC4_QUARTERROUND_SHUFFLE
diff --git a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u8.h b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u8.h
index 22bf9fcfa1..c92fbd3514 100644
--- a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u8.h
+++ b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u8.h
@@ -1,357 +1,357 @@
-
-#define VEC8_ROT(A, IMM) \
-    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
-
-/* implements a vector quarter round by-the-book (naive!) */
-#define VEC8_QUARTERROUND_NAIVE(A, B, C, D) \
-    x_##A = _mm256_add_epi32(x_##A, x_##B); \
-    t_##A = _mm256_xor_si256(x_##D, x_##A); \
-    x_##D = VEC8_ROT(t_##A, 16);            \
-    x_##C = _mm256_add_epi32(x_##C, x_##D); \
-    t_##C = _mm256_xor_si256(x_##B, x_##C); \
-    x_##B = VEC8_ROT(t_##C, 12);            \
-    x_##A = _mm256_add_epi32(x_##A, x_##B); \
-    t_##A = _mm256_xor_si256(x_##D, x_##A); \
-    x_##D = VEC8_ROT(t_##A, 8);             \
-    x_##C = _mm256_add_epi32(x_##C, x_##D); \
-    t_##C = _mm256_xor_si256(x_##B, x_##C); \
-    x_##B = VEC8_ROT(t_##C, 7)
-
-/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 &
- * 16) (better) */
-#define VEC8_QUARTERROUND_SHUFFLE(A, B, C, D)  \
-    x_##A = _mm256_add_epi32(x_##A, x_##B);    \
-    t_##A = _mm256_xor_si256(x_##D, x_##A);    \
-    x_##D = _mm256_shuffle_epi8(t_##A, rot16); \
-    x_##C = _mm256_add_epi32(x_##C, x_##D);    \
-    t_##C = _mm256_xor_si256(x_##B, x_##C);    \
-    x_##B = VEC8_ROT(t_##C, 12);               \
-    x_##A = _mm256_add_epi32(x_##A, x_##B);    \
-    t_##A = _mm256_xor_si256(x_##D, x_##A);    \
-    x_##D = _mm256_shuffle_epi8(t_##A, rot8);  \
-    x_##C = _mm256_add_epi32(x_##C, x_##D);    \
-    t_##C = _mm256_xor_si256(x_##B, x_##C);    \
-    x_##B = VEC8_ROT(t_##C, 7)
-
-/* same, but replace 2 of the shift/shift/or "rotation" by byte & word shuffles
- * (8 & 16) (not as good as previous) */
-#define VEC8_QUARTERROUND_SHUFFLE2(A, B, C, D)                                 \
-    x_##A = _mm256_add_epi32(x_##A, x_##B);                                    \
-    t_##A = _mm256_xor_si256(x_##D, x_##A);                                    \
-    x_##D = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(t_##A, 0xb1), 0xb1); \
-    x_##C = _mm256_add_epi32(x_##C, x_##D);                                    \
-    t_##C = _mm256_xor_si256(x_##B, x_##C);                                    \
-    x_##B = VEC8_ROT(t_##C, 12);                                               \
-    x_##A = _mm256_add_epi32(x_##A, x_##B);                                    \
-    t_##A = _mm256_xor_si256(x_##D, x_##A);                                    \
-    x_##D = _mm256_shuffle_epi8(t_##A, rot8);                                  \
-    x_##C = _mm256_add_epi32(x_##C, x_##D);                                    \
-    t_##C = _mm256_xor_si256(x_##B, x_##C);                                    \
-    x_##B = VEC8_ROT(t_##C, 7)
-
-#define VEC8_QUARTERROUND(A, B, C, D) VEC8_QUARTERROUND_SHUFFLE(A, B, C, D)
-
-#define VEC8_LINE1(A, B, C, D)              \
-    x_##A = _mm256_add_epi32(x_##A, x_##B); \
-    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
-#define VEC8_LINE2(A, B, C, D)              \
-    x_##C = _mm256_add_epi32(x_##C, x_##D); \
-    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
-#define VEC8_LINE3(A, B, C, D)              \
-    x_##A = _mm256_add_epi32(x_##A, x_##B); \
-    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
-#define VEC8_LINE4(A, B, C, D)              \
-    x_##C = _mm256_add_epi32(x_##C, x_##D); \
-    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
-
-#define VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, \
-                       C4, D4)                                                 \
-    VEC8_LINE1(A1, B1, C1, D1);                                                \
-    VEC8_LINE1(A2, B2, C2, D2);                                                \
-    VEC8_LINE1(A3, B3, C3, D3);                                                \
-    VEC8_LINE1(A4, B4, C4, D4);                                                \
-    VEC8_LINE2(A1, B1, C1, D1);                                                \
-    VEC8_LINE2(A2, B2, C2, D2);                                                \
-    VEC8_LINE2(A3, B3, C3, D3);                                                \
-    VEC8_LINE2(A4, B4, C4, D4);                                                \
-    VEC8_LINE3(A1, B1, C1, D1);                                                \
-    VEC8_LINE3(A2, B2, C2, D2);                                                \
-    VEC8_LINE3(A3, B3, C3, D3);                                                \
-    VEC8_LINE3(A4, B4, C4, D4);                                                \
-    VEC8_LINE4(A1, B1, C1, D1);                                                \
-    VEC8_LINE4(A2, B2, C2, D2);                                                \
-    VEC8_LINE4(A3, B3, C3, D3);                                                \
-    VEC8_LINE4(A4, B4, C4, D4)
-
-#define VEC8_ROUND_HALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, \
-                        B4, C4, D4)                                         \
-    VEC8_LINE1(A1, B1, C1, D1);                                             \
-    VEC8_LINE1(A2, B2, C2, D2);                                             \
-    VEC8_LINE2(A1, B1, C1, D1);                                             \
-    VEC8_LINE2(A2, B2, C2, D2);                                             \
-    VEC8_LINE3(A1, B1, C1, D1);                                             \
-    VEC8_LINE3(A2, B2, C2, D2);                                             \
-    VEC8_LINE4(A1, B1, C1, D1);                                             \
-    VEC8_LINE4(A2, B2, C2, D2);                                             \
-    VEC8_LINE1(A3, B3, C3, D3);                                             \
-    VEC8_LINE1(A4, B4, C4, D4);                                             \
-    VEC8_LINE2(A3, B3, C3, D3);                                             \
-    VEC8_LINE2(A4, B4, C4, D4);                                             \
-    VEC8_LINE3(A3, B3, C3, D3);                                             \
-    VEC8_LINE3(A4, B4, C4, D4);                                             \
-    VEC8_LINE4(A3, B3, C3, D3);                                             \
-    VEC8_LINE4(A4, B4, C4, D4)
-
-#define VEC8_ROUND_HALFANDHALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, \
-                               A4, B4, C4, D4)                                 \
-    VEC8_LINE1(A1, B1, C1, D1);                                                \
-    VEC8_LINE1(A2, B2, C2, D2);                                                \
-    VEC8_LINE2(A1, B1, C1, D1);                                                \
-    VEC8_LINE2(A2, B2, C2, D2);                                                \
-    VEC8_LINE1(A3, B3, C3, D3);                                                \
-    VEC8_LINE1(A4, B4, C4, D4);                                                \
-    VEC8_LINE2(A3, B3, C3, D3);                                                \
-    VEC8_LINE2(A4, B4, C4, D4);                                                \
-    VEC8_LINE3(A1, B1, C1, D1);                                                \
-    VEC8_LINE3(A2, B2, C2, D2);                                                \
-    VEC8_LINE4(A1, B1, C1, D1);                                                \
-    VEC8_LINE4(A2, B2, C2, D2);                                                \
-    VEC8_LINE3(A3, B3, C3, D3);                                                \
-    VEC8_LINE3(A4, B4, C4, D4);                                                \
-    VEC8_LINE4(A3, B3, C3, D3);                                                \
-    VEC8_LINE4(A4, B4, C4, D4)
-
-#define VEC8_ROUND(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \
-                   D4)                                                         \
-    VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \
-                   D4)
-
-if (bytes >= 512) {
-    /* constant for shuffling bytes (replacing multiple-of-8 rotates) */
-    __m256i rot16 =
-        _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
-                        13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
-    __m256i rot8 =
-        _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3,
-                        14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
-    uint32_t in12, in13;
-
-    /* the naive way seems as fast (if not a bit faster) than the vector way */
-    __m256i x_0  = _mm256_set1_epi32(x[0]);
-    __m256i x_1  = _mm256_set1_epi32(x[1]);
-    __m256i x_2  = _mm256_set1_epi32(x[2]);
-    __m256i x_3  = _mm256_set1_epi32(x[3]);
-    __m256i x_4  = _mm256_set1_epi32(x[4]);
-    __m256i x_5  = _mm256_set1_epi32(x[5]);
-    __m256i x_6  = _mm256_set1_epi32(x[6]);
-    __m256i x_7  = _mm256_set1_epi32(x[7]);
-    __m256i x_8  = _mm256_set1_epi32(x[8]);
-    __m256i x_9  = _mm256_set1_epi32(x[9]);
-    __m256i x_10 = _mm256_set1_epi32(x[10]);
-    __m256i x_11 = _mm256_set1_epi32(x[11]);
-    __m256i x_12;
-    __m256i x_13;
-    __m256i x_14 = _mm256_set1_epi32(x[14]);
-    __m256i x_15 = _mm256_set1_epi32(x[15]);
-
-    __m256i orig0  = x_0;
-    __m256i orig1  = x_1;
-    __m256i orig2  = x_2;
-    __m256i orig3  = x_3;
-    __m256i orig4  = x_4;
-    __m256i orig5  = x_5;
-    __m256i orig6  = x_6;
-    __m256i orig7  = x_7;
-    __m256i orig8  = x_8;
-    __m256i orig9  = x_9;
-    __m256i orig10 = x_10;
-    __m256i orig11 = x_11;
-    __m256i orig12;
-    __m256i orig13;
-    __m256i orig14 = x_14;
-    __m256i orig15 = x_15;
-    __m256i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
-        t_13, t_14, t_15;
-
-    while (bytes >= 512) {
-        const __m256i addv12  = _mm256_set_epi64x(3, 2, 1, 0);
-        const __m256i addv13  = _mm256_set_epi64x(7, 6, 5, 4);
-        const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
-        __m256i       t12, t13;
-
-        uint64_t in1213;
-        int      i;
-
-        x_0  = orig0;
-        x_1  = orig1;
-        x_2  = orig2;
-        x_3  = orig3;
-        x_4  = orig4;
-        x_5  = orig5;
-        x_6  = orig6;
-        x_7  = orig7;
-        x_8  = orig8;
-        x_9  = orig9;
-        x_10 = orig10;
-        x_11 = orig11;
-        x_14 = orig14;
-        x_15 = orig15;
-
-        in12   = x[12];
-        in13   = x[13];
-        in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32);
-        x_12   = x_13 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213));
-
-        t12 = _mm256_add_epi64(addv12, x_12);
-        t13 = _mm256_add_epi64(addv13, x_13);
-
-        x_12 = _mm256_unpacklo_epi32(t12, t13);
-        x_13 = _mm256_unpackhi_epi32(t12, t13);
-
-        t12 = _mm256_unpacklo_epi32(x_12, x_13);
-        t13 = _mm256_unpackhi_epi32(x_12, x_13);
-
-        /* required because unpack* are intra-lane */
-        x_12 = _mm256_permutevar8x32_epi32(t12, permute);
-        x_13 = _mm256_permutevar8x32_epi32(t13, permute);
-
-        orig12 = x_12;
-        orig13 = x_13;
-
-        in1213 += 8;
-
-        x[12] = in1213 & 0xFFFFFFFF;
-        x[13] = (in1213 >> 32) & 0xFFFFFFFF;
-
-        for (i = 0; i < ROUNDS; i += 2) {
-            VEC8_ROUND(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
-            VEC8_ROUND(0, 5, 10, 15, 1, 6, 11, 12, 2, 7, 8, 13, 3, 4, 9, 14);
-        }
-
-#define ONEQUAD_TRANSPOSE(A, B, C, D)                              \
-    {                                                              \
-        __m128i t0, t1, t2, t3;                                    \
-        x_##A = _mm256_add_epi32(x_##A, orig##A);                  \
-        x_##B = _mm256_add_epi32(x_##B, orig##B);                  \
-        x_##C = _mm256_add_epi32(x_##C, orig##C);                  \
-        x_##D = _mm256_add_epi32(x_##D, orig##D);                  \
-        t_##A = _mm256_unpacklo_epi32(x_##A, x_##B);               \
-        t_##B = _mm256_unpacklo_epi32(x_##C, x_##D);               \
-        t_##C = _mm256_unpackhi_epi32(x_##A, x_##B);               \
-        t_##D = _mm256_unpackhi_epi32(x_##C, x_##D);               \
-        x_##A = _mm256_unpacklo_epi64(t_##A, t_##B);               \
-        x_##B = _mm256_unpackhi_epi64(t_##A, t_##B);               \
-        x_##C = _mm256_unpacklo_epi64(t_##C, t_##D);               \
-        x_##D = _mm256_unpackhi_epi64(t_##C, t_##D);               \
-        t0    = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 0),  \
-                           _mm_loadu_si128((__m128i*) (m + 0)));   \
-        _mm_storeu_si128((__m128i*) (c + 0), t0);                  \
-        t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 0),     \
-                           _mm_loadu_si128((__m128i*) (m + 64)));  \
-        _mm_storeu_si128((__m128i*) (c + 64), t1);                 \
-        t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 0),     \
-                           _mm_loadu_si128((__m128i*) (m + 128))); \
-        _mm_storeu_si128((__m128i*) (c + 128), t2);                \
-        t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 0),     \
-                           _mm_loadu_si128((__m128i*) (m + 192))); \
-        _mm_storeu_si128((__m128i*) (c + 192), t3);                \
-        t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 1),     \
-                           _mm_loadu_si128((__m128i*) (m + 256))); \
-        _mm_storeu_si128((__m128i*) (c + 256), t0);                \
-        t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 1),     \
-                           _mm_loadu_si128((__m128i*) (m + 320))); \
-        _mm_storeu_si128((__m128i*) (c + 320), t1);                \
-        t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 1),     \
-                           _mm_loadu_si128((__m128i*) (m + 384))); \
-        _mm_storeu_si128((__m128i*) (c + 384), t2);                \
-        t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 1),     \
-                           _mm_loadu_si128((__m128i*) (m + 448))); \
-        _mm_storeu_si128((__m128i*) (c + 448), t3);                \
-    }
-
-#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
-
-#define ONEQUAD_UNPCK(A, B, C, D)                    \
-    {                                                \
-        x_##A = _mm256_add_epi32(x_##A, orig##A);    \
-        x_##B = _mm256_add_epi32(x_##B, orig##B);    \
-        x_##C = _mm256_add_epi32(x_##C, orig##C);    \
-        x_##D = _mm256_add_epi32(x_##D, orig##D);    \
-        t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
-        t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
-        t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
-        t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
-        x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
-        x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
-        x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
-        x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
-    }
-
-#define ONEOCTO(A, B, C, D, A2, B2, C2, D2)                                    \
-    {                                                                          \
-        ONEQUAD_UNPCK(A, B, C, D);                                             \
-        ONEQUAD_UNPCK(A2, B2, C2, D2);                                         \
-        t_##A  = _mm256_permute2x128_si256(x_##A, x_##A2, 0x20);               \
-        t_##A2 = _mm256_permute2x128_si256(x_##A, x_##A2, 0x31);               \
-        t_##B  = _mm256_permute2x128_si256(x_##B, x_##B2, 0x20);               \
-        t_##B2 = _mm256_permute2x128_si256(x_##B, x_##B2, 0x31);               \
-        t_##C  = _mm256_permute2x128_si256(x_##C, x_##C2, 0x20);               \
-        t_##C2 = _mm256_permute2x128_si256(x_##C, x_##C2, 0x31);               \
-        t_##D  = _mm256_permute2x128_si256(x_##D, x_##D2, 0x20);               \
-        t_##D2 = _mm256_permute2x128_si256(x_##D, x_##D2, 0x31);               \
-        t_##A =                                                                \
-            _mm256_xor_si256(t_##A, _mm256_loadu_si256((__m256i*) (m + 0)));   \
-        t_##B =                                                                \
-            _mm256_xor_si256(t_##B, _mm256_loadu_si256((__m256i*) (m + 64)));  \
-        t_##C =                                                                \
-            _mm256_xor_si256(t_##C, _mm256_loadu_si256((__m256i*) (m + 128))); \
-        t_##D =                                                                \
-            _mm256_xor_si256(t_##D, _mm256_loadu_si256((__m256i*) (m + 192))); \
-        t_##A2 = _mm256_xor_si256(t_##A2,                                      \
-                                  _mm256_loadu_si256((__m256i*) (m + 256)));   \
-        t_##B2 = _mm256_xor_si256(t_##B2,                                      \
-                                  _mm256_loadu_si256((__m256i*) (m + 320)));   \
-        t_##C2 = _mm256_xor_si256(t_##C2,                                      \
-                                  _mm256_loadu_si256((__m256i*) (m + 384)));   \
-        t_##D2 = _mm256_xor_si256(t_##D2,                                      \
-                                  _mm256_loadu_si256((__m256i*) (m + 448)));   \
-        _mm256_storeu_si256((__m256i*) (c + 0), t_##A);                        \
-        _mm256_storeu_si256((__m256i*) (c + 64), t_##B);                       \
-        _mm256_storeu_si256((__m256i*) (c + 128), t_##C);                      \
-        _mm256_storeu_si256((__m256i*) (c + 192), t_##D);                      \
-        _mm256_storeu_si256((__m256i*) (c + 256), t_##A2);                     \
-        _mm256_storeu_si256((__m256i*) (c + 320), t_##B2);                     \
-        _mm256_storeu_si256((__m256i*) (c + 384), t_##C2);                     \
-        _mm256_storeu_si256((__m256i*) (c + 448), t_##D2);                     \
-    }
-
-        ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
-        m += 32;
-        c += 32;
-        ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
-        m -= 32;
-        c -= 32;
-
-#undef ONEQUAD
-#undef ONEQUAD_TRANSPOSE
-#undef ONEQUAD_UNPCK
-#undef ONEOCTO
-
-        bytes -= 512;
-        c += 512;
-        m += 512;
-    }
-}
-#undef VEC8_ROT
-#undef VEC8_QUARTERROUND
-#undef VEC8_QUARTERROUND_NAIVE
-#undef VEC8_QUARTERROUND_SHUFFLE
-#undef VEC8_QUARTERROUND_SHUFFLE2
-#undef VEC8_LINE1
-#undef VEC8_LINE2
-#undef VEC8_LINE3
-#undef VEC8_LINE4
-#undef VEC8_ROUND
-#undef VEC8_ROUND_SEQ
-#undef VEC8_ROUND_HALF
-#undef VEC8_ROUND_HALFANDHALF
+
+#define VEC8_ROT(A, IMM) \
+    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
+
+/* implements a vector quarter round by-the-book (naive!) */
+#define VEC8_QUARTERROUND_NAIVE(A, B, C, D) \
+    x_##A = _mm256_add_epi32(x_##A, x_##B); \
+    t_##A = _mm256_xor_si256(x_##D, x_##A); \
+    x_##D = VEC8_ROT(t_##A, 16);            \
+    x_##C = _mm256_add_epi32(x_##C, x_##D); \
+    t_##C = _mm256_xor_si256(x_##B, x_##C); \
+    x_##B = VEC8_ROT(t_##C, 12);            \
+    x_##A = _mm256_add_epi32(x_##A, x_##B); \
+    t_##A = _mm256_xor_si256(x_##D, x_##A); \
+    x_##D = VEC8_ROT(t_##A, 8);             \
+    x_##C = _mm256_add_epi32(x_##C, x_##D); \
+    t_##C = _mm256_xor_si256(x_##B, x_##C); \
+    x_##B = VEC8_ROT(t_##C, 7)
+
+/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 &
+ * 16) (better) */
+#define VEC8_QUARTERROUND_SHUFFLE(A, B, C, D)  \
+    x_##A = _mm256_add_epi32(x_##A, x_##B);    \
+    t_##A = _mm256_xor_si256(x_##D, x_##A);    \
+    x_##D = _mm256_shuffle_epi8(t_##A, rot16); \
+    x_##C = _mm256_add_epi32(x_##C, x_##D);    \
+    t_##C = _mm256_xor_si256(x_##B, x_##C);    \
+    x_##B = VEC8_ROT(t_##C, 12);               \
+    x_##A = _mm256_add_epi32(x_##A, x_##B);    \
+    t_##A = _mm256_xor_si256(x_##D, x_##A);    \
+    x_##D = _mm256_shuffle_epi8(t_##A, rot8);  \
+    x_##C = _mm256_add_epi32(x_##C, x_##D);    \
+    t_##C = _mm256_xor_si256(x_##B, x_##C);    \
+    x_##B = VEC8_ROT(t_##C, 7)
+
+/* same, but replace 2 of the shift/shift/or "rotation" by byte & word shuffles
+ * (8 & 16) (not as good as previous) */
+#define VEC8_QUARTERROUND_SHUFFLE2(A, B, C, D)                                 \
+    x_##A = _mm256_add_epi32(x_##A, x_##B);                                    \
+    t_##A = _mm256_xor_si256(x_##D, x_##A);                                    \
+    x_##D = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(t_##A, 0xb1), 0xb1); \
+    x_##C = _mm256_add_epi32(x_##C, x_##D);                                    \
+    t_##C = _mm256_xor_si256(x_##B, x_##C);                                    \
+    x_##B = VEC8_ROT(t_##C, 12);                                               \
+    x_##A = _mm256_add_epi32(x_##A, x_##B);                                    \
+    t_##A = _mm256_xor_si256(x_##D, x_##A);                                    \
+    x_##D = _mm256_shuffle_epi8(t_##A, rot8);                                  \
+    x_##C = _mm256_add_epi32(x_##C, x_##D);                                    \
+    t_##C = _mm256_xor_si256(x_##B, x_##C);                                    \
+    x_##B = VEC8_ROT(t_##C, 7)
+
+#define VEC8_QUARTERROUND(A, B, C, D) VEC8_QUARTERROUND_SHUFFLE(A, B, C, D)
+
+#define VEC8_LINE1(A, B, C, D)              \
+    x_##A = _mm256_add_epi32(x_##A, x_##B); \
+    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
+#define VEC8_LINE2(A, B, C, D)              \
+    x_##C = _mm256_add_epi32(x_##C, x_##D); \
+    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
+#define VEC8_LINE3(A, B, C, D)              \
+    x_##A = _mm256_add_epi32(x_##A, x_##B); \
+    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
+#define VEC8_LINE4(A, B, C, D)              \
+    x_##C = _mm256_add_epi32(x_##C, x_##D); \
+    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
+
+#define VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, \
+                       C4, D4)                                                 \
+    VEC8_LINE1(A1, B1, C1, D1);                                                \
+    VEC8_LINE1(A2, B2, C2, D2);                                                \
+    VEC8_LINE1(A3, B3, C3, D3);                                                \
+    VEC8_LINE1(A4, B4, C4, D4);                                                \
+    VEC8_LINE2(A1, B1, C1, D1);                                                \
+    VEC8_LINE2(A2, B2, C2, D2);                                                \
+    VEC8_LINE2(A3, B3, C3, D3);                                                \
+    VEC8_LINE2(A4, B4, C4, D4);                                                \
+    VEC8_LINE3(A1, B1, C1, D1);                                                \
+    VEC8_LINE3(A2, B2, C2, D2);                                                \
+    VEC8_LINE3(A3, B3, C3, D3);                                                \
+    VEC8_LINE3(A4, B4, C4, D4);                                                \
+    VEC8_LINE4(A1, B1, C1, D1);                                                \
+    VEC8_LINE4(A2, B2, C2, D2);                                                \
+    VEC8_LINE4(A3, B3, C3, D3);                                                \
+    VEC8_LINE4(A4, B4, C4, D4)
+
+#define VEC8_ROUND_HALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, \
+                        B4, C4, D4)                                         \
+    VEC8_LINE1(A1, B1, C1, D1);                                             \
+    VEC8_LINE1(A2, B2, C2, D2);                                             \
+    VEC8_LINE2(A1, B1, C1, D1);                                             \
+    VEC8_LINE2(A2, B2, C2, D2);                                             \
+    VEC8_LINE3(A1, B1, C1, D1);                                             \
+    VEC8_LINE3(A2, B2, C2, D2);                                             \
+    VEC8_LINE4(A1, B1, C1, D1);                                             \
+    VEC8_LINE4(A2, B2, C2, D2);                                             \
+    VEC8_LINE1(A3, B3, C3, D3);                                             \
+    VEC8_LINE1(A4, B4, C4, D4);                                             \
+    VEC8_LINE2(A3, B3, C3, D3);                                             \
+    VEC8_LINE2(A4, B4, C4, D4);                                             \
+    VEC8_LINE3(A3, B3, C3, D3);                                             \
+    VEC8_LINE3(A4, B4, C4, D4);                                             \
+    VEC8_LINE4(A3, B3, C3, D3);                                             \
+    VEC8_LINE4(A4, B4, C4, D4)
+
+#define VEC8_ROUND_HALFANDHALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, \
+                               A4, B4, C4, D4)                                 \
+    VEC8_LINE1(A1, B1, C1, D1);                                                \
+    VEC8_LINE1(A2, B2, C2, D2);                                                \
+    VEC8_LINE2(A1, B1, C1, D1);                                                \
+    VEC8_LINE2(A2, B2, C2, D2);                                                \
+    VEC8_LINE1(A3, B3, C3, D3);                                                \
+    VEC8_LINE1(A4, B4, C4, D4);                                                \
+    VEC8_LINE2(A3, B3, C3, D3);                                                \
+    VEC8_LINE2(A4, B4, C4, D4);                                                \
+    VEC8_LINE3(A1, B1, C1, D1);                                                \
+    VEC8_LINE3(A2, B2, C2, D2);                                                \
+    VEC8_LINE4(A1, B1, C1, D1);                                                \
+    VEC8_LINE4(A2, B2, C2, D2);                                                \
+    VEC8_LINE3(A3, B3, C3, D3);                                                \
+    VEC8_LINE3(A4, B4, C4, D4);                                                \
+    VEC8_LINE4(A3, B3, C3, D3);                                                \
+    VEC8_LINE4(A4, B4, C4, D4)
+
+#define VEC8_ROUND(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \
+                   D4)                                                         \
+    VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \
+                   D4)
+
+if (bytes >= 512) {
+    /* constant for shuffling bytes (replacing multiple-of-8 rotates) */
+    __m256i rot16 =
+        _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
+                        13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
+    __m256i rot8 =
+        _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3,
+                        14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
+    uint32_t in12, in13;
+
+    /* the naive way seems as fast (if not a bit faster) than the vector way */
+    __m256i x_0  = _mm256_set1_epi32(x[0]);
+    __m256i x_1  = _mm256_set1_epi32(x[1]);
+    __m256i x_2  = _mm256_set1_epi32(x[2]);
+    __m256i x_3  = _mm256_set1_epi32(x[3]);
+    __m256i x_4  = _mm256_set1_epi32(x[4]);
+    __m256i x_5  = _mm256_set1_epi32(x[5]);
+    __m256i x_6  = _mm256_set1_epi32(x[6]);
+    __m256i x_7  = _mm256_set1_epi32(x[7]);
+    __m256i x_8  = _mm256_set1_epi32(x[8]);
+    __m256i x_9  = _mm256_set1_epi32(x[9]);
+    __m256i x_10 = _mm256_set1_epi32(x[10]);
+    __m256i x_11 = _mm256_set1_epi32(x[11]);
+    __m256i x_12;
+    __m256i x_13;
+    __m256i x_14 = _mm256_set1_epi32(x[14]);
+    __m256i x_15 = _mm256_set1_epi32(x[15]);
+
+    __m256i orig0  = x_0;
+    __m256i orig1  = x_1;
+    __m256i orig2  = x_2;
+    __m256i orig3  = x_3;
+    __m256i orig4  = x_4;
+    __m256i orig5  = x_5;
+    __m256i orig6  = x_6;
+    __m256i orig7  = x_7;
+    __m256i orig8  = x_8;
+    __m256i orig9  = x_9;
+    __m256i orig10 = x_10;
+    __m256i orig11 = x_11;
+    __m256i orig12;
+    __m256i orig13;
+    __m256i orig14 = x_14;
+    __m256i orig15 = x_15;
+    __m256i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
+        t_13, t_14, t_15;
+
+    while (bytes >= 512) {
+        const __m256i addv12  = _mm256_set_epi64x(3, 2, 1, 0);
+        const __m256i addv13  = _mm256_set_epi64x(7, 6, 5, 4);
+        const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+        __m256i       t12, t13;
+
+        uint64_t in1213;
+        int      i;
+
+        x_0  = orig0;
+        x_1  = orig1;
+        x_2  = orig2;
+        x_3  = orig3;
+        x_4  = orig4;
+        x_5  = orig5;
+        x_6  = orig6;
+        x_7  = orig7;
+        x_8  = orig8;
+        x_9  = orig9;
+        x_10 = orig10;
+        x_11 = orig11;
+        x_14 = orig14;
+        x_15 = orig15;
+
+        in12   = x[12];
+        in13   = x[13];
+        in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32);
+        x_12 = x_13 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213));
+
+        t12 = _mm256_add_epi64(addv12, x_12);
+        t13 = _mm256_add_epi64(addv13, x_13);
+
+        x_12 = _mm256_unpacklo_epi32(t12, t13);
+        x_13 = _mm256_unpackhi_epi32(t12, t13);
+
+        t12 = _mm256_unpacklo_epi32(x_12, x_13);
+        t13 = _mm256_unpackhi_epi32(x_12, x_13);
+
+        /* required because unpack* are intra-lane */
+        x_12 = _mm256_permutevar8x32_epi32(t12, permute);
+        x_13 = _mm256_permutevar8x32_epi32(t13, permute);
+
+        orig12 = x_12;
+        orig13 = x_13;
+
+        in1213 += 8;
+
+        x[12] = in1213 & 0xFFFFFFFF;
+        x[13] = (in1213 >> 32) & 0xFFFFFFFF;
+
+        for (i = 0; i < ROUNDS; i += 2) {
+            VEC8_ROUND(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+            VEC8_ROUND(0, 5, 10, 15, 1, 6, 11, 12, 2, 7, 8, 13, 3, 4, 9, 14);
+        }
+
+#define ONEQUAD_TRANSPOSE(A, B, C, D)                                     \
+    {                                                                     \
+        __m128i t0, t1, t2, t3;                                           \
+        x_##A = _mm256_add_epi32(x_##A, orig##A);                         \
+        x_##B = _mm256_add_epi32(x_##B, orig##B);                         \
+        x_##C = _mm256_add_epi32(x_##C, orig##C);                         \
+        x_##D = _mm256_add_epi32(x_##D, orig##D);                         \
+        t_##A = _mm256_unpacklo_epi32(x_##A, x_##B);                      \
+        t_##B = _mm256_unpacklo_epi32(x_##C, x_##D);                      \
+        t_##C = _mm256_unpackhi_epi32(x_##A, x_##B);                      \
+        t_##D = _mm256_unpackhi_epi32(x_##C, x_##D);                      \
+        x_##A = _mm256_unpacklo_epi64(t_##A, t_##B);                      \
+        x_##B = _mm256_unpackhi_epi64(t_##A, t_##B);                      \
+        x_##C = _mm256_unpacklo_epi64(t_##C, t_##D);                      \
+        x_##D = _mm256_unpackhi_epi64(t_##C, t_##D);                      \
+        t0    = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 0),         \
+                           _mm_loadu_si128((const __m128i*) (m + 0))); \
+        _mm_storeu_si128((__m128i*) (c + 0), t0);                         \
+        t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 0),            \
+                           _mm_loadu_si128((const __m128i*) (m + 64)));   \
+        _mm_storeu_si128((__m128i*) (c + 64), t1);                        \
+        t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 0),            \
+                           _mm_loadu_si128((const __m128i*) (m + 128)));  \
+        _mm_storeu_si128((__m128i*) (c + 128), t2);                       \
+        t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 0),            \
+                           _mm_loadu_si128((const __m128i*) (m + 192)));  \
+        _mm_storeu_si128((__m128i*) (c + 192), t3);                       \
+        t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 1),            \
+                           _mm_loadu_si128((const __m128i*) (m + 256)));  \
+        _mm_storeu_si128((__m128i*) (c + 256), t0);                       \
+        t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 1),            \
+                           _mm_loadu_si128((const __m128i*) (m + 320)));  \
+        _mm_storeu_si128((__m128i*) (c + 320), t1);                       \
+        t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 1),            \
+                           _mm_loadu_si128((const __m128i*) (m + 384)));  \
+        _mm_storeu_si128((__m128i*) (c + 384), t2);                       \
+        t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 1),            \
+                           _mm_loadu_si128((const __m128i*) (m + 448)));  \
+        _mm_storeu_si128((__m128i*) (c + 448), t3);                       \
+    }
+
+#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
+
+#define ONEQUAD_UNPCK(A, B, C, D)                    \
+    {                                                \
+        x_##A = _mm256_add_epi32(x_##A, orig##A);    \
+        x_##B = _mm256_add_epi32(x_##B, orig##B);    \
+        x_##C = _mm256_add_epi32(x_##C, orig##C);    \
+        x_##D = _mm256_add_epi32(x_##D, orig##D);    \
+        t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
+        t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
+        t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
+        t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
+        x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
+        x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
+        x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
+        x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
+    }
+
+#define ONEOCTO(A, B, C, D, A2, B2, C2, D2)                          \
+    {                                                                \
+        ONEQUAD_UNPCK(A, B, C, D);                                   \
+        ONEQUAD_UNPCK(A2, B2, C2, D2);                               \
+        t_##A  = _mm256_permute2x128_si256(x_##A, x_##A2, 0x20);     \
+        t_##A2 = _mm256_permute2x128_si256(x_##A, x_##A2, 0x31);     \
+        t_##B  = _mm256_permute2x128_si256(x_##B, x_##B2, 0x20);     \
+        t_##B2 = _mm256_permute2x128_si256(x_##B, x_##B2, 0x31);     \
+        t_##C  = _mm256_permute2x128_si256(x_##C, x_##C2, 0x20);     \
+        t_##C2 = _mm256_permute2x128_si256(x_##C, x_##C2, 0x31);     \
+        t_##D  = _mm256_permute2x128_si256(x_##D, x_##D2, 0x20);     \
+        t_##D2 = _mm256_permute2x128_si256(x_##D, x_##D2, 0x31);     \
+        t_##A  = _mm256_xor_si256(                                   \
+            t_##A, _mm256_loadu_si256((const __m256i*) (m + 0)));   \
+        t_##B = _mm256_xor_si256(                                    \
+            t_##B, _mm256_loadu_si256((const __m256i*) (m + 64)));   \
+        t_##C = _mm256_xor_si256(                                    \
+            t_##C, _mm256_loadu_si256((const __m256i*) (m + 128)));  \
+        t_##D = _mm256_xor_si256(                                    \
+            t_##D, _mm256_loadu_si256((const __m256i*) (m + 192)));  \
+        t_##A2 = _mm256_xor_si256(                                   \
+            t_##A2, _mm256_loadu_si256((const __m256i*) (m + 256))); \
+        t_##B2 = _mm256_xor_si256(                                   \
+            t_##B2, _mm256_loadu_si256((const __m256i*) (m + 320))); \
+        t_##C2 = _mm256_xor_si256(                                   \
+            t_##C2, _mm256_loadu_si256((const __m256i*) (m + 384))); \
+        t_##D2 = _mm256_xor_si256(                                   \
+            t_##D2, _mm256_loadu_si256((const __m256i*) (m + 448))); \
+        _mm256_storeu_si256((__m256i*) (c + 0), t_##A);              \
+        _mm256_storeu_si256((__m256i*) (c + 64), t_##B);             \
+        _mm256_storeu_si256((__m256i*) (c + 128), t_##C);            \
+        _mm256_storeu_si256((__m256i*) (c + 192), t_##D);            \
+        _mm256_storeu_si256((__m256i*) (c + 256), t_##A2);           \
+        _mm256_storeu_si256((__m256i*) (c + 320), t_##B2);           \
+        _mm256_storeu_si256((__m256i*) (c + 384), t_##C2);           \
+        _mm256_storeu_si256((__m256i*) (c + 448), t_##D2);           \
+    }
+
+        ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
+        m += 32;
+        c += 32;
+        ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
+        m -= 32;
+        c -= 32;
+
+#undef ONEQUAD
+#undef ONEQUAD_TRANSPOSE
+#undef ONEQUAD_UNPCK
+#undef ONEOCTO
+
+        bytes -= 512;
+        c += 512;
+        m += 512;
+    }
+}
+#undef VEC8_ROT
+#undef VEC8_QUARTERROUND
+#undef VEC8_QUARTERROUND_NAIVE
+#undef VEC8_QUARTERROUND_SHUFFLE
+#undef VEC8_QUARTERROUND_SHUFFLE2
+#undef VEC8_LINE1
+#undef VEC8_LINE2
+#undef VEC8_LINE3
+#undef VEC8_LINE4
+#undef VEC8_ROUND
+#undef VEC8_ROUND_SEQ
+#undef VEC8_ROUND_HALF
+#undef VEC8_ROUND_HALFANDHALF
diff --git a/libs/libsodium/src/crypto_stream/chacha20/ref/chacha20_ref.c b/libs/libsodium/src/crypto_stream/chacha20/ref/chacha20_ref.c
index 40cccbf8f8..fb1e3a4b5c 100644
--- a/libs/libsodium/src/crypto_stream/chacha20/ref/chacha20_ref.c
+++ b/libs/libsodium/src/crypto_stream/chacha20/ref/chacha20_ref.c
@@ -1,312 +1,312 @@
-
-/*
- chacha-merged.c version 20080118
- D. J. Bernstein
- Public domain.
- */
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "core.h"
-#include "crypto_stream_chacha20.h"
-#include "private/common.h"
-#include "utils.h"
-
-#include "../stream_chacha20.h"
-#include "chacha20_ref.h"
-
-struct chacha_ctx {
-    uint32_t input[16];
-};
-
-typedef struct chacha_ctx chacha_ctx;
-
-#define U32C(v) (v##U)
-
-#define U32V(v) ((uint32_t)(v) &U32C(0xFFFFFFFF))
-
-#define ROTATE(v, c) (ROTL32(v, c))
-#define XOR(v, w) ((v) ^ (w))
-#define PLUS(v, w) (U32V((v) + (w)))
-#define PLUSONE(v) (PLUS((v), 1))
-
-#define QUARTERROUND(a, b, c, d) \
-    a = PLUS(a, b);              \
-    d = ROTATE(XOR(d, a), 16);   \
-    c = PLUS(c, d);              \
-    b = ROTATE(XOR(b, c), 12);   \
-    a = PLUS(a, b);              \
-    d = ROTATE(XOR(d, a), 8);    \
-    c = PLUS(c, d);              \
-    b = ROTATE(XOR(b, c), 7);
-
-static void
-chacha_keysetup(chacha_ctx *ctx, const uint8_t *k)
-{
-    ctx->input[0]  = U32C(0x61707865);
-    ctx->input[1]  = U32C(0x3320646e);
-    ctx->input[2]  = U32C(0x79622d32);
-    ctx->input[3]  = U32C(0x6b206574);
-    ctx->input[4]  = LOAD32_LE(k + 0);
-    ctx->input[5]  = LOAD32_LE(k + 4);
-    ctx->input[6]  = LOAD32_LE(k + 8);
-    ctx->input[7]  = LOAD32_LE(k + 12);
-    ctx->input[8]  = LOAD32_LE(k + 16);
-    ctx->input[9]  = LOAD32_LE(k + 20);
-    ctx->input[10] = LOAD32_LE(k + 24);
-    ctx->input[11] = LOAD32_LE(k + 28);
-}
-
-static void
-chacha_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
-{
-    ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter + 0);
-    ctx->input[13] = counter == NULL ? 0 : LOAD32_LE(counter + 4);
-    ctx->input[14] = LOAD32_LE(iv + 0);
-    ctx->input[15] = LOAD32_LE(iv + 4);
-}
-
-static void
-chacha_ietf_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
-{
-    ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter);
-    ctx->input[13] = LOAD32_LE(iv + 0);
-    ctx->input[14] = LOAD32_LE(iv + 4);
-    ctx->input[15] = LOAD32_LE(iv + 8);
-}
-
-static void
-chacha20_encrypt_bytes(chacha_ctx *ctx, const uint8_t *m, uint8_t *c,
-                       unsigned long long bytes)
-{
-    uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
-        x15;
-    uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14,
-        j15;
-    uint8_t     *ctarget = NULL;
-    uint8_t      tmp[64];
-    unsigned int i;
-
-    if (!bytes) {
-        return; /* LCOV_EXCL_LINE */
-    }
-    j0  = ctx->input[0];
-    j1  = ctx->input[1];
-    j2  = ctx->input[2];
-    j3  = ctx->input[3];
-    j4  = ctx->input[4];
-    j5  = ctx->input[5];
-    j6  = ctx->input[6];
-    j7  = ctx->input[7];
-    j8  = ctx->input[8];
-    j9  = ctx->input[9];
-    j10 = ctx->input[10];
-    j11 = ctx->input[11];
-    j12 = ctx->input[12];
-    j13 = ctx->input[13];
-    j14 = ctx->input[14];
-    j15 = ctx->input[15];
-
-    for (;;) {
-        if (bytes < 64) {
-            memset(tmp, 0, 64);
-            for (i = 0; i < bytes; ++i) {
-                tmp[i] = m[i];
-            }
-            m       = tmp;
-            ctarget = c;
-            c       = tmp;
-        }
-        x0  = j0;
-        x1  = j1;
-        x2  = j2;
-        x3  = j3;
-        x4  = j4;
-        x5  = j5;
-        x6  = j6;
-        x7  = j7;
-        x8  = j8;
-        x9  = j9;
-        x10 = j10;
-        x11 = j11;
-        x12 = j12;
-        x13 = j13;
-        x14 = j14;
-        x15 = j15;
-        for (i = 20; i > 0; i -= 2) {
-            QUARTERROUND(x0, x4, x8, x12)
-            QUARTERROUND(x1, x5, x9, x13)
-            QUARTERROUND(x2, x6, x10, x14)
-            QUARTERROUND(x3, x7, x11, x15)
-            QUARTERROUND(x0, x5, x10, x15)
-            QUARTERROUND(x1, x6, x11, x12)
-            QUARTERROUND(x2, x7, x8, x13)
-            QUARTERROUND(x3, x4, x9, x14)
-        }
-        x0  = PLUS(x0, j0);
-        x1  = PLUS(x1, j1);
-        x2  = PLUS(x2, j2);
-        x3  = PLUS(x3, j3);
-        x4  = PLUS(x4, j4);
-        x5  = PLUS(x5, j5);
-        x6  = PLUS(x6, j6);
-        x7  = PLUS(x7, j7);
-        x8  = PLUS(x8, j8);
-        x9  = PLUS(x9, j9);
-        x10 = PLUS(x10, j10);
-        x11 = PLUS(x11, j11);
-        x12 = PLUS(x12, j12);
-        x13 = PLUS(x13, j13);
-        x14 = PLUS(x14, j14);
-        x15 = PLUS(x15, j15);
-
-        x0  = XOR(x0, LOAD32_LE(m + 0));
-        x1  = XOR(x1, LOAD32_LE(m + 4));
-        x2  = XOR(x2, LOAD32_LE(m + 8));
-        x3  = XOR(x3, LOAD32_LE(m + 12));
-        x4  = XOR(x4, LOAD32_LE(m + 16));
-        x5  = XOR(x5, LOAD32_LE(m + 20));
-        x6  = XOR(x6, LOAD32_LE(m + 24));
-        x7  = XOR(x7, LOAD32_LE(m + 28));
-        x8  = XOR(x8, LOAD32_LE(m + 32));
-        x9  = XOR(x9, LOAD32_LE(m + 36));
-        x10 = XOR(x10, LOAD32_LE(m + 40));
-        x11 = XOR(x11, LOAD32_LE(m + 44));
-        x12 = XOR(x12, LOAD32_LE(m + 48));
-        x13 = XOR(x13, LOAD32_LE(m + 52));
-        x14 = XOR(x14, LOAD32_LE(m + 56));
-        x15 = XOR(x15, LOAD32_LE(m + 60));
-
-        j12 = PLUSONE(j12);
-        /* LCOV_EXCL_START */
-        if (!j12) {
-            j13 = PLUSONE(j13);
-        }
-        /* LCOV_EXCL_STOP */
-
-        STORE32_LE(c + 0, x0);
-        STORE32_LE(c + 4, x1);
-        STORE32_LE(c + 8, x2);
-        STORE32_LE(c + 12, x3);
-        STORE32_LE(c + 16, x4);
-        STORE32_LE(c + 20, x5);
-        STORE32_LE(c + 24, x6);
-        STORE32_LE(c + 28, x7);
-        STORE32_LE(c + 32, x8);
-        STORE32_LE(c + 36, x9);
-        STORE32_LE(c + 40, x10);
-        STORE32_LE(c + 44, x11);
-        STORE32_LE(c + 48, x12);
-        STORE32_LE(c + 52, x13);
-        STORE32_LE(c + 56, x14);
-        STORE32_LE(c + 60, x15);
-
-        if (bytes <= 64) {
-            if (bytes < 64) {
-                for (i = 0; i < (unsigned int) bytes; ++i) {
-                    ctarget[i] = c[i]; /* ctarget cannot be NULL */
-                }
-            }
-            ctx->input[12] = j12;
-            ctx->input[13] = j13;
-
-            return;
-        }
-        bytes -= 64;
-        c += 64;
-        m += 64;
-    }
-}
-
-static int
-stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n,
-           const unsigned char *k)
-{
-    struct chacha_ctx ctx;
-
-    if (!clen) {
-        return 0;
-    }
-    COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8);
-    chacha_keysetup(&ctx, k);
-    chacha_ivsetup(&ctx, n, NULL);
-    memset(c, 0, clen);
-    chacha20_encrypt_bytes(&ctx, c, c, clen);
-    sodium_memzero(&ctx, sizeof ctx);
-
-    return 0;
-}
-
-static int
-stream_ietf_ext_ref(unsigned char *c, unsigned long long clen,
-                    const unsigned char *n, const unsigned char *k)
-{
-    struct chacha_ctx ctx;
-
-    if (!clen) {
-        return 0;
-    }
-    COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8);
-    chacha_keysetup(&ctx, k);
-    chacha_ietf_ivsetup(&ctx, n, NULL);
-    memset(c, 0, clen);
-    chacha20_encrypt_bytes(&ctx, c, c, clen);
-    sodium_memzero(&ctx, sizeof ctx);
-
-    return 0;
-}
-
-static int
-stream_ref_xor_ic(unsigned char *c, const unsigned char *m,
-                  unsigned long long mlen, const unsigned char *n, uint64_t ic,
-                  const unsigned char *k)
-{
-    struct chacha_ctx ctx;
-    uint8_t           ic_bytes[8];
-    uint32_t          ic_high;
-    uint32_t          ic_low;
-
-    if (!mlen) {
-        return 0;
-    }
-    ic_high = U32V(ic >> 32);
-    ic_low  = U32V(ic);
-    STORE32_LE(&ic_bytes[0], ic_low);
-    STORE32_LE(&ic_bytes[4], ic_high);
-    chacha_keysetup(&ctx, k);
-    chacha_ivsetup(&ctx, n, ic_bytes);
-    chacha20_encrypt_bytes(&ctx, m, c, mlen);
-    sodium_memzero(&ctx, sizeof ctx);
-
-    return 0;
-}
-
-static int
-stream_ietf_ext_ref_xor_ic(unsigned char *c, const unsigned char *m,
-                           unsigned long long mlen, const unsigned char *n,
-                           uint32_t ic, const unsigned char *k)
-{
-    struct chacha_ctx ctx;
-    uint8_t           ic_bytes[4];
-
-    if (!mlen) {
-        return 0;
-    }
-    STORE32_LE(ic_bytes, ic);
-    chacha_keysetup(&ctx, k);
-    chacha_ietf_ivsetup(&ctx, n, ic_bytes);
-    chacha20_encrypt_bytes(&ctx, m, c, mlen);
-    sodium_memzero(&ctx, sizeof ctx);
-
-    return 0;
-}
-
-struct crypto_stream_chacha20_implementation
-    crypto_stream_chacha20_ref_implementation = {
-        SODIUM_C99(.stream =) stream_ref,
-        SODIUM_C99(.stream_ietf_ext =) stream_ietf_ext_ref,
-        SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic,
-        SODIUM_C99(.stream_ietf_ext_xor_ic =) stream_ietf_ext_ref_xor_ic
-    };
+
+/*
+ chacha-merged.c version 20080118
+ D. J. Bernstein
+ Public domain.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "core.h"
+#include "crypto_stream_chacha20.h"
+#include "private/common.h"
+#include "utils.h"
+
+#include "../stream_chacha20.h"
+#include "chacha20_ref.h"
+
+struct chacha_ctx {
+    uint32_t input[16];
+};
+
+typedef struct chacha_ctx chacha_ctx;
+
+#define U32C(v) (v##U)
+
+#define U32V(v) ((uint32_t)(v) &U32C(0xFFFFFFFF))
+
+#define ROTATE(v, c) (ROTL32(v, c))
+#define XOR(v, w) ((v) ^ (w))
+#define PLUS(v, w) (U32V((v) + (w)))
+#define PLUSONE(v) (PLUS((v), 1))
+
+#define QUARTERROUND(a, b, c, d) \
+    a = PLUS(a, b);              \
+    d = ROTATE(XOR(d, a), 16);   \
+    c = PLUS(c, d);              \
+    b = ROTATE(XOR(b, c), 12);   \
+    a = PLUS(a, b);              \
+    d = ROTATE(XOR(d, a), 8);    \
+    c = PLUS(c, d);              \
+    b = ROTATE(XOR(b, c), 7);
+
+static void
+chacha_keysetup(chacha_ctx *ctx, const uint8_t *k)
+{
+    ctx->input[0]  = U32C(0x61707865);
+    ctx->input[1]  = U32C(0x3320646e);
+    ctx->input[2]  = U32C(0x79622d32);
+    ctx->input[3]  = U32C(0x6b206574);
+    ctx->input[4]  = LOAD32_LE(k + 0);
+    ctx->input[5]  = LOAD32_LE(k + 4);
+    ctx->input[6]  = LOAD32_LE(k + 8);
+    ctx->input[7]  = LOAD32_LE(k + 12);
+    ctx->input[8]  = LOAD32_LE(k + 16);
+    ctx->input[9]  = LOAD32_LE(k + 20);
+    ctx->input[10] = LOAD32_LE(k + 24);
+    ctx->input[11] = LOAD32_LE(k + 28);
+}
+
+static void
+chacha_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
+{
+    ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter + 0);
+    ctx->input[13] = counter == NULL ? 0 : LOAD32_LE(counter + 4);
+    ctx->input[14] = LOAD32_LE(iv + 0);
+    ctx->input[15] = LOAD32_LE(iv + 4);
+}
+
+static void
+chacha_ietf_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
+{
+    ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter);
+    ctx->input[13] = LOAD32_LE(iv + 0);
+    ctx->input[14] = LOAD32_LE(iv + 4);
+    ctx->input[15] = LOAD32_LE(iv + 8);
+}
+
+static void
+chacha20_encrypt_bytes(chacha_ctx *ctx, const uint8_t *m, uint8_t *c,
+                       unsigned long long bytes)
+{
+    uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
+        x15;
+    uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14,
+        j15;
+    uint8_t     *ctarget = NULL;
+    uint8_t      tmp[64];
+    unsigned int i;
+
+    if (!bytes) {
+        return; /* LCOV_EXCL_LINE */
+    }
+    j0  = ctx->input[0];
+    j1  = ctx->input[1];
+    j2  = ctx->input[2];
+    j3  = ctx->input[3];
+    j4  = ctx->input[4];
+    j5  = ctx->input[5];
+    j6  = ctx->input[6];
+    j7  = ctx->input[7];
+    j8  = ctx->input[8];
+    j9  = ctx->input[9];
+    j10 = ctx->input[10];
+    j11 = ctx->input[11];
+    j12 = ctx->input[12];
+    j13 = ctx->input[13];
+    j14 = ctx->input[14];
+    j15 = ctx->input[15];
+
+    for (;;) {
+        if (bytes < 64) {
+            memset(tmp, 0, 64);
+            for (i = 0; i < bytes; ++i) {
+                tmp[i] = m[i];
+            }
+            m       = tmp;
+            ctarget = c;
+            c       = tmp;
+        }
+        x0  = j0;
+        x1  = j1;
+        x2  = j2;
+        x3  = j3;
+        x4  = j4;
+        x5  = j5;
+        x6  = j6;
+        x7  = j7;
+        x8  = j8;
+        x9  = j9;
+        x10 = j10;
+        x11 = j11;
+        x12 = j12;
+        x13 = j13;
+        x14 = j14;
+        x15 = j15;
+        for (i = 20; i > 0; i -= 2) {
+            QUARTERROUND(x0, x4, x8, x12)
+            QUARTERROUND(x1, x5, x9, x13)
+            QUARTERROUND(x2, x6, x10, x14)
+            QUARTERROUND(x3, x7, x11, x15)
+            QUARTERROUND(x0, x5, x10, x15)
+            QUARTERROUND(x1, x6, x11, x12)
+            QUARTERROUND(x2, x7, x8, x13)
+            QUARTERROUND(x3, x4, x9, x14)
+        }
+        x0  = PLUS(x0, j0);
+        x1  = PLUS(x1, j1);
+        x2  = PLUS(x2, j2);
+        x3  = PLUS(x3, j3);
+        x4  = PLUS(x4, j4);
+        x5  = PLUS(x5, j5);
+        x6  = PLUS(x6, j6);
+        x7  = PLUS(x7, j7);
+        x8  = PLUS(x8, j8);
+        x9  = PLUS(x9, j9);
+        x10 = PLUS(x10, j10);
+        x11 = PLUS(x11, j11);
+        x12 = PLUS(x12, j12);
+        x13 = PLUS(x13, j13);
+        x14 = PLUS(x14, j14);
+        x15 = PLUS(x15, j15);
+
+        x0  = XOR(x0, LOAD32_LE(m + 0));
+        x1  = XOR(x1, LOAD32_LE(m + 4));
+        x2  = XOR(x2, LOAD32_LE(m + 8));
+        x3  = XOR(x3, LOAD32_LE(m + 12));
+        x4  = XOR(x4, LOAD32_LE(m + 16));
+        x5  = XOR(x5, LOAD32_LE(m + 20));
+        x6  = XOR(x6, LOAD32_LE(m + 24));
+        x7  = XOR(x7, LOAD32_LE(m + 28));
+        x8  = XOR(x8, LOAD32_LE(m + 32));
+        x9  = XOR(x9, LOAD32_LE(m + 36));
+        x10 = XOR(x10, LOAD32_LE(m + 40));
+        x11 = XOR(x11, LOAD32_LE(m + 44));
+        x12 = XOR(x12, LOAD32_LE(m + 48));
+        x13 = XOR(x13, LOAD32_LE(m + 52));
+        x14 = XOR(x14, LOAD32_LE(m + 56));
+        x15 = XOR(x15, LOAD32_LE(m + 60));
+
+        j12 = PLUSONE(j12);
+        /* LCOV_EXCL_START */
+        if (!j12) {
+            j13 = PLUSONE(j13);
+        }
+        /* LCOV_EXCL_STOP */
+
+        STORE32_LE(c + 0, x0);
+        STORE32_LE(c + 4, x1);
+        STORE32_LE(c + 8, x2);
+        STORE32_LE(c + 12, x3);
+        STORE32_LE(c + 16, x4);
+        STORE32_LE(c + 20, x5);
+        STORE32_LE(c + 24, x6);
+        STORE32_LE(c + 28, x7);
+        STORE32_LE(c + 32, x8);
+        STORE32_LE(c + 36, x9);
+        STORE32_LE(c + 40, x10);
+        STORE32_LE(c + 44, x11);
+        STORE32_LE(c + 48, x12);
+        STORE32_LE(c + 52, x13);
+        STORE32_LE(c + 56, x14);
+        STORE32_LE(c + 60, x15);
+
+        if (bytes <= 64) {
+            if (bytes < 64) {
+                for (i = 0; i < (unsigned int) bytes; ++i) {
+                    ctarget[i] = c[i]; /* ctarget cannot be NULL */
+                }
+            }
+            ctx->input[12] = j12;
+            ctx->input[13] = j13;
+
+            return;
+        }
+        bytes -= 64;
+        c += 64;
+        m += 64;
+    }
+}
+
+static int
+stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n,
+           const unsigned char *k)
+{
+    struct chacha_ctx ctx;
+
+    if (!clen) {
+        return 0;
+    }
+    COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8);
+    chacha_keysetup(&ctx, k);
+    chacha_ivsetup(&ctx, n, NULL);
+    memset(c, 0, clen);
+    chacha20_encrypt_bytes(&ctx, c, c, clen);
+    sodium_memzero(&ctx, sizeof ctx);
+
+    return 0;
+}
+
+static int
+stream_ietf_ext_ref(unsigned char *c, unsigned long long clen,
+                    const unsigned char *n, const unsigned char *k)
+{
+    struct chacha_ctx ctx;
+
+    if (!clen) {
+        return 0;
+    }
+    COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8);
+    chacha_keysetup(&ctx, k);
+    chacha_ietf_ivsetup(&ctx, n, NULL);
+    memset(c, 0, clen);
+    chacha20_encrypt_bytes(&ctx, c, c, clen);
+    sodium_memzero(&ctx, sizeof ctx);
+
+    return 0;
+}
+
+static int
+stream_ref_xor_ic(unsigned char *c, const unsigned char *m,
+                  unsigned long long mlen, const unsigned char *n, uint64_t ic,
+                  const unsigned char *k)
+{
+    struct chacha_ctx ctx;
+    uint8_t           ic_bytes[8];
+    uint32_t          ic_high;
+    uint32_t          ic_low;
+
+    if (!mlen) {
+        return 0;
+    }
+    ic_high = U32V(ic >> 32);
+    ic_low  = U32V(ic);
+    STORE32_LE(&ic_bytes[0], ic_low);
+    STORE32_LE(&ic_bytes[4], ic_high);
+    chacha_keysetup(&ctx, k);
+    chacha_ivsetup(&ctx, n, ic_bytes);
+    chacha20_encrypt_bytes(&ctx, m, c, mlen);
+    sodium_memzero(&ctx, sizeof ctx);
+
+    return 0;
+}
+
+static int
+stream_ietf_ext_ref_xor_ic(unsigned char *c, const unsigned char *m,
+                           unsigned long long mlen, const unsigned char *n,
+                           uint32_t ic, const unsigned char *k)
+{
+    struct chacha_ctx ctx;
+    uint8_t           ic_bytes[4];
+
+    if (!mlen) {
+        return 0;
+    }
+    STORE32_LE(ic_bytes, ic);
+    chacha_keysetup(&ctx, k);
+    chacha_ietf_ivsetup(&ctx, n, ic_bytes);
+    chacha20_encrypt_bytes(&ctx, m, c, mlen);
+    sodium_memzero(&ctx, sizeof ctx);
+
+    return 0;
+}
+
+struct crypto_stream_chacha20_implementation
+    crypto_stream_chacha20_ref_implementation = {
+        SODIUM_C99(.stream =) stream_ref,
+        SODIUM_C99(.stream_ietf_ext =) stream_ietf_ext_ref,
+        SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic,
+        SODIUM_C99(.stream_ietf_ext_xor_ic =) stream_ietf_ext_ref_xor_ic
+    };
diff --git a/libs/libsodium/src/crypto_stream/chacha20/ref/chacha20_ref.h b/libs/libsodium/src/crypto_stream/chacha20/ref/chacha20_ref.h
index 6ac4807554..66c2e830e7 100644
--- a/libs/libsodium/src/crypto_stream/chacha20/ref/chacha20_ref.h
+++ b/libs/libsodium/src/crypto_stream/chacha20/ref/chacha20_ref.h
@@ -1,8 +1,8 @@
-
-#include <stdint.h>
-
-#include "../stream_chacha20.h"
-#include "crypto_stream_chacha20.h"
-
-extern struct crypto_stream_chacha20_implementation
-    crypto_stream_chacha20_ref_implementation;
+
+#include <stdint.h>
+
+#include "../stream_chacha20.h"
+#include "crypto_stream_chacha20.h"
+
+extern struct crypto_stream_chacha20_implementation
+    crypto_stream_chacha20_ref_implementation;
diff --git a/libs/libsodium/src/crypto_stream/chacha20/stream_chacha20.c b/libs/libsodium/src/crypto_stream/chacha20/stream_chacha20.c
index c98d60907f..b88f9a50e9 100644
--- a/libs/libsodium/src/crypto_stream/chacha20/stream_chacha20.c
+++ b/libs/libsodium/src/crypto_stream/chacha20/stream_chacha20.c
@@ -1,183 +1,184 @@
-#include "crypto_stream_chacha20.h"
-#include "core.h"
-#include "private/common.h"
-#include "private/implementations.h"
-#include "randombytes.h"
-#include "runtime.h"
-#include "stream_chacha20.h"
-
-#include "ref/chacha20_ref.h"
-#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
-    defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
-# include "dolbeau/chacha20_dolbeau-avx2.h"
-#endif
-#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H)
-# include "dolbeau/chacha20_dolbeau-ssse3.h"
-#endif
-
-static const crypto_stream_chacha20_implementation *implementation =
-    &crypto_stream_chacha20_ref_implementation;
-
-size_t
-crypto_stream_chacha20_keybytes(void) {
-    return crypto_stream_chacha20_KEYBYTES;
-}
-
-size_t
-crypto_stream_chacha20_noncebytes(void) {
-    return crypto_stream_chacha20_NONCEBYTES;
-}
-
-size_t
-crypto_stream_chacha20_messagebytes_max(void)
-{
-    return crypto_stream_chacha20_MESSAGEBYTES_MAX;
-}
-
-size_t
-crypto_stream_chacha20_ietf_keybytes(void) {
-    return crypto_stream_chacha20_ietf_KEYBYTES;
-}
-
-size_t
-crypto_stream_chacha20_ietf_noncebytes(void) {
-    return crypto_stream_chacha20_ietf_NONCEBYTES;
-}
-
-size_t
-crypto_stream_chacha20_ietf_messagebytes_max(void)
-{
-    return crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX;
-}
-
-int
-crypto_stream_chacha20(unsigned char *c, unsigned long long clen,
-                       const unsigned char *n, const unsigned char *k)
-{
-    if (clen > crypto_stream_chacha20_MESSAGEBYTES_MAX) {
-        sodium_misuse();
-    }
-    return implementation->stream(c, clen, n, k);
-}
-
-int
-crypto_stream_chacha20_xor_ic(unsigned char *c, const unsigned char *m,
-                              unsigned long long mlen,
-                              const unsigned char *n, uint64_t ic,
-                              const unsigned char *k)
-{
-    if (mlen > crypto_stream_chacha20_MESSAGEBYTES_MAX) {
-        sodium_misuse();
-    }
-    return implementation->stream_xor_ic(c, m, mlen, n, ic, k);
-}
-
-int
-crypto_stream_chacha20_xor(unsigned char *c, const unsigned char *m,
-                           unsigned long long mlen, const unsigned char *n,
-                           const unsigned char *k)
-{
-    if (mlen > crypto_stream_chacha20_MESSAGEBYTES_MAX) {
-        sodium_misuse();
-    }
-    return implementation->stream_xor_ic(c, m, mlen, n, 0U, k);
-}
-
-int
-crypto_stream_chacha20_ietf_ext(unsigned char *c, unsigned long long clen,
-                                const unsigned char *n, const unsigned char *k)
-{
-    if (clen > crypto_stream_chacha20_MESSAGEBYTES_MAX) {
-        sodium_misuse();
-    }
-    return implementation->stream_ietf_ext(c, clen, n, k);
-}
-
-int
-crypto_stream_chacha20_ietf_ext_xor_ic(unsigned char *c, const unsigned char *m,
-                                       unsigned long long mlen,
-                                       const unsigned char *n, uint32_t ic,
-                                       const unsigned char *k)
-{
-    if (mlen > crypto_stream_chacha20_MESSAGEBYTES_MAX) {
-        sodium_misuse();
-    }
-    return implementation->stream_ietf_ext_xor_ic(c, m, mlen, n, ic, k);
-}
-
-static int
-crypto_stream_chacha20_ietf_ext_xor(unsigned char *c, const unsigned char *m,
-                                    unsigned long long mlen, const unsigned char *n,
-                                    const unsigned char *k)
-{
-    if (mlen > crypto_stream_chacha20_MESSAGEBYTES_MAX) {
-        sodium_misuse();
-    }
-    return implementation->stream_ietf_ext_xor_ic(c, m, mlen, n, 0U, k);
-}
-
-int
-crypto_stream_chacha20_ietf(unsigned char *c, unsigned long long clen,
-                            const unsigned char *n, const unsigned char *k)
-{
-    if (clen > crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX) {
-        sodium_misuse();
-    }
-    return crypto_stream_chacha20_ietf_ext(c, clen, n, k);
-}
-
-int
-crypto_stream_chacha20_ietf_xor_ic(unsigned char *c, const unsigned char *m,
-                                   unsigned long long mlen,
-                                   const unsigned char *n, uint32_t ic,
-                                   const unsigned char *k)
-{
-    if ((unsigned long long) ic >
-        (64ULL * (1ULL << 32)) / 64ULL - (mlen + 63ULL) / 64ULL) {
-        sodium_misuse();
-    }
-    return crypto_stream_chacha20_ietf_ext_xor_ic(c, m, mlen, n, ic, k);
-}
-
-int
-crypto_stream_chacha20_ietf_xor(unsigned char *c, const unsigned char *m,
-                                unsigned long long mlen, const unsigned char *n,
-                                const unsigned char *k)
-{
-    if (mlen > crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX) {
-        sodium_misuse();
-    }
-    return crypto_stream_chacha20_ietf_ext_xor(c, m, mlen, n, k);
-}
-
-void
-crypto_stream_chacha20_ietf_keygen(unsigned char k[crypto_stream_chacha20_ietf_KEYBYTES])
-{
-    randombytes_buf(k, crypto_stream_chacha20_ietf_KEYBYTES);
-}
-
-void
-crypto_stream_chacha20_keygen(unsigned char k[crypto_stream_chacha20_KEYBYTES])
-{
-    randombytes_buf(k, crypto_stream_chacha20_KEYBYTES);
-}
-
-int
-_crypto_stream_chacha20_pick_best_implementation(void)
-{
-    implementation = &crypto_stream_chacha20_ref_implementation;
-#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
-    defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
-    if (sodium_runtime_has_avx2()) {
-        implementation = &crypto_stream_chacha20_dolbeau_avx2_implementation;
-        return 0;
-    }
-#endif
-#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H)
-    if (sodium_runtime_has_ssse3()) {
-        implementation = &crypto_stream_chacha20_dolbeau_ssse3_implementation;
-        return 0;
-    }
-#endif
-    return 0;
-}
+#include "crypto_stream_chacha20.h"
+#include "core.h"
+#include "private/chacha20_ietf_ext.h"
+#include "private/common.h"
+#include "private/implementations.h"
+#include "randombytes.h"
+#include "runtime.h"
+#include "stream_chacha20.h"
+
+#include "ref/chacha20_ref.h"
+#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
+    defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
+# include "dolbeau/chacha20_dolbeau-avx2.h"
+#endif
+#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H)
+# include "dolbeau/chacha20_dolbeau-ssse3.h"
+#endif
+
+static const crypto_stream_chacha20_implementation *implementation =
+    &crypto_stream_chacha20_ref_implementation;
+
+size_t
+crypto_stream_chacha20_keybytes(void) {
+    return crypto_stream_chacha20_KEYBYTES;
+}
+
+size_t
+crypto_stream_chacha20_noncebytes(void) {
+    return crypto_stream_chacha20_NONCEBYTES;
+}
+
+size_t
+crypto_stream_chacha20_messagebytes_max(void)
+{
+    return crypto_stream_chacha20_MESSAGEBYTES_MAX;
+}
+
+size_t
+crypto_stream_chacha20_ietf_keybytes(void) {
+    return crypto_stream_chacha20_ietf_KEYBYTES;
+}
+
+size_t
+crypto_stream_chacha20_ietf_noncebytes(void) {
+    return crypto_stream_chacha20_ietf_NONCEBYTES;
+}
+
+size_t
+crypto_stream_chacha20_ietf_messagebytes_max(void)
+{
+    return crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX;
+}
+
+int
+crypto_stream_chacha20(unsigned char *c, unsigned long long clen,
+                       const unsigned char *n, const unsigned char *k)
+{
+    if (clen > crypto_stream_chacha20_MESSAGEBYTES_MAX) {
+        sodium_misuse();
+    }
+    return implementation->stream(c, clen, n, k);
+}
+
+int
+crypto_stream_chacha20_xor_ic(unsigned char *c, const unsigned char *m,
+                              unsigned long long mlen,
+                              const unsigned char *n, uint64_t ic,
+                              const unsigned char *k)
+{
+    if (mlen > crypto_stream_chacha20_MESSAGEBYTES_MAX) {
+        sodium_misuse();
+    }
+    return implementation->stream_xor_ic(c, m, mlen, n, ic, k);
+}
+
+int
+crypto_stream_chacha20_xor(unsigned char *c, const unsigned char *m,
+                           unsigned long long mlen, const unsigned char *n,
+                           const unsigned char *k)
+{
+    if (mlen > crypto_stream_chacha20_MESSAGEBYTES_MAX) {
+        sodium_misuse();
+    }
+    return implementation->stream_xor_ic(c, m, mlen, n, 0U, k);
+}
+
+int
+crypto_stream_chacha20_ietf_ext(unsigned char *c, unsigned long long clen,
+                                const unsigned char *n, const unsigned char *k)
+{
+    if (clen > crypto_stream_chacha20_MESSAGEBYTES_MAX) {
+        sodium_misuse();
+    }
+    return implementation->stream_ietf_ext(c, clen, n, k);
+}
+
+int
+crypto_stream_chacha20_ietf_ext_xor_ic(unsigned char *c, const unsigned char *m,
+                                       unsigned long long mlen,
+                                       const unsigned char *n, uint32_t ic,
+                                       const unsigned char *k)
+{
+    if (mlen > crypto_stream_chacha20_MESSAGEBYTES_MAX) {
+        sodium_misuse();
+    }
+    return implementation->stream_ietf_ext_xor_ic(c, m, mlen, n, ic, k);
+}
+
+static int
+crypto_stream_chacha20_ietf_ext_xor(unsigned char *c, const unsigned char *m,
+                                    unsigned long long mlen, const unsigned char *n,
+                                    const unsigned char *k)
+{
+    if (mlen > crypto_stream_chacha20_MESSAGEBYTES_MAX) {
+        sodium_misuse();
+    }
+    return implementation->stream_ietf_ext_xor_ic(c, m, mlen, n, 0U, k);
+}
+
+int
+crypto_stream_chacha20_ietf(unsigned char *c, unsigned long long clen,
+                            const unsigned char *n, const unsigned char *k)
+{
+    if (clen > crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX) {
+        sodium_misuse();
+    }
+    return crypto_stream_chacha20_ietf_ext(c, clen, n, k);
+}
+
+int
+crypto_stream_chacha20_ietf_xor_ic(unsigned char *c, const unsigned char *m,
+                                   unsigned long long mlen,
+                                   const unsigned char *n, uint32_t ic,
+                                   const unsigned char *k)
+{
+    if ((unsigned long long) ic >
+        (64ULL * (1ULL << 32)) / 64ULL - (mlen + 63ULL) / 64ULL) {
+        sodium_misuse();
+    }
+    return crypto_stream_chacha20_ietf_ext_xor_ic(c, m, mlen, n, ic, k);
+}
+
+int
+crypto_stream_chacha20_ietf_xor(unsigned char *c, const unsigned char *m,
+                                unsigned long long mlen, const unsigned char *n,
+                                const unsigned char *k)
+{
+    if (mlen > crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX) {
+        sodium_misuse();
+    }
+    return crypto_stream_chacha20_ietf_ext_xor(c, m, mlen, n, k);
+}
+
+void
+crypto_stream_chacha20_ietf_keygen(unsigned char k[crypto_stream_chacha20_ietf_KEYBYTES])
+{
+    randombytes_buf(k, crypto_stream_chacha20_ietf_KEYBYTES);
+}
+
+void
+crypto_stream_chacha20_keygen(unsigned char k[crypto_stream_chacha20_KEYBYTES])
+{
+    randombytes_buf(k, crypto_stream_chacha20_KEYBYTES);
+}
+
+int
+_crypto_stream_chacha20_pick_best_implementation(void)
+{
+    implementation = &crypto_stream_chacha20_ref_implementation;
+#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
+    defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
+    if (sodium_runtime_has_avx2()) {
+        implementation = &crypto_stream_chacha20_dolbeau_avx2_implementation;
+        return 0;
+    }
+#endif
+#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H)
+    if (sodium_runtime_has_ssse3()) {
+        implementation = &crypto_stream_chacha20_dolbeau_ssse3_implementation;
+        return 0;
+    }
+#endif
+    return 0;
+}
diff --git a/libs/libsodium/src/crypto_stream/chacha20/stream_chacha20.h b/libs/libsodium/src/crypto_stream/chacha20/stream_chacha20.h
index 40f782f418..0233a4dbef 100644
--- a/libs/libsodium/src/crypto_stream/chacha20/stream_chacha20.h
+++ b/libs/libsodium/src/crypto_stream/chacha20/stream_chacha20.h
@@ -1,22 +1,22 @@
-
-#ifndef stream_chacha20_H
-#define stream_chacha20_H
-
-#include <stdint.h>
-
-typedef struct crypto_stream_chacha20_implementation {
-    int (*stream)(unsigned char *c, unsigned long long clen,
-                  const unsigned char *n, const unsigned char *k);
-    int (*stream_ietf_ext)(unsigned char *c, unsigned long long clen,
-                           const unsigned char *n, const unsigned char *k);
-    int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
-                         unsigned long long mlen,
-                         const unsigned char *n, uint64_t ic,
-                         const unsigned char *k);
-    int (*stream_ietf_ext_xor_ic)(unsigned char *c, const unsigned char *m,
-                                  unsigned long long mlen,
-                                  const unsigned char *n, uint32_t ic,
-                                  const unsigned char *k);
-} crypto_stream_chacha20_implementation;
-
-#endif
+
+#ifndef stream_chacha20_H
+#define stream_chacha20_H
+
+#include <stdint.h>
+
+typedef struct crypto_stream_chacha20_implementation {
+    int (*stream)(unsigned char *c, unsigned long long clen,
+                  const unsigned char *n, const unsigned char *k);
+    int (*stream_ietf_ext)(unsigned char *c, unsigned long long clen,
+                           const unsigned char *n, const unsigned char *k);
+    int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
+                         unsigned long long mlen,
+                         const unsigned char *n, uint64_t ic,
+                         const unsigned char *k);
+    int (*stream_ietf_ext_xor_ic)(unsigned char *c, const unsigned char *m,
+                                  unsigned long long mlen,
+                                  const unsigned char *n, uint32_t ic,
+                                  const unsigned char *k);
+} crypto_stream_chacha20_implementation;
+
+#endif
diff --git a/libs/libsodium/src/crypto_stream/crypto_stream.c b/libs/libsodium/src/crypto_stream/crypto_stream.c
index 58d25381ab..6eaac0f747 100644
--- a/libs/libsodium/src/crypto_stream/crypto_stream.c
+++ b/libs/libsodium/src/crypto_stream/crypto_stream.c
@@ -1,49 +1,49 @@
-
-#include "crypto_stream.h"
-#include "randombytes.h"
-
-size_t
-crypto_stream_keybytes(void)
-{
-    return crypto_stream_KEYBYTES;
-}
-
-size_t
-crypto_stream_noncebytes(void)
-{
-    return crypto_stream_NONCEBYTES;
-}
-
-size_t
-crypto_stream_messagebytes_max(void)
-{
-    return crypto_stream_MESSAGEBYTES_MAX;
-}
-
-const char *
-crypto_stream_primitive(void)
-{
-    return crypto_stream_PRIMITIVE;
-}
-
-int
-crypto_stream(unsigned char *c, unsigned long long clen,
-              const unsigned char *n, const unsigned char *k)
-{
-    return crypto_stream_xsalsa20(c, clen, n, k);
-}
-
-
-int
-crypto_stream_xor(unsigned char *c, const unsigned char *m,
-                  unsigned long long mlen, const unsigned char *n,
-                  const unsigned char *k)
-{
-    return crypto_stream_xsalsa20_xor(c, m, mlen, n, k);
-}
-
-void
-crypto_stream_keygen(unsigned char k[crypto_stream_KEYBYTES])
-{
-    randombytes_buf(k, crypto_stream_KEYBYTES);
-}
+
+#include "crypto_stream.h"
+#include "randombytes.h"
+
+size_t
+crypto_stream_keybytes(void)
+{
+    return crypto_stream_KEYBYTES;
+}
+
+size_t
+crypto_stream_noncebytes(void)
+{
+    return crypto_stream_NONCEBYTES;
+}
+
+size_t
+crypto_stream_messagebytes_max(void)
+{
+    return crypto_stream_MESSAGEBYTES_MAX;
+}
+
+const char *
+crypto_stream_primitive(void)
+{
+    return crypto_stream_PRIMITIVE;
+}
+
+int
+crypto_stream(unsigned char *c, unsigned long long clen,
+              const unsigned char *n, const unsigned char *k)
+{
+    return crypto_stream_xsalsa20(c, clen, n, k);
+}
+
+
+int
+crypto_stream_xor(unsigned char *c, const unsigned char *m,
+                  unsigned long long mlen, const unsigned char *n,
+                  const unsigned char *k)
+{
+    return crypto_stream_xsalsa20_xor(c, m, mlen, n, k);
+}
+
+void
+crypto_stream_keygen(unsigned char k[crypto_stream_KEYBYTES])
+{
+    randombytes_buf(k, crypto_stream_KEYBYTES);
+}
diff --git a/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.c b/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.c
index f0854ebf7e..81522f0065 100644
--- a/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.c
+++ b/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.c
@@ -1,120 +1,120 @@
-/*
-version 20140420
-D. J. Bernstein
-Public domain.
-*/
-
-#include <stdint.h>
-
-#include "crypto_core_salsa20.h"
-#include "crypto_stream_salsa20.h"
-#include "utils.h"
-
-#include "../stream_salsa20.h"
-#include "salsa20_ref.h"
-
-#ifndef HAVE_AMD64_ASM
-
-static int
-stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n,
-           const unsigned char *k)
-{
-    unsigned char in[16];
-    unsigned char block[64];
-    unsigned char kcopy[32];
-    unsigned int  i;
-    unsigned int  u;
-
-    if (!clen) {
-        return 0;
-    }
-    for (i = 0; i < 32; i++) {
-        kcopy[i] = k[i];
-    }
-    for (i = 0; i < 8; i++) {
-        in[i] = n[i];
-    }
-    for (i = 8; i < 16; i++) {
-        in[i] = 0;
-    }
-    while (clen >= 64) {
-        crypto_core_salsa20(c, in, kcopy, NULL);
-        u = 1;
-        for (i = 8; i < 16; i++) {
-            u += (unsigned int) in[i];
-            in[i] = u;
-            u >>= 8;
-        }
-        clen -= 64;
-        c += 64;
-    }
-    if (clen) {
-        crypto_core_salsa20(block, in, kcopy, NULL);
-        for (i = 0; i < (unsigned int) clen; i++) {
-            c[i] = block[i];
-        }
-    }
-    sodium_memzero(block, sizeof block);
-    sodium_memzero(kcopy, sizeof kcopy);
-
-    return 0;
-}
-
-static int
-stream_ref_xor_ic(unsigned char *c, const unsigned char *m,
-                  unsigned long long mlen, const unsigned char *n, uint64_t ic,
-                  const unsigned char *k)
-{
-    unsigned char in[16];
-    unsigned char block[64];
-    unsigned char kcopy[32];
-    unsigned int  i;
-    unsigned int  u;
-
-    if (!mlen) {
-        return 0;
-    }
-    for (i = 0; i < 32; i++) {
-        kcopy[i] = k[i];
-    }
-    for (i = 0; i < 8; i++) {
-        in[i] = n[i];
-    }
-    for (i = 8; i < 16; i++) {
-        in[i] = (unsigned char) (ic & 0xff);
-        ic >>= 8;
-    }
-    while (mlen >= 64) {
-        crypto_core_salsa20(block, in, kcopy, NULL);
-        for (i = 0; i < 64; i++) {
-            c[i] = m[i] ^ block[i];
-        }
-        u = 1;
-        for (i = 8; i < 16; i++) {
-            u += (unsigned int) in[i];
-            in[i] = u;
-            u >>= 8;
-        }
-        mlen -= 64;
-        c += 64;
-        m += 64;
-    }
-    if (mlen) {
-        crypto_core_salsa20(block, in, kcopy, NULL);
-        for (i = 0; i < (unsigned int) mlen; i++) {
-            c[i] = m[i] ^ block[i];
-        }
-    }
-    sodium_memzero(block, sizeof block);
-    sodium_memzero(kcopy, sizeof kcopy);
-
-    return 0;
-}
-
-struct crypto_stream_salsa20_implementation
-    crypto_stream_salsa20_ref_implementation = {
-        SODIUM_C99(.stream =) stream_ref,
-        SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic,
-    };
-
-#endif
+/*
+version 20140420
+D. J. Bernstein
+Public domain.
+*/
+
+#include <stdint.h>
+
+#include "crypto_core_salsa20.h"
+#include "crypto_stream_salsa20.h"
+#include "utils.h"
+
+#include "../stream_salsa20.h"
+#include "salsa20_ref.h"
+
+#ifndef HAVE_AMD64_ASM
+
+static int
+stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n,
+           const unsigned char *k)
+{
+    unsigned char in[16];
+    unsigned char block[64];
+    unsigned char kcopy[32];
+    unsigned int  i;
+    unsigned int  u;
+
+    if (!clen) {
+        return 0;
+    }
+    for (i = 0; i < 32; i++) {
+        kcopy[i] = k[i];
+    }
+    for (i = 0; i < 8; i++) {
+        in[i] = n[i];
+    }
+    for (i = 8; i < 16; i++) {
+        in[i] = 0;
+    }
+    while (clen >= 64) {
+        crypto_core_salsa20(c, in, kcopy, NULL);
+        u = 1;
+        for (i = 8; i < 16; i++) {
+            u += (unsigned int) in[i];
+            in[i] = u;
+            u >>= 8;
+        }
+        clen -= 64;
+        c += 64;
+    }
+    if (clen) {
+        crypto_core_salsa20(block, in, kcopy, NULL);
+        for (i = 0; i < (unsigned int) clen; i++) {
+            c[i] = block[i];
+        }
+    }
+    sodium_memzero(block, sizeof block);
+    sodium_memzero(kcopy, sizeof kcopy);
+
+    return 0;
+}
+
+static int
+stream_ref_xor_ic(unsigned char *c, const unsigned char *m,
+                  unsigned long long mlen, const unsigned char *n, uint64_t ic,
+                  const unsigned char *k)
+{
+    unsigned char in[16];
+    unsigned char block[64];
+    unsigned char kcopy[32];
+    unsigned int  i;
+    unsigned int  u;
+
+    if (!mlen) {
+        return 0;
+    }
+    for (i = 0; i < 32; i++) {
+        kcopy[i] = k[i];
+    }
+    for (i = 0; i < 8; i++) {
+        in[i] = n[i];
+    }
+    for (i = 8; i < 16; i++) {
+        in[i] = (unsigned char) (ic & 0xff);
+        ic >>= 8;
+    }
+    while (mlen >= 64) {
+        crypto_core_salsa20(block, in, kcopy, NULL);
+        for (i = 0; i < 64; i++) {
+            c[i] = m[i] ^ block[i];
+        }
+        u = 1;
+        for (i = 8; i < 16; i++) {
+            u += (unsigned int) in[i];
+            in[i] = u;
+            u >>= 8;
+        }
+        mlen -= 64;
+        c += 64;
+        m += 64;
+    }
+    if (mlen) {
+        crypto_core_salsa20(block, in, kcopy, NULL);
+        for (i = 0; i < (unsigned int) mlen; i++) {
+            c[i] = m[i] ^ block[i];
+        }
+    }
+    sodium_memzero(block, sizeof block);
+    sodium_memzero(kcopy, sizeof kcopy);
+
+    return 0;
+}
+
+struct crypto_stream_salsa20_implementation
+    crypto_stream_salsa20_ref_implementation = {
+        SODIUM_C99(.stream =) stream_ref,
+        SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic,
+    };
+
+#endif
diff --git a/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.h b/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.h
index 8716cb4048..9976cc7f3a 100644
--- a/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.h
+++ b/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.h
@@ -1,8 +1,8 @@
-
-#include <stdint.h>
-
-#include "../stream_salsa20.h"
-#include "crypto_stream_salsa20.h"
-
-extern struct crypto_stream_salsa20_implementation
-    crypto_stream_salsa20_ref_implementation;
+
+#include <stdint.h>
+
+#include "../stream_salsa20.h"
+#include "crypto_stream_salsa20.h"
+
+extern struct crypto_stream_salsa20_implementation
+    crypto_stream_salsa20_ref_implementation;
diff --git a/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.c b/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.c
index 4529850136..cf06e6460d 100644
--- a/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.c
+++ b/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.c
@@ -1,100 +1,100 @@
-#include "crypto_stream_salsa20.h"
-#include "private/common.h"
-#include "private/implementations.h"
-#include "randombytes.h"
-#include "runtime.h"
-#include "stream_salsa20.h"
-
-#ifdef HAVE_AMD64_ASM
-# include "xmm6/salsa20_xmm6.h"
-#else
-# include "ref/salsa20_ref.h"
-#endif
-#if !defined(HAVE_AMD64_ASM) && defined(HAVE_EMMINTRIN_H)
-# include "xmm6int/salsa20_xmm6int-sse2.h"
-#endif
-#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
-    defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
-# include "xmm6int/salsa20_xmm6int-avx2.h"
-#endif
-
-#if HAVE_AMD64_ASM
-static const crypto_stream_salsa20_implementation *implementation =
-    &crypto_stream_salsa20_xmm6_implementation;
-#else
-static const crypto_stream_salsa20_implementation *implementation =
-    &crypto_stream_salsa20_ref_implementation;
-#endif
-
-size_t
-crypto_stream_salsa20_keybytes(void)
-{
-    return crypto_stream_salsa20_KEYBYTES;
-}
-
-size_t
-crypto_stream_salsa20_noncebytes(void)
-{
-    return crypto_stream_salsa20_NONCEBYTES;
-}
-
-size_t
-crypto_stream_salsa20_messagebytes_max(void)
-{
-    return crypto_stream_salsa20_MESSAGEBYTES_MAX;
-}
-
-int
-crypto_stream_salsa20(unsigned char *c, unsigned long long clen,
-                      const unsigned char *n, const unsigned char *k)
-{
-    return implementation->stream(c, clen, n, k);
-}
-
-int
-crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m,
-                             unsigned long long mlen,
-                             const unsigned char *n, uint64_t ic,
-                             const unsigned char *k)
-{
-    return implementation->stream_xor_ic(c, m, mlen, n, ic, k);
-}
-
-int
-crypto_stream_salsa20_xor(unsigned char *c, const unsigned char *m,
-                          unsigned long long mlen, const unsigned char *n,
-                          const unsigned char *k)
-{
-    return implementation->stream_xor_ic(c, m, mlen, n, 0U, k);
-}
-
-void
-crypto_stream_salsa20_keygen(unsigned char k[crypto_stream_salsa20_KEYBYTES])
-{
-    randombytes_buf(k, crypto_stream_salsa20_KEYBYTES);
-}
-
-int
-_crypto_stream_salsa20_pick_best_implementation(void)
-{
-#ifdef HAVE_AMD64_ASM
-    implementation = &crypto_stream_salsa20_xmm6_implementation;
-#else
-    implementation = &crypto_stream_salsa20_ref_implementation;
-#endif
-
-#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
-    defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
-    if (sodium_runtime_has_avx2()) {
-        implementation = &crypto_stream_salsa20_xmm6int_avx2_implementation;
-        return 0;
-    }
-#endif
-#if !defined(HAVE_AMD64_ASM) && defined(HAVE_EMMINTRIN_H)
-    if (sodium_runtime_has_sse2()) {
-        implementation = &crypto_stream_salsa20_xmm6int_sse2_implementation;
-        return 0;
-    }
-#endif
-    return 0; /* LCOV_EXCL_LINE */
-}
+#include "crypto_stream_salsa20.h"
+#include "private/common.h"
+#include "private/implementations.h"
+#include "randombytes.h"
+#include "runtime.h"
+#include "stream_salsa20.h"
+
+#ifdef HAVE_AMD64_ASM
+# include "xmm6/salsa20_xmm6.h"
+#else
+# include "ref/salsa20_ref.h"
+#endif
+#if !defined(HAVE_AMD64_ASM) && defined(HAVE_EMMINTRIN_H)
+# include "xmm6int/salsa20_xmm6int-sse2.h"
+#endif
+#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
+    defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
+# include "xmm6int/salsa20_xmm6int-avx2.h"
+#endif
+
+#if HAVE_AMD64_ASM
+static const crypto_stream_salsa20_implementation *implementation =
+    &crypto_stream_salsa20_xmm6_implementation;
+#else
+static const crypto_stream_salsa20_implementation *implementation =
+    &crypto_stream_salsa20_ref_implementation;
+#endif
+
+size_t
+crypto_stream_salsa20_keybytes(void)
+{
+    return crypto_stream_salsa20_KEYBYTES;
+}
+
+size_t
+crypto_stream_salsa20_noncebytes(void)
+{
+    return crypto_stream_salsa20_NONCEBYTES;
+}
+
+size_t
+crypto_stream_salsa20_messagebytes_max(void)
+{
+    return crypto_stream_salsa20_MESSAGEBYTES_MAX;
+}
+
+int
+crypto_stream_salsa20(unsigned char *c, unsigned long long clen,
+                      const unsigned char *n, const unsigned char *k)
+{
+    return implementation->stream(c, clen, n, k);
+}
+
+int
+crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m,
+                             unsigned long long mlen,
+                             const unsigned char *n, uint64_t ic,
+                             const unsigned char *k)
+{
+    return implementation->stream_xor_ic(c, m, mlen, n, ic, k);
+}
+
+int
+crypto_stream_salsa20_xor(unsigned char *c, const unsigned char *m,
+                          unsigned long long mlen, const unsigned char *n,
+                          const unsigned char *k)
+{
+    return implementation->stream_xor_ic(c, m, mlen, n, 0U, k);
+}
+
+void
+crypto_stream_salsa20_keygen(unsigned char k[crypto_stream_salsa20_KEYBYTES])
+{
+    randombytes_buf(k, crypto_stream_salsa20_KEYBYTES);
+}
+
+int
+_crypto_stream_salsa20_pick_best_implementation(void)
+{
+#ifdef HAVE_AMD64_ASM
+    implementation = &crypto_stream_salsa20_xmm6_implementation;
+#else
+    implementation = &crypto_stream_salsa20_ref_implementation;
+#endif
+
+#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
+    defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
+    if (sodium_runtime_has_avx2()) {
+        implementation = &crypto_stream_salsa20_xmm6int_avx2_implementation;
+        return 0;
+    }
+#endif
+#if !defined(HAVE_AMD64_ASM) && defined(HAVE_EMMINTRIN_H)
+    if (sodium_runtime_has_sse2()) {
+        implementation = &crypto_stream_salsa20_xmm6int_sse2_implementation;
+        return 0;
+    }
+#endif
+    return 0; /* LCOV_EXCL_LINE */
+}
diff --git a/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.h b/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.h
index 1949d38113..0b5971ca48 100644
--- a/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.h
+++ b/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.h
@@ -1,16 +1,16 @@
-
-#ifndef stream_salsa20_H
-#define stream_salsa20_H
-
-#include <stdint.h>
-
-typedef struct crypto_stream_salsa20_implementation {
-    int (*stream)(unsigned char *c, unsigned long long clen,
-                  const unsigned char *n, const unsigned char *k);
-    int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
-                         unsigned long long mlen,
-                         const unsigned char *n, uint64_t ic,
-                         const unsigned char *k);
-} crypto_stream_salsa20_implementation;
-
-#endif
+
+#ifndef stream_salsa20_H
+#define stream_salsa20_H
+
+#include <stdint.h>
+
+typedef struct crypto_stream_salsa20_implementation {
+    int (*stream)(unsigned char *c, unsigned long long clen,
+                  const unsigned char *n, const unsigned char *k);
+    int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
+                         unsigned long long mlen,
+                         const unsigned char *n, uint64_t ic,
+                         const unsigned char *k);
+} crypto_stream_salsa20_implementation;
+
+#endif
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6-asm.S b/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6-asm.S
index 6d9f354e10..9ecea1b088 100644
--- a/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6-asm.S
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6-asm.S
@@ -1,960 +1,960 @@
-#ifdef HAVE_AMD64_ASM
-
-.text
-.p2align 5
-
-#ifdef ASM_HIDE_SYMBOL
-ASM_HIDE_SYMBOL stream_salsa20_xmm6
-ASM_HIDE_SYMBOL _stream_salsa20_xmm6
-#endif
-.globl  stream_salsa20_xmm6
-.globl _stream_salsa20_xmm6
-#ifdef __ELF__
-.type  stream_salsa20_xmm6, @function
-.type _stream_salsa20_xmm6, @function
-#endif
-stream_salsa20_xmm6:
-_stream_salsa20_xmm6:
-mov %rsp,%r11
-and $31,%r11
-add $512,%r11
-sub %r11,%rsp
-movq %r11,416(%rsp)
-movq %r12,424(%rsp)
-movq %r13,432(%rsp)
-movq %r14,440(%rsp)
-movq %r15,448(%rsp)
-movq %rbx,456(%rsp)
-movq %rbp,464(%rsp)
-mov  %rsi,%r9
-mov  %rdi,%rdi
-mov  %rdi,%rsi
-mov  %rdx,%rdx
-mov  %rcx,%r10
-cmp  $0,%r9
-jbe ._done
-mov  $0,%rax
-mov  %r9,%rcx
-rep stosb
-sub  %r9,%rdi
-movq $0,472(%rsp)
-jmp ._start
-
-.text
-.p2align 5
-
-#ifdef ASM_HIDE_SYMBOL
-ASM_HIDE_SYMBOL stream_salsa20_xmm6_xor_ic
-ASM_HIDE_SYMBOL _stream_salsa20_xmm6_xor_ic
-#endif
-.globl  stream_salsa20_xmm6_xor_ic
-.globl _stream_salsa20_xmm6_xor_ic
-#ifdef __ELF__
-.type  stream_salsa20_xmm6_xor_ic, @function
-.type _stream_salsa20_xmm6_xor_ic, @function
-#endif
-stream_salsa20_xmm6_xor_ic:
-_stream_salsa20_xmm6_xor_ic:
-
-mov %rsp,%r11
-and $31,%r11
-add $512,%r11
-sub %r11,%rsp
-movq %r11,416(%rsp)
-movq %r12,424(%rsp)
-movq %r13,432(%rsp)
-movq %r14,440(%rsp)
-movq %r15,448(%rsp)
-movq %rbx,456(%rsp)
-movq %rbp,464(%rsp)
-mov  %rdi,%rdi
-mov  %rsi,%rsi
-mov  %r9,%r10
-movq %r8,472(%rsp)
-mov  %rdx,%r9
-mov  %rcx,%rdx
-cmp  $0,%r9
-jbe ._done
-
-._start:
-movl   20(%r10),%ecx
-movl   0(%r10),%r8d
-movl   0(%rdx),%eax
-movl   16(%r10),%r11d
-movl %ecx,64(%rsp)
-movl %r8d,4+64(%rsp)
-movl %eax,8+64(%rsp)
-movl %r11d,12+64(%rsp)
-movl   24(%r10),%r8d
-movl   4(%r10),%eax
-movl   4(%rdx),%edx
-movq 472(%rsp),%rcx
-movl %ecx,80(%rsp)
-movl %r8d,4+80(%rsp)
-movl %eax,8+80(%rsp)
-movl %edx,12+80(%rsp)
-movl   12(%r10),%edx
-shr  $32,%rcx
-movl   28(%r10),%r8d
-movl   8(%r10),%eax
-movl %edx,96(%rsp)
-movl %ecx,4+96(%rsp)
-movl %r8d,8+96(%rsp)
-movl %eax,12+96(%rsp)
-mov  $1634760805,%rdx
-mov  $857760878,%rcx
-mov  $2036477234,%r8
-mov  $1797285236,%rax
-movl %edx,112(%rsp)
-movl %ecx,4+112(%rsp)
-movl %r8d,8+112(%rsp)
-movl %eax,12+112(%rsp)
-cmp  $256,%r9
-jb ._bytesbetween1and255
-movdqa 112(%rsp),%xmm0
-pshufd $0x55,%xmm0,%xmm1
-pshufd $0xaa,%xmm0,%xmm2
-pshufd $0xff,%xmm0,%xmm3
-pshufd $0x00,%xmm0,%xmm0
-movdqa %xmm1,128(%rsp)
-movdqa %xmm2,144(%rsp)
-movdqa %xmm3,160(%rsp)
-movdqa %xmm0,176(%rsp)
-movdqa 64(%rsp),%xmm0
-pshufd $0xaa,%xmm0,%xmm1
-pshufd $0xff,%xmm0,%xmm2
-pshufd $0x00,%xmm0,%xmm3
-pshufd $0x55,%xmm0,%xmm0
-movdqa %xmm1,192(%rsp)
-movdqa %xmm2,208(%rsp)
-movdqa %xmm3,224(%rsp)
-movdqa %xmm0,240(%rsp)
-movdqa 80(%rsp),%xmm0
-pshufd $0xff,%xmm0,%xmm1
-pshufd $0x55,%xmm0,%xmm2
-pshufd $0xaa,%xmm0,%xmm0
-movdqa %xmm1,256(%rsp)
-movdqa %xmm2,272(%rsp)
-movdqa %xmm0,288(%rsp)
-movdqa 96(%rsp),%xmm0
-pshufd $0x00,%xmm0,%xmm1
-pshufd $0xaa,%xmm0,%xmm2
-pshufd $0xff,%xmm0,%xmm0
-movdqa %xmm1,304(%rsp)
-movdqa %xmm2,320(%rsp)
-movdqa %xmm0,336(%rsp)
-
-.p2align 4
-._bytesatleast256:
-movq 472(%rsp),%rdx
-mov  %rdx,%rcx
-shr  $32,%rcx
-movl %edx,352(%rsp)
-movl %ecx,368(%rsp)
-add  $1,%rdx
-mov  %rdx,%rcx
-shr  $32,%rcx
-movl %edx,4+352(%rsp)
-movl %ecx,4+368(%rsp)
-add  $1,%rdx
-mov  %rdx,%rcx
-shr  $32,%rcx
-movl %edx,8+352(%rsp)
-movl %ecx,8+368(%rsp)
-add  $1,%rdx
-mov  %rdx,%rcx
-shr  $32,%rcx
-movl %edx,12+352(%rsp)
-movl %ecx,12+368(%rsp)
-add  $1,%rdx
-mov  %rdx,%rcx
-shr  $32,%rcx
-movl %edx,80(%rsp)
-movl %ecx,4+96(%rsp)
-movq %rdx,472(%rsp)
-movq %r9,480(%rsp)
-mov  $20,%rdx
-movdqa 128(%rsp),%xmm0
-movdqa 144(%rsp),%xmm1
-movdqa 160(%rsp),%xmm2
-movdqa 320(%rsp),%xmm3
-movdqa 336(%rsp),%xmm4
-movdqa 192(%rsp),%xmm5
-movdqa 208(%rsp),%xmm6
-movdqa 240(%rsp),%xmm7
-movdqa 256(%rsp),%xmm8
-movdqa 272(%rsp),%xmm9
-movdqa 288(%rsp),%xmm10
-movdqa 368(%rsp),%xmm11
-movdqa 176(%rsp),%xmm12
-movdqa 224(%rsp),%xmm13
-movdqa 304(%rsp),%xmm14
-movdqa 352(%rsp),%xmm15
-
-.p2align 4
-._mainloop1:
-movdqa %xmm1,384(%rsp)
-movdqa %xmm2,400(%rsp)
-movdqa %xmm13,%xmm1
-paddd %xmm12,%xmm1
-movdqa %xmm1,%xmm2
-pslld $7,%xmm1
-pxor  %xmm1,%xmm14
-psrld $25,%xmm2
-pxor  %xmm2,%xmm14
-movdqa %xmm7,%xmm1
-paddd %xmm0,%xmm1
-movdqa %xmm1,%xmm2
-pslld $7,%xmm1
-pxor  %xmm1,%xmm11
-psrld $25,%xmm2
-pxor  %xmm2,%xmm11
-movdqa %xmm12,%xmm1
-paddd %xmm14,%xmm1
-movdqa %xmm1,%xmm2
-pslld $9,%xmm1
-pxor  %xmm1,%xmm15
-psrld $23,%xmm2
-pxor  %xmm2,%xmm15
-movdqa %xmm0,%xmm1
-paddd %xmm11,%xmm1
-movdqa %xmm1,%xmm2
-pslld $9,%xmm1
-pxor  %xmm1,%xmm9
-psrld $23,%xmm2
-pxor  %xmm2,%xmm9
-movdqa %xmm14,%xmm1
-paddd %xmm15,%xmm1
-movdqa %xmm1,%xmm2
-pslld $13,%xmm1
-pxor  %xmm1,%xmm13
-psrld $19,%xmm2
-pxor  %xmm2,%xmm13
-movdqa %xmm11,%xmm1
-paddd %xmm9,%xmm1
-movdqa %xmm1,%xmm2
-pslld $13,%xmm1
-pxor  %xmm1,%xmm7
-psrld $19,%xmm2
-pxor  %xmm2,%xmm7
-movdqa %xmm15,%xmm1
-paddd %xmm13,%xmm1
-movdqa %xmm1,%xmm2
-pslld $18,%xmm1
-pxor  %xmm1,%xmm12
-psrld $14,%xmm2
-pxor  %xmm2,%xmm12
-movdqa 384(%rsp),%xmm1
-movdqa %xmm12,384(%rsp)
-movdqa %xmm9,%xmm2
-paddd %xmm7,%xmm2
-movdqa %xmm2,%xmm12
-pslld $18,%xmm2
-pxor  %xmm2,%xmm0
-psrld $14,%xmm12
-pxor  %xmm12,%xmm0
-movdqa %xmm5,%xmm2
-paddd %xmm1,%xmm2
-movdqa %xmm2,%xmm12
-pslld $7,%xmm2
-pxor  %xmm2,%xmm3
-psrld $25,%xmm12
-pxor  %xmm12,%xmm3
-movdqa 400(%rsp),%xmm2
-movdqa %xmm0,400(%rsp)
-movdqa %xmm6,%xmm0
-paddd %xmm2,%xmm0
-movdqa %xmm0,%xmm12
-pslld $7,%xmm0
-pxor  %xmm0,%xmm4
-psrld $25,%xmm12
-pxor  %xmm12,%xmm4
-movdqa %xmm1,%xmm0
-paddd %xmm3,%xmm0
-movdqa %xmm0,%xmm12
-pslld $9,%xmm0
-pxor  %xmm0,%xmm10
-psrld $23,%xmm12
-pxor  %xmm12,%xmm10
-movdqa %xmm2,%xmm0
-paddd %xmm4,%xmm0
-movdqa %xmm0,%xmm12
-pslld $9,%xmm0
-pxor  %xmm0,%xmm8
-psrld $23,%xmm12
-pxor  %xmm12,%xmm8
-movdqa %xmm3,%xmm0
-paddd %xmm10,%xmm0
-movdqa %xmm0,%xmm12
-pslld $13,%xmm0
-pxor  %xmm0,%xmm5
-psrld $19,%xmm12
-pxor  %xmm12,%xmm5
-movdqa %xmm4,%xmm0
-paddd %xmm8,%xmm0
-movdqa %xmm0,%xmm12
-pslld $13,%xmm0
-pxor  %xmm0,%xmm6
-psrld $19,%xmm12
-pxor  %xmm12,%xmm6
-movdqa %xmm10,%xmm0
-paddd %xmm5,%xmm0
-movdqa %xmm0,%xmm12
-pslld $18,%xmm0
-pxor  %xmm0,%xmm1
-psrld $14,%xmm12
-pxor  %xmm12,%xmm1
-movdqa 384(%rsp),%xmm0
-movdqa %xmm1,384(%rsp)
-movdqa %xmm4,%xmm1
-paddd %xmm0,%xmm1
-movdqa %xmm1,%xmm12
-pslld $7,%xmm1
-pxor  %xmm1,%xmm7
-psrld $25,%xmm12
-pxor  %xmm12,%xmm7
-movdqa %xmm8,%xmm1
-paddd %xmm6,%xmm1
-movdqa %xmm1,%xmm12
-pslld $18,%xmm1
-pxor  %xmm1,%xmm2
-psrld $14,%xmm12
-pxor  %xmm12,%xmm2
-movdqa 400(%rsp),%xmm12
-movdqa %xmm2,400(%rsp)
-movdqa %xmm14,%xmm1
-paddd %xmm12,%xmm1
-movdqa %xmm1,%xmm2
-pslld $7,%xmm1
-pxor  %xmm1,%xmm5
-psrld $25,%xmm2
-pxor  %xmm2,%xmm5
-movdqa %xmm0,%xmm1
-paddd %xmm7,%xmm1
-movdqa %xmm1,%xmm2
-pslld $9,%xmm1
-pxor  %xmm1,%xmm10
-psrld $23,%xmm2
-pxor  %xmm2,%xmm10
-movdqa %xmm12,%xmm1
-paddd %xmm5,%xmm1
-movdqa %xmm1,%xmm2
-pslld $9,%xmm1
-pxor  %xmm1,%xmm8
-psrld $23,%xmm2
-pxor  %xmm2,%xmm8
-movdqa %xmm7,%xmm1
-paddd %xmm10,%xmm1
-movdqa %xmm1,%xmm2
-pslld $13,%xmm1
-pxor  %xmm1,%xmm4
-psrld $19,%xmm2
-pxor  %xmm2,%xmm4
-movdqa %xmm5,%xmm1
-paddd %xmm8,%xmm1
-movdqa %xmm1,%xmm2
-pslld $13,%xmm1
-pxor  %xmm1,%xmm14
-psrld $19,%xmm2
-pxor  %xmm2,%xmm14
-movdqa %xmm10,%xmm1
-paddd %xmm4,%xmm1
-movdqa %xmm1,%xmm2
-pslld $18,%xmm1
-pxor  %xmm1,%xmm0
-psrld $14,%xmm2
-pxor  %xmm2,%xmm0
-movdqa 384(%rsp),%xmm1
-movdqa %xmm0,384(%rsp)
-movdqa %xmm8,%xmm0
-paddd %xmm14,%xmm0
-movdqa %xmm0,%xmm2
-pslld $18,%xmm0
-pxor  %xmm0,%xmm12
-psrld $14,%xmm2
-pxor  %xmm2,%xmm12
-movdqa %xmm11,%xmm0
-paddd %xmm1,%xmm0
-movdqa %xmm0,%xmm2
-pslld $7,%xmm0
-pxor  %xmm0,%xmm6
-psrld $25,%xmm2
-pxor  %xmm2,%xmm6
-movdqa 400(%rsp),%xmm2
-movdqa %xmm12,400(%rsp)
-movdqa %xmm3,%xmm0
-paddd %xmm2,%xmm0
-movdqa %xmm0,%xmm12
-pslld $7,%xmm0
-pxor  %xmm0,%xmm13
-psrld $25,%xmm12
-pxor  %xmm12,%xmm13
-movdqa %xmm1,%xmm0
-paddd %xmm6,%xmm0
-movdqa %xmm0,%xmm12
-pslld $9,%xmm0
-pxor  %xmm0,%xmm15
-psrld $23,%xmm12
-pxor  %xmm12,%xmm15
-movdqa %xmm2,%xmm0
-paddd %xmm13,%xmm0
-movdqa %xmm0,%xmm12
-pslld $9,%xmm0
-pxor  %xmm0,%xmm9
-psrld $23,%xmm12
-pxor  %xmm12,%xmm9
-movdqa %xmm6,%xmm0
-paddd %xmm15,%xmm0
-movdqa %xmm0,%xmm12
-pslld $13,%xmm0
-pxor  %xmm0,%xmm11
-psrld $19,%xmm12
-pxor  %xmm12,%xmm11
-movdqa %xmm13,%xmm0
-paddd %xmm9,%xmm0
-movdqa %xmm0,%xmm12
-pslld $13,%xmm0
-pxor  %xmm0,%xmm3
-psrld $19,%xmm12
-pxor  %xmm12,%xmm3
-movdqa %xmm15,%xmm0
-paddd %xmm11,%xmm0
-movdqa %xmm0,%xmm12
-pslld $18,%xmm0
-pxor  %xmm0,%xmm1
-psrld $14,%xmm12
-pxor  %xmm12,%xmm1
-movdqa %xmm9,%xmm0
-paddd %xmm3,%xmm0
-movdqa %xmm0,%xmm12
-pslld $18,%xmm0
-pxor  %xmm0,%xmm2
-psrld $14,%xmm12
-pxor  %xmm12,%xmm2
-movdqa 384(%rsp),%xmm12
-movdqa 400(%rsp),%xmm0
-sub  $2,%rdx
-ja ._mainloop1
-
-paddd 176(%rsp),%xmm12
-paddd 240(%rsp),%xmm7
-paddd 288(%rsp),%xmm10
-paddd 336(%rsp),%xmm4
-movd   %xmm12,%rdx
-movd   %xmm7,%rcx
-movd   %xmm10,%r8
-movd   %xmm4,%r9
-pshufd $0x39,%xmm12,%xmm12
-pshufd $0x39,%xmm7,%xmm7
-pshufd $0x39,%xmm10,%xmm10
-pshufd $0x39,%xmm4,%xmm4
-xorl 0(%rsi),%edx
-xorl 4(%rsi),%ecx
-xorl 8(%rsi),%r8d
-xorl 12(%rsi),%r9d
-movl   %edx,0(%rdi)
-movl   %ecx,4(%rdi)
-movl   %r8d,8(%rdi)
-movl   %r9d,12(%rdi)
-movd   %xmm12,%rdx
-movd   %xmm7,%rcx
-movd   %xmm10,%r8
-movd   %xmm4,%r9
-pshufd $0x39,%xmm12,%xmm12
-pshufd $0x39,%xmm7,%xmm7
-pshufd $0x39,%xmm10,%xmm10
-pshufd $0x39,%xmm4,%xmm4
-xorl 64(%rsi),%edx
-xorl 68(%rsi),%ecx
-xorl 72(%rsi),%r8d
-xorl 76(%rsi),%r9d
-movl   %edx,64(%rdi)
-movl   %ecx,68(%rdi)
-movl   %r8d,72(%rdi)
-movl   %r9d,76(%rdi)
-movd   %xmm12,%rdx
-movd   %xmm7,%rcx
-movd   %xmm10,%r8
-movd   %xmm4,%r9
-pshufd $0x39,%xmm12,%xmm12
-pshufd $0x39,%xmm7,%xmm7
-pshufd $0x39,%xmm10,%xmm10
-pshufd $0x39,%xmm4,%xmm4
-xorl 128(%rsi),%edx
-xorl 132(%rsi),%ecx
-xorl 136(%rsi),%r8d
-xorl 140(%rsi),%r9d
-movl   %edx,128(%rdi)
-movl   %ecx,132(%rdi)
-movl   %r8d,136(%rdi)
-movl   %r9d,140(%rdi)
-movd   %xmm12,%rdx
-movd   %xmm7,%rcx
-movd   %xmm10,%r8
-movd   %xmm4,%r9
-xorl 192(%rsi),%edx
-xorl 196(%rsi),%ecx
-xorl 200(%rsi),%r8d
-xorl 204(%rsi),%r9d
-movl   %edx,192(%rdi)
-movl   %ecx,196(%rdi)
-movl   %r8d,200(%rdi)
-movl   %r9d,204(%rdi)
-paddd 304(%rsp),%xmm14
-paddd 128(%rsp),%xmm0
-paddd 192(%rsp),%xmm5
-paddd 256(%rsp),%xmm8
-movd   %xmm14,%rdx
-movd   %xmm0,%rcx
-movd   %xmm5,%r8
-movd   %xmm8,%r9
-pshufd $0x39,%xmm14,%xmm14
-pshufd $0x39,%xmm0,%xmm0
-pshufd $0x39,%xmm5,%xmm5
-pshufd $0x39,%xmm8,%xmm8
-xorl 16(%rsi),%edx
-xorl 20(%rsi),%ecx
-xorl 24(%rsi),%r8d
-xorl 28(%rsi),%r9d
-movl   %edx,16(%rdi)
-movl   %ecx,20(%rdi)
-movl   %r8d,24(%rdi)
-movl   %r9d,28(%rdi)
-movd   %xmm14,%rdx
-movd   %xmm0,%rcx
-movd   %xmm5,%r8
-movd   %xmm8,%r9
-pshufd $0x39,%xmm14,%xmm14
-pshufd $0x39,%xmm0,%xmm0
-pshufd $0x39,%xmm5,%xmm5
-pshufd $0x39,%xmm8,%xmm8
-xorl 80(%rsi),%edx
-xorl 84(%rsi),%ecx
-xorl 88(%rsi),%r8d
-xorl 92(%rsi),%r9d
-movl   %edx,80(%rdi)
-movl   %ecx,84(%rdi)
-movl   %r8d,88(%rdi)
-movl   %r9d,92(%rdi)
-movd   %xmm14,%rdx
-movd   %xmm0,%rcx
-movd   %xmm5,%r8
-movd   %xmm8,%r9
-pshufd $0x39,%xmm14,%xmm14
-pshufd $0x39,%xmm0,%xmm0
-pshufd $0x39,%xmm5,%xmm5
-pshufd $0x39,%xmm8,%xmm8
-xorl 144(%rsi),%edx
-xorl 148(%rsi),%ecx
-xorl 152(%rsi),%r8d
-xorl 156(%rsi),%r9d
-movl   %edx,144(%rdi)
-movl   %ecx,148(%rdi)
-movl   %r8d,152(%rdi)
-movl   %r9d,156(%rdi)
-movd   %xmm14,%rdx
-movd   %xmm0,%rcx
-movd   %xmm5,%r8
-movd   %xmm8,%r9
-xorl 208(%rsi),%edx
-xorl 212(%rsi),%ecx
-xorl 216(%rsi),%r8d
-xorl 220(%rsi),%r9d
-movl   %edx,208(%rdi)
-movl   %ecx,212(%rdi)
-movl   %r8d,216(%rdi)
-movl   %r9d,220(%rdi)
-paddd 352(%rsp),%xmm15
-paddd 368(%rsp),%xmm11
-paddd 144(%rsp),%xmm1
-paddd 208(%rsp),%xmm6
-movd   %xmm15,%rdx
-movd   %xmm11,%rcx
-movd   %xmm1,%r8
-movd   %xmm6,%r9
-pshufd $0x39,%xmm15,%xmm15
-pshufd $0x39,%xmm11,%xmm11
-pshufd $0x39,%xmm1,%xmm1
-pshufd $0x39,%xmm6,%xmm6
-xorl 32(%rsi),%edx
-xorl 36(%rsi),%ecx
-xorl 40(%rsi),%r8d
-xorl 44(%rsi),%r9d
-movl   %edx,32(%rdi)
-movl   %ecx,36(%rdi)
-movl   %r8d,40(%rdi)
-movl   %r9d,44(%rdi)
-movd   %xmm15,%rdx
-movd   %xmm11,%rcx
-movd   %xmm1,%r8
-movd   %xmm6,%r9
-pshufd $0x39,%xmm15,%xmm15
-pshufd $0x39,%xmm11,%xmm11
-pshufd $0x39,%xmm1,%xmm1
-pshufd $0x39,%xmm6,%xmm6
-xorl 96(%rsi),%edx
-xorl 100(%rsi),%ecx
-xorl 104(%rsi),%r8d
-xorl 108(%rsi),%r9d
-movl   %edx,96(%rdi)
-movl   %ecx,100(%rdi)
-movl   %r8d,104(%rdi)
-movl   %r9d,108(%rdi)
-movd   %xmm15,%rdx
-movd   %xmm11,%rcx
-movd   %xmm1,%r8
-movd   %xmm6,%r9
-pshufd $0x39,%xmm15,%xmm15
-pshufd $0x39,%xmm11,%xmm11
-pshufd $0x39,%xmm1,%xmm1
-pshufd $0x39,%xmm6,%xmm6
-xorl 160(%rsi),%edx
-xorl 164(%rsi),%ecx
-xorl 168(%rsi),%r8d
-xorl 172(%rsi),%r9d
-movl   %edx,160(%rdi)
-movl   %ecx,164(%rdi)
-movl   %r8d,168(%rdi)
-movl   %r9d,172(%rdi)
-movd   %xmm15,%rdx
-movd   %xmm11,%rcx
-movd   %xmm1,%r8
-movd   %xmm6,%r9
-xorl 224(%rsi),%edx
-xorl 228(%rsi),%ecx
-xorl 232(%rsi),%r8d
-xorl 236(%rsi),%r9d
-movl   %edx,224(%rdi)
-movl   %ecx,228(%rdi)
-movl   %r8d,232(%rdi)
-movl   %r9d,236(%rdi)
-paddd 224(%rsp),%xmm13
-paddd 272(%rsp),%xmm9
-paddd 320(%rsp),%xmm3
-paddd 160(%rsp),%xmm2
-movd   %xmm13,%rdx
-movd   %xmm9,%rcx
-movd   %xmm3,%r8
-movd   %xmm2,%r9
-pshufd $0x39,%xmm13,%xmm13
-pshufd $0x39,%xmm9,%xmm9
-pshufd $0x39,%xmm3,%xmm3
-pshufd $0x39,%xmm2,%xmm2
-xorl 48(%rsi),%edx
-xorl 52(%rsi),%ecx
-xorl 56(%rsi),%r8d
-xorl 60(%rsi),%r9d
-movl   %edx,48(%rdi)
-movl   %ecx,52(%rdi)
-movl   %r8d,56(%rdi)
-movl   %r9d,60(%rdi)
-movd   %xmm13,%rdx
-movd   %xmm9,%rcx
-movd   %xmm3,%r8
-movd   %xmm2,%r9
-pshufd $0x39,%xmm13,%xmm13
-pshufd $0x39,%xmm9,%xmm9
-pshufd $0x39,%xmm3,%xmm3
-pshufd $0x39,%xmm2,%xmm2
-xorl 112(%rsi),%edx
-xorl 116(%rsi),%ecx
-xorl 120(%rsi),%r8d
-xorl 124(%rsi),%r9d
-movl   %edx,112(%rdi)
-movl   %ecx,116(%rdi)
-movl   %r8d,120(%rdi)
-movl   %r9d,124(%rdi)
-movd   %xmm13,%rdx
-movd   %xmm9,%rcx
-movd   %xmm3,%r8
-movd   %xmm2,%r9
-pshufd $0x39,%xmm13,%xmm13
-pshufd $0x39,%xmm9,%xmm9
-pshufd $0x39,%xmm3,%xmm3
-pshufd $0x39,%xmm2,%xmm2
-xorl 176(%rsi),%edx
-xorl 180(%rsi),%ecx
-xorl 184(%rsi),%r8d
-xorl 188(%rsi),%r9d
-movl   %edx,176(%rdi)
-movl   %ecx,180(%rdi)
-movl   %r8d,184(%rdi)
-movl   %r9d,188(%rdi)
-movd   %xmm13,%rdx
-movd   %xmm9,%rcx
-movd   %xmm3,%r8
-movd   %xmm2,%r9
-xorl 240(%rsi),%edx
-xorl 244(%rsi),%ecx
-xorl 248(%rsi),%r8d
-xorl 252(%rsi),%r9d
-movl   %edx,240(%rdi)
-movl   %ecx,244(%rdi)
-movl   %r8d,248(%rdi)
-movl   %r9d,252(%rdi)
-movq 480(%rsp),%r9
-sub  $256,%r9
-add  $256,%rsi
-add  $256,%rdi
-cmp  $256,%r9
-jae ._bytesatleast256
-
-cmp  $0,%r9
-jbe ._done
-
-._bytesbetween1and255:
-cmp  $64,%r9
-jae ._nocopy
-
-mov  %rdi,%rdx
-leaq 0(%rsp),%rdi
-mov  %r9,%rcx
-rep movsb
-leaq 0(%rsp),%rdi
-leaq 0(%rsp),%rsi
-
-._nocopy:
-movq %r9,480(%rsp)
-movdqa 112(%rsp),%xmm0
-movdqa 64(%rsp),%xmm1
-movdqa 80(%rsp),%xmm2
-movdqa 96(%rsp),%xmm3
-movdqa %xmm1,%xmm4
-mov  $20,%rcx
-
-.p2align 4
-._mainloop2:
-paddd %xmm0,%xmm4
-movdqa %xmm0,%xmm5
-movdqa %xmm4,%xmm6
-pslld $7,%xmm4
-psrld $25,%xmm6
-pxor  %xmm4,%xmm3
-pxor  %xmm6,%xmm3
-paddd %xmm3,%xmm5
-movdqa %xmm3,%xmm4
-movdqa %xmm5,%xmm6
-pslld $9,%xmm5
-psrld $23,%xmm6
-pxor  %xmm5,%xmm2
-pshufd $0x93,%xmm3,%xmm3
-pxor  %xmm6,%xmm2
-paddd %xmm2,%xmm4
-movdqa %xmm2,%xmm5
-movdqa %xmm4,%xmm6
-pslld $13,%xmm4
-psrld $19,%xmm6
-pxor  %xmm4,%xmm1
-pshufd $0x4e,%xmm2,%xmm2
-pxor  %xmm6,%xmm1
-paddd %xmm1,%xmm5
-movdqa %xmm3,%xmm4
-movdqa %xmm5,%xmm6
-pslld $18,%xmm5
-psrld $14,%xmm6
-pxor  %xmm5,%xmm0
-pshufd $0x39,%xmm1,%xmm1
-pxor  %xmm6,%xmm0
-paddd %xmm0,%xmm4
-movdqa %xmm0,%xmm5
-movdqa %xmm4,%xmm6
-pslld $7,%xmm4
-psrld $25,%xmm6
-pxor  %xmm4,%xmm1
-pxor  %xmm6,%xmm1
-paddd %xmm1,%xmm5
-movdqa %xmm1,%xmm4
-movdqa %xmm5,%xmm6
-pslld $9,%xmm5
-psrld $23,%xmm6
-pxor  %xmm5,%xmm2
-pshufd $0x93,%xmm1,%xmm1
-pxor  %xmm6,%xmm2
-paddd %xmm2,%xmm4
-movdqa %xmm2,%xmm5
-movdqa %xmm4,%xmm6
-pslld $13,%xmm4
-psrld $19,%xmm6
-pxor  %xmm4,%xmm3
-pshufd $0x4e,%xmm2,%xmm2
-pxor  %xmm6,%xmm3
-paddd %xmm3,%xmm5
-movdqa %xmm1,%xmm4
-movdqa %xmm5,%xmm6
-pslld $18,%xmm5
-psrld $14,%xmm6
-pxor  %xmm5,%xmm0
-pshufd $0x39,%xmm3,%xmm3
-pxor  %xmm6,%xmm0
-paddd %xmm0,%xmm4
-movdqa %xmm0,%xmm5
-movdqa %xmm4,%xmm6
-pslld $7,%xmm4
-psrld $25,%xmm6
-pxor  %xmm4,%xmm3
-pxor  %xmm6,%xmm3
-paddd %xmm3,%xmm5
-movdqa %xmm3,%xmm4
-movdqa %xmm5,%xmm6
-pslld $9,%xmm5
-psrld $23,%xmm6
-pxor  %xmm5,%xmm2
-pshufd $0x93,%xmm3,%xmm3
-pxor  %xmm6,%xmm2
-paddd %xmm2,%xmm4
-movdqa %xmm2,%xmm5
-movdqa %xmm4,%xmm6
-pslld $13,%xmm4
-psrld $19,%xmm6
-pxor  %xmm4,%xmm1
-pshufd $0x4e,%xmm2,%xmm2
-pxor  %xmm6,%xmm1
-paddd %xmm1,%xmm5
-movdqa %xmm3,%xmm4
-movdqa %xmm5,%xmm6
-pslld $18,%xmm5
-psrld $14,%xmm6
-pxor  %xmm5,%xmm0
-pshufd $0x39,%xmm1,%xmm1
-pxor  %xmm6,%xmm0
-paddd %xmm0,%xmm4
-movdqa %xmm0,%xmm5
-movdqa %xmm4,%xmm6
-pslld $7,%xmm4
-psrld $25,%xmm6
-pxor  %xmm4,%xmm1
-pxor  %xmm6,%xmm1
-paddd %xmm1,%xmm5
-movdqa %xmm1,%xmm4
-movdqa %xmm5,%xmm6
-pslld $9,%xmm5
-psrld $23,%xmm6
-pxor  %xmm5,%xmm2
-pshufd $0x93,%xmm1,%xmm1
-pxor  %xmm6,%xmm2
-paddd %xmm2,%xmm4
-movdqa %xmm2,%xmm5
-movdqa %xmm4,%xmm6
-pslld $13,%xmm4
-psrld $19,%xmm6
-pxor  %xmm4,%xmm3
-pshufd $0x4e,%xmm2,%xmm2
-pxor  %xmm6,%xmm3
-sub  $4,%rcx
-paddd %xmm3,%xmm5
-movdqa %xmm1,%xmm4
-movdqa %xmm5,%xmm6
-pslld $18,%xmm5
-pxor   %xmm7,%xmm7
-psrld $14,%xmm6
-pxor  %xmm5,%xmm0
-pshufd $0x39,%xmm3,%xmm3
-pxor  %xmm6,%xmm0
-ja ._mainloop2
-
-paddd 112(%rsp),%xmm0
-paddd 64(%rsp),%xmm1
-paddd 80(%rsp),%xmm2
-paddd 96(%rsp),%xmm3
-movd   %xmm0,%rcx
-movd   %xmm1,%r8
-movd   %xmm2,%r9
-movd   %xmm3,%rax
-pshufd $0x39,%xmm0,%xmm0
-pshufd $0x39,%xmm1,%xmm1
-pshufd $0x39,%xmm2,%xmm2
-pshufd $0x39,%xmm3,%xmm3
-xorl 0(%rsi),%ecx
-xorl 48(%rsi),%r8d
-xorl 32(%rsi),%r9d
-xorl 16(%rsi),%eax
-movl   %ecx,0(%rdi)
-movl   %r8d,48(%rdi)
-movl   %r9d,32(%rdi)
-movl   %eax,16(%rdi)
-movd   %xmm0,%rcx
-movd   %xmm1,%r8
-movd   %xmm2,%r9
-movd   %xmm3,%rax
-pshufd $0x39,%xmm0,%xmm0
-pshufd $0x39,%xmm1,%xmm1
-pshufd $0x39,%xmm2,%xmm2
-pshufd $0x39,%xmm3,%xmm3
-xorl 20(%rsi),%ecx
-xorl 4(%rsi),%r8d
-xorl 52(%rsi),%r9d
-xorl 36(%rsi),%eax
-movl   %ecx,20(%rdi)
-movl   %r8d,4(%rdi)
-movl   %r9d,52(%rdi)
-movl   %eax,36(%rdi)
-movd   %xmm0,%rcx
-movd   %xmm1,%r8
-movd   %xmm2,%r9
-movd   %xmm3,%rax
-pshufd $0x39,%xmm0,%xmm0
-pshufd $0x39,%xmm1,%xmm1
-pshufd $0x39,%xmm2,%xmm2
-pshufd $0x39,%xmm3,%xmm3
-xorl 40(%rsi),%ecx
-xorl 24(%rsi),%r8d
-xorl 8(%rsi),%r9d
-xorl 56(%rsi),%eax
-movl   %ecx,40(%rdi)
-movl   %r8d,24(%rdi)
-movl   %r9d,8(%rdi)
-movl   %eax,56(%rdi)
-movd   %xmm0,%rcx
-movd   %xmm1,%r8
-movd   %xmm2,%r9
-movd   %xmm3,%rax
-xorl 60(%rsi),%ecx
-xorl 44(%rsi),%r8d
-xorl 28(%rsi),%r9d
-xorl 12(%rsi),%eax
-movl   %ecx,60(%rdi)
-movl   %r8d,44(%rdi)
-movl   %r9d,28(%rdi)
-movl   %eax,12(%rdi)
-movq 480(%rsp),%r9
-movq 472(%rsp),%rcx
-add  $1,%rcx
-mov  %rcx,%r8
-shr  $32,%r8
-movl %ecx,80(%rsp)
-movl %r8d,4+96(%rsp)
-movq %rcx,472(%rsp)
-cmp  $64,%r9
-ja ._bytesatleast65
-jae ._bytesatleast64
-
-mov  %rdi,%rsi
-mov  %rdx,%rdi
-mov  %r9,%rcx
-rep movsb
-
-._bytesatleast64:
-._done:
-movq 416(%rsp),%r11
-movq 424(%rsp),%r12
-movq 432(%rsp),%r13
-movq 440(%rsp),%r14
-movq 448(%rsp),%r15
-movq 456(%rsp),%rbx
-movq 464(%rsp),%rbp
-add %r11,%rsp
-xor %rax,%rax
-mov %rsi,%rdx
-ret
-
-._bytesatleast65:
-sub  $64,%r9
-add  $64,%rdi
-add  $64,%rsi
-jmp ._bytesbetween1and255
-
-#endif
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
+#ifdef HAVE_AMD64_ASM
+
+.text
+.p2align 5
+
+#ifdef ASM_HIDE_SYMBOL
+ASM_HIDE_SYMBOL stream_salsa20_xmm6
+ASM_HIDE_SYMBOL _stream_salsa20_xmm6
+#endif
+.globl  stream_salsa20_xmm6
+.globl _stream_salsa20_xmm6
+#ifdef __ELF__
+.type  stream_salsa20_xmm6, @function
+.type _stream_salsa20_xmm6, @function
+#endif
+stream_salsa20_xmm6:
+_stream_salsa20_xmm6:
+mov %rsp,%r11
+and $31,%r11
+add $512,%r11
+sub %r11,%rsp
+movq %r11,416(%rsp)
+movq %r12,424(%rsp)
+movq %r13,432(%rsp)
+movq %r14,440(%rsp)
+movq %r15,448(%rsp)
+movq %rbx,456(%rsp)
+movq %rbp,464(%rsp)
+mov  %rsi,%r9
+mov  %rdi,%rdi
+mov  %rdi,%rsi
+mov  %rdx,%rdx
+mov  %rcx,%r10
+cmp  $0,%r9
+jbe ._done
+mov  $0,%rax
+mov  %r9,%rcx
+rep stosb
+sub  %r9,%rdi
+movq $0,472(%rsp)
+jmp ._start
+
+.text
+.p2align 5
+
+#ifdef ASM_HIDE_SYMBOL
+ASM_HIDE_SYMBOL stream_salsa20_xmm6_xor_ic
+ASM_HIDE_SYMBOL _stream_salsa20_xmm6_xor_ic
+#endif
+.globl  stream_salsa20_xmm6_xor_ic
+.globl _stream_salsa20_xmm6_xor_ic
+#ifdef __ELF__
+.type  stream_salsa20_xmm6_xor_ic, @function
+.type _stream_salsa20_xmm6_xor_ic, @function
+#endif
+stream_salsa20_xmm6_xor_ic:
+_stream_salsa20_xmm6_xor_ic:
+
+mov %rsp,%r11
+and $31,%r11
+add $512,%r11
+sub %r11,%rsp
+movq %r11,416(%rsp)
+movq %r12,424(%rsp)
+movq %r13,432(%rsp)
+movq %r14,440(%rsp)
+movq %r15,448(%rsp)
+movq %rbx,456(%rsp)
+movq %rbp,464(%rsp)
+mov  %rdi,%rdi
+mov  %rsi,%rsi
+mov  %r9,%r10
+movq %r8,472(%rsp)
+mov  %rdx,%r9
+mov  %rcx,%rdx
+cmp  $0,%r9
+jbe ._done
+
+._start:
+movl   20(%r10),%ecx
+movl   0(%r10),%r8d
+movl   0(%rdx),%eax
+movl   16(%r10),%r11d
+movl %ecx,64(%rsp)
+movl %r8d,4+64(%rsp)
+movl %eax,8+64(%rsp)
+movl %r11d,12+64(%rsp)
+movl   24(%r10),%r8d
+movl   4(%r10),%eax
+movl   4(%rdx),%edx
+movq 472(%rsp),%rcx
+movl %ecx,80(%rsp)
+movl %r8d,4+80(%rsp)
+movl %eax,8+80(%rsp)
+movl %edx,12+80(%rsp)
+movl   12(%r10),%edx
+shr  $32,%rcx
+movl   28(%r10),%r8d
+movl   8(%r10),%eax
+movl %edx,96(%rsp)
+movl %ecx,4+96(%rsp)
+movl %r8d,8+96(%rsp)
+movl %eax,12+96(%rsp)
+mov  $1634760805,%rdx
+mov  $857760878,%rcx
+mov  $2036477234,%r8
+mov  $1797285236,%rax
+movl %edx,112(%rsp)
+movl %ecx,4+112(%rsp)
+movl %r8d,8+112(%rsp)
+movl %eax,12+112(%rsp)
+cmp  $256,%r9
+jb ._bytesbetween1and255
+movdqa 112(%rsp),%xmm0
+pshufd $0x55,%xmm0,%xmm1
+pshufd $0xaa,%xmm0,%xmm2
+pshufd $0xff,%xmm0,%xmm3
+pshufd $0x00,%xmm0,%xmm0
+movdqa %xmm1,128(%rsp)
+movdqa %xmm2,144(%rsp)
+movdqa %xmm3,160(%rsp)
+movdqa %xmm0,176(%rsp)
+movdqa 64(%rsp),%xmm0
+pshufd $0xaa,%xmm0,%xmm1
+pshufd $0xff,%xmm0,%xmm2
+pshufd $0x00,%xmm0,%xmm3
+pshufd $0x55,%xmm0,%xmm0
+movdqa %xmm1,192(%rsp)
+movdqa %xmm2,208(%rsp)
+movdqa %xmm3,224(%rsp)
+movdqa %xmm0,240(%rsp)
+movdqa 80(%rsp),%xmm0
+pshufd $0xff,%xmm0,%xmm1
+pshufd $0x55,%xmm0,%xmm2
+pshufd $0xaa,%xmm0,%xmm0
+movdqa %xmm1,256(%rsp)
+movdqa %xmm2,272(%rsp)
+movdqa %xmm0,288(%rsp)
+movdqa 96(%rsp),%xmm0
+pshufd $0x00,%xmm0,%xmm1
+pshufd $0xaa,%xmm0,%xmm2
+pshufd $0xff,%xmm0,%xmm0
+movdqa %xmm1,304(%rsp)
+movdqa %xmm2,320(%rsp)
+movdqa %xmm0,336(%rsp)
+
+.p2align 4
+._bytesatleast256:
+movq 472(%rsp),%rdx
+mov  %rdx,%rcx
+shr  $32,%rcx
+movl %edx,352(%rsp)
+movl %ecx,368(%rsp)
+add  $1,%rdx
+mov  %rdx,%rcx
+shr  $32,%rcx
+movl %edx,4+352(%rsp)
+movl %ecx,4+368(%rsp)
+add  $1,%rdx
+mov  %rdx,%rcx
+shr  $32,%rcx
+movl %edx,8+352(%rsp)
+movl %ecx,8+368(%rsp)
+add  $1,%rdx
+mov  %rdx,%rcx
+shr  $32,%rcx
+movl %edx,12+352(%rsp)
+movl %ecx,12+368(%rsp)
+add  $1,%rdx
+mov  %rdx,%rcx
+shr  $32,%rcx
+movl %edx,80(%rsp)
+movl %ecx,4+96(%rsp)
+movq %rdx,472(%rsp)
+movq %r9,480(%rsp)
+mov  $20,%rdx
+movdqa 128(%rsp),%xmm0
+movdqa 144(%rsp),%xmm1
+movdqa 160(%rsp),%xmm2
+movdqa 320(%rsp),%xmm3
+movdqa 336(%rsp),%xmm4
+movdqa 192(%rsp),%xmm5
+movdqa 208(%rsp),%xmm6
+movdqa 240(%rsp),%xmm7
+movdqa 256(%rsp),%xmm8
+movdqa 272(%rsp),%xmm9
+movdqa 288(%rsp),%xmm10
+movdqa 368(%rsp),%xmm11
+movdqa 176(%rsp),%xmm12
+movdqa 224(%rsp),%xmm13
+movdqa 304(%rsp),%xmm14
+movdqa 352(%rsp),%xmm15
+
+.p2align 4
+._mainloop1:
+movdqa %xmm1,384(%rsp)
+movdqa %xmm2,400(%rsp)
+movdqa %xmm13,%xmm1
+paddd %xmm12,%xmm1
+movdqa %xmm1,%xmm2
+pslld $7,%xmm1
+pxor  %xmm1,%xmm14
+psrld $25,%xmm2
+pxor  %xmm2,%xmm14
+movdqa %xmm7,%xmm1
+paddd %xmm0,%xmm1
+movdqa %xmm1,%xmm2
+pslld $7,%xmm1
+pxor  %xmm1,%xmm11
+psrld $25,%xmm2
+pxor  %xmm2,%xmm11
+movdqa %xmm12,%xmm1
+paddd %xmm14,%xmm1
+movdqa %xmm1,%xmm2
+pslld $9,%xmm1
+pxor  %xmm1,%xmm15
+psrld $23,%xmm2
+pxor  %xmm2,%xmm15
+movdqa %xmm0,%xmm1
+paddd %xmm11,%xmm1
+movdqa %xmm1,%xmm2
+pslld $9,%xmm1
+pxor  %xmm1,%xmm9
+psrld $23,%xmm2
+pxor  %xmm2,%xmm9
+movdqa %xmm14,%xmm1
+paddd %xmm15,%xmm1
+movdqa %xmm1,%xmm2
+pslld $13,%xmm1
+pxor  %xmm1,%xmm13
+psrld $19,%xmm2
+pxor  %xmm2,%xmm13
+movdqa %xmm11,%xmm1
+paddd %xmm9,%xmm1
+movdqa %xmm1,%xmm2
+pslld $13,%xmm1
+pxor  %xmm1,%xmm7
+psrld $19,%xmm2
+pxor  %xmm2,%xmm7
+movdqa %xmm15,%xmm1
+paddd %xmm13,%xmm1
+movdqa %xmm1,%xmm2
+pslld $18,%xmm1
+pxor  %xmm1,%xmm12
+psrld $14,%xmm2
+pxor  %xmm2,%xmm12
+movdqa 384(%rsp),%xmm1
+movdqa %xmm12,384(%rsp)
+movdqa %xmm9,%xmm2
+paddd %xmm7,%xmm2
+movdqa %xmm2,%xmm12
+pslld $18,%xmm2
+pxor  %xmm2,%xmm0
+psrld $14,%xmm12
+pxor  %xmm12,%xmm0
+movdqa %xmm5,%xmm2
+paddd %xmm1,%xmm2
+movdqa %xmm2,%xmm12
+pslld $7,%xmm2
+pxor  %xmm2,%xmm3
+psrld $25,%xmm12
+pxor  %xmm12,%xmm3
+movdqa 400(%rsp),%xmm2
+movdqa %xmm0,400(%rsp)
+movdqa %xmm6,%xmm0
+paddd %xmm2,%xmm0
+movdqa %xmm0,%xmm12
+pslld $7,%xmm0
+pxor  %xmm0,%xmm4
+psrld $25,%xmm12
+pxor  %xmm12,%xmm4
+movdqa %xmm1,%xmm0
+paddd %xmm3,%xmm0
+movdqa %xmm0,%xmm12
+pslld $9,%xmm0
+pxor  %xmm0,%xmm10
+psrld $23,%xmm12
+pxor  %xmm12,%xmm10
+movdqa %xmm2,%xmm0
+paddd %xmm4,%xmm0
+movdqa %xmm0,%xmm12
+pslld $9,%xmm0
+pxor  %xmm0,%xmm8
+psrld $23,%xmm12
+pxor  %xmm12,%xmm8
+movdqa %xmm3,%xmm0
+paddd %xmm10,%xmm0
+movdqa %xmm0,%xmm12
+pslld $13,%xmm0
+pxor  %xmm0,%xmm5
+psrld $19,%xmm12
+pxor  %xmm12,%xmm5
+movdqa %xmm4,%xmm0
+paddd %xmm8,%xmm0
+movdqa %xmm0,%xmm12
+pslld $13,%xmm0
+pxor  %xmm0,%xmm6
+psrld $19,%xmm12
+pxor  %xmm12,%xmm6
+movdqa %xmm10,%xmm0
+paddd %xmm5,%xmm0
+movdqa %xmm0,%xmm12
+pslld $18,%xmm0
+pxor  %xmm0,%xmm1
+psrld $14,%xmm12
+pxor  %xmm12,%xmm1
+movdqa 384(%rsp),%xmm0
+movdqa %xmm1,384(%rsp)
+movdqa %xmm4,%xmm1
+paddd %xmm0,%xmm1
+movdqa %xmm1,%xmm12
+pslld $7,%xmm1
+pxor  %xmm1,%xmm7
+psrld $25,%xmm12
+pxor  %xmm12,%xmm7
+movdqa %xmm8,%xmm1
+paddd %xmm6,%xmm1
+movdqa %xmm1,%xmm12
+pslld $18,%xmm1
+pxor  %xmm1,%xmm2
+psrld $14,%xmm12
+pxor  %xmm12,%xmm2
+movdqa 400(%rsp),%xmm12
+movdqa %xmm2,400(%rsp)
+movdqa %xmm14,%xmm1
+paddd %xmm12,%xmm1
+movdqa %xmm1,%xmm2
+pslld $7,%xmm1
+pxor  %xmm1,%xmm5
+psrld $25,%xmm2
+pxor  %xmm2,%xmm5
+movdqa %xmm0,%xmm1
+paddd %xmm7,%xmm1
+movdqa %xmm1,%xmm2
+pslld $9,%xmm1
+pxor  %xmm1,%xmm10
+psrld $23,%xmm2
+pxor  %xmm2,%xmm10
+movdqa %xmm12,%xmm1
+paddd %xmm5,%xmm1
+movdqa %xmm1,%xmm2
+pslld $9,%xmm1
+pxor  %xmm1,%xmm8
+psrld $23,%xmm2
+pxor  %xmm2,%xmm8
+movdqa %xmm7,%xmm1
+paddd %xmm10,%xmm1
+movdqa %xmm1,%xmm2
+pslld $13,%xmm1
+pxor  %xmm1,%xmm4
+psrld $19,%xmm2
+pxor  %xmm2,%xmm4
+movdqa %xmm5,%xmm1
+paddd %xmm8,%xmm1
+movdqa %xmm1,%xmm2
+pslld $13,%xmm1
+pxor  %xmm1,%xmm14
+psrld $19,%xmm2
+pxor  %xmm2,%xmm14
+movdqa %xmm10,%xmm1
+paddd %xmm4,%xmm1
+movdqa %xmm1,%xmm2
+pslld $18,%xmm1
+pxor  %xmm1,%xmm0
+psrld $14,%xmm2
+pxor  %xmm2,%xmm0
+movdqa 384(%rsp),%xmm1
+movdqa %xmm0,384(%rsp)
+movdqa %xmm8,%xmm0
+paddd %xmm14,%xmm0
+movdqa %xmm0,%xmm2
+pslld $18,%xmm0
+pxor  %xmm0,%xmm12
+psrld $14,%xmm2
+pxor  %xmm2,%xmm12
+movdqa %xmm11,%xmm0
+paddd %xmm1,%xmm0
+movdqa %xmm0,%xmm2
+pslld $7,%xmm0
+pxor  %xmm0,%xmm6
+psrld $25,%xmm2
+pxor  %xmm2,%xmm6
+movdqa 400(%rsp),%xmm2
+movdqa %xmm12,400(%rsp)
+movdqa %xmm3,%xmm0
+paddd %xmm2,%xmm0
+movdqa %xmm0,%xmm12
+pslld $7,%xmm0
+pxor  %xmm0,%xmm13
+psrld $25,%xmm12
+pxor  %xmm12,%xmm13
+movdqa %xmm1,%xmm0
+paddd %xmm6,%xmm0
+movdqa %xmm0,%xmm12
+pslld $9,%xmm0
+pxor  %xmm0,%xmm15
+psrld $23,%xmm12
+pxor  %xmm12,%xmm15
+movdqa %xmm2,%xmm0
+paddd %xmm13,%xmm0
+movdqa %xmm0,%xmm12
+pslld $9,%xmm0
+pxor  %xmm0,%xmm9
+psrld $23,%xmm12
+pxor  %xmm12,%xmm9
+movdqa %xmm6,%xmm0
+paddd %xmm15,%xmm0
+movdqa %xmm0,%xmm12
+pslld $13,%xmm0
+pxor  %xmm0,%xmm11
+psrld $19,%xmm12
+pxor  %xmm12,%xmm11
+movdqa %xmm13,%xmm0
+paddd %xmm9,%xmm0
+movdqa %xmm0,%xmm12
+pslld $13,%xmm0
+pxor  %xmm0,%xmm3
+psrld $19,%xmm12
+pxor  %xmm12,%xmm3
+movdqa %xmm15,%xmm0
+paddd %xmm11,%xmm0
+movdqa %xmm0,%xmm12
+pslld $18,%xmm0
+pxor  %xmm0,%xmm1
+psrld $14,%xmm12
+pxor  %xmm12,%xmm1
+movdqa %xmm9,%xmm0
+paddd %xmm3,%xmm0
+movdqa %xmm0,%xmm12
+pslld $18,%xmm0
+pxor  %xmm0,%xmm2
+psrld $14,%xmm12
+pxor  %xmm12,%xmm2
+movdqa 384(%rsp),%xmm12
+movdqa 400(%rsp),%xmm0
+sub  $2,%rdx
+ja ._mainloop1
+
+paddd 176(%rsp),%xmm12
+paddd 240(%rsp),%xmm7
+paddd 288(%rsp),%xmm10
+paddd 336(%rsp),%xmm4
+movd   %xmm12,%rdx
+movd   %xmm7,%rcx
+movd   %xmm10,%r8
+movd   %xmm4,%r9
+pshufd $0x39,%xmm12,%xmm12
+pshufd $0x39,%xmm7,%xmm7
+pshufd $0x39,%xmm10,%xmm10
+pshufd $0x39,%xmm4,%xmm4
+xorl 0(%rsi),%edx
+xorl 4(%rsi),%ecx
+xorl 8(%rsi),%r8d
+xorl 12(%rsi),%r9d
+movl   %edx,0(%rdi)
+movl   %ecx,4(%rdi)
+movl   %r8d,8(%rdi)
+movl   %r9d,12(%rdi)
+movd   %xmm12,%rdx
+movd   %xmm7,%rcx
+movd   %xmm10,%r8
+movd   %xmm4,%r9
+pshufd $0x39,%xmm12,%xmm12
+pshufd $0x39,%xmm7,%xmm7
+pshufd $0x39,%xmm10,%xmm10
+pshufd $0x39,%xmm4,%xmm4
+xorl 64(%rsi),%edx
+xorl 68(%rsi),%ecx
+xorl 72(%rsi),%r8d
+xorl 76(%rsi),%r9d
+movl   %edx,64(%rdi)
+movl   %ecx,68(%rdi)
+movl   %r8d,72(%rdi)
+movl   %r9d,76(%rdi)
+movd   %xmm12,%rdx
+movd   %xmm7,%rcx
+movd   %xmm10,%r8
+movd   %xmm4,%r9
+pshufd $0x39,%xmm12,%xmm12
+pshufd $0x39,%xmm7,%xmm7
+pshufd $0x39,%xmm10,%xmm10
+pshufd $0x39,%xmm4,%xmm4
+xorl 128(%rsi),%edx
+xorl 132(%rsi),%ecx
+xorl 136(%rsi),%r8d
+xorl 140(%rsi),%r9d
+movl   %edx,128(%rdi)
+movl   %ecx,132(%rdi)
+movl   %r8d,136(%rdi)
+movl   %r9d,140(%rdi)
+movd   %xmm12,%rdx
+movd   %xmm7,%rcx
+movd   %xmm10,%r8
+movd   %xmm4,%r9
+xorl 192(%rsi),%edx
+xorl 196(%rsi),%ecx
+xorl 200(%rsi),%r8d
+xorl 204(%rsi),%r9d
+movl   %edx,192(%rdi)
+movl   %ecx,196(%rdi)
+movl   %r8d,200(%rdi)
+movl   %r9d,204(%rdi)
+paddd 304(%rsp),%xmm14
+paddd 128(%rsp),%xmm0
+paddd 192(%rsp),%xmm5
+paddd 256(%rsp),%xmm8
+movd   %xmm14,%rdx
+movd   %xmm0,%rcx
+movd   %xmm5,%r8
+movd   %xmm8,%r9
+pshufd $0x39,%xmm14,%xmm14
+pshufd $0x39,%xmm0,%xmm0
+pshufd $0x39,%xmm5,%xmm5
+pshufd $0x39,%xmm8,%xmm8
+xorl 16(%rsi),%edx
+xorl 20(%rsi),%ecx
+xorl 24(%rsi),%r8d
+xorl 28(%rsi),%r9d
+movl   %edx,16(%rdi)
+movl   %ecx,20(%rdi)
+movl   %r8d,24(%rdi)
+movl   %r9d,28(%rdi)
+movd   %xmm14,%rdx
+movd   %xmm0,%rcx
+movd   %xmm5,%r8
+movd   %xmm8,%r9
+pshufd $0x39,%xmm14,%xmm14
+pshufd $0x39,%xmm0,%xmm0
+pshufd $0x39,%xmm5,%xmm5
+pshufd $0x39,%xmm8,%xmm8
+xorl 80(%rsi),%edx
+xorl 84(%rsi),%ecx
+xorl 88(%rsi),%r8d
+xorl 92(%rsi),%r9d
+movl   %edx,80(%rdi)
+movl   %ecx,84(%rdi)
+movl   %r8d,88(%rdi)
+movl   %r9d,92(%rdi)
+movd   %xmm14,%rdx
+movd   %xmm0,%rcx
+movd   %xmm5,%r8
+movd   %xmm8,%r9
+pshufd $0x39,%xmm14,%xmm14
+pshufd $0x39,%xmm0,%xmm0
+pshufd $0x39,%xmm5,%xmm5
+pshufd $0x39,%xmm8,%xmm8
+xorl 144(%rsi),%edx
+xorl 148(%rsi),%ecx
+xorl 152(%rsi),%r8d
+xorl 156(%rsi),%r9d
+movl   %edx,144(%rdi)
+movl   %ecx,148(%rdi)
+movl   %r8d,152(%rdi)
+movl   %r9d,156(%rdi)
+movd   %xmm14,%rdx
+movd   %xmm0,%rcx
+movd   %xmm5,%r8
+movd   %xmm8,%r9
+xorl 208(%rsi),%edx
+xorl 212(%rsi),%ecx
+xorl 216(%rsi),%r8d
+xorl 220(%rsi),%r9d
+movl   %edx,208(%rdi)
+movl   %ecx,212(%rdi)
+movl   %r8d,216(%rdi)
+movl   %r9d,220(%rdi)
+paddd 352(%rsp),%xmm15
+paddd 368(%rsp),%xmm11
+paddd 144(%rsp),%xmm1
+paddd 208(%rsp),%xmm6
+movd   %xmm15,%rdx
+movd   %xmm11,%rcx
+movd   %xmm1,%r8
+movd   %xmm6,%r9
+pshufd $0x39,%xmm15,%xmm15
+pshufd $0x39,%xmm11,%xmm11
+pshufd $0x39,%xmm1,%xmm1
+pshufd $0x39,%xmm6,%xmm6
+xorl 32(%rsi),%edx
+xorl 36(%rsi),%ecx
+xorl 40(%rsi),%r8d
+xorl 44(%rsi),%r9d
+movl   %edx,32(%rdi)
+movl   %ecx,36(%rdi)
+movl   %r8d,40(%rdi)
+movl   %r9d,44(%rdi)
+movd   %xmm15,%rdx
+movd   %xmm11,%rcx
+movd   %xmm1,%r8
+movd   %xmm6,%r9
+pshufd $0x39,%xmm15,%xmm15
+pshufd $0x39,%xmm11,%xmm11
+pshufd $0x39,%xmm1,%xmm1
+pshufd $0x39,%xmm6,%xmm6
+xorl 96(%rsi),%edx
+xorl 100(%rsi),%ecx
+xorl 104(%rsi),%r8d
+xorl 108(%rsi),%r9d
+movl   %edx,96(%rdi)
+movl   %ecx,100(%rdi)
+movl   %r8d,104(%rdi)
+movl   %r9d,108(%rdi)
+movd   %xmm15,%rdx
+movd   %xmm11,%rcx
+movd   %xmm1,%r8
+movd   %xmm6,%r9
+pshufd $0x39,%xmm15,%xmm15
+pshufd $0x39,%xmm11,%xmm11
+pshufd $0x39,%xmm1,%xmm1
+pshufd $0x39,%xmm6,%xmm6
+xorl 160(%rsi),%edx
+xorl 164(%rsi),%ecx
+xorl 168(%rsi),%r8d
+xorl 172(%rsi),%r9d
+movl   %edx,160(%rdi)
+movl   %ecx,164(%rdi)
+movl   %r8d,168(%rdi)
+movl   %r9d,172(%rdi)
+movd   %xmm15,%rdx
+movd   %xmm11,%rcx
+movd   %xmm1,%r8
+movd   %xmm6,%r9
+xorl 224(%rsi),%edx
+xorl 228(%rsi),%ecx
+xorl 232(%rsi),%r8d
+xorl 236(%rsi),%r9d
+movl   %edx,224(%rdi)
+movl   %ecx,228(%rdi)
+movl   %r8d,232(%rdi)
+movl   %r9d,236(%rdi)
+paddd 224(%rsp),%xmm13
+paddd 272(%rsp),%xmm9
+paddd 320(%rsp),%xmm3
+paddd 160(%rsp),%xmm2
+movd   %xmm13,%rdx
+movd   %xmm9,%rcx
+movd   %xmm3,%r8
+movd   %xmm2,%r9
+pshufd $0x39,%xmm13,%xmm13
+pshufd $0x39,%xmm9,%xmm9
+pshufd $0x39,%xmm3,%xmm3
+pshufd $0x39,%xmm2,%xmm2
+xorl 48(%rsi),%edx
+xorl 52(%rsi),%ecx
+xorl 56(%rsi),%r8d
+xorl 60(%rsi),%r9d
+movl   %edx,48(%rdi)
+movl   %ecx,52(%rdi)
+movl   %r8d,56(%rdi)
+movl   %r9d,60(%rdi)
+movd   %xmm13,%rdx
+movd   %xmm9,%rcx
+movd   %xmm3,%r8
+movd   %xmm2,%r9
+pshufd $0x39,%xmm13,%xmm13
+pshufd $0x39,%xmm9,%xmm9
+pshufd $0x39,%xmm3,%xmm3
+pshufd $0x39,%xmm2,%xmm2
+xorl 112(%rsi),%edx
+xorl 116(%rsi),%ecx
+xorl 120(%rsi),%r8d
+xorl 124(%rsi),%r9d
+movl   %edx,112(%rdi)
+movl   %ecx,116(%rdi)
+movl   %r8d,120(%rdi)
+movl   %r9d,124(%rdi)
+movd   %xmm13,%rdx
+movd   %xmm9,%rcx
+movd   %xmm3,%r8
+movd   %xmm2,%r9
+pshufd $0x39,%xmm13,%xmm13
+pshufd $0x39,%xmm9,%xmm9
+pshufd $0x39,%xmm3,%xmm3
+pshufd $0x39,%xmm2,%xmm2
+xorl 176(%rsi),%edx
+xorl 180(%rsi),%ecx
+xorl 184(%rsi),%r8d
+xorl 188(%rsi),%r9d
+movl   %edx,176(%rdi)
+movl   %ecx,180(%rdi)
+movl   %r8d,184(%rdi)
+movl   %r9d,188(%rdi)
+movd   %xmm13,%rdx
+movd   %xmm9,%rcx
+movd   %xmm3,%r8
+movd   %xmm2,%r9
+xorl 240(%rsi),%edx
+xorl 244(%rsi),%ecx
+xorl 248(%rsi),%r8d
+xorl 252(%rsi),%r9d
+movl   %edx,240(%rdi)
+movl   %ecx,244(%rdi)
+movl   %r8d,248(%rdi)
+movl   %r9d,252(%rdi)
+movq 480(%rsp),%r9
+sub  $256,%r9
+add  $256,%rsi
+add  $256,%rdi
+cmp  $256,%r9
+jae ._bytesatleast256
+
+cmp  $0,%r9
+jbe ._done
+
+._bytesbetween1and255:
+cmp  $64,%r9
+jae ._nocopy
+
+mov  %rdi,%rdx
+leaq 0(%rsp),%rdi
+mov  %r9,%rcx
+rep movsb
+leaq 0(%rsp),%rdi
+leaq 0(%rsp),%rsi
+
+._nocopy:
+movq %r9,480(%rsp)
+movdqa 112(%rsp),%xmm0
+movdqa 64(%rsp),%xmm1
+movdqa 80(%rsp),%xmm2
+movdqa 96(%rsp),%xmm3
+movdqa %xmm1,%xmm4
+mov  $20,%rcx
+
+.p2align 4
+._mainloop2:
+paddd %xmm0,%xmm4
+movdqa %xmm0,%xmm5
+movdqa %xmm4,%xmm6
+pslld $7,%xmm4
+psrld $25,%xmm6
+pxor  %xmm4,%xmm3
+pxor  %xmm6,%xmm3
+paddd %xmm3,%xmm5
+movdqa %xmm3,%xmm4
+movdqa %xmm5,%xmm6
+pslld $9,%xmm5
+psrld $23,%xmm6
+pxor  %xmm5,%xmm2
+pshufd $0x93,%xmm3,%xmm3
+pxor  %xmm6,%xmm2
+paddd %xmm2,%xmm4
+movdqa %xmm2,%xmm5
+movdqa %xmm4,%xmm6
+pslld $13,%xmm4
+psrld $19,%xmm6
+pxor  %xmm4,%xmm1
+pshufd $0x4e,%xmm2,%xmm2
+pxor  %xmm6,%xmm1
+paddd %xmm1,%xmm5
+movdqa %xmm3,%xmm4
+movdqa %xmm5,%xmm6
+pslld $18,%xmm5
+psrld $14,%xmm6
+pxor  %xmm5,%xmm0
+pshufd $0x39,%xmm1,%xmm1
+pxor  %xmm6,%xmm0
+paddd %xmm0,%xmm4
+movdqa %xmm0,%xmm5
+movdqa %xmm4,%xmm6
+pslld $7,%xmm4
+psrld $25,%xmm6
+pxor  %xmm4,%xmm1
+pxor  %xmm6,%xmm1
+paddd %xmm1,%xmm5
+movdqa %xmm1,%xmm4
+movdqa %xmm5,%xmm6
+pslld $9,%xmm5
+psrld $23,%xmm6
+pxor  %xmm5,%xmm2
+pshufd $0x93,%xmm1,%xmm1
+pxor  %xmm6,%xmm2
+paddd %xmm2,%xmm4
+movdqa %xmm2,%xmm5
+movdqa %xmm4,%xmm6
+pslld $13,%xmm4
+psrld $19,%xmm6
+pxor  %xmm4,%xmm3
+pshufd $0x4e,%xmm2,%xmm2
+pxor  %xmm6,%xmm3
+paddd %xmm3,%xmm5
+movdqa %xmm1,%xmm4
+movdqa %xmm5,%xmm6
+pslld $18,%xmm5
+psrld $14,%xmm6
+pxor  %xmm5,%xmm0
+pshufd $0x39,%xmm3,%xmm3
+pxor  %xmm6,%xmm0
+paddd %xmm0,%xmm4
+movdqa %xmm0,%xmm5
+movdqa %xmm4,%xmm6
+pslld $7,%xmm4
+psrld $25,%xmm6
+pxor  %xmm4,%xmm3
+pxor  %xmm6,%xmm3
+paddd %xmm3,%xmm5
+movdqa %xmm3,%xmm4
+movdqa %xmm5,%xmm6
+pslld $9,%xmm5
+psrld $23,%xmm6
+pxor  %xmm5,%xmm2
+pshufd $0x93,%xmm3,%xmm3
+pxor  %xmm6,%xmm2
+paddd %xmm2,%xmm4
+movdqa %xmm2,%xmm5
+movdqa %xmm4,%xmm6
+pslld $13,%xmm4
+psrld $19,%xmm6
+pxor  %xmm4,%xmm1
+pshufd $0x4e,%xmm2,%xmm2
+pxor  %xmm6,%xmm1
+paddd %xmm1,%xmm5
+movdqa %xmm3,%xmm4
+movdqa %xmm5,%xmm6
+pslld $18,%xmm5
+psrld $14,%xmm6
+pxor  %xmm5,%xmm0
+pshufd $0x39,%xmm1,%xmm1
+pxor  %xmm6,%xmm0
+paddd %xmm0,%xmm4
+movdqa %xmm0,%xmm5
+movdqa %xmm4,%xmm6
+pslld $7,%xmm4
+psrld $25,%xmm6
+pxor  %xmm4,%xmm1
+pxor  %xmm6,%xmm1
+paddd %xmm1,%xmm5
+movdqa %xmm1,%xmm4
+movdqa %xmm5,%xmm6
+pslld $9,%xmm5
+psrld $23,%xmm6
+pxor  %xmm5,%xmm2
+pshufd $0x93,%xmm1,%xmm1
+pxor  %xmm6,%xmm2
+paddd %xmm2,%xmm4
+movdqa %xmm2,%xmm5
+movdqa %xmm4,%xmm6
+pslld $13,%xmm4
+psrld $19,%xmm6
+pxor  %xmm4,%xmm3
+pshufd $0x4e,%xmm2,%xmm2
+pxor  %xmm6,%xmm3
+sub  $4,%rcx
+paddd %xmm3,%xmm5
+movdqa %xmm1,%xmm4
+movdqa %xmm5,%xmm6
+pslld $18,%xmm5
+pxor   %xmm7,%xmm7
+psrld $14,%xmm6
+pxor  %xmm5,%xmm0
+pshufd $0x39,%xmm3,%xmm3
+pxor  %xmm6,%xmm0
+ja ._mainloop2
+
+paddd 112(%rsp),%xmm0
+paddd 64(%rsp),%xmm1
+paddd 80(%rsp),%xmm2
+paddd 96(%rsp),%xmm3
+movd   %xmm0,%rcx
+movd   %xmm1,%r8
+movd   %xmm2,%r9
+movd   %xmm3,%rax
+pshufd $0x39,%xmm0,%xmm0
+pshufd $0x39,%xmm1,%xmm1
+pshufd $0x39,%xmm2,%xmm2
+pshufd $0x39,%xmm3,%xmm3
+xorl 0(%rsi),%ecx
+xorl 48(%rsi),%r8d
+xorl 32(%rsi),%r9d
+xorl 16(%rsi),%eax
+movl   %ecx,0(%rdi)
+movl   %r8d,48(%rdi)
+movl   %r9d,32(%rdi)
+movl   %eax,16(%rdi)
+movd   %xmm0,%rcx
+movd   %xmm1,%r8
+movd   %xmm2,%r9
+movd   %xmm3,%rax
+pshufd $0x39,%xmm0,%xmm0
+pshufd $0x39,%xmm1,%xmm1
+pshufd $0x39,%xmm2,%xmm2
+pshufd $0x39,%xmm3,%xmm3
+xorl 20(%rsi),%ecx
+xorl 4(%rsi),%r8d
+xorl 52(%rsi),%r9d
+xorl 36(%rsi),%eax
+movl   %ecx,20(%rdi)
+movl   %r8d,4(%rdi)
+movl   %r9d,52(%rdi)
+movl   %eax,36(%rdi)
+movd   %xmm0,%rcx
+movd   %xmm1,%r8
+movd   %xmm2,%r9
+movd   %xmm3,%rax
+pshufd $0x39,%xmm0,%xmm0
+pshufd $0x39,%xmm1,%xmm1
+pshufd $0x39,%xmm2,%xmm2
+pshufd $0x39,%xmm3,%xmm3
+xorl 40(%rsi),%ecx
+xorl 24(%rsi),%r8d
+xorl 8(%rsi),%r9d
+xorl 56(%rsi),%eax
+movl   %ecx,40(%rdi)
+movl   %r8d,24(%rdi)
+movl   %r9d,8(%rdi)
+movl   %eax,56(%rdi)
+movd   %xmm0,%rcx
+movd   %xmm1,%r8
+movd   %xmm2,%r9
+movd   %xmm3,%rax
+xorl 60(%rsi),%ecx
+xorl 44(%rsi),%r8d
+xorl 28(%rsi),%r9d
+xorl 12(%rsi),%eax
+movl   %ecx,60(%rdi)
+movl   %r8d,44(%rdi)
+movl   %r9d,28(%rdi)
+movl   %eax,12(%rdi)
+movq 480(%rsp),%r9
+movq 472(%rsp),%rcx
+add  $1,%rcx
+mov  %rcx,%r8
+shr  $32,%r8
+movl %ecx,80(%rsp)
+movl %r8d,4+96(%rsp)
+movq %rcx,472(%rsp)
+cmp  $64,%r9
+ja ._bytesatleast65
+jae ._bytesatleast64
+
+mov  %rdi,%rsi
+mov  %rdx,%rdi
+mov  %r9,%rcx
+rep movsb
+
+._bytesatleast64:
+._done:
+movq 416(%rsp),%r11
+movq 424(%rsp),%r12
+movq 432(%rsp),%r13
+movq 440(%rsp),%r14
+movq 448(%rsp),%r15
+movq 456(%rsp),%rbx
+movq 464(%rsp),%rbp
+add %r11,%rsp
+xor %rax,%rax
+mov %rsi,%rdx
+ret
+
+._bytesatleast65:
+sub  $64,%r9
+add  $64,%rdi
+add  $64,%rsi
+jmp ._bytesbetween1and255
+
+#endif
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.c b/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.c
index 0a6fee0f3e..504727038f 100644
--- a/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.c
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.c
@@ -1,31 +1,31 @@
-
-#include <stdint.h>
-
-#include "utils.h"
-
-#include "../stream_salsa20.h"
-#include "salsa20_xmm6.h"
-
-#ifdef HAVE_AMD64_ASM
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-extern int stream_salsa20_xmm6(unsigned char *c, unsigned long long clen,
-                               const unsigned char *n, const unsigned char *k);
-
-extern int stream_salsa20_xmm6_xor_ic(unsigned char *c, const unsigned char *m,
-                                      unsigned long long mlen,
-                                      const unsigned char *n,
-                                      uint64_t ic, const unsigned char *k);
-#ifdef __cplusplus
-}
-#endif
-
-struct crypto_stream_salsa20_implementation
-    crypto_stream_salsa20_xmm6_implementation = {
-        SODIUM_C99(.stream =) stream_salsa20_xmm6,
-        SODIUM_C99(.stream_xor_ic =) stream_salsa20_xmm6_xor_ic,
-    };
-
-#endif
+
+#include <stdint.h>
+
+#include "utils.h"
+
+#include "../stream_salsa20.h"
+#include "salsa20_xmm6.h"
+
+#ifdef HAVE_AMD64_ASM
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern int stream_salsa20_xmm6(unsigned char *c, unsigned long long clen,
+                               const unsigned char *n, const unsigned char *k);
+
+extern int stream_salsa20_xmm6_xor_ic(unsigned char *c, const unsigned char *m,
+                                      unsigned long long mlen,
+                                      const unsigned char *n,
+                                      uint64_t ic, const unsigned char *k);
+#ifdef __cplusplus
+}
+#endif
+
+struct crypto_stream_salsa20_implementation
+    crypto_stream_salsa20_xmm6_implementation = {
+        SODIUM_C99(.stream =) stream_salsa20_xmm6,
+        SODIUM_C99(.stream_xor_ic =) stream_salsa20_xmm6_xor_ic,
+    };
+
+#endif
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.h
index d38473a9ff..3ccbb5e8e6 100644
--- a/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.h
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.h
@@ -1,8 +1,8 @@
-
-#include <stdint.h>
-
-#include "../stream_salsa20.h"
-#include "crypto_stream_salsa20.h"
-
-extern struct crypto_stream_salsa20_implementation
-    crypto_stream_salsa20_xmm6_implementation;
+
+#include <stdint.h>
+
+#include "../stream_salsa20.h"
+#include "crypto_stream_salsa20.h"
+
+extern struct crypto_stream_salsa20_implementation
+    crypto_stream_salsa20_xmm6_implementation;
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.c b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.c
index 18d4773ec9..95bb63fd13 100644
--- a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.c
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.c
@@ -1,131 +1,131 @@
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "crypto_stream_salsa20.h"
-#include "private/common.h"
-#include "private/sse2_64_32.h"
-#include "utils.h"
-
-#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
-    defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
-
-# ifdef __GNUC__
-#  pragma GCC target("sse2")
-#  pragma GCC target("ssse3")
-#  pragma GCC target("sse4.1")
-#  pragma GCC target("avx2")
-# endif
-
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <smmintrin.h>
-#include <tmmintrin.h>
-
-# include "../stream_salsa20.h"
-# include "salsa20_xmm6int-avx2.h"
-
-# define ROUNDS 20
-
-typedef struct salsa_ctx {
-    uint32_t input[16];
-} salsa_ctx;
-
-static const int TR[16] = {
-    0, 5, 10, 15, 12, 1, 6, 11, 8, 13, 2, 7, 4, 9, 14, 3
-};
-
-static void
-salsa_keysetup(salsa_ctx *ctx, const uint8_t *k)
-{
-    ctx->input[TR[1]]  = LOAD32_LE(k + 0);
-    ctx->input[TR[2]]  = LOAD32_LE(k + 4);
-    ctx->input[TR[3]]  = LOAD32_LE(k + 8);
-    ctx->input[TR[4]]  = LOAD32_LE(k + 12);
-    ctx->input[TR[11]] = LOAD32_LE(k + 16);
-    ctx->input[TR[12]] = LOAD32_LE(k + 20);
-    ctx->input[TR[13]] = LOAD32_LE(k + 24);
-    ctx->input[TR[14]] = LOAD32_LE(k + 28);
-    ctx->input[TR[0]]  = 0x61707865;
-    ctx->input[TR[5]]  = 0x3320646e;
-    ctx->input[TR[10]] = 0x79622d32;
-    ctx->input[TR[15]] = 0x6b206574;
-}
-
-static void
-salsa_ivsetup(salsa_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
-{
-    ctx->input[TR[6]] = LOAD32_LE(iv + 0);
-    ctx->input[TR[7]] = LOAD32_LE(iv + 4);
-    ctx->input[TR[8]] = counter == NULL ? 0 : LOAD32_LE(counter + 0);
-    ctx->input[TR[9]] = counter == NULL ? 0 : LOAD32_LE(counter + 4);
-}
-
-static void
-salsa20_encrypt_bytes(salsa_ctx *ctx, const uint8_t *m, uint8_t *c,
-                      unsigned long long bytes)
-{
-    uint32_t * const x = &ctx->input[0];
-
-    if (!bytes) {
-        return; /* LCOV_EXCL_LINE */
-    }
-
-#include "u8.h"
-#include "u4.h"
-#include "u1.h"
-#include "u0.h"
-}
-
-static int
-stream_avx2(unsigned char *c, unsigned long long clen, const unsigned char *n,
-            const unsigned char *k)
-{
-    struct salsa_ctx ctx;
-
-    if (!clen) {
-        return 0;
-    }
-    COMPILER_ASSERT(crypto_stream_salsa20_KEYBYTES == 256 / 8);
-    salsa_keysetup(&ctx, k);
-    salsa_ivsetup(&ctx, n, NULL);
-    memset(c, 0, clen);
-    salsa20_encrypt_bytes(&ctx, c, c, clen);
-    sodium_memzero(&ctx, sizeof ctx);
-
-    return 0;
-}
-
-static int
-stream_avx2_xor_ic(unsigned char *c, const unsigned char *m,
-                   unsigned long long mlen, const unsigned char *n, uint64_t ic,
-                   const unsigned char *k)
-{
-    struct salsa_ctx ctx;
-    uint8_t          ic_bytes[8];
-    uint32_t         ic_high;
-    uint32_t         ic_low;
-
-    if (!mlen) {
-        return 0;
-    }
-    ic_high = (uint32_t) (ic >> 32);
-    ic_low  = (uint32_t) ic;
-    STORE32_LE(&ic_bytes[0], ic_low);
-    STORE32_LE(&ic_bytes[4], ic_high);
-    salsa_keysetup(&ctx, k);
-    salsa_ivsetup(&ctx, n, ic_bytes);
-    salsa20_encrypt_bytes(&ctx, m, c, mlen);
-    sodium_memzero(&ctx, sizeof ctx);
-
-    return 0;
-}
-
-struct crypto_stream_salsa20_implementation
-    crypto_stream_salsa20_xmm6int_avx2_implementation = {
-        SODIUM_C99(.stream =) stream_avx2,
-        SODIUM_C99(.stream_xor_ic =) stream_avx2_xor_ic
-    };
-
-#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "crypto_stream_salsa20.h"
+#include "private/common.h"
+#include "private/sse2_64_32.h"
+#include "utils.h"
+
+#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
+    defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
+
+# ifdef __GNUC__
+#  pragma GCC target("sse2")
+#  pragma GCC target("ssse3")
+#  pragma GCC target("sse4.1")
+#  pragma GCC target("avx2")
+# endif
+
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+
+# include "../stream_salsa20.h"
+# include "salsa20_xmm6int-avx2.h"
+
+# define ROUNDS 20
+
+typedef struct salsa_ctx {
+    uint32_t input[16];
+} salsa_ctx;
+
+static const int TR[16] = {
+    0, 5, 10, 15, 12, 1, 6, 11, 8, 13, 2, 7, 4, 9, 14, 3
+};
+
+static void
+salsa_keysetup(salsa_ctx *ctx, const uint8_t *k)
+{
+    ctx->input[TR[1]]  = LOAD32_LE(k + 0);
+    ctx->input[TR[2]]  = LOAD32_LE(k + 4);
+    ctx->input[TR[3]]  = LOAD32_LE(k + 8);
+    ctx->input[TR[4]]  = LOAD32_LE(k + 12);
+    ctx->input[TR[11]] = LOAD32_LE(k + 16);
+    ctx->input[TR[12]] = LOAD32_LE(k + 20);
+    ctx->input[TR[13]] = LOAD32_LE(k + 24);
+    ctx->input[TR[14]] = LOAD32_LE(k + 28);
+    ctx->input[TR[0]]  = 0x61707865;
+    ctx->input[TR[5]]  = 0x3320646e;
+    ctx->input[TR[10]] = 0x79622d32;
+    ctx->input[TR[15]] = 0x6b206574;
+}
+
+static void
+salsa_ivsetup(salsa_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
+{
+    ctx->input[TR[6]] = LOAD32_LE(iv + 0);
+    ctx->input[TR[7]] = LOAD32_LE(iv + 4);
+    ctx->input[TR[8]] = counter == NULL ? 0 : LOAD32_LE(counter + 0);
+    ctx->input[TR[9]] = counter == NULL ? 0 : LOAD32_LE(counter + 4);
+}
+
+static void
+salsa20_encrypt_bytes(salsa_ctx *ctx, const uint8_t *m, uint8_t *c,
+                      unsigned long long bytes)
+{
+    uint32_t * const x = &ctx->input[0];
+
+    if (!bytes) {
+        return; /* LCOV_EXCL_LINE */
+    }
+
+#include "u8.h"
+#include "u4.h"
+#include "u1.h"
+#include "u0.h"
+}
+
+static int
+stream_avx2(unsigned char *c, unsigned long long clen, const unsigned char *n,
+            const unsigned char *k)
+{
+    struct salsa_ctx ctx;
+
+    if (!clen) {
+        return 0;
+    }
+    COMPILER_ASSERT(crypto_stream_salsa20_KEYBYTES == 256 / 8);
+    salsa_keysetup(&ctx, k);
+    salsa_ivsetup(&ctx, n, NULL);
+    memset(c, 0, clen);
+    salsa20_encrypt_bytes(&ctx, c, c, clen);
+    sodium_memzero(&ctx, sizeof ctx);
+
+    return 0;
+}
+
+static int
+stream_avx2_xor_ic(unsigned char *c, const unsigned char *m,
+                   unsigned long long mlen, const unsigned char *n, uint64_t ic,
+                   const unsigned char *k)
+{
+    struct salsa_ctx ctx;
+    uint8_t          ic_bytes[8];
+    uint32_t         ic_high;
+    uint32_t         ic_low;
+
+    if (!mlen) {
+        return 0;
+    }
+    ic_high = (uint32_t) (ic >> 32);
+    ic_low  = (uint32_t) ic;
+    STORE32_LE(&ic_bytes[0], ic_low);
+    STORE32_LE(&ic_bytes[4], ic_high);
+    salsa_keysetup(&ctx, k);
+    salsa_ivsetup(&ctx, n, ic_bytes);
+    salsa20_encrypt_bytes(&ctx, m, c, mlen);
+    sodium_memzero(&ctx, sizeof ctx);
+
+    return 0;
+}
+
+struct crypto_stream_salsa20_implementation
+    crypto_stream_salsa20_xmm6int_avx2_implementation = {
+        SODIUM_C99(.stream =) stream_avx2,
+        SODIUM_C99(.stream_xor_ic =) stream_avx2_xor_ic
+    };
+
+#endif
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.h
index 0924e9baff..a84ea0d2d0 100644
--- a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.h
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.h
@@ -1,8 +1,8 @@
-
-#include <stdint.h>
-
-#include "../stream_salsa20.h"
-#include "crypto_stream_salsa20.h"
-
-extern struct crypto_stream_salsa20_implementation
-    crypto_stream_salsa20_xmm6int_avx2_implementation;
+
+#include <stdint.h>
+
+#include "../stream_salsa20.h"
+#include "crypto_stream_salsa20.h"
+
+extern struct crypto_stream_salsa20_implementation
+    crypto_stream_salsa20_xmm6int_avx2_implementation;
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.c b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.c
index d8e53a6554..41dc8193fc 100644
--- a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.c
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.c
@@ -1,122 +1,122 @@
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "crypto_stream_salsa20.h"
-#include "private/common.h"
-#include "private/sse2_64_32.h"
-#include "utils.h"
-
-#ifdef HAVE_EMMINTRIN_H
-
-# ifdef __GNUC__
-#  pragma GCC target("sse2")
-# endif
-# include <emmintrin.h>
-
-# include "../stream_salsa20.h"
-# include "salsa20_xmm6int-sse2.h"
-
-# define ROUNDS 20
-
-typedef struct salsa_ctx {
-    uint32_t input[16];
-} salsa_ctx;
-
-static const int TR[16] = {
-    0, 5, 10, 15, 12, 1, 6, 11, 8, 13, 2, 7, 4, 9, 14, 3
-};
-
-static void
-salsa_keysetup(salsa_ctx *ctx, const uint8_t *k)
-{
-    ctx->input[TR[1]]  = LOAD32_LE(k + 0);
-    ctx->input[TR[2]]  = LOAD32_LE(k + 4);
-    ctx->input[TR[3]]  = LOAD32_LE(k + 8);
-    ctx->input[TR[4]]  = LOAD32_LE(k + 12);
-    ctx->input[TR[11]] = LOAD32_LE(k + 16);
-    ctx->input[TR[12]] = LOAD32_LE(k + 20);
-    ctx->input[TR[13]] = LOAD32_LE(k + 24);
-    ctx->input[TR[14]] = LOAD32_LE(k + 28);
-    ctx->input[TR[0]]  = 0x61707865;
-    ctx->input[TR[5]]  = 0x3320646e;
-    ctx->input[TR[10]] = 0x79622d32;
-    ctx->input[TR[15]] = 0x6b206574;
-}
-
-static void
-salsa_ivsetup(salsa_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
-{
-    ctx->input[TR[6]] = LOAD32_LE(iv + 0);
-    ctx->input[TR[7]] = LOAD32_LE(iv + 4);
-    ctx->input[TR[8]] = counter == NULL ? 0 : LOAD32_LE(counter + 0);
-    ctx->input[TR[9]] = counter == NULL ? 0 : LOAD32_LE(counter + 4);
-}
-
-static void
-salsa20_encrypt_bytes(salsa_ctx *ctx, const uint8_t *m, uint8_t *c,
-                      unsigned long long bytes)
-{
-    uint32_t * const x = &ctx->input[0];
-
-    if (!bytes) {
-        return; /* LCOV_EXCL_LINE */
-    }
-
-#include "u4.h"
-#include "u1.h"
-#include "u0.h"
-}
-
-static int
-stream_sse2(unsigned char *c, unsigned long long clen, const unsigned char *n,
-            const unsigned char *k)
-{
-    struct salsa_ctx ctx;
-
-    if (!clen) {
-        return 0;
-    }
-    COMPILER_ASSERT(crypto_stream_salsa20_KEYBYTES == 256 / 8);
-    salsa_keysetup(&ctx, k);
-    salsa_ivsetup(&ctx, n, NULL);
-    memset(c, 0, clen);
-    salsa20_encrypt_bytes(&ctx, c, c, clen);
-    sodium_memzero(&ctx, sizeof ctx);
-
-    return 0;
-}
-
-static int
-stream_sse2_xor_ic(unsigned char *c, const unsigned char *m,
-                   unsigned long long mlen, const unsigned char *n, uint64_t ic,
-                   const unsigned char *k)
-{
-    struct salsa_ctx ctx;
-    uint8_t          ic_bytes[8];
-    uint32_t         ic_high;
-    uint32_t         ic_low;
-
-    if (!mlen) {
-        return 0;
-    }
-    ic_high = (uint32_t) (ic >> 32);
-    ic_low  = (uint32_t) (ic);
-    STORE32_LE(&ic_bytes[0], ic_low);
-    STORE32_LE(&ic_bytes[4], ic_high);
-    salsa_keysetup(&ctx, k);
-    salsa_ivsetup(&ctx, n, ic_bytes);
-    salsa20_encrypt_bytes(&ctx, m, c, mlen);
-    sodium_memzero(&ctx, sizeof ctx);
-
-    return 0;
-}
-
-struct crypto_stream_salsa20_implementation
-    crypto_stream_salsa20_xmm6int_sse2_implementation = {
-        SODIUM_C99(.stream =) stream_sse2,
-        SODIUM_C99(.stream_xor_ic =) stream_sse2_xor_ic
-    };
-
-#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "crypto_stream_salsa20.h"
+#include "private/common.h"
+#include "private/sse2_64_32.h"
+#include "utils.h"
+
+#ifdef HAVE_EMMINTRIN_H
+
+# ifdef __GNUC__
+#  pragma GCC target("sse2")
+# endif
+# include <emmintrin.h>
+
+# include "../stream_salsa20.h"
+# include "salsa20_xmm6int-sse2.h"
+
+# define ROUNDS 20
+
+typedef struct salsa_ctx {
+    uint32_t input[16];
+} salsa_ctx;
+
+static const int TR[16] = {
+    0, 5, 10, 15, 12, 1, 6, 11, 8, 13, 2, 7, 4, 9, 14, 3
+};
+
+static void
+salsa_keysetup(salsa_ctx *ctx, const uint8_t *k)
+{
+    ctx->input[TR[1]]  = LOAD32_LE(k + 0);
+    ctx->input[TR[2]]  = LOAD32_LE(k + 4);
+    ctx->input[TR[3]]  = LOAD32_LE(k + 8);
+    ctx->input[TR[4]]  = LOAD32_LE(k + 12);
+    ctx->input[TR[11]] = LOAD32_LE(k + 16);
+    ctx->input[TR[12]] = LOAD32_LE(k + 20);
+    ctx->input[TR[13]] = LOAD32_LE(k + 24);
+    ctx->input[TR[14]] = LOAD32_LE(k + 28);
+    ctx->input[TR[0]]  = 0x61707865;
+    ctx->input[TR[5]]  = 0x3320646e;
+    ctx->input[TR[10]] = 0x79622d32;
+    ctx->input[TR[15]] = 0x6b206574;
+}
+
+static void
+salsa_ivsetup(salsa_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
+{
+    ctx->input[TR[6]] = LOAD32_LE(iv + 0);
+    ctx->input[TR[7]] = LOAD32_LE(iv + 4);
+    ctx->input[TR[8]] = counter == NULL ? 0 : LOAD32_LE(counter + 0);
+    ctx->input[TR[9]] = counter == NULL ? 0 : LOAD32_LE(counter + 4);
+}
+
+static void
+salsa20_encrypt_bytes(salsa_ctx *ctx, const uint8_t *m, uint8_t *c,
+                      unsigned long long bytes)
+{
+    uint32_t * const x = &ctx->input[0];
+
+    if (!bytes) {
+        return; /* LCOV_EXCL_LINE */
+    }
+
+#include "u4.h"
+#include "u1.h"
+#include "u0.h"
+}
+
+static int
+stream_sse2(unsigned char *c, unsigned long long clen, const unsigned char *n,
+            const unsigned char *k)
+{
+    struct salsa_ctx ctx;
+
+    if (!clen) {
+        return 0;
+    }
+    COMPILER_ASSERT(crypto_stream_salsa20_KEYBYTES == 256 / 8);
+    salsa_keysetup(&ctx, k);
+    salsa_ivsetup(&ctx, n, NULL);
+    memset(c, 0, clen);
+    salsa20_encrypt_bytes(&ctx, c, c, clen);
+    sodium_memzero(&ctx, sizeof ctx);
+
+    return 0;
+}
+
+static int
+stream_sse2_xor_ic(unsigned char *c, const unsigned char *m,
+                   unsigned long long mlen, const unsigned char *n, uint64_t ic,
+                   const unsigned char *k)
+{
+    struct salsa_ctx ctx;
+    uint8_t          ic_bytes[8];
+    uint32_t         ic_high;
+    uint32_t         ic_low;
+
+    if (!mlen) {
+        return 0;
+    }
+    ic_high = (uint32_t) (ic >> 32);
+    ic_low  = (uint32_t) (ic);
+    STORE32_LE(&ic_bytes[0], ic_low);
+    STORE32_LE(&ic_bytes[4], ic_high);
+    salsa_keysetup(&ctx, k);
+    salsa_ivsetup(&ctx, n, ic_bytes);
+    salsa20_encrypt_bytes(&ctx, m, c, mlen);
+    sodium_memzero(&ctx, sizeof ctx);
+
+    return 0;
+}
+
+struct crypto_stream_salsa20_implementation
+    crypto_stream_salsa20_xmm6int_sse2_implementation = {
+        SODIUM_C99(.stream =) stream_sse2,
+        SODIUM_C99(.stream_xor_ic =) stream_sse2_xor_ic
+    };
+
+#endif
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.h
index ed52a8bcbe..627f3f80fd 100644
--- a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.h
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.h
@@ -1,8 +1,8 @@
-
-#include <stdint.h>
-
-#include "../stream_salsa20.h"
-#include "crypto_stream_salsa20.h"
-
-extern struct crypto_stream_salsa20_implementation
-    crypto_stream_salsa20_xmm6int_sse2_implementation;
+
+#include <stdint.h>
+
+#include "../stream_salsa20.h"
+#include "crypto_stream_salsa20.h"
+
+extern struct crypto_stream_salsa20_implementation
+    crypto_stream_salsa20_xmm6int_sse2_implementation;
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u0.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u0.h
index b2d4168058..e2634b4a3e 100644
--- a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u0.h
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u0.h
@@ -1,195 +1,195 @@
-if (bytes > 0) {
-    __m128i diag0 = _mm_loadu_si128((__m128i *) (x + 0));
-    __m128i diag1 = _mm_loadu_si128((__m128i *) (x + 4));
-    __m128i diag2 = _mm_loadu_si128((__m128i *) (x + 8));
-    __m128i diag3 = _mm_loadu_si128((__m128i *) (x + 12));
-    __m128i a0, a1, a2, a3, a4, a5, a6, a7;
-    __m128i b0, b1, b2, b3, b4, b5, b6, b7;
-    uint8_t partialblock[64];
-
-    unsigned int i;
-
-    a0 = diag1;
-    for (i = 0; i < ROUNDS; i += 4) {
-        a0    = _mm_add_epi32(a0, diag0);
-        a1    = diag0;
-        b0    = a0;
-        a0    = _mm_slli_epi32(a0, 7);
-        b0    = _mm_srli_epi32(b0, 25);
-        diag3 = _mm_xor_si128(diag3, a0);
-
-        diag3 = _mm_xor_si128(diag3, b0);
-
-        a1    = _mm_add_epi32(a1, diag3);
-        a2    = diag3;
-        b1    = a1;
-        a1    = _mm_slli_epi32(a1, 9);
-        b1    = _mm_srli_epi32(b1, 23);
-        diag2 = _mm_xor_si128(diag2, a1);
-        diag3 = _mm_shuffle_epi32(diag3, 0x93);
-        diag2 = _mm_xor_si128(diag2, b1);
-
-        a2    = _mm_add_epi32(a2, diag2);
-        a3    = diag2;
-        b2    = a2;
-        a2    = _mm_slli_epi32(a2, 13);
-        b2    = _mm_srli_epi32(b2, 19);
-        diag1 = _mm_xor_si128(diag1, a2);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag1 = _mm_xor_si128(diag1, b2);
-
-        a3    = _mm_add_epi32(a3, diag1);
-        a4    = diag3;
-        b3    = a3;
-        a3    = _mm_slli_epi32(a3, 18);
-        b3    = _mm_srli_epi32(b3, 14);
-        diag0 = _mm_xor_si128(diag0, a3);
-        diag1 = _mm_shuffle_epi32(diag1, 0x39);
-        diag0 = _mm_xor_si128(diag0, b3);
-
-        a4    = _mm_add_epi32(a4, diag0);
-        a5    = diag0;
-        b4    = a4;
-        a4    = _mm_slli_epi32(a4, 7);
-        b4    = _mm_srli_epi32(b4, 25);
-        diag1 = _mm_xor_si128(diag1, a4);
-
-        diag1 = _mm_xor_si128(diag1, b4);
-
-        a5    = _mm_add_epi32(a5, diag1);
-        a6    = diag1;
-        b5    = a5;
-        a5    = _mm_slli_epi32(a5, 9);
-        b5    = _mm_srli_epi32(b5, 23);
-        diag2 = _mm_xor_si128(diag2, a5);
-        diag1 = _mm_shuffle_epi32(diag1, 0x93);
-        diag2 = _mm_xor_si128(diag2, b5);
-
-        a6    = _mm_add_epi32(a6, diag2);
-        a7    = diag2;
-        b6    = a6;
-        a6    = _mm_slli_epi32(a6, 13);
-        b6    = _mm_srli_epi32(b6, 19);
-        diag3 = _mm_xor_si128(diag3, a6);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag3 = _mm_xor_si128(diag3, b6);
-
-        a7    = _mm_add_epi32(a7, diag3);
-        a0    = diag1;
-        b7    = a7;
-        a7    = _mm_slli_epi32(a7, 18);
-        b7    = _mm_srli_epi32(b7, 14);
-        diag0 = _mm_xor_si128(diag0, a7);
-        diag3 = _mm_shuffle_epi32(diag3, 0x39);
-        diag0 = _mm_xor_si128(diag0, b7);
-
-        a0    = _mm_add_epi32(a0, diag0);
-        a1    = diag0;
-        b0    = a0;
-        a0    = _mm_slli_epi32(a0, 7);
-        b0    = _mm_srli_epi32(b0, 25);
-        diag3 = _mm_xor_si128(diag3, a0);
-
-        diag3 = _mm_xor_si128(diag3, b0);
-
-        a1    = _mm_add_epi32(a1, diag3);
-        a2    = diag3;
-        b1    = a1;
-        a1    = _mm_slli_epi32(a1, 9);
-        b1    = _mm_srli_epi32(b1, 23);
-        diag2 = _mm_xor_si128(diag2, a1);
-        diag3 = _mm_shuffle_epi32(diag3, 0x93);
-        diag2 = _mm_xor_si128(diag2, b1);
-
-        a2    = _mm_add_epi32(a2, diag2);
-        a3    = diag2;
-        b2    = a2;
-        a2    = _mm_slli_epi32(a2, 13);
-        b2    = _mm_srli_epi32(b2, 19);
-        diag1 = _mm_xor_si128(diag1, a2);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag1 = _mm_xor_si128(diag1, b2);
-
-        a3    = _mm_add_epi32(a3, diag1);
-        a4    = diag3;
-        b3    = a3;
-        a3    = _mm_slli_epi32(a3, 18);
-        b3    = _mm_srli_epi32(b3, 14);
-        diag0 = _mm_xor_si128(diag0, a3);
-        diag1 = _mm_shuffle_epi32(diag1, 0x39);
-        diag0 = _mm_xor_si128(diag0, b3);
-
-        a4    = _mm_add_epi32(a4, diag0);
-        a5    = diag0;
-        b4    = a4;
-        a4    = _mm_slli_epi32(a4, 7);
-        b4    = _mm_srli_epi32(b4, 25);
-        diag1 = _mm_xor_si128(diag1, a4);
-
-        diag1 = _mm_xor_si128(diag1, b4);
-
-        a5    = _mm_add_epi32(a5, diag1);
-        a6    = diag1;
-        b5    = a5;
-        a5    = _mm_slli_epi32(a5, 9);
-        b5    = _mm_srli_epi32(b5, 23);
-        diag2 = _mm_xor_si128(diag2, a5);
-        diag1 = _mm_shuffle_epi32(diag1, 0x93);
-        diag2 = _mm_xor_si128(diag2, b5);
-
-        a6    = _mm_add_epi32(a6, diag2);
-        a7    = diag2;
-        b6    = a6;
-        a6    = _mm_slli_epi32(a6, 13);
-        b6    = _mm_srli_epi32(b6, 19);
-        diag3 = _mm_xor_si128(diag3, a6);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag3 = _mm_xor_si128(diag3, b6);
-
-        a7    = _mm_add_epi32(a7, diag3);
-        a0    = diag1;
-        b7    = a7;
-        a7    = _mm_slli_epi32(a7, 18);
-        b7    = _mm_srli_epi32(b7, 14);
-        diag0 = _mm_xor_si128(diag0, a7);
-        diag3 = _mm_shuffle_epi32(diag3, 0x39);
-        diag0 = _mm_xor_si128(diag0, b7);
-    }
-
-    diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *) (x + 0)));
-    diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *) (x + 4)));
-    diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *) (x + 8)));
-    diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *) (x + 12)));
-
-#define ONEQUAD_SHUFFLE(A, B, C, D)                      \
-    do {                                                 \
-        uint32_t in##A = _mm_cvtsi128_si32(diag0);       \
-        uint32_t in##B = _mm_cvtsi128_si32(diag1);       \
-        uint32_t in##C = _mm_cvtsi128_si32(diag2);       \
-        uint32_t in##D = _mm_cvtsi128_si32(diag3);       \
-        diag0          = _mm_shuffle_epi32(diag0, 0x39); \
-        diag1          = _mm_shuffle_epi32(diag1, 0x39); \
-        diag2          = _mm_shuffle_epi32(diag2, 0x39); \
-        diag3          = _mm_shuffle_epi32(diag3, 0x39); \
-        *(uint32_t *) (partialblock + (A * 4)) = in##A;  \
-        *(uint32_t *) (partialblock + (B * 4)) = in##B;  \
-        *(uint32_t *) (partialblock + (C * 4)) = in##C;  \
-        *(uint32_t *) (partialblock + (D * 4)) = in##D;  \
-    } while (0)
-
-#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
-
-    ONEQUAD(0, 12, 8, 4);
-    ONEQUAD(5, 1, 13, 9);
-    ONEQUAD(10, 6, 2, 14);
-    ONEQUAD(15, 11, 7, 3);
-
-#undef ONEQUAD
-#undef ONEQUAD_SHUFFLE
-
-    for (i = 0; i < bytes; i++) {
-        c[i] = m[i] ^ partialblock[i];
-    }
-
-    sodium_memzero(partialblock, sizeof partialblock);
-}
+if (bytes > 0) {
+    __m128i diag0 = _mm_loadu_si128((const __m128i *) (x + 0));
+    __m128i diag1 = _mm_loadu_si128((const __m128i *) (x + 4));
+    __m128i diag2 = _mm_loadu_si128((const __m128i *) (x + 8));
+    __m128i diag3 = _mm_loadu_si128((const __m128i *) (x + 12));
+    __m128i a0, a1, a2, a3, a4, a5, a6, a7;
+    __m128i b0, b1, b2, b3, b4, b5, b6, b7;
+    uint8_t partialblock[64];
+
+    unsigned int i;
+
+    a0 = diag1;
+    for (i = 0; i < ROUNDS; i += 4) {
+        a0    = _mm_add_epi32(a0, diag0);
+        a1    = diag0;
+        b0    = a0;
+        a0    = _mm_slli_epi32(a0, 7);
+        b0    = _mm_srli_epi32(b0, 25);
+        diag3 = _mm_xor_si128(diag3, a0);
+
+        diag3 = _mm_xor_si128(diag3, b0);
+
+        a1    = _mm_add_epi32(a1, diag3);
+        a2    = diag3;
+        b1    = a1;
+        a1    = _mm_slli_epi32(a1, 9);
+        b1    = _mm_srli_epi32(b1, 23);
+        diag2 = _mm_xor_si128(diag2, a1);
+        diag3 = _mm_shuffle_epi32(diag3, 0x93);
+        diag2 = _mm_xor_si128(diag2, b1);
+
+        a2    = _mm_add_epi32(a2, diag2);
+        a3    = diag2;
+        b2    = a2;
+        a2    = _mm_slli_epi32(a2, 13);
+        b2    = _mm_srli_epi32(b2, 19);
+        diag1 = _mm_xor_si128(diag1, a2);
+        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+        diag1 = _mm_xor_si128(diag1, b2);
+
+        a3    = _mm_add_epi32(a3, diag1);
+        a4    = diag3;
+        b3    = a3;
+        a3    = _mm_slli_epi32(a3, 18);
+        b3    = _mm_srli_epi32(b3, 14);
+        diag0 = _mm_xor_si128(diag0, a3);
+        diag1 = _mm_shuffle_epi32(diag1, 0x39);
+        diag0 = _mm_xor_si128(diag0, b3);
+
+        a4    = _mm_add_epi32(a4, diag0);
+        a5    = diag0;
+        b4    = a4;
+        a4    = _mm_slli_epi32(a4, 7);
+        b4    = _mm_srli_epi32(b4, 25);
+        diag1 = _mm_xor_si128(diag1, a4);
+
+        diag1 = _mm_xor_si128(diag1, b4);
+
+        a5    = _mm_add_epi32(a5, diag1);
+        a6    = diag1;
+        b5    = a5;
+        a5    = _mm_slli_epi32(a5, 9);
+        b5    = _mm_srli_epi32(b5, 23);
+        diag2 = _mm_xor_si128(diag2, a5);
+        diag1 = _mm_shuffle_epi32(diag1, 0x93);
+        diag2 = _mm_xor_si128(diag2, b5);
+
+        a6    = _mm_add_epi32(a6, diag2);
+        a7    = diag2;
+        b6    = a6;
+        a6    = _mm_slli_epi32(a6, 13);
+        b6    = _mm_srli_epi32(b6, 19);
+        diag3 = _mm_xor_si128(diag3, a6);
+        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+        diag3 = _mm_xor_si128(diag3, b6);
+
+        a7    = _mm_add_epi32(a7, diag3);
+        a0    = diag1;
+        b7    = a7;
+        a7    = _mm_slli_epi32(a7, 18);
+        b7    = _mm_srli_epi32(b7, 14);
+        diag0 = _mm_xor_si128(diag0, a7);
+        diag3 = _mm_shuffle_epi32(diag3, 0x39);
+        diag0 = _mm_xor_si128(diag0, b7);
+
+        a0    = _mm_add_epi32(a0, diag0);
+        a1    = diag0;
+        b0    = a0;
+        a0    = _mm_slli_epi32(a0, 7);
+        b0    = _mm_srli_epi32(b0, 25);
+        diag3 = _mm_xor_si128(diag3, a0);
+
+        diag3 = _mm_xor_si128(diag3, b0);
+
+        a1    = _mm_add_epi32(a1, diag3);
+        a2    = diag3;
+        b1    = a1;
+        a1    = _mm_slli_epi32(a1, 9);
+        b1    = _mm_srli_epi32(b1, 23);
+        diag2 = _mm_xor_si128(diag2, a1);
+        diag3 = _mm_shuffle_epi32(diag3, 0x93);
+        diag2 = _mm_xor_si128(diag2, b1);
+
+        a2    = _mm_add_epi32(a2, diag2);
+        a3    = diag2;
+        b2    = a2;
+        a2    = _mm_slli_epi32(a2, 13);
+        b2    = _mm_srli_epi32(b2, 19);
+        diag1 = _mm_xor_si128(diag1, a2);
+        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+        diag1 = _mm_xor_si128(diag1, b2);
+
+        a3    = _mm_add_epi32(a3, diag1);
+        a4    = diag3;
+        b3    = a3;
+        a3    = _mm_slli_epi32(a3, 18);
+        b3    = _mm_srli_epi32(b3, 14);
+        diag0 = _mm_xor_si128(diag0, a3);
+        diag1 = _mm_shuffle_epi32(diag1, 0x39);
+        diag0 = _mm_xor_si128(diag0, b3);
+
+        a4    = _mm_add_epi32(a4, diag0);
+        a5    = diag0;
+        b4    = a4;
+        a4    = _mm_slli_epi32(a4, 7);
+        b4    = _mm_srli_epi32(b4, 25);
+        diag1 = _mm_xor_si128(diag1, a4);
+
+        diag1 = _mm_xor_si128(diag1, b4);
+
+        a5    = _mm_add_epi32(a5, diag1);
+        a6    = diag1;
+        b5    = a5;
+        a5    = _mm_slli_epi32(a5, 9);
+        b5    = _mm_srli_epi32(b5, 23);
+        diag2 = _mm_xor_si128(diag2, a5);
+        diag1 = _mm_shuffle_epi32(diag1, 0x93);
+        diag2 = _mm_xor_si128(diag2, b5);
+
+        a6    = _mm_add_epi32(a6, diag2);
+        a7    = diag2;
+        b6    = a6;
+        a6    = _mm_slli_epi32(a6, 13);
+        b6    = _mm_srli_epi32(b6, 19);
+        diag3 = _mm_xor_si128(diag3, a6);
+        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+        diag3 = _mm_xor_si128(diag3, b6);
+
+        a7    = _mm_add_epi32(a7, diag3);
+        a0    = diag1;
+        b7    = a7;
+        a7    = _mm_slli_epi32(a7, 18);
+        b7    = _mm_srli_epi32(b7, 14);
+        diag0 = _mm_xor_si128(diag0, a7);
+        diag3 = _mm_shuffle_epi32(diag3, 0x39);
+        diag0 = _mm_xor_si128(diag0, b7);
+    }
+
+    diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((const __m128i *) (x + 0)));
+    diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((const __m128i *) (x + 4)));
+    diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((const __m128i *) (x + 8)));
+    diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((const __m128i *) (x + 12)));
+
+#define ONEQUAD_SHUFFLE(A, B, C, D)                                              \
+    do {                                                                         \
+        uint32_t in##A                         = _mm_cvtsi128_si32(diag0);       \
+        uint32_t in##B                         = _mm_cvtsi128_si32(diag1);       \
+        uint32_t in##C                         = _mm_cvtsi128_si32(diag2);       \
+        uint32_t in##D                         = _mm_cvtsi128_si32(diag3);       \
+        diag0                                  = _mm_shuffle_epi32(diag0, 0x39); \
+        diag1                                  = _mm_shuffle_epi32(diag1, 0x39); \
+        diag2                                  = _mm_shuffle_epi32(diag2, 0x39); \
+        diag3                                  = _mm_shuffle_epi32(diag3, 0x39); \
+        *(uint32_t *) (partialblock + (A * 4)) = in##A;                          \
+        *(uint32_t *) (partialblock + (B * 4)) = in##B;                          \
+        *(uint32_t *) (partialblock + (C * 4)) = in##C;                          \
+        *(uint32_t *) (partialblock + (D * 4)) = in##D;                          \
+    } while (0)
+
+#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
+
+    ONEQUAD(0, 12, 8, 4);
+    ONEQUAD(5, 1, 13, 9);
+    ONEQUAD(10, 6, 2, 14);
+    ONEQUAD(15, 11, 7, 3);
+
+#undef ONEQUAD
+#undef ONEQUAD_SHUFFLE
+
+    for (i = 0; i < bytes; i++) {
+        c[i] = m[i] ^ partialblock[i];
+    }
+
+    sodium_memzero(partialblock, sizeof partialblock);
+}
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u1.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u1.h
index c245d9565f..e246027e5c 100644
--- a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u1.h
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u1.h
@@ -1,207 +1,207 @@
-while (bytes >= 64) {
-    __m128i diag0 = _mm_loadu_si128((__m128i *) (x + 0));
-    __m128i diag1 = _mm_loadu_si128((__m128i *) (x + 4));
-    __m128i diag2 = _mm_loadu_si128((__m128i *) (x + 8));
-    __m128i diag3 = _mm_loadu_si128((__m128i *) (x + 12));
-    __m128i a0, a1, a2, a3, a4, a5, a6, a7;
-    __m128i b0, b1, b2, b3, b4, b5, b6, b7;
-
-    uint32_t in8;
-    uint32_t in9;
-    int      i;
-
-    a0 = diag1;
-    for (i = 0; i < ROUNDS; i += 4) {
-        a0    = _mm_add_epi32(a0, diag0);
-        a1    = diag0;
-        b0    = a0;
-        a0    = _mm_slli_epi32(a0, 7);
-        b0    = _mm_srli_epi32(b0, 25);
-        diag3 = _mm_xor_si128(diag3, a0);
-
-        diag3 = _mm_xor_si128(diag3, b0);
-
-        a1    = _mm_add_epi32(a1, diag3);
-        a2    = diag3;
-        b1    = a1;
-        a1    = _mm_slli_epi32(a1, 9);
-        b1    = _mm_srli_epi32(b1, 23);
-        diag2 = _mm_xor_si128(diag2, a1);
-        diag3 = _mm_shuffle_epi32(diag3, 0x93);
-        diag2 = _mm_xor_si128(diag2, b1);
-
-        a2    = _mm_add_epi32(a2, diag2);
-        a3    = diag2;
-        b2    = a2;
-        a2    = _mm_slli_epi32(a2, 13);
-        b2    = _mm_srli_epi32(b2, 19);
-        diag1 = _mm_xor_si128(diag1, a2);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag1 = _mm_xor_si128(diag1, b2);
-
-        a3    = _mm_add_epi32(a3, diag1);
-        a4    = diag3;
-        b3    = a3;
-        a3    = _mm_slli_epi32(a3, 18);
-        b3    = _mm_srli_epi32(b3, 14);
-        diag0 = _mm_xor_si128(diag0, a3);
-        diag1 = _mm_shuffle_epi32(diag1, 0x39);
-        diag0 = _mm_xor_si128(diag0, b3);
-
-        a4    = _mm_add_epi32(a4, diag0);
-        a5    = diag0;
-        b4    = a4;
-        a4    = _mm_slli_epi32(a4, 7);
-        b4    = _mm_srli_epi32(b4, 25);
-        diag1 = _mm_xor_si128(diag1, a4);
-
-        diag1 = _mm_xor_si128(diag1, b4);
-
-        a5    = _mm_add_epi32(a5, diag1);
-        a6    = diag1;
-        b5    = a5;
-        a5    = _mm_slli_epi32(a5, 9);
-        b5    = _mm_srli_epi32(b5, 23);
-        diag2 = _mm_xor_si128(diag2, a5);
-        diag1 = _mm_shuffle_epi32(diag1, 0x93);
-        diag2 = _mm_xor_si128(diag2, b5);
-
-        a6    = _mm_add_epi32(a6, diag2);
-        a7    = diag2;
-        b6    = a6;
-        a6    = _mm_slli_epi32(a6, 13);
-        b6    = _mm_srli_epi32(b6, 19);
-        diag3 = _mm_xor_si128(diag3, a6);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag3 = _mm_xor_si128(diag3, b6);
-
-        a7    = _mm_add_epi32(a7, diag3);
-        a0    = diag1;
-        b7    = a7;
-        a7    = _mm_slli_epi32(a7, 18);
-        b7    = _mm_srli_epi32(b7, 14);
-        diag0 = _mm_xor_si128(diag0, a7);
-        diag3 = _mm_shuffle_epi32(diag3, 0x39);
-        diag0 = _mm_xor_si128(diag0, b7);
-
-        a0    = _mm_add_epi32(a0, diag0);
-        a1    = diag0;
-        b0    = a0;
-        a0    = _mm_slli_epi32(a0, 7);
-        b0    = _mm_srli_epi32(b0, 25);
-        diag3 = _mm_xor_si128(diag3, a0);
-
-        diag3 = _mm_xor_si128(diag3, b0);
-
-        a1    = _mm_add_epi32(a1, diag3);
-        a2    = diag3;
-        b1    = a1;
-        a1    = _mm_slli_epi32(a1, 9);
-        b1    = _mm_srli_epi32(b1, 23);
-        diag2 = _mm_xor_si128(diag2, a1);
-        diag3 = _mm_shuffle_epi32(diag3, 0x93);
-        diag2 = _mm_xor_si128(diag2, b1);
-
-        a2    = _mm_add_epi32(a2, diag2);
-        a3    = diag2;
-        b2    = a2;
-        a2    = _mm_slli_epi32(a2, 13);
-        b2    = _mm_srli_epi32(b2, 19);
-        diag1 = _mm_xor_si128(diag1, a2);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag1 = _mm_xor_si128(diag1, b2);
-
-        a3    = _mm_add_epi32(a3, diag1);
-        a4    = diag3;
-        b3    = a3;
-        a3    = _mm_slli_epi32(a3, 18);
-        b3    = _mm_srli_epi32(b3, 14);
-        diag0 = _mm_xor_si128(diag0, a3);
-        diag1 = _mm_shuffle_epi32(diag1, 0x39);
-        diag0 = _mm_xor_si128(diag0, b3);
-
-        a4    = _mm_add_epi32(a4, diag0);
-        a5    = diag0;
-        b4    = a4;
-        a4    = _mm_slli_epi32(a4, 7);
-        b4    = _mm_srli_epi32(b4, 25);
-        diag1 = _mm_xor_si128(diag1, a4);
-
-        diag1 = _mm_xor_si128(diag1, b4);
-
-        a5    = _mm_add_epi32(a5, diag1);
-        a6    = diag1;
-        b5    = a5;
-        a5    = _mm_slli_epi32(a5, 9);
-        b5    = _mm_srli_epi32(b5, 23);
-        diag2 = _mm_xor_si128(diag2, a5);
-        diag1 = _mm_shuffle_epi32(diag1, 0x93);
-        diag2 = _mm_xor_si128(diag2, b5);
-
-        a6    = _mm_add_epi32(a6, diag2);
-        a7    = diag2;
-        b6    = a6;
-        a6    = _mm_slli_epi32(a6, 13);
-        b6    = _mm_srli_epi32(b6, 19);
-        diag3 = _mm_xor_si128(diag3, a6);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag3 = _mm_xor_si128(diag3, b6);
-
-        a7    = _mm_add_epi32(a7, diag3);
-        a0    = diag1;
-        b7    = a7;
-        a7    = _mm_slli_epi32(a7, 18);
-        b7    = _mm_srli_epi32(b7, 14);
-        diag0 = _mm_xor_si128(diag0, a7);
-        diag3 = _mm_shuffle_epi32(diag3, 0x39);
-        diag0 = _mm_xor_si128(diag0, b7);
-    }
-
-    diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *) (x + 0)));
-    diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *) (x + 4)));
-    diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *) (x + 8)));
-    diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *) (x + 12)));
-
-#define ONEQUAD_SHUFFLE(A, B, C, D)                      \
-    do {                                                 \
-        uint32_t in##A = _mm_cvtsi128_si32(diag0);       \
-        uint32_t in##B = _mm_cvtsi128_si32(diag1);       \
-        uint32_t in##C = _mm_cvtsi128_si32(diag2);       \
-        uint32_t in##D = _mm_cvtsi128_si32(diag3);       \
-        diag0          = _mm_shuffle_epi32(diag0, 0x39); \
-        diag1          = _mm_shuffle_epi32(diag1, 0x39); \
-        diag2          = _mm_shuffle_epi32(diag2, 0x39); \
-        diag3          = _mm_shuffle_epi32(diag3, 0x39); \
-        in##A ^= *(uint32_t *) (m + (A * 4));            \
-        in##B ^= *(uint32_t *) (m + (B * 4));            \
-        in##C ^= *(uint32_t *) (m + (C * 4));            \
-        in##D ^= *(uint32_t *) (m + (D * 4));            \
-        *(uint32_t *) (c + (A * 4)) = in##A;             \
-        *(uint32_t *) (c + (B * 4)) = in##B;             \
-        *(uint32_t *) (c + (C * 4)) = in##C;             \
-        *(uint32_t *) (c + (D * 4)) = in##D;             \
-    } while (0)
-
-#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
-
-    ONEQUAD(0, 12, 8, 4);
-    ONEQUAD(5, 1, 13, 9);
-    ONEQUAD(10, 6, 2, 14);
-    ONEQUAD(15, 11, 7, 3);
-
-#undef ONEQUAD
-#undef ONEQUAD_SHUFFLE
-
-    in8 = x[8];
-    in9 = x[13];
-    in8++;
-    if (in8 == 0) {
-        in9++;
-    }
-    x[8]  = in8;
-    x[13] = in9;
-
-    c += 64;
-    m += 64;
-    bytes -= 64;
-}
+while (bytes >= 64) {
+    __m128i diag0 = _mm_loadu_si128((const __m128i *) (x + 0));
+    __m128i diag1 = _mm_loadu_si128((const __m128i *) (x + 4));
+    __m128i diag2 = _mm_loadu_si128((const __m128i *) (x + 8));
+    __m128i diag3 = _mm_loadu_si128((const __m128i *) (x + 12));
+    __m128i a0, a1, a2, a3, a4, a5, a6, a7;
+    __m128i b0, b1, b2, b3, b4, b5, b6, b7;
+
+    uint32_t in8;
+    uint32_t in9;
+    int      i;
+
+    a0 = diag1;
+    for (i = 0; i < ROUNDS; i += 4) {
+        a0    = _mm_add_epi32(a0, diag0);
+        a1    = diag0;
+        b0    = a0;
+        a0    = _mm_slli_epi32(a0, 7);
+        b0    = _mm_srli_epi32(b0, 25);
+        diag3 = _mm_xor_si128(diag3, a0);
+
+        diag3 = _mm_xor_si128(diag3, b0);
+
+        a1    = _mm_add_epi32(a1, diag3);
+        a2    = diag3;
+        b1    = a1;
+        a1    = _mm_slli_epi32(a1, 9);
+        b1    = _mm_srli_epi32(b1, 23);
+        diag2 = _mm_xor_si128(diag2, a1);
+        diag3 = _mm_shuffle_epi32(diag3, 0x93);
+        diag2 = _mm_xor_si128(diag2, b1);
+
+        a2    = _mm_add_epi32(a2, diag2);
+        a3    = diag2;
+        b2    = a2;
+        a2    = _mm_slli_epi32(a2, 13);
+        b2    = _mm_srli_epi32(b2, 19);
+        diag1 = _mm_xor_si128(diag1, a2);
+        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+        diag1 = _mm_xor_si128(diag1, b2);
+
+        a3    = _mm_add_epi32(a3, diag1);
+        a4    = diag3;
+        b3    = a3;
+        a3    = _mm_slli_epi32(a3, 18);
+        b3    = _mm_srli_epi32(b3, 14);
+        diag0 = _mm_xor_si128(diag0, a3);
+        diag1 = _mm_shuffle_epi32(diag1, 0x39);
+        diag0 = _mm_xor_si128(diag0, b3);
+
+        a4    = _mm_add_epi32(a4, diag0);
+        a5    = diag0;
+        b4    = a4;
+        a4    = _mm_slli_epi32(a4, 7);
+        b4    = _mm_srli_epi32(b4, 25);
+        diag1 = _mm_xor_si128(diag1, a4);
+
+        diag1 = _mm_xor_si128(diag1, b4);
+
+        a5    = _mm_add_epi32(a5, diag1);
+        a6    = diag1;
+        b5    = a5;
+        a5    = _mm_slli_epi32(a5, 9);
+        b5    = _mm_srli_epi32(b5, 23);
+        diag2 = _mm_xor_si128(diag2, a5);
+        diag1 = _mm_shuffle_epi32(diag1, 0x93);
+        diag2 = _mm_xor_si128(diag2, b5);
+
+        a6    = _mm_add_epi32(a6, diag2);
+        a7    = diag2;
+        b6    = a6;
+        a6    = _mm_slli_epi32(a6, 13);
+        b6    = _mm_srli_epi32(b6, 19);
+        diag3 = _mm_xor_si128(diag3, a6);
+        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+        diag3 = _mm_xor_si128(diag3, b6);
+
+        a7    = _mm_add_epi32(a7, diag3);
+        a0    = diag1;
+        b7    = a7;
+        a7    = _mm_slli_epi32(a7, 18);
+        b7    = _mm_srli_epi32(b7, 14);
+        diag0 = _mm_xor_si128(diag0, a7);
+        diag3 = _mm_shuffle_epi32(diag3, 0x39);
+        diag0 = _mm_xor_si128(diag0, b7);
+
+        a0    = _mm_add_epi32(a0, diag0);
+        a1    = diag0;
+        b0    = a0;
+        a0    = _mm_slli_epi32(a0, 7);
+        b0    = _mm_srli_epi32(b0, 25);
+        diag3 = _mm_xor_si128(diag3, a0);
+
+        diag3 = _mm_xor_si128(diag3, b0);
+
+        a1    = _mm_add_epi32(a1, diag3);
+        a2    = diag3;
+        b1    = a1;
+        a1    = _mm_slli_epi32(a1, 9);
+        b1    = _mm_srli_epi32(b1, 23);
+        diag2 = _mm_xor_si128(diag2, a1);
+        diag3 = _mm_shuffle_epi32(diag3, 0x93);
+        diag2 = _mm_xor_si128(diag2, b1);
+
+        a2    = _mm_add_epi32(a2, diag2);
+        a3    = diag2;
+        b2    = a2;
+        a2    = _mm_slli_epi32(a2, 13);
+        b2    = _mm_srli_epi32(b2, 19);
+        diag1 = _mm_xor_si128(diag1, a2);
+        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+        diag1 = _mm_xor_si128(diag1, b2);
+
+        a3    = _mm_add_epi32(a3, diag1);
+        a4    = diag3;
+        b3    = a3;
+        a3    = _mm_slli_epi32(a3, 18);
+        b3    = _mm_srli_epi32(b3, 14);
+        diag0 = _mm_xor_si128(diag0, a3);
+        diag1 = _mm_shuffle_epi32(diag1, 0x39);
+        diag0 = _mm_xor_si128(diag0, b3);
+
+        a4    = _mm_add_epi32(a4, diag0);
+        a5    = diag0;
+        b4    = a4;
+        a4    = _mm_slli_epi32(a4, 7);
+        b4    = _mm_srli_epi32(b4, 25);
+        diag1 = _mm_xor_si128(diag1, a4);
+
+        diag1 = _mm_xor_si128(diag1, b4);
+
+        a5    = _mm_add_epi32(a5, diag1);
+        a6    = diag1;
+        b5    = a5;
+        a5    = _mm_slli_epi32(a5, 9);
+        b5    = _mm_srli_epi32(b5, 23);
+        diag2 = _mm_xor_si128(diag2, a5);
+        diag1 = _mm_shuffle_epi32(diag1, 0x93);
+        diag2 = _mm_xor_si128(diag2, b5);
+
+        a6    = _mm_add_epi32(a6, diag2);
+        a7    = diag2;
+        b6    = a6;
+        a6    = _mm_slli_epi32(a6, 13);
+        b6    = _mm_srli_epi32(b6, 19);
+        diag3 = _mm_xor_si128(diag3, a6);
+        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+        diag3 = _mm_xor_si128(diag3, b6);
+
+        a7    = _mm_add_epi32(a7, diag3);
+        a0    = diag1;
+        b7    = a7;
+        a7    = _mm_slli_epi32(a7, 18);
+        b7    = _mm_srli_epi32(b7, 14);
+        diag0 = _mm_xor_si128(diag0, a7);
+        diag3 = _mm_shuffle_epi32(diag3, 0x39);
+        diag0 = _mm_xor_si128(diag0, b7);
+    }
+
+    diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((const __m128i *) (x + 0)));
+    diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((const __m128i *) (x + 4)));
+    diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((const __m128i *) (x + 8)));
+    diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((const __m128i *) (x + 12)));
+
+#define ONEQUAD_SHUFFLE(A, B, C, D)                      \
+    do {                                                 \
+        uint32_t in##A = _mm_cvtsi128_si32(diag0);       \
+        uint32_t in##B = _mm_cvtsi128_si32(diag1);       \
+        uint32_t in##C = _mm_cvtsi128_si32(diag2);       \
+        uint32_t in##D = _mm_cvtsi128_si32(diag3);       \
+        diag0          = _mm_shuffle_epi32(diag0, 0x39); \
+        diag1          = _mm_shuffle_epi32(diag1, 0x39); \
+        diag2          = _mm_shuffle_epi32(diag2, 0x39); \
+        diag3          = _mm_shuffle_epi32(diag3, 0x39); \
+        in##A ^= *(const uint32_t *) (m + (A * 4));      \
+        in##B ^= *(const uint32_t *) (m + (B * 4));      \
+        in##C ^= *(const uint32_t *) (m + (C * 4));      \
+        in##D ^= *(const uint32_t *) (m + (D * 4));      \
+        *(uint32_t *) (c + (A * 4)) = in##A;             \
+        *(uint32_t *) (c + (B * 4)) = in##B;             \
+        *(uint32_t *) (c + (C * 4)) = in##C;             \
+        *(uint32_t *) (c + (D * 4)) = in##D;             \
+    } while (0)
+
+#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
+
+    ONEQUAD(0, 12, 8, 4);
+    ONEQUAD(5, 1, 13, 9);
+    ONEQUAD(10, 6, 2, 14);
+    ONEQUAD(15, 11, 7, 3);
+
+#undef ONEQUAD
+#undef ONEQUAD_SHUFFLE
+
+    in8 = x[8];
+    in9 = x[13];
+    in8++;
+    if (in8 == 0) {
+        in9++;
+    }
+    x[8]  = in8;
+    x[13] = in9;
+
+    c += 64;
+    m += 64;
+    bytes -= 64;
+}
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u4.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u4.h
index 61d935fc90..50a59e8c25 100644
--- a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u4.h
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u4.h
@@ -1,547 +1,547 @@
-if (bytes >= 256) {
-    __m128i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
-        y15;
-    __m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14,
-        z15;
-    __m128i orig0, orig1, orig2, orig3, orig4, orig5, orig6, orig7, orig8,
-        orig9, orig10, orig11, orig12, orig13, orig14, orig15;
-
-    uint32_t in8;
-    uint32_t in9;
-    int      i;
-
-    /* element broadcast immediate for _mm_shuffle_epi32 are in order:
-       0x00, 0x55, 0xaa, 0xff */
-    z0  = _mm_loadu_si128((__m128i *) (x + 0));
-    z5  = _mm_shuffle_epi32(z0, 0x55);
-    z10 = _mm_shuffle_epi32(z0, 0xaa);
-    z15 = _mm_shuffle_epi32(z0, 0xff);
-    z0  = _mm_shuffle_epi32(z0, 0x00);
-    z1  = _mm_loadu_si128((__m128i *) (x + 4));
-    z6  = _mm_shuffle_epi32(z1, 0xaa);
-    z11 = _mm_shuffle_epi32(z1, 0xff);
-    z12 = _mm_shuffle_epi32(z1, 0x00);
-    z1  = _mm_shuffle_epi32(z1, 0x55);
-    z2  = _mm_loadu_si128((__m128i *) (x + 8));
-    z7  = _mm_shuffle_epi32(z2, 0xff);
-    z13 = _mm_shuffle_epi32(z2, 0x55);
-    z2  = _mm_shuffle_epi32(z2, 0xaa);
-    /* no z8 -> first half of the nonce, will fill later */
-    z3  = _mm_loadu_si128((__m128i *) (x + 12));
-    z4  = _mm_shuffle_epi32(z3, 0x00);
-    z14 = _mm_shuffle_epi32(z3, 0xaa);
-    z3  = _mm_shuffle_epi32(z3, 0xff);
-    /* no z9 -> second half of the nonce, will fill later */
-    orig0  = z0;
-    orig1  = z1;
-    orig2  = z2;
-    orig3  = z3;
-    orig4  = z4;
-    orig5  = z5;
-    orig6  = z6;
-    orig7  = z7;
-    orig10 = z10;
-    orig11 = z11;
-    orig12 = z12;
-    orig13 = z13;
-    orig14 = z14;
-    orig15 = z15;
-
-    while (bytes >= 256) {
-        /* vector implementation for z8 and z9 */
-        /* not sure if it helps for only 4 blocks */
-        const __m128i addv8 = _mm_set_epi64x(1, 0);
-        const __m128i addv9 = _mm_set_epi64x(3, 2);
-        __m128i       t8, t9;
-        uint64_t      in89;
-
-        in8  = x[8];
-        in9  = x[13];
-        in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
-        t8   = _mm_set1_epi64x(in89);
-        t9   = _mm_set1_epi64x(in89);
-
-        z8 = _mm_add_epi64(addv8, t8);
-        z9 = _mm_add_epi64(addv9, t9);
-
-        t8 = _mm_unpacklo_epi32(z8, z9);
-        t9 = _mm_unpackhi_epi32(z8, z9);
-
-        z8 = _mm_unpacklo_epi32(t8, t9);
-        z9 = _mm_unpackhi_epi32(t8, t9);
-
-        orig8 = z8;
-        orig9 = z9;
-
-        in89 += 4;
-
-        x[8]  = in89 & 0xFFFFFFFF;
-        x[13] = (in89 >> 32) & 0xFFFFFFFF;
-
-        z5  = orig5;
-        z10 = orig10;
-        z15 = orig15;
-        z14 = orig14;
-        z3  = orig3;
-        z6  = orig6;
-        z11 = orig11;
-        z1  = orig1;
-
-        z7  = orig7;
-        z13 = orig13;
-        z2  = orig2;
-        z9  = orig9;
-        z0  = orig0;
-        z12 = orig12;
-        z4  = orig4;
-        z8  = orig8;
-
-        for (i = 0; i < ROUNDS; i += 2) {
-            /* the inner loop is a direct translation (regexp search/replace)
-             * from the amd64-xmm6 ASM */
-            __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
-                r14, r15;
-
-            y4 = z12;
-            y4 = _mm_add_epi32(y4, z0);
-            r4 = y4;
-            y4 = _mm_slli_epi32(y4, 7);
-            z4 = _mm_xor_si128(z4, y4);
-            r4 = _mm_srli_epi32(r4, 25);
-            z4 = _mm_xor_si128(z4, r4);
-
-            y9 = z1;
-            y9 = _mm_add_epi32(y9, z5);
-            r9 = y9;
-            y9 = _mm_slli_epi32(y9, 7);
-            z9 = _mm_xor_si128(z9, y9);
-            r9 = _mm_srli_epi32(r9, 25);
-            z9 = _mm_xor_si128(z9, r9);
-
-            y8 = z0;
-            y8 = _mm_add_epi32(y8, z4);
-            r8 = y8;
-            y8 = _mm_slli_epi32(y8, 9);
-            z8 = _mm_xor_si128(z8, y8);
-            r8 = _mm_srli_epi32(r8, 23);
-            z8 = _mm_xor_si128(z8, r8);
-
-            y13 = z5;
-            y13 = _mm_add_epi32(y13, z9);
-            r13 = y13;
-            y13 = _mm_slli_epi32(y13, 9);
-            z13 = _mm_xor_si128(z13, y13);
-            r13 = _mm_srli_epi32(r13, 23);
-            z13 = _mm_xor_si128(z13, r13);
-
-            y12 = z4;
-            y12 = _mm_add_epi32(y12, z8);
-            r12 = y12;
-            y12 = _mm_slli_epi32(y12, 13);
-            z12 = _mm_xor_si128(z12, y12);
-            r12 = _mm_srli_epi32(r12, 19);
-            z12 = _mm_xor_si128(z12, r12);
-
-            y1 = z9;
-            y1 = _mm_add_epi32(y1, z13);
-            r1 = y1;
-            y1 = _mm_slli_epi32(y1, 13);
-            z1 = _mm_xor_si128(z1, y1);
-            r1 = _mm_srli_epi32(r1, 19);
-            z1 = _mm_xor_si128(z1, r1);
-
-            y0 = z8;
-            y0 = _mm_add_epi32(y0, z12);
-            r0 = y0;
-            y0 = _mm_slli_epi32(y0, 18);
-            z0 = _mm_xor_si128(z0, y0);
-            r0 = _mm_srli_epi32(r0, 14);
-            z0 = _mm_xor_si128(z0, r0);
-
-            y5 = z13;
-            y5 = _mm_add_epi32(y5, z1);
-            r5 = y5;
-            y5 = _mm_slli_epi32(y5, 18);
-            z5 = _mm_xor_si128(z5, y5);
-            r5 = _mm_srli_epi32(r5, 14);
-            z5 = _mm_xor_si128(z5, r5);
-
-            y14 = z6;
-            y14 = _mm_add_epi32(y14, z10);
-            r14 = y14;
-            y14 = _mm_slli_epi32(y14, 7);
-            z14 = _mm_xor_si128(z14, y14);
-            r14 = _mm_srli_epi32(r14, 25);
-            z14 = _mm_xor_si128(z14, r14);
-
-            y3 = z11;
-            y3 = _mm_add_epi32(y3, z15);
-            r3 = y3;
-            y3 = _mm_slli_epi32(y3, 7);
-            z3 = _mm_xor_si128(z3, y3);
-            r3 = _mm_srli_epi32(r3, 25);
-            z3 = _mm_xor_si128(z3, r3);
-
-            y2 = z10;
-            y2 = _mm_add_epi32(y2, z14);
-            r2 = y2;
-            y2 = _mm_slli_epi32(y2, 9);
-            z2 = _mm_xor_si128(z2, y2);
-            r2 = _mm_srli_epi32(r2, 23);
-            z2 = _mm_xor_si128(z2, r2);
-
-            y7 = z15;
-            y7 = _mm_add_epi32(y7, z3);
-            r7 = y7;
-            y7 = _mm_slli_epi32(y7, 9);
-            z7 = _mm_xor_si128(z7, y7);
-            r7 = _mm_srli_epi32(r7, 23);
-            z7 = _mm_xor_si128(z7, r7);
-
-            y6 = z14;
-            y6 = _mm_add_epi32(y6, z2);
-            r6 = y6;
-            y6 = _mm_slli_epi32(y6, 13);
-            z6 = _mm_xor_si128(z6, y6);
-            r6 = _mm_srli_epi32(r6, 19);
-            z6 = _mm_xor_si128(z6, r6);
-
-            y11 = z3;
-            y11 = _mm_add_epi32(y11, z7);
-            r11 = y11;
-            y11 = _mm_slli_epi32(y11, 13);
-            z11 = _mm_xor_si128(z11, y11);
-            r11 = _mm_srli_epi32(r11, 19);
-            z11 = _mm_xor_si128(z11, r11);
-
-            y10 = z2;
-            y10 = _mm_add_epi32(y10, z6);
-            r10 = y10;
-            y10 = _mm_slli_epi32(y10, 18);
-            z10 = _mm_xor_si128(z10, y10);
-            r10 = _mm_srli_epi32(r10, 14);
-            z10 = _mm_xor_si128(z10, r10);
-
-            y1 = z3;
-            y1 = _mm_add_epi32(y1, z0);
-            r1 = y1;
-            y1 = _mm_slli_epi32(y1, 7);
-            z1 = _mm_xor_si128(z1, y1);
-            r1 = _mm_srli_epi32(r1, 25);
-            z1 = _mm_xor_si128(z1, r1);
-
-            y15 = z7;
-            y15 = _mm_add_epi32(y15, z11);
-            r15 = y15;
-            y15 = _mm_slli_epi32(y15, 18);
-            z15 = _mm_xor_si128(z15, y15);
-            r15 = _mm_srli_epi32(r15, 14);
-            z15 = _mm_xor_si128(z15, r15);
-
-            y6 = z4;
-            y6 = _mm_add_epi32(y6, z5);
-            r6 = y6;
-            y6 = _mm_slli_epi32(y6, 7);
-            z6 = _mm_xor_si128(z6, y6);
-            r6 = _mm_srli_epi32(r6, 25);
-            z6 = _mm_xor_si128(z6, r6);
-
-            y2 = z0;
-            y2 = _mm_add_epi32(y2, z1);
-            r2 = y2;
-            y2 = _mm_slli_epi32(y2, 9);
-            z2 = _mm_xor_si128(z2, y2);
-            r2 = _mm_srli_epi32(r2, 23);
-            z2 = _mm_xor_si128(z2, r2);
-
-            y7 = z5;
-            y7 = _mm_add_epi32(y7, z6);
-            r7 = y7;
-            y7 = _mm_slli_epi32(y7, 9);
-            z7 = _mm_xor_si128(z7, y7);
-            r7 = _mm_srli_epi32(r7, 23);
-            z7 = _mm_xor_si128(z7, r7);
-
-            y3 = z1;
-            y3 = _mm_add_epi32(y3, z2);
-            r3 = y3;
-            y3 = _mm_slli_epi32(y3, 13);
-            z3 = _mm_xor_si128(z3, y3);
-            r3 = _mm_srli_epi32(r3, 19);
-            z3 = _mm_xor_si128(z3, r3);
-
-            y4 = z6;
-            y4 = _mm_add_epi32(y4, z7);
-            r4 = y4;
-            y4 = _mm_slli_epi32(y4, 13);
-            z4 = _mm_xor_si128(z4, y4);
-            r4 = _mm_srli_epi32(r4, 19);
-            z4 = _mm_xor_si128(z4, r4);
-
-            y0 = z2;
-            y0 = _mm_add_epi32(y0, z3);
-            r0 = y0;
-            y0 = _mm_slli_epi32(y0, 18);
-            z0 = _mm_xor_si128(z0, y0);
-            r0 = _mm_srli_epi32(r0, 14);
-            z0 = _mm_xor_si128(z0, r0);
-
-            y5 = z7;
-            y5 = _mm_add_epi32(y5, z4);
-            r5 = y5;
-            y5 = _mm_slli_epi32(y5, 18);
-            z5 = _mm_xor_si128(z5, y5);
-            r5 = _mm_srli_epi32(r5, 14);
-            z5 = _mm_xor_si128(z5, r5);
-
-            y11 = z9;
-            y11 = _mm_add_epi32(y11, z10);
-            r11 = y11;
-            y11 = _mm_slli_epi32(y11, 7);
-            z11 = _mm_xor_si128(z11, y11);
-            r11 = _mm_srli_epi32(r11, 25);
-            z11 = _mm_xor_si128(z11, r11);
-
-            y12 = z14;
-            y12 = _mm_add_epi32(y12, z15);
-            r12 = y12;
-            y12 = _mm_slli_epi32(y12, 7);
-            z12 = _mm_xor_si128(z12, y12);
-            r12 = _mm_srli_epi32(r12, 25);
-            z12 = _mm_xor_si128(z12, r12);
-
-            y8 = z10;
-            y8 = _mm_add_epi32(y8, z11);
-            r8 = y8;
-            y8 = _mm_slli_epi32(y8, 9);
-            z8 = _mm_xor_si128(z8, y8);
-            r8 = _mm_srli_epi32(r8, 23);
-            z8 = _mm_xor_si128(z8, r8);
-
-            y13 = z15;
-            y13 = _mm_add_epi32(y13, z12);
-            r13 = y13;
-            y13 = _mm_slli_epi32(y13, 9);
-            z13 = _mm_xor_si128(z13, y13);
-            r13 = _mm_srli_epi32(r13, 23);
-            z13 = _mm_xor_si128(z13, r13);
-
-            y9 = z11;
-            y9 = _mm_add_epi32(y9, z8);
-            r9 = y9;
-            y9 = _mm_slli_epi32(y9, 13);
-            z9 = _mm_xor_si128(z9, y9);
-            r9 = _mm_srli_epi32(r9, 19);
-            z9 = _mm_xor_si128(z9, r9);
-
-            y14 = z12;
-            y14 = _mm_add_epi32(y14, z13);
-            r14 = y14;
-            y14 = _mm_slli_epi32(y14, 13);
-            z14 = _mm_xor_si128(z14, y14);
-            r14 = _mm_srli_epi32(r14, 19);
-            z14 = _mm_xor_si128(z14, r14);
-
-            y10 = z8;
-            y10 = _mm_add_epi32(y10, z9);
-            r10 = y10;
-            y10 = _mm_slli_epi32(y10, 18);
-            z10 = _mm_xor_si128(z10, y10);
-            r10 = _mm_srli_epi32(r10, 14);
-            z10 = _mm_xor_si128(z10, r10);
-
-            y15 = z13;
-            y15 = _mm_add_epi32(y15, z14);
-            r15 = y15;
-            y15 = _mm_slli_epi32(y15, 18);
-            z15 = _mm_xor_si128(z15, y15);
-            r15 = _mm_srli_epi32(r15, 14);
-            z15 = _mm_xor_si128(z15, r15);
-        }
-
-/* store data ; this macro replicates the original amd64-xmm6 code */
-#define ONEQUAD_SHUFFLE(A, B, C, D)        \
-    z##A  = _mm_add_epi32(z##A, orig##A);  \
-    z##B  = _mm_add_epi32(z##B, orig##B);  \
-    z##C  = _mm_add_epi32(z##C, orig##C);  \
-    z##D  = _mm_add_epi32(z##D, orig##D);  \
-    in##A = _mm_cvtsi128_si32(z##A);       \
-    in##B = _mm_cvtsi128_si32(z##B);       \
-    in##C = _mm_cvtsi128_si32(z##C);       \
-    in##D = _mm_cvtsi128_si32(z##D);       \
-    z##A  = _mm_shuffle_epi32(z##A, 0x39); \
-    z##B  = _mm_shuffle_epi32(z##B, 0x39); \
-    z##C  = _mm_shuffle_epi32(z##C, 0x39); \
-    z##D  = _mm_shuffle_epi32(z##D, 0x39); \
-                                           \
-    in##A ^= *(uint32_t *) (m + 0);        \
-    in##B ^= *(uint32_t *) (m + 4);        \
-    in##C ^= *(uint32_t *) (m + 8);        \
-    in##D ^= *(uint32_t *) (m + 12);       \
-                                           \
-    *(uint32_t *) (c + 0)  = in##A;        \
-    *(uint32_t *) (c + 4)  = in##B;        \
-    *(uint32_t *) (c + 8)  = in##C;        \
-    *(uint32_t *) (c + 12) = in##D;        \
-                                           \
-    in##A = _mm_cvtsi128_si32(z##A);       \
-    in##B = _mm_cvtsi128_si32(z##B);       \
-    in##C = _mm_cvtsi128_si32(z##C);       \
-    in##D = _mm_cvtsi128_si32(z##D);       \
-    z##A  = _mm_shuffle_epi32(z##A, 0x39); \
-    z##B  = _mm_shuffle_epi32(z##B, 0x39); \
-    z##C  = _mm_shuffle_epi32(z##C, 0x39); \
-    z##D  = _mm_shuffle_epi32(z##D, 0x39); \
-                                           \
-    in##A ^= *(uint32_t *) (m + 64);       \
-    in##B ^= *(uint32_t *) (m + 68);       \
-    in##C ^= *(uint32_t *) (m + 72);       \
-    in##D ^= *(uint32_t *) (m + 76);       \
-    *(uint32_t *) (c + 64) = in##A;        \
-    *(uint32_t *) (c + 68) = in##B;        \
-    *(uint32_t *) (c + 72) = in##C;        \
-    *(uint32_t *) (c + 76) = in##D;        \
-                                           \
-    in##A = _mm_cvtsi128_si32(z##A);       \
-    in##B = _mm_cvtsi128_si32(z##B);       \
-    in##C = _mm_cvtsi128_si32(z##C);       \
-    in##D = _mm_cvtsi128_si32(z##D);       \
-    z##A  = _mm_shuffle_epi32(z##A, 0x39); \
-    z##B  = _mm_shuffle_epi32(z##B, 0x39); \
-    z##C  = _mm_shuffle_epi32(z##C, 0x39); \
-    z##D  = _mm_shuffle_epi32(z##D, 0x39); \
-                                           \
-    in##A ^= *(uint32_t *) (m + 128);      \
-    in##B ^= *(uint32_t *) (m + 132);      \
-    in##C ^= *(uint32_t *) (m + 136);      \
-    in##D ^= *(uint32_t *) (m + 140);      \
-    *(uint32_t *) (c + 128) = in##A;       \
-    *(uint32_t *) (c + 132) = in##B;       \
-    *(uint32_t *) (c + 136) = in##C;       \
-    *(uint32_t *) (c + 140) = in##D;       \
-                                           \
-    in##A = _mm_cvtsi128_si32(z##A);       \
-    in##B = _mm_cvtsi128_si32(z##B);       \
-    in##C = _mm_cvtsi128_si32(z##C);       \
-    in##D = _mm_cvtsi128_si32(z##D);       \
-                                           \
-    in##A ^= *(uint32_t *) (m + 192);      \
-    in##B ^= *(uint32_t *) (m + 196);      \
-    in##C ^= *(uint32_t *) (m + 200);      \
-    in##D ^= *(uint32_t *) (m + 204);      \
-    *(uint32_t *) (c + 192) = in##A;       \
-    *(uint32_t *) (c + 196) = in##B;       \
-    *(uint32_t *) (c + 200) = in##C;       \
-    *(uint32_t *) (c + 204) = in##D
-
-/* store data ; this macro replaces shuffle+mov by a direct extract; not much
- * difference */
-#define ONEQUAD_EXTRACT(A, B, C, D)       \
-    z##A  = _mm_add_epi32(z##A, orig##A); \
-    z##B  = _mm_add_epi32(z##B, orig##B); \
-    z##C  = _mm_add_epi32(z##C, orig##C); \
-    z##D  = _mm_add_epi32(z##D, orig##D); \
-    in##A = _mm_cvtsi128_si32(z##A);      \
-    in##B = _mm_cvtsi128_si32(z##B);      \
-    in##C = _mm_cvtsi128_si32(z##C);      \
-    in##D = _mm_cvtsi128_si32(z##D);      \
-    in##A ^= *(uint32_t *) (m + 0);       \
-    in##B ^= *(uint32_t *) (m + 4);       \
-    in##C ^= *(uint32_t *) (m + 8);       \
-    in##D ^= *(uint32_t *) (m + 12);      \
-    *(uint32_t *) (c + 0)  = in##A;       \
-    *(uint32_t *) (c + 4)  = in##B;       \
-    *(uint32_t *) (c + 8)  = in##C;       \
-    *(uint32_t *) (c + 12) = in##D;       \
-                                          \
-    in##A = _mm_extract_epi32(z##A, 1);   \
-    in##B = _mm_extract_epi32(z##B, 1);   \
-    in##C = _mm_extract_epi32(z##C, 1);   \
-    in##D = _mm_extract_epi32(z##D, 1);   \
-                                          \
-    in##A ^= *(uint32_t *) (m + 64);      \
-    in##B ^= *(uint32_t *) (m + 68);      \
-    in##C ^= *(uint32_t *) (m + 72);      \
-    in##D ^= *(uint32_t *) (m + 76);      \
-    *(uint32_t *) (c + 64) = in##A;       \
-    *(uint32_t *) (c + 68) = in##B;       \
-    *(uint32_t *) (c + 72) = in##C;       \
-    *(uint32_t *) (c + 76) = in##D;       \
-                                          \
-    in##A = _mm_extract_epi32(z##A, 2);   \
-    in##B = _mm_extract_epi32(z##B, 2);   \
-    in##C = _mm_extract_epi32(z##C, 2);   \
-    in##D = _mm_extract_epi32(z##D, 2);   \
-                                          \
-    in##A ^= *(uint32_t *) (m + 128);     \
-    in##B ^= *(uint32_t *) (m + 132);     \
-    in##C ^= *(uint32_t *) (m + 136);     \
-    in##D ^= *(uint32_t *) (m + 140);     \
-    *(uint32_t *) (c + 128) = in##A;      \
-    *(uint32_t *) (c + 132) = in##B;      \
-    *(uint32_t *) (c + 136) = in##C;      \
-    *(uint32_t *) (c + 140) = in##D;      \
-                                          \
-    in##A = _mm_extract_epi32(z##A, 3);   \
-    in##B = _mm_extract_epi32(z##B, 3);   \
-    in##C = _mm_extract_epi32(z##C, 3);   \
-    in##D = _mm_extract_epi32(z##D, 3);   \
-                                          \
-    in##A ^= *(uint32_t *) (m + 192);     \
-    in##B ^= *(uint32_t *) (m + 196);     \
-    in##C ^= *(uint32_t *) (m + 200);     \
-    in##D ^= *(uint32_t *) (m + 204);     \
-    *(uint32_t *) (c + 192) = in##A;      \
-    *(uint32_t *) (c + 196) = in##B;      \
-    *(uint32_t *) (c + 200) = in##C;      \
-    *(uint32_t *) (c + 204) = in##D
-
-/* store data ; this macro first transpose data in-registers, and then store
- * them in memory. much faster with icc. */
-#define ONEQUAD_TRANSPOSE(A, B, C, D)                                   \
-    z##A = _mm_add_epi32(z##A, orig##A);                                \
-    z##B = _mm_add_epi32(z##B, orig##B);                                \
-    z##C = _mm_add_epi32(z##C, orig##C);                                \
-    z##D = _mm_add_epi32(z##D, orig##D);                                \
-    y##A = _mm_unpacklo_epi32(z##A, z##B);                              \
-    y##B = _mm_unpacklo_epi32(z##C, z##D);                              \
-    y##C = _mm_unpackhi_epi32(z##A, z##B);                              \
-    y##D = _mm_unpackhi_epi32(z##C, z##D);                              \
-    z##A = _mm_unpacklo_epi64(y##A, y##B);                              \
-    z##B = _mm_unpackhi_epi64(y##A, y##B);                              \
-    z##C = _mm_unpacklo_epi64(y##C, y##D);                              \
-    z##D = _mm_unpackhi_epi64(y##C, y##D);                              \
-    y##A = _mm_xor_si128(z##A, _mm_loadu_si128((__m128i *) (m + 0)));   \
-    _mm_storeu_si128((__m128i *) (c + 0), y##A);                        \
-    y##B = _mm_xor_si128(z##B, _mm_loadu_si128((__m128i *) (m + 64)));  \
-    _mm_storeu_si128((__m128i *) (c + 64), y##B);                       \
-    y##C = _mm_xor_si128(z##C, _mm_loadu_si128((__m128i *) (m + 128))); \
-    _mm_storeu_si128((__m128i *) (c + 128), y##C);                      \
-    y##D = _mm_xor_si128(z##D, _mm_loadu_si128((__m128i *) (m + 192))); \
-    _mm_storeu_si128((__m128i *) (c + 192), y##D)
-
-#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
-
-        ONEQUAD(0, 1, 2, 3);
-        m += 16;
-        c += 16;
-        ONEQUAD(4, 5, 6, 7);
-        m += 16;
-        c += 16;
-        ONEQUAD(8, 9, 10, 11);
-        m += 16;
-        c += 16;
-        ONEQUAD(12, 13, 14, 15);
-        m -= 48;
-        c -= 48;
-
-#undef ONEQUAD
-#undef ONEQUAD_TRANSPOSE
-#undef ONEQUAD_EXTRACT
-#undef ONEQUAD_SHUFFLE
-
-        bytes -= 256;
-        c += 256;
-        m += 256;
-    }
-}
+if (bytes >= 256) {
+    __m128i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
+        y15;
+    __m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14,
+        z15;
+    __m128i orig0, orig1, orig2, orig3, orig4, orig5, orig6, orig7, orig8,
+        orig9, orig10, orig11, orig12, orig13, orig14, orig15;
+
+    uint32_t in8;
+    uint32_t in9;
+    int      i;
+
+    /* element broadcast immediate for _mm_shuffle_epi32 are in order:
+       0x00, 0x55, 0xaa, 0xff */
+    z0  = _mm_loadu_si128((const __m128i *) (x + 0));
+    z5  = _mm_shuffle_epi32(z0, 0x55);
+    z10 = _mm_shuffle_epi32(z0, 0xaa);
+    z15 = _mm_shuffle_epi32(z0, 0xff);
+    z0  = _mm_shuffle_epi32(z0, 0x00);
+    z1  = _mm_loadu_si128((const __m128i *) (x + 4));
+    z6  = _mm_shuffle_epi32(z1, 0xaa);
+    z11 = _mm_shuffle_epi32(z1, 0xff);
+    z12 = _mm_shuffle_epi32(z1, 0x00);
+    z1  = _mm_shuffle_epi32(z1, 0x55);
+    z2  = _mm_loadu_si128((const __m128i *) (x + 8));
+    z7  = _mm_shuffle_epi32(z2, 0xff);
+    z13 = _mm_shuffle_epi32(z2, 0x55);
+    z2  = _mm_shuffle_epi32(z2, 0xaa);
+    /* no z8 -> first half of the nonce, will fill later */
+    z3  = _mm_loadu_si128((const __m128i *) (x + 12));
+    z4  = _mm_shuffle_epi32(z3, 0x00);
+    z14 = _mm_shuffle_epi32(z3, 0xaa);
+    z3  = _mm_shuffle_epi32(z3, 0xff);
+    /* no z9 -> second half of the nonce, will fill later */
+    orig0  = z0;
+    orig1  = z1;
+    orig2  = z2;
+    orig3  = z3;
+    orig4  = z4;
+    orig5  = z5;
+    orig6  = z6;
+    orig7  = z7;
+    orig10 = z10;
+    orig11 = z11;
+    orig12 = z12;
+    orig13 = z13;
+    orig14 = z14;
+    orig15 = z15;
+
+    while (bytes >= 256) {
+        /* vector implementation for z8 and z9 */
+        /* not sure if it helps for only 4 blocks */
+        const __m128i addv8 = _mm_set_epi64x(1, 0);
+        const __m128i addv9 = _mm_set_epi64x(3, 2);
+        __m128i       t8, t9;
+        uint64_t      in89;
+
+        in8  = x[8];
+        in9  = x[13];
+        in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
+        t8   = _mm_set1_epi64x(in89);
+        t9   = _mm_set1_epi64x(in89);
+
+        z8 = _mm_add_epi64(addv8, t8);
+        z9 = _mm_add_epi64(addv9, t9);
+
+        t8 = _mm_unpacklo_epi32(z8, z9);
+        t9 = _mm_unpackhi_epi32(z8, z9);
+
+        z8 = _mm_unpacklo_epi32(t8, t9);
+        z9 = _mm_unpackhi_epi32(t8, t9);
+
+        orig8 = z8;
+        orig9 = z9;
+
+        in89 += 4;
+
+        x[8]  = in89 & 0xFFFFFFFF;
+        x[13] = (in89 >> 32) & 0xFFFFFFFF;
+
+        z5  = orig5;
+        z10 = orig10;
+        z15 = orig15;
+        z14 = orig14;
+        z3  = orig3;
+        z6  = orig6;
+        z11 = orig11;
+        z1  = orig1;
+
+        z7  = orig7;
+        z13 = orig13;
+        z2  = orig2;
+        z9  = orig9;
+        z0  = orig0;
+        z12 = orig12;
+        z4  = orig4;
+        z8  = orig8;
+
+        for (i = 0; i < ROUNDS; i += 2) {
+            /* the inner loop is a direct translation (regexp search/replace)
+             * from the amd64-xmm6 ASM */
+            __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
+                r14, r15;
+
+            y4 = z12;
+            y4 = _mm_add_epi32(y4, z0);
+            r4 = y4;
+            y4 = _mm_slli_epi32(y4, 7);
+            z4 = _mm_xor_si128(z4, y4);
+            r4 = _mm_srli_epi32(r4, 25);
+            z4 = _mm_xor_si128(z4, r4);
+
+            y9 = z1;
+            y9 = _mm_add_epi32(y9, z5);
+            r9 = y9;
+            y9 = _mm_slli_epi32(y9, 7);
+            z9 = _mm_xor_si128(z9, y9);
+            r9 = _mm_srli_epi32(r9, 25);
+            z9 = _mm_xor_si128(z9, r9);
+
+            y8 = z0;
+            y8 = _mm_add_epi32(y8, z4);
+            r8 = y8;
+            y8 = _mm_slli_epi32(y8, 9);
+            z8 = _mm_xor_si128(z8, y8);
+            r8 = _mm_srli_epi32(r8, 23);
+            z8 = _mm_xor_si128(z8, r8);
+
+            y13 = z5;
+            y13 = _mm_add_epi32(y13, z9);
+            r13 = y13;
+            y13 = _mm_slli_epi32(y13, 9);
+            z13 = _mm_xor_si128(z13, y13);
+            r13 = _mm_srli_epi32(r13, 23);
+            z13 = _mm_xor_si128(z13, r13);
+
+            y12 = z4;
+            y12 = _mm_add_epi32(y12, z8);
+            r12 = y12;
+            y12 = _mm_slli_epi32(y12, 13);
+            z12 = _mm_xor_si128(z12, y12);
+            r12 = _mm_srli_epi32(r12, 19);
+            z12 = _mm_xor_si128(z12, r12);
+
+            y1 = z9;
+            y1 = _mm_add_epi32(y1, z13);
+            r1 = y1;
+            y1 = _mm_slli_epi32(y1, 13);
+            z1 = _mm_xor_si128(z1, y1);
+            r1 = _mm_srli_epi32(r1, 19);
+            z1 = _mm_xor_si128(z1, r1);
+
+            y0 = z8;
+            y0 = _mm_add_epi32(y0, z12);
+            r0 = y0;
+            y0 = _mm_slli_epi32(y0, 18);
+            z0 = _mm_xor_si128(z0, y0);
+            r0 = _mm_srli_epi32(r0, 14);
+            z0 = _mm_xor_si128(z0, r0);
+
+            y5 = z13;
+            y5 = _mm_add_epi32(y5, z1);
+            r5 = y5;
+            y5 = _mm_slli_epi32(y5, 18);
+            z5 = _mm_xor_si128(z5, y5);
+            r5 = _mm_srli_epi32(r5, 14);
+            z5 = _mm_xor_si128(z5, r5);
+
+            y14 = z6;
+            y14 = _mm_add_epi32(y14, z10);
+            r14 = y14;
+            y14 = _mm_slli_epi32(y14, 7);
+            z14 = _mm_xor_si128(z14, y14);
+            r14 = _mm_srli_epi32(r14, 25);
+            z14 = _mm_xor_si128(z14, r14);
+
+            y3 = z11;
+            y3 = _mm_add_epi32(y3, z15);
+            r3 = y3;
+            y3 = _mm_slli_epi32(y3, 7);
+            z3 = _mm_xor_si128(z3, y3);
+            r3 = _mm_srli_epi32(r3, 25);
+            z3 = _mm_xor_si128(z3, r3);
+
+            y2 = z10;
+            y2 = _mm_add_epi32(y2, z14);
+            r2 = y2;
+            y2 = _mm_slli_epi32(y2, 9);
+            z2 = _mm_xor_si128(z2, y2);
+            r2 = _mm_srli_epi32(r2, 23);
+            z2 = _mm_xor_si128(z2, r2);
+
+            y7 = z15;
+            y7 = _mm_add_epi32(y7, z3);
+            r7 = y7;
+            y7 = _mm_slli_epi32(y7, 9);
+            z7 = _mm_xor_si128(z7, y7);
+            r7 = _mm_srli_epi32(r7, 23);
+            z7 = _mm_xor_si128(z7, r7);
+
+            y6 = z14;
+            y6 = _mm_add_epi32(y6, z2);
+            r6 = y6;
+            y6 = _mm_slli_epi32(y6, 13);
+            z6 = _mm_xor_si128(z6, y6);
+            r6 = _mm_srli_epi32(r6, 19);
+            z6 = _mm_xor_si128(z6, r6);
+
+            y11 = z3;
+            y11 = _mm_add_epi32(y11, z7);
+            r11 = y11;
+            y11 = _mm_slli_epi32(y11, 13);
+            z11 = _mm_xor_si128(z11, y11);
+            r11 = _mm_srli_epi32(r11, 19);
+            z11 = _mm_xor_si128(z11, r11);
+
+            y10 = z2;
+            y10 = _mm_add_epi32(y10, z6);
+            r10 = y10;
+            y10 = _mm_slli_epi32(y10, 18);
+            z10 = _mm_xor_si128(z10, y10);
+            r10 = _mm_srli_epi32(r10, 14);
+            z10 = _mm_xor_si128(z10, r10);
+
+            y1 = z3;
+            y1 = _mm_add_epi32(y1, z0);
+            r1 = y1;
+            y1 = _mm_slli_epi32(y1, 7);
+            z1 = _mm_xor_si128(z1, y1);
+            r1 = _mm_srli_epi32(r1, 25);
+            z1 = _mm_xor_si128(z1, r1);
+
+            y15 = z7;
+            y15 = _mm_add_epi32(y15, z11);
+            r15 = y15;
+            y15 = _mm_slli_epi32(y15, 18);
+            z15 = _mm_xor_si128(z15, y15);
+            r15 = _mm_srli_epi32(r15, 14);
+            z15 = _mm_xor_si128(z15, r15);
+
+            y6 = z4;
+            y6 = _mm_add_epi32(y6, z5);
+            r6 = y6;
+            y6 = _mm_slli_epi32(y6, 7);
+            z6 = _mm_xor_si128(z6, y6);
+            r6 = _mm_srli_epi32(r6, 25);
+            z6 = _mm_xor_si128(z6, r6);
+
+            y2 = z0;
+            y2 = _mm_add_epi32(y2, z1);
+            r2 = y2;
+            y2 = _mm_slli_epi32(y2, 9);
+            z2 = _mm_xor_si128(z2, y2);
+            r2 = _mm_srli_epi32(r2, 23);
+            z2 = _mm_xor_si128(z2, r2);
+
+            y7 = z5;
+            y7 = _mm_add_epi32(y7, z6);
+            r7 = y7;
+            y7 = _mm_slli_epi32(y7, 9);
+            z7 = _mm_xor_si128(z7, y7);
+            r7 = _mm_srli_epi32(r7, 23);
+            z7 = _mm_xor_si128(z7, r7);
+
+            y3 = z1;
+            y3 = _mm_add_epi32(y3, z2);
+            r3 = y3;
+            y3 = _mm_slli_epi32(y3, 13);
+            z3 = _mm_xor_si128(z3, y3);
+            r3 = _mm_srli_epi32(r3, 19);
+            z3 = _mm_xor_si128(z3, r3);
+
+            y4 = z6;
+            y4 = _mm_add_epi32(y4, z7);
+            r4 = y4;
+            y4 = _mm_slli_epi32(y4, 13);
+            z4 = _mm_xor_si128(z4, y4);
+            r4 = _mm_srli_epi32(r4, 19);
+            z4 = _mm_xor_si128(z4, r4);
+
+            y0 = z2;
+            y0 = _mm_add_epi32(y0, z3);
+            r0 = y0;
+            y0 = _mm_slli_epi32(y0, 18);
+            z0 = _mm_xor_si128(z0, y0);
+            r0 = _mm_srli_epi32(r0, 14);
+            z0 = _mm_xor_si128(z0, r0);
+
+            y5 = z7;
+            y5 = _mm_add_epi32(y5, z4);
+            r5 = y5;
+            y5 = _mm_slli_epi32(y5, 18);
+            z5 = _mm_xor_si128(z5, y5);
+            r5 = _mm_srli_epi32(r5, 14);
+            z5 = _mm_xor_si128(z5, r5);
+
+            y11 = z9;
+            y11 = _mm_add_epi32(y11, z10);
+            r11 = y11;
+            y11 = _mm_slli_epi32(y11, 7);
+            z11 = _mm_xor_si128(z11, y11);
+            r11 = _mm_srli_epi32(r11, 25);
+            z11 = _mm_xor_si128(z11, r11);
+
+            y12 = z14;
+            y12 = _mm_add_epi32(y12, z15);
+            r12 = y12;
+            y12 = _mm_slli_epi32(y12, 7);
+            z12 = _mm_xor_si128(z12, y12);
+            r12 = _mm_srli_epi32(r12, 25);
+            z12 = _mm_xor_si128(z12, r12);
+
+            y8 = z10;
+            y8 = _mm_add_epi32(y8, z11);
+            r8 = y8;
+            y8 = _mm_slli_epi32(y8, 9);
+            z8 = _mm_xor_si128(z8, y8);
+            r8 = _mm_srli_epi32(r8, 23);
+            z8 = _mm_xor_si128(z8, r8);
+
+            y13 = z15;
+            y13 = _mm_add_epi32(y13, z12);
+            r13 = y13;
+            y13 = _mm_slli_epi32(y13, 9);
+            z13 = _mm_xor_si128(z13, y13);
+            r13 = _mm_srli_epi32(r13, 23);
+            z13 = _mm_xor_si128(z13, r13);
+
+            y9 = z11;
+            y9 = _mm_add_epi32(y9, z8);
+            r9 = y9;
+            y9 = _mm_slli_epi32(y9, 13);
+            z9 = _mm_xor_si128(z9, y9);
+            r9 = _mm_srli_epi32(r9, 19);
+            z9 = _mm_xor_si128(z9, r9);
+
+            y14 = z12;
+            y14 = _mm_add_epi32(y14, z13);
+            r14 = y14;
+            y14 = _mm_slli_epi32(y14, 13);
+            z14 = _mm_xor_si128(z14, y14);
+            r14 = _mm_srli_epi32(r14, 19);
+            z14 = _mm_xor_si128(z14, r14);
+
+            y10 = z8;
+            y10 = _mm_add_epi32(y10, z9);
+            r10 = y10;
+            y10 = _mm_slli_epi32(y10, 18);
+            z10 = _mm_xor_si128(z10, y10);
+            r10 = _mm_srli_epi32(r10, 14);
+            z10 = _mm_xor_si128(z10, r10);
+
+            y15 = z13;
+            y15 = _mm_add_epi32(y15, z14);
+            r15 = y15;
+            y15 = _mm_slli_epi32(y15, 18);
+            z15 = _mm_xor_si128(z15, y15);
+            r15 = _mm_srli_epi32(r15, 14);
+            z15 = _mm_xor_si128(z15, r15);
+        }
+
+/* store data ; this macro replicates the original amd64-xmm6 code */
+#define ONEQUAD_SHUFFLE(A, B, C, D)        \
+    z##A  = _mm_add_epi32(z##A, orig##A);  \
+    z##B  = _mm_add_epi32(z##B, orig##B);  \
+    z##C  = _mm_add_epi32(z##C, orig##C);  \
+    z##D  = _mm_add_epi32(z##D, orig##D);  \
+    in##A = _mm_cvtsi128_si32(z##A);       \
+    in##B = _mm_cvtsi128_si32(z##B);       \
+    in##C = _mm_cvtsi128_si32(z##C);       \
+    in##D = _mm_cvtsi128_si32(z##D);       \
+    z##A  = _mm_shuffle_epi32(z##A, 0x39); \
+    z##B  = _mm_shuffle_epi32(z##B, 0x39); \
+    z##C  = _mm_shuffle_epi32(z##C, 0x39); \
+    z##D  = _mm_shuffle_epi32(z##D, 0x39); \
+                                           \
+    in##A ^= *(uint32_t *) (m + 0);        \
+    in##B ^= *(uint32_t *) (m + 4);        \
+    in##C ^= *(uint32_t *) (m + 8);        \
+    in##D ^= *(uint32_t *) (m + 12);       \
+                                           \
+    *(uint32_t *) (c + 0)  = in##A;        \
+    *(uint32_t *) (c + 4)  = in##B;        \
+    *(uint32_t *) (c + 8)  = in##C;        \
+    *(uint32_t *) (c + 12) = in##D;        \
+                                           \
+    in##A = _mm_cvtsi128_si32(z##A);       \
+    in##B = _mm_cvtsi128_si32(z##B);       \
+    in##C = _mm_cvtsi128_si32(z##C);       \
+    in##D = _mm_cvtsi128_si32(z##D);       \
+    z##A  = _mm_shuffle_epi32(z##A, 0x39); \
+    z##B  = _mm_shuffle_epi32(z##B, 0x39); \
+    z##C  = _mm_shuffle_epi32(z##C, 0x39); \
+    z##D  = _mm_shuffle_epi32(z##D, 0x39); \
+                                           \
+    in##A ^= *(uint32_t *) (m + 64);       \
+    in##B ^= *(uint32_t *) (m + 68);       \
+    in##C ^= *(uint32_t *) (m + 72);       \
+    in##D ^= *(uint32_t *) (m + 76);       \
+    *(uint32_t *) (c + 64) = in##A;        \
+    *(uint32_t *) (c + 68) = in##B;        \
+    *(uint32_t *) (c + 72) = in##C;        \
+    *(uint32_t *) (c + 76) = in##D;        \
+                                           \
+    in##A = _mm_cvtsi128_si32(z##A);       \
+    in##B = _mm_cvtsi128_si32(z##B);       \
+    in##C = _mm_cvtsi128_si32(z##C);       \
+    in##D = _mm_cvtsi128_si32(z##D);       \
+    z##A  = _mm_shuffle_epi32(z##A, 0x39); \
+    z##B  = _mm_shuffle_epi32(z##B, 0x39); \
+    z##C  = _mm_shuffle_epi32(z##C, 0x39); \
+    z##D  = _mm_shuffle_epi32(z##D, 0x39); \
+                                           \
+    in##A ^= *(uint32_t *) (m + 128);      \
+    in##B ^= *(uint32_t *) (m + 132);      \
+    in##C ^= *(uint32_t *) (m + 136);      \
+    in##D ^= *(uint32_t *) (m + 140);      \
+    *(uint32_t *) (c + 128) = in##A;       \
+    *(uint32_t *) (c + 132) = in##B;       \
+    *(uint32_t *) (c + 136) = in##C;       \
+    *(uint32_t *) (c + 140) = in##D;       \
+                                           \
+    in##A = _mm_cvtsi128_si32(z##A);       \
+    in##B = _mm_cvtsi128_si32(z##B);       \
+    in##C = _mm_cvtsi128_si32(z##C);       \
+    in##D = _mm_cvtsi128_si32(z##D);       \
+                                           \
+    in##A ^= *(uint32_t *) (m + 192);      \
+    in##B ^= *(uint32_t *) (m + 196);      \
+    in##C ^= *(uint32_t *) (m + 200);      \
+    in##D ^= *(uint32_t *) (m + 204);      \
+    *(uint32_t *) (c + 192) = in##A;       \
+    *(uint32_t *) (c + 196) = in##B;       \
+    *(uint32_t *) (c + 200) = in##C;       \
+    *(uint32_t *) (c + 204) = in##D
+
+/* store data ; this macro replaces shuffle+mov by a direct extract; not much
+ * difference */
+#define ONEQUAD_EXTRACT(A, B, C, D)       \
+    z##A  = _mm_add_epi32(z##A, orig##A); \
+    z##B  = _mm_add_epi32(z##B, orig##B); \
+    z##C  = _mm_add_epi32(z##C, orig##C); \
+    z##D  = _mm_add_epi32(z##D, orig##D); \
+    in##A = _mm_cvtsi128_si32(z##A);      \
+    in##B = _mm_cvtsi128_si32(z##B);      \
+    in##C = _mm_cvtsi128_si32(z##C);      \
+    in##D = _mm_cvtsi128_si32(z##D);      \
+    in##A ^= *(uint32_t *) (m + 0);       \
+    in##B ^= *(uint32_t *) (m + 4);       \
+    in##C ^= *(uint32_t *) (m + 8);       \
+    in##D ^= *(uint32_t *) (m + 12);      \
+    *(uint32_t *) (c + 0)  = in##A;       \
+    *(uint32_t *) (c + 4)  = in##B;       \
+    *(uint32_t *) (c + 8)  = in##C;       \
+    *(uint32_t *) (c + 12) = in##D;       \
+                                          \
+    in##A = _mm_extract_epi32(z##A, 1);   \
+    in##B = _mm_extract_epi32(z##B, 1);   \
+    in##C = _mm_extract_epi32(z##C, 1);   \
+    in##D = _mm_extract_epi32(z##D, 1);   \
+                                          \
+    in##A ^= *(uint32_t *) (m + 64);      \
+    in##B ^= *(uint32_t *) (m + 68);      \
+    in##C ^= *(uint32_t *) (m + 72);      \
+    in##D ^= *(uint32_t *) (m + 76);      \
+    *(uint32_t *) (c + 64) = in##A;       \
+    *(uint32_t *) (c + 68) = in##B;       \
+    *(uint32_t *) (c + 72) = in##C;       \
+    *(uint32_t *) (c + 76) = in##D;       \
+                                          \
+    in##A = _mm_extract_epi32(z##A, 2);   \
+    in##B = _mm_extract_epi32(z##B, 2);   \
+    in##C = _mm_extract_epi32(z##C, 2);   \
+    in##D = _mm_extract_epi32(z##D, 2);   \
+                                          \
+    in##A ^= *(uint32_t *) (m + 128);     \
+    in##B ^= *(uint32_t *) (m + 132);     \
+    in##C ^= *(uint32_t *) (m + 136);     \
+    in##D ^= *(uint32_t *) (m + 140);     \
+    *(uint32_t *) (c + 128) = in##A;      \
+    *(uint32_t *) (c + 132) = in##B;      \
+    *(uint32_t *) (c + 136) = in##C;      \
+    *(uint32_t *) (c + 140) = in##D;      \
+                                          \
+    in##A = _mm_extract_epi32(z##A, 3);   \
+    in##B = _mm_extract_epi32(z##B, 3);   \
+    in##C = _mm_extract_epi32(z##C, 3);   \
+    in##D = _mm_extract_epi32(z##D, 3);   \
+                                          \
+    in##A ^= *(uint32_t *) (m + 192);     \
+    in##B ^= *(uint32_t *) (m + 196);     \
+    in##C ^= *(uint32_t *) (m + 200);     \
+    in##D ^= *(uint32_t *) (m + 204);     \
+    *(uint32_t *) (c + 192) = in##A;      \
+    *(uint32_t *) (c + 196) = in##B;      \
+    *(uint32_t *) (c + 200) = in##C;      \
+    *(uint32_t *) (c + 204) = in##D
+
+/* store data ; this macro first transpose data in-registers, and then store
+ * them in memory. much faster with icc. */
+#define ONEQUAD_TRANSPOSE(A, B, C, D)                                         \
+    z##A = _mm_add_epi32(z##A, orig##A);                                      \
+    z##B = _mm_add_epi32(z##B, orig##B);                                      \
+    z##C = _mm_add_epi32(z##C, orig##C);                                      \
+    z##D = _mm_add_epi32(z##D, orig##D);                                      \
+    y##A = _mm_unpacklo_epi32(z##A, z##B);                                    \
+    y##B = _mm_unpacklo_epi32(z##C, z##D);                                    \
+    y##C = _mm_unpackhi_epi32(z##A, z##B);                                    \
+    y##D = _mm_unpackhi_epi32(z##C, z##D);                                    \
+    z##A = _mm_unpacklo_epi64(y##A, y##B);                                    \
+    z##B = _mm_unpackhi_epi64(y##A, y##B);                                    \
+    z##C = _mm_unpacklo_epi64(y##C, y##D);                                    \
+    z##D = _mm_unpackhi_epi64(y##C, y##D);                                    \
+    y##A = _mm_xor_si128(z##A, _mm_loadu_si128((const __m128i *) (m + 0)));   \
+    _mm_storeu_si128((__m128i *) (c + 0), y##A);                              \
+    y##B = _mm_xor_si128(z##B, _mm_loadu_si128((const __m128i *) (m + 64)));  \
+    _mm_storeu_si128((__m128i *) (c + 64), y##B);                             \
+    y##C = _mm_xor_si128(z##C, _mm_loadu_si128((const __m128i *) (m + 128))); \
+    _mm_storeu_si128((__m128i *) (c + 128), y##C);                            \
+    y##D = _mm_xor_si128(z##D, _mm_loadu_si128((const __m128i *) (m + 192))); \
+    _mm_storeu_si128((__m128i *) (c + 192), y##D)
+
+#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
+
+        ONEQUAD(0, 1, 2, 3);
+        m += 16;
+        c += 16;
+        ONEQUAD(4, 5, 6, 7);
+        m += 16;
+        c += 16;
+        ONEQUAD(8, 9, 10, 11);
+        m += 16;
+        c += 16;
+        ONEQUAD(12, 13, 14, 15);
+        m -= 48;
+        c -= 48;
+
+#undef ONEQUAD
+#undef ONEQUAD_TRANSPOSE
+#undef ONEQUAD_EXTRACT
+#undef ONEQUAD_SHUFFLE
+
+        bytes -= 256;
+        c += 256;
+        m += 256;
+    }
+}
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u8.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u8.h
index 467a961299..ce5fb2664e 100644
--- a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u8.h
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u8.h
@@ -1,476 +1,477 @@
-if (bytes >= 512) {
-    __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
-        y15;
-
-    /* the naive way seems as fast (if not a bit faster) than the vector way */
-    __m256i z0  = _mm256_set1_epi32(x[0]);
-    __m256i z5  = _mm256_set1_epi32(x[1]);
-    __m256i z10 = _mm256_set1_epi32(x[2]);
-    __m256i z15 = _mm256_set1_epi32(x[3]);
-    __m256i z12 = _mm256_set1_epi32(x[4]);
-    __m256i z1  = _mm256_set1_epi32(x[5]);
-    __m256i z6  = _mm256_set1_epi32(x[6]);
-    __m256i z11 = _mm256_set1_epi32(x[7]);
-    __m256i z8; /* useless */
-    __m256i z13 = _mm256_set1_epi32(x[9]);
-    __m256i z2  = _mm256_set1_epi32(x[10]);
-    __m256i z7  = _mm256_set1_epi32(x[11]);
-    __m256i z4  = _mm256_set1_epi32(x[12]);
-    __m256i z9; /* useless */
-    __m256i z14 = _mm256_set1_epi32(x[14]);
-    __m256i z3  = _mm256_set1_epi32(x[15]);
-
-    __m256i orig0 = z0;
-    __m256i orig1 = z1;
-    __m256i orig2 = z2;
-    __m256i orig3 = z3;
-    __m256i orig4 = z4;
-    __m256i orig5 = z5;
-    __m256i orig6 = z6;
-    __m256i orig7 = z7;
-    __m256i orig8;
-    __m256i orig9;
-    __m256i orig10 = z10;
-    __m256i orig11 = z11;
-    __m256i orig12 = z12;
-    __m256i orig13 = z13;
-    __m256i orig14 = z14;
-    __m256i orig15 = z15;
-
-    uint32_t in8;
-    uint32_t in9;
-    int      i;
-
-    while (bytes >= 512) {
-        /* vector implementation for z8 and z9 */
-        /* faster than the naive version for 8 blocks */
-        const __m256i addv8   = _mm256_set_epi64x(3, 2, 1, 0);
-        const __m256i addv9   = _mm256_set_epi64x(7, 6, 5, 4);
-        const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
-
-        __m256i  t8, t9;
-        uint64_t in89;
-
-        in8  = x[8];
-        in9  = x[13]; /* see arrays above for the address translation */
-        in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
-
-        z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89));
-
-        t8 = _mm256_add_epi64(addv8, z8);
-        t9 = _mm256_add_epi64(addv9, z9);
-
-        z8 = _mm256_unpacklo_epi32(t8, t9);
-        z9 = _mm256_unpackhi_epi32(t8, t9);
-
-        t8 = _mm256_unpacklo_epi32(z8, z9);
-        t9 = _mm256_unpackhi_epi32(z8, z9);
-
-        /* required because unpack* are intra-lane */
-        z8 = _mm256_permutevar8x32_epi32(t8, permute);
-        z9 = _mm256_permutevar8x32_epi32(t9, permute);
-
-        orig8 = z8;
-        orig9 = z9;
-
-        in89 += 8;
-
-        x[8]  = in89 & 0xFFFFFFFF;
-        x[13] = (in89 >> 32) & 0xFFFFFFFF;
-
-        z5  = orig5;
-        z10 = orig10;
-        z15 = orig15;
-        z14 = orig14;
-        z3  = orig3;
-        z6  = orig6;
-        z11 = orig11;
-        z1  = orig1;
-
-        z7  = orig7;
-        z13 = orig13;
-        z2  = orig2;
-        z9  = orig9;
-        z0  = orig0;
-        z12 = orig12;
-        z4  = orig4;
-        z8  = orig8;
-
-        for (i = 0; i < ROUNDS; i += 2) {
-            /* the inner loop is a direct translation (regexp search/replace)
-             * from the amd64-xmm6 ASM */
-            __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
-                r14, r15;
-
-            y4 = z12;
-            y4 = _mm256_add_epi32(y4, z0);
-            r4 = y4;
-            y4 = _mm256_slli_epi32(y4, 7);
-            z4 = _mm256_xor_si256(z4, y4);
-            r4 = _mm256_srli_epi32(r4, 25);
-            z4 = _mm256_xor_si256(z4, r4);
-
-            y9 = z1;
-            y9 = _mm256_add_epi32(y9, z5);
-            r9 = y9;
-            y9 = _mm256_slli_epi32(y9, 7);
-            z9 = _mm256_xor_si256(z9, y9);
-            r9 = _mm256_srli_epi32(r9, 25);
-            z9 = _mm256_xor_si256(z9, r9);
-
-            y8 = z0;
-            y8 = _mm256_add_epi32(y8, z4);
-            r8 = y8;
-            y8 = _mm256_slli_epi32(y8, 9);
-            z8 = _mm256_xor_si256(z8, y8);
-            r8 = _mm256_srli_epi32(r8, 23);
-            z8 = _mm256_xor_si256(z8, r8);
-
-            y13 = z5;
-            y13 = _mm256_add_epi32(y13, z9);
-            r13 = y13;
-            y13 = _mm256_slli_epi32(y13, 9);
-            z13 = _mm256_xor_si256(z13, y13);
-            r13 = _mm256_srli_epi32(r13, 23);
-            z13 = _mm256_xor_si256(z13, r13);
-
-            y12 = z4;
-            y12 = _mm256_add_epi32(y12, z8);
-            r12 = y12;
-            y12 = _mm256_slli_epi32(y12, 13);
-            z12 = _mm256_xor_si256(z12, y12);
-            r12 = _mm256_srli_epi32(r12, 19);
-            z12 = _mm256_xor_si256(z12, r12);
-
-            y1 = z9;
-            y1 = _mm256_add_epi32(y1, z13);
-            r1 = y1;
-            y1 = _mm256_slli_epi32(y1, 13);
-            z1 = _mm256_xor_si256(z1, y1);
-            r1 = _mm256_srli_epi32(r1, 19);
-            z1 = _mm256_xor_si256(z1, r1);
-
-            y0 = z8;
-            y0 = _mm256_add_epi32(y0, z12);
-            r0 = y0;
-            y0 = _mm256_slli_epi32(y0, 18);
-            z0 = _mm256_xor_si256(z0, y0);
-            r0 = _mm256_srli_epi32(r0, 14);
-            z0 = _mm256_xor_si256(z0, r0);
-
-            y5 = z13;
-            y5 = _mm256_add_epi32(y5, z1);
-            r5 = y5;
-            y5 = _mm256_slli_epi32(y5, 18);
-            z5 = _mm256_xor_si256(z5, y5);
-            r5 = _mm256_srli_epi32(r5, 14);
-            z5 = _mm256_xor_si256(z5, r5);
-
-            y14 = z6;
-            y14 = _mm256_add_epi32(y14, z10);
-            r14 = y14;
-            y14 = _mm256_slli_epi32(y14, 7);
-            z14 = _mm256_xor_si256(z14, y14);
-            r14 = _mm256_srli_epi32(r14, 25);
-            z14 = _mm256_xor_si256(z14, r14);
-
-            y3 = z11;
-            y3 = _mm256_add_epi32(y3, z15);
-            r3 = y3;
-            y3 = _mm256_slli_epi32(y3, 7);
-            z3 = _mm256_xor_si256(z3, y3);
-            r3 = _mm256_srli_epi32(r3, 25);
-            z3 = _mm256_xor_si256(z3, r3);
-
-            y2 = z10;
-            y2 = _mm256_add_epi32(y2, z14);
-            r2 = y2;
-            y2 = _mm256_slli_epi32(y2, 9);
-            z2 = _mm256_xor_si256(z2, y2);
-            r2 = _mm256_srli_epi32(r2, 23);
-            z2 = _mm256_xor_si256(z2, r2);
-
-            y7 = z15;
-            y7 = _mm256_add_epi32(y7, z3);
-            r7 = y7;
-            y7 = _mm256_slli_epi32(y7, 9);
-            z7 = _mm256_xor_si256(z7, y7);
-            r7 = _mm256_srli_epi32(r7, 23);
-            z7 = _mm256_xor_si256(z7, r7);
-
-            y6 = z14;
-            y6 = _mm256_add_epi32(y6, z2);
-            r6 = y6;
-            y6 = _mm256_slli_epi32(y6, 13);
-            z6 = _mm256_xor_si256(z6, y6);
-            r6 = _mm256_srli_epi32(r6, 19);
-            z6 = _mm256_xor_si256(z6, r6);
-
-            y11 = z3;
-            y11 = _mm256_add_epi32(y11, z7);
-            r11 = y11;
-            y11 = _mm256_slli_epi32(y11, 13);
-            z11 = _mm256_xor_si256(z11, y11);
-            r11 = _mm256_srli_epi32(r11, 19);
-            z11 = _mm256_xor_si256(z11, r11);
-
-            y10 = z2;
-            y10 = _mm256_add_epi32(y10, z6);
-            r10 = y10;
-            y10 = _mm256_slli_epi32(y10, 18);
-            z10 = _mm256_xor_si256(z10, y10);
-            r10 = _mm256_srli_epi32(r10, 14);
-            z10 = _mm256_xor_si256(z10, r10);
-
-            y1 = z3;
-            y1 = _mm256_add_epi32(y1, z0);
-            r1 = y1;
-            y1 = _mm256_slli_epi32(y1, 7);
-            z1 = _mm256_xor_si256(z1, y1);
-            r1 = _mm256_srli_epi32(r1, 25);
-            z1 = _mm256_xor_si256(z1, r1);
-
-            y15 = z7;
-            y15 = _mm256_add_epi32(y15, z11);
-            r15 = y15;
-            y15 = _mm256_slli_epi32(y15, 18);
-            z15 = _mm256_xor_si256(z15, y15);
-            r15 = _mm256_srli_epi32(r15, 14);
-            z15 = _mm256_xor_si256(z15, r15);
-
-            y6 = z4;
-            y6 = _mm256_add_epi32(y6, z5);
-            r6 = y6;
-            y6 = _mm256_slli_epi32(y6, 7);
-            z6 = _mm256_xor_si256(z6, y6);
-            r6 = _mm256_srli_epi32(r6, 25);
-            z6 = _mm256_xor_si256(z6, r6);
-
-            y2 = z0;
-            y2 = _mm256_add_epi32(y2, z1);
-            r2 = y2;
-            y2 = _mm256_slli_epi32(y2, 9);
-            z2 = _mm256_xor_si256(z2, y2);
-            r2 = _mm256_srli_epi32(r2, 23);
-            z2 = _mm256_xor_si256(z2, r2);
-
-            y7 = z5;
-            y7 = _mm256_add_epi32(y7, z6);
-            r7 = y7;
-            y7 = _mm256_slli_epi32(y7, 9);
-            z7 = _mm256_xor_si256(z7, y7);
-            r7 = _mm256_srli_epi32(r7, 23);
-            z7 = _mm256_xor_si256(z7, r7);
-
-            y3 = z1;
-            y3 = _mm256_add_epi32(y3, z2);
-            r3 = y3;
-            y3 = _mm256_slli_epi32(y3, 13);
-            z3 = _mm256_xor_si256(z3, y3);
-            r3 = _mm256_srli_epi32(r3, 19);
-            z3 = _mm256_xor_si256(z3, r3);
-
-            y4 = z6;
-            y4 = _mm256_add_epi32(y4, z7);
-            r4 = y4;
-            y4 = _mm256_slli_epi32(y4, 13);
-            z4 = _mm256_xor_si256(z4, y4);
-            r4 = _mm256_srli_epi32(r4, 19);
-            z4 = _mm256_xor_si256(z4, r4);
-
-            y0 = z2;
-            y0 = _mm256_add_epi32(y0, z3);
-            r0 = y0;
-            y0 = _mm256_slli_epi32(y0, 18);
-            z0 = _mm256_xor_si256(z0, y0);
-            r0 = _mm256_srli_epi32(r0, 14);
-            z0 = _mm256_xor_si256(z0, r0);
-
-            y5 = z7;
-            y5 = _mm256_add_epi32(y5, z4);
-            r5 = y5;
-            y5 = _mm256_slli_epi32(y5, 18);
-            z5 = _mm256_xor_si256(z5, y5);
-            r5 = _mm256_srli_epi32(r5, 14);
-            z5 = _mm256_xor_si256(z5, r5);
-
-            y11 = z9;
-            y11 = _mm256_add_epi32(y11, z10);
-            r11 = y11;
-            y11 = _mm256_slli_epi32(y11, 7);
-            z11 = _mm256_xor_si256(z11, y11);
-            r11 = _mm256_srli_epi32(r11, 25);
-            z11 = _mm256_xor_si256(z11, r11);
-
-            y12 = z14;
-            y12 = _mm256_add_epi32(y12, z15);
-            r12 = y12;
-            y12 = _mm256_slli_epi32(y12, 7);
-            z12 = _mm256_xor_si256(z12, y12);
-            r12 = _mm256_srli_epi32(r12, 25);
-            z12 = _mm256_xor_si256(z12, r12);
-
-            y8 = z10;
-            y8 = _mm256_add_epi32(y8, z11);
-            r8 = y8;
-            y8 = _mm256_slli_epi32(y8, 9);
-            z8 = _mm256_xor_si256(z8, y8);
-            r8 = _mm256_srli_epi32(r8, 23);
-            z8 = _mm256_xor_si256(z8, r8);
-
-            y13 = z15;
-            y13 = _mm256_add_epi32(y13, z12);
-            r13 = y13;
-            y13 = _mm256_slli_epi32(y13, 9);
-            z13 = _mm256_xor_si256(z13, y13);
-            r13 = _mm256_srli_epi32(r13, 23);
-            z13 = _mm256_xor_si256(z13, r13);
-
-            y9 = z11;
-            y9 = _mm256_add_epi32(y9, z8);
-            r9 = y9;
-            y9 = _mm256_slli_epi32(y9, 13);
-            z9 = _mm256_xor_si256(z9, y9);
-            r9 = _mm256_srli_epi32(r9, 19);
-            z9 = _mm256_xor_si256(z9, r9);
-
-            y14 = z12;
-            y14 = _mm256_add_epi32(y14, z13);
-            r14 = y14;
-            y14 = _mm256_slli_epi32(y14, 13);
-            z14 = _mm256_xor_si256(z14, y14);
-            r14 = _mm256_srli_epi32(r14, 19);
-            z14 = _mm256_xor_si256(z14, r14);
-
-            y10 = z8;
-            y10 = _mm256_add_epi32(y10, z9);
-            r10 = y10;
-            y10 = _mm256_slli_epi32(y10, 18);
-            z10 = _mm256_xor_si256(z10, y10);
-            r10 = _mm256_srli_epi32(r10, 14);
-            z10 = _mm256_xor_si256(z10, r10);
-
-            y15 = z13;
-            y15 = _mm256_add_epi32(y15, z14);
-            r15 = y15;
-            y15 = _mm256_slli_epi32(y15, 18);
-            z15 = _mm256_xor_si256(z15, y15);
-            r15 = _mm256_srli_epi32(r15, 14);
-            z15 = _mm256_xor_si256(z15, r15);
-        }
-
-/* store data ; this macro first transpose data in-registers, and then store
- * them in memory. much faster with icc. */
-#define ONEQUAD_TRANSPOSE(A, B, C, D)                              \
-    {                                                              \
-        __m128i t0, t1, t2, t3;                                    \
-        z##A = _mm256_add_epi32(z##A, orig##A);                    \
-        z##B = _mm256_add_epi32(z##B, orig##B);                    \
-        z##C = _mm256_add_epi32(z##C, orig##C);                    \
-        z##D = _mm256_add_epi32(z##D, orig##D);                    \
-        y##A = _mm256_unpacklo_epi32(z##A, z##B);                  \
-        y##B = _mm256_unpacklo_epi32(z##C, z##D);                  \
-        y##C = _mm256_unpackhi_epi32(z##A, z##B);                  \
-        y##D = _mm256_unpackhi_epi32(z##C, z##D);                  \
-        z##A = _mm256_unpacklo_epi64(y##A, y##B);                  \
-        z##B = _mm256_unpackhi_epi64(y##A, y##B);                  \
-        z##C = _mm256_unpacklo_epi64(y##C, y##D);                  \
-        z##D = _mm256_unpackhi_epi64(y##C, y##D);                  \
-        t0   = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0),    \
-                           _mm_loadu_si128((__m128i*) (m + 0)));   \
-        _mm_storeu_si128((__m128i*) (c + 0), t0);                  \
-        t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0),      \
-                           _mm_loadu_si128((__m128i*) (m + 64)));  \
-        _mm_storeu_si128((__m128i*) (c + 64), t1);                 \
-        t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0),      \
-                           _mm_loadu_si128((__m128i*) (m + 128))); \
-        _mm_storeu_si128((__m128i*) (c + 128), t2);                \
-        t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0),      \
-                           _mm_loadu_si128((__m128i*) (m + 192))); \
-        _mm_storeu_si128((__m128i*) (c + 192), t3);                \
-        t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1),      \
-                           _mm_loadu_si128((__m128i*) (m + 256))); \
-        _mm_storeu_si128((__m128i*) (c + 256), t0);                \
-        t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1),      \
-                           _mm_loadu_si128((__m128i*) (m + 320))); \
-        _mm_storeu_si128((__m128i*) (c + 320), t1);                \
-        t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1),      \
-                           _mm_loadu_si128((__m128i*) (m + 384))); \
-        _mm_storeu_si128((__m128i*) (c + 384), t2);                \
-        t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1),      \
-                           _mm_loadu_si128((__m128i*) (m + 448))); \
-        _mm_storeu_si128((__m128i*) (c + 448), t3);                \
-    }
-
-#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
-
-#define ONEQUAD_UNPCK(A, B, C, D)                 \
-    {                                             \
-        z##A = _mm256_add_epi32(z##A, orig##A);   \
-        z##B = _mm256_add_epi32(z##B, orig##B);   \
-        z##C = _mm256_add_epi32(z##C, orig##C);   \
-        z##D = _mm256_add_epi32(z##D, orig##D);   \
-        y##A = _mm256_unpacklo_epi32(z##A, z##B); \
-        y##B = _mm256_unpacklo_epi32(z##C, z##D); \
-        y##C = _mm256_unpackhi_epi32(z##A, z##B); \
-        y##D = _mm256_unpackhi_epi32(z##C, z##D); \
-        z##A = _mm256_unpacklo_epi64(y##A, y##B); \
-        z##B = _mm256_unpackhi_epi64(y##A, y##B); \
-        z##C = _mm256_unpacklo_epi64(y##C, y##D); \
-        z##D = _mm256_unpackhi_epi64(y##C, y##D); \
-    }
-
-#define ONEOCTO(A, B, C, D, A2, B2, C2, D2)                                     \
-    {                                                                           \
-        ONEQUAD_UNPCK(A, B, C, D);                                              \
-        ONEQUAD_UNPCK(A2, B2, C2, D2);                                          \
-        y##A  = _mm256_permute2x128_si256(z##A, z##A2, 0x20);                   \
-        y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31);                   \
-        y##B  = _mm256_permute2x128_si256(z##B, z##B2, 0x20);                   \
-        y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31);                   \
-        y##C  = _mm256_permute2x128_si256(z##C, z##C2, 0x20);                   \
-        y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31);                   \
-        y##D  = _mm256_permute2x128_si256(z##D, z##D2, 0x20);                   \
-        y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31);                   \
-        y##A  = _mm256_xor_si256(y##A, _mm256_loadu_si256((__m256i*) (m + 0))); \
-        y##B =                                                                  \
-            _mm256_xor_si256(y##B, _mm256_loadu_si256((__m256i*) (m + 64)));    \
-        y##C =                                                                  \
-            _mm256_xor_si256(y##C, _mm256_loadu_si256((__m256i*) (m + 128)));   \
-        y##D =                                                                  \
-            _mm256_xor_si256(y##D, _mm256_loadu_si256((__m256i*) (m + 192)));   \
-        y##A2 =                                                                 \
-            _mm256_xor_si256(y##A2, _mm256_loadu_si256((__m256i*) (m + 256)));  \
-        y##B2 =                                                                 \
-            _mm256_xor_si256(y##B2, _mm256_loadu_si256((__m256i*) (m + 320)));  \
-        y##C2 =                                                                 \
-            _mm256_xor_si256(y##C2, _mm256_loadu_si256((__m256i*) (m + 384)));  \
-        y##D2 =                                                                 \
-            _mm256_xor_si256(y##D2, _mm256_loadu_si256((__m256i*) (m + 448)));  \
-        _mm256_storeu_si256((__m256i*) (c + 0), y##A);                          \
-        _mm256_storeu_si256((__m256i*) (c + 64), y##B);                         \
-        _mm256_storeu_si256((__m256i*) (c + 128), y##C);                        \
-        _mm256_storeu_si256((__m256i*) (c + 192), y##D);                        \
-        _mm256_storeu_si256((__m256i*) (c + 256), y##A2);                       \
-        _mm256_storeu_si256((__m256i*) (c + 320), y##B2);                       \
-        _mm256_storeu_si256((__m256i*) (c + 384), y##C2);                       \
-        _mm256_storeu_si256((__m256i*) (c + 448), y##D2);                       \
-    }
-
-        ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
-        m += 32;
-        c += 32;
-        ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
-        m -= 32;
-        c -= 32;
-
-#undef ONEQUAD
-#undef ONEQUAD_TRANSPOSE
-#undef ONEQUAD_UNPCK
-#undef ONEOCTO
-
-        bytes -= 512;
-        c += 512;
-        m += 512;
-    }
-}
+if (bytes >= 512) {
+    __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
+        y15;
+
+    /* the naive way seems as fast (if not a bit faster) than the vector way */
+    __m256i z0  = _mm256_set1_epi32(x[0]);
+    __m256i z5  = _mm256_set1_epi32(x[1]);
+    __m256i z10 = _mm256_set1_epi32(x[2]);
+    __m256i z15 = _mm256_set1_epi32(x[3]);
+    __m256i z12 = _mm256_set1_epi32(x[4]);
+    __m256i z1  = _mm256_set1_epi32(x[5]);
+    __m256i z6  = _mm256_set1_epi32(x[6]);
+    __m256i z11 = _mm256_set1_epi32(x[7]);
+    __m256i z8; /* useless */
+    __m256i z13 = _mm256_set1_epi32(x[9]);
+    __m256i z2  = _mm256_set1_epi32(x[10]);
+    __m256i z7  = _mm256_set1_epi32(x[11]);
+    __m256i z4  = _mm256_set1_epi32(x[12]);
+    __m256i z9; /* useless */
+    __m256i z14 = _mm256_set1_epi32(x[14]);
+    __m256i z3  = _mm256_set1_epi32(x[15]);
+
+    __m256i orig0 = z0;
+    __m256i orig1 = z1;
+    __m256i orig2 = z2;
+    __m256i orig3 = z3;
+    __m256i orig4 = z4;
+    __m256i orig5 = z5;
+    __m256i orig6 = z6;
+    __m256i orig7 = z7;
+    __m256i orig8;
+    __m256i orig9;
+    __m256i orig10 = z10;
+    __m256i orig11 = z11;
+    __m256i orig12 = z12;
+    __m256i orig13 = z13;
+    __m256i orig14 = z14;
+    __m256i orig15 = z15;
+
+    uint32_t in8;
+    uint32_t in9;
+    int      i;
+
+    while (bytes >= 512) {
+        /* vector implementation for z8 and z9 */
+        /* faster than the naive version for 8 blocks */
+        const __m256i addv8   = _mm256_set_epi64x(3, 2, 1, 0);
+        const __m256i addv9   = _mm256_set_epi64x(7, 6, 5, 4);
+        const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+        __m256i  t8, t9;
+        uint64_t in89;
+
+        in8  = x[8];
+        in9  = x[13]; /* see arrays above for the address translation */
+        in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
+
+        z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89));
+
+        t8 = _mm256_add_epi64(addv8, z8);
+        t9 = _mm256_add_epi64(addv9, z9);
+
+        z8 = _mm256_unpacklo_epi32(t8, t9);
+        z9 = _mm256_unpackhi_epi32(t8, t9);
+
+        t8 = _mm256_unpacklo_epi32(z8, z9);
+        t9 = _mm256_unpackhi_epi32(z8, z9);
+
+        /* required because unpack* are intra-lane */
+        z8 = _mm256_permutevar8x32_epi32(t8, permute);
+        z9 = _mm256_permutevar8x32_epi32(t9, permute);
+
+        orig8 = z8;
+        orig9 = z9;
+
+        in89 += 8;
+
+        x[8]  = in89 & 0xFFFFFFFF;
+        x[13] = (in89 >> 32) & 0xFFFFFFFF;
+
+        z5  = orig5;
+        z10 = orig10;
+        z15 = orig15;
+        z14 = orig14;
+        z3  = orig3;
+        z6  = orig6;
+        z11 = orig11;
+        z1  = orig1;
+
+        z7  = orig7;
+        z13 = orig13;
+        z2  = orig2;
+        z9  = orig9;
+        z0  = orig0;
+        z12 = orig12;
+        z4  = orig4;
+        z8  = orig8;
+
+        for (i = 0; i < ROUNDS; i += 2) {
+            /* the inner loop is a direct translation (regexp search/replace)
+             * from the amd64-xmm6 ASM */
+            __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
+                r14, r15;
+
+            y4 = z12;
+            y4 = _mm256_add_epi32(y4, z0);
+            r4 = y4;
+            y4 = _mm256_slli_epi32(y4, 7);
+            z4 = _mm256_xor_si256(z4, y4);
+            r4 = _mm256_srli_epi32(r4, 25);
+            z4 = _mm256_xor_si256(z4, r4);
+
+            y9 = z1;
+            y9 = _mm256_add_epi32(y9, z5);
+            r9 = y9;
+            y9 = _mm256_slli_epi32(y9, 7);
+            z9 = _mm256_xor_si256(z9, y9);
+            r9 = _mm256_srli_epi32(r9, 25);
+            z9 = _mm256_xor_si256(z9, r9);
+
+            y8 = z0;
+            y8 = _mm256_add_epi32(y8, z4);
+            r8 = y8;
+            y8 = _mm256_slli_epi32(y8, 9);
+            z8 = _mm256_xor_si256(z8, y8);
+            r8 = _mm256_srli_epi32(r8, 23);
+            z8 = _mm256_xor_si256(z8, r8);
+
+            y13 = z5;
+            y13 = _mm256_add_epi32(y13, z9);
+            r13 = y13;
+            y13 = _mm256_slli_epi32(y13, 9);
+            z13 = _mm256_xor_si256(z13, y13);
+            r13 = _mm256_srli_epi32(r13, 23);
+            z13 = _mm256_xor_si256(z13, r13);
+
+            y12 = z4;
+            y12 = _mm256_add_epi32(y12, z8);
+            r12 = y12;
+            y12 = _mm256_slli_epi32(y12, 13);
+            z12 = _mm256_xor_si256(z12, y12);
+            r12 = _mm256_srli_epi32(r12, 19);
+            z12 = _mm256_xor_si256(z12, r12);
+
+            y1 = z9;
+            y1 = _mm256_add_epi32(y1, z13);
+            r1 = y1;
+            y1 = _mm256_slli_epi32(y1, 13);
+            z1 = _mm256_xor_si256(z1, y1);
+            r1 = _mm256_srli_epi32(r1, 19);
+            z1 = _mm256_xor_si256(z1, r1);
+
+            y0 = z8;
+            y0 = _mm256_add_epi32(y0, z12);
+            r0 = y0;
+            y0 = _mm256_slli_epi32(y0, 18);
+            z0 = _mm256_xor_si256(z0, y0);
+            r0 = _mm256_srli_epi32(r0, 14);
+            z0 = _mm256_xor_si256(z0, r0);
+
+            y5 = z13;
+            y5 = _mm256_add_epi32(y5, z1);
+            r5 = y5;
+            y5 = _mm256_slli_epi32(y5, 18);
+            z5 = _mm256_xor_si256(z5, y5);
+            r5 = _mm256_srli_epi32(r5, 14);
+            z5 = _mm256_xor_si256(z5, r5);
+
+            y14 = z6;
+            y14 = _mm256_add_epi32(y14, z10);
+            r14 = y14;
+            y14 = _mm256_slli_epi32(y14, 7);
+            z14 = _mm256_xor_si256(z14, y14);
+            r14 = _mm256_srli_epi32(r14, 25);
+            z14 = _mm256_xor_si256(z14, r14);
+
+            y3 = z11;
+            y3 = _mm256_add_epi32(y3, z15);
+            r3 = y3;
+            y3 = _mm256_slli_epi32(y3, 7);
+            z3 = _mm256_xor_si256(z3, y3);
+            r3 = _mm256_srli_epi32(r3, 25);
+            z3 = _mm256_xor_si256(z3, r3);
+
+            y2 = z10;
+            y2 = _mm256_add_epi32(y2, z14);
+            r2 = y2;
+            y2 = _mm256_slli_epi32(y2, 9);
+            z2 = _mm256_xor_si256(z2, y2);
+            r2 = _mm256_srli_epi32(r2, 23);
+            z2 = _mm256_xor_si256(z2, r2);
+
+            y7 = z15;
+            y7 = _mm256_add_epi32(y7, z3);
+            r7 = y7;
+            y7 = _mm256_slli_epi32(y7, 9);
+            z7 = _mm256_xor_si256(z7, y7);
+            r7 = _mm256_srli_epi32(r7, 23);
+            z7 = _mm256_xor_si256(z7, r7);
+
+            y6 = z14;
+            y6 = _mm256_add_epi32(y6, z2);
+            r6 = y6;
+            y6 = _mm256_slli_epi32(y6, 13);
+            z6 = _mm256_xor_si256(z6, y6);
+            r6 = _mm256_srli_epi32(r6, 19);
+            z6 = _mm256_xor_si256(z6, r6);
+
+            y11 = z3;
+            y11 = _mm256_add_epi32(y11, z7);
+            r11 = y11;
+            y11 = _mm256_slli_epi32(y11, 13);
+            z11 = _mm256_xor_si256(z11, y11);
+            r11 = _mm256_srli_epi32(r11, 19);
+            z11 = _mm256_xor_si256(z11, r11);
+
+            y10 = z2;
+            y10 = _mm256_add_epi32(y10, z6);
+            r10 = y10;
+            y10 = _mm256_slli_epi32(y10, 18);
+            z10 = _mm256_xor_si256(z10, y10);
+            r10 = _mm256_srli_epi32(r10, 14);
+            z10 = _mm256_xor_si256(z10, r10);
+
+            y1 = z3;
+            y1 = _mm256_add_epi32(y1, z0);
+            r1 = y1;
+            y1 = _mm256_slli_epi32(y1, 7);
+            z1 = _mm256_xor_si256(z1, y1);
+            r1 = _mm256_srli_epi32(r1, 25);
+            z1 = _mm256_xor_si256(z1, r1);
+
+            y15 = z7;
+            y15 = _mm256_add_epi32(y15, z11);
+            r15 = y15;
+            y15 = _mm256_slli_epi32(y15, 18);
+            z15 = _mm256_xor_si256(z15, y15);
+            r15 = _mm256_srli_epi32(r15, 14);
+            z15 = _mm256_xor_si256(z15, r15);
+
+            y6 = z4;
+            y6 = _mm256_add_epi32(y6, z5);
+            r6 = y6;
+            y6 = _mm256_slli_epi32(y6, 7);
+            z6 = _mm256_xor_si256(z6, y6);
+            r6 = _mm256_srli_epi32(r6, 25);
+            z6 = _mm256_xor_si256(z6, r6);
+
+            y2 = z0;
+            y2 = _mm256_add_epi32(y2, z1);
+            r2 = y2;
+            y2 = _mm256_slli_epi32(y2, 9);
+            z2 = _mm256_xor_si256(z2, y2);
+            r2 = _mm256_srli_epi32(r2, 23);
+            z2 = _mm256_xor_si256(z2, r2);
+
+            y7 = z5;
+            y7 = _mm256_add_epi32(y7, z6);
+            r7 = y7;
+            y7 = _mm256_slli_epi32(y7, 9);
+            z7 = _mm256_xor_si256(z7, y7);
+            r7 = _mm256_srli_epi32(r7, 23);
+            z7 = _mm256_xor_si256(z7, r7);
+
+            y3 = z1;
+            y3 = _mm256_add_epi32(y3, z2);
+            r3 = y3;
+            y3 = _mm256_slli_epi32(y3, 13);
+            z3 = _mm256_xor_si256(z3, y3);
+            r3 = _mm256_srli_epi32(r3, 19);
+            z3 = _mm256_xor_si256(z3, r3);
+
+            y4 = z6;
+            y4 = _mm256_add_epi32(y4, z7);
+            r4 = y4;
+            y4 = _mm256_slli_epi32(y4, 13);
+            z4 = _mm256_xor_si256(z4, y4);
+            r4 = _mm256_srli_epi32(r4, 19);
+            z4 = _mm256_xor_si256(z4, r4);
+
+            y0 = z2;
+            y0 = _mm256_add_epi32(y0, z3);
+            r0 = y0;
+            y0 = _mm256_slli_epi32(y0, 18);
+            z0 = _mm256_xor_si256(z0, y0);
+            r0 = _mm256_srli_epi32(r0, 14);
+            z0 = _mm256_xor_si256(z0, r0);
+
+            y5 = z7;
+            y5 = _mm256_add_epi32(y5, z4);
+            r5 = y5;
+            y5 = _mm256_slli_epi32(y5, 18);
+            z5 = _mm256_xor_si256(z5, y5);
+            r5 = _mm256_srli_epi32(r5, 14);
+            z5 = _mm256_xor_si256(z5, r5);
+
+            y11 = z9;
+            y11 = _mm256_add_epi32(y11, z10);
+            r11 = y11;
+            y11 = _mm256_slli_epi32(y11, 7);
+            z11 = _mm256_xor_si256(z11, y11);
+            r11 = _mm256_srli_epi32(r11, 25);
+            z11 = _mm256_xor_si256(z11, r11);
+
+            y12 = z14;
+            y12 = _mm256_add_epi32(y12, z15);
+            r12 = y12;
+            y12 = _mm256_slli_epi32(y12, 7);
+            z12 = _mm256_xor_si256(z12, y12);
+            r12 = _mm256_srli_epi32(r12, 25);
+            z12 = _mm256_xor_si256(z12, r12);
+
+            y8 = z10;
+            y8 = _mm256_add_epi32(y8, z11);
+            r8 = y8;
+            y8 = _mm256_slli_epi32(y8, 9);
+            z8 = _mm256_xor_si256(z8, y8);
+            r8 = _mm256_srli_epi32(r8, 23);
+            z8 = _mm256_xor_si256(z8, r8);
+
+            y13 = z15;
+            y13 = _mm256_add_epi32(y13, z12);
+            r13 = y13;
+            y13 = _mm256_slli_epi32(y13, 9);
+            z13 = _mm256_xor_si256(z13, y13);
+            r13 = _mm256_srli_epi32(r13, 23);
+            z13 = _mm256_xor_si256(z13, r13);
+
+            y9 = z11;
+            y9 = _mm256_add_epi32(y9, z8);
+            r9 = y9;
+            y9 = _mm256_slli_epi32(y9, 13);
+            z9 = _mm256_xor_si256(z9, y9);
+            r9 = _mm256_srli_epi32(r9, 19);
+            z9 = _mm256_xor_si256(z9, r9);
+
+            y14 = z12;
+            y14 = _mm256_add_epi32(y14, z13);
+            r14 = y14;
+            y14 = _mm256_slli_epi32(y14, 13);
+            z14 = _mm256_xor_si256(z14, y14);
+            r14 = _mm256_srli_epi32(r14, 19);
+            z14 = _mm256_xor_si256(z14, r14);
+
+            y10 = z8;
+            y10 = _mm256_add_epi32(y10, z9);
+            r10 = y10;
+            y10 = _mm256_slli_epi32(y10, 18);
+            z10 = _mm256_xor_si256(z10, y10);
+            r10 = _mm256_srli_epi32(r10, 14);
+            z10 = _mm256_xor_si256(z10, r10);
+
+            y15 = z13;
+            y15 = _mm256_add_epi32(y15, z14);
+            r15 = y15;
+            y15 = _mm256_slli_epi32(y15, 18);
+            z15 = _mm256_xor_si256(z15, y15);
+            r15 = _mm256_srli_epi32(r15, 14);
+            z15 = _mm256_xor_si256(z15, r15);
+        }
+
+/* store data ; this macro first transpose data in-registers, and then store
+ * them in memory. much faster with icc. */
+#define ONEQUAD_TRANSPOSE(A, B, C, D)                                    \
+    {                                                                    \
+        __m128i t0, t1, t2, t3;                                          \
+        z##A = _mm256_add_epi32(z##A, orig##A);                          \
+        z##B = _mm256_add_epi32(z##B, orig##B);                          \
+        z##C = _mm256_add_epi32(z##C, orig##C);                          \
+        z##D = _mm256_add_epi32(z##D, orig##D);                          \
+        y##A = _mm256_unpacklo_epi32(z##A, z##B);                        \
+        y##B = _mm256_unpacklo_epi32(z##C, z##D);                        \
+        y##C = _mm256_unpackhi_epi32(z##A, z##B);                        \
+        y##D = _mm256_unpackhi_epi32(z##C, z##D);                        \
+        z##A = _mm256_unpacklo_epi64(y##A, y##B);                        \
+        z##B = _mm256_unpackhi_epi64(y##A, y##B);                        \
+        z##C = _mm256_unpacklo_epi64(y##C, y##D);                        \
+        z##D = _mm256_unpackhi_epi64(y##C, y##D);                        \
+        t0   = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0),          \
+                           _mm_loadu_si128((const __m128i*) (m + 0))); \
+        _mm_storeu_si128((__m128i*) (c + 0), t0);                        \
+        t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0),            \
+                           _mm_loadu_si128((const __m128i*) (m + 64)));  \
+        _mm_storeu_si128((__m128i*) (c + 64), t1);                       \
+        t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0),            \
+                           _mm_loadu_si128((const __m128i*) (m + 128))); \
+        _mm_storeu_si128((__m128i*) (c + 128), t2);                      \
+        t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0),            \
+                           _mm_loadu_si128((const __m128i*) (m + 192))); \
+        _mm_storeu_si128((__m128i*) (c + 192), t3);                      \
+        t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1),            \
+                           _mm_loadu_si128((const __m128i*) (m + 256))); \
+        _mm_storeu_si128((__m128i*) (c + 256), t0);                      \
+        t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1),            \
+                           _mm_loadu_si128((const __m128i*) (m + 320))); \
+        _mm_storeu_si128((__m128i*) (c + 320), t1);                      \
+        t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1),            \
+                           _mm_loadu_si128((const __m128i*) (m + 384))); \
+        _mm_storeu_si128((__m128i*) (c + 384), t2);                      \
+        t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1),            \
+                           _mm_loadu_si128((const __m128i*) (m + 448))); \
+        _mm_storeu_si128((__m128i*) (c + 448), t3);                      \
+    }
+
+#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
+
+#define ONEQUAD_UNPCK(A, B, C, D)                 \
+    {                                             \
+        z##A = _mm256_add_epi32(z##A, orig##A);   \
+        z##B = _mm256_add_epi32(z##B, orig##B);   \
+        z##C = _mm256_add_epi32(z##C, orig##C);   \
+        z##D = _mm256_add_epi32(z##D, orig##D);   \
+        y##A = _mm256_unpacklo_epi32(z##A, z##B); \
+        y##B = _mm256_unpacklo_epi32(z##C, z##D); \
+        y##C = _mm256_unpackhi_epi32(z##A, z##B); \
+        y##D = _mm256_unpackhi_epi32(z##C, z##D); \
+        z##A = _mm256_unpacklo_epi64(y##A, y##B); \
+        z##B = _mm256_unpackhi_epi64(y##A, y##B); \
+        z##C = _mm256_unpacklo_epi64(y##C, y##D); \
+        z##D = _mm256_unpackhi_epi64(y##C, y##D); \
+    }
+
+#define ONEOCTO(A, B, C, D, A2, B2, C2, D2)                                    \
+    {                                                                          \
+        ONEQUAD_UNPCK(A, B, C, D);                                             \
+        ONEQUAD_UNPCK(A2, B2, C2, D2);                                         \
+        y##A  = _mm256_permute2x128_si256(z##A, z##A2, 0x20);                  \
+        y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31);                  \
+        y##B  = _mm256_permute2x128_si256(z##B, z##B2, 0x20);                  \
+        y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31);                  \
+        y##C  = _mm256_permute2x128_si256(z##C, z##C2, 0x20);                  \
+        y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31);                  \
+        y##D  = _mm256_permute2x128_si256(z##D, z##D2, 0x20);                  \
+        y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31);                  \
+        y##A  = _mm256_xor_si256(y##A,                                         \
+                                _mm256_loadu_si256((const __m256i*) (m + 0))); \
+        y##B  = _mm256_xor_si256(                                              \
+            y##B, _mm256_loadu_si256((const __m256i*) (m + 64)));              \
+        y##C = _mm256_xor_si256(                                               \
+            y##C, _mm256_loadu_si256((const __m256i*) (m + 128)));             \
+        y##D = _mm256_xor_si256(                                               \
+            y##D, _mm256_loadu_si256((const __m256i*) (m + 192)));             \
+        y##A2 = _mm256_xor_si256(                                              \
+            y##A2, _mm256_loadu_si256((const __m256i*) (m + 256)));            \
+        y##B2 = _mm256_xor_si256(                                              \
+            y##B2, _mm256_loadu_si256((const __m256i*) (m + 320)));            \
+        y##C2 = _mm256_xor_si256(                                              \
+            y##C2, _mm256_loadu_si256((const __m256i*) (m + 384)));            \
+        y##D2 = _mm256_xor_si256(                                              \
+            y##D2, _mm256_loadu_si256((const __m256i*) (m + 448)));            \
+        _mm256_storeu_si256((__m256i*) (c + 0), y##A);                         \
+        _mm256_storeu_si256((__m256i*) (c + 64), y##B);                        \
+        _mm256_storeu_si256((__m256i*) (c + 128), y##C);                       \
+        _mm256_storeu_si256((__m256i*) (c + 192), y##D);                       \
+        _mm256_storeu_si256((__m256i*) (c + 256), y##A2);                      \
+        _mm256_storeu_si256((__m256i*) (c + 320), y##B2);                      \
+        _mm256_storeu_si256((__m256i*) (c + 384), y##C2);                      \
+        _mm256_storeu_si256((__m256i*) (c + 448), y##D2);                      \
+    }
+
+        ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
+        m += 32;
+        c += 32;
+        ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
+        m -= 32;
+        c -= 32;
+
+#undef ONEQUAD
+#undef ONEQUAD_TRANSPOSE
+#undef ONEQUAD_UNPCK
+#undef ONEOCTO
+
+        bytes -= 512;
+        c += 512;
+        m += 512;
+    }
+}
diff --git a/libs/libsodium/src/crypto_stream/salsa2012/ref/stream_salsa2012_ref.c b/libs/libsodium/src/crypto_stream/salsa2012/ref/stream_salsa2012_ref.c
index bfdfeedba3..239becfc9e 100644
--- a/libs/libsodium/src/crypto_stream/salsa2012/ref/stream_salsa2012_ref.c
+++ b/libs/libsodium/src/crypto_stream/salsa2012/ref/stream_salsa2012_ref.c
@@ -1,106 +1,106 @@
-/*
-version 20140420
-D. J. Bernstein
-Public domain.
-*/
-
-#include <stdint.h>
-
-#include "crypto_core_salsa2012.h"
-#include "crypto_stream_salsa2012.h"
-#include "utils.h"
-
-int
-crypto_stream_salsa2012(unsigned char *c, unsigned long long clen,
-                        const unsigned char *n, const unsigned char *k)
-{
-    unsigned char in[16];
-    unsigned char block[64];
-    unsigned char kcopy[32];
-    unsigned int  i;
-    unsigned int  u;
-
-    if (!clen) {
-        return 0;
-    }
-    for (i = 0; i < 32; ++i) {
-        kcopy[i] = k[i];
-    }
-    for (i = 0; i < 8; ++i) {
-        in[i] = n[i];
-    }
-    for (i = 8; i < 16; ++i) {
-        in[i] = 0;
-    }
-    while (clen >= 64) {
-        crypto_core_salsa2012(c, in, kcopy, NULL);
-        u = 1;
-        for (i = 8; i < 16; ++i) {
-            u += (unsigned int)in[i];
-            in[i] = u;
-            u >>= 8;
-        }
-        clen -= 64;
-        c += 64;
-    }
-    if (clen) {
-        crypto_core_salsa2012(block, in, kcopy, NULL);
-        for (i = 0; i < (unsigned int)clen; ++i) {
-            c[i] = block[i];
-        }
-    }
-    sodium_memzero(block, sizeof block);
-    sodium_memzero(kcopy, sizeof kcopy);
-
-    return 0;
-}
-
-int
-crypto_stream_salsa2012_xor(unsigned char *c, const unsigned char *m,
-                            unsigned long long mlen, const unsigned char *n,
-                            const unsigned char *k)
-{
-    unsigned char in[16];
-    unsigned char block[64];
-    unsigned char kcopy[32];
-    unsigned int  i;
-    unsigned int  u;
-
-    if (!mlen) {
-        return 0;
-    }
-    for (i = 0; i < 32; ++i) {
-        kcopy[i] = k[i];
-    }
-    for (i = 0; i < 8; ++i) {
-        in[i] = n[i];
-    }
-    for (i = 8; i < 16; ++i) {
-        in[i] = 0;
-    }
-    while (mlen >= 64) {
-        crypto_core_salsa2012(block, in, kcopy, NULL);
-        for (i = 0; i < 64; ++i) {
-            c[i] = m[i] ^ block[i];
-        }
-        u = 1;
-        for (i = 8; i < 16; ++i) {
-            u += (unsigned int)in[i];
-            in[i] = u;
-            u >>= 8;
-        }
-        mlen -= 64;
-        c += 64;
-        m += 64;
-    }
-    if (mlen) {
-        crypto_core_salsa2012(block, in, kcopy, NULL);
-        for (i = 0; i < (unsigned int)mlen; ++i) {
-            c[i] = m[i] ^ block[i];
-        }
-    }
-    sodium_memzero(block, sizeof block);
-    sodium_memzero(kcopy, sizeof kcopy);
-
-    return 0;
-}
+/*
+version 20140420
+D. J. Bernstein
+Public domain.
+*/
+
+#include <stdint.h>
+
+#include "crypto_core_salsa2012.h"
+#include "crypto_stream_salsa2012.h"
+#include "utils.h"
+
+int
+crypto_stream_salsa2012(unsigned char *c, unsigned long long clen,
+                        const unsigned char *n, const unsigned char *k)
+{
+    unsigned char in[16];
+    unsigned char block[64];
+    unsigned char kcopy[32];
+    unsigned int  i;
+    unsigned int  u;
+
+    if (!clen) {
+        return 0;
+    }
+    for (i = 0; i < 32; ++i) {
+        kcopy[i] = k[i];
+    }
+    for (i = 0; i < 8; ++i) {
+        in[i] = n[i];
+    }
+    for (i = 8; i < 16; ++i) {
+        in[i] = 0;
+    }
+    while (clen >= 64) {
+        crypto_core_salsa2012(c, in, kcopy, NULL);
+        u = 1;
+        for (i = 8; i < 16; ++i) {
+            u += (unsigned int)in[i];
+            in[i] = u;
+            u >>= 8;
+        }
+        clen -= 64;
+        c += 64;
+    }
+    if (clen) {
+        crypto_core_salsa2012(block, in, kcopy, NULL);
+        for (i = 0; i < (unsigned int)clen; ++i) {
+            c[i] = block[i];
+        }
+    }
+    sodium_memzero(block, sizeof block);
+    sodium_memzero(kcopy, sizeof kcopy);
+
+    return 0;
+}
+
+int
+crypto_stream_salsa2012_xor(unsigned char *c, const unsigned char *m,
+                            unsigned long long mlen, const unsigned char *n,
+                            const unsigned char *k)
+{
+    unsigned char in[16];
+    unsigned char block[64];
+    unsigned char kcopy[32];
+    unsigned int  i;
+    unsigned int  u;
+
+    if (!mlen) {
+        return 0;
+    }
+    for (i = 0; i < 32; ++i) {
+        kcopy[i] = k[i];
+    }
+    for (i = 0; i < 8; ++i) {
+        in[i] = n[i];
+    }
+    for (i = 8; i < 16; ++i) {
+        in[i] = 0;
+    }
+    while (mlen >= 64) {
+        crypto_core_salsa2012(block, in, kcopy, NULL);
+        for (i = 0; i < 64; ++i) {
+            c[i] = m[i] ^ block[i];
+        }
+        u = 1;
+        for (i = 8; i < 16; ++i) {
+            u += (unsigned int)in[i];
+            in[i] = u;
+            u >>= 8;
+        }
+        mlen -= 64;
+        c += 64;
+        m += 64;
+    }
+    if (mlen) {
+        crypto_core_salsa2012(block, in, kcopy, NULL);
+        for (i = 0; i < (unsigned int)mlen; ++i) {
+            c[i] = m[i] ^ block[i];
+        }
+    }
+    sodium_memzero(block, sizeof block);
+    sodium_memzero(kcopy, sizeof kcopy);
+
+    return 0;
+}
diff --git a/libs/libsodium/src/crypto_stream/salsa2012/stream_salsa2012.c b/libs/libsodium/src/crypto_stream/salsa2012/stream_salsa2012.c
index d0cc0f68ee..506ec57624 100644
--- a/libs/libsodium/src/crypto_stream/salsa2012/stream_salsa2012.c
+++ b/libs/libsodium/src/crypto_stream/salsa2012/stream_salsa2012.c
@@ -1,26 +1,26 @@
-#include "crypto_stream_salsa2012.h"
-#include "randombytes.h"
-
-size_t
-crypto_stream_salsa2012_keybytes(void)
-{
-    return crypto_stream_salsa2012_KEYBYTES;
-}
-
-size_t
-crypto_stream_salsa2012_noncebytes(void)
-{
-    return crypto_stream_salsa2012_NONCEBYTES;
-}
-
-size_t
-crypto_stream_salsa2012_messagebytes_max(void)
-{
-    return crypto_stream_salsa2012_MESSAGEBYTES_MAX;
-}
-
-void
-crypto_stream_salsa2012_keygen(unsigned char k[crypto_stream_salsa2012_KEYBYTES])
-{
-    randombytes_buf(k, crypto_stream_salsa2012_KEYBYTES);
-}
+#include "crypto_stream_salsa2012.h"
+#include "randombytes.h"
+
+size_t
+crypto_stream_salsa2012_keybytes(void)
+{
+    return crypto_stream_salsa2012_KEYBYTES;
+}
+
+size_t
+crypto_stream_salsa2012_noncebytes(void)
+{
+    return crypto_stream_salsa2012_NONCEBYTES;
+}
+
+size_t
+crypto_stream_salsa2012_messagebytes_max(void)
+{
+    return crypto_stream_salsa2012_MESSAGEBYTES_MAX;
+}
+
+void
+crypto_stream_salsa2012_keygen(unsigned char k[crypto_stream_salsa2012_KEYBYTES])
+{
+    randombytes_buf(k, crypto_stream_salsa2012_KEYBYTES);
+}
diff --git a/libs/libsodium/src/crypto_stream/salsa208/ref/stream_salsa208_ref.c b/libs/libsodium/src/crypto_stream/salsa208/ref/stream_salsa208_ref.c
index 7ec0c4e78e..e52a573d40 100644
--- a/libs/libsodium/src/crypto_stream/salsa208/ref/stream_salsa208_ref.c
+++ b/libs/libsodium/src/crypto_stream/salsa208/ref/stream_salsa208_ref.c
@@ -1,106 +1,106 @@
-/*
-version 20140420
-D. J. Bernstein
-Public domain.
-*/
-
-#include <stdint.h>
-
-#include "crypto_core_salsa208.h"
-#include "crypto_stream_salsa208.h"
-#include "utils.h"
-
-int
-crypto_stream_salsa208(unsigned char *c, unsigned long long clen,
-                       const unsigned char *n, const unsigned char *k)
-{
-    unsigned char in[16];
-    unsigned char block[64];
-    unsigned char kcopy[32];
-    unsigned int  i;
-    unsigned int  u;
-
-    if (!clen) {
-        return 0;
-    }
-    for (i = 0; i < 32; ++i) {
-        kcopy[i] = k[i];
-    }
-    for (i = 0; i < 8; ++i) {
-        in[i] = n[i];
-    }
-    for (i = 8; i < 16; ++i) {
-        in[i] = 0;
-    }
-    while (clen >= 64) {
-        crypto_core_salsa208(c, in, kcopy, NULL);
-        u = 1;
-        for (i = 8; i < 16; ++i) {
-            u += (unsigned int)in[i];
-            in[i] = u;
-            u >>= 8;
-        }
-        clen -= 64;
-        c += 64;
-    }
-    if (clen) {
-        crypto_core_salsa208(block, in, kcopy, NULL);
-        for (i = 0; i < (unsigned int)clen; ++i) {
-            c[i] = block[i];
-        }
-    }
-    sodium_memzero(block, sizeof block);
-    sodium_memzero(kcopy, sizeof kcopy);
-
-    return 0;
-}
-
-int
-crypto_stream_salsa208_xor(unsigned char *c, const unsigned char *m,
-                           unsigned long long mlen, const unsigned char *n,
-                           const unsigned char *k)
-{
-    unsigned char in[16];
-    unsigned char block[64];
-    unsigned char kcopy[32];
-    unsigned int  i;
-    unsigned int  u;
-
-    if (!mlen) {
-        return 0;
-    }
-    for (i = 0; i < 32; ++i) {
-        kcopy[i] = k[i];
-    }
-    for (i = 0; i < 8; ++i) {
-        in[i] = n[i];
-    }
-    for (i = 8; i < 16; ++i) {
-        in[i] = 0;
-    }
-    while (mlen >= 64) {
-        crypto_core_salsa208(block, in, kcopy, NULL);
-        for (i = 0; i < 64; ++i) {
-            c[i] = m[i] ^ block[i];
-        }
-        u = 1;
-        for (i = 8; i < 16; ++i) {
-            u += (unsigned int)in[i];
-            in[i] = u;
-            u >>= 8;
-        }
-        mlen -= 64;
-        c += 64;
-        m += 64;
-    }
-    if (mlen) {
-        crypto_core_salsa208(block, in, kcopy, NULL);
-        for (i = 0; i < (unsigned int)mlen; ++i) {
-            c[i] = m[i] ^ block[i];
-        }
-    }
-    sodium_memzero(block, sizeof block);
-    sodium_memzero(kcopy, sizeof kcopy);
-
-    return 0;
-}
+/*
+version 20140420
+D. J. Bernstein
+Public domain.
+*/
+
+#include <stdint.h>
+
+#include "crypto_core_salsa208.h"
+#include "crypto_stream_salsa208.h"
+#include "utils.h"
+
+int
+crypto_stream_salsa208(unsigned char *c, unsigned long long clen,
+                       const unsigned char *n, const unsigned char *k)
+{
+    unsigned char in[16];
+    unsigned char block[64];
+    unsigned char kcopy[32];
+    unsigned int  i;
+    unsigned int  u;
+
+    if (!clen) {
+        return 0;
+    }
+    for (i = 0; i < 32; ++i) {
+        kcopy[i] = k[i];
+    }
+    for (i = 0; i < 8; ++i) {
+        in[i] = n[i];
+    }
+    for (i = 8; i < 16; ++i) {
+        in[i] = 0;
+    }
+    while (clen >= 64) {
+        crypto_core_salsa208(c, in, kcopy, NULL);
+        u = 1;
+        for (i = 8; i < 16; ++i) {
+            u += (unsigned int)in[i];
+            in[i] = u;
+            u >>= 8;
+        }
+        clen -= 64;
+        c += 64;
+    }
+    if (clen) {
+        crypto_core_salsa208(block, in, kcopy, NULL);
+        for (i = 0; i < (unsigned int)clen; ++i) {
+            c[i] = block[i];
+        }
+    }
+    sodium_memzero(block, sizeof block);
+    sodium_memzero(kcopy, sizeof kcopy);
+
+    return 0;
+}
+
+int
+crypto_stream_salsa208_xor(unsigned char *c, const unsigned char *m,
+                           unsigned long long mlen, const unsigned char *n,
+                           const unsigned char *k)
+{
+    unsigned char in[16];
+    unsigned char block[64];
+    unsigned char kcopy[32];
+    unsigned int  i;
+    unsigned int  u;
+
+    if (!mlen) {
+        return 0;
+    }
+    for (i = 0; i < 32; ++i) {
+        kcopy[i] = k[i];
+    }
+    for (i = 0; i < 8; ++i) {
+        in[i] = n[i];
+    }
+    for (i = 8; i < 16; ++i) {
+        in[i] = 0;
+    }
+    while (mlen >= 64) {
+        crypto_core_salsa208(block, in, kcopy, NULL);
+        for (i = 0; i < 64; ++i) {
+            c[i] = m[i] ^ block[i];
+        }
+        u = 1;
+        for (i = 8; i < 16; ++i) {
+            u += (unsigned int)in[i];
+            in[i] = u;
+            u >>= 8;
+        }
+        mlen -= 64;
+        c += 64;
+        m += 64;
+    }
+    if (mlen) {
+        crypto_core_salsa208(block, in, kcopy, NULL);
+        for (i = 0; i < (unsigned int)mlen; ++i) {
+            c[i] = m[i] ^ block[i];
+        }
+    }
+    sodium_memzero(block, sizeof block);
+    sodium_memzero(kcopy, sizeof kcopy);
+
+    return 0;
+}
diff --git a/libs/libsodium/src/crypto_stream/salsa208/stream_salsa208.c b/libs/libsodium/src/crypto_stream/salsa208/stream_salsa208.c
index b79bda5ec2..fb7111b407 100644
--- a/libs/libsodium/src/crypto_stream/salsa208/stream_salsa208.c
+++ b/libs/libsodium/src/crypto_stream/salsa208/stream_salsa208.c
@@ -1,26 +1,26 @@
-#include "crypto_stream_salsa208.h"
-#include "randombytes.h"
-
-size_t
-crypto_stream_salsa208_keybytes(void)
-{
-    return crypto_stream_salsa208_KEYBYTES;
-}
-
-size_t
-crypto_stream_salsa208_noncebytes(void)
-{
-    return crypto_stream_salsa208_NONCEBYTES;
-}
-
-size_t
-crypto_stream_salsa208_messagebytes_max(void)
-{
-    return crypto_stream_salsa208_MESSAGEBYTES_MAX;
-}
-
-void
-crypto_stream_salsa208_keygen(unsigned char k[crypto_stream_salsa208_KEYBYTES])
-{
-    randombytes_buf(k, crypto_stream_salsa208_KEYBYTES);
-}
+#include "crypto_stream_salsa208.h"
+#include "randombytes.h"
+
+size_t
+crypto_stream_salsa208_keybytes(void)
+{
+    return crypto_stream_salsa208_KEYBYTES;
+}
+
+size_t
+crypto_stream_salsa208_noncebytes(void)
+{
+    return crypto_stream_salsa208_NONCEBYTES;
+}
+
+size_t
+crypto_stream_salsa208_messagebytes_max(void)
+{
+    return crypto_stream_salsa208_MESSAGEBYTES_MAX;
+}
+
+void
+crypto_stream_salsa208_keygen(unsigned char k[crypto_stream_salsa208_KEYBYTES])
+{
+    randombytes_buf(k, crypto_stream_salsa208_KEYBYTES);
+}
diff --git a/libs/libsodium/src/crypto_stream/xchacha20/stream_xchacha20.c b/libs/libsodium/src/crypto_stream/xchacha20/stream_xchacha20.c
index 8b1bc09abd..47807e0a44 100644
--- a/libs/libsodium/src/crypto_stream/xchacha20/stream_xchacha20.c
+++ b/libs/libsodium/src/crypto_stream/xchacha20/stream_xchacha20.c
@@ -1,69 +1,69 @@
-
-#include <stdlib.h>
-
-#include "crypto_core_hchacha20.h"
-#include "crypto_stream_chacha20.h"
-#include "crypto_stream_xchacha20.h"
-#include "private/common.h"
-#include "randombytes.h"
-
-size_t
-crypto_stream_xchacha20_keybytes(void)
-{
-    return crypto_stream_xchacha20_KEYBYTES;
-}
-
-size_t
-crypto_stream_xchacha20_noncebytes(void)
-{
-    return crypto_stream_xchacha20_NONCEBYTES;
-}
-
-size_t
-crypto_stream_xchacha20_messagebytes_max(void)
-{
-    return crypto_stream_xchacha20_MESSAGEBYTES_MAX;
-}
-
-int
-crypto_stream_xchacha20(unsigned char *c, unsigned long long clen,
-                        const unsigned char *n, const unsigned char *k)
-{
-    unsigned char k2[crypto_core_hchacha20_OUTPUTBYTES];
-
-    crypto_core_hchacha20(k2, n, k, NULL);
-    COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES <= sizeof k2);
-    COMPILER_ASSERT(crypto_stream_chacha20_NONCEBYTES ==
-                    crypto_stream_xchacha20_NONCEBYTES -
-                        crypto_core_hchacha20_INPUTBYTES);
-
-    return crypto_stream_chacha20(c, clen, n + crypto_core_hchacha20_INPUTBYTES,
-                                  k2);
-}
-
-int
-crypto_stream_xchacha20_xor_ic(unsigned char *c, const unsigned char *m,
-                               unsigned long long mlen, const unsigned char *n,
-                               uint64_t ic, const unsigned char *k)
-{
-    unsigned char k2[crypto_core_hchacha20_OUTPUTBYTES];
-
-    crypto_core_hchacha20(k2, n, k, NULL);
-    return crypto_stream_chacha20_xor_ic(
-        c, m, mlen, n + crypto_core_hchacha20_INPUTBYTES, ic, k2);
-}
-
-int
-crypto_stream_xchacha20_xor(unsigned char *c, const unsigned char *m,
-                            unsigned long long mlen, const unsigned char *n,
-                            const unsigned char *k)
-{
-    return crypto_stream_xchacha20_xor_ic(c, m, mlen, n, 0U, k);
-}
-
-void
-crypto_stream_xchacha20_keygen(
-    unsigned char k[crypto_stream_xchacha20_KEYBYTES])
-{
-    randombytes_buf(k, crypto_stream_xchacha20_KEYBYTES);
-}
+
+#include <stdlib.h>
+
+#include "crypto_core_hchacha20.h"
+#include "crypto_stream_chacha20.h"
+#include "crypto_stream_xchacha20.h"
+#include "private/common.h"
+#include "randombytes.h"
+
+size_t
+crypto_stream_xchacha20_keybytes(void)
+{
+    return crypto_stream_xchacha20_KEYBYTES;
+}
+
+size_t
+crypto_stream_xchacha20_noncebytes(void)
+{
+    return crypto_stream_xchacha20_NONCEBYTES;
+}
+
+size_t
+crypto_stream_xchacha20_messagebytes_max(void)
+{
+    return crypto_stream_xchacha20_MESSAGEBYTES_MAX;
+}
+
+int
+crypto_stream_xchacha20(unsigned char *c, unsigned long long clen,
+                        const unsigned char *n, const unsigned char *k)
+{
+    unsigned char k2[crypto_core_hchacha20_OUTPUTBYTES];
+
+    crypto_core_hchacha20(k2, n, k, NULL);
+    COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES <= sizeof k2);
+    COMPILER_ASSERT(crypto_stream_chacha20_NONCEBYTES ==
+                    crypto_stream_xchacha20_NONCEBYTES -
+                        crypto_core_hchacha20_INPUTBYTES);
+
+    return crypto_stream_chacha20(c, clen, n + crypto_core_hchacha20_INPUTBYTES,
+                                  k2);
+}
+
+int
+crypto_stream_xchacha20_xor_ic(unsigned char *c, const unsigned char *m,
+                               unsigned long long mlen, const unsigned char *n,
+                               uint64_t ic, const unsigned char *k)
+{
+    unsigned char k2[crypto_core_hchacha20_OUTPUTBYTES];
+
+    crypto_core_hchacha20(k2, n, k, NULL);
+    return crypto_stream_chacha20_xor_ic(
+        c, m, mlen, n + crypto_core_hchacha20_INPUTBYTES, ic, k2);
+}
+
+int
+crypto_stream_xchacha20_xor(unsigned char *c, const unsigned char *m,
+                            unsigned long long mlen, const unsigned char *n,
+                            const unsigned char *k)
+{
+    return crypto_stream_xchacha20_xor_ic(c, m, mlen, n, 0U, k);
+}
+
+void
+crypto_stream_xchacha20_keygen(
+    unsigned char k[crypto_stream_xchacha20_KEYBYTES])
+{
+    randombytes_buf(k, crypto_stream_xchacha20_KEYBYTES);
+}
diff --git a/libs/libsodium/src/crypto_stream/xsalsa20/stream_xsalsa20.c b/libs/libsodium/src/crypto_stream/xsalsa20/stream_xsalsa20.c
index dc831a94d8..30b2929794 100644
--- a/libs/libsodium/src/crypto_stream/xsalsa20/stream_xsalsa20.c
+++ b/libs/libsodium/src/crypto_stream/xsalsa20/stream_xsalsa20.c
@@ -1,66 +1,66 @@
-#include "crypto_core_hsalsa20.h"
-#include "crypto_stream_salsa20.h"
-#include "crypto_stream_xsalsa20.h"
-#include "randombytes.h"
-#include "utils.h"
-
-int
-crypto_stream_xsalsa20(unsigned char *c, unsigned long long clen,
-                       const unsigned char *n, const unsigned char *k)
-{
-    unsigned char subkey[32];
-    int           ret;
-
-    crypto_core_hsalsa20(subkey, n, k, NULL);
-    ret = crypto_stream_salsa20(c, clen, n + 16, subkey);
-    sodium_memzero(subkey, sizeof subkey);
-
-    return ret;
-}
-
-int
-crypto_stream_xsalsa20_xor_ic(unsigned char *c, const unsigned char *m,
-                              unsigned long long mlen, const unsigned char *n,
-                              uint64_t ic, const unsigned char *k)
-{
-    unsigned char subkey[32];
-    int           ret;
-
-    crypto_core_hsalsa20(subkey, n, k, NULL);
-    ret = crypto_stream_salsa20_xor_ic(c, m, mlen, n + 16, ic, subkey);
-    sodium_memzero(subkey, sizeof subkey);
-
-    return ret;
-}
-
-int
-crypto_stream_xsalsa20_xor(unsigned char *c, const unsigned char *m,
-                           unsigned long long mlen, const unsigned char *n,
-                           const unsigned char *k)
-{
-    return crypto_stream_xsalsa20_xor_ic(c, m, mlen, n, 0ULL, k);
-}
-
-size_t
-crypto_stream_xsalsa20_keybytes(void)
-{
-    return crypto_stream_xsalsa20_KEYBYTES;
-}
-
-size_t
-crypto_stream_xsalsa20_noncebytes(void)
-{
-    return crypto_stream_xsalsa20_NONCEBYTES;
-}
-
-size_t
-crypto_stream_xsalsa20_messagebytes_max(void)
-{
-    return crypto_stream_xsalsa20_MESSAGEBYTES_MAX;
-}
-
-void
-crypto_stream_xsalsa20_keygen(unsigned char k[crypto_stream_xsalsa20_KEYBYTES])
-{
-    randombytes_buf(k, crypto_stream_xsalsa20_KEYBYTES);
-}
+#include "crypto_core_hsalsa20.h"
+#include "crypto_stream_salsa20.h"
+#include "crypto_stream_xsalsa20.h"
+#include "randombytes.h"
+#include "utils.h"
+
+int
+crypto_stream_xsalsa20(unsigned char *c, unsigned long long clen,
+                       const unsigned char *n, const unsigned char *k)
+{
+    unsigned char subkey[32];
+    int           ret;
+
+    crypto_core_hsalsa20(subkey, n, k, NULL);
+    ret = crypto_stream_salsa20(c, clen, n + 16, subkey);
+    sodium_memzero(subkey, sizeof subkey);
+
+    return ret;
+}
+
+int
+crypto_stream_xsalsa20_xor_ic(unsigned char *c, const unsigned char *m,
+                              unsigned long long mlen, const unsigned char *n,
+                              uint64_t ic, const unsigned char *k)
+{
+    unsigned char subkey[32];
+    int           ret;
+
+    crypto_core_hsalsa20(subkey, n, k, NULL);
+    ret = crypto_stream_salsa20_xor_ic(c, m, mlen, n + 16, ic, subkey);
+    sodium_memzero(subkey, sizeof subkey);
+
+    return ret;
+}
+
+int
+crypto_stream_xsalsa20_xor(unsigned char *c, const unsigned char *m,
+                           unsigned long long mlen, const unsigned char *n,
+                           const unsigned char *k)
+{
+    return crypto_stream_xsalsa20_xor_ic(c, m, mlen, n, 0ULL, k);
+}
+
+size_t
+crypto_stream_xsalsa20_keybytes(void)
+{
+    return crypto_stream_xsalsa20_KEYBYTES;
+}
+
+size_t
+crypto_stream_xsalsa20_noncebytes(void)
+{
+    return crypto_stream_xsalsa20_NONCEBYTES;
+}
+
+size_t
+crypto_stream_xsalsa20_messagebytes_max(void)
+{
+    return crypto_stream_xsalsa20_MESSAGEBYTES_MAX;
+}
+
+void
+crypto_stream_xsalsa20_keygen(unsigned char k[crypto_stream_xsalsa20_KEYBYTES])
+{
+    randombytes_buf(k, crypto_stream_xsalsa20_KEYBYTES);
+}
-- 
cgit v1.2.3