summaryrefslogtreecommitdiff
path: root/libs/libsodium/src/crypto_stream/salsa20
diff options
context:
space:
mode:
Diffstat (limited to 'libs/libsodium/src/crypto_stream/salsa20')
-rw-r--r--libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.c120
-rw-r--r--libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.h8
-rw-r--r--libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.c100
-rw-r--r--libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.h16
-rw-r--r--libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6-asm.S960
-rw-r--r--libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.c31
-rw-r--r--libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.h8
-rw-r--r--libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.c131
-rw-r--r--libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.h8
-rw-r--r--libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.c122
-rw-r--r--libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.h8
-rw-r--r--libs/libsodium/src/crypto_stream/salsa20/xmm6int/u0.h195
-rw-r--r--libs/libsodium/src/crypto_stream/salsa20/xmm6int/u1.h207
-rw-r--r--libs/libsodium/src/crypto_stream/salsa20/xmm6int/u4.h547
-rw-r--r--libs/libsodium/src/crypto_stream/salsa20/xmm6int/u8.h476
15 files changed, 2937 insertions, 0 deletions
diff --git a/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.c b/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.c
new file mode 100644
index 0000000000..f0854ebf7e
--- /dev/null
+++ b/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.c
@@ -0,0 +1,120 @@
+/*
+version 20140420
+D. J. Bernstein
+Public domain.
+*/
+
+#include <stdint.h>
+
+#include "crypto_core_salsa20.h"
+#include "crypto_stream_salsa20.h"
+#include "utils.h"
+
+#include "../stream_salsa20.h"
+#include "salsa20_ref.h"
+
+#ifndef HAVE_AMD64_ASM
+
+static int
+stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n,
+ const unsigned char *k)
+{
+ unsigned char in[16];
+ unsigned char block[64];
+ unsigned char kcopy[32];
+ unsigned int i;
+ unsigned int u;
+
+ if (!clen) {
+ return 0;
+ }
+ for (i = 0; i < 32; i++) {
+ kcopy[i] = k[i];
+ }
+ for (i = 0; i < 8; i++) {
+ in[i] = n[i];
+ }
+ for (i = 8; i < 16; i++) {
+ in[i] = 0;
+ }
+ while (clen >= 64) {
+ crypto_core_salsa20(c, in, kcopy, NULL);
+ u = 1;
+ for (i = 8; i < 16; i++) {
+ u += (unsigned int) in[i];
+ in[i] = u;
+ u >>= 8;
+ }
+ clen -= 64;
+ c += 64;
+ }
+ if (clen) {
+ crypto_core_salsa20(block, in, kcopy, NULL);
+ for (i = 0; i < (unsigned int) clen; i++) {
+ c[i] = block[i];
+ }
+ }
+ sodium_memzero(block, sizeof block);
+ sodium_memzero(kcopy, sizeof kcopy);
+
+ return 0;
+}
+
+static int
+stream_ref_xor_ic(unsigned char *c, const unsigned char *m,
+ unsigned long long mlen, const unsigned char *n, uint64_t ic,
+ const unsigned char *k)
+{
+ unsigned char in[16];
+ unsigned char block[64];
+ unsigned char kcopy[32];
+ unsigned int i;
+ unsigned int u;
+
+ if (!mlen) {
+ return 0;
+ }
+ for (i = 0; i < 32; i++) {
+ kcopy[i] = k[i];
+ }
+ for (i = 0; i < 8; i++) {
+ in[i] = n[i];
+ }
+ for (i = 8; i < 16; i++) {
+ in[i] = (unsigned char) (ic & 0xff);
+ ic >>= 8;
+ }
+ while (mlen >= 64) {
+ crypto_core_salsa20(block, in, kcopy, NULL);
+ for (i = 0; i < 64; i++) {
+ c[i] = m[i] ^ block[i];
+ }
+ u = 1;
+ for (i = 8; i < 16; i++) {
+ u += (unsigned int) in[i];
+ in[i] = u;
+ u >>= 8;
+ }
+ mlen -= 64;
+ c += 64;
+ m += 64;
+ }
+ if (mlen) {
+ crypto_core_salsa20(block, in, kcopy, NULL);
+ for (i = 0; i < (unsigned int) mlen; i++) {
+ c[i] = m[i] ^ block[i];
+ }
+ }
+ sodium_memzero(block, sizeof block);
+ sodium_memzero(kcopy, sizeof kcopy);
+
+ return 0;
+}
+
+struct crypto_stream_salsa20_implementation
+ crypto_stream_salsa20_ref_implementation = {
+ SODIUM_C99(.stream =) stream_ref,
+ SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic,
+ };
+
+#endif
diff --git a/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.h b/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.h
new file mode 100644
index 0000000000..8716cb4048
--- /dev/null
+++ b/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.h
@@ -0,0 +1,8 @@
+
+#include <stdint.h>
+
+#include "../stream_salsa20.h"
+#include "crypto_stream_salsa20.h"
+
+extern struct crypto_stream_salsa20_implementation
+ crypto_stream_salsa20_ref_implementation;
diff --git a/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.c b/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.c
new file mode 100644
index 0000000000..4529850136
--- /dev/null
+++ b/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.c
@@ -0,0 +1,100 @@
+#include "crypto_stream_salsa20.h"
+#include "private/common.h"
+#include "private/implementations.h"
+#include "randombytes.h"
+#include "runtime.h"
+#include "stream_salsa20.h"
+
+#ifdef HAVE_AMD64_ASM
+# include "xmm6/salsa20_xmm6.h"
+#else
+# include "ref/salsa20_ref.h"
+#endif
+#if !defined(HAVE_AMD64_ASM) && defined(HAVE_EMMINTRIN_H)
+# include "xmm6int/salsa20_xmm6int-sse2.h"
+#endif
+#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
+ defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
+# include "xmm6int/salsa20_xmm6int-avx2.h"
+#endif
+
+#if HAVE_AMD64_ASM
+static const crypto_stream_salsa20_implementation *implementation =
+ &crypto_stream_salsa20_xmm6_implementation;
+#else
+static const crypto_stream_salsa20_implementation *implementation =
+ &crypto_stream_salsa20_ref_implementation;
+#endif
+
+size_t
+crypto_stream_salsa20_keybytes(void)
+{
+ return crypto_stream_salsa20_KEYBYTES;
+}
+
+size_t
+crypto_stream_salsa20_noncebytes(void)
+{
+ return crypto_stream_salsa20_NONCEBYTES;
+}
+
+size_t
+crypto_stream_salsa20_messagebytes_max(void)
+{
+ return crypto_stream_salsa20_MESSAGEBYTES_MAX;
+}
+
+int
+crypto_stream_salsa20(unsigned char *c, unsigned long long clen,
+ const unsigned char *n, const unsigned char *k)
+{
+ return implementation->stream(c, clen, n, k);
+}
+
+int
+crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m,
+ unsigned long long mlen,
+ const unsigned char *n, uint64_t ic,
+ const unsigned char *k)
+{
+ return implementation->stream_xor_ic(c, m, mlen, n, ic, k);
+}
+
+int
+crypto_stream_salsa20_xor(unsigned char *c, const unsigned char *m,
+ unsigned long long mlen, const unsigned char *n,
+ const unsigned char *k)
+{
+ return implementation->stream_xor_ic(c, m, mlen, n, 0U, k);
+}
+
+void
+crypto_stream_salsa20_keygen(unsigned char k[crypto_stream_salsa20_KEYBYTES])
+{
+ randombytes_buf(k, crypto_stream_salsa20_KEYBYTES);
+}
+
+int
+_crypto_stream_salsa20_pick_best_implementation(void)
+{
+#ifdef HAVE_AMD64_ASM
+ implementation = &crypto_stream_salsa20_xmm6_implementation;
+#else
+ implementation = &crypto_stream_salsa20_ref_implementation;
+#endif
+
+#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
+ defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
+ if (sodium_runtime_has_avx2()) {
+ implementation = &crypto_stream_salsa20_xmm6int_avx2_implementation;
+ return 0;
+ }
+#endif
+#if !defined(HAVE_AMD64_ASM) && defined(HAVE_EMMINTRIN_H)
+ if (sodium_runtime_has_sse2()) {
+ implementation = &crypto_stream_salsa20_xmm6int_sse2_implementation;
+ return 0;
+ }
+#endif
+ return 0; /* LCOV_EXCL_LINE */
+}
diff --git a/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.h b/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.h
new file mode 100644
index 0000000000..1949d38113
--- /dev/null
+++ b/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.h
@@ -0,0 +1,16 @@
+
+#ifndef stream_salsa20_H
+#define stream_salsa20_H
+
+#include <stdint.h>
+
+typedef struct crypto_stream_salsa20_implementation {
+ int (*stream)(unsigned char *c, unsigned long long clen,
+ const unsigned char *n, const unsigned char *k);
+ int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
+ unsigned long long mlen,
+ const unsigned char *n, uint64_t ic,
+ const unsigned char *k);
+} crypto_stream_salsa20_implementation;
+
+#endif
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6-asm.S b/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6-asm.S
new file mode 100644
index 0000000000..6d9f354e10
--- /dev/null
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6-asm.S
@@ -0,0 +1,960 @@
+#ifdef HAVE_AMD64_ASM
+
+.text
+.p2align 5
+
+#ifdef ASM_HIDE_SYMBOL
+ASM_HIDE_SYMBOL stream_salsa20_xmm6
+ASM_HIDE_SYMBOL _stream_salsa20_xmm6
+#endif
+.globl stream_salsa20_xmm6
+.globl _stream_salsa20_xmm6
+#ifdef __ELF__
+.type stream_salsa20_xmm6, @function
+.type _stream_salsa20_xmm6, @function
+#endif
+stream_salsa20_xmm6:
+_stream_salsa20_xmm6:
+mov %rsp,%r11
+and $31,%r11
+add $512,%r11
+sub %r11,%rsp
+movq %r11,416(%rsp)
+movq %r12,424(%rsp)
+movq %r13,432(%rsp)
+movq %r14,440(%rsp)
+movq %r15,448(%rsp)
+movq %rbx,456(%rsp)
+movq %rbp,464(%rsp)
+mov %rsi,%r9
+mov %rdi,%rdi
+mov %rdi,%rsi
+mov %rdx,%rdx
+mov %rcx,%r10
+cmp $0,%r9
+jbe ._done
+mov $0,%rax
+mov %r9,%rcx
+rep stosb
+sub %r9,%rdi
+movq $0,472(%rsp)
+jmp ._start
+
+.text
+.p2align 5
+
+#ifdef ASM_HIDE_SYMBOL
+ASM_HIDE_SYMBOL stream_salsa20_xmm6_xor_ic
+ASM_HIDE_SYMBOL _stream_salsa20_xmm6_xor_ic
+#endif
+.globl stream_salsa20_xmm6_xor_ic
+.globl _stream_salsa20_xmm6_xor_ic
+#ifdef __ELF__
+.type stream_salsa20_xmm6_xor_ic, @function
+.type _stream_salsa20_xmm6_xor_ic, @function
+#endif
+stream_salsa20_xmm6_xor_ic:
+_stream_salsa20_xmm6_xor_ic:
+
+mov %rsp,%r11
+and $31,%r11
+add $512,%r11
+sub %r11,%rsp
+movq %r11,416(%rsp)
+movq %r12,424(%rsp)
+movq %r13,432(%rsp)
+movq %r14,440(%rsp)
+movq %r15,448(%rsp)
+movq %rbx,456(%rsp)
+movq %rbp,464(%rsp)
+mov %rdi,%rdi
+mov %rsi,%rsi
+mov %r9,%r10
+movq %r8,472(%rsp)
+mov %rdx,%r9
+mov %rcx,%rdx
+cmp $0,%r9
+jbe ._done
+
+._start:
+movl 20(%r10),%ecx
+movl 0(%r10),%r8d
+movl 0(%rdx),%eax
+movl 16(%r10),%r11d
+movl %ecx,64(%rsp)
+movl %r8d,4+64(%rsp)
+movl %eax,8+64(%rsp)
+movl %r11d,12+64(%rsp)
+movl 24(%r10),%r8d
+movl 4(%r10),%eax
+movl 4(%rdx),%edx
+movq 472(%rsp),%rcx
+movl %ecx,80(%rsp)
+movl %r8d,4+80(%rsp)
+movl %eax,8+80(%rsp)
+movl %edx,12+80(%rsp)
+movl 12(%r10),%edx
+shr $32,%rcx
+movl 28(%r10),%r8d
+movl 8(%r10),%eax
+movl %edx,96(%rsp)
+movl %ecx,4+96(%rsp)
+movl %r8d,8+96(%rsp)
+movl %eax,12+96(%rsp)
+mov $1634760805,%rdx
+mov $857760878,%rcx
+mov $2036477234,%r8
+mov $1797285236,%rax
+movl %edx,112(%rsp)
+movl %ecx,4+112(%rsp)
+movl %r8d,8+112(%rsp)
+movl %eax,12+112(%rsp)
+cmp $256,%r9
+jb ._bytesbetween1and255
+movdqa 112(%rsp),%xmm0
+pshufd $0x55,%xmm0,%xmm1
+pshufd $0xaa,%xmm0,%xmm2
+pshufd $0xff,%xmm0,%xmm3
+pshufd $0x00,%xmm0,%xmm0
+movdqa %xmm1,128(%rsp)
+movdqa %xmm2,144(%rsp)
+movdqa %xmm3,160(%rsp)
+movdqa %xmm0,176(%rsp)
+movdqa 64(%rsp),%xmm0
+pshufd $0xaa,%xmm0,%xmm1
+pshufd $0xff,%xmm0,%xmm2
+pshufd $0x00,%xmm0,%xmm3
+pshufd $0x55,%xmm0,%xmm0
+movdqa %xmm1,192(%rsp)
+movdqa %xmm2,208(%rsp)
+movdqa %xmm3,224(%rsp)
+movdqa %xmm0,240(%rsp)
+movdqa 80(%rsp),%xmm0
+pshufd $0xff,%xmm0,%xmm1
+pshufd $0x55,%xmm0,%xmm2
+pshufd $0xaa,%xmm0,%xmm0
+movdqa %xmm1,256(%rsp)
+movdqa %xmm2,272(%rsp)
+movdqa %xmm0,288(%rsp)
+movdqa 96(%rsp),%xmm0
+pshufd $0x00,%xmm0,%xmm1
+pshufd $0xaa,%xmm0,%xmm2
+pshufd $0xff,%xmm0,%xmm0
+movdqa %xmm1,304(%rsp)
+movdqa %xmm2,320(%rsp)
+movdqa %xmm0,336(%rsp)
+
+.p2align 4
+._bytesatleast256:
+movq 472(%rsp),%rdx
+mov %rdx,%rcx
+shr $32,%rcx
+movl %edx,352(%rsp)
+movl %ecx,368(%rsp)
+add $1,%rdx
+mov %rdx,%rcx
+shr $32,%rcx
+movl %edx,4+352(%rsp)
+movl %ecx,4+368(%rsp)
+add $1,%rdx
+mov %rdx,%rcx
+shr $32,%rcx
+movl %edx,8+352(%rsp)
+movl %ecx,8+368(%rsp)
+add $1,%rdx
+mov %rdx,%rcx
+shr $32,%rcx
+movl %edx,12+352(%rsp)
+movl %ecx,12+368(%rsp)
+add $1,%rdx
+mov %rdx,%rcx
+shr $32,%rcx
+movl %edx,80(%rsp)
+movl %ecx,4+96(%rsp)
+movq %rdx,472(%rsp)
+movq %r9,480(%rsp)
+mov $20,%rdx
+movdqa 128(%rsp),%xmm0
+movdqa 144(%rsp),%xmm1
+movdqa 160(%rsp),%xmm2
+movdqa 320(%rsp),%xmm3
+movdqa 336(%rsp),%xmm4
+movdqa 192(%rsp),%xmm5
+movdqa 208(%rsp),%xmm6
+movdqa 240(%rsp),%xmm7
+movdqa 256(%rsp),%xmm8
+movdqa 272(%rsp),%xmm9
+movdqa 288(%rsp),%xmm10
+movdqa 368(%rsp),%xmm11
+movdqa 176(%rsp),%xmm12
+movdqa 224(%rsp),%xmm13
+movdqa 304(%rsp),%xmm14
+movdqa 352(%rsp),%xmm15
+
+.p2align 4
+._mainloop1:
+movdqa %xmm1,384(%rsp)
+movdqa %xmm2,400(%rsp)
+movdqa %xmm13,%xmm1
+paddd %xmm12,%xmm1
+movdqa %xmm1,%xmm2
+pslld $7,%xmm1
+pxor %xmm1,%xmm14
+psrld $25,%xmm2
+pxor %xmm2,%xmm14
+movdqa %xmm7,%xmm1
+paddd %xmm0,%xmm1
+movdqa %xmm1,%xmm2
+pslld $7,%xmm1
+pxor %xmm1,%xmm11
+psrld $25,%xmm2
+pxor %xmm2,%xmm11
+movdqa %xmm12,%xmm1
+paddd %xmm14,%xmm1
+movdqa %xmm1,%xmm2
+pslld $9,%xmm1
+pxor %xmm1,%xmm15
+psrld $23,%xmm2
+pxor %xmm2,%xmm15
+movdqa %xmm0,%xmm1
+paddd %xmm11,%xmm1
+movdqa %xmm1,%xmm2
+pslld $9,%xmm1
+pxor %xmm1,%xmm9
+psrld $23,%xmm2
+pxor %xmm2,%xmm9
+movdqa %xmm14,%xmm1
+paddd %xmm15,%xmm1
+movdqa %xmm1,%xmm2
+pslld $13,%xmm1
+pxor %xmm1,%xmm13
+psrld $19,%xmm2
+pxor %xmm2,%xmm13
+movdqa %xmm11,%xmm1
+paddd %xmm9,%xmm1
+movdqa %xmm1,%xmm2
+pslld $13,%xmm1
+pxor %xmm1,%xmm7
+psrld $19,%xmm2
+pxor %xmm2,%xmm7
+movdqa %xmm15,%xmm1
+paddd %xmm13,%xmm1
+movdqa %xmm1,%xmm2
+pslld $18,%xmm1
+pxor %xmm1,%xmm12
+psrld $14,%xmm2
+pxor %xmm2,%xmm12
+movdqa 384(%rsp),%xmm1
+movdqa %xmm12,384(%rsp)
+movdqa %xmm9,%xmm2
+paddd %xmm7,%xmm2
+movdqa %xmm2,%xmm12
+pslld $18,%xmm2
+pxor %xmm2,%xmm0
+psrld $14,%xmm12
+pxor %xmm12,%xmm0
+movdqa %xmm5,%xmm2
+paddd %xmm1,%xmm2
+movdqa %xmm2,%xmm12
+pslld $7,%xmm2
+pxor %xmm2,%xmm3
+psrld $25,%xmm12
+pxor %xmm12,%xmm3
+movdqa 400(%rsp),%xmm2
+movdqa %xmm0,400(%rsp)
+movdqa %xmm6,%xmm0
+paddd %xmm2,%xmm0
+movdqa %xmm0,%xmm12
+pslld $7,%xmm0
+pxor %xmm0,%xmm4
+psrld $25,%xmm12
+pxor %xmm12,%xmm4
+movdqa %xmm1,%xmm0
+paddd %xmm3,%xmm0
+movdqa %xmm0,%xmm12
+pslld $9,%xmm0
+pxor %xmm0,%xmm10
+psrld $23,%xmm12
+pxor %xmm12,%xmm10
+movdqa %xmm2,%xmm0
+paddd %xmm4,%xmm0
+movdqa %xmm0,%xmm12
+pslld $9,%xmm0
+pxor %xmm0,%xmm8
+psrld $23,%xmm12
+pxor %xmm12,%xmm8
+movdqa %xmm3,%xmm0
+paddd %xmm10,%xmm0
+movdqa %xmm0,%xmm12
+pslld $13,%xmm0
+pxor %xmm0,%xmm5
+psrld $19,%xmm12
+pxor %xmm12,%xmm5
+movdqa %xmm4,%xmm0
+paddd %xmm8,%xmm0
+movdqa %xmm0,%xmm12
+pslld $13,%xmm0
+pxor %xmm0,%xmm6
+psrld $19,%xmm12
+pxor %xmm12,%xmm6
+movdqa %xmm10,%xmm0
+paddd %xmm5,%xmm0
+movdqa %xmm0,%xmm12
+pslld $18,%xmm0
+pxor %xmm0,%xmm1
+psrld $14,%xmm12
+pxor %xmm12,%xmm1
+movdqa 384(%rsp),%xmm0
+movdqa %xmm1,384(%rsp)
+movdqa %xmm4,%xmm1
+paddd %xmm0,%xmm1
+movdqa %xmm1,%xmm12
+pslld $7,%xmm1
+pxor %xmm1,%xmm7
+psrld $25,%xmm12
+pxor %xmm12,%xmm7
+movdqa %xmm8,%xmm1
+paddd %xmm6,%xmm1
+movdqa %xmm1,%xmm12
+pslld $18,%xmm1
+pxor %xmm1,%xmm2
+psrld $14,%xmm12
+pxor %xmm12,%xmm2
+movdqa 400(%rsp),%xmm12
+movdqa %xmm2,400(%rsp)
+movdqa %xmm14,%xmm1
+paddd %xmm12,%xmm1
+movdqa %xmm1,%xmm2
+pslld $7,%xmm1
+pxor %xmm1,%xmm5
+psrld $25,%xmm2
+pxor %xmm2,%xmm5
+movdqa %xmm0,%xmm1
+paddd %xmm7,%xmm1
+movdqa %xmm1,%xmm2
+pslld $9,%xmm1
+pxor %xmm1,%xmm10
+psrld $23,%xmm2
+pxor %xmm2,%xmm10
+movdqa %xmm12,%xmm1
+paddd %xmm5,%xmm1
+movdqa %xmm1,%xmm2
+pslld $9,%xmm1
+pxor %xmm1,%xmm8
+psrld $23,%xmm2
+pxor %xmm2,%xmm8
+movdqa %xmm7,%xmm1
+paddd %xmm10,%xmm1
+movdqa %xmm1,%xmm2
+pslld $13,%xmm1
+pxor %xmm1,%xmm4
+psrld $19,%xmm2
+pxor %xmm2,%xmm4
+movdqa %xmm5,%xmm1
+paddd %xmm8,%xmm1
+movdqa %xmm1,%xmm2
+pslld $13,%xmm1
+pxor %xmm1,%xmm14
+psrld $19,%xmm2
+pxor %xmm2,%xmm14
+movdqa %xmm10,%xmm1
+paddd %xmm4,%xmm1
+movdqa %xmm1,%xmm2
+pslld $18,%xmm1
+pxor %xmm1,%xmm0
+psrld $14,%xmm2
+pxor %xmm2,%xmm0
+movdqa 384(%rsp),%xmm1
+movdqa %xmm0,384(%rsp)
+movdqa %xmm8,%xmm0
+paddd %xmm14,%xmm0
+movdqa %xmm0,%xmm2
+pslld $18,%xmm0
+pxor %xmm0,%xmm12
+psrld $14,%xmm2
+pxor %xmm2,%xmm12
+movdqa %xmm11,%xmm0
+paddd %xmm1,%xmm0
+movdqa %xmm0,%xmm2
+pslld $7,%xmm0
+pxor %xmm0,%xmm6
+psrld $25,%xmm2
+pxor %xmm2,%xmm6
+movdqa 400(%rsp),%xmm2
+movdqa %xmm12,400(%rsp)
+movdqa %xmm3,%xmm0
+paddd %xmm2,%xmm0
+movdqa %xmm0,%xmm12
+pslld $7,%xmm0
+pxor %xmm0,%xmm13
+psrld $25,%xmm12
+pxor %xmm12,%xmm13
+movdqa %xmm1,%xmm0
+paddd %xmm6,%xmm0
+movdqa %xmm0,%xmm12
+pslld $9,%xmm0
+pxor %xmm0,%xmm15
+psrld $23,%xmm12
+pxor %xmm12,%xmm15
+movdqa %xmm2,%xmm0
+paddd %xmm13,%xmm0
+movdqa %xmm0,%xmm12
+pslld $9,%xmm0
+pxor %xmm0,%xmm9
+psrld $23,%xmm12
+pxor %xmm12,%xmm9
+movdqa %xmm6,%xmm0
+paddd %xmm15,%xmm0
+movdqa %xmm0,%xmm12
+pslld $13,%xmm0
+pxor %xmm0,%xmm11
+psrld $19,%xmm12
+pxor %xmm12,%xmm11
+movdqa %xmm13,%xmm0
+paddd %xmm9,%xmm0
+movdqa %xmm0,%xmm12
+pslld $13,%xmm0
+pxor %xmm0,%xmm3
+psrld $19,%xmm12
+pxor %xmm12,%xmm3
+movdqa %xmm15,%xmm0
+paddd %xmm11,%xmm0
+movdqa %xmm0,%xmm12
+pslld $18,%xmm0
+pxor %xmm0,%xmm1
+psrld $14,%xmm12
+pxor %xmm12,%xmm1
+movdqa %xmm9,%xmm0
+paddd %xmm3,%xmm0
+movdqa %xmm0,%xmm12
+pslld $18,%xmm0
+pxor %xmm0,%xmm2
+psrld $14,%xmm12
+pxor %xmm12,%xmm2
+movdqa 384(%rsp),%xmm12
+movdqa 400(%rsp),%xmm0
+sub $2,%rdx
+ja ._mainloop1
+
+paddd 176(%rsp),%xmm12
+paddd 240(%rsp),%xmm7
+paddd 288(%rsp),%xmm10
+paddd 336(%rsp),%xmm4
+movd %xmm12,%rdx
+movd %xmm7,%rcx
+movd %xmm10,%r8
+movd %xmm4,%r9
+pshufd $0x39,%xmm12,%xmm12
+pshufd $0x39,%xmm7,%xmm7
+pshufd $0x39,%xmm10,%xmm10
+pshufd $0x39,%xmm4,%xmm4
+xorl 0(%rsi),%edx
+xorl 4(%rsi),%ecx
+xorl 8(%rsi),%r8d
+xorl 12(%rsi),%r9d
+movl %edx,0(%rdi)
+movl %ecx,4(%rdi)
+movl %r8d,8(%rdi)
+movl %r9d,12(%rdi)
+movd %xmm12,%rdx
+movd %xmm7,%rcx
+movd %xmm10,%r8
+movd %xmm4,%r9
+pshufd $0x39,%xmm12,%xmm12
+pshufd $0x39,%xmm7,%xmm7
+pshufd $0x39,%xmm10,%xmm10
+pshufd $0x39,%xmm4,%xmm4
+xorl 64(%rsi),%edx
+xorl 68(%rsi),%ecx
+xorl 72(%rsi),%r8d
+xorl 76(%rsi),%r9d
+movl %edx,64(%rdi)
+movl %ecx,68(%rdi)
+movl %r8d,72(%rdi)
+movl %r9d,76(%rdi)
+movd %xmm12,%rdx
+movd %xmm7,%rcx
+movd %xmm10,%r8
+movd %xmm4,%r9
+pshufd $0x39,%xmm12,%xmm12
+pshufd $0x39,%xmm7,%xmm7
+pshufd $0x39,%xmm10,%xmm10
+pshufd $0x39,%xmm4,%xmm4
+xorl 128(%rsi),%edx
+xorl 132(%rsi),%ecx
+xorl 136(%rsi),%r8d
+xorl 140(%rsi),%r9d
+movl %edx,128(%rdi)
+movl %ecx,132(%rdi)
+movl %r8d,136(%rdi)
+movl %r9d,140(%rdi)
+movd %xmm12,%rdx
+movd %xmm7,%rcx
+movd %xmm10,%r8
+movd %xmm4,%r9
+xorl 192(%rsi),%edx
+xorl 196(%rsi),%ecx
+xorl 200(%rsi),%r8d
+xorl 204(%rsi),%r9d
+movl %edx,192(%rdi)
+movl %ecx,196(%rdi)
+movl %r8d,200(%rdi)
+movl %r9d,204(%rdi)
+paddd 304(%rsp),%xmm14
+paddd 128(%rsp),%xmm0
+paddd 192(%rsp),%xmm5
+paddd 256(%rsp),%xmm8
+movd %xmm14,%rdx
+movd %xmm0,%rcx
+movd %xmm5,%r8
+movd %xmm8,%r9
+pshufd $0x39,%xmm14,%xmm14
+pshufd $0x39,%xmm0,%xmm0
+pshufd $0x39,%xmm5,%xmm5
+pshufd $0x39,%xmm8,%xmm8
+xorl 16(%rsi),%edx
+xorl 20(%rsi),%ecx
+xorl 24(%rsi),%r8d
+xorl 28(%rsi),%r9d
+movl %edx,16(%rdi)
+movl %ecx,20(%rdi)
+movl %r8d,24(%rdi)
+movl %r9d,28(%rdi)
+movd %xmm14,%rdx
+movd %xmm0,%rcx
+movd %xmm5,%r8
+movd %xmm8,%r9
+pshufd $0x39,%xmm14,%xmm14
+pshufd $0x39,%xmm0,%xmm0
+pshufd $0x39,%xmm5,%xmm5
+pshufd $0x39,%xmm8,%xmm8
+xorl 80(%rsi),%edx
+xorl 84(%rsi),%ecx
+xorl 88(%rsi),%r8d
+xorl 92(%rsi),%r9d
+movl %edx,80(%rdi)
+movl %ecx,84(%rdi)
+movl %r8d,88(%rdi)
+movl %r9d,92(%rdi)
+movd %xmm14,%rdx
+movd %xmm0,%rcx
+movd %xmm5,%r8
+movd %xmm8,%r9
+pshufd $0x39,%xmm14,%xmm14
+pshufd $0x39,%xmm0,%xmm0
+pshufd $0x39,%xmm5,%xmm5
+pshufd $0x39,%xmm8,%xmm8
+xorl 144(%rsi),%edx
+xorl 148(%rsi),%ecx
+xorl 152(%rsi),%r8d
+xorl 156(%rsi),%r9d
+movl %edx,144(%rdi)
+movl %ecx,148(%rdi)
+movl %r8d,152(%rdi)
+movl %r9d,156(%rdi)
+movd %xmm14,%rdx
+movd %xmm0,%rcx
+movd %xmm5,%r8
+movd %xmm8,%r9
+xorl 208(%rsi),%edx
+xorl 212(%rsi),%ecx
+xorl 216(%rsi),%r8d
+xorl 220(%rsi),%r9d
+movl %edx,208(%rdi)
+movl %ecx,212(%rdi)
+movl %r8d,216(%rdi)
+movl %r9d,220(%rdi)
+paddd 352(%rsp),%xmm15
+paddd 368(%rsp),%xmm11
+paddd 144(%rsp),%xmm1
+paddd 208(%rsp),%xmm6
+movd %xmm15,%rdx
+movd %xmm11,%rcx
+movd %xmm1,%r8
+movd %xmm6,%r9
+pshufd $0x39,%xmm15,%xmm15
+pshufd $0x39,%xmm11,%xmm11
+pshufd $0x39,%xmm1,%xmm1
+pshufd $0x39,%xmm6,%xmm6
+xorl 32(%rsi),%edx
+xorl 36(%rsi),%ecx
+xorl 40(%rsi),%r8d
+xorl 44(%rsi),%r9d
+movl %edx,32(%rdi)
+movl %ecx,36(%rdi)
+movl %r8d,40(%rdi)
+movl %r9d,44(%rdi)
+movd %xmm15,%rdx
+movd %xmm11,%rcx
+movd %xmm1,%r8
+movd %xmm6,%r9
+pshufd $0x39,%xmm15,%xmm15
+pshufd $0x39,%xmm11,%xmm11
+pshufd $0x39,%xmm1,%xmm1
+pshufd $0x39,%xmm6,%xmm6
+xorl 96(%rsi),%edx
+xorl 100(%rsi),%ecx
+xorl 104(%rsi),%r8d
+xorl 108(%rsi),%r9d
+movl %edx,96(%rdi)
+movl %ecx,100(%rdi)
+movl %r8d,104(%rdi)
+movl %r9d,108(%rdi)
+movd %xmm15,%rdx
+movd %xmm11,%rcx
+movd %xmm1,%r8
+movd %xmm6,%r9
+pshufd $0x39,%xmm15,%xmm15
+pshufd $0x39,%xmm11,%xmm11
+pshufd $0x39,%xmm1,%xmm1
+pshufd $0x39,%xmm6,%xmm6
+xorl 160(%rsi),%edx
+xorl 164(%rsi),%ecx
+xorl 168(%rsi),%r8d
+xorl 172(%rsi),%r9d
+movl %edx,160(%rdi)
+movl %ecx,164(%rdi)
+movl %r8d,168(%rdi)
+movl %r9d,172(%rdi)
+movd %xmm15,%rdx
+movd %xmm11,%rcx
+movd %xmm1,%r8
+movd %xmm6,%r9
+xorl 224(%rsi),%edx
+xorl 228(%rsi),%ecx
+xorl 232(%rsi),%r8d
+xorl 236(%rsi),%r9d
+movl %edx,224(%rdi)
+movl %ecx,228(%rdi)
+movl %r8d,232(%rdi)
+movl %r9d,236(%rdi)
+paddd 224(%rsp),%xmm13
+paddd 272(%rsp),%xmm9
+paddd 320(%rsp),%xmm3
+paddd 160(%rsp),%xmm2
+movd %xmm13,%rdx
+movd %xmm9,%rcx
+movd %xmm3,%r8
+movd %xmm2,%r9
+pshufd $0x39,%xmm13,%xmm13
+pshufd $0x39,%xmm9,%xmm9
+pshufd $0x39,%xmm3,%xmm3
+pshufd $0x39,%xmm2,%xmm2
+xorl 48(%rsi),%edx
+xorl 52(%rsi),%ecx
+xorl 56(%rsi),%r8d
+xorl 60(%rsi),%r9d
+movl %edx,48(%rdi)
+movl %ecx,52(%rdi)
+movl %r8d,56(%rdi)
+movl %r9d,60(%rdi)
+movd %xmm13,%rdx
+movd %xmm9,%rcx
+movd %xmm3,%r8
+movd %xmm2,%r9
+pshufd $0x39,%xmm13,%xmm13
+pshufd $0x39,%xmm9,%xmm9
+pshufd $0x39,%xmm3,%xmm3
+pshufd $0x39,%xmm2,%xmm2
+xorl 112(%rsi),%edx
+xorl 116(%rsi),%ecx
+xorl 120(%rsi),%r8d
+xorl 124(%rsi),%r9d
+movl %edx,112(%rdi)
+movl %ecx,116(%rdi)
+movl %r8d,120(%rdi)
+movl %r9d,124(%rdi)
+movd %xmm13,%rdx
+movd %xmm9,%rcx
+movd %xmm3,%r8
+movd %xmm2,%r9
+pshufd $0x39,%xmm13,%xmm13
+pshufd $0x39,%xmm9,%xmm9
+pshufd $0x39,%xmm3,%xmm3
+pshufd $0x39,%xmm2,%xmm2
+xorl 176(%rsi),%edx
+xorl 180(%rsi),%ecx
+xorl 184(%rsi),%r8d
+xorl 188(%rsi),%r9d
+movl %edx,176(%rdi)
+movl %ecx,180(%rdi)
+movl %r8d,184(%rdi)
+movl %r9d,188(%rdi)
+movd %xmm13,%rdx
+movd %xmm9,%rcx
+movd %xmm3,%r8
+movd %xmm2,%r9
+xorl 240(%rsi),%edx
+xorl 244(%rsi),%ecx
+xorl 248(%rsi),%r8d
+xorl 252(%rsi),%r9d
+movl %edx,240(%rdi)
+movl %ecx,244(%rdi)
+movl %r8d,248(%rdi)
+movl %r9d,252(%rdi)
+movq 480(%rsp),%r9
+sub $256,%r9
+add $256,%rsi
+add $256,%rdi
+cmp $256,%r9
+jae ._bytesatleast256
+
+cmp $0,%r9
+jbe ._done
+
+._bytesbetween1and255:
+cmp $64,%r9
+jae ._nocopy
+
+mov %rdi,%rdx
+leaq 0(%rsp),%rdi
+mov %r9,%rcx
+rep movsb
+leaq 0(%rsp),%rdi
+leaq 0(%rsp),%rsi
+
+._nocopy:
+movq %r9,480(%rsp)
+movdqa 112(%rsp),%xmm0
+movdqa 64(%rsp),%xmm1
+movdqa 80(%rsp),%xmm2
+movdqa 96(%rsp),%xmm3
+movdqa %xmm1,%xmm4
+mov $20,%rcx
+
+.p2align 4
+._mainloop2:
+paddd %xmm0,%xmm4
+movdqa %xmm0,%xmm5
+movdqa %xmm4,%xmm6
+pslld $7,%xmm4
+psrld $25,%xmm6
+pxor %xmm4,%xmm3
+pxor %xmm6,%xmm3
+paddd %xmm3,%xmm5
+movdqa %xmm3,%xmm4
+movdqa %xmm5,%xmm6
+pslld $9,%xmm5
+psrld $23,%xmm6
+pxor %xmm5,%xmm2
+pshufd $0x93,%xmm3,%xmm3
+pxor %xmm6,%xmm2
+paddd %xmm2,%xmm4
+movdqa %xmm2,%xmm5
+movdqa %xmm4,%xmm6
+pslld $13,%xmm4
+psrld $19,%xmm6
+pxor %xmm4,%xmm1
+pshufd $0x4e,%xmm2,%xmm2
+pxor %xmm6,%xmm1
+paddd %xmm1,%xmm5
+movdqa %xmm3,%xmm4
+movdqa %xmm5,%xmm6
+pslld $18,%xmm5
+psrld $14,%xmm6
+pxor %xmm5,%xmm0
+pshufd $0x39,%xmm1,%xmm1
+pxor %xmm6,%xmm0
+paddd %xmm0,%xmm4
+movdqa %xmm0,%xmm5
+movdqa %xmm4,%xmm6
+pslld $7,%xmm4
+psrld $25,%xmm6
+pxor %xmm4,%xmm1
+pxor %xmm6,%xmm1
+paddd %xmm1,%xmm5
+movdqa %xmm1,%xmm4
+movdqa %xmm5,%xmm6
+pslld $9,%xmm5
+psrld $23,%xmm6
+pxor %xmm5,%xmm2
+pshufd $0x93,%xmm1,%xmm1
+pxor %xmm6,%xmm2
+paddd %xmm2,%xmm4
+movdqa %xmm2,%xmm5
+movdqa %xmm4,%xmm6
+pslld $13,%xmm4
+psrld $19,%xmm6
+pxor %xmm4,%xmm3
+pshufd $0x4e,%xmm2,%xmm2
+pxor %xmm6,%xmm3
+paddd %xmm3,%xmm5
+movdqa %xmm1,%xmm4
+movdqa %xmm5,%xmm6
+pslld $18,%xmm5
+psrld $14,%xmm6
+pxor %xmm5,%xmm0
+pshufd $0x39,%xmm3,%xmm3
+pxor %xmm6,%xmm0
+paddd %xmm0,%xmm4
+movdqa %xmm0,%xmm5
+movdqa %xmm4,%xmm6
+pslld $7,%xmm4
+psrld $25,%xmm6
+pxor %xmm4,%xmm3
+pxor %xmm6,%xmm3
+paddd %xmm3,%xmm5
+movdqa %xmm3,%xmm4
+movdqa %xmm5,%xmm6
+pslld $9,%xmm5
+psrld $23,%xmm6
+pxor %xmm5,%xmm2
+pshufd $0x93,%xmm3,%xmm3
+pxor %xmm6,%xmm2
+paddd %xmm2,%xmm4
+movdqa %xmm2,%xmm5
+movdqa %xmm4,%xmm6
+pslld $13,%xmm4
+psrld $19,%xmm6
+pxor %xmm4,%xmm1
+pshufd $0x4e,%xmm2,%xmm2
+pxor %xmm6,%xmm1
+paddd %xmm1,%xmm5
+movdqa %xmm3,%xmm4
+movdqa %xmm5,%xmm6
+pslld $18,%xmm5
+psrld $14,%xmm6
+pxor %xmm5,%xmm0
+pshufd $0x39,%xmm1,%xmm1
+pxor %xmm6,%xmm0
+paddd %xmm0,%xmm4
+movdqa %xmm0,%xmm5
+movdqa %xmm4,%xmm6
+pslld $7,%xmm4
+psrld $25,%xmm6
+pxor %xmm4,%xmm1
+pxor %xmm6,%xmm1
+paddd %xmm1,%xmm5
+movdqa %xmm1,%xmm4
+movdqa %xmm5,%xmm6
+pslld $9,%xmm5
+psrld $23,%xmm6
+pxor %xmm5,%xmm2
+pshufd $0x93,%xmm1,%xmm1
+pxor %xmm6,%xmm2
+paddd %xmm2,%xmm4
+movdqa %xmm2,%xmm5
+movdqa %xmm4,%xmm6
+pslld $13,%xmm4
+psrld $19,%xmm6
+pxor %xmm4,%xmm3
+pshufd $0x4e,%xmm2,%xmm2
+pxor %xmm6,%xmm3
+sub $4,%rcx
+paddd %xmm3,%xmm5
+movdqa %xmm1,%xmm4
+movdqa %xmm5,%xmm6
+pslld $18,%xmm5
+pxor %xmm7,%xmm7
+psrld $14,%xmm6
+pxor %xmm5,%xmm0
+pshufd $0x39,%xmm3,%xmm3
+pxor %xmm6,%xmm0
+ja ._mainloop2
+
+paddd 112(%rsp),%xmm0
+paddd 64(%rsp),%xmm1
+paddd 80(%rsp),%xmm2
+paddd 96(%rsp),%xmm3
+movd %xmm0,%rcx
+movd %xmm1,%r8
+movd %xmm2,%r9
+movd %xmm3,%rax
+pshufd $0x39,%xmm0,%xmm0
+pshufd $0x39,%xmm1,%xmm1
+pshufd $0x39,%xmm2,%xmm2
+pshufd $0x39,%xmm3,%xmm3
+xorl 0(%rsi),%ecx
+xorl 48(%rsi),%r8d
+xorl 32(%rsi),%r9d
+xorl 16(%rsi),%eax
+movl %ecx,0(%rdi)
+movl %r8d,48(%rdi)
+movl %r9d,32(%rdi)
+movl %eax,16(%rdi)
+movd %xmm0,%rcx
+movd %xmm1,%r8
+movd %xmm2,%r9
+movd %xmm3,%rax
+pshufd $0x39,%xmm0,%xmm0
+pshufd $0x39,%xmm1,%xmm1
+pshufd $0x39,%xmm2,%xmm2
+pshufd $0x39,%xmm3,%xmm3
+xorl 20(%rsi),%ecx
+xorl 4(%rsi),%r8d
+xorl 52(%rsi),%r9d
+xorl 36(%rsi),%eax
+movl %ecx,20(%rdi)
+movl %r8d,4(%rdi)
+movl %r9d,52(%rdi)
+movl %eax,36(%rdi)
+movd %xmm0,%rcx
+movd %xmm1,%r8
+movd %xmm2,%r9
+movd %xmm3,%rax
+pshufd $0x39,%xmm0,%xmm0
+pshufd $0x39,%xmm1,%xmm1
+pshufd $0x39,%xmm2,%xmm2
+pshufd $0x39,%xmm3,%xmm3
+xorl 40(%rsi),%ecx
+xorl 24(%rsi),%r8d
+xorl 8(%rsi),%r9d
+xorl 56(%rsi),%eax
+movl %ecx,40(%rdi)
+movl %r8d,24(%rdi)
+movl %r9d,8(%rdi)
+movl %eax,56(%rdi)
+movd %xmm0,%rcx
+movd %xmm1,%r8
+movd %xmm2,%r9
+movd %xmm3,%rax
+xorl 60(%rsi),%ecx
+xorl 44(%rsi),%r8d
+xorl 28(%rsi),%r9d
+xorl 12(%rsi),%eax
+movl %ecx,60(%rdi)
+movl %r8d,44(%rdi)
+movl %r9d,28(%rdi)
+movl %eax,12(%rdi)
+movq 480(%rsp),%r9
+movq 472(%rsp),%rcx
+add $1,%rcx
+mov %rcx,%r8
+shr $32,%r8
+movl %ecx,80(%rsp)
+movl %r8d,4+96(%rsp)
+movq %rcx,472(%rsp)
+cmp $64,%r9
+ja ._bytesatleast65
+jae ._bytesatleast64
+
+mov %rdi,%rsi
+mov %rdx,%rdi
+mov %r9,%rcx
+rep movsb
+
+._bytesatleast64:
+._done:
+movq 416(%rsp),%r11
+movq 424(%rsp),%r12
+movq 432(%rsp),%r13
+movq 440(%rsp),%r14
+movq 448(%rsp),%r15
+movq 456(%rsp),%rbx
+movq 464(%rsp),%rbp
+add %r11,%rsp
+xor %rax,%rax
+mov %rsi,%rdx
+ret
+
+._bytesatleast65:
+sub $64,%r9
+add $64,%rdi
+add $64,%rsi
+jmp ._bytesbetween1and255
+
+#endif
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.c b/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.c
new file mode 100644
index 0000000000..0a6fee0f3e
--- /dev/null
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.c
@@ -0,0 +1,31 @@
+
+#include <stdint.h>
+
+#include "utils.h"
+
+#include "../stream_salsa20.h"
+#include "salsa20_xmm6.h"
+
+#ifdef HAVE_AMD64_ASM
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern int stream_salsa20_xmm6(unsigned char *c, unsigned long long clen,
+ const unsigned char *n, const unsigned char *k);
+
+extern int stream_salsa20_xmm6_xor_ic(unsigned char *c, const unsigned char *m,
+ unsigned long long mlen,
+ const unsigned char *n,
+ uint64_t ic, const unsigned char *k);
+#ifdef __cplusplus
+}
+#endif
+
+struct crypto_stream_salsa20_implementation
+ crypto_stream_salsa20_xmm6_implementation = {
+ SODIUM_C99(.stream =) stream_salsa20_xmm6,
+ SODIUM_C99(.stream_xor_ic =) stream_salsa20_xmm6_xor_ic,
+ };
+
+#endif
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.h
new file mode 100644
index 0000000000..d38473a9ff
--- /dev/null
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.h
@@ -0,0 +1,8 @@
+
+#include <stdint.h>
+
+#include "../stream_salsa20.h"
+#include "crypto_stream_salsa20.h"
+
+extern struct crypto_stream_salsa20_implementation
+ crypto_stream_salsa20_xmm6_implementation;
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.c b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.c
new file mode 100644
index 0000000000..18d4773ec9
--- /dev/null
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.c
@@ -0,0 +1,131 @@
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "crypto_stream_salsa20.h"
+#include "private/common.h"
+#include "private/sse2_64_32.h"
+#include "utils.h"
+
+#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
+ defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
+
+# ifdef __GNUC__
+# pragma GCC target("sse2")
+# pragma GCC target("ssse3")
+# pragma GCC target("sse4.1")
+# pragma GCC target("avx2")
+# endif
+
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+
+# include "../stream_salsa20.h"
+# include "salsa20_xmm6int-avx2.h"
+
+# define ROUNDS 20
+
+typedef struct salsa_ctx {
+ uint32_t input[16];
+} salsa_ctx;
+
+static const int TR[16] = {
+ 0, 5, 10, 15, 12, 1, 6, 11, 8, 13, 2, 7, 4, 9, 14, 3
+};
+
+static void
+salsa_keysetup(salsa_ctx *ctx, const uint8_t *k)
+{
+ ctx->input[TR[1]] = LOAD32_LE(k + 0);
+ ctx->input[TR[2]] = LOAD32_LE(k + 4);
+ ctx->input[TR[3]] = LOAD32_LE(k + 8);
+ ctx->input[TR[4]] = LOAD32_LE(k + 12);
+ ctx->input[TR[11]] = LOAD32_LE(k + 16);
+ ctx->input[TR[12]] = LOAD32_LE(k + 20);
+ ctx->input[TR[13]] = LOAD32_LE(k + 24);
+ ctx->input[TR[14]] = LOAD32_LE(k + 28);
+ ctx->input[TR[0]] = 0x61707865;
+ ctx->input[TR[5]] = 0x3320646e;
+ ctx->input[TR[10]] = 0x79622d32;
+ ctx->input[TR[15]] = 0x6b206574;
+}
+
+static void
+salsa_ivsetup(salsa_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
+{
+ ctx->input[TR[6]] = LOAD32_LE(iv + 0);
+ ctx->input[TR[7]] = LOAD32_LE(iv + 4);
+ ctx->input[TR[8]] = counter == NULL ? 0 : LOAD32_LE(counter + 0);
+ ctx->input[TR[9]] = counter == NULL ? 0 : LOAD32_LE(counter + 4);
+}
+
+static void
+salsa20_encrypt_bytes(salsa_ctx *ctx, const uint8_t *m, uint8_t *c,
+ unsigned long long bytes)
+{
+ uint32_t * const x = &ctx->input[0];
+
+ if (!bytes) {
+ return; /* LCOV_EXCL_LINE */
+ }
+
+#include "u8.h"
+#include "u4.h"
+#include "u1.h"
+#include "u0.h"
+}
+
+static int
+stream_avx2(unsigned char *c, unsigned long long clen, const unsigned char *n,
+ const unsigned char *k)
+{
+ struct salsa_ctx ctx;
+
+ if (!clen) {
+ return 0;
+ }
+ COMPILER_ASSERT(crypto_stream_salsa20_KEYBYTES == 256 / 8);
+ salsa_keysetup(&ctx, k);
+ salsa_ivsetup(&ctx, n, NULL);
+ memset(c, 0, clen);
+ salsa20_encrypt_bytes(&ctx, c, c, clen);
+ sodium_memzero(&ctx, sizeof ctx);
+
+ return 0;
+}
+
+static int
+stream_avx2_xor_ic(unsigned char *c, const unsigned char *m,
+ unsigned long long mlen, const unsigned char *n, uint64_t ic,
+ const unsigned char *k)
+{
+ struct salsa_ctx ctx;
+ uint8_t ic_bytes[8];
+ uint32_t ic_high;
+ uint32_t ic_low;
+
+ if (!mlen) {
+ return 0;
+ }
+ ic_high = (uint32_t) (ic >> 32);
+ ic_low = (uint32_t) ic;
+ STORE32_LE(&ic_bytes[0], ic_low);
+ STORE32_LE(&ic_bytes[4], ic_high);
+ salsa_keysetup(&ctx, k);
+ salsa_ivsetup(&ctx, n, ic_bytes);
+ salsa20_encrypt_bytes(&ctx, m, c, mlen);
+ sodium_memzero(&ctx, sizeof ctx);
+
+ return 0;
+}
+
+struct crypto_stream_salsa20_implementation
+ crypto_stream_salsa20_xmm6int_avx2_implementation = {
+ SODIUM_C99(.stream =) stream_avx2,
+ SODIUM_C99(.stream_xor_ic =) stream_avx2_xor_ic
+ };
+
+#endif
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.h
new file mode 100644
index 0000000000..0924e9baff
--- /dev/null
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.h
@@ -0,0 +1,8 @@
+
+#include <stdint.h>
+
+#include "../stream_salsa20.h"
+#include "crypto_stream_salsa20.h"
+
+extern struct crypto_stream_salsa20_implementation
+ crypto_stream_salsa20_xmm6int_avx2_implementation;
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.c b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.c
new file mode 100644
index 0000000000..d8e53a6554
--- /dev/null
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.c
@@ -0,0 +1,122 @@
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "crypto_stream_salsa20.h"
+#include "private/common.h"
+#include "private/sse2_64_32.h"
+#include "utils.h"
+
+#ifdef HAVE_EMMINTRIN_H
+
+# ifdef __GNUC__
+# pragma GCC target("sse2")
+# endif
+# include <emmintrin.h>
+
+# include "../stream_salsa20.h"
+# include "salsa20_xmm6int-sse2.h"
+
+# define ROUNDS 20
+
+typedef struct salsa_ctx {
+ uint32_t input[16];
+} salsa_ctx;
+
+static const int TR[16] = {
+ 0, 5, 10, 15, 12, 1, 6, 11, 8, 13, 2, 7, 4, 9, 14, 3
+};
+
+static void
+salsa_keysetup(salsa_ctx *ctx, const uint8_t *k)
+{
+ ctx->input[TR[1]] = LOAD32_LE(k + 0);
+ ctx->input[TR[2]] = LOAD32_LE(k + 4);
+ ctx->input[TR[3]] = LOAD32_LE(k + 8);
+ ctx->input[TR[4]] = LOAD32_LE(k + 12);
+ ctx->input[TR[11]] = LOAD32_LE(k + 16);
+ ctx->input[TR[12]] = LOAD32_LE(k + 20);
+ ctx->input[TR[13]] = LOAD32_LE(k + 24);
+ ctx->input[TR[14]] = LOAD32_LE(k + 28);
+ ctx->input[TR[0]] = 0x61707865;
+ ctx->input[TR[5]] = 0x3320646e;
+ ctx->input[TR[10]] = 0x79622d32;
+ ctx->input[TR[15]] = 0x6b206574;
+}
+
+static void
+salsa_ivsetup(salsa_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
+{
+ ctx->input[TR[6]] = LOAD32_LE(iv + 0);
+ ctx->input[TR[7]] = LOAD32_LE(iv + 4);
+ ctx->input[TR[8]] = counter == NULL ? 0 : LOAD32_LE(counter + 0);
+ ctx->input[TR[9]] = counter == NULL ? 0 : LOAD32_LE(counter + 4);
+}
+
+static void
+salsa20_encrypt_bytes(salsa_ctx *ctx, const uint8_t *m, uint8_t *c,
+ unsigned long long bytes)
+{
+ uint32_t * const x = &ctx->input[0];
+
+ if (!bytes) {
+ return; /* LCOV_EXCL_LINE */
+ }
+
+#include "u4.h"
+#include "u1.h"
+#include "u0.h"
+}
+
+static int
+stream_sse2(unsigned char *c, unsigned long long clen, const unsigned char *n,
+ const unsigned char *k)
+{
+ struct salsa_ctx ctx;
+
+ if (!clen) {
+ return 0;
+ }
+ COMPILER_ASSERT(crypto_stream_salsa20_KEYBYTES == 256 / 8);
+ salsa_keysetup(&ctx, k);
+ salsa_ivsetup(&ctx, n, NULL);
+ memset(c, 0, clen);
+ salsa20_encrypt_bytes(&ctx, c, c, clen);
+ sodium_memzero(&ctx, sizeof ctx);
+
+ return 0;
+}
+
+static int
+stream_sse2_xor_ic(unsigned char *c, const unsigned char *m,
+ unsigned long long mlen, const unsigned char *n, uint64_t ic,
+ const unsigned char *k)
+{
+ struct salsa_ctx ctx;
+ uint8_t ic_bytes[8];
+ uint32_t ic_high;
+ uint32_t ic_low;
+
+ if (!mlen) {
+ return 0;
+ }
+ ic_high = (uint32_t) (ic >> 32);
+ ic_low = (uint32_t) (ic);
+ STORE32_LE(&ic_bytes[0], ic_low);
+ STORE32_LE(&ic_bytes[4], ic_high);
+ salsa_keysetup(&ctx, k);
+ salsa_ivsetup(&ctx, n, ic_bytes);
+ salsa20_encrypt_bytes(&ctx, m, c, mlen);
+ sodium_memzero(&ctx, sizeof ctx);
+
+ return 0;
+}
+
+struct crypto_stream_salsa20_implementation
+ crypto_stream_salsa20_xmm6int_sse2_implementation = {
+ SODIUM_C99(.stream =) stream_sse2,
+ SODIUM_C99(.stream_xor_ic =) stream_sse2_xor_ic
+ };
+
+#endif
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.h
new file mode 100644
index 0000000000..ed52a8bcbe
--- /dev/null
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.h
@@ -0,0 +1,8 @@
+
+#include <stdint.h>
+
+#include "../stream_salsa20.h"
+#include "crypto_stream_salsa20.h"
+
+extern struct crypto_stream_salsa20_implementation
+ crypto_stream_salsa20_xmm6int_sse2_implementation;
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u0.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u0.h
new file mode 100644
index 0000000000..b2d4168058
--- /dev/null
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u0.h
@@ -0,0 +1,195 @@
+if (bytes > 0) {
+ __m128i diag0 = _mm_loadu_si128((__m128i *) (x + 0));
+ __m128i diag1 = _mm_loadu_si128((__m128i *) (x + 4));
+ __m128i diag2 = _mm_loadu_si128((__m128i *) (x + 8));
+ __m128i diag3 = _mm_loadu_si128((__m128i *) (x + 12));
+ __m128i a0, a1, a2, a3, a4, a5, a6, a7;
+ __m128i b0, b1, b2, b3, b4, b5, b6, b7;
+ uint8_t partialblock[64];
+
+ unsigned int i;
+
+ a0 = diag1;
+ for (i = 0; i < ROUNDS; i += 4) {
+ a0 = _mm_add_epi32(a0, diag0);
+ a1 = diag0;
+ b0 = a0;
+ a0 = _mm_slli_epi32(a0, 7);
+ b0 = _mm_srli_epi32(b0, 25);
+ diag3 = _mm_xor_si128(diag3, a0);
+
+ diag3 = _mm_xor_si128(diag3, b0);
+
+ a1 = _mm_add_epi32(a1, diag3);
+ a2 = diag3;
+ b1 = a1;
+ a1 = _mm_slli_epi32(a1, 9);
+ b1 = _mm_srli_epi32(b1, 23);
+ diag2 = _mm_xor_si128(diag2, a1);
+ diag3 = _mm_shuffle_epi32(diag3, 0x93);
+ diag2 = _mm_xor_si128(diag2, b1);
+
+ a2 = _mm_add_epi32(a2, diag2);
+ a3 = diag2;
+ b2 = a2;
+ a2 = _mm_slli_epi32(a2, 13);
+ b2 = _mm_srli_epi32(b2, 19);
+ diag1 = _mm_xor_si128(diag1, a2);
+ diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+ diag1 = _mm_xor_si128(diag1, b2);
+
+ a3 = _mm_add_epi32(a3, diag1);
+ a4 = diag3;
+ b3 = a3;
+ a3 = _mm_slli_epi32(a3, 18);
+ b3 = _mm_srli_epi32(b3, 14);
+ diag0 = _mm_xor_si128(diag0, a3);
+ diag1 = _mm_shuffle_epi32(diag1, 0x39);
+ diag0 = _mm_xor_si128(diag0, b3);
+
+ a4 = _mm_add_epi32(a4, diag0);
+ a5 = diag0;
+ b4 = a4;
+ a4 = _mm_slli_epi32(a4, 7);
+ b4 = _mm_srli_epi32(b4, 25);
+ diag1 = _mm_xor_si128(diag1, a4);
+
+ diag1 = _mm_xor_si128(diag1, b4);
+
+ a5 = _mm_add_epi32(a5, diag1);
+ a6 = diag1;
+ b5 = a5;
+ a5 = _mm_slli_epi32(a5, 9);
+ b5 = _mm_srli_epi32(b5, 23);
+ diag2 = _mm_xor_si128(diag2, a5);
+ diag1 = _mm_shuffle_epi32(diag1, 0x93);
+ diag2 = _mm_xor_si128(diag2, b5);
+
+ a6 = _mm_add_epi32(a6, diag2);
+ a7 = diag2;
+ b6 = a6;
+ a6 = _mm_slli_epi32(a6, 13);
+ b6 = _mm_srli_epi32(b6, 19);
+ diag3 = _mm_xor_si128(diag3, a6);
+ diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+ diag3 = _mm_xor_si128(diag3, b6);
+
+ a7 = _mm_add_epi32(a7, diag3);
+ a0 = diag1;
+ b7 = a7;
+ a7 = _mm_slli_epi32(a7, 18);
+ b7 = _mm_srli_epi32(b7, 14);
+ diag0 = _mm_xor_si128(diag0, a7);
+ diag3 = _mm_shuffle_epi32(diag3, 0x39);
+ diag0 = _mm_xor_si128(diag0, b7);
+
+ a0 = _mm_add_epi32(a0, diag0);
+ a1 = diag0;
+ b0 = a0;
+ a0 = _mm_slli_epi32(a0, 7);
+ b0 = _mm_srli_epi32(b0, 25);
+ diag3 = _mm_xor_si128(diag3, a0);
+
+ diag3 = _mm_xor_si128(diag3, b0);
+
+ a1 = _mm_add_epi32(a1, diag3);
+ a2 = diag3;
+ b1 = a1;
+ a1 = _mm_slli_epi32(a1, 9);
+ b1 = _mm_srli_epi32(b1, 23);
+ diag2 = _mm_xor_si128(diag2, a1);
+ diag3 = _mm_shuffle_epi32(diag3, 0x93);
+ diag2 = _mm_xor_si128(diag2, b1);
+
+ a2 = _mm_add_epi32(a2, diag2);
+ a3 = diag2;
+ b2 = a2;
+ a2 = _mm_slli_epi32(a2, 13);
+ b2 = _mm_srli_epi32(b2, 19);
+ diag1 = _mm_xor_si128(diag1, a2);
+ diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+ diag1 = _mm_xor_si128(diag1, b2);
+
+ a3 = _mm_add_epi32(a3, diag1);
+ a4 = diag3;
+ b3 = a3;
+ a3 = _mm_slli_epi32(a3, 18);
+ b3 = _mm_srli_epi32(b3, 14);
+ diag0 = _mm_xor_si128(diag0, a3);
+ diag1 = _mm_shuffle_epi32(diag1, 0x39);
+ diag0 = _mm_xor_si128(diag0, b3);
+
+ a4 = _mm_add_epi32(a4, diag0);
+ a5 = diag0;
+ b4 = a4;
+ a4 = _mm_slli_epi32(a4, 7);
+ b4 = _mm_srli_epi32(b4, 25);
+ diag1 = _mm_xor_si128(diag1, a4);
+
+ diag1 = _mm_xor_si128(diag1, b4);
+
+ a5 = _mm_add_epi32(a5, diag1);
+ a6 = diag1;
+ b5 = a5;
+ a5 = _mm_slli_epi32(a5, 9);
+ b5 = _mm_srli_epi32(b5, 23);
+ diag2 = _mm_xor_si128(diag2, a5);
+ diag1 = _mm_shuffle_epi32(diag1, 0x93);
+ diag2 = _mm_xor_si128(diag2, b5);
+
+ a6 = _mm_add_epi32(a6, diag2);
+ a7 = diag2;
+ b6 = a6;
+ a6 = _mm_slli_epi32(a6, 13);
+ b6 = _mm_srli_epi32(b6, 19);
+ diag3 = _mm_xor_si128(diag3, a6);
+ diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+ diag3 = _mm_xor_si128(diag3, b6);
+
+ a7 = _mm_add_epi32(a7, diag3);
+ a0 = diag1;
+ b7 = a7;
+ a7 = _mm_slli_epi32(a7, 18);
+ b7 = _mm_srli_epi32(b7, 14);
+ diag0 = _mm_xor_si128(diag0, a7);
+ diag3 = _mm_shuffle_epi32(diag3, 0x39);
+ diag0 = _mm_xor_si128(diag0, b7);
+ }
+
+ diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *) (x + 0)));
+ diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *) (x + 4)));
+ diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *) (x + 8)));
+ diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *) (x + 12)));
+
+#define ONEQUAD_SHUFFLE(A, B, C, D) \
+ do { \
+ uint32_t in##A = _mm_cvtsi128_si32(diag0); \
+ uint32_t in##B = _mm_cvtsi128_si32(diag1); \
+ uint32_t in##C = _mm_cvtsi128_si32(diag2); \
+ uint32_t in##D = _mm_cvtsi128_si32(diag3); \
+ diag0 = _mm_shuffle_epi32(diag0, 0x39); \
+ diag1 = _mm_shuffle_epi32(diag1, 0x39); \
+ diag2 = _mm_shuffle_epi32(diag2, 0x39); \
+ diag3 = _mm_shuffle_epi32(diag3, 0x39); \
+ *(uint32_t *) (partialblock + (A * 4)) = in##A; \
+ *(uint32_t *) (partialblock + (B * 4)) = in##B; \
+ *(uint32_t *) (partialblock + (C * 4)) = in##C; \
+ *(uint32_t *) (partialblock + (D * 4)) = in##D; \
+ } while (0)
+
+#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
+
+ ONEQUAD(0, 12, 8, 4);
+ ONEQUAD(5, 1, 13, 9);
+ ONEQUAD(10, 6, 2, 14);
+ ONEQUAD(15, 11, 7, 3);
+
+#undef ONEQUAD
+#undef ONEQUAD_SHUFFLE
+
+ for (i = 0; i < bytes; i++) {
+ c[i] = m[i] ^ partialblock[i];
+ }
+
+ sodium_memzero(partialblock, sizeof partialblock);
+}
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u1.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u1.h
new file mode 100644
index 0000000000..c245d9565f
--- /dev/null
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u1.h
@@ -0,0 +1,207 @@
+while (bytes >= 64) {
+ __m128i diag0 = _mm_loadu_si128((__m128i *) (x + 0));
+ __m128i diag1 = _mm_loadu_si128((__m128i *) (x + 4));
+ __m128i diag2 = _mm_loadu_si128((__m128i *) (x + 8));
+ __m128i diag3 = _mm_loadu_si128((__m128i *) (x + 12));
+ __m128i a0, a1, a2, a3, a4, a5, a6, a7;
+ __m128i b0, b1, b2, b3, b4, b5, b6, b7;
+
+ uint32_t in8;
+ uint32_t in9;
+ int i;
+
+ a0 = diag1;
+ for (i = 0; i < ROUNDS; i += 4) {
+ a0 = _mm_add_epi32(a0, diag0);
+ a1 = diag0;
+ b0 = a0;
+ a0 = _mm_slli_epi32(a0, 7);
+ b0 = _mm_srli_epi32(b0, 25);
+ diag3 = _mm_xor_si128(diag3, a0);
+
+ diag3 = _mm_xor_si128(diag3, b0);
+
+ a1 = _mm_add_epi32(a1, diag3);
+ a2 = diag3;
+ b1 = a1;
+ a1 = _mm_slli_epi32(a1, 9);
+ b1 = _mm_srli_epi32(b1, 23);
+ diag2 = _mm_xor_si128(diag2, a1);
+ diag3 = _mm_shuffle_epi32(diag3, 0x93);
+ diag2 = _mm_xor_si128(diag2, b1);
+
+ a2 = _mm_add_epi32(a2, diag2);
+ a3 = diag2;
+ b2 = a2;
+ a2 = _mm_slli_epi32(a2, 13);
+ b2 = _mm_srli_epi32(b2, 19);
+ diag1 = _mm_xor_si128(diag1, a2);
+ diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+ diag1 = _mm_xor_si128(diag1, b2);
+
+ a3 = _mm_add_epi32(a3, diag1);
+ a4 = diag3;
+ b3 = a3;
+ a3 = _mm_slli_epi32(a3, 18);
+ b3 = _mm_srli_epi32(b3, 14);
+ diag0 = _mm_xor_si128(diag0, a3);
+ diag1 = _mm_shuffle_epi32(diag1, 0x39);
+ diag0 = _mm_xor_si128(diag0, b3);
+
+ a4 = _mm_add_epi32(a4, diag0);
+ a5 = diag0;
+ b4 = a4;
+ a4 = _mm_slli_epi32(a4, 7);
+ b4 = _mm_srli_epi32(b4, 25);
+ diag1 = _mm_xor_si128(diag1, a4);
+
+ diag1 = _mm_xor_si128(diag1, b4);
+
+ a5 = _mm_add_epi32(a5, diag1);
+ a6 = diag1;
+ b5 = a5;
+ a5 = _mm_slli_epi32(a5, 9);
+ b5 = _mm_srli_epi32(b5, 23);
+ diag2 = _mm_xor_si128(diag2, a5);
+ diag1 = _mm_shuffle_epi32(diag1, 0x93);
+ diag2 = _mm_xor_si128(diag2, b5);
+
+ a6 = _mm_add_epi32(a6, diag2);
+ a7 = diag2;
+ b6 = a6;
+ a6 = _mm_slli_epi32(a6, 13);
+ b6 = _mm_srli_epi32(b6, 19);
+ diag3 = _mm_xor_si128(diag3, a6);
+ diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+ diag3 = _mm_xor_si128(diag3, b6);
+
+ a7 = _mm_add_epi32(a7, diag3);
+ a0 = diag1;
+ b7 = a7;
+ a7 = _mm_slli_epi32(a7, 18);
+ b7 = _mm_srli_epi32(b7, 14);
+ diag0 = _mm_xor_si128(diag0, a7);
+ diag3 = _mm_shuffle_epi32(diag3, 0x39);
+ diag0 = _mm_xor_si128(diag0, b7);
+
+ a0 = _mm_add_epi32(a0, diag0);
+ a1 = diag0;
+ b0 = a0;
+ a0 = _mm_slli_epi32(a0, 7);
+ b0 = _mm_srli_epi32(b0, 25);
+ diag3 = _mm_xor_si128(diag3, a0);
+
+ diag3 = _mm_xor_si128(diag3, b0);
+
+ a1 = _mm_add_epi32(a1, diag3);
+ a2 = diag3;
+ b1 = a1;
+ a1 = _mm_slli_epi32(a1, 9);
+ b1 = _mm_srli_epi32(b1, 23);
+ diag2 = _mm_xor_si128(diag2, a1);
+ diag3 = _mm_shuffle_epi32(diag3, 0x93);
+ diag2 = _mm_xor_si128(diag2, b1);
+
+ a2 = _mm_add_epi32(a2, diag2);
+ a3 = diag2;
+ b2 = a2;
+ a2 = _mm_slli_epi32(a2, 13);
+ b2 = _mm_srli_epi32(b2, 19);
+ diag1 = _mm_xor_si128(diag1, a2);
+ diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+ diag1 = _mm_xor_si128(diag1, b2);
+
+ a3 = _mm_add_epi32(a3, diag1);
+ a4 = diag3;
+ b3 = a3;
+ a3 = _mm_slli_epi32(a3, 18);
+ b3 = _mm_srli_epi32(b3, 14);
+ diag0 = _mm_xor_si128(diag0, a3);
+ diag1 = _mm_shuffle_epi32(diag1, 0x39);
+ diag0 = _mm_xor_si128(diag0, b3);
+
+ a4 = _mm_add_epi32(a4, diag0);
+ a5 = diag0;
+ b4 = a4;
+ a4 = _mm_slli_epi32(a4, 7);
+ b4 = _mm_srli_epi32(b4, 25);
+ diag1 = _mm_xor_si128(diag1, a4);
+
+ diag1 = _mm_xor_si128(diag1, b4);
+
+ a5 = _mm_add_epi32(a5, diag1);
+ a6 = diag1;
+ b5 = a5;
+ a5 = _mm_slli_epi32(a5, 9);
+ b5 = _mm_srli_epi32(b5, 23);
+ diag2 = _mm_xor_si128(diag2, a5);
+ diag1 = _mm_shuffle_epi32(diag1, 0x93);
+ diag2 = _mm_xor_si128(diag2, b5);
+
+ a6 = _mm_add_epi32(a6, diag2);
+ a7 = diag2;
+ b6 = a6;
+ a6 = _mm_slli_epi32(a6, 13);
+ b6 = _mm_srli_epi32(b6, 19);
+ diag3 = _mm_xor_si128(diag3, a6);
+ diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+ diag3 = _mm_xor_si128(diag3, b6);
+
+ a7 = _mm_add_epi32(a7, diag3);
+ a0 = diag1;
+ b7 = a7;
+ a7 = _mm_slli_epi32(a7, 18);
+ b7 = _mm_srli_epi32(b7, 14);
+ diag0 = _mm_xor_si128(diag0, a7);
+ diag3 = _mm_shuffle_epi32(diag3, 0x39);
+ diag0 = _mm_xor_si128(diag0, b7);
+ }
+
+ diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *) (x + 0)));
+ diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *) (x + 4)));
+ diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *) (x + 8)));
+ diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *) (x + 12)));
+
+#define ONEQUAD_SHUFFLE(A, B, C, D) \
+ do { \
+ uint32_t in##A = _mm_cvtsi128_si32(diag0); \
+ uint32_t in##B = _mm_cvtsi128_si32(diag1); \
+ uint32_t in##C = _mm_cvtsi128_si32(diag2); \
+ uint32_t in##D = _mm_cvtsi128_si32(diag3); \
+ diag0 = _mm_shuffle_epi32(diag0, 0x39); \
+ diag1 = _mm_shuffle_epi32(diag1, 0x39); \
+ diag2 = _mm_shuffle_epi32(diag2, 0x39); \
+ diag3 = _mm_shuffle_epi32(diag3, 0x39); \
+ in##A ^= *(uint32_t *) (m + (A * 4)); \
+ in##B ^= *(uint32_t *) (m + (B * 4)); \
+ in##C ^= *(uint32_t *) (m + (C * 4)); \
+ in##D ^= *(uint32_t *) (m + (D * 4)); \
+ *(uint32_t *) (c + (A * 4)) = in##A; \
+ *(uint32_t *) (c + (B * 4)) = in##B; \
+ *(uint32_t *) (c + (C * 4)) = in##C; \
+ *(uint32_t *) (c + (D * 4)) = in##D; \
+ } while (0)
+
+#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
+
+ ONEQUAD(0, 12, 8, 4);
+ ONEQUAD(5, 1, 13, 9);
+ ONEQUAD(10, 6, 2, 14);
+ ONEQUAD(15, 11, 7, 3);
+
+#undef ONEQUAD
+#undef ONEQUAD_SHUFFLE
+
+ in8 = x[8];
+ in9 = x[13];
+ in8++;
+ if (in8 == 0) {
+ in9++;
+ }
+ x[8] = in8;
+ x[13] = in9;
+
+ c += 64;
+ m += 64;
+ bytes -= 64;
+}
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u4.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u4.h
new file mode 100644
index 0000000000..61d935fc90
--- /dev/null
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u4.h
@@ -0,0 +1,547 @@
+if (bytes >= 256) {
+ __m128i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
+ y15;
+ __m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14,
+ z15;
+ __m128i orig0, orig1, orig2, orig3, orig4, orig5, orig6, orig7, orig8,
+ orig9, orig10, orig11, orig12, orig13, orig14, orig15;
+
+ uint32_t in8;
+ uint32_t in9;
+ int i;
+
+ /* element broadcast immediate for _mm_shuffle_epi32 are in order:
+ 0x00, 0x55, 0xaa, 0xff */
+ z0 = _mm_loadu_si128((__m128i *) (x + 0));
+ z5 = _mm_shuffle_epi32(z0, 0x55);
+ z10 = _mm_shuffle_epi32(z0, 0xaa);
+ z15 = _mm_shuffle_epi32(z0, 0xff);
+ z0 = _mm_shuffle_epi32(z0, 0x00);
+ z1 = _mm_loadu_si128((__m128i *) (x + 4));
+ z6 = _mm_shuffle_epi32(z1, 0xaa);
+ z11 = _mm_shuffle_epi32(z1, 0xff);
+ z12 = _mm_shuffle_epi32(z1, 0x00);
+ z1 = _mm_shuffle_epi32(z1, 0x55);
+ z2 = _mm_loadu_si128((__m128i *) (x + 8));
+ z7 = _mm_shuffle_epi32(z2, 0xff);
+ z13 = _mm_shuffle_epi32(z2, 0x55);
+ z2 = _mm_shuffle_epi32(z2, 0xaa);
+ /* no z8 -> first half of the nonce, will fill later */
+ z3 = _mm_loadu_si128((__m128i *) (x + 12));
+ z4 = _mm_shuffle_epi32(z3, 0x00);
+ z14 = _mm_shuffle_epi32(z3, 0xaa);
+ z3 = _mm_shuffle_epi32(z3, 0xff);
+ /* no z9 -> second half of the nonce, will fill later */
+ orig0 = z0;
+ orig1 = z1;
+ orig2 = z2;
+ orig3 = z3;
+ orig4 = z4;
+ orig5 = z5;
+ orig6 = z6;
+ orig7 = z7;
+ orig10 = z10;
+ orig11 = z11;
+ orig12 = z12;
+ orig13 = z13;
+ orig14 = z14;
+ orig15 = z15;
+
+ while (bytes >= 256) {
+ /* vector implementation for z8 and z9 */
+ /* not sure if it helps for only 4 blocks */
+ const __m128i addv8 = _mm_set_epi64x(1, 0);
+ const __m128i addv9 = _mm_set_epi64x(3, 2);
+ __m128i t8, t9;
+ uint64_t in89;
+
+ in8 = x[8];
+ in9 = x[13];
+ in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
+ t8 = _mm_set1_epi64x(in89);
+ t9 = _mm_set1_epi64x(in89);
+
+ z8 = _mm_add_epi64(addv8, t8);
+ z9 = _mm_add_epi64(addv9, t9);
+
+ t8 = _mm_unpacklo_epi32(z8, z9);
+ t9 = _mm_unpackhi_epi32(z8, z9);
+
+ z8 = _mm_unpacklo_epi32(t8, t9);
+ z9 = _mm_unpackhi_epi32(t8, t9);
+
+ orig8 = z8;
+ orig9 = z9;
+
+ in89 += 4;
+
+ x[8] = in89 & 0xFFFFFFFF;
+ x[13] = (in89 >> 32) & 0xFFFFFFFF;
+
+ z5 = orig5;
+ z10 = orig10;
+ z15 = orig15;
+ z14 = orig14;
+ z3 = orig3;
+ z6 = orig6;
+ z11 = orig11;
+ z1 = orig1;
+
+ z7 = orig7;
+ z13 = orig13;
+ z2 = orig2;
+ z9 = orig9;
+ z0 = orig0;
+ z12 = orig12;
+ z4 = orig4;
+ z8 = orig8;
+
+ for (i = 0; i < ROUNDS; i += 2) {
+ /* the inner loop is a direct translation (regexp search/replace)
+ * from the amd64-xmm6 ASM */
+ __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
+ r14, r15;
+
+ y4 = z12;
+ y4 = _mm_add_epi32(y4, z0);
+ r4 = y4;
+ y4 = _mm_slli_epi32(y4, 7);
+ z4 = _mm_xor_si128(z4, y4);
+ r4 = _mm_srli_epi32(r4, 25);
+ z4 = _mm_xor_si128(z4, r4);
+
+ y9 = z1;
+ y9 = _mm_add_epi32(y9, z5);
+ r9 = y9;
+ y9 = _mm_slli_epi32(y9, 7);
+ z9 = _mm_xor_si128(z9, y9);
+ r9 = _mm_srli_epi32(r9, 25);
+ z9 = _mm_xor_si128(z9, r9);
+
+ y8 = z0;
+ y8 = _mm_add_epi32(y8, z4);
+ r8 = y8;
+ y8 = _mm_slli_epi32(y8, 9);
+ z8 = _mm_xor_si128(z8, y8);
+ r8 = _mm_srli_epi32(r8, 23);
+ z8 = _mm_xor_si128(z8, r8);
+
+ y13 = z5;
+ y13 = _mm_add_epi32(y13, z9);
+ r13 = y13;
+ y13 = _mm_slli_epi32(y13, 9);
+ z13 = _mm_xor_si128(z13, y13);
+ r13 = _mm_srli_epi32(r13, 23);
+ z13 = _mm_xor_si128(z13, r13);
+
+ y12 = z4;
+ y12 = _mm_add_epi32(y12, z8);
+ r12 = y12;
+ y12 = _mm_slli_epi32(y12, 13);
+ z12 = _mm_xor_si128(z12, y12);
+ r12 = _mm_srli_epi32(r12, 19);
+ z12 = _mm_xor_si128(z12, r12);
+
+ y1 = z9;
+ y1 = _mm_add_epi32(y1, z13);
+ r1 = y1;
+ y1 = _mm_slli_epi32(y1, 13);
+ z1 = _mm_xor_si128(z1, y1);
+ r1 = _mm_srli_epi32(r1, 19);
+ z1 = _mm_xor_si128(z1, r1);
+
+ y0 = z8;
+ y0 = _mm_add_epi32(y0, z12);
+ r0 = y0;
+ y0 = _mm_slli_epi32(y0, 18);
+ z0 = _mm_xor_si128(z0, y0);
+ r0 = _mm_srli_epi32(r0, 14);
+ z0 = _mm_xor_si128(z0, r0);
+
+ y5 = z13;
+ y5 = _mm_add_epi32(y5, z1);
+ r5 = y5;
+ y5 = _mm_slli_epi32(y5, 18);
+ z5 = _mm_xor_si128(z5, y5);
+ r5 = _mm_srli_epi32(r5, 14);
+ z5 = _mm_xor_si128(z5, r5);
+
+ y14 = z6;
+ y14 = _mm_add_epi32(y14, z10);
+ r14 = y14;
+ y14 = _mm_slli_epi32(y14, 7);
+ z14 = _mm_xor_si128(z14, y14);
+ r14 = _mm_srli_epi32(r14, 25);
+ z14 = _mm_xor_si128(z14, r14);
+
+ y3 = z11;
+ y3 = _mm_add_epi32(y3, z15);
+ r3 = y3;
+ y3 = _mm_slli_epi32(y3, 7);
+ z3 = _mm_xor_si128(z3, y3);
+ r3 = _mm_srli_epi32(r3, 25);
+ z3 = _mm_xor_si128(z3, r3);
+
+ y2 = z10;
+ y2 = _mm_add_epi32(y2, z14);
+ r2 = y2;
+ y2 = _mm_slli_epi32(y2, 9);
+ z2 = _mm_xor_si128(z2, y2);
+ r2 = _mm_srli_epi32(r2, 23);
+ z2 = _mm_xor_si128(z2, r2);
+
+ y7 = z15;
+ y7 = _mm_add_epi32(y7, z3);
+ r7 = y7;
+ y7 = _mm_slli_epi32(y7, 9);
+ z7 = _mm_xor_si128(z7, y7);
+ r7 = _mm_srli_epi32(r7, 23);
+ z7 = _mm_xor_si128(z7, r7);
+
+ y6 = z14;
+ y6 = _mm_add_epi32(y6, z2);
+ r6 = y6;
+ y6 = _mm_slli_epi32(y6, 13);
+ z6 = _mm_xor_si128(z6, y6);
+ r6 = _mm_srli_epi32(r6, 19);
+ z6 = _mm_xor_si128(z6, r6);
+
+ y11 = z3;
+ y11 = _mm_add_epi32(y11, z7);
+ r11 = y11;
+ y11 = _mm_slli_epi32(y11, 13);
+ z11 = _mm_xor_si128(z11, y11);
+ r11 = _mm_srli_epi32(r11, 19);
+ z11 = _mm_xor_si128(z11, r11);
+
+ y10 = z2;
+ y10 = _mm_add_epi32(y10, z6);
+ r10 = y10;
+ y10 = _mm_slli_epi32(y10, 18);
+ z10 = _mm_xor_si128(z10, y10);
+ r10 = _mm_srli_epi32(r10, 14);
+ z10 = _mm_xor_si128(z10, r10);
+
+ y1 = z3;
+ y1 = _mm_add_epi32(y1, z0);
+ r1 = y1;
+ y1 = _mm_slli_epi32(y1, 7);
+ z1 = _mm_xor_si128(z1, y1);
+ r1 = _mm_srli_epi32(r1, 25);
+ z1 = _mm_xor_si128(z1, r1);
+
+ y15 = z7;
+ y15 = _mm_add_epi32(y15, z11);
+ r15 = y15;
+ y15 = _mm_slli_epi32(y15, 18);
+ z15 = _mm_xor_si128(z15, y15);
+ r15 = _mm_srli_epi32(r15, 14);
+ z15 = _mm_xor_si128(z15, r15);
+
+ y6 = z4;
+ y6 = _mm_add_epi32(y6, z5);
+ r6 = y6;
+ y6 = _mm_slli_epi32(y6, 7);
+ z6 = _mm_xor_si128(z6, y6);
+ r6 = _mm_srli_epi32(r6, 25);
+ z6 = _mm_xor_si128(z6, r6);
+
+ y2 = z0;
+ y2 = _mm_add_epi32(y2, z1);
+ r2 = y2;
+ y2 = _mm_slli_epi32(y2, 9);
+ z2 = _mm_xor_si128(z2, y2);
+ r2 = _mm_srli_epi32(r2, 23);
+ z2 = _mm_xor_si128(z2, r2);
+
+ y7 = z5;
+ y7 = _mm_add_epi32(y7, z6);
+ r7 = y7;
+ y7 = _mm_slli_epi32(y7, 9);
+ z7 = _mm_xor_si128(z7, y7);
+ r7 = _mm_srli_epi32(r7, 23);
+ z7 = _mm_xor_si128(z7, r7);
+
+ y3 = z1;
+ y3 = _mm_add_epi32(y3, z2);
+ r3 = y3;
+ y3 = _mm_slli_epi32(y3, 13);
+ z3 = _mm_xor_si128(z3, y3);
+ r3 = _mm_srli_epi32(r3, 19);
+ z3 = _mm_xor_si128(z3, r3);
+
+ y4 = z6;
+ y4 = _mm_add_epi32(y4, z7);
+ r4 = y4;
+ y4 = _mm_slli_epi32(y4, 13);
+ z4 = _mm_xor_si128(z4, y4);
+ r4 = _mm_srli_epi32(r4, 19);
+ z4 = _mm_xor_si128(z4, r4);
+
+ y0 = z2;
+ y0 = _mm_add_epi32(y0, z3);
+ r0 = y0;
+ y0 = _mm_slli_epi32(y0, 18);
+ z0 = _mm_xor_si128(z0, y0);
+ r0 = _mm_srli_epi32(r0, 14);
+ z0 = _mm_xor_si128(z0, r0);
+
+ y5 = z7;
+ y5 = _mm_add_epi32(y5, z4);
+ r5 = y5;
+ y5 = _mm_slli_epi32(y5, 18);
+ z5 = _mm_xor_si128(z5, y5);
+ r5 = _mm_srli_epi32(r5, 14);
+ z5 = _mm_xor_si128(z5, r5);
+
+ y11 = z9;
+ y11 = _mm_add_epi32(y11, z10);
+ r11 = y11;
+ y11 = _mm_slli_epi32(y11, 7);
+ z11 = _mm_xor_si128(z11, y11);
+ r11 = _mm_srli_epi32(r11, 25);
+ z11 = _mm_xor_si128(z11, r11);
+
+ y12 = z14;
+ y12 = _mm_add_epi32(y12, z15);
+ r12 = y12;
+ y12 = _mm_slli_epi32(y12, 7);
+ z12 = _mm_xor_si128(z12, y12);
+ r12 = _mm_srli_epi32(r12, 25);
+ z12 = _mm_xor_si128(z12, r12);
+
+ y8 = z10;
+ y8 = _mm_add_epi32(y8, z11);
+ r8 = y8;
+ y8 = _mm_slli_epi32(y8, 9);
+ z8 = _mm_xor_si128(z8, y8);
+ r8 = _mm_srli_epi32(r8, 23);
+ z8 = _mm_xor_si128(z8, r8);
+
+ y13 = z15;
+ y13 = _mm_add_epi32(y13, z12);
+ r13 = y13;
+ y13 = _mm_slli_epi32(y13, 9);
+ z13 = _mm_xor_si128(z13, y13);
+ r13 = _mm_srli_epi32(r13, 23);
+ z13 = _mm_xor_si128(z13, r13);
+
+ y9 = z11;
+ y9 = _mm_add_epi32(y9, z8);
+ r9 = y9;
+ y9 = _mm_slli_epi32(y9, 13);
+ z9 = _mm_xor_si128(z9, y9);
+ r9 = _mm_srli_epi32(r9, 19);
+ z9 = _mm_xor_si128(z9, r9);
+
+ y14 = z12;
+ y14 = _mm_add_epi32(y14, z13);
+ r14 = y14;
+ y14 = _mm_slli_epi32(y14, 13);
+ z14 = _mm_xor_si128(z14, y14);
+ r14 = _mm_srli_epi32(r14, 19);
+ z14 = _mm_xor_si128(z14, r14);
+
+ y10 = z8;
+ y10 = _mm_add_epi32(y10, z9);
+ r10 = y10;
+ y10 = _mm_slli_epi32(y10, 18);
+ z10 = _mm_xor_si128(z10, y10);
+ r10 = _mm_srli_epi32(r10, 14);
+ z10 = _mm_xor_si128(z10, r10);
+
+ y15 = z13;
+ y15 = _mm_add_epi32(y15, z14);
+ r15 = y15;
+ y15 = _mm_slli_epi32(y15, 18);
+ z15 = _mm_xor_si128(z15, y15);
+ r15 = _mm_srli_epi32(r15, 14);
+ z15 = _mm_xor_si128(z15, r15);
+ }
+
+/* store data ; this macro replicates the original amd64-xmm6 code */
+#define ONEQUAD_SHUFFLE(A, B, C, D) \
+ z##A = _mm_add_epi32(z##A, orig##A); \
+ z##B = _mm_add_epi32(z##B, orig##B); \
+ z##C = _mm_add_epi32(z##C, orig##C); \
+ z##D = _mm_add_epi32(z##D, orig##D); \
+ in##A = _mm_cvtsi128_si32(z##A); \
+ in##B = _mm_cvtsi128_si32(z##B); \
+ in##C = _mm_cvtsi128_si32(z##C); \
+ in##D = _mm_cvtsi128_si32(z##D); \
+ z##A = _mm_shuffle_epi32(z##A, 0x39); \
+ z##B = _mm_shuffle_epi32(z##B, 0x39); \
+ z##C = _mm_shuffle_epi32(z##C, 0x39); \
+ z##D = _mm_shuffle_epi32(z##D, 0x39); \
+ \
+ in##A ^= *(uint32_t *) (m + 0); \
+ in##B ^= *(uint32_t *) (m + 4); \
+ in##C ^= *(uint32_t *) (m + 8); \
+ in##D ^= *(uint32_t *) (m + 12); \
+ \
+ *(uint32_t *) (c + 0) = in##A; \
+ *(uint32_t *) (c + 4) = in##B; \
+ *(uint32_t *) (c + 8) = in##C; \
+ *(uint32_t *) (c + 12) = in##D; \
+ \
+ in##A = _mm_cvtsi128_si32(z##A); \
+ in##B = _mm_cvtsi128_si32(z##B); \
+ in##C = _mm_cvtsi128_si32(z##C); \
+ in##D = _mm_cvtsi128_si32(z##D); \
+ z##A = _mm_shuffle_epi32(z##A, 0x39); \
+ z##B = _mm_shuffle_epi32(z##B, 0x39); \
+ z##C = _mm_shuffle_epi32(z##C, 0x39); \
+ z##D = _mm_shuffle_epi32(z##D, 0x39); \
+ \
+ in##A ^= *(uint32_t *) (m + 64); \
+ in##B ^= *(uint32_t *) (m + 68); \
+ in##C ^= *(uint32_t *) (m + 72); \
+ in##D ^= *(uint32_t *) (m + 76); \
+ *(uint32_t *) (c + 64) = in##A; \
+ *(uint32_t *) (c + 68) = in##B; \
+ *(uint32_t *) (c + 72) = in##C; \
+ *(uint32_t *) (c + 76) = in##D; \
+ \
+ in##A = _mm_cvtsi128_si32(z##A); \
+ in##B = _mm_cvtsi128_si32(z##B); \
+ in##C = _mm_cvtsi128_si32(z##C); \
+ in##D = _mm_cvtsi128_si32(z##D); \
+ z##A = _mm_shuffle_epi32(z##A, 0x39); \
+ z##B = _mm_shuffle_epi32(z##B, 0x39); \
+ z##C = _mm_shuffle_epi32(z##C, 0x39); \
+ z##D = _mm_shuffle_epi32(z##D, 0x39); \
+ \
+ in##A ^= *(uint32_t *) (m + 128); \
+ in##B ^= *(uint32_t *) (m + 132); \
+ in##C ^= *(uint32_t *) (m + 136); \
+ in##D ^= *(uint32_t *) (m + 140); \
+ *(uint32_t *) (c + 128) = in##A; \
+ *(uint32_t *) (c + 132) = in##B; \
+ *(uint32_t *) (c + 136) = in##C; \
+ *(uint32_t *) (c + 140) = in##D; \
+ \
+ in##A = _mm_cvtsi128_si32(z##A); \
+ in##B = _mm_cvtsi128_si32(z##B); \
+ in##C = _mm_cvtsi128_si32(z##C); \
+ in##D = _mm_cvtsi128_si32(z##D); \
+ \
+ in##A ^= *(uint32_t *) (m + 192); \
+ in##B ^= *(uint32_t *) (m + 196); \
+ in##C ^= *(uint32_t *) (m + 200); \
+ in##D ^= *(uint32_t *) (m + 204); \
+ *(uint32_t *) (c + 192) = in##A; \
+ *(uint32_t *) (c + 196) = in##B; \
+ *(uint32_t *) (c + 200) = in##C; \
+ *(uint32_t *) (c + 204) = in##D
+
+/* store data ; this macro replaces shuffle+mov by a direct extract; not much
+ * difference */
+#define ONEQUAD_EXTRACT(A, B, C, D) \
+ z##A = _mm_add_epi32(z##A, orig##A); \
+ z##B = _mm_add_epi32(z##B, orig##B); \
+ z##C = _mm_add_epi32(z##C, orig##C); \
+ z##D = _mm_add_epi32(z##D, orig##D); \
+ in##A = _mm_cvtsi128_si32(z##A); \
+ in##B = _mm_cvtsi128_si32(z##B); \
+ in##C = _mm_cvtsi128_si32(z##C); \
+ in##D = _mm_cvtsi128_si32(z##D); \
+ in##A ^= *(uint32_t *) (m + 0); \
+ in##B ^= *(uint32_t *) (m + 4); \
+ in##C ^= *(uint32_t *) (m + 8); \
+ in##D ^= *(uint32_t *) (m + 12); \
+ *(uint32_t *) (c + 0) = in##A; \
+ *(uint32_t *) (c + 4) = in##B; \
+ *(uint32_t *) (c + 8) = in##C; \
+ *(uint32_t *) (c + 12) = in##D; \
+ \
+ in##A = _mm_extract_epi32(z##A, 1); \
+ in##B = _mm_extract_epi32(z##B, 1); \
+ in##C = _mm_extract_epi32(z##C, 1); \
+ in##D = _mm_extract_epi32(z##D, 1); \
+ \
+ in##A ^= *(uint32_t *) (m + 64); \
+ in##B ^= *(uint32_t *) (m + 68); \
+ in##C ^= *(uint32_t *) (m + 72); \
+ in##D ^= *(uint32_t *) (m + 76); \
+ *(uint32_t *) (c + 64) = in##A; \
+ *(uint32_t *) (c + 68) = in##B; \
+ *(uint32_t *) (c + 72) = in##C; \
+ *(uint32_t *) (c + 76) = in##D; \
+ \
+ in##A = _mm_extract_epi32(z##A, 2); \
+ in##B = _mm_extract_epi32(z##B, 2); \
+ in##C = _mm_extract_epi32(z##C, 2); \
+ in##D = _mm_extract_epi32(z##D, 2); \
+ \
+ in##A ^= *(uint32_t *) (m + 128); \
+ in##B ^= *(uint32_t *) (m + 132); \
+ in##C ^= *(uint32_t *) (m + 136); \
+ in##D ^= *(uint32_t *) (m + 140); \
+ *(uint32_t *) (c + 128) = in##A; \
+ *(uint32_t *) (c + 132) = in##B; \
+ *(uint32_t *) (c + 136) = in##C; \
+ *(uint32_t *) (c + 140) = in##D; \
+ \
+ in##A = _mm_extract_epi32(z##A, 3); \
+ in##B = _mm_extract_epi32(z##B, 3); \
+ in##C = _mm_extract_epi32(z##C, 3); \
+ in##D = _mm_extract_epi32(z##D, 3); \
+ \
+ in##A ^= *(uint32_t *) (m + 192); \
+ in##B ^= *(uint32_t *) (m + 196); \
+ in##C ^= *(uint32_t *) (m + 200); \
+ in##D ^= *(uint32_t *) (m + 204); \
+ *(uint32_t *) (c + 192) = in##A; \
+ *(uint32_t *) (c + 196) = in##B; \
+ *(uint32_t *) (c + 200) = in##C; \
+ *(uint32_t *) (c + 204) = in##D
+
+/* store data ; this macro first transpose data in-registers, and then store
+ * them in memory. much faster with icc. */
+#define ONEQUAD_TRANSPOSE(A, B, C, D) \
+ z##A = _mm_add_epi32(z##A, orig##A); \
+ z##B = _mm_add_epi32(z##B, orig##B); \
+ z##C = _mm_add_epi32(z##C, orig##C); \
+ z##D = _mm_add_epi32(z##D, orig##D); \
+ y##A = _mm_unpacklo_epi32(z##A, z##B); \
+ y##B = _mm_unpacklo_epi32(z##C, z##D); \
+ y##C = _mm_unpackhi_epi32(z##A, z##B); \
+ y##D = _mm_unpackhi_epi32(z##C, z##D); \
+ z##A = _mm_unpacklo_epi64(y##A, y##B); \
+ z##B = _mm_unpackhi_epi64(y##A, y##B); \
+ z##C = _mm_unpacklo_epi64(y##C, y##D); \
+ z##D = _mm_unpackhi_epi64(y##C, y##D); \
+ y##A = _mm_xor_si128(z##A, _mm_loadu_si128((__m128i *) (m + 0))); \
+ _mm_storeu_si128((__m128i *) (c + 0), y##A); \
+ y##B = _mm_xor_si128(z##B, _mm_loadu_si128((__m128i *) (m + 64))); \
+ _mm_storeu_si128((__m128i *) (c + 64), y##B); \
+ y##C = _mm_xor_si128(z##C, _mm_loadu_si128((__m128i *) (m + 128))); \
+ _mm_storeu_si128((__m128i *) (c + 128), y##C); \
+ y##D = _mm_xor_si128(z##D, _mm_loadu_si128((__m128i *) (m + 192))); \
+ _mm_storeu_si128((__m128i *) (c + 192), y##D)
+
+#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
+
+ ONEQUAD(0, 1, 2, 3);
+ m += 16;
+ c += 16;
+ ONEQUAD(4, 5, 6, 7);
+ m += 16;
+ c += 16;
+ ONEQUAD(8, 9, 10, 11);
+ m += 16;
+ c += 16;
+ ONEQUAD(12, 13, 14, 15);
+ m -= 48;
+ c -= 48;
+
+#undef ONEQUAD
+#undef ONEQUAD_TRANSPOSE
+#undef ONEQUAD_EXTRACT
+#undef ONEQUAD_SHUFFLE
+
+ bytes -= 256;
+ c += 256;
+ m += 256;
+ }
+}
diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u8.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u8.h
new file mode 100644
index 0000000000..467a961299
--- /dev/null
+++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u8.h
@@ -0,0 +1,476 @@
+if (bytes >= 512) {
+ __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
+ y15;
+
+ /* the naive way seems as fast (if not a bit faster) than the vector way */
+ __m256i z0 = _mm256_set1_epi32(x[0]);
+ __m256i z5 = _mm256_set1_epi32(x[1]);
+ __m256i z10 = _mm256_set1_epi32(x[2]);
+ __m256i z15 = _mm256_set1_epi32(x[3]);
+ __m256i z12 = _mm256_set1_epi32(x[4]);
+ __m256i z1 = _mm256_set1_epi32(x[5]);
+ __m256i z6 = _mm256_set1_epi32(x[6]);
+ __m256i z11 = _mm256_set1_epi32(x[7]);
+ __m256i z8; /* useless */
+ __m256i z13 = _mm256_set1_epi32(x[9]);
+ __m256i z2 = _mm256_set1_epi32(x[10]);
+ __m256i z7 = _mm256_set1_epi32(x[11]);
+ __m256i z4 = _mm256_set1_epi32(x[12]);
+ __m256i z9; /* useless */
+ __m256i z14 = _mm256_set1_epi32(x[14]);
+ __m256i z3 = _mm256_set1_epi32(x[15]);
+
+ __m256i orig0 = z0;
+ __m256i orig1 = z1;
+ __m256i orig2 = z2;
+ __m256i orig3 = z3;
+ __m256i orig4 = z4;
+ __m256i orig5 = z5;
+ __m256i orig6 = z6;
+ __m256i orig7 = z7;
+ __m256i orig8;
+ __m256i orig9;
+ __m256i orig10 = z10;
+ __m256i orig11 = z11;
+ __m256i orig12 = z12;
+ __m256i orig13 = z13;
+ __m256i orig14 = z14;
+ __m256i orig15 = z15;
+
+ uint32_t in8;
+ uint32_t in9;
+ int i;
+
+ while (bytes >= 512) {
+ /* vector implementation for z8 and z9 */
+ /* faster than the naive version for 8 blocks */
+ const __m256i addv8 = _mm256_set_epi64x(3, 2, 1, 0);
+ const __m256i addv9 = _mm256_set_epi64x(7, 6, 5, 4);
+ const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+ __m256i t8, t9;
+ uint64_t in89;
+
+ in8 = x[8];
+ in9 = x[13]; /* see arrays above for the address translation */
+ in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
+
+ z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89));
+
+ t8 = _mm256_add_epi64(addv8, z8);
+ t9 = _mm256_add_epi64(addv9, z9);
+
+ z8 = _mm256_unpacklo_epi32(t8, t9);
+ z9 = _mm256_unpackhi_epi32(t8, t9);
+
+ t8 = _mm256_unpacklo_epi32(z8, z9);
+ t9 = _mm256_unpackhi_epi32(z8, z9);
+
+ /* required because unpack* are intra-lane */
+ z8 = _mm256_permutevar8x32_epi32(t8, permute);
+ z9 = _mm256_permutevar8x32_epi32(t9, permute);
+
+ orig8 = z8;
+ orig9 = z9;
+
+ in89 += 8;
+
+ x[8] = in89 & 0xFFFFFFFF;
+ x[13] = (in89 >> 32) & 0xFFFFFFFF;
+
+ z5 = orig5;
+ z10 = orig10;
+ z15 = orig15;
+ z14 = orig14;
+ z3 = orig3;
+ z6 = orig6;
+ z11 = orig11;
+ z1 = orig1;
+
+ z7 = orig7;
+ z13 = orig13;
+ z2 = orig2;
+ z9 = orig9;
+ z0 = orig0;
+ z12 = orig12;
+ z4 = orig4;
+ z8 = orig8;
+
+ for (i = 0; i < ROUNDS; i += 2) {
+ /* the inner loop is a direct translation (regexp search/replace)
+ * from the amd64-xmm6 ASM */
+ __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
+ r14, r15;
+
+ y4 = z12;
+ y4 = _mm256_add_epi32(y4, z0);
+ r4 = y4;
+ y4 = _mm256_slli_epi32(y4, 7);
+ z4 = _mm256_xor_si256(z4, y4);
+ r4 = _mm256_srli_epi32(r4, 25);
+ z4 = _mm256_xor_si256(z4, r4);
+
+ y9 = z1;
+ y9 = _mm256_add_epi32(y9, z5);
+ r9 = y9;
+ y9 = _mm256_slli_epi32(y9, 7);
+ z9 = _mm256_xor_si256(z9, y9);
+ r9 = _mm256_srli_epi32(r9, 25);
+ z9 = _mm256_xor_si256(z9, r9);
+
+ y8 = z0;
+ y8 = _mm256_add_epi32(y8, z4);
+ r8 = y8;
+ y8 = _mm256_slli_epi32(y8, 9);
+ z8 = _mm256_xor_si256(z8, y8);
+ r8 = _mm256_srli_epi32(r8, 23);
+ z8 = _mm256_xor_si256(z8, r8);
+
+ y13 = z5;
+ y13 = _mm256_add_epi32(y13, z9);
+ r13 = y13;
+ y13 = _mm256_slli_epi32(y13, 9);
+ z13 = _mm256_xor_si256(z13, y13);
+ r13 = _mm256_srli_epi32(r13, 23);
+ z13 = _mm256_xor_si256(z13, r13);
+
+ y12 = z4;
+ y12 = _mm256_add_epi32(y12, z8);
+ r12 = y12;
+ y12 = _mm256_slli_epi32(y12, 13);
+ z12 = _mm256_xor_si256(z12, y12);
+ r12 = _mm256_srli_epi32(r12, 19);
+ z12 = _mm256_xor_si256(z12, r12);
+
+ y1 = z9;
+ y1 = _mm256_add_epi32(y1, z13);
+ r1 = y1;
+ y1 = _mm256_slli_epi32(y1, 13);
+ z1 = _mm256_xor_si256(z1, y1);
+ r1 = _mm256_srli_epi32(r1, 19);
+ z1 = _mm256_xor_si256(z1, r1);
+
+ y0 = z8;
+ y0 = _mm256_add_epi32(y0, z12);
+ r0 = y0;
+ y0 = _mm256_slli_epi32(y0, 18);
+ z0 = _mm256_xor_si256(z0, y0);
+ r0 = _mm256_srli_epi32(r0, 14);
+ z0 = _mm256_xor_si256(z0, r0);
+
+ y5 = z13;
+ y5 = _mm256_add_epi32(y5, z1);
+ r5 = y5;
+ y5 = _mm256_slli_epi32(y5, 18);
+ z5 = _mm256_xor_si256(z5, y5);
+ r5 = _mm256_srli_epi32(r5, 14);
+ z5 = _mm256_xor_si256(z5, r5);
+
+ y14 = z6;
+ y14 = _mm256_add_epi32(y14, z10);
+ r14 = y14;
+ y14 = _mm256_slli_epi32(y14, 7);
+ z14 = _mm256_xor_si256(z14, y14);
+ r14 = _mm256_srli_epi32(r14, 25);
+ z14 = _mm256_xor_si256(z14, r14);
+
+ y3 = z11;
+ y3 = _mm256_add_epi32(y3, z15);
+ r3 = y3;
+ y3 = _mm256_slli_epi32(y3, 7);
+ z3 = _mm256_xor_si256(z3, y3);
+ r3 = _mm256_srli_epi32(r3, 25);
+ z3 = _mm256_xor_si256(z3, r3);
+
+ y2 = z10;
+ y2 = _mm256_add_epi32(y2, z14);
+ r2 = y2;
+ y2 = _mm256_slli_epi32(y2, 9);
+ z2 = _mm256_xor_si256(z2, y2);
+ r2 = _mm256_srli_epi32(r2, 23);
+ z2 = _mm256_xor_si256(z2, r2);
+
+ y7 = z15;
+ y7 = _mm256_add_epi32(y7, z3);
+ r7 = y7;
+ y7 = _mm256_slli_epi32(y7, 9);
+ z7 = _mm256_xor_si256(z7, y7);
+ r7 = _mm256_srli_epi32(r7, 23);
+ z7 = _mm256_xor_si256(z7, r7);
+
+ y6 = z14;
+ y6 = _mm256_add_epi32(y6, z2);
+ r6 = y6;
+ y6 = _mm256_slli_epi32(y6, 13);
+ z6 = _mm256_xor_si256(z6, y6);
+ r6 = _mm256_srli_epi32(r6, 19);
+ z6 = _mm256_xor_si256(z6, r6);
+
+ y11 = z3;
+ y11 = _mm256_add_epi32(y11, z7);
+ r11 = y11;
+ y11 = _mm256_slli_epi32(y11, 13);
+ z11 = _mm256_xor_si256(z11, y11);
+ r11 = _mm256_srli_epi32(r11, 19);
+ z11 = _mm256_xor_si256(z11, r11);
+
+ y10 = z2;
+ y10 = _mm256_add_epi32(y10, z6);
+ r10 = y10;
+ y10 = _mm256_slli_epi32(y10, 18);
+ z10 = _mm256_xor_si256(z10, y10);
+ r10 = _mm256_srli_epi32(r10, 14);
+ z10 = _mm256_xor_si256(z10, r10);
+
+ y1 = z3;
+ y1 = _mm256_add_epi32(y1, z0);
+ r1 = y1;
+ y1 = _mm256_slli_epi32(y1, 7);
+ z1 = _mm256_xor_si256(z1, y1);
+ r1 = _mm256_srli_epi32(r1, 25);
+ z1 = _mm256_xor_si256(z1, r1);
+
+ y15 = z7;
+ y15 = _mm256_add_epi32(y15, z11);
+ r15 = y15;
+ y15 = _mm256_slli_epi32(y15, 18);
+ z15 = _mm256_xor_si256(z15, y15);
+ r15 = _mm256_srli_epi32(r15, 14);
+ z15 = _mm256_xor_si256(z15, r15);
+
+ y6 = z4;
+ y6 = _mm256_add_epi32(y6, z5);
+ r6 = y6;
+ y6 = _mm256_slli_epi32(y6, 7);
+ z6 = _mm256_xor_si256(z6, y6);
+ r6 = _mm256_srli_epi32(r6, 25);
+ z6 = _mm256_xor_si256(z6, r6);
+
+ y2 = z0;
+ y2 = _mm256_add_epi32(y2, z1);
+ r2 = y2;
+ y2 = _mm256_slli_epi32(y2, 9);
+ z2 = _mm256_xor_si256(z2, y2);
+ r2 = _mm256_srli_epi32(r2, 23);
+ z2 = _mm256_xor_si256(z2, r2);
+
+ y7 = z5;
+ y7 = _mm256_add_epi32(y7, z6);
+ r7 = y7;
+ y7 = _mm256_slli_epi32(y7, 9);
+ z7 = _mm256_xor_si256(z7, y7);
+ r7 = _mm256_srli_epi32(r7, 23);
+ z7 = _mm256_xor_si256(z7, r7);
+
+ y3 = z1;
+ y3 = _mm256_add_epi32(y3, z2);
+ r3 = y3;
+ y3 = _mm256_slli_epi32(y3, 13);
+ z3 = _mm256_xor_si256(z3, y3);
+ r3 = _mm256_srli_epi32(r3, 19);
+ z3 = _mm256_xor_si256(z3, r3);
+
+ y4 = z6;
+ y4 = _mm256_add_epi32(y4, z7);
+ r4 = y4;
+ y4 = _mm256_slli_epi32(y4, 13);
+ z4 = _mm256_xor_si256(z4, y4);
+ r4 = _mm256_srli_epi32(r4, 19);
+ z4 = _mm256_xor_si256(z4, r4);
+
+ y0 = z2;
+ y0 = _mm256_add_epi32(y0, z3);
+ r0 = y0;
+ y0 = _mm256_slli_epi32(y0, 18);
+ z0 = _mm256_xor_si256(z0, y0);
+ r0 = _mm256_srli_epi32(r0, 14);
+ z0 = _mm256_xor_si256(z0, r0);
+
+ y5 = z7;
+ y5 = _mm256_add_epi32(y5, z4);
+ r5 = y5;
+ y5 = _mm256_slli_epi32(y5, 18);
+ z5 = _mm256_xor_si256(z5, y5);
+ r5 = _mm256_srli_epi32(r5, 14);
+ z5 = _mm256_xor_si256(z5, r5);
+
+ y11 = z9;
+ y11 = _mm256_add_epi32(y11, z10);
+ r11 = y11;
+ y11 = _mm256_slli_epi32(y11, 7);
+ z11 = _mm256_xor_si256(z11, y11);
+ r11 = _mm256_srli_epi32(r11, 25);
+ z11 = _mm256_xor_si256(z11, r11);
+
+ y12 = z14;
+ y12 = _mm256_add_epi32(y12, z15);
+ r12 = y12;
+ y12 = _mm256_slli_epi32(y12, 7);
+ z12 = _mm256_xor_si256(z12, y12);
+ r12 = _mm256_srli_epi32(r12, 25);
+ z12 = _mm256_xor_si256(z12, r12);
+
+ y8 = z10;
+ y8 = _mm256_add_epi32(y8, z11);
+ r8 = y8;
+ y8 = _mm256_slli_epi32(y8, 9);
+ z8 = _mm256_xor_si256(z8, y8);
+ r8 = _mm256_srli_epi32(r8, 23);
+ z8 = _mm256_xor_si256(z8, r8);
+
+ y13 = z15;
+ y13 = _mm256_add_epi32(y13, z12);
+ r13 = y13;
+ y13 = _mm256_slli_epi32(y13, 9);
+ z13 = _mm256_xor_si256(z13, y13);
+ r13 = _mm256_srli_epi32(r13, 23);
+ z13 = _mm256_xor_si256(z13, r13);
+
+ y9 = z11;
+ y9 = _mm256_add_epi32(y9, z8);
+ r9 = y9;
+ y9 = _mm256_slli_epi32(y9, 13);
+ z9 = _mm256_xor_si256(z9, y9);
+ r9 = _mm256_srli_epi32(r9, 19);
+ z9 = _mm256_xor_si256(z9, r9);
+
+ y14 = z12;
+ y14 = _mm256_add_epi32(y14, z13);
+ r14 = y14;
+ y14 = _mm256_slli_epi32(y14, 13);
+ z14 = _mm256_xor_si256(z14, y14);
+ r14 = _mm256_srli_epi32(r14, 19);
+ z14 = _mm256_xor_si256(z14, r14);
+
+ y10 = z8;
+ y10 = _mm256_add_epi32(y10, z9);
+ r10 = y10;
+ y10 = _mm256_slli_epi32(y10, 18);
+ z10 = _mm256_xor_si256(z10, y10);
+ r10 = _mm256_srli_epi32(r10, 14);
+ z10 = _mm256_xor_si256(z10, r10);
+
+ y15 = z13;
+ y15 = _mm256_add_epi32(y15, z14);
+ r15 = y15;
+ y15 = _mm256_slli_epi32(y15, 18);
+ z15 = _mm256_xor_si256(z15, y15);
+ r15 = _mm256_srli_epi32(r15, 14);
+ z15 = _mm256_xor_si256(z15, r15);
+ }
+
+/* store data ; this macro first transpose data in-registers, and then store
+ * them in memory. much faster with icc. */
+#define ONEQUAD_TRANSPOSE(A, B, C, D) \
+ { \
+ __m128i t0, t1, t2, t3; \
+ z##A = _mm256_add_epi32(z##A, orig##A); \
+ z##B = _mm256_add_epi32(z##B, orig##B); \
+ z##C = _mm256_add_epi32(z##C, orig##C); \
+ z##D = _mm256_add_epi32(z##D, orig##D); \
+ y##A = _mm256_unpacklo_epi32(z##A, z##B); \
+ y##B = _mm256_unpacklo_epi32(z##C, z##D); \
+ y##C = _mm256_unpackhi_epi32(z##A, z##B); \
+ y##D = _mm256_unpackhi_epi32(z##C, z##D); \
+ z##A = _mm256_unpacklo_epi64(y##A, y##B); \
+ z##B = _mm256_unpackhi_epi64(y##A, y##B); \
+ z##C = _mm256_unpacklo_epi64(y##C, y##D); \
+ z##D = _mm256_unpackhi_epi64(y##C, y##D); \
+ t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0), \
+ _mm_loadu_si128((__m128i*) (m + 0))); \
+ _mm_storeu_si128((__m128i*) (c + 0), t0); \
+ t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0), \
+ _mm_loadu_si128((__m128i*) (m + 64))); \
+ _mm_storeu_si128((__m128i*) (c + 64), t1); \
+ t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0), \
+ _mm_loadu_si128((__m128i*) (m + 128))); \
+ _mm_storeu_si128((__m128i*) (c + 128), t2); \
+ t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0), \
+ _mm_loadu_si128((__m128i*) (m + 192))); \
+ _mm_storeu_si128((__m128i*) (c + 192), t3); \
+ t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1), \
+ _mm_loadu_si128((__m128i*) (m + 256))); \
+ _mm_storeu_si128((__m128i*) (c + 256), t0); \
+ t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1), \
+ _mm_loadu_si128((__m128i*) (m + 320))); \
+ _mm_storeu_si128((__m128i*) (c + 320), t1); \
+ t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1), \
+ _mm_loadu_si128((__m128i*) (m + 384))); \
+ _mm_storeu_si128((__m128i*) (c + 384), t2); \
+ t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1), \
+ _mm_loadu_si128((__m128i*) (m + 448))); \
+ _mm_storeu_si128((__m128i*) (c + 448), t3); \
+ }
+
+#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
+
+#define ONEQUAD_UNPCK(A, B, C, D) \
+ { \
+ z##A = _mm256_add_epi32(z##A, orig##A); \
+ z##B = _mm256_add_epi32(z##B, orig##B); \
+ z##C = _mm256_add_epi32(z##C, orig##C); \
+ z##D = _mm256_add_epi32(z##D, orig##D); \
+ y##A = _mm256_unpacklo_epi32(z##A, z##B); \
+ y##B = _mm256_unpacklo_epi32(z##C, z##D); \
+ y##C = _mm256_unpackhi_epi32(z##A, z##B); \
+ y##D = _mm256_unpackhi_epi32(z##C, z##D); \
+ z##A = _mm256_unpacklo_epi64(y##A, y##B); \
+ z##B = _mm256_unpackhi_epi64(y##A, y##B); \
+ z##C = _mm256_unpacklo_epi64(y##C, y##D); \
+ z##D = _mm256_unpackhi_epi64(y##C, y##D); \
+ }
+
+#define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \
+ { \
+ ONEQUAD_UNPCK(A, B, C, D); \
+ ONEQUAD_UNPCK(A2, B2, C2, D2); \
+ y##A = _mm256_permute2x128_si256(z##A, z##A2, 0x20); \
+ y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31); \
+ y##B = _mm256_permute2x128_si256(z##B, z##B2, 0x20); \
+ y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31); \
+ y##C = _mm256_permute2x128_si256(z##C, z##C2, 0x20); \
+ y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31); \
+ y##D = _mm256_permute2x128_si256(z##D, z##D2, 0x20); \
+ y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31); \
+ y##A = _mm256_xor_si256(y##A, _mm256_loadu_si256((__m256i*) (m + 0))); \
+ y##B = \
+ _mm256_xor_si256(y##B, _mm256_loadu_si256((__m256i*) (m + 64))); \
+ y##C = \
+ _mm256_xor_si256(y##C, _mm256_loadu_si256((__m256i*) (m + 128))); \
+ y##D = \
+ _mm256_xor_si256(y##D, _mm256_loadu_si256((__m256i*) (m + 192))); \
+ y##A2 = \
+ _mm256_xor_si256(y##A2, _mm256_loadu_si256((__m256i*) (m + 256))); \
+ y##B2 = \
+ _mm256_xor_si256(y##B2, _mm256_loadu_si256((__m256i*) (m + 320))); \
+ y##C2 = \
+ _mm256_xor_si256(y##C2, _mm256_loadu_si256((__m256i*) (m + 384))); \
+ y##D2 = \
+ _mm256_xor_si256(y##D2, _mm256_loadu_si256((__m256i*) (m + 448))); \
+ _mm256_storeu_si256((__m256i*) (c + 0), y##A); \
+ _mm256_storeu_si256((__m256i*) (c + 64), y##B); \
+ _mm256_storeu_si256((__m256i*) (c + 128), y##C); \
+ _mm256_storeu_si256((__m256i*) (c + 192), y##D); \
+ _mm256_storeu_si256((__m256i*) (c + 256), y##A2); \
+ _mm256_storeu_si256((__m256i*) (c + 320), y##B2); \
+ _mm256_storeu_si256((__m256i*) (c + 384), y##C2); \
+ _mm256_storeu_si256((__m256i*) (c + 448), y##D2); \
+ }
+
+ ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
+ m += 32;
+ c += 32;
+ ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
+ m -= 32;
+ c -= 32;
+
+#undef ONEQUAD
+#undef ONEQUAD_TRANSPOSE
+#undef ONEQUAD_UNPCK
+#undef ONEOCTO
+
+ bytes -= 512;
+ c += 512;
+ m += 512;
+ }
+}