summaryrefslogtreecommitdiff
path: root/plugins/MirOTR/Libgcrypt/cipher/rijndael.c
diff options
context:
space:
mode:
Diffstat (limited to 'plugins/MirOTR/Libgcrypt/cipher/rijndael.c')
-rw-r--r--plugins/MirOTR/Libgcrypt/cipher/rijndael.c2352
1 files changed, 1980 insertions, 372 deletions
diff --git a/plugins/MirOTR/Libgcrypt/cipher/rijndael.c b/plugins/MirOTR/Libgcrypt/cipher/rijndael.c
index d43b349b41..8019f0aad8 100644
--- a/plugins/MirOTR/Libgcrypt/cipher/rijndael.c
+++ b/plugins/MirOTR/Libgcrypt/cipher/rijndael.c
@@ -1,6 +1,6 @@
/* Rijndael (AES) for GnuPG
* Copyright (C) 2000, 2001, 2002, 2003, 2007,
- * 2008 Free Software Foundation, Inc.
+ * 2008, 2011, 2012 Free Software Foundation, Inc.
*
* This file is part of Libgcrypt.
*
@@ -45,77 +45,434 @@
#include "types.h" /* for byte and u32 typedefs */
#include "g10lib.h"
#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
#define MAXKC (256/32)
#define MAXROUNDS 14
#define BLOCKSIZE (128/8)
+/* Helper macro to force alignment to 16 bytes. */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_16 __attribute__ ((aligned (16)))
+#else
+# define ATTR_ALIGNED_16
+#endif
+
+
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+# define USE_AMD64_ASM 1
+#endif
+
+/* USE_ARM_ASM indicates whether to use ARM assembly code. */
+#undef USE_ARM_ASM
+#if defined(__ARMEL__)
+# ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+# define USE_ARM_ASM 1
+# endif
+#endif
+
/* USE_PADLOCK indicates whether to compile the padlock specific
code. */
#undef USE_PADLOCK
#ifdef ENABLE_PADLOCK_SUPPORT
-# if defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4 && defined (__GNUC__)
-# define USE_PADLOCK
+# ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# if (defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__)
+# define USE_PADLOCK 1
+# endif
# endif
#endif /*ENABLE_PADLOCK_SUPPORT*/
-static const char *selftest(void);
+/* USE_AESNI inidicates whether to compile with Intel AES-NI code. We
+ need the vector-size attribute which seems to be available since
+ gcc 3. However, to be on the safe side we require at least gcc 4. */
+#undef USE_AESNI
+#ifdef ENABLE_AESNI_SUPPORT
+# if ((defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__))
+# if __GNUC__ >= 4
+# define USE_AESNI 1
+# endif
+# endif
+#endif /* ENABLE_AESNI_SUPPORT */
-typedef struct
-{
- int ROUNDS; /* Key-length-dependent number of rounds. */
- int decryption_prepared; /* The decryption key schedule is available. */
-#ifdef USE_PADLOCK
- int use_padlock; /* Padlock shall be used. */
- /* The key as passed to the padlock engine. */
- unsigned char padlock_key[16] __attribute__ ((aligned (16)));
+#ifdef USE_AESNI
+ typedef struct u128_s { u32 a, b, c, d; } u128_t;
+#endif /*USE_AESNI*/
+
+/* Define an u32 variant for the sake of gcc 4.4's strict aliasing. */
+#if __GNUC__ > 4 || ( __GNUC__ == 4 && __GNUC_MINOR__ >= 4 )
+typedef u32 __attribute__ ((__may_alias__)) u32_a_t;
+#else
+typedef u32 u32_a_t;
#endif
+
+
+#ifdef USE_AMD64_ASM
+/* AMD64 assembly implementations of AES */
+extern void _gcry_aes_amd64_encrypt_block(const void *keysched_enc,
+ unsigned char *out,
+ const unsigned char *in,
+ int rounds);
+
+extern void _gcry_aes_amd64_decrypt_block(const void *keysched_dec,
+ unsigned char *out,
+ const unsigned char *in,
+ int rounds);
+#endif /*USE_AMD64_ASM*/
+
+#ifdef USE_ARM_ASM
+/* ARM assembly implementations of AES */
+extern void _gcry_aes_arm_encrypt_block(const void *keysched_enc,
+ unsigned char *out,
+ const unsigned char *in,
+ int rounds);
+
+extern void _gcry_aes_arm_decrypt_block(const void *keysched_dec,
+ unsigned char *out,
+ const unsigned char *in,
+ int rounds);
+#endif /*USE_ARM_ASM*/
+
+
+
+/* Our context object. */
+typedef struct
+{
+ /* The first fields are the keyschedule arrays. This is so that
+ they are aligned on a 16 byte boundary if using gcc. This
+ alignment is required for the AES-NI code and a good idea in any
+ case. The alignment is guaranteed due to the way cipher.c
+ allocates the space for the context. The PROPERLY_ALIGNED_TYPE
+ hack is used to force a minimal alignment if not using gcc of if
+ the alignment requirement is higher that 16 bytes. */
union
{
PROPERLY_ALIGNED_TYPE dummy;
byte keyschedule[MAXROUNDS+1][4][4];
+#ifdef USE_PADLOCK
+ /* The key as passed to the padlock engine. It is only used if
+ the padlock engine is used (USE_PADLOCK, below). */
+ unsigned char padlock_key[16] __attribute__ ((aligned (16)));
+#endif /*USE_PADLOCK*/
} u1;
union
{
PROPERLY_ALIGNED_TYPE dummy;
- byte keyschedule[MAXROUNDS+1][4][4];
+ byte keyschedule[MAXROUNDS+1][4][4];
} u2;
-} RIJNDAEL_context;
+ int rounds; /* Key-length-dependent number of rounds. */
+ unsigned int decryption_prepared:1; /* The decryption key schedule is available. */
+#ifdef USE_PADLOCK
+ unsigned int use_padlock:1; /* Padlock shall be used. */
+#endif /*USE_PADLOCK*/
+#ifdef USE_AESNI
+ unsigned int use_aesni:1; /* AES-NI shall be used. */
+#endif /*USE_AESNI*/
+} RIJNDAEL_context ATTR_ALIGNED_16;
+
+/* Macros defining alias for the keyschedules. */
+#define keyschenc u1.keyschedule
+#define keyschdec u2.keyschedule
+#define padlockkey u1.padlock_key
+
+/* Two macros to be called prior and after the use of AESNI
+ instructions. There should be no external function calls between
+ the use of these macros. There purpose is to make sure that the
+ SSE regsiters are cleared and won't reveal any information about
+ the key or the data. */
+#ifdef USE_AESNI
+# define aesni_prepare() do { } while (0)
+# define aesni_cleanup() \
+ do { asm volatile ("pxor %%xmm0, %%xmm0\n\t" \
+ "pxor %%xmm1, %%xmm1\n" :: ); \
+ } while (0)
+# define aesni_cleanup_2_6() \
+ do { asm volatile ("pxor %%xmm2, %%xmm2\n\t" \
+ "pxor %%xmm3, %%xmm3\n" \
+ "pxor %%xmm4, %%xmm4\n" \
+ "pxor %%xmm5, %%xmm5\n" \
+ "pxor %%xmm6, %%xmm6\n":: ); \
+ } while (0)
+#else
+# define aesni_prepare() do { } while (0)
+# define aesni_cleanup() do { } while (0)
+#endif
-#define keySched u1.keyschedule
-#define keySched2 u2.keyschedule
/* All the numbers. */
#include "rijndael-tables.h"
-/* Perform the key setup. */
+
+/* Function prototypes. */
+#if defined(__i386__) && defined(USE_AESNI)
+/* We don't want to inline these functions on i386 to help gcc allocate enough
+ registers. */
+static void do_aesni_ctr (const RIJNDAEL_context *ctx, unsigned char *ctr,
+ unsigned char *b, const unsigned char *a)
+ __attribute__ ((__noinline__));
+static void do_aesni_ctr_4 (const RIJNDAEL_context *ctx, unsigned char *ctr,
+ unsigned char *b, const unsigned char *a)
+ __attribute__ ((__noinline__));
+#endif /*USE_AESNI*/
+
+static const char *selftest(void);
+
+
+
+#ifdef USE_AESNI
+static void
+aesni_do_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+ aesni_prepare();
+
+ if (ctx->rounds < 12)
+ {
+ /* 128-bit key */
+#define AESKEYGENASSIST_xmm1_xmm2(imm8) \
+ ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t"
+#define AESKEY_EXPAND128 \
+ "pshufd $0xff, %%xmm2, %%xmm2\n\t" \
+ "movdqa %%xmm1, %%xmm3\n\t" \
+ "pslldq $4, %%xmm3\n\t" \
+ "pxor %%xmm3, %%xmm1\n\t" \
+ "pslldq $4, %%xmm3\n\t" \
+ "pxor %%xmm3, %%xmm1\n\t" \
+ "pslldq $4, %%xmm3\n\t" \
+ "pxor %%xmm3, %%xmm2\n\t" \
+ "pxor %%xmm2, %%xmm1\n\t"
+
+ asm volatile ("movdqu (%[key]), %%xmm1\n\t" /* xmm1 := key */
+ "movdqa %%xmm1, (%[ksch])\n\t" /* ksch[0] := xmm1 */
+ AESKEYGENASSIST_xmm1_xmm2(0x01)
+ AESKEY_EXPAND128
+ "movdqa %%xmm1, 0x10(%[ksch])\n\t" /* ksch[1] := xmm1 */
+ AESKEYGENASSIST_xmm1_xmm2(0x02)
+ AESKEY_EXPAND128
+ "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1 */
+ AESKEYGENASSIST_xmm1_xmm2(0x04)
+ AESKEY_EXPAND128
+ "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1 */
+ AESKEYGENASSIST_xmm1_xmm2(0x08)
+ AESKEY_EXPAND128
+ "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1 */
+ AESKEYGENASSIST_xmm1_xmm2(0x10)
+ AESKEY_EXPAND128
+ "movdqa %%xmm1, 0x50(%[ksch])\n\t" /* ksch[5] := xmm1 */
+ AESKEYGENASSIST_xmm1_xmm2(0x20)
+ AESKEY_EXPAND128
+ "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1 */
+ AESKEYGENASSIST_xmm1_xmm2(0x40)
+ AESKEY_EXPAND128
+ "movdqa %%xmm1, 0x70(%[ksch])\n\t" /* ksch[7] := xmm1 */
+ AESKEYGENASSIST_xmm1_xmm2(0x80)
+ AESKEY_EXPAND128
+ "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1 */
+ AESKEYGENASSIST_xmm1_xmm2(0x1b)
+ AESKEY_EXPAND128
+ "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1 */
+ AESKEYGENASSIST_xmm1_xmm2(0x36)
+ AESKEY_EXPAND128
+ "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1 */
+ :
+ : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
+ : "cc", "memory" );
+#undef AESKEYGENASSIST_xmm1_xmm2
+#undef AESKEY_EXPAND128
+ }
+ else if (ctx->rounds == 12)
+ {
+ /* 192-bit key */
+#define AESKEYGENASSIST_xmm3_xmm2(imm8) \
+ ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t"
+#define AESKEY_EXPAND192 \
+ "pshufd $0x55, %%xmm2, %%xmm2\n\t" \
+ "movdqu %%xmm1, %%xmm4\n\t" \
+ "pslldq $4, %%xmm4\n\t" \
+ "pxor %%xmm4, %%xmm1\n\t" \
+ "pslldq $4, %%xmm4\n\t" \
+ "pxor %%xmm4, %%xmm1\n\t" \
+ "pslldq $4, %%xmm4\n\t" \
+ "pxor %%xmm4, %%xmm1\n\t" \
+ "pxor %%xmm2, %%xmm1\n\t" \
+ "pshufd $0xff, %%xmm1, %%xmm2\n\t" \
+ "movdqu %%xmm3, %%xmm4\n\t" \
+ "pslldq $4, %%xmm4\n\t" \
+ "pxor %%xmm4, %%xmm3\n\t" \
+ "pxor %%xmm2, %%xmm3\n\t"
+
+ asm volatile ("movdqu (%[key]), %%xmm1\n\t" /* xmm1 := key[0..15] */
+ "movq 16(%[key]), %%xmm3\n\t" /* xmm3 := key[16..23] */
+ "movdqa %%xmm1, (%[ksch])\n\t" /* ksch[0] := xmm1 */
+ "movdqa %%xmm3, %%xmm5\n\t"
+
+ AESKEYGENASSIST_xmm3_xmm2(0x01)
+ AESKEY_EXPAND192
+ "shufpd $0, %%xmm1, %%xmm5\n\t"
+ "movdqa %%xmm5, 0x10(%[ksch])\n\t" /* ksch[1] := xmm5 */
+ "movdqa %%xmm1, %%xmm6\n\t"
+ "shufpd $1, %%xmm3, %%xmm6\n\t"
+ "movdqa %%xmm6, 0x20(%[ksch])\n\t" /* ksch[2] := xmm6 */
+ AESKEYGENASSIST_xmm3_xmm2(0x02)
+ AESKEY_EXPAND192
+ "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1 */
+ "movdqa %%xmm3, %%xmm5\n\t"
+
+ AESKEYGENASSIST_xmm3_xmm2(0x04)
+ AESKEY_EXPAND192
+ "shufpd $0, %%xmm1, %%xmm5\n\t"
+ "movdqa %%xmm5, 0x40(%[ksch])\n\t" /* ksch[4] := xmm5 */
+ "movdqa %%xmm1, %%xmm6\n\t"
+ "shufpd $1, %%xmm3, %%xmm6\n\t"
+ "movdqa %%xmm6, 0x50(%[ksch])\n\t" /* ksch[5] := xmm6 */
+ AESKEYGENASSIST_xmm3_xmm2(0x08)
+ AESKEY_EXPAND192
+ "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1 */
+ "movdqa %%xmm3, %%xmm5\n\t"
+
+ AESKEYGENASSIST_xmm3_xmm2(0x10)
+ AESKEY_EXPAND192
+ "shufpd $0, %%xmm1, %%xmm5\n\t"
+ "movdqa %%xmm5, 0x70(%[ksch])\n\t" /* ksch[7] := xmm5 */
+ "movdqa %%xmm1, %%xmm6\n\t"
+ "shufpd $1, %%xmm3, %%xmm6\n\t"
+ "movdqa %%xmm6, 0x80(%[ksch])\n\t" /* ksch[8] := xmm6 */
+ AESKEYGENASSIST_xmm3_xmm2(0x20)
+ AESKEY_EXPAND192
+ "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1 */
+ "movdqa %%xmm3, %%xmm5\n\t"
+
+ AESKEYGENASSIST_xmm3_xmm2(0x40)
+ AESKEY_EXPAND192
+ "shufpd $0, %%xmm1, %%xmm5\n\t"
+ "movdqa %%xmm5, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm5 */
+ "movdqa %%xmm1, %%xmm6\n\t"
+ "shufpd $1, %%xmm3, %%xmm6\n\t"
+ "movdqa %%xmm6, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm6 */
+ AESKEYGENASSIST_xmm3_xmm2(0x80)
+ AESKEY_EXPAND192
+ "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1 */
+ :
+ : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
+ : "cc", "memory" );
+#undef AESKEYGENASSIST_xmm3_xmm2
+#undef AESKEY_EXPAND192
+ }
+ else if (ctx->rounds > 12)
+ {
+ /* 256-bit key */
+#define AESKEYGENASSIST_xmm1_xmm2(imm8) \
+ ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t"
+#define AESKEYGENASSIST_xmm3_xmm2(imm8) \
+ ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t"
+#define AESKEY_EXPAND256_A \
+ "pshufd $0xff, %%xmm2, %%xmm2\n\t" \
+ "movdqa %%xmm1, %%xmm4\n\t" \
+ "pslldq $4, %%xmm4\n\t" \
+ "pxor %%xmm4, %%xmm1\n\t" \
+ "pslldq $4, %%xmm4\n\t" \
+ "pxor %%xmm4, %%xmm1\n\t" \
+ "pslldq $4, %%xmm4\n\t" \
+ "pxor %%xmm4, %%xmm1\n\t" \
+ "pxor %%xmm2, %%xmm1\n\t"
+#define AESKEY_EXPAND256_B \
+ "pshufd $0xaa, %%xmm2, %%xmm2\n\t" \
+ "movdqa %%xmm3, %%xmm4\n\t" \
+ "pslldq $4, %%xmm4\n\t" \
+ "pxor %%xmm4, %%xmm3\n\t" \
+ "pslldq $4, %%xmm4\n\t" \
+ "pxor %%xmm4, %%xmm3\n\t" \
+ "pslldq $4, %%xmm4\n\t" \
+ "pxor %%xmm4, %%xmm3\n\t" \
+ "pxor %%xmm2, %%xmm3\n\t"
+
+ asm volatile ("movdqu (%[key]), %%xmm1\n\t" /* xmm1 := key[0..15] */
+ "movdqu 16(%[key]), %%xmm3\n\t" /* xmm3 := key[16..31] */
+ "movdqa %%xmm1, (%[ksch])\n\t" /* ksch[0] := xmm1 */
+ "movdqa %%xmm3, 0x10(%[ksch])\n\t" /* ksch[1] := xmm3 */
+
+ AESKEYGENASSIST_xmm3_xmm2(0x01)
+ AESKEY_EXPAND256_A
+ "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1 */
+ AESKEYGENASSIST_xmm1_xmm2(0x00)
+ AESKEY_EXPAND256_B
+ "movdqa %%xmm3, 0x30(%[ksch])\n\t" /* ksch[3] := xmm3 */
+
+ AESKEYGENASSIST_xmm3_xmm2(0x02)
+ AESKEY_EXPAND256_A
+ "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1 */
+ AESKEYGENASSIST_xmm1_xmm2(0x00)
+ AESKEY_EXPAND256_B
+ "movdqa %%xmm3, 0x50(%[ksch])\n\t" /* ksch[5] := xmm3 */
+
+ AESKEYGENASSIST_xmm3_xmm2(0x04)
+ AESKEY_EXPAND256_A
+ "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1 */
+ AESKEYGENASSIST_xmm1_xmm2(0x00)
+ AESKEY_EXPAND256_B
+ "movdqa %%xmm3, 0x70(%[ksch])\n\t" /* ksch[7] := xmm3 */
+
+ AESKEYGENASSIST_xmm3_xmm2(0x08)
+ AESKEY_EXPAND256_A
+ "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1 */
+ AESKEYGENASSIST_xmm1_xmm2(0x00)
+ AESKEY_EXPAND256_B
+ "movdqa %%xmm3, 0x90(%[ksch])\n\t" /* ksch[9] := xmm3 */
+
+ AESKEYGENASSIST_xmm3_xmm2(0x10)
+ AESKEY_EXPAND256_A
+ "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1 */
+ AESKEYGENASSIST_xmm1_xmm2(0x00)
+ AESKEY_EXPAND256_B
+ "movdqa %%xmm3, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm3 */
+
+ AESKEYGENASSIST_xmm3_xmm2(0x20)
+ AESKEY_EXPAND256_A
+ "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1 */
+ AESKEYGENASSIST_xmm1_xmm2(0x00)
+ AESKEY_EXPAND256_B
+ "movdqa %%xmm3, 0xd0(%[ksch])\n\t" /* ksch[13] := xmm3 */
+
+ AESKEYGENASSIST_xmm3_xmm2(0x40)
+ AESKEY_EXPAND256_A
+ "movdqa %%xmm1, 0xe0(%[ksch])\n\t" /* ksch[14] := xmm1 */
+
+ :
+ : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
+ : "cc", "memory" );
+#undef AESKEYGENASSIST_xmm1_xmm2
+#undef AESKEYGENASSIST_xmm3_xmm2
+#undef AESKEY_EXPAND256_A
+#undef AESKEY_EXPAND256_B
+ }
+
+ aesni_cleanup();
+ aesni_cleanup_2_6();
+}
+#endif /*USE_AESNI*/
+
+
+
+/* Perform the key setup. */
static gcry_err_code_t
do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
{
static int initialized = 0;
static const char *selftest_failed=0;
- int ROUNDS;
+ int rounds;
int i,j, r, t, rconpointer = 0;
int KC;
- union
- {
- PROPERLY_ALIGNED_TYPE dummy;
- byte k[MAXKC][4];
- } k;
-#define k k.k
- union
- {
- PROPERLY_ALIGNED_TYPE dummy;
- byte tk[MAXKC][4];
- } tk;
-#define tk tk.tk
+#if defined(USE_AESNI) || defined(USE_PADLOCK)
+ unsigned int hwfeatures;
+#endif
/* The on-the-fly self tests are only run in non-fips mode. In fips
mode explicit self-tests are required. Actually the on-the-fly
self-tests are not fully thread-safe and it might happen that a
- failed self-test won't get noticed in another thread.
+ failed self-test won't get noticed in another thread.
FIXME: We might want to have a central registry of succeeded
self-tests. */
@@ -129,65 +486,115 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
if (selftest_failed)
return GPG_ERR_SELFTEST_FAILED;
+#if defined(USE_AESNI) || defined(USE_PADLOCK)
+ hwfeatures = _gcry_get_hw_features ();
+#endif
+
ctx->decryption_prepared = 0;
#ifdef USE_PADLOCK
ctx->use_padlock = 0;
#endif
+#ifdef USE_AESNI
+ ctx->use_aesni = 0;
+#endif
if( keylen == 128/8 )
{
- ROUNDS = 10;
+ rounds = 10;
KC = 4;
+
+ if (0)
+ {
+ ;
+ }
#ifdef USE_PADLOCK
- if ((_gcry_get_hw_features () & HWF_PADLOCK_AES))
+ else if (hwfeatures & HWF_PADLOCK_AES)
{
ctx->use_padlock = 1;
- memcpy (ctx->padlock_key, key, keylen);
+ memcpy (ctx->padlockkey, key, keylen);
+ }
+#endif
+#ifdef USE_AESNI
+ else if (hwfeatures & HWF_INTEL_AESNI)
+ {
+ ctx->use_aesni = 1;
}
#endif
}
else if ( keylen == 192/8 )
{
- ROUNDS = 12;
+ rounds = 12;
KC = 6;
+
+ if (0)
+ {
+ ;
+ }
+#ifdef USE_AESNI
+ else if (hwfeatures & HWF_INTEL_AESNI)
+ {
+ ctx->use_aesni = 1;
+ }
+#endif
}
else if ( keylen == 256/8 )
{
- ROUNDS = 14;
+ rounds = 14;
KC = 8;
+
+ if (0)
+ {
+ ;
+ }
+#ifdef USE_AESNI
+ else if (hwfeatures & HWF_INTEL_AESNI)
+ {
+ ctx->use_aesni = 1;
+ }
+#endif
}
else
return GPG_ERR_INV_KEYLEN;
- ctx->ROUNDS = ROUNDS;
+ ctx->rounds = rounds;
-#ifdef USE_PADLOCK
- if (ctx->use_padlock)
+ /* NB: We don't yet support Padlock hardware key generation. */
+
+ if (0)
{
- /* Nothing to do as we support only hardware key generation for
- now. */
+ ;
}
+#ifdef USE_AESNI
+ else if (ctx->use_aesni)
+ aesni_do_setkey(ctx, key);
+#endif
else
-#endif /*USE_PADLOCK*/
{
-#define W (ctx->keySched)
- for (i = 0; i < keylen; i++)
+ union
{
- k[i >> 2][i & 3] = key[i];
+ PROPERLY_ALIGNED_TYPE dummy;
+ byte data[MAXKC][4];
+ } k, tk;
+#define k k.data
+#define tk tk.data
+#define W (ctx->keyschenc)
+ for (i = 0; i < keylen; i++)
+ {
+ k[i >> 2][i & 3] = key[i];
}
-
- for (j = KC-1; j >= 0; j--)
+
+ for (j = KC-1; j >= 0; j--)
{
- *((u32*)tk[j]) = *((u32*)k[j]);
+ *((u32_a_t*)tk[j]) = *((u32_a_t*)k[j]);
}
r = 0;
t = 0;
/* Copy values into round key array. */
- for (j = 0; (j < KC) && (r < ROUNDS + 1); )
+ for (j = 0; (j < KC) && (r < rounds + 1); )
{
for (; (j < KC) && (t < 4); j++, t++)
{
- *((u32*)W[r][t]) = *((u32*)tk[j]);
+ *((u32_a_t*)W[r][t]) = *((u32_a_t*)tk[j]);
}
if (t == 4)
{
@@ -195,8 +602,8 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
t = 0;
}
}
-
- while (r < ROUNDS + 1)
+
+ while (r < rounds + 1)
{
/* While not enough round key material calculated calculate
new values. */
@@ -205,19 +612,19 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
tk[0][2] ^= S[tk[KC-1][3]];
tk[0][3] ^= S[tk[KC-1][0]];
tk[0][0] ^= rcon[rconpointer++];
-
+
if (KC != 8)
{
- for (j = 1; j < KC; j++)
+ for (j = 1; j < KC; j++)
{
- *((u32*)tk[j]) ^= *((u32*)tk[j-1]);
+ *((u32_a_t*)tk[j]) ^= *((u32_a_t*)tk[j-1]);
}
- }
- else
+ }
+ else
{
for (j = 1; j < KC/2; j++)
{
- *((u32*)tk[j]) ^= *((u32*)tk[j-1]);
+ *((u32_a_t*)tk[j]) ^= *((u32_a_t*)tk[j-1]);
}
tk[KC/2][0] ^= S[tk[KC/2 - 1][0]];
tk[KC/2][1] ^= S[tk[KC/2 - 1][1]];
@@ -225,16 +632,16 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
tk[KC/2][3] ^= S[tk[KC/2 - 1][3]];
for (j = KC/2 + 1; j < KC; j++)
{
- *((u32*)tk[j]) ^= *((u32*)tk[j-1]);
+ *((u32_a_t*)tk[j]) ^= *((u32_a_t*)tk[j-1]);
}
}
-
+
/* Copy values into round key array. */
- for (j = 0; (j < KC) && (r < ROUNDS + 1); )
+ for (j = 0; (j < KC) && (r < rounds + 1); )
{
for (; (j < KC) && (t < 4); j++, t++)
{
- *((u32*)W[r][t]) = *((u32*)tk[j]);
+ *((u32_a_t*)W[r][t]) = *((u32_a_t*)tk[j]);
}
if (t == 4)
{
@@ -242,13 +649,15 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
t = 0;
}
}
- }
-#undef W
+ }
+#undef W
+#undef tk
+#undef k
+ wipememory(&tk, sizeof(tk));
+ wipememory(&t, sizeof(t));
}
return 0;
-#undef tk
-#undef k
}
@@ -256,10 +665,7 @@ static gcry_err_code_t
rijndael_setkey (void *context, const byte *key, const unsigned keylen)
{
RIJNDAEL_context *ctx = context;
-
- int rc = do_setkey (ctx, key, keylen);
- _gcry_burn_stack ( 100 + 16*sizeof(int));
- return rc;
+ return do_setkey (ctx, key, keylen);
}
@@ -268,53 +674,113 @@ static void
prepare_decryption( RIJNDAEL_context *ctx )
{
int r;
- union
- {
- PROPERLY_ALIGNED_TYPE dummy;
- byte *w;
- } w;
-#define w w.w
- for (r=0; r < MAXROUNDS+1; r++ )
+#ifdef USE_AESNI
+ if (ctx->use_aesni)
{
- *((u32*)ctx->keySched2[r][0]) = *((u32*)ctx->keySched[r][0]);
- *((u32*)ctx->keySched2[r][1]) = *((u32*)ctx->keySched[r][1]);
- *((u32*)ctx->keySched2[r][2]) = *((u32*)ctx->keySched[r][2]);
- *((u32*)ctx->keySched2[r][3]) = *((u32*)ctx->keySched[r][3]);
+ /* The AES-NI decrypt instructions use the Equivalent Inverse
+ Cipher, thus we can't use the the standard decrypt key
+ preparation. */
+ u128_t *ekey = (u128_t *)ctx->keyschenc;
+ u128_t *dkey = (u128_t *)ctx->keyschdec;
+ int rr;
+
+ aesni_prepare();
+
+#define DO_AESNI_AESIMC() \
+ asm volatile ("movdqa %[ekey], %%xmm1\n\t" \
+ /*"aesimc %%xmm1, %%xmm1\n\t"*/ \
+ ".byte 0x66, 0x0f, 0x38, 0xdb, 0xc9\n\t" \
+ "movdqa %%xmm1, %[dkey]" \
+ : [dkey] "=m" (dkey[r]) \
+ : [ekey] "m" (ekey[rr]) \
+ : "memory")
+
+ dkey[0] = ekey[ctx->rounds];
+ r=1;
+ rr=ctx->rounds-1;
+ DO_AESNI_AESIMC(); r++; rr--; /* round 1 */
+ DO_AESNI_AESIMC(); r++; rr--; /* round 2 */
+ DO_AESNI_AESIMC(); r++; rr--; /* round 3 */
+ DO_AESNI_AESIMC(); r++; rr--; /* round 4 */
+ DO_AESNI_AESIMC(); r++; rr--; /* round 5 */
+ DO_AESNI_AESIMC(); r++; rr--; /* round 6 */
+ DO_AESNI_AESIMC(); r++; rr--; /* round 7 */
+ DO_AESNI_AESIMC(); r++; rr--; /* round 8 */
+ DO_AESNI_AESIMC(); r++; rr--; /* round 9 */
+ if (ctx->rounds > 10)
+ {
+ DO_AESNI_AESIMC(); r++; rr--; /* round 10 */
+ DO_AESNI_AESIMC(); r++; rr--; /* round 11 */
+ if (ctx->rounds > 12)
+ {
+ DO_AESNI_AESIMC(); r++; rr--; /* round 12 */
+ DO_AESNI_AESIMC(); r++; rr--; /* round 13 */
+ }
+ }
+
+ dkey[r] = ekey[0];
+
+#undef DO_AESNI_AESIMC
+
+ aesni_cleanup();
}
-#define W (ctx->keySched2)
- for (r = 1; r < ctx->ROUNDS; r++)
+ else
+#endif /*USE_AESNI*/
{
- w = W[r][0];
- *((u32*)w) = *((u32*)U1[w[0]]) ^ *((u32*)U2[w[1]])
- ^ *((u32*)U3[w[2]]) ^ *((u32*)U4[w[3]]);
-
- w = W[r][1];
- *((u32*)w) = *((u32*)U1[w[0]]) ^ *((u32*)U2[w[1]])
- ^ *((u32*)U3[w[2]]) ^ *((u32*)U4[w[3]]);
-
- w = W[r][2];
- *((u32*)w) = *((u32*)U1[w[0]]) ^ *((u32*)U2[w[1]])
- ^ *((u32*)U3[w[2]]) ^ *((u32*)U4[w[3]]);
-
- w = W[r][3];
- *((u32*)w) = *((u32*)U1[w[0]]) ^ *((u32*)U2[w[1]])
- ^ *((u32*)U3[w[2]]) ^ *((u32*)U4[w[3]]);
- }
+ union
+ {
+ PROPERLY_ALIGNED_TYPE dummy;
+ byte *w;
+ } w;
+#define w w.w
+
+ for (r=0; r < MAXROUNDS+1; r++ )
+ {
+ *((u32_a_t*)ctx->keyschdec[r][0]) = *((u32_a_t*)ctx->keyschenc[r][0]);
+ *((u32_a_t*)ctx->keyschdec[r][1]) = *((u32_a_t*)ctx->keyschenc[r][1]);
+ *((u32_a_t*)ctx->keyschdec[r][2]) = *((u32_a_t*)ctx->keyschenc[r][2]);
+ *((u32_a_t*)ctx->keyschdec[r][3]) = *((u32_a_t*)ctx->keyschenc[r][3]);
+ }
+#define W (ctx->keyschdec)
+ for (r = 1; r < ctx->rounds; r++)
+ {
+ w = W[r][0];
+ *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]])
+ ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]);
+
+ w = W[r][1];
+ *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]])
+ ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]);
+
+ w = W[r][2];
+ *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]])
+ ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]);
+
+ w = W[r][3];
+ *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]])
+ ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]);
+ }
#undef W
#undef w
-}
-
+ wipememory(&w, sizeof(w));
+ }
+}
/* Encrypt one block. A and B need to be aligned on a 4 byte
boundary. A and B may be the same. */
static void
-do_encrypt_aligned (const RIJNDAEL_context *ctx,
+do_encrypt_aligned (const RIJNDAEL_context *ctx,
unsigned char *b, const unsigned char *a)
{
-#define rk (ctx->keySched)
- int ROUNDS = ctx->ROUNDS;
+#ifdef USE_AMD64_ASM
+ _gcry_aes_amd64_encrypt_block(ctx->keyschenc, b, a, ctx->rounds);
+#elif defined(USE_ARM_ASM)
+ _gcry_aes_arm_encrypt_block(ctx->keyschenc, b, a, ctx->rounds);
+#else
+#define rk (ctx->keyschenc)
+ int rounds = ctx->rounds;
int r;
union
{
@@ -322,57 +788,57 @@ do_encrypt_aligned (const RIJNDAEL_context *ctx,
byte temp[4][4];
} u;
- *((u32*)u.temp[0]) = *((u32*)(a )) ^ *((u32*)rk[0][0]);
- *((u32*)u.temp[1]) = *((u32*)(a+ 4)) ^ *((u32*)rk[0][1]);
- *((u32*)u.temp[2]) = *((u32*)(a+ 8)) ^ *((u32*)rk[0][2]);
- *((u32*)u.temp[3]) = *((u32*)(a+12)) ^ *((u32*)rk[0][3]);
- *((u32*)(b )) = (*((u32*)T1[u.temp[0][0]])
- ^ *((u32*)T2[u.temp[1][1]])
- ^ *((u32*)T3[u.temp[2][2]])
- ^ *((u32*)T4[u.temp[3][3]]));
- *((u32*)(b + 4)) = (*((u32*)T1[u.temp[1][0]])
- ^ *((u32*)T2[u.temp[2][1]])
- ^ *((u32*)T3[u.temp[3][2]])
- ^ *((u32*)T4[u.temp[0][3]]));
- *((u32*)(b + 8)) = (*((u32*)T1[u.temp[2][0]])
- ^ *((u32*)T2[u.temp[3][1]])
- ^ *((u32*)T3[u.temp[0][2]])
- ^ *((u32*)T4[u.temp[1][3]]));
- *((u32*)(b +12)) = (*((u32*)T1[u.temp[3][0]])
- ^ *((u32*)T2[u.temp[0][1]])
- ^ *((u32*)T3[u.temp[1][2]])
- ^ *((u32*)T4[u.temp[2][3]]));
-
- for (r = 1; r < ROUNDS-1; r++)
+ *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(a )) ^ *((u32_a_t*)rk[0][0]);
+ *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(a+ 4)) ^ *((u32_a_t*)rk[0][1]);
+ *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(a+ 8)) ^ *((u32_a_t*)rk[0][2]);
+ *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(a+12)) ^ *((u32_a_t*)rk[0][3]);
+ *((u32_a_t*)(b )) = (*((u32_a_t*)T1[u.temp[0][0]])
+ ^ *((u32_a_t*)T2[u.temp[1][1]])
+ ^ *((u32_a_t*)T3[u.temp[2][2]])
+ ^ *((u32_a_t*)T4[u.temp[3][3]]));
+ *((u32_a_t*)(b + 4)) = (*((u32_a_t*)T1[u.temp[1][0]])
+ ^ *((u32_a_t*)T2[u.temp[2][1]])
+ ^ *((u32_a_t*)T3[u.temp[3][2]])
+ ^ *((u32_a_t*)T4[u.temp[0][3]]));
+ *((u32_a_t*)(b + 8)) = (*((u32_a_t*)T1[u.temp[2][0]])
+ ^ *((u32_a_t*)T2[u.temp[3][1]])
+ ^ *((u32_a_t*)T3[u.temp[0][2]])
+ ^ *((u32_a_t*)T4[u.temp[1][3]]));
+ *((u32_a_t*)(b +12)) = (*((u32_a_t*)T1[u.temp[3][0]])
+ ^ *((u32_a_t*)T2[u.temp[0][1]])
+ ^ *((u32_a_t*)T3[u.temp[1][2]])
+ ^ *((u32_a_t*)T4[u.temp[2][3]]));
+
+ for (r = 1; r < rounds-1; r++)
{
- *((u32*)u.temp[0]) = *((u32*)(b )) ^ *((u32*)rk[r][0]);
- *((u32*)u.temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[r][1]);
- *((u32*)u.temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[r][2]);
- *((u32*)u.temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[r][3]);
-
- *((u32*)(b )) = (*((u32*)T1[u.temp[0][0]])
- ^ *((u32*)T2[u.temp[1][1]])
- ^ *((u32*)T3[u.temp[2][2]])
- ^ *((u32*)T4[u.temp[3][3]]));
- *((u32*)(b + 4)) = (*((u32*)T1[u.temp[1][0]])
- ^ *((u32*)T2[u.temp[2][1]])
- ^ *((u32*)T3[u.temp[3][2]])
- ^ *((u32*)T4[u.temp[0][3]]));
- *((u32*)(b + 8)) = (*((u32*)T1[u.temp[2][0]])
- ^ *((u32*)T2[u.temp[3][1]])
- ^ *((u32*)T3[u.temp[0][2]])
- ^ *((u32*)T4[u.temp[1][3]]));
- *((u32*)(b +12)) = (*((u32*)T1[u.temp[3][0]])
- ^ *((u32*)T2[u.temp[0][1]])
- ^ *((u32*)T3[u.temp[1][2]])
- ^ *((u32*)T4[u.temp[2][3]]));
+ *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[r][0]);
+ *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[r][1]);
+ *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[r][2]);
+ *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[r][3]);
+
+ *((u32_a_t*)(b )) = (*((u32_a_t*)T1[u.temp[0][0]])
+ ^ *((u32_a_t*)T2[u.temp[1][1]])
+ ^ *((u32_a_t*)T3[u.temp[2][2]])
+ ^ *((u32_a_t*)T4[u.temp[3][3]]));
+ *((u32_a_t*)(b + 4)) = (*((u32_a_t*)T1[u.temp[1][0]])
+ ^ *((u32_a_t*)T2[u.temp[2][1]])
+ ^ *((u32_a_t*)T3[u.temp[3][2]])
+ ^ *((u32_a_t*)T4[u.temp[0][3]]));
+ *((u32_a_t*)(b + 8)) = (*((u32_a_t*)T1[u.temp[2][0]])
+ ^ *((u32_a_t*)T2[u.temp[3][1]])
+ ^ *((u32_a_t*)T3[u.temp[0][2]])
+ ^ *((u32_a_t*)T4[u.temp[1][3]]));
+ *((u32_a_t*)(b +12)) = (*((u32_a_t*)T1[u.temp[3][0]])
+ ^ *((u32_a_t*)T2[u.temp[0][1]])
+ ^ *((u32_a_t*)T3[u.temp[1][2]])
+ ^ *((u32_a_t*)T4[u.temp[2][3]]));
}
- /* Last round is special. */
- *((u32*)u.temp[0]) = *((u32*)(b )) ^ *((u32*)rk[ROUNDS-1][0]);
- *((u32*)u.temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[ROUNDS-1][1]);
- *((u32*)u.temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[ROUNDS-1][2]);
- *((u32*)u.temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[ROUNDS-1][3]);
+ /* Last round is special. */
+ *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[rounds-1][0]);
+ *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[rounds-1][1]);
+ *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[rounds-1][2]);
+ *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[rounds-1][3]);
b[ 0] = T1[u.temp[0][0]][1];
b[ 1] = T1[u.temp[1][1]][1];
b[ 2] = T1[u.temp[2][2]][1];
@@ -389,11 +855,12 @@ do_encrypt_aligned (const RIJNDAEL_context *ctx,
b[13] = T1[u.temp[0][1]][1];
b[14] = T1[u.temp[1][2]][1];
b[15] = T1[u.temp[2][3]][1];
- *((u32*)(b )) ^= *((u32*)rk[ROUNDS][0]);
- *((u32*)(b+ 4)) ^= *((u32*)rk[ROUNDS][1]);
- *((u32*)(b+ 8)) ^= *((u32*)rk[ROUNDS][2]);
- *((u32*)(b+12)) ^= *((u32*)rk[ROUNDS][3]);
+ *((u32_a_t*)(b )) ^= *((u32_a_t*)rk[rounds][0]);
+ *((u32_a_t*)(b+ 4)) ^= *((u32_a_t*)rk[rounds][1]);
+ *((u32_a_t*)(b+ 8)) ^= *((u32_a_t*)rk[rounds][2]);
+ *((u32_a_t*)(b+12)) ^= *((u32_a_t*)rk[rounds][3]);
#undef rk
+#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
}
@@ -401,22 +868,31 @@ static void
do_encrypt (const RIJNDAEL_context *ctx,
unsigned char *bx, const unsigned char *ax)
{
- /* BX and AX are not necessary correctly aligned. Thus we need to
- copy them here. */
- union
- {
- u32 dummy[4];
- byte a[16];
- } a;
- union
- {
- u32 dummy[4];
- byte b[16];
- } b;
+#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
+ /* BX and AX are not necessary correctly aligned. Thus we might
+ need to copy them here. We try to align to a 16 bytes. */
+ if (((size_t)ax & 0x0f) || ((size_t)bx & 0x0f))
+ {
+ union
+ {
+ u32 dummy[4];
+ byte a[16] ATTR_ALIGNED_16;
+ } a;
+ union
+ {
+ u32 dummy[4];
+ byte b[16] ATTR_ALIGNED_16;
+ } b;
- memcpy (a.a, ax, 16);
- do_encrypt_aligned (ctx, b.b, a.a);
- memcpy (bx, b.b, 16);
+ buf_cpy (a.a, ax, 16);
+ do_encrypt_aligned (ctx, b.b, a.a);
+ buf_cpy (bx, b.b, 16);
+ }
+ else
+#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
+ {
+ do_encrypt_aligned (ctx, bx, ax);
+ }
}
@@ -432,11 +908,12 @@ do_padlock (const RIJNDAEL_context *ctx, int decrypt_flag,
unsigned char a[16] __attribute__ ((aligned (16)));
unsigned char b[16] __attribute__ ((aligned (16)));
unsigned int cword[4] __attribute__ ((aligned (16)));
+ int blocks;
/* The control word fields are:
127:12 11:10 9 8 7 6 5 4 3:0
RESERVED KSIZE CRYPT INTER KEYGN CIPHR ALIGN DGEST ROUND */
- cword[0] = (ctx->ROUNDS & 15); /* (The mask is just a safeguard.) */
+ cword[0] = (ctx->rounds & 15); /* (The mask is just a safeguard.) */
cword[1] = 0;
cword[2] = 0;
cword[3] = 0;
@@ -444,18 +921,29 @@ do_padlock (const RIJNDAEL_context *ctx, int decrypt_flag,
cword[0] |= 0x00000200;
memcpy (a, ax, 16);
-
- asm volatile
- ("pushfl\n\t" /* Force key reload. */
+
+ blocks = 1; /* Init counter for just one block. */
+#ifdef __x86_64__
+ asm volatile
+ ("pushfq\n\t" /* Force key reload. */
+ "popfq\n\t"
+ ".byte 0xf3, 0x0f, 0xa7, 0xc8\n\t" /* REP XCRYPT ECB. */
+ : /* No output */
+ : "S" (a), "D" (b), "d" (cword), "b" (ctx->padlockkey), "c" (blocks)
+ : "cc", "memory"
+ );
+#else
+ asm volatile
+ ("pushfl\n\t" /* Force key reload. */
"popfl\n\t"
"xchg %3, %%ebx\n\t" /* Load key. */
- "movl $1, %%ecx\n\t" /* Init counter for just one block. */
- ".byte 0xf3, 0x0f, 0xa7, 0xc8\n\t" /* REP XSTORE ECB. */
+ ".byte 0xf3, 0x0f, 0xa7, 0xc8\n\t" /* REP XCRYPT ECB. */
"xchg %3, %%ebx\n" /* Restore GOT register. */
: /* No output */
- : "S" (a), "D" (b), "d" (cword), "r" (ctx->padlock_key)
- : "%ecx", "cc", "memory"
+ : "S" (a), "D" (b), "d" (cword), "r" (ctx->padlockkey), "c" (blocks)
+ : "cc", "memory"
);
+#endif
memcpy (bx, b, 16);
@@ -463,23 +951,721 @@ do_padlock (const RIJNDAEL_context *ctx, int decrypt_flag,
#endif /*USE_PADLOCK*/
+#ifdef USE_AESNI
+/* Encrypt one block using the Intel AES-NI instructions. A and B may
+ be the same.
+
+ Our problem here is that gcc does not allow the "x" constraint for
+ SSE registers in asm unless you compile with -msse. The common
+ wisdom is to use a separate file for SSE instructions and build it
+ separately. This would require a lot of extra build system stuff,
+ similar to what we do in mpi/ for the asm stuff. What we do
+ instead is to use standard registers and a bit more of plain asm
+ which copies the data and key stuff to the SSE registers and later
+ back. If we decide to implement some block modes with parallelized
+ AES instructions, it might indeed be better to use plain asm ala
+ mpi/. */
+static inline void
+do_aesni_enc (const RIJNDAEL_context *ctx, unsigned char *b,
+ const unsigned char *a)
+{
+#define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+ /* Note: For now we relax the alignment requirement for A and B: It
+ does not make much difference because in many case we would need
+ to memcpy them to an extra buffer; using the movdqu is much faster
+ that memcpy and movdqa. For CFB we know that the IV is properly
+ aligned but that is a special case. We should better implement
+ CFB direct in asm. */
+ asm volatile ("movdqu %[src], %%xmm0\n\t" /* xmm0 := *a */
+ "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
+ "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
+ "movdqa 0x10(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x20(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x30(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x40(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x50(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x60(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x70(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x80(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x90(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xa0(%[key]), %%xmm1\n\t"
+ "cmpl $10, %[rounds]\n\t"
+ "jz .Lenclast%=\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xb0(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xc0(%[key]), %%xmm1\n\t"
+ "cmpl $12, %[rounds]\n\t"
+ "jz .Lenclast%=\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xd0(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xe0(%[key]), %%xmm1\n"
+
+ ".Lenclast%=:\n\t"
+ aesenclast_xmm1_xmm0
+ "movdqu %%xmm0, %[dst]\n"
+ : [dst] "=m" (*b)
+ : [src] "m" (*a),
+ [key] "r" (ctx->keyschenc),
+ [rounds] "r" (ctx->rounds)
+ : "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenclast_xmm1_xmm0
+}
+
+
+static inline void
+do_aesni_dec (const RIJNDAEL_context *ctx, unsigned char *b,
+ const unsigned char *a)
+{
+#define aesdec_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc1\n\t"
+#define aesdeclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc1\n\t"
+ asm volatile ("movdqu %[src], %%xmm0\n\t" /* xmm0 := *a */
+ "movdqa (%[key]), %%xmm1\n\t"
+ "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
+ "movdqa 0x10(%[key]), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0x20(%[key]), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0x30(%[key]), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0x40(%[key]), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0x50(%[key]), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0x60(%[key]), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0x70(%[key]), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0x80(%[key]), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0x90(%[key]), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0xa0(%[key]), %%xmm1\n\t"
+ "cmpl $10, %[rounds]\n\t"
+ "jz .Ldeclast%=\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0xb0(%[key]), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0xc0(%[key]), %%xmm1\n\t"
+ "cmpl $12, %[rounds]\n\t"
+ "jz .Ldeclast%=\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0xd0(%[key]), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0xe0(%[key]), %%xmm1\n"
+
+ ".Ldeclast%=:\n\t"
+ aesdeclast_xmm1_xmm0
+ "movdqu %%xmm0, %[dst]\n"
+ : [dst] "=m" (*b)
+ : [src] "m" (*a),
+ [key] "r" (ctx->keyschdec),
+ [rounds] "r" (ctx->rounds)
+ : "cc", "memory");
+#undef aesdec_xmm1_xmm0
+#undef aesdeclast_xmm1_xmm0
+}
+
+
+/* Encrypt four blocks using the Intel AES-NI instructions. Blocks are input
+ * and output through SSE registers xmm1 to xmm4. */
+static void
+do_aesni_enc_vec4 (const RIJNDAEL_context *ctx)
+{
+#define aesenc_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t"
+#define aesenc_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd0\n\t"
+#define aesenc_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd8\n\t"
+#define aesenc_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe0\n\t"
+#define aesenclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc8\n\t"
+#define aesenclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd0\n\t"
+#define aesenclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd8\n\t"
+#define aesenclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe0\n\t"
+ asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */
+ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */
+ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */
+ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */
+ "movdqa 0x10(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x20(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x30(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x40(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x50(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x60(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x70(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x80(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x90(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xa0(%[key]), %%xmm0\n\t"
+ "cmpl $10, %[rounds]\n\t"
+ "jz .Ldeclast%=\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xb0(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xc0(%[key]), %%xmm0\n\t"
+ "cmpl $12, %[rounds]\n\t"
+ "jz .Ldeclast%=\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xd0(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xe0(%[key]), %%xmm0\n"
+
+ ".Ldeclast%=:\n\t"
+ aesenclast_xmm0_xmm1
+ aesenclast_xmm0_xmm2
+ aesenclast_xmm0_xmm3
+ aesenclast_xmm0_xmm4
+ : /* no output */
+ : [key] "r" (ctx->keyschenc),
+ [rounds] "r" (ctx->rounds)
+ : "cc", "memory");
+#undef aesenc_xmm0_xmm1
+#undef aesenc_xmm0_xmm2
+#undef aesenc_xmm0_xmm3
+#undef aesenc_xmm0_xmm4
+#undef aesenclast_xmm0_xmm1
+#undef aesenclast_xmm0_xmm2
+#undef aesenclast_xmm0_xmm3
+#undef aesenclast_xmm0_xmm4
+}
+
+
+/* Decrypt four blocks using the Intel AES-NI instructions. Blocks are input
+ * and output through SSE registers xmm1 to xmm4. */
+static void
+do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
+{
+#define aesdec_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc8\n\t"
+#define aesdec_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd0\n\t"
+#define aesdec_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd8\n\t"
+#define aesdec_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xde, 0xe0\n\t"
+#define aesdeclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc8\n\t"
+#define aesdeclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd0\n\t"
+#define aesdeclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd8\n\t"
+#define aesdeclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xe0\n\t"
+ asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */
+ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */
+ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */
+ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */
+ "movdqa 0x10(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x20(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x30(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x40(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x50(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x60(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x70(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x80(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x90(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0xa0(%[key]), %%xmm0\n\t"
+ "cmpl $10, %[rounds]\n\t"
+ "jz .Ldeclast%=\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0xb0(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0xc0(%[key]), %%xmm0\n\t"
+ "cmpl $12, %[rounds]\n\t"
+ "jz .Ldeclast%=\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0xd0(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0xe0(%[key]), %%xmm0\n"
+
+ ".Ldeclast%=:\n\t"
+ aesdeclast_xmm0_xmm1
+ aesdeclast_xmm0_xmm2
+ aesdeclast_xmm0_xmm3
+ aesdeclast_xmm0_xmm4
+ : /* no output */
+ : [key] "r" (ctx->keyschdec),
+ [rounds] "r" (ctx->rounds)
+ : "cc", "memory");
+#undef aesdec_xmm0_xmm1
+#undef aesdec_xmm0_xmm2
+#undef aesdec_xmm0_xmm3
+#undef aesdec_xmm0_xmm4
+#undef aesdeclast_xmm0_xmm1
+#undef aesdeclast_xmm0_xmm2
+#undef aesdeclast_xmm0_xmm3
+#undef aesdeclast_xmm0_xmm4
+}
+
+
+/* Perform a CFB encryption or decryption round using the
+ initialization vector IV and the input block A. Write the result
+ to the output block B and update IV. IV needs to be 16 byte
+ aligned. */
+static void
+do_aesni_cfb (const RIJNDAEL_context *ctx, int decrypt_flag,
+ unsigned char *iv, unsigned char *b, const unsigned char *a)
+{
+#define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+ asm volatile ("movdqa %[iv], %%xmm0\n\t" /* xmm0 := IV */
+ "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
+ "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
+ "movdqa 0x10(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x20(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x30(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x40(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x50(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x60(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x70(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x80(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x90(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xa0(%[key]), %%xmm1\n\t"
+ "cmpl $10, %[rounds]\n\t"
+ "jz .Lenclast%=\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xb0(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xc0(%[key]), %%xmm1\n\t"
+ "cmpl $12, %[rounds]\n\t"
+ "jz .Lenclast%=\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xd0(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xe0(%[key]), %%xmm1\n"
+
+ ".Lenclast%=:\n\t"
+ aesenclast_xmm1_xmm0
+ "movdqu %[src], %%xmm1\n\t" /* Save input. */
+ "pxor %%xmm1, %%xmm0\n\t" /* xmm0 = input ^ IV */
+
+ "cmpl $1, %[decrypt]\n\t"
+ "jz .Ldecrypt_%=\n\t"
+ "movdqa %%xmm0, %[iv]\n\t" /* [encrypt] Store IV. */
+ "jmp .Lleave_%=\n"
+ ".Ldecrypt_%=:\n\t"
+ "movdqa %%xmm1, %[iv]\n" /* [decrypt] Store IV. */
+ ".Lleave_%=:\n\t"
+ "movdqu %%xmm0, %[dst]\n" /* Store output. */
+ : [iv] "+m" (*iv), [dst] "=m" (*b)
+ : [src] "m" (*a),
+ [key] "r" (ctx->keyschenc),
+ [rounds] "g" (ctx->rounds),
+ [decrypt] "m" (decrypt_flag)
+ : "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenclast_xmm1_xmm0
+}
+
+/* Perform a CTR encryption round using the counter CTR and the input
+ block A. Write the result to the output block B and update CTR.
+ CTR needs to be a 16 byte aligned little-endian value. */
+static void
+do_aesni_ctr (const RIJNDAEL_context *ctx,
+ unsigned char *ctr, unsigned char *b, const unsigned char *a)
+{
+#define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+
+ asm volatile ("movdqa %%xmm5, %%xmm0\n\t" /* xmm0 := CTR (xmm5) */
+ "pcmpeqd %%xmm1, %%xmm1\n\t"
+ "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */
+
+ "pshufb %%xmm6, %%xmm5\n\t"
+ "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ (big endian) */
+
+ /* detect if 64-bit carry handling is needed */
+ "cmpl $0xffffffff, 8(%[ctr])\n\t"
+ "jne .Lno_carry%=\n\t"
+ "cmpl $0xffffffff, 12(%[ctr])\n\t"
+ "jne .Lno_carry%=\n\t"
+
+ "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */
+ "psubq %%xmm1, %%xmm5\n\t" /* add carry to upper 64bits */
+
+ ".Lno_carry%=:\n\t"
+
+ "pshufb %%xmm6, %%xmm5\n\t"
+ "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */
+
+ "pxor (%[key]), %%xmm0\n\t" /* xmm1 ^= key[0] */
+ "movdqa 0x10(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x20(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x30(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x40(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x50(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x60(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x70(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x80(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x90(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xa0(%[key]), %%xmm1\n\t"
+ "cmpl $10, %[rounds]\n\t"
+ "jz .Lenclast%=\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xb0(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xc0(%[key]), %%xmm1\n\t"
+ "cmpl $12, %[rounds]\n\t"
+ "jz .Lenclast%=\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xd0(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xe0(%[key]), %%xmm1\n"
+
+ ".Lenclast%=:\n\t"
+ aesenclast_xmm1_xmm0
+ "movdqu %[src], %%xmm1\n\t" /* xmm1 := input */
+ "pxor %%xmm1, %%xmm0\n\t" /* EncCTR ^= input */
+ "movdqu %%xmm0, %[dst]" /* Store EncCTR. */
+
+ : [dst] "=m" (*b)
+ : [src] "m" (*a),
+ [ctr] "r" (ctr),
+ [key] "r" (ctx->keyschenc),
+ [rounds] "g" (ctx->rounds)
+ : "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenclast_xmm1_xmm0
+}
+
+
+/* Four blocks at a time variant of do_aesni_ctr. */
static void
+do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
+ unsigned char *ctr, unsigned char *b, const unsigned char *a)
+{
+#define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenc_xmm1_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd1\n\t"
+#define aesenc_xmm1_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd9\n\t"
+#define aesenc_xmm1_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe1\n\t"
+#define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+#define aesenclast_xmm1_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd1\n\t"
+#define aesenclast_xmm1_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd9\n\t"
+#define aesenclast_xmm1_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t"
+
+ /* Register usage:
+ esi keyschedule
+ xmm0 CTR-0
+ xmm1 temp / round key
+ xmm2 CTR-1
+ xmm3 CTR-2
+ xmm4 CTR-3
+ xmm5 copy of *ctr
+ xmm6 endian swapping mask
+ */
+
+ asm volatile ("movdqa %%xmm5, %%xmm0\n\t" /* xmm0, xmm2 := CTR (xmm5) */
+ "movdqa %%xmm0, %%xmm2\n\t"
+ "pcmpeqd %%xmm1, %%xmm1\n\t"
+ "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */
+
+ "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := le(xmm2) */
+ "psubq %%xmm1, %%xmm2\n\t" /* xmm2++ */
+ "movdqa %%xmm2, %%xmm3\n\t" /* xmm3 := xmm2 */
+ "psubq %%xmm1, %%xmm3\n\t" /* xmm3++ */
+ "movdqa %%xmm3, %%xmm4\n\t" /* xmm4 := xmm3 */
+ "psubq %%xmm1, %%xmm4\n\t" /* xmm4++ */
+ "movdqa %%xmm4, %%xmm5\n\t" /* xmm5 := xmm4 */
+ "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ */
+
+ /* detect if 64-bit carry handling is needed */
+ "cmpl $0xffffffff, 8(%[ctr])\n\t"
+ "jne .Lno_carry%=\n\t"
+ "movl 12(%[ctr]), %%esi\n\t"
+ "bswapl %%esi\n\t"
+ "cmpl $0xfffffffc, %%esi\n\t"
+ "jb .Lno_carry%=\n\t" /* no carry */
+
+ "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */
+ "je .Lcarry_xmm5%=\n\t" /* esi == 0xfffffffc */
+ "cmpl $0xfffffffe, %%esi\n\t"
+ "jb .Lcarry_xmm4%=\n\t" /* esi == 0xfffffffd */
+ "je .Lcarry_xmm3%=\n\t" /* esi == 0xfffffffe */
+ /* esi == 0xffffffff */
+
+ "psubq %%xmm1, %%xmm2\n\t"
+ ".Lcarry_xmm3%=:\n\t"
+ "psubq %%xmm1, %%xmm3\n\t"
+ ".Lcarry_xmm4%=:\n\t"
+ "psubq %%xmm1, %%xmm4\n\t"
+ ".Lcarry_xmm5%=:\n\t"
+ "psubq %%xmm1, %%xmm5\n\t"
+
+ ".Lno_carry%=:\n\t"
+ "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
+ "movl %[rounds], %%esi\n\t"
+
+ "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := be(xmm2) */
+ "pshufb %%xmm6, %%xmm3\n\t" /* xmm3 := be(xmm3) */
+ "pshufb %%xmm6, %%xmm4\n\t" /* xmm4 := be(xmm4) */
+ "pshufb %%xmm6, %%xmm5\n\t" /* xmm5 := be(xmm5) */
+ "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */
+
+ "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
+ "pxor %%xmm1, %%xmm2\n\t" /* xmm2 ^= key[0] */
+ "pxor %%xmm1, %%xmm3\n\t" /* xmm3 ^= key[0] */
+ "pxor %%xmm1, %%xmm4\n\t" /* xmm4 ^= key[0] */
+ "movdqa 0x10(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0x20(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0x30(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0x40(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0x50(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0x60(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0x70(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0x80(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0x90(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0xa0(%[key]), %%xmm1\n\t"
+ "cmpl $10, %%esi\n\t"
+ "jz .Lenclast%=\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0xb0(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0xc0(%[key]), %%xmm1\n\t"
+ "cmpl $12, %%esi\n\t"
+ "jz .Lenclast%=\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0xd0(%[key]), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0xe0(%[key]), %%xmm1\n"
+
+ ".Lenclast%=:\n\t"
+ aesenclast_xmm1_xmm0
+ aesenclast_xmm1_xmm2
+ aesenclast_xmm1_xmm3
+ aesenclast_xmm1_xmm4
+
+ "movdqu (%[src]), %%xmm1\n\t" /* Get block 1. */
+ "pxor %%xmm1, %%xmm0\n\t" /* EncCTR-1 ^= input */
+ "movdqu %%xmm0, (%[dst])\n\t" /* Store block 1 */
+
+ "movdqu 16(%[src]), %%xmm1\n\t" /* Get block 2. */
+ "pxor %%xmm1, %%xmm2\n\t" /* EncCTR-2 ^= input */
+ "movdqu %%xmm2, 16(%[dst])\n\t" /* Store block 2. */
+
+ "movdqu 32(%[src]), %%xmm1\n\t" /* Get block 3. */
+ "pxor %%xmm1, %%xmm3\n\t" /* EncCTR-3 ^= input */
+ "movdqu %%xmm3, 32(%[dst])\n\t" /* Store block 3. */
+
+ "movdqu 48(%[src]), %%xmm1\n\t" /* Get block 4. */
+ "pxor %%xmm1, %%xmm4\n\t" /* EncCTR-4 ^= input */
+ "movdqu %%xmm4, 48(%[dst])" /* Store block 4. */
+
+ :
+ : [ctr] "r" (ctr),
+ [src] "r" (a),
+ [dst] "r" (b),
+ [key] "r" (ctx->keyschenc),
+ [rounds] "g" (ctx->rounds)
+ : "%esi", "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenc_xmm1_xmm2
+#undef aesenc_xmm1_xmm3
+#undef aesenc_xmm1_xmm4
+#undef aesenclast_xmm1_xmm0
+#undef aesenclast_xmm1_xmm2
+#undef aesenclast_xmm1_xmm3
+#undef aesenclast_xmm1_xmm4
+}
+
+#endif /*USE_AESNI*/
+
+
+static unsigned int
rijndael_encrypt (void *context, byte *b, const byte *a)
{
RIJNDAEL_context *ctx = context;
+ unsigned int burn_stack;
+ if (0)
+ ;
#ifdef USE_PADLOCK
- if (ctx->use_padlock)
+ else if (ctx->use_padlock)
{
do_padlock (ctx, 0, b, a);
- _gcry_burn_stack (48 + 15 /* possible padding for alignment */);
+ burn_stack = (48 + 15 /* possible padding for alignment */);
}
- else
#endif /*USE_PADLOCK*/
+#ifdef USE_AESNI
+ else if (ctx->use_aesni)
+ {
+ aesni_prepare ();
+ do_aesni_enc (ctx, b, a);
+ aesni_cleanup ();
+ burn_stack = 0;
+ }
+#endif /*USE_AESNI*/
+ else
{
do_encrypt (ctx, b, a);
- _gcry_burn_stack (48 + 2*sizeof(int));
+ burn_stack = (56 + 2*sizeof(int));
}
+
+ return burn_stack;
}
@@ -488,18 +1674,19 @@ rijndael_encrypt (void *context, byte *b, const byte *a)
function is only intended for the bulk encryption feature of
cipher.c. */
void
-_gcry_aes_cfb_enc (void *context, unsigned char *iv,
+_gcry_aes_cfb_enc (void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
- unsigned int nblocks)
+ size_t nblocks)
{
RIJNDAEL_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
- unsigned char *ivp;
- int i;
+ unsigned int burn_depth = 48 + 2*sizeof(int);
+ if (0)
+ ;
#ifdef USE_PADLOCK
- if (ctx->use_padlock)
+ else if (ctx->use_padlock)
{
/* Fixme: Let Padlock do the CFBing. */
for ( ;nblocks; nblocks-- )
@@ -507,24 +1694,42 @@ _gcry_aes_cfb_enc (void *context, unsigned char *iv,
/* Encrypt the IV. */
do_padlock (ctx, 0, iv, iv);
/* XOR the input with the IV and store input into IV. */
- for (ivp=iv,i=0; i < BLOCKSIZE; i++ )
- *outbuf++ = (*ivp++ ^= *inbuf++);
+ buf_xor_2dst(outbuf, iv, inbuf, BLOCKSIZE);
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
}
}
+#endif /*USE_PADLOCK*/
+#ifdef USE_AESNI
+ else if (ctx->use_aesni)
+ {
+ aesni_prepare ();
+ for ( ;nblocks; nblocks-- )
+ {
+ do_aesni_cfb (ctx, 0, iv, outbuf, inbuf);
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
+ }
+ aesni_cleanup ();
+
+ burn_depth = 0; /* No stack usage. */
+ }
+#endif /*USE_AESNI*/
else
-#endif /* USE_PADLOCK*/
{
for ( ;nblocks; nblocks-- )
{
/* Encrypt the IV. */
do_encrypt_aligned (ctx, iv, iv);
/* XOR the input with the IV and store input into IV. */
- for (ivp=iv,i=0; i < BLOCKSIZE; i++ )
- *outbuf++ = (*ivp++ ^= *inbuf++);
+ buf_xor_2dst(outbuf, iv, inbuf, BLOCKSIZE);
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
}
}
- _gcry_burn_stack (48 + 2*sizeof(int));
+ if (burn_depth)
+ _gcry_burn_stack (burn_depth);
}
@@ -533,35 +1738,173 @@ _gcry_aes_cfb_enc (void *context, unsigned char *iv,
function is only intended for the bulk encryption feature of
cipher.c. */
void
-_gcry_aes_cbc_enc (void *context, unsigned char *iv,
+_gcry_aes_cbc_enc (void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
- unsigned int nblocks, int cbc_mac)
+ size_t nblocks, int cbc_mac)
{
RIJNDAEL_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
- unsigned char *ivp;
- int i;
+ unsigned char *last_iv;
+ unsigned int burn_depth = 48 + 2*sizeof(int);
+#ifdef USE_AESNI
+ int use_aesni = ctx->use_aesni;
+#endif
+
+#ifdef USE_AESNI
+ if (use_aesni)
+ aesni_prepare ();
+#endif /*USE_AESNI*/
+
+ last_iv = iv;
for ( ;nblocks; nblocks-- )
{
- for (ivp=iv, i=0; i < BLOCKSIZE; i++ )
- outbuf[i] = inbuf[i] ^ *ivp++;
+ if (0)
+ ;
+#ifdef USE_AESNI
+ else if (use_aesni)
+ {
+ /* ~35% speed up on Sandy-Bridge when doing xoring and copying with
+ SSE registers. */
+ asm volatile ("movdqu %[iv], %%xmm0\n\t"
+ "movdqu %[inbuf], %%xmm1\n\t"
+ "pxor %%xmm0, %%xmm1\n\t"
+ "movdqu %%xmm1, %[outbuf]\n\t"
+ : /* No output */
+ : [iv] "m" (*last_iv),
+ [inbuf] "m" (*inbuf),
+ [outbuf] "m" (*outbuf)
+ : "memory" );
+
+ do_aesni_enc (ctx, outbuf, outbuf);
+ }
+#endif /*USE_AESNI*/
+ else
+ {
+ buf_xor(outbuf, inbuf, last_iv, BLOCKSIZE);
+ if (0)
+ ;
#ifdef USE_PADLOCK
- if (ctx->use_padlock)
- do_padlock (ctx, 0, outbuf, outbuf);
- else
+ else if (ctx->use_padlock)
+ do_padlock (ctx, 0, outbuf, outbuf);
#endif /*USE_PADLOCK*/
- do_encrypt (ctx, outbuf, outbuf );
+ else
+ do_encrypt (ctx, outbuf, outbuf );
+ }
- memcpy (iv, outbuf, BLOCKSIZE);
+ last_iv = outbuf;
inbuf += BLOCKSIZE;
if (!cbc_mac)
outbuf += BLOCKSIZE;
}
- _gcry_burn_stack (48 + 2*sizeof(int));
+ if (last_iv != iv)
+ {
+ if (0)
+ ;
+#ifdef USE_AESNI
+ else if (use_aesni)
+ asm volatile ("movdqu %[last], %%xmm0\n\t"
+ "movdqu %%xmm0, %[iv]\n\t"
+ : /* No output */
+ : [last] "m" (*last_iv),
+ [iv] "m" (*iv)
+ : "memory" );
+#endif /*USE_AESNI*/
+ else
+ buf_cpy (iv, last_iv, BLOCKSIZE);
+ }
+
+#ifdef USE_AESNI
+ if (use_aesni)
+ {
+ aesni_cleanup ();
+ burn_depth = 0; /* No stack usage. */
+ }
+#endif /*USE_AESNI*/
+
+ if (burn_depth)
+ _gcry_burn_stack (burn_depth);
+}
+
+
+/* Bulk encryption of complete blocks in CTR mode. Caller needs to
+ make sure that CTR is aligned on a 16 byte boundary if AESNI; the
+ minimum alignment is for an u32. This function is only intended
+ for the bulk encryption feature of cipher.c. CTR is expected to be
+ of size BLOCKSIZE. */
+void
+_gcry_aes_ctr_enc (void *context, unsigned char *ctr,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks)
+{
+ RIJNDAEL_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ unsigned int burn_depth = 48 + 2*sizeof(int);
+ int i;
+
+ if (0)
+ ;
+#ifdef USE_AESNI
+ else if (ctx->use_aesni)
+ {
+ static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+ { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+
+ aesni_prepare ();
+
+ asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */
+ "movdqa %[ctr], %%xmm5\n\t" /* Preload CTR */
+ : /* No output */
+ : [mask] "m" (*be_mask),
+ [ctr] "m" (*ctr)
+ : "memory");
+
+ for ( ;nblocks > 3 ; nblocks -= 4 )
+ {
+ do_aesni_ctr_4 (ctx, ctr, outbuf, inbuf);
+ outbuf += 4*BLOCKSIZE;
+ inbuf += 4*BLOCKSIZE;
+ }
+ for ( ;nblocks; nblocks-- )
+ {
+ do_aesni_ctr (ctx, ctr, outbuf, inbuf);
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
+ }
+ aesni_cleanup ();
+ aesni_cleanup_2_6 ();
+
+ burn_depth = 0; /* No stack usage. */
+ }
+#endif /*USE_AESNI*/
+ else
+ {
+ union { unsigned char x1[16]; u32 x32[4]; } tmp;
+
+ for ( ;nblocks; nblocks-- )
+ {
+ /* Encrypt the counter. */
+ do_encrypt_aligned (ctx, tmp.x1, ctr);
+ /* XOR the input with the encrypted counter and store in output. */
+ buf_xor(outbuf, tmp.x1, inbuf, BLOCKSIZE);
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
+ /* Increment the counter. */
+ for (i = BLOCKSIZE; i > 0; i--)
+ {
+ ctr[i-1]++;
+ if (ctr[i-1])
+ break;
+ }
+ }
+ }
+
+ if (burn_depth)
+ _gcry_burn_stack (burn_depth);
}
@@ -570,70 +1913,75 @@ _gcry_aes_cbc_enc (void *context, unsigned char *iv,
and the decryption must have been prepared. A and B may be the
same. */
static void
-do_decrypt_aligned (RIJNDAEL_context *ctx,
+do_decrypt_aligned (RIJNDAEL_context *ctx,
unsigned char *b, const unsigned char *a)
{
-#define rk (ctx->keySched2)
- int ROUNDS = ctx->ROUNDS;
+#ifdef USE_AMD64_ASM
+ _gcry_aes_amd64_decrypt_block(ctx->keyschdec, b, a, ctx->rounds);
+#elif defined(USE_ARM_ASM)
+ _gcry_aes_arm_decrypt_block(ctx->keyschdec, b, a, ctx->rounds);
+#else
+#define rk (ctx->keyschdec)
+ int rounds = ctx->rounds;
int r;
- union
+ union
{
u32 tempu32[4]; /* Force correct alignment. */
byte temp[4][4];
} u;
- *((u32*)u.temp[0]) = *((u32*)(a )) ^ *((u32*)rk[ROUNDS][0]);
- *((u32*)u.temp[1]) = *((u32*)(a+ 4)) ^ *((u32*)rk[ROUNDS][1]);
- *((u32*)u.temp[2]) = *((u32*)(a+ 8)) ^ *((u32*)rk[ROUNDS][2]);
- *((u32*)u.temp[3]) = *((u32*)(a+12)) ^ *((u32*)rk[ROUNDS][3]);
-
- *((u32*)(b )) = (*((u32*)T5[u.temp[0][0]])
- ^ *((u32*)T6[u.temp[3][1]])
- ^ *((u32*)T7[u.temp[2][2]])
- ^ *((u32*)T8[u.temp[1][3]]));
- *((u32*)(b+ 4)) = (*((u32*)T5[u.temp[1][0]])
- ^ *((u32*)T6[u.temp[0][1]])
- ^ *((u32*)T7[u.temp[3][2]])
- ^ *((u32*)T8[u.temp[2][3]]));
- *((u32*)(b+ 8)) = (*((u32*)T5[u.temp[2][0]])
- ^ *((u32*)T6[u.temp[1][1]])
- ^ *((u32*)T7[u.temp[0][2]])
- ^ *((u32*)T8[u.temp[3][3]]));
- *((u32*)(b+12)) = (*((u32*)T5[u.temp[3][0]])
- ^ *((u32*)T6[u.temp[2][1]])
- ^ *((u32*)T7[u.temp[1][2]])
- ^ *((u32*)T8[u.temp[0][3]]));
-
- for (r = ROUNDS-1; r > 1; r--)
+ *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(a )) ^ *((u32_a_t*)rk[rounds][0]);
+ *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(a+ 4)) ^ *((u32_a_t*)rk[rounds][1]);
+ *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(a+ 8)) ^ *((u32_a_t*)rk[rounds][2]);
+ *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(a+12)) ^ *((u32_a_t*)rk[rounds][3]);
+
+ *((u32_a_t*)(b )) = (*((u32_a_t*)T5[u.temp[0][0]])
+ ^ *((u32_a_t*)T6[u.temp[3][1]])
+ ^ *((u32_a_t*)T7[u.temp[2][2]])
+ ^ *((u32_a_t*)T8[u.temp[1][3]]));
+ *((u32_a_t*)(b+ 4)) = (*((u32_a_t*)T5[u.temp[1][0]])
+ ^ *((u32_a_t*)T6[u.temp[0][1]])
+ ^ *((u32_a_t*)T7[u.temp[3][2]])
+ ^ *((u32_a_t*)T8[u.temp[2][3]]));
+ *((u32_a_t*)(b+ 8)) = (*((u32_a_t*)T5[u.temp[2][0]])
+ ^ *((u32_a_t*)T6[u.temp[1][1]])
+ ^ *((u32_a_t*)T7[u.temp[0][2]])
+ ^ *((u32_a_t*)T8[u.temp[3][3]]));
+ *((u32_a_t*)(b+12)) = (*((u32_a_t*)T5[u.temp[3][0]])
+ ^ *((u32_a_t*)T6[u.temp[2][1]])
+ ^ *((u32_a_t*)T7[u.temp[1][2]])
+ ^ *((u32_a_t*)T8[u.temp[0][3]]));
+
+ for (r = rounds-1; r > 1; r--)
{
- *((u32*)u.temp[0]) = *((u32*)(b )) ^ *((u32*)rk[r][0]);
- *((u32*)u.temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[r][1]);
- *((u32*)u.temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[r][2]);
- *((u32*)u.temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[r][3]);
- *((u32*)(b )) = (*((u32*)T5[u.temp[0][0]])
- ^ *((u32*)T6[u.temp[3][1]])
- ^ *((u32*)T7[u.temp[2][2]])
- ^ *((u32*)T8[u.temp[1][3]]));
- *((u32*)(b+ 4)) = (*((u32*)T5[u.temp[1][0]])
- ^ *((u32*)T6[u.temp[0][1]])
- ^ *((u32*)T7[u.temp[3][2]])
- ^ *((u32*)T8[u.temp[2][3]]));
- *((u32*)(b+ 8)) = (*((u32*)T5[u.temp[2][0]])
- ^ *((u32*)T6[u.temp[1][1]])
- ^ *((u32*)T7[u.temp[0][2]])
- ^ *((u32*)T8[u.temp[3][3]]));
- *((u32*)(b+12)) = (*((u32*)T5[u.temp[3][0]])
- ^ *((u32*)T6[u.temp[2][1]])
- ^ *((u32*)T7[u.temp[1][2]])
- ^ *((u32*)T8[u.temp[0][3]]));
+ *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[r][0]);
+ *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[r][1]);
+ *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[r][2]);
+ *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[r][3]);
+ *((u32_a_t*)(b )) = (*((u32_a_t*)T5[u.temp[0][0]])
+ ^ *((u32_a_t*)T6[u.temp[3][1]])
+ ^ *((u32_a_t*)T7[u.temp[2][2]])
+ ^ *((u32_a_t*)T8[u.temp[1][3]]));
+ *((u32_a_t*)(b+ 4)) = (*((u32_a_t*)T5[u.temp[1][0]])
+ ^ *((u32_a_t*)T6[u.temp[0][1]])
+ ^ *((u32_a_t*)T7[u.temp[3][2]])
+ ^ *((u32_a_t*)T8[u.temp[2][3]]));
+ *((u32_a_t*)(b+ 8)) = (*((u32_a_t*)T5[u.temp[2][0]])
+ ^ *((u32_a_t*)T6[u.temp[1][1]])
+ ^ *((u32_a_t*)T7[u.temp[0][2]])
+ ^ *((u32_a_t*)T8[u.temp[3][3]]));
+ *((u32_a_t*)(b+12)) = (*((u32_a_t*)T5[u.temp[3][0]])
+ ^ *((u32_a_t*)T6[u.temp[2][1]])
+ ^ *((u32_a_t*)T7[u.temp[1][2]])
+ ^ *((u32_a_t*)T8[u.temp[0][3]]));
}
- /* Last round is special. */
- *((u32*)u.temp[0]) = *((u32*)(b )) ^ *((u32*)rk[1][0]);
- *((u32*)u.temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[1][1]);
- *((u32*)u.temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[1][2]);
- *((u32*)u.temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[1][3]);
+ /* Last round is special. */
+ *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[1][0]);
+ *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[1][1]);
+ *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[1][2]);
+ *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[1][3]);
b[ 0] = S5[u.temp[0][0]];
b[ 1] = S5[u.temp[3][1]];
b[ 2] = S5[u.temp[2][2]];
@@ -650,11 +1998,12 @@ do_decrypt_aligned (RIJNDAEL_context *ctx,
b[13] = S5[u.temp[2][1]];
b[14] = S5[u.temp[1][2]];
b[15] = S5[u.temp[0][3]];
- *((u32*)(b )) ^= *((u32*)rk[0][0]);
- *((u32*)(b+ 4)) ^= *((u32*)rk[0][1]);
- *((u32*)(b+ 8)) ^= *((u32*)rk[0][2]);
- *((u32*)(b+12)) ^= *((u32*)rk[0][3]);
+ *((u32_a_t*)(b )) ^= *((u32_a_t*)rk[0][0]);
+ *((u32_a_t*)(b+ 4)) ^= *((u32_a_t*)rk[0][1]);
+ *((u32_a_t*)(b+ 8)) ^= *((u32_a_t*)rk[0][2]);
+ *((u32_a_t*)(b+12)) ^= *((u32_a_t*)rk[0][3]);
#undef rk
+#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
}
@@ -662,102 +2011,189 @@ do_decrypt_aligned (RIJNDAEL_context *ctx,
static void
do_decrypt (RIJNDAEL_context *ctx, byte *bx, const byte *ax)
{
- /* BX and AX are not necessary correctly aligned. Thus we need to
- copy them here. */
- union
- {
- u32 dummy[4];
- byte a[16];
- } a;
- union
- {
- u32 dummy[4];
- byte b[16];
- } b;
+#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
+ /* BX and AX are not necessary correctly aligned. Thus we might
+ need to copy them here. We try to align to a 16 bytes. */
+ if (((size_t)ax & 0x0f) || ((size_t)bx & 0x0f))
+ {
+ union
+ {
+ u32 dummy[4];
+ byte a[16] ATTR_ALIGNED_16;
+ } a;
+ union
+ {
+ u32 dummy[4];
+ byte b[16] ATTR_ALIGNED_16;
+ } b;
- if ( !ctx->decryption_prepared )
+ buf_cpy (a.a, ax, 16);
+ do_decrypt_aligned (ctx, b.b, a.a);
+ buf_cpy (bx, b.b, 16);
+ }
+ else
+#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
+ {
+ do_decrypt_aligned (ctx, bx, ax);
+ }
+}
+
+
+static inline void
+check_decryption_preparation (RIJNDAEL_context *ctx)
+{
+ if (0)
+ ;
+#ifdef USE_PADLOCK
+ else if (ctx->use_padlock)
+ { /* Padlock does not need decryption subkeys. */ }
+#endif /*USE_PADLOCK*/
+ else if ( !ctx->decryption_prepared )
{
prepare_decryption ( ctx );
- _gcry_burn_stack (64);
ctx->decryption_prepared = 1;
}
-
- memcpy (a.a, ax, 16);
- do_decrypt_aligned (ctx, b.b, a.a);
- memcpy (bx, b.b, 16);
-#undef rk
}
-
-
-static void
+static unsigned int
rijndael_decrypt (void *context, byte *b, const byte *a)
{
RIJNDAEL_context *ctx = context;
+ unsigned int burn_stack;
+ check_decryption_preparation (ctx);
+
+ if (0)
+ ;
#ifdef USE_PADLOCK
- if (ctx->use_padlock)
+ else if (ctx->use_padlock)
{
do_padlock (ctx, 1, b, a);
- _gcry_burn_stack (48 + 2*sizeof(int) /* FIXME */);
+ burn_stack = (48 + 2*sizeof(int) /* FIXME */);
}
- else
#endif /*USE_PADLOCK*/
+#ifdef USE_AESNI
+ else if (ctx->use_aesni)
+ {
+ aesni_prepare ();
+ do_aesni_dec (ctx, b, a);
+ aesni_cleanup ();
+ burn_stack = 0;
+ }
+#endif /*USE_AESNI*/
+ else
{
do_decrypt (ctx, b, a);
- _gcry_burn_stack (48+2*sizeof(int));
+ burn_stack = (56+2*sizeof(int));
}
+
+ return burn_stack;
}
/* Bulk decryption of complete blocks in CFB mode. Caller needs to
- make sure that IV is aligned on an unisgned lonhg boundary. This
+ make sure that IV is aligned on an unsigned long boundary. This
function is only intended for the bulk encryption feature of
cipher.c. */
void
-_gcry_aes_cfb_dec (void *context, unsigned char *iv,
+_gcry_aes_cfb_dec (void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
- unsigned int nblocks)
+ size_t nblocks)
{
RIJNDAEL_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
- unsigned char *ivp;
- unsigned char temp;
- int i;
+ unsigned int burn_depth = 48 + 2*sizeof(int);
+ if (0)
+ ;
#ifdef USE_PADLOCK
- if (ctx->use_padlock)
+ else if (ctx->use_padlock)
{
/* Fixme: Let Padlock do the CFBing. */
for ( ;nblocks; nblocks-- )
{
do_padlock (ctx, 0, iv, iv);
- for (ivp=iv,i=0; i < BLOCKSIZE; i++ )
- {
- temp = *inbuf++;
- *outbuf++ = *ivp ^ temp;
- *ivp++ = temp;
- }
+ buf_xor_n_copy(outbuf, iv, inbuf, BLOCKSIZE);
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
}
}
- else
#endif /*USE_PADLOCK*/
+#ifdef USE_AESNI
+ else if (ctx->use_aesni)
+ {
+ aesni_prepare ();
+
+ /* CFB decryption can be parallelized */
+ for ( ;nblocks >= 4; nblocks -= 4)
+ {
+ asm volatile
+ ("movdqu (%[iv]), %%xmm1\n\t" /* load input blocks */
+ "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
+ "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
+ "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
+
+ "movdqu 3*16(%[inbuf]), %%xmm0\n\t" /* update IV */
+ "movdqu %%xmm0, (%[iv])\n\t"
+ : /* No output */
+ : [inbuf] "r" (inbuf), [iv] "r" (iv)
+ : "memory");
+
+ do_aesni_enc_vec4 (ctx);
+
+ asm volatile
+ ("movdqu 0*16(%[inbuf]), %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+
+ "movdqu 1*16(%[inbuf]), %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+
+ "movdqu 2*16(%[inbuf]), %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+
+ "movdqu 3*16(%[inbuf]), %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+ : /* No output */
+ : [inbuf] "r" (inbuf),
+ [outbuf] "r" (outbuf)
+ : "memory");
+
+ outbuf += 4*BLOCKSIZE;
+ inbuf += 4*BLOCKSIZE;
+ }
+
+ for ( ;nblocks; nblocks-- )
+ {
+ do_aesni_cfb (ctx, 1, iv, outbuf, inbuf);
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
+ }
+ aesni_cleanup ();
+ aesni_cleanup_2_6 ();
+
+ burn_depth = 0; /* No stack usage. */
+ }
+#endif /*USE_AESNI*/
+ else
{
for ( ;nblocks; nblocks-- )
{
do_encrypt_aligned (ctx, iv, iv);
- for (ivp=iv,i=0; i < BLOCKSIZE; i++ )
- {
- temp = *inbuf++;
- *outbuf++ = *ivp ^ temp;
- *ivp++ = temp;
- }
+ buf_xor_n_copy(outbuf, iv, inbuf, BLOCKSIZE);
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
}
}
- _gcry_burn_stack (48 + 2*sizeof(int));
+ if (burn_depth)
+ _gcry_burn_stack (burn_depth);
}
@@ -766,38 +2202,133 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv,
function is only intended for the bulk encryption feature of
cipher.c. */
void
-_gcry_aes_cbc_dec (void *context, unsigned char *iv,
+_gcry_aes_cbc_dec (void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
- unsigned int nblocks)
+ size_t nblocks)
{
RIJNDAEL_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
- unsigned char *ivp;
- int i;
- unsigned char savebuf[BLOCKSIZE];
+ unsigned int burn_depth = 48 + 2*sizeof(int) + 4*sizeof (char*);
- for ( ;nblocks; nblocks-- )
+ check_decryption_preparation (ctx);
+
+ if (0)
+ ;
+#ifdef USE_AESNI
+ else if (ctx->use_aesni)
{
- /* We need to save INBUF away because it may be identical to
- OUTBUF. */
- memcpy (savebuf, inbuf, BLOCKSIZE);
+ aesni_prepare ();
+
+ asm volatile
+ ("movdqu %[iv], %%xmm5\n\t" /* use xmm5 as fast IV storage */
+ : /* No output */
+ : [iv] "m" (*iv)
+ : "memory");
+ for ( ;nblocks > 3 ; nblocks -= 4 )
+ {
+ asm volatile
+ ("movdqu 0*16(%[inbuf]), %%xmm1\n\t" /* load input blocks */
+ "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
+ "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
+ "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
+ : /* No output */
+ : [inbuf] "r" (inbuf)
+ : "memory");
+
+ do_aesni_dec_vec4 (ctx);
+
+ asm volatile
+ ("pxor %%xmm5, %%xmm1\n\t" /* xor IV with output */
+ "movdqu 0*16(%[inbuf]), %%xmm5\n\t" /* load new IV */
+ "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+
+ "pxor %%xmm5, %%xmm2\n\t" /* xor IV with output */
+ "movdqu 1*16(%[inbuf]), %%xmm5\n\t" /* load new IV */
+ "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+
+ "pxor %%xmm5, %%xmm3\n\t" /* xor IV with output */
+ "movdqu 2*16(%[inbuf]), %%xmm5\n\t" /* load new IV */
+ "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+
+ "pxor %%xmm5, %%xmm4\n\t" /* xor IV with output */
+ "movdqu 3*16(%[inbuf]), %%xmm5\n\t" /* load new IV */
+ "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+ : /* No output */
+ : [inbuf] "r" (inbuf),
+ [outbuf] "r" (outbuf)
+ : "memory");
+
+ outbuf += 4*BLOCKSIZE;
+ inbuf += 4*BLOCKSIZE;
+ }
+
+ for ( ;nblocks; nblocks-- )
+ {
+ asm volatile
+ ("movdqu %[inbuf], %%xmm2\n\t" /* use xmm2 as savebuf */
+ : /* No output */
+ : [inbuf] "m" (*inbuf)
+ : "memory");
+
+ /* uses only xmm0 and xmm1 */
+ do_aesni_dec (ctx, outbuf, inbuf);
+
+ asm volatile
+ ("movdqu %[outbuf], %%xmm0\n\t"
+ "pxor %%xmm5, %%xmm0\n\t" /* xor IV with output */
+ "movdqu %%xmm0, %[outbuf]\n\t"
+ "movdqu %%xmm2, %%xmm5\n\t" /* store savebuf as new IV */
+ : /* No output */
+ : [outbuf] "m" (*outbuf)
+ : "memory");
+
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
+ }
+
+ asm volatile
+ ("movdqu %%xmm5, %[iv]\n\t" /* store IV */
+ : /* No output */
+ : [iv] "m" (*iv)
+ : "memory");
+
+ aesni_cleanup ();
+ aesni_cleanup_2_6 ();
+
+ burn_depth = 0; /* No stack usage. */
+ }
+#endif /*USE_AESNI*/
+ else
+ {
+ unsigned char savebuf[BLOCKSIZE];
+
+ for ( ;nblocks; nblocks-- )
+ {
+ /* INBUF is needed later and it may be identical to OUTBUF, so store
+ the intermediate result to SAVEBUF. */
+
+ if (0)
+ ;
#ifdef USE_PADLOCK
- if (ctx->use_padlock)
- do_padlock (ctx, 1, outbuf, inbuf);
- else
+ else if (ctx->use_padlock)
+ do_padlock (ctx, 1, savebuf, inbuf);
#endif /*USE_PADLOCK*/
- do_decrypt (ctx, outbuf, inbuf);
+ else
+ do_decrypt (ctx, savebuf, inbuf);
- for (ivp=iv, i=0; i < BLOCKSIZE; i++ )
- outbuf[i] ^= *ivp++;
- memcpy (iv, savebuf, BLOCKSIZE);
- inbuf += BLOCKSIZE;
- outbuf += BLOCKSIZE;
+ buf_xor_n_copy_2(outbuf, savebuf, iv, inbuf, BLOCKSIZE);
+ inbuf += BLOCKSIZE;
+ outbuf += BLOCKSIZE;
+ }
+
+ wipememory(savebuf, sizeof(savebuf));
}
- _gcry_burn_stack (48 + 2*sizeof(int) + BLOCKSIZE + 4*sizeof (char*));
+ if (burn_depth)
+ _gcry_burn_stack (burn_depth);
}
@@ -808,11 +2339,12 @@ static const char*
selftest_basic_128 (void)
{
RIJNDAEL_context ctx;
- unsigned char scratch[16];
+ unsigned char scratch[16];
/* The test vectors are from the AES supplied ones; more or less
randomly taken from ecb_tbl.txt (I=42,81,14) */
- static const unsigned char plaintext_128[16] =
+#if 1
+ static const unsigned char plaintext_128[16] =
{
0x01,0x4B,0xAF,0x22,0x78,0xA6,0x9D,0x33,
0x1D,0x51,0x80,0x10,0x36,0x43,0xE9,0x9A
@@ -827,7 +2359,28 @@ selftest_basic_128 (void)
0x67,0x43,0xC3,0xD1,0x51,0x9A,0xB4,0xF2,
0xCD,0x9A,0x78,0xAB,0x09,0xA5,0x11,0xBD
};
-
+#else
+ /* Test vectors from fips-197, appendix C. */
+# warning debug test vectors in use
+ static const unsigned char plaintext_128[16] =
+ {
+ 0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,
+ 0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff
+ };
+ static const unsigned char key_128[16] =
+ {
+ 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
+ 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+ /* 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, */
+ /* 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c */
+ };
+ static const unsigned char ciphertext_128[16] =
+ {
+ 0x69,0xc4,0xe0,0xd8,0x6a,0x7b,0x04,0x30,
+ 0xd8,0xcd,0xb7,0x80,0x70,0xb4,0xc5,0x5a
+ };
+#endif
+
rijndael_setkey (&ctx, key_128, sizeof (key_128));
rijndael_encrypt (&ctx, scratch, plaintext_128);
if (memcmp (scratch, ciphertext_128, sizeof (ciphertext_128)))
@@ -835,7 +2388,7 @@ selftest_basic_128 (void)
rijndael_decrypt (&ctx, scratch, scratch);
if (memcmp (scratch, plaintext_128, sizeof (plaintext_128)))
return "AES-128 test decryption failed.";
-
+
return NULL;
}
@@ -844,14 +2397,14 @@ static const char*
selftest_basic_192 (void)
{
RIJNDAEL_context ctx;
- unsigned char scratch[16];
-
- static unsigned char plaintext_192[16] =
+ unsigned char scratch[16];
+
+ static unsigned char plaintext_192[16] =
{
0x76,0x77,0x74,0x75,0xF1,0xF2,0xF3,0xF4,
0xF8,0xF9,0xE6,0xE7,0x77,0x70,0x71,0x72
};
- static unsigned char key_192[24] =
+ static unsigned char key_192[24] =
{
0x04,0x05,0x06,0x07,0x09,0x0A,0x0B,0x0C,
0x0E,0x0F,0x10,0x11,0x13,0x14,0x15,0x16,
@@ -862,7 +2415,7 @@ selftest_basic_192 (void)
0x5D,0x1E,0xF2,0x0D,0xCE,0xD6,0xBC,0xBC,
0x12,0x13,0x1A,0xC7,0xC5,0x47,0x88,0xAA
};
-
+
rijndael_setkey (&ctx, key_192, sizeof(key_192));
rijndael_encrypt (&ctx, scratch, plaintext_192);
if (memcmp (scratch, ciphertext_192, sizeof (ciphertext_192)))
@@ -870,7 +2423,7 @@ selftest_basic_192 (void)
rijndael_decrypt (&ctx, scratch, scratch);
if (memcmp (scratch, plaintext_192, sizeof (plaintext_192)))
return "AES-192 test decryption failed.";
-
+
return NULL;
}
@@ -880,21 +2433,21 @@ static const char*
selftest_basic_256 (void)
{
RIJNDAEL_context ctx;
- unsigned char scratch[16];
+ unsigned char scratch[16];
- static unsigned char plaintext_256[16] =
+ static unsigned char plaintext_256[16] =
{
0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21
};
- static unsigned char key_256[32] =
+ static unsigned char key_256[32] =
{
0x08,0x09,0x0A,0x0B,0x0D,0x0E,0x0F,0x10,
0x12,0x13,0x14,0x15,0x17,0x18,0x19,0x1A,
0x1C,0x1D,0x1E,0x1F,0x21,0x22,0x23,0x24,
0x26,0x27,0x28,0x29,0x2B,0x2C,0x2D,0x2E
};
- static const unsigned char ciphertext_256[16] =
+ static const unsigned char ciphertext_256[16] =
{
0x08,0x0E,0x95,0x17,0xEB,0x16,0x77,0x71,
0x9A,0xCF,0x72,0x80,0x86,0x04,0x0A,0xE3
@@ -907,10 +2460,56 @@ selftest_basic_256 (void)
rijndael_decrypt (&ctx, scratch, scratch);
if (memcmp (scratch, plaintext_256, sizeof (plaintext_256)))
return "AES-256 test decryption failed.";
-
+
return NULL;
}
+
+/* Run the self-tests for AES-CTR-128, tests IV increment of bulk CTR
+ encryption. Returns NULL on success. */
+static const char*
+selftest_ctr_128 (void)
+{
+ const int nblocks = 8+1;
+ const int blocksize = BLOCKSIZE;
+ const int context_size = sizeof(RIJNDAEL_context);
+
+ return _gcry_selftest_helper_ctr("AES", &rijndael_setkey,
+ &rijndael_encrypt, &_gcry_aes_ctr_enc, nblocks, blocksize,
+ context_size);
+}
+
+
+/* Run the self-tests for AES-CBC-128, tests bulk CBC decryption.
+ Returns NULL on success. */
+static const char*
+selftest_cbc_128 (void)
+{
+ const int nblocks = 8+2;
+ const int blocksize = BLOCKSIZE;
+ const int context_size = sizeof(RIJNDAEL_context);
+
+ return _gcry_selftest_helper_cbc("AES", &rijndael_setkey,
+ &rijndael_encrypt, &_gcry_aes_cbc_dec, nblocks, blocksize,
+ context_size);
+}
+
+
+/* Run the self-tests for AES-CFB-128, tests bulk CFB decryption.
+ Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+ const int nblocks = 8+2;
+ const int blocksize = BLOCKSIZE;
+ const int context_size = sizeof(RIJNDAEL_context);
+
+ return _gcry_selftest_helper_cfb("AES", &rijndael_setkey,
+ &rijndael_encrypt, &_gcry_aes_cfb_dec, nblocks, blocksize,
+ context_size);
+}
+
+
/* Run all the self-tests and return NULL on success. This function
is used for the on-the-fly self-tests. */
static const char *
@@ -923,6 +2522,15 @@ selftest (void)
|| (r = selftest_basic_256 ()) )
return r;
+ if ( (r = selftest_ctr_128 ()) )
+ return r;
+
+ if ( (r = selftest_cbc_128 ()) )
+ return r;
+
+ if ( (r = selftest_cfb_128 ()) )
+ return r;
+
return r;
}
@@ -931,12 +2539,12 @@ selftest (void)
static const char *
selftest_fips_128_38a (int requested_mode)
{
- struct tv
+ static const struct tv
{
int mode;
const unsigned char key[16];
const unsigned char iv[16];
- struct
+ struct
{
const unsigned char input[16];
const unsigned char output[16];
@@ -947,24 +2555,24 @@ selftest_fips_128_38a (int requested_mode)
GCRY_CIPHER_MODE_CFB, /* F.3.13, CFB128-AES128 */
{ 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6,
0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c },
- { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
{
{ { 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96,
0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a },
{ 0x3b, 0x3f, 0xd9, 0x2e, 0xb7, 0x2d, 0xad, 0x20,
0x33, 0x34, 0x49, 0xf8, 0xe8, 0x3c, 0xfb, 0x4a } },
-
+
{ { 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c,
0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51 },
{ 0xc8, 0xa6, 0x45, 0x37, 0xa0, 0xb3, 0xa9, 0x3f,
0xcd, 0xe3, 0xcd, 0xad, 0x9f, 0x1c, 0xe5, 0x8b } },
-
- { { 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11,
+
+ { { 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11,
0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef },
{ 0x26, 0x75, 0x1f, 0x67, 0xa3, 0xcb, 0xb1, 0x40,
0xb1, 0x80, 0x8c, 0xf1, 0x87, 0xa4, 0xf4, 0xdf } },
-
+
{ { 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17,
0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10 },
{ 0xc0, 0x4b, 0x05, 0x35, 0x7c, 0x5d, 0x1c, 0x0e,
@@ -975,7 +2583,7 @@ selftest_fips_128_38a (int requested_mode)
GCRY_CIPHER_MODE_OFB,
{ 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6,
0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c },
- { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
{
{ { 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96,
@@ -987,7 +2595,7 @@ selftest_fips_128_38a (int requested_mode)
0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51 },
{ 0x77, 0x89, 0x50, 0x8d, 0x16, 0x91, 0x8f, 0x03,
0xf5, 0x3c, 0x52, 0xda, 0xc5, 0x4e, 0xd8, 0x25 } },
-
+
{ { 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11,
0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef },
{ 0x97, 0x40, 0x05, 0x1e, 0x9c, 0x5f, 0xec, 0xf6,
@@ -1057,7 +2665,7 @@ selftest_fips_128_38a (int requested_mode)
#undef Fail
_gcry_cipher_close (hdenc);
- _gcry_cipher_close (hddec);
+ _gcry_cipher_close (hddec);
return NULL;
}
@@ -1068,7 +2676,7 @@ selftest_fips_128 (int extended, selftest_report_func_t report)
{
const char *what;
const char *errtxt;
-
+
what = "low-level";
errtxt = selftest_basic_128 ();
if (errtxt)
@@ -1080,7 +2688,7 @@ selftest_fips_128 (int extended, selftest_report_func_t report)
errtxt = selftest_fips_128_38a (GCRY_CIPHER_MODE_CFB);
if (errtxt)
goto failed;
-
+
what = "ofb";
errtxt = selftest_fips_128_38a (GCRY_CIPHER_MODE_OFB);
if (errtxt)
@@ -1125,7 +2733,7 @@ selftest_fips_256 (int extended, selftest_report_func_t report)
{
const char *what;
const char *errtxt;
-
+
(void)extended; /* No extended tests available. */
what = "low-level";
@@ -1163,7 +2771,7 @@ run_selftests (int algo, int extended, selftest_report_func_t report)
default:
ec = GPG_ERR_CIPHER_ALGO;
break;
-
+
}
return ec;
}
@@ -1190,14 +2798,15 @@ static gcry_cipher_oid_spec_t rijndael_oids[] =
gcry_cipher_spec_t _gcry_cipher_spec_aes =
{
- "AES", rijndael_names, rijndael_oids, 16, 128, sizeof (RIJNDAEL_context),
- rijndael_setkey, rijndael_encrypt, rijndael_decrypt
- };
-cipher_extra_spec_t _gcry_cipher_extraspec_aes =
- {
+ GCRY_CIPHER_AES, {0, 1},
+ "AES", rijndael_names, rijndael_oids, 16, 128,
+ sizeof (RIJNDAEL_context),
+ rijndael_setkey, rijndael_encrypt, rijndael_decrypt,
+ NULL, NULL,
run_selftests
};
+
static const char *rijndael192_names[] =
{
"RIJNDAEL192",
@@ -1216,14 +2825,15 @@ static gcry_cipher_oid_spec_t rijndael192_oids[] =
gcry_cipher_spec_t _gcry_cipher_spec_aes192 =
{
- "AES192", rijndael192_names, rijndael192_oids, 16, 192, sizeof (RIJNDAEL_context),
- rijndael_setkey, rijndael_encrypt, rijndael_decrypt
- };
-cipher_extra_spec_t _gcry_cipher_extraspec_aes192 =
- {
+ GCRY_CIPHER_AES192, {0, 1},
+ "AES192", rijndael192_names, rijndael192_oids, 16, 192,
+ sizeof (RIJNDAEL_context),
+ rijndael_setkey, rijndael_encrypt, rijndael_decrypt,
+ NULL, NULL,
run_selftests
};
+
static const char *rijndael256_names[] =
{
"RIJNDAEL256",
@@ -1242,12 +2852,10 @@ static gcry_cipher_oid_spec_t rijndael256_oids[] =
gcry_cipher_spec_t _gcry_cipher_spec_aes256 =
{
+ GCRY_CIPHER_AES256, {0, 1},
"AES256", rijndael256_names, rijndael256_oids, 16, 256,
sizeof (RIJNDAEL_context),
- rijndael_setkey, rijndael_encrypt, rijndael_decrypt
- };
-
-cipher_extra_spec_t _gcry_cipher_extraspec_aes256 =
- {
+ rijndael_setkey, rijndael_encrypt, rijndael_decrypt,
+ NULL, NULL,
run_selftests
};