diff options
Diffstat (limited to 'plugins/MirOTR/Libgcrypt/cipher/rijndael.c')
-rw-r--r-- | plugins/MirOTR/Libgcrypt/cipher/rijndael.c | 2352 |
1 files changed, 1980 insertions, 372 deletions
diff --git a/plugins/MirOTR/Libgcrypt/cipher/rijndael.c b/plugins/MirOTR/Libgcrypt/cipher/rijndael.c index d43b349b41..8019f0aad8 100644 --- a/plugins/MirOTR/Libgcrypt/cipher/rijndael.c +++ b/plugins/MirOTR/Libgcrypt/cipher/rijndael.c @@ -1,6 +1,6 @@ /* Rijndael (AES) for GnuPG * Copyright (C) 2000, 2001, 2002, 2003, 2007, - * 2008 Free Software Foundation, Inc. + * 2008, 2011, 2012 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * @@ -45,77 +45,434 @@ #include "types.h" /* for byte and u32 typedefs */ #include "g10lib.h" #include "cipher.h" +#include "bufhelp.h" +#include "cipher-selftest.h" #define MAXKC (256/32) #define MAXROUNDS 14 #define BLOCKSIZE (128/8) +/* Helper macro to force alignment to 16 bytes. */ +#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED +# define ATTR_ALIGNED_16 __attribute__ ((aligned (16))) +#else +# define ATTR_ALIGNED_16 +#endif + + +/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */ +#undef USE_AMD64_ASM +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) +# define USE_AMD64_ASM 1 +#endif + +/* USE_ARM_ASM indicates whether to use ARM assembly code. */ +#undef USE_ARM_ASM +#if defined(__ARMEL__) +# ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS +# define USE_ARM_ASM 1 +# endif +#endif + /* USE_PADLOCK indicates whether to compile the padlock specific code. */ #undef USE_PADLOCK #ifdef ENABLE_PADLOCK_SUPPORT -# if defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4 && defined (__GNUC__) -# define USE_PADLOCK +# ifdef HAVE_GCC_ATTRIBUTE_ALIGNED +# if (defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__) +# define USE_PADLOCK 1 +# endif # endif #endif /*ENABLE_PADLOCK_SUPPORT*/ -static const char *selftest(void); +/* USE_AESNI inidicates whether to compile with Intel AES-NI code. We + need the vector-size attribute which seems to be available since + gcc 3. However, to be on the safe side we require at least gcc 4. */ +#undef USE_AESNI +#ifdef ENABLE_AESNI_SUPPORT +# if ((defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__)) +# if __GNUC__ >= 4 +# define USE_AESNI 1 +# endif +# endif +#endif /* ENABLE_AESNI_SUPPORT */ -typedef struct -{ - int ROUNDS; /* Key-length-dependent number of rounds. */ - int decryption_prepared; /* The decryption key schedule is available. */ -#ifdef USE_PADLOCK - int use_padlock; /* Padlock shall be used. */ - /* The key as passed to the padlock engine. */ - unsigned char padlock_key[16] __attribute__ ((aligned (16))); +#ifdef USE_AESNI + typedef struct u128_s { u32 a, b, c, d; } u128_t; +#endif /*USE_AESNI*/ + +/* Define an u32 variant for the sake of gcc 4.4's strict aliasing. */ +#if __GNUC__ > 4 || ( __GNUC__ == 4 && __GNUC_MINOR__ >= 4 ) +typedef u32 __attribute__ ((__may_alias__)) u32_a_t; +#else +typedef u32 u32_a_t; #endif + + +#ifdef USE_AMD64_ASM +/* AMD64 assembly implementations of AES */ +extern void _gcry_aes_amd64_encrypt_block(const void *keysched_enc, + unsigned char *out, + const unsigned char *in, + int rounds); + +extern void _gcry_aes_amd64_decrypt_block(const void *keysched_dec, + unsigned char *out, + const unsigned char *in, + int rounds); +#endif /*USE_AMD64_ASM*/ + +#ifdef USE_ARM_ASM +/* ARM assembly implementations of AES */ +extern void _gcry_aes_arm_encrypt_block(const void *keysched_enc, + unsigned char *out, + const unsigned char *in, + int rounds); + +extern void _gcry_aes_arm_decrypt_block(const void *keysched_dec, + unsigned char *out, + const unsigned char *in, + int rounds); +#endif /*USE_ARM_ASM*/ + + + +/* Our context object. */ +typedef struct +{ + /* The first fields are the keyschedule arrays. This is so that + they are aligned on a 16 byte boundary if using gcc. This + alignment is required for the AES-NI code and a good idea in any + case. The alignment is guaranteed due to the way cipher.c + allocates the space for the context. The PROPERLY_ALIGNED_TYPE + hack is used to force a minimal alignment if not using gcc of if + the alignment requirement is higher that 16 bytes. */ union { PROPERLY_ALIGNED_TYPE dummy; byte keyschedule[MAXROUNDS+1][4][4]; +#ifdef USE_PADLOCK + /* The key as passed to the padlock engine. It is only used if + the padlock engine is used (USE_PADLOCK, below). */ + unsigned char padlock_key[16] __attribute__ ((aligned (16))); +#endif /*USE_PADLOCK*/ } u1; union { PROPERLY_ALIGNED_TYPE dummy; - byte keyschedule[MAXROUNDS+1][4][4]; + byte keyschedule[MAXROUNDS+1][4][4]; } u2; -} RIJNDAEL_context; + int rounds; /* Key-length-dependent number of rounds. */ + unsigned int decryption_prepared:1; /* The decryption key schedule is available. */ +#ifdef USE_PADLOCK + unsigned int use_padlock:1; /* Padlock shall be used. */ +#endif /*USE_PADLOCK*/ +#ifdef USE_AESNI + unsigned int use_aesni:1; /* AES-NI shall be used. */ +#endif /*USE_AESNI*/ +} RIJNDAEL_context ATTR_ALIGNED_16; + +/* Macros defining alias for the keyschedules. */ +#define keyschenc u1.keyschedule +#define keyschdec u2.keyschedule +#define padlockkey u1.padlock_key + +/* Two macros to be called prior and after the use of AESNI + instructions. There should be no external function calls between + the use of these macros. There purpose is to make sure that the + SSE regsiters are cleared and won't reveal any information about + the key or the data. */ +#ifdef USE_AESNI +# define aesni_prepare() do { } while (0) +# define aesni_cleanup() \ + do { asm volatile ("pxor %%xmm0, %%xmm0\n\t" \ + "pxor %%xmm1, %%xmm1\n" :: ); \ + } while (0) +# define aesni_cleanup_2_6() \ + do { asm volatile ("pxor %%xmm2, %%xmm2\n\t" \ + "pxor %%xmm3, %%xmm3\n" \ + "pxor %%xmm4, %%xmm4\n" \ + "pxor %%xmm5, %%xmm5\n" \ + "pxor %%xmm6, %%xmm6\n":: ); \ + } while (0) +#else +# define aesni_prepare() do { } while (0) +# define aesni_cleanup() do { } while (0) +#endif -#define keySched u1.keyschedule -#define keySched2 u2.keyschedule /* All the numbers. */ #include "rijndael-tables.h" -/* Perform the key setup. */ + +/* Function prototypes. */ +#if defined(__i386__) && defined(USE_AESNI) +/* We don't want to inline these functions on i386 to help gcc allocate enough + registers. */ +static void do_aesni_ctr (const RIJNDAEL_context *ctx, unsigned char *ctr, + unsigned char *b, const unsigned char *a) + __attribute__ ((__noinline__)); +static void do_aesni_ctr_4 (const RIJNDAEL_context *ctx, unsigned char *ctr, + unsigned char *b, const unsigned char *a) + __attribute__ ((__noinline__)); +#endif /*USE_AESNI*/ + +static const char *selftest(void); + + + +#ifdef USE_AESNI +static void +aesni_do_setkey (RIJNDAEL_context *ctx, const byte *key) +{ + aesni_prepare(); + + if (ctx->rounds < 12) + { + /* 128-bit key */ +#define AESKEYGENASSIST_xmm1_xmm2(imm8) \ + ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t" +#define AESKEY_EXPAND128 \ + "pshufd $0xff, %%xmm2, %%xmm2\n\t" \ + "movdqa %%xmm1, %%xmm3\n\t" \ + "pslldq $4, %%xmm3\n\t" \ + "pxor %%xmm3, %%xmm1\n\t" \ + "pslldq $4, %%xmm3\n\t" \ + "pxor %%xmm3, %%xmm1\n\t" \ + "pslldq $4, %%xmm3\n\t" \ + "pxor %%xmm3, %%xmm2\n\t" \ + "pxor %%xmm2, %%xmm1\n\t" + + asm volatile ("movdqu (%[key]), %%xmm1\n\t" /* xmm1 := key */ + "movdqa %%xmm1, (%[ksch])\n\t" /* ksch[0] := xmm1 */ + AESKEYGENASSIST_xmm1_xmm2(0x01) + AESKEY_EXPAND128 + "movdqa %%xmm1, 0x10(%[ksch])\n\t" /* ksch[1] := xmm1 */ + AESKEYGENASSIST_xmm1_xmm2(0x02) + AESKEY_EXPAND128 + "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1 */ + AESKEYGENASSIST_xmm1_xmm2(0x04) + AESKEY_EXPAND128 + "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1 */ + AESKEYGENASSIST_xmm1_xmm2(0x08) + AESKEY_EXPAND128 + "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1 */ + AESKEYGENASSIST_xmm1_xmm2(0x10) + AESKEY_EXPAND128 + "movdqa %%xmm1, 0x50(%[ksch])\n\t" /* ksch[5] := xmm1 */ + AESKEYGENASSIST_xmm1_xmm2(0x20) + AESKEY_EXPAND128 + "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1 */ + AESKEYGENASSIST_xmm1_xmm2(0x40) + AESKEY_EXPAND128 + "movdqa %%xmm1, 0x70(%[ksch])\n\t" /* ksch[7] := xmm1 */ + AESKEYGENASSIST_xmm1_xmm2(0x80) + AESKEY_EXPAND128 + "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1 */ + AESKEYGENASSIST_xmm1_xmm2(0x1b) + AESKEY_EXPAND128 + "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1 */ + AESKEYGENASSIST_xmm1_xmm2(0x36) + AESKEY_EXPAND128 + "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1 */ + : + : [key] "r" (key), [ksch] "r" (ctx->keyschenc) + : "cc", "memory" ); +#undef AESKEYGENASSIST_xmm1_xmm2 +#undef AESKEY_EXPAND128 + } + else if (ctx->rounds == 12) + { + /* 192-bit key */ +#define AESKEYGENASSIST_xmm3_xmm2(imm8) \ + ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t" +#define AESKEY_EXPAND192 \ + "pshufd $0x55, %%xmm2, %%xmm2\n\t" \ + "movdqu %%xmm1, %%xmm4\n\t" \ + "pslldq $4, %%xmm4\n\t" \ + "pxor %%xmm4, %%xmm1\n\t" \ + "pslldq $4, %%xmm4\n\t" \ + "pxor %%xmm4, %%xmm1\n\t" \ + "pslldq $4, %%xmm4\n\t" \ + "pxor %%xmm4, %%xmm1\n\t" \ + "pxor %%xmm2, %%xmm1\n\t" \ + "pshufd $0xff, %%xmm1, %%xmm2\n\t" \ + "movdqu %%xmm3, %%xmm4\n\t" \ + "pslldq $4, %%xmm4\n\t" \ + "pxor %%xmm4, %%xmm3\n\t" \ + "pxor %%xmm2, %%xmm3\n\t" + + asm volatile ("movdqu (%[key]), %%xmm1\n\t" /* xmm1 := key[0..15] */ + "movq 16(%[key]), %%xmm3\n\t" /* xmm3 := key[16..23] */ + "movdqa %%xmm1, (%[ksch])\n\t" /* ksch[0] := xmm1 */ + "movdqa %%xmm3, %%xmm5\n\t" + + AESKEYGENASSIST_xmm3_xmm2(0x01) + AESKEY_EXPAND192 + "shufpd $0, %%xmm1, %%xmm5\n\t" + "movdqa %%xmm5, 0x10(%[ksch])\n\t" /* ksch[1] := xmm5 */ + "movdqa %%xmm1, %%xmm6\n\t" + "shufpd $1, %%xmm3, %%xmm6\n\t" + "movdqa %%xmm6, 0x20(%[ksch])\n\t" /* ksch[2] := xmm6 */ + AESKEYGENASSIST_xmm3_xmm2(0x02) + AESKEY_EXPAND192 + "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1 */ + "movdqa %%xmm3, %%xmm5\n\t" + + AESKEYGENASSIST_xmm3_xmm2(0x04) + AESKEY_EXPAND192 + "shufpd $0, %%xmm1, %%xmm5\n\t" + "movdqa %%xmm5, 0x40(%[ksch])\n\t" /* ksch[4] := xmm5 */ + "movdqa %%xmm1, %%xmm6\n\t" + "shufpd $1, %%xmm3, %%xmm6\n\t" + "movdqa %%xmm6, 0x50(%[ksch])\n\t" /* ksch[5] := xmm6 */ + AESKEYGENASSIST_xmm3_xmm2(0x08) + AESKEY_EXPAND192 + "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1 */ + "movdqa %%xmm3, %%xmm5\n\t" + + AESKEYGENASSIST_xmm3_xmm2(0x10) + AESKEY_EXPAND192 + "shufpd $0, %%xmm1, %%xmm5\n\t" + "movdqa %%xmm5, 0x70(%[ksch])\n\t" /* ksch[7] := xmm5 */ + "movdqa %%xmm1, %%xmm6\n\t" + "shufpd $1, %%xmm3, %%xmm6\n\t" + "movdqa %%xmm6, 0x80(%[ksch])\n\t" /* ksch[8] := xmm6 */ + AESKEYGENASSIST_xmm3_xmm2(0x20) + AESKEY_EXPAND192 + "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1 */ + "movdqa %%xmm3, %%xmm5\n\t" + + AESKEYGENASSIST_xmm3_xmm2(0x40) + AESKEY_EXPAND192 + "shufpd $0, %%xmm1, %%xmm5\n\t" + "movdqa %%xmm5, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm5 */ + "movdqa %%xmm1, %%xmm6\n\t" + "shufpd $1, %%xmm3, %%xmm6\n\t" + "movdqa %%xmm6, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm6 */ + AESKEYGENASSIST_xmm3_xmm2(0x80) + AESKEY_EXPAND192 + "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1 */ + : + : [key] "r" (key), [ksch] "r" (ctx->keyschenc) + : "cc", "memory" ); +#undef AESKEYGENASSIST_xmm3_xmm2 +#undef AESKEY_EXPAND192 + } + else if (ctx->rounds > 12) + { + /* 256-bit key */ +#define AESKEYGENASSIST_xmm1_xmm2(imm8) \ + ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t" +#define AESKEYGENASSIST_xmm3_xmm2(imm8) \ + ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t" +#define AESKEY_EXPAND256_A \ + "pshufd $0xff, %%xmm2, %%xmm2\n\t" \ + "movdqa %%xmm1, %%xmm4\n\t" \ + "pslldq $4, %%xmm4\n\t" \ + "pxor %%xmm4, %%xmm1\n\t" \ + "pslldq $4, %%xmm4\n\t" \ + "pxor %%xmm4, %%xmm1\n\t" \ + "pslldq $4, %%xmm4\n\t" \ + "pxor %%xmm4, %%xmm1\n\t" \ + "pxor %%xmm2, %%xmm1\n\t" +#define AESKEY_EXPAND256_B \ + "pshufd $0xaa, %%xmm2, %%xmm2\n\t" \ + "movdqa %%xmm3, %%xmm4\n\t" \ + "pslldq $4, %%xmm4\n\t" \ + "pxor %%xmm4, %%xmm3\n\t" \ + "pslldq $4, %%xmm4\n\t" \ + "pxor %%xmm4, %%xmm3\n\t" \ + "pslldq $4, %%xmm4\n\t" \ + "pxor %%xmm4, %%xmm3\n\t" \ + "pxor %%xmm2, %%xmm3\n\t" + + asm volatile ("movdqu (%[key]), %%xmm1\n\t" /* xmm1 := key[0..15] */ + "movdqu 16(%[key]), %%xmm3\n\t" /* xmm3 := key[16..31] */ + "movdqa %%xmm1, (%[ksch])\n\t" /* ksch[0] := xmm1 */ + "movdqa %%xmm3, 0x10(%[ksch])\n\t" /* ksch[1] := xmm3 */ + + AESKEYGENASSIST_xmm3_xmm2(0x01) + AESKEY_EXPAND256_A + "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1 */ + AESKEYGENASSIST_xmm1_xmm2(0x00) + AESKEY_EXPAND256_B + "movdqa %%xmm3, 0x30(%[ksch])\n\t" /* ksch[3] := xmm3 */ + + AESKEYGENASSIST_xmm3_xmm2(0x02) + AESKEY_EXPAND256_A + "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1 */ + AESKEYGENASSIST_xmm1_xmm2(0x00) + AESKEY_EXPAND256_B + "movdqa %%xmm3, 0x50(%[ksch])\n\t" /* ksch[5] := xmm3 */ + + AESKEYGENASSIST_xmm3_xmm2(0x04) + AESKEY_EXPAND256_A + "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1 */ + AESKEYGENASSIST_xmm1_xmm2(0x00) + AESKEY_EXPAND256_B + "movdqa %%xmm3, 0x70(%[ksch])\n\t" /* ksch[7] := xmm3 */ + + AESKEYGENASSIST_xmm3_xmm2(0x08) + AESKEY_EXPAND256_A + "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1 */ + AESKEYGENASSIST_xmm1_xmm2(0x00) + AESKEY_EXPAND256_B + "movdqa %%xmm3, 0x90(%[ksch])\n\t" /* ksch[9] := xmm3 */ + + AESKEYGENASSIST_xmm3_xmm2(0x10) + AESKEY_EXPAND256_A + "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1 */ + AESKEYGENASSIST_xmm1_xmm2(0x00) + AESKEY_EXPAND256_B + "movdqa %%xmm3, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm3 */ + + AESKEYGENASSIST_xmm3_xmm2(0x20) + AESKEY_EXPAND256_A + "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1 */ + AESKEYGENASSIST_xmm1_xmm2(0x00) + AESKEY_EXPAND256_B + "movdqa %%xmm3, 0xd0(%[ksch])\n\t" /* ksch[13] := xmm3 */ + + AESKEYGENASSIST_xmm3_xmm2(0x40) + AESKEY_EXPAND256_A + "movdqa %%xmm1, 0xe0(%[ksch])\n\t" /* ksch[14] := xmm1 */ + + : + : [key] "r" (key), [ksch] "r" (ctx->keyschenc) + : "cc", "memory" ); +#undef AESKEYGENASSIST_xmm1_xmm2 +#undef AESKEYGENASSIST_xmm3_xmm2 +#undef AESKEY_EXPAND256_A +#undef AESKEY_EXPAND256_B + } + + aesni_cleanup(); + aesni_cleanup_2_6(); +} +#endif /*USE_AESNI*/ + + + +/* Perform the key setup. */ static gcry_err_code_t do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) { static int initialized = 0; static const char *selftest_failed=0; - int ROUNDS; + int rounds; int i,j, r, t, rconpointer = 0; int KC; - union - { - PROPERLY_ALIGNED_TYPE dummy; - byte k[MAXKC][4]; - } k; -#define k k.k - union - { - PROPERLY_ALIGNED_TYPE dummy; - byte tk[MAXKC][4]; - } tk; -#define tk tk.tk +#if defined(USE_AESNI) || defined(USE_PADLOCK) + unsigned int hwfeatures; +#endif /* The on-the-fly self tests are only run in non-fips mode. In fips mode explicit self-tests are required. Actually the on-the-fly self-tests are not fully thread-safe and it might happen that a - failed self-test won't get noticed in another thread. + failed self-test won't get noticed in another thread. FIXME: We might want to have a central registry of succeeded self-tests. */ @@ -129,65 +486,115 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) if (selftest_failed) return GPG_ERR_SELFTEST_FAILED; +#if defined(USE_AESNI) || defined(USE_PADLOCK) + hwfeatures = _gcry_get_hw_features (); +#endif + ctx->decryption_prepared = 0; #ifdef USE_PADLOCK ctx->use_padlock = 0; #endif +#ifdef USE_AESNI + ctx->use_aesni = 0; +#endif if( keylen == 128/8 ) { - ROUNDS = 10; + rounds = 10; KC = 4; + + if (0) + { + ; + } #ifdef USE_PADLOCK - if ((_gcry_get_hw_features () & HWF_PADLOCK_AES)) + else if (hwfeatures & HWF_PADLOCK_AES) { ctx->use_padlock = 1; - memcpy (ctx->padlock_key, key, keylen); + memcpy (ctx->padlockkey, key, keylen); + } +#endif +#ifdef USE_AESNI + else if (hwfeatures & HWF_INTEL_AESNI) + { + ctx->use_aesni = 1; } #endif } else if ( keylen == 192/8 ) { - ROUNDS = 12; + rounds = 12; KC = 6; + + if (0) + { + ; + } +#ifdef USE_AESNI + else if (hwfeatures & HWF_INTEL_AESNI) + { + ctx->use_aesni = 1; + } +#endif } else if ( keylen == 256/8 ) { - ROUNDS = 14; + rounds = 14; KC = 8; + + if (0) + { + ; + } +#ifdef USE_AESNI + else if (hwfeatures & HWF_INTEL_AESNI) + { + ctx->use_aesni = 1; + } +#endif } else return GPG_ERR_INV_KEYLEN; - ctx->ROUNDS = ROUNDS; + ctx->rounds = rounds; -#ifdef USE_PADLOCK - if (ctx->use_padlock) + /* NB: We don't yet support Padlock hardware key generation. */ + + if (0) { - /* Nothing to do as we support only hardware key generation for - now. */ + ; } +#ifdef USE_AESNI + else if (ctx->use_aesni) + aesni_do_setkey(ctx, key); +#endif else -#endif /*USE_PADLOCK*/ { -#define W (ctx->keySched) - for (i = 0; i < keylen; i++) + union { - k[i >> 2][i & 3] = key[i]; + PROPERLY_ALIGNED_TYPE dummy; + byte data[MAXKC][4]; + } k, tk; +#define k k.data +#define tk tk.data +#define W (ctx->keyschenc) + for (i = 0; i < keylen; i++) + { + k[i >> 2][i & 3] = key[i]; } - - for (j = KC-1; j >= 0; j--) + + for (j = KC-1; j >= 0; j--) { - *((u32*)tk[j]) = *((u32*)k[j]); + *((u32_a_t*)tk[j]) = *((u32_a_t*)k[j]); } r = 0; t = 0; /* Copy values into round key array. */ - for (j = 0; (j < KC) && (r < ROUNDS + 1); ) + for (j = 0; (j < KC) && (r < rounds + 1); ) { for (; (j < KC) && (t < 4); j++, t++) { - *((u32*)W[r][t]) = *((u32*)tk[j]); + *((u32_a_t*)W[r][t]) = *((u32_a_t*)tk[j]); } if (t == 4) { @@ -195,8 +602,8 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) t = 0; } } - - while (r < ROUNDS + 1) + + while (r < rounds + 1) { /* While not enough round key material calculated calculate new values. */ @@ -205,19 +612,19 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) tk[0][2] ^= S[tk[KC-1][3]]; tk[0][3] ^= S[tk[KC-1][0]]; tk[0][0] ^= rcon[rconpointer++]; - + if (KC != 8) { - for (j = 1; j < KC; j++) + for (j = 1; j < KC; j++) { - *((u32*)tk[j]) ^= *((u32*)tk[j-1]); + *((u32_a_t*)tk[j]) ^= *((u32_a_t*)tk[j-1]); } - } - else + } + else { for (j = 1; j < KC/2; j++) { - *((u32*)tk[j]) ^= *((u32*)tk[j-1]); + *((u32_a_t*)tk[j]) ^= *((u32_a_t*)tk[j-1]); } tk[KC/2][0] ^= S[tk[KC/2 - 1][0]]; tk[KC/2][1] ^= S[tk[KC/2 - 1][1]]; @@ -225,16 +632,16 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) tk[KC/2][3] ^= S[tk[KC/2 - 1][3]]; for (j = KC/2 + 1; j < KC; j++) { - *((u32*)tk[j]) ^= *((u32*)tk[j-1]); + *((u32_a_t*)tk[j]) ^= *((u32_a_t*)tk[j-1]); } } - + /* Copy values into round key array. */ - for (j = 0; (j < KC) && (r < ROUNDS + 1); ) + for (j = 0; (j < KC) && (r < rounds + 1); ) { for (; (j < KC) && (t < 4); j++, t++) { - *((u32*)W[r][t]) = *((u32*)tk[j]); + *((u32_a_t*)W[r][t]) = *((u32_a_t*)tk[j]); } if (t == 4) { @@ -242,13 +649,15 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) t = 0; } } - } -#undef W + } +#undef W +#undef tk +#undef k + wipememory(&tk, sizeof(tk)); + wipememory(&t, sizeof(t)); } return 0; -#undef tk -#undef k } @@ -256,10 +665,7 @@ static gcry_err_code_t rijndael_setkey (void *context, const byte *key, const unsigned keylen) { RIJNDAEL_context *ctx = context; - - int rc = do_setkey (ctx, key, keylen); - _gcry_burn_stack ( 100 + 16*sizeof(int)); - return rc; + return do_setkey (ctx, key, keylen); } @@ -268,53 +674,113 @@ static void prepare_decryption( RIJNDAEL_context *ctx ) { int r; - union - { - PROPERLY_ALIGNED_TYPE dummy; - byte *w; - } w; -#define w w.w - for (r=0; r < MAXROUNDS+1; r++ ) +#ifdef USE_AESNI + if (ctx->use_aesni) { - *((u32*)ctx->keySched2[r][0]) = *((u32*)ctx->keySched[r][0]); - *((u32*)ctx->keySched2[r][1]) = *((u32*)ctx->keySched[r][1]); - *((u32*)ctx->keySched2[r][2]) = *((u32*)ctx->keySched[r][2]); - *((u32*)ctx->keySched2[r][3]) = *((u32*)ctx->keySched[r][3]); + /* The AES-NI decrypt instructions use the Equivalent Inverse + Cipher, thus we can't use the the standard decrypt key + preparation. */ + u128_t *ekey = (u128_t *)ctx->keyschenc; + u128_t *dkey = (u128_t *)ctx->keyschdec; + int rr; + + aesni_prepare(); + +#define DO_AESNI_AESIMC() \ + asm volatile ("movdqa %[ekey], %%xmm1\n\t" \ + /*"aesimc %%xmm1, %%xmm1\n\t"*/ \ + ".byte 0x66, 0x0f, 0x38, 0xdb, 0xc9\n\t" \ + "movdqa %%xmm1, %[dkey]" \ + : [dkey] "=m" (dkey[r]) \ + : [ekey] "m" (ekey[rr]) \ + : "memory") + + dkey[0] = ekey[ctx->rounds]; + r=1; + rr=ctx->rounds-1; + DO_AESNI_AESIMC(); r++; rr--; /* round 1 */ + DO_AESNI_AESIMC(); r++; rr--; /* round 2 */ + DO_AESNI_AESIMC(); r++; rr--; /* round 3 */ + DO_AESNI_AESIMC(); r++; rr--; /* round 4 */ + DO_AESNI_AESIMC(); r++; rr--; /* round 5 */ + DO_AESNI_AESIMC(); r++; rr--; /* round 6 */ + DO_AESNI_AESIMC(); r++; rr--; /* round 7 */ + DO_AESNI_AESIMC(); r++; rr--; /* round 8 */ + DO_AESNI_AESIMC(); r++; rr--; /* round 9 */ + if (ctx->rounds > 10) + { + DO_AESNI_AESIMC(); r++; rr--; /* round 10 */ + DO_AESNI_AESIMC(); r++; rr--; /* round 11 */ + if (ctx->rounds > 12) + { + DO_AESNI_AESIMC(); r++; rr--; /* round 12 */ + DO_AESNI_AESIMC(); r++; rr--; /* round 13 */ + } + } + + dkey[r] = ekey[0]; + +#undef DO_AESNI_AESIMC + + aesni_cleanup(); } -#define W (ctx->keySched2) - for (r = 1; r < ctx->ROUNDS; r++) + else +#endif /*USE_AESNI*/ { - w = W[r][0]; - *((u32*)w) = *((u32*)U1[w[0]]) ^ *((u32*)U2[w[1]]) - ^ *((u32*)U3[w[2]]) ^ *((u32*)U4[w[3]]); - - w = W[r][1]; - *((u32*)w) = *((u32*)U1[w[0]]) ^ *((u32*)U2[w[1]]) - ^ *((u32*)U3[w[2]]) ^ *((u32*)U4[w[3]]); - - w = W[r][2]; - *((u32*)w) = *((u32*)U1[w[0]]) ^ *((u32*)U2[w[1]]) - ^ *((u32*)U3[w[2]]) ^ *((u32*)U4[w[3]]); - - w = W[r][3]; - *((u32*)w) = *((u32*)U1[w[0]]) ^ *((u32*)U2[w[1]]) - ^ *((u32*)U3[w[2]]) ^ *((u32*)U4[w[3]]); - } + union + { + PROPERLY_ALIGNED_TYPE dummy; + byte *w; + } w; +#define w w.w + + for (r=0; r < MAXROUNDS+1; r++ ) + { + *((u32_a_t*)ctx->keyschdec[r][0]) = *((u32_a_t*)ctx->keyschenc[r][0]); + *((u32_a_t*)ctx->keyschdec[r][1]) = *((u32_a_t*)ctx->keyschenc[r][1]); + *((u32_a_t*)ctx->keyschdec[r][2]) = *((u32_a_t*)ctx->keyschenc[r][2]); + *((u32_a_t*)ctx->keyschdec[r][3]) = *((u32_a_t*)ctx->keyschenc[r][3]); + } +#define W (ctx->keyschdec) + for (r = 1; r < ctx->rounds; r++) + { + w = W[r][0]; + *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]]) + ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]); + + w = W[r][1]; + *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]]) + ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]); + + w = W[r][2]; + *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]]) + ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]); + + w = W[r][3]; + *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]]) + ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]); + } #undef W #undef w -} - + wipememory(&w, sizeof(w)); + } +} /* Encrypt one block. A and B need to be aligned on a 4 byte boundary. A and B may be the same. */ static void -do_encrypt_aligned (const RIJNDAEL_context *ctx, +do_encrypt_aligned (const RIJNDAEL_context *ctx, unsigned char *b, const unsigned char *a) { -#define rk (ctx->keySched) - int ROUNDS = ctx->ROUNDS; +#ifdef USE_AMD64_ASM + _gcry_aes_amd64_encrypt_block(ctx->keyschenc, b, a, ctx->rounds); +#elif defined(USE_ARM_ASM) + _gcry_aes_arm_encrypt_block(ctx->keyschenc, b, a, ctx->rounds); +#else +#define rk (ctx->keyschenc) + int rounds = ctx->rounds; int r; union { @@ -322,57 +788,57 @@ do_encrypt_aligned (const RIJNDAEL_context *ctx, byte temp[4][4]; } u; - *((u32*)u.temp[0]) = *((u32*)(a )) ^ *((u32*)rk[0][0]); - *((u32*)u.temp[1]) = *((u32*)(a+ 4)) ^ *((u32*)rk[0][1]); - *((u32*)u.temp[2]) = *((u32*)(a+ 8)) ^ *((u32*)rk[0][2]); - *((u32*)u.temp[3]) = *((u32*)(a+12)) ^ *((u32*)rk[0][3]); - *((u32*)(b )) = (*((u32*)T1[u.temp[0][0]]) - ^ *((u32*)T2[u.temp[1][1]]) - ^ *((u32*)T3[u.temp[2][2]]) - ^ *((u32*)T4[u.temp[3][3]])); - *((u32*)(b + 4)) = (*((u32*)T1[u.temp[1][0]]) - ^ *((u32*)T2[u.temp[2][1]]) - ^ *((u32*)T3[u.temp[3][2]]) - ^ *((u32*)T4[u.temp[0][3]])); - *((u32*)(b + 8)) = (*((u32*)T1[u.temp[2][0]]) - ^ *((u32*)T2[u.temp[3][1]]) - ^ *((u32*)T3[u.temp[0][2]]) - ^ *((u32*)T4[u.temp[1][3]])); - *((u32*)(b +12)) = (*((u32*)T1[u.temp[3][0]]) - ^ *((u32*)T2[u.temp[0][1]]) - ^ *((u32*)T3[u.temp[1][2]]) - ^ *((u32*)T4[u.temp[2][3]])); - - for (r = 1; r < ROUNDS-1; r++) + *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(a )) ^ *((u32_a_t*)rk[0][0]); + *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(a+ 4)) ^ *((u32_a_t*)rk[0][1]); + *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(a+ 8)) ^ *((u32_a_t*)rk[0][2]); + *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(a+12)) ^ *((u32_a_t*)rk[0][3]); + *((u32_a_t*)(b )) = (*((u32_a_t*)T1[u.temp[0][0]]) + ^ *((u32_a_t*)T2[u.temp[1][1]]) + ^ *((u32_a_t*)T3[u.temp[2][2]]) + ^ *((u32_a_t*)T4[u.temp[3][3]])); + *((u32_a_t*)(b + 4)) = (*((u32_a_t*)T1[u.temp[1][0]]) + ^ *((u32_a_t*)T2[u.temp[2][1]]) + ^ *((u32_a_t*)T3[u.temp[3][2]]) + ^ *((u32_a_t*)T4[u.temp[0][3]])); + *((u32_a_t*)(b + 8)) = (*((u32_a_t*)T1[u.temp[2][0]]) + ^ *((u32_a_t*)T2[u.temp[3][1]]) + ^ *((u32_a_t*)T3[u.temp[0][2]]) + ^ *((u32_a_t*)T4[u.temp[1][3]])); + *((u32_a_t*)(b +12)) = (*((u32_a_t*)T1[u.temp[3][0]]) + ^ *((u32_a_t*)T2[u.temp[0][1]]) + ^ *((u32_a_t*)T3[u.temp[1][2]]) + ^ *((u32_a_t*)T4[u.temp[2][3]])); + + for (r = 1; r < rounds-1; r++) { - *((u32*)u.temp[0]) = *((u32*)(b )) ^ *((u32*)rk[r][0]); - *((u32*)u.temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[r][1]); - *((u32*)u.temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[r][2]); - *((u32*)u.temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[r][3]); - - *((u32*)(b )) = (*((u32*)T1[u.temp[0][0]]) - ^ *((u32*)T2[u.temp[1][1]]) - ^ *((u32*)T3[u.temp[2][2]]) - ^ *((u32*)T4[u.temp[3][3]])); - *((u32*)(b + 4)) = (*((u32*)T1[u.temp[1][0]]) - ^ *((u32*)T2[u.temp[2][1]]) - ^ *((u32*)T3[u.temp[3][2]]) - ^ *((u32*)T4[u.temp[0][3]])); - *((u32*)(b + 8)) = (*((u32*)T1[u.temp[2][0]]) - ^ *((u32*)T2[u.temp[3][1]]) - ^ *((u32*)T3[u.temp[0][2]]) - ^ *((u32*)T4[u.temp[1][3]])); - *((u32*)(b +12)) = (*((u32*)T1[u.temp[3][0]]) - ^ *((u32*)T2[u.temp[0][1]]) - ^ *((u32*)T3[u.temp[1][2]]) - ^ *((u32*)T4[u.temp[2][3]])); + *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[r][0]); + *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[r][1]); + *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[r][2]); + *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[r][3]); + + *((u32_a_t*)(b )) = (*((u32_a_t*)T1[u.temp[0][0]]) + ^ *((u32_a_t*)T2[u.temp[1][1]]) + ^ *((u32_a_t*)T3[u.temp[2][2]]) + ^ *((u32_a_t*)T4[u.temp[3][3]])); + *((u32_a_t*)(b + 4)) = (*((u32_a_t*)T1[u.temp[1][0]]) + ^ *((u32_a_t*)T2[u.temp[2][1]]) + ^ *((u32_a_t*)T3[u.temp[3][2]]) + ^ *((u32_a_t*)T4[u.temp[0][3]])); + *((u32_a_t*)(b + 8)) = (*((u32_a_t*)T1[u.temp[2][0]]) + ^ *((u32_a_t*)T2[u.temp[3][1]]) + ^ *((u32_a_t*)T3[u.temp[0][2]]) + ^ *((u32_a_t*)T4[u.temp[1][3]])); + *((u32_a_t*)(b +12)) = (*((u32_a_t*)T1[u.temp[3][0]]) + ^ *((u32_a_t*)T2[u.temp[0][1]]) + ^ *((u32_a_t*)T3[u.temp[1][2]]) + ^ *((u32_a_t*)T4[u.temp[2][3]])); } - /* Last round is special. */ - *((u32*)u.temp[0]) = *((u32*)(b )) ^ *((u32*)rk[ROUNDS-1][0]); - *((u32*)u.temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[ROUNDS-1][1]); - *((u32*)u.temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[ROUNDS-1][2]); - *((u32*)u.temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[ROUNDS-1][3]); + /* Last round is special. */ + *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[rounds-1][0]); + *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[rounds-1][1]); + *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[rounds-1][2]); + *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[rounds-1][3]); b[ 0] = T1[u.temp[0][0]][1]; b[ 1] = T1[u.temp[1][1]][1]; b[ 2] = T1[u.temp[2][2]][1]; @@ -389,11 +855,12 @@ do_encrypt_aligned (const RIJNDAEL_context *ctx, b[13] = T1[u.temp[0][1]][1]; b[14] = T1[u.temp[1][2]][1]; b[15] = T1[u.temp[2][3]][1]; - *((u32*)(b )) ^= *((u32*)rk[ROUNDS][0]); - *((u32*)(b+ 4)) ^= *((u32*)rk[ROUNDS][1]); - *((u32*)(b+ 8)) ^= *((u32*)rk[ROUNDS][2]); - *((u32*)(b+12)) ^= *((u32*)rk[ROUNDS][3]); + *((u32_a_t*)(b )) ^= *((u32_a_t*)rk[rounds][0]); + *((u32_a_t*)(b+ 4)) ^= *((u32_a_t*)rk[rounds][1]); + *((u32_a_t*)(b+ 8)) ^= *((u32_a_t*)rk[rounds][2]); + *((u32_a_t*)(b+12)) ^= *((u32_a_t*)rk[rounds][3]); #undef rk +#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/ } @@ -401,22 +868,31 @@ static void do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax) { - /* BX and AX are not necessary correctly aligned. Thus we need to - copy them here. */ - union - { - u32 dummy[4]; - byte a[16]; - } a; - union - { - u32 dummy[4]; - byte b[16]; - } b; +#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM) + /* BX and AX are not necessary correctly aligned. Thus we might + need to copy them here. We try to align to a 16 bytes. */ + if (((size_t)ax & 0x0f) || ((size_t)bx & 0x0f)) + { + union + { + u32 dummy[4]; + byte a[16] ATTR_ALIGNED_16; + } a; + union + { + u32 dummy[4]; + byte b[16] ATTR_ALIGNED_16; + } b; - memcpy (a.a, ax, 16); - do_encrypt_aligned (ctx, b.b, a.a); - memcpy (bx, b.b, 16); + buf_cpy (a.a, ax, 16); + do_encrypt_aligned (ctx, b.b, a.a); + buf_cpy (bx, b.b, 16); + } + else +#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/ + { + do_encrypt_aligned (ctx, bx, ax); + } } @@ -432,11 +908,12 @@ do_padlock (const RIJNDAEL_context *ctx, int decrypt_flag, unsigned char a[16] __attribute__ ((aligned (16))); unsigned char b[16] __attribute__ ((aligned (16))); unsigned int cword[4] __attribute__ ((aligned (16))); + int blocks; /* The control word fields are: 127:12 11:10 9 8 7 6 5 4 3:0 RESERVED KSIZE CRYPT INTER KEYGN CIPHR ALIGN DGEST ROUND */ - cword[0] = (ctx->ROUNDS & 15); /* (The mask is just a safeguard.) */ + cword[0] = (ctx->rounds & 15); /* (The mask is just a safeguard.) */ cword[1] = 0; cword[2] = 0; cword[3] = 0; @@ -444,18 +921,29 @@ do_padlock (const RIJNDAEL_context *ctx, int decrypt_flag, cword[0] |= 0x00000200; memcpy (a, ax, 16); - - asm volatile - ("pushfl\n\t" /* Force key reload. */ + + blocks = 1; /* Init counter for just one block. */ +#ifdef __x86_64__ + asm volatile + ("pushfq\n\t" /* Force key reload. */ + "popfq\n\t" + ".byte 0xf3, 0x0f, 0xa7, 0xc8\n\t" /* REP XCRYPT ECB. */ + : /* No output */ + : "S" (a), "D" (b), "d" (cword), "b" (ctx->padlockkey), "c" (blocks) + : "cc", "memory" + ); +#else + asm volatile + ("pushfl\n\t" /* Force key reload. */ "popfl\n\t" "xchg %3, %%ebx\n\t" /* Load key. */ - "movl $1, %%ecx\n\t" /* Init counter for just one block. */ - ".byte 0xf3, 0x0f, 0xa7, 0xc8\n\t" /* REP XSTORE ECB. */ + ".byte 0xf3, 0x0f, 0xa7, 0xc8\n\t" /* REP XCRYPT ECB. */ "xchg %3, %%ebx\n" /* Restore GOT register. */ : /* No output */ - : "S" (a), "D" (b), "d" (cword), "r" (ctx->padlock_key) - : "%ecx", "cc", "memory" + : "S" (a), "D" (b), "d" (cword), "r" (ctx->padlockkey), "c" (blocks) + : "cc", "memory" ); +#endif memcpy (bx, b, 16); @@ -463,23 +951,721 @@ do_padlock (const RIJNDAEL_context *ctx, int decrypt_flag, #endif /*USE_PADLOCK*/ +#ifdef USE_AESNI +/* Encrypt one block using the Intel AES-NI instructions. A and B may + be the same. + + Our problem here is that gcc does not allow the "x" constraint for + SSE registers in asm unless you compile with -msse. The common + wisdom is to use a separate file for SSE instructions and build it + separately. This would require a lot of extra build system stuff, + similar to what we do in mpi/ for the asm stuff. What we do + instead is to use standard registers and a bit more of plain asm + which copies the data and key stuff to the SSE registers and later + back. If we decide to implement some block modes with parallelized + AES instructions, it might indeed be better to use plain asm ala + mpi/. */ +static inline void +do_aesni_enc (const RIJNDAEL_context *ctx, unsigned char *b, + const unsigned char *a) +{ +#define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t" +#define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t" + /* Note: For now we relax the alignment requirement for A and B: It + does not make much difference because in many case we would need + to memcpy them to an extra buffer; using the movdqu is much faster + that memcpy and movdqa. For CFB we know that the IV is properly + aligned but that is a special case. We should better implement + CFB direct in asm. */ + asm volatile ("movdqu %[src], %%xmm0\n\t" /* xmm0 := *a */ + "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ + "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */ + "movdqa 0x10(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x20(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x30(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x40(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x50(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x60(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x70(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x80(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x90(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xa0(%[key]), %%xmm1\n\t" + "cmpl $10, %[rounds]\n\t" + "jz .Lenclast%=\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xb0(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xc0(%[key]), %%xmm1\n\t" + "cmpl $12, %[rounds]\n\t" + "jz .Lenclast%=\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xd0(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xe0(%[key]), %%xmm1\n" + + ".Lenclast%=:\n\t" + aesenclast_xmm1_xmm0 + "movdqu %%xmm0, %[dst]\n" + : [dst] "=m" (*b) + : [src] "m" (*a), + [key] "r" (ctx->keyschenc), + [rounds] "r" (ctx->rounds) + : "cc", "memory"); +#undef aesenc_xmm1_xmm0 +#undef aesenclast_xmm1_xmm0 +} + + +static inline void +do_aesni_dec (const RIJNDAEL_context *ctx, unsigned char *b, + const unsigned char *a) +{ +#define aesdec_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc1\n\t" +#define aesdeclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc1\n\t" + asm volatile ("movdqu %[src], %%xmm0\n\t" /* xmm0 := *a */ + "movdqa (%[key]), %%xmm1\n\t" + "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */ + "movdqa 0x10(%[key]), %%xmm1\n\t" + aesdec_xmm1_xmm0 + "movdqa 0x20(%[key]), %%xmm1\n\t" + aesdec_xmm1_xmm0 + "movdqa 0x30(%[key]), %%xmm1\n\t" + aesdec_xmm1_xmm0 + "movdqa 0x40(%[key]), %%xmm1\n\t" + aesdec_xmm1_xmm0 + "movdqa 0x50(%[key]), %%xmm1\n\t" + aesdec_xmm1_xmm0 + "movdqa 0x60(%[key]), %%xmm1\n\t" + aesdec_xmm1_xmm0 + "movdqa 0x70(%[key]), %%xmm1\n\t" + aesdec_xmm1_xmm0 + "movdqa 0x80(%[key]), %%xmm1\n\t" + aesdec_xmm1_xmm0 + "movdqa 0x90(%[key]), %%xmm1\n\t" + aesdec_xmm1_xmm0 + "movdqa 0xa0(%[key]), %%xmm1\n\t" + "cmpl $10, %[rounds]\n\t" + "jz .Ldeclast%=\n\t" + aesdec_xmm1_xmm0 + "movdqa 0xb0(%[key]), %%xmm1\n\t" + aesdec_xmm1_xmm0 + "movdqa 0xc0(%[key]), %%xmm1\n\t" + "cmpl $12, %[rounds]\n\t" + "jz .Ldeclast%=\n\t" + aesdec_xmm1_xmm0 + "movdqa 0xd0(%[key]), %%xmm1\n\t" + aesdec_xmm1_xmm0 + "movdqa 0xe0(%[key]), %%xmm1\n" + + ".Ldeclast%=:\n\t" + aesdeclast_xmm1_xmm0 + "movdqu %%xmm0, %[dst]\n" + : [dst] "=m" (*b) + : [src] "m" (*a), + [key] "r" (ctx->keyschdec), + [rounds] "r" (ctx->rounds) + : "cc", "memory"); +#undef aesdec_xmm1_xmm0 +#undef aesdeclast_xmm1_xmm0 +} + + +/* Encrypt four blocks using the Intel AES-NI instructions. Blocks are input + * and output through SSE registers xmm1 to xmm4. */ +static void +do_aesni_enc_vec4 (const RIJNDAEL_context *ctx) +{ +#define aesenc_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t" +#define aesenc_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd0\n\t" +#define aesenc_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd8\n\t" +#define aesenc_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe0\n\t" +#define aesenclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc8\n\t" +#define aesenclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd0\n\t" +#define aesenclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd8\n\t" +#define aesenclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe0\n\t" + asm volatile ("movdqa (%[key]), %%xmm0\n\t" + "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */ + "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */ + "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */ + "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */ + "movdqa 0x10(%[key]), %%xmm0\n\t" + aesenc_xmm0_xmm1 + aesenc_xmm0_xmm2 + aesenc_xmm0_xmm3 + aesenc_xmm0_xmm4 + "movdqa 0x20(%[key]), %%xmm0\n\t" + aesenc_xmm0_xmm1 + aesenc_xmm0_xmm2 + aesenc_xmm0_xmm3 + aesenc_xmm0_xmm4 + "movdqa 0x30(%[key]), %%xmm0\n\t" + aesenc_xmm0_xmm1 + aesenc_xmm0_xmm2 + aesenc_xmm0_xmm3 + aesenc_xmm0_xmm4 + "movdqa 0x40(%[key]), %%xmm0\n\t" + aesenc_xmm0_xmm1 + aesenc_xmm0_xmm2 + aesenc_xmm0_xmm3 + aesenc_xmm0_xmm4 + "movdqa 0x50(%[key]), %%xmm0\n\t" + aesenc_xmm0_xmm1 + aesenc_xmm0_xmm2 + aesenc_xmm0_xmm3 + aesenc_xmm0_xmm4 + "movdqa 0x60(%[key]), %%xmm0\n\t" + aesenc_xmm0_xmm1 + aesenc_xmm0_xmm2 + aesenc_xmm0_xmm3 + aesenc_xmm0_xmm4 + "movdqa 0x70(%[key]), %%xmm0\n\t" + aesenc_xmm0_xmm1 + aesenc_xmm0_xmm2 + aesenc_xmm0_xmm3 + aesenc_xmm0_xmm4 + "movdqa 0x80(%[key]), %%xmm0\n\t" + aesenc_xmm0_xmm1 + aesenc_xmm0_xmm2 + aesenc_xmm0_xmm3 + aesenc_xmm0_xmm4 + "movdqa 0x90(%[key]), %%xmm0\n\t" + aesenc_xmm0_xmm1 + aesenc_xmm0_xmm2 + aesenc_xmm0_xmm3 + aesenc_xmm0_xmm4 + "movdqa 0xa0(%[key]), %%xmm0\n\t" + "cmpl $10, %[rounds]\n\t" + "jz .Ldeclast%=\n\t" + aesenc_xmm0_xmm1 + aesenc_xmm0_xmm2 + aesenc_xmm0_xmm3 + aesenc_xmm0_xmm4 + "movdqa 0xb0(%[key]), %%xmm0\n\t" + aesenc_xmm0_xmm1 + aesenc_xmm0_xmm2 + aesenc_xmm0_xmm3 + aesenc_xmm0_xmm4 + "movdqa 0xc0(%[key]), %%xmm0\n\t" + "cmpl $12, %[rounds]\n\t" + "jz .Ldeclast%=\n\t" + aesenc_xmm0_xmm1 + aesenc_xmm0_xmm2 + aesenc_xmm0_xmm3 + aesenc_xmm0_xmm4 + "movdqa 0xd0(%[key]), %%xmm0\n\t" + aesenc_xmm0_xmm1 + aesenc_xmm0_xmm2 + aesenc_xmm0_xmm3 + aesenc_xmm0_xmm4 + "movdqa 0xe0(%[key]), %%xmm0\n" + + ".Ldeclast%=:\n\t" + aesenclast_xmm0_xmm1 + aesenclast_xmm0_xmm2 + aesenclast_xmm0_xmm3 + aesenclast_xmm0_xmm4 + : /* no output */ + : [key] "r" (ctx->keyschenc), + [rounds] "r" (ctx->rounds) + : "cc", "memory"); +#undef aesenc_xmm0_xmm1 +#undef aesenc_xmm0_xmm2 +#undef aesenc_xmm0_xmm3 +#undef aesenc_xmm0_xmm4 +#undef aesenclast_xmm0_xmm1 +#undef aesenclast_xmm0_xmm2 +#undef aesenclast_xmm0_xmm3 +#undef aesenclast_xmm0_xmm4 +} + + +/* Decrypt four blocks using the Intel AES-NI instructions. Blocks are input + * and output through SSE registers xmm1 to xmm4. */ +static void +do_aesni_dec_vec4 (const RIJNDAEL_context *ctx) +{ +#define aesdec_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc8\n\t" +#define aesdec_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd0\n\t" +#define aesdec_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd8\n\t" +#define aesdec_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xde, 0xe0\n\t" +#define aesdeclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc8\n\t" +#define aesdeclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd0\n\t" +#define aesdeclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd8\n\t" +#define aesdeclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xe0\n\t" + asm volatile ("movdqa (%[key]), %%xmm0\n\t" + "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */ + "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */ + "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */ + "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */ + "movdqa 0x10(%[key]), %%xmm0\n\t" + aesdec_xmm0_xmm1 + aesdec_xmm0_xmm2 + aesdec_xmm0_xmm3 + aesdec_xmm0_xmm4 + "movdqa 0x20(%[key]), %%xmm0\n\t" + aesdec_xmm0_xmm1 + aesdec_xmm0_xmm2 + aesdec_xmm0_xmm3 + aesdec_xmm0_xmm4 + "movdqa 0x30(%[key]), %%xmm0\n\t" + aesdec_xmm0_xmm1 + aesdec_xmm0_xmm2 + aesdec_xmm0_xmm3 + aesdec_xmm0_xmm4 + "movdqa 0x40(%[key]), %%xmm0\n\t" + aesdec_xmm0_xmm1 + aesdec_xmm0_xmm2 + aesdec_xmm0_xmm3 + aesdec_xmm0_xmm4 + "movdqa 0x50(%[key]), %%xmm0\n\t" + aesdec_xmm0_xmm1 + aesdec_xmm0_xmm2 + aesdec_xmm0_xmm3 + aesdec_xmm0_xmm4 + "movdqa 0x60(%[key]), %%xmm0\n\t" + aesdec_xmm0_xmm1 + aesdec_xmm0_xmm2 + aesdec_xmm0_xmm3 + aesdec_xmm0_xmm4 + "movdqa 0x70(%[key]), %%xmm0\n\t" + aesdec_xmm0_xmm1 + aesdec_xmm0_xmm2 + aesdec_xmm0_xmm3 + aesdec_xmm0_xmm4 + "movdqa 0x80(%[key]), %%xmm0\n\t" + aesdec_xmm0_xmm1 + aesdec_xmm0_xmm2 + aesdec_xmm0_xmm3 + aesdec_xmm0_xmm4 + "movdqa 0x90(%[key]), %%xmm0\n\t" + aesdec_xmm0_xmm1 + aesdec_xmm0_xmm2 + aesdec_xmm0_xmm3 + aesdec_xmm0_xmm4 + "movdqa 0xa0(%[key]), %%xmm0\n\t" + "cmpl $10, %[rounds]\n\t" + "jz .Ldeclast%=\n\t" + aesdec_xmm0_xmm1 + aesdec_xmm0_xmm2 + aesdec_xmm0_xmm3 + aesdec_xmm0_xmm4 + "movdqa 0xb0(%[key]), %%xmm0\n\t" + aesdec_xmm0_xmm1 + aesdec_xmm0_xmm2 + aesdec_xmm0_xmm3 + aesdec_xmm0_xmm4 + "movdqa 0xc0(%[key]), %%xmm0\n\t" + "cmpl $12, %[rounds]\n\t" + "jz .Ldeclast%=\n\t" + aesdec_xmm0_xmm1 + aesdec_xmm0_xmm2 + aesdec_xmm0_xmm3 + aesdec_xmm0_xmm4 + "movdqa 0xd0(%[key]), %%xmm0\n\t" + aesdec_xmm0_xmm1 + aesdec_xmm0_xmm2 + aesdec_xmm0_xmm3 + aesdec_xmm0_xmm4 + "movdqa 0xe0(%[key]), %%xmm0\n" + + ".Ldeclast%=:\n\t" + aesdeclast_xmm0_xmm1 + aesdeclast_xmm0_xmm2 + aesdeclast_xmm0_xmm3 + aesdeclast_xmm0_xmm4 + : /* no output */ + : [key] "r" (ctx->keyschdec), + [rounds] "r" (ctx->rounds) + : "cc", "memory"); +#undef aesdec_xmm0_xmm1 +#undef aesdec_xmm0_xmm2 +#undef aesdec_xmm0_xmm3 +#undef aesdec_xmm0_xmm4 +#undef aesdeclast_xmm0_xmm1 +#undef aesdeclast_xmm0_xmm2 +#undef aesdeclast_xmm0_xmm3 +#undef aesdeclast_xmm0_xmm4 +} + + +/* Perform a CFB encryption or decryption round using the + initialization vector IV and the input block A. Write the result + to the output block B and update IV. IV needs to be 16 byte + aligned. */ +static void +do_aesni_cfb (const RIJNDAEL_context *ctx, int decrypt_flag, + unsigned char *iv, unsigned char *b, const unsigned char *a) +{ +#define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t" +#define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t" + asm volatile ("movdqa %[iv], %%xmm0\n\t" /* xmm0 := IV */ + "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ + "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */ + "movdqa 0x10(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x20(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x30(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x40(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x50(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x60(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x70(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x80(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x90(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xa0(%[key]), %%xmm1\n\t" + "cmpl $10, %[rounds]\n\t" + "jz .Lenclast%=\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xb0(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xc0(%[key]), %%xmm1\n\t" + "cmpl $12, %[rounds]\n\t" + "jz .Lenclast%=\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xd0(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xe0(%[key]), %%xmm1\n" + + ".Lenclast%=:\n\t" + aesenclast_xmm1_xmm0 + "movdqu %[src], %%xmm1\n\t" /* Save input. */ + "pxor %%xmm1, %%xmm0\n\t" /* xmm0 = input ^ IV */ + + "cmpl $1, %[decrypt]\n\t" + "jz .Ldecrypt_%=\n\t" + "movdqa %%xmm0, %[iv]\n\t" /* [encrypt] Store IV. */ + "jmp .Lleave_%=\n" + ".Ldecrypt_%=:\n\t" + "movdqa %%xmm1, %[iv]\n" /* [decrypt] Store IV. */ + ".Lleave_%=:\n\t" + "movdqu %%xmm0, %[dst]\n" /* Store output. */ + : [iv] "+m" (*iv), [dst] "=m" (*b) + : [src] "m" (*a), + [key] "r" (ctx->keyschenc), + [rounds] "g" (ctx->rounds), + [decrypt] "m" (decrypt_flag) + : "cc", "memory"); +#undef aesenc_xmm1_xmm0 +#undef aesenclast_xmm1_xmm0 +} + +/* Perform a CTR encryption round using the counter CTR and the input + block A. Write the result to the output block B and update CTR. + CTR needs to be a 16 byte aligned little-endian value. */ +static void +do_aesni_ctr (const RIJNDAEL_context *ctx, + unsigned char *ctr, unsigned char *b, const unsigned char *a) +{ +#define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t" +#define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t" + + asm volatile ("movdqa %%xmm5, %%xmm0\n\t" /* xmm0 := CTR (xmm5) */ + "pcmpeqd %%xmm1, %%xmm1\n\t" + "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */ + + "pshufb %%xmm6, %%xmm5\n\t" + "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ (big endian) */ + + /* detect if 64-bit carry handling is needed */ + "cmpl $0xffffffff, 8(%[ctr])\n\t" + "jne .Lno_carry%=\n\t" + "cmpl $0xffffffff, 12(%[ctr])\n\t" + "jne .Lno_carry%=\n\t" + + "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */ + "psubq %%xmm1, %%xmm5\n\t" /* add carry to upper 64bits */ + + ".Lno_carry%=:\n\t" + + "pshufb %%xmm6, %%xmm5\n\t" + "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */ + + "pxor (%[key]), %%xmm0\n\t" /* xmm1 ^= key[0] */ + "movdqa 0x10(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x20(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x30(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x40(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x50(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x60(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x70(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x80(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x90(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xa0(%[key]), %%xmm1\n\t" + "cmpl $10, %[rounds]\n\t" + "jz .Lenclast%=\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xb0(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xc0(%[key]), %%xmm1\n\t" + "cmpl $12, %[rounds]\n\t" + "jz .Lenclast%=\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xd0(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xe0(%[key]), %%xmm1\n" + + ".Lenclast%=:\n\t" + aesenclast_xmm1_xmm0 + "movdqu %[src], %%xmm1\n\t" /* xmm1 := input */ + "pxor %%xmm1, %%xmm0\n\t" /* EncCTR ^= input */ + "movdqu %%xmm0, %[dst]" /* Store EncCTR. */ + + : [dst] "=m" (*b) + : [src] "m" (*a), + [ctr] "r" (ctr), + [key] "r" (ctx->keyschenc), + [rounds] "g" (ctx->rounds) + : "cc", "memory"); +#undef aesenc_xmm1_xmm0 +#undef aesenclast_xmm1_xmm0 +} + + +/* Four blocks at a time variant of do_aesni_ctr. */ static void +do_aesni_ctr_4 (const RIJNDAEL_context *ctx, + unsigned char *ctr, unsigned char *b, const unsigned char *a) +{ +#define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t" +#define aesenc_xmm1_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd1\n\t" +#define aesenc_xmm1_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd9\n\t" +#define aesenc_xmm1_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe1\n\t" +#define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t" +#define aesenclast_xmm1_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd1\n\t" +#define aesenclast_xmm1_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd9\n\t" +#define aesenclast_xmm1_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t" + + /* Register usage: + esi keyschedule + xmm0 CTR-0 + xmm1 temp / round key + xmm2 CTR-1 + xmm3 CTR-2 + xmm4 CTR-3 + xmm5 copy of *ctr + xmm6 endian swapping mask + */ + + asm volatile ("movdqa %%xmm5, %%xmm0\n\t" /* xmm0, xmm2 := CTR (xmm5) */ + "movdqa %%xmm0, %%xmm2\n\t" + "pcmpeqd %%xmm1, %%xmm1\n\t" + "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */ + + "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := le(xmm2) */ + "psubq %%xmm1, %%xmm2\n\t" /* xmm2++ */ + "movdqa %%xmm2, %%xmm3\n\t" /* xmm3 := xmm2 */ + "psubq %%xmm1, %%xmm3\n\t" /* xmm3++ */ + "movdqa %%xmm3, %%xmm4\n\t" /* xmm4 := xmm3 */ + "psubq %%xmm1, %%xmm4\n\t" /* xmm4++ */ + "movdqa %%xmm4, %%xmm5\n\t" /* xmm5 := xmm4 */ + "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ */ + + /* detect if 64-bit carry handling is needed */ + "cmpl $0xffffffff, 8(%[ctr])\n\t" + "jne .Lno_carry%=\n\t" + "movl 12(%[ctr]), %%esi\n\t" + "bswapl %%esi\n\t" + "cmpl $0xfffffffc, %%esi\n\t" + "jb .Lno_carry%=\n\t" /* no carry */ + + "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */ + "je .Lcarry_xmm5%=\n\t" /* esi == 0xfffffffc */ + "cmpl $0xfffffffe, %%esi\n\t" + "jb .Lcarry_xmm4%=\n\t" /* esi == 0xfffffffd */ + "je .Lcarry_xmm3%=\n\t" /* esi == 0xfffffffe */ + /* esi == 0xffffffff */ + + "psubq %%xmm1, %%xmm2\n\t" + ".Lcarry_xmm3%=:\n\t" + "psubq %%xmm1, %%xmm3\n\t" + ".Lcarry_xmm4%=:\n\t" + "psubq %%xmm1, %%xmm4\n\t" + ".Lcarry_xmm5%=:\n\t" + "psubq %%xmm1, %%xmm5\n\t" + + ".Lno_carry%=:\n\t" + "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ + "movl %[rounds], %%esi\n\t" + + "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := be(xmm2) */ + "pshufb %%xmm6, %%xmm3\n\t" /* xmm3 := be(xmm3) */ + "pshufb %%xmm6, %%xmm4\n\t" /* xmm4 := be(xmm4) */ + "pshufb %%xmm6, %%xmm5\n\t" /* xmm5 := be(xmm5) */ + "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */ + + "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */ + "pxor %%xmm1, %%xmm2\n\t" /* xmm2 ^= key[0] */ + "pxor %%xmm1, %%xmm3\n\t" /* xmm3 ^= key[0] */ + "pxor %%xmm1, %%xmm4\n\t" /* xmm4 ^= key[0] */ + "movdqa 0x10(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + aesenc_xmm1_xmm2 + aesenc_xmm1_xmm3 + aesenc_xmm1_xmm4 + "movdqa 0x20(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + aesenc_xmm1_xmm2 + aesenc_xmm1_xmm3 + aesenc_xmm1_xmm4 + "movdqa 0x30(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + aesenc_xmm1_xmm2 + aesenc_xmm1_xmm3 + aesenc_xmm1_xmm4 + "movdqa 0x40(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + aesenc_xmm1_xmm2 + aesenc_xmm1_xmm3 + aesenc_xmm1_xmm4 + "movdqa 0x50(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + aesenc_xmm1_xmm2 + aesenc_xmm1_xmm3 + aesenc_xmm1_xmm4 + "movdqa 0x60(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + aesenc_xmm1_xmm2 + aesenc_xmm1_xmm3 + aesenc_xmm1_xmm4 + "movdqa 0x70(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + aesenc_xmm1_xmm2 + aesenc_xmm1_xmm3 + aesenc_xmm1_xmm4 + "movdqa 0x80(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + aesenc_xmm1_xmm2 + aesenc_xmm1_xmm3 + aesenc_xmm1_xmm4 + "movdqa 0x90(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + aesenc_xmm1_xmm2 + aesenc_xmm1_xmm3 + aesenc_xmm1_xmm4 + "movdqa 0xa0(%[key]), %%xmm1\n\t" + "cmpl $10, %%esi\n\t" + "jz .Lenclast%=\n\t" + aesenc_xmm1_xmm0 + aesenc_xmm1_xmm2 + aesenc_xmm1_xmm3 + aesenc_xmm1_xmm4 + "movdqa 0xb0(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + aesenc_xmm1_xmm2 + aesenc_xmm1_xmm3 + aesenc_xmm1_xmm4 + "movdqa 0xc0(%[key]), %%xmm1\n\t" + "cmpl $12, %%esi\n\t" + "jz .Lenclast%=\n\t" + aesenc_xmm1_xmm0 + aesenc_xmm1_xmm2 + aesenc_xmm1_xmm3 + aesenc_xmm1_xmm4 + "movdqa 0xd0(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + aesenc_xmm1_xmm2 + aesenc_xmm1_xmm3 + aesenc_xmm1_xmm4 + "movdqa 0xe0(%[key]), %%xmm1\n" + + ".Lenclast%=:\n\t" + aesenclast_xmm1_xmm0 + aesenclast_xmm1_xmm2 + aesenclast_xmm1_xmm3 + aesenclast_xmm1_xmm4 + + "movdqu (%[src]), %%xmm1\n\t" /* Get block 1. */ + "pxor %%xmm1, %%xmm0\n\t" /* EncCTR-1 ^= input */ + "movdqu %%xmm0, (%[dst])\n\t" /* Store block 1 */ + + "movdqu 16(%[src]), %%xmm1\n\t" /* Get block 2. */ + "pxor %%xmm1, %%xmm2\n\t" /* EncCTR-2 ^= input */ + "movdqu %%xmm2, 16(%[dst])\n\t" /* Store block 2. */ + + "movdqu 32(%[src]), %%xmm1\n\t" /* Get block 3. */ + "pxor %%xmm1, %%xmm3\n\t" /* EncCTR-3 ^= input */ + "movdqu %%xmm3, 32(%[dst])\n\t" /* Store block 3. */ + + "movdqu 48(%[src]), %%xmm1\n\t" /* Get block 4. */ + "pxor %%xmm1, %%xmm4\n\t" /* EncCTR-4 ^= input */ + "movdqu %%xmm4, 48(%[dst])" /* Store block 4. */ + + : + : [ctr] "r" (ctr), + [src] "r" (a), + [dst] "r" (b), + [key] "r" (ctx->keyschenc), + [rounds] "g" (ctx->rounds) + : "%esi", "cc", "memory"); +#undef aesenc_xmm1_xmm0 +#undef aesenc_xmm1_xmm2 +#undef aesenc_xmm1_xmm3 +#undef aesenc_xmm1_xmm4 +#undef aesenclast_xmm1_xmm0 +#undef aesenclast_xmm1_xmm2 +#undef aesenclast_xmm1_xmm3 +#undef aesenclast_xmm1_xmm4 +} + +#endif /*USE_AESNI*/ + + +static unsigned int rijndael_encrypt (void *context, byte *b, const byte *a) { RIJNDAEL_context *ctx = context; + unsigned int burn_stack; + if (0) + ; #ifdef USE_PADLOCK - if (ctx->use_padlock) + else if (ctx->use_padlock) { do_padlock (ctx, 0, b, a); - _gcry_burn_stack (48 + 15 /* possible padding for alignment */); + burn_stack = (48 + 15 /* possible padding for alignment */); } - else #endif /*USE_PADLOCK*/ +#ifdef USE_AESNI + else if (ctx->use_aesni) + { + aesni_prepare (); + do_aesni_enc (ctx, b, a); + aesni_cleanup (); + burn_stack = 0; + } +#endif /*USE_AESNI*/ + else { do_encrypt (ctx, b, a); - _gcry_burn_stack (48 + 2*sizeof(int)); + burn_stack = (56 + 2*sizeof(int)); } + + return burn_stack; } @@ -488,18 +1674,19 @@ rijndael_encrypt (void *context, byte *b, const byte *a) function is only intended for the bulk encryption feature of cipher.c. */ void -_gcry_aes_cfb_enc (void *context, unsigned char *iv, +_gcry_aes_cfb_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, - unsigned int nblocks) + size_t nblocks) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; - unsigned char *ivp; - int i; + unsigned int burn_depth = 48 + 2*sizeof(int); + if (0) + ; #ifdef USE_PADLOCK - if (ctx->use_padlock) + else if (ctx->use_padlock) { /* Fixme: Let Padlock do the CFBing. */ for ( ;nblocks; nblocks-- ) @@ -507,24 +1694,42 @@ _gcry_aes_cfb_enc (void *context, unsigned char *iv, /* Encrypt the IV. */ do_padlock (ctx, 0, iv, iv); /* XOR the input with the IV and store input into IV. */ - for (ivp=iv,i=0; i < BLOCKSIZE; i++ ) - *outbuf++ = (*ivp++ ^= *inbuf++); + buf_xor_2dst(outbuf, iv, inbuf, BLOCKSIZE); + outbuf += BLOCKSIZE; + inbuf += BLOCKSIZE; } } +#endif /*USE_PADLOCK*/ +#ifdef USE_AESNI + else if (ctx->use_aesni) + { + aesni_prepare (); + for ( ;nblocks; nblocks-- ) + { + do_aesni_cfb (ctx, 0, iv, outbuf, inbuf); + outbuf += BLOCKSIZE; + inbuf += BLOCKSIZE; + } + aesni_cleanup (); + + burn_depth = 0; /* No stack usage. */ + } +#endif /*USE_AESNI*/ else -#endif /* USE_PADLOCK*/ { for ( ;nblocks; nblocks-- ) { /* Encrypt the IV. */ do_encrypt_aligned (ctx, iv, iv); /* XOR the input with the IV and store input into IV. */ - for (ivp=iv,i=0; i < BLOCKSIZE; i++ ) - *outbuf++ = (*ivp++ ^= *inbuf++); + buf_xor_2dst(outbuf, iv, inbuf, BLOCKSIZE); + outbuf += BLOCKSIZE; + inbuf += BLOCKSIZE; } } - _gcry_burn_stack (48 + 2*sizeof(int)); + if (burn_depth) + _gcry_burn_stack (burn_depth); } @@ -533,35 +1738,173 @@ _gcry_aes_cfb_enc (void *context, unsigned char *iv, function is only intended for the bulk encryption feature of cipher.c. */ void -_gcry_aes_cbc_enc (void *context, unsigned char *iv, +_gcry_aes_cbc_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, - unsigned int nblocks, int cbc_mac) + size_t nblocks, int cbc_mac) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; - unsigned char *ivp; - int i; + unsigned char *last_iv; + unsigned int burn_depth = 48 + 2*sizeof(int); +#ifdef USE_AESNI + int use_aesni = ctx->use_aesni; +#endif + +#ifdef USE_AESNI + if (use_aesni) + aesni_prepare (); +#endif /*USE_AESNI*/ + + last_iv = iv; for ( ;nblocks; nblocks-- ) { - for (ivp=iv, i=0; i < BLOCKSIZE; i++ ) - outbuf[i] = inbuf[i] ^ *ivp++; + if (0) + ; +#ifdef USE_AESNI + else if (use_aesni) + { + /* ~35% speed up on Sandy-Bridge when doing xoring and copying with + SSE registers. */ + asm volatile ("movdqu %[iv], %%xmm0\n\t" + "movdqu %[inbuf], %%xmm1\n\t" + "pxor %%xmm0, %%xmm1\n\t" + "movdqu %%xmm1, %[outbuf]\n\t" + : /* No output */ + : [iv] "m" (*last_iv), + [inbuf] "m" (*inbuf), + [outbuf] "m" (*outbuf) + : "memory" ); + + do_aesni_enc (ctx, outbuf, outbuf); + } +#endif /*USE_AESNI*/ + else + { + buf_xor(outbuf, inbuf, last_iv, BLOCKSIZE); + if (0) + ; #ifdef USE_PADLOCK - if (ctx->use_padlock) - do_padlock (ctx, 0, outbuf, outbuf); - else + else if (ctx->use_padlock) + do_padlock (ctx, 0, outbuf, outbuf); #endif /*USE_PADLOCK*/ - do_encrypt (ctx, outbuf, outbuf ); + else + do_encrypt (ctx, outbuf, outbuf ); + } - memcpy (iv, outbuf, BLOCKSIZE); + last_iv = outbuf; inbuf += BLOCKSIZE; if (!cbc_mac) outbuf += BLOCKSIZE; } - _gcry_burn_stack (48 + 2*sizeof(int)); + if (last_iv != iv) + { + if (0) + ; +#ifdef USE_AESNI + else if (use_aesni) + asm volatile ("movdqu %[last], %%xmm0\n\t" + "movdqu %%xmm0, %[iv]\n\t" + : /* No output */ + : [last] "m" (*last_iv), + [iv] "m" (*iv) + : "memory" ); +#endif /*USE_AESNI*/ + else + buf_cpy (iv, last_iv, BLOCKSIZE); + } + +#ifdef USE_AESNI + if (use_aesni) + { + aesni_cleanup (); + burn_depth = 0; /* No stack usage. */ + } +#endif /*USE_AESNI*/ + + if (burn_depth) + _gcry_burn_stack (burn_depth); +} + + +/* Bulk encryption of complete blocks in CTR mode. Caller needs to + make sure that CTR is aligned on a 16 byte boundary if AESNI; the + minimum alignment is for an u32. This function is only intended + for the bulk encryption feature of cipher.c. CTR is expected to be + of size BLOCKSIZE. */ +void +_gcry_aes_ctr_enc (void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks) +{ + RIJNDAEL_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + unsigned int burn_depth = 48 + 2*sizeof(int); + int i; + + if (0) + ; +#ifdef USE_AESNI + else if (ctx->use_aesni) + { + static const unsigned char be_mask[16] __attribute__ ((aligned (16))) = + { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + + aesni_prepare (); + + asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */ + "movdqa %[ctr], %%xmm5\n\t" /* Preload CTR */ + : /* No output */ + : [mask] "m" (*be_mask), + [ctr] "m" (*ctr) + : "memory"); + + for ( ;nblocks > 3 ; nblocks -= 4 ) + { + do_aesni_ctr_4 (ctx, ctr, outbuf, inbuf); + outbuf += 4*BLOCKSIZE; + inbuf += 4*BLOCKSIZE; + } + for ( ;nblocks; nblocks-- ) + { + do_aesni_ctr (ctx, ctr, outbuf, inbuf); + outbuf += BLOCKSIZE; + inbuf += BLOCKSIZE; + } + aesni_cleanup (); + aesni_cleanup_2_6 (); + + burn_depth = 0; /* No stack usage. */ + } +#endif /*USE_AESNI*/ + else + { + union { unsigned char x1[16]; u32 x32[4]; } tmp; + + for ( ;nblocks; nblocks-- ) + { + /* Encrypt the counter. */ + do_encrypt_aligned (ctx, tmp.x1, ctr); + /* XOR the input with the encrypted counter and store in output. */ + buf_xor(outbuf, tmp.x1, inbuf, BLOCKSIZE); + outbuf += BLOCKSIZE; + inbuf += BLOCKSIZE; + /* Increment the counter. */ + for (i = BLOCKSIZE; i > 0; i--) + { + ctr[i-1]++; + if (ctr[i-1]) + break; + } + } + } + + if (burn_depth) + _gcry_burn_stack (burn_depth); } @@ -570,70 +1913,75 @@ _gcry_aes_cbc_enc (void *context, unsigned char *iv, and the decryption must have been prepared. A and B may be the same. */ static void -do_decrypt_aligned (RIJNDAEL_context *ctx, +do_decrypt_aligned (RIJNDAEL_context *ctx, unsigned char *b, const unsigned char *a) { -#define rk (ctx->keySched2) - int ROUNDS = ctx->ROUNDS; +#ifdef USE_AMD64_ASM + _gcry_aes_amd64_decrypt_block(ctx->keyschdec, b, a, ctx->rounds); +#elif defined(USE_ARM_ASM) + _gcry_aes_arm_decrypt_block(ctx->keyschdec, b, a, ctx->rounds); +#else +#define rk (ctx->keyschdec) + int rounds = ctx->rounds; int r; - union + union { u32 tempu32[4]; /* Force correct alignment. */ byte temp[4][4]; } u; - *((u32*)u.temp[0]) = *((u32*)(a )) ^ *((u32*)rk[ROUNDS][0]); - *((u32*)u.temp[1]) = *((u32*)(a+ 4)) ^ *((u32*)rk[ROUNDS][1]); - *((u32*)u.temp[2]) = *((u32*)(a+ 8)) ^ *((u32*)rk[ROUNDS][2]); - *((u32*)u.temp[3]) = *((u32*)(a+12)) ^ *((u32*)rk[ROUNDS][3]); - - *((u32*)(b )) = (*((u32*)T5[u.temp[0][0]]) - ^ *((u32*)T6[u.temp[3][1]]) - ^ *((u32*)T7[u.temp[2][2]]) - ^ *((u32*)T8[u.temp[1][3]])); - *((u32*)(b+ 4)) = (*((u32*)T5[u.temp[1][0]]) - ^ *((u32*)T6[u.temp[0][1]]) - ^ *((u32*)T7[u.temp[3][2]]) - ^ *((u32*)T8[u.temp[2][3]])); - *((u32*)(b+ 8)) = (*((u32*)T5[u.temp[2][0]]) - ^ *((u32*)T6[u.temp[1][1]]) - ^ *((u32*)T7[u.temp[0][2]]) - ^ *((u32*)T8[u.temp[3][3]])); - *((u32*)(b+12)) = (*((u32*)T5[u.temp[3][0]]) - ^ *((u32*)T6[u.temp[2][1]]) - ^ *((u32*)T7[u.temp[1][2]]) - ^ *((u32*)T8[u.temp[0][3]])); - - for (r = ROUNDS-1; r > 1; r--) + *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(a )) ^ *((u32_a_t*)rk[rounds][0]); + *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(a+ 4)) ^ *((u32_a_t*)rk[rounds][1]); + *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(a+ 8)) ^ *((u32_a_t*)rk[rounds][2]); + *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(a+12)) ^ *((u32_a_t*)rk[rounds][3]); + + *((u32_a_t*)(b )) = (*((u32_a_t*)T5[u.temp[0][0]]) + ^ *((u32_a_t*)T6[u.temp[3][1]]) + ^ *((u32_a_t*)T7[u.temp[2][2]]) + ^ *((u32_a_t*)T8[u.temp[1][3]])); + *((u32_a_t*)(b+ 4)) = (*((u32_a_t*)T5[u.temp[1][0]]) + ^ *((u32_a_t*)T6[u.temp[0][1]]) + ^ *((u32_a_t*)T7[u.temp[3][2]]) + ^ *((u32_a_t*)T8[u.temp[2][3]])); + *((u32_a_t*)(b+ 8)) = (*((u32_a_t*)T5[u.temp[2][0]]) + ^ *((u32_a_t*)T6[u.temp[1][1]]) + ^ *((u32_a_t*)T7[u.temp[0][2]]) + ^ *((u32_a_t*)T8[u.temp[3][3]])); + *((u32_a_t*)(b+12)) = (*((u32_a_t*)T5[u.temp[3][0]]) + ^ *((u32_a_t*)T6[u.temp[2][1]]) + ^ *((u32_a_t*)T7[u.temp[1][2]]) + ^ *((u32_a_t*)T8[u.temp[0][3]])); + + for (r = rounds-1; r > 1; r--) { - *((u32*)u.temp[0]) = *((u32*)(b )) ^ *((u32*)rk[r][0]); - *((u32*)u.temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[r][1]); - *((u32*)u.temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[r][2]); - *((u32*)u.temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[r][3]); - *((u32*)(b )) = (*((u32*)T5[u.temp[0][0]]) - ^ *((u32*)T6[u.temp[3][1]]) - ^ *((u32*)T7[u.temp[2][2]]) - ^ *((u32*)T8[u.temp[1][3]])); - *((u32*)(b+ 4)) = (*((u32*)T5[u.temp[1][0]]) - ^ *((u32*)T6[u.temp[0][1]]) - ^ *((u32*)T7[u.temp[3][2]]) - ^ *((u32*)T8[u.temp[2][3]])); - *((u32*)(b+ 8)) = (*((u32*)T5[u.temp[2][0]]) - ^ *((u32*)T6[u.temp[1][1]]) - ^ *((u32*)T7[u.temp[0][2]]) - ^ *((u32*)T8[u.temp[3][3]])); - *((u32*)(b+12)) = (*((u32*)T5[u.temp[3][0]]) - ^ *((u32*)T6[u.temp[2][1]]) - ^ *((u32*)T7[u.temp[1][2]]) - ^ *((u32*)T8[u.temp[0][3]])); + *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[r][0]); + *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[r][1]); + *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[r][2]); + *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[r][3]); + *((u32_a_t*)(b )) = (*((u32_a_t*)T5[u.temp[0][0]]) + ^ *((u32_a_t*)T6[u.temp[3][1]]) + ^ *((u32_a_t*)T7[u.temp[2][2]]) + ^ *((u32_a_t*)T8[u.temp[1][3]])); + *((u32_a_t*)(b+ 4)) = (*((u32_a_t*)T5[u.temp[1][0]]) + ^ *((u32_a_t*)T6[u.temp[0][1]]) + ^ *((u32_a_t*)T7[u.temp[3][2]]) + ^ *((u32_a_t*)T8[u.temp[2][3]])); + *((u32_a_t*)(b+ 8)) = (*((u32_a_t*)T5[u.temp[2][0]]) + ^ *((u32_a_t*)T6[u.temp[1][1]]) + ^ *((u32_a_t*)T7[u.temp[0][2]]) + ^ *((u32_a_t*)T8[u.temp[3][3]])); + *((u32_a_t*)(b+12)) = (*((u32_a_t*)T5[u.temp[3][0]]) + ^ *((u32_a_t*)T6[u.temp[2][1]]) + ^ *((u32_a_t*)T7[u.temp[1][2]]) + ^ *((u32_a_t*)T8[u.temp[0][3]])); } - /* Last round is special. */ - *((u32*)u.temp[0]) = *((u32*)(b )) ^ *((u32*)rk[1][0]); - *((u32*)u.temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[1][1]); - *((u32*)u.temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[1][2]); - *((u32*)u.temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[1][3]); + /* Last round is special. */ + *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[1][0]); + *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[1][1]); + *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[1][2]); + *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[1][3]); b[ 0] = S5[u.temp[0][0]]; b[ 1] = S5[u.temp[3][1]]; b[ 2] = S5[u.temp[2][2]]; @@ -650,11 +1998,12 @@ do_decrypt_aligned (RIJNDAEL_context *ctx, b[13] = S5[u.temp[2][1]]; b[14] = S5[u.temp[1][2]]; b[15] = S5[u.temp[0][3]]; - *((u32*)(b )) ^= *((u32*)rk[0][0]); - *((u32*)(b+ 4)) ^= *((u32*)rk[0][1]); - *((u32*)(b+ 8)) ^= *((u32*)rk[0][2]); - *((u32*)(b+12)) ^= *((u32*)rk[0][3]); + *((u32_a_t*)(b )) ^= *((u32_a_t*)rk[0][0]); + *((u32_a_t*)(b+ 4)) ^= *((u32_a_t*)rk[0][1]); + *((u32_a_t*)(b+ 8)) ^= *((u32_a_t*)rk[0][2]); + *((u32_a_t*)(b+12)) ^= *((u32_a_t*)rk[0][3]); #undef rk +#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/ } @@ -662,102 +2011,189 @@ do_decrypt_aligned (RIJNDAEL_context *ctx, static void do_decrypt (RIJNDAEL_context *ctx, byte *bx, const byte *ax) { - /* BX and AX are not necessary correctly aligned. Thus we need to - copy them here. */ - union - { - u32 dummy[4]; - byte a[16]; - } a; - union - { - u32 dummy[4]; - byte b[16]; - } b; +#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM) + /* BX and AX are not necessary correctly aligned. Thus we might + need to copy them here. We try to align to a 16 bytes. */ + if (((size_t)ax & 0x0f) || ((size_t)bx & 0x0f)) + { + union + { + u32 dummy[4]; + byte a[16] ATTR_ALIGNED_16; + } a; + union + { + u32 dummy[4]; + byte b[16] ATTR_ALIGNED_16; + } b; - if ( !ctx->decryption_prepared ) + buf_cpy (a.a, ax, 16); + do_decrypt_aligned (ctx, b.b, a.a); + buf_cpy (bx, b.b, 16); + } + else +#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/ + { + do_decrypt_aligned (ctx, bx, ax); + } +} + + +static inline void +check_decryption_preparation (RIJNDAEL_context *ctx) +{ + if (0) + ; +#ifdef USE_PADLOCK + else if (ctx->use_padlock) + { /* Padlock does not need decryption subkeys. */ } +#endif /*USE_PADLOCK*/ + else if ( !ctx->decryption_prepared ) { prepare_decryption ( ctx ); - _gcry_burn_stack (64); ctx->decryption_prepared = 1; } - - memcpy (a.a, ax, 16); - do_decrypt_aligned (ctx, b.b, a.a); - memcpy (bx, b.b, 16); -#undef rk } - - -static void +static unsigned int rijndael_decrypt (void *context, byte *b, const byte *a) { RIJNDAEL_context *ctx = context; + unsigned int burn_stack; + check_decryption_preparation (ctx); + + if (0) + ; #ifdef USE_PADLOCK - if (ctx->use_padlock) + else if (ctx->use_padlock) { do_padlock (ctx, 1, b, a); - _gcry_burn_stack (48 + 2*sizeof(int) /* FIXME */); + burn_stack = (48 + 2*sizeof(int) /* FIXME */); } - else #endif /*USE_PADLOCK*/ +#ifdef USE_AESNI + else if (ctx->use_aesni) + { + aesni_prepare (); + do_aesni_dec (ctx, b, a); + aesni_cleanup (); + burn_stack = 0; + } +#endif /*USE_AESNI*/ + else { do_decrypt (ctx, b, a); - _gcry_burn_stack (48+2*sizeof(int)); + burn_stack = (56+2*sizeof(int)); } + + return burn_stack; } /* Bulk decryption of complete blocks in CFB mode. Caller needs to - make sure that IV is aligned on an unisgned lonhg boundary. This + make sure that IV is aligned on an unsigned long boundary. This function is only intended for the bulk encryption feature of cipher.c. */ void -_gcry_aes_cfb_dec (void *context, unsigned char *iv, +_gcry_aes_cfb_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, - unsigned int nblocks) + size_t nblocks) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; - unsigned char *ivp; - unsigned char temp; - int i; + unsigned int burn_depth = 48 + 2*sizeof(int); + if (0) + ; #ifdef USE_PADLOCK - if (ctx->use_padlock) + else if (ctx->use_padlock) { /* Fixme: Let Padlock do the CFBing. */ for ( ;nblocks; nblocks-- ) { do_padlock (ctx, 0, iv, iv); - for (ivp=iv,i=0; i < BLOCKSIZE; i++ ) - { - temp = *inbuf++; - *outbuf++ = *ivp ^ temp; - *ivp++ = temp; - } + buf_xor_n_copy(outbuf, iv, inbuf, BLOCKSIZE); + outbuf += BLOCKSIZE; + inbuf += BLOCKSIZE; } } - else #endif /*USE_PADLOCK*/ +#ifdef USE_AESNI + else if (ctx->use_aesni) + { + aesni_prepare (); + + /* CFB decryption can be parallelized */ + for ( ;nblocks >= 4; nblocks -= 4) + { + asm volatile + ("movdqu (%[iv]), %%xmm1\n\t" /* load input blocks */ + "movdqu 0*16(%[inbuf]), %%xmm2\n\t" + "movdqu 1*16(%[inbuf]), %%xmm3\n\t" + "movdqu 2*16(%[inbuf]), %%xmm4\n\t" + + "movdqu 3*16(%[inbuf]), %%xmm0\n\t" /* update IV */ + "movdqu %%xmm0, (%[iv])\n\t" + : /* No output */ + : [inbuf] "r" (inbuf), [iv] "r" (iv) + : "memory"); + + do_aesni_enc_vec4 (ctx); + + asm volatile + ("movdqu 0*16(%[inbuf]), %%xmm5\n\t" + "pxor %%xmm5, %%xmm1\n\t" + "movdqu %%xmm1, 0*16(%[outbuf])\n\t" + + "movdqu 1*16(%[inbuf]), %%xmm5\n\t" + "pxor %%xmm5, %%xmm2\n\t" + "movdqu %%xmm2, 1*16(%[outbuf])\n\t" + + "movdqu 2*16(%[inbuf]), %%xmm5\n\t" + "pxor %%xmm5, %%xmm3\n\t" + "movdqu %%xmm3, 2*16(%[outbuf])\n\t" + + "movdqu 3*16(%[inbuf]), %%xmm5\n\t" + "pxor %%xmm5, %%xmm4\n\t" + "movdqu %%xmm4, 3*16(%[outbuf])\n\t" + + : /* No output */ + : [inbuf] "r" (inbuf), + [outbuf] "r" (outbuf) + : "memory"); + + outbuf += 4*BLOCKSIZE; + inbuf += 4*BLOCKSIZE; + } + + for ( ;nblocks; nblocks-- ) + { + do_aesni_cfb (ctx, 1, iv, outbuf, inbuf); + outbuf += BLOCKSIZE; + inbuf += BLOCKSIZE; + } + aesni_cleanup (); + aesni_cleanup_2_6 (); + + burn_depth = 0; /* No stack usage. */ + } +#endif /*USE_AESNI*/ + else { for ( ;nblocks; nblocks-- ) { do_encrypt_aligned (ctx, iv, iv); - for (ivp=iv,i=0; i < BLOCKSIZE; i++ ) - { - temp = *inbuf++; - *outbuf++ = *ivp ^ temp; - *ivp++ = temp; - } + buf_xor_n_copy(outbuf, iv, inbuf, BLOCKSIZE); + outbuf += BLOCKSIZE; + inbuf += BLOCKSIZE; } } - _gcry_burn_stack (48 + 2*sizeof(int)); + if (burn_depth) + _gcry_burn_stack (burn_depth); } @@ -766,38 +2202,133 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv, function is only intended for the bulk encryption feature of cipher.c. */ void -_gcry_aes_cbc_dec (void *context, unsigned char *iv, +_gcry_aes_cbc_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, - unsigned int nblocks) + size_t nblocks) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; - unsigned char *ivp; - int i; - unsigned char savebuf[BLOCKSIZE]; + unsigned int burn_depth = 48 + 2*sizeof(int) + 4*sizeof (char*); - for ( ;nblocks; nblocks-- ) + check_decryption_preparation (ctx); + + if (0) + ; +#ifdef USE_AESNI + else if (ctx->use_aesni) { - /* We need to save INBUF away because it may be identical to - OUTBUF. */ - memcpy (savebuf, inbuf, BLOCKSIZE); + aesni_prepare (); + + asm volatile + ("movdqu %[iv], %%xmm5\n\t" /* use xmm5 as fast IV storage */ + : /* No output */ + : [iv] "m" (*iv) + : "memory"); + for ( ;nblocks > 3 ; nblocks -= 4 ) + { + asm volatile + ("movdqu 0*16(%[inbuf]), %%xmm1\n\t" /* load input blocks */ + "movdqu 1*16(%[inbuf]), %%xmm2\n\t" + "movdqu 2*16(%[inbuf]), %%xmm3\n\t" + "movdqu 3*16(%[inbuf]), %%xmm4\n\t" + : /* No output */ + : [inbuf] "r" (inbuf) + : "memory"); + + do_aesni_dec_vec4 (ctx); + + asm volatile + ("pxor %%xmm5, %%xmm1\n\t" /* xor IV with output */ + "movdqu 0*16(%[inbuf]), %%xmm5\n\t" /* load new IV */ + "movdqu %%xmm1, 0*16(%[outbuf])\n\t" + + "pxor %%xmm5, %%xmm2\n\t" /* xor IV with output */ + "movdqu 1*16(%[inbuf]), %%xmm5\n\t" /* load new IV */ + "movdqu %%xmm2, 1*16(%[outbuf])\n\t" + + "pxor %%xmm5, %%xmm3\n\t" /* xor IV with output */ + "movdqu 2*16(%[inbuf]), %%xmm5\n\t" /* load new IV */ + "movdqu %%xmm3, 2*16(%[outbuf])\n\t" + + "pxor %%xmm5, %%xmm4\n\t" /* xor IV with output */ + "movdqu 3*16(%[inbuf]), %%xmm5\n\t" /* load new IV */ + "movdqu %%xmm4, 3*16(%[outbuf])\n\t" + + : /* No output */ + : [inbuf] "r" (inbuf), + [outbuf] "r" (outbuf) + : "memory"); + + outbuf += 4*BLOCKSIZE; + inbuf += 4*BLOCKSIZE; + } + + for ( ;nblocks; nblocks-- ) + { + asm volatile + ("movdqu %[inbuf], %%xmm2\n\t" /* use xmm2 as savebuf */ + : /* No output */ + : [inbuf] "m" (*inbuf) + : "memory"); + + /* uses only xmm0 and xmm1 */ + do_aesni_dec (ctx, outbuf, inbuf); + + asm volatile + ("movdqu %[outbuf], %%xmm0\n\t" + "pxor %%xmm5, %%xmm0\n\t" /* xor IV with output */ + "movdqu %%xmm0, %[outbuf]\n\t" + "movdqu %%xmm2, %%xmm5\n\t" /* store savebuf as new IV */ + : /* No output */ + : [outbuf] "m" (*outbuf) + : "memory"); + + outbuf += BLOCKSIZE; + inbuf += BLOCKSIZE; + } + + asm volatile + ("movdqu %%xmm5, %[iv]\n\t" /* store IV */ + : /* No output */ + : [iv] "m" (*iv) + : "memory"); + + aesni_cleanup (); + aesni_cleanup_2_6 (); + + burn_depth = 0; /* No stack usage. */ + } +#endif /*USE_AESNI*/ + else + { + unsigned char savebuf[BLOCKSIZE]; + + for ( ;nblocks; nblocks-- ) + { + /* INBUF is needed later and it may be identical to OUTBUF, so store + the intermediate result to SAVEBUF. */ + + if (0) + ; #ifdef USE_PADLOCK - if (ctx->use_padlock) - do_padlock (ctx, 1, outbuf, inbuf); - else + else if (ctx->use_padlock) + do_padlock (ctx, 1, savebuf, inbuf); #endif /*USE_PADLOCK*/ - do_decrypt (ctx, outbuf, inbuf); + else + do_decrypt (ctx, savebuf, inbuf); - for (ivp=iv, i=0; i < BLOCKSIZE; i++ ) - outbuf[i] ^= *ivp++; - memcpy (iv, savebuf, BLOCKSIZE); - inbuf += BLOCKSIZE; - outbuf += BLOCKSIZE; + buf_xor_n_copy_2(outbuf, savebuf, iv, inbuf, BLOCKSIZE); + inbuf += BLOCKSIZE; + outbuf += BLOCKSIZE; + } + + wipememory(savebuf, sizeof(savebuf)); } - _gcry_burn_stack (48 + 2*sizeof(int) + BLOCKSIZE + 4*sizeof (char*)); + if (burn_depth) + _gcry_burn_stack (burn_depth); } @@ -808,11 +2339,12 @@ static const char* selftest_basic_128 (void) { RIJNDAEL_context ctx; - unsigned char scratch[16]; + unsigned char scratch[16]; /* The test vectors are from the AES supplied ones; more or less randomly taken from ecb_tbl.txt (I=42,81,14) */ - static const unsigned char plaintext_128[16] = +#if 1 + static const unsigned char plaintext_128[16] = { 0x01,0x4B,0xAF,0x22,0x78,0xA6,0x9D,0x33, 0x1D,0x51,0x80,0x10,0x36,0x43,0xE9,0x9A @@ -827,7 +2359,28 @@ selftest_basic_128 (void) 0x67,0x43,0xC3,0xD1,0x51,0x9A,0xB4,0xF2, 0xCD,0x9A,0x78,0xAB,0x09,0xA5,0x11,0xBD }; - +#else + /* Test vectors from fips-197, appendix C. */ +# warning debug test vectors in use + static const unsigned char plaintext_128[16] = + { + 0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77, + 0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff + }; + static const unsigned char key_128[16] = + { + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, + 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + /* 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, */ + /* 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c */ + }; + static const unsigned char ciphertext_128[16] = + { + 0x69,0xc4,0xe0,0xd8,0x6a,0x7b,0x04,0x30, + 0xd8,0xcd,0xb7,0x80,0x70,0xb4,0xc5,0x5a + }; +#endif + rijndael_setkey (&ctx, key_128, sizeof (key_128)); rijndael_encrypt (&ctx, scratch, plaintext_128); if (memcmp (scratch, ciphertext_128, sizeof (ciphertext_128))) @@ -835,7 +2388,7 @@ selftest_basic_128 (void) rijndael_decrypt (&ctx, scratch, scratch); if (memcmp (scratch, plaintext_128, sizeof (plaintext_128))) return "AES-128 test decryption failed."; - + return NULL; } @@ -844,14 +2397,14 @@ static const char* selftest_basic_192 (void) { RIJNDAEL_context ctx; - unsigned char scratch[16]; - - static unsigned char plaintext_192[16] = + unsigned char scratch[16]; + + static unsigned char plaintext_192[16] = { 0x76,0x77,0x74,0x75,0xF1,0xF2,0xF3,0xF4, 0xF8,0xF9,0xE6,0xE7,0x77,0x70,0x71,0x72 }; - static unsigned char key_192[24] = + static unsigned char key_192[24] = { 0x04,0x05,0x06,0x07,0x09,0x0A,0x0B,0x0C, 0x0E,0x0F,0x10,0x11,0x13,0x14,0x15,0x16, @@ -862,7 +2415,7 @@ selftest_basic_192 (void) 0x5D,0x1E,0xF2,0x0D,0xCE,0xD6,0xBC,0xBC, 0x12,0x13,0x1A,0xC7,0xC5,0x47,0x88,0xAA }; - + rijndael_setkey (&ctx, key_192, sizeof(key_192)); rijndael_encrypt (&ctx, scratch, plaintext_192); if (memcmp (scratch, ciphertext_192, sizeof (ciphertext_192))) @@ -870,7 +2423,7 @@ selftest_basic_192 (void) rijndael_decrypt (&ctx, scratch, scratch); if (memcmp (scratch, plaintext_192, sizeof (plaintext_192))) return "AES-192 test decryption failed."; - + return NULL; } @@ -880,21 +2433,21 @@ static const char* selftest_basic_256 (void) { RIJNDAEL_context ctx; - unsigned char scratch[16]; + unsigned char scratch[16]; - static unsigned char plaintext_256[16] = + static unsigned char plaintext_256[16] = { 0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F, 0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21 }; - static unsigned char key_256[32] = + static unsigned char key_256[32] = { 0x08,0x09,0x0A,0x0B,0x0D,0x0E,0x0F,0x10, 0x12,0x13,0x14,0x15,0x17,0x18,0x19,0x1A, 0x1C,0x1D,0x1E,0x1F,0x21,0x22,0x23,0x24, 0x26,0x27,0x28,0x29,0x2B,0x2C,0x2D,0x2E }; - static const unsigned char ciphertext_256[16] = + static const unsigned char ciphertext_256[16] = { 0x08,0x0E,0x95,0x17,0xEB,0x16,0x77,0x71, 0x9A,0xCF,0x72,0x80,0x86,0x04,0x0A,0xE3 @@ -907,10 +2460,56 @@ selftest_basic_256 (void) rijndael_decrypt (&ctx, scratch, scratch); if (memcmp (scratch, plaintext_256, sizeof (plaintext_256))) return "AES-256 test decryption failed."; - + return NULL; } + +/* Run the self-tests for AES-CTR-128, tests IV increment of bulk CTR + encryption. Returns NULL on success. */ +static const char* +selftest_ctr_128 (void) +{ + const int nblocks = 8+1; + const int blocksize = BLOCKSIZE; + const int context_size = sizeof(RIJNDAEL_context); + + return _gcry_selftest_helper_ctr("AES", &rijndael_setkey, + &rijndael_encrypt, &_gcry_aes_ctr_enc, nblocks, blocksize, + context_size); +} + + +/* Run the self-tests for AES-CBC-128, tests bulk CBC decryption. + Returns NULL on success. */ +static const char* +selftest_cbc_128 (void) +{ + const int nblocks = 8+2; + const int blocksize = BLOCKSIZE; + const int context_size = sizeof(RIJNDAEL_context); + + return _gcry_selftest_helper_cbc("AES", &rijndael_setkey, + &rijndael_encrypt, &_gcry_aes_cbc_dec, nblocks, blocksize, + context_size); +} + + +/* Run the self-tests for AES-CFB-128, tests bulk CFB decryption. + Returns NULL on success. */ +static const char* +selftest_cfb_128 (void) +{ + const int nblocks = 8+2; + const int blocksize = BLOCKSIZE; + const int context_size = sizeof(RIJNDAEL_context); + + return _gcry_selftest_helper_cfb("AES", &rijndael_setkey, + &rijndael_encrypt, &_gcry_aes_cfb_dec, nblocks, blocksize, + context_size); +} + + /* Run all the self-tests and return NULL on success. This function is used for the on-the-fly self-tests. */ static const char * @@ -923,6 +2522,15 @@ selftest (void) || (r = selftest_basic_256 ()) ) return r; + if ( (r = selftest_ctr_128 ()) ) + return r; + + if ( (r = selftest_cbc_128 ()) ) + return r; + + if ( (r = selftest_cfb_128 ()) ) + return r; + return r; } @@ -931,12 +2539,12 @@ selftest (void) static const char * selftest_fips_128_38a (int requested_mode) { - struct tv + static const struct tv { int mode; const unsigned char key[16]; const unsigned char iv[16]; - struct + struct { const unsigned char input[16]; const unsigned char output[16]; @@ -947,24 +2555,24 @@ selftest_fips_128_38a (int requested_mode) GCRY_CIPHER_MODE_CFB, /* F.3.13, CFB128-AES128 */ { 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c }, - { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, { { { 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a }, { 0x3b, 0x3f, 0xd9, 0x2e, 0xb7, 0x2d, 0xad, 0x20, 0x33, 0x34, 0x49, 0xf8, 0xe8, 0x3c, 0xfb, 0x4a } }, - + { { 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51 }, { 0xc8, 0xa6, 0x45, 0x37, 0xa0, 0xb3, 0xa9, 0x3f, 0xcd, 0xe3, 0xcd, 0xad, 0x9f, 0x1c, 0xe5, 0x8b } }, - - { { 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, + + { { 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef }, { 0x26, 0x75, 0x1f, 0x67, 0xa3, 0xcb, 0xb1, 0x40, 0xb1, 0x80, 0x8c, 0xf1, 0x87, 0xa4, 0xf4, 0xdf } }, - + { { 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10 }, { 0xc0, 0x4b, 0x05, 0x35, 0x7c, 0x5d, 0x1c, 0x0e, @@ -975,7 +2583,7 @@ selftest_fips_128_38a (int requested_mode) GCRY_CIPHER_MODE_OFB, { 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c }, - { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, { { { 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, @@ -987,7 +2595,7 @@ selftest_fips_128_38a (int requested_mode) 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51 }, { 0x77, 0x89, 0x50, 0x8d, 0x16, 0x91, 0x8f, 0x03, 0xf5, 0x3c, 0x52, 0xda, 0xc5, 0x4e, 0xd8, 0x25 } }, - + { { 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef }, { 0x97, 0x40, 0x05, 0x1e, 0x9c, 0x5f, 0xec, 0xf6, @@ -1057,7 +2665,7 @@ selftest_fips_128_38a (int requested_mode) #undef Fail _gcry_cipher_close (hdenc); - _gcry_cipher_close (hddec); + _gcry_cipher_close (hddec); return NULL; } @@ -1068,7 +2676,7 @@ selftest_fips_128 (int extended, selftest_report_func_t report) { const char *what; const char *errtxt; - + what = "low-level"; errtxt = selftest_basic_128 (); if (errtxt) @@ -1080,7 +2688,7 @@ selftest_fips_128 (int extended, selftest_report_func_t report) errtxt = selftest_fips_128_38a (GCRY_CIPHER_MODE_CFB); if (errtxt) goto failed; - + what = "ofb"; errtxt = selftest_fips_128_38a (GCRY_CIPHER_MODE_OFB); if (errtxt) @@ -1125,7 +2733,7 @@ selftest_fips_256 (int extended, selftest_report_func_t report) { const char *what; const char *errtxt; - + (void)extended; /* No extended tests available. */ what = "low-level"; @@ -1163,7 +2771,7 @@ run_selftests (int algo, int extended, selftest_report_func_t report) default: ec = GPG_ERR_CIPHER_ALGO; break; - + } return ec; } @@ -1190,14 +2798,15 @@ static gcry_cipher_oid_spec_t rijndael_oids[] = gcry_cipher_spec_t _gcry_cipher_spec_aes = { - "AES", rijndael_names, rijndael_oids, 16, 128, sizeof (RIJNDAEL_context), - rijndael_setkey, rijndael_encrypt, rijndael_decrypt - }; -cipher_extra_spec_t _gcry_cipher_extraspec_aes = - { + GCRY_CIPHER_AES, {0, 1}, + "AES", rijndael_names, rijndael_oids, 16, 128, + sizeof (RIJNDAEL_context), + rijndael_setkey, rijndael_encrypt, rijndael_decrypt, + NULL, NULL, run_selftests }; + static const char *rijndael192_names[] = { "RIJNDAEL192", @@ -1216,14 +2825,15 @@ static gcry_cipher_oid_spec_t rijndael192_oids[] = gcry_cipher_spec_t _gcry_cipher_spec_aes192 = { - "AES192", rijndael192_names, rijndael192_oids, 16, 192, sizeof (RIJNDAEL_context), - rijndael_setkey, rijndael_encrypt, rijndael_decrypt - }; -cipher_extra_spec_t _gcry_cipher_extraspec_aes192 = - { + GCRY_CIPHER_AES192, {0, 1}, + "AES192", rijndael192_names, rijndael192_oids, 16, 192, + sizeof (RIJNDAEL_context), + rijndael_setkey, rijndael_encrypt, rijndael_decrypt, + NULL, NULL, run_selftests }; + static const char *rijndael256_names[] = { "RIJNDAEL256", @@ -1242,12 +2852,10 @@ static gcry_cipher_oid_spec_t rijndael256_oids[] = gcry_cipher_spec_t _gcry_cipher_spec_aes256 = { + GCRY_CIPHER_AES256, {0, 1}, "AES256", rijndael256_names, rijndael256_oids, 16, 256, sizeof (RIJNDAEL_context), - rijndael_setkey, rijndael_encrypt, rijndael_decrypt - }; - -cipher_extra_spec_t _gcry_cipher_extraspec_aes256 = - { + rijndael_setkey, rijndael_encrypt, rijndael_decrypt, + NULL, NULL, run_selftests }; |