MirOTR: Libgcrypt and Libgpg-error update

Libgcrypt 1.4.6 => 1.6.3 Libgpg-error 1.9 => 1.18 git-svn-id: http://svn.miranda-ng.org/main/trunk@12449 1316c22d-e87f-b044-9b9b-93d7a3e3ba9c
author: René Schümann <white06tiger@gmail.com> 2015-03-20 12:32:29 +0000
committer: René Schümann <white06tiger@gmail.com> 2015-03-20 12:32:29 +0000
commit: 539705d58fc39a28388ff18c695dd406f4ffd1d9 (patch)
tree: 51db7a37a66c09f41734ba5573d972aae9f30d71 /plugins/MirOTR/Libgcrypt/cipher/rijndael.c
parent: 90171f125f36488dc08f5cfe0b0d4b78d995f08d (diff)
1 files changed, 1980 insertions, 372 deletions
diff --git a/plugins/MirOTR/Libgcrypt/cipher/rijndael.c b/plugins/MirOTR/Libgcrypt/cipher/rijndael.c
index d43b349b41..8019f0aad8 100644
--- a/plugins/MirOTR/Libgcrypt/cipher/rijndael.c
+++ b/plugins/MirOTR/Libgcrypt/cipher/rijndael.c
@@ -1,6 +1,6 @@
 /* Rijndael (AES) for GnuPG
  * Copyright (C) 2000, 2001, 2002, 2003, 2007,
- *               2008 Free Software Foundation, Inc.
+ *               2008, 2011, 2012 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
@@ -45,77 +45,434 @@
 #include "types.h"  /* for byte and u32 typedefs */
 #include "g10lib.h"
 #include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
 
 #define MAXKC			(256/32)
 #define MAXROUNDS		14
 #define BLOCKSIZE               (128/8)
 
 
+/* Helper macro to force alignment to 16 bytes.  */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_16  __attribute__ ((aligned (16)))
+#else
+# define ATTR_ALIGNED_16
+#endif
+
+
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+# define USE_AMD64_ASM 1
+#endif
+
+/* USE_ARM_ASM indicates whether to use ARM assembly code. */
+#undef USE_ARM_ASM
+#if defined(__ARMEL__)
+# ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+#  define USE_ARM_ASM 1
+# endif
+#endif
+
 /* USE_PADLOCK indicates whether to compile the padlock specific
    code.  */
 #undef USE_PADLOCK
 #ifdef ENABLE_PADLOCK_SUPPORT
-# if defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4 && defined (__GNUC__)
-# define USE_PADLOCK
+# ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+#  if (defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__)
+#   define USE_PADLOCK 1
+#  endif
 # endif
 #endif /*ENABLE_PADLOCK_SUPPORT*/
 
-static const char *selftest(void);
+/* USE_AESNI inidicates whether to compile with Intel AES-NI code.  We
+   need the vector-size attribute which seems to be available since
+   gcc 3.  However, to be on the safe side we require at least gcc 4.  */
+#undef USE_AESNI
+#ifdef ENABLE_AESNI_SUPPORT
+# if ((defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__))
+#  if __GNUC__ >= 4
+#   define USE_AESNI 1
+#  endif
+# endif
+#endif /* ENABLE_AESNI_SUPPORT */
 
-typedef struct 
-{
-  int   ROUNDS;             /* Key-length-dependent number of rounds.  */
-  int decryption_prepared;  /* The decryption key schedule is available.  */
-#ifdef USE_PADLOCK
-  int use_padlock;          /* Padlock shall be used.  */
-  /* The key as passed to the padlock engine.  */
-  unsigned char padlock_key[16] __attribute__ ((aligned (16)));
+#ifdef USE_AESNI
+  typedef struct u128_s { u32 a, b, c, d; } u128_t;
+#endif /*USE_AESNI*/
+
+/* Define an u32 variant for the sake of gcc 4.4's strict aliasing.  */
+#if __GNUC__ > 4 || ( __GNUC__ == 4 && __GNUC_MINOR__ >= 4 )
+typedef u32           __attribute__ ((__may_alias__)) u32_a_t;
+#else
+typedef u32           u32_a_t;
 #endif
+
+
+#ifdef USE_AMD64_ASM
+/* AMD64 assembly implementations of AES */
+extern void _gcry_aes_amd64_encrypt_block(const void *keysched_enc,
+					  unsigned char *out,
+					  const unsigned char *in,
+					  int rounds);
+
+extern void _gcry_aes_amd64_decrypt_block(const void *keysched_dec,
+					  unsigned char *out,
+					  const unsigned char *in,
+					  int rounds);
+#endif /*USE_AMD64_ASM*/
+
+#ifdef USE_ARM_ASM
+/* ARM assembly implementations of AES */
+extern void _gcry_aes_arm_encrypt_block(const void *keysched_enc,
+					  unsigned char *out,
+					  const unsigned char *in,
+					  int rounds);
+
+extern void _gcry_aes_arm_decrypt_block(const void *keysched_dec,
+					  unsigned char *out,
+					  const unsigned char *in,
+					  int rounds);
+#endif /*USE_ARM_ASM*/
+
+
+
+/* Our context object.  */
+typedef struct
+{
+  /* The first fields are the keyschedule arrays.  This is so that
+     they are aligned on a 16 byte boundary if using gcc.  This
+     alignment is required for the AES-NI code and a good idea in any
+     case.  The alignment is guaranteed due to the way cipher.c
+     allocates the space for the context.  The PROPERLY_ALIGNED_TYPE
+     hack is used to force a minimal alignment if not using gcc of if
+     the alignment requirement is higher that 16 bytes.  */
   union
   {
     PROPERLY_ALIGNED_TYPE dummy;
     byte keyschedule[MAXROUNDS+1][4][4];
+#ifdef USE_PADLOCK
+    /* The key as passed to the padlock engine.  It is only used if
+       the padlock engine is used (USE_PADLOCK, below).  */
+    unsigned char padlock_key[16] __attribute__ ((aligned (16)));
+#endif /*USE_PADLOCK*/
   } u1;
   union
   {
     PROPERLY_ALIGNED_TYPE dummy;
-    byte keyschedule[MAXROUNDS+1][4][4];	
+    byte keyschedule[MAXROUNDS+1][4][4];
   } u2;
-} RIJNDAEL_context;
+  int rounds;                /* Key-length-dependent number of rounds.  */
+  unsigned int decryption_prepared:1; /* The decryption key schedule is available.  */
+#ifdef USE_PADLOCK
+  unsigned int use_padlock:1;         /* Padlock shall be used.  */
+#endif /*USE_PADLOCK*/
+#ifdef USE_AESNI
+  unsigned int use_aesni:1;           /* AES-NI shall be used.  */
+#endif /*USE_AESNI*/
+} RIJNDAEL_context ATTR_ALIGNED_16;
+
+/* Macros defining alias for the keyschedules.  */
+#define keyschenc  u1.keyschedule
+#define keyschdec  u2.keyschedule
+#define padlockkey u1.padlock_key
+
+/* Two macros to be called prior and after the use of AESNI
+   instructions.  There should be no external function calls between
+   the use of these macros.  There purpose is to make sure that the
+   SSE regsiters are cleared and won't reveal any information about
+   the key or the data.  */
+#ifdef USE_AESNI
+# define aesni_prepare() do { } while (0)
+# define aesni_cleanup()                                                \
+  do { asm volatile ("pxor %%xmm0, %%xmm0\n\t"                          \
+                     "pxor %%xmm1, %%xmm1\n" :: );                      \
+  } while (0)
+# define aesni_cleanup_2_6()                                            \
+  do { asm volatile ("pxor %%xmm2, %%xmm2\n\t"                          \
+                     "pxor %%xmm3, %%xmm3\n"                            \
+                     "pxor %%xmm4, %%xmm4\n"                            \
+                     "pxor %%xmm5, %%xmm5\n"                            \
+                     "pxor %%xmm6, %%xmm6\n":: );                       \
+  } while (0)
+#else
+# define aesni_prepare() do { } while (0)
+# define aesni_cleanup() do { } while (0)
+#endif
 
-#define keySched  u1.keyschedule
-#define keySched2 u2.keyschedule
 
 /* All the numbers.  */
 #include "rijndael-tables.h"
 
 
-/* Perform the key setup.  */  
+
+/* Function prototypes.  */
+#if defined(__i386__) && defined(USE_AESNI)
+/* We don't want to inline these functions on i386 to help gcc allocate enough
+   registers.  */
+static void do_aesni_ctr (const RIJNDAEL_context *ctx, unsigned char *ctr,
+                          unsigned char *b, const unsigned char *a)
+  __attribute__ ((__noinline__));
+static void do_aesni_ctr_4 (const RIJNDAEL_context *ctx, unsigned char *ctr,
+                            unsigned char *b, const unsigned char *a)
+  __attribute__ ((__noinline__));
+#endif /*USE_AESNI*/
+
+static const char *selftest(void);
+
+
+
+#ifdef USE_AESNI
+static void
+aesni_do_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+  aesni_prepare();
+
+  if (ctx->rounds < 12)
+    {
+      /* 128-bit key */
+#define AESKEYGENASSIST_xmm1_xmm2(imm8) \
+	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t"
+#define AESKEY_EXPAND128 \
+	"pshufd $0xff, %%xmm2, %%xmm2\n\t" \
+	"movdqa %%xmm1, %%xmm3\n\t" \
+	"pslldq $4, %%xmm3\n\t" \
+	"pxor   %%xmm3, %%xmm1\n\t" \
+	"pslldq $4, %%xmm3\n\t" \
+	"pxor   %%xmm3, %%xmm1\n\t" \
+	"pslldq $4, %%xmm3\n\t" \
+	"pxor   %%xmm3, %%xmm2\n\t" \
+	"pxor   %%xmm2, %%xmm1\n\t"
+
+      asm volatile ("movdqu (%[key]), %%xmm1\n\t"     /* xmm1 := key   */
+                    "movdqa %%xmm1, (%[ksch])\n\t"     /* ksch[0] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x01)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x10(%[ksch])\n\t" /* ksch[1] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x02)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x04)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x08)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x10)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x50(%[ksch])\n\t" /* ksch[5] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x20)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x40)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x70(%[ksch])\n\t" /* ksch[7] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x80)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x1b)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x36)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1  */
+                    :
+                    : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
+                    : "cc", "memory" );
+#undef AESKEYGENASSIST_xmm1_xmm2
+#undef AESKEY_EXPAND128
+    }
+  else if (ctx->rounds == 12)
+    {
+      /* 192-bit key */
+#define AESKEYGENASSIST_xmm3_xmm2(imm8) \
+	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t"
+#define AESKEY_EXPAND192 \
+	"pshufd $0x55, %%xmm2, %%xmm2\n\t" \
+	"movdqu %%xmm1, %%xmm4\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm1\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm1\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm1\n\t" \
+	"pxor %%xmm2, %%xmm1\n\t" \
+	"pshufd $0xff, %%xmm1, %%xmm2\n\t" \
+	"movdqu %%xmm3, %%xmm4\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm3\n\t" \
+	"pxor %%xmm2, %%xmm3\n\t"
+
+      asm volatile ("movdqu (%[key]), %%xmm1\n\t"     /* xmm1 := key[0..15]   */
+                    "movq 16(%[key]), %%xmm3\n\t"     /* xmm3 := key[16..23]  */
+                    "movdqa %%xmm1, (%[ksch])\n\t"    /* ksch[0] := xmm1  */
+                    "movdqa %%xmm3, %%xmm5\n\t"
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x01)
+                    AESKEY_EXPAND192
+                    "shufpd $0, %%xmm1, %%xmm5\n\t"
+                    "movdqa %%xmm5, 0x10(%[ksch])\n\t" /* ksch[1] := xmm5  */
+                    "movdqa %%xmm1, %%xmm6\n\t"
+                    "shufpd $1, %%xmm3, %%xmm6\n\t"
+                    "movdqa %%xmm6, 0x20(%[ksch])\n\t" /* ksch[2] := xmm6  */
+                    AESKEYGENASSIST_xmm3_xmm2(0x02)
+                    AESKEY_EXPAND192
+                    "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1  */
+                    "movdqa %%xmm3, %%xmm5\n\t"
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x04)
+                    AESKEY_EXPAND192
+                    "shufpd $0, %%xmm1, %%xmm5\n\t"
+                    "movdqa %%xmm5, 0x40(%[ksch])\n\t" /* ksch[4] := xmm5  */
+                    "movdqa %%xmm1, %%xmm6\n\t"
+                    "shufpd $1, %%xmm3, %%xmm6\n\t"
+                    "movdqa %%xmm6, 0x50(%[ksch])\n\t" /* ksch[5] := xmm6  */
+                    AESKEYGENASSIST_xmm3_xmm2(0x08)
+                    AESKEY_EXPAND192
+                    "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1  */
+                    "movdqa %%xmm3, %%xmm5\n\t"
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x10)
+                    AESKEY_EXPAND192
+                    "shufpd $0, %%xmm1, %%xmm5\n\t"
+                    "movdqa %%xmm5, 0x70(%[ksch])\n\t" /* ksch[7] := xmm5  */
+                    "movdqa %%xmm1, %%xmm6\n\t"
+                    "shufpd $1, %%xmm3, %%xmm6\n\t"
+                    "movdqa %%xmm6, 0x80(%[ksch])\n\t" /* ksch[8] := xmm6  */
+                    AESKEYGENASSIST_xmm3_xmm2(0x20)
+                    AESKEY_EXPAND192
+                    "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1  */
+                    "movdqa %%xmm3, %%xmm5\n\t"
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x40)
+                    AESKEY_EXPAND192
+                    "shufpd $0, %%xmm1, %%xmm5\n\t"
+                    "movdqa %%xmm5, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm5  */
+                    "movdqa %%xmm1, %%xmm6\n\t"
+                    "shufpd $1, %%xmm3, %%xmm6\n\t"
+                    "movdqa %%xmm6, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm6  */
+                    AESKEYGENASSIST_xmm3_xmm2(0x80)
+                    AESKEY_EXPAND192
+                    "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1  */
+                    :
+                    : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
+                    : "cc", "memory" );
+#undef AESKEYGENASSIST_xmm3_xmm2
+#undef AESKEY_EXPAND192
+    }
+  else if (ctx->rounds > 12)
+    {
+      /* 256-bit key */
+#define AESKEYGENASSIST_xmm1_xmm2(imm8) \
+	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t"
+#define AESKEYGENASSIST_xmm3_xmm2(imm8) \
+	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t"
+#define AESKEY_EXPAND256_A \
+	"pshufd $0xff, %%xmm2, %%xmm2\n\t" \
+	"movdqa %%xmm1, %%xmm4\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm1\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm1\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm1\n\t" \
+	"pxor %%xmm2, %%xmm1\n\t"
+#define AESKEY_EXPAND256_B \
+	"pshufd $0xaa, %%xmm2, %%xmm2\n\t" \
+	"movdqa %%xmm3, %%xmm4\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm3\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm3\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm3\n\t" \
+	"pxor %%xmm2, %%xmm3\n\t"
+
+      asm volatile ("movdqu (%[key]), %%xmm1\n\t"     /* xmm1 := key[0..15]   */
+                    "movdqu 16(%[key]), %%xmm3\n\t"   /* xmm3 := key[16..31]  */
+                    "movdqa %%xmm1, (%[ksch])\n\t"     /* ksch[0] := xmm1  */
+                    "movdqa %%xmm3, 0x10(%[ksch])\n\t" /* ksch[1] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x01)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x00)
+                    AESKEY_EXPAND256_B
+                    "movdqa %%xmm3, 0x30(%[ksch])\n\t" /* ksch[3] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x02)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x00)
+                    AESKEY_EXPAND256_B
+                    "movdqa %%xmm3, 0x50(%[ksch])\n\t" /* ksch[5] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x04)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x00)
+                    AESKEY_EXPAND256_B
+                    "movdqa %%xmm3, 0x70(%[ksch])\n\t" /* ksch[7] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x08)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x00)
+                    AESKEY_EXPAND256_B
+                    "movdqa %%xmm3, 0x90(%[ksch])\n\t" /* ksch[9] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x10)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x00)
+                    AESKEY_EXPAND256_B
+                    "movdqa %%xmm3, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x20)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x00)
+                    AESKEY_EXPAND256_B
+                    "movdqa %%xmm3, 0xd0(%[ksch])\n\t" /* ksch[13] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x40)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0xe0(%[ksch])\n\t" /* ksch[14] := xmm1  */
+
+                    :
+                    : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
+                    : "cc", "memory" );
+#undef AESKEYGENASSIST_xmm1_xmm2
+#undef AESKEYGENASSIST_xmm3_xmm2
+#undef AESKEY_EXPAND256_A
+#undef AESKEY_EXPAND256_B
+    }
+
+  aesni_cleanup();
+  aesni_cleanup_2_6();
+}
+#endif /*USE_AESNI*/
+
+
+
+/* Perform the key setup.  */
 static gcry_err_code_t
 do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
 {
   static int initialized = 0;
   static const char *selftest_failed=0;
-  int ROUNDS;
+  int rounds;
   int i,j, r, t, rconpointer = 0;
   int KC;
-  union
-  {
-    PROPERLY_ALIGNED_TYPE dummy;
-    byte k[MAXKC][4];
-  } k;
-#define k k.k
-  union
-  {
-    PROPERLY_ALIGNED_TYPE dummy;
-    byte tk[MAXKC][4];
-  } tk;
-#define tk tk.tk  
+#if defined(USE_AESNI) || defined(USE_PADLOCK)
+  unsigned int hwfeatures;
+#endif
 
   /* The on-the-fly self tests are only run in non-fips mode. In fips
      mode explicit self-tests are required.  Actually the on-the-fly
      self-tests are not fully thread-safe and it might happen that a
-     failed self-test won't get noticed in another thread.  
+     failed self-test won't get noticed in another thread.
 
      FIXME: We might want to have a central registry of succeeded
      self-tests. */
@@ -129,65 +486,115 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
   if (selftest_failed)
     return GPG_ERR_SELFTEST_FAILED;
 
+#if defined(USE_AESNI) || defined(USE_PADLOCK)
+  hwfeatures = _gcry_get_hw_features ();
+#endif
+
   ctx->decryption_prepared = 0;
 #ifdef USE_PADLOCK
   ctx->use_padlock = 0;
 #endif
+#ifdef USE_AESNI
+  ctx->use_aesni = 0;
+#endif
 
   if( keylen == 128/8 )
     {
-      ROUNDS = 10;
+      rounds = 10;
       KC = 4;
+
+      if (0)
+        {
+          ;
+        }
 #ifdef USE_PADLOCK
-      if ((_gcry_get_hw_features () & HWF_PADLOCK_AES))
+      else if (hwfeatures & HWF_PADLOCK_AES)
         {
           ctx->use_padlock = 1;
-          memcpy (ctx->padlock_key, key, keylen);
+          memcpy (ctx->padlockkey, key, keylen);
+        }
+#endif
+#ifdef USE_AESNI
+      else if (hwfeatures & HWF_INTEL_AESNI)
+        {
+          ctx->use_aesni = 1;
         }
 #endif
     }
   else if ( keylen == 192/8 )
     {
-      ROUNDS = 12;
+      rounds = 12;
       KC = 6;
+
+      if (0)
+        {
+          ;
+        }
+#ifdef USE_AESNI
+      else if (hwfeatures & HWF_INTEL_AESNI)
+        {
+          ctx->use_aesni = 1;
+        }
+#endif
     }
   else if ( keylen == 256/8 )
     {
-      ROUNDS = 14;
+      rounds = 14;
       KC = 8;
+
+      if (0)
+        {
+          ;
+        }
+#ifdef USE_AESNI
+      else if (hwfeatures & HWF_INTEL_AESNI)
+        {
+          ctx->use_aesni = 1;
+        }
+#endif
     }
   else
     return GPG_ERR_INV_KEYLEN;
 
-  ctx->ROUNDS = ROUNDS;
+  ctx->rounds = rounds;
 
-#ifdef USE_PADLOCK
-  if (ctx->use_padlock)
+  /* NB: We don't yet support Padlock hardware key generation.  */
+
+  if (0)
     {
-      /* Nothing to do as we support only hardware key generation for
-         now.  */
+      ;
     }
+#ifdef USE_AESNI
+  else if (ctx->use_aesni)
+    aesni_do_setkey(ctx, key);
+#endif
   else
-#endif /*USE_PADLOCK*/
     {
-#define W (ctx->keySched)
-      for (i = 0; i < keylen; i++) 
+      union
         {
-          k[i >> 2][i & 3] = key[i]; 
+          PROPERLY_ALIGNED_TYPE dummy;
+          byte data[MAXKC][4];
+        } k, tk;
+#define k k.data
+#define tk tk.data
+#define W (ctx->keyschenc)
+      for (i = 0; i < keylen; i++)
+        {
+          k[i >> 2][i & 3] = key[i];
         }
-      
-      for (j = KC-1; j >= 0; j--) 
+
+      for (j = KC-1; j >= 0; j--)
         {
-          *((u32*)tk[j]) = *((u32*)k[j]);
+          *((u32_a_t*)tk[j]) = *((u32_a_t*)k[j]);
         }
       r = 0;
       t = 0;
       /* Copy values into round key array.  */
-      for (j = 0; (j < KC) && (r < ROUNDS + 1); )
+      for (j = 0; (j < KC) && (r < rounds + 1); )
         {
           for (; (j < KC) && (t < 4); j++, t++)
             {
-              *((u32*)W[r][t]) = *((u32*)tk[j]);
+              *((u32_a_t*)W[r][t]) = *((u32_a_t*)tk[j]);
             }
           if (t == 4)
             {
@@ -195,8 +602,8 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
               t = 0;
             }
         }
-      
-      while (r < ROUNDS + 1)
+
+      while (r < rounds + 1)
         {
           /* While not enough round key material calculated calculate
              new values.  */
@@ -205,19 +612,19 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
           tk[0][2] ^= S[tk[KC-1][3]];
           tk[0][3] ^= S[tk[KC-1][0]];
           tk[0][0] ^= rcon[rconpointer++];
-          
+
           if (KC != 8)
             {
-              for (j = 1; j < KC; j++) 
+              for (j = 1; j < KC; j++)
                 {
-                  *((u32*)tk[j]) ^= *((u32*)tk[j-1]);
+                  *((u32_a_t*)tk[j]) ^= *((u32_a_t*)tk[j-1]);
                 }
-            } 
-          else 
+            }
+          else
             {
               for (j = 1; j < KC/2; j++)
                 {
-                  *((u32*)tk[j]) ^= *((u32*)tk[j-1]);
+                  *((u32_a_t*)tk[j]) ^= *((u32_a_t*)tk[j-1]);
                 }
               tk[KC/2][0] ^= S[tk[KC/2 - 1][0]];
               tk[KC/2][1] ^= S[tk[KC/2 - 1][1]];
@@ -225,16 +632,16 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
               tk[KC/2][3] ^= S[tk[KC/2 - 1][3]];
               for (j = KC/2 + 1; j < KC; j++)
                 {
-                  *((u32*)tk[j]) ^= *((u32*)tk[j-1]);
+                  *((u32_a_t*)tk[j]) ^= *((u32_a_t*)tk[j-1]);
                 }
             }
-          
+
           /* Copy values into round key array.  */
-          for (j = 0; (j < KC) && (r < ROUNDS + 1); )
+          for (j = 0; (j < KC) && (r < rounds + 1); )
             {
               for (; (j < KC) && (t < 4); j++, t++)
                 {
-                  *((u32*)W[r][t]) = *((u32*)tk[j]);
+                  *((u32_a_t*)W[r][t]) = *((u32_a_t*)tk[j]);
                 }
               if (t == 4)
                 {
@@ -242,13 +649,15 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
                   t = 0;
                 }
             }
-        }		
-#undef W    
+        }
+#undef W
+#undef tk
+#undef k
+      wipememory(&tk, sizeof(tk));
+      wipememory(&t, sizeof(t));
     }
 
   return 0;
-#undef tk
-#undef k
 }
 
 
@@ -256,10 +665,7 @@ static gcry_err_code_t
 rijndael_setkey (void *context, const byte *key, const unsigned keylen)
 {
   RIJNDAEL_context *ctx = context;
-
-  int rc = do_setkey (ctx, key, keylen);
-  _gcry_burn_stack ( 100 + 16*sizeof(int));
-  return rc;
+  return do_setkey (ctx, key, keylen);
 }
 
 
@@ -268,53 +674,113 @@ static void
 prepare_decryption( RIJNDAEL_context *ctx )
 {
   int r;
-  union
-  {
-    PROPERLY_ALIGNED_TYPE dummy;
-    byte *w;
-  } w;
-#define w w.w
 
-  for (r=0; r < MAXROUNDS+1; r++ )
+#ifdef USE_AESNI
+  if (ctx->use_aesni)
     {
-      *((u32*)ctx->keySched2[r][0]) = *((u32*)ctx->keySched[r][0]);
-      *((u32*)ctx->keySched2[r][1]) = *((u32*)ctx->keySched[r][1]);
-      *((u32*)ctx->keySched2[r][2]) = *((u32*)ctx->keySched[r][2]);
-      *((u32*)ctx->keySched2[r][3]) = *((u32*)ctx->keySched[r][3]);
+      /* The AES-NI decrypt instructions use the Equivalent Inverse
+         Cipher, thus we can't use the the standard decrypt key
+         preparation.  */
+        u128_t *ekey = (u128_t *)ctx->keyschenc;
+        u128_t *dkey = (u128_t *)ctx->keyschdec;
+        int rr;
+
+	aesni_prepare();
+
+#define DO_AESNI_AESIMC() \
+	asm volatile ("movdqa %[ekey], %%xmm1\n\t" \
+                      /*"aesimc %%xmm1, %%xmm1\n\t"*/ \
+                      ".byte 0x66, 0x0f, 0x38, 0xdb, 0xc9\n\t" \
+                      "movdqa %%xmm1, %[dkey]" \
+                      : [dkey] "=m" (dkey[r]) \
+                      : [ekey] "m" (ekey[rr]) \
+                      : "memory")
+
+        dkey[0] = ekey[ctx->rounds];
+        r=1;
+	rr=ctx->rounds-1;
+	DO_AESNI_AESIMC(); r++; rr--; /* round 1 */
+	DO_AESNI_AESIMC(); r++; rr--; /* round 2 */
+	DO_AESNI_AESIMC(); r++; rr--; /* round 3 */
+	DO_AESNI_AESIMC(); r++; rr--; /* round 4 */
+	DO_AESNI_AESIMC(); r++; rr--; /* round 5 */
+	DO_AESNI_AESIMC(); r++; rr--; /* round 6 */
+	DO_AESNI_AESIMC(); r++; rr--; /* round 7 */
+	DO_AESNI_AESIMC(); r++; rr--; /* round 8 */
+	DO_AESNI_AESIMC(); r++; rr--; /* round 9 */
+	if (ctx->rounds > 10)
+	  {
+	    DO_AESNI_AESIMC(); r++; rr--; /* round 10 */
+	    DO_AESNI_AESIMC(); r++; rr--; /* round 11 */
+	    if (ctx->rounds > 12)
+	      {
+	        DO_AESNI_AESIMC(); r++; rr--; /* round 12 */
+	        DO_AESNI_AESIMC(); r++; rr--; /* round 13 */
+	      }
+	  }
+
+        dkey[r] = ekey[0];
+
+#undef DO_AESNI_AESIMC
+
+	aesni_cleanup();
     }
-#define W (ctx->keySched2)
-  for (r = 1; r < ctx->ROUNDS; r++)
+  else
+#endif /*USE_AESNI*/
     {
-      w = W[r][0];
-      *((u32*)w) = *((u32*)U1[w[0]]) ^ *((u32*)U2[w[1]])
-        ^ *((u32*)U3[w[2]]) ^ *((u32*)U4[w[3]]);
-       
-      w = W[r][1];
-      *((u32*)w) = *((u32*)U1[w[0]]) ^ *((u32*)U2[w[1]])
-        ^ *((u32*)U3[w[2]]) ^ *((u32*)U4[w[3]]);
-        
-      w = W[r][2];
-      *((u32*)w) = *((u32*)U1[w[0]]) ^ *((u32*)U2[w[1]])
-        ^ *((u32*)U3[w[2]]) ^ *((u32*)U4[w[3]]);
-        
-      w = W[r][3];
-      *((u32*)w) = *((u32*)U1[w[0]]) ^ *((u32*)U2[w[1]])
-        ^ *((u32*)U3[w[2]]) ^ *((u32*)U4[w[3]]);
-    }
+      union
+      {
+        PROPERLY_ALIGNED_TYPE dummy;
+        byte *w;
+      } w;
+#define w w.w
+
+      for (r=0; r < MAXROUNDS+1; r++ )
+        {
+          *((u32_a_t*)ctx->keyschdec[r][0]) = *((u32_a_t*)ctx->keyschenc[r][0]);
+          *((u32_a_t*)ctx->keyschdec[r][1]) = *((u32_a_t*)ctx->keyschenc[r][1]);
+          *((u32_a_t*)ctx->keyschdec[r][2]) = *((u32_a_t*)ctx->keyschenc[r][2]);
+          *((u32_a_t*)ctx->keyschdec[r][3]) = *((u32_a_t*)ctx->keyschenc[r][3]);
+        }
+#define W (ctx->keyschdec)
+      for (r = 1; r < ctx->rounds; r++)
+        {
+          w = W[r][0];
+          *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]])
+            ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]);
+
+          w = W[r][1];
+          *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]])
+            ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]);
+
+          w = W[r][2];
+          *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]])
+        ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]);
+
+          w = W[r][3];
+          *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]])
+            ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]);
+        }
 #undef W
 #undef w
-}	
-
+      wipememory(&w, sizeof(w));
+    }
+}
 
 
 /* Encrypt one block.  A and B need to be aligned on a 4 byte
    boundary.  A and B may be the same. */
 static void
-do_encrypt_aligned (const RIJNDAEL_context *ctx, 
+do_encrypt_aligned (const RIJNDAEL_context *ctx,
                     unsigned char *b, const unsigned char *a)
 {
-#define rk (ctx->keySched)
-  int ROUNDS = ctx->ROUNDS;
+#ifdef USE_AMD64_ASM
+  _gcry_aes_amd64_encrypt_block(ctx->keyschenc, b, a, ctx->rounds);
+#elif defined(USE_ARM_ASM)
+  _gcry_aes_arm_encrypt_block(ctx->keyschenc, b, a, ctx->rounds);
+#else
+#define rk (ctx->keyschenc)
+  int rounds = ctx->rounds;
   int r;
   union
   {
@@ -322,57 +788,57 @@ do_encrypt_aligned (const RIJNDAEL_context *ctx,
     byte temp[4][4];
   } u;
 
-  *((u32*)u.temp[0]) = *((u32*)(a   )) ^ *((u32*)rk[0][0]);
-  *((u32*)u.temp[1]) = *((u32*)(a+ 4)) ^ *((u32*)rk[0][1]);
-  *((u32*)u.temp[2]) = *((u32*)(a+ 8)) ^ *((u32*)rk[0][2]);
-  *((u32*)u.temp[3]) = *((u32*)(a+12)) ^ *((u32*)rk[0][3]);
-  *((u32*)(b    ))   = (*((u32*)T1[u.temp[0][0]])
-                        ^ *((u32*)T2[u.temp[1][1]])
-                        ^ *((u32*)T3[u.temp[2][2]]) 
-                        ^ *((u32*)T4[u.temp[3][3]]));
-  *((u32*)(b + 4))   = (*((u32*)T1[u.temp[1][0]])
-                        ^ *((u32*)T2[u.temp[2][1]])
-                        ^ *((u32*)T3[u.temp[3][2]]) 
-                        ^ *((u32*)T4[u.temp[0][3]]));
-  *((u32*)(b + 8))   = (*((u32*)T1[u.temp[2][0]])
-                        ^ *((u32*)T2[u.temp[3][1]])
-                        ^ *((u32*)T3[u.temp[0][2]]) 
-                        ^ *((u32*)T4[u.temp[1][3]]));
-  *((u32*)(b +12))   = (*((u32*)T1[u.temp[3][0]])
-                        ^ *((u32*)T2[u.temp[0][1]])
-                        ^ *((u32*)T3[u.temp[1][2]]) 
-                        ^ *((u32*)T4[u.temp[2][3]]));
-
-  for (r = 1; r < ROUNDS-1; r++)
+  *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(a   )) ^ *((u32_a_t*)rk[0][0]);
+  *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(a+ 4)) ^ *((u32_a_t*)rk[0][1]);
+  *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(a+ 8)) ^ *((u32_a_t*)rk[0][2]);
+  *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(a+12)) ^ *((u32_a_t*)rk[0][3]);
+  *((u32_a_t*)(b    ))   = (*((u32_a_t*)T1[u.temp[0][0]])
+                        ^ *((u32_a_t*)T2[u.temp[1][1]])
+                        ^ *((u32_a_t*)T3[u.temp[2][2]])
+                        ^ *((u32_a_t*)T4[u.temp[3][3]]));
+  *((u32_a_t*)(b + 4))   = (*((u32_a_t*)T1[u.temp[1][0]])
+                        ^ *((u32_a_t*)T2[u.temp[2][1]])
+                        ^ *((u32_a_t*)T3[u.temp[3][2]])
+                        ^ *((u32_a_t*)T4[u.temp[0][3]]));
+  *((u32_a_t*)(b + 8))   = (*((u32_a_t*)T1[u.temp[2][0]])
+                        ^ *((u32_a_t*)T2[u.temp[3][1]])
+                        ^ *((u32_a_t*)T3[u.temp[0][2]])
+                        ^ *((u32_a_t*)T4[u.temp[1][3]]));
+  *((u32_a_t*)(b +12))   = (*((u32_a_t*)T1[u.temp[3][0]])
+                        ^ *((u32_a_t*)T2[u.temp[0][1]])
+                        ^ *((u32_a_t*)T3[u.temp[1][2]])
+                        ^ *((u32_a_t*)T4[u.temp[2][3]]));
+
+  for (r = 1; r < rounds-1; r++)
     {
-      *((u32*)u.temp[0]) = *((u32*)(b   )) ^ *((u32*)rk[r][0]);
-      *((u32*)u.temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[r][1]);
-      *((u32*)u.temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[r][2]);
-      *((u32*)u.temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[r][3]);
-
-      *((u32*)(b    ))   = (*((u32*)T1[u.temp[0][0]])
-                            ^ *((u32*)T2[u.temp[1][1]])
-                            ^ *((u32*)T3[u.temp[2][2]]) 
-                            ^ *((u32*)T4[u.temp[3][3]]));
-      *((u32*)(b + 4))   = (*((u32*)T1[u.temp[1][0]])
-                            ^ *((u32*)T2[u.temp[2][1]])
-                            ^ *((u32*)T3[u.temp[3][2]]) 
-                            ^ *((u32*)T4[u.temp[0][3]]));
-      *((u32*)(b + 8))   = (*((u32*)T1[u.temp[2][0]])
-                            ^ *((u32*)T2[u.temp[3][1]])
-                            ^ *((u32*)T3[u.temp[0][2]]) 
-                            ^ *((u32*)T4[u.temp[1][3]]));
-      *((u32*)(b +12))   = (*((u32*)T1[u.temp[3][0]])
-                            ^ *((u32*)T2[u.temp[0][1]])
-                            ^ *((u32*)T3[u.temp[1][2]]) 
-                            ^ *((u32*)T4[u.temp[2][3]]));
+      *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b   )) ^ *((u32_a_t*)rk[r][0]);
+      *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[r][1]);
+      *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[r][2]);
+      *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[r][3]);
+
+      *((u32_a_t*)(b    ))   = (*((u32_a_t*)T1[u.temp[0][0]])
+                            ^ *((u32_a_t*)T2[u.temp[1][1]])
+                            ^ *((u32_a_t*)T3[u.temp[2][2]])
+                            ^ *((u32_a_t*)T4[u.temp[3][3]]));
+      *((u32_a_t*)(b + 4))   = (*((u32_a_t*)T1[u.temp[1][0]])
+                            ^ *((u32_a_t*)T2[u.temp[2][1]])
+                            ^ *((u32_a_t*)T3[u.temp[3][2]])
+                            ^ *((u32_a_t*)T4[u.temp[0][3]]));
+      *((u32_a_t*)(b + 8))   = (*((u32_a_t*)T1[u.temp[2][0]])
+                            ^ *((u32_a_t*)T2[u.temp[3][1]])
+                            ^ *((u32_a_t*)T3[u.temp[0][2]])
+                            ^ *((u32_a_t*)T4[u.temp[1][3]]));
+      *((u32_a_t*)(b +12))   = (*((u32_a_t*)T1[u.temp[3][0]])
+                            ^ *((u32_a_t*)T2[u.temp[0][1]])
+                            ^ *((u32_a_t*)T3[u.temp[1][2]])
+                            ^ *((u32_a_t*)T4[u.temp[2][3]]));
     }
 
-  /* Last round is special. */   
-  *((u32*)u.temp[0]) = *((u32*)(b   )) ^ *((u32*)rk[ROUNDS-1][0]);
-  *((u32*)u.temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[ROUNDS-1][1]);
-  *((u32*)u.temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[ROUNDS-1][2]);
-  *((u32*)u.temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[ROUNDS-1][3]);
+  /* Last round is special. */
+  *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b   )) ^ *((u32_a_t*)rk[rounds-1][0]);
+  *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[rounds-1][1]);
+  *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[rounds-1][2]);
+  *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[rounds-1][3]);
   b[ 0] = T1[u.temp[0][0]][1];
   b[ 1] = T1[u.temp[1][1]][1];
   b[ 2] = T1[u.temp[2][2]][1];
@@ -389,11 +855,12 @@ do_encrypt_aligned (const RIJNDAEL_context *ctx,
   b[13] = T1[u.temp[0][1]][1];
   b[14] = T1[u.temp[1][2]][1];
   b[15] = T1[u.temp[2][3]][1];
-  *((u32*)(b   )) ^= *((u32*)rk[ROUNDS][0]);
-  *((u32*)(b+ 4)) ^= *((u32*)rk[ROUNDS][1]);
-  *((u32*)(b+ 8)) ^= *((u32*)rk[ROUNDS][2]);
-  *((u32*)(b+12)) ^= *((u32*)rk[ROUNDS][3]);
+  *((u32_a_t*)(b   )) ^= *((u32_a_t*)rk[rounds][0]);
+  *((u32_a_t*)(b+ 4)) ^= *((u32_a_t*)rk[rounds][1]);
+  *((u32_a_t*)(b+ 8)) ^= *((u32_a_t*)rk[rounds][2]);
+  *((u32_a_t*)(b+12)) ^= *((u32_a_t*)rk[rounds][3]);
 #undef rk
+#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
 }
 
 
@@ -401,22 +868,31 @@ static void
 do_encrypt (const RIJNDAEL_context *ctx,
             unsigned char *bx, const unsigned char *ax)
 {
-  /* BX and AX are not necessary correctly aligned.  Thus we need to
-     copy them here. */
-  union
-  {
-    u32  dummy[4]; 
-    byte a[16];
-  } a;
-  union
-  {
-    u32  dummy[4]; 
-    byte b[16];
-  } b;
+#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
+  /* BX and AX are not necessary correctly aligned.  Thus we might
+     need to copy them here.  We try to align to a 16 bytes.  */
+  if (((size_t)ax & 0x0f) || ((size_t)bx & 0x0f))
+    {
+      union
+      {
+        u32  dummy[4];
+        byte a[16] ATTR_ALIGNED_16;
+      } a;
+      union
+      {
+        u32  dummy[4];
+        byte b[16] ATTR_ALIGNED_16;
+      } b;
 
-  memcpy (a.a, ax, 16);
-  do_encrypt_aligned (ctx, b.b, a.a);
-  memcpy (bx, b.b, 16);
+      buf_cpy (a.a, ax, 16);
+      do_encrypt_aligned (ctx, b.b, a.a);
+      buf_cpy (bx, b.b, 16);
+    }
+  else
+#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
+    {
+      do_encrypt_aligned (ctx, bx, ax);
+    }
 }
 
 
@@ -432,11 +908,12 @@ do_padlock (const RIJNDAEL_context *ctx, int decrypt_flag,
   unsigned char a[16] __attribute__ ((aligned (16)));
   unsigned char b[16] __attribute__ ((aligned (16)));
   unsigned int cword[4] __attribute__ ((aligned (16)));
+  int blocks;
 
   /* The control word fields are:
       127:12   11:10 9     8     7     6     5     4     3:0
       RESERVED KSIZE CRYPT INTER KEYGN CIPHR ALIGN DGEST ROUND  */
-  cword[0] = (ctx->ROUNDS & 15);  /* (The mask is just a safeguard.)  */
+  cword[0] = (ctx->rounds & 15);  /* (The mask is just a safeguard.)  */
   cword[1] = 0;
   cword[2] = 0;
   cword[3] = 0;
@@ -444,18 +921,29 @@ do_padlock (const RIJNDAEL_context *ctx, int decrypt_flag,
     cword[0] |= 0x00000200;
 
   memcpy (a, ax, 16);
-   
-  asm volatile 
-    ("pushfl\n\t"          /* Force key reload.  */            
+
+  blocks = 1; /* Init counter for just one block.  */
+#ifdef __x86_64__
+  asm volatile
+    ("pushfq\n\t"          /* Force key reload.  */
+     "popfq\n\t"
+     ".byte 0xf3, 0x0f, 0xa7, 0xc8\n\t" /* REP XCRYPT ECB. */
+     : /* No output */
+     : "S" (a), "D" (b), "d" (cword), "b" (ctx->padlockkey), "c" (blocks)
+     : "cc", "memory"
+     );
+#else
+  asm volatile
+    ("pushfl\n\t"          /* Force key reload.  */
      "popfl\n\t"
      "xchg %3, %%ebx\n\t"  /* Load key.  */
-     "movl $1, %%ecx\n\t"  /* Init counter for just one block.  */
-     ".byte 0xf3, 0x0f, 0xa7, 0xc8\n\t" /* REP XSTORE ECB. */
+     ".byte 0xf3, 0x0f, 0xa7, 0xc8\n\t" /* REP XCRYPT ECB. */
      "xchg %3, %%ebx\n"    /* Restore GOT register.  */
      : /* No output */
-     : "S" (a), "D" (b), "d" (cword), "r" (ctx->padlock_key)
-     : "%ecx", "cc", "memory"
+     : "S" (a), "D" (b), "d" (cword), "r" (ctx->padlockkey), "c" (blocks)
+     : "cc", "memory"
      );
+#endif
 
   memcpy (bx, b, 16);
 
@@ -463,23 +951,721 @@ do_padlock (const RIJNDAEL_context *ctx, int decrypt_flag,
 #endif /*USE_PADLOCK*/
 
 
+#ifdef USE_AESNI
+/* Encrypt one block using the Intel AES-NI instructions.  A and B may
+   be the same.
+
+   Our problem here is that gcc does not allow the "x" constraint for
+   SSE registers in asm unless you compile with -msse.  The common
+   wisdom is to use a separate file for SSE instructions and build it
+   separately.  This would require a lot of extra build system stuff,
+   similar to what we do in mpi/ for the asm stuff.  What we do
+   instead is to use standard registers and a bit more of plain asm
+   which copies the data and key stuff to the SSE registers and later
+   back.  If we decide to implement some block modes with parallelized
+   AES instructions, it might indeed be better to use plain asm ala
+   mpi/.  */
+static inline void
+do_aesni_enc (const RIJNDAEL_context *ctx, unsigned char *b,
+              const unsigned char *a)
+{
+#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+  /* Note: For now we relax the alignment requirement for A and B: It
+     does not make much difference because in many case we would need
+     to memcpy them to an extra buffer; using the movdqu is much faster
+     that memcpy and movdqa.  For CFB we know that the IV is properly
+     aligned but that is a special case.  We should better implement
+     CFB direct in asm.  */
+  asm volatile ("movdqu %[src], %%xmm0\n\t"     /* xmm0 := *a     */
+                "movdqa (%[key]), %%xmm1\n\t"    /* xmm1 := key[0] */
+                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x20(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Lenclast%=:\n\t"
+                aesenclast_xmm1_xmm0
+                "movdqu %%xmm0, %[dst]\n"
+                : [dst] "=m" (*b)
+                : [src] "m" (*a),
+                  [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenclast_xmm1_xmm0
+}
+
+
+static inline void
+do_aesni_dec (const RIJNDAEL_context *ctx, unsigned char *b,
+              const unsigned char *a)
+{
+#define aesdec_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xde, 0xc1\n\t"
+#define aesdeclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc1\n\t"
+  asm volatile ("movdqu %[src], %%xmm0\n\t"     /* xmm0 := *a     */
+                "movdqa (%[key]), %%xmm1\n\t"
+                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x20(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Ldeclast%=:\n\t"
+                aesdeclast_xmm1_xmm0
+                "movdqu %%xmm0, %[dst]\n"
+                : [dst] "=m" (*b)
+                : [src] "m" (*a),
+                  [key] "r" (ctx->keyschdec),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+#undef aesdec_xmm1_xmm0
+#undef aesdeclast_xmm1_xmm0
+}
+
+
+/* Encrypt four blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4.  */
+static void
+do_aesni_enc_vec4 (const RIJNDAEL_context *ctx)
+{
+#define aesenc_xmm0_xmm1      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t"
+#define aesenc_xmm0_xmm2      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd0\n\t"
+#define aesenc_xmm0_xmm3      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd8\n\t"
+#define aesenc_xmm0_xmm4      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe0\n\t"
+#define aesenclast_xmm0_xmm1  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc8\n\t"
+#define aesenclast_xmm0_xmm2  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd0\n\t"
+#define aesenclast_xmm0_xmm3  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd8\n\t"
+#define aesenclast_xmm0_xmm4  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe0\n\t"
+  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                aesenclast_xmm0_xmm1
+                aesenclast_xmm0_xmm2
+                aesenclast_xmm0_xmm3
+                aesenclast_xmm0_xmm4
+                : /* no output */
+                : [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+#undef aesenc_xmm0_xmm1
+#undef aesenc_xmm0_xmm2
+#undef aesenc_xmm0_xmm3
+#undef aesenc_xmm0_xmm4
+#undef aesenclast_xmm0_xmm1
+#undef aesenclast_xmm0_xmm2
+#undef aesenclast_xmm0_xmm3
+#undef aesenclast_xmm0_xmm4
+}
+
+
+/* Decrypt four blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4.  */
+static void
+do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
+{
+#define aesdec_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc8\n\t"
+#define aesdec_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd0\n\t"
+#define aesdec_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd8\n\t"
+#define aesdec_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xde, 0xe0\n\t"
+#define aesdeclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc8\n\t"
+#define aesdeclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd0\n\t"
+#define aesdeclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd8\n\t"
+#define aesdeclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xe0\n\t"
+  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                aesdeclast_xmm0_xmm1
+                aesdeclast_xmm0_xmm2
+                aesdeclast_xmm0_xmm3
+                aesdeclast_xmm0_xmm4
+                : /* no output */
+                : [key] "r" (ctx->keyschdec),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+#undef aesdec_xmm0_xmm1
+#undef aesdec_xmm0_xmm2
+#undef aesdec_xmm0_xmm3
+#undef aesdec_xmm0_xmm4
+#undef aesdeclast_xmm0_xmm1
+#undef aesdeclast_xmm0_xmm2
+#undef aesdeclast_xmm0_xmm3
+#undef aesdeclast_xmm0_xmm4
+}
+
+
+/* Perform a CFB encryption or decryption round using the
+   initialization vector IV and the input block A.  Write the result
+   to the output block B and update IV.  IV needs to be 16 byte
+   aligned.  */
+static void
+do_aesni_cfb (const RIJNDAEL_context *ctx, int decrypt_flag,
+              unsigned char *iv, unsigned char *b, const unsigned char *a)
+{
+#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+  asm volatile ("movdqa %[iv], %%xmm0\n\t"      /* xmm0 := IV     */
+                "movdqa (%[key]), %%xmm1\n\t"    /* xmm1 := key[0] */
+                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x20(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Lenclast%=:\n\t"
+                aesenclast_xmm1_xmm0
+                "movdqu %[src], %%xmm1\n\t"      /* Save input.  */
+                "pxor %%xmm1, %%xmm0\n\t"        /* xmm0 = input ^ IV  */
+
+                "cmpl $1, %[decrypt]\n\t"
+                "jz .Ldecrypt_%=\n\t"
+                "movdqa %%xmm0, %[iv]\n\t"       /* [encrypt] Store IV.  */
+                "jmp .Lleave_%=\n"
+                ".Ldecrypt_%=:\n\t"
+                "movdqa %%xmm1, %[iv]\n"         /* [decrypt] Store IV.  */
+                ".Lleave_%=:\n\t"
+                "movdqu %%xmm0, %[dst]\n"        /* Store output.   */
+                : [iv] "+m" (*iv), [dst] "=m" (*b)
+                : [src] "m" (*a),
+                  [key] "r" (ctx->keyschenc),
+                  [rounds] "g" (ctx->rounds),
+                  [decrypt] "m" (decrypt_flag)
+                : "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenclast_xmm1_xmm0
+}
+
+/* Perform a CTR encryption round using the counter CTR and the input
+   block A.  Write the result to the output block B and update CTR.
+   CTR needs to be a 16 byte aligned little-endian value.  */
+static void
+do_aesni_ctr (const RIJNDAEL_context *ctx,
+              unsigned char *ctr, unsigned char *b, const unsigned char *a)
+{
+#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+
+  asm volatile ("movdqa %%xmm5, %%xmm0\n\t"     /* xmm0 := CTR (xmm5)  */
+                "pcmpeqd %%xmm1, %%xmm1\n\t"
+                "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
+
+                "pshufb %%xmm6, %%xmm5\n\t"
+                "psubq  %%xmm1, %%xmm5\n\t"     /* xmm5++ (big endian) */
+
+                /* detect if 64-bit carry handling is needed */
+                "cmpl   $0xffffffff, 8(%[ctr])\n\t"
+                "jne    .Lno_carry%=\n\t"
+                "cmpl   $0xffffffff, 12(%[ctr])\n\t"
+                "jne    .Lno_carry%=\n\t"
+
+                "pslldq $8, %%xmm1\n\t"         /* move lower 64-bit to high */
+                "psubq   %%xmm1, %%xmm5\n\t"    /* add carry to upper 64bits */
+
+                ".Lno_carry%=:\n\t"
+
+                "pshufb %%xmm6, %%xmm5\n\t"
+                "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).       */
+
+                "pxor (%[key]), %%xmm0\n\t"     /* xmm1 ^= key[0]    */
+                "movdqa 0x10(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x20(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Lenclast%=:\n\t"
+                aesenclast_xmm1_xmm0
+                "movdqu %[src], %%xmm1\n\t"      /* xmm1 := input   */
+                "pxor %%xmm1, %%xmm0\n\t"        /* EncCTR ^= input  */
+                "movdqu %%xmm0, %[dst]"          /* Store EncCTR.    */
+
+                : [dst] "=m" (*b)
+                : [src] "m" (*a),
+                  [ctr] "r" (ctr),
+                  [key] "r" (ctx->keyschenc),
+                  [rounds] "g" (ctx->rounds)
+                : "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenclast_xmm1_xmm0
+}
+
+
+/* Four blocks at a time variant of do_aesni_ctr.  */
 static void
+do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
+                unsigned char *ctr, unsigned char *b, const unsigned char *a)
+{
+#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenc_xmm1_xmm2      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd1\n\t"
+#define aesenc_xmm1_xmm3      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd9\n\t"
+#define aesenc_xmm1_xmm4      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe1\n\t"
+#define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+#define aesenclast_xmm1_xmm2  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd1\n\t"
+#define aesenclast_xmm1_xmm3  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd9\n\t"
+#define aesenclast_xmm1_xmm4  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t"
+
+  /* Register usage:
+      esi   keyschedule
+      xmm0  CTR-0
+      xmm1  temp / round key
+      xmm2  CTR-1
+      xmm3  CTR-2
+      xmm4  CTR-3
+      xmm5  copy of *ctr
+      xmm6  endian swapping mask
+   */
+
+  asm volatile ("movdqa %%xmm5, %%xmm0\n\t"   /* xmm0, xmm2 := CTR (xmm5) */
+                "movdqa %%xmm0, %%xmm2\n\t"
+                "pcmpeqd %%xmm1, %%xmm1\n\t"
+                "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
+
+                "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := le(xmm2) */
+                "psubq  %%xmm1, %%xmm2\n\t"     /* xmm2++           */
+                "movdqa %%xmm2, %%xmm3\n\t"     /* xmm3 := xmm2     */
+                "psubq  %%xmm1, %%xmm3\n\t"     /* xmm3++           */
+                "movdqa %%xmm3, %%xmm4\n\t"     /* xmm4 := xmm3     */
+                "psubq  %%xmm1, %%xmm4\n\t"     /* xmm4++           */
+                "movdqa %%xmm4, %%xmm5\n\t"     /* xmm5 := xmm4     */
+                "psubq  %%xmm1, %%xmm5\n\t"     /* xmm5++           */
+
+                /* detect if 64-bit carry handling is needed */
+                "cmpl   $0xffffffff, 8(%[ctr])\n\t"
+                "jne    .Lno_carry%=\n\t"
+                "movl   12(%[ctr]), %%esi\n\t"
+                "bswapl %%esi\n\t"
+                "cmpl   $0xfffffffc, %%esi\n\t"
+                "jb     .Lno_carry%=\n\t"       /* no carry */
+
+                "pslldq $8, %%xmm1\n\t"         /* move lower 64-bit to high */
+                "je     .Lcarry_xmm5%=\n\t"     /* esi == 0xfffffffc */
+                "cmpl   $0xfffffffe, %%esi\n\t"
+                "jb     .Lcarry_xmm4%=\n\t"     /* esi == 0xfffffffd */
+                "je     .Lcarry_xmm3%=\n\t"     /* esi == 0xfffffffe */
+                /* esi == 0xffffffff */
+
+                "psubq   %%xmm1, %%xmm2\n\t"
+                ".Lcarry_xmm3%=:\n\t"
+                "psubq   %%xmm1, %%xmm3\n\t"
+                ".Lcarry_xmm4%=:\n\t"
+                "psubq   %%xmm1, %%xmm4\n\t"
+                ".Lcarry_xmm5%=:\n\t"
+                "psubq   %%xmm1, %%xmm5\n\t"
+
+                ".Lno_carry%=:\n\t"
+                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0]    */
+                "movl %[rounds], %%esi\n\t"
+
+                "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := be(xmm2) */
+                "pshufb %%xmm6, %%xmm3\n\t"     /* xmm3 := be(xmm3) */
+                "pshufb %%xmm6, %%xmm4\n\t"     /* xmm4 := be(xmm4) */
+                "pshufb %%xmm6, %%xmm5\n\t"     /* xmm5 := be(xmm5) */
+                "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).  */
+
+                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm2\n\t"     /* xmm2 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm3\n\t"     /* xmm3 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm4\n\t"     /* xmm4 ^= key[0]    */
+                "movdqa 0x10(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x20(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "cmpl $10, %%esi\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "cmpl $12, %%esi\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Lenclast%=:\n\t"
+                aesenclast_xmm1_xmm0
+                aesenclast_xmm1_xmm2
+                aesenclast_xmm1_xmm3
+                aesenclast_xmm1_xmm4
+
+                "movdqu (%[src]), %%xmm1\n\t"    /* Get block 1.      */
+                "pxor %%xmm1, %%xmm0\n\t"        /* EncCTR-1 ^= input */
+                "movdqu %%xmm0, (%[dst])\n\t"    /* Store block 1     */
+
+                "movdqu 16(%[src]), %%xmm1\n\t"  /* Get block 2.      */
+                "pxor %%xmm1, %%xmm2\n\t"        /* EncCTR-2 ^= input */
+                "movdqu %%xmm2, 16(%[dst])\n\t"  /* Store block 2.    */
+
+                "movdqu 32(%[src]), %%xmm1\n\t"  /* Get block 3.      */
+                "pxor %%xmm1, %%xmm3\n\t"        /* EncCTR-3 ^= input */
+                "movdqu %%xmm3, 32(%[dst])\n\t"  /* Store block 3.    */
+
+                "movdqu 48(%[src]), %%xmm1\n\t"  /* Get block 4.      */
+                "pxor %%xmm1, %%xmm4\n\t"        /* EncCTR-4 ^= input */
+                "movdqu %%xmm4, 48(%[dst])"      /* Store block 4.   */
+
+                :
+                : [ctr] "r" (ctr),
+                  [src] "r" (a),
+                  [dst] "r" (b),
+                  [key] "r" (ctx->keyschenc),
+                  [rounds] "g" (ctx->rounds)
+                : "%esi", "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenc_xmm1_xmm2
+#undef aesenc_xmm1_xmm3
+#undef aesenc_xmm1_xmm4
+#undef aesenclast_xmm1_xmm0
+#undef aesenclast_xmm1_xmm2
+#undef aesenclast_xmm1_xmm3
+#undef aesenclast_xmm1_xmm4
+}
+
+#endif /*USE_AESNI*/
+
+
+static unsigned int
 rijndael_encrypt (void *context, byte *b, const byte *a)
 {
   RIJNDAEL_context *ctx = context;
+  unsigned int burn_stack;
 
+  if (0)
+    ;
 #ifdef USE_PADLOCK
-  if (ctx->use_padlock)
+  else if (ctx->use_padlock)
     {
       do_padlock (ctx, 0, b, a);
-      _gcry_burn_stack (48 + 15 /* possible padding for alignment */);
+      burn_stack = (48 + 15 /* possible padding for alignment */);
     }
-  else
 #endif /*USE_PADLOCK*/
+#ifdef USE_AESNI
+  else if (ctx->use_aesni)
+    {
+      aesni_prepare ();
+      do_aesni_enc (ctx, b, a);
+      aesni_cleanup ();
+      burn_stack = 0;
+    }
+#endif /*USE_AESNI*/
+  else
     {
       do_encrypt (ctx, b, a);
-      _gcry_burn_stack (48 + 2*sizeof(int));
+      burn_stack = (56 + 2*sizeof(int));
     }
+
+  return burn_stack;
 }
 
 
@@ -488,18 +1674,19 @@ rijndael_encrypt (void *context, byte *b, const byte *a)
    function is only intended for the bulk encryption feature of
    cipher.c. */
 void
-_gcry_aes_cfb_enc (void *context, unsigned char *iv, 
+_gcry_aes_cfb_enc (void *context, unsigned char *iv,
                    void *outbuf_arg, const void *inbuf_arg,
-                   unsigned int nblocks)
+                   size_t nblocks)
 {
   RIJNDAEL_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
-  unsigned char *ivp;
-  int i;
+  unsigned int burn_depth = 48 + 2*sizeof(int);
 
+  if (0)
+    ;
 #ifdef USE_PADLOCK
-  if (ctx->use_padlock)
+  else if (ctx->use_padlock)
     {
       /* Fixme: Let Padlock do the CFBing.  */
       for ( ;nblocks; nblocks-- )
@@ -507,24 +1694,42 @@ _gcry_aes_cfb_enc (void *context, unsigned char *iv,
           /* Encrypt the IV. */
           do_padlock (ctx, 0, iv, iv);
           /* XOR the input with the IV and store input into IV.  */
-          for (ivp=iv,i=0; i < BLOCKSIZE; i++ )
-            *outbuf++ = (*ivp++ ^= *inbuf++);
+          buf_xor_2dst(outbuf, iv, inbuf, BLOCKSIZE);
+          outbuf += BLOCKSIZE;
+          inbuf  += BLOCKSIZE;
         }
     }
+#endif /*USE_PADLOCK*/
+#ifdef USE_AESNI
+  else if (ctx->use_aesni)
+    {
+      aesni_prepare ();
+      for ( ;nblocks; nblocks-- )
+        {
+          do_aesni_cfb (ctx, 0, iv, outbuf, inbuf);
+          outbuf += BLOCKSIZE;
+          inbuf  += BLOCKSIZE;
+        }
+      aesni_cleanup ();
+
+      burn_depth = 0; /* No stack usage. */
+    }
+#endif /*USE_AESNI*/
   else
-#endif /* USE_PADLOCK*/
     {
       for ( ;nblocks; nblocks-- )
         {
           /* Encrypt the IV. */
           do_encrypt_aligned (ctx, iv, iv);
           /* XOR the input with the IV and store input into IV.  */
-          for (ivp=iv,i=0; i < BLOCKSIZE; i++ )
-            *outbuf++ = (*ivp++ ^= *inbuf++);
+          buf_xor_2dst(outbuf, iv, inbuf, BLOCKSIZE);
+          outbuf += BLOCKSIZE;
+          inbuf  += BLOCKSIZE;
         }
     }
 
-  _gcry_burn_stack (48 + 2*sizeof(int));
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth);
 }
 
 
@@ -533,35 +1738,173 @@ _gcry_aes_cfb_enc (void *context, unsigned char *iv,
    function is only intended for the bulk encryption feature of
    cipher.c. */
 void
-_gcry_aes_cbc_enc (void *context, unsigned char *iv, 
+_gcry_aes_cbc_enc (void *context, unsigned char *iv,
                    void *outbuf_arg, const void *inbuf_arg,
-                   unsigned int nblocks, int cbc_mac)
+                   size_t nblocks, int cbc_mac)
 {
   RIJNDAEL_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
-  unsigned char *ivp;
-  int i;
+  unsigned char *last_iv;
+  unsigned int burn_depth = 48 + 2*sizeof(int);
+#ifdef USE_AESNI
+  int use_aesni = ctx->use_aesni;
+#endif
+
+#ifdef USE_AESNI
+  if (use_aesni)
+    aesni_prepare ();
+#endif /*USE_AESNI*/
+
+  last_iv = iv;
 
   for ( ;nblocks; nblocks-- )
     {
-      for (ivp=iv, i=0; i < BLOCKSIZE; i++ )
-        outbuf[i] = inbuf[i] ^ *ivp++;
+      if (0)
+        ;
+#ifdef USE_AESNI
+      else if (use_aesni)
+        {
+          /* ~35% speed up on Sandy-Bridge when doing xoring and copying with
+             SSE registers.  */
+          asm volatile ("movdqu %[iv], %%xmm0\n\t"
+                        "movdqu %[inbuf], %%xmm1\n\t"
+                        "pxor %%xmm0, %%xmm1\n\t"
+                        "movdqu %%xmm1, %[outbuf]\n\t"
+                        : /* No output */
+                        : [iv] "m" (*last_iv),
+                          [inbuf] "m" (*inbuf),
+                          [outbuf] "m" (*outbuf)
+                        : "memory" );
+
+          do_aesni_enc (ctx, outbuf, outbuf);
+        }
+#endif /*USE_AESNI*/
+      else
+        {
+          buf_xor(outbuf, inbuf, last_iv, BLOCKSIZE);
 
+          if (0)
+            ;
 #ifdef USE_PADLOCK
-      if (ctx->use_padlock)
-        do_padlock (ctx, 0, outbuf, outbuf);
-      else
+          else if (ctx->use_padlock)
+            do_padlock (ctx, 0, outbuf, outbuf);
 #endif /*USE_PADLOCK*/
-        do_encrypt (ctx, outbuf, outbuf );
+          else
+            do_encrypt (ctx, outbuf, outbuf );
+        }
 
-      memcpy (iv, outbuf, BLOCKSIZE);
+      last_iv = outbuf;
       inbuf += BLOCKSIZE;
       if (!cbc_mac)
         outbuf += BLOCKSIZE;
     }
 
-  _gcry_burn_stack (48 + 2*sizeof(int));
+  if (last_iv != iv)
+    {
+      if (0)
+        ;
+#ifdef USE_AESNI
+      else if (use_aesni)
+        asm volatile ("movdqu %[last], %%xmm0\n\t"
+                      "movdqu %%xmm0, %[iv]\n\t"
+                      : /* No output */
+                      : [last] "m" (*last_iv),
+                        [iv] "m" (*iv)
+                      : "memory" );
+#endif /*USE_AESNI*/
+      else
+        buf_cpy (iv, last_iv, BLOCKSIZE);
+    }
+
+#ifdef USE_AESNI
+   if (use_aesni)
+     {
+       aesni_cleanup ();
+       burn_depth = 0; /* No stack usage. */
+     }
+#endif /*USE_AESNI*/
+
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth);
+}
+
+
+/* Bulk encryption of complete blocks in CTR mode.  Caller needs to
+   make sure that CTR is aligned on a 16 byte boundary if AESNI; the
+   minimum alignment is for an u32.  This function is only intended
+   for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size BLOCKSIZE. */
+void
+_gcry_aes_ctr_enc (void *context, unsigned char *ctr,
+                   void *outbuf_arg, const void *inbuf_arg,
+                   size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned int burn_depth = 48 + 2*sizeof(int);
+  int i;
+
+  if (0)
+    ;
+#ifdef USE_AESNI
+  else if (ctx->use_aesni)
+    {
+      static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+        { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+
+      aesni_prepare ();
+
+      asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */
+                    "movdqa %[ctr], %%xmm5\n\t"  /* Preload CTR */
+                    : /* No output */
+                    : [mask] "m" (*be_mask),
+                      [ctr] "m" (*ctr)
+                    : "memory");
+
+      for ( ;nblocks > 3 ; nblocks -= 4 )
+        {
+          do_aesni_ctr_4 (ctx, ctr, outbuf, inbuf);
+          outbuf += 4*BLOCKSIZE;
+          inbuf  += 4*BLOCKSIZE;
+        }
+      for ( ;nblocks; nblocks-- )
+        {
+          do_aesni_ctr (ctx, ctr, outbuf, inbuf);
+          outbuf += BLOCKSIZE;
+          inbuf  += BLOCKSIZE;
+        }
+      aesni_cleanup ();
+      aesni_cleanup_2_6 ();
+
+      burn_depth = 0; /* No stack usage. */
+    }
+#endif /*USE_AESNI*/
+  else
+    {
+      union { unsigned char x1[16]; u32 x32[4]; } tmp;
+
+      for ( ;nblocks; nblocks-- )
+        {
+          /* Encrypt the counter. */
+          do_encrypt_aligned (ctx, tmp.x1, ctr);
+          /* XOR the input with the encrypted counter and store in output.  */
+          buf_xor(outbuf, tmp.x1, inbuf, BLOCKSIZE);
+          outbuf += BLOCKSIZE;
+          inbuf  += BLOCKSIZE;
+          /* Increment the counter.  */
+          for (i = BLOCKSIZE; i > 0; i--)
+            {
+              ctr[i-1]++;
+              if (ctr[i-1])
+                break;
+            }
+        }
+    }
+
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth);
 }
 
 
@@ -570,70 +1913,75 @@ _gcry_aes_cbc_enc (void *context, unsigned char *iv,
    and the decryption must have been prepared.  A and B may be the
    same. */
 static void
-do_decrypt_aligned (RIJNDAEL_context *ctx, 
+do_decrypt_aligned (RIJNDAEL_context *ctx,
                     unsigned char *b, const unsigned char *a)
 {
-#define rk  (ctx->keySched2)
-  int ROUNDS = ctx->ROUNDS; 
+#ifdef USE_AMD64_ASM
+  _gcry_aes_amd64_decrypt_block(ctx->keyschdec, b, a, ctx->rounds);
+#elif defined(USE_ARM_ASM)
+  _gcry_aes_arm_decrypt_block(ctx->keyschdec, b, a, ctx->rounds);
+#else
+#define rk  (ctx->keyschdec)
+  int rounds = ctx->rounds;
   int r;
-  union 
+  union
   {
     u32  tempu32[4];  /* Force correct alignment. */
     byte temp[4][4];
   } u;
 
 
-  *((u32*)u.temp[0]) = *((u32*)(a   )) ^ *((u32*)rk[ROUNDS][0]);
-  *((u32*)u.temp[1]) = *((u32*)(a+ 4)) ^ *((u32*)rk[ROUNDS][1]);
-  *((u32*)u.temp[2]) = *((u32*)(a+ 8)) ^ *((u32*)rk[ROUNDS][2]);
-  *((u32*)u.temp[3]) = *((u32*)(a+12)) ^ *((u32*)rk[ROUNDS][3]);
-  
-  *((u32*)(b   ))    = (*((u32*)T5[u.temp[0][0]])
-                        ^ *((u32*)T6[u.temp[3][1]])
-                        ^ *((u32*)T7[u.temp[2][2]]) 
-                        ^ *((u32*)T8[u.temp[1][3]]));
-  *((u32*)(b+ 4))    = (*((u32*)T5[u.temp[1][0]])
-                        ^ *((u32*)T6[u.temp[0][1]])
-                        ^ *((u32*)T7[u.temp[3][2]]) 
-                        ^ *((u32*)T8[u.temp[2][3]]));
-  *((u32*)(b+ 8))    = (*((u32*)T5[u.temp[2][0]])
-                        ^ *((u32*)T6[u.temp[1][1]])
-                        ^ *((u32*)T7[u.temp[0][2]]) 
-                        ^ *((u32*)T8[u.temp[3][3]]));
-  *((u32*)(b+12))    = (*((u32*)T5[u.temp[3][0]])
-                        ^ *((u32*)T6[u.temp[2][1]])
-                        ^ *((u32*)T7[u.temp[1][2]]) 
-                        ^ *((u32*)T8[u.temp[0][3]]));
-
-  for (r = ROUNDS-1; r > 1; r--)
+  *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(a   )) ^ *((u32_a_t*)rk[rounds][0]);
+  *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(a+ 4)) ^ *((u32_a_t*)rk[rounds][1]);
+  *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(a+ 8)) ^ *((u32_a_t*)rk[rounds][2]);
+  *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(a+12)) ^ *((u32_a_t*)rk[rounds][3]);
+
+  *((u32_a_t*)(b   ))    = (*((u32_a_t*)T5[u.temp[0][0]])
+                        ^ *((u32_a_t*)T6[u.temp[3][1]])
+                        ^ *((u32_a_t*)T7[u.temp[2][2]])
+                        ^ *((u32_a_t*)T8[u.temp[1][3]]));
+  *((u32_a_t*)(b+ 4))    = (*((u32_a_t*)T5[u.temp[1][0]])
+                        ^ *((u32_a_t*)T6[u.temp[0][1]])
+                        ^ *((u32_a_t*)T7[u.temp[3][2]])
+                        ^ *((u32_a_t*)T8[u.temp[2][3]]));
+  *((u32_a_t*)(b+ 8))    = (*((u32_a_t*)T5[u.temp[2][0]])
+                        ^ *((u32_a_t*)T6[u.temp[1][1]])
+                        ^ *((u32_a_t*)T7[u.temp[0][2]])
+                        ^ *((u32_a_t*)T8[u.temp[3][3]]));
+  *((u32_a_t*)(b+12))    = (*((u32_a_t*)T5[u.temp[3][0]])
+                        ^ *((u32_a_t*)T6[u.temp[2][1]])
+                        ^ *((u32_a_t*)T7[u.temp[1][2]])
+                        ^ *((u32_a_t*)T8[u.temp[0][3]]));
+
+  for (r = rounds-1; r > 1; r--)
     {
-      *((u32*)u.temp[0]) = *((u32*)(b   )) ^ *((u32*)rk[r][0]);
-      *((u32*)u.temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[r][1]);
-      *((u32*)u.temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[r][2]);
-      *((u32*)u.temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[r][3]);
-      *((u32*)(b   ))    = (*((u32*)T5[u.temp[0][0]])
-                            ^ *((u32*)T6[u.temp[3][1]])
-                            ^ *((u32*)T7[u.temp[2][2]]) 
-                            ^ *((u32*)T8[u.temp[1][3]]));
-      *((u32*)(b+ 4))    = (*((u32*)T5[u.temp[1][0]])
-                            ^ *((u32*)T6[u.temp[0][1]])
-                            ^ *((u32*)T7[u.temp[3][2]]) 
-                            ^ *((u32*)T8[u.temp[2][3]]));
-      *((u32*)(b+ 8))    = (*((u32*)T5[u.temp[2][0]])
-                            ^ *((u32*)T6[u.temp[1][1]])
-                            ^ *((u32*)T7[u.temp[0][2]]) 
-                            ^ *((u32*)T8[u.temp[3][3]]));
-      *((u32*)(b+12))    = (*((u32*)T5[u.temp[3][0]])
-                            ^ *((u32*)T6[u.temp[2][1]])
-                            ^ *((u32*)T7[u.temp[1][2]]) 
-                            ^ *((u32*)T8[u.temp[0][3]]));
+      *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b   )) ^ *((u32_a_t*)rk[r][0]);
+      *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[r][1]);
+      *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[r][2]);
+      *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[r][3]);
+      *((u32_a_t*)(b   ))    = (*((u32_a_t*)T5[u.temp[0][0]])
+                            ^ *((u32_a_t*)T6[u.temp[3][1]])
+                            ^ *((u32_a_t*)T7[u.temp[2][2]])
+                            ^ *((u32_a_t*)T8[u.temp[1][3]]));
+      *((u32_a_t*)(b+ 4))    = (*((u32_a_t*)T5[u.temp[1][0]])
+                            ^ *((u32_a_t*)T6[u.temp[0][1]])
+                            ^ *((u32_a_t*)T7[u.temp[3][2]])
+                            ^ *((u32_a_t*)T8[u.temp[2][3]]));
+      *((u32_a_t*)(b+ 8))    = (*((u32_a_t*)T5[u.temp[2][0]])
+                            ^ *((u32_a_t*)T6[u.temp[1][1]])
+                            ^ *((u32_a_t*)T7[u.temp[0][2]])
+                            ^ *((u32_a_t*)T8[u.temp[3][3]]));
+      *((u32_a_t*)(b+12))    = (*((u32_a_t*)T5[u.temp[3][0]])
+                            ^ *((u32_a_t*)T6[u.temp[2][1]])
+                            ^ *((u32_a_t*)T7[u.temp[1][2]])
+                            ^ *((u32_a_t*)T8[u.temp[0][3]]));
     }
 
-  /* Last round is special. */   
-  *((u32*)u.temp[0]) = *((u32*)(b   )) ^ *((u32*)rk[1][0]);
-  *((u32*)u.temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[1][1]);
-  *((u32*)u.temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[1][2]);
-  *((u32*)u.temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[1][3]);
+  /* Last round is special. */
+  *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b   )) ^ *((u32_a_t*)rk[1][0]);
+  *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[1][1]);
+  *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[1][2]);
+  *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[1][3]);
   b[ 0] = S5[u.temp[0][0]];
   b[ 1] = S5[u.temp[3][1]];
   b[ 2] = S5[u.temp[2][2]];
@@ -650,11 +1998,12 @@ do_decrypt_aligned (RIJNDAEL_context *ctx,
   b[13] = S5[u.temp[2][1]];
   b[14] = S5[u.temp[1][2]];
   b[15] = S5[u.temp[0][3]];
-  *((u32*)(b   )) ^= *((u32*)rk[0][0]);
-  *((u32*)(b+ 4)) ^= *((u32*)rk[0][1]);
-  *((u32*)(b+ 8)) ^= *((u32*)rk[0][2]);
-  *((u32*)(b+12)) ^= *((u32*)rk[0][3]);
+  *((u32_a_t*)(b   )) ^= *((u32_a_t*)rk[0][0]);
+  *((u32_a_t*)(b+ 4)) ^= *((u32_a_t*)rk[0][1]);
+  *((u32_a_t*)(b+ 8)) ^= *((u32_a_t*)rk[0][2]);
+  *((u32_a_t*)(b+12)) ^= *((u32_a_t*)rk[0][3]);
 #undef rk
+#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
 }
 
 
@@ -662,102 +2011,189 @@ do_decrypt_aligned (RIJNDAEL_context *ctx,
 static void
 do_decrypt (RIJNDAEL_context *ctx, byte *bx, const byte *ax)
 {
-  /* BX and AX are not necessary correctly aligned.  Thus we need to
-     copy them here. */
-  union
-  {
-    u32  dummy[4]; 
-    byte a[16];
-  } a;
-  union
-  {
-    u32  dummy[4]; 
-    byte b[16];
-  } b;
+#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
+  /* BX and AX are not necessary correctly aligned.  Thus we might
+     need to copy them here.  We try to align to a 16 bytes. */
+  if (((size_t)ax & 0x0f) || ((size_t)bx & 0x0f))
+    {
+      union
+      {
+        u32  dummy[4];
+        byte a[16] ATTR_ALIGNED_16;
+      } a;
+      union
+      {
+        u32  dummy[4];
+        byte b[16] ATTR_ALIGNED_16;
+      } b;
 
-  if ( !ctx->decryption_prepared )
+      buf_cpy (a.a, ax, 16);
+      do_decrypt_aligned (ctx, b.b, a.a);
+      buf_cpy (bx, b.b, 16);
+    }
+  else
+#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
+    {
+      do_decrypt_aligned (ctx, bx, ax);
+    }
+}
+
+
+static inline void
+check_decryption_preparation (RIJNDAEL_context *ctx)
+{
+  if (0)
+    ;
+#ifdef USE_PADLOCK
+  else if (ctx->use_padlock)
+    { /* Padlock does not need decryption subkeys. */ }
+#endif /*USE_PADLOCK*/
+  else if ( !ctx->decryption_prepared )
     {
       prepare_decryption ( ctx );
-      _gcry_burn_stack (64);
       ctx->decryption_prepared = 1;
     }
-
-  memcpy (a.a, ax, 16);
-  do_decrypt_aligned (ctx, b.b, a.a);
-  memcpy (bx, b.b, 16);
-#undef rk
 }
-    
 
 
-
-static void
+static unsigned int
 rijndael_decrypt (void *context, byte *b, const byte *a)
 {
   RIJNDAEL_context *ctx = context;
+  unsigned int burn_stack;
 
+  check_decryption_preparation (ctx);
+
+  if (0)
+    ;
 #ifdef USE_PADLOCK
-  if (ctx->use_padlock)
+  else if (ctx->use_padlock)
     {
       do_padlock (ctx, 1, b, a);
-      _gcry_burn_stack (48 + 2*sizeof(int) /* FIXME */);
+      burn_stack = (48 + 2*sizeof(int) /* FIXME */);
     }
-  else
 #endif /*USE_PADLOCK*/
+#ifdef USE_AESNI
+  else if (ctx->use_aesni)
+    {
+      aesni_prepare ();
+      do_aesni_dec (ctx, b, a);
+      aesni_cleanup ();
+      burn_stack = 0;
+    }
+#endif /*USE_AESNI*/
+  else
     {
       do_decrypt (ctx, b, a);
-      _gcry_burn_stack (48+2*sizeof(int));
+      burn_stack = (56+2*sizeof(int));
     }
+
+  return burn_stack;
 }
 
 
 /* Bulk decryption of complete blocks in CFB mode.  Caller needs to
-   make sure that IV is aligned on an unisgned lonhg boundary.  This
+   make sure that IV is aligned on an unsigned long boundary.  This
    function is only intended for the bulk encryption feature of
    cipher.c. */
 void
-_gcry_aes_cfb_dec (void *context, unsigned char *iv, 
+_gcry_aes_cfb_dec (void *context, unsigned char *iv,
                    void *outbuf_arg, const void *inbuf_arg,
-                   unsigned int nblocks)
+                   size_t nblocks)
 {
   RIJNDAEL_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
-  unsigned char *ivp;
-  unsigned char temp;
-  int i;
+  unsigned int burn_depth = 48 + 2*sizeof(int);
 
+  if (0)
+    ;
 #ifdef USE_PADLOCK
-  if (ctx->use_padlock)
+  else if (ctx->use_padlock)
     {
       /* Fixme:  Let Padlock do the CFBing.  */
       for ( ;nblocks; nblocks-- )
         {
           do_padlock (ctx, 0, iv, iv);
-          for (ivp=iv,i=0; i < BLOCKSIZE; i++ )
-            {
-              temp = *inbuf++;
-              *outbuf++ = *ivp ^ temp;
-              *ivp++ = temp;
-            }
+          buf_xor_n_copy(outbuf, iv, inbuf, BLOCKSIZE);
+          outbuf += BLOCKSIZE;
+          inbuf  += BLOCKSIZE;
         }
     }
-  else
 #endif /*USE_PADLOCK*/
+#ifdef USE_AESNI
+  else if (ctx->use_aesni)
+    {
+      aesni_prepare ();
+
+      /* CFB decryption can be parallelized */
+      for ( ;nblocks >= 4; nblocks -= 4)
+        {
+          asm volatile
+            ("movdqu (%[iv]),        %%xmm1\n\t" /* load input blocks */
+             "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
+             "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
+             "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
+
+             "movdqu 3*16(%[inbuf]), %%xmm0\n\t" /* update IV */
+             "movdqu %%xmm0,         (%[iv])\n\t"
+             : /* No output */
+             : [inbuf] "r" (inbuf), [iv] "r" (iv)
+             : "memory");
+
+          do_aesni_enc_vec4 (ctx);
+
+          asm volatile
+            ("movdqu 0*16(%[inbuf]), %%xmm5\n\t"
+             "pxor %%xmm5, %%xmm1\n\t"
+             "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+
+             "movdqu 1*16(%[inbuf]), %%xmm5\n\t"
+             "pxor %%xmm5, %%xmm2\n\t"
+             "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+
+             "movdqu 2*16(%[inbuf]), %%xmm5\n\t"
+             "pxor %%xmm5, %%xmm3\n\t"
+             "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+
+             "movdqu 3*16(%[inbuf]), %%xmm5\n\t"
+             "pxor %%xmm5, %%xmm4\n\t"
+             "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+             : /* No output */
+             : [inbuf] "r" (inbuf),
+               [outbuf] "r" (outbuf)
+             : "memory");
+
+          outbuf += 4*BLOCKSIZE;
+          inbuf  += 4*BLOCKSIZE;
+        }
+
+      for ( ;nblocks; nblocks-- )
+        {
+          do_aesni_cfb (ctx, 1, iv, outbuf, inbuf);
+          outbuf += BLOCKSIZE;
+          inbuf  += BLOCKSIZE;
+        }
+      aesni_cleanup ();
+      aesni_cleanup_2_6 ();
+
+      burn_depth = 0; /* No stack usage. */
+    }
+#endif /*USE_AESNI*/
+  else
     {
       for ( ;nblocks; nblocks-- )
         {
           do_encrypt_aligned (ctx, iv, iv);
-          for (ivp=iv,i=0; i < BLOCKSIZE; i++ )
-            {
-              temp = *inbuf++;
-              *outbuf++ = *ivp ^ temp;
-              *ivp++ = temp;
-            }
+          buf_xor_n_copy(outbuf, iv, inbuf, BLOCKSIZE);
+          outbuf += BLOCKSIZE;
+          inbuf  += BLOCKSIZE;
         }
     }
 
-  _gcry_burn_stack (48 + 2*sizeof(int));
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth);
 }
 
 
@@ -766,38 +2202,133 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv,
    function is only intended for the bulk encryption feature of
    cipher.c. */
 void
-_gcry_aes_cbc_dec (void *context, unsigned char *iv, 
+_gcry_aes_cbc_dec (void *context, unsigned char *iv,
                    void *outbuf_arg, const void *inbuf_arg,
-                   unsigned int nblocks)
+                   size_t nblocks)
 {
   RIJNDAEL_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
-  unsigned char *ivp;
-  int i;
-  unsigned char savebuf[BLOCKSIZE];
+  unsigned int burn_depth = 48 + 2*sizeof(int) + 4*sizeof (char*);
 
-  for ( ;nblocks; nblocks-- )
+  check_decryption_preparation (ctx);
+
+  if (0)
+    ;
+#ifdef USE_AESNI
+  else if (ctx->use_aesni)
     {
-      /* We need to save INBUF away because it may be identical to
-         OUTBUF.  */
-      memcpy (savebuf, inbuf, BLOCKSIZE);
+      aesni_prepare ();
+
+      asm volatile
+        ("movdqu %[iv], %%xmm5\n\t"	/* use xmm5 as fast IV storage */
+         : /* No output */
+         : [iv] "m" (*iv)
+         : "memory");
 
+      for ( ;nblocks > 3 ; nblocks -= 4 )
+        {
+          asm volatile
+            ("movdqu 0*16(%[inbuf]), %%xmm1\n\t"	/* load input blocks */
+             "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
+             "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
+             "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
+             : /* No output */
+             : [inbuf] "r" (inbuf)
+             : "memory");
+
+          do_aesni_dec_vec4 (ctx);
+
+          asm volatile
+            ("pxor %%xmm5, %%xmm1\n\t"			/* xor IV with output */
+             "movdqu 0*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
+             "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+
+             "pxor %%xmm5, %%xmm2\n\t"			/* xor IV with output */
+             "movdqu 1*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
+             "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+
+             "pxor %%xmm5, %%xmm3\n\t"			/* xor IV with output */
+             "movdqu 2*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
+             "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+
+             "pxor %%xmm5, %%xmm4\n\t"			/* xor IV with output */
+             "movdqu 3*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
+             "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+             : /* No output */
+             : [inbuf] "r" (inbuf),
+               [outbuf] "r" (outbuf)
+             : "memory");
+
+          outbuf += 4*BLOCKSIZE;
+          inbuf  += 4*BLOCKSIZE;
+        }
+
+      for ( ;nblocks; nblocks-- )
+        {
+          asm volatile
+            ("movdqu %[inbuf], %%xmm2\n\t"	/* use xmm2 as savebuf */
+             : /* No output */
+             : [inbuf] "m" (*inbuf)
+             : "memory");
+
+          /* uses only xmm0 and xmm1 */
+          do_aesni_dec (ctx, outbuf, inbuf);
+
+          asm volatile
+            ("movdqu %[outbuf], %%xmm0\n\t"
+             "pxor %%xmm5, %%xmm0\n\t"		/* xor IV with output */
+             "movdqu %%xmm0, %[outbuf]\n\t"
+             "movdqu %%xmm2, %%xmm5\n\t"	/* store savebuf as new IV */
+             : /* No output */
+             : [outbuf] "m" (*outbuf)
+             : "memory");
+
+          outbuf += BLOCKSIZE;
+          inbuf  += BLOCKSIZE;
+        }
+
+      asm volatile
+        ("movdqu %%xmm5, %[iv]\n\t"	/* store IV */
+         : /* No output */
+         : [iv] "m" (*iv)
+         : "memory");
+
+      aesni_cleanup ();
+      aesni_cleanup_2_6 ();
+
+      burn_depth = 0; /* No stack usage. */
+    }
+#endif /*USE_AESNI*/
+  else
+    {
+      unsigned char savebuf[BLOCKSIZE];
+
+      for ( ;nblocks; nblocks-- )
+        {
+          /* INBUF is needed later and it may be identical to OUTBUF, so store
+             the intermediate result to SAVEBUF.  */
+
+          if (0)
+            ;
 #ifdef USE_PADLOCK
-      if (ctx->use_padlock)
-        do_padlock (ctx, 1, outbuf, inbuf);
-      else
+          else if (ctx->use_padlock)
+            do_padlock (ctx, 1, savebuf, inbuf);
 #endif /*USE_PADLOCK*/
-        do_decrypt (ctx, outbuf, inbuf);
+          else
+            do_decrypt (ctx, savebuf, inbuf);
 
-      for (ivp=iv, i=0; i < BLOCKSIZE; i++ )
-        outbuf[i] ^= *ivp++;
-      memcpy (iv, savebuf, BLOCKSIZE);
-      inbuf += BLOCKSIZE;
-      outbuf += BLOCKSIZE;
+          buf_xor_n_copy_2(outbuf, savebuf, iv, inbuf, BLOCKSIZE);
+          inbuf += BLOCKSIZE;
+          outbuf += BLOCKSIZE;
+        }
+
+      wipememory(savebuf, sizeof(savebuf));
     }
 
-  _gcry_burn_stack (48 + 2*sizeof(int) + BLOCKSIZE + 4*sizeof (char*));
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth);
 }
 
 
@@ -808,11 +2339,12 @@ static const char*
 selftest_basic_128 (void)
 {
   RIJNDAEL_context ctx;
-  unsigned char scratch[16];	   
+  unsigned char scratch[16];
 
   /* The test vectors are from the AES supplied ones; more or less
      randomly taken from ecb_tbl.txt (I=42,81,14) */
-  static const unsigned char plaintext_128[16] = 
+#if 1
+  static const unsigned char plaintext_128[16] =
     {
       0x01,0x4B,0xAF,0x22,0x78,0xA6,0x9D,0x33,
       0x1D,0x51,0x80,0x10,0x36,0x43,0xE9,0x9A
@@ -827,7 +2359,28 @@ selftest_basic_128 (void)
       0x67,0x43,0xC3,0xD1,0x51,0x9A,0xB4,0xF2,
       0xCD,0x9A,0x78,0xAB,0x09,0xA5,0x11,0xBD
     };
-  
+#else
+  /* Test vectors from fips-197, appendix C. */
+# warning debug test vectors in use
+  static const unsigned char plaintext_128[16] =
+    {
+      0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,
+      0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff
+    };
+  static const unsigned char key_128[16] =
+    {
+      0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
+      0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+      /* 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, */
+      /* 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c */
+    };
+  static const unsigned char ciphertext_128[16] =
+    {
+      0x69,0xc4,0xe0,0xd8,0x6a,0x7b,0x04,0x30,
+      0xd8,0xcd,0xb7,0x80,0x70,0xb4,0xc5,0x5a
+    };
+#endif
+
   rijndael_setkey (&ctx, key_128, sizeof (key_128));
   rijndael_encrypt (&ctx, scratch, plaintext_128);
   if (memcmp (scratch, ciphertext_128, sizeof (ciphertext_128)))
@@ -835,7 +2388,7 @@ selftest_basic_128 (void)
   rijndael_decrypt (&ctx, scratch, scratch);
   if (memcmp (scratch, plaintext_128, sizeof (plaintext_128)))
     return "AES-128 test decryption failed.";
-  
+
   return NULL;
 }
 
@@ -844,14 +2397,14 @@ static const char*
 selftest_basic_192 (void)
 {
   RIJNDAEL_context ctx;
-  unsigned char scratch[16];	   
-  
-  static unsigned char plaintext_192[16] = 
+  unsigned char scratch[16];
+
+  static unsigned char plaintext_192[16] =
     {
       0x76,0x77,0x74,0x75,0xF1,0xF2,0xF3,0xF4,
       0xF8,0xF9,0xE6,0xE7,0x77,0x70,0x71,0x72
     };
-  static unsigned char key_192[24] = 
+  static unsigned char key_192[24] =
     {
       0x04,0x05,0x06,0x07,0x09,0x0A,0x0B,0x0C,
       0x0E,0x0F,0x10,0x11,0x13,0x14,0x15,0x16,
@@ -862,7 +2415,7 @@ selftest_basic_192 (void)
       0x5D,0x1E,0xF2,0x0D,0xCE,0xD6,0xBC,0xBC,
       0x12,0x13,0x1A,0xC7,0xC5,0x47,0x88,0xAA
     };
-    
+
   rijndael_setkey (&ctx, key_192, sizeof(key_192));
   rijndael_encrypt (&ctx, scratch, plaintext_192);
   if (memcmp (scratch, ciphertext_192, sizeof (ciphertext_192)))
@@ -870,7 +2423,7 @@ selftest_basic_192 (void)
   rijndael_decrypt (&ctx, scratch, scratch);
   if (memcmp (scratch, plaintext_192, sizeof (plaintext_192)))
     return "AES-192 test decryption failed.";
-  
+
   return NULL;
 }
 
@@ -880,21 +2433,21 @@ static const char*
 selftest_basic_256 (void)
 {
   RIJNDAEL_context ctx;
-  unsigned char scratch[16];	   
+  unsigned char scratch[16];
 
-  static unsigned char plaintext_256[16] = 
+  static unsigned char plaintext_256[16] =
     {
       0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
       0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21
     };
-  static unsigned char key_256[32] = 
+  static unsigned char key_256[32] =
     {
       0x08,0x09,0x0A,0x0B,0x0D,0x0E,0x0F,0x10,
       0x12,0x13,0x14,0x15,0x17,0x18,0x19,0x1A,
       0x1C,0x1D,0x1E,0x1F,0x21,0x22,0x23,0x24,
       0x26,0x27,0x28,0x29,0x2B,0x2C,0x2D,0x2E
     };
-  static const unsigned char ciphertext_256[16] = 
+  static const unsigned char ciphertext_256[16] =
     {
       0x08,0x0E,0x95,0x17,0xEB,0x16,0x77,0x71,
       0x9A,0xCF,0x72,0x80,0x86,0x04,0x0A,0xE3
@@ -907,10 +2460,56 @@ selftest_basic_256 (void)
   rijndael_decrypt (&ctx, scratch, scratch);
   if (memcmp (scratch, plaintext_256, sizeof (plaintext_256)))
     return "AES-256 test decryption failed.";
-    
+
   return NULL;
 }
 
+
+/* Run the self-tests for AES-CTR-128, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char*
+selftest_ctr_128 (void)
+{
+  const int nblocks = 8+1;
+  const int blocksize = BLOCKSIZE;
+  const int context_size = sizeof(RIJNDAEL_context);
+
+  return _gcry_selftest_helper_ctr("AES", &rijndael_setkey,
+           &rijndael_encrypt, &_gcry_aes_ctr_enc, nblocks, blocksize,
+	   context_size);
+}
+
+
+/* Run the self-tests for AES-CBC-128, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cbc_128 (void)
+{
+  const int nblocks = 8+2;
+  const int blocksize = BLOCKSIZE;
+  const int context_size = sizeof(RIJNDAEL_context);
+
+  return _gcry_selftest_helper_cbc("AES", &rijndael_setkey,
+           &rijndael_encrypt, &_gcry_aes_cbc_dec, nblocks, blocksize,
+	   context_size);
+}
+
+
+/* Run the self-tests for AES-CFB-128, tests bulk CFB decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+  const int nblocks = 8+2;
+  const int blocksize = BLOCKSIZE;
+  const int context_size = sizeof(RIJNDAEL_context);
+
+  return _gcry_selftest_helper_cfb("AES", &rijndael_setkey,
+           &rijndael_encrypt, &_gcry_aes_cfb_dec, nblocks, blocksize,
+	   context_size);
+}
+
+
 /* Run all the self-tests and return NULL on success.  This function
    is used for the on-the-fly self-tests. */
 static const char *
@@ -923,6 +2522,15 @@ selftest (void)
        || (r = selftest_basic_256 ()) )
     return r;
 
+  if ( (r = selftest_ctr_128 ()) )
+    return r;
+
+  if ( (r = selftest_cbc_128 ()) )
+    return r;
+
+  if ( (r = selftest_cfb_128 ()) )
+    return r;
+
   return r;
 }
 
@@ -931,12 +2539,12 @@ selftest (void)
 static const char *
 selftest_fips_128_38a (int requested_mode)
 {
-  struct tv
+  static const struct tv
   {
     int mode;
     const unsigned char key[16];
     const unsigned char iv[16];
-    struct 
+    struct
     {
       const unsigned char input[16];
       const unsigned char output[16];
@@ -947,24 +2555,24 @@ selftest_fips_128_38a (int requested_mode)
         GCRY_CIPHER_MODE_CFB,  /* F.3.13, CFB128-AES128 */
         { 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6,
           0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c },
-        { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 
+        { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
         {
           { { 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96,
               0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a },
             { 0x3b, 0x3f, 0xd9, 0x2e, 0xb7, 0x2d, 0xad, 0x20,
               0x33, 0x34, 0x49, 0xf8, 0xe8, 0x3c, 0xfb, 0x4a } },
-          
+
           { { 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c,
               0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51 },
             { 0xc8, 0xa6, 0x45, 0x37, 0xa0, 0xb3, 0xa9, 0x3f,
               0xcd, 0xe3, 0xcd, 0xad, 0x9f, 0x1c, 0xe5, 0x8b } },
-          
-          { { 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 
+
+          { { 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11,
               0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef },
             { 0x26, 0x75, 0x1f, 0x67, 0xa3, 0xcb, 0xb1, 0x40,
               0xb1, 0x80, 0x8c, 0xf1, 0x87, 0xa4, 0xf4, 0xdf } },
-          
+
           { { 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17,
               0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10 },
             { 0xc0, 0x4b, 0x05, 0x35, 0x7c, 0x5d, 0x1c, 0x0e,
@@ -975,7 +2583,7 @@ selftest_fips_128_38a (int requested_mode)
         GCRY_CIPHER_MODE_OFB,
         { 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6,
           0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c },
-        { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 
+        { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
         {
           { { 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96,
@@ -987,7 +2595,7 @@ selftest_fips_128_38a (int requested_mode)
               0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51 },
             { 0x77, 0x89, 0x50, 0x8d, 0x16, 0x91, 0x8f, 0x03,
               0xf5, 0x3c, 0x52, 0xda, 0xc5, 0x4e, 0xd8, 0x25 } },
-          
+
           { { 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11,
               0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef },
             { 0x97, 0x40, 0x05, 0x1e, 0x9c, 0x5f, 0xec, 0xf6,
@@ -1057,7 +2665,7 @@ selftest_fips_128_38a (int requested_mode)
 
 #undef Fail
   _gcry_cipher_close (hdenc);
-  _gcry_cipher_close (hddec); 
+  _gcry_cipher_close (hddec);
   return NULL;
 }
 
@@ -1068,7 +2676,7 @@ selftest_fips_128 (int extended, selftest_report_func_t report)
 {
   const char *what;
   const char *errtxt;
-  
+
   what = "low-level";
   errtxt = selftest_basic_128 ();
   if (errtxt)
@@ -1080,7 +2688,7 @@ selftest_fips_128 (int extended, selftest_report_func_t report)
       errtxt = selftest_fips_128_38a (GCRY_CIPHER_MODE_CFB);
       if (errtxt)
         goto failed;
-      
+
       what = "ofb";
       errtxt = selftest_fips_128_38a (GCRY_CIPHER_MODE_OFB);
       if (errtxt)
@@ -1125,7 +2733,7 @@ selftest_fips_256 (int extended, selftest_report_func_t report)
 {
   const char *what;
   const char *errtxt;
-  
+
   (void)extended; /* No extended tests available.  */
 
   what = "low-level";
@@ -1163,7 +2771,7 @@ run_selftests (int algo, int extended, selftest_report_func_t report)
     default:
       ec = GPG_ERR_CIPHER_ALGO;
       break;
-        
+
     }
   return ec;
 }
@@ -1190,14 +2798,15 @@ static gcry_cipher_oid_spec_t rijndael_oids[] =
 
 gcry_cipher_spec_t _gcry_cipher_spec_aes =
   {
-    "AES", rijndael_names, rijndael_oids, 16, 128, sizeof (RIJNDAEL_context),
-    rijndael_setkey, rijndael_encrypt, rijndael_decrypt
-  };
-cipher_extra_spec_t _gcry_cipher_extraspec_aes = 
-  {
+    GCRY_CIPHER_AES, {0, 1},
+    "AES", rijndael_names, rijndael_oids, 16, 128,
+    sizeof (RIJNDAEL_context),
+    rijndael_setkey, rijndael_encrypt, rijndael_decrypt,
+    NULL, NULL,
     run_selftests
   };
 
+
 static const char *rijndael192_names[] =
   {
     "RIJNDAEL192",
@@ -1216,14 +2825,15 @@ static gcry_cipher_oid_spec_t rijndael192_oids[] =
 
 gcry_cipher_spec_t _gcry_cipher_spec_aes192 =
   {
-    "AES192", rijndael192_names, rijndael192_oids, 16, 192, sizeof (RIJNDAEL_context),
-    rijndael_setkey, rijndael_encrypt, rijndael_decrypt
-  };
-cipher_extra_spec_t _gcry_cipher_extraspec_aes192 = 
-  {
+    GCRY_CIPHER_AES192, {0, 1},
+    "AES192", rijndael192_names, rijndael192_oids, 16, 192,
+    sizeof (RIJNDAEL_context),
+    rijndael_setkey, rijndael_encrypt, rijndael_decrypt,
+    NULL, NULL,
     run_selftests
   };
 
+
 static const char *rijndael256_names[] =
   {
     "RIJNDAEL256",
@@ -1242,12 +2852,10 @@ static gcry_cipher_oid_spec_t rijndael256_oids[] =
 
 gcry_cipher_spec_t _gcry_cipher_spec_aes256 =
   {
+    GCRY_CIPHER_AES256, {0, 1},
     "AES256", rijndael256_names, rijndael256_oids, 16, 256,
     sizeof (RIJNDAEL_context),
-    rijndael_setkey, rijndael_encrypt, rijndael_decrypt
-  };
-
-cipher_extra_spec_t _gcry_cipher_extraspec_aes256 = 
-  {
+    rijndael_setkey, rijndael_encrypt, rijndael_decrypt,
+    NULL, NULL,
     run_selftests
   };
author	René Schümann <white06tiger@gmail.com>	2015-03-20 12:32:29 +0000
committer	René Schümann <white06tiger@gmail.com>	2015-03-20 12:32:29 +0000
commit	539705d58fc39a28388ff18c695dd406f4ffd1d9 (patch)
tree	51db7a37a66c09f41734ba5573d972aae9f30d71 /plugins/MirOTR/Libgcrypt/cipher/rijndael.c
parent	90171f125f36488dc08f5cfe0b0d4b78d995f08d (diff)