40 files changed, 8225 insertions, 1013 deletions
diff --git a/deps/openssl/openssl/crypto/aes/Makefile b/deps/openssl/openssl/crypto/aes/Makefile
deleted file mode 100644
index 05e4a0149e..0000000000
--- a/deps/openssl/openssl/crypto/aes/Makefile
+++ /dev/null
@@ -1,171 +0,0 @@
-#
-# crypto/aes/Makefile
-#
-
-DIR=	aes
-TOP=	../..
-CC=	cc
-CPP=	$(CC) -E
-INCLUDES=
-CFLAG=-g
-MAKEFILE=	Makefile
-AR=		ar r
-
-AES_ENC=aes_core.o aes_cbc.o
-
-CFLAGS= $(INCLUDES) $(CFLAG)
-ASFLAGS= $(INCLUDES) $(ASFLAG)
-AFLAGS= $(ASFLAGS)
-
-GENERAL=Makefile
-#TEST=aestest.c
-TEST=
-APPS=
-
-LIB=$(TOP)/libcrypto.a
-LIBSRC=aes_core.c aes_misc.c aes_ecb.c aes_cbc.c aes_cfb.c aes_ofb.c \
-       aes_ctr.c aes_ige.c aes_wrap.c
-LIBOBJ=aes_misc.o aes_ecb.o aes_cfb.o aes_ofb.o aes_ctr.o aes_ige.o aes_wrap.o \
-       $(AES_ENC)
-
-SRC= $(LIBSRC)
-
-EXHEADER= aes.h
-HEADER= aes_locl.h $(EXHEADER)
-
-ALL=    $(GENERAL) $(SRC) $(HEADER)
-
-top:
-	(cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all)
-
-all:	lib
-
-lib:	$(LIBOBJ)
-	$(AR) $(LIB) $(LIBOBJ)
-	$(RANLIB) $(LIB) || echo Never mind.
-	@touch lib
-
-aes-ia64.s: asm/aes-ia64.S
-	$(CC) $(CFLAGS) -E asm/aes-ia64.S > $@
-
-aes-586.s:	asm/aes-586.pl ../perlasm/x86asm.pl
-	$(PERL) asm/aes-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
-vpaes-x86.s:	asm/vpaes-x86.pl ../perlasm/x86asm.pl
-	$(PERL) asm/vpaes-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
-aesni-x86.s:	asm/aesni-x86.pl ../perlasm/x86asm.pl
-	$(PERL) asm/aesni-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
-
-aes-x86_64.s: asm/aes-x86_64.pl
-	$(PERL) asm/aes-x86_64.pl $(PERLASM_SCHEME) > $@
-vpaes-x86_64.s:	asm/vpaes-x86_64.pl
-	$(PERL) asm/vpaes-x86_64.pl $(PERLASM_SCHEME) > $@
-bsaes-x86_64.s:	asm/bsaes-x86_64.pl
-	$(PERL) asm/bsaes-x86_64.pl $(PERLASM_SCHEME) > $@
-aesni-x86_64.s: asm/aesni-x86_64.pl
-	$(PERL) asm/aesni-x86_64.pl $(PERLASM_SCHEME) > $@
-aesni-sha1-x86_64.s:	asm/aesni-sha1-x86_64.pl
-	$(PERL) asm/aesni-sha1-x86_64.pl $(PERLASM_SCHEME) > $@
-aesni-sha256-x86_64.s:	asm/aesni-sha256-x86_64.pl
-	$(PERL) asm/aesni-sha256-x86_64.pl $(PERLASM_SCHEME) > $@
-aesni-mb-x86_64.s:	asm/aesni-mb-x86_64.pl
-	$(PERL) asm/aesni-mb-x86_64.pl $(PERLASM_SCHEME) > $@
-
-aes-sparcv9.s: asm/aes-sparcv9.pl
-	$(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@
-aest4-sparcv9.s: asm/aest4-sparcv9.pl ../perlasm/sparcv9_modes.pl
-	$(PERL) asm/aest4-sparcv9.pl $(CFLAGS) > $@
-
-aes-ppc.s:	asm/aes-ppc.pl
-	$(PERL) asm/aes-ppc.pl $(PERLASM_SCHEME) $@
-vpaes-ppc.s:	asm/vpaes-ppc.pl
-	$(PERL) asm/vpaes-ppc.pl $(PERLASM_SCHEME) $@
-aesp8-ppc.s:	asm/aesp8-ppc.pl
-	$(PERL) asm/aesp8-ppc.pl $(PERLASM_SCHEME) $@
-
-aes-parisc.s:	asm/aes-parisc.pl
-	$(PERL) asm/aes-parisc.pl $(PERLASM_SCHEME) $@
-
-aes-mips.S:	asm/aes-mips.pl
-	$(PERL) asm/aes-mips.pl $(PERLASM_SCHEME) $@
-
-aesv8-armx.S:	asm/aesv8-armx.pl
-	$(PERL) asm/aesv8-armx.pl $(PERLASM_SCHEME) $@
-aesv8-armx.o:	aesv8-armx.S
-
-# GNU make "catch all"
-aes-%.S:	asm/aes-%.pl;	$(PERL) $< $(PERLASM_SCHEME) > $@
-aes-armv4.o:	aes-armv4.S
-bsaes-%.S:	asm/bsaes-%.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
-bsaes-armv7.o:	bsaes-armv7.S
-
-files:
-	$(PERL) $(TOP)/util/files.pl "AES_ENC=$(AES_ENC)" Makefile >> $(TOP)/MINFO
-
-links:
-	@$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER)
-	@$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST)
-	@$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS)
-
-install:
-	@[ -n "$(INSTALLTOP)" ] # should be set by top Makefile...
-	@headerlist="$(EXHEADER)"; for i in $$headerlist ; \
-	do  \
-	(cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \
-	chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \
-	done;
-
-tags:
-	ctags $(SRC)
-
-tests:
-
-lint:
-	lint -DLINT $(INCLUDES) $(SRC)>fluff
-
-update: depend
-
-depend:
-	@[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile...
-	$(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC)
-
-dclean:
-	$(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new
-	mv -f Makefile.new $(MAKEFILE)
-
-clean:
-	rm -f *.s *.S *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
-
-# DO NOT DELETE THIS LINE -- make depend depends on it.
-
-aes_cbc.o: ../../include/openssl/aes.h ../../include/openssl/modes.h
-aes_cbc.o: ../../include/openssl/opensslconf.h aes_cbc.c
-aes_cfb.o: ../../include/openssl/aes.h ../../include/openssl/modes.h
-aes_cfb.o: ../../include/openssl/opensslconf.h aes_cfb.c
-aes_core.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h
-aes_core.o: ../../include/openssl/opensslconf.h aes_core.c aes_locl.h
-aes_ctr.o: ../../include/openssl/aes.h ../../include/openssl/modes.h
-aes_ctr.o: ../../include/openssl/opensslconf.h aes_ctr.c
-aes_ecb.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h
-aes_ecb.o: ../../include/openssl/opensslconf.h aes_ecb.c aes_locl.h
-aes_ige.o: ../../e_os.h ../../include/openssl/aes.h ../../include/openssl/bio.h
-aes_ige.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-aes_ige.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-aes_ige.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-aes_ige.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-aes_ige.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-aes_ige.o: ../../include/openssl/symhacks.h ../cryptlib.h aes_ige.c aes_locl.h
-aes_misc.o: ../../include/openssl/aes.h ../../include/openssl/crypto.h
-aes_misc.o: ../../include/openssl/e_os2.h ../../include/openssl/opensslconf.h
-aes_misc.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-aes_misc.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-aes_misc.o: ../../include/openssl/symhacks.h aes_locl.h aes_misc.c
-aes_ofb.o: ../../include/openssl/aes.h ../../include/openssl/modes.h
-aes_ofb.o: ../../include/openssl/opensslconf.h aes_ofb.c
-aes_wrap.o: ../../e_os.h ../../include/openssl/aes.h
-aes_wrap.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h
-aes_wrap.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-aes_wrap.o: ../../include/openssl/err.h ../../include/openssl/lhash.h
-aes_wrap.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
-aes_wrap.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-aes_wrap.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-aes_wrap.o: ../../include/openssl/symhacks.h ../cryptlib.h aes_wrap.c
diff --git a/deps/openssl/openssl/crypto/aes/README b/deps/openssl/openssl/crypto/aes/README
deleted file mode 100644
index 0f9620a80e..0000000000
--- a/deps/openssl/openssl/crypto/aes/README
+++ /dev/null
@@ -1,3 +0,0 @@
-This is an OpenSSL-compatible version of AES (also called Rijndael).
-aes_core.c is basically the same as rijndael-alg-fst.c but with an
-API that looks like the rest of the OpenSSL symmetric cipher suite.
diff --git a/deps/openssl/openssl/crypto/aes/aes.h b/deps/openssl/openssl/crypto/aes/aes.h
deleted file mode 100644
index faa66c4914..0000000000
--- a/deps/openssl/openssl/crypto/aes/aes.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* crypto/aes/aes.h */
-/* ====================================================================
- * Copyright (c) 1998-2002 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- */
-
-#ifndef HEADER_AES_H
-# define HEADER_AES_H
-
-# include <openssl/opensslconf.h>
-
-# ifdef OPENSSL_NO_AES
-#  error AES is disabled.
-# endif
-
-# include <stddef.h>
-
-# define AES_ENCRYPT     1
-# define AES_DECRYPT     0
-
-/*
- * Because array size can't be a const in C, the following two are macros.
- * Both sizes are in bytes.
- */
-# define AES_MAXNR 14
-# define AES_BLOCK_SIZE 16
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-/* This should be a hidden type, but EVP requires that the size be known */
-struct aes_key_st {
-# ifdef AES_LONG
-    unsigned long rd_key[4 * (AES_MAXNR + 1)];
-# else
-    unsigned int rd_key[4 * (AES_MAXNR + 1)];
-# endif
-    int rounds;
-};
-typedef struct aes_key_st AES_KEY;
-
-const char *AES_options(void);
-
-int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
-                        AES_KEY *key);
-int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
-                        AES_KEY *key);
-
-int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
-                                AES_KEY *key);
-int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
-                                AES_KEY *key);
-
-void AES_encrypt(const unsigned char *in, unsigned char *out,
-                 const AES_KEY *key);
-void AES_decrypt(const unsigned char *in, unsigned char *out,
-                 const AES_KEY *key);
-
-void AES_ecb_encrypt(const unsigned char *in, unsigned char *out,
-                     const AES_KEY *key, const int enc);
-void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
-                     size_t length, const AES_KEY *key,
-                     unsigned char *ivec, const int enc);
-void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t length, const AES_KEY *key,
-                        unsigned char *ivec, int *num, const int enc);
-void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out,
-                      size_t length, const AES_KEY *key,
-                      unsigned char *ivec, int *num, const int enc);
-void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out,
-                      size_t length, const AES_KEY *key,
-                      unsigned char *ivec, int *num, const int enc);
-void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t length, const AES_KEY *key,
-                        unsigned char *ivec, int *num);
-void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t length, const AES_KEY *key,
-                        unsigned char ivec[AES_BLOCK_SIZE],
-                        unsigned char ecount_buf[AES_BLOCK_SIZE],
-                        unsigned int *num);
-/* NB: the IV is _two_ blocks long */
-void AES_ige_encrypt(const unsigned char *in, unsigned char *out,
-                     size_t length, const AES_KEY *key,
-                     unsigned char *ivec, const int enc);
-/* NB: the IV is _four_ blocks long */
-void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t length, const AES_KEY *key,
-                        const AES_KEY *key2, const unsigned char *ivec,
-                        const int enc);
-
-int AES_wrap_key(AES_KEY *key, const unsigned char *iv,
-                 unsigned char *out,
-                 const unsigned char *in, unsigned int inlen);
-int AES_unwrap_key(AES_KEY *key, const unsigned char *iv,
-                   unsigned char *out,
-                   const unsigned char *in, unsigned int inlen);
-
-
-#ifdef  __cplusplus
-}
-#endif
-
-#endif                          /* !HEADER_AES_H */
diff --git a/deps/openssl/openssl/crypto/aes/aes_cbc.c b/deps/openssl/openssl/crypto/aes/aes_cbc.c
index 805d0e260a..342841fc4f 100644
--- a/deps/openssl/openssl/crypto/aes/aes_cbc.c
+++ b/deps/openssl/openssl/crypto/aes/aes_cbc.c
@@ -1,52 +1,10 @@
-/* crypto/aes/aes_cbc.c */
-/* ====================================================================
- * Copyright (c) 1998-2002 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
  *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
  */
 
 #include <openssl/aes.h>
diff --git a/deps/openssl/openssl/crypto/aes/aes_cfb.c b/deps/openssl/openssl/crypto/aes/aes_cfb.c
index 1225000963..f010e3c4ea 100644
--- a/deps/openssl/openssl/crypto/aes/aes_cfb.c
+++ b/deps/openssl/openssl/crypto/aes/aes_cfb.c
@@ -1,52 +1,10 @@
-/* crypto/aes/aes_cfb.c */
-/* ====================================================================
- * Copyright (c) 2002-2006 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
  *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
  */
 
 #include <openssl/aes.h>
diff --git a/deps/openssl/openssl/crypto/aes/aes_core.c b/deps/openssl/openssl/crypto/aes/aes_core.c
index 7019b5d7aa..bd5c7793be 100644
--- a/deps/openssl/openssl/crypto/aes/aes_core.c
+++ b/deps/openssl/openssl/crypto/aes/aes_core.c
@@ -1,4 +1,12 @@
-/* crypto/aes/aes_core.c */
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
 /**
  * rijndael-alg-fst.c
  *
@@ -28,14 +36,10 @@
 /* Note: rewritten a little bit to provide error control and an OpenSSL-
    compatible API */
 
-#ifndef AES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
 #include <assert.h>
 
 #include <stdlib.h>
+#include <openssl/crypto.h>
 #include <openssl/aes.h>
 #include "aes_locl.h"
 
@@ -625,8 +629,8 @@ static const u32 rcon[] = {
 /**
  * Expand the cipher key into the encryption key schedule.
  */
-int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
-                                AES_KEY *key)
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+                        AES_KEY *key)
 {
 
     u32 *rk;
@@ -640,9 +644,9 @@ int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 
     rk = key->rd_key;
 
-    if (bits==128)
+    if (bits == 128)
         key->rounds = 10;
-    else if (bits==192)
+    else if (bits == 192)
         key->rounds = 12;
     else
         key->rounds = 14;
@@ -727,8 +731,8 @@ int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 /**
  * Expand the cipher key into the decryption key schedule.
  */
-int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
-                                AES_KEY *key)
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+                        AES_KEY *key)
 {
 
     u32 *rk;
@@ -736,7 +740,7 @@ int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
     u32 temp;
 
     /* first, start with an encryption schedule */
-    status = private_AES_set_encrypt_key(userKey, bits, key);
+    status = AES_set_encrypt_key(userKey, bits, key);
     if (status < 0)
         return status;
 
@@ -1204,11 +1208,11 @@ static const u32 rcon[] = {
 /**
  * Expand the cipher key into the encryption key schedule.
  */
-int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
-                                AES_KEY *key)
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+                        AES_KEY *key)
 {
     u32 *rk;
-   	int i = 0;
+    int i = 0;
     u32 temp;
 
     if (!userKey || !key)
@@ -1218,9 +1222,9 @@ int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 
     rk = key->rd_key;
 
-    if (bits==128)
+    if (bits == 128)
         key->rounds = 10;
-    else if (bits==192)
+    else if (bits == 192)
         key->rounds = 12;
     else
         key->rounds = 14;
@@ -1305,8 +1309,8 @@ int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 /**
  * Expand the cipher key into the decryption key schedule.
  */
-int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
-                                AES_KEY *key)
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+                        AES_KEY *key)
 {
 
     u32 *rk;
@@ -1314,7 +1318,7 @@ int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
     u32 temp;
 
     /* first, start with an encryption schedule */
-    status = private_AES_set_encrypt_key(userKey, bits, key);
+    status = AES_set_encrypt_key(userKey, bits, key);
     if (status < 0)
         return status;
 
@@ -1351,7 +1355,7 @@ int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
             rk[j] = tpe ^ ROTATE(tpd,16) ^
                 ROTATE(tp9,24) ^ ROTATE(tpb,8);
 #else
-            rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ 
+            rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
                 (tp9 >> 8) ^ (tp9 << 24) ^
                 (tpb >> 24) ^ (tpb << 8);
 #endif
diff --git a/deps/openssl/openssl/crypto/aes/aes_ctr.c b/deps/openssl/openssl/crypto/aes/aes_ctr.c
deleted file mode 100644
index 9e760c4b12..0000000000
--- a/deps/openssl/openssl/crypto/aes/aes_ctr.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/* crypto/aes/aes_ctr.c */
-/* ====================================================================
- * Copyright (c) 1998-2002 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- */
-
-#include <openssl/aes.h>
-#include <openssl/modes.h>
-
-void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t length, const AES_KEY *key,
-                        unsigned char ivec[AES_BLOCK_SIZE],
-                        unsigned char ecount_buf[AES_BLOCK_SIZE],
-                        unsigned int *num)
-{
-    CRYPTO_ctr128_encrypt(in, out, length, key, ivec, ecount_buf, num,
-                          (block128_f) AES_encrypt);
-}
diff --git a/deps/openssl/openssl/crypto/aes/aes_ecb.c b/deps/openssl/openssl/crypto/aes/aes_ecb.c
index 52151a5c70..29bfc1ad66 100644
--- a/deps/openssl/openssl/crypto/aes/aes_ecb.c
+++ b/deps/openssl/openssl/crypto/aes/aes_ecb.c
@@ -1,59 +1,12 @@
-/* crypto/aes/aes_ecb.c */
-/* ====================================================================
- * Copyright (c) 1998-2002 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
  *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
  */
 
-#ifndef AES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
 #include <assert.h>
 
 #include <openssl/aes.h>
diff --git a/deps/openssl/openssl/crypto/aes/aes_ige.c b/deps/openssl/openssl/crypto/aes/aes_ige.c
index 8f2b770647..75f796cf3b 100644
--- a/deps/openssl/openssl/crypto/aes/aes_ige.c
+++ b/deps/openssl/openssl/crypto/aes/aes_ige.c
@@ -1,55 +1,13 @@
-/* crypto/aes/aes_ige.c */
-/* ====================================================================
- * Copyright (c) 2006 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
+/*
+ * Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
  *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
  */
 
-#include "cryptlib.h"
+#include "internal/cryptlib.h"
 
 #include <openssl/aes.h>
 #include "aes_locl.h"
@@ -83,6 +41,9 @@ void AES_ige_encrypt(const unsigned char *in, unsigned char *out,
     size_t n;
     size_t len = length;
 
+    if (length == 0)
+        return;
+
     OPENSSL_assert(in && out && key && ivec);
     OPENSSL_assert((AES_ENCRYPT == enc) || (AES_DECRYPT == enc));
     OPENSSL_assert((length % AES_BLOCK_SIZE) == 0);
diff --git a/deps/openssl/openssl/crypto/aes/aes_locl.h b/deps/openssl/openssl/crypto/aes/aes_locl.h
index 7acd74ec16..adee29df8d 100644
--- a/deps/openssl/openssl/crypto/aes/aes_locl.h
+++ b/deps/openssl/openssl/crypto/aes/aes_locl.h
@@ -1,63 +1,16 @@
-/* crypto/aes/aes.h */
-/* ====================================================================
- * Copyright (c) 1998-2002 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
  *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
  */
 
 #ifndef HEADER_AES_LOCL_H
 # define HEADER_AES_LOCL_H
 
 # include <openssl/e_os2.h>
-
-# ifdef OPENSSL_NO_AES
-#  error AES is disabled.
-# endif
-
 # include <stdio.h>
 # include <stdlib.h>
 # include <string.h>
diff --git a/deps/openssl/openssl/crypto/aes/aes_misc.c b/deps/openssl/openssl/crypto/aes/aes_misc.c
index fafad4d6f5..7403c84f82 100644
--- a/deps/openssl/openssl/crypto/aes/aes_misc.c
+++ b/deps/openssl/openssl/crypto/aes/aes_misc.c
@@ -1,61 +1,16 @@
-/* crypto/aes/aes_misc.c */
-/* ====================================================================
- * Copyright (c) 1998-2002 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
  *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
  */
 
 #include <openssl/opensslv.h>
-#include <openssl/crypto.h>
 #include <openssl/aes.h>
 #include "aes_locl.h"
 
-const char AES_version[] = "AES" OPENSSL_VERSION_PTEXT;
-
 const char *AES_options(void)
 {
 #ifdef FULL_UNROLL
@@ -64,23 +19,3 @@ const char *AES_options(void)
     return "aes(partial)";
 #endif
 }
-
-/* FIPS wrapper functions to block low level AES calls in FIPS mode */
-
-int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
-                        AES_KEY *key)
-{
-#ifdef OPENSSL_FIPS
-    fips_cipher_abort(AES);
-#endif
-    return private_AES_set_encrypt_key(userKey, bits, key);
-}
-
-int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
-                        AES_KEY *key)
-{
-#ifdef OPENSSL_FIPS
-    fips_cipher_abort(AES);
-#endif
-    return private_AES_set_decrypt_key(userKey, bits, key);
-}
diff --git a/deps/openssl/openssl/crypto/aes/aes_ofb.c b/deps/openssl/openssl/crypto/aes/aes_ofb.c
index 64a08caaec..215b53858e 100644
--- a/deps/openssl/openssl/crypto/aes/aes_ofb.c
+++ b/deps/openssl/openssl/crypto/aes/aes_ofb.c
@@ -1,52 +1,10 @@
-/* crypto/aes/aes_ofb.c */
-/* ====================================================================
- * Copyright (c) 2002-2006 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
  *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
  */
 
 #include <openssl/aes.h>
diff --git a/deps/openssl/openssl/crypto/aes/aes_wrap.c b/deps/openssl/openssl/crypto/aes/aes_wrap.c
index b7b64d57a4..cae0b21229 100644
--- a/deps/openssl/openssl/crypto/aes/aes_wrap.c
+++ b/deps/openssl/openssl/crypto/aes/aes_wrap.c
@@ -1,58 +1,13 @@
-/* crypto/aes/aes_wrap.c */
 /*
- * Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
- * project.
- */
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    licensing@OpenSSL.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
  *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
  */
 
-#include "cryptlib.h"
+#include "internal/cryptlib.h"
 #include <openssl/aes.h>
 #include <openssl/modes.h>
 
diff --git a/deps/openssl/openssl/crypto/aes/aes_x86core.c b/deps/openssl/openssl/crypto/aes/aes_x86core.c
index b5dd697677..95b49bbabc 100644
--- a/deps/openssl/openssl/crypto/aes/aes_x86core.c
+++ b/deps/openssl/openssl/crypto/aes/aes_x86core.c
@@ -1,4 +1,12 @@
-/* crypto/aes/aes_core.c */
+/*
+ * Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
 /**
  * rijndael-alg-fst.c
  *
@@ -35,11 +43,6 @@
  */
 
 
-#ifndef AES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
 #include <assert.h>
 
 #include <stdlib.h>
@@ -618,7 +621,7 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
             rk[j] = tpe ^ ROTATE(tpd,16) ^
                 ROTATE(tp9,8) ^ ROTATE(tpb,24);
 #else
-            rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ 
+            rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
                 (tp9 >> 24) ^ (tp9 << 8) ^
                 (tpb >> 8) ^ (tpb << 24);
 #endif
@@ -907,7 +910,7 @@ void AES_decrypt(const unsigned char *in, unsigned char *out,
            (u32)Td4[(s1 >> 16) & 0xff] << 16 ^
            (u32)Td4[(s0 >> 24)       ] << 24;
 
-    /* now do the linear transform using words */ 
+    /* now do the linear transform using words */
     {
         int i;
         u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
@@ -931,7 +934,7 @@ void AES_decrypt(const unsigned char *in, unsigned char *out,
             t[i] = tpe ^ ROTATE(tpd,16) ^
                 ROTATE(tp9,8) ^ ROTATE(tpb,24);
 #else
-            t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ 
+            t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
                 (tp9 >> 24) ^ (tp9 << 8) ^
                 (tpb >> 8) ^ (tpb << 24);
 #endif
@@ -984,7 +987,7 @@ void AES_decrypt(const unsigned char *in, unsigned char *out,
                (u32)Td4[(s1 >> 16) & 0xff] << 16 ^
                (u32)Td4[(s0 >> 24)       ] << 24;
 
-    /* now do the linear transform using words */ 
+    /* now do the linear transform using words */
     {
         int i;
         u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
@@ -1008,7 +1011,7 @@ void AES_decrypt(const unsigned char *in, unsigned char *out,
             t[i] = tpe ^ ROTATE(tpd,16) ^
                 ROTATE(tp9,8) ^ ROTATE(tpb,24);
 #else
-            t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ 
+            t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
                 (tp9 >> 24) ^ (tp9 << 8) ^
                 (tpb >> 8) ^ (tpb << 24);
 #endif
diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-586.pl b/deps/openssl/openssl/crypto/aes/asm/aes-586.pl
index 60286ecb96..1ba356508a 100755
--- a/deps/openssl/openssl/crypto/aes/asm/aes-586.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/aes-586.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -191,6 +198,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
+$output = pop;
+open OUT,">$output";
+*STDOUT=*OUT;
+
 &asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
 &static_label("AES_Te");
 &static_label("AES_Td");
@@ -2861,12 +2872,12 @@ sub enckey()
     &set_label("exit");
 &function_end("_x86_AES_set_encrypt_key");
 
-# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
-&function_begin_B("private_AES_set_encrypt_key");
+&function_begin_B("AES_set_encrypt_key");
 	&call	("_x86_AES_set_encrypt_key");
 	&ret	();
-&function_end_B("private_AES_set_encrypt_key");
+&function_end_B("AES_set_encrypt_key");
 
 sub deckey()
 { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
@@ -2923,9 +2934,9 @@ sub deckey()
 	&mov	(&DWP(4*$i,$key),$tp1);
 }
 
-# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
-&function_begin_B("private_AES_set_decrypt_key");
+&function_begin_B("AES_set_decrypt_key");
 	&call	("_x86_AES_set_encrypt_key");
 	&cmp	("eax",0);
 	&je	(&label("proceed"));
@@ -2981,7 +2992,9 @@ sub deckey()
 	&jb	(&label("permute"));
 
 	&xor	("eax","eax");			# return success
-&function_end("private_AES_set_decrypt_key");
+&function_end("AES_set_decrypt_key");
 &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
 
 &asm_finish();
+
+close STDOUT;
diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-armv4.pl b/deps/openssl/openssl/crypto/aes/asm/aes-armv4.pl
index c1b5e352d7..998158998e 100644
--- a/deps/openssl/openssl/crypto/aes/asm/aes-armv4.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/aes-armv4.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -32,8 +39,20 @@
 # Profiler-assisted and platform-specific optimization resulted in 16%
 # improvement on Cortex A8 core and ~21.5 cycles per byte.
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
 
 $s0="r0";
 $s1="r1";
@@ -58,15 +77,12 @@ $code=<<___;
 #endif
 
 .text
-#if __ARM_ARCH__<7
-.code	32
-#else
+#if defined(__thumb2__) && !defined(__APPLE__)
 .syntax	unified
-# ifdef __thumb2__
 .thumb
-# else
+#else
 .code	32
-# endif
+#undef __thumb2__
 #endif
 
 .type	AES_Te,%object
@@ -181,15 +197,19 @@ AES_Te:
 .type   AES_encrypt,%function
 .align	5
 AES_encrypt:
-#if __ARM_ARCH__<7
+#ifndef	__thumb2__
 	sub	r3,pc,#8		@ AES_encrypt
 #else
 	adr	r3,.
 #endif
 	stmdb   sp!,{r1,r4-r12,lr}
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	$tbl,AES_Te
+#else
+	sub	$tbl,r3,#AES_encrypt-AES_Te	@ Te
+#endif
 	mov	$rounds,r0		@ inp
 	mov	$key,r2
-	sub	$tbl,r3,#AES_encrypt-AES_Te	@ Te
 #if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
@@ -422,24 +442,24 @@ _armv4_AES_encrypt:
 	ldr	pc,[sp],#4		@ pop and return
 .size	_armv4_AES_encrypt,.-_armv4_AES_encrypt
 
-.global private_AES_set_encrypt_key
-.type   private_AES_set_encrypt_key,%function
+.global AES_set_encrypt_key
+.type   AES_set_encrypt_key,%function
 .align	5
-private_AES_set_encrypt_key:
+AES_set_encrypt_key:
 _armv4_AES_set_encrypt_key:
-#if __ARM_ARCH__<7
+#ifndef	__thumb2__
 	sub	r3,pc,#8		@ AES_set_encrypt_key
 #else
 	adr	r3,.
 #endif
 	teq	r0,#0
-#if __ARM_ARCH__>=7
+#ifdef	__thumb2__
 	itt	eq			@ Thumb2 thing, sanity check in ARM
 #endif
 	moveq	r0,#-1
 	beq	.Labrt
 	teq	r2,#0
-#if __ARM_ARCH__>=7
+#ifdef	__thumb2__
 	itt	eq			@ Thumb2 thing, sanity check in ARM
 #endif
 	moveq	r0,#-1
@@ -450,19 +470,23 @@ _armv4_AES_set_encrypt_key:
 	teq	r1,#192
 	beq	.Lok
 	teq	r1,#256
-#if __ARM_ARCH__>=7
+#ifdef	__thumb2__
 	itt	ne			@ Thumb2 thing, sanity check in ARM
 #endif
 	movne	r0,#-1
 	bne	.Labrt
 
 .Lok:	stmdb   sp!,{r4-r12,lr}
-	sub	$tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024	@ Te4
-
 	mov	$rounds,r0		@ inp
 	mov	lr,r1			@ bits
 	mov	$key,r2			@ key
 
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	$tbl,AES_Te+1024				@ Te4
+#else
+	sub	$tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024	@ Te4
+#endif
+
 #if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
@@ -607,7 +631,7 @@ _armv4_AES_set_encrypt_key:
 	str	$s2,[$key,#-16]
 	subs	$rounds,$rounds,#1
 	str	$s3,[$key,#-12]
-#if __ARM_ARCH__>=7
+#ifdef	__thumb2__
 	itt	eq				@ Thumb2 thing, sanity check in ARM
 #endif
 	subeq	r2,$key,#216
@@ -679,7 +703,7 @@ _armv4_AES_set_encrypt_key:
 	str	$s2,[$key,#-24]
 	subs	$rounds,$rounds,#1
 	str	$s3,[$key,#-20]
-#if __ARM_ARCH__>=7
+#ifdef	__thumb2__
 	itt	eq				@ Thumb2 thing, sanity check in ARM
 #endif
 	subeq	r2,$key,#256
@@ -722,12 +746,12 @@ _armv4_AES_set_encrypt_key:
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
 #endif
-.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
+.size	AES_set_encrypt_key,.-AES_set_encrypt_key
 
-.global private_AES_set_decrypt_key
-.type   private_AES_set_decrypt_key,%function
+.global AES_set_decrypt_key
+.type   AES_set_decrypt_key,%function
 .align	5
-private_AES_set_decrypt_key:
+AES_set_decrypt_key:
 	str	lr,[sp,#-4]!            @ push lr
 	bl	_armv4_AES_set_encrypt_key
 	teq	r0,#0
@@ -737,7 +761,7 @@ private_AES_set_decrypt_key:
 	mov	r0,r2			@ AES_set_encrypt_key preserves r2,
 	mov	r1,r2			@ which is AES_KEY *key
 	b	_armv4_AES_set_enc2dec_key
-.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
+.size	AES_set_decrypt_key,.-AES_set_decrypt_key
 
 @ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out)
 .global	AES_set_enc2dec_key
@@ -750,7 +774,7 @@ _armv4_AES_set_enc2dec_key:
 	ldr	$rounds,[r0,#240]
 	mov	$i1,r0			@ input
 	add	$i2,r0,$rounds,lsl#4
-	mov	$key,r1			@ ouput
+	mov	$key,r1			@ output
 	add	$tbl,r1,$rounds,lsl#4
 	str	$rounds,[r1,#240]
 
@@ -949,15 +973,19 @@ AES_Td:
 .type   AES_decrypt,%function
 .align	5
 AES_decrypt:
-#if __ARM_ARCH__<7
+#ifndef	__thumb2__
 	sub	r3,pc,#8		@ AES_decrypt
 #else
 	adr	r3,.
 #endif
 	stmdb   sp!,{r1,r4-r12,lr}
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	$tbl,AES_Td
+#else
+	sub	$tbl,r3,#AES_decrypt-AES_Td	@ Td
+#endif
 	mov	$rounds,r0		@ inp
 	mov	$key,r2
-	sub	$tbl,r3,#AES_decrypt-AES_Td		@ Td
 #if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-c64xplus.pl b/deps/openssl/openssl/crypto/aes/asm/aes-c64xplus.pl
new file mode 100644
index 0000000000..19d2cc176f
--- /dev/null
+++ b/deps/openssl/openssl/crypto/aes/asm/aes-c64xplus.pl
@@ -0,0 +1,1382 @@
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# [Endian-neutral] AES for C64x+.
+#
+# Even though SPLOOPs are scheduled for 13 cycles, and thus expected
+# performance is ~8.5 cycles per byte processed with 128-bit key,
+# measured performance turned to be ~10 cycles per byte. Discrepancy
+# must be caused by limitations of L1D memory banking(*), see SPRU871
+# TI publication for further details. If any consolation it's still
+# ~20% faster than TI's linear assembly module anyway... Compared to
+# aes_core.c compiled with cl6x 6.0 with -mv6400+ -o2 options this
+# code is 3.75x faster and almost 3x smaller (tables included).
+#
+# (*)	This means that there might be subtle correlation between data
+#	and timing and one can wonder if it can be ... attacked:-(
+#	On the other hand this also means that *if* one chooses to
+#	implement *4* T-tables variant [instead of 1 T-table as in
+#	this implementation, or in addition to], then one ought to
+#	*interleave* them. Even though it complicates addressing,
+#	references to interleaved tables would be guaranteed not to
+#	clash. I reckon that it should be possible to break 8 cycles
+#	per byte "barrier," i.e. improve by ~20%, naturally at the
+#	cost of 8x increased pressure on L1D. 8x because you'd have
+#	to interleave both Te and Td tables...
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($TEA,$TEB)=("A5","B5");
+($KPA,$KPB)=("A3","B1");
+@K=("A6","B6","A7","B7");
+@s=("A8","B8","A9","B9");
+@Te0=@Td0=("A16","B16","A17","B17");
+@Te1=@Td1=("A18","B18","A19","B19");
+@Te2=@Td2=("A20","B20","A21","B21");
+@Te3=@Td3=("A22","B22","A23","B23");
+
+$code=<<___;
+	.text
+
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.nocmp
+	.asg	AES_encrypt,_AES_encrypt
+	.asg	AES_decrypt,_AES_decrypt
+	.asg	AES_set_encrypt_key,_AES_set_encrypt_key
+	.asg	AES_set_decrypt_key,_AES_set_decrypt_key
+	.asg	AES_ctr32_encrypt,_AES_ctr32_encrypt
+	.endif
+
+	.asg	B3,RA
+	.asg	A4,INP
+	.asg	B4,OUT
+	.asg	A6,KEY
+	.asg	A4,RET
+	.asg	B15,SP
+
+	.eval	24,EXT0
+	.eval	16,EXT1
+	.eval	8,EXT2
+	.eval	0,EXT3
+	.eval	8,TBL1
+	.eval	16,TBL2
+	.eval	24,TBL3
+
+	.if	.BIG_ENDIAN
+	.eval	24-EXT0,EXT0
+	.eval	24-EXT1,EXT1
+	.eval	24-EXT2,EXT2
+	.eval	24-EXT3,EXT3
+	.eval	32-TBL1,TBL1
+	.eval	32-TBL2,TBL2
+	.eval	32-TBL3,TBL3
+	.endif
+
+	.global	_AES_encrypt
+_AES_encrypt:
+	.asmfunc
+	MVK	1,B2
+__encrypt:
+	.if	__TI_EABI__
+   [B2]	LDNDW	*INP++,A9:A8			; load input
+||	MVKL	\$PCR_OFFSET(AES_Te,__encrypt),$TEA
+||	ADDKPC	__encrypt,B0
+   [B2]	LDNDW	*INP++,B9:B8
+||	MVKH	\$PCR_OFFSET(AES_Te,__encrypt),$TEA
+||	ADD	0,KEY,$KPA
+||	ADD	4,KEY,$KPB
+	.else
+   [B2]	LDNDW	*INP++,A9:A8			; load input
+||	MVKL	(AES_Te-__encrypt),$TEA
+||	ADDKPC	__encrypt,B0
+   [B2]	LDNDW	*INP++,B9:B8
+||	MVKH	(AES_Te-__encrypt),$TEA
+||	ADD	0,KEY,$KPA
+||	ADD	4,KEY,$KPB
+	.endif
+	LDW	*$KPA++[2],$Te0[0]		; zero round key
+||	LDW	*$KPB++[2],$Te0[1]
+||	MVK	60,A0
+||	ADD	B0,$TEA,$TEA			; AES_Te
+	LDW	*KEY[A0],B0			; rounds
+||	MVK	1024,A0				; sizeof(AES_Te)
+	LDW	*$KPA++[2],$Te0[2]
+||	LDW	*$KPB++[2],$Te0[3]
+||	MV	$TEA,$TEB
+	NOP
+	.if	.BIG_ENDIAN
+	MV	A9,$s[0]
+||	MV	A8,$s[1]
+||	MV	B9,$s[2]
+||	MV	B8,$s[3]
+	.else
+	MV	A8,$s[0]
+||	MV	A9,$s[1]
+||	MV	B8,$s[2]
+||	MV	B9,$s[3]
+	.endif
+	XOR	$Te0[0],$s[0],$s[0]
+||	XOR	$Te0[1],$s[1],$s[1]
+||	LDW	*$KPA++[2],$K[0]		; 1st round key
+||	LDW	*$KPB++[2],$K[1]
+	SUB	B0,2,B0
+
+	SPLOOPD	13
+||	MVC	B0,ILC
+||	LDW	*$KPA++[2],$K[2]
+||	LDW	*$KPB++[2],$K[3]
+;;====================================================================
+	EXTU	$s[1],EXT1,24,$Te1[1]
+||	EXTU	$s[0],EXT3,24,$Te3[0]
+	LDW	*${TEB}[$Te1[1]],$Te1[1]	; Te1[s1>>8],	t0
+||	LDW	*${TEA}[$Te3[0]],$Te3[0]	; Te3[s0>>24],	t1
+||	XOR	$s[2],$Te0[2],$s[2]		; modulo-scheduled
+||	XOR	$s[3],$Te0[3],$s[3]		; modulo-scheduled
+||	EXTU	$s[1],EXT3,24,$Te3[1]
+||	EXTU	$s[0],EXT1,24,$Te1[0]
+	LDW	*${TEB}[$Te3[1]],$Te3[1]	; Te3[s1>>24],	t2
+||	LDW	*${TEA}[$Te1[0]],$Te1[0]	; Te1[s0>>8],	t3
+||	EXTU	$s[2],EXT2,24,$Te2[2]
+||	EXTU	$s[3],EXT2,24,$Te2[3]
+	LDW	*${TEA}[$Te2[2]],$Te2[2]	; Te2[s2>>16],	t0
+||	LDW	*${TEB}[$Te2[3]],$Te2[3]	; Te2[s3>>16],	t1
+||	EXTU	$s[3],EXT3,24,$Te3[3]
+||	EXTU	$s[2],EXT1,24,$Te1[2]
+	LDW	*${TEB}[$Te3[3]],$Te3[3]	; Te3[s3>>24],	t0
+||	LDW	*${TEA}[$Te1[2]],$Te1[2]	; Te1[s2>>8],	t1
+||	EXTU	$s[0],EXT2,24,$Te2[0]
+||	EXTU	$s[1],EXT2,24,$Te2[1]
+	LDW	*${TEA}[$Te2[0]],$Te2[0]	; Te2[s0>>16],	t2
+||	LDW	*${TEB}[$Te2[1]],$Te2[1]	; Te2[s1>>16],	t3
+||	EXTU	$s[3],EXT1,24,$Te1[3]
+||	EXTU	$s[2],EXT3,24,$Te3[2]
+	LDW	*${TEB}[$Te1[3]],$Te1[3]	; Te1[s3>>8],	t2
+||	LDW	*${TEA}[$Te3[2]],$Te3[2]	; Te3[s2>>24],	t3
+||	ROTL	$Te1[1],TBL1,$Te3[0]		; t0
+||	ROTL	$Te3[0],TBL3,$Te1[1]		; t1
+||	EXTU	$s[0],EXT0,24,$Te0[0]
+||	EXTU	$s[1],EXT0,24,$Te0[1]
+	LDW	*${TEA}[$Te0[0]],$Te0[0]	; Te0[s0],	t0
+||	LDW	*${TEB}[$Te0[1]],$Te0[1]	; Te0[s1],	t1
+||	ROTL	$Te3[1],TBL3,$Te1[0]		; t2
+||	ROTL	$Te1[0],TBL1,$Te3[1]		; t3
+||	EXTU	$s[2],EXT0,24,$Te0[2]
+||	EXTU	$s[3],EXT0,24,$Te0[3]
+	LDW	*${TEA}[$Te0[2]],$Te0[2]	; Te0[s2],	t2
+||	LDW	*${TEB}[$Te0[3]],$Te0[3]	; Te0[s3],	t3
+||	ROTL	$Te2[2],TBL2,$Te2[2]		; t0
+||	ROTL	$Te2[3],TBL2,$Te2[3]		; t1
+||	XOR	$K[0],$Te3[0],$s[0]
+||	XOR	$K[1],$Te1[1],$s[1]
+	ROTL	$Te3[3],TBL3,$Te1[2]		; t0
+||	ROTL	$Te1[2],TBL1,$Te3[3]		; t1
+||	XOR	$K[2],$Te1[0],$s[2]
+||	XOR	$K[3],$Te3[1],$s[3]
+||	LDW	*$KPA++[2],$K[0]		; next round key
+||	LDW	*$KPB++[2],$K[1]
+	ROTL	$Te2[0],TBL2,$Te2[0]		; t2
+||	ROTL	$Te2[1],TBL2,$Te2[1]		; t3
+||	XOR	$s[0],$Te2[2],$s[0]
+||	XOR	$s[1],$Te2[3],$s[1]
+||	LDW	*$KPA++[2],$K[2]
+||	LDW	*$KPB++[2],$K[3]
+	ROTL	$Te1[3],TBL1,$Te3[2]		; t2
+||	ROTL	$Te3[2],TBL3,$Te1[3]		; t3
+||	XOR	$s[0],$Te1[2],$s[0]
+||	XOR	$s[1],$Te3[3],$s[1]
+	XOR	$s[2],$Te2[0],$s[2]
+||	XOR	$s[3],$Te2[1],$s[3]
+||	XOR	$s[0],$Te0[0],$s[0]
+||	XOR	$s[1],$Te0[1],$s[1]
+	SPKERNEL
+||	XOR.L	$s[2],$Te3[2],$s[2]
+||	XOR.L	$s[3],$Te1[3],$s[3]
+;;====================================================================
+	ADD.D	${TEA},A0,${TEA}		; point to Te4
+||	ADD.D	${TEB},A0,${TEB}
+||	EXTU	$s[1],EXT1,24,$Te1[1]
+||	EXTU	$s[0],EXT3,24,$Te3[0]
+	LDBU	*${TEB}[$Te1[1]],$Te1[1]	; Te1[s1>>8],	t0
+||	LDBU	*${TEA}[$Te3[0]],$Te3[0]	; Te3[s0>>24],	t1
+||	XOR	$s[2],$Te0[2],$s[2]		; modulo-scheduled
+||	XOR	$s[3],$Te0[3],$s[3]		; modulo-scheduled
+||	EXTU	$s[0],EXT0,24,$Te0[0]
+||	EXTU	$s[1],EXT0,24,$Te0[1]
+	LDBU	*${TEA}[$Te0[0]],$Te0[0]	; Te0[s0],	t0
+||	LDBU	*${TEB}[$Te0[1]],$Te0[1]	; Te0[s1],	t1
+||	EXTU	$s[3],EXT3,24,$Te3[3]
+||	EXTU	$s[2],EXT1,24,$Te1[2]
+	LDBU	*${TEB}[$Te3[3]],$Te3[3]	; Te3[s3>>24],	t0
+||	LDBU	*${TEA}[$Te1[2]],$Te1[2]	; Te1[s2>>8],	t1
+||	EXTU	$s[2],EXT2,24,$Te2[2]
+||	EXTU	$s[3],EXT2,24,$Te2[3]
+	LDBU	*${TEA}[$Te2[2]],$Te2[2]	; Te2[s2>>16],	t0
+||	LDBU	*${TEB}[$Te2[3]],$Te2[3]	; Te2[s3>>16],	t1
+||	EXTU	$s[1],EXT3,24,$Te3[1]
+||	EXTU	$s[0],EXT1,24,$Te1[0]
+	LDBU	*${TEB}[$Te3[1]],$Te3[1]	; Te3[s1>>24],	t2
+||	LDBU	*${TEA}[$Te1[0]],$Te1[0]	; Te1[s0>>8],	t3
+||	EXTU	$s[3],EXT1,24,$Te1[3]
+||	EXTU	$s[2],EXT3,24,$Te3[2]
+	LDBU	*${TEB}[$Te1[3]],$Te1[3]	; Te1[s3>>8],	t2
+||	LDBU	*${TEA}[$Te3[2]],$Te3[2]	; Te3[s2>>24],	t3
+||	EXTU	$s[2],EXT0,24,$Te0[2]
+||	EXTU	$s[3],EXT0,24,$Te0[3]
+	LDBU	*${TEA}[$Te0[2]],$Te0[2]	; Te0[s2],	t2
+||	LDBU	*${TEB}[$Te0[3]],$Te0[3]	; Te0[s3],	t3
+||	EXTU	$s[0],EXT2,24,$Te2[0]
+||	EXTU	$s[1],EXT2,24,$Te2[1]
+	LDBU	*${TEA}[$Te2[0]],$Te2[0]	; Te2[s0>>16],	t2
+||	LDBU	*${TEB}[$Te2[1]],$Te2[1]	; Te2[s1>>16],	t3
+
+	.if	.BIG_ENDIAN
+	PACK2	$Te0[0],$Te1[1],$Te0[0]
+||	PACK2	$Te0[1],$Te1[2],$Te0[1]
+	PACK2	$Te2[2],$Te3[3],$Te2[2]
+||	PACK2	$Te2[3],$Te3[0],$Te2[3]
+	PACKL4	$Te0[0],$Te2[2],$Te0[0]
+||	PACKL4	$Te0[1],$Te2[3],$Te0[1]
+	XOR	$K[0],$Te0[0],$Te0[0]		; s[0]
+||	XOR	$K[1],$Te0[1],$Te0[1]		; s[1]
+
+	PACK2	$Te0[2],$Te1[3],$Te0[2]
+||	PACK2	$Te0[3],$Te1[0],$Te0[3]
+	PACK2	$Te2[0],$Te3[1],$Te2[0]
+||	PACK2	$Te2[1],$Te3[2],$Te2[1]
+||	BNOP	RA
+	PACKL4	$Te0[2],$Te2[0],$Te0[2]
+||	PACKL4	$Te0[3],$Te2[1],$Te0[3]
+	XOR	$K[2],$Te0[2],$Te0[2]		; s[2]
+||	XOR	$K[3],$Te0[3],$Te0[3]		; s[3]
+
+	MV	$Te0[0],A9
+||	MV	$Te0[1],A8
+	MV	$Te0[2],B9
+||	MV	$Te0[3],B8
+|| [B2]	STNDW	A9:A8,*OUT++
+   [B2]	STNDW	B9:B8,*OUT++
+	.else
+	PACK2	$Te1[1],$Te0[0],$Te1[1]
+||	PACK2	$Te1[2],$Te0[1],$Te1[2]
+	PACK2	$Te3[3],$Te2[2],$Te3[3]
+||	PACK2	$Te3[0],$Te2[3],$Te3[0]
+	PACKL4	$Te3[3],$Te1[1],$Te1[1]
+||	PACKL4	$Te3[0],$Te1[2],$Te1[2]
+	XOR	$K[0],$Te1[1],$Te1[1]		; s[0]
+||	XOR	$K[1],$Te1[2],$Te1[2]		; s[1]
+
+	PACK2	$Te1[3],$Te0[2],$Te1[3]
+||	PACK2	$Te1[0],$Te0[3],$Te1[0]
+	PACK2	$Te3[1],$Te2[0],$Te3[1]
+||	PACK2	$Te3[2],$Te2[1],$Te3[2]
+||	BNOP	RA
+	PACKL4	$Te3[1],$Te1[3],$Te1[3]
+||	PACKL4	$Te3[2],$Te1[0],$Te1[0]
+	XOR	$K[2],$Te1[3],$Te1[3]		; s[2]
+||	XOR	$K[3],$Te1[0],$Te1[0]		; s[3]
+
+	MV	$Te1[1],A8
+||	MV	$Te1[2],A9
+	MV	$Te1[3],B8
+||	MV	$Te1[0],B9
+|| [B2]	STNDW	A9:A8,*OUT++
+   [B2]	STNDW	B9:B8,*OUT++
+	.endif
+	.endasmfunc
+
+	.global	_AES_decrypt
+_AES_decrypt:
+	.asmfunc
+	MVK	1,B2
+__decrypt:
+	.if	__TI_EABI__
+   [B2]	LDNDW	*INP++,A9:A8			; load input
+||	MVKL	\$PCR_OFFSET(AES_Td,__decrypt),$TEA
+||	ADDKPC	__decrypt,B0
+   [B2]	LDNDW	*INP++,B9:B8
+||	MVKH	\$PCR_OFFSET(AES_Td,__decrypt),$TEA
+||	ADD	0,KEY,$KPA
+||	ADD	4,KEY,$KPB
+	.else
+   [B2]	LDNDW	*INP++,A9:A8			; load input
+||	MVKL	(AES_Td-__decrypt),$TEA
+||	ADDKPC	__decrypt,B0
+   [B2]	LDNDW	*INP++,B9:B8
+||	MVKH	(AES_Td-__decrypt),$TEA
+||	ADD	0,KEY,$KPA
+||	ADD	4,KEY,$KPB
+	.endif
+	LDW	*$KPA++[2],$Td0[0]		; zero round key
+||	LDW	*$KPB++[2],$Td0[1]
+||	MVK	60,A0
+||	ADD	B0,$TEA,$TEA			; AES_Td
+	LDW	*KEY[A0],B0			; rounds
+||	MVK	1024,A0				; sizeof(AES_Td)
+	LDW	*$KPA++[2],$Td0[2]
+||	LDW	*$KPB++[2],$Td0[3]
+||	MV	$TEA,$TEB
+	NOP
+	.if	.BIG_ENDIAN
+	MV	A9,$s[0]
+||	MV	A8,$s[1]
+||	MV	B9,$s[2]
+||	MV	B8,$s[3]
+	.else
+	MV	A8,$s[0]
+||	MV	A9,$s[1]
+||	MV	B8,$s[2]
+||	MV	B9,$s[3]
+	.endif
+	XOR	$Td0[0],$s[0],$s[0]
+||	XOR	$Td0[1],$s[1],$s[1]
+||	LDW	*$KPA++[2],$K[0]		; 1st round key
+||	LDW	*$KPB++[2],$K[1]
+	SUB	B0,2,B0
+
+	SPLOOPD	13
+||	MVC	B0,ILC
+||	LDW	*$KPA++[2],$K[2]
+||	LDW	*$KPB++[2],$K[3]
+;;====================================================================
+	EXTU	$s[1],EXT3,24,$Td3[1]
+||	EXTU	$s[0],EXT1,24,$Td1[0]
+	LDW	*${TEB}[$Td3[1]],$Td3[1]	; Td3[s1>>24],	t0
+||	LDW	*${TEA}[$Td1[0]],$Td1[0]	; Td1[s0>>8],	t1
+||	XOR	$s[2],$Td0[2],$s[2]		; modulo-scheduled
+||	XOR	$s[3],$Td0[3],$s[3]		; modulo-scheduled
+||	EXTU	$s[1],EXT1,24,$Td1[1]
+||	EXTU	$s[0],EXT3,24,$Td3[0]
+	LDW	*${TEB}[$Td1[1]],$Td1[1]	; Td1[s1>>8],	t2
+||	LDW	*${TEA}[$Td3[0]],$Td3[0]	; Td3[s0>>24],	t3
+||	EXTU	$s[2],EXT2,24,$Td2[2]
+||	EXTU	$s[3],EXT2,24,$Td2[3]
+	LDW	*${TEA}[$Td2[2]],$Td2[2]	; Td2[s2>>16],	t0
+||	LDW	*${TEB}[$Td2[3]],$Td2[3]	; Td2[s3>>16],	t1
+||	EXTU	$s[3],EXT1,24,$Td1[3]
+||	EXTU	$s[2],EXT3,24,$Td3[2]
+	LDW	*${TEB}[$Td1[3]],$Td1[3]	; Td1[s3>>8],	t0
+||	LDW	*${TEA}[$Td3[2]],$Td3[2]	; Td3[s2>>24],	t1
+||	EXTU	$s[0],EXT2,24,$Td2[0]
+||	EXTU	$s[1],EXT2,24,$Td2[1]
+	LDW	*${TEA}[$Td2[0]],$Td2[0]	; Td2[s0>>16],	t2
+||	LDW	*${TEB}[$Td2[1]],$Td2[1]	; Td2[s1>>16],	t3
+||	EXTU	$s[3],EXT3,24,$Td3[3]
+||	EXTU	$s[2],EXT1,24,$Td1[2]
+	LDW	*${TEB}[$Td3[3]],$Td3[3]	; Td3[s3>>24],	t2
+||	LDW	*${TEA}[$Td1[2]],$Td1[2]	; Td1[s2>>8],	t3
+||	ROTL	$Td3[1],TBL3,$Td1[0]		; t0
+||	ROTL	$Td1[0],TBL1,$Td3[1]		; t1
+||	EXTU	$s[0],EXT0,24,$Td0[0]
+||	EXTU	$s[1],EXT0,24,$Td0[1]
+	LDW	*${TEA}[$Td0[0]],$Td0[0]	; Td0[s0],	t0
+||	LDW	*${TEB}[$Td0[1]],$Td0[1]	; Td0[s1],	t1
+||	ROTL	$Td1[1],TBL1,$Td3[0]		; t2
+||	ROTL	$Td3[0],TBL3,$Td1[1]		; t3
+||	EXTU	$s[2],EXT0,24,$Td0[2]
+||	EXTU	$s[3],EXT0,24,$Td0[3]
+	LDW	*${TEA}[$Td0[2]],$Td0[2]	; Td0[s2],	t2
+||	LDW	*${TEB}[$Td0[3]],$Td0[3]	; Td0[s3],	t3
+||	ROTL	$Td2[2],TBL2,$Td2[2]		; t0
+||	ROTL	$Td2[3],TBL2,$Td2[3]		; t1
+||	XOR	$K[0],$Td1[0],$s[0]
+||	XOR	$K[1],$Td3[1],$s[1]
+	ROTL	$Td1[3],TBL1,$Td3[2]		; t0
+||	ROTL	$Td3[2],TBL3,$Td1[3]		; t1
+||	XOR	$K[2],$Td3[0],$s[2]
+||	XOR	$K[3],$Td1[1],$s[3]
+||	LDW	*$KPA++[2],$K[0]		; next round key
+||	LDW	*$KPB++[2],$K[1]
+	ROTL	$Td2[0],TBL2,$Td2[0]		; t2
+||	ROTL	$Td2[1],TBL2,$Td2[1]		; t3
+||	XOR	$s[0],$Td2[2],$s[0]
+||	XOR	$s[1],$Td2[3],$s[1]
+||	LDW	*$KPA++[2],$K[2]
+||	LDW	*$KPB++[2],$K[3]
+	ROTL	$Td3[3],TBL3,$Td1[2]		; t2
+||	ROTL	$Td1[2],TBL1,$Td3[3]		; t3
+||	XOR	$s[0],$Td3[2],$s[0]
+||	XOR	$s[1],$Td1[3],$s[1]
+	XOR	$s[2],$Td2[0],$s[2]
+||	XOR	$s[3],$Td2[1],$s[3]
+||	XOR	$s[0],$Td0[0],$s[0]
+||	XOR	$s[1],$Td0[1],$s[1]
+	SPKERNEL
+||	XOR.L	$s[2],$Td1[2],$s[2]
+||	XOR.L	$s[3],$Td3[3],$s[3]
+;;====================================================================
+	ADD.D	${TEA},A0,${TEA}		; point to Td4
+||	ADD.D	${TEB},A0,${TEB}
+||	EXTU	$s[1],EXT3,24,$Td3[1]
+||	EXTU	$s[0],EXT1,24,$Td1[0]
+	LDBU	*${TEB}[$Td3[1]],$Td3[1]	; Td3[s1>>24],	t0
+||	LDBU	*${TEA}[$Td1[0]],$Td1[0]	; Td1[s0>>8],	t1
+||	XOR	$s[2],$Td0[2],$s[2]		; modulo-scheduled
+||	XOR	$s[3],$Td0[3],$s[3]		; modulo-scheduled
+||	EXTU	$s[0],EXT0,24,$Td0[0]
+||	EXTU	$s[1],EXT0,24,$Td0[1]
+	LDBU	*${TEA}[$Td0[0]],$Td0[0]	; Td0[s0],	t0
+||	LDBU	*${TEB}[$Td0[1]],$Td0[1]	; Td0[s1],	t1
+||	EXTU	$s[2],EXT2,24,$Td2[2]
+||	EXTU	$s[3],EXT2,24,$Td2[3]
+	LDBU	*${TEA}[$Td2[2]],$Td2[2]	; Td2[s2>>16],	t0
+||	LDBU	*${TEB}[$Td2[3]],$Td2[3]	; Td2[s3>>16],	t1
+||	EXTU	$s[3],EXT1,24,$Td1[3]
+||	EXTU	$s[2],EXT3,24,$Td3[2]
+	LDBU	*${TEB}[$Td1[3]],$Td1[3]	; Td1[s3>>8],	t0
+||	LDBU	*${TEA}[$Td3[2]],$Td3[2]	; Td3[s2>>24],	t1
+||	EXTU	$s[1],EXT1,24,$Td1[1]
+||	EXTU	$s[0],EXT3,24,$Td3[0]
+	LDBU	*${TEB}[$Td1[1]],$Td1[1]	; Td1[s1>>8],	t2
+||	LDBU	*${TEA}[$Td3[0]],$Td3[0]	; Td3[s0>>24],	t3
+||	EXTU	$s[0],EXT2,24,$Td2[0]
+||	EXTU	$s[1],EXT2,24,$Td2[1]
+	LDBU	*${TEA}[$Td2[0]],$Td2[0]	; Td2[s0>>16],	t2
+||	LDBU	*${TEB}[$Td2[1]],$Td2[1]	; Td2[s1>>16],	t3
+||	EXTU	$s[3],EXT3,24,$Td3[3]
+||	EXTU	$s[2],EXT1,24,$Td1[2]
+	LDBU	*${TEB}[$Td3[3]],$Td3[3]	; Td3[s3>>24],	t2
+||	LDBU	*${TEA}[$Td1[2]],$Td1[2]	; Td1[s2>>8],	t3
+||	EXTU	$s[2],EXT0,24,$Td0[2]
+||	EXTU	$s[3],EXT0,24,$Td0[3]
+	LDBU	*${TEA}[$Td0[2]],$Td0[2]	; Td0[s2],	t2
+||	LDBU	*${TEB}[$Td0[3]],$Td0[3]	; Td0[s3],	t3
+
+	.if	.BIG_ENDIAN
+	PACK2	$Td0[0],$Td1[3],$Td0[0]
+||	PACK2	$Td0[1],$Td1[0],$Td0[1]
+	PACK2	$Td2[2],$Td3[1],$Td2[2]
+||	PACK2	$Td2[3],$Td3[2],$Td2[3]
+	PACKL4	$Td0[0],$Td2[2],$Td0[0]
+||	PACKL4	$Td0[1],$Td2[3],$Td0[1]
+	XOR	$K[0],$Td0[0],$Td0[0]		; s[0]
+||	XOR	$K[1],$Td0[1],$Td0[1]		; s[1]
+
+	PACK2	$Td0[2],$Td1[1],$Td0[2]
+||	PACK2	$Td0[3],$Td1[2],$Td0[3]
+	PACK2	$Td2[0],$Td3[3],$Td2[0]
+||	PACK2	$Td2[1],$Td3[0],$Td2[1]
+||	BNOP	RA
+	PACKL4	$Td0[2],$Td2[0],$Td0[2]
+||	PACKL4	$Td0[3],$Td2[1],$Td0[3]
+	XOR	$K[2],$Td0[2],$Td0[2]		; s[2]
+||	XOR	$K[3],$Td0[3],$Td0[3]		; s[3]
+
+	MV	$Td0[0],A9
+||	MV	$Td0[1],A8
+	MV	$Td0[2],B9
+||	MV	$Td0[3],B8
+|| [B2]	STNDW	A9:A8,*OUT++
+   [B2]	STNDW	B9:B8,*OUT++
+	.else
+	PACK2	$Td1[3],$Td0[0],$Td1[3]
+||	PACK2	$Td1[0],$Td0[1],$Td1[0]
+	PACK2	$Td3[1],$Td2[2],$Td3[1]
+||	PACK2	$Td3[2],$Td2[3],$Td3[2]
+	PACKL4	$Td3[1],$Td1[3],$Td1[3]
+||	PACKL4	$Td3[2],$Td1[0],$Td1[0]
+	XOR	$K[0],$Td1[3],$Td1[3]		; s[0]
+||	XOR	$K[1],$Td1[0],$Td1[0]		; s[1]
+
+	PACK2	$Td1[1],$Td0[2],$Td1[1]
+||	PACK2	$Td1[2],$Td0[3],$Td1[2]
+	PACK2	$Td3[3],$Td2[0],$Td3[3]
+||	PACK2	$Td3[0],$Td2[1],$Td3[0]
+||	BNOP	RA
+	PACKL4	$Td3[3],$Td1[1],$Td1[1]
+||	PACKL4	$Td3[0],$Td1[2],$Td1[2]
+	XOR	$K[2],$Td1[1],$Td1[1]		; s[2]
+||	XOR	$K[3],$Td1[2],$Td1[2]		; s[3]
+
+	MV	$Td1[3],A8
+||	MV	$Td1[0],A9
+	MV	$Td1[1],B8
+||	MV	$Td1[2],B9
+|| [B2]	STNDW	A9:A8,*OUT++
+   [B2]	STNDW	B9:B8,*OUT++
+	.endif
+	.endasmfunc
+___
+{
+my @K=(@K,@s);			# extended key
+my @Te4=map("B$_",(16..19));
+
+my @Kx9=@Te0;			# used in AES_set_decrypt_key
+my @KxB=@Te1;
+my @KxD=@Te2;
+my @KxE=@Te3;
+
+$code.=<<___;
+	.asg	OUT,BITS
+
+	.global	_AES_set_encrypt_key
+_AES_set_encrypt_key:
+__set_encrypt_key:
+	.asmfunc
+	MV	INP,A0
+||	SHRU	BITS,5,BITS			; 128-192-256 -> 4-6-8
+||	MV	KEY,A1
+  [!A0]	B	RA
+||[!A0]	MVK	-1,RET
+||[!A0]	MVK	1,A1				; only one B RA
+  [!A1]	B	RA
+||[!A1]	MVK	-1,RET
+||[!A1]	MVK	0,A0
+||	MVK	0,B0
+||	MVK	0,A1
+   [A0]	LDNDW	*INP++,A9:A8
+|| [A0]	CMPEQ	4,BITS,B0
+|| [A0]	CMPLT	3,BITS,A1
+   [B0]	B	key128?
+|| [A1]	LDNDW	*INP++,B9:B8
+|| [A0]	CMPEQ	6,BITS,B0
+|| [A0]	CMPLT	5,BITS,A1
+   [B0]	B	key192?
+|| [A1]	LDNDW	*INP++,B17:B16
+|| [A0]	CMPEQ	8,BITS,B0
+|| [A0]	CMPLT	7,BITS,A1
+   [B0]	B	key256?
+|| [A1]	LDNDW	*INP++,B19:B18
+
+	.if	__TI_EABI__
+   [A0]	ADD	0,KEY,$KPA
+|| [A0]	ADD	4,KEY,$KPB
+|| [A0]	MVKL	\$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
+|| [A0]	ADDKPC	__set_encrypt_key,B6
+   [A0]	MVKH	\$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
+   [A0]	ADD	B6,$TEA,$TEA			; AES_Te4
+	.else
+   [A0]	ADD	0,KEY,$KPA
+|| [A0]	ADD	4,KEY,$KPB
+|| [A0]	MVKL	(AES_Te4-__set_encrypt_key),$TEA
+|| [A0]	ADDKPC	__set_encrypt_key,B6
+   [A0]	MVKH	(AES_Te4-__set_encrypt_key),$TEA
+   [A0]	ADD	B6,$TEA,$TEA			; AES_Te4
+	.endif
+	NOP
+	NOP
+
+	BNOP	RA,5
+||	MVK	-2,RET				; unknown bit length
+||	MVK	0,B0				; redundant
+;;====================================================================
+;;====================================================================
+key128?:
+	.if	.BIG_ENDIAN
+	MV	A9,$K[0]
+||	MV	A8,$K[1]
+||	MV	B9,$Te4[2]
+||	MV	B8,$K[3]
+	.else
+	MV	A8,$K[0]
+||	MV	A9,$K[1]
+||	MV	B8,$Te4[2]
+||	MV	B9,$K[3]
+	.endif
+
+	MVK	256,A0
+||	MVK	9,B0
+
+	SPLOOPD	14
+||	MVC	B0,ILC
+||	MV	$TEA,$TEB
+||	ADD	$TEA,A0,A30			; rcon
+;;====================================================================
+	LDW	*A30++[1],A31			; rcon[i]
+||	MV	$Te4[2],$K[2]
+||	EXTU	$K[3],EXT1,24,$Te4[0]
+	LDBU	*${TEB}[$Te4[0]],$Te4[0]
+||	MV	$K[3],A0
+||	EXTU	$K[3],EXT2,24,$Te4[1]
+	LDBU	*${TEB}[$Te4[1]],$Te4[1]
+||	EXTU	A0,EXT3,24,A0
+||	EXTU	$K[3],EXT0,24,$Te4[3]
+	.if	.BIG_ENDIAN
+	LDBU	*${TEA}[A0],$Te4[3]
+||	LDBU	*${TEB}[$Te4[3]],A0
+	.else
+	LDBU	*${TEA}[A0],A0
+||	LDBU	*${TEB}[$Te4[3]],$Te4[3]
+	.endif
+
+	STW	$K[0],*$KPA++[2]
+||	STW	$K[1],*$KPB++[2]
+	STW	$K[2],*$KPA++[2]
+||	STW	$K[3],*$KPB++[2]
+
+	XOR	A31,$K[0],$K[0]			; ^=rcon[i]
+	.if	.BIG_ENDIAN
+	PACK2	$Te4[0],$Te4[1],$Te4[1]
+	PACK2	$Te4[3],A0,$Te4[3]
+	PACKL4	$Te4[1],$Te4[3],$Te4[3]
+	.else
+	PACK2	$Te4[1],$Te4[0],$Te4[1]
+	PACK2	$Te4[3],A0,$Te4[3]
+	PACKL4	$Te4[3],$Te4[1],$Te4[3]
+	.endif
+	XOR	$Te4[3],$K[0],$Te4[0]		; K[0]
+	XOR	$Te4[0],$K[1],$K[1]		; K[1]
+	MV	$Te4[0],$K[0]
+||	XOR	$K[1],$K[2],$Te4[2]		; K[2]
+	XOR	$Te4[2],$K[3],$K[3]		; K[3]
+	SPKERNEL
+;;====================================================================
+	BNOP	RA
+	MV	$Te4[2],$K[2]
+||	STW	$K[0],*$KPA++[2]
+||	STW	$K[1],*$KPB++[2]
+	STW	$K[2],*$KPA++[2]
+||	STW	$K[3],*$KPB++[2]
+	MVK	10,B0				; rounds
+	STW	B0,*++${KPB}[15]
+	MVK	0,RET
+;;====================================================================
+;;====================================================================
+key192?:
+	.if	.BIG_ENDIAN
+	MV	A9,$K[0]
+||	MV	A8,$K[1]
+||	MV	B9,$K[2]
+||	MV	B8,$K[3]
+	MV	B17,$Te4[2]
+||	MV	B16,$K[5]
+	.else
+	MV	A8,$K[0]
+||	MV	A9,$K[1]
+||	MV	B8,$K[2]
+||	MV	B9,$K[3]
+	MV	B16,$Te4[2]
+||	MV	B17,$K[5]
+	.endif
+
+	MVK	256,A0
+||	MVK	6,B0
+	MV	$TEA,$TEB
+||	ADD	$TEA,A0,A30			; rcon
+;;====================================================================
+loop192?:
+	LDW	*A30++[1],A31			; rcon[i]
+||	MV	$Te4[2],$K[4]
+||	EXTU	$K[5],EXT1,24,$Te4[0]
+	LDBU	*${TEB}[$Te4[0]],$Te4[0]
+||	MV	$K[5],A0
+||	EXTU	$K[5],EXT2,24,$Te4[1]
+	LDBU	*${TEB}[$Te4[1]],$Te4[1]
+||	EXTU	A0,EXT3,24,A0
+||	EXTU	$K[5],EXT0,24,$Te4[3]
+	.if	.BIG_ENDIAN
+	LDBU	*${TEA}[A0],$Te4[3]
+||	LDBU	*${TEB}[$Te4[3]],A0
+	.else
+	LDBU	*${TEA}[A0],A0
+||	LDBU	*${TEB}[$Te4[3]],$Te4[3]
+	.endif
+
+	STW	$K[0],*$KPA++[2]
+||	STW	$K[1],*$KPB++[2]
+	STW	$K[2],*$KPA++[2]
+||	STW	$K[3],*$KPB++[2]
+	STW	$K[4],*$KPA++[2]
+||	STW	$K[5],*$KPB++[2]
+
+	XOR	A31,$K[0],$K[0]			; ^=rcon[i]
+	.if	.BIG_ENDIAN
+	PACK2	$Te4[0],$Te4[1],$Te4[1]
+||	PACK2	$Te4[3],A0,$Te4[3]
+	PACKL4	$Te4[1],$Te4[3],$Te4[3]
+	.else
+	PACK2	$Te4[1],$Te4[0],$Te4[1]
+||	PACK2	$Te4[3],A0,$Te4[3]
+	PACKL4	$Te4[3],$Te4[1],$Te4[3]
+	.endif
+	BDEC	loop192?,B0
+||	XOR	$Te4[3],$K[0],$Te4[0]		; K[0]
+	XOR	$Te4[0],$K[1],$K[1]		; K[1]
+	MV	$Te4[0],$K[0]
+||	XOR	$K[1],$K[2],$Te4[2]		; K[2]
+	XOR	$Te4[2],$K[3],$K[3]		; K[3]
+	MV	$Te4[2],$K[2]
+||	XOR	$K[3],$K[4],$Te4[2]		; K[4]
+	XOR	$Te4[2],$K[5],$K[5]		; K[5]
+;;====================================================================
+	BNOP	RA
+	STW	$K[0],*$KPA++[2]
+||	STW	$K[1],*$KPB++[2]
+	STW	$K[2],*$KPA++[2]
+||	STW	$K[3],*$KPB++[2]
+	MVK	12,B0				; rounds
+	STW	B0,*++${KPB}[7]
+	MVK	0,RET
+;;====================================================================
+;;====================================================================
+key256?:
+	.if	.BIG_ENDIAN
+	MV	A9,$K[0]
+||	MV	A8,$K[1]
+||	MV	B9,$K[2]
+||	MV	B8,$K[3]
+	MV	B17,$K[4]
+||	MV	B16,$K[5]
+||	MV	B19,$Te4[2]
+||	MV	B18,$K[7]
+	.else
+	MV	A8,$K[0]
+||	MV	A9,$K[1]
+||	MV	B8,$K[2]
+||	MV	B9,$K[3]
+	MV	B16,$K[4]
+||	MV	B17,$K[5]
+||	MV	B18,$Te4[2]
+||	MV	B19,$K[7]
+	.endif
+
+	MVK	256,A0
+||	MVK	6,B0
+	MV	$TEA,$TEB
+||	ADD	$TEA,A0,A30			; rcon
+;;====================================================================
+loop256?:
+	LDW	*A30++[1],A31			; rcon[i]
+||	MV	$Te4[2],$K[6]
+||	EXTU	$K[7],EXT1,24,$Te4[0]
+	LDBU	*${TEB}[$Te4[0]],$Te4[0]
+||	MV	$K[7],A0
+||	EXTU	$K[7],EXT2,24,$Te4[1]
+	LDBU	*${TEB}[$Te4[1]],$Te4[1]
+||	EXTU	A0,EXT3,24,A0
+||	EXTU	$K[7],EXT0,24,$Te4[3]
+	.if	.BIG_ENDIAN
+	LDBU	*${TEA}[A0],$Te4[3]
+||	LDBU	*${TEB}[$Te4[3]],A0
+	.else
+	LDBU	*${TEA}[A0],A0
+||	LDBU	*${TEB}[$Te4[3]],$Te4[3]
+	.endif
+
+	STW	$K[0],*$KPA++[2]
+||	STW	$K[1],*$KPB++[2]
+	STW	$K[2],*$KPA++[2]
+||	STW	$K[3],*$KPB++[2]
+	STW	$K[4],*$KPA++[2]
+||	STW	$K[5],*$KPB++[2]
+	STW	$K[6],*$KPA++[2]
+||	STW	$K[7],*$KPB++[2]
+||	XOR	A31,$K[0],$K[0]			; ^=rcon[i]
+	.if	.BIG_ENDIAN
+	PACK2	$Te4[0],$Te4[1],$Te4[1]
+||	PACK2	$Te4[3],A0,$Te4[3]
+	PACKL4	$Te4[1],$Te4[3],$Te4[3]
+||[!B0]	B	done256?
+	.else
+	PACK2	$Te4[1],$Te4[0],$Te4[1]
+||	PACK2	$Te4[3],A0,$Te4[3]
+	PACKL4	$Te4[3],$Te4[1],$Te4[3]
+||[!B0]	B	done256?
+	.endif
+	XOR	$Te4[3],$K[0],$Te4[0]		; K[0]
+	XOR	$Te4[0],$K[1],$K[1]		; K[1]
+	MV	$Te4[0],$K[0]
+||	XOR	$K[1],$K[2],$Te4[2]		; K[2]
+	XOR	$Te4[2],$K[3],$K[3]		; K[3]
+
+	MV	$Te4[2],$K[2]
+|| [B0]	EXTU	$K[3],EXT0,24,$Te4[0]
+|| [B0]	SUB	B0,1,B0
+	LDBU	*${TEB}[$Te4[0]],$Te4[0]
+||	MV	$K[3],A0
+||	EXTU	$K[3],EXT1,24,$Te4[1]
+	LDBU	*${TEB}[$Te4[1]],$Te4[1]
+||	EXTU	A0,EXT2,24,A0
+||	EXTU	$K[3],EXT3,24,$Te4[3]
+
+	.if	.BIG_ENDIAN
+	LDBU	*${TEA}[A0],$Te4[3]
+||	LDBU	*${TEB}[$Te4[3]],A0
+	NOP	3
+	PACK2	$Te4[0],$Te4[1],$Te4[1]
+	PACK2	$Te4[3],A0,$Te4[3]
+||	B	loop256?
+	PACKL4	$Te4[1],$Te4[3],$Te4[3]
+	.else
+	LDBU	*${TEA}[A0],A0
+||	LDBU	*${TEB}[$Te4[3]],$Te4[3]
+	NOP	3
+	PACK2	$Te4[1],$Te4[0],$Te4[1]
+	PACK2	$Te4[3],A0,$Te4[3]
+||	B	loop256?
+	PACKL4	$Te4[3],$Te4[1],$Te4[3]
+	.endif
+
+	XOR	$Te4[3],$K[4],$Te4[0]		; K[4]
+	XOR	$Te4[0],$K[5],$K[5]		; K[5]
+	MV	$Te4[0],$K[4]
+||	XOR	$K[5],$K[6],$Te4[2]		; K[6]
+	XOR	$Te4[2],$K[7],$K[7]		; K[7]
+;;====================================================================
+done256?:
+	BNOP	RA
+	STW	$K[0],*$KPA++[2]
+||	STW	$K[1],*$KPB++[2]
+	STW	$K[2],*$KPA++[2]
+||	STW	$K[3],*$KPB++[2]
+	MVK	14,B0				; rounds
+	STW	B0,*--${KPB}[1]
+	MVK	0,RET
+	.endasmfunc
+
+	.global	_AES_set_decrypt_key
+_AES_set_decrypt_key:
+	.asmfunc
+	B	__set_encrypt_key		; guarantee local call
+	MV	KEY,B30				; B30 is not modified
+	MV	RA, B31				; B31 is not modified
+	ADDKPC	ret?,RA,2
+ret?:						; B0 holds rounds or zero
+  [!B0]	BNOP	B31				; return if zero
+   [B0]	SHL	B0,4,A0				; offset to last round key
+   [B0]	SHRU	B0,1,B1
+   [B0]	SUB	B1,1,B1
+   [B0]	MVK	0x0000001B,B3			; AES polynomial
+   [B0]	MVKH	0x07000000,B3
+
+	SPLOOPD	9				; flip round keys
+||	MVC	B1,ILC
+||	MV	B30,$KPA
+||	ADD	B30,A0,$KPB
+||	MVK	16,A0				; sizeof(round key)
+;;====================================================================
+	LDW	*${KPA}[0],A16
+||	LDW	*${KPB}[0],B16
+	LDW	*${KPA}[1],A17
+||	LDW	*${KPB}[1],B17
+	LDW	*${KPA}[2],A18
+||	LDW	*${KPB}[2],B18
+	LDW	*${KPA}[3],A19
+||	ADD	$KPA,A0,$KPA
+||	LDW	*${KPB}[3],B19
+||	SUB	$KPB,A0,$KPB
+	NOP
+	STW	B16,*${KPA}[-4]
+||	STW	A16,*${KPB}[4]
+	STW	B17,*${KPA}[-3]
+||	STW	A17,*${KPB}[5]
+	STW	B18,*${KPA}[-2]
+||	STW	A18,*${KPB}[6]
+	STW	B19,*${KPA}[-1]
+||	STW	A19,*${KPB}[7]
+	SPKERNEL
+;;====================================================================
+	SUB	B0,1,B0				; skip last round
+||	ADD	B30,A0,$KPA			; skip first round
+||	ADD	B30,A0,$KPB
+||	MVC	GFPGFR,B30			; save GFPGFR
+	LDW	*${KPA}[0],$K[0]
+||	LDW	*${KPB}[1],$K[1]
+||	MVC	B3,GFPGFR
+	LDW	*${KPA}[2],$K[2]
+||	LDW	*${KPB}[3],$K[3]
+	MVK	0x00000909,A24
+||	MVK	0x00000B0B,B24
+	MVKH	0x09090000,A24
+||	MVKH	0x0B0B0000,B24
+	MVC	B0,ILC
+||	SUB	B0,1,B0
+
+	GMPY4	$K[0],A24,$Kx9[0]		; ·0x09
+||	GMPY4	$K[1],A24,$Kx9[1]
+||	MVK	0x00000D0D,A25
+||	MVK	0x00000E0E,B25
+	GMPY4	$K[2],A24,$Kx9[2]
+||	GMPY4	$K[3],A24,$Kx9[3]
+||	MVKH	0x0D0D0000,A25
+||	MVKH	0x0E0E0000,B25
+
+	GMPY4	$K[0],B24,$KxB[0]		; ·0x0B
+||	GMPY4	$K[1],B24,$KxB[1]
+	GMPY4	$K[2],B24,$KxB[2]
+||	GMPY4	$K[3],B24,$KxB[3]
+
+	SPLOOP	11				; InvMixColumns
+;;====================================================================
+	GMPY4	$K[0],A25,$KxD[0]		; ·0x0D
+||	GMPY4	$K[1],A25,$KxD[1]
+||	SWAP2	$Kx9[0],$Kx9[0]			; rotate by 16
+||	SWAP2	$Kx9[1],$Kx9[1]
+||	MV	$K[0],$s[0]			; this or DINT
+||	MV	$K[1],$s[1]
+|| [B0]	LDW	*${KPA}[4],$K[0]
+|| [B0]	LDW	*${KPB}[5],$K[1]
+	GMPY4	$K[2],A25,$KxD[2]
+||	GMPY4	$K[3],A25,$KxD[3]
+||	SWAP2	$Kx9[2],$Kx9[2]
+||	SWAP2	$Kx9[3],$Kx9[3]
+||	MV	$K[2],$s[2]
+||	MV	$K[3],$s[3]
+|| [B0]	LDW	*${KPA}[6],$K[2]
+|| [B0]	LDW	*${KPB}[7],$K[3]
+
+	GMPY4	$s[0],B25,$KxE[0]		; ·0x0E
+||	GMPY4	$s[1],B25,$KxE[1]
+||	XOR	$Kx9[0],$KxB[0],$KxB[0]
+||	XOR	$Kx9[1],$KxB[1],$KxB[1]
+	GMPY4	$s[2],B25,$KxE[2]
+||	GMPY4	$s[3],B25,$KxE[3]
+||	XOR	$Kx9[2],$KxB[2],$KxB[2]
+||	XOR	$Kx9[3],$KxB[3],$KxB[3]
+
+	ROTL	$KxB[0],TBL3,$KxB[0]
+||	ROTL	$KxB[1],TBL3,$KxB[1]
+||	SWAP2	$KxD[0],$KxD[0]			; rotate by 16
+||	SWAP2	$KxD[1],$KxD[1]
+	ROTL	$KxB[2],TBL3,$KxB[2]
+||	ROTL	$KxB[3],TBL3,$KxB[3]
+||	SWAP2	$KxD[2],$KxD[2]
+||	SWAP2	$KxD[3],$KxD[3]
+
+	XOR	$KxE[0],$KxD[0],$KxE[0]
+||	XOR	$KxE[1],$KxD[1],$KxE[1]
+|| [B0]	GMPY4	$K[0],A24,$Kx9[0]		; ·0x09
+|| [B0]	GMPY4	$K[1],A24,$Kx9[1]
+||	ADDAW	$KPA,4,$KPA
+	XOR	$KxE[2],$KxD[2],$KxE[2]
+||	XOR	$KxE[3],$KxD[3],$KxE[3]
+|| [B0]	GMPY4	$K[2],A24,$Kx9[2]
+|| [B0]	GMPY4	$K[3],A24,$Kx9[3]
+||	ADDAW	$KPB,4,$KPB
+
+	XOR	$KxB[0],$KxE[0],$KxE[0]
+||	XOR	$KxB[1],$KxE[1],$KxE[1]
+|| [B0]	GMPY4	$K[0],B24,$KxB[0]		; ·0x0B
+|| [B0]	GMPY4	$K[1],B24,$KxB[1]
+	XOR	$KxB[2],$KxE[2],$KxE[2]
+||	XOR	$KxB[3],$KxE[3],$KxE[3]
+|| [B0]	GMPY4	$K[2],B24,$KxB[2]
+|| [B0]	GMPY4	$K[3],B24,$KxB[3]
+||	STW	$KxE[0],*${KPA}[-4]
+||	STW	$KxE[1],*${KPB}[-3]
+	STW	$KxE[2],*${KPA}[-2]
+||	STW	$KxE[3],*${KPB}[-1]
+|| [B0]	SUB	B0,1,B0
+	SPKERNEL
+;;====================================================================
+	BNOP	B31,3
+	MVC	B30,GFPGFR			; restore GFPGFR(*)
+	MVK	0,RET
+	.endasmfunc
+___
+# (*)	Even though ABI doesn't specify GFPGFR as non-volatile, there
+#	are code samples out there that *assume* its default value.
+}
+{
+my ($inp,$out,$blocks,$key,$ivp)=("A4","B4","A6","B6","A8");
+$code.=<<___;
+	.global	_AES_ctr32_encrypt
+_AES_ctr32_encrypt:
+	.asmfunc
+	LDNDW	*${ivp}[0],A31:A30	; load counter value
+||	MV	$blocks,A2		; reassign $blocks
+||	DMV	RA,$key,B27:B26		; reassign RA and $key
+	LDNDW	*${ivp}[1],B31:B30
+||	MVK	0,B2			; don't let __encrypt load input
+||	MVK	0,A1			; and postpone writing output
+	.if	.BIG_ENDIAN
+	NOP
+	.else
+	NOP	4
+	SWAP2	B31,B31			; keep least significant 32 bits
+	SWAP4	B31,B31			; in host byte order
+	.endif
+ctr32_loop?:
+   [A2]	BNOP	__encrypt
+|| [A1]	XOR	A29,A9,A9		; input^Ek(counter)
+|| [A1]	XOR	A28,A8,A8
+|| [A2]	LDNDW	*INP++,A29:A28		; load input
+  [!A2]	BNOP	B27			; return
+|| [A1]	XOR	B29,B9,B9
+|| [A1]	XOR	B28,B8,B8
+|| [A2]	LDNDW	*INP++,B29:B28
+	.if	.BIG_ENDIAN
+   [A1]	STNDW	A9:A8,*OUT++		; save output
+|| [A2]	DMV	A31,A30,A9:A8		; pass counter value to __encrypt
+   [A1]	STNDW	B9:B8,*OUT++
+|| [A2]	DMV	B31,B30,B9:B8
+|| [A2]	ADD	B30,1,B30		; counter++
+	.else
+   [A1]	STNDW	A9:A8,*OUT++		; save output
+|| [A2]	DMV	A31,A30,A9:A8
+|| [A2]	SWAP2	B31,B0
+|| [A2]	ADD	B31,1,B31		; counter++
+   [A1]	STNDW	B9:B8,*OUT++
+|| [A2]	MV	B30,B8
+|| [A2]	SWAP4	B0,B9
+	.endif
+   [A2]	ADDKPC	ctr32_loop?,RA		; return to ctr32_loop?
+|| [A2]	MV	B26,KEY			; pass $key
+|| [A2]	SUB	A2,1,A2			; $blocks--
+||[!A1]	MVK	1,A1
+	NOP
+	NOP
+	.endasmfunc
+___
+}
+# Tables are kept in endian-neutral manner
+$code.=<<___;
+	.if	__TI_EABI__
+	.sect	".text:aes_asm.const"
+	.else
+	.sect	".const:aes_asm"
+	.endif
+	.align	128
+AES_Te:
+	.byte	0xc6,0x63,0x63,0xa5,	0xf8,0x7c,0x7c,0x84
+	.byte	0xee,0x77,0x77,0x99,	0xf6,0x7b,0x7b,0x8d
+	.byte	0xff,0xf2,0xf2,0x0d,	0xd6,0x6b,0x6b,0xbd
+	.byte	0xde,0x6f,0x6f,0xb1,	0x91,0xc5,0xc5,0x54
+	.byte	0x60,0x30,0x30,0x50,	0x02,0x01,0x01,0x03
+	.byte	0xce,0x67,0x67,0xa9,	0x56,0x2b,0x2b,0x7d
+	.byte	0xe7,0xfe,0xfe,0x19,	0xb5,0xd7,0xd7,0x62
+	.byte	0x4d,0xab,0xab,0xe6,	0xec,0x76,0x76,0x9a
+	.byte	0x8f,0xca,0xca,0x45,	0x1f,0x82,0x82,0x9d
+	.byte	0x89,0xc9,0xc9,0x40,	0xfa,0x7d,0x7d,0x87
+	.byte	0xef,0xfa,0xfa,0x15,	0xb2,0x59,0x59,0xeb
+	.byte	0x8e,0x47,0x47,0xc9,	0xfb,0xf0,0xf0,0x0b
+	.byte	0x41,0xad,0xad,0xec,	0xb3,0xd4,0xd4,0x67
+	.byte	0x5f,0xa2,0xa2,0xfd,	0x45,0xaf,0xaf,0xea
+	.byte	0x23,0x9c,0x9c,0xbf,	0x53,0xa4,0xa4,0xf7
+	.byte	0xe4,0x72,0x72,0x96,	0x9b,0xc0,0xc0,0x5b
+	.byte	0x75,0xb7,0xb7,0xc2,	0xe1,0xfd,0xfd,0x1c
+	.byte	0x3d,0x93,0x93,0xae,	0x4c,0x26,0x26,0x6a
+	.byte	0x6c,0x36,0x36,0x5a,	0x7e,0x3f,0x3f,0x41
+	.byte	0xf5,0xf7,0xf7,0x02,	0x83,0xcc,0xcc,0x4f
+	.byte	0x68,0x34,0x34,0x5c,	0x51,0xa5,0xa5,0xf4
+	.byte	0xd1,0xe5,0xe5,0x34,	0xf9,0xf1,0xf1,0x08
+	.byte	0xe2,0x71,0x71,0x93,	0xab,0xd8,0xd8,0x73
+	.byte	0x62,0x31,0x31,0x53,	0x2a,0x15,0x15,0x3f
+	.byte	0x08,0x04,0x04,0x0c,	0x95,0xc7,0xc7,0x52
+	.byte	0x46,0x23,0x23,0x65,	0x9d,0xc3,0xc3,0x5e
+	.byte	0x30,0x18,0x18,0x28,	0x37,0x96,0x96,0xa1
+	.byte	0x0a,0x05,0x05,0x0f,	0x2f,0x9a,0x9a,0xb5
+	.byte	0x0e,0x07,0x07,0x09,	0x24,0x12,0x12,0x36
+	.byte	0x1b,0x80,0x80,0x9b,	0xdf,0xe2,0xe2,0x3d
+	.byte	0xcd,0xeb,0xeb,0x26,	0x4e,0x27,0x27,0x69
+	.byte	0x7f,0xb2,0xb2,0xcd,	0xea,0x75,0x75,0x9f
+	.byte	0x12,0x09,0x09,0x1b,	0x1d,0x83,0x83,0x9e
+	.byte	0x58,0x2c,0x2c,0x74,	0x34,0x1a,0x1a,0x2e
+	.byte	0x36,0x1b,0x1b,0x2d,	0xdc,0x6e,0x6e,0xb2
+	.byte	0xb4,0x5a,0x5a,0xee,	0x5b,0xa0,0xa0,0xfb
+	.byte	0xa4,0x52,0x52,0xf6,	0x76,0x3b,0x3b,0x4d
+	.byte	0xb7,0xd6,0xd6,0x61,	0x7d,0xb3,0xb3,0xce
+	.byte	0x52,0x29,0x29,0x7b,	0xdd,0xe3,0xe3,0x3e
+	.byte	0x5e,0x2f,0x2f,0x71,	0x13,0x84,0x84,0x97
+	.byte	0xa6,0x53,0x53,0xf5,	0xb9,0xd1,0xd1,0x68
+	.byte	0x00,0x00,0x00,0x00,	0xc1,0xed,0xed,0x2c
+	.byte	0x40,0x20,0x20,0x60,	0xe3,0xfc,0xfc,0x1f
+	.byte	0x79,0xb1,0xb1,0xc8,	0xb6,0x5b,0x5b,0xed
+	.byte	0xd4,0x6a,0x6a,0xbe,	0x8d,0xcb,0xcb,0x46
+	.byte	0x67,0xbe,0xbe,0xd9,	0x72,0x39,0x39,0x4b
+	.byte	0x94,0x4a,0x4a,0xde,	0x98,0x4c,0x4c,0xd4
+	.byte	0xb0,0x58,0x58,0xe8,	0x85,0xcf,0xcf,0x4a
+	.byte	0xbb,0xd0,0xd0,0x6b,	0xc5,0xef,0xef,0x2a
+	.byte	0x4f,0xaa,0xaa,0xe5,	0xed,0xfb,0xfb,0x16
+	.byte	0x86,0x43,0x43,0xc5,	0x9a,0x4d,0x4d,0xd7
+	.byte	0x66,0x33,0x33,0x55,	0x11,0x85,0x85,0x94
+	.byte	0x8a,0x45,0x45,0xcf,	0xe9,0xf9,0xf9,0x10
+	.byte	0x04,0x02,0x02,0x06,	0xfe,0x7f,0x7f,0x81
+	.byte	0xa0,0x50,0x50,0xf0,	0x78,0x3c,0x3c,0x44
+	.byte	0x25,0x9f,0x9f,0xba,	0x4b,0xa8,0xa8,0xe3
+	.byte	0xa2,0x51,0x51,0xf3,	0x5d,0xa3,0xa3,0xfe
+	.byte	0x80,0x40,0x40,0xc0,	0x05,0x8f,0x8f,0x8a
+	.byte	0x3f,0x92,0x92,0xad,	0x21,0x9d,0x9d,0xbc
+	.byte	0x70,0x38,0x38,0x48,	0xf1,0xf5,0xf5,0x04
+	.byte	0x63,0xbc,0xbc,0xdf,	0x77,0xb6,0xb6,0xc1
+	.byte	0xaf,0xda,0xda,0x75,	0x42,0x21,0x21,0x63
+	.byte	0x20,0x10,0x10,0x30,	0xe5,0xff,0xff,0x1a
+	.byte	0xfd,0xf3,0xf3,0x0e,	0xbf,0xd2,0xd2,0x6d
+	.byte	0x81,0xcd,0xcd,0x4c,	0x18,0x0c,0x0c,0x14
+	.byte	0x26,0x13,0x13,0x35,	0xc3,0xec,0xec,0x2f
+	.byte	0xbe,0x5f,0x5f,0xe1,	0x35,0x97,0x97,0xa2
+	.byte	0x88,0x44,0x44,0xcc,	0x2e,0x17,0x17,0x39
+	.byte	0x93,0xc4,0xc4,0x57,	0x55,0xa7,0xa7,0xf2
+	.byte	0xfc,0x7e,0x7e,0x82,	0x7a,0x3d,0x3d,0x47
+	.byte	0xc8,0x64,0x64,0xac,	0xba,0x5d,0x5d,0xe7
+	.byte	0x32,0x19,0x19,0x2b,	0xe6,0x73,0x73,0x95
+	.byte	0xc0,0x60,0x60,0xa0,	0x19,0x81,0x81,0x98
+	.byte	0x9e,0x4f,0x4f,0xd1,	0xa3,0xdc,0xdc,0x7f
+	.byte	0x44,0x22,0x22,0x66,	0x54,0x2a,0x2a,0x7e
+	.byte	0x3b,0x90,0x90,0xab,	0x0b,0x88,0x88,0x83
+	.byte	0x8c,0x46,0x46,0xca,	0xc7,0xee,0xee,0x29
+	.byte	0x6b,0xb8,0xb8,0xd3,	0x28,0x14,0x14,0x3c
+	.byte	0xa7,0xde,0xde,0x79,	0xbc,0x5e,0x5e,0xe2
+	.byte	0x16,0x0b,0x0b,0x1d,	0xad,0xdb,0xdb,0x76
+	.byte	0xdb,0xe0,0xe0,0x3b,	0x64,0x32,0x32,0x56
+	.byte	0x74,0x3a,0x3a,0x4e,	0x14,0x0a,0x0a,0x1e
+	.byte	0x92,0x49,0x49,0xdb,	0x0c,0x06,0x06,0x0a
+	.byte	0x48,0x24,0x24,0x6c,	0xb8,0x5c,0x5c,0xe4
+	.byte	0x9f,0xc2,0xc2,0x5d,	0xbd,0xd3,0xd3,0x6e
+	.byte	0x43,0xac,0xac,0xef,	0xc4,0x62,0x62,0xa6
+	.byte	0x39,0x91,0x91,0xa8,	0x31,0x95,0x95,0xa4
+	.byte	0xd3,0xe4,0xe4,0x37,	0xf2,0x79,0x79,0x8b
+	.byte	0xd5,0xe7,0xe7,0x32,	0x8b,0xc8,0xc8,0x43
+	.byte	0x6e,0x37,0x37,0x59,	0xda,0x6d,0x6d,0xb7
+	.byte	0x01,0x8d,0x8d,0x8c,	0xb1,0xd5,0xd5,0x64
+	.byte	0x9c,0x4e,0x4e,0xd2,	0x49,0xa9,0xa9,0xe0
+	.byte	0xd8,0x6c,0x6c,0xb4,	0xac,0x56,0x56,0xfa
+	.byte	0xf3,0xf4,0xf4,0x07,	0xcf,0xea,0xea,0x25
+	.byte	0xca,0x65,0x65,0xaf,	0xf4,0x7a,0x7a,0x8e
+	.byte	0x47,0xae,0xae,0xe9,	0x10,0x08,0x08,0x18
+	.byte	0x6f,0xba,0xba,0xd5,	0xf0,0x78,0x78,0x88
+	.byte	0x4a,0x25,0x25,0x6f,	0x5c,0x2e,0x2e,0x72
+	.byte	0x38,0x1c,0x1c,0x24,	0x57,0xa6,0xa6,0xf1
+	.byte	0x73,0xb4,0xb4,0xc7,	0x97,0xc6,0xc6,0x51
+	.byte	0xcb,0xe8,0xe8,0x23,	0xa1,0xdd,0xdd,0x7c
+	.byte	0xe8,0x74,0x74,0x9c,	0x3e,0x1f,0x1f,0x21
+	.byte	0x96,0x4b,0x4b,0xdd,	0x61,0xbd,0xbd,0xdc
+	.byte	0x0d,0x8b,0x8b,0x86,	0x0f,0x8a,0x8a,0x85
+	.byte	0xe0,0x70,0x70,0x90,	0x7c,0x3e,0x3e,0x42
+	.byte	0x71,0xb5,0xb5,0xc4,	0xcc,0x66,0x66,0xaa
+	.byte	0x90,0x48,0x48,0xd8,	0x06,0x03,0x03,0x05
+	.byte	0xf7,0xf6,0xf6,0x01,	0x1c,0x0e,0x0e,0x12
+	.byte	0xc2,0x61,0x61,0xa3,	0x6a,0x35,0x35,0x5f
+	.byte	0xae,0x57,0x57,0xf9,	0x69,0xb9,0xb9,0xd0
+	.byte	0x17,0x86,0x86,0x91,	0x99,0xc1,0xc1,0x58
+	.byte	0x3a,0x1d,0x1d,0x27,	0x27,0x9e,0x9e,0xb9
+	.byte	0xd9,0xe1,0xe1,0x38,	0xeb,0xf8,0xf8,0x13
+	.byte	0x2b,0x98,0x98,0xb3,	0x22,0x11,0x11,0x33
+	.byte	0xd2,0x69,0x69,0xbb,	0xa9,0xd9,0xd9,0x70
+	.byte	0x07,0x8e,0x8e,0x89,	0x33,0x94,0x94,0xa7
+	.byte	0x2d,0x9b,0x9b,0xb6,	0x3c,0x1e,0x1e,0x22
+	.byte	0x15,0x87,0x87,0x92,	0xc9,0xe9,0xe9,0x20
+	.byte	0x87,0xce,0xce,0x49,	0xaa,0x55,0x55,0xff
+	.byte	0x50,0x28,0x28,0x78,	0xa5,0xdf,0xdf,0x7a
+	.byte	0x03,0x8c,0x8c,0x8f,	0x59,0xa1,0xa1,0xf8
+	.byte	0x09,0x89,0x89,0x80,	0x1a,0x0d,0x0d,0x17
+	.byte	0x65,0xbf,0xbf,0xda,	0xd7,0xe6,0xe6,0x31
+	.byte	0x84,0x42,0x42,0xc6,	0xd0,0x68,0x68,0xb8
+	.byte	0x82,0x41,0x41,0xc3,	0x29,0x99,0x99,0xb0
+	.byte	0x5a,0x2d,0x2d,0x77,	0x1e,0x0f,0x0f,0x11
+	.byte	0x7b,0xb0,0xb0,0xcb,	0xa8,0x54,0x54,0xfc
+	.byte	0x6d,0xbb,0xbb,0xd6,	0x2c,0x16,0x16,0x3a
+AES_Te4:
+	.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+	.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+	.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+	.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+	.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+	.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+	.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+	.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+	.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+	.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+	.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+	.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+	.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+	.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+	.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+	.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+	.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+	.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+	.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+	.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+	.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+	.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+	.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+	.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+	.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+	.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+	.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+	.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+	.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+	.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+	.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+	.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+rcon:
+	.byte	0x01,0x00,0x00,0x00,	0x02,0x00,0x00,0x00
+	.byte	0x04,0x00,0x00,0x00,	0x08,0x00,0x00,0x00
+	.byte	0x10,0x00,0x00,0x00,	0x20,0x00,0x00,0x00
+	.byte	0x40,0x00,0x00,0x00,	0x80,0x00,0x00,0x00
+	.byte	0x1B,0x00,0x00,0x00,	0x36,0x00,0x00,0x00
+	.align	128
+AES_Td:
+	.byte	0x51,0xf4,0xa7,0x50,	0x7e,0x41,0x65,0x53
+	.byte	0x1a,0x17,0xa4,0xc3,	0x3a,0x27,0x5e,0x96
+	.byte	0x3b,0xab,0x6b,0xcb,	0x1f,0x9d,0x45,0xf1
+	.byte	0xac,0xfa,0x58,0xab,	0x4b,0xe3,0x03,0x93
+	.byte	0x20,0x30,0xfa,0x55,	0xad,0x76,0x6d,0xf6
+	.byte	0x88,0xcc,0x76,0x91,	0xf5,0x02,0x4c,0x25
+	.byte	0x4f,0xe5,0xd7,0xfc,	0xc5,0x2a,0xcb,0xd7
+	.byte	0x26,0x35,0x44,0x80,	0xb5,0x62,0xa3,0x8f
+	.byte	0xde,0xb1,0x5a,0x49,	0x25,0xba,0x1b,0x67
+	.byte	0x45,0xea,0x0e,0x98,	0x5d,0xfe,0xc0,0xe1
+	.byte	0xc3,0x2f,0x75,0x02,	0x81,0x4c,0xf0,0x12
+	.byte	0x8d,0x46,0x97,0xa3,	0x6b,0xd3,0xf9,0xc6
+	.byte	0x03,0x8f,0x5f,0xe7,	0x15,0x92,0x9c,0x95
+	.byte	0xbf,0x6d,0x7a,0xeb,	0x95,0x52,0x59,0xda
+	.byte	0xd4,0xbe,0x83,0x2d,	0x58,0x74,0x21,0xd3
+	.byte	0x49,0xe0,0x69,0x29,	0x8e,0xc9,0xc8,0x44
+	.byte	0x75,0xc2,0x89,0x6a,	0xf4,0x8e,0x79,0x78
+	.byte	0x99,0x58,0x3e,0x6b,	0x27,0xb9,0x71,0xdd
+	.byte	0xbe,0xe1,0x4f,0xb6,	0xf0,0x88,0xad,0x17
+	.byte	0xc9,0x20,0xac,0x66,	0x7d,0xce,0x3a,0xb4
+	.byte	0x63,0xdf,0x4a,0x18,	0xe5,0x1a,0x31,0x82
+	.byte	0x97,0x51,0x33,0x60,	0x62,0x53,0x7f,0x45
+	.byte	0xb1,0x64,0x77,0xe0,	0xbb,0x6b,0xae,0x84
+	.byte	0xfe,0x81,0xa0,0x1c,	0xf9,0x08,0x2b,0x94
+	.byte	0x70,0x48,0x68,0x58,	0x8f,0x45,0xfd,0x19
+	.byte	0x94,0xde,0x6c,0x87,	0x52,0x7b,0xf8,0xb7
+	.byte	0xab,0x73,0xd3,0x23,	0x72,0x4b,0x02,0xe2
+	.byte	0xe3,0x1f,0x8f,0x57,	0x66,0x55,0xab,0x2a
+	.byte	0xb2,0xeb,0x28,0x07,	0x2f,0xb5,0xc2,0x03
+	.byte	0x86,0xc5,0x7b,0x9a,	0xd3,0x37,0x08,0xa5
+	.byte	0x30,0x28,0x87,0xf2,	0x23,0xbf,0xa5,0xb2
+	.byte	0x02,0x03,0x6a,0xba,	0xed,0x16,0x82,0x5c
+	.byte	0x8a,0xcf,0x1c,0x2b,	0xa7,0x79,0xb4,0x92
+	.byte	0xf3,0x07,0xf2,0xf0,	0x4e,0x69,0xe2,0xa1
+	.byte	0x65,0xda,0xf4,0xcd,	0x06,0x05,0xbe,0xd5
+	.byte	0xd1,0x34,0x62,0x1f,	0xc4,0xa6,0xfe,0x8a
+	.byte	0x34,0x2e,0x53,0x9d,	0xa2,0xf3,0x55,0xa0
+	.byte	0x05,0x8a,0xe1,0x32,	0xa4,0xf6,0xeb,0x75
+	.byte	0x0b,0x83,0xec,0x39,	0x40,0x60,0xef,0xaa
+	.byte	0x5e,0x71,0x9f,0x06,	0xbd,0x6e,0x10,0x51
+	.byte	0x3e,0x21,0x8a,0xf9,	0x96,0xdd,0x06,0x3d
+	.byte	0xdd,0x3e,0x05,0xae,	0x4d,0xe6,0xbd,0x46
+	.byte	0x91,0x54,0x8d,0xb5,	0x71,0xc4,0x5d,0x05
+	.byte	0x04,0x06,0xd4,0x6f,	0x60,0x50,0x15,0xff
+	.byte	0x19,0x98,0xfb,0x24,	0xd6,0xbd,0xe9,0x97
+	.byte	0x89,0x40,0x43,0xcc,	0x67,0xd9,0x9e,0x77
+	.byte	0xb0,0xe8,0x42,0xbd,	0x07,0x89,0x8b,0x88
+	.byte	0xe7,0x19,0x5b,0x38,	0x79,0xc8,0xee,0xdb
+	.byte	0xa1,0x7c,0x0a,0x47,	0x7c,0x42,0x0f,0xe9
+	.byte	0xf8,0x84,0x1e,0xc9,	0x00,0x00,0x00,0x00
+	.byte	0x09,0x80,0x86,0x83,	0x32,0x2b,0xed,0x48
+	.byte	0x1e,0x11,0x70,0xac,	0x6c,0x5a,0x72,0x4e
+	.byte	0xfd,0x0e,0xff,0xfb,	0x0f,0x85,0x38,0x56
+	.byte	0x3d,0xae,0xd5,0x1e,	0x36,0x2d,0x39,0x27
+	.byte	0x0a,0x0f,0xd9,0x64,	0x68,0x5c,0xa6,0x21
+	.byte	0x9b,0x5b,0x54,0xd1,	0x24,0x36,0x2e,0x3a
+	.byte	0x0c,0x0a,0x67,0xb1,	0x93,0x57,0xe7,0x0f
+	.byte	0xb4,0xee,0x96,0xd2,	0x1b,0x9b,0x91,0x9e
+	.byte	0x80,0xc0,0xc5,0x4f,	0x61,0xdc,0x20,0xa2
+	.byte	0x5a,0x77,0x4b,0x69,	0x1c,0x12,0x1a,0x16
+	.byte	0xe2,0x93,0xba,0x0a,	0xc0,0xa0,0x2a,0xe5
+	.byte	0x3c,0x22,0xe0,0x43,	0x12,0x1b,0x17,0x1d
+	.byte	0x0e,0x09,0x0d,0x0b,	0xf2,0x8b,0xc7,0xad
+	.byte	0x2d,0xb6,0xa8,0xb9,	0x14,0x1e,0xa9,0xc8
+	.byte	0x57,0xf1,0x19,0x85,	0xaf,0x75,0x07,0x4c
+	.byte	0xee,0x99,0xdd,0xbb,	0xa3,0x7f,0x60,0xfd
+	.byte	0xf7,0x01,0x26,0x9f,	0x5c,0x72,0xf5,0xbc
+	.byte	0x44,0x66,0x3b,0xc5,	0x5b,0xfb,0x7e,0x34
+	.byte	0x8b,0x43,0x29,0x76,	0xcb,0x23,0xc6,0xdc
+	.byte	0xb6,0xed,0xfc,0x68,	0xb8,0xe4,0xf1,0x63
+	.byte	0xd7,0x31,0xdc,0xca,	0x42,0x63,0x85,0x10
+	.byte	0x13,0x97,0x22,0x40,	0x84,0xc6,0x11,0x20
+	.byte	0x85,0x4a,0x24,0x7d,	0xd2,0xbb,0x3d,0xf8
+	.byte	0xae,0xf9,0x32,0x11,	0xc7,0x29,0xa1,0x6d
+	.byte	0x1d,0x9e,0x2f,0x4b,	0xdc,0xb2,0x30,0xf3
+	.byte	0x0d,0x86,0x52,0xec,	0x77,0xc1,0xe3,0xd0
+	.byte	0x2b,0xb3,0x16,0x6c,	0xa9,0x70,0xb9,0x99
+	.byte	0x11,0x94,0x48,0xfa,	0x47,0xe9,0x64,0x22
+	.byte	0xa8,0xfc,0x8c,0xc4,	0xa0,0xf0,0x3f,0x1a
+	.byte	0x56,0x7d,0x2c,0xd8,	0x22,0x33,0x90,0xef
+	.byte	0x87,0x49,0x4e,0xc7,	0xd9,0x38,0xd1,0xc1
+	.byte	0x8c,0xca,0xa2,0xfe,	0x98,0xd4,0x0b,0x36
+	.byte	0xa6,0xf5,0x81,0xcf,	0xa5,0x7a,0xde,0x28
+	.byte	0xda,0xb7,0x8e,0x26,	0x3f,0xad,0xbf,0xa4
+	.byte	0x2c,0x3a,0x9d,0xe4,	0x50,0x78,0x92,0x0d
+	.byte	0x6a,0x5f,0xcc,0x9b,	0x54,0x7e,0x46,0x62
+	.byte	0xf6,0x8d,0x13,0xc2,	0x90,0xd8,0xb8,0xe8
+	.byte	0x2e,0x39,0xf7,0x5e,	0x82,0xc3,0xaf,0xf5
+	.byte	0x9f,0x5d,0x80,0xbe,	0x69,0xd0,0x93,0x7c
+	.byte	0x6f,0xd5,0x2d,0xa9,	0xcf,0x25,0x12,0xb3
+	.byte	0xc8,0xac,0x99,0x3b,	0x10,0x18,0x7d,0xa7
+	.byte	0xe8,0x9c,0x63,0x6e,	0xdb,0x3b,0xbb,0x7b
+	.byte	0xcd,0x26,0x78,0x09,	0x6e,0x59,0x18,0xf4
+	.byte	0xec,0x9a,0xb7,0x01,	0x83,0x4f,0x9a,0xa8
+	.byte	0xe6,0x95,0x6e,0x65,	0xaa,0xff,0xe6,0x7e
+	.byte	0x21,0xbc,0xcf,0x08,	0xef,0x15,0xe8,0xe6
+	.byte	0xba,0xe7,0x9b,0xd9,	0x4a,0x6f,0x36,0xce
+	.byte	0xea,0x9f,0x09,0xd4,	0x29,0xb0,0x7c,0xd6
+	.byte	0x31,0xa4,0xb2,0xaf,	0x2a,0x3f,0x23,0x31
+	.byte	0xc6,0xa5,0x94,0x30,	0x35,0xa2,0x66,0xc0
+	.byte	0x74,0x4e,0xbc,0x37,	0xfc,0x82,0xca,0xa6
+	.byte	0xe0,0x90,0xd0,0xb0,	0x33,0xa7,0xd8,0x15
+	.byte	0xf1,0x04,0x98,0x4a,	0x41,0xec,0xda,0xf7
+	.byte	0x7f,0xcd,0x50,0x0e,	0x17,0x91,0xf6,0x2f
+	.byte	0x76,0x4d,0xd6,0x8d,	0x43,0xef,0xb0,0x4d
+	.byte	0xcc,0xaa,0x4d,0x54,	0xe4,0x96,0x04,0xdf
+	.byte	0x9e,0xd1,0xb5,0xe3,	0x4c,0x6a,0x88,0x1b
+	.byte	0xc1,0x2c,0x1f,0xb8,	0x46,0x65,0x51,0x7f
+	.byte	0x9d,0x5e,0xea,0x04,	0x01,0x8c,0x35,0x5d
+	.byte	0xfa,0x87,0x74,0x73,	0xfb,0x0b,0x41,0x2e
+	.byte	0xb3,0x67,0x1d,0x5a,	0x92,0xdb,0xd2,0x52
+	.byte	0xe9,0x10,0x56,0x33,	0x6d,0xd6,0x47,0x13
+	.byte	0x9a,0xd7,0x61,0x8c,	0x37,0xa1,0x0c,0x7a
+	.byte	0x59,0xf8,0x14,0x8e,	0xeb,0x13,0x3c,0x89
+	.byte	0xce,0xa9,0x27,0xee,	0xb7,0x61,0xc9,0x35
+	.byte	0xe1,0x1c,0xe5,0xed,	0x7a,0x47,0xb1,0x3c
+	.byte	0x9c,0xd2,0xdf,0x59,	0x55,0xf2,0x73,0x3f
+	.byte	0x18,0x14,0xce,0x79,	0x73,0xc7,0x37,0xbf
+	.byte	0x53,0xf7,0xcd,0xea,	0x5f,0xfd,0xaa,0x5b
+	.byte	0xdf,0x3d,0x6f,0x14,	0x78,0x44,0xdb,0x86
+	.byte	0xca,0xaf,0xf3,0x81,	0xb9,0x68,0xc4,0x3e
+	.byte	0x38,0x24,0x34,0x2c,	0xc2,0xa3,0x40,0x5f
+	.byte	0x16,0x1d,0xc3,0x72,	0xbc,0xe2,0x25,0x0c
+	.byte	0x28,0x3c,0x49,0x8b,	0xff,0x0d,0x95,0x41
+	.byte	0x39,0xa8,0x01,0x71,	0x08,0x0c,0xb3,0xde
+	.byte	0xd8,0xb4,0xe4,0x9c,	0x64,0x56,0xc1,0x90
+	.byte	0x7b,0xcb,0x84,0x61,	0xd5,0x32,0xb6,0x70
+	.byte	0x48,0x6c,0x5c,0x74,	0xd0,0xb8,0x57,0x42
+AES_Td4:
+	.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+	.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+	.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+	.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+	.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+	.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+	.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+	.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+	.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+	.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+	.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+	.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+	.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+	.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+	.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+	.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+	.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+	.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+	.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+	.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+	.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+	.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+	.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+	.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+	.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+	.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+	.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+	.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+	.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+	.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+	.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+	.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+	.cstring "AES for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
+close STDOUT;
diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-ia64.S b/deps/openssl/openssl/crypto/aes/asm/aes-ia64.S
index 7f6c4c3662..f7f1f63c9d 100644
--- a/deps/openssl/openssl/crypto/aes/asm/aes-ia64.S
+++ b/deps/openssl/openssl/crypto/aes/asm/aes-ia64.S
@@ -1,3 +1,10 @@
+// Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
 // ====================================================================
 // Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 // project. Rights for redistribution and usage in source and binary
@@ -10,7 +17,7 @@
 // 'and' which in turn can be assigned to M-port [there're double as
 // much M-ports as there're I-ports on Itanium 2]. By sacrificing few
 // registers for small constants (255, 24 and 16) to be used with
-// 'shr' and 'and' instructions I can achieve better ILP, Intruction
+// 'shr' and 'and' instructions I can achieve better ILP, Instruction
 // Level Parallelism, and performance. This code outperforms GCC 3.3
 // generated code by over factor of 2 (two), GCC 3.4 - by 70% and
 // HP C - by 40%. Measured best-case scenario, i.e. aligned
diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-mips.pl b/deps/openssl/openssl/crypto/aes/asm/aes-mips.pl
index 4de3ee26bb..439578d9c2 100644
--- a/deps/openssl/openssl/crypto/aes/asm/aes-mips.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/aes-mips.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -57,6 +64,7 @@
 $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
 
 if ($flavour =~ /64|n32/i) {
+	$PTR_LA="dla";
 	$PTR_ADD="dadd";	# incidentally works even on n32
 	$PTR_SUB="dsub";	# incidentally works even on n32
 	$PTR_INS="dins";
@@ -65,6 +73,7 @@ if ($flavour =~ /64|n32/i) {
 	$PTR_SLL="dsll";	# incidentally works even on n32
 	$SZREG=8;
 } else {
+	$PTR_LA="la";
 	$PTR_ADD="add";
 	$PTR_SUB="sub";
 	$PTR_INS="ins";
@@ -81,13 +90,13 @@ $pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
 
 $big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0 if ($ENV{CC});
 
-for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);	}
+for (@ARGV) {	$output=$_ if (/\w[\w\-]*\.\w+$/);	}
 open STDOUT,">$output";
 
 if (!defined($big_endian))
 {    $big_endian=(unpack('L',pack('N',1))==1);   }
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 my ($MSB,$LSB)=(0,3);	# automatically converted to little-endian
@@ -110,7 +119,7 @@ ___
 
 {{{
 my $FRAMESIZE=16*$SZREG;
-my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc0fff008" : "0xc0ff0000";
 
 my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7);
 my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
@@ -646,7 +655,7 @@ $code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
 ___
 $code.=<<___;
 	.set	reorder
-	la	$Tbl,AES_Te		# PIC-ified 'load address'
+	$PTR_LA	$Tbl,AES_Te		# PIC-ified 'load address'
 
 	lwl	$s0,0+$MSB($inp)
 	lwl	$s1,4+$MSB($inp)
@@ -1217,7 +1226,7 @@ $code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
 ___
 $code.=<<___;
 	.set	reorder
-	la	$Tbl,AES_Td		# PIC-ified 'load address'
+	$PTR_LA	$Tbl,AES_Td		# PIC-ified 'load address'
 
 	lwl	$s0,0+$MSB($inp)
 	lwl	$s1,4+$MSB($inp)
@@ -1267,7 +1276,7 @@ ___
 
 {{{
 my $FRAMESIZE=8*$SZREG;
-my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000;
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc000f008" : "0xc0000000";
 
 my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3);
 my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
@@ -1528,9 +1537,9 @@ _mips_AES_set_encrypt_key:
 	nop
 .end	_mips_AES_set_encrypt_key
 
-.globl	private_AES_set_encrypt_key
-.ent	private_AES_set_encrypt_key
-private_AES_set_encrypt_key:
+.globl	AES_set_encrypt_key
+.ent	AES_set_encrypt_key
+AES_set_encrypt_key:
 	.frame	$sp,$FRAMESIZE,$ra
 	.mask	$SAVED_REGS_MASK,-$SZREG
 	.set	noreorder
@@ -1552,11 +1561,11 @@ $code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
 ___
 $code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
 	.cplocal	$Tbl
-	.cpsetup	$pf,$zero,private_AES_set_encrypt_key
+	.cpsetup	$pf,$zero,AES_set_encrypt_key
 ___
 $code.=<<___;
 	.set	reorder
-	la	$Tbl,AES_Te4		# PIC-ified 'load address'
+	$PTR_LA	$Tbl,AES_Te4		# PIC-ified 'load address'
 
 	bal	_mips_AES_set_encrypt_key
 
@@ -1575,7 +1584,7 @@ ___
 $code.=<<___;
 	jr	$ra
 	$PTR_ADD $sp,$FRAMESIZE
-.end	private_AES_set_encrypt_key
+.end	AES_set_encrypt_key
 ___
 
 my ($head,$tail)=($inp,$bits);
@@ -1583,9 +1592,9 @@ my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
 my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2);
 $code.=<<___;
 .align	5
-.globl	private_AES_set_decrypt_key
-.ent	private_AES_set_decrypt_key
-private_AES_set_decrypt_key:
+.globl	AES_set_decrypt_key
+.ent	AES_set_decrypt_key
+AES_set_decrypt_key:
 	.frame	$sp,$FRAMESIZE,$ra
 	.mask	$SAVED_REGS_MASK,-$SZREG
 	.set	noreorder
@@ -1607,11 +1616,11 @@ $code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
 ___
 $code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
 	.cplocal	$Tbl
-	.cpsetup	$pf,$zero,private_AES_set_decrypt_key
+	.cpsetup	$pf,$zero,AES_set_decrypt_key
 ___
 $code.=<<___;
 	.set	reorder
-	la	$Tbl,AES_Te4		# PIC-ified 'load address'
+	$PTR_LA	$Tbl,AES_Te4		# PIC-ified 'load address'
 
 	bal	_mips_AES_set_encrypt_key
 
@@ -1729,7 +1738,7 @@ ___
 $code.=<<___;
 	jr	$ra
 	$PTR_ADD $sp,$FRAMESIZE
-.end	private_AES_set_decrypt_key
+.end	AES_set_decrypt_key
 ___
 }}}
 
diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-parisc.pl b/deps/openssl/openssl/crypto/aes/asm/aes-parisc.pl
index 714dcfbbe3..2c785bc56d 100644
--- a/deps/openssl/openssl/crypto/aes/asm/aes-parisc.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/aes-parisc.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-ppc.pl b/deps/openssl/openssl/crypto/aes/asm/aes-ppc.pl
index 5b83016efa..1558d8e454 100644
--- a/deps/openssl/openssl/crypto/aes/asm/aes-ppc.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/aes-ppc.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -19,7 +26,7 @@
 # February 2010
 #
 # Rescheduling instructions to favour Power6 pipeline gave 10%
-# performance improvement on the platfrom in question (and marginal
+# performance improvement on the platform in question (and marginal
 # improvement even on others). It should be noted that Power6 fails
 # to process byte in 18 cycles, only in 23, because it fails to issue
 # 4 load instructions in two cycles, only in 3. As result non-compact
diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-s390x.pl b/deps/openssl/openssl/crypto/aes/asm/aes-s390x.pl
index a8f4d29d1c..fd8a737166 100644
--- a/deps/openssl/openssl/crypto/aes/asm/aes-s390x.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/aes-s390x.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -92,7 +99,7 @@ if ($flavour =~ /3[12]/) {
 	$g="g";
 }
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 $softonly=0;	# allow hardware support
@@ -779,10 +786,10 @@ ___
 $code.=<<___;
 # void AES_set_encrypt_key(const unsigned char *in, int bits,
 # 		 AES_KEY *key) {
-.globl	private_AES_set_encrypt_key
-.type	private_AES_set_encrypt_key,\@function
+.globl	AES_set_encrypt_key
+.type	AES_set_encrypt_key,\@function
 .align	16
-private_AES_set_encrypt_key:
+AES_set_encrypt_key:
 _s390x_AES_set_encrypt_key:
 	lghi	$t0,0
 	cl${g}r	$inp,$t0
@@ -806,7 +813,7 @@ _s390x_AES_set_encrypt_key:
 .Lproceed:
 ___
 $code.=<<___ if (!$softonly);
-	# convert bits to km code, [128,192,256]->[18,19,20]
+	# convert bits to km(c) code, [128,192,256]->[18,19,20]
 	lhi	%r5,-128
 	lhi	%r0,18
 	ar	%r5,$bits
@@ -814,13 +821,10 @@ $code.=<<___ if (!$softonly);
 	ar	%r5,%r0
 
 	larl	%r1,OPENSSL_s390xcap_P
-	lg	%r0,0(%r1)
-	tmhl	%r0,0x4000	# check for message-security assist
-	jz	.Lekey_internal
-
 	llihh	%r0,0x8000
 	srlg	%r0,%r0,0(%r5)
-	ng	%r0,48(%r1)	# check kmc capability vector
+	ng	%r0,32(%r1)	# check availability of both km...
+	ng	%r0,48(%r1)	# ...and kmc support for given key length
 	jz	.Lekey_internal
 
 	lmg	%r0,%r1,0($inp)	# just copy 128 bits...
@@ -835,7 +839,7 @@ $code.=<<___ if (!$softonly);
 	stg	%r1,24($key)
 1:	st	$bits,236($key)	# save bits [for debugging purposes]
 	lgr	$t0,%r5
-	st	%r5,240($key)	# save km code
+	st	%r5,240($key)	# save km(c) code
 	lghi	%r2,0
 	br	%r14
 ___
@@ -1059,14 +1063,14 @@ $code.=<<___;
 .Lminus1:
 	lghi	%r2,-1
 	br	$ra
-.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
+.size	AES_set_encrypt_key,.-AES_set_encrypt_key
 
 # void AES_set_decrypt_key(const unsigned char *in, int bits,
 # 		 AES_KEY *key) {
-.globl	private_AES_set_decrypt_key
-.type	private_AES_set_decrypt_key,\@function
+.globl	AES_set_decrypt_key
+.type	AES_set_decrypt_key,\@function
 .align	16
-private_AES_set_decrypt_key:
+AES_set_decrypt_key:
 	#st${g}	$key,4*$SIZE_T($sp)	# I rely on AES_set_encrypt_key to
 	st${g}	$ra,14*$SIZE_T($sp)	# save non-volatile registers and $key!
 	bras	$ra,_s390x_AES_set_encrypt_key
@@ -1166,7 +1170,7 @@ $code.=<<___;
 	lm${g}	%r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
 	lghi	%r2,0
 	br	$ra
-.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
+.size	AES_set_decrypt_key,.-AES_set_decrypt_key
 ___
 
 ########################################################################
@@ -1432,12 +1436,7 @@ $code.=<<___ if (!$softonly);
 
 .Lctr32_hw_switch:
 ___
-$code.=<<___ if (0);	######### kmctr code was measured to be ~12% slower
-	larl	$s0,OPENSSL_s390xcap_P
-	lg	$s0,8($s0)
-	tmhh	$s0,0x0004	# check for message_security-assist-4
-	jz	.Lctr32_km_loop
-
+$code.=<<___ if (!$softonly && 0);# kmctr code was measured to be ~12% slower
 	llgfr	$s0,%r0
 	lgr	$s1,%r1
 	larl	%r1,OPENSSL_s390xcap_P
@@ -1481,7 +1480,7 @@ $code.=<<___ if (0);	######### kmctr code was measured to be ~12% slower
 	br	$ra
 .align	16
 ___
-$code.=<<___;
+$code.=<<___ if (!$softonly);
 .Lctr32_km_loop:
 	la	$s2,16($sp)
 	lgr	$s3,$fp
@@ -1568,8 +1567,8 @@ ___
 }
 
 ########################################################################
-# void AES_xts_encrypt(const unsigned char *inp, unsigned char *out,
-#	size_t len, const AES_KEY *key1, const AES_KEY *key2,
+# void AES_xts_encrypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2,
 #	const unsigned char iv[16]);
 #
 {
@@ -1937,8 +1936,8 @@ $code.=<<___;
 	br	$ra
 .size	AES_xts_encrypt,.-AES_xts_encrypt
 ___
-# void AES_xts_decrypt(const unsigned char *inp, unsigned char *out,
-#	size_t len, const AES_KEY *key1, const AES_KEY *key2,
+# void AES_xts_decrypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2,
 #	const unsigned char iv[16]);
 #
 $code.=<<___;
@@ -2220,7 +2219,6 @@ ___
 }
 $code.=<<___;
 .string	"AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.comm	OPENSSL_s390xcap_P,80,8
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-sparcv9.pl b/deps/openssl/openssl/crypto/aes/asm/aes-sparcv9.pl
index 403c4d1290..883fae820f 100755
--- a/deps/openssl/openssl/crypto/aes/asm/aes-sparcv9.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/aes-sparcv9.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -30,10 +37,11 @@
 # optimal decrypt procedure]. Compared to GNU C generated code both
 # procedures are more than 60% faster:-)
 
-$bits=32;
-for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64)	{ $bias=2047; $frame=192; }
-else		{ $bias=0;    $frame=112; }
+$output = pop;
+open STDOUT,">$output";
+
+$frame="STACK_FRAME";
+$bias="STACK_BIAS";
 $locals=16;
 
 $acc0="%l0";
@@ -74,11 +82,13 @@ sub _data_word()
     while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
 }
 
-$code.=<<___ if ($bits==64);
+$code.=<<___;
+#include "sparc_arch.h"
+
+#ifdef  __arch64__
 .register	%g2,#scratch
 .register	%g3,#scratch
-___
-$code.=<<___;
+#endif
 .section	".text",#alloc,#execinstr
 
 .align	256
diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-x86_64.pl b/deps/openssl/openssl/crypto/aes/asm/aes-x86_64.pl
index 47f416375d..ce4ca30b1a 100755
--- a/deps/openssl/openssl/crypto/aes/asm/aes-x86_64.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/aes-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -37,7 +44,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;
 
 $verticalspin=1;	# unlike 32-bit version $verticalspin performs
@@ -1282,13 +1289,13 @@ $code.=<<___;
 ___
 }
 
-# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
 $code.=<<___;
-.globl	private_AES_set_encrypt_key
-.type	private_AES_set_encrypt_key,\@function,3
+.globl	AES_set_encrypt_key
+.type	AES_set_encrypt_key,\@function,3
 .align	16
-private_AES_set_encrypt_key:
+AES_set_encrypt_key:
 	push	%rbx
 	push	%rbp
 	push	%r12			# redundant, but allows to share 
@@ -1305,7 +1312,7 @@ private_AES_set_encrypt_key:
 	add	\$56,%rsp
 .Lenc_key_epilogue:
 	ret
-.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
+.size	AES_set_encrypt_key,.-AES_set_encrypt_key
 
 .type	_x86_64_AES_set_encrypt_key,\@abi-omnipotent
 .align	16
@@ -1548,13 +1555,13 @@ $code.=<<___;
 ___
 }
 
-# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
 $code.=<<___;
-.globl	private_AES_set_decrypt_key
-.type	private_AES_set_decrypt_key,\@function,3
+.globl	AES_set_decrypt_key
+.type	AES_set_decrypt_key,\@function,3
 .align	16
-private_AES_set_decrypt_key:
+AES_set_decrypt_key:
 	push	%rbx
 	push	%rbp
 	push	%r12
@@ -1623,7 +1630,7 @@ $code.=<<___;
 	add	\$56,%rsp
 .Ldec_key_epilogue:
 	ret
-.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
+.size	AES_set_decrypt_key,.-AES_set_decrypt_key
 ___
 
 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
@@ -2770,13 +2777,13 @@ cbc_se_handler:
 	.rva	.LSEH_end_AES_decrypt
 	.rva	.LSEH_info_AES_decrypt
 
-	.rva	.LSEH_begin_private_AES_set_encrypt_key
-	.rva	.LSEH_end_private_AES_set_encrypt_key
-	.rva	.LSEH_info_private_AES_set_encrypt_key
+	.rva	.LSEH_begin_AES_set_encrypt_key
+	.rva	.LSEH_end_AES_set_encrypt_key
+	.rva	.LSEH_info_AES_set_encrypt_key
 
-	.rva	.LSEH_begin_private_AES_set_decrypt_key
-	.rva	.LSEH_end_private_AES_set_decrypt_key
-	.rva	.LSEH_info_private_AES_set_decrypt_key
+	.rva	.LSEH_begin_AES_set_decrypt_key
+	.rva	.LSEH_end_AES_set_decrypt_key
+	.rva	.LSEH_info_AES_set_decrypt_key
 
 	.rva	.LSEH_begin_AES_cbc_encrypt
 	.rva	.LSEH_end_AES_cbc_encrypt
@@ -2792,11 +2799,11 @@ cbc_se_handler:
 	.byte	9,0,0,0
 	.rva	block_se_handler
 	.rva	.Ldec_prologue,.Ldec_epilogue	# HandlerData[]
-.LSEH_info_private_AES_set_encrypt_key:
+.LSEH_info_AES_set_encrypt_key:
 	.byte	9,0,0,0
 	.rva	key_se_handler
 	.rva	.Lenc_key_prologue,.Lenc_key_epilogue	# HandlerData[]
-.LSEH_info_private_AES_set_decrypt_key:
+.LSEH_info_AES_set_decrypt_key:
 	.byte	9,0,0,0
 	.rva	key_se_handler
 	.rva	.Ldec_key_prologue,.Ldec_key_epilogue	# HandlerData[]
diff --git a/deps/openssl/openssl/crypto/aes/asm/aesfx-sparcv9.pl b/deps/openssl/openssl/crypto/aes/asm/aesfx-sparcv9.pl
new file mode 100644
index 0000000000..04b3cf7116
--- /dev/null
+++ b/deps/openssl/openssl/crypto/aes/asm/aesfx-sparcv9.pl
@@ -0,0 +1,1270 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# March 2016
+#
+# Initial support for Fujitsu SPARC64 X/X+ comprises minimally
+# required key setup and single-block procedures.
+#
+# April 2016
+#
+# Add "teaser" CBC and CTR mode-specific subroutines. "Teaser" means
+# that parallelizeable nature of CBC decrypt and CTR is not utilized
+# yet. CBC encrypt on the other hand is as good as it can possibly
+# get processing one byte in 4.1 cycles with 128-bit key on SPARC64 X.
+# This is ~6x faster than pure software implementation...
+#
+# July 2016
+#
+# Switch from faligndata to fshiftorx, which allows to omit alignaddr
+# instructions and improve single-block and short-input performance
+# with misaligned data.
+
+$output = pop;
+open STDOUT,">$output";
+
+{
+my ($inp,$out,$key,$rounds,$tmp,$mask) = map("%o$_",(0..5));
+
+$code.=<<___;
+#include "sparc_arch.h"
+
+#define LOCALS (STACK_BIAS+STACK_FRAME)
+
+.text
+
+.globl	aes_fx_encrypt
+.align	32
+aes_fx_encrypt:
+	and		$inp, 7, $tmp		! is input aligned?
+	andn		$inp, 7, $inp
+	ldd		[$key +  0], %f6	! round[0]
+	ldd		[$key +  8], %f8
+	mov		%o7, %g1
+	ld		[$key + 240], $rounds
+
+1:	call		.+8
+	add		%o7, .Linp_align-1b, %o7
+
+	sll		$tmp, 3, $tmp
+	ldd		[$inp + 0], %f0		! load input
+	brz,pt		$tmp, .Lenc_inp_aligned
+	ldd		[$inp + 8], %f2
+
+	ldd		[%o7 + $tmp], %f14	! shift left params
+	ldd		[$inp + 16], %f4
+	fshiftorx	%f0, %f2, %f14, %f0
+	fshiftorx	%f2, %f4, %f14, %f2
+
+.Lenc_inp_aligned:
+	ldd		[$key + 16], %f10	! round[1]
+	ldd		[$key + 24], %f12
+
+	fxor		%f0, %f6, %f0		! ^=round[0]
+	fxor		%f2, %f8, %f2
+	ldd		[$key + 32], %f6	! round[2]
+	ldd		[$key + 40], %f8
+	add		$key, 32, $key
+	sub		$rounds, 4, $rounds
+
+.Loop_enc:
+	fmovd		%f0, %f4
+	faesencx	%f2, %f10, %f0
+	faesencx	%f4, %f12, %f2
+	ldd		[$key + 16], %f10
+	ldd		[$key + 24], %f12
+	add		$key, 32, $key
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f6, %f0
+	faesencx	%f4, %f8, %f2
+	ldd		[$key +  0], %f6
+	ldd		[$key +  8], %f8
+
+	brnz,a		$rounds, .Loop_enc
+	sub		$rounds, 2, $rounds
+
+	andcc		$out, 7, $tmp		! is output aligned?
+	andn		$out, 7, $out
+	mov		0xff, $mask
+	srl		$mask, $tmp, $mask
+	add		%o7, 64, %o7
+	sll		$tmp, 3, $tmp
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f10, %f0
+	faesencx	%f4, %f12, %f2
+	ldd		[%o7 + $tmp], %f14	! shift right params
+
+	fmovd		%f0, %f4
+	faesenclx	%f2, %f6, %f0
+	faesenclx	%f4, %f8, %f2
+
+	bnz,pn		%icc, .Lenc_out_unaligned
+	mov		%g1, %o7
+
+	std		%f0, [$out + 0]
+	retl
+	std		%f2, [$out + 8]
+
+.align	16
+.Lenc_out_unaligned:
+	add		$out, 16, $inp
+	orn		%g0, $mask, $tmp
+	fshiftorx	%f0, %f0, %f14, %f4
+	fshiftorx	%f0, %f2, %f14, %f6
+	fshiftorx	%f2, %f2, %f14, %f8
+
+	stda		%f4, [$out + $mask]0xc0	! partial store
+	std		%f6, [$out + 8]
+	stda		%f8, [$inp + $tmp]0xc0	! partial store
+	retl
+	nop
+.type	aes_fx_encrypt,#function
+.size	aes_fx_encrypt,.-aes_fx_encrypt
+
+.globl	aes_fx_decrypt
+.align	32
+aes_fx_decrypt:
+	and		$inp, 7, $tmp		! is input aligned?
+	andn		$inp, 7, $inp
+	ldd		[$key +  0], %f6	! round[0]
+	ldd		[$key +  8], %f8
+	mov		%o7, %g1
+	ld		[$key + 240], $rounds
+
+1:	call		.+8
+	add		%o7, .Linp_align-1b, %o7
+
+	sll		$tmp, 3, $tmp
+	ldd		[$inp + 0], %f0		! load input
+	brz,pt		$tmp, .Ldec_inp_aligned
+	ldd		[$inp + 8], %f2
+
+	ldd		[%o7 + $tmp], %f14	! shift left params
+	ldd		[$inp + 16], %f4
+	fshiftorx	%f0, %f2, %f14, %f0
+	fshiftorx	%f2, %f4, %f14, %f2
+
+.Ldec_inp_aligned:
+	ldd		[$key + 16], %f10	! round[1]
+	ldd		[$key + 24], %f12
+
+	fxor		%f0, %f6, %f0		! ^=round[0]
+	fxor		%f2, %f8, %f2
+	ldd		[$key + 32], %f6	! round[2]
+	ldd		[$key + 40], %f8
+	add		$key, 32, $key
+	sub		$rounds, 4, $rounds
+
+.Loop_dec:
+	fmovd		%f0, %f4
+	faesdecx	%f2, %f10, %f0
+	faesdecx	%f4, %f12, %f2
+	ldd		[$key + 16], %f10
+	ldd		[$key + 24], %f12
+	add		$key, 32, $key
+
+	fmovd		%f0, %f4
+	faesdecx	%f2, %f6, %f0
+	faesdecx	%f4, %f8, %f2
+	ldd		[$key +  0], %f6
+	ldd		[$key +  8], %f8
+
+	brnz,a		$rounds, .Loop_dec
+	sub		$rounds, 2, $rounds
+
+	andcc		$out, 7, $tmp		! is output aligned?
+	andn		$out, 7, $out
+	mov		0xff, $mask
+	srl		$mask, $tmp, $mask
+	add		%o7, 64, %o7
+	sll		$tmp, 3, $tmp
+
+	fmovd		%f0, %f4
+	faesdecx	%f2, %f10, %f0
+	faesdecx	%f4, %f12, %f2
+	ldd		[%o7 + $tmp], %f14	! shift right params
+
+	fmovd		%f0, %f4
+	faesdeclx	%f2, %f6, %f0
+	faesdeclx	%f4, %f8, %f2
+
+	bnz,pn		%icc, .Ldec_out_unaligned
+	mov		%g1, %o7
+
+	std		%f0, [$out + 0]
+	retl
+	std		%f2, [$out + 8]
+
+.align	16
+.Ldec_out_unaligned:
+	add		$out, 16, $inp
+	orn		%g0, $mask, $tmp
+	fshiftorx	%f0, %f0, %f14, %f4
+	fshiftorx	%f0, %f2, %f14, %f6
+	fshiftorx	%f2, %f2, %f14, %f8
+
+	stda		%f4, [$out + $mask]0xc0	! partial store
+	std		%f6, [$out + 8]
+	stda		%f8, [$inp + $tmp]0xc0	! partial store
+	retl
+	nop
+.type	aes_fx_decrypt,#function
+.size	aes_fx_decrypt,.-aes_fx_decrypt
+___
+}
+{
+my ($inp,$bits,$out,$tmp,$inc) = map("%o$_",(0..5));
+$code.=<<___;
+.globl	aes_fx_set_decrypt_key
+.align	32
+aes_fx_set_decrypt_key:
+	b		.Lset_encrypt_key
+	mov		-1, $inc
+	retl
+	nop
+.type	aes_fx_set_decrypt_key,#function
+.size	aes_fx_set_decrypt_key,.-aes_fx_set_decrypt_key
+
+.globl	aes_fx_set_encrypt_key
+.align	32
+aes_fx_set_encrypt_key:
+	mov		1, $inc
+	nop
+.Lset_encrypt_key:
+	and		$inp, 7, $tmp
+	andn		$inp, 7, $inp
+	sll		$tmp, 3, $tmp
+	mov		%o7, %g1
+
+1:	call		.+8
+	add		%o7, .Linp_align-1b, %o7
+
+	ldd		[%o7 + $tmp], %f10	! shift left params
+	mov		%g1, %o7
+
+	cmp		$bits, 192
+	ldd		[$inp + 0], %f0
+	bl,pt		%icc, .L128
+	ldd		[$inp + 8], %f2
+
+	be,pt		%icc, .L192
+	ldd		[$inp + 16], %f4
+	brz,pt		$tmp, .L256aligned
+	ldd		[$inp + 24], %f6
+
+	ldd		[$inp + 32], %f8
+	fshiftorx	%f0, %f2, %f10, %f0
+	fshiftorx	%f2, %f4, %f10, %f2
+	fshiftorx	%f4, %f6, %f10, %f4
+	fshiftorx	%f6, %f8, %f10, %f6
+
+.L256aligned:
+	mov		14, $bits
+	and		$inc, `14*16`, $tmp
+	st		$bits, [$out + 240]	! store rounds
+	add		$out, $tmp, $out	! start or end of key schedule
+	sllx		$inc, 4, $inc		! 16 or -16
+___
+for ($i=0; $i<6; $i++) {
+    $code.=<<___;
+	std		%f0, [$out + 0]
+	faeskeyx	%f6, `0x10+$i`, %f0
+	std		%f2, [$out + 8]
+	add		$out, $inc, $out
+	faeskeyx	%f0, 0x00, %f2
+	std		%f4, [$out + 0]
+	faeskeyx	%f2, 0x01, %f4
+	std		%f6, [$out + 8]
+	add		$out, $inc, $out
+	faeskeyx	%f4, 0x00, %f6
+___
+}
+$code.=<<___;
+	std		%f0, [$out + 0]
+	faeskeyx	%f6, `0x10+$i`, %f0
+	std		%f2, [$out + 8]
+	add		$out, $inc, $out
+	faeskeyx	%f0, 0x00, %f2
+	std		%f4,[$out + 0]
+	std		%f6,[$out + 8]
+	add		$out, $inc, $out
+	std		%f0,[$out + 0]
+	std		%f2,[$out + 8]
+	retl
+	xor		%o0, %o0, %o0		! return 0
+
+.align	16
+.L192:
+	brz,pt		$tmp, .L192aligned
+	nop
+
+	ldd		[$inp + 24], %f6
+	fshiftorx	%f0, %f2, %f10, %f0
+	fshiftorx	%f2, %f4, %f10, %f2
+	fshiftorx	%f4, %f6, %f10, %f4
+
+.L192aligned:
+	mov		12, $bits
+	and		$inc, `12*16`, $tmp
+	st		$bits, [$out + 240]	! store rounds
+	add		$out, $tmp, $out	! start or end of key schedule
+	sllx		$inc, 4, $inc		! 16 or -16
+___
+for ($i=0; $i<8; $i+=2) {
+    $code.=<<___;
+	std		%f0, [$out + 0]
+	faeskeyx	%f4, `0x10+$i`, %f0
+	std		%f2, [$out + 8]
+	add		$out, $inc, $out
+	faeskeyx	%f0, 0x00, %f2
+	std		%f4, [$out + 0]
+	faeskeyx	%f2, 0x00, %f4
+	std		%f0, [$out + 8]
+	add		$out, $inc, $out
+	faeskeyx	%f4, `0x10+$i+1`, %f0
+	std		%f2, [$out + 0]
+	faeskeyx	%f0, 0x00, %f2
+	std		%f4, [$out + 8]
+	add		$out, $inc, $out
+___
+$code.=<<___		if ($i<6);
+	faeskeyx	%f2, 0x00, %f4
+___
+}
+$code.=<<___;
+	std		%f0, [$out + 0]
+	std		%f2, [$out + 8]
+	retl
+	xor		%o0, %o0, %o0		! return 0
+
+.align	16
+.L128:
+	brz,pt		$tmp, .L128aligned
+	nop
+
+	ldd		[$inp + 16], %f4
+	fshiftorx	%f0, %f2, %f10, %f0
+	fshiftorx	%f2, %f4, %f10, %f2
+
+.L128aligned:
+	mov		10, $bits
+	and		$inc, `10*16`, $tmp
+	st		$bits, [$out + 240]	! store rounds
+	add		$out, $tmp, $out	! start or end of key schedule
+	sllx		$inc, 4, $inc		! 16 or -16
+___
+for ($i=0; $i<10; $i++) {
+    $code.=<<___;
+	std		%f0, [$out + 0]
+	faeskeyx	%f2, `0x10+$i`, %f0
+	std		%f2, [$out + 8]
+	add		$out, $inc, $out
+	faeskeyx	%f0, 0x00, %f2
+___
+}
+$code.=<<___;
+	std		%f0, [$out + 0]
+	std		%f2, [$out + 8]
+	retl
+	xor		%o0, %o0, %o0		! return 0
+.type	aes_fx_set_encrypt_key,#function
+.size	aes_fx_set_encrypt_key,.-aes_fx_set_encrypt_key
+___
+}
+{
+my ($inp,$out,$len,$key,$ivp,$dir) = map("%i$_",(0..5));
+my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
+my ($iv0,$iv1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
+   = map("%f$_",grep { !($_ & 1) } (16 .. 62));
+my ($ileft,$iright) = ($ialign,$oalign);
+
+$code.=<<___;
+.globl	aes_fx_cbc_encrypt
+.align	32
+aes_fx_cbc_encrypt:
+	save		%sp, -STACK_FRAME-16, %sp
+	srln		$len, 4, $len
+	and		$inp, 7, $ialign
+	andn		$inp, 7, $inp
+	brz,pn		$len, .Lcbc_no_data
+	sll		$ialign, 3, $ileft
+
+1:	call		.+8
+	add		%o7, .Linp_align-1b, %o7
+
+	ld		[$key + 240], $rounds
+	and		$out, 7, $oalign
+	ld		[$ivp + 0], %f0		! load ivec
+	andn		$out, 7, $out
+	ld		[$ivp + 4], %f1
+	sll		$oalign, 3, $mask
+	ld		[$ivp + 8], %f2
+	ld		[$ivp + 12], %f3
+
+	sll		$rounds, 4, $rounds
+	add		$rounds, $key, $end
+	ldd		[$key + 0], $r0hi	! round[0]
+	ldd		[$key + 8], $r0lo
+
+	add		$inp, 16, $inp
+	sub		$len,  1, $len
+	ldd		[$end + 0], $rlhi	! round[last]
+	ldd		[$end + 8], $rllo
+
+	mov		16, $inc
+	movrz		$len, 0, $inc
+	ldd		[$key + 16], %f10	! round[1]
+	ldd		[$key + 24], %f12
+
+	ldd		[%o7 + $ileft], $fshift	! shift left params
+	add		%o7, 64, %o7
+	ldd		[$inp - 16], $in0	! load input
+	ldd		[$inp -  8], $in1
+	ldda		[$inp]0x82, $intail	! non-faulting load
+	brz		$dir, .Lcbc_decrypt
+	add		$inp, $inc, $inp	! inp+=16
+
+	fxor		$r0hi, %f0, %f0		! ivec^=round[0]
+	fxor		$r0lo, %f2, %f2
+	fshiftorx	$in0, $in1, $fshift, $in0
+	fshiftorx	$in1, $intail, $fshift, $in1
+	nop
+
+.Loop_cbc_enc:
+	fxor		$in0, %f0, %f0		! inp^ivec^round[0]
+	fxor		$in1, %f2, %f2
+	ldd		[$key + 32], %f6	! round[2]
+	ldd		[$key + 40], %f8
+	add		$key, 32, $end
+	sub		$rounds, 16*6, $inner
+
+.Lcbc_enc:
+	fmovd		%f0, %f4
+	faesencx	%f2, %f10, %f0
+	faesencx	%f4, %f12, %f2
+	ldd		[$end + 16], %f10
+	ldd		[$end + 24], %f12
+	add		$end, 32, $end
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f6, %f0
+	faesencx	%f4, %f8, %f2
+	ldd		[$end + 0], %f6
+	ldd		[$end + 8], %f8
+
+	brnz,a		$inner, .Lcbc_enc
+	sub		$inner, 16*2, $inner
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f10, %f0
+	faesencx	%f4, %f12, %f2
+	ldd		[$end + 16], %f10	! round[last-1]
+	ldd		[$end + 24], %f12
+
+	movrz		$len, 0, $inc
+	fmovd		$intail, $in0
+	ldd		[$inp - 8], $in1	! load next input block
+	ldda		[$inp]0x82, $intail	! non-faulting load
+	add		$inp, $inc, $inp	! inp+=16
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f6, %f0
+	faesencx	%f4, %f8, %f2
+
+	fshiftorx	$in0, $in1, $fshift, $in0
+	fshiftorx	$in1, $intail, $fshift, $in1
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f10, %f0
+	faesencx	%f4, %f12, %f2
+	ldd		[$key + 16], %f10	! round[1]
+	ldd		[$key + 24], %f12
+
+	fxor		$r0hi, $in0, $in0	! inp^=round[0]
+	fxor		$r0lo, $in1, $in1
+
+	fmovd		%f0, %f4
+	faesenclx	%f2, $rlhi, %f0
+	faesenclx	%f4, $rllo, %f2
+
+	brnz,pn		$oalign, .Lcbc_enc_unaligned_out
+	nop
+
+	std		%f0, [$out + 0]
+	std		%f2, [$out + 8]
+	add		$out, 16, $out
+
+	brnz,a		$len, .Loop_cbc_enc
+	sub		$len, 1, $len
+
+	st		%f0, [$ivp + 0]		! output ivec
+	st		%f1, [$ivp + 4]
+	st		%f2, [$ivp + 8]
+	st		%f3, [$ivp + 12]
+
+.Lcbc_no_data:
+	ret
+	restore
+
+.align	32
+.Lcbc_enc_unaligned_out:
+	ldd		[%o7 + $mask], $fshift	! shift right params
+	mov		0xff, $mask
+	srl		$mask, $oalign, $mask
+	sub		%g0, $ileft, $iright
+
+	fshiftorx	%f0, %f0, $fshift, %f6
+	fshiftorx	%f0, %f2, $fshift, %f8
+
+	stda		%f6, [$out + $mask]0xc0	! partial store
+	orn		%g0, $mask, $mask
+	std		%f8, [$out + 8]
+	add		$out, 16, $out
+	brz		$len, .Lcbc_enc_unaligned_out_done
+	sub		$len, 1, $len
+	b		.Loop_cbc_enc_unaligned_out
+	nop
+
+.align	32
+.Loop_cbc_enc_unaligned_out:
+	fmovd		%f2, $outhead
+	fxor		$in0, %f0, %f0		! inp^ivec^round[0]
+	fxor		$in1, %f2, %f2
+	ldd		[$key + 32], %f6	! round[2]
+	ldd		[$key + 40], %f8
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f10, %f0
+	faesencx	%f4, %f12, %f2
+	ldd		[$key + 48], %f10	! round[3]
+	ldd		[$key + 56], %f12
+
+	ldx		[$inp - 16], %o0
+	ldx		[$inp -  8], %o1
+	brz		$ileft, .Lcbc_enc_aligned_inp
+	movrz		$len, 0, $inc
+
+	ldx		[$inp], %o2
+	sllx		%o0, $ileft, %o0
+	srlx		%o1, $iright, %g1
+	sllx		%o1, $ileft, %o1
+	or		%g1, %o0, %o0
+	srlx		%o2, $iright, %o2
+	or		%o2, %o1, %o1
+
+.Lcbc_enc_aligned_inp:
+	fmovd		%f0, %f4
+	faesencx	%f2, %f6, %f0
+	faesencx	%f4, %f8, %f2
+	ldd		[$key + 64], %f6	! round[4]
+	ldd		[$key + 72], %f8
+	add		$key, 64, $end
+	sub		$rounds, 16*8, $inner
+
+	stx		%o0, [%sp + LOCALS + 0]
+	stx		%o1, [%sp + LOCALS + 8]
+	add		$inp, $inc, $inp	! inp+=16
+	nop
+
+.Lcbc_enc_unaligned:
+	fmovd		%f0, %f4
+	faesencx	%f2, %f10, %f0
+	faesencx	%f4, %f12, %f2
+	ldd		[$end + 16], %f10
+	ldd		[$end + 24], %f12
+	add		$end, 32, $end
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f6, %f0
+	faesencx	%f4, %f8, %f2
+	ldd		[$end + 0], %f6
+	ldd		[$end + 8], %f8
+
+	brnz,a		$inner, .Lcbc_enc_unaligned
+	sub		$inner, 16*2, $inner
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f10, %f0
+	faesencx	%f4, %f12, %f2
+	ldd		[$end + 16], %f10	! round[last-1]
+	ldd		[$end + 24], %f12
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f6, %f0
+	faesencx	%f4, %f8, %f2
+
+	ldd		[%sp + LOCALS + 0], $in0
+	ldd		[%sp + LOCALS + 8], $in1
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f10, %f0
+	faesencx	%f4, %f12, %f2
+	ldd		[$key + 16], %f10	! round[1]
+	ldd		[$key + 24], %f12
+
+	fxor		$r0hi, $in0, $in0	! inp^=round[0]
+	fxor		$r0lo, $in1, $in1
+
+	fmovd		%f0, %f4
+	faesenclx	%f2, $rlhi, %f0
+	faesenclx	%f4, $rllo, %f2
+
+	fshiftorx	$outhead, %f0, $fshift, %f6
+	fshiftorx	%f0, %f2, $fshift, %f8
+	std		%f6, [$out + 0]
+	std		%f8, [$out + 8]
+	add		$out, 16, $out
+
+	brnz,a		$len, .Loop_cbc_enc_unaligned_out
+	sub		$len, 1, $len
+
+.Lcbc_enc_unaligned_out_done:
+	fshiftorx	%f2, %f2, $fshift, %f8
+	stda		%f8, [$out + $mask]0xc0	! partial store
+
+	st		%f0, [$ivp + 0]		! output ivec
+	st		%f1, [$ivp + 4]
+	st		%f2, [$ivp + 8]
+	st		%f3, [$ivp + 12]
+
+	ret
+	restore
+
+.align	32
+.Lcbc_decrypt:
+	fshiftorx	$in0, $in1, $fshift, $in0
+	fshiftorx	$in1, $intail, $fshift, $in1
+	fmovd		%f0, $iv0
+	fmovd		%f2, $iv1
+
+.Loop_cbc_dec:
+	fxor		$in0, $r0hi, %f0	! inp^round[0]
+	fxor		$in1, $r0lo, %f2
+	ldd		[$key + 32], %f6	! round[2]
+	ldd		[$key + 40], %f8
+	add		$key, 32, $end
+	sub		$rounds, 16*6, $inner
+
+.Lcbc_dec:
+	fmovd		%f0, %f4
+	faesdecx	%f2, %f10, %f0
+	faesdecx	%f4, %f12, %f2
+	ldd		[$end + 16], %f10
+	ldd		[$end + 24], %f12
+	add		$end, 32, $end
+
+	fmovd		%f0, %f4
+	faesdecx	%f2, %f6, %f0
+	faesdecx	%f4, %f8, %f2
+	ldd		[$end + 0], %f6
+	ldd		[$end + 8], %f8
+
+	brnz,a		$inner, .Lcbc_dec
+	sub		$inner, 16*2, $inner
+
+	fmovd		%f0, %f4
+	faesdecx	%f2, %f10, %f0
+	faesdecx	%f4, %f12, %f2
+	ldd		[$end + 16], %f10	! round[last-1]
+	ldd		[$end + 24], %f12
+
+	fmovd		%f0, %f4
+	faesdecx	%f2, %f6, %f0
+	faesdecx	%f4, %f8, %f2
+	fxor		$iv0, $rlhi, %f6	! ivec^round[last]
+	fxor		$iv1, $rllo, %f8
+	fmovd		$in0, $iv0
+	fmovd		$in1, $iv1
+
+	movrz		$len, 0, $inc
+	fmovd		$intail, $in0
+	ldd		[$inp - 8], $in1	! load next input block
+	ldda		[$inp]0x82, $intail	! non-faulting load
+	add		$inp, $inc, $inp	! inp+=16
+
+	fmovd		%f0, %f4
+	faesdecx	%f2, %f10, %f0
+	faesdecx	%f4, %f12, %f2
+	ldd		[$key + 16], %f10	! round[1]
+	ldd		[$key + 24], %f12
+
+	fshiftorx	$in0, $in1, $fshift, $in0
+	fshiftorx	$in1, $intail, $fshift, $in1
+
+	fmovd		%f0, %f4
+	faesdeclx	%f2, %f6, %f0
+	faesdeclx	%f4, %f8, %f2
+
+	brnz,pn		$oalign, .Lcbc_dec_unaligned_out
+	nop
+
+	std		%f0, [$out + 0]
+	std		%f2, [$out + 8]
+	add		$out, 16, $out
+
+	brnz,a		$len, .Loop_cbc_dec
+	sub		$len, 1, $len
+
+	st		$iv0,    [$ivp + 0]	! output ivec
+	st		$iv0#lo, [$ivp + 4]
+	st		$iv1,    [$ivp + 8]
+	st		$iv1#lo, [$ivp + 12]
+
+	ret
+	restore
+
+.align	32
+.Lcbc_dec_unaligned_out:
+	ldd		[%o7 + $mask], $fshift	! shift right params
+	mov		0xff, $mask
+	srl		$mask, $oalign, $mask
+	sub		%g0, $ileft, $iright
+
+	fshiftorx	%f0, %f0, $fshift, %f6
+	fshiftorx	%f0, %f2, $fshift, %f8
+
+	stda		%f6, [$out + $mask]0xc0	! partial store
+	orn		%g0, $mask, $mask
+	std		%f8, [$out + 8]
+	add		$out, 16, $out
+	brz		$len, .Lcbc_dec_unaligned_out_done
+	sub		$len, 1, $len
+	b		.Loop_cbc_dec_unaligned_out
+	nop
+
+.align	32
+.Loop_cbc_dec_unaligned_out:
+	fmovd		%f2, $outhead
+	fxor		$in0, $r0hi, %f0	! inp^round[0]
+	fxor		$in1, $r0lo, %f2
+	ldd		[$key + 32], %f6	! round[2]
+	ldd		[$key + 40], %f8
+
+	fmovd		%f0, %f4
+	faesdecx	%f2, %f10, %f0
+	faesdecx	%f4, %f12, %f2
+	ldd		[$key + 48], %f10	! round[3]
+	ldd		[$key + 56], %f12
+
+	ldx		[$inp - 16], %o0
+	ldx		[$inp - 8], %o1
+	brz		$ileft, .Lcbc_dec_aligned_inp
+	movrz		$len, 0, $inc
+
+	ldx		[$inp], %o2
+	sllx		%o0, $ileft, %o0
+	srlx		%o1, $iright, %g1
+	sllx		%o1, $ileft, %o1
+	or		%g1, %o0, %o0
+	srlx		%o2, $iright, %o2
+	or		%o2, %o1, %o1
+
+.Lcbc_dec_aligned_inp:
+	fmovd		%f0, %f4
+	faesdecx	%f2, %f6, %f0
+	faesdecx	%f4, %f8, %f2
+	ldd		[$key + 64], %f6	! round[4]
+	ldd		[$key + 72], %f8
+	add		$key, 64, $end
+	sub		$rounds, 16*8, $inner
+
+	stx		%o0, [%sp + LOCALS + 0]
+	stx		%o1, [%sp + LOCALS + 8]
+	add		$inp, $inc, $inp	! inp+=16
+	nop
+
+.Lcbc_dec_unaligned:
+	fmovd		%f0, %f4
+	faesdecx	%f2, %f10, %f0
+	faesdecx	%f4, %f12, %f2
+	ldd		[$end + 16], %f10
+	ldd		[$end + 24], %f12
+	add		$end, 32, $end
+
+	fmovd		%f0, %f4
+	faesdecx	%f2, %f6, %f0
+	faesdecx	%f4, %f8, %f2
+	ldd		[$end + 0], %f6
+	ldd		[$end + 8], %f8
+
+	brnz,a		$inner, .Lcbc_dec_unaligned
+	sub		$inner, 16*2, $inner
+
+	fmovd		%f0, %f4
+	faesdecx	%f2, %f10, %f0
+	faesdecx	%f4, %f12, %f2
+	ldd		[$end + 16], %f10	! round[last-1]
+	ldd		[$end + 24], %f12
+
+	fmovd		%f0, %f4
+	faesdecx	%f2, %f6, %f0
+	faesdecx	%f4, %f8, %f2
+
+	fxor		$iv0, $rlhi, %f6	! ivec^round[last]
+	fxor		$iv1, $rllo, %f8
+	fmovd		$in0, $iv0
+	fmovd		$in1, $iv1
+	ldd		[%sp + LOCALS + 0], $in0
+	ldd		[%sp + LOCALS + 8], $in1
+
+	fmovd		%f0, %f4
+	faesdecx	%f2, %f10, %f0
+	faesdecx	%f4, %f12, %f2
+	ldd		[$key + 16], %f10	! round[1]
+	ldd		[$key + 24], %f12
+
+	fmovd		%f0, %f4
+	faesdeclx	%f2, %f6, %f0
+	faesdeclx	%f4, %f8, %f2
+
+	fshiftorx	$outhead, %f0, $fshift, %f6
+	fshiftorx	%f0, %f2, $fshift, %f8
+	std		%f6, [$out + 0]
+	std		%f8, [$out + 8]
+	add		$out, 16, $out
+
+	brnz,a		$len, .Loop_cbc_dec_unaligned_out
+	sub		$len, 1, $len
+
+.Lcbc_dec_unaligned_out_done:
+	fshiftorx	%f2, %f2, $fshift, %f8
+	stda		%f8, [$out + $mask]0xc0	! partial store
+
+	st		$iv0,    [$ivp + 0]	! output ivec
+	st		$iv0#lo, [$ivp + 4]
+	st		$iv1,    [$ivp + 8]
+	st		$iv1#lo, [$ivp + 12]
+
+	ret
+	restore
+.type	aes_fx_cbc_encrypt,#function
+.size	aes_fx_cbc_encrypt,.-aes_fx_cbc_encrypt
+___
+}
+{
+my ($inp,$out,$len,$key,$ivp) = map("%i$_",(0..5));
+my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
+my ($ctr0,$ctr1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
+   = map("%f$_",grep { !($_ & 1) } (16 .. 62));
+my ($ileft,$iright) = ($ialign, $oalign);
+my $one = "%f14";
+
+$code.=<<___;
+.globl	aes_fx_ctr32_encrypt_blocks
+.align	32
+aes_fx_ctr32_encrypt_blocks:
+	save		%sp, -STACK_FRAME-16, %sp
+	srln		$len, 0, $len
+	and		$inp, 7, $ialign
+	andn		$inp, 7, $inp
+	brz,pn		$len, .Lctr32_no_data
+	sll		$ialign, 3, $ileft
+
+.Lpic:	call		.+8
+	add		%o7, .Linp_align - .Lpic, %o7
+
+	ld		[$key + 240], $rounds
+	and		$out, 7, $oalign
+	ld		[$ivp +  0], $ctr0	! load counter
+	andn		$out, 7, $out
+	ld		[$ivp +  4], $ctr0#lo
+	sll		$oalign, 3, $mask
+	ld		[$ivp +  8], $ctr1
+	ld		[$ivp + 12], $ctr1#lo
+	ldd		[%o7 + 128], $one
+
+	sll		$rounds, 4, $rounds
+	add		$rounds, $key, $end
+	ldd		[$key + 0], $r0hi	! round[0]
+	ldd		[$key + 8], $r0lo
+
+	add		$inp, 16, $inp
+	sub		$len, 1, $len
+	ldd		[$key + 16], %f10	! round[1]
+	ldd		[$key + 24], %f12
+
+	mov		16, $inc
+	movrz		$len, 0, $inc
+	ldd		[$end + 0], $rlhi	! round[last]
+	ldd		[$end + 8], $rllo
+
+	ldd		[%o7 + $ileft], $fshift	! shiftleft params
+	add		%o7, 64, %o7
+	ldd		[$inp - 16], $in0	! load input
+	ldd		[$inp -  8], $in1
+	ldda		[$inp]0x82, $intail	! non-faulting load
+	add		$inp, $inc, $inp	! inp+=16
+
+	fshiftorx	$in0, $in1, $fshift, $in0
+	fshiftorx	$in1, $intail, $fshift, $in1
+
+.Loop_ctr32:
+	fxor		$ctr0, $r0hi, %f0	! counter^round[0]
+	fxor		$ctr1, $r0lo, %f2
+	ldd		[$key + 32], %f6	! round[2]
+	ldd		[$key + 40], %f8
+	add		$key, 32, $end
+	sub		$rounds, 16*6, $inner
+
+.Lctr32_enc:
+	fmovd		%f0, %f4
+	faesencx	%f2, %f10, %f0
+	faesencx	%f4, %f12, %f2
+	ldd		[$end + 16], %f10
+	ldd		[$end + 24], %f12
+	add		$end, 32, $end
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f6, %f0
+	faesencx	%f4, %f8, %f2
+	ldd		[$end + 0], %f6
+	ldd		[$end + 8], %f8
+
+	brnz,a		$inner, .Lctr32_enc
+	sub		$inner, 16*2, $inner
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f10, %f0
+	faesencx	%f4, %f12, %f2
+	ldd		[$end + 16], %f10	! round[last-1]
+	ldd		[$end + 24], %f12
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f6, %f0
+	faesencx	%f4, %f8, %f2
+	fxor		$in0, $rlhi, %f6	! inp^round[last]
+	fxor		$in1, $rllo, %f8
+
+	movrz		$len, 0, $inc
+	fmovd		$intail, $in0
+	ldd		[$inp - 8], $in1	! load next input block
+	ldda		[$inp]0x82, $intail	! non-faulting load
+	add		$inp, $inc, $inp	! inp+=16
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f10, %f0
+	faesencx	%f4, %f12, %f2
+	ldd		[$key + 16], %f10	! round[1]
+	ldd		[$key + 24], %f12
+
+	fshiftorx	$in0, $in1, $fshift, $in0
+	fshiftorx	$in1, $intail, $fshift, $in1
+	fpadd32		$ctr1, $one, $ctr1	! increment counter
+
+	fmovd		%f0, %f4
+	faesenclx	%f2, %f6, %f0
+	faesenclx	%f4, %f8, %f2
+
+	brnz,pn		$oalign, .Lctr32_unaligned_out
+	nop
+
+	std		%f0, [$out + 0]
+	std		%f2, [$out + 8]
+	add		$out, 16, $out
+
+	brnz,a		$len, .Loop_ctr32
+	sub		$len, 1, $len
+
+.Lctr32_no_data:
+	ret
+	restore
+
+.align	32
+.Lctr32_unaligned_out:
+	ldd		[%o7 + $mask], $fshift	! shift right params
+	mov		0xff, $mask
+	srl		$mask, $oalign, $mask
+	sub		%g0, $ileft, $iright
+
+	fshiftorx	%f0, %f0, $fshift, %f6
+	fshiftorx	%f0, %f2, $fshift, %f8
+
+	stda		%f6, [$out + $mask]0xc0	! partial store
+	orn		%g0, $mask, $mask
+	std		%f8, [$out + 8]
+	add		$out, 16, $out
+	brz		$len, .Lctr32_unaligned_out_done
+	sub		$len, 1, $len
+	b		.Loop_ctr32_unaligned_out
+	nop
+
+.align	32
+.Loop_ctr32_unaligned_out:
+	fmovd		%f2, $outhead
+	fxor		$ctr0, $r0hi, %f0	! counter^round[0]
+	fxor		$ctr1, $r0lo, %f2
+	ldd		[$key + 32], %f6	! round[2]
+	ldd		[$key + 40], %f8
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f10, %f0
+	faesencx	%f4, %f12, %f2
+	ldd		[$key + 48], %f10	! round[3]
+	ldd		[$key + 56], %f12
+
+	ldx		[$inp - 16], %o0
+	ldx		[$inp -  8], %o1
+	brz		$ileft, .Lctr32_aligned_inp
+	movrz		$len, 0, $inc
+
+	ldx		[$inp], %o2
+	sllx		%o0, $ileft, %o0
+	srlx		%o1, $iright, %g1
+	sllx		%o1, $ileft, %o1
+	or		%g1, %o0, %o0
+	srlx		%o2, $iright, %o2
+	or		%o2, %o1, %o1
+
+.Lctr32_aligned_inp:
+	fmovd		%f0, %f4
+	faesencx	%f2, %f6, %f0
+	faesencx	%f4, %f8, %f2
+	ldd		[$key + 64], %f6	! round[4]
+	ldd		[$key + 72], %f8
+	add		$key, 64, $end
+	sub		$rounds, 16*8, $inner
+
+	stx		%o0, [%sp + LOCALS + 0]
+	stx		%o1, [%sp + LOCALS + 8]
+	add		$inp, $inc, $inp	! inp+=16
+	nop
+
+.Lctr32_enc_unaligned:
+	fmovd		%f0, %f4
+	faesencx	%f2, %f10, %f0
+	faesencx	%f4, %f12, %f2
+	ldd		[$end + 16], %f10
+	ldd		[$end + 24], %f12
+	add		$end, 32, $end
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f6, %f0
+	faesencx	%f4, %f8, %f2
+	ldd		[$end + 0], %f6
+	ldd		[$end + 8], %f8
+
+	brnz,a		$inner, .Lctr32_enc_unaligned
+	sub		$inner, 16*2, $inner
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f10, %f0
+	faesencx	%f4, %f12, %f2
+	ldd		[$end + 16], %f10	! round[last-1]
+	ldd		[$end + 24], %f12
+	fpadd32		$ctr1, $one, $ctr1	! increment counter
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f6, %f0
+	faesencx	%f4, %f8, %f2
+	fxor		$in0, $rlhi, %f6	! inp^round[last]
+	fxor		$in1, $rllo, %f8
+	ldd		[%sp + LOCALS + 0], $in0
+	ldd		[%sp + LOCALS + 8], $in1
+
+	fmovd		%f0, %f4
+	faesencx	%f2, %f10, %f0
+	faesencx	%f4, %f12, %f2
+	ldd		[$key + 16], %f10	! round[1]
+	ldd		[$key + 24], %f12
+
+	fmovd		%f0, %f4
+	faesenclx	%f2, %f6, %f0
+	faesenclx	%f4, %f8, %f2
+
+	fshiftorx	$outhead, %f0, $fshift, %f6
+	fshiftorx	%f0, %f2, $fshift, %f8
+	std		%f6, [$out + 0]
+	std		%f8, [$out + 8]
+	add		$out, 16, $out
+
+	brnz,a		$len, .Loop_ctr32_unaligned_out
+	sub		$len, 1, $len
+
+.Lctr32_unaligned_out_done:
+	fshiftorx	%f2, %f2, $fshift, %f8
+	stda		%f8, [$out + $mask]0xc0	! partial store
+
+	ret
+	restore
+.type	aes_fx_ctr32_encrypt_blocks,#function
+.size	aes_fx_ctr32_encrypt_blocks,.-aes_fx_ctr32_encrypt_blocks
+
+.align	32
+.Linp_align:		! fshiftorx parameters for left shift toward %rs1
+	.byte	0, 0, 64,  0,	0, 64,  0, -64
+	.byte	0, 0, 56,  8,	0, 56,  8, -56
+	.byte	0, 0, 48, 16,	0, 48, 16, -48
+	.byte	0, 0, 40, 24,	0, 40, 24, -40
+	.byte	0, 0, 32, 32,	0, 32, 32, -32
+	.byte	0, 0, 24, 40,	0, 24, 40, -24
+	.byte	0, 0, 16, 48,	0, 16, 48, -16
+	.byte	0, 0,  8, 56,	0,  8, 56, -8
+.Lout_align:		! fshiftorx parameters for right shift toward %rs2
+	.byte	0, 0,  0, 64,	0,  0, 64,   0
+	.byte	0, 0,  8, 56,	0,  8, 56,  -8
+	.byte	0, 0, 16, 48,	0, 16, 48, -16
+	.byte	0, 0, 24, 40,	0, 24, 40, -24
+	.byte	0, 0, 32, 32,	0, 32, 32, -32
+	.byte	0, 0, 40, 24,	0, 40, 24, -40
+	.byte	0, 0, 48, 16,	0, 48, 16, -48
+	.byte	0, 0, 56,  8,	0, 56,  8, -56
+.Lone:
+	.word	0, 1
+.asciz	"AES for Fujitsu SPARC64 X, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+___
+}
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my ($ref,$opf);
+my %visopf = (	"faligndata"	=> 0x048,
+		"bshuffle"	=> 0x04c,
+		"fpadd32"	=> 0x052,
+		"fxor"		=> 0x06c,
+		"fsrc2"		=> 0x078	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+    if ($opf=$visopf{$mnemonic}) {
+	foreach ($rs1,$rs2,$rd) {
+	    return $ref if (!/%f([0-9]{1,2})/);
+	    $_=$1;
+	    if ($1>=32) {
+		return $ref if ($1&1);
+		# re-encode for upper double register addressing
+		$_=($1|$1>>5)&31;
+	    }
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = (	"alignaddr"	=> 0x018,
+		"bmask"		=> 0x019,
+		"alignaddrl"	=> 0x01a	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+    if ($opf=$visopf{$mnemonic}) {
+	foreach ($rs1,$rs2,$rd) {
+	    return $ref if (!/%([goli])([0-9])/);
+	    $_=$bias{$1}+$2;
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+
+sub unfx {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my ($ref,$opf);
+my %aesopf = (	"faesencx"	=> 0x90,
+		"faesdecx"	=> 0x91,
+		"faesenclx"	=> 0x92,
+		"faesdeclx"	=> 0x93,
+		"faeskeyx"	=> 0x94	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+    if (defined($opf=$aesopf{$mnemonic})) {
+	$rs2 = ($rs2 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs2;
+	$rs2 = oct($rs2) if ($rs2 =~ /^0/);
+
+	foreach ($rs1,$rd) {
+	    return $ref if (!/%f([0-9]{1,2})/);
+	    $_=$1;
+	    if ($1>=32) {
+		return $ref if ($1&1);
+		# re-encode for upper double register addressing
+		$_=($1|$1>>5)&31;
+	    }
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+
+sub unfx3src {
+my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
+my ($ref,$opf);
+my %aesopf = (	"fshiftorx"	=> 0x0b	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
+
+    if (defined($opf=$aesopf{$mnemonic})) {
+	foreach ($rs1,$rs2,$rs3,$rd) {
+	    return $ref if (!/%f([0-9]{1,2})/);
+	    $_=$1;
+	    if ($1>=32) {
+		return $ref if ($1&1);
+		# re-encode for upper double register addressing
+		$_=($1|$1>>5)&31;
+	    }
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			2<<30|$rd<<25|0x37<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+
+foreach (split("\n",$code)) {
+    s/\`([^\`]*)\`/eval $1/ge;
+
+    s/%f([0-9]+)#lo/sprintf "%%f%d",$1+1/ge;
+
+    s/\b(faes[^x]{3,4}x)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
+		&unfx($1,$2,$3,$4)
+     /ge or
+    s/\b([f][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+		&unfx3src($1,$2,$3,$4,$5)
+     /ge or
+    s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+		&unvis($1,$2,$3,$4)
+     /ge or
+    s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+		&unvis3($1,$2,$3,$4)
+     /ge;
+    print $_,"\n";
+}
+
+close STDOUT;
diff --git a/deps/openssl/openssl/crypto/aes/asm/aesni-mb-x86_64.pl b/deps/openssl/openssl/crypto/aes/asm/aesni-mb-x86_64.pl
index d7ad7882c4..aa2735e06a 100644
--- a/deps/openssl/openssl/crypto/aes/asm/aesni-mb-x86_64.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/aesni-mb-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -67,7 +74,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([
 	$avx = ($2>=3.0) + ($2>3.0);
 }
 
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;
 
 # void aesni_multi_cbc_encrypt (
diff --git a/deps/openssl/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl b/deps/openssl/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl
index 7a30e893fb..33a7f0cf44 100644
--- a/deps/openssl/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -25,7 +32,10 @@
 # Sandy Bridge	5.05[+5.0(6.1)]	10.06(11.15)	5.98(7.05)  +68%(+58%)
 # Ivy Bridge	5.05[+4.6]	9.65		5.54        +74%
 # Haswell	4.43[+3.6(4.2)]	8.00(8.58)	4.55(5.21)  +75%(+65%)
+# Skylake	2.63[+3.5(4.1)]	6.17(6.69)	4.23(4.44)  +46%(+51%)
 # Bulldozer	5.77[+6.0]	11.72		6.37        +84%
+# Ryzen(**)	2.71[+1.93]	4.64		2.74        +69%
+# Goldmont(**)	3.82[+1.70]	5.52		4.20        +31%
 #
 #		AES-192-CBC
 # Westmere	4.51		9.81		6.80	    +44%
@@ -39,12 +49,16 @@
 # Sandy Bridge	7.05		12.06(13.15)	7.12(7.72)  +69%(+70%)
 # Ivy Bridge	7.05		11.65		7.12        +64%
 # Haswell	6.19		9.76(10.34)	6.21(6.25)  +57%(+65%)
+# Skylake	3.62		7.16(7.68)	4.56(4.76)  +57%(+61%)
 # Bulldozer	8.00		13.95		8.25        +69%
+# Ryzen(**)	3.71		5.64		3.72        +52%
+# Goldmont(**)	5.35		7.05		5.76        +22%
 #
 # (*)	There are two code paths: SSSE3 and AVX. See sha1-568.pl for
 #	background information. Above numbers in parentheses are SSSE3
 #	results collected on AVX-capable CPU, i.e. apply on OSes that
 #	don't support AVX.
+# (**)	SHAEXT results.
 #
 # Needless to mention that it makes no sense to implement "stitched"
 # *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
@@ -100,7 +114,7 @@ $shaext=1;	### set to zero if compiling for 1.0.1
 
 $stitched_decrypt=0;
 
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;
 
 # void aesni_cbc_sha1_enc(const void *inp,
@@ -298,7 +312,7 @@ ___
     $r++;	unshift(@rndkey,pop(@rndkey));
 };
 
-sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
+sub Xupdate_ssse3_16_31()		# recall that $Xi starts with 4
 { use integer;
   my $body = shift;
   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
@@ -1137,7 +1151,7 @@ ___
     $r++;	unshift(@rndkey,pop(@rndkey));
 };
 
-sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
+sub Xupdate_avx_16_31()		# recall that $Xi starts with 4
 { use integer;
   my $body = shift;
   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
diff --git a/deps/openssl/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl b/deps/openssl/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl
index 588ade64ee..0e49f26faf 100644
--- a/deps/openssl/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -21,17 +28,21 @@
 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
 # subroutine:
 #
-#		 AES-128/-192/-256+SHA256	this(**)gain
-# Sandy Bridge	    5.05/6.05/7.05+11.6		13.0	+28%/36%/43%
-# Ivy Bridge	    5.05/6.05/7.05+10.3		11.6	+32%/41%/50%
-# Haswell	    4.43/5.29/6.19+7.80		8.79	+39%/49%/59%
-# Bulldozer	    5.77/6.89/8.00+13.7		13.7	+42%/50%/58%
+#		 AES-128/-192/-256+SHA256   this(**)	gain
+# Sandy Bridge	    5.05/6.05/7.05+11.6	    13.0	+28%/36%/43%
+# Ivy Bridge	    5.05/6.05/7.05+10.3	    11.6	+32%/41%/50%
+# Haswell	    4.43/5.29/6.19+7.80	    8.79	+39%/49%/59%
+# Skylake	    2.62/3.14/3.62+7.70	    8.10	+27%/34%/40%
+# Bulldozer	    5.77/6.89/8.00+13.7	    13.7	+42%/50%/58%
+# Ryzen(***)	    2.71/-/3.71+2.05	    2.74/-/3.73	+74%/-/54%
+# Goldmont(***)	    3.82/-/5.35+4.16	    4.73/-/5.94	+69%/-/60%
 #
-# (*)	there are XOP, AVX1 and AVX2 code pathes, meaning that
+# (*)	there are XOP, AVX1 and AVX2 code paths, meaning that
 #	Westmere is omitted from loop, this is because gain was not
 #	estimated high enough to justify the effort;
 # (**)	these are EVP-free results, results obtained with 'speed
 #	-evp aes-256-cbc-hmac-sha256' will vary by percent or two;
+# (***)	these are SHAEXT results;
 
 $flavour = shift;
 $output  = shift;
@@ -66,7 +77,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([
 $shaext=$avx;	### set to zero if compiling for 1.0.1
 $avx=1		if (!$shaext && $avx);
 
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;
 
 $func="aesni_cbc_sha256_enc";
diff --git a/deps/openssl/openssl/crypto/aes/asm/aesni-x86.pl b/deps/openssl/openssl/crypto/aes/asm/aesni-x86.pl
index 9b2e37aafb..ed1a47c30c 100644
--- a/deps/openssl/openssl/crypto/aes/asm/aesni-x86.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/aesni-x86.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -43,16 +50,20 @@
 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
 
+# November 2015
+#
+# Add aesni_ocb_[en|de]crypt.
+
 ######################################################################
 # Current large-block performance in cycles per byte processed with
 # 128-bit key (less is better).
 #
-#		CBC en-/decrypt	CTR	XTS	ECB
+#		CBC en-/decrypt	CTR	XTS	ECB	OCB
 # Westmere	3.77/1.37	1.37	1.52	1.27
-# * Bridge	5.07/0.98	0.99	1.09	0.91
-# Haswell	4.44/0.80	0.97	1.03	0.72
-# Silvermont	5.77/3.56	3.67	4.03	3.46
-# Bulldozer	5.80/0.98	1.05	1.24	0.93
+# * Bridge	5.07/0.98	0.99	1.09	0.91	1.10
+# Haswell	4.44/0.80	0.97	1.03	0.72	0.76
+# Silvermont	5.77/3.56	3.67	4.03	3.46	4.03
+# Bulldozer	5.80/0.98	1.05	1.24	0.93	1.23
 
 $PREFIX="aesni";	# if $PREFIX is set to "AES", the script
 			# generates drop-in replacement for
@@ -63,6 +74,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
+$output = pop;
+open OUT,">$output";
+*STDOUT=*OUT;
+
 &asm_init($ARGV[0],$0);
 
 &external_label("OPENSSL_ia32cap_P");
@@ -1831,6 +1846,877 @@ if ($PREFIX eq "aesni") {
 	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
 &function_end("aesni_xts_decrypt");
 }
+
+######################################################################
+# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
+#	const AES_KEY *key, unsigned int start_block_num,
+#	unsigned char offset_i[16], const unsigned char L_[][16],
+#	unsigned char checksum[16]);
+#
+{
+# offsets within stack frame
+my $checksum = 16*6;
+my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4));
+
+# reassigned registers
+my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out);
+# $l_, $blocks, $inp, $key are permanently allocated in registers;
+# remaining non-volatile ones are offloaded to stack, which even
+# stay invariant after written to stack.
+
+&function_begin("aesni_ocb_encrypt");
+	&mov	($rounds,&wparam(5));		# &offset_i
+	&mov	($rounds_,&wparam(7));		# &checksum
+
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&movdqu	($rndkey0,&QWP(0,$rounds));	# load offset_i
+	&mov	($block,&wparam(4));		# start_block_num
+	&movdqu	($rndkey1,&QWP(0,$rounds_));	# load checksum
+	&mov	($l_,&wparam(6));		# L_
+
+	&mov	($rounds,"esp");
+	&sub	("esp",$esp_off+4);		# alloca
+	&and	("esp",-16);			# align stack
+
+	&sub	($out,$inp);
+	&shl	($len,4);
+	&lea	($len,&DWP(-16*6,$inp,$len));	# end of input - 16*6
+	&mov	(&DWP($out_off,"esp"),$out);
+	&mov	(&DWP($end_off,"esp"),$len);
+	&mov	(&DWP($esp_off,"esp"),$rounds);
+
+	&mov	($rounds,&DWP(240,$key));
+
+	&test	($block,1);
+	&jnz	(&label("odd"));
+
+	&bsf		($i3,$block);
+	&add		($block,1);
+	&shl		($i3,4);
+	&movdqu		($inout5,&QWP(0,$l_,$i3));
+	&mov		($i3,$key);			# put aside key
+
+	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
+	&lea		($inp,&DWP(16,$inp));
+
+	&pxor		($inout5,$rndkey0);		# ^ last offset_i
+	&pxor		($rndkey1,$inout0);		# checksum
+	&pxor		($inout0,$inout5);		# ^ offset_i
+
+	&movdqa		($inout4,$rndkey1);
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+
+	&xorps		($inout0,$inout5);		# ^ offset_i
+	&movdqa		($rndkey0,$inout5);		# pass last offset_i
+	&movdqa		($rndkey1,$inout4);		# pass the checksum
+
+	&movups		(&QWP(-16,$out,$inp),$inout0);	# store output
+
+	&mov		($rounds,&DWP(240,$i3));
+	&mov		($key,$i3);			# restore key
+	&mov		($len,&DWP($end_off,"esp"));
+
+&set_label("odd");
+	&shl		($rounds,4);
+	&mov		($out,16);
+	&sub		($out,$rounds);			# twisted rounds
+	&mov		(&DWP($key_off,"esp"),$key);
+	&lea		($key,&DWP(32,$key,$rounds));	# end of key schedule
+	&mov		(&DWP($rounds_off,"esp"),$out);
+
+	&cmp		($inp,$len);
+	&ja		(&label("short"));
+	&jmp		(&label("grandloop"));
+
+&set_label("grandloop",32);
+	&lea		($i1,&DWP(1,$block));
+	&lea		($i3,&DWP(3,$block));
+	&lea		($i5,&DWP(5,$block));
+	&add		($block,6);
+	&bsf		($i1,$i1);
+	&bsf		($i3,$i3);
+	&bsf		($i5,$i5);
+	&shl		($i1,4);
+	&shl		($i3,4);
+	&shl		($i5,4);
+	&movdqu		($inout0,&QWP(0,$l_));
+	&movdqu		($inout1,&QWP(0,$l_,$i1));
+	&mov		($rounds,&DWP($rounds_off,"esp"));
+	&movdqa		($inout2,$inout0);
+	&movdqu		($inout3,&QWP(0,$l_,$i3));
+	&movdqa		($inout4,$inout0);
+	&movdqu		($inout5,&QWP(0,$l_,$i5));
+
+	&pxor		($inout0,$rndkey0);		# ^ last offset_i
+	&pxor		($inout1,$inout0);
+	&movdqa		(&QWP(16*0,"esp"),$inout0);
+	&pxor		($inout2,$inout1);
+	&movdqa		(&QWP(16*1,"esp"),$inout1);
+	&pxor		($inout3,$inout2);
+	&movdqa		(&QWP(16*2,"esp"),$inout2);
+	&pxor		($inout4,$inout3);
+	&movdqa		(&QWP(16*3,"esp"),$inout3);
+	&pxor		($inout5,$inout4);
+	&movdqa		(&QWP(16*4,"esp"),$inout4);
+	&movdqa		(&QWP(16*5,"esp"),$inout5);
+
+	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
+	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
+	&movdqu		($inout1,&QWP(16*1,$inp));
+	&movdqu		($inout2,&QWP(16*2,$inp));
+	&movdqu		($inout3,&QWP(16*3,$inp));
+	&movdqu		($inout4,&QWP(16*4,$inp));
+	&movdqu		($inout5,&QWP(16*5,$inp));
+	&lea		($inp,&DWP(16*6,$inp));
+
+	&pxor		($rndkey1,$inout0);		# checksum
+	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
+	&pxor		($rndkey1,$inout1);
+	&pxor		($inout1,$rndkey0);
+	&pxor		($rndkey1,$inout2);
+	&pxor		($inout2,$rndkey0);
+	&pxor		($rndkey1,$inout3);
+	&pxor		($inout3,$rndkey0);
+	&pxor		($rndkey1,$inout4);
+	&pxor		($inout4,$rndkey0);
+	&pxor		($rndkey1,$inout5);
+	&pxor		($inout5,$rndkey0);
+	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
+
+	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
+	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
+	&pxor		($inout1,&QWP(16*1,"esp"));
+	&pxor		($inout2,&QWP(16*2,"esp"));
+	&pxor		($inout3,&QWP(16*3,"esp"));
+	&pxor		($inout4,&QWP(16*4,"esp"));
+	&pxor		($inout5,&QWP(16*5,"esp"));
+
+	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
+	&aesenc		($inout0,$rndkey1);
+	&aesenc		($inout1,$rndkey1);
+	&aesenc		($inout2,$rndkey1);
+	&aesenc		($inout3,$rndkey1);
+	&aesenc		($inout4,$rndkey1);
+	&aesenc		($inout5,$rndkey1);
+
+	&mov		($out,&DWP($out_off,"esp"));
+	&mov		($len,&DWP($end_off,"esp"));
+	&call		("_aesni_encrypt6_enter");
+
+	&movdqa		($rndkey0,&QWP(16*5,"esp"));	# pass last offset_i
+	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
+	&pxor		($inout1,&QWP(16*1,"esp"));
+	&pxor		($inout2,&QWP(16*2,"esp"));
+	&pxor		($inout3,&QWP(16*3,"esp"));
+	&pxor		($inout4,&QWP(16*4,"esp"));
+	&pxor		($inout5,$rndkey0);
+	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+
+	&movdqu		(&QWP(-16*6,$out,$inp),$inout0);# store output
+	&movdqu		(&QWP(-16*5,$out,$inp),$inout1);
+	&movdqu		(&QWP(-16*4,$out,$inp),$inout2);
+	&movdqu		(&QWP(-16*3,$out,$inp),$inout3);
+	&movdqu		(&QWP(-16*2,$out,$inp),$inout4);
+	&movdqu		(&QWP(-16*1,$out,$inp),$inout5);
+	&cmp		($inp,$len);			# done yet?
+	&jb		(&label("grandloop"));
+
+&set_label("short");
+	&add		($len,16*6);
+	&sub		($len,$inp);
+	&jz		(&label("done"));
+
+	&cmp		($len,16*2);
+	&jb		(&label("one"));
+	&je		(&label("two"));
+
+	&cmp		($len,16*4);
+	&jb		(&label("three"));
+	&je		(&label("four"));
+
+	&lea		($i1,&DWP(1,$block));
+	&lea		($i3,&DWP(3,$block));
+	&bsf		($i1,$i1);
+	&bsf		($i3,$i3);
+	&shl		($i1,4);
+	&shl		($i3,4);
+	&movdqu		($inout0,&QWP(0,$l_));
+	&movdqu		($inout1,&QWP(0,$l_,$i1));
+	&mov		($rounds,&DWP($rounds_off,"esp"));
+	&movdqa		($inout2,$inout0);
+	&movdqu		($inout3,&QWP(0,$l_,$i3));
+	&movdqa		($inout4,$inout0);
+
+	&pxor		($inout0,$rndkey0);		# ^ last offset_i
+	&pxor		($inout1,$inout0);
+	&movdqa		(&QWP(16*0,"esp"),$inout0);
+	&pxor		($inout2,$inout1);
+	&movdqa		(&QWP(16*1,"esp"),$inout1);
+	&pxor		($inout3,$inout2);
+	&movdqa		(&QWP(16*2,"esp"),$inout2);
+	&pxor		($inout4,$inout3);
+	&movdqa		(&QWP(16*3,"esp"),$inout3);
+	&pxor		($inout5,$inout4);
+	&movdqa		(&QWP(16*4,"esp"),$inout4);
+
+	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
+	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
+	&movdqu		($inout1,&QWP(16*1,$inp));
+	&movdqu		($inout2,&QWP(16*2,$inp));
+	&movdqu		($inout3,&QWP(16*3,$inp));
+	&movdqu		($inout4,&QWP(16*4,$inp));
+	&pxor		($inout5,$inout5);
+
+	&pxor		($rndkey1,$inout0);		# checksum
+	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
+	&pxor		($rndkey1,$inout1);
+	&pxor		($inout1,$rndkey0);
+	&pxor		($rndkey1,$inout2);
+	&pxor		($inout2,$rndkey0);
+	&pxor		($rndkey1,$inout3);
+	&pxor		($inout3,$rndkey0);
+	&pxor		($rndkey1,$inout4);
+	&pxor		($inout4,$rndkey0);
+	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
+
+	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
+	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
+	&pxor		($inout1,&QWP(16*1,"esp"));
+	&pxor		($inout2,&QWP(16*2,"esp"));
+	&pxor		($inout3,&QWP(16*3,"esp"));
+	&pxor		($inout4,&QWP(16*4,"esp"));
+
+	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
+	&aesenc		($inout0,$rndkey1);
+	&aesenc		($inout1,$rndkey1);
+	&aesenc		($inout2,$rndkey1);
+	&aesenc		($inout3,$rndkey1);
+	&aesenc		($inout4,$rndkey1);
+	&aesenc		($inout5,$rndkey1);
+
+	&mov		($out,&DWP($out_off,"esp"));
+	&call		("_aesni_encrypt6_enter");
+
+	&movdqa		($rndkey0,&QWP(16*4,"esp"));	# pass last offset_i
+	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
+	&pxor		($inout1,&QWP(16*1,"esp"));
+	&pxor		($inout2,&QWP(16*2,"esp"));
+	&pxor		($inout3,&QWP(16*3,"esp"));
+	&pxor		($inout4,$rndkey0);
+	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+
+	&movdqu		(&QWP(16*0,$out,$inp),$inout0);	# store output
+	&movdqu		(&QWP(16*1,$out,$inp),$inout1);
+	&movdqu		(&QWP(16*2,$out,$inp),$inout2);
+	&movdqu		(&QWP(16*3,$out,$inp),$inout3);
+	&movdqu		(&QWP(16*4,$out,$inp),$inout4);
+
+	&jmp		(&label("done"));
+
+&set_label("one",16);
+	&movdqu		($inout5,&QWP(0,$l_));
+	&mov		($key,&DWP($key_off,"esp"));	# restore key
+
+	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
+	&mov		($rounds,&DWP(240,$key));
+
+	&pxor		($inout5,$rndkey0);		# ^ last offset_i
+	&pxor		($rndkey1,$inout0);		# checksum
+	&pxor		($inout0,$inout5);		# ^ offset_i
+
+	&movdqa		($inout4,$rndkey1);
+	&mov		($out,&DWP($out_off,"esp"));
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+
+	&xorps		($inout0,$inout5);		# ^ offset_i
+	&movdqa		($rndkey0,$inout5);		# pass last offset_i
+	&movdqa		($rndkey1,$inout4);		# pass the checksum
+	&movups		(&QWP(0,$out,$inp),$inout0);
+
+	&jmp		(&label("done"));
+
+&set_label("two",16);
+	&lea		($i1,&DWP(1,$block));
+	&mov		($key,&DWP($key_off,"esp"));	# restore key
+	&bsf		($i1,$i1);
+	&shl		($i1,4);
+	&movdqu		($inout4,&QWP(0,$l_));
+	&movdqu		($inout5,&QWP(0,$l_,$i1));
+
+	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
+	&movdqu		($inout1,&QWP(16*1,$inp));
+	&mov		($rounds,&DWP(240,$key));
+
+	&pxor		($inout4,$rndkey0);		# ^ last offset_i
+	&pxor		($inout5,$inout4);
+
+	&pxor		($rndkey1,$inout0);		# checksum
+	&pxor		($inout0,$inout4);		# ^ offset_i
+	&pxor		($rndkey1,$inout1);
+	&pxor		($inout1,$inout5);
+
+	&movdqa		($inout3,$rndkey1)
+	&mov		($out,&DWP($out_off,"esp"));
+	&call		("_aesni_encrypt2");
+
+	&xorps		($inout0,$inout4);		# ^ offset_i
+	&xorps		($inout1,$inout5);
+	&movdqa		($rndkey0,$inout5);		# pass last offset_i
+	&movdqa		($rndkey1,$inout3);		# pass the checksum
+	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
+	&movups		(&QWP(16*1,$out,$inp),$inout1);
+
+	&jmp		(&label("done"));
+
+&set_label("three",16);
+	&lea		($i1,&DWP(1,$block));
+	&mov		($key,&DWP($key_off,"esp"));	# restore key
+	&bsf		($i1,$i1);
+	&shl		($i1,4);
+	&movdqu		($inout3,&QWP(0,$l_));
+	&movdqu		($inout4,&QWP(0,$l_,$i1));
+	&movdqa		($inout5,$inout3);
+
+	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
+	&movdqu		($inout1,&QWP(16*1,$inp));
+	&movdqu		($inout2,&QWP(16*2,$inp));
+	&mov		($rounds,&DWP(240,$key));
+
+	&pxor		($inout3,$rndkey0);		# ^ last offset_i
+	&pxor		($inout4,$inout3);
+	&pxor		($inout5,$inout4);
+
+	&pxor		($rndkey1,$inout0);		# checksum
+	&pxor		($inout0,$inout3);		# ^ offset_i
+	&pxor		($rndkey1,$inout1);
+	&pxor		($inout1,$inout4);
+	&pxor		($rndkey1,$inout2);
+	&pxor		($inout2,$inout5);
+
+	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
+	&mov		($out,&DWP($out_off,"esp"));
+	&call		("_aesni_encrypt3");
+
+	&xorps		($inout0,$inout3);		# ^ offset_i
+	&xorps		($inout1,$inout4);
+	&xorps		($inout2,$inout5);
+	&movdqa		($rndkey0,$inout5);		# pass last offset_i
+	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
+	&movups		(&QWP(16*1,$out,$inp),$inout1);
+	&movups		(&QWP(16*2,$out,$inp),$inout2);
+
+	&jmp		(&label("done"));
+
+&set_label("four",16);
+	&lea		($i1,&DWP(1,$block));
+	&lea		($i3,&DWP(3,$block));
+	&bsf		($i1,$i1);
+	&bsf		($i3,$i3);
+	&mov		($key,&DWP($key_off,"esp"));	# restore key
+	&shl		($i1,4);
+	&shl		($i3,4);
+	&movdqu		($inout2,&QWP(0,$l_));
+	&movdqu		($inout3,&QWP(0,$l_,$i1));
+	&movdqa		($inout4,$inout2);
+	&movdqu		($inout5,&QWP(0,$l_,$i3));
+
+	&pxor		($inout2,$rndkey0);		# ^ last offset_i
+	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
+	&pxor		($inout3,$inout2);
+	&movdqu		($inout1,&QWP(16*1,$inp));
+	&pxor		($inout4,$inout3);
+	&movdqa		(&QWP(16*0,"esp"),$inout2);
+	&pxor		($inout5,$inout4);
+	&movdqa		(&QWP(16*1,"esp"),$inout3);
+	&movdqu		($inout2,&QWP(16*2,$inp));
+	&movdqu		($inout3,&QWP(16*3,$inp));
+	&mov		($rounds,&DWP(240,$key));
+
+	&pxor		($rndkey1,$inout0);		# checksum
+	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
+	&pxor		($rndkey1,$inout1);
+	&pxor		($inout1,&QWP(16*1,"esp"));
+	&pxor		($rndkey1,$inout2);
+	&pxor		($inout2,$inout4);
+	&pxor		($rndkey1,$inout3);
+	&pxor		($inout3,$inout5);
+
+	&movdqa		(&QWP($checksum,"esp"),$rndkey1)
+	&mov		($out,&DWP($out_off,"esp"));
+	&call		("_aesni_encrypt4");
+
+	&xorps		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
+	&xorps		($inout1,&QWP(16*1,"esp"));
+	&xorps		($inout2,$inout4);
+	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
+	&xorps		($inout3,$inout5);
+	&movups		(&QWP(16*1,$out,$inp),$inout1);
+	&movdqa		($rndkey0,$inout5);		# pass last offset_i
+	&movups		(&QWP(16*2,$out,$inp),$inout2);
+	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+	&movups		(&QWP(16*3,$out,$inp),$inout3);
+
+&set_label("done");
+	&mov	($key,&DWP($esp_off,"esp"));
+	&pxor	($inout0,$inout0);		# clear register bank
+	&pxor	($inout1,$inout1);
+	&movdqa	(&QWP(16*0,"esp"),$inout0);	# clear stack
+	&pxor	($inout2,$inout2);
+	&movdqa	(&QWP(16*1,"esp"),$inout0);
+	&pxor	($inout3,$inout3);
+	&movdqa	(&QWP(16*2,"esp"),$inout0);
+	&pxor	($inout4,$inout4);
+	&movdqa	(&QWP(16*3,"esp"),$inout0);
+	&pxor	($inout5,$inout5);
+	&movdqa	(&QWP(16*4,"esp"),$inout0);
+	&movdqa	(&QWP(16*5,"esp"),$inout0);
+	&movdqa	(&QWP(16*6,"esp"),$inout0);
+
+	&lea	("esp",&DWP(0,$key));
+	&mov	($rounds,&wparam(5));		# &offset_i
+	&mov	($rounds_,&wparam(7));		# &checksum
+	&movdqu	(&QWP(0,$rounds),$rndkey0);
+	&pxor	($rndkey0,$rndkey0);
+	&movdqu	(&QWP(0,$rounds_),$rndkey1);
+	&pxor	($rndkey1,$rndkey1);
+&function_end("aesni_ocb_encrypt");
+
+&function_begin("aesni_ocb_decrypt");
+	&mov	($rounds,&wparam(5));		# &offset_i
+	&mov	($rounds_,&wparam(7));		# &checksum
+
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&movdqu	($rndkey0,&QWP(0,$rounds));	# load offset_i
+	&mov	($block,&wparam(4));		# start_block_num
+	&movdqu	($rndkey1,&QWP(0,$rounds_));	# load checksum
+	&mov	($l_,&wparam(6));		# L_
+
+	&mov	($rounds,"esp");
+	&sub	("esp",$esp_off+4);		# alloca
+	&and	("esp",-16);			# align stack
+
+	&sub	($out,$inp);
+	&shl	($len,4);
+	&lea	($len,&DWP(-16*6,$inp,$len));	# end of input - 16*6
+	&mov	(&DWP($out_off,"esp"),$out);
+	&mov	(&DWP($end_off,"esp"),$len);
+	&mov	(&DWP($esp_off,"esp"),$rounds);
+
+	&mov	($rounds,&DWP(240,$key));
+
+	&test	($block,1);
+	&jnz	(&label("odd"));
+
+	&bsf		($i3,$block);
+	&add		($block,1);
+	&shl		($i3,4);
+	&movdqu		($inout5,&QWP(0,$l_,$i3));
+	&mov		($i3,$key);			# put aside key
+
+	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
+	&lea		($inp,&DWP(16,$inp));
+
+	&pxor		($inout5,$rndkey0);		# ^ last offset_i
+	&pxor		($inout0,$inout5);		# ^ offset_i
+
+	&movdqa		($inout4,$rndkey1);
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+
+	&xorps		($inout0,$inout5);		# ^ offset_i
+	&movaps		($rndkey1,$inout4);		# pass the checksum
+	&movdqa		($rndkey0,$inout5);		# pass last offset_i
+	&xorps		($rndkey1,$inout0);		# checksum
+	&movups		(&QWP(-16,$out,$inp),$inout0);	# store output
+
+	&mov		($rounds,&DWP(240,$i3));
+	&mov		($key,$i3);			# restore key
+	&mov		($len,&DWP($end_off,"esp"));
+
+&set_label("odd");
+	&shl		($rounds,4);
+	&mov		($out,16);
+	&sub		($out,$rounds);			# twisted rounds
+	&mov		(&DWP($key_off,"esp"),$key);
+	&lea		($key,&DWP(32,$key,$rounds));	# end of key schedule
+	&mov		(&DWP($rounds_off,"esp"),$out);
+
+	&cmp		($inp,$len);
+	&ja		(&label("short"));
+	&jmp		(&label("grandloop"));
+
+&set_label("grandloop",32);
+	&lea		($i1,&DWP(1,$block));
+	&lea		($i3,&DWP(3,$block));
+	&lea		($i5,&DWP(5,$block));
+	&add		($block,6);
+	&bsf		($i1,$i1);
+	&bsf		($i3,$i3);
+	&bsf		($i5,$i5);
+	&shl		($i1,4);
+	&shl		($i3,4);
+	&shl		($i5,4);
+	&movdqu		($inout0,&QWP(0,$l_));
+	&movdqu		($inout1,&QWP(0,$l_,$i1));
+	&mov		($rounds,&DWP($rounds_off,"esp"));
+	&movdqa		($inout2,$inout0);
+	&movdqu		($inout3,&QWP(0,$l_,$i3));
+	&movdqa		($inout4,$inout0);
+	&movdqu		($inout5,&QWP(0,$l_,$i5));
+
+	&pxor		($inout0,$rndkey0);		# ^ last offset_i
+	&pxor		($inout1,$inout0);
+	&movdqa		(&QWP(16*0,"esp"),$inout0);
+	&pxor		($inout2,$inout1);
+	&movdqa		(&QWP(16*1,"esp"),$inout1);
+	&pxor		($inout3,$inout2);
+	&movdqa		(&QWP(16*2,"esp"),$inout2);
+	&pxor		($inout4,$inout3);
+	&movdqa		(&QWP(16*3,"esp"),$inout3);
+	&pxor		($inout5,$inout4);
+	&movdqa		(&QWP(16*4,"esp"),$inout4);
+	&movdqa		(&QWP(16*5,"esp"),$inout5);
+
+	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
+	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
+	&movdqu		($inout1,&QWP(16*1,$inp));
+	&movdqu		($inout2,&QWP(16*2,$inp));
+	&movdqu		($inout3,&QWP(16*3,$inp));
+	&movdqu		($inout4,&QWP(16*4,$inp));
+	&movdqu		($inout5,&QWP(16*5,$inp));
+	&lea		($inp,&DWP(16*6,$inp));
+
+	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
+	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
+	&pxor		($inout1,$rndkey0);
+	&pxor		($inout2,$rndkey0);
+	&pxor		($inout3,$rndkey0);
+	&pxor		($inout4,$rndkey0);
+	&pxor		($inout5,$rndkey0);
+
+	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
+	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
+	&pxor		($inout1,&QWP(16*1,"esp"));
+	&pxor		($inout2,&QWP(16*2,"esp"));
+	&pxor		($inout3,&QWP(16*3,"esp"));
+	&pxor		($inout4,&QWP(16*4,"esp"));
+	&pxor		($inout5,&QWP(16*5,"esp"));
+
+	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
+	&aesdec		($inout0,$rndkey1);
+	&aesdec		($inout1,$rndkey1);
+	&aesdec		($inout2,$rndkey1);
+	&aesdec		($inout3,$rndkey1);
+	&aesdec		($inout4,$rndkey1);
+	&aesdec		($inout5,$rndkey1);
+
+	&mov		($out,&DWP($out_off,"esp"));
+	&mov		($len,&DWP($end_off,"esp"));
+	&call		("_aesni_decrypt6_enter");
+
+	&movdqa		($rndkey0,&QWP(16*5,"esp"));	# pass last offset_i
+	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
+	&movdqa		($rndkey1,&QWP($checksum,"esp"));
+	&pxor		($inout1,&QWP(16*1,"esp"));
+	&pxor		($inout2,&QWP(16*2,"esp"));
+	&pxor		($inout3,&QWP(16*3,"esp"));
+	&pxor		($inout4,&QWP(16*4,"esp"));
+	&pxor		($inout5,$rndkey0);
+
+	&pxor		($rndkey1,$inout0);		# checksum
+	&movdqu		(&QWP(-16*6,$out,$inp),$inout0);# store output
+	&pxor		($rndkey1,$inout1);
+	&movdqu		(&QWP(-16*5,$out,$inp),$inout1);
+	&pxor		($rndkey1,$inout2);
+	&movdqu		(&QWP(-16*4,$out,$inp),$inout2);
+	&pxor		($rndkey1,$inout3);
+	&movdqu		(&QWP(-16*3,$out,$inp),$inout3);
+	&pxor		($rndkey1,$inout4);
+	&movdqu		(&QWP(-16*2,$out,$inp),$inout4);
+	&pxor		($rndkey1,$inout5);
+	&movdqu		(&QWP(-16*1,$out,$inp),$inout5);
+	&cmp		($inp,$len);			# done yet?
+	&jb		(&label("grandloop"));
+
+&set_label("short");
+	&add		($len,16*6);
+	&sub		($len,$inp);
+	&jz		(&label("done"));
+
+	&cmp		($len,16*2);
+	&jb		(&label("one"));
+	&je		(&label("two"));
+
+	&cmp		($len,16*4);
+	&jb		(&label("three"));
+	&je		(&label("four"));
+
+	&lea		($i1,&DWP(1,$block));
+	&lea		($i3,&DWP(3,$block));
+	&bsf		($i1,$i1);
+	&bsf		($i3,$i3);
+	&shl		($i1,4);
+	&shl		($i3,4);
+	&movdqu		($inout0,&QWP(0,$l_));
+	&movdqu		($inout1,&QWP(0,$l_,$i1));
+	&mov		($rounds,&DWP($rounds_off,"esp"));
+	&movdqa		($inout2,$inout0);
+	&movdqu		($inout3,&QWP(0,$l_,$i3));
+	&movdqa		($inout4,$inout0);
+
+	&pxor		($inout0,$rndkey0);		# ^ last offset_i
+	&pxor		($inout1,$inout0);
+	&movdqa		(&QWP(16*0,"esp"),$inout0);
+	&pxor		($inout2,$inout1);
+	&movdqa		(&QWP(16*1,"esp"),$inout1);
+	&pxor		($inout3,$inout2);
+	&movdqa		(&QWP(16*2,"esp"),$inout2);
+	&pxor		($inout4,$inout3);
+	&movdqa		(&QWP(16*3,"esp"),$inout3);
+	&pxor		($inout5,$inout4);
+	&movdqa		(&QWP(16*4,"esp"),$inout4);
+
+	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
+	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
+	&movdqu		($inout1,&QWP(16*1,$inp));
+	&movdqu		($inout2,&QWP(16*2,$inp));
+	&movdqu		($inout3,&QWP(16*3,$inp));
+	&movdqu		($inout4,&QWP(16*4,$inp));
+	&pxor		($inout5,$inout5);
+
+	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
+	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
+	&pxor		($inout1,$rndkey0);
+	&pxor		($inout2,$rndkey0);
+	&pxor		($inout3,$rndkey0);
+	&pxor		($inout4,$rndkey0);
+
+	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
+	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
+	&pxor		($inout1,&QWP(16*1,"esp"));
+	&pxor		($inout2,&QWP(16*2,"esp"));
+	&pxor		($inout3,&QWP(16*3,"esp"));
+	&pxor		($inout4,&QWP(16*4,"esp"));
+
+	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
+	&aesdec		($inout0,$rndkey1);
+	&aesdec		($inout1,$rndkey1);
+	&aesdec		($inout2,$rndkey1);
+	&aesdec		($inout3,$rndkey1);
+	&aesdec		($inout4,$rndkey1);
+	&aesdec		($inout5,$rndkey1);
+
+	&mov		($out,&DWP($out_off,"esp"));
+	&call		("_aesni_decrypt6_enter");
+
+	&movdqa		($rndkey0,&QWP(16*4,"esp"));	# pass last offset_i
+	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
+	&movdqa		($rndkey1,&QWP($checksum,"esp"));
+	&pxor		($inout1,&QWP(16*1,"esp"));
+	&pxor		($inout2,&QWP(16*2,"esp"));
+	&pxor		($inout3,&QWP(16*3,"esp"));
+	&pxor		($inout4,$rndkey0);
+
+	&pxor		($rndkey1,$inout0);		# checksum
+	&movdqu		(&QWP(16*0,$out,$inp),$inout0);	# store output
+	&pxor		($rndkey1,$inout1);
+	&movdqu		(&QWP(16*1,$out,$inp),$inout1);
+	&pxor		($rndkey1,$inout2);
+	&movdqu		(&QWP(16*2,$out,$inp),$inout2);
+	&pxor		($rndkey1,$inout3);
+	&movdqu		(&QWP(16*3,$out,$inp),$inout3);
+	&pxor		($rndkey1,$inout4);
+	&movdqu		(&QWP(16*4,$out,$inp),$inout4);
+
+	&jmp		(&label("done"));
+
+&set_label("one",16);
+	&movdqu		($inout5,&QWP(0,$l_));
+	&mov		($key,&DWP($key_off,"esp"));	# restore key
+
+	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
+	&mov		($rounds,&DWP(240,$key));
+
+	&pxor		($inout5,$rndkey0);		# ^ last offset_i
+	&pxor		($inout0,$inout5);		# ^ offset_i
+
+	&movdqa		($inout4,$rndkey1);
+	&mov		($out,&DWP($out_off,"esp"));
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+
+	&xorps		($inout0,$inout5);		# ^ offset_i
+	&movaps		($rndkey1,$inout4);		# pass the checksum
+	&movdqa		($rndkey0,$inout5);		# pass last offset_i
+	&xorps		($rndkey1,$inout0);		# checksum
+	&movups		(&QWP(0,$out,$inp),$inout0);
+
+	&jmp		(&label("done"));
+
+&set_label("two",16);
+	&lea		($i1,&DWP(1,$block));
+	&mov		($key,&DWP($key_off,"esp"));	# restore key
+	&bsf		($i1,$i1);
+	&shl		($i1,4);
+	&movdqu		($inout4,&QWP(0,$l_));
+	&movdqu		($inout5,&QWP(0,$l_,$i1));
+
+	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
+	&movdqu		($inout1,&QWP(16*1,$inp));
+	&mov		($rounds,&DWP(240,$key));
+
+	&movdqa		($inout3,$rndkey1);
+	&pxor		($inout4,$rndkey0);		# ^ last offset_i
+	&pxor		($inout5,$inout4);
+
+	&pxor		($inout0,$inout4);		# ^ offset_i
+	&pxor		($inout1,$inout5);
+
+	&mov		($out,&DWP($out_off,"esp"));
+	&call		("_aesni_decrypt2");
+
+	&xorps		($inout0,$inout4);		# ^ offset_i
+	&xorps		($inout1,$inout5);
+	&movdqa		($rndkey0,$inout5);		# pass last offset_i
+	&xorps		($inout3,$inout0);		# checksum
+	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
+	&xorps		($inout3,$inout1);
+	&movups		(&QWP(16*1,$out,$inp),$inout1);
+	&movaps		($rndkey1,$inout3);		# pass the checksum
+
+	&jmp		(&label("done"));
+
+&set_label("three",16);
+	&lea		($i1,&DWP(1,$block));
+	&mov		($key,&DWP($key_off,"esp"));	# restore key
+	&bsf		($i1,$i1);
+	&shl		($i1,4);
+	&movdqu		($inout3,&QWP(0,$l_));
+	&movdqu		($inout4,&QWP(0,$l_,$i1));
+	&movdqa		($inout5,$inout3);
+
+	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
+	&movdqu		($inout1,&QWP(16*1,$inp));
+	&movdqu		($inout2,&QWP(16*2,$inp));
+	&mov		($rounds,&DWP(240,$key));
+
+	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
+	&pxor		($inout3,$rndkey0);		# ^ last offset_i
+	&pxor		($inout4,$inout3);
+	&pxor		($inout5,$inout4);
+
+	&pxor		($inout0,$inout3);		# ^ offset_i
+	&pxor		($inout1,$inout4);
+	&pxor		($inout2,$inout5);
+
+	&mov		($out,&DWP($out_off,"esp"));
+	&call		("_aesni_decrypt3");
+
+	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+	&xorps		($inout0,$inout3);		# ^ offset_i
+	&xorps		($inout1,$inout4);
+	&xorps		($inout2,$inout5);
+	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
+	&pxor		($rndkey1,$inout0);		# checksum
+	&movdqa		($rndkey0,$inout5);		# pass last offset_i
+	&movups		(&QWP(16*1,$out,$inp),$inout1);
+	&pxor		($rndkey1,$inout1);
+	&movups		(&QWP(16*2,$out,$inp),$inout2);
+	&pxor		($rndkey1,$inout2);
+
+	&jmp		(&label("done"));
+
+&set_label("four",16);
+	&lea		($i1,&DWP(1,$block));
+	&lea		($i3,&DWP(3,$block));
+	&bsf		($i1,$i1);
+	&bsf		($i3,$i3);
+	&mov		($key,&DWP($key_off,"esp"));	# restore key
+	&shl		($i1,4);
+	&shl		($i3,4);
+	&movdqu		($inout2,&QWP(0,$l_));
+	&movdqu		($inout3,&QWP(0,$l_,$i1));
+	&movdqa		($inout4,$inout2);
+	&movdqu		($inout5,&QWP(0,$l_,$i3));
+
+	&pxor		($inout2,$rndkey0);		# ^ last offset_i
+	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
+	&pxor		($inout3,$inout2);
+	&movdqu		($inout1,&QWP(16*1,$inp));
+	&pxor		($inout4,$inout3);
+	&movdqa		(&QWP(16*0,"esp"),$inout2);
+	&pxor		($inout5,$inout4);
+	&movdqa		(&QWP(16*1,"esp"),$inout3);
+	&movdqu		($inout2,&QWP(16*2,$inp));
+	&movdqu		($inout3,&QWP(16*3,$inp));
+	&mov		($rounds,&DWP(240,$key));
+
+	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
+	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
+	&pxor		($inout1,&QWP(16*1,"esp"));
+	&pxor		($inout2,$inout4);
+	&pxor		($inout3,$inout5);
+
+	&mov		($out,&DWP($out_off,"esp"));
+	&call		("_aesni_decrypt4");
+
+	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+	&xorps		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
+	&xorps		($inout1,&QWP(16*1,"esp"));
+	&xorps		($inout2,$inout4);
+	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
+	&pxor		($rndkey1,$inout0);		# checksum
+	&xorps		($inout3,$inout5);
+	&movups		(&QWP(16*1,$out,$inp),$inout1);
+	&pxor		($rndkey1,$inout1);
+	&movdqa		($rndkey0,$inout5);		# pass last offset_i
+	&movups		(&QWP(16*2,$out,$inp),$inout2);
+	&pxor		($rndkey1,$inout2);
+	&movups		(&QWP(16*3,$out,$inp),$inout3);
+	&pxor		($rndkey1,$inout3);
+
+&set_label("done");
+	&mov	($key,&DWP($esp_off,"esp"));
+	&pxor	($inout0,$inout0);		# clear register bank
+	&pxor	($inout1,$inout1);
+	&movdqa	(&QWP(16*0,"esp"),$inout0);	# clear stack
+	&pxor	($inout2,$inout2);
+	&movdqa	(&QWP(16*1,"esp"),$inout0);
+	&pxor	($inout3,$inout3);
+	&movdqa	(&QWP(16*2,"esp"),$inout0);
+	&pxor	($inout4,$inout4);
+	&movdqa	(&QWP(16*3,"esp"),$inout0);
+	&pxor	($inout5,$inout5);
+	&movdqa	(&QWP(16*4,"esp"),$inout0);
+	&movdqa	(&QWP(16*5,"esp"),$inout0);
+	&movdqa	(&QWP(16*6,"esp"),$inout0);
+
+	&lea	("esp",&DWP(0,$key));
+	&mov	($rounds,&wparam(5));		# &offset_i
+	&mov	($rounds_,&wparam(7));		# &checksum
+	&movdqu	(&QWP(0,$rounds),$rndkey0);
+	&pxor	($rndkey0,$rndkey0);
+	&movdqu	(&QWP(0,$rounds_),$rndkey1);
+	&pxor	($rndkey1,$rndkey1);
+&function_end("aesni_ocb_decrypt");
+}
 }
 
 ######################################################################
@@ -2419,7 +3305,7 @@ if ($PREFIX eq "aesni") {
 	&pxor		("xmm3","xmm3");
 	&aesenclast	("xmm2","xmm3");
 
-	&movdqa		("xmm3","xmm1")
+	&movdqa		("xmm3","xmm1");
 	&pslldq		("xmm1",4);
 	&pxor		("xmm3","xmm1");
 	&pslldq		("xmm1",4);
@@ -2523,3 +3409,5 @@ if ($PREFIX eq "aesni") {
 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
 
 &asm_finish();
+
+close STDOUT;
diff --git a/deps/openssl/openssl/crypto/aes/asm/aesni-x86_64.pl b/deps/openssl/openssl/crypto/aes/asm/aesni-x86_64.pl
index 25ca574f6a..98ca17991d 100644
--- a/deps/openssl/openssl/crypto/aes/asm/aesni-x86_64.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/aesni-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -157,16 +164,23 @@
 # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
 # in CTR mode AES instruction interleave factor was chosen to be 6x.
 
+# November 2015
+#
+# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
+# chosen to be 6x.
+
 ######################################################################
 # Current large-block performance in cycles per byte processed with
 # 128-bit key (less is better).
 #
-#		CBC en-/decrypt	CTR	XTS	ECB
+#		CBC en-/decrypt	CTR	XTS	ECB	OCB
 # Westmere	3.77/1.25	1.25	1.25	1.26
-# * Bridge	5.07/0.74	0.75	0.90	0.85
-# Haswell	4.44/0.63	0.63	0.73	0.63
-# Silvermont	5.75/3.54	3.56	4.12	3.87(*)
-# Bulldozer	5.77/0.70	0.72	0.90	0.70
+# * Bridge	5.07/0.74	0.75	0.90	0.85	0.98
+# Haswell	4.44/0.63	0.63	0.73	0.63	0.70
+# Skylake	2.62/0.63	0.63	0.63	0.63
+# Silvermont	5.75/3.54	3.56	4.12	3.87(*)	4.11
+# Goldmont	3.82/1.26	1.26	1.29	1.29	1.50
+# Bulldozer	5.77/0.70	0.72	0.90	0.70	0.95
 #
 # (*)	Atom Silvermont ECB result is suboptimal because of penalties
 #	incurred by operations on %xmm8-15. As ECB is not considered
@@ -187,7 +201,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;
 
 $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
@@ -2708,6 +2722,925 @@ $code.=<<___;
 	ret
 .size	aesni_xts_decrypt,.-aesni_xts_decrypt
 ___
+}
+
+######################################################################
+# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
+#	const AES_KEY *key, unsigned int start_block_num,
+#	unsigned char offset_i[16], const unsigned char L_[][16],
+#	unsigned char checksum[16]);
+#
+{
+my @offset=map("%xmm$_",(10..15));
+my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
+my ($block_num,$offset_p)=("%r8","%r9");		# 5th and 6th arguments
+my ($L_p,$checksum_p) = ("%rbx","%rbp");
+my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
+my $seventh_arg = $win64 ? 56 : 8;
+my $blocks = $len;
+
+$code.=<<___;
+.globl	aesni_ocb_encrypt
+.type	aesni_ocb_encrypt,\@function,6
+.align	32
+aesni_ocb_encrypt:
+	lea	(%rsp),%rax
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+___
+$code.=<<___ if ($win64);
+	lea	-0xa0(%rsp),%rsp
+	movaps	%xmm6,0x00(%rsp)		# offload everything
+	movaps	%xmm7,0x10(%rsp)
+	movaps	%xmm8,0x20(%rsp)
+	movaps	%xmm9,0x30(%rsp)
+	movaps	%xmm10,0x40(%rsp)
+	movaps	%xmm11,0x50(%rsp)
+	movaps	%xmm12,0x60(%rsp)
+	movaps	%xmm13,0x70(%rsp)
+	movaps	%xmm14,0x80(%rsp)
+	movaps	%xmm15,0x90(%rsp)
+.Locb_enc_body:
+___
+$code.=<<___;
+	mov	$seventh_arg(%rax),$L_p		# 7th argument
+	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
+
+	mov	240($key),$rnds_
+	mov	$key,$key_
+	shl	\$4,$rnds_
+	$movkey	($key),$rndkey0l		# round[0]
+	$movkey	16($key,$rnds_),$rndkey1	# round[last]
+
+	movdqu	($offset_p),@offset[5]		# load last offset_i
+	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
+	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
+
+	mov	\$16+32,$rounds
+	lea	32($key_,$rnds_),$key
+	$movkey	16($key_),$rndkey1		# round[1]
+	sub	%r10,%rax			# twisted $rounds
+	mov	%rax,%r10			# backup twisted $rounds
+
+	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
+	movdqu	($checksum_p),$checksum		# load checksum
+
+	test	\$1,$block_num			# is first block number odd?
+	jnz	.Locb_enc_odd
+
+	bsf	$block_num,$i1
+	add	\$1,$block_num
+	shl	\$4,$i1
+	movdqu	($L_p,$i1),$inout5		# borrow
+	movdqu	($inp),$inout0
+	lea	16($inp),$inp
+
+	call	__ocb_encrypt1
+
+	movdqa	$inout5,@offset[5]
+	movups	$inout0,($out)
+	lea	16($out),$out
+	sub	\$1,$blocks
+	jz	.Locb_enc_done
+
+.Locb_enc_odd:
+	lea	1($block_num),$i1		# even-numbered blocks
+	lea	3($block_num),$i3
+	lea	5($block_num),$i5
+	lea	6($block_num),$block_num
+	bsf	$i1,$i1				# ntz(block)
+	bsf	$i3,$i3
+	bsf	$i5,$i5
+	shl	\$4,$i1				# ntz(block) -> table offset
+	shl	\$4,$i3
+	shl	\$4,$i5
+
+	sub	\$6,$blocks
+	jc	.Locb_enc_short
+	jmp	.Locb_enc_grandloop
+
+.align	32
+.Locb_enc_grandloop:
+	movdqu	`16*0`($inp),$inout0		# load input
+	movdqu	`16*1`($inp),$inout1
+	movdqu	`16*2`($inp),$inout2
+	movdqu	`16*3`($inp),$inout3
+	movdqu	`16*4`($inp),$inout4
+	movdqu	`16*5`($inp),$inout5
+	lea	`16*6`($inp),$inp
+
+	call	__ocb_encrypt6
+
+	movups	$inout0,`16*0`($out)		# store output
+	movups	$inout1,`16*1`($out)
+	movups	$inout2,`16*2`($out)
+	movups	$inout3,`16*3`($out)
+	movups	$inout4,`16*4`($out)
+	movups	$inout5,`16*5`($out)
+	lea	`16*6`($out),$out
+	sub	\$6,$blocks
+	jnc	.Locb_enc_grandloop
+
+.Locb_enc_short:
+	add	\$6,$blocks
+	jz	.Locb_enc_done
+
+	movdqu	`16*0`($inp),$inout0
+	cmp	\$2,$blocks
+	jb	.Locb_enc_one
+	movdqu	`16*1`($inp),$inout1
+	je	.Locb_enc_two
+
+	movdqu	`16*2`($inp),$inout2
+	cmp	\$4,$blocks
+	jb	.Locb_enc_three
+	movdqu	`16*3`($inp),$inout3
+	je	.Locb_enc_four
+
+	movdqu	`16*4`($inp),$inout4
+	pxor	$inout5,$inout5
+
+	call	__ocb_encrypt6
+
+	movdqa	@offset[4],@offset[5]
+	movups	$inout0,`16*0`($out)
+	movups	$inout1,`16*1`($out)
+	movups	$inout2,`16*2`($out)
+	movups	$inout3,`16*3`($out)
+	movups	$inout4,`16*4`($out)
+
+	jmp	.Locb_enc_done
+
+.align	16
+.Locb_enc_one:
+	movdqa	@offset[0],$inout5		# borrow
+
+	call	__ocb_encrypt1
+
+	movdqa	$inout5,@offset[5]
+	movups	$inout0,`16*0`($out)
+	jmp	.Locb_enc_done
+
+.align	16
+.Locb_enc_two:
+	pxor	$inout2,$inout2
+	pxor	$inout3,$inout3
+
+	call	__ocb_encrypt4
+
+	movdqa	@offset[1],@offset[5]
+	movups	$inout0,`16*0`($out)
+	movups	$inout1,`16*1`($out)
+
+	jmp	.Locb_enc_done
+
+.align	16
+.Locb_enc_three:
+	pxor	$inout3,$inout3
+
+	call	__ocb_encrypt4
+
+	movdqa	@offset[2],@offset[5]
+	movups	$inout0,`16*0`($out)
+	movups	$inout1,`16*1`($out)
+	movups	$inout2,`16*2`($out)
+
+	jmp	.Locb_enc_done
+
+.align	16
+.Locb_enc_four:
+	call	__ocb_encrypt4
+
+	movdqa	@offset[3],@offset[5]
+	movups	$inout0,`16*0`($out)
+	movups	$inout1,`16*1`($out)
+	movups	$inout2,`16*2`($out)
+	movups	$inout3,`16*3`($out)
+
+.Locb_enc_done:
+	pxor	$rndkey0,@offset[5]		# "remove" round[last]
+	movdqu	$checksum,($checksum_p)		# store checksum
+	movdqu	@offset[5],($offset_p)		# store last offset_i
+
+	xorps	%xmm0,%xmm0			# clear register bank
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+	pxor	%xmm10,%xmm10
+	pxor	%xmm11,%xmm11
+	pxor	%xmm12,%xmm12
+	pxor	%xmm13,%xmm13
+	pxor	%xmm14,%xmm14
+	pxor	%xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+	movaps	0x00(%rsp),%xmm6
+	movaps	%xmm0,0x00(%rsp)		# clear stack
+	movaps	0x10(%rsp),%xmm7
+	movaps	%xmm0,0x10(%rsp)
+	movaps	0x20(%rsp),%xmm8
+	movaps	%xmm0,0x20(%rsp)
+	movaps	0x30(%rsp),%xmm9
+	movaps	%xmm0,0x30(%rsp)
+	movaps	0x40(%rsp),%xmm10
+	movaps	%xmm0,0x40(%rsp)
+	movaps	0x50(%rsp),%xmm11
+	movaps	%xmm0,0x50(%rsp)
+	movaps	0x60(%rsp),%xmm12
+	movaps	%xmm0,0x60(%rsp)
+	movaps	0x70(%rsp),%xmm13
+	movaps	%xmm0,0x70(%rsp)
+	movaps	0x80(%rsp),%xmm14
+	movaps	%xmm0,0x80(%rsp)
+	movaps	0x90(%rsp),%xmm15
+	movaps	%xmm0,0x90(%rsp)
+	lea	0xa0+0x28(%rsp),%rax
+.Locb_enc_pop:
+	lea	0xa0(%rsp),%rsp
+___
+$code.=<<___;
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+.Locb_enc_epilogue:
+	ret
+.size	aesni_ocb_encrypt,.-aesni_ocb_encrypt
+
+.type	__ocb_encrypt6,\@abi-omnipotent
+.align	32
+__ocb_encrypt6:
+	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
+	 movdqu		($L_p,$i1),@offset[1]
+	 movdqa		@offset[0],@offset[2]
+	 movdqu		($L_p,$i3),@offset[3]
+	 movdqa		@offset[0],@offset[4]
+	 pxor		@offset[5],@offset[0]
+	 movdqu		($L_p,$i5),@offset[5]
+	 pxor		@offset[0],@offset[1]
+	pxor		$inout0,$checksum	# accumulate checksum
+	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
+	 pxor		@offset[1],@offset[2]
+	pxor		$inout1,$checksum
+	pxor		@offset[1],$inout1
+	 pxor		@offset[2],@offset[3]
+	pxor		$inout2,$checksum
+	pxor		@offset[2],$inout2
+	 pxor		@offset[3],@offset[4]
+	pxor		$inout3,$checksum
+	pxor		@offset[3],$inout3
+	 pxor		@offset[4],@offset[5]
+	pxor		$inout4,$checksum
+	pxor		@offset[4],$inout4
+	pxor		$inout5,$checksum
+	pxor		@offset[5],$inout5
+	$movkey		32($key_),$rndkey0
+
+	lea		1($block_num),$i1	# even-numbered blocks
+	lea		3($block_num),$i3
+	lea		5($block_num),$i5
+	add		\$6,$block_num
+	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
+	bsf		$i1,$i1			# ntz(block)
+	bsf		$i3,$i3
+	bsf		$i5,$i5
+
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	 pxor		$rndkey0l,@offset[1]
+	 pxor		$rndkey0l,@offset[2]
+	aesenc		$rndkey1,$inout4
+	 pxor		$rndkey0l,@offset[3]
+	 pxor		$rndkey0l,@offset[4]
+	aesenc		$rndkey1,$inout5
+	$movkey		48($key_),$rndkey1
+	 pxor		$rndkey0l,@offset[5]
+
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	$movkey		64($key_),$rndkey0
+	shl		\$4,$i1			# ntz(block) -> table offset
+	shl		\$4,$i3
+	jmp		.Locb_enc_loop6
+
+.align	32
+.Locb_enc_loop6:
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.Locb_enc_loop6
+
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+	$movkey		16($key_),$rndkey1
+	shl		\$4,$i5
+
+	aesenclast	@offset[0],$inout0
+	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
+	mov		%r10,%rax		# restore twisted rounds
+	aesenclast	@offset[1],$inout1
+	aesenclast	@offset[2],$inout2
+	aesenclast	@offset[3],$inout3
+	aesenclast	@offset[4],$inout4
+	aesenclast	@offset[5],$inout5
+	ret
+.size	__ocb_encrypt6,.-__ocb_encrypt6
+
+.type	__ocb_encrypt4,\@abi-omnipotent
+.align	32
+__ocb_encrypt4:
+	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
+	 movdqu		($L_p,$i1),@offset[1]
+	 movdqa		@offset[0],@offset[2]
+	 movdqu		($L_p,$i3),@offset[3]
+	 pxor		@offset[5],@offset[0]
+	 pxor		@offset[0],@offset[1]
+	pxor		$inout0,$checksum	# accumulate checksum
+	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
+	 pxor		@offset[1],@offset[2]
+	pxor		$inout1,$checksum
+	pxor		@offset[1],$inout1
+	 pxor		@offset[2],@offset[3]
+	pxor		$inout2,$checksum
+	pxor		@offset[2],$inout2
+	pxor		$inout3,$checksum
+	pxor		@offset[3],$inout3
+	$movkey		32($key_),$rndkey0
+
+	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
+	 pxor		$rndkey0l,@offset[1]
+	 pxor		$rndkey0l,@offset[2]
+	 pxor		$rndkey0l,@offset[3]
+
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	$movkey		48($key_),$rndkey1
+
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	$movkey		64($key_),$rndkey0
+	jmp		.Locb_enc_loop4
+
+.align	32
+.Locb_enc_loop4:
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.Locb_enc_loop4
+
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	$movkey		16($key_),$rndkey1
+	mov		%r10,%rax		# restore twisted rounds
+
+	aesenclast	@offset[0],$inout0
+	aesenclast	@offset[1],$inout1
+	aesenclast	@offset[2],$inout2
+	aesenclast	@offset[3],$inout3
+	ret
+.size	__ocb_encrypt4,.-__ocb_encrypt4
+
+.type	__ocb_encrypt1,\@abi-omnipotent
+.align	32
+__ocb_encrypt1:
+	 pxor		@offset[5],$inout5	# offset_i
+	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
+	pxor		$inout0,$checksum	# accumulate checksum
+	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
+	$movkey		32($key_),$rndkey0
+
+	aesenc		$rndkey1,$inout0
+	$movkey		48($key_),$rndkey1
+	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
+
+	aesenc		$rndkey0,$inout0
+	$movkey		64($key_),$rndkey0
+	jmp		.Locb_enc_loop1
+
+.align	32
+.Locb_enc_loop1:
+	aesenc		$rndkey1,$inout0
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+
+	aesenc		$rndkey0,$inout0
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.Locb_enc_loop1
+
+	aesenc		$rndkey1,$inout0
+	$movkey		16($key_),$rndkey1	# redundant in tail
+	mov		%r10,%rax		# restore twisted rounds
+
+	aesenclast	$inout5,$inout0
+	ret
+.size	__ocb_encrypt1,.-__ocb_encrypt1
+
+.globl	aesni_ocb_decrypt
+.type	aesni_ocb_decrypt,\@function,6
+.align	32
+aesni_ocb_decrypt:
+	lea	(%rsp),%rax
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+___
+$code.=<<___ if ($win64);
+	lea	-0xa0(%rsp),%rsp
+	movaps	%xmm6,0x00(%rsp)		# offload everything
+	movaps	%xmm7,0x10(%rsp)
+	movaps	%xmm8,0x20(%rsp)
+	movaps	%xmm9,0x30(%rsp)
+	movaps	%xmm10,0x40(%rsp)
+	movaps	%xmm11,0x50(%rsp)
+	movaps	%xmm12,0x60(%rsp)
+	movaps	%xmm13,0x70(%rsp)
+	movaps	%xmm14,0x80(%rsp)
+	movaps	%xmm15,0x90(%rsp)
+.Locb_dec_body:
+___
+$code.=<<___;
+	mov	$seventh_arg(%rax),$L_p		# 7th argument
+	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
+
+	mov	240($key),$rnds_
+	mov	$key,$key_
+	shl	\$4,$rnds_
+	$movkey	($key),$rndkey0l		# round[0]
+	$movkey	16($key,$rnds_),$rndkey1	# round[last]
+
+	movdqu	($offset_p),@offset[5]		# load last offset_i
+	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
+	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
+
+	mov	\$16+32,$rounds
+	lea	32($key_,$rnds_),$key
+	$movkey	16($key_),$rndkey1		# round[1]
+	sub	%r10,%rax			# twisted $rounds
+	mov	%rax,%r10			# backup twisted $rounds
+
+	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
+	movdqu	($checksum_p),$checksum		# load checksum
+
+	test	\$1,$block_num			# is first block number odd?
+	jnz	.Locb_dec_odd
+
+	bsf	$block_num,$i1
+	add	\$1,$block_num
+	shl	\$4,$i1
+	movdqu	($L_p,$i1),$inout5		# borrow
+	movdqu	($inp),$inout0
+	lea	16($inp),$inp
+
+	call	__ocb_decrypt1
+
+	movdqa	$inout5,@offset[5]
+	movups	$inout0,($out)
+	xorps	$inout0,$checksum		# accumulate checksum
+	lea	16($out),$out
+	sub	\$1,$blocks
+	jz	.Locb_dec_done
+
+.Locb_dec_odd:
+	lea	1($block_num),$i1		# even-numbered blocks
+	lea	3($block_num),$i3
+	lea	5($block_num),$i5
+	lea	6($block_num),$block_num
+	bsf	$i1,$i1				# ntz(block)
+	bsf	$i3,$i3
+	bsf	$i5,$i5
+	shl	\$4,$i1				# ntz(block) -> table offset
+	shl	\$4,$i3
+	shl	\$4,$i5
+
+	sub	\$6,$blocks
+	jc	.Locb_dec_short
+	jmp	.Locb_dec_grandloop
+
+.align	32
+.Locb_dec_grandloop:
+	movdqu	`16*0`($inp),$inout0		# load input
+	movdqu	`16*1`($inp),$inout1
+	movdqu	`16*2`($inp),$inout2
+	movdqu	`16*3`($inp),$inout3
+	movdqu	`16*4`($inp),$inout4
+	movdqu	`16*5`($inp),$inout5
+	lea	`16*6`($inp),$inp
+
+	call	__ocb_decrypt6
+
+	movups	$inout0,`16*0`($out)		# store output
+	pxor	$inout0,$checksum		# accumulate checksum
+	movups	$inout1,`16*1`($out)
+	pxor	$inout1,$checksum
+	movups	$inout2,`16*2`($out)
+	pxor	$inout2,$checksum
+	movups	$inout3,`16*3`($out)
+	pxor	$inout3,$checksum
+	movups	$inout4,`16*4`($out)
+	pxor	$inout4,$checksum
+	movups	$inout5,`16*5`($out)
+	pxor	$inout5,$checksum
+	lea	`16*6`($out),$out
+	sub	\$6,$blocks
+	jnc	.Locb_dec_grandloop
+
+.Locb_dec_short:
+	add	\$6,$blocks
+	jz	.Locb_dec_done
+
+	movdqu	`16*0`($inp),$inout0
+	cmp	\$2,$blocks
+	jb	.Locb_dec_one
+	movdqu	`16*1`($inp),$inout1
+	je	.Locb_dec_two
+
+	movdqu	`16*2`($inp),$inout2
+	cmp	\$4,$blocks
+	jb	.Locb_dec_three
+	movdqu	`16*3`($inp),$inout3
+	je	.Locb_dec_four
+
+	movdqu	`16*4`($inp),$inout4
+	pxor	$inout5,$inout5
+
+	call	__ocb_decrypt6
+
+	movdqa	@offset[4],@offset[5]
+	movups	$inout0,`16*0`($out)		# store output
+	pxor	$inout0,$checksum		# accumulate checksum
+	movups	$inout1,`16*1`($out)
+	pxor	$inout1,$checksum
+	movups	$inout2,`16*2`($out)
+	pxor	$inout2,$checksum
+	movups	$inout3,`16*3`($out)
+	pxor	$inout3,$checksum
+	movups	$inout4,`16*4`($out)
+	pxor	$inout4,$checksum
+
+	jmp	.Locb_dec_done
+
+.align	16
+.Locb_dec_one:
+	movdqa	@offset[0],$inout5		# borrow
+
+	call	__ocb_decrypt1
+
+	movdqa	$inout5,@offset[5]
+	movups	$inout0,`16*0`($out)		# store output
+	xorps	$inout0,$checksum		# accumulate checksum
+	jmp	.Locb_dec_done
+
+.align	16
+.Locb_dec_two:
+	pxor	$inout2,$inout2
+	pxor	$inout3,$inout3
+
+	call	__ocb_decrypt4
+
+	movdqa	@offset[1],@offset[5]
+	movups	$inout0,`16*0`($out)		# store output
+	xorps	$inout0,$checksum		# accumulate checksum
+	movups	$inout1,`16*1`($out)
+	xorps	$inout1,$checksum
+
+	jmp	.Locb_dec_done
+
+.align	16
+.Locb_dec_three:
+	pxor	$inout3,$inout3
+
+	call	__ocb_decrypt4
+
+	movdqa	@offset[2],@offset[5]
+	movups	$inout0,`16*0`($out)		# store output
+	xorps	$inout0,$checksum		# accumulate checksum
+	movups	$inout1,`16*1`($out)
+	xorps	$inout1,$checksum
+	movups	$inout2,`16*2`($out)
+	xorps	$inout2,$checksum
+
+	jmp	.Locb_dec_done
+
+.align	16
+.Locb_dec_four:
+	call	__ocb_decrypt4
+
+	movdqa	@offset[3],@offset[5]
+	movups	$inout0,`16*0`($out)		# store output
+	pxor	$inout0,$checksum		# accumulate checksum
+	movups	$inout1,`16*1`($out)
+	pxor	$inout1,$checksum
+	movups	$inout2,`16*2`($out)
+	pxor	$inout2,$checksum
+	movups	$inout3,`16*3`($out)
+	pxor	$inout3,$checksum
+
+.Locb_dec_done:
+	pxor	$rndkey0,@offset[5]		# "remove" round[last]
+	movdqu	$checksum,($checksum_p)		# store checksum
+	movdqu	@offset[5],($offset_p)		# store last offset_i
+
+	xorps	%xmm0,%xmm0			# clear register bank
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+	pxor	%xmm10,%xmm10
+	pxor	%xmm11,%xmm11
+	pxor	%xmm12,%xmm12
+	pxor	%xmm13,%xmm13
+	pxor	%xmm14,%xmm14
+	pxor	%xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+	movaps	0x00(%rsp),%xmm6
+	movaps	%xmm0,0x00(%rsp)		# clear stack
+	movaps	0x10(%rsp),%xmm7
+	movaps	%xmm0,0x10(%rsp)
+	movaps	0x20(%rsp),%xmm8
+	movaps	%xmm0,0x20(%rsp)
+	movaps	0x30(%rsp),%xmm9
+	movaps	%xmm0,0x30(%rsp)
+	movaps	0x40(%rsp),%xmm10
+	movaps	%xmm0,0x40(%rsp)
+	movaps	0x50(%rsp),%xmm11
+	movaps	%xmm0,0x50(%rsp)
+	movaps	0x60(%rsp),%xmm12
+	movaps	%xmm0,0x60(%rsp)
+	movaps	0x70(%rsp),%xmm13
+	movaps	%xmm0,0x70(%rsp)
+	movaps	0x80(%rsp),%xmm14
+	movaps	%xmm0,0x80(%rsp)
+	movaps	0x90(%rsp),%xmm15
+	movaps	%xmm0,0x90(%rsp)
+	lea	0xa0+0x28(%rsp),%rax
+.Locb_dec_pop:
+	lea	0xa0(%rsp),%rsp
+___
+$code.=<<___;
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+.Locb_dec_epilogue:
+	ret
+.size	aesni_ocb_decrypt,.-aesni_ocb_decrypt
+
+.type	__ocb_decrypt6,\@abi-omnipotent
+.align	32
+__ocb_decrypt6:
+	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
+	 movdqu		($L_p,$i1),@offset[1]
+	 movdqa		@offset[0],@offset[2]
+	 movdqu		($L_p,$i3),@offset[3]
+	 movdqa		@offset[0],@offset[4]
+	 pxor		@offset[5],@offset[0]
+	 movdqu		($L_p,$i5),@offset[5]
+	 pxor		@offset[0],@offset[1]
+	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
+	 pxor		@offset[1],@offset[2]
+	pxor		@offset[1],$inout1
+	 pxor		@offset[2],@offset[3]
+	pxor		@offset[2],$inout2
+	 pxor		@offset[3],@offset[4]
+	pxor		@offset[3],$inout3
+	 pxor		@offset[4],@offset[5]
+	pxor		@offset[4],$inout4
+	pxor		@offset[5],$inout5
+	$movkey		32($key_),$rndkey0
+
+	lea		1($block_num),$i1	# even-numbered blocks
+	lea		3($block_num),$i3
+	lea		5($block_num),$i5
+	add		\$6,$block_num
+	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
+	bsf		$i1,$i1			# ntz(block)
+	bsf		$i3,$i3
+	bsf		$i5,$i5
+
+	aesdec		$rndkey1,$inout0
+	aesdec		$rndkey1,$inout1
+	aesdec		$rndkey1,$inout2
+	aesdec		$rndkey1,$inout3
+	 pxor		$rndkey0l,@offset[1]
+	 pxor		$rndkey0l,@offset[2]
+	aesdec		$rndkey1,$inout4
+	 pxor		$rndkey0l,@offset[3]
+	 pxor		$rndkey0l,@offset[4]
+	aesdec		$rndkey1,$inout5
+	$movkey		48($key_),$rndkey1
+	 pxor		$rndkey0l,@offset[5]
+
+	aesdec		$rndkey0,$inout0
+	aesdec		$rndkey0,$inout1
+	aesdec		$rndkey0,$inout2
+	aesdec		$rndkey0,$inout3
+	aesdec		$rndkey0,$inout4
+	aesdec		$rndkey0,$inout5
+	$movkey		64($key_),$rndkey0
+	shl		\$4,$i1			# ntz(block) -> table offset
+	shl		\$4,$i3
+	jmp		.Locb_dec_loop6
+
+.align	32
+.Locb_dec_loop6:
+	aesdec		$rndkey1,$inout0
+	aesdec		$rndkey1,$inout1
+	aesdec		$rndkey1,$inout2
+	aesdec		$rndkey1,$inout3
+	aesdec		$rndkey1,$inout4
+	aesdec		$rndkey1,$inout5
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+
+	aesdec		$rndkey0,$inout0
+	aesdec		$rndkey0,$inout1
+	aesdec		$rndkey0,$inout2
+	aesdec		$rndkey0,$inout3
+	aesdec		$rndkey0,$inout4
+	aesdec		$rndkey0,$inout5
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.Locb_dec_loop6
+
+	aesdec		$rndkey1,$inout0
+	aesdec		$rndkey1,$inout1
+	aesdec		$rndkey1,$inout2
+	aesdec		$rndkey1,$inout3
+	aesdec		$rndkey1,$inout4
+	aesdec		$rndkey1,$inout5
+	$movkey		16($key_),$rndkey1
+	shl		\$4,$i5
+
+	aesdeclast	@offset[0],$inout0
+	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
+	mov		%r10,%rax		# restore twisted rounds
+	aesdeclast	@offset[1],$inout1
+	aesdeclast	@offset[2],$inout2
+	aesdeclast	@offset[3],$inout3
+	aesdeclast	@offset[4],$inout4
+	aesdeclast	@offset[5],$inout5
+	ret
+.size	__ocb_decrypt6,.-__ocb_decrypt6
+
+.type	__ocb_decrypt4,\@abi-omnipotent
+.align	32
+__ocb_decrypt4:
+	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
+	 movdqu		($L_p,$i1),@offset[1]
+	 movdqa		@offset[0],@offset[2]
+	 movdqu		($L_p,$i3),@offset[3]
+	 pxor		@offset[5],@offset[0]
+	 pxor		@offset[0],@offset[1]
+	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
+	 pxor		@offset[1],@offset[2]
+	pxor		@offset[1],$inout1
+	 pxor		@offset[2],@offset[3]
+	pxor		@offset[2],$inout2
+	pxor		@offset[3],$inout3
+	$movkey		32($key_),$rndkey0
+
+	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
+	 pxor		$rndkey0l,@offset[1]
+	 pxor		$rndkey0l,@offset[2]
+	 pxor		$rndkey0l,@offset[3]
+
+	aesdec		$rndkey1,$inout0
+	aesdec		$rndkey1,$inout1
+	aesdec		$rndkey1,$inout2
+	aesdec		$rndkey1,$inout3
+	$movkey		48($key_),$rndkey1
+
+	aesdec		$rndkey0,$inout0
+	aesdec		$rndkey0,$inout1
+	aesdec		$rndkey0,$inout2
+	aesdec		$rndkey0,$inout3
+	$movkey		64($key_),$rndkey0
+	jmp		.Locb_dec_loop4
+
+.align	32
+.Locb_dec_loop4:
+	aesdec		$rndkey1,$inout0
+	aesdec		$rndkey1,$inout1
+	aesdec		$rndkey1,$inout2
+	aesdec		$rndkey1,$inout3
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+
+	aesdec		$rndkey0,$inout0
+	aesdec		$rndkey0,$inout1
+	aesdec		$rndkey0,$inout2
+	aesdec		$rndkey0,$inout3
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.Locb_dec_loop4
+
+	aesdec		$rndkey1,$inout0
+	aesdec		$rndkey1,$inout1
+	aesdec		$rndkey1,$inout2
+	aesdec		$rndkey1,$inout3
+	$movkey		16($key_),$rndkey1
+	mov		%r10,%rax		# restore twisted rounds
+
+	aesdeclast	@offset[0],$inout0
+	aesdeclast	@offset[1],$inout1
+	aesdeclast	@offset[2],$inout2
+	aesdeclast	@offset[3],$inout3
+	ret
+.size	__ocb_decrypt4,.-__ocb_decrypt4
+
+.type	__ocb_decrypt1,\@abi-omnipotent
+.align	32
+__ocb_decrypt1:
+	 pxor		@offset[5],$inout5	# offset_i
+	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
+	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
+	$movkey		32($key_),$rndkey0
+
+	aesdec		$rndkey1,$inout0
+	$movkey		48($key_),$rndkey1
+	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
+
+	aesdec		$rndkey0,$inout0
+	$movkey		64($key_),$rndkey0
+	jmp		.Locb_dec_loop1
+
+.align	32
+.Locb_dec_loop1:
+	aesdec		$rndkey1,$inout0
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+
+	aesdec		$rndkey0,$inout0
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.Locb_dec_loop1
+
+	aesdec		$rndkey1,$inout0
+	$movkey		16($key_),$rndkey1	# redundant in tail
+	mov		%r10,%rax		# restore twisted rounds
+
+	aesdeclast	$inout5,$inout0
+	ret
+.size	__ocb_decrypt1,.-__ocb_decrypt1
+___
 } }}
 
 ########################################################################
@@ -3307,7 +4240,7 @@ ___
 #	Vinodh Gopal <vinodh.gopal@intel.com>
 #	Kahraman Akdemir
 #
-# Agressively optimized in respect to aeskeygenassist's critical path
+# Aggressively optimized in respect to aeskeygenassist's critical path
 # and is contained in %xmm0-5 to meet Win64 ABI requirement.
 #
 # int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
@@ -3819,6 +4752,65 @@ ctr_xts_se_handler:
 
 	jmp	.Lcommon_rbp_tail
 .size	ctr_xts_se_handler,.-ctr_xts_se_handler
+
+.type	ocb_se_handler,\@abi-omnipotent
+.align	16
+ocb_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue lable
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	mov	8(%r11),%r10d		# HandlerData[2]
+	lea	(%rsi,%r10),%r10
+	cmp	%r10,%rbx		# context->Rip>=pop label
+	jae	.Locb_no_xmm
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# & context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0xa0+0x28(%rax),%rax
+
+.Locb_no_xmm:
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+
+	jmp	.Lcommon_seh_tail
+.size	ocb_se_handler,.-ocb_se_handler
 ___
 $code.=<<___;
 .type	cbc_se_handler,\@abi-omnipotent
@@ -3932,6 +4924,14 @@ $code.=<<___ if ($PREFIX eq "aesni");
 	.rva	.LSEH_begin_aesni_xts_decrypt
 	.rva	.LSEH_end_aesni_xts_decrypt
 	.rva	.LSEH_info_xts_dec
+
+	.rva	.LSEH_begin_aesni_ocb_encrypt
+	.rva	.LSEH_end_aesni_ocb_encrypt
+	.rva	.LSEH_info_ocb_enc
+
+	.rva	.LSEH_begin_aesni_ocb_decrypt
+	.rva	.LSEH_end_aesni_ocb_decrypt
+	.rva	.LSEH_info_ocb_dec
 ___
 $code.=<<___;
 	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
@@ -3973,6 +4973,18 @@ $code.=<<___ if ($PREFIX eq "aesni");
 	.byte	9,0,0,0
 	.rva	ctr_xts_se_handler
 	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
+.LSEH_info_ocb_enc:
+	.byte	9,0,0,0
+	.rva	ocb_se_handler
+	.rva	.Locb_enc_body,.Locb_enc_epilogue	# HandlerData[]
+	.rva	.Locb_enc_pop
+	.long	0
+.LSEH_info_ocb_dec:
+	.byte	9,0,0,0
+	.rva	ocb_se_handler
+	.rva	.Locb_dec_body,.Locb_dec_epilogue	# HandlerData[]
+	.rva	.Locb_dec_pop
+	.long	0
 ___
 $code.=<<___;
 .LSEH_info_cbc:
diff --git a/deps/openssl/openssl/crypto/aes/asm/aesp8-ppc.pl b/deps/openssl/openssl/crypto/aes/asm/aesp8-ppc.pl
index a1891cc03c..b7e92f6538 100755
--- a/deps/openssl/openssl/crypto/aes/asm/aesp8-ppc.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/aesp8-ppc.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -20,6 +27,19 @@
 # instructions are interleaved. It's reckoned that eventual
 # misalignment penalties at page boundaries are in average lower
 # than additional overhead in pure AltiVec approach.
+#
+# May 2016
+#
+# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
+# systems were measured.
+#
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+#		CBC en-/decrypt	CTR	XTS
+# POWER8[le]	3.96/0.72	0.74	1.1
+# POWER8[be]	3.75/0.65	0.66	1.0
 
 $flavour = shift;
 
@@ -1887,6 +1907,1849 @@ Lctr32_enc8x_done:
 ___
 }}	}}}
 
+#########################################################################
+{{{	# XTS procedures						#
+# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len,	#
+#                             const AES_KEY *key1, const AES_KEY *key2,	#
+#                             [const] unsigned char iv[16]);		#
+# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which	#
+# input tweak value is assumed to be encrypted already, and last tweak	#
+# value, one suitable for consecutive call on same chunk of data, is	#
+# written back to original buffer. In addition, in "tweak chaining"	#
+# mode only complete input blocks are processed.			#
+
+my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) =	map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout) =				map("v$_",(0..2));
+my ($output,$inptail,$inpperm,$leperm,$keyperm) =	map("v$_",(3..7));
+my ($tweak,$seven,$eighty7,$tmp,$tweak1) =		map("v$_",(8..12));
+my $taillen = $key2;
+
+   ($inp,$idx) = ($idx,$inp);				# reassign
+
+$code.=<<___;
+.globl	.${prefix}_xts_encrypt
+.align	5
+.${prefix}_xts_encrypt:
+	mr		$inp,r3				# reassign
+	li		r3,-1
+	${UCMP}i	$len,16
+	bltlr-
+
+	lis		r0,0xfff0
+	mfspr		r12,256				# save vrsave
+	li		r11,0
+	mtspr		256,r0
+
+	vspltisb	$seven,0x07			# 0x070707..07
+	le?lvsl		$leperm,r11,r11
+	le?vspltisb	$tmp,0x0f
+	le?vxor		$leperm,$leperm,$seven
+
+	li		$idx,15
+	lvx		$tweak,0,$ivp			# load [unaligned] iv
+	lvsl		$inpperm,0,$ivp
+	lvx		$inptail,$idx,$ivp
+	le?vxor		$inpperm,$inpperm,$tmp
+	vperm		$tweak,$tweak,$inptail,$inpperm
+
+	neg		r11,$inp
+	lvsr		$inpperm,0,r11			# prepare for unaligned load
+	lvx		$inout,0,$inp
+	addi		$inp,$inp,15			# 15 is not typo
+	le?vxor		$inpperm,$inpperm,$tmp
+
+	${UCMP}i	$key2,0				# key2==NULL?
+	beq		Lxts_enc_no_key2
+
+	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
+	lwz		$rounds,240($key2)
+	srwi		$rounds,$rounds,1
+	subi		$rounds,$rounds,1
+	li		$idx,16
+
+	lvx		$rndkey0,0,$key2
+	lvx		$rndkey1,$idx,$key2
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$tweak,$tweak,$rndkey0
+	lvx		$rndkey0,$idx,$key2
+	addi		$idx,$idx,16
+	mtctr		$rounds
+
+Ltweak_xts_enc:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$tweak,$tweak,$rndkey1
+	lvx		$rndkey1,$idx,$key2
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipher		$tweak,$tweak,$rndkey0
+	lvx		$rndkey0,$idx,$key2
+	addi		$idx,$idx,16
+	bdnz		Ltweak_xts_enc
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$tweak,$tweak,$rndkey1
+	lvx		$rndkey1,$idx,$key2
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipherlast	$tweak,$tweak,$rndkey0
+
+	li		$ivp,0				# don't chain the tweak
+	b		Lxts_enc
+
+Lxts_enc_no_key2:
+	li		$idx,-16
+	and		$len,$len,$idx			# in "tweak chaining"
+							# mode only complete
+							# blocks are processed
+Lxts_enc:
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,16
+
+	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
+	lwz		$rounds,240($key1)
+	srwi		$rounds,$rounds,1
+	subi		$rounds,$rounds,1
+	li		$idx,16
+
+	vslb		$eighty7,$seven,$seven		# 0x808080..80
+	vor		$eighty7,$eighty7,$seven	# 0x878787..87
+	vspltisb	$tmp,1				# 0x010101..01
+	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
+
+	${UCMP}i	$len,96
+	bge		_aesp8_xts_encrypt6x
+
+	andi.		$taillen,$len,15
+	subic		r0,$len,32
+	subi		$taillen,$taillen,16
+	subfe		r0,r0,r0
+	and		r0,r0,$taillen
+	add		$inp,$inp,r0
+
+	lvx		$rndkey0,0,$key1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+	vperm		$inout,$inout,$inptail,$inpperm
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$inout,$inout,$tweak
+	vxor		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+	mtctr		$rounds
+	b		Loop_xts_enc
+
+.align	5
+Loop_xts_enc:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipher		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+	bdnz		Loop_xts_enc
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key1
+	li		$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$rndkey0,$rndkey0,$tweak
+	vcipherlast	$output,$inout,$rndkey0
+
+	le?vperm	$tmp,$output,$output,$leperm
+	be?nop
+	le?stvx_u	$tmp,0,$out
+	be?stvx_u	$output,0,$out
+	addi		$out,$out,16
+
+	subic.		$len,$len,16
+	beq		Lxts_enc_done
+
+	vmr		$inout,$inptail
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,16
+	lvx		$rndkey0,0,$key1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+
+	subic		r0,$len,32
+	subfe		r0,r0,r0
+	and		r0,r0,$taillen
+	add		$inp,$inp,r0
+
+	vsrab		$tmp,$tweak,$seven		# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	vand		$tmp,$tmp,$eighty7
+	vxor		$tweak,$tweak,$tmp
+
+	vperm		$inout,$inout,$inptail,$inpperm
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$inout,$inout,$tweak
+	vxor		$output,$output,$rndkey0	# just in case $len<16
+	vxor		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+
+	mtctr		$rounds
+	${UCMP}i	$len,16
+	bge		Loop_xts_enc
+
+	vxor		$output,$output,$tweak
+	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
+	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
+	vspltisb	$tmp,-1
+	vperm		$inptail,$inptail,$tmp,$inpperm
+	vsel		$inout,$inout,$output,$inptail
+
+	subi		r11,$out,17
+	subi		$out,$out,16
+	mtctr		$len
+	li		$len,16
+Loop_xts_enc_steal:
+	lbzu		r0,1(r11)
+	stb		r0,16(r11)
+	bdnz		Loop_xts_enc_steal
+
+	mtctr		$rounds
+	b		Loop_xts_enc			# one more time...
+
+Lxts_enc_done:
+	${UCMP}i	$ivp,0
+	beq		Lxts_enc_ret
+
+	vsrab		$tmp,$tweak,$seven		# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	vand		$tmp,$tmp,$eighty7
+	vxor		$tweak,$tweak,$tmp
+
+	le?vperm	$tweak,$tweak,$tweak,$leperm
+	stvx_u		$tweak,0,$ivp
+
+Lxts_enc_ret:
+	mtspr		256,r12				# restore vrsave
+	li		r3,0
+	blr
+	.long		0
+	.byte		0,12,0x04,0,0x80,6,6,0
+	.long		0
+.size	.${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
+
+.globl	.${prefix}_xts_decrypt
+.align	5
+.${prefix}_xts_decrypt:
+	mr		$inp,r3				# reassign
+	li		r3,-1
+	${UCMP}i	$len,16
+	bltlr-
+
+	lis		r0,0xfff8
+	mfspr		r12,256				# save vrsave
+	li		r11,0
+	mtspr		256,r0
+
+	andi.		r0,$len,15
+	neg		r0,r0
+	andi.		r0,r0,16
+	sub		$len,$len,r0
+
+	vspltisb	$seven,0x07			# 0x070707..07
+	le?lvsl		$leperm,r11,r11
+	le?vspltisb	$tmp,0x0f
+	le?vxor		$leperm,$leperm,$seven
+
+	li		$idx,15
+	lvx		$tweak,0,$ivp			# load [unaligned] iv
+	lvsl		$inpperm,0,$ivp
+	lvx		$inptail,$idx,$ivp
+	le?vxor		$inpperm,$inpperm,$tmp
+	vperm		$tweak,$tweak,$inptail,$inpperm
+
+	neg		r11,$inp
+	lvsr		$inpperm,0,r11			# prepare for unaligned load
+	lvx		$inout,0,$inp
+	addi		$inp,$inp,15			# 15 is not typo
+	le?vxor		$inpperm,$inpperm,$tmp
+
+	${UCMP}i	$key2,0				# key2==NULL?
+	beq		Lxts_dec_no_key2
+
+	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
+	lwz		$rounds,240($key2)
+	srwi		$rounds,$rounds,1
+	subi		$rounds,$rounds,1
+	li		$idx,16
+
+	lvx		$rndkey0,0,$key2
+	lvx		$rndkey1,$idx,$key2
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$tweak,$tweak,$rndkey0
+	lvx		$rndkey0,$idx,$key2
+	addi		$idx,$idx,16
+	mtctr		$rounds
+
+Ltweak_xts_dec:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$tweak,$tweak,$rndkey1
+	lvx		$rndkey1,$idx,$key2
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipher		$tweak,$tweak,$rndkey0
+	lvx		$rndkey0,$idx,$key2
+	addi		$idx,$idx,16
+	bdnz		Ltweak_xts_dec
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$tweak,$tweak,$rndkey1
+	lvx		$rndkey1,$idx,$key2
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipherlast	$tweak,$tweak,$rndkey0
+
+	li		$ivp,0				# don't chain the tweak
+	b		Lxts_dec
+
+Lxts_dec_no_key2:
+	neg		$idx,$len
+	andi.		$idx,$idx,15
+	add		$len,$len,$idx			# in "tweak chaining"
+							# mode only complete
+							# blocks are processed
+Lxts_dec:
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,16
+
+	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
+	lwz		$rounds,240($key1)
+	srwi		$rounds,$rounds,1
+	subi		$rounds,$rounds,1
+	li		$idx,16
+
+	vslb		$eighty7,$seven,$seven		# 0x808080..80
+	vor		$eighty7,$eighty7,$seven	# 0x878787..87
+	vspltisb	$tmp,1				# 0x010101..01
+	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
+
+	${UCMP}i	$len,96
+	bge		_aesp8_xts_decrypt6x
+
+	lvx		$rndkey0,0,$key1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+	vperm		$inout,$inout,$inptail,$inpperm
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$inout,$inout,$tweak
+	vxor		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+	mtctr		$rounds
+
+	${UCMP}i	$len,16
+	blt		Ltail_xts_dec
+	be?b		Loop_xts_dec
+
+.align	5
+Loop_xts_dec:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vncipher	$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vncipher	$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+	bdnz		Loop_xts_dec
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vncipher	$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key1
+	li		$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$rndkey0,$rndkey0,$tweak
+	vncipherlast	$output,$inout,$rndkey0
+
+	le?vperm	$tmp,$output,$output,$leperm
+	be?nop
+	le?stvx_u	$tmp,0,$out
+	be?stvx_u	$output,0,$out
+	addi		$out,$out,16
+
+	subic.		$len,$len,16
+	beq		Lxts_dec_done
+
+	vmr		$inout,$inptail
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,16
+	lvx		$rndkey0,0,$key1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+
+	vsrab		$tmp,$tweak,$seven		# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	vand		$tmp,$tmp,$eighty7
+	vxor		$tweak,$tweak,$tmp
+
+	vperm		$inout,$inout,$inptail,$inpperm
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$inout,$inout,$tweak
+	vxor		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+
+	mtctr		$rounds
+	${UCMP}i	$len,16
+	bge		Loop_xts_dec
+
+Ltail_xts_dec:
+	vsrab		$tmp,$tweak,$seven		# next tweak value
+	vaddubm		$tweak1,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	vand		$tmp,$tmp,$eighty7
+	vxor		$tweak1,$tweak1,$tmp
+
+	subi		$inp,$inp,16
+	add		$inp,$inp,$len
+
+	vxor		$inout,$inout,$tweak		# :-(
+	vxor		$inout,$inout,$tweak1		# :-)
+
+Loop_xts_dec_short:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vncipher	$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vncipher	$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+	bdnz		Loop_xts_dec_short
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vncipher	$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key1
+	li		$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$rndkey0,$rndkey0,$tweak1
+	vncipherlast	$output,$inout,$rndkey0
+
+	le?vperm	$tmp,$output,$output,$leperm
+	be?nop
+	le?stvx_u	$tmp,0,$out
+	be?stvx_u	$output,0,$out
+
+	vmr		$inout,$inptail
+	lvx		$inptail,0,$inp
+	#addi		$inp,$inp,16
+	lvx		$rndkey0,0,$key1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+	vperm		$inout,$inout,$inptail,$inpperm
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+
+	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
+	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
+	vspltisb	$tmp,-1
+	vperm		$inptail,$inptail,$tmp,$inpperm
+	vsel		$inout,$inout,$output,$inptail
+
+	vxor		$rndkey0,$rndkey0,$tweak
+	vxor		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+
+	subi		r11,$out,1
+	mtctr		$len
+	li		$len,16
+Loop_xts_dec_steal:
+	lbzu		r0,1(r11)
+	stb		r0,16(r11)
+	bdnz		Loop_xts_dec_steal
+
+	mtctr		$rounds
+	b		Loop_xts_dec			# one more time...
+
+Lxts_dec_done:
+	${UCMP}i	$ivp,0
+	beq		Lxts_dec_ret
+
+	vsrab		$tmp,$tweak,$seven		# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	vand		$tmp,$tmp,$eighty7
+	vxor		$tweak,$tweak,$tmp
+
+	le?vperm	$tweak,$tweak,$tweak,$leperm
+	stvx_u		$tweak,0,$ivp
+
+Lxts_dec_ret:
+	mtspr		256,r12				# restore vrsave
+	li		r3,0
+	blr
+	.long		0
+	.byte		0,12,0x04,0,0x80,6,6,0
+	.long		0
+.size	.${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
+___
+#########################################################################
+{{	# Optimized XTS procedures					#
+my $key_=$key2;
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
+    $x00=0 if ($flavour =~ /osx/);
+my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5 )=map("v$_",(0..5));
+my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
+my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
+my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
+			# v26-v31 last 6 round keys
+my ($keyperm)=($out0);	# aliases with "caller", redundant assignment
+my $taillen=$x70;
+
+$code.=<<___;
+.align	5
+_aesp8_xts_encrypt6x:
+	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+	mflr		r11
+	li		r7,`$FRAME+8*16+15`
+	li		r3,`$FRAME+8*16+31`
+	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+	stvx		v20,r7,$sp		# ABI says so
+	addi		r7,r7,32
+	stvx		v21,r3,$sp
+	addi		r3,r3,32
+	stvx		v22,r7,$sp
+	addi		r7,r7,32
+	stvx		v23,r3,$sp
+	addi		r3,r3,32
+	stvx		v24,r7,$sp
+	addi		r7,r7,32
+	stvx		v25,r3,$sp
+	addi		r3,r3,32
+	stvx		v26,r7,$sp
+	addi		r7,r7,32
+	stvx		v27,r3,$sp
+	addi		r3,r3,32
+	stvx		v28,r7,$sp
+	addi		r7,r7,32
+	stvx		v29,r3,$sp
+	addi		r3,r3,32
+	stvx		v30,r7,$sp
+	stvx		v31,r3,$sp
+	li		r0,-1
+	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
+	li		$x10,0x10
+	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	li		$x20,0x20
+	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	li		$x30,0x30
+	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	li		$x40,0x40
+	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	li		$x50,0x50
+	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	li		$x60,0x60
+	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	li		$x70,0x70
+	mtspr		256,r0
+
+	subi		$rounds,$rounds,3	# -4 in total
+
+	lvx		$rndkey0,$x00,$key1	# load key schedule
+	lvx		v30,$x10,$key1
+	addi		$key1,$key1,0x20
+	lvx		v31,$x00,$key1
+	?vperm		$rndkey0,$rndkey0,v30,$keyperm
+	addi		$key_,$sp,$FRAME+15
+	mtctr		$rounds
+
+Load_xts_enc_key:
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v30,$x10,$key1
+	addi		$key1,$key1,0x20
+	stvx		v24,$x00,$key_		# off-load round[1]
+	?vperm		v25,v31,v30,$keyperm
+	lvx		v31,$x00,$key1
+	stvx		v25,$x10,$key_		# off-load round[2]
+	addi		$key_,$key_,0x20
+	bdnz		Load_xts_enc_key
+
+	lvx		v26,$x10,$key1
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v27,$x20,$key1
+	stvx		v24,$x00,$key_		# off-load round[3]
+	?vperm		v25,v31,v26,$keyperm
+	lvx		v28,$x30,$key1
+	stvx		v25,$x10,$key_		# off-load round[4]
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	?vperm		v26,v26,v27,$keyperm
+	lvx		v29,$x40,$key1
+	?vperm		v27,v27,v28,$keyperm
+	lvx		v30,$x50,$key1
+	?vperm		v28,v28,v29,$keyperm
+	lvx		v31,$x60,$key1
+	?vperm		v29,v29,v30,$keyperm
+	lvx		$twk5,$x70,$key1	# borrow $twk5
+	?vperm		v30,v30,v31,$keyperm
+	lvx		v24,$x00,$key_		# pre-load round[1]
+	?vperm		v31,v31,$twk5,$keyperm
+	lvx		v25,$x10,$key_		# pre-load round[2]
+
+	 vperm		$in0,$inout,$inptail,$inpperm
+	 subi		$inp,$inp,31		# undo "caller"
+	vxor		$twk0,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out0,$in0,$twk0
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in1,$x10,$inp
+	vxor		$twk1,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in1,$in1,$in1,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out1,$in1,$twk1
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in2,$x20,$inp
+	 andi.		$taillen,$len,15
+	vxor		$twk2,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in2,$in2,$in2,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out2,$in2,$twk2
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in3,$x30,$inp
+	 sub		$len,$len,$taillen
+	vxor		$twk3,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in3,$in3,$in3,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out3,$in3,$twk3
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in4,$x40,$inp
+	 subi		$len,$len,0x60
+	vxor		$twk4,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in4,$in4,$in4,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out4,$in4,$twk4
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in5,$x50,$inp
+	 addi		$inp,$inp,0x60
+	vxor		$twk5,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in5,$in5,$in5,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out5,$in5,$twk5
+	vxor		$tweak,$tweak,$tmp
+
+	vxor		v31,v31,$rndkey0
+	mtctr		$rounds
+	b		Loop_xts_enc6x
+
+.align	5
+Loop_xts_enc6x:
+	vcipher		$out0,$out0,v24
+	vcipher		$out1,$out1,v24
+	vcipher		$out2,$out2,v24
+	vcipher		$out3,$out3,v24
+	vcipher		$out4,$out4,v24
+	vcipher		$out5,$out5,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vcipher		$out0,$out0,v25
+	vcipher		$out1,$out1,v25
+	vcipher		$out2,$out2,v25
+	vcipher		$out3,$out3,v25
+	vcipher		$out4,$out4,v25
+	vcipher		$out5,$out5,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Loop_xts_enc6x
+
+	subic		$len,$len,96		# $len-=96
+	 vxor		$in0,$twk0,v31		# xor with last round key
+	vcipher		$out0,$out0,v24
+	vcipher		$out1,$out1,v24
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk0,$tweak,$rndkey0
+	 vaddubm	$tweak,$tweak,$tweak
+	vcipher		$out2,$out2,v24
+	vcipher		$out3,$out3,v24
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vcipher		$out4,$out4,v24
+	vcipher		$out5,$out5,v24
+
+	subfe.		r0,r0,r0		# borrow?-1:0
+	 vand		$tmp,$tmp,$eighty7
+	vcipher		$out0,$out0,v25
+	vcipher		$out1,$out1,v25
+	 vxor		$tweak,$tweak,$tmp
+	vcipher		$out2,$out2,v25
+	vcipher		$out3,$out3,v25
+	 vxor		$in1,$twk1,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk1,$tweak,$rndkey0
+	vcipher		$out4,$out4,v25
+	vcipher		$out5,$out5,v25
+
+	and		r0,r0,$len
+	 vaddubm	$tweak,$tweak,$tweak
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vcipher		$out0,$out0,v26
+	vcipher		$out1,$out1,v26
+	 vand		$tmp,$tmp,$eighty7
+	vcipher		$out2,$out2,v26
+	vcipher		$out3,$out3,v26
+	 vxor		$tweak,$tweak,$tmp
+	vcipher		$out4,$out4,v26
+	vcipher		$out5,$out5,v26
+
+	add		$inp,$inp,r0		# $inp is adjusted in such
+						# way that at exit from the
+						# loop inX-in5 are loaded
+						# with last "words"
+	 vxor		$in2,$twk2,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk2,$tweak,$rndkey0
+	 vaddubm	$tweak,$tweak,$tweak
+	vcipher		$out0,$out0,v27
+	vcipher		$out1,$out1,v27
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vcipher		$out2,$out2,v27
+	vcipher		$out3,$out3,v27
+	 vand		$tmp,$tmp,$eighty7
+	vcipher		$out4,$out4,v27
+	vcipher		$out5,$out5,v27
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	 vxor		$tweak,$tweak,$tmp
+	vcipher		$out0,$out0,v28
+	vcipher		$out1,$out1,v28
+	 vxor		$in3,$twk3,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk3,$tweak,$rndkey0
+	vcipher		$out2,$out2,v28
+	vcipher		$out3,$out3,v28
+	 vaddubm	$tweak,$tweak,$tweak
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vcipher		$out4,$out4,v28
+	vcipher		$out5,$out5,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+	 vand		$tmp,$tmp,$eighty7
+
+	vcipher		$out0,$out0,v29
+	vcipher		$out1,$out1,v29
+	 vxor		$tweak,$tweak,$tmp
+	vcipher		$out2,$out2,v29
+	vcipher		$out3,$out3,v29
+	 vxor		$in4,$twk4,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk4,$tweak,$rndkey0
+	vcipher		$out4,$out4,v29
+	vcipher		$out5,$out5,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vaddubm	$tweak,$tweak,$tweak
+	 vsldoi		$tmp,$tmp,$tmp,15
+
+	vcipher		$out0,$out0,v30
+	vcipher		$out1,$out1,v30
+	 vand		$tmp,$tmp,$eighty7
+	vcipher		$out2,$out2,v30
+	vcipher		$out3,$out3,v30
+	 vxor		$tweak,$tweak,$tmp
+	vcipher		$out4,$out4,v30
+	vcipher		$out5,$out5,v30
+	 vxor		$in5,$twk5,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk5,$tweak,$rndkey0
+
+	vcipherlast	$out0,$out0,$in0
+	 lvx_u		$in0,$x00,$inp		# load next input block
+	 vaddubm	$tweak,$tweak,$tweak
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vcipherlast	$out1,$out1,$in1
+	 lvx_u		$in1,$x10,$inp
+	vcipherlast	$out2,$out2,$in2
+	 le?vperm	$in0,$in0,$in0,$leperm
+	 lvx_u		$in2,$x20,$inp
+	 vand		$tmp,$tmp,$eighty7
+	vcipherlast	$out3,$out3,$in3
+	 le?vperm	$in1,$in1,$in1,$leperm
+	 lvx_u		$in3,$x30,$inp
+	vcipherlast	$out4,$out4,$in4
+	 le?vperm	$in2,$in2,$in2,$leperm
+	 lvx_u		$in4,$x40,$inp
+	 vxor		$tweak,$tweak,$tmp
+	vcipherlast	$tmp,$out5,$in5		# last block might be needed
+						# in stealing mode
+	 le?vperm	$in3,$in3,$in3,$leperm
+	 lvx_u		$in5,$x50,$inp
+	 addi		$inp,$inp,0x60
+	 le?vperm	$in4,$in4,$in4,$leperm
+	 le?vperm	$in5,$in5,$in5,$leperm
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	 vxor		$out0,$in0,$twk0
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	 vxor		$out1,$in1,$twk1
+	le?vperm	$out3,$out3,$out3,$leperm
+	stvx_u		$out2,$x20,$out
+	 vxor		$out2,$in2,$twk2
+	le?vperm	$out4,$out4,$out4,$leperm
+	stvx_u		$out3,$x30,$out
+	 vxor		$out3,$in3,$twk3
+	le?vperm	$out5,$tmp,$tmp,$leperm
+	stvx_u		$out4,$x40,$out
+	 vxor		$out4,$in4,$twk4
+	le?stvx_u	$out5,$x50,$out
+	be?stvx_u	$tmp, $x50,$out
+	 vxor		$out5,$in5,$twk5
+	addi		$out,$out,0x60
+
+	mtctr		$rounds
+	beq		Loop_xts_enc6x		# did $len-=96 borrow?
+
+	addic.		$len,$len,0x60
+	beq		Lxts_enc6x_zero
+	cmpwi		$len,0x20
+	blt		Lxts_enc6x_one
+	nop
+	beq		Lxts_enc6x_two
+	cmpwi		$len,0x40
+	blt		Lxts_enc6x_three
+	nop
+	beq		Lxts_enc6x_four
+
+Lxts_enc6x_five:
+	vxor		$out0,$in1,$twk0
+	vxor		$out1,$in2,$twk1
+	vxor		$out2,$in3,$twk2
+	vxor		$out3,$in4,$twk3
+	vxor		$out4,$in5,$twk4
+
+	bl		_aesp8_xts_enc5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk5		# unused tweak
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$leperm
+	stvx_u		$out2,$x20,$out
+	vxor		$tmp,$out4,$twk5	# last block prep for stealing
+	le?vperm	$out4,$out4,$out4,$leperm
+	stvx_u		$out3,$x30,$out
+	stvx_u		$out4,$x40,$out
+	addi		$out,$out,0x50
+	bne		Lxts_enc6x_steal
+	b		Lxts_enc6x_done
+
+.align	4
+Lxts_enc6x_four:
+	vxor		$out0,$in2,$twk0
+	vxor		$out1,$in3,$twk1
+	vxor		$out2,$in4,$twk2
+	vxor		$out3,$in5,$twk3
+	vxor		$out4,$out4,$out4
+
+	bl		_aesp8_xts_enc5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk4		# unused tweak
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	vxor		$tmp,$out3,$twk4	# last block prep for stealing
+	le?vperm	$out3,$out3,$out3,$leperm
+	stvx_u		$out2,$x20,$out
+	stvx_u		$out3,$x30,$out
+	addi		$out,$out,0x40
+	bne		Lxts_enc6x_steal
+	b		Lxts_enc6x_done
+
+.align	4
+Lxts_enc6x_three:
+	vxor		$out0,$in3,$twk0
+	vxor		$out1,$in4,$twk1
+	vxor		$out2,$in5,$twk2
+	vxor		$out3,$out3,$out3
+	vxor		$out4,$out4,$out4
+
+	bl		_aesp8_xts_enc5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk3		# unused tweak
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	vxor		$tmp,$out2,$twk3	# last block prep for stealing
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	stvx_u		$out2,$x20,$out
+	addi		$out,$out,0x30
+	bne		Lxts_enc6x_steal
+	b		Lxts_enc6x_done
+
+.align	4
+Lxts_enc6x_two:
+	vxor		$out0,$in4,$twk0
+	vxor		$out1,$in5,$twk1
+	vxor		$out2,$out2,$out2
+	vxor		$out3,$out3,$out3
+	vxor		$out4,$out4,$out4
+
+	bl		_aesp8_xts_enc5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk2		# unused tweak
+	vxor		$tmp,$out1,$twk2	# last block prep for stealing
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	stvx_u		$out1,$x10,$out
+	addi		$out,$out,0x20
+	bne		Lxts_enc6x_steal
+	b		Lxts_enc6x_done
+
+.align	4
+Lxts_enc6x_one:
+	vxor		$out0,$in5,$twk0
+	nop
+Loop_xts_enc1x:
+	vcipher		$out0,$out0,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vcipher		$out0,$out0,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Loop_xts_enc1x
+
+	add		$inp,$inp,$taillen
+	cmpwi		$taillen,0
+	vcipher		$out0,$out0,v24
+
+	subi		$inp,$inp,16
+	vcipher		$out0,$out0,v25
+
+	lvsr		$inpperm,0,$taillen
+	vcipher		$out0,$out0,v26
+
+	lvx_u		$in0,0,$inp
+	vcipher		$out0,$out0,v27
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vcipher		$out0,$out0,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+
+	vcipher		$out0,$out0,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vxor		$twk0,$twk0,v31
+
+	le?vperm	$in0,$in0,$in0,$leperm
+	vcipher		$out0,$out0,v30
+
+	vperm		$in0,$in0,$in0,$inpperm
+	vcipherlast	$out0,$out0,$twk0
+
+	vmr		$twk0,$twk1		# unused tweak
+	vxor		$tmp,$out0,$twk1	# last block prep for stealing
+	le?vperm	$out0,$out0,$out0,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	addi		$out,$out,0x10
+	bne		Lxts_enc6x_steal
+	b		Lxts_enc6x_done
+
+.align	4
+Lxts_enc6x_zero:
+	cmpwi		$taillen,0
+	beq		Lxts_enc6x_done
+
+	add		$inp,$inp,$taillen
+	subi		$inp,$inp,16
+	lvx_u		$in0,0,$inp
+	lvsr		$inpperm,0,$taillen	# $in5 is no more
+	le?vperm	$in0,$in0,$in0,$leperm
+	vperm		$in0,$in0,$in0,$inpperm
+	vxor		$tmp,$tmp,$twk0
+Lxts_enc6x_steal:
+	vxor		$in0,$in0,$twk0
+	vxor		$out0,$out0,$out0
+	vspltisb	$out1,-1
+	vperm		$out0,$out0,$out1,$inpperm
+	vsel		$out0,$in0,$tmp,$out0	# $tmp is last block, remember?
+
+	subi		r30,$out,17
+	subi		$out,$out,16
+	mtctr		$taillen
+Loop_xts_enc6x_steal:
+	lbzu		r0,1(r30)
+	stb		r0,16(r30)
+	bdnz		Loop_xts_enc6x_steal
+
+	li		$taillen,0
+	mtctr		$rounds
+	b		Loop_xts_enc1x		# one more time...
+
+.align	4
+Lxts_enc6x_done:
+	${UCMP}i	$ivp,0
+	beq		Lxts_enc6x_ret
+
+	vxor		$tweak,$twk0,$rndkey0
+	le?vperm	$tweak,$tweak,$tweak,$leperm
+	stvx_u		$tweak,0,$ivp
+
+Lxts_enc6x_ret:
+	mtlr		r11
+	li		r10,`$FRAME+15`
+	li		r11,`$FRAME+31`
+	stvx		$seven,r10,$sp		# wipe copies of round keys
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+	stvx		$seven,r10,$sp
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+	stvx		$seven,r10,$sp
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+	stvx		$seven,r10,$sp
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+
+	mtspr		256,$vrsave
+	lvx		v20,r10,$sp		# ABI says so
+	addi		r10,r10,32
+	lvx		v21,r11,$sp
+	addi		r11,r11,32
+	lvx		v22,r10,$sp
+	addi		r10,r10,32
+	lvx		v23,r11,$sp
+	addi		r11,r11,32
+	lvx		v24,r10,$sp
+	addi		r10,r10,32
+	lvx		v25,r11,$sp
+	addi		r11,r11,32
+	lvx		v26,r10,$sp
+	addi		r10,r10,32
+	lvx		v27,r11,$sp
+	addi		r11,r11,32
+	lvx		v28,r10,$sp
+	addi		r10,r10,32
+	lvx		v29,r11,$sp
+	addi		r11,r11,32
+	lvx		v30,r10,$sp
+	lvx		v31,r11,$sp
+	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+	blr
+	.long		0
+	.byte		0,12,0x04,1,0x80,6,6,0
+	.long		0
+
+.align	5
+_aesp8_xts_enc5x:
+	vcipher		$out0,$out0,v24
+	vcipher		$out1,$out1,v24
+	vcipher		$out2,$out2,v24
+	vcipher		$out3,$out3,v24
+	vcipher		$out4,$out4,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vcipher		$out0,$out0,v25
+	vcipher		$out1,$out1,v25
+	vcipher		$out2,$out2,v25
+	vcipher		$out3,$out3,v25
+	vcipher		$out4,$out4,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		_aesp8_xts_enc5x
+
+	add		$inp,$inp,$taillen
+	cmpwi		$taillen,0
+	vcipher		$out0,$out0,v24
+	vcipher		$out1,$out1,v24
+	vcipher		$out2,$out2,v24
+	vcipher		$out3,$out3,v24
+	vcipher		$out4,$out4,v24
+
+	subi		$inp,$inp,16
+	vcipher		$out0,$out0,v25
+	vcipher		$out1,$out1,v25
+	vcipher		$out2,$out2,v25
+	vcipher		$out3,$out3,v25
+	vcipher		$out4,$out4,v25
+	 vxor		$twk0,$twk0,v31
+
+	vcipher		$out0,$out0,v26
+	lvsr		$inpperm,0,$taillen	# $in5 is no more
+	vcipher		$out1,$out1,v26
+	vcipher		$out2,$out2,v26
+	vcipher		$out3,$out3,v26
+	vcipher		$out4,$out4,v26
+	 vxor		$in1,$twk1,v31
+
+	vcipher		$out0,$out0,v27
+	lvx_u		$in0,0,$inp
+	vcipher		$out1,$out1,v27
+	vcipher		$out2,$out2,v27
+	vcipher		$out3,$out3,v27
+	vcipher		$out4,$out4,v27
+	 vxor		$in2,$twk2,v31
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vcipher		$out0,$out0,v28
+	vcipher		$out1,$out1,v28
+	vcipher		$out2,$out2,v28
+	vcipher		$out3,$out3,v28
+	vcipher		$out4,$out4,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+	 vxor		$in3,$twk3,v31
+
+	vcipher		$out0,$out0,v29
+	le?vperm	$in0,$in0,$in0,$leperm
+	vcipher		$out1,$out1,v29
+	vcipher		$out2,$out2,v29
+	vcipher		$out3,$out3,v29
+	vcipher		$out4,$out4,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vxor		$in4,$twk4,v31
+
+	vcipher		$out0,$out0,v30
+	vperm		$in0,$in0,$in0,$inpperm
+	vcipher		$out1,$out1,v30
+	vcipher		$out2,$out2,v30
+	vcipher		$out3,$out3,v30
+	vcipher		$out4,$out4,v30
+
+	vcipherlast	$out0,$out0,$twk0
+	vcipherlast	$out1,$out1,$in1
+	vcipherlast	$out2,$out2,$in2
+	vcipherlast	$out3,$out3,$in3
+	vcipherlast	$out4,$out4,$in4
+	blr
+        .long   	0
+        .byte   	0,12,0x14,0,0,0,0,0
+
+.align	5
+_aesp8_xts_decrypt6x:
+	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+	mflr		r11
+	li		r7,`$FRAME+8*16+15`
+	li		r3,`$FRAME+8*16+31`
+	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+	stvx		v20,r7,$sp		# ABI says so
+	addi		r7,r7,32
+	stvx		v21,r3,$sp
+	addi		r3,r3,32
+	stvx		v22,r7,$sp
+	addi		r7,r7,32
+	stvx		v23,r3,$sp
+	addi		r3,r3,32
+	stvx		v24,r7,$sp
+	addi		r7,r7,32
+	stvx		v25,r3,$sp
+	addi		r3,r3,32
+	stvx		v26,r7,$sp
+	addi		r7,r7,32
+	stvx		v27,r3,$sp
+	addi		r3,r3,32
+	stvx		v28,r7,$sp
+	addi		r7,r7,32
+	stvx		v29,r3,$sp
+	addi		r3,r3,32
+	stvx		v30,r7,$sp
+	stvx		v31,r3,$sp
+	li		r0,-1
+	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
+	li		$x10,0x10
+	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	li		$x20,0x20
+	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	li		$x30,0x30
+	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	li		$x40,0x40
+	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	li		$x50,0x50
+	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	li		$x60,0x60
+	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	li		$x70,0x70
+	mtspr		256,r0
+
+	subi		$rounds,$rounds,3	# -4 in total
+
+	lvx		$rndkey0,$x00,$key1	# load key schedule
+	lvx		v30,$x10,$key1
+	addi		$key1,$key1,0x20
+	lvx		v31,$x00,$key1
+	?vperm		$rndkey0,$rndkey0,v30,$keyperm
+	addi		$key_,$sp,$FRAME+15
+	mtctr		$rounds
+
+Load_xts_dec_key:
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v30,$x10,$key1
+	addi		$key1,$key1,0x20
+	stvx		v24,$x00,$key_		# off-load round[1]
+	?vperm		v25,v31,v30,$keyperm
+	lvx		v31,$x00,$key1
+	stvx		v25,$x10,$key_		# off-load round[2]
+	addi		$key_,$key_,0x20
+	bdnz		Load_xts_dec_key
+
+	lvx		v26,$x10,$key1
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v27,$x20,$key1
+	stvx		v24,$x00,$key_		# off-load round[3]
+	?vperm		v25,v31,v26,$keyperm
+	lvx		v28,$x30,$key1
+	stvx		v25,$x10,$key_		# off-load round[4]
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	?vperm		v26,v26,v27,$keyperm
+	lvx		v29,$x40,$key1
+	?vperm		v27,v27,v28,$keyperm
+	lvx		v30,$x50,$key1
+	?vperm		v28,v28,v29,$keyperm
+	lvx		v31,$x60,$key1
+	?vperm		v29,v29,v30,$keyperm
+	lvx		$twk5,$x70,$key1	# borrow $twk5
+	?vperm		v30,v30,v31,$keyperm
+	lvx		v24,$x00,$key_		# pre-load round[1]
+	?vperm		v31,v31,$twk5,$keyperm
+	lvx		v25,$x10,$key_		# pre-load round[2]
+
+	 vperm		$in0,$inout,$inptail,$inpperm
+	 subi		$inp,$inp,31		# undo "caller"
+	vxor		$twk0,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out0,$in0,$twk0
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in1,$x10,$inp
+	vxor		$twk1,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in1,$in1,$in1,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out1,$in1,$twk1
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in2,$x20,$inp
+	 andi.		$taillen,$len,15
+	vxor		$twk2,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in2,$in2,$in2,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out2,$in2,$twk2
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in3,$x30,$inp
+	 sub		$len,$len,$taillen
+	vxor		$twk3,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in3,$in3,$in3,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out3,$in3,$twk3
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in4,$x40,$inp
+	 subi		$len,$len,0x60
+	vxor		$twk4,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in4,$in4,$in4,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out4,$in4,$twk4
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in5,$x50,$inp
+	 addi		$inp,$inp,0x60
+	vxor		$twk5,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in5,$in5,$in5,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out5,$in5,$twk5
+	vxor		$tweak,$tweak,$tmp
+
+	vxor		v31,v31,$rndkey0
+	mtctr		$rounds
+	b		Loop_xts_dec6x
+
+.align	5
+Loop_xts_dec6x:
+	vncipher	$out0,$out0,v24
+	vncipher	$out1,$out1,v24
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	vncipher	$out4,$out4,v24
+	vncipher	$out5,$out5,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vncipher	$out0,$out0,v25
+	vncipher	$out1,$out1,v25
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	vncipher	$out4,$out4,v25
+	vncipher	$out5,$out5,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Loop_xts_dec6x
+
+	subic		$len,$len,96		# $len-=96
+	 vxor		$in0,$twk0,v31		# xor with last round key
+	vncipher	$out0,$out0,v24
+	vncipher	$out1,$out1,v24
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk0,$tweak,$rndkey0
+	 vaddubm	$tweak,$tweak,$tweak
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vncipher	$out4,$out4,v24
+	vncipher	$out5,$out5,v24
+
+	subfe.		r0,r0,r0		# borrow?-1:0
+	 vand		$tmp,$tmp,$eighty7
+	vncipher	$out0,$out0,v25
+	vncipher	$out1,$out1,v25
+	 vxor		$tweak,$tweak,$tmp
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	 vxor		$in1,$twk1,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk1,$tweak,$rndkey0
+	vncipher	$out4,$out4,v25
+	vncipher	$out5,$out5,v25
+
+	and		r0,r0,$len
+	 vaddubm	$tweak,$tweak,$tweak
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vncipher	$out0,$out0,v26
+	vncipher	$out1,$out1,v26
+	 vand		$tmp,$tmp,$eighty7
+	vncipher	$out2,$out2,v26
+	vncipher	$out3,$out3,v26
+	 vxor		$tweak,$tweak,$tmp
+	vncipher	$out4,$out4,v26
+	vncipher	$out5,$out5,v26
+
+	add		$inp,$inp,r0		# $inp is adjusted in such
+						# way that at exit from the
+						# loop inX-in5 are loaded
+						# with last "words"
+	 vxor		$in2,$twk2,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk2,$tweak,$rndkey0
+	 vaddubm	$tweak,$tweak,$tweak
+	vncipher	$out0,$out0,v27
+	vncipher	$out1,$out1,v27
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vncipher	$out2,$out2,v27
+	vncipher	$out3,$out3,v27
+	 vand		$tmp,$tmp,$eighty7
+	vncipher	$out4,$out4,v27
+	vncipher	$out5,$out5,v27
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	 vxor		$tweak,$tweak,$tmp
+	vncipher	$out0,$out0,v28
+	vncipher	$out1,$out1,v28
+	 vxor		$in3,$twk3,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk3,$tweak,$rndkey0
+	vncipher	$out2,$out2,v28
+	vncipher	$out3,$out3,v28
+	 vaddubm	$tweak,$tweak,$tweak
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vncipher	$out4,$out4,v28
+	vncipher	$out5,$out5,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+	 vand		$tmp,$tmp,$eighty7
+
+	vncipher	$out0,$out0,v29
+	vncipher	$out1,$out1,v29
+	 vxor		$tweak,$tweak,$tmp
+	vncipher	$out2,$out2,v29
+	vncipher	$out3,$out3,v29
+	 vxor		$in4,$twk4,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk4,$tweak,$rndkey0
+	vncipher	$out4,$out4,v29
+	vncipher	$out5,$out5,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vaddubm	$tweak,$tweak,$tweak
+	 vsldoi		$tmp,$tmp,$tmp,15
+
+	vncipher	$out0,$out0,v30
+	vncipher	$out1,$out1,v30
+	 vand		$tmp,$tmp,$eighty7
+	vncipher	$out2,$out2,v30
+	vncipher	$out3,$out3,v30
+	 vxor		$tweak,$tweak,$tmp
+	vncipher	$out4,$out4,v30
+	vncipher	$out5,$out5,v30
+	 vxor		$in5,$twk5,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk5,$tweak,$rndkey0
+
+	vncipherlast	$out0,$out0,$in0
+	 lvx_u		$in0,$x00,$inp		# load next input block
+	 vaddubm	$tweak,$tweak,$tweak
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vncipherlast	$out1,$out1,$in1
+	 lvx_u		$in1,$x10,$inp
+	vncipherlast	$out2,$out2,$in2
+	 le?vperm	$in0,$in0,$in0,$leperm
+	 lvx_u		$in2,$x20,$inp
+	 vand		$tmp,$tmp,$eighty7
+	vncipherlast	$out3,$out3,$in3
+	 le?vperm	$in1,$in1,$in1,$leperm
+	 lvx_u		$in3,$x30,$inp
+	vncipherlast	$out4,$out4,$in4
+	 le?vperm	$in2,$in2,$in2,$leperm
+	 lvx_u		$in4,$x40,$inp
+	 vxor		$tweak,$tweak,$tmp
+	vncipherlast	$out5,$out5,$in5
+	 le?vperm	$in3,$in3,$in3,$leperm
+	 lvx_u		$in5,$x50,$inp
+	 addi		$inp,$inp,0x60
+	 le?vperm	$in4,$in4,$in4,$leperm
+	 le?vperm	$in5,$in5,$in5,$leperm
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	 vxor		$out0,$in0,$twk0
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	 vxor		$out1,$in1,$twk1
+	le?vperm	$out3,$out3,$out3,$leperm
+	stvx_u		$out2,$x20,$out
+	 vxor		$out2,$in2,$twk2
+	le?vperm	$out4,$out4,$out4,$leperm
+	stvx_u		$out3,$x30,$out
+	 vxor		$out3,$in3,$twk3
+	le?vperm	$out5,$out5,$out5,$leperm
+	stvx_u		$out4,$x40,$out
+	 vxor		$out4,$in4,$twk4
+	stvx_u		$out5,$x50,$out
+	 vxor		$out5,$in5,$twk5
+	addi		$out,$out,0x60
+
+	mtctr		$rounds
+	beq		Loop_xts_dec6x		# did $len-=96 borrow?
+
+	addic.		$len,$len,0x60
+	beq		Lxts_dec6x_zero
+	cmpwi		$len,0x20
+	blt		Lxts_dec6x_one
+	nop
+	beq		Lxts_dec6x_two
+	cmpwi		$len,0x40
+	blt		Lxts_dec6x_three
+	nop
+	beq		Lxts_dec6x_four
+
+Lxts_dec6x_five:
+	vxor		$out0,$in1,$twk0
+	vxor		$out1,$in2,$twk1
+	vxor		$out2,$in3,$twk2
+	vxor		$out3,$in4,$twk3
+	vxor		$out4,$in5,$twk4
+
+	bl		_aesp8_xts_dec5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk5		# unused tweak
+	vxor		$twk1,$tweak,$rndkey0
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	vxor		$out0,$in0,$twk1
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$leperm
+	stvx_u		$out2,$x20,$out
+	le?vperm	$out4,$out4,$out4,$leperm
+	stvx_u		$out3,$x30,$out
+	stvx_u		$out4,$x40,$out
+	addi		$out,$out,0x50
+	bne		Lxts_dec6x_steal
+	b		Lxts_dec6x_done
+
+.align	4
+Lxts_dec6x_four:
+	vxor		$out0,$in2,$twk0
+	vxor		$out1,$in3,$twk1
+	vxor		$out2,$in4,$twk2
+	vxor		$out3,$in5,$twk3
+	vxor		$out4,$out4,$out4
+
+	bl		_aesp8_xts_dec5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk4		# unused tweak
+	vmr		$twk1,$twk5
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	vxor		$out0,$in0,$twk5
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$leperm
+	stvx_u		$out2,$x20,$out
+	stvx_u		$out3,$x30,$out
+	addi		$out,$out,0x40
+	bne		Lxts_dec6x_steal
+	b		Lxts_dec6x_done
+
+.align	4
+Lxts_dec6x_three:
+	vxor		$out0,$in3,$twk0
+	vxor		$out1,$in4,$twk1
+	vxor		$out2,$in5,$twk2
+	vxor		$out3,$out3,$out3
+	vxor		$out4,$out4,$out4
+
+	bl		_aesp8_xts_dec5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk3		# unused tweak
+	vmr		$twk1,$twk4
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	vxor		$out0,$in0,$twk4
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	stvx_u		$out2,$x20,$out
+	addi		$out,$out,0x30
+	bne		Lxts_dec6x_steal
+	b		Lxts_dec6x_done
+
+.align	4
+Lxts_dec6x_two:
+	vxor		$out0,$in4,$twk0
+	vxor		$out1,$in5,$twk1
+	vxor		$out2,$out2,$out2
+	vxor		$out3,$out3,$out3
+	vxor		$out4,$out4,$out4
+
+	bl		_aesp8_xts_dec5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk2		# unused tweak
+	vmr		$twk1,$twk3
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	vxor		$out0,$in0,$twk3
+	stvx_u		$out1,$x10,$out
+	addi		$out,$out,0x20
+	bne		Lxts_dec6x_steal
+	b		Lxts_dec6x_done
+
+.align	4
+Lxts_dec6x_one:
+	vxor		$out0,$in5,$twk0
+	nop
+Loop_xts_dec1x:
+	vncipher	$out0,$out0,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vncipher	$out0,$out0,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Loop_xts_dec1x
+
+	subi		r0,$taillen,1
+	vncipher	$out0,$out0,v24
+
+	andi.		r0,r0,16
+	cmpwi		$taillen,0
+	vncipher	$out0,$out0,v25
+
+	sub		$inp,$inp,r0
+	vncipher	$out0,$out0,v26
+
+	lvx_u		$in0,0,$inp
+	vncipher	$out0,$out0,v27
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vncipher	$out0,$out0,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+
+	vncipher	$out0,$out0,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vxor		$twk0,$twk0,v31
+
+	le?vperm	$in0,$in0,$in0,$leperm
+	vncipher	$out0,$out0,v30
+
+	mtctr		$rounds
+	vncipherlast	$out0,$out0,$twk0
+
+	vmr		$twk0,$twk1		# unused tweak
+	vmr		$twk1,$twk2
+	le?vperm	$out0,$out0,$out0,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	addi		$out,$out,0x10
+	vxor		$out0,$in0,$twk2
+	bne		Lxts_dec6x_steal
+	b		Lxts_dec6x_done
+
+.align	4
+Lxts_dec6x_zero:
+	cmpwi		$taillen,0
+	beq		Lxts_dec6x_done
+
+	lvx_u		$in0,0,$inp
+	le?vperm	$in0,$in0,$in0,$leperm
+	vxor		$out0,$in0,$twk1
+Lxts_dec6x_steal:
+	vncipher	$out0,$out0,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vncipher	$out0,$out0,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Lxts_dec6x_steal
+
+	add		$inp,$inp,$taillen
+	vncipher	$out0,$out0,v24
+
+	cmpwi		$taillen,0
+	vncipher	$out0,$out0,v25
+
+	lvx_u		$in0,0,$inp
+	vncipher	$out0,$out0,v26
+
+	lvsr		$inpperm,0,$taillen	# $in5 is no more
+	vncipher	$out0,$out0,v27
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vncipher	$out0,$out0,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+
+	vncipher	$out0,$out0,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vxor		$twk1,$twk1,v31
+
+	le?vperm	$in0,$in0,$in0,$leperm
+	vncipher	$out0,$out0,v30
+
+	vperm		$in0,$in0,$in0,$inpperm
+	vncipherlast	$tmp,$out0,$twk1
+
+	le?vperm	$out0,$tmp,$tmp,$leperm
+	le?stvx_u	$out0,0,$out
+	be?stvx_u	$tmp,0,$out
+
+	vxor		$out0,$out0,$out0
+	vspltisb	$out1,-1
+	vperm		$out0,$out0,$out1,$inpperm
+	vsel		$out0,$in0,$tmp,$out0
+	vxor		$out0,$out0,$twk0
+
+	subi		r30,$out,1
+	mtctr		$taillen
+Loop_xts_dec6x_steal:
+	lbzu		r0,1(r30)
+	stb		r0,16(r30)
+	bdnz		Loop_xts_dec6x_steal
+
+	li		$taillen,0
+	mtctr		$rounds
+	b		Loop_xts_dec1x		# one more time...
+
+.align	4
+Lxts_dec6x_done:
+	${UCMP}i	$ivp,0
+	beq		Lxts_dec6x_ret
+
+	vxor		$tweak,$twk0,$rndkey0
+	le?vperm	$tweak,$tweak,$tweak,$leperm
+	stvx_u		$tweak,0,$ivp
+
+Lxts_dec6x_ret:
+	mtlr		r11
+	li		r10,`$FRAME+15`
+	li		r11,`$FRAME+31`
+	stvx		$seven,r10,$sp		# wipe copies of round keys
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+	stvx		$seven,r10,$sp
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+	stvx		$seven,r10,$sp
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+	stvx		$seven,r10,$sp
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+
+	mtspr		256,$vrsave
+	lvx		v20,r10,$sp		# ABI says so
+	addi		r10,r10,32
+	lvx		v21,r11,$sp
+	addi		r11,r11,32
+	lvx		v22,r10,$sp
+	addi		r10,r10,32
+	lvx		v23,r11,$sp
+	addi		r11,r11,32
+	lvx		v24,r10,$sp
+	addi		r10,r10,32
+	lvx		v25,r11,$sp
+	addi		r11,r11,32
+	lvx		v26,r10,$sp
+	addi		r10,r10,32
+	lvx		v27,r11,$sp
+	addi		r11,r11,32
+	lvx		v28,r10,$sp
+	addi		r10,r10,32
+	lvx		v29,r11,$sp
+	addi		r11,r11,32
+	lvx		v30,r10,$sp
+	lvx		v31,r11,$sp
+	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+	blr
+	.long		0
+	.byte		0,12,0x04,1,0x80,6,6,0
+	.long		0
+
+.align	5
+_aesp8_xts_dec5x:
+	vncipher	$out0,$out0,v24
+	vncipher	$out1,$out1,v24
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	vncipher	$out4,$out4,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vncipher	$out0,$out0,v25
+	vncipher	$out1,$out1,v25
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	vncipher	$out4,$out4,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		_aesp8_xts_dec5x
+
+	subi		r0,$taillen,1
+	vncipher	$out0,$out0,v24
+	vncipher	$out1,$out1,v24
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	vncipher	$out4,$out4,v24
+
+	andi.		r0,r0,16
+	cmpwi		$taillen,0
+	vncipher	$out0,$out0,v25
+	vncipher	$out1,$out1,v25
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	vncipher	$out4,$out4,v25
+	 vxor		$twk0,$twk0,v31
+
+	sub		$inp,$inp,r0
+	vncipher	$out0,$out0,v26
+	vncipher	$out1,$out1,v26
+	vncipher	$out2,$out2,v26
+	vncipher	$out3,$out3,v26
+	vncipher	$out4,$out4,v26
+	 vxor		$in1,$twk1,v31
+
+	vncipher	$out0,$out0,v27
+	lvx_u		$in0,0,$inp
+	vncipher	$out1,$out1,v27
+	vncipher	$out2,$out2,v27
+	vncipher	$out3,$out3,v27
+	vncipher	$out4,$out4,v27
+	 vxor		$in2,$twk2,v31
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vncipher	$out0,$out0,v28
+	vncipher	$out1,$out1,v28
+	vncipher	$out2,$out2,v28
+	vncipher	$out3,$out3,v28
+	vncipher	$out4,$out4,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+	 vxor		$in3,$twk3,v31
+
+	vncipher	$out0,$out0,v29
+	le?vperm	$in0,$in0,$in0,$leperm
+	vncipher	$out1,$out1,v29
+	vncipher	$out2,$out2,v29
+	vncipher	$out3,$out3,v29
+	vncipher	$out4,$out4,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vxor		$in4,$twk4,v31
+
+	vncipher	$out0,$out0,v30
+	vncipher	$out1,$out1,v30
+	vncipher	$out2,$out2,v30
+	vncipher	$out3,$out3,v30
+	vncipher	$out4,$out4,v30
+
+	vncipherlast	$out0,$out0,$twk0
+	vncipherlast	$out1,$out1,$in1
+	vncipherlast	$out2,$out2,$in2
+	vncipherlast	$out3,$out3,$in3
+	vncipherlast	$out4,$out4,$in4
+	mtctr		$rounds
+	blr
+        .long   	0
+        .byte   	0,12,0x14,0,0,0,0,0
+___
+}}	}}}
+
 my $consts=1;
 foreach(split("\n",$code)) {
         s/\`([^\`]*)\`/eval($1)/geo;
diff --git a/deps/openssl/openssl/crypto/aes/asm/aest4-sparcv9.pl b/deps/openssl/openssl/crypto/aes/asm/aest4-sparcv9.pl
index 536f23b47c..bf479c60ae 100644
--- a/deps/openssl/openssl/crypto/aes/asm/aest4-sparcv9.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/aest4-sparcv9.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
@@ -68,7 +75,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "sparcv9_modes.pl";
 
-&asm_init(@ARGV);
+$output = pop;
+open STDOUT,">$output";
 
 $::evp=1;	# if $evp is set to 0, script generates module with
 # AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
@@ -83,12 +91,14 @@ $::evp=1;	# if $evp is set to 0, script generates module with
 {
 my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
 
-$code.=<<___ if ($::abibits==64);
+$code.=<<___;
+#include "sparc_arch.h"
+
+#ifdef	__arch64__
 .register	%g2,#scratch
 .register	%g3,#scratch
+#endif
 
-___
-$code.=<<___;
 .text
 
 .globl	aes_t4_encrypt
diff --git a/deps/openssl/openssl/crypto/aes/asm/aesv8-armx.pl b/deps/openssl/openssl/crypto/aes/asm/aesv8-armx.pl
index 95ebae3beb..1782d5b414 100755
--- a/deps/openssl/openssl/crypto/aes/asm/aesv8-armx.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/aesv8-armx.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -27,12 +34,21 @@
 # Cortex-A53	1.32		1.29		1.46
 # Cortex-A57(*)	1.95		0.85		0.93
 # Denver	1.96		0.86		0.80
+# Mongoose	1.33		1.20		1.20
 #
 # (*)	original 3.64/1.34/1.32 results were for r0p0 revision
 #	and are still same even for updated module;
 
 $flavour = shift;
-open STDOUT,">".shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
 
 $prefix="aes_v8";
 
@@ -43,9 +59,12 @@ $code=<<___;
 .text
 ___
 $code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
-$code.=".arch	armv7-a\n.fpu	neon\n.code	32\n"	if ($flavour !~ /64/);
-		#^^^^^^ this is done to simplify adoption by not depending
-		#	on latest binutils.
+$code.=<<___						if ($flavour !~ /64/);
+.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
+.fpu	neon
+.code	32
+#undef	__thumb2__
+___
 
 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
@@ -60,7 +79,7 @@ my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
 
 $code.=<<___;
 .align	5
-rcon:
+.Lrcon:
 .long	0x01,0x01,0x01,0x01
 .long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
 .long	0x1b,0x1b,0x1b,0x1b
@@ -89,7 +108,7 @@ $code.=<<___;
 	tst	$bits,#0x3f
 	b.ne	.Lenc_key_abort
 
-	adr	$ptr,rcon
+	adr	$ptr,.Lrcon
 	cmp	$bits,#192
 
 	veor	$zero,$zero,$zero
diff --git a/deps/openssl/openssl/crypto/aes/asm/bsaes-armv7.pl b/deps/openssl/openssl/crypto/aes/asm/bsaes-armv7.pl
index ec66b0502a..7af38afcb6 100644
--- a/deps/openssl/openssl/crypto/aes/asm/bsaes-armv7.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/bsaes-armv7.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2012-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -47,8 +54,20 @@
 #
 #					<ard.biesheuvel@linaro.org>
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
 
 my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
 my @XMM=map("q$_",(0..15));
@@ -702,7 +721,7 @@ $code.=<<___;
 # define BSAES_ASM_EXTENDED_KEY
 # define XTS_CHAIN_TWEAK
 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
 #endif
 
 #ifdef __thumb__
@@ -715,10 +734,11 @@ $code.=<<___;
 
 .text
 .syntax	unified 	@ ARMv7-capable assembler is expected to handle this
-#ifdef __thumb2__
+#if defined(__thumb2__) && !defined(__APPLE__)
 .thumb
 #else
 .code   32
+# undef __thumb2__
 #endif
 
 .type	_bsaes_decrypt8,%function
@@ -726,7 +746,11 @@ $code.=<<___;
 _bsaes_decrypt8:
 	adr	$const,.
 	vldmia	$key!, {@XMM[9]}		@ round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	$const,.LM0ISR
+#else
 	add	$const,$const,#.LM0ISR-_bsaes_decrypt8
+#endif
 
 	vldmia	$const!, {@XMM[8]}		@ .LM0ISR
 	veor	@XMM[10], @XMM[0], @XMM[9]	@ xor with round0 key
@@ -821,7 +845,11 @@ _bsaes_const:
 _bsaes_encrypt8:
 	adr	$const,.
 	vldmia	$key!, {@XMM[9]}		@ round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	$const,.LM0SR
+#else
 	sub	$const,$const,#_bsaes_encrypt8-.LM0SR
+#endif
 
 	vldmia	$const!, {@XMM[8]}		@ .LM0SR
 _bsaes_encrypt8_alt:
@@ -925,7 +953,11 @@ $code.=<<___;
 _bsaes_key_convert:
 	adr	$const,.
 	vld1.8	{@XMM[7]},  [$inp]!		@ load round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	$const,.LM0
+#else
 	sub	$const,$const,#_bsaes_key_convert-.LM0
+#endif
 	vld1.8	{@XMM[15]}, [$inp]!		@ load round 1 key
 
 	vmov.i8	@XMM[8],  #0x01			@ bit masks
@@ -1392,7 +1424,12 @@ bsaes_ctr32_encrypt_blocks:
 	vstmia	r12, {@XMM[7]}			@ save last round key
 
 	vld1.8	{@XMM[0]}, [$ctr]		@ load counter
+#ifdef	__APPLE__
+	mov	$ctr, #:lower16:(.LREVM0SR-.LM0)
+	add	$ctr, $const, $ctr
+#else
 	add	$ctr, $const, #.LREVM0SR-.LM0	@ borrow $ctr
+#endif
 	vldmia	$keysched, {@XMM[4]}		@ load round0 key
 #else
 	ldr	r12, [$key, #244]
@@ -1449,7 +1486,12 @@ bsaes_ctr32_encrypt_blocks:
 	vldmia		$ctr, {@XMM[8]}			@ .LREVM0SR
 	mov		r5, $rounds			@ pass rounds
 	vstmia		$fp, {@XMM[10]}			@ save next counter
+#ifdef	__APPLE__
+	mov		$const, #:lower16:(.LREVM0SR-.LSR)
+	sub		$const, $ctr, $const
+#else
 	sub		$const, $ctr, #.LREVM0SR-.LSR	@ pass constants
+#endif
 
 	bl		_bsaes_encrypt8_alt
 
@@ -1550,7 +1592,7 @@ bsaes_ctr32_encrypt_blocks:
 	rev	r8, r8
 #endif
 	sub	sp, sp, #0x10
-	vst1.8	{@XMM[1]}, [sp,:64]	@ copy counter value
+	vst1.8	{@XMM[1]}, [sp]		@ copy counter value
 	sub	sp, sp, #0x10
 
 .Lctr_enc_short_loop:
@@ -1561,7 +1603,7 @@ bsaes_ctr32_encrypt_blocks:
 	bl	AES_encrypt
 
 	vld1.8	{@XMM[0]}, [r4]!	@ load input
-	vld1.8	{@XMM[1]}, [sp,:64]	@ load encrypted counter
+	vld1.8	{@XMM[1]}, [sp]		@ load encrypted counter
 	add	r8, r8, #1
 #ifdef __ARMEL__
 	rev	r0, r8
@@ -2068,9 +2110,11 @@ bsaes_xts_decrypt:
 	vld1.8	{@XMM[8]}, [r0]			@ initial tweak
 	adr	$magic, .Lxts_magic
 
+#ifndef	XTS_CHAIN_TWEAK
 	tst	$len, #0xf			@ if not multiple of 16
 	it	ne				@ Thumb2 thing, sanity check in ARM
 	subne	$len, #0x10			@ subtract another 16 bytes
+#endif
 	subs	$len, #0x80
 
 	blo	.Lxts_dec_short
diff --git a/deps/openssl/openssl/crypto/aes/asm/bsaes-x86_64.pl b/deps/openssl/openssl/crypto/aes/asm/bsaes-x86_64.pl
index 3f7d33c45b..921d870e98 100644
--- a/deps/openssl/openssl/crypto/aes/asm/bsaes-x86_64.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/bsaes-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 ###################################################################
 ### AES-128 [originally in CTR mode]				###
@@ -41,6 +48,7 @@
 # Nehalem(**) 	7.63		6.88		+11%
 # Atom	    	17.1		16.4		+4%
 # Silvermont	-		12.9
+# Goldmont	-		8.85
 #
 # (*)	Comparison is not completely fair, because "this" is ECB,
 #	i.e. no extra processing such as counter values calculation
@@ -80,6 +88,7 @@
 # Nehalem	7.80
 # Atom		17.9
 # Silvermont	14.0
+# Goldmont	10.2
 #
 # November 2011.
 #
@@ -99,7 +108,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;
 
 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
diff --git a/deps/openssl/openssl/crypto/aes/asm/vpaes-armv8.pl b/deps/openssl/openssl/crypto/aes/asm/vpaes-armv8.pl
new file mode 100755
index 0000000000..2e704a2124
--- /dev/null
+++ b/deps/openssl/openssl/crypto/aes/asm/vpaes-armv8.pl
@@ -0,0 +1,1259 @@
+#! /usr/bin/env perl
+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+##
+######################################################################
+# ARMv8 NEON adaptation by <appro@openssl.org>
+#
+# Reason for undertaken effort is that there is at least one popular
+# SoC based on Cortex-A53 that doesn't have crypto extensions.
+#
+#                   CBC enc     ECB enc/dec(*)   [bit-sliced enc/dec]
+# Cortex-A53        21.5        18.1/20.6        [17.5/19.8         ]
+# Cortex-A57        36.0(**)    20.4/24.9(**)    [14.4/16.6         ]
+# X-Gene            45.9(**)    45.8/57.7(**)    [33.1/37.6(**)     ]
+# Denver(***)       16.6(**)    15.1/17.8(**)    [8.80/9.93         ]
+# Apple A7(***)     22.7(**)    10.9/14.3        [8.45/10.0         ]
+# Mongoose(***)     26.3(**)    21.0/25.0(**)    [13.3/16.8         ]
+#
+# (*)	ECB denotes approximate result for parallelizeable modes
+#	such as CBC decrypt, CTR, etc.;
+# (**)	these results are worse than scalar compiler-generated
+#	code, but it's constant-time and therefore preferred;
+# (***)	presented for reference/comparison purposes;
+
+$flavour = shift;
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$code.=<<___;
+.text
+
+.type	_vpaes_consts,%object
+.align	7	// totally strategic alignment
+_vpaes_consts:
+.Lk_mc_forward:	// mc_forward
+	.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+	.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+	.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+	.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward:// mc_backward
+	.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+	.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+	.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+	.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr:		// sr
+	.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+	.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+	.quad	0x0F060D040B020900, 0x070E050C030A0108
+	.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+.Lk_inv:	// inv, inva
+	.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+	.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_ipt:	// input transform (lo, hi)
+	.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+	.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sbo:	// sbou, sbot
+	.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+	.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_sb1:	// sb1u, sb1t
+	.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+	.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2:	// sb2u, sb2t
+	.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+	.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+//  Decryption stuff
+//
+.Lk_dipt:	// decryption input transform
+	.quad	0x0F505B040B545F00, 0x154A411E114E451A
+	.quad	0x86E383E660056500, 0x12771772F491F194
+.Lk_dsbo:	// decryption sbox final output
+	.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+	.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.Lk_dsb9:	// decryption sbox output *9*u, *9*t
+	.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+	.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:	// decryption sbox output *D*u, *D*t
+	.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+	.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:	// decryption sbox output *B*u, *B*t
+	.quad	0xD022649296B44200, 0x602646F6B0F2D404
+	.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:	// decryption sbox output *E*u, *E*t
+	.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+	.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+
+//
+//  Key schedule constants
+//
+.Lk_dksd:	// decryption key schedule: invskew x*D
+	.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+	.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:	// decryption key schedule: invskew x*B
+	.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+	.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:	// decryption key schedule: invskew x*E + 0x63
+	.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+	.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:	// decryption key schedule: invskew x*9
+	.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+	.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+.Lk_rcon:	// rcon
+	.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt:	// output transform
+	.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+	.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew:	// deskew tables: inverts the sbox's "skew"
+	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.asciz  "Vector Permutaion AES for ARMv8, Mike Hamburg (Stanford University)"
+.size	_vpaes_consts,.-_vpaes_consts
+.align	6
+___
+
+{
+my ($inp,$out,$key) = map("x$_",(0..2));
+
+my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23));
+my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
+my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
+
+$code.=<<___;
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+.type	_vpaes_encrypt_preheat,%function
+.align	4
+_vpaes_encrypt_preheat:
+	adr	x10, .Lk_inv
+	movi	v17.16b, #0x0f
+	ld1	{v18.2d-v19.2d}, [x10],#32	// .Lk_inv
+	ld1	{v20.2d-v23.2d}, [x10],#64	// .Lk_ipt, .Lk_sbo
+	ld1	{v24.2d-v27.2d}, [x10]		// .Lk_sb1, .Lk_sb2
+	ret
+.size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type	_vpaes_encrypt_core,%function
+.align 4
+_vpaes_encrypt_core:
+	mov	x9, $key
+	ldr	w8, [$key,#240]			// pull rounds
+	adr	x11, .Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
+	tbl	v1.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	.Lenc_entry
+
+.align 4
+.Lenc_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b, {$sb1t}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	tbl	v0.16b, {$sb1u}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	tbl	v5.16b,	{$sb2t}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v2.16b, {$sb2u}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	x11, x11, #~(1<<6)		// and		\$0x30,	%r11		# ... mod 4
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	sub	w8, w8, #1			// nr--
+
+.Lenc_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v5.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v3.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, .Lenc_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b, {$sbou}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	tbl	v0.16b, {$sbot}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+.globl	vpaes_encrypt
+.type	vpaes_encrypt,%function
+.align	4
+vpaes_encrypt:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [$inp]
+	bl	_vpaes_encrypt_preheat
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [$out]
+
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_encrypt,.-vpaes_encrypt
+
+.type	_vpaes_encrypt_2x,%function
+.align 4
+_vpaes_encrypt_2x:
+	mov	x9, $key
+	ldr	w8, [$key,#240]			// pull rounds
+	adr	x11, .Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	\$4,	%xmm0,	%xmm0
+	 and	v9.16b,  v15.16b,  v17.16b
+	 ushr	v8.16b,  v15.16b,  #4
+	tbl	v1.16b,  {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+	 tbl	v9.16b,  {$iptlo}, v9.16b
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b,  {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	 tbl	v10.16b, {$ipthi}, v8.16b
+	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
+	 eor	v8.16b,  v9.16b,   v16.16b
+	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	 eor	v8.16b,  v8.16b,   v10.16b
+	b	.Lenc_2x_entry
+
+.align 4
+.Lenc_2x_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b,  {$sb1t}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	 tbl	v12.16b, {$sb1t}, v10.16b
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	tbl	v0.16b,  {$sb1u}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	 tbl	v8.16b,  {$sb1u}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	 eor	v12.16b, v12.16b, v16.16b
+	tbl	v5.16b,	 {$sb2t}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	 tbl	v13.16b, {$sb2t}, v10.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	 eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v2.16b,  {$sb2u}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	 tbl	v10.16b, {$sb2u}, v11.16b
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	 tbl	v11.16b, {v8.16b}, v1.16b
+	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	 eor	v10.16b, v10.16b, v13.16b
+	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	 tbl	v8.16b,  {v8.16b}, v4.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	 eor	v11.16b, v11.16b, v10.16b
+	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	 tbl	v12.16b, {v11.16b},v1.16b
+	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	 eor	v8.16b,  v8.16b,  v11.16b
+	and	x11, x11, #~(1<<6)		// and		\$0x30,	%r11		# ... mod 4
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	 eor	v8.16b,  v8.16b,  v12.16b
+	sub	w8, w8, #1			// nr--
+
+.Lenc_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b,  v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
+	 and	v9.16b,  v8.16b, v17.16b
+	 ushr	v8.16b,  v8.16b, #4
+	tbl	v5.16b,  {$invhi},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	 tbl	v13.16b, {$invhi},v9.16b
+	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	 eor	v9.16b,  v9.16b,  v8.16b
+	tbl	v3.16b,  {$invlo},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	 tbl	v11.16b, {$invlo},v8.16b
+	tbl	v4.16b,  {$invlo},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	 tbl	v12.16b, {$invlo},v9.16b
+	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	 eor	v11.16b, v11.16b, v13.16b
+	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	 eor	v12.16b, v12.16b, v13.16b
+	tbl	v2.16b,  {$invlo},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	 tbl	v10.16b, {$invlo},v11.16b
+	tbl	v3.16b,  {$invlo},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	 tbl	v11.16b, {$invlo},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	 eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	 eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, .Lenc_2x_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b,  {$sbou}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	 tbl	v12.16b, {$sbou}, v10.16b
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	tbl	v0.16b,  {$sbot}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	 tbl	v8.16b,  {$sbot}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	 eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	 eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	 tbl	v1.16b,  {v8.16b},v1.16b
+	ret
+.size	_vpaes_encrypt_2x,.-_vpaes_encrypt_2x
+
+.type	_vpaes_decrypt_preheat,%function
+.align	4
+_vpaes_decrypt_preheat:
+	adr	x10, .Lk_inv
+	movi	v17.16b, #0x0f
+	adr	x11, .Lk_dipt
+	ld1	{v18.2d-v19.2d}, [x10],#32	// .Lk_inv
+	ld1	{v20.2d-v23.2d}, [x11],#64	// .Lk_dipt, .Lk_dsbo
+	ld1	{v24.2d-v27.2d}, [x11],#64	// .Lk_dsb9, .Lk_dsbd
+	ld1	{v28.2d-v31.2d}, [x11]		// .Lk_dsbb, .Lk_dsbe
+	ret
+.size	_vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
+
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+.type	_vpaes_decrypt_core,%function
+.align	4
+_vpaes_decrypt_core:
+	mov	x9, $key
+	ldr	w8, [$key,#240]			// pull rounds
+
+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	\$4, %r11
+	eor	x11, x11, #0x30			// xor		\$0x30,	%r11
+	adr	x10, .Lk_sr
+	and	x11, x11, #0x30			// and		\$0x30,	%r11
+	add	x11, x11, x10
+	adr	x10, .Lk_mc_forward+48
+
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
+	tbl	v2.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	tbl	v0.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	.Ldec_entry
+
+.align 4
+.Ldec_loop:
+//
+//  Inverse mix columns
+//
+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	tbl	v4.16b, {$sb9u}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	tbl	v1.16b, {$sb9t}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+
+	tbl	v4.16b, {$sbdu}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	tbl 	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {$sbdt}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	tbl	v4.16b, {$sbbu}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {$sbbt}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
+
+	tbl	v4.16b, {$sbeu}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {$sbet}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr \$12,	%xmm5,	%xmm5,	%xmm5
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	sub	w8, w8, #1			// sub		\$1,%rax			# nr--
+
+.Ldec_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v2.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	tbl	v3.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
+	cbnz	w8, .Ldec_loop
+
+	// middle of last round
+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	tbl	v4.16b, {$sbou}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
+	tbl	v1.16b, {$sbot}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+.globl	vpaes_decrypt
+.type	vpaes_decrypt,%function
+.align	4
+vpaes_decrypt:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [$inp]
+	bl	_vpaes_decrypt_preheat
+	bl	_vpaes_decrypt_core
+	st1	{v0.16b}, [$out]
+
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_decrypt,.-vpaes_decrypt
+
+// v14-v15 input, v0-v1 output
+.type	_vpaes_decrypt_2x,%function
+.align	4
+_vpaes_decrypt_2x:
+	mov	x9, $key
+	ldr	w8, [$key,#240]			// pull rounds
+
+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	\$4, %r11
+	eor	x11, x11, #0x30			// xor		\$0x30,	%r11
+	adr	x10, .Lk_sr
+	and	x11, x11, #0x30			// and		\$0x30,	%r11
+	add	x11, x11, x10
+	adr	x10, .Lk_mc_forward+48
+
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
+	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
+	 and	v9.16b,  v15.16b, v17.16b
+	 ushr	v8.16b,  v15.16b, #4
+	tbl	v2.16b,  {$iptlo},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+	 tbl	v10.16b, {$iptlo},v9.16b
+	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	tbl	v0.16b,  {$ipthi},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	 tbl	v8.16b,  {$ipthi},v8.16b
+	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
+	 eor	v10.16b, v10.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	 eor	v8.16b,  v8.16b,  v10.16b
+	b	.Ldec_2x_entry
+
+.align 4
+.Ldec_2x_loop:
+//
+//  Inverse mix columns
+//
+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	tbl	v4.16b,  {$sb9u}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	 tbl	v12.16b, {$sb9u}, v10.16b
+	tbl	v1.16b,  {$sb9t}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	 tbl	v9.16b,  {$sb9t}, v11.16b
+	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
+	 eor	v8.16b,  v12.16b, v16.16b
+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	 eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+
+	tbl	v4.16b,  {$sbdu}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	 tbl	v12.16b, {$sbdu}, v10.16b
+	tbl 	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	 tbl 	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {$sbdt}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	 tbl	v9.16b,  {$sbdt}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	 eor	v8.16b,  v8.16b,  v12.16b
+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	 eor	v8.16b,  v8.16b,  v9.16b
+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	tbl	v4.16b,  {$sbbu}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	 tbl	v12.16b, {$sbbu}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	 tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {$sbbt}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	 tbl	v9.16b,  {$sbbt}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	 eor	v8.16b,  v8.16b,  v12.16b
+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	 eor	v8.16b,  v8.16b,  v9.16b
+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
+
+	tbl	v4.16b,  {$sbeu}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	 tbl	v12.16b, {$sbeu}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	 tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {$sbet}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	 tbl	v9.16b,  {$sbet}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	 eor	v8.16b,  v8.16b,  v12.16b
+	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr \$12,	%xmm5,	%xmm5,	%xmm5
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	 eor	v8.16b,  v8.16b,  v9.16b
+	sub	w8, w8, #1			// sub		\$1,%rax			# nr--
+
+.Ldec_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
+	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
+	 and	v9.16b,  v8.16b,  v17.16b
+	 ushr	v8.16b,  v8.16b,  #4
+	tbl	v2.16b,  {$invhi},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	 tbl	v10.16b, {$invhi},v9.16b
+	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	 eor	v9.16b,	 v9.16b,  v8.16b
+	tbl	v3.16b,  {$invlo},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	 tbl	v11.16b, {$invlo},v8.16b
+	tbl	v4.16b,  {$invlo},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	 tbl	v12.16b, {$invlo},v9.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	 eor	v11.16b, v11.16b, v10.16b
+	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	 eor	v12.16b, v12.16b, v10.16b
+	tbl	v2.16b,  {$invlo},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	 tbl	v10.16b, {$invlo},v11.16b
+	tbl	v3.16b,  {$invlo},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	 tbl	v11.16b, {$invlo},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
+	 eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	 eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
+	cbnz	w8, .Ldec_2x_loop
+
+	// middle of last round
+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	tbl	v4.16b,  {$sbou}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	 tbl	v12.16b, {$sbou}, v10.16b
+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	tbl	v1.16b,  {$sbot}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	 tbl	v9.16b,  {$sbot}, v11.16b
+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	 eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
+	 eor	v8.16b,  v9.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
+	 tbl	v1.16b,  {v8.16b},v2.16b
+	ret
+.size	_vpaes_decrypt_2x,.-_vpaes_decrypt_2x
+___
+}
+{
+my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
+my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
+
+$code.=<<___;
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+.type	_vpaes_key_preheat,%function
+.align	4
+_vpaes_key_preheat:
+	adr	x10, .Lk_inv
+	movi	v16.16b, #0x5b			// .Lk_s63
+	adr	x11, .Lk_sb1
+	movi	v17.16b, #0x0f			// .Lk_s0F
+	ld1	{v18.2d-v21.2d}, [x10]		// .Lk_inv, .Lk_ipt
+	adr	x10, .Lk_dksd
+	ld1	{v22.2d-v23.2d}, [x11]		// .Lk_sb1
+	adr	x11, .Lk_mc_forward
+	ld1	{v24.2d-v27.2d}, [x10],#64	// .Lk_dksd, .Lk_dksb
+	ld1	{v28.2d-v31.2d}, [x10],#64	// .Lk_dkse, .Lk_dks9
+	ld1	{v8.2d}, [x10]			// .Lk_rcon
+	ld1	{v9.2d}, [x11]			// .Lk_mc_forward[0]
+	ret
+.size	_vpaes_key_preheat,.-_vpaes_key_preheat
+
+.type	_vpaes_schedule_core,%function
+.align	4
+_vpaes_schedule_core:
+	stp	x29, x30, [sp,#-16]!
+	add	x29,sp,#0
+
+	bl	_vpaes_key_preheat		// load the tables
+
+	ld1	{v0.16b}, [$inp],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	// input transform
+	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
+
+	adr	x10, .Lk_sr			// lea	.Lk_sr(%rip),%r10
+	add	x8, x8, x10
+	cbnz	$dir, .Lschedule_am_decrypting
+
+	// encrypting, output zeroth round key after transform
+	st1	{v0.2d}, [$out]			// vmovdqu	%xmm0,	(%rdx)
+	b	.Lschedule_go
+
+.Lschedule_am_decrypting:
+	// decrypting, output zeroth round key after shiftrows
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
+	st1	{v3.2d}, [$out]			// vmovdqu	%xmm3,	(%rdx)
+	eor	x8, x8, #0x30			// xor	\$0x30, %r8
+
+.Lschedule_go:
+	cmp	$bits, #192			// cmp	\$192,	%esi
+	b.hi	.Lschedule_256
+	b.eq	.Lschedule_192
+	// 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+.Lschedule_128:
+	mov	$inp, #10			// mov	\$10, %esi
+
+.Loop_schedule_128:
+	sub	$inp, $inp, #1			// dec	%esi
+	bl 	_vpaes_schedule_round
+	cbz 	$inp, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// write output
+	b 	.Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align	4
+.Lschedule_192:
+	sub	$inp, $inp, #8
+	ld1	{v0.16b}, [$inp]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
+	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
+	mov	$inp, #4			// mov	\$4,	%esi
+
+.Loop_schedule_192:
+	sub	$inp, $inp, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	\$8,%xmm6,%xmm0,%xmm0
+	bl	_vpaes_schedule_mangle		// save key n
+	bl	_vpaes_schedule_192_smear
+	bl	_vpaes_schedule_mangle		// save key n+1
+	bl	_vpaes_schedule_round
+	cbz 	$inp, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// save key n+2
+	bl	_vpaes_schedule_192_smear
+	b	.Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	4
+.Lschedule_256:
+	ld1	{v0.16b}, [$inp]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	$inp, #7			// mov	\$7, %esi
+
+.Loop_schedule_256:
+	sub	$inp, $inp, #1			// dec	%esi
+	bl	_vpaes_schedule_mangle		// output low result
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	// high round
+	bl	_vpaes_schedule_round
+	cbz 	$inp, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	// low round. swap xmm7 and xmm6
+	dup	v0.4s, v0.s[3]			// vpshufd	\$0xFF,	%xmm0,	%xmm0
+	movi	v4.16b, #0
+	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
+	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
+
+	b	.Loop_schedule_256
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	4
+.Lschedule_mangle_last:
+	// schedule last round key from xmm0
+	adr	x11, .Lk_deskew			// lea	.Lk_deskew(%rip),%r11	# prepare to deskew
+	cbnz	$dir, .Lschedule_mangle_last_dec
+
+	// encrypting
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
+	adr	x11, .Lk_opt			// lea	.Lk_opt(%rip),	%r11		# prepare to output transform
+	add	$out, $out, #32			// add	\$32,	%rdx
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+
+.Lschedule_mangle_last_dec:
+	ld1	{v20.2d-v21.2d}, [x11]		// reload constants
+	sub	$out, $out, #16			// add	\$-16,	%rdx
+	eor	v0.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	// output transform
+	st1	{v0.2d}, [$out]			// vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	// cleanup
+	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
+	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
+	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
+	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
+	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
+	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
+	ldp	x29, x30, [sp],#16
+	ret
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.type	_vpaes_schedule_192_smear,%function
+.align	4
+_vpaes_schedule_192_smear:
+	movi	v1.16b, #0
+	dup	v0.4s, v7.s[3]
+	ins	v1.s[3], v6.s[2]	// vpshufd	\$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
+	ins	v0.s[0], v7.s[2]	// vpshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
+	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
+	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
+	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
+	ret
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.type	_vpaes_schedule_round,%function
+.align	4
+_vpaes_schedule_round:
+	// extract rcon from xmm8
+	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
+	ext	v1.16b, $rcon, v4.16b, #15	// vpalignr	\$15,	%xmm8,	%xmm4,	%xmm1
+	ext	$rcon, $rcon, $rcon, #15	// vpalignr	\$15,	%xmm8,	%xmm8,	%xmm8
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+
+	// rotate
+	dup	v0.4s, v0.s[3]			// vpshufd	\$0xFF,	%xmm0,	%xmm0
+	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	\$1,	%xmm0,	%xmm0,	%xmm0
+
+	// fall through...
+
+	// low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	// smear xmm7
+	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	\$4,	%xmm7,	%xmm1
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	\$8,	%xmm7,	%xmm4
+
+	// subbytes
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0		# 1 = i
+	 eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
+	tbl	v2.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
+	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	 eor	v7.16b, v7.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm7,	%xmm7
+	tbl	v3.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
+	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	// add in smeared stuff
+	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
+	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
+	ret
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.type	_vpaes_schedule_transform,%function
+.align	4
+_vpaes_schedule_transform:
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
+						// vmovdqa	(%r11),	%xmm2 	# lo
+	tbl	v2.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+						// vmovdqa	16(%r11),	%xmm1 # hi
+	tbl	v0.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.type	_vpaes_schedule_mangle,%function
+.align	4
+_vpaes_schedule_mangle:
+	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
+	cbnz	$dir, .Lschedule_mangle_dec
+
+	// encrypting
+	eor	v4.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
+	add	$out, $out, #16			// add	\$16,	%rdx
+	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
+	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
+	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
+	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
+
+	b	.Lschedule_mangle_both
+.align	4
+.Lschedule_mangle_dec:
+	// inverse mix columns
+						// lea	.Lk_dksd(%rip),%r11
+	ushr	v1.16b, v4.16b, #4		// vpsrlb	\$4,	%xmm4,	%xmm1	# 1 = hi
+	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
+
+						// vmovdqa	0x00(%r11),	%xmm2
+	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+						// vmovdqa	0x10(%r11),	%xmm3
+	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+
+						// vmovdqa	0x20(%r11),	%xmm2
+	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+						// vmovdqa	0x30(%r11),	%xmm3
+	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+
+						// vmovdqa	0x40(%r11),	%xmm2
+	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+						// vmovdqa	0x50(%r11),	%xmm3
+	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+
+						// vmovdqa	0x60(%r11),	%xmm2
+	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+						// vmovdqa	0x70(%r11),	%xmm4
+	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
+
+	sub	$out, $out, #16			// add	\$-16,	%rdx
+
+.Lschedule_mangle_both:
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	add	x8, x8, #64-16			// add	\$-16,	%r8
+	and	x8, x8, #~(1<<6)		// and	\$0x30,	%r8
+	st1	{v3.2d}, [$out]			// vmovdqu	%xmm3,	(%rdx)
+	ret
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+.globl	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,%function
+.align	4
+vpaes_set_encrypt_key:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, $bits, #5		// shr	\$5,%eax
+	add	w9, w9, #5		// \$5,%eax
+	str	w9, [$out,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	$dir, #0		// mov	\$0,%ecx
+	mov	x8, #0x30		// mov	\$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	x0, x0, x0
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl	vpaes_set_decrypt_key
+.type	vpaes_set_decrypt_key,%function
+.align	4
+vpaes_set_decrypt_key:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, $bits, #5		// shr	\$5,%eax
+	add	w9, w9, #5		// \$5,%eax
+	str	w9, [$out,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+	lsl	w9, w9, #4		// shl	\$4,%eax
+	add	$out, $out, #16		// lea	16(%rdx,%rax),%rdx
+	add	$out, $out, x9
+
+	mov	$dir, #1		// mov	\$1,%ecx
+	lsr	w8, $bits, #1		// shr	\$1,%r8d
+	and	x8, x8, #32		// and	\$32,%r8d
+	eor	x8, x8, #32		// xor	\$32,%r8d	# nbits==192?0:32
+	bl	_vpaes_schedule_core
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+___
+}
+{
+my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5));
+
+$code.=<<___;
+.globl	vpaes_cbc_encrypt
+.type	vpaes_cbc_encrypt,%function
+.align	4
+vpaes_cbc_encrypt:
+	cbz	$len, .Lcbc_abort
+	cmp	w5, #0			// check direction
+	b.eq	vpaes_cbc_decrypt
+
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x17, $len		// reassign
+	mov	x2,  $key		// reassign
+
+	ld1	{v0.16b}, [$ivec]	// load ivec
+	bl	_vpaes_encrypt_preheat
+	b	.Lcbc_enc_loop
+
+.align	4
+.Lcbc_enc_loop:
+	ld1	{v7.16b}, [$inp],#16	// load input
+	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [$out],#16	// save output
+	subs	x17, x17, #16
+	b.hi	.Lcbc_enc_loop
+
+	st1	{v0.16b}, [$ivec]	// write ivec
+
+	ldp	x29,x30,[sp],#16
+.Lcbc_abort:
+	ret
+.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+
+.type	vpaes_cbc_decrypt,%function
+.align	4
+vpaes_cbc_decrypt:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	mov	x17, $len		// reassign
+	mov	x2,  $key		// reassign
+	ld1	{v6.16b}, [$ivec]	// load ivec
+	bl	_vpaes_decrypt_preheat
+	tst	x17, #16
+	b.eq	.Lcbc_dec_loop2x
+
+	ld1	{v7.16b}, [$inp], #16	// load input
+	bl	_vpaes_decrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
+	orr	v6.16b, v7.16b, v7.16b	// next ivec value
+	st1	{v0.16b}, [$out], #16
+	subs	x17, x17, #16
+	b.ls	.Lcbc_dec_done
+
+.align	4
+.Lcbc_dec_loop2x:
+	ld1	{v14.16b,v15.16b}, [$inp], #32
+	bl	_vpaes_decrypt_2x
+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
+	eor	v1.16b, v1.16b, v14.16b
+	orr	v6.16b, v15.16b, v15.16b
+	st1	{v0.16b,v1.16b}, [$out], #32
+	subs	x17, x17, #32
+	b.hi	.Lcbc_dec_loop2x
+
+.Lcbc_dec_done:
+	st1	{v6.16b}, [$ivec]
+
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
+___
+if (1) {
+$code.=<<___;
+.globl	vpaes_ecb_encrypt
+.type	vpaes_ecb_encrypt,%function
+.align	4
+vpaes_ecb_encrypt:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	mov	x17, $len
+	mov	x2,  $key
+	bl	_vpaes_encrypt_preheat
+	tst	x17, #16
+	b.eq	.Lecb_enc_loop
+
+	ld1	{v7.16b}, [$inp],#16
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [$out],#16
+	subs	x17, x17, #16
+	b.ls	.Lecb_enc_done
+
+.align	4
+.Lecb_enc_loop:
+	ld1	{v14.16b,v15.16b}, [$inp], #32
+	bl	_vpaes_encrypt_2x
+	st1	{v0.16b,v1.16b}, [$out], #32
+	subs	x17, x17, #32
+	b.hi	.Lecb_enc_loop
+
+.Lecb_enc_done:
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
+
+.globl	vpaes_ecb_decrypt
+.type	vpaes_ecb_decrypt,%function
+.align	4
+vpaes_ecb_decrypt:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	mov	x17, $len
+	mov	x2,  $key
+	bl	_vpaes_decrypt_preheat
+	tst	x17, #16
+	b.eq	.Lecb_dec_loop
+
+	ld1	{v7.16b}, [$inp],#16
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [$out],#16
+	subs	x17, x17, #16
+	b.ls	.Lecb_dec_done
+
+.align	4
+.Lecb_dec_loop:
+	ld1	{v14.16b,v15.16b}, [$inp], #32
+	bl	_vpaes_decrypt_2x
+	st1	{v0.16b,v1.16b}, [$out], #32
+	subs	x17, x17, #32
+	b.hi	.Lecb_dec_loop
+
+.Lecb_dec_done:
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
+___
+}	}
+print $code;
+
+close STDOUT;
diff --git a/deps/openssl/openssl/crypto/aes/asm/vpaes-ppc.pl b/deps/openssl/openssl/crypto/aes/asm/vpaes-ppc.pl
index 1759ae9dcf..bb38fbe60c 100644
--- a/deps/openssl/openssl/crypto/aes/asm/vpaes-ppc.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/vpaes-ppc.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 ######################################################################
 ## Constant-time SSSE3 AES core implementation.
@@ -14,7 +21,8 @@
 # 128-bit key.
 #
 #		aes-ppc.pl		this
-# G4e		35.5/52.1/(23.8)	11.9(*)/15.4
+# PPC74x0/G4e	35.5/52.1/(23.8)	11.9(*)/15.4
+# PPC970/G5	37.9/55.0/(28.5)	22.2/28.5
 # POWER6	42.7/54.3/(28.2)	63.0/92.8(**)
 # POWER7	32.3/42.9/(18.4)	18.5/23.3
 #
diff --git a/deps/openssl/openssl/crypto/aes/asm/vpaes-x86.pl b/deps/openssl/openssl/crypto/aes/asm/vpaes-x86.pl
index 2ba149c3f9..47615c0795 100644
--- a/deps/openssl/openssl/crypto/aes/asm/vpaes-x86.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/vpaes-x86.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 ######################################################################
 ## Constant-time SSSE3 AES core implementation.
@@ -51,6 +58,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
+$output = pop;
+open OUT,">$output";
+*STDOUT=*OUT;
+
 &asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
 
 $PREFIX="vpaes";
@@ -901,3 +912,5 @@ $k_dsbo=0x2c0;		# decryption sbox final output
 &function_end("${PREFIX}_cbc_encrypt");
 
 &asm_finish();
+
+close STDOUT;
diff --git a/deps/openssl/openssl/crypto/aes/asm/vpaes-x86_64.pl b/deps/openssl/openssl/crypto/aes/asm/vpaes-x86_64.pl
index f2ef318fae..422e8ee442 100644
--- a/deps/openssl/openssl/crypto/aes/asm/vpaes-x86_64.pl
+++ b/deps/openssl/openssl/crypto/aes/asm/vpaes-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 ######################################################################
 ## Constant-time SSSE3 AES core implementation.
@@ -31,6 +38,7 @@
 # Nehalem	29.6/40.3/14.6		10.0/11.8
 # Atom		57.3/74.2/32.1		60.9/77.2(***)
 # Silvermont	52.7/64.0/19.5		48.8/60.8(***)
+# Goldmont	38.9/49.0/17.8		10.6/12.6
 #
 # (*)	"Hyper-threading" in the context refers rather to cache shared
 #	among multiple cores, than to specifically Intel HTT. As vast
@@ -57,7 +65,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;
 
 $PREFIX="vpaes";
diff --git a/deps/openssl/openssl/crypto/aes/build.info b/deps/openssl/openssl/crypto/aes/build.info
new file mode 100644
index 0000000000..cf6cb5ec25
--- /dev/null
+++ b/deps/openssl/openssl/crypto/aes/build.info
@@ -0,0 +1,57 @@
+LIBS=../../libcrypto
+SOURCE[../../libcrypto]=\
+        aes_misc.c aes_ecb.c aes_cfb.c aes_ofb.c \
+        aes_ige.c aes_wrap.c {- $target{aes_asm_src} -}
+
+GENERATE[aes-ia64.s]=asm/aes-ia64.S
+
+GENERATE[aes-586.s]=asm/aes-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(LIB_CFLAGS) $(PROCESSOR)
+DEPEND[aes-586.s]=../perlasm/x86asm.pl
+GENERATE[vpaes-x86.s]=asm/vpaes-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(LIB_CFLAGS) $(PROCESSOR)
+DEPEND[vpaes-586.s]=../perlasm/x86asm.pl
+GENERATE[aesni-x86.s]=asm/aesni-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(LIB_CFLAGS) $(PROCESSOR)
+DEPEND[aesni-586.s]=../perlasm/x86asm.pl
+
+GENERATE[aes-x86_64.s]=asm/aes-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[vpaes-x86_64.s]=asm/vpaes-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[bsaes-x86_64.s]=asm/bsaes-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[aesni-x86_64.s]=asm/aesni-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[aesni-sha1-x86_64.s]=asm/aesni-sha1-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[aesni-sha256-x86_64.s]=asm/aesni-sha256-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[aesni-mb-x86_64.s]=asm/aesni-mb-x86_64.pl $(PERLASM_SCHEME)
+
+GENERATE[aes-sparcv9.S]=asm/aes-sparcv9.pl $(PERLASM_SCHEME)
+INCLUDE[aes-sparcv9.o]=..
+GENERATE[aest4-sparcv9.S]=asm/aest4-sparcv9.pl $(PERLASM_SCHEME)
+INCLUDE[aest4-sparcv9.o]=..
+DEPEND[aest4-sparcv9.S]=../perlasm/sparcv9_modes.pl
+GENERATE[aesfx-sparcv9.S]=asm/aesfx-sparcv9.pl $(PERLASM_SCHEME)
+INCLUDE[aesfx-sparcv9.o]=..
+
+GENERATE[aes-ppc.s]=asm/aes-ppc.pl $(PERLASM_SCHEME)
+GENERATE[vpaes-ppc.s]=asm/vpaes-ppc.pl $(PERLASM_SCHEME)
+GENERATE[aesp8-ppc.s]=asm/aesp8-ppc.pl $(PERLASM_SCHEME)
+
+GENERATE[aes-parisc.s]=asm/aes-parisc.pl $(PERLASM_SCHEME)
+
+GENERATE[aes-mips.S]=asm/aes-mips.pl $(PERLASM_SCHEME)
+
+GENERATE[aesv8-armx.S]=asm/aesv8-armx.pl $(PERLASM_SCHEME)
+INCLUDE[aesv8-armx.o]=..
+GENERATE[vpaes-armv8.S]=asm/vpaes-armv8.pl $(PERLASM_SCHEME)
+
+GENERATE[aes-armv4.S]=asm/aes-armv4.pl $(PERLASM_SCHEME)
+INCLUDE[aes-armv4.o]=..
+GENERATE[bsaes-armv7.S]=asm/bsaes-armv7.pl $(PERLASM_SCHEME)
+INCLUDE[bsaes-armv7.o]=..
+
+BEGINRAW[Makefile]
+##### AES assembler implementations
+
+# GNU make "catch all"
+{- $builddir -}/aes-%.S:	{- $sourcedir -}/asm/aes-%.pl
+	CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
+{- $builddir -}/bsaes-%.S:	{- $sourcedir -}/asm/bsaes-%.pl
+	CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
+
+ENDRAW[Makefile]