diff options
Diffstat (limited to 'deps/openssl/openssl/crypto/aes')
40 files changed, 8225 insertions, 1013 deletions
diff --git a/deps/openssl/openssl/crypto/aes/Makefile b/deps/openssl/openssl/crypto/aes/Makefile deleted file mode 100644 index 05e4a0149e..0000000000 --- a/deps/openssl/openssl/crypto/aes/Makefile +++ /dev/null @@ -1,171 +0,0 @@ -# -# crypto/aes/Makefile -# - -DIR= aes -TOP= ../.. -CC= cc -CPP= $(CC) -E -INCLUDES= -CFLAG=-g -MAKEFILE= Makefile -AR= ar r - -AES_ENC=aes_core.o aes_cbc.o - -CFLAGS= $(INCLUDES) $(CFLAG) -ASFLAGS= $(INCLUDES) $(ASFLAG) -AFLAGS= $(ASFLAGS) - -GENERAL=Makefile -#TEST=aestest.c -TEST= -APPS= - -LIB=$(TOP)/libcrypto.a -LIBSRC=aes_core.c aes_misc.c aes_ecb.c aes_cbc.c aes_cfb.c aes_ofb.c \ - aes_ctr.c aes_ige.c aes_wrap.c -LIBOBJ=aes_misc.o aes_ecb.o aes_cfb.o aes_ofb.o aes_ctr.o aes_ige.o aes_wrap.o \ - $(AES_ENC) - -SRC= $(LIBSRC) - -EXHEADER= aes.h -HEADER= aes_locl.h $(EXHEADER) - -ALL= $(GENERAL) $(SRC) $(HEADER) - -top: - (cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all) - -all: lib - -lib: $(LIBOBJ) - $(AR) $(LIB) $(LIBOBJ) - $(RANLIB) $(LIB) || echo Never mind. - @touch lib - -aes-ia64.s: asm/aes-ia64.S - $(CC) $(CFLAGS) -E asm/aes-ia64.S > $@ - -aes-586.s: asm/aes-586.pl ../perlasm/x86asm.pl - $(PERL) asm/aes-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@ -vpaes-x86.s: asm/vpaes-x86.pl ../perlasm/x86asm.pl - $(PERL) asm/vpaes-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@ -aesni-x86.s: asm/aesni-x86.pl ../perlasm/x86asm.pl - $(PERL) asm/aesni-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@ - -aes-x86_64.s: asm/aes-x86_64.pl - $(PERL) asm/aes-x86_64.pl $(PERLASM_SCHEME) > $@ -vpaes-x86_64.s: asm/vpaes-x86_64.pl - $(PERL) asm/vpaes-x86_64.pl $(PERLASM_SCHEME) > $@ -bsaes-x86_64.s: asm/bsaes-x86_64.pl - $(PERL) asm/bsaes-x86_64.pl $(PERLASM_SCHEME) > $@ -aesni-x86_64.s: asm/aesni-x86_64.pl - $(PERL) asm/aesni-x86_64.pl $(PERLASM_SCHEME) > $@ -aesni-sha1-x86_64.s: asm/aesni-sha1-x86_64.pl - $(PERL) asm/aesni-sha1-x86_64.pl $(PERLASM_SCHEME) > $@ -aesni-sha256-x86_64.s: asm/aesni-sha256-x86_64.pl - $(PERL) asm/aesni-sha256-x86_64.pl $(PERLASM_SCHEME) > $@ -aesni-mb-x86_64.s: asm/aesni-mb-x86_64.pl - $(PERL) asm/aesni-mb-x86_64.pl $(PERLASM_SCHEME) > $@ - -aes-sparcv9.s: asm/aes-sparcv9.pl - $(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@ -aest4-sparcv9.s: asm/aest4-sparcv9.pl ../perlasm/sparcv9_modes.pl - $(PERL) asm/aest4-sparcv9.pl $(CFLAGS) > $@ - -aes-ppc.s: asm/aes-ppc.pl - $(PERL) asm/aes-ppc.pl $(PERLASM_SCHEME) $@ -vpaes-ppc.s: asm/vpaes-ppc.pl - $(PERL) asm/vpaes-ppc.pl $(PERLASM_SCHEME) $@ -aesp8-ppc.s: asm/aesp8-ppc.pl - $(PERL) asm/aesp8-ppc.pl $(PERLASM_SCHEME) $@ - -aes-parisc.s: asm/aes-parisc.pl - $(PERL) asm/aes-parisc.pl $(PERLASM_SCHEME) $@ - -aes-mips.S: asm/aes-mips.pl - $(PERL) asm/aes-mips.pl $(PERLASM_SCHEME) $@ - -aesv8-armx.S: asm/aesv8-armx.pl - $(PERL) asm/aesv8-armx.pl $(PERLASM_SCHEME) $@ -aesv8-armx.o: aesv8-armx.S - -# GNU make "catch all" -aes-%.S: asm/aes-%.pl; $(PERL) $< $(PERLASM_SCHEME) > $@ -aes-armv4.o: aes-armv4.S -bsaes-%.S: asm/bsaes-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ -bsaes-armv7.o: bsaes-armv7.S - -files: - $(PERL) $(TOP)/util/files.pl "AES_ENC=$(AES_ENC)" Makefile >> $(TOP)/MINFO - -links: - @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER) - @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST) - @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS) - -install: - @[ -n "$(INSTALLTOP)" ] # should be set by top Makefile... - @headerlist="$(EXHEADER)"; for i in $$headerlist ; \ - do \ - (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \ - chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \ - done; - -tags: - ctags $(SRC) - -tests: - -lint: - lint -DLINT $(INCLUDES) $(SRC)>fluff - -update: depend - -depend: - @[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile... - $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC) - -dclean: - $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new - mv -f Makefile.new $(MAKEFILE) - -clean: - rm -f *.s *.S *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff - -# DO NOT DELETE THIS LINE -- make depend depends on it. - -aes_cbc.o: ../../include/openssl/aes.h ../../include/openssl/modes.h -aes_cbc.o: ../../include/openssl/opensslconf.h aes_cbc.c -aes_cfb.o: ../../include/openssl/aes.h ../../include/openssl/modes.h -aes_cfb.o: ../../include/openssl/opensslconf.h aes_cfb.c -aes_core.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h -aes_core.o: ../../include/openssl/opensslconf.h aes_core.c aes_locl.h -aes_ctr.o: ../../include/openssl/aes.h ../../include/openssl/modes.h -aes_ctr.o: ../../include/openssl/opensslconf.h aes_ctr.c -aes_ecb.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h -aes_ecb.o: ../../include/openssl/opensslconf.h aes_ecb.c aes_locl.h -aes_ige.o: ../../e_os.h ../../include/openssl/aes.h ../../include/openssl/bio.h -aes_ige.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h -aes_ige.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h -aes_ige.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h -aes_ige.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -aes_ige.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -aes_ige.o: ../../include/openssl/symhacks.h ../cryptlib.h aes_ige.c aes_locl.h -aes_misc.o: ../../include/openssl/aes.h ../../include/openssl/crypto.h -aes_misc.o: ../../include/openssl/e_os2.h ../../include/openssl/opensslconf.h -aes_misc.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -aes_misc.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -aes_misc.o: ../../include/openssl/symhacks.h aes_locl.h aes_misc.c -aes_ofb.o: ../../include/openssl/aes.h ../../include/openssl/modes.h -aes_ofb.o: ../../include/openssl/opensslconf.h aes_ofb.c -aes_wrap.o: ../../e_os.h ../../include/openssl/aes.h -aes_wrap.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h -aes_wrap.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h -aes_wrap.o: ../../include/openssl/err.h ../../include/openssl/lhash.h -aes_wrap.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h -aes_wrap.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -aes_wrap.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -aes_wrap.o: ../../include/openssl/symhacks.h ../cryptlib.h aes_wrap.c diff --git a/deps/openssl/openssl/crypto/aes/README b/deps/openssl/openssl/crypto/aes/README deleted file mode 100644 index 0f9620a80e..0000000000 --- a/deps/openssl/openssl/crypto/aes/README +++ /dev/null @@ -1,3 +0,0 @@ -This is an OpenSSL-compatible version of AES (also called Rijndael). -aes_core.c is basically the same as rijndael-alg-fst.c but with an -API that looks like the rest of the OpenSSL symmetric cipher suite. diff --git a/deps/openssl/openssl/crypto/aes/aes.h b/deps/openssl/openssl/crypto/aes/aes.h deleted file mode 100644 index faa66c4914..0000000000 --- a/deps/openssl/openssl/crypto/aes/aes.h +++ /dev/null @@ -1,149 +0,0 @@ -/* crypto/aes/aes.h */ -/* ==================================================================== - * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - */ - -#ifndef HEADER_AES_H -# define HEADER_AES_H - -# include <openssl/opensslconf.h> - -# ifdef OPENSSL_NO_AES -# error AES is disabled. -# endif - -# include <stddef.h> - -# define AES_ENCRYPT 1 -# define AES_DECRYPT 0 - -/* - * Because array size can't be a const in C, the following two are macros. - * Both sizes are in bytes. - */ -# define AES_MAXNR 14 -# define AES_BLOCK_SIZE 16 - -#ifdef __cplusplus -extern "C" { -#endif - -/* This should be a hidden type, but EVP requires that the size be known */ -struct aes_key_st { -# ifdef AES_LONG - unsigned long rd_key[4 * (AES_MAXNR + 1)]; -# else - unsigned int rd_key[4 * (AES_MAXNR + 1)]; -# endif - int rounds; -}; -typedef struct aes_key_st AES_KEY; - -const char *AES_options(void); - -int AES_set_encrypt_key(const unsigned char *userKey, const int bits, - AES_KEY *key); -int AES_set_decrypt_key(const unsigned char *userKey, const int bits, - AES_KEY *key); - -int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, - AES_KEY *key); -int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, - AES_KEY *key); - -void AES_encrypt(const unsigned char *in, unsigned char *out, - const AES_KEY *key); -void AES_decrypt(const unsigned char *in, unsigned char *out, - const AES_KEY *key); - -void AES_ecb_encrypt(const unsigned char *in, unsigned char *out, - const AES_KEY *key, const int enc); -void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, - size_t length, const AES_KEY *key, - unsigned char *ivec, const int enc); -void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out, - size_t length, const AES_KEY *key, - unsigned char *ivec, int *num, const int enc); -void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out, - size_t length, const AES_KEY *key, - unsigned char *ivec, int *num, const int enc); -void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out, - size_t length, const AES_KEY *key, - unsigned char *ivec, int *num, const int enc); -void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, - size_t length, const AES_KEY *key, - unsigned char *ivec, int *num); -void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out, - size_t length, const AES_KEY *key, - unsigned char ivec[AES_BLOCK_SIZE], - unsigned char ecount_buf[AES_BLOCK_SIZE], - unsigned int *num); -/* NB: the IV is _two_ blocks long */ -void AES_ige_encrypt(const unsigned char *in, unsigned char *out, - size_t length, const AES_KEY *key, - unsigned char *ivec, const int enc); -/* NB: the IV is _four_ blocks long */ -void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out, - size_t length, const AES_KEY *key, - const AES_KEY *key2, const unsigned char *ivec, - const int enc); - -int AES_wrap_key(AES_KEY *key, const unsigned char *iv, - unsigned char *out, - const unsigned char *in, unsigned int inlen); -int AES_unwrap_key(AES_KEY *key, const unsigned char *iv, - unsigned char *out, - const unsigned char *in, unsigned int inlen); - - -#ifdef __cplusplus -} -#endif - -#endif /* !HEADER_AES_H */ diff --git a/deps/openssl/openssl/crypto/aes/aes_cbc.c b/deps/openssl/openssl/crypto/aes/aes_cbc.c index 805d0e260a..342841fc4f 100644 --- a/deps/openssl/openssl/crypto/aes/aes_cbc.c +++ b/deps/openssl/openssl/crypto/aes/aes_cbc.c @@ -1,52 +1,10 @@ -/* crypto/aes/aes_cbc.c */ -/* ==================================================================== - * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== +/* + * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html */ #include <openssl/aes.h> diff --git a/deps/openssl/openssl/crypto/aes/aes_cfb.c b/deps/openssl/openssl/crypto/aes/aes_cfb.c index 1225000963..f010e3c4ea 100644 --- a/deps/openssl/openssl/crypto/aes/aes_cfb.c +++ b/deps/openssl/openssl/crypto/aes/aes_cfb.c @@ -1,52 +1,10 @@ -/* crypto/aes/aes_cfb.c */ -/* ==================================================================== - * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== +/* + * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html */ #include <openssl/aes.h> diff --git a/deps/openssl/openssl/crypto/aes/aes_core.c b/deps/openssl/openssl/crypto/aes/aes_core.c index 7019b5d7aa..bd5c7793be 100644 --- a/deps/openssl/openssl/crypto/aes/aes_core.c +++ b/deps/openssl/openssl/crypto/aes/aes_core.c @@ -1,4 +1,12 @@ -/* crypto/aes/aes_core.c */ +/* + * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + /** * rijndael-alg-fst.c * @@ -28,14 +36,10 @@ /* Note: rewritten a little bit to provide error control and an OpenSSL- compatible API */ -#ifndef AES_DEBUG -# ifndef NDEBUG -# define NDEBUG -# endif -#endif #include <assert.h> #include <stdlib.h> +#include <openssl/crypto.h> #include <openssl/aes.h> #include "aes_locl.h" @@ -625,8 +629,8 @@ static const u32 rcon[] = { /** * Expand the cipher key into the encryption key schedule. */ -int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, - AES_KEY *key) +int AES_set_encrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key) { u32 *rk; @@ -640,9 +644,9 @@ int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, rk = key->rd_key; - if (bits==128) + if (bits == 128) key->rounds = 10; - else if (bits==192) + else if (bits == 192) key->rounds = 12; else key->rounds = 14; @@ -727,8 +731,8 @@ int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, /** * Expand the cipher key into the decryption key schedule. */ -int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, - AES_KEY *key) +int AES_set_decrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key) { u32 *rk; @@ -736,7 +740,7 @@ int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, u32 temp; /* first, start with an encryption schedule */ - status = private_AES_set_encrypt_key(userKey, bits, key); + status = AES_set_encrypt_key(userKey, bits, key); if (status < 0) return status; @@ -1204,11 +1208,11 @@ static const u32 rcon[] = { /** * Expand the cipher key into the encryption key schedule. */ -int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, - AES_KEY *key) +int AES_set_encrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key) { u32 *rk; - int i = 0; + int i = 0; u32 temp; if (!userKey || !key) @@ -1218,9 +1222,9 @@ int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, rk = key->rd_key; - if (bits==128) + if (bits == 128) key->rounds = 10; - else if (bits==192) + else if (bits == 192) key->rounds = 12; else key->rounds = 14; @@ -1305,8 +1309,8 @@ int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, /** * Expand the cipher key into the decryption key schedule. */ -int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, - AES_KEY *key) +int AES_set_decrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key) { u32 *rk; @@ -1314,7 +1318,7 @@ int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, u32 temp; /* first, start with an encryption schedule */ - status = private_AES_set_encrypt_key(userKey, bits, key); + status = AES_set_encrypt_key(userKey, bits, key); if (status < 0) return status; @@ -1351,7 +1355,7 @@ int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, rk[j] = tpe ^ ROTATE(tpd,16) ^ ROTATE(tp9,24) ^ ROTATE(tpb,8); #else - rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ + rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ (tp9 >> 8) ^ (tp9 << 24) ^ (tpb >> 24) ^ (tpb << 8); #endif diff --git a/deps/openssl/openssl/crypto/aes/aes_ctr.c b/deps/openssl/openssl/crypto/aes/aes_ctr.c deleted file mode 100644 index 9e760c4b12..0000000000 --- a/deps/openssl/openssl/crypto/aes/aes_ctr.c +++ /dev/null @@ -1,63 +0,0 @@ -/* crypto/aes/aes_ctr.c */ -/* ==================================================================== - * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - */ - -#include <openssl/aes.h> -#include <openssl/modes.h> - -void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out, - size_t length, const AES_KEY *key, - unsigned char ivec[AES_BLOCK_SIZE], - unsigned char ecount_buf[AES_BLOCK_SIZE], - unsigned int *num) -{ - CRYPTO_ctr128_encrypt(in, out, length, key, ivec, ecount_buf, num, - (block128_f) AES_encrypt); -} diff --git a/deps/openssl/openssl/crypto/aes/aes_ecb.c b/deps/openssl/openssl/crypto/aes/aes_ecb.c index 52151a5c70..29bfc1ad66 100644 --- a/deps/openssl/openssl/crypto/aes/aes_ecb.c +++ b/deps/openssl/openssl/crypto/aes/aes_ecb.c @@ -1,59 +1,12 @@ -/* crypto/aes/aes_ecb.c */ -/* ==================================================================== - * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== +/* + * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html */ -#ifndef AES_DEBUG -# ifndef NDEBUG -# define NDEBUG -# endif -#endif #include <assert.h> #include <openssl/aes.h> diff --git a/deps/openssl/openssl/crypto/aes/aes_ige.c b/deps/openssl/openssl/crypto/aes/aes_ige.c index 8f2b770647..75f796cf3b 100644 --- a/deps/openssl/openssl/crypto/aes/aes_ige.c +++ b/deps/openssl/openssl/crypto/aes/aes_ige.c @@ -1,55 +1,13 @@ -/* crypto/aes/aes_ige.c */ -/* ==================================================================== - * Copyright (c) 2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== +/* + * Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html */ -#include "cryptlib.h" +#include "internal/cryptlib.h" #include <openssl/aes.h> #include "aes_locl.h" @@ -83,6 +41,9 @@ void AES_ige_encrypt(const unsigned char *in, unsigned char *out, size_t n; size_t len = length; + if (length == 0) + return; + OPENSSL_assert(in && out && key && ivec); OPENSSL_assert((AES_ENCRYPT == enc) || (AES_DECRYPT == enc)); OPENSSL_assert((length % AES_BLOCK_SIZE) == 0); diff --git a/deps/openssl/openssl/crypto/aes/aes_locl.h b/deps/openssl/openssl/crypto/aes/aes_locl.h index 7acd74ec16..adee29df8d 100644 --- a/deps/openssl/openssl/crypto/aes/aes_locl.h +++ b/deps/openssl/openssl/crypto/aes/aes_locl.h @@ -1,63 +1,16 @@ -/* crypto/aes/aes.h */ -/* ==================================================================== - * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== +/* + * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html */ #ifndef HEADER_AES_LOCL_H # define HEADER_AES_LOCL_H # include <openssl/e_os2.h> - -# ifdef OPENSSL_NO_AES -# error AES is disabled. -# endif - # include <stdio.h> # include <stdlib.h> # include <string.h> diff --git a/deps/openssl/openssl/crypto/aes/aes_misc.c b/deps/openssl/openssl/crypto/aes/aes_misc.c index fafad4d6f5..7403c84f82 100644 --- a/deps/openssl/openssl/crypto/aes/aes_misc.c +++ b/deps/openssl/openssl/crypto/aes/aes_misc.c @@ -1,61 +1,16 @@ -/* crypto/aes/aes_misc.c */ -/* ==================================================================== - * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== +/* + * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html */ #include <openssl/opensslv.h> -#include <openssl/crypto.h> #include <openssl/aes.h> #include "aes_locl.h" -const char AES_version[] = "AES" OPENSSL_VERSION_PTEXT; - const char *AES_options(void) { #ifdef FULL_UNROLL @@ -64,23 +19,3 @@ const char *AES_options(void) return "aes(partial)"; #endif } - -/* FIPS wrapper functions to block low level AES calls in FIPS mode */ - -int AES_set_encrypt_key(const unsigned char *userKey, const int bits, - AES_KEY *key) -{ -#ifdef OPENSSL_FIPS - fips_cipher_abort(AES); -#endif - return private_AES_set_encrypt_key(userKey, bits, key); -} - -int AES_set_decrypt_key(const unsigned char *userKey, const int bits, - AES_KEY *key) -{ -#ifdef OPENSSL_FIPS - fips_cipher_abort(AES); -#endif - return private_AES_set_decrypt_key(userKey, bits, key); -} diff --git a/deps/openssl/openssl/crypto/aes/aes_ofb.c b/deps/openssl/openssl/crypto/aes/aes_ofb.c index 64a08caaec..215b53858e 100644 --- a/deps/openssl/openssl/crypto/aes/aes_ofb.c +++ b/deps/openssl/openssl/crypto/aes/aes_ofb.c @@ -1,52 +1,10 @@ -/* crypto/aes/aes_ofb.c */ -/* ==================================================================== - * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== +/* + * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html */ #include <openssl/aes.h> diff --git a/deps/openssl/openssl/crypto/aes/aes_wrap.c b/deps/openssl/openssl/crypto/aes/aes_wrap.c index b7b64d57a4..cae0b21229 100644 --- a/deps/openssl/openssl/crypto/aes/aes_wrap.c +++ b/deps/openssl/openssl/crypto/aes/aes_wrap.c @@ -1,58 +1,13 @@ -/* crypto/aes/aes_wrap.c */ /* - * Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL - * project. - */ -/* ==================================================================== - * Copyright (c) 2008 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved. * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html */ -#include "cryptlib.h" +#include "internal/cryptlib.h" #include <openssl/aes.h> #include <openssl/modes.h> diff --git a/deps/openssl/openssl/crypto/aes/aes_x86core.c b/deps/openssl/openssl/crypto/aes/aes_x86core.c index b5dd697677..95b49bbabc 100644 --- a/deps/openssl/openssl/crypto/aes/aes_x86core.c +++ b/deps/openssl/openssl/crypto/aes/aes_x86core.c @@ -1,4 +1,12 @@ -/* crypto/aes/aes_core.c */ +/* + * Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + /** * rijndael-alg-fst.c * @@ -35,11 +43,6 @@ */ -#ifndef AES_DEBUG -# ifndef NDEBUG -# define NDEBUG -# endif -#endif #include <assert.h> #include <stdlib.h> @@ -618,7 +621,7 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits, rk[j] = tpe ^ ROTATE(tpd,16) ^ ROTATE(tp9,8) ^ ROTATE(tpb,24); #else - rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ + rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ (tp9 >> 24) ^ (tp9 << 8) ^ (tpb >> 8) ^ (tpb << 24); #endif @@ -907,7 +910,7 @@ void AES_decrypt(const unsigned char *in, unsigned char *out, (u32)Td4[(s1 >> 16) & 0xff] << 16 ^ (u32)Td4[(s0 >> 24) ] << 24; - /* now do the linear transform using words */ + /* now do the linear transform using words */ { int i; u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m; @@ -931,7 +934,7 @@ void AES_decrypt(const unsigned char *in, unsigned char *out, t[i] = tpe ^ ROTATE(tpd,16) ^ ROTATE(tp9,8) ^ ROTATE(tpb,24); #else - t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ + t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ (tp9 >> 24) ^ (tp9 << 8) ^ (tpb >> 8) ^ (tpb << 24); #endif @@ -984,7 +987,7 @@ void AES_decrypt(const unsigned char *in, unsigned char *out, (u32)Td4[(s1 >> 16) & 0xff] << 16 ^ (u32)Td4[(s0 >> 24) ] << 24; - /* now do the linear transform using words */ + /* now do the linear transform using words */ { int i; u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m; @@ -1008,7 +1011,7 @@ void AES_decrypt(const unsigned char *in, unsigned char *out, t[i] = tpe ^ ROTATE(tpd,16) ^ ROTATE(tp9,8) ^ ROTATE(tpb,24); #else - t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ + t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ (tp9 >> 24) ^ (tp9 << 8) ^ (tpb >> 8) ^ (tpb << 24); #endif diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-586.pl b/deps/openssl/openssl/crypto/aes/asm/aes-586.pl index 60286ecb96..1ba356508a 100755 --- a/deps/openssl/openssl/crypto/aes/asm/aes-586.pl +++ b/deps/openssl/openssl/crypto/aes/asm/aes-586.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -191,6 +198,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; +$output = pop; +open OUT,">$output"; +*STDOUT=*OUT; + &asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386"); &static_label("AES_Te"); &static_label("AES_Td"); @@ -2861,12 +2872,12 @@ sub enckey() &set_label("exit"); &function_end("_x86_AES_set_encrypt_key"); -# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, +# int AES_set_encrypt_key(const unsigned char *userKey, const int bits, # AES_KEY *key) -&function_begin_B("private_AES_set_encrypt_key"); +&function_begin_B("AES_set_encrypt_key"); &call ("_x86_AES_set_encrypt_key"); &ret (); -&function_end_B("private_AES_set_encrypt_key"); +&function_end_B("AES_set_encrypt_key"); sub deckey() { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; @@ -2923,9 +2934,9 @@ sub deckey() &mov (&DWP(4*$i,$key),$tp1); } -# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, +# int AES_set_decrypt_key(const unsigned char *userKey, const int bits, # AES_KEY *key) -&function_begin_B("private_AES_set_decrypt_key"); +&function_begin_B("AES_set_decrypt_key"); &call ("_x86_AES_set_encrypt_key"); &cmp ("eax",0); &je (&label("proceed")); @@ -2981,7 +2992,9 @@ sub deckey() &jb (&label("permute")); &xor ("eax","eax"); # return success -&function_end("private_AES_set_decrypt_key"); +&function_end("AES_set_decrypt_key"); &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>"); &asm_finish(); + +close STDOUT; diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-armv4.pl b/deps/openssl/openssl/crypto/aes/asm/aes-armv4.pl index c1b5e352d7..998158998e 100644 --- a/deps/openssl/openssl/crypto/aes/asm/aes-armv4.pl +++ b/deps/openssl/openssl/crypto/aes/asm/aes-armv4.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -32,8 +39,20 @@ # Profiler-assisted and platform-specific optimization resulted in 16% # improvement on Cortex A8 core and ~21.5 cycles per byte. -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $s0="r0"; $s1="r1"; @@ -58,15 +77,12 @@ $code=<<___; #endif .text -#if __ARM_ARCH__<7 -.code 32 -#else +#if defined(__thumb2__) && !defined(__APPLE__) .syntax unified -# ifdef __thumb2__ .thumb -# else +#else .code 32 -# endif +#undef __thumb2__ #endif .type AES_Te,%object @@ -181,15 +197,19 @@ AES_Te: .type AES_encrypt,%function .align 5 AES_encrypt: -#if __ARM_ARCH__<7 +#ifndef __thumb2__ sub r3,pc,#8 @ AES_encrypt #else adr r3,. #endif stmdb sp!,{r1,r4-r12,lr} +#if defined(__thumb2__) || defined(__APPLE__) + adr $tbl,AES_Te +#else + sub $tbl,r3,#AES_encrypt-AES_Te @ Te +#endif mov $rounds,r0 @ inp mov $key,r2 - sub $tbl,r3,#AES_encrypt-AES_Te @ Te #if __ARM_ARCH__<7 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $t1,[$rounds,#2] @ manner... @@ -422,24 +442,24 @@ _armv4_AES_encrypt: ldr pc,[sp],#4 @ pop and return .size _armv4_AES_encrypt,.-_armv4_AES_encrypt -.global private_AES_set_encrypt_key -.type private_AES_set_encrypt_key,%function +.global AES_set_encrypt_key +.type AES_set_encrypt_key,%function .align 5 -private_AES_set_encrypt_key: +AES_set_encrypt_key: _armv4_AES_set_encrypt_key: -#if __ARM_ARCH__<7 +#ifndef __thumb2__ sub r3,pc,#8 @ AES_set_encrypt_key #else adr r3,. #endif teq r0,#0 -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt eq @ Thumb2 thing, sanity check in ARM #endif moveq r0,#-1 beq .Labrt teq r2,#0 -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt eq @ Thumb2 thing, sanity check in ARM #endif moveq r0,#-1 @@ -450,19 +470,23 @@ _armv4_AES_set_encrypt_key: teq r1,#192 beq .Lok teq r1,#256 -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt ne @ Thumb2 thing, sanity check in ARM #endif movne r0,#-1 bne .Labrt .Lok: stmdb sp!,{r4-r12,lr} - sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 - mov $rounds,r0 @ inp mov lr,r1 @ bits mov $key,r2 @ key +#if defined(__thumb2__) || defined(__APPLE__) + adr $tbl,AES_Te+1024 @ Te4 +#else + sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 +#endif + #if __ARM_ARCH__<7 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $t1,[$rounds,#2] @ manner... @@ -607,7 +631,7 @@ _armv4_AES_set_encrypt_key: str $s2,[$key,#-16] subs $rounds,$rounds,#1 str $s3,[$key,#-12] -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt eq @ Thumb2 thing, sanity check in ARM #endif subeq r2,$key,#216 @@ -679,7 +703,7 @@ _armv4_AES_set_encrypt_key: str $s2,[$key,#-24] subs $rounds,$rounds,#1 str $s3,[$key,#-20] -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt eq @ Thumb2 thing, sanity check in ARM #endif subeq r2,$key,#256 @@ -722,12 +746,12 @@ _armv4_AES_set_encrypt_key: moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif -.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key +.size AES_set_encrypt_key,.-AES_set_encrypt_key -.global private_AES_set_decrypt_key -.type private_AES_set_decrypt_key,%function +.global AES_set_decrypt_key +.type AES_set_decrypt_key,%function .align 5 -private_AES_set_decrypt_key: +AES_set_decrypt_key: str lr,[sp,#-4]! @ push lr bl _armv4_AES_set_encrypt_key teq r0,#0 @@ -737,7 +761,7 @@ private_AES_set_decrypt_key: mov r0,r2 @ AES_set_encrypt_key preserves r2, mov r1,r2 @ which is AES_KEY *key b _armv4_AES_set_enc2dec_key -.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key +.size AES_set_decrypt_key,.-AES_set_decrypt_key @ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) .global AES_set_enc2dec_key @@ -750,7 +774,7 @@ _armv4_AES_set_enc2dec_key: ldr $rounds,[r0,#240] mov $i1,r0 @ input add $i2,r0,$rounds,lsl#4 - mov $key,r1 @ ouput + mov $key,r1 @ output add $tbl,r1,$rounds,lsl#4 str $rounds,[r1,#240] @@ -949,15 +973,19 @@ AES_Td: .type AES_decrypt,%function .align 5 AES_decrypt: -#if __ARM_ARCH__<7 +#ifndef __thumb2__ sub r3,pc,#8 @ AES_decrypt #else adr r3,. #endif stmdb sp!,{r1,r4-r12,lr} +#if defined(__thumb2__) || defined(__APPLE__) + adr $tbl,AES_Td +#else + sub $tbl,r3,#AES_decrypt-AES_Td @ Td +#endif mov $rounds,r0 @ inp mov $key,r2 - sub $tbl,r3,#AES_decrypt-AES_Td @ Td #if __ARM_ARCH__<7 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $t1,[$rounds,#2] @ manner... diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-c64xplus.pl b/deps/openssl/openssl/crypto/aes/asm/aes-c64xplus.pl new file mode 100644 index 0000000000..19d2cc176f --- /dev/null +++ b/deps/openssl/openssl/crypto/aes/asm/aes-c64xplus.pl @@ -0,0 +1,1382 @@ +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# [Endian-neutral] AES for C64x+. +# +# Even though SPLOOPs are scheduled for 13 cycles, and thus expected +# performance is ~8.5 cycles per byte processed with 128-bit key, +# measured performance turned to be ~10 cycles per byte. Discrepancy +# must be caused by limitations of L1D memory banking(*), see SPRU871 +# TI publication for further details. If any consolation it's still +# ~20% faster than TI's linear assembly module anyway... Compared to +# aes_core.c compiled with cl6x 6.0 with -mv6400+ -o2 options this +# code is 3.75x faster and almost 3x smaller (tables included). +# +# (*) This means that there might be subtle correlation between data +# and timing and one can wonder if it can be ... attacked:-( +# On the other hand this also means that *if* one chooses to +# implement *4* T-tables variant [instead of 1 T-table as in +# this implementation, or in addition to], then one ought to +# *interleave* them. Even though it complicates addressing, +# references to interleaved tables would be guaranteed not to +# clash. I reckon that it should be possible to break 8 cycles +# per byte "barrier," i.e. improve by ~20%, naturally at the +# cost of 8x increased pressure on L1D. 8x because you'd have +# to interleave both Te and Td tables... + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($TEA,$TEB)=("A5","B5"); +($KPA,$KPB)=("A3","B1"); +@K=("A6","B6","A7","B7"); +@s=("A8","B8","A9","B9"); +@Te0=@Td0=("A16","B16","A17","B17"); +@Te1=@Td1=("A18","B18","A19","B19"); +@Te2=@Td2=("A20","B20","A21","B21"); +@Te3=@Td3=("A22","B22","A23","B23"); + +$code=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .nocmp + .asg AES_encrypt,_AES_encrypt + .asg AES_decrypt,_AES_decrypt + .asg AES_set_encrypt_key,_AES_set_encrypt_key + .asg AES_set_decrypt_key,_AES_set_decrypt_key + .asg AES_ctr32_encrypt,_AES_ctr32_encrypt + .endif + + .asg B3,RA + .asg A4,INP + .asg B4,OUT + .asg A6,KEY + .asg A4,RET + .asg B15,SP + + .eval 24,EXT0 + .eval 16,EXT1 + .eval 8,EXT2 + .eval 0,EXT3 + .eval 8,TBL1 + .eval 16,TBL2 + .eval 24,TBL3 + + .if .BIG_ENDIAN + .eval 24-EXT0,EXT0 + .eval 24-EXT1,EXT1 + .eval 24-EXT2,EXT2 + .eval 24-EXT3,EXT3 + .eval 32-TBL1,TBL1 + .eval 32-TBL2,TBL2 + .eval 32-TBL3,TBL3 + .endif + + .global _AES_encrypt +_AES_encrypt: + .asmfunc + MVK 1,B2 +__encrypt: + .if __TI_EABI__ + [B2] LDNDW *INP++,A9:A8 ; load input +|| MVKL \$PCR_OFFSET(AES_Te,__encrypt),$TEA +|| ADDKPC __encrypt,B0 + [B2] LDNDW *INP++,B9:B8 +|| MVKH \$PCR_OFFSET(AES_Te,__encrypt),$TEA +|| ADD 0,KEY,$KPA +|| ADD 4,KEY,$KPB + .else + [B2] LDNDW *INP++,A9:A8 ; load input +|| MVKL (AES_Te-__encrypt),$TEA +|| ADDKPC __encrypt,B0 + [B2] LDNDW *INP++,B9:B8 +|| MVKH (AES_Te-__encrypt),$TEA +|| ADD 0,KEY,$KPA +|| ADD 4,KEY,$KPB + .endif + LDW *$KPA++[2],$Te0[0] ; zero round key +|| LDW *$KPB++[2],$Te0[1] +|| MVK 60,A0 +|| ADD B0,$TEA,$TEA ; AES_Te + LDW *KEY[A0],B0 ; rounds +|| MVK 1024,A0 ; sizeof(AES_Te) + LDW *$KPA++[2],$Te0[2] +|| LDW *$KPB++[2],$Te0[3] +|| MV $TEA,$TEB + NOP + .if .BIG_ENDIAN + MV A9,$s[0] +|| MV A8,$s[1] +|| MV B9,$s[2] +|| MV B8,$s[3] + .else + MV A8,$s[0] +|| MV A9,$s[1] +|| MV B8,$s[2] +|| MV B9,$s[3] + .endif + XOR $Te0[0],$s[0],$s[0] +|| XOR $Te0[1],$s[1],$s[1] +|| LDW *$KPA++[2],$K[0] ; 1st round key +|| LDW *$KPB++[2],$K[1] + SUB B0,2,B0 + + SPLOOPD 13 +|| MVC B0,ILC +|| LDW *$KPA++[2],$K[2] +|| LDW *$KPB++[2],$K[3] +;;==================================================================== + EXTU $s[1],EXT1,24,$Te1[1] +|| EXTU $s[0],EXT3,24,$Te3[0] + LDW *${TEB}[$Te1[1]],$Te1[1] ; Te1[s1>>8], t0 +|| LDW *${TEA}[$Te3[0]],$Te3[0] ; Te3[s0>>24], t1 +|| XOR $s[2],$Te0[2],$s[2] ; modulo-scheduled +|| XOR $s[3],$Te0[3],$s[3] ; modulo-scheduled +|| EXTU $s[1],EXT3,24,$Te3[1] +|| EXTU $s[0],EXT1,24,$Te1[0] + LDW *${TEB}[$Te3[1]],$Te3[1] ; Te3[s1>>24], t2 +|| LDW *${TEA}[$Te1[0]],$Te1[0] ; Te1[s0>>8], t3 +|| EXTU $s[2],EXT2,24,$Te2[2] +|| EXTU $s[3],EXT2,24,$Te2[3] + LDW *${TEA}[$Te2[2]],$Te2[2] ; Te2[s2>>16], t0 +|| LDW *${TEB}[$Te2[3]],$Te2[3] ; Te2[s3>>16], t1 +|| EXTU $s[3],EXT3,24,$Te3[3] +|| EXTU $s[2],EXT1,24,$Te1[2] + LDW *${TEB}[$Te3[3]],$Te3[3] ; Te3[s3>>24], t0 +|| LDW *${TEA}[$Te1[2]],$Te1[2] ; Te1[s2>>8], t1 +|| EXTU $s[0],EXT2,24,$Te2[0] +|| EXTU $s[1],EXT2,24,$Te2[1] + LDW *${TEA}[$Te2[0]],$Te2[0] ; Te2[s0>>16], t2 +|| LDW *${TEB}[$Te2[1]],$Te2[1] ; Te2[s1>>16], t3 +|| EXTU $s[3],EXT1,24,$Te1[3] +|| EXTU $s[2],EXT3,24,$Te3[2] + LDW *${TEB}[$Te1[3]],$Te1[3] ; Te1[s3>>8], t2 +|| LDW *${TEA}[$Te3[2]],$Te3[2] ; Te3[s2>>24], t3 +|| ROTL $Te1[1],TBL1,$Te3[0] ; t0 +|| ROTL $Te3[0],TBL3,$Te1[1] ; t1 +|| EXTU $s[0],EXT0,24,$Te0[0] +|| EXTU $s[1],EXT0,24,$Te0[1] + LDW *${TEA}[$Te0[0]],$Te0[0] ; Te0[s0], t0 +|| LDW *${TEB}[$Te0[1]],$Te0[1] ; Te0[s1], t1 +|| ROTL $Te3[1],TBL3,$Te1[0] ; t2 +|| ROTL $Te1[0],TBL1,$Te3[1] ; t3 +|| EXTU $s[2],EXT0,24,$Te0[2] +|| EXTU $s[3],EXT0,24,$Te0[3] + LDW *${TEA}[$Te0[2]],$Te0[2] ; Te0[s2], t2 +|| LDW *${TEB}[$Te0[3]],$Te0[3] ; Te0[s3], t3 +|| ROTL $Te2[2],TBL2,$Te2[2] ; t0 +|| ROTL $Te2[3],TBL2,$Te2[3] ; t1 +|| XOR $K[0],$Te3[0],$s[0] +|| XOR $K[1],$Te1[1],$s[1] + ROTL $Te3[3],TBL3,$Te1[2] ; t0 +|| ROTL $Te1[2],TBL1,$Te3[3] ; t1 +|| XOR $K[2],$Te1[0],$s[2] +|| XOR $K[3],$Te3[1],$s[3] +|| LDW *$KPA++[2],$K[0] ; next round key +|| LDW *$KPB++[2],$K[1] + ROTL $Te2[0],TBL2,$Te2[0] ; t2 +|| ROTL $Te2[1],TBL2,$Te2[1] ; t3 +|| XOR $s[0],$Te2[2],$s[0] +|| XOR $s[1],$Te2[3],$s[1] +|| LDW *$KPA++[2],$K[2] +|| LDW *$KPB++[2],$K[3] + ROTL $Te1[3],TBL1,$Te3[2] ; t2 +|| ROTL $Te3[2],TBL3,$Te1[3] ; t3 +|| XOR $s[0],$Te1[2],$s[0] +|| XOR $s[1],$Te3[3],$s[1] + XOR $s[2],$Te2[0],$s[2] +|| XOR $s[3],$Te2[1],$s[3] +|| XOR $s[0],$Te0[0],$s[0] +|| XOR $s[1],$Te0[1],$s[1] + SPKERNEL +|| XOR.L $s[2],$Te3[2],$s[2] +|| XOR.L $s[3],$Te1[3],$s[3] +;;==================================================================== + ADD.D ${TEA},A0,${TEA} ; point to Te4 +|| ADD.D ${TEB},A0,${TEB} +|| EXTU $s[1],EXT1,24,$Te1[1] +|| EXTU $s[0],EXT3,24,$Te3[0] + LDBU *${TEB}[$Te1[1]],$Te1[1] ; Te1[s1>>8], t0 +|| LDBU *${TEA}[$Te3[0]],$Te3[0] ; Te3[s0>>24], t1 +|| XOR $s[2],$Te0[2],$s[2] ; modulo-scheduled +|| XOR $s[3],$Te0[3],$s[3] ; modulo-scheduled +|| EXTU $s[0],EXT0,24,$Te0[0] +|| EXTU $s[1],EXT0,24,$Te0[1] + LDBU *${TEA}[$Te0[0]],$Te0[0] ; Te0[s0], t0 +|| LDBU *${TEB}[$Te0[1]],$Te0[1] ; Te0[s1], t1 +|| EXTU $s[3],EXT3,24,$Te3[3] +|| EXTU $s[2],EXT1,24,$Te1[2] + LDBU *${TEB}[$Te3[3]],$Te3[3] ; Te3[s3>>24], t0 +|| LDBU *${TEA}[$Te1[2]],$Te1[2] ; Te1[s2>>8], t1 +|| EXTU $s[2],EXT2,24,$Te2[2] +|| EXTU $s[3],EXT2,24,$Te2[3] + LDBU *${TEA}[$Te2[2]],$Te2[2] ; Te2[s2>>16], t0 +|| LDBU *${TEB}[$Te2[3]],$Te2[3] ; Te2[s3>>16], t1 +|| EXTU $s[1],EXT3,24,$Te3[1] +|| EXTU $s[0],EXT1,24,$Te1[0] + LDBU *${TEB}[$Te3[1]],$Te3[1] ; Te3[s1>>24], t2 +|| LDBU *${TEA}[$Te1[0]],$Te1[0] ; Te1[s0>>8], t3 +|| EXTU $s[3],EXT1,24,$Te1[3] +|| EXTU $s[2],EXT3,24,$Te3[2] + LDBU *${TEB}[$Te1[3]],$Te1[3] ; Te1[s3>>8], t2 +|| LDBU *${TEA}[$Te3[2]],$Te3[2] ; Te3[s2>>24], t3 +|| EXTU $s[2],EXT0,24,$Te0[2] +|| EXTU $s[3],EXT0,24,$Te0[3] + LDBU *${TEA}[$Te0[2]],$Te0[2] ; Te0[s2], t2 +|| LDBU *${TEB}[$Te0[3]],$Te0[3] ; Te0[s3], t3 +|| EXTU $s[0],EXT2,24,$Te2[0] +|| EXTU $s[1],EXT2,24,$Te2[1] + LDBU *${TEA}[$Te2[0]],$Te2[0] ; Te2[s0>>16], t2 +|| LDBU *${TEB}[$Te2[1]],$Te2[1] ; Te2[s1>>16], t3 + + .if .BIG_ENDIAN + PACK2 $Te0[0],$Te1[1],$Te0[0] +|| PACK2 $Te0[1],$Te1[2],$Te0[1] + PACK2 $Te2[2],$Te3[3],$Te2[2] +|| PACK2 $Te2[3],$Te3[0],$Te2[3] + PACKL4 $Te0[0],$Te2[2],$Te0[0] +|| PACKL4 $Te0[1],$Te2[3],$Te0[1] + XOR $K[0],$Te0[0],$Te0[0] ; s[0] +|| XOR $K[1],$Te0[1],$Te0[1] ; s[1] + + PACK2 $Te0[2],$Te1[3],$Te0[2] +|| PACK2 $Te0[3],$Te1[0],$Te0[3] + PACK2 $Te2[0],$Te3[1],$Te2[0] +|| PACK2 $Te2[1],$Te3[2],$Te2[1] +|| BNOP RA + PACKL4 $Te0[2],$Te2[0],$Te0[2] +|| PACKL4 $Te0[3],$Te2[1],$Te0[3] + XOR $K[2],$Te0[2],$Te0[2] ; s[2] +|| XOR $K[3],$Te0[3],$Te0[3] ; s[3] + + MV $Te0[0],A9 +|| MV $Te0[1],A8 + MV $Te0[2],B9 +|| MV $Te0[3],B8 +|| [B2] STNDW A9:A8,*OUT++ + [B2] STNDW B9:B8,*OUT++ + .else + PACK2 $Te1[1],$Te0[0],$Te1[1] +|| PACK2 $Te1[2],$Te0[1],$Te1[2] + PACK2 $Te3[3],$Te2[2],$Te3[3] +|| PACK2 $Te3[0],$Te2[3],$Te3[0] + PACKL4 $Te3[3],$Te1[1],$Te1[1] +|| PACKL4 $Te3[0],$Te1[2],$Te1[2] + XOR $K[0],$Te1[1],$Te1[1] ; s[0] +|| XOR $K[1],$Te1[2],$Te1[2] ; s[1] + + PACK2 $Te1[3],$Te0[2],$Te1[3] +|| PACK2 $Te1[0],$Te0[3],$Te1[0] + PACK2 $Te3[1],$Te2[0],$Te3[1] +|| PACK2 $Te3[2],$Te2[1],$Te3[2] +|| BNOP RA + PACKL4 $Te3[1],$Te1[3],$Te1[3] +|| PACKL4 $Te3[2],$Te1[0],$Te1[0] + XOR $K[2],$Te1[3],$Te1[3] ; s[2] +|| XOR $K[3],$Te1[0],$Te1[0] ; s[3] + + MV $Te1[1],A8 +|| MV $Te1[2],A9 + MV $Te1[3],B8 +|| MV $Te1[0],B9 +|| [B2] STNDW A9:A8,*OUT++ + [B2] STNDW B9:B8,*OUT++ + .endif + .endasmfunc + + .global _AES_decrypt +_AES_decrypt: + .asmfunc + MVK 1,B2 +__decrypt: + .if __TI_EABI__ + [B2] LDNDW *INP++,A9:A8 ; load input +|| MVKL \$PCR_OFFSET(AES_Td,__decrypt),$TEA +|| ADDKPC __decrypt,B0 + [B2] LDNDW *INP++,B9:B8 +|| MVKH \$PCR_OFFSET(AES_Td,__decrypt),$TEA +|| ADD 0,KEY,$KPA +|| ADD 4,KEY,$KPB + .else + [B2] LDNDW *INP++,A9:A8 ; load input +|| MVKL (AES_Td-__decrypt),$TEA +|| ADDKPC __decrypt,B0 + [B2] LDNDW *INP++,B9:B8 +|| MVKH (AES_Td-__decrypt),$TEA +|| ADD 0,KEY,$KPA +|| ADD 4,KEY,$KPB + .endif + LDW *$KPA++[2],$Td0[0] ; zero round key +|| LDW *$KPB++[2],$Td0[1] +|| MVK 60,A0 +|| ADD B0,$TEA,$TEA ; AES_Td + LDW *KEY[A0],B0 ; rounds +|| MVK 1024,A0 ; sizeof(AES_Td) + LDW *$KPA++[2],$Td0[2] +|| LDW *$KPB++[2],$Td0[3] +|| MV $TEA,$TEB + NOP + .if .BIG_ENDIAN + MV A9,$s[0] +|| MV A8,$s[1] +|| MV B9,$s[2] +|| MV B8,$s[3] + .else + MV A8,$s[0] +|| MV A9,$s[1] +|| MV B8,$s[2] +|| MV B9,$s[3] + .endif + XOR $Td0[0],$s[0],$s[0] +|| XOR $Td0[1],$s[1],$s[1] +|| LDW *$KPA++[2],$K[0] ; 1st round key +|| LDW *$KPB++[2],$K[1] + SUB B0,2,B0 + + SPLOOPD 13 +|| MVC B0,ILC +|| LDW *$KPA++[2],$K[2] +|| LDW *$KPB++[2],$K[3] +;;==================================================================== + EXTU $s[1],EXT3,24,$Td3[1] +|| EXTU $s[0],EXT1,24,$Td1[0] + LDW *${TEB}[$Td3[1]],$Td3[1] ; Td3[s1>>24], t0 +|| LDW *${TEA}[$Td1[0]],$Td1[0] ; Td1[s0>>8], t1 +|| XOR $s[2],$Td0[2],$s[2] ; modulo-scheduled +|| XOR $s[3],$Td0[3],$s[3] ; modulo-scheduled +|| EXTU $s[1],EXT1,24,$Td1[1] +|| EXTU $s[0],EXT3,24,$Td3[0] + LDW *${TEB}[$Td1[1]],$Td1[1] ; Td1[s1>>8], t2 +|| LDW *${TEA}[$Td3[0]],$Td3[0] ; Td3[s0>>24], t3 +|| EXTU $s[2],EXT2,24,$Td2[2] +|| EXTU $s[3],EXT2,24,$Td2[3] + LDW *${TEA}[$Td2[2]],$Td2[2] ; Td2[s2>>16], t0 +|| LDW *${TEB}[$Td2[3]],$Td2[3] ; Td2[s3>>16], t1 +|| EXTU $s[3],EXT1,24,$Td1[3] +|| EXTU $s[2],EXT3,24,$Td3[2] + LDW *${TEB}[$Td1[3]],$Td1[3] ; Td1[s3>>8], t0 +|| LDW *${TEA}[$Td3[2]],$Td3[2] ; Td3[s2>>24], t1 +|| EXTU $s[0],EXT2,24,$Td2[0] +|| EXTU $s[1],EXT2,24,$Td2[1] + LDW *${TEA}[$Td2[0]],$Td2[0] ; Td2[s0>>16], t2 +|| LDW *${TEB}[$Td2[1]],$Td2[1] ; Td2[s1>>16], t3 +|| EXTU $s[3],EXT3,24,$Td3[3] +|| EXTU $s[2],EXT1,24,$Td1[2] + LDW *${TEB}[$Td3[3]],$Td3[3] ; Td3[s3>>24], t2 +|| LDW *${TEA}[$Td1[2]],$Td1[2] ; Td1[s2>>8], t3 +|| ROTL $Td3[1],TBL3,$Td1[0] ; t0 +|| ROTL $Td1[0],TBL1,$Td3[1] ; t1 +|| EXTU $s[0],EXT0,24,$Td0[0] +|| EXTU $s[1],EXT0,24,$Td0[1] + LDW *${TEA}[$Td0[0]],$Td0[0] ; Td0[s0], t0 +|| LDW *${TEB}[$Td0[1]],$Td0[1] ; Td0[s1], t1 +|| ROTL $Td1[1],TBL1,$Td3[0] ; t2 +|| ROTL $Td3[0],TBL3,$Td1[1] ; t3 +|| EXTU $s[2],EXT0,24,$Td0[2] +|| EXTU $s[3],EXT0,24,$Td0[3] + LDW *${TEA}[$Td0[2]],$Td0[2] ; Td0[s2], t2 +|| LDW *${TEB}[$Td0[3]],$Td0[3] ; Td0[s3], t3 +|| ROTL $Td2[2],TBL2,$Td2[2] ; t0 +|| ROTL $Td2[3],TBL2,$Td2[3] ; t1 +|| XOR $K[0],$Td1[0],$s[0] +|| XOR $K[1],$Td3[1],$s[1] + ROTL $Td1[3],TBL1,$Td3[2] ; t0 +|| ROTL $Td3[2],TBL3,$Td1[3] ; t1 +|| XOR $K[2],$Td3[0],$s[2] +|| XOR $K[3],$Td1[1],$s[3] +|| LDW *$KPA++[2],$K[0] ; next round key +|| LDW *$KPB++[2],$K[1] + ROTL $Td2[0],TBL2,$Td2[0] ; t2 +|| ROTL $Td2[1],TBL2,$Td2[1] ; t3 +|| XOR $s[0],$Td2[2],$s[0] +|| XOR $s[1],$Td2[3],$s[1] +|| LDW *$KPA++[2],$K[2] +|| LDW *$KPB++[2],$K[3] + ROTL $Td3[3],TBL3,$Td1[2] ; t2 +|| ROTL $Td1[2],TBL1,$Td3[3] ; t3 +|| XOR $s[0],$Td3[2],$s[0] +|| XOR $s[1],$Td1[3],$s[1] + XOR $s[2],$Td2[0],$s[2] +|| XOR $s[3],$Td2[1],$s[3] +|| XOR $s[0],$Td0[0],$s[0] +|| XOR $s[1],$Td0[1],$s[1] + SPKERNEL +|| XOR.L $s[2],$Td1[2],$s[2] +|| XOR.L $s[3],$Td3[3],$s[3] +;;==================================================================== + ADD.D ${TEA},A0,${TEA} ; point to Td4 +|| ADD.D ${TEB},A0,${TEB} +|| EXTU $s[1],EXT3,24,$Td3[1] +|| EXTU $s[0],EXT1,24,$Td1[0] + LDBU *${TEB}[$Td3[1]],$Td3[1] ; Td3[s1>>24], t0 +|| LDBU *${TEA}[$Td1[0]],$Td1[0] ; Td1[s0>>8], t1 +|| XOR $s[2],$Td0[2],$s[2] ; modulo-scheduled +|| XOR $s[3],$Td0[3],$s[3] ; modulo-scheduled +|| EXTU $s[0],EXT0,24,$Td0[0] +|| EXTU $s[1],EXT0,24,$Td0[1] + LDBU *${TEA}[$Td0[0]],$Td0[0] ; Td0[s0], t0 +|| LDBU *${TEB}[$Td0[1]],$Td0[1] ; Td0[s1], t1 +|| EXTU $s[2],EXT2,24,$Td2[2] +|| EXTU $s[3],EXT2,24,$Td2[3] + LDBU *${TEA}[$Td2[2]],$Td2[2] ; Td2[s2>>16], t0 +|| LDBU *${TEB}[$Td2[3]],$Td2[3] ; Td2[s3>>16], t1 +|| EXTU $s[3],EXT1,24,$Td1[3] +|| EXTU $s[2],EXT3,24,$Td3[2] + LDBU *${TEB}[$Td1[3]],$Td1[3] ; Td1[s3>>8], t0 +|| LDBU *${TEA}[$Td3[2]],$Td3[2] ; Td3[s2>>24], t1 +|| EXTU $s[1],EXT1,24,$Td1[1] +|| EXTU $s[0],EXT3,24,$Td3[0] + LDBU *${TEB}[$Td1[1]],$Td1[1] ; Td1[s1>>8], t2 +|| LDBU *${TEA}[$Td3[0]],$Td3[0] ; Td3[s0>>24], t3 +|| EXTU $s[0],EXT2,24,$Td2[0] +|| EXTU $s[1],EXT2,24,$Td2[1] + LDBU *${TEA}[$Td2[0]],$Td2[0] ; Td2[s0>>16], t2 +|| LDBU *${TEB}[$Td2[1]],$Td2[1] ; Td2[s1>>16], t3 +|| EXTU $s[3],EXT3,24,$Td3[3] +|| EXTU $s[2],EXT1,24,$Td1[2] + LDBU *${TEB}[$Td3[3]],$Td3[3] ; Td3[s3>>24], t2 +|| LDBU *${TEA}[$Td1[2]],$Td1[2] ; Td1[s2>>8], t3 +|| EXTU $s[2],EXT0,24,$Td0[2] +|| EXTU $s[3],EXT0,24,$Td0[3] + LDBU *${TEA}[$Td0[2]],$Td0[2] ; Td0[s2], t2 +|| LDBU *${TEB}[$Td0[3]],$Td0[3] ; Td0[s3], t3 + + .if .BIG_ENDIAN + PACK2 $Td0[0],$Td1[3],$Td0[0] +|| PACK2 $Td0[1],$Td1[0],$Td0[1] + PACK2 $Td2[2],$Td3[1],$Td2[2] +|| PACK2 $Td2[3],$Td3[2],$Td2[3] + PACKL4 $Td0[0],$Td2[2],$Td0[0] +|| PACKL4 $Td0[1],$Td2[3],$Td0[1] + XOR $K[0],$Td0[0],$Td0[0] ; s[0] +|| XOR $K[1],$Td0[1],$Td0[1] ; s[1] + + PACK2 $Td0[2],$Td1[1],$Td0[2] +|| PACK2 $Td0[3],$Td1[2],$Td0[3] + PACK2 $Td2[0],$Td3[3],$Td2[0] +|| PACK2 $Td2[1],$Td3[0],$Td2[1] +|| BNOP RA + PACKL4 $Td0[2],$Td2[0],$Td0[2] +|| PACKL4 $Td0[3],$Td2[1],$Td0[3] + XOR $K[2],$Td0[2],$Td0[2] ; s[2] +|| XOR $K[3],$Td0[3],$Td0[3] ; s[3] + + MV $Td0[0],A9 +|| MV $Td0[1],A8 + MV $Td0[2],B9 +|| MV $Td0[3],B8 +|| [B2] STNDW A9:A8,*OUT++ + [B2] STNDW B9:B8,*OUT++ + .else + PACK2 $Td1[3],$Td0[0],$Td1[3] +|| PACK2 $Td1[0],$Td0[1],$Td1[0] + PACK2 $Td3[1],$Td2[2],$Td3[1] +|| PACK2 $Td3[2],$Td2[3],$Td3[2] + PACKL4 $Td3[1],$Td1[3],$Td1[3] +|| PACKL4 $Td3[2],$Td1[0],$Td1[0] + XOR $K[0],$Td1[3],$Td1[3] ; s[0] +|| XOR $K[1],$Td1[0],$Td1[0] ; s[1] + + PACK2 $Td1[1],$Td0[2],$Td1[1] +|| PACK2 $Td1[2],$Td0[3],$Td1[2] + PACK2 $Td3[3],$Td2[0],$Td3[3] +|| PACK2 $Td3[0],$Td2[1],$Td3[0] +|| BNOP RA + PACKL4 $Td3[3],$Td1[1],$Td1[1] +|| PACKL4 $Td3[0],$Td1[2],$Td1[2] + XOR $K[2],$Td1[1],$Td1[1] ; s[2] +|| XOR $K[3],$Td1[2],$Td1[2] ; s[3] + + MV $Td1[3],A8 +|| MV $Td1[0],A9 + MV $Td1[1],B8 +|| MV $Td1[2],B9 +|| [B2] STNDW A9:A8,*OUT++ + [B2] STNDW B9:B8,*OUT++ + .endif + .endasmfunc +___ +{ +my @K=(@K,@s); # extended key +my @Te4=map("B$_",(16..19)); + +my @Kx9=@Te0; # used in AES_set_decrypt_key +my @KxB=@Te1; +my @KxD=@Te2; +my @KxE=@Te3; + +$code.=<<___; + .asg OUT,BITS + + .global _AES_set_encrypt_key +_AES_set_encrypt_key: +__set_encrypt_key: + .asmfunc + MV INP,A0 +|| SHRU BITS,5,BITS ; 128-192-256 -> 4-6-8 +|| MV KEY,A1 + [!A0] B RA +||[!A0] MVK -1,RET +||[!A0] MVK 1,A1 ; only one B RA + [!A1] B RA +||[!A1] MVK -1,RET +||[!A1] MVK 0,A0 +|| MVK 0,B0 +|| MVK 0,A1 + [A0] LDNDW *INP++,A9:A8 +|| [A0] CMPEQ 4,BITS,B0 +|| [A0] CMPLT 3,BITS,A1 + [B0] B key128? +|| [A1] LDNDW *INP++,B9:B8 +|| [A0] CMPEQ 6,BITS,B0 +|| [A0] CMPLT 5,BITS,A1 + [B0] B key192? +|| [A1] LDNDW *INP++,B17:B16 +|| [A0] CMPEQ 8,BITS,B0 +|| [A0] CMPLT 7,BITS,A1 + [B0] B key256? +|| [A1] LDNDW *INP++,B19:B18 + + .if __TI_EABI__ + [A0] ADD 0,KEY,$KPA +|| [A0] ADD 4,KEY,$KPB +|| [A0] MVKL \$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA +|| [A0] ADDKPC __set_encrypt_key,B6 + [A0] MVKH \$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA + [A0] ADD B6,$TEA,$TEA ; AES_Te4 + .else + [A0] ADD 0,KEY,$KPA +|| [A0] ADD 4,KEY,$KPB +|| [A0] MVKL (AES_Te4-__set_encrypt_key),$TEA +|| [A0] ADDKPC __set_encrypt_key,B6 + [A0] MVKH (AES_Te4-__set_encrypt_key),$TEA + [A0] ADD B6,$TEA,$TEA ; AES_Te4 + .endif + NOP + NOP + + BNOP RA,5 +|| MVK -2,RET ; unknown bit length +|| MVK 0,B0 ; redundant +;;==================================================================== +;;==================================================================== +key128?: + .if .BIG_ENDIAN + MV A9,$K[0] +|| MV A8,$K[1] +|| MV B9,$Te4[2] +|| MV B8,$K[3] + .else + MV A8,$K[0] +|| MV A9,$K[1] +|| MV B8,$Te4[2] +|| MV B9,$K[3] + .endif + + MVK 256,A0 +|| MVK 9,B0 + + SPLOOPD 14 +|| MVC B0,ILC +|| MV $TEA,$TEB +|| ADD $TEA,A0,A30 ; rcon +;;==================================================================== + LDW *A30++[1],A31 ; rcon[i] +|| MV $Te4[2],$K[2] +|| EXTU $K[3],EXT1,24,$Te4[0] + LDBU *${TEB}[$Te4[0]],$Te4[0] +|| MV $K[3],A0 +|| EXTU $K[3],EXT2,24,$Te4[1] + LDBU *${TEB}[$Te4[1]],$Te4[1] +|| EXTU A0,EXT3,24,A0 +|| EXTU $K[3],EXT0,24,$Te4[3] + .if .BIG_ENDIAN + LDBU *${TEA}[A0],$Te4[3] +|| LDBU *${TEB}[$Te4[3]],A0 + .else + LDBU *${TEA}[A0],A0 +|| LDBU *${TEB}[$Te4[3]],$Te4[3] + .endif + + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + + XOR A31,$K[0],$K[0] ; ^=rcon[i] + .if .BIG_ENDIAN + PACK2 $Te4[0],$Te4[1],$Te4[1] + PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[1],$Te4[3],$Te4[3] + .else + PACK2 $Te4[1],$Te4[0],$Te4[1] + PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[3],$Te4[1],$Te4[3] + .endif + XOR $Te4[3],$K[0],$Te4[0] ; K[0] + XOR $Te4[0],$K[1],$K[1] ; K[1] + MV $Te4[0],$K[0] +|| XOR $K[1],$K[2],$Te4[2] ; K[2] + XOR $Te4[2],$K[3],$K[3] ; K[3] + SPKERNEL +;;==================================================================== + BNOP RA + MV $Te4[2],$K[2] +|| STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + MVK 10,B0 ; rounds + STW B0,*++${KPB}[15] + MVK 0,RET +;;==================================================================== +;;==================================================================== +key192?: + .if .BIG_ENDIAN + MV A9,$K[0] +|| MV A8,$K[1] +|| MV B9,$K[2] +|| MV B8,$K[3] + MV B17,$Te4[2] +|| MV B16,$K[5] + .else + MV A8,$K[0] +|| MV A9,$K[1] +|| MV B8,$K[2] +|| MV B9,$K[3] + MV B16,$Te4[2] +|| MV B17,$K[5] + .endif + + MVK 256,A0 +|| MVK 6,B0 + MV $TEA,$TEB +|| ADD $TEA,A0,A30 ; rcon +;;==================================================================== +loop192?: + LDW *A30++[1],A31 ; rcon[i] +|| MV $Te4[2],$K[4] +|| EXTU $K[5],EXT1,24,$Te4[0] + LDBU *${TEB}[$Te4[0]],$Te4[0] +|| MV $K[5],A0 +|| EXTU $K[5],EXT2,24,$Te4[1] + LDBU *${TEB}[$Te4[1]],$Te4[1] +|| EXTU A0,EXT3,24,A0 +|| EXTU $K[5],EXT0,24,$Te4[3] + .if .BIG_ENDIAN + LDBU *${TEA}[A0],$Te4[3] +|| LDBU *${TEB}[$Te4[3]],A0 + .else + LDBU *${TEA}[A0],A0 +|| LDBU *${TEB}[$Te4[3]],$Te4[3] + .endif + + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + STW $K[4],*$KPA++[2] +|| STW $K[5],*$KPB++[2] + + XOR A31,$K[0],$K[0] ; ^=rcon[i] + .if .BIG_ENDIAN + PACK2 $Te4[0],$Te4[1],$Te4[1] +|| PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[1],$Te4[3],$Te4[3] + .else + PACK2 $Te4[1],$Te4[0],$Te4[1] +|| PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[3],$Te4[1],$Te4[3] + .endif + BDEC loop192?,B0 +|| XOR $Te4[3],$K[0],$Te4[0] ; K[0] + XOR $Te4[0],$K[1],$K[1] ; K[1] + MV $Te4[0],$K[0] +|| XOR $K[1],$K[2],$Te4[2] ; K[2] + XOR $Te4[2],$K[3],$K[3] ; K[3] + MV $Te4[2],$K[2] +|| XOR $K[3],$K[4],$Te4[2] ; K[4] + XOR $Te4[2],$K[5],$K[5] ; K[5] +;;==================================================================== + BNOP RA + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + MVK 12,B0 ; rounds + STW B0,*++${KPB}[7] + MVK 0,RET +;;==================================================================== +;;==================================================================== +key256?: + .if .BIG_ENDIAN + MV A9,$K[0] +|| MV A8,$K[1] +|| MV B9,$K[2] +|| MV B8,$K[3] + MV B17,$K[4] +|| MV B16,$K[5] +|| MV B19,$Te4[2] +|| MV B18,$K[7] + .else + MV A8,$K[0] +|| MV A9,$K[1] +|| MV B8,$K[2] +|| MV B9,$K[3] + MV B16,$K[4] +|| MV B17,$K[5] +|| MV B18,$Te4[2] +|| MV B19,$K[7] + .endif + + MVK 256,A0 +|| MVK 6,B0 + MV $TEA,$TEB +|| ADD $TEA,A0,A30 ; rcon +;;==================================================================== +loop256?: + LDW *A30++[1],A31 ; rcon[i] +|| MV $Te4[2],$K[6] +|| EXTU $K[7],EXT1,24,$Te4[0] + LDBU *${TEB}[$Te4[0]],$Te4[0] +|| MV $K[7],A0 +|| EXTU $K[7],EXT2,24,$Te4[1] + LDBU *${TEB}[$Te4[1]],$Te4[1] +|| EXTU A0,EXT3,24,A0 +|| EXTU $K[7],EXT0,24,$Te4[3] + .if .BIG_ENDIAN + LDBU *${TEA}[A0],$Te4[3] +|| LDBU *${TEB}[$Te4[3]],A0 + .else + LDBU *${TEA}[A0],A0 +|| LDBU *${TEB}[$Te4[3]],$Te4[3] + .endif + + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + STW $K[4],*$KPA++[2] +|| STW $K[5],*$KPB++[2] + STW $K[6],*$KPA++[2] +|| STW $K[7],*$KPB++[2] +|| XOR A31,$K[0],$K[0] ; ^=rcon[i] + .if .BIG_ENDIAN + PACK2 $Te4[0],$Te4[1],$Te4[1] +|| PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[1],$Te4[3],$Te4[3] +||[!B0] B done256? + .else + PACK2 $Te4[1],$Te4[0],$Te4[1] +|| PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[3],$Te4[1],$Te4[3] +||[!B0] B done256? + .endif + XOR $Te4[3],$K[0],$Te4[0] ; K[0] + XOR $Te4[0],$K[1],$K[1] ; K[1] + MV $Te4[0],$K[0] +|| XOR $K[1],$K[2],$Te4[2] ; K[2] + XOR $Te4[2],$K[3],$K[3] ; K[3] + + MV $Te4[2],$K[2] +|| [B0] EXTU $K[3],EXT0,24,$Te4[0] +|| [B0] SUB B0,1,B0 + LDBU *${TEB}[$Te4[0]],$Te4[0] +|| MV $K[3],A0 +|| EXTU $K[3],EXT1,24,$Te4[1] + LDBU *${TEB}[$Te4[1]],$Te4[1] +|| EXTU A0,EXT2,24,A0 +|| EXTU $K[3],EXT3,24,$Te4[3] + + .if .BIG_ENDIAN + LDBU *${TEA}[A0],$Te4[3] +|| LDBU *${TEB}[$Te4[3]],A0 + NOP 3 + PACK2 $Te4[0],$Te4[1],$Te4[1] + PACK2 $Te4[3],A0,$Te4[3] +|| B loop256? + PACKL4 $Te4[1],$Te4[3],$Te4[3] + .else + LDBU *${TEA}[A0],A0 +|| LDBU *${TEB}[$Te4[3]],$Te4[3] + NOP 3 + PACK2 $Te4[1],$Te4[0],$Te4[1] + PACK2 $Te4[3],A0,$Te4[3] +|| B loop256? + PACKL4 $Te4[3],$Te4[1],$Te4[3] + .endif + + XOR $Te4[3],$K[4],$Te4[0] ; K[4] + XOR $Te4[0],$K[5],$K[5] ; K[5] + MV $Te4[0],$K[4] +|| XOR $K[5],$K[6],$Te4[2] ; K[6] + XOR $Te4[2],$K[7],$K[7] ; K[7] +;;==================================================================== +done256?: + BNOP RA + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + MVK 14,B0 ; rounds + STW B0,*--${KPB}[1] + MVK 0,RET + .endasmfunc + + .global _AES_set_decrypt_key +_AES_set_decrypt_key: + .asmfunc + B __set_encrypt_key ; guarantee local call + MV KEY,B30 ; B30 is not modified + MV RA, B31 ; B31 is not modified + ADDKPC ret?,RA,2 +ret?: ; B0 holds rounds or zero + [!B0] BNOP B31 ; return if zero + [B0] SHL B0,4,A0 ; offset to last round key + [B0] SHRU B0,1,B1 + [B0] SUB B1,1,B1 + [B0] MVK 0x0000001B,B3 ; AES polynomial + [B0] MVKH 0x07000000,B3 + + SPLOOPD 9 ; flip round keys +|| MVC B1,ILC +|| MV B30,$KPA +|| ADD B30,A0,$KPB +|| MVK 16,A0 ; sizeof(round key) +;;==================================================================== + LDW *${KPA}[0],A16 +|| LDW *${KPB}[0],B16 + LDW *${KPA}[1],A17 +|| LDW *${KPB}[1],B17 + LDW *${KPA}[2],A18 +|| LDW *${KPB}[2],B18 + LDW *${KPA}[3],A19 +|| ADD $KPA,A0,$KPA +|| LDW *${KPB}[3],B19 +|| SUB $KPB,A0,$KPB + NOP + STW B16,*${KPA}[-4] +|| STW A16,*${KPB}[4] + STW B17,*${KPA}[-3] +|| STW A17,*${KPB}[5] + STW B18,*${KPA}[-2] +|| STW A18,*${KPB}[6] + STW B19,*${KPA}[-1] +|| STW A19,*${KPB}[7] + SPKERNEL +;;==================================================================== + SUB B0,1,B0 ; skip last round +|| ADD B30,A0,$KPA ; skip first round +|| ADD B30,A0,$KPB +|| MVC GFPGFR,B30 ; save GFPGFR + LDW *${KPA}[0],$K[0] +|| LDW *${KPB}[1],$K[1] +|| MVC B3,GFPGFR + LDW *${KPA}[2],$K[2] +|| LDW *${KPB}[3],$K[3] + MVK 0x00000909,A24 +|| MVK 0x00000B0B,B24 + MVKH 0x09090000,A24 +|| MVKH 0x0B0B0000,B24 + MVC B0,ILC +|| SUB B0,1,B0 + + GMPY4 $K[0],A24,$Kx9[0] ; ·0x09 +|| GMPY4 $K[1],A24,$Kx9[1] +|| MVK 0x00000D0D,A25 +|| MVK 0x00000E0E,B25 + GMPY4 $K[2],A24,$Kx9[2] +|| GMPY4 $K[3],A24,$Kx9[3] +|| MVKH 0x0D0D0000,A25 +|| MVKH 0x0E0E0000,B25 + + GMPY4 $K[0],B24,$KxB[0] ; ·0x0B +|| GMPY4 $K[1],B24,$KxB[1] + GMPY4 $K[2],B24,$KxB[2] +|| GMPY4 $K[3],B24,$KxB[3] + + SPLOOP 11 ; InvMixColumns +;;==================================================================== + GMPY4 $K[0],A25,$KxD[0] ; ·0x0D +|| GMPY4 $K[1],A25,$KxD[1] +|| SWAP2 $Kx9[0],$Kx9[0] ; rotate by 16 +|| SWAP2 $Kx9[1],$Kx9[1] +|| MV $K[0],$s[0] ; this or DINT +|| MV $K[1],$s[1] +|| [B0] LDW *${KPA}[4],$K[0] +|| [B0] LDW *${KPB}[5],$K[1] + GMPY4 $K[2],A25,$KxD[2] +|| GMPY4 $K[3],A25,$KxD[3] +|| SWAP2 $Kx9[2],$Kx9[2] +|| SWAP2 $Kx9[3],$Kx9[3] +|| MV $K[2],$s[2] +|| MV $K[3],$s[3] +|| [B0] LDW *${KPA}[6],$K[2] +|| [B0] LDW *${KPB}[7],$K[3] + + GMPY4 $s[0],B25,$KxE[0] ; ·0x0E +|| GMPY4 $s[1],B25,$KxE[1] +|| XOR $Kx9[0],$KxB[0],$KxB[0] +|| XOR $Kx9[1],$KxB[1],$KxB[1] + GMPY4 $s[2],B25,$KxE[2] +|| GMPY4 $s[3],B25,$KxE[3] +|| XOR $Kx9[2],$KxB[2],$KxB[2] +|| XOR $Kx9[3],$KxB[3],$KxB[3] + + ROTL $KxB[0],TBL3,$KxB[0] +|| ROTL $KxB[1],TBL3,$KxB[1] +|| SWAP2 $KxD[0],$KxD[0] ; rotate by 16 +|| SWAP2 $KxD[1],$KxD[1] + ROTL $KxB[2],TBL3,$KxB[2] +|| ROTL $KxB[3],TBL3,$KxB[3] +|| SWAP2 $KxD[2],$KxD[2] +|| SWAP2 $KxD[3],$KxD[3] + + XOR $KxE[0],$KxD[0],$KxE[0] +|| XOR $KxE[1],$KxD[1],$KxE[1] +|| [B0] GMPY4 $K[0],A24,$Kx9[0] ; ·0x09 +|| [B0] GMPY4 $K[1],A24,$Kx9[1] +|| ADDAW $KPA,4,$KPA + XOR $KxE[2],$KxD[2],$KxE[2] +|| XOR $KxE[3],$KxD[3],$KxE[3] +|| [B0] GMPY4 $K[2],A24,$Kx9[2] +|| [B0] GMPY4 $K[3],A24,$Kx9[3] +|| ADDAW $KPB,4,$KPB + + XOR $KxB[0],$KxE[0],$KxE[0] +|| XOR $KxB[1],$KxE[1],$KxE[1] +|| [B0] GMPY4 $K[0],B24,$KxB[0] ; ·0x0B +|| [B0] GMPY4 $K[1],B24,$KxB[1] + XOR $KxB[2],$KxE[2],$KxE[2] +|| XOR $KxB[3],$KxE[3],$KxE[3] +|| [B0] GMPY4 $K[2],B24,$KxB[2] +|| [B0] GMPY4 $K[3],B24,$KxB[3] +|| STW $KxE[0],*${KPA}[-4] +|| STW $KxE[1],*${KPB}[-3] + STW $KxE[2],*${KPA}[-2] +|| STW $KxE[3],*${KPB}[-1] +|| [B0] SUB B0,1,B0 + SPKERNEL +;;==================================================================== + BNOP B31,3 + MVC B30,GFPGFR ; restore GFPGFR(*) + MVK 0,RET + .endasmfunc +___ +# (*) Even though ABI doesn't specify GFPGFR as non-volatile, there +# are code samples out there that *assume* its default value. +} +{ +my ($inp,$out,$blocks,$key,$ivp)=("A4","B4","A6","B6","A8"); +$code.=<<___; + .global _AES_ctr32_encrypt +_AES_ctr32_encrypt: + .asmfunc + LDNDW *${ivp}[0],A31:A30 ; load counter value +|| MV $blocks,A2 ; reassign $blocks +|| DMV RA,$key,B27:B26 ; reassign RA and $key + LDNDW *${ivp}[1],B31:B30 +|| MVK 0,B2 ; don't let __encrypt load input +|| MVK 0,A1 ; and postpone writing output + .if .BIG_ENDIAN + NOP + .else + NOP 4 + SWAP2 B31,B31 ; keep least significant 32 bits + SWAP4 B31,B31 ; in host byte order + .endif +ctr32_loop?: + [A2] BNOP __encrypt +|| [A1] XOR A29,A9,A9 ; input^Ek(counter) +|| [A1] XOR A28,A8,A8 +|| [A2] LDNDW *INP++,A29:A28 ; load input + [!A2] BNOP B27 ; return +|| [A1] XOR B29,B9,B9 +|| [A1] XOR B28,B8,B8 +|| [A2] LDNDW *INP++,B29:B28 + .if .BIG_ENDIAN + [A1] STNDW A9:A8,*OUT++ ; save output +|| [A2] DMV A31,A30,A9:A8 ; pass counter value to __encrypt + [A1] STNDW B9:B8,*OUT++ +|| [A2] DMV B31,B30,B9:B8 +|| [A2] ADD B30,1,B30 ; counter++ + .else + [A1] STNDW A9:A8,*OUT++ ; save output +|| [A2] DMV A31,A30,A9:A8 +|| [A2] SWAP2 B31,B0 +|| [A2] ADD B31,1,B31 ; counter++ + [A1] STNDW B9:B8,*OUT++ +|| [A2] MV B30,B8 +|| [A2] SWAP4 B0,B9 + .endif + [A2] ADDKPC ctr32_loop?,RA ; return to ctr32_loop? +|| [A2] MV B26,KEY ; pass $key +|| [A2] SUB A2,1,A2 ; $blocks-- +||[!A1] MVK 1,A1 + NOP + NOP + .endasmfunc +___ +} +# Tables are kept in endian-neutral manner +$code.=<<___; + .if __TI_EABI__ + .sect ".text:aes_asm.const" + .else + .sect ".const:aes_asm" + .endif + .align 128 +AES_Te: + .byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 + .byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d + .byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd + .byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54 + .byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03 + .byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d + .byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62 + .byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a + .byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d + .byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87 + .byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb + .byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b + .byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67 + .byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea + .byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7 + .byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b + .byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c + .byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a + .byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41 + .byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f + .byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4 + .byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08 + .byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73 + .byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f + .byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52 + .byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e + .byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1 + .byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5 + .byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36 + .byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d + .byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69 + .byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f + .byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e + .byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e + .byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2 + .byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb + .byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d + .byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce + .byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e + .byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97 + .byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68 + .byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c + .byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f + .byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed + .byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46 + .byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b + .byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4 + .byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a + .byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a + .byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16 + .byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7 + .byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94 + .byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10 + .byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81 + .byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44 + .byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3 + .byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe + .byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a + .byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc + .byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04 + .byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1 + .byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63 + .byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a + .byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d + .byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14 + .byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f + .byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2 + .byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39 + .byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2 + .byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47 + .byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7 + .byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95 + .byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98 + .byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f + .byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e + .byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83 + .byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29 + .byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c + .byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2 + .byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76 + .byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56 + .byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e + .byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a + .byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4 + .byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e + .byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6 + .byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4 + .byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b + .byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43 + .byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7 + .byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64 + .byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0 + .byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa + .byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25 + .byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e + .byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18 + .byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88 + .byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72 + .byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1 + .byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51 + .byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c + .byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21 + .byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc + .byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85 + .byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42 + .byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa + .byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05 + .byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12 + .byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f + .byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0 + .byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58 + .byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9 + .byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13 + .byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33 + .byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70 + .byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7 + .byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22 + .byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20 + .byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff + .byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a + .byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8 + .byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17 + .byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31 + .byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8 + .byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0 + .byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11 + .byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc + .byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a +AES_Te4: + .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 + .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 + .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 + .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 + .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc + .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 + .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a + .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 + .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 + .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 + .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b + .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf + .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 + .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 + .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 + .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 + .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 + .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 + .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 + .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb + .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c + .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 + .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 + .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 + .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 + .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a + .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e + .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e + .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 + .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf + .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 + .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +rcon: + .byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 + .byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00 + .byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00 + .byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 + .byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00 + .align 128 +AES_Td: + .byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 + .byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96 + .byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1 + .byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93 + .byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6 + .byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25 + .byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7 + .byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f + .byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67 + .byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1 + .byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12 + .byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6 + .byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95 + .byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda + .byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3 + .byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44 + .byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78 + .byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd + .byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17 + .byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4 + .byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82 + .byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45 + .byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84 + .byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94 + .byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19 + .byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7 + .byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2 + .byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a + .byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03 + .byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5 + .byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2 + .byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c + .byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92 + .byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1 + .byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5 + .byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a + .byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0 + .byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75 + .byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa + .byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51 + .byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d + .byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46 + .byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05 + .byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff + .byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97 + .byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77 + .byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88 + .byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb + .byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9 + .byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00 + .byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48 + .byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e + .byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56 + .byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27 + .byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21 + .byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a + .byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f + .byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e + .byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2 + .byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16 + .byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5 + .byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d + .byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad + .byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8 + .byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c + .byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd + .byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc + .byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34 + .byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc + .byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63 + .byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10 + .byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20 + .byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8 + .byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d + .byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3 + .byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0 + .byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99 + .byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22 + .byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a + .byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef + .byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1 + .byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36 + .byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28 + .byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4 + .byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d + .byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62 + .byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8 + .byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5 + .byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c + .byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3 + .byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7 + .byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b + .byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4 + .byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8 + .byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e + .byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6 + .byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce + .byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6 + .byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31 + .byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0 + .byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6 + .byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15 + .byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7 + .byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f + .byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d + .byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf + .byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b + .byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f + .byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d + .byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e + .byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52 + .byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13 + .byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a + .byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89 + .byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35 + .byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c + .byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f + .byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf + .byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b + .byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86 + .byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e + .byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f + .byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c + .byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41 + .byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde + .byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90 + .byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70 + .byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42 +AES_Td4: + .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 + .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb + .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 + .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb + .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d + .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e + .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 + .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 + .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 + .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 + .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda + .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 + .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a + .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 + .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 + .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b + .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea + .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 + .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 + .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e + .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 + .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b + .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 + .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 + .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 + .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f + .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d + .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef + .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 + .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 + .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 + .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + .cstring "AES for C64x+, CRYPTOGAMS by <appro\@openssl.org>" + .align 4 +___ + +print $code; +close STDOUT; diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-ia64.S b/deps/openssl/openssl/crypto/aes/asm/aes-ia64.S index 7f6c4c3662..f7f1f63c9d 100644 --- a/deps/openssl/openssl/crypto/aes/asm/aes-ia64.S +++ b/deps/openssl/openssl/crypto/aes/asm/aes-ia64.S @@ -1,3 +1,10 @@ +// Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// // ==================================================================== // Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL // project. Rights for redistribution and usage in source and binary @@ -10,7 +17,7 @@ // 'and' which in turn can be assigned to M-port [there're double as // much M-ports as there're I-ports on Itanium 2]. By sacrificing few // registers for small constants (255, 24 and 16) to be used with -// 'shr' and 'and' instructions I can achieve better ILP, Intruction +// 'shr' and 'and' instructions I can achieve better ILP, Instruction // Level Parallelism, and performance. This code outperforms GCC 3.3 // generated code by over factor of 2 (two), GCC 3.4 - by 70% and // HP C - by 40%. Measured best-case scenario, i.e. aligned diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-mips.pl b/deps/openssl/openssl/crypto/aes/asm/aes-mips.pl index 4de3ee26bb..439578d9c2 100644 --- a/deps/openssl/openssl/crypto/aes/asm/aes-mips.pl +++ b/deps/openssl/openssl/crypto/aes/asm/aes-mips.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -57,6 +64,7 @@ $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { + $PTR_LA="dla"; $PTR_ADD="dadd"; # incidentally works even on n32 $PTR_SUB="dsub"; # incidentally works even on n32 $PTR_INS="dins"; @@ -65,6 +73,7 @@ if ($flavour =~ /64|n32/i) { $PTR_SLL="dsll"; # incidentally works even on n32 $SZREG=8; } else { + $PTR_LA="la"; $PTR_ADD="add"; $PTR_SUB="sub"; $PTR_INS="ins"; @@ -81,13 +90,13 @@ $pf = ($flavour =~ /nubi/i) ? $t0 : $t2; $big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0 if ($ENV{CC}); -for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } +for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); } open STDOUT,">$output"; if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); } -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; my ($MSB,$LSB)=(0,3); # automatically converted to little-endian @@ -110,7 +119,7 @@ ___ {{{ my $FRAMESIZE=16*$SZREG; -my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc0fff008" : "0xc0ff0000"; my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7); my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2); @@ -646,7 +655,7 @@ $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification ___ $code.=<<___; .set reorder - la $Tbl,AES_Te # PIC-ified 'load address' + $PTR_LA $Tbl,AES_Te # PIC-ified 'load address' lwl $s0,0+$MSB($inp) lwl $s1,4+$MSB($inp) @@ -1217,7 +1226,7 @@ $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification ___ $code.=<<___; .set reorder - la $Tbl,AES_Td # PIC-ified 'load address' + $PTR_LA $Tbl,AES_Td # PIC-ified 'load address' lwl $s0,0+$MSB($inp) lwl $s1,4+$MSB($inp) @@ -1267,7 +1276,7 @@ ___ {{{ my $FRAMESIZE=8*$SZREG; -my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000; +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc000f008" : "0xc0000000"; my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3); my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3); @@ -1528,9 +1537,9 @@ _mips_AES_set_encrypt_key: nop .end _mips_AES_set_encrypt_key -.globl private_AES_set_encrypt_key -.ent private_AES_set_encrypt_key -private_AES_set_encrypt_key: +.globl AES_set_encrypt_key +.ent AES_set_encrypt_key +AES_set_encrypt_key: .frame $sp,$FRAMESIZE,$ra .mask $SAVED_REGS_MASK,-$SZREG .set noreorder @@ -1552,11 +1561,11 @@ $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue ___ $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification .cplocal $Tbl - .cpsetup $pf,$zero,private_AES_set_encrypt_key + .cpsetup $pf,$zero,AES_set_encrypt_key ___ $code.=<<___; .set reorder - la $Tbl,AES_Te4 # PIC-ified 'load address' + $PTR_LA $Tbl,AES_Te4 # PIC-ified 'load address' bal _mips_AES_set_encrypt_key @@ -1575,7 +1584,7 @@ ___ $code.=<<___; jr $ra $PTR_ADD $sp,$FRAMESIZE -.end private_AES_set_encrypt_key +.end AES_set_encrypt_key ___ my ($head,$tail)=($inp,$bits); @@ -1583,9 +1592,9 @@ my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3); my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2); $code.=<<___; .align 5 -.globl private_AES_set_decrypt_key -.ent private_AES_set_decrypt_key -private_AES_set_decrypt_key: +.globl AES_set_decrypt_key +.ent AES_set_decrypt_key +AES_set_decrypt_key: .frame $sp,$FRAMESIZE,$ra .mask $SAVED_REGS_MASK,-$SZREG .set noreorder @@ -1607,11 +1616,11 @@ $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue ___ $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification .cplocal $Tbl - .cpsetup $pf,$zero,private_AES_set_decrypt_key + .cpsetup $pf,$zero,AES_set_decrypt_key ___ $code.=<<___; .set reorder - la $Tbl,AES_Te4 # PIC-ified 'load address' + $PTR_LA $Tbl,AES_Te4 # PIC-ified 'load address' bal _mips_AES_set_encrypt_key @@ -1729,7 +1738,7 @@ ___ $code.=<<___; jr $ra $PTR_ADD $sp,$FRAMESIZE -.end private_AES_set_decrypt_key +.end AES_set_decrypt_key ___ }}} diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-parisc.pl b/deps/openssl/openssl/crypto/aes/asm/aes-parisc.pl index 714dcfbbe3..2c785bc56d 100644 --- a/deps/openssl/openssl/crypto/aes/asm/aes-parisc.pl +++ b/deps/openssl/openssl/crypto/aes/asm/aes-parisc.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-ppc.pl b/deps/openssl/openssl/crypto/aes/asm/aes-ppc.pl index 5b83016efa..1558d8e454 100644 --- a/deps/openssl/openssl/crypto/aes/asm/aes-ppc.pl +++ b/deps/openssl/openssl/crypto/aes/asm/aes-ppc.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -19,7 +26,7 @@ # February 2010 # # Rescheduling instructions to favour Power6 pipeline gave 10% -# performance improvement on the platfrom in question (and marginal +# performance improvement on the platform in question (and marginal # improvement even on others). It should be noted that Power6 fails # to process byte in 18 cycles, only in 23, because it fails to issue # 4 load instructions in two cycles, only in 3. As result non-compact diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-s390x.pl b/deps/openssl/openssl/crypto/aes/asm/aes-s390x.pl index a8f4d29d1c..fd8a737166 100644 --- a/deps/openssl/openssl/crypto/aes/asm/aes-s390x.pl +++ b/deps/openssl/openssl/crypto/aes/asm/aes-s390x.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -92,7 +99,7 @@ if ($flavour =~ /3[12]/) { $g="g"; } -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $softonly=0; # allow hardware support @@ -779,10 +786,10 @@ ___ $code.=<<___; # void AES_set_encrypt_key(const unsigned char *in, int bits, # AES_KEY *key) { -.globl private_AES_set_encrypt_key -.type private_AES_set_encrypt_key,\@function +.globl AES_set_encrypt_key +.type AES_set_encrypt_key,\@function .align 16 -private_AES_set_encrypt_key: +AES_set_encrypt_key: _s390x_AES_set_encrypt_key: lghi $t0,0 cl${g}r $inp,$t0 @@ -806,7 +813,7 @@ _s390x_AES_set_encrypt_key: .Lproceed: ___ $code.=<<___ if (!$softonly); - # convert bits to km code, [128,192,256]->[18,19,20] + # convert bits to km(c) code, [128,192,256]->[18,19,20] lhi %r5,-128 lhi %r0,18 ar %r5,$bits @@ -814,13 +821,10 @@ $code.=<<___ if (!$softonly); ar %r5,%r0 larl %r1,OPENSSL_s390xcap_P - lg %r0,0(%r1) - tmhl %r0,0x4000 # check for message-security assist - jz .Lekey_internal - llihh %r0,0x8000 srlg %r0,%r0,0(%r5) - ng %r0,48(%r1) # check kmc capability vector + ng %r0,32(%r1) # check availability of both km... + ng %r0,48(%r1) # ...and kmc support for given key length jz .Lekey_internal lmg %r0,%r1,0($inp) # just copy 128 bits... @@ -835,7 +839,7 @@ $code.=<<___ if (!$softonly); stg %r1,24($key) 1: st $bits,236($key) # save bits [for debugging purposes] lgr $t0,%r5 - st %r5,240($key) # save km code + st %r5,240($key) # save km(c) code lghi %r2,0 br %r14 ___ @@ -1059,14 +1063,14 @@ $code.=<<___; .Lminus1: lghi %r2,-1 br $ra -.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key +.size AES_set_encrypt_key,.-AES_set_encrypt_key # void AES_set_decrypt_key(const unsigned char *in, int bits, # AES_KEY *key) { -.globl private_AES_set_decrypt_key -.type private_AES_set_decrypt_key,\@function +.globl AES_set_decrypt_key +.type AES_set_decrypt_key,\@function .align 16 -private_AES_set_decrypt_key: +AES_set_decrypt_key: #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key! bras $ra,_s390x_AES_set_encrypt_key @@ -1166,7 +1170,7 @@ $code.=<<___; lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key! lghi %r2,0 br $ra -.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key +.size AES_set_decrypt_key,.-AES_set_decrypt_key ___ ######################################################################## @@ -1432,12 +1436,7 @@ $code.=<<___ if (!$softonly); .Lctr32_hw_switch: ___ -$code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower - larl $s0,OPENSSL_s390xcap_P - lg $s0,8($s0) - tmhh $s0,0x0004 # check for message_security-assist-4 - jz .Lctr32_km_loop - +$code.=<<___ if (!$softonly && 0);# kmctr code was measured to be ~12% slower llgfr $s0,%r0 lgr $s1,%r1 larl %r1,OPENSSL_s390xcap_P @@ -1481,7 +1480,7 @@ $code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower br $ra .align 16 ___ -$code.=<<___; +$code.=<<___ if (!$softonly); .Lctr32_km_loop: la $s2,16($sp) lgr $s3,$fp @@ -1568,8 +1567,8 @@ ___ } ######################################################################## -# void AES_xts_encrypt(const unsigned char *inp, unsigned char *out, -# size_t len, const AES_KEY *key1, const AES_KEY *key2, +# void AES_xts_encrypt(const char *inp,char *out,size_t len, +# const AES_KEY *key1, const AES_KEY *key2, # const unsigned char iv[16]); # { @@ -1937,8 +1936,8 @@ $code.=<<___; br $ra .size AES_xts_encrypt,.-AES_xts_encrypt ___ -# void AES_xts_decrypt(const unsigned char *inp, unsigned char *out, -# size_t len, const AES_KEY *key1, const AES_KEY *key2, +# void AES_xts_decrypt(const char *inp,char *out,size_t len, +# const AES_KEY *key1, const AES_KEY *key2, # const unsigned char iv[16]); # $code.=<<___; @@ -2220,7 +2219,6 @@ ___ } $code.=<<___; .string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>" -.comm OPENSSL_s390xcap_P,80,8 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-sparcv9.pl b/deps/openssl/openssl/crypto/aes/asm/aes-sparcv9.pl index 403c4d1290..883fae820f 100755 --- a/deps/openssl/openssl/crypto/aes/asm/aes-sparcv9.pl +++ b/deps/openssl/openssl/crypto/aes/asm/aes-sparcv9.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -30,10 +37,11 @@ # optimal decrypt procedure]. Compared to GNU C generated code both # procedures are more than 60% faster:-) -$bits=32; -for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } -if ($bits==64) { $bias=2047; $frame=192; } -else { $bias=0; $frame=112; } +$output = pop; +open STDOUT,">$output"; + +$frame="STACK_FRAME"; +$bias="STACK_BIAS"; $locals=16; $acc0="%l0"; @@ -74,11 +82,13 @@ sub _data_word() while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; } } -$code.=<<___ if ($bits==64); +$code.=<<___; +#include "sparc_arch.h" + +#ifdef __arch64__ .register %g2,#scratch .register %g3,#scratch -___ -$code.=<<___; +#endif .section ".text",#alloc,#execinstr .align 256 diff --git a/deps/openssl/openssl/crypto/aes/asm/aes-x86_64.pl b/deps/openssl/openssl/crypto/aes/asm/aes-x86_64.pl index 47f416375d..ce4ca30b1a 100755 --- a/deps/openssl/openssl/crypto/aes/asm/aes-x86_64.pl +++ b/deps/openssl/openssl/crypto/aes/asm/aes-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -37,7 +44,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $verticalspin=1; # unlike 32-bit version $verticalspin performs @@ -1282,13 +1289,13 @@ $code.=<<___; ___ } -# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, +# int AES_set_encrypt_key(const unsigned char *userKey, const int bits, # AES_KEY *key) $code.=<<___; -.globl private_AES_set_encrypt_key -.type private_AES_set_encrypt_key,\@function,3 +.globl AES_set_encrypt_key +.type AES_set_encrypt_key,\@function,3 .align 16 -private_AES_set_encrypt_key: +AES_set_encrypt_key: push %rbx push %rbp push %r12 # redundant, but allows to share @@ -1305,7 +1312,7 @@ private_AES_set_encrypt_key: add \$56,%rsp .Lenc_key_epilogue: ret -.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key +.size AES_set_encrypt_key,.-AES_set_encrypt_key .type _x86_64_AES_set_encrypt_key,\@abi-omnipotent .align 16 @@ -1548,13 +1555,13 @@ $code.=<<___; ___ } -# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, +# int AES_set_decrypt_key(const unsigned char *userKey, const int bits, # AES_KEY *key) $code.=<<___; -.globl private_AES_set_decrypt_key -.type private_AES_set_decrypt_key,\@function,3 +.globl AES_set_decrypt_key +.type AES_set_decrypt_key,\@function,3 .align 16 -private_AES_set_decrypt_key: +AES_set_decrypt_key: push %rbx push %rbp push %r12 @@ -1623,7 +1630,7 @@ $code.=<<___; add \$56,%rsp .Ldec_key_epilogue: ret -.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key +.size AES_set_decrypt_key,.-AES_set_decrypt_key ___ # void AES_cbc_encrypt (const void char *inp, unsigned char *out, @@ -2770,13 +2777,13 @@ cbc_se_handler: .rva .LSEH_end_AES_decrypt .rva .LSEH_info_AES_decrypt - .rva .LSEH_begin_private_AES_set_encrypt_key - .rva .LSEH_end_private_AES_set_encrypt_key - .rva .LSEH_info_private_AES_set_encrypt_key + .rva .LSEH_begin_AES_set_encrypt_key + .rva .LSEH_end_AES_set_encrypt_key + .rva .LSEH_info_AES_set_encrypt_key - .rva .LSEH_begin_private_AES_set_decrypt_key - .rva .LSEH_end_private_AES_set_decrypt_key - .rva .LSEH_info_private_AES_set_decrypt_key + .rva .LSEH_begin_AES_set_decrypt_key + .rva .LSEH_end_AES_set_decrypt_key + .rva .LSEH_info_AES_set_decrypt_key .rva .LSEH_begin_AES_cbc_encrypt .rva .LSEH_end_AES_cbc_encrypt @@ -2792,11 +2799,11 @@ cbc_se_handler: .byte 9,0,0,0 .rva block_se_handler .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] -.LSEH_info_private_AES_set_encrypt_key: +.LSEH_info_AES_set_encrypt_key: .byte 9,0,0,0 .rva key_se_handler .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[] -.LSEH_info_private_AES_set_decrypt_key: +.LSEH_info_AES_set_decrypt_key: .byte 9,0,0,0 .rva key_se_handler .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[] diff --git a/deps/openssl/openssl/crypto/aes/asm/aesfx-sparcv9.pl b/deps/openssl/openssl/crypto/aes/asm/aesfx-sparcv9.pl new file mode 100644 index 0000000000..04b3cf7116 --- /dev/null +++ b/deps/openssl/openssl/crypto/aes/asm/aesfx-sparcv9.pl @@ -0,0 +1,1270 @@ +#! /usr/bin/env perl +# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# March 2016 +# +# Initial support for Fujitsu SPARC64 X/X+ comprises minimally +# required key setup and single-block procedures. +# +# April 2016 +# +# Add "teaser" CBC and CTR mode-specific subroutines. "Teaser" means +# that parallelizeable nature of CBC decrypt and CTR is not utilized +# yet. CBC encrypt on the other hand is as good as it can possibly +# get processing one byte in 4.1 cycles with 128-bit key on SPARC64 X. +# This is ~6x faster than pure software implementation... +# +# July 2016 +# +# Switch from faligndata to fshiftorx, which allows to omit alignaddr +# instructions and improve single-block and short-input performance +# with misaligned data. + +$output = pop; +open STDOUT,">$output"; + +{ +my ($inp,$out,$key,$rounds,$tmp,$mask) = map("%o$_",(0..5)); + +$code.=<<___; +#include "sparc_arch.h" + +#define LOCALS (STACK_BIAS+STACK_FRAME) + +.text + +.globl aes_fx_encrypt +.align 32 +aes_fx_encrypt: + and $inp, 7, $tmp ! is input aligned? + andn $inp, 7, $inp + ldd [$key + 0], %f6 ! round[0] + ldd [$key + 8], %f8 + mov %o7, %g1 + ld [$key + 240], $rounds + +1: call .+8 + add %o7, .Linp_align-1b, %o7 + + sll $tmp, 3, $tmp + ldd [$inp + 0], %f0 ! load input + brz,pt $tmp, .Lenc_inp_aligned + ldd [$inp + 8], %f2 + + ldd [%o7 + $tmp], %f14 ! shift left params + ldd [$inp + 16], %f4 + fshiftorx %f0, %f2, %f14, %f0 + fshiftorx %f2, %f4, %f14, %f2 + +.Lenc_inp_aligned: + ldd [$key + 16], %f10 ! round[1] + ldd [$key + 24], %f12 + + fxor %f0, %f6, %f0 ! ^=round[0] + fxor %f2, %f8, %f2 + ldd [$key + 32], %f6 ! round[2] + ldd [$key + 40], %f8 + add $key, 32, $key + sub $rounds, 4, $rounds + +.Loop_enc: + fmovd %f0, %f4 + faesencx %f2, %f10, %f0 + faesencx %f4, %f12, %f2 + ldd [$key + 16], %f10 + ldd [$key + 24], %f12 + add $key, 32, $key + + fmovd %f0, %f4 + faesencx %f2, %f6, %f0 + faesencx %f4, %f8, %f2 + ldd [$key + 0], %f6 + ldd [$key + 8], %f8 + + brnz,a $rounds, .Loop_enc + sub $rounds, 2, $rounds + + andcc $out, 7, $tmp ! is output aligned? + andn $out, 7, $out + mov 0xff, $mask + srl $mask, $tmp, $mask + add %o7, 64, %o7 + sll $tmp, 3, $tmp + + fmovd %f0, %f4 + faesencx %f2, %f10, %f0 + faesencx %f4, %f12, %f2 + ldd [%o7 + $tmp], %f14 ! shift right params + + fmovd %f0, %f4 + faesenclx %f2, %f6, %f0 + faesenclx %f4, %f8, %f2 + + bnz,pn %icc, .Lenc_out_unaligned + mov %g1, %o7 + + std %f0, [$out + 0] + retl + std %f2, [$out + 8] + +.align 16 +.Lenc_out_unaligned: + add $out, 16, $inp + orn %g0, $mask, $tmp + fshiftorx %f0, %f0, %f14, %f4 + fshiftorx %f0, %f2, %f14, %f6 + fshiftorx %f2, %f2, %f14, %f8 + + stda %f4, [$out + $mask]0xc0 ! partial store + std %f6, [$out + 8] + stda %f8, [$inp + $tmp]0xc0 ! partial store + retl + nop +.type aes_fx_encrypt,#function +.size aes_fx_encrypt,.-aes_fx_encrypt + +.globl aes_fx_decrypt +.align 32 +aes_fx_decrypt: + and $inp, 7, $tmp ! is input aligned? + andn $inp, 7, $inp + ldd [$key + 0], %f6 ! round[0] + ldd [$key + 8], %f8 + mov %o7, %g1 + ld [$key + 240], $rounds + +1: call .+8 + add %o7, .Linp_align-1b, %o7 + + sll $tmp, 3, $tmp + ldd [$inp + 0], %f0 ! load input + brz,pt $tmp, .Ldec_inp_aligned + ldd [$inp + 8], %f2 + + ldd [%o7 + $tmp], %f14 ! shift left params + ldd [$inp + 16], %f4 + fshiftorx %f0, %f2, %f14, %f0 + fshiftorx %f2, %f4, %f14, %f2 + +.Ldec_inp_aligned: + ldd [$key + 16], %f10 ! round[1] + ldd [$key + 24], %f12 + + fxor %f0, %f6, %f0 ! ^=round[0] + fxor %f2, %f8, %f2 + ldd [$key + 32], %f6 ! round[2] + ldd [$key + 40], %f8 + add $key, 32, $key + sub $rounds, 4, $rounds + +.Loop_dec: + fmovd %f0, %f4 + faesdecx %f2, %f10, %f0 + faesdecx %f4, %f12, %f2 + ldd [$key + 16], %f10 + ldd [$key + 24], %f12 + add $key, 32, $key + + fmovd %f0, %f4 + faesdecx %f2, %f6, %f0 + faesdecx %f4, %f8, %f2 + ldd [$key + 0], %f6 + ldd [$key + 8], %f8 + + brnz,a $rounds, .Loop_dec + sub $rounds, 2, $rounds + + andcc $out, 7, $tmp ! is output aligned? + andn $out, 7, $out + mov 0xff, $mask + srl $mask, $tmp, $mask + add %o7, 64, %o7 + sll $tmp, 3, $tmp + + fmovd %f0, %f4 + faesdecx %f2, %f10, %f0 + faesdecx %f4, %f12, %f2 + ldd [%o7 + $tmp], %f14 ! shift right params + + fmovd %f0, %f4 + faesdeclx %f2, %f6, %f0 + faesdeclx %f4, %f8, %f2 + + bnz,pn %icc, .Ldec_out_unaligned + mov %g1, %o7 + + std %f0, [$out + 0] + retl + std %f2, [$out + 8] + +.align 16 +.Ldec_out_unaligned: + add $out, 16, $inp + orn %g0, $mask, $tmp + fshiftorx %f0, %f0, %f14, %f4 + fshiftorx %f0, %f2, %f14, %f6 + fshiftorx %f2, %f2, %f14, %f8 + + stda %f4, [$out + $mask]0xc0 ! partial store + std %f6, [$out + 8] + stda %f8, [$inp + $tmp]0xc0 ! partial store + retl + nop +.type aes_fx_decrypt,#function +.size aes_fx_decrypt,.-aes_fx_decrypt +___ +} +{ +my ($inp,$bits,$out,$tmp,$inc) = map("%o$_",(0..5)); +$code.=<<___; +.globl aes_fx_set_decrypt_key +.align 32 +aes_fx_set_decrypt_key: + b .Lset_encrypt_key + mov -1, $inc + retl + nop +.type aes_fx_set_decrypt_key,#function +.size aes_fx_set_decrypt_key,.-aes_fx_set_decrypt_key + +.globl aes_fx_set_encrypt_key +.align 32 +aes_fx_set_encrypt_key: + mov 1, $inc + nop +.Lset_encrypt_key: + and $inp, 7, $tmp + andn $inp, 7, $inp + sll $tmp, 3, $tmp + mov %o7, %g1 + +1: call .+8 + add %o7, .Linp_align-1b, %o7 + + ldd [%o7 + $tmp], %f10 ! shift left params + mov %g1, %o7 + + cmp $bits, 192 + ldd [$inp + 0], %f0 + bl,pt %icc, .L128 + ldd [$inp + 8], %f2 + + be,pt %icc, .L192 + ldd [$inp + 16], %f4 + brz,pt $tmp, .L256aligned + ldd [$inp + 24], %f6 + + ldd [$inp + 32], %f8 + fshiftorx %f0, %f2, %f10, %f0 + fshiftorx %f2, %f4, %f10, %f2 + fshiftorx %f4, %f6, %f10, %f4 + fshiftorx %f6, %f8, %f10, %f6 + +.L256aligned: + mov 14, $bits + and $inc, `14*16`, $tmp + st $bits, [$out + 240] ! store rounds + add $out, $tmp, $out ! start or end of key schedule + sllx $inc, 4, $inc ! 16 or -16 +___ +for ($i=0; $i<6; $i++) { + $code.=<<___; + std %f0, [$out + 0] + faeskeyx %f6, `0x10+$i`, %f0 + std %f2, [$out + 8] + add $out, $inc, $out + faeskeyx %f0, 0x00, %f2 + std %f4, [$out + 0] + faeskeyx %f2, 0x01, %f4 + std %f6, [$out + 8] + add $out, $inc, $out + faeskeyx %f4, 0x00, %f6 +___ +} +$code.=<<___; + std %f0, [$out + 0] + faeskeyx %f6, `0x10+$i`, %f0 + std %f2, [$out + 8] + add $out, $inc, $out + faeskeyx %f0, 0x00, %f2 + std %f4,[$out + 0] + std %f6,[$out + 8] + add $out, $inc, $out + std %f0,[$out + 0] + std %f2,[$out + 8] + retl + xor %o0, %o0, %o0 ! return 0 + +.align 16 +.L192: + brz,pt $tmp, .L192aligned + nop + + ldd [$inp + 24], %f6 + fshiftorx %f0, %f2, %f10, %f0 + fshiftorx %f2, %f4, %f10, %f2 + fshiftorx %f4, %f6, %f10, %f4 + +.L192aligned: + mov 12, $bits + and $inc, `12*16`, $tmp + st $bits, [$out + 240] ! store rounds + add $out, $tmp, $out ! start or end of key schedule + sllx $inc, 4, $inc ! 16 or -16 +___ +for ($i=0; $i<8; $i+=2) { + $code.=<<___; + std %f0, [$out + 0] + faeskeyx %f4, `0x10+$i`, %f0 + std %f2, [$out + 8] + add $out, $inc, $out + faeskeyx %f0, 0x00, %f2 + std %f4, [$out + 0] + faeskeyx %f2, 0x00, %f4 + std %f0, [$out + 8] + add $out, $inc, $out + faeskeyx %f4, `0x10+$i+1`, %f0 + std %f2, [$out + 0] + faeskeyx %f0, 0x00, %f2 + std %f4, [$out + 8] + add $out, $inc, $out +___ +$code.=<<___ if ($i<6); + faeskeyx %f2, 0x00, %f4 +___ +} +$code.=<<___; + std %f0, [$out + 0] + std %f2, [$out + 8] + retl + xor %o0, %o0, %o0 ! return 0 + +.align 16 +.L128: + brz,pt $tmp, .L128aligned + nop + + ldd [$inp + 16], %f4 + fshiftorx %f0, %f2, %f10, %f0 + fshiftorx %f2, %f4, %f10, %f2 + +.L128aligned: + mov 10, $bits + and $inc, `10*16`, $tmp + st $bits, [$out + 240] ! store rounds + add $out, $tmp, $out ! start or end of key schedule + sllx $inc, 4, $inc ! 16 or -16 +___ +for ($i=0; $i<10; $i++) { + $code.=<<___; + std %f0, [$out + 0] + faeskeyx %f2, `0x10+$i`, %f0 + std %f2, [$out + 8] + add $out, $inc, $out + faeskeyx %f0, 0x00, %f2 +___ +} +$code.=<<___; + std %f0, [$out + 0] + std %f2, [$out + 8] + retl + xor %o0, %o0, %o0 ! return 0 +.type aes_fx_set_encrypt_key,#function +.size aes_fx_set_encrypt_key,.-aes_fx_set_encrypt_key +___ +} +{ +my ($inp,$out,$len,$key,$ivp,$dir) = map("%i$_",(0..5)); +my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7)); +my ($iv0,$iv1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift) + = map("%f$_",grep { !($_ & 1) } (16 .. 62)); +my ($ileft,$iright) = ($ialign,$oalign); + +$code.=<<___; +.globl aes_fx_cbc_encrypt +.align 32 +aes_fx_cbc_encrypt: + save %sp, -STACK_FRAME-16, %sp + srln $len, 4, $len + and $inp, 7, $ialign + andn $inp, 7, $inp + brz,pn $len, .Lcbc_no_data + sll $ialign, 3, $ileft + +1: call .+8 + add %o7, .Linp_align-1b, %o7 + + ld [$key + 240], $rounds + and $out, 7, $oalign + ld [$ivp + 0], %f0 ! load ivec + andn $out, 7, $out + ld [$ivp + 4], %f1 + sll $oalign, 3, $mask + ld [$ivp + 8], %f2 + ld [$ivp + 12], %f3 + + sll $rounds, 4, $rounds + add $rounds, $key, $end + ldd [$key + 0], $r0hi ! round[0] + ldd [$key + 8], $r0lo + + add $inp, 16, $inp + sub $len, 1, $len + ldd [$end + 0], $rlhi ! round[last] + ldd [$end + 8], $rllo + + mov 16, $inc + movrz $len, 0, $inc + ldd [$key + 16], %f10 ! round[1] + ldd [$key + 24], %f12 + + ldd [%o7 + $ileft], $fshift ! shift left params + add %o7, 64, %o7 + ldd [$inp - 16], $in0 ! load input + ldd [$inp - 8], $in1 + ldda [$inp]0x82, $intail ! non-faulting load + brz $dir, .Lcbc_decrypt + add $inp, $inc, $inp ! inp+=16 + + fxor $r0hi, %f0, %f0 ! ivec^=round[0] + fxor $r0lo, %f2, %f2 + fshiftorx $in0, $in1, $fshift, $in0 + fshiftorx $in1, $intail, $fshift, $in1 + nop + +.Loop_cbc_enc: + fxor $in0, %f0, %f0 ! inp^ivec^round[0] + fxor $in1, %f2, %f2 + ldd [$key + 32], %f6 ! round[2] + ldd [$key + 40], %f8 + add $key, 32, $end + sub $rounds, 16*6, $inner + +.Lcbc_enc: + fmovd %f0, %f4 + faesencx %f2, %f10, %f0 + faesencx %f4, %f12, %f2 + ldd [$end + 16], %f10 + ldd [$end + 24], %f12 + add $end, 32, $end + + fmovd %f0, %f4 + faesencx %f2, %f6, %f0 + faesencx %f4, %f8, %f2 + ldd [$end + 0], %f6 + ldd [$end + 8], %f8 + + brnz,a $inner, .Lcbc_enc + sub $inner, 16*2, $inner + + fmovd %f0, %f4 + faesencx %f2, %f10, %f0 + faesencx %f4, %f12, %f2 + ldd [$end + 16], %f10 ! round[last-1] + ldd [$end + 24], %f12 + + movrz $len, 0, $inc + fmovd $intail, $in0 + ldd [$inp - 8], $in1 ! load next input block + ldda [$inp]0x82, $intail ! non-faulting load + add $inp, $inc, $inp ! inp+=16 + + fmovd %f0, %f4 + faesencx %f2, %f6, %f0 + faesencx %f4, %f8, %f2 + + fshiftorx $in0, $in1, $fshift, $in0 + fshiftorx $in1, $intail, $fshift, $in1 + + fmovd %f0, %f4 + faesencx %f2, %f10, %f0 + faesencx %f4, %f12, %f2 + ldd [$key + 16], %f10 ! round[1] + ldd [$key + 24], %f12 + + fxor $r0hi, $in0, $in0 ! inp^=round[0] + fxor $r0lo, $in1, $in1 + + fmovd %f0, %f4 + faesenclx %f2, $rlhi, %f0 + faesenclx %f4, $rllo, %f2 + + brnz,pn $oalign, .Lcbc_enc_unaligned_out + nop + + std %f0, [$out + 0] + std %f2, [$out + 8] + add $out, 16, $out + + brnz,a $len, .Loop_cbc_enc + sub $len, 1, $len + + st %f0, [$ivp + 0] ! output ivec + st %f1, [$ivp + 4] + st %f2, [$ivp + 8] + st %f3, [$ivp + 12] + +.Lcbc_no_data: + ret + restore + +.align 32 +.Lcbc_enc_unaligned_out: + ldd [%o7 + $mask], $fshift ! shift right params + mov 0xff, $mask + srl $mask, $oalign, $mask + sub %g0, $ileft, $iright + + fshiftorx %f0, %f0, $fshift, %f6 + fshiftorx %f0, %f2, $fshift, %f8 + + stda %f6, [$out + $mask]0xc0 ! partial store + orn %g0, $mask, $mask + std %f8, [$out + 8] + add $out, 16, $out + brz $len, .Lcbc_enc_unaligned_out_done + sub $len, 1, $len + b .Loop_cbc_enc_unaligned_out + nop + +.align 32 +.Loop_cbc_enc_unaligned_out: + fmovd %f2, $outhead + fxor $in0, %f0, %f0 ! inp^ivec^round[0] + fxor $in1, %f2, %f2 + ldd [$key + 32], %f6 ! round[2] + ldd [$key + 40], %f8 + + fmovd %f0, %f4 + faesencx %f2, %f10, %f0 + faesencx %f4, %f12, %f2 + ldd [$key + 48], %f10 ! round[3] + ldd [$key + 56], %f12 + + ldx [$inp - 16], %o0 + ldx [$inp - 8], %o1 + brz $ileft, .Lcbc_enc_aligned_inp + movrz $len, 0, $inc + + ldx [$inp], %o2 + sllx %o0, $ileft, %o0 + srlx %o1, $iright, %g1 + sllx %o1, $ileft, %o1 + or %g1, %o0, %o0 + srlx %o2, $iright, %o2 + or %o2, %o1, %o1 + +.Lcbc_enc_aligned_inp: + fmovd %f0, %f4 + faesencx %f2, %f6, %f0 + faesencx %f4, %f8, %f2 + ldd [$key + 64], %f6 ! round[4] + ldd [$key + 72], %f8 + add $key, 64, $end + sub $rounds, 16*8, $inner + + stx %o0, [%sp + LOCALS + 0] + stx %o1, [%sp + LOCALS + 8] + add $inp, $inc, $inp ! inp+=16 + nop + +.Lcbc_enc_unaligned: + fmovd %f0, %f4 + faesencx %f2, %f10, %f0 + faesencx %f4, %f12, %f2 + ldd [$end + 16], %f10 + ldd [$end + 24], %f12 + add $end, 32, $end + + fmovd %f0, %f4 + faesencx %f2, %f6, %f0 + faesencx %f4, %f8, %f2 + ldd [$end + 0], %f6 + ldd [$end + 8], %f8 + + brnz,a $inner, .Lcbc_enc_unaligned + sub $inner, 16*2, $inner + + fmovd %f0, %f4 + faesencx %f2, %f10, %f0 + faesencx %f4, %f12, %f2 + ldd [$end + 16], %f10 ! round[last-1] + ldd [$end + 24], %f12 + + fmovd %f0, %f4 + faesencx %f2, %f6, %f0 + faesencx %f4, %f8, %f2 + + ldd [%sp + LOCALS + 0], $in0 + ldd [%sp + LOCALS + 8], $in1 + + fmovd %f0, %f4 + faesencx %f2, %f10, %f0 + faesencx %f4, %f12, %f2 + ldd [$key + 16], %f10 ! round[1] + ldd [$key + 24], %f12 + + fxor $r0hi, $in0, $in0 ! inp^=round[0] + fxor $r0lo, $in1, $in1 + + fmovd %f0, %f4 + faesenclx %f2, $rlhi, %f0 + faesenclx %f4, $rllo, %f2 + + fshiftorx $outhead, %f0, $fshift, %f6 + fshiftorx %f0, %f2, $fshift, %f8 + std %f6, [$out + 0] + std %f8, [$out + 8] + add $out, 16, $out + + brnz,a $len, .Loop_cbc_enc_unaligned_out + sub $len, 1, $len + +.Lcbc_enc_unaligned_out_done: + fshiftorx %f2, %f2, $fshift, %f8 + stda %f8, [$out + $mask]0xc0 ! partial store + + st %f0, [$ivp + 0] ! output ivec + st %f1, [$ivp + 4] + st %f2, [$ivp + 8] + st %f3, [$ivp + 12] + + ret + restore + +.align 32 +.Lcbc_decrypt: + fshiftorx $in0, $in1, $fshift, $in0 + fshiftorx $in1, $intail, $fshift, $in1 + fmovd %f0, $iv0 + fmovd %f2, $iv1 + +.Loop_cbc_dec: + fxor $in0, $r0hi, %f0 ! inp^round[0] + fxor $in1, $r0lo, %f2 + ldd [$key + 32], %f6 ! round[2] + ldd [$key + 40], %f8 + add $key, 32, $end + sub $rounds, 16*6, $inner + +.Lcbc_dec: + fmovd %f0, %f4 + faesdecx %f2, %f10, %f0 + faesdecx %f4, %f12, %f2 + ldd [$end + 16], %f10 + ldd [$end + 24], %f12 + add $end, 32, $end + + fmovd %f0, %f4 + faesdecx %f2, %f6, %f0 + faesdecx %f4, %f8, %f2 + ldd [$end + 0], %f6 + ldd [$end + 8], %f8 + + brnz,a $inner, .Lcbc_dec + sub $inner, 16*2, $inner + + fmovd %f0, %f4 + faesdecx %f2, %f10, %f0 + faesdecx %f4, %f12, %f2 + ldd [$end + 16], %f10 ! round[last-1] + ldd [$end + 24], %f12 + + fmovd %f0, %f4 + faesdecx %f2, %f6, %f0 + faesdecx %f4, %f8, %f2 + fxor $iv0, $rlhi, %f6 ! ivec^round[last] + fxor $iv1, $rllo, %f8 + fmovd $in0, $iv0 + fmovd $in1, $iv1 + + movrz $len, 0, $inc + fmovd $intail, $in0 + ldd [$inp - 8], $in1 ! load next input block + ldda [$inp]0x82, $intail ! non-faulting load + add $inp, $inc, $inp ! inp+=16 + + fmovd %f0, %f4 + faesdecx %f2, %f10, %f0 + faesdecx %f4, %f12, %f2 + ldd [$key + 16], %f10 ! round[1] + ldd [$key + 24], %f12 + + fshiftorx $in0, $in1, $fshift, $in0 + fshiftorx $in1, $intail, $fshift, $in1 + + fmovd %f0, %f4 + faesdeclx %f2, %f6, %f0 + faesdeclx %f4, %f8, %f2 + + brnz,pn $oalign, .Lcbc_dec_unaligned_out + nop + + std %f0, [$out + 0] + std %f2, [$out + 8] + add $out, 16, $out + + brnz,a $len, .Loop_cbc_dec + sub $len, 1, $len + + st $iv0, [$ivp + 0] ! output ivec + st $iv0#lo, [$ivp + 4] + st $iv1, [$ivp + 8] + st $iv1#lo, [$ivp + 12] + + ret + restore + +.align 32 +.Lcbc_dec_unaligned_out: + ldd [%o7 + $mask], $fshift ! shift right params + mov 0xff, $mask + srl $mask, $oalign, $mask + sub %g0, $ileft, $iright + + fshiftorx %f0, %f0, $fshift, %f6 + fshiftorx %f0, %f2, $fshift, %f8 + + stda %f6, [$out + $mask]0xc0 ! partial store + orn %g0, $mask, $mask + std %f8, [$out + 8] + add $out, 16, $out + brz $len, .Lcbc_dec_unaligned_out_done + sub $len, 1, $len + b .Loop_cbc_dec_unaligned_out + nop + +.align 32 +.Loop_cbc_dec_unaligned_out: + fmovd %f2, $outhead + fxor $in0, $r0hi, %f0 ! inp^round[0] + fxor $in1, $r0lo, %f2 + ldd [$key + 32], %f6 ! round[2] + ldd [$key + 40], %f8 + + fmovd %f0, %f4 + faesdecx %f2, %f10, %f0 + faesdecx %f4, %f12, %f2 + ldd [$key + 48], %f10 ! round[3] + ldd [$key + 56], %f12 + + ldx [$inp - 16], %o0 + ldx [$inp - 8], %o1 + brz $ileft, .Lcbc_dec_aligned_inp + movrz $len, 0, $inc + + ldx [$inp], %o2 + sllx %o0, $ileft, %o0 + srlx %o1, $iright, %g1 + sllx %o1, $ileft, %o1 + or %g1, %o0, %o0 + srlx %o2, $iright, %o2 + or %o2, %o1, %o1 + +.Lcbc_dec_aligned_inp: + fmovd %f0, %f4 + faesdecx %f2, %f6, %f0 + faesdecx %f4, %f8, %f2 + ldd [$key + 64], %f6 ! round[4] + ldd [$key + 72], %f8 + add $key, 64, $end + sub $rounds, 16*8, $inner + + stx %o0, [%sp + LOCALS + 0] + stx %o1, [%sp + LOCALS + 8] + add $inp, $inc, $inp ! inp+=16 + nop + +.Lcbc_dec_unaligned: + fmovd %f0, %f4 + faesdecx %f2, %f10, %f0 + faesdecx %f4, %f12, %f2 + ldd [$end + 16], %f10 + ldd [$end + 24], %f12 + add $end, 32, $end + + fmovd %f0, %f4 + faesdecx %f2, %f6, %f0 + faesdecx %f4, %f8, %f2 + ldd [$end + 0], %f6 + ldd [$end + 8], %f8 + + brnz,a $inner, .Lcbc_dec_unaligned + sub $inner, 16*2, $inner + + fmovd %f0, %f4 + faesdecx %f2, %f10, %f0 + faesdecx %f4, %f12, %f2 + ldd [$end + 16], %f10 ! round[last-1] + ldd [$end + 24], %f12 + + fmovd %f0, %f4 + faesdecx %f2, %f6, %f0 + faesdecx %f4, %f8, %f2 + + fxor $iv0, $rlhi, %f6 ! ivec^round[last] + fxor $iv1, $rllo, %f8 + fmovd $in0, $iv0 + fmovd $in1, $iv1 + ldd [%sp + LOCALS + 0], $in0 + ldd [%sp + LOCALS + 8], $in1 + + fmovd %f0, %f4 + faesdecx %f2, %f10, %f0 + faesdecx %f4, %f12, %f2 + ldd [$key + 16], %f10 ! round[1] + ldd [$key + 24], %f12 + + fmovd %f0, %f4 + faesdeclx %f2, %f6, %f0 + faesdeclx %f4, %f8, %f2 + + fshiftorx $outhead, %f0, $fshift, %f6 + fshiftorx %f0, %f2, $fshift, %f8 + std %f6, [$out + 0] + std %f8, [$out + 8] + add $out, 16, $out + + brnz,a $len, .Loop_cbc_dec_unaligned_out + sub $len, 1, $len + +.Lcbc_dec_unaligned_out_done: + fshiftorx %f2, %f2, $fshift, %f8 + stda %f8, [$out + $mask]0xc0 ! partial store + + st $iv0, [$ivp + 0] ! output ivec + st $iv0#lo, [$ivp + 4] + st $iv1, [$ivp + 8] + st $iv1#lo, [$ivp + 12] + + ret + restore +.type aes_fx_cbc_encrypt,#function +.size aes_fx_cbc_encrypt,.-aes_fx_cbc_encrypt +___ +} +{ +my ($inp,$out,$len,$key,$ivp) = map("%i$_",(0..5)); +my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7)); +my ($ctr0,$ctr1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift) + = map("%f$_",grep { !($_ & 1) } (16 .. 62)); +my ($ileft,$iright) = ($ialign, $oalign); +my $one = "%f14"; + +$code.=<<___; +.globl aes_fx_ctr32_encrypt_blocks +.align 32 +aes_fx_ctr32_encrypt_blocks: + save %sp, -STACK_FRAME-16, %sp + srln $len, 0, $len + and $inp, 7, $ialign + andn $inp, 7, $inp + brz,pn $len, .Lctr32_no_data + sll $ialign, 3, $ileft + +.Lpic: call .+8 + add %o7, .Linp_align - .Lpic, %o7 + + ld [$key + 240], $rounds + and $out, 7, $oalign + ld [$ivp + 0], $ctr0 ! load counter + andn $out, 7, $out + ld [$ivp + 4], $ctr0#lo + sll $oalign, 3, $mask + ld [$ivp + 8], $ctr1 + ld [$ivp + 12], $ctr1#lo + ldd [%o7 + 128], $one + + sll $rounds, 4, $rounds + add $rounds, $key, $end + ldd [$key + 0], $r0hi ! round[0] + ldd [$key + 8], $r0lo + + add $inp, 16, $inp + sub $len, 1, $len + ldd [$key + 16], %f10 ! round[1] + ldd [$key + 24], %f12 + + mov 16, $inc + movrz $len, 0, $inc + ldd [$end + 0], $rlhi ! round[last] + ldd [$end + 8], $rllo + + ldd [%o7 + $ileft], $fshift ! shiftleft params + add %o7, 64, %o7 + ldd [$inp - 16], $in0 ! load input + ldd [$inp - 8], $in1 + ldda [$inp]0x82, $intail ! non-faulting load + add $inp, $inc, $inp ! inp+=16 + + fshiftorx $in0, $in1, $fshift, $in0 + fshiftorx $in1, $intail, $fshift, $in1 + +.Loop_ctr32: + fxor $ctr0, $r0hi, %f0 ! counter^round[0] + fxor $ctr1, $r0lo, %f2 + ldd [$key + 32], %f6 ! round[2] + ldd [$key + 40], %f8 + add $key, 32, $end + sub $rounds, 16*6, $inner + +.Lctr32_enc: + fmovd %f0, %f4 + faesencx %f2, %f10, %f0 + faesencx %f4, %f12, %f2 + ldd [$end + 16], %f10 + ldd [$end + 24], %f12 + add $end, 32, $end + + fmovd %f0, %f4 + faesencx %f2, %f6, %f0 + faesencx %f4, %f8, %f2 + ldd [$end + 0], %f6 + ldd [$end + 8], %f8 + + brnz,a $inner, .Lctr32_enc + sub $inner, 16*2, $inner + + fmovd %f0, %f4 + faesencx %f2, %f10, %f0 + faesencx %f4, %f12, %f2 + ldd [$end + 16], %f10 ! round[last-1] + ldd [$end + 24], %f12 + + fmovd %f0, %f4 + faesencx %f2, %f6, %f0 + faesencx %f4, %f8, %f2 + fxor $in0, $rlhi, %f6 ! inp^round[last] + fxor $in1, $rllo, %f8 + + movrz $len, 0, $inc + fmovd $intail, $in0 + ldd [$inp - 8], $in1 ! load next input block + ldda [$inp]0x82, $intail ! non-faulting load + add $inp, $inc, $inp ! inp+=16 + + fmovd %f0, %f4 + faesencx %f2, %f10, %f0 + faesencx %f4, %f12, %f2 + ldd [$key + 16], %f10 ! round[1] + ldd [$key + 24], %f12 + + fshiftorx $in0, $in1, $fshift, $in0 + fshiftorx $in1, $intail, $fshift, $in1 + fpadd32 $ctr1, $one, $ctr1 ! increment counter + + fmovd %f0, %f4 + faesenclx %f2, %f6, %f0 + faesenclx %f4, %f8, %f2 + + brnz,pn $oalign, .Lctr32_unaligned_out + nop + + std %f0, [$out + 0] + std %f2, [$out + 8] + add $out, 16, $out + + brnz,a $len, .Loop_ctr32 + sub $len, 1, $len + +.Lctr32_no_data: + ret + restore + +.align 32 +.Lctr32_unaligned_out: + ldd [%o7 + $mask], $fshift ! shift right params + mov 0xff, $mask + srl $mask, $oalign, $mask + sub %g0, $ileft, $iright + + fshiftorx %f0, %f0, $fshift, %f6 + fshiftorx %f0, %f2, $fshift, %f8 + + stda %f6, [$out + $mask]0xc0 ! partial store + orn %g0, $mask, $mask + std %f8, [$out + 8] + add $out, 16, $out + brz $len, .Lctr32_unaligned_out_done + sub $len, 1, $len + b .Loop_ctr32_unaligned_out + nop + +.align 32 +.Loop_ctr32_unaligned_out: + fmovd %f2, $outhead + fxor $ctr0, $r0hi, %f0 ! counter^round[0] + fxor $ctr1, $r0lo, %f2 + ldd [$key + 32], %f6 ! round[2] + ldd [$key + 40], %f8 + + fmovd %f0, %f4 + faesencx %f2, %f10, %f0 + faesencx %f4, %f12, %f2 + ldd [$key + 48], %f10 ! round[3] + ldd [$key + 56], %f12 + + ldx [$inp - 16], %o0 + ldx [$inp - 8], %o1 + brz $ileft, .Lctr32_aligned_inp + movrz $len, 0, $inc + + ldx [$inp], %o2 + sllx %o0, $ileft, %o0 + srlx %o1, $iright, %g1 + sllx %o1, $ileft, %o1 + or %g1, %o0, %o0 + srlx %o2, $iright, %o2 + or %o2, %o1, %o1 + +.Lctr32_aligned_inp: + fmovd %f0, %f4 + faesencx %f2, %f6, %f0 + faesencx %f4, %f8, %f2 + ldd [$key + 64], %f6 ! round[4] + ldd [$key + 72], %f8 + add $key, 64, $end + sub $rounds, 16*8, $inner + + stx %o0, [%sp + LOCALS + 0] + stx %o1, [%sp + LOCALS + 8] + add $inp, $inc, $inp ! inp+=16 + nop + +.Lctr32_enc_unaligned: + fmovd %f0, %f4 + faesencx %f2, %f10, %f0 + faesencx %f4, %f12, %f2 + ldd [$end + 16], %f10 + ldd [$end + 24], %f12 + add $end, 32, $end + + fmovd %f0, %f4 + faesencx %f2, %f6, %f0 + faesencx %f4, %f8, %f2 + ldd [$end + 0], %f6 + ldd [$end + 8], %f8 + + brnz,a $inner, .Lctr32_enc_unaligned + sub $inner, 16*2, $inner + + fmovd %f0, %f4 + faesencx %f2, %f10, %f0 + faesencx %f4, %f12, %f2 + ldd [$end + 16], %f10 ! round[last-1] + ldd [$end + 24], %f12 + fpadd32 $ctr1, $one, $ctr1 ! increment counter + + fmovd %f0, %f4 + faesencx %f2, %f6, %f0 + faesencx %f4, %f8, %f2 + fxor $in0, $rlhi, %f6 ! inp^round[last] + fxor $in1, $rllo, %f8 + ldd [%sp + LOCALS + 0], $in0 + ldd [%sp + LOCALS + 8], $in1 + + fmovd %f0, %f4 + faesencx %f2, %f10, %f0 + faesencx %f4, %f12, %f2 + ldd [$key + 16], %f10 ! round[1] + ldd [$key + 24], %f12 + + fmovd %f0, %f4 + faesenclx %f2, %f6, %f0 + faesenclx %f4, %f8, %f2 + + fshiftorx $outhead, %f0, $fshift, %f6 + fshiftorx %f0, %f2, $fshift, %f8 + std %f6, [$out + 0] + std %f8, [$out + 8] + add $out, 16, $out + + brnz,a $len, .Loop_ctr32_unaligned_out + sub $len, 1, $len + +.Lctr32_unaligned_out_done: + fshiftorx %f2, %f2, $fshift, %f8 + stda %f8, [$out + $mask]0xc0 ! partial store + + ret + restore +.type aes_fx_ctr32_encrypt_blocks,#function +.size aes_fx_ctr32_encrypt_blocks,.-aes_fx_ctr32_encrypt_blocks + +.align 32 +.Linp_align: ! fshiftorx parameters for left shift toward %rs1 + .byte 0, 0, 64, 0, 0, 64, 0, -64 + .byte 0, 0, 56, 8, 0, 56, 8, -56 + .byte 0, 0, 48, 16, 0, 48, 16, -48 + .byte 0, 0, 40, 24, 0, 40, 24, -40 + .byte 0, 0, 32, 32, 0, 32, 32, -32 + .byte 0, 0, 24, 40, 0, 24, 40, -24 + .byte 0, 0, 16, 48, 0, 16, 48, -16 + .byte 0, 0, 8, 56, 0, 8, 56, -8 +.Lout_align: ! fshiftorx parameters for right shift toward %rs2 + .byte 0, 0, 0, 64, 0, 0, 64, 0 + .byte 0, 0, 8, 56, 0, 8, 56, -8 + .byte 0, 0, 16, 48, 0, 16, 48, -16 + .byte 0, 0, 24, 40, 0, 24, 40, -24 + .byte 0, 0, 32, 32, 0, 32, 32, -32 + .byte 0, 0, 40, 24, 0, 40, 24, -40 + .byte 0, 0, 48, 16, 0, 48, 16, -48 + .byte 0, 0, 56, 8, 0, 56, 8, -56 +.Lone: + .word 0, 1 +.asciz "AES for Fujitsu SPARC64 X, CRYPTOGAMS by <appro\@openssl.org>" +.align 4 +___ +} +# Purpose of these subroutines is to explicitly encode VIS instructions, +# so that one can compile the module without having to specify VIS +# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# Idea is to reserve for option to produce "universal" binary and let +# programmer detect if current CPU is VIS capable at run-time. +sub unvis { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my ($ref,$opf); +my %visopf = ( "faligndata" => 0x048, + "bshuffle" => 0x04c, + "fpadd32" => 0x052, + "fxor" => 0x06c, + "fsrc2" => 0x078 ); + + $ref = "$mnemonic\t$rs1,$rs2,$rd"; + + if ($opf=$visopf{$mnemonic}) { + foreach ($rs1,$rs2,$rd) { + return $ref if (!/%f([0-9]{1,2})/); + $_=$1; + if ($1>=32) { + return $ref if ($1&1); + # re-encode for upper double register addressing + $_=($1|$1>>5)&31; + } + } + + return sprintf ".word\t0x%08x !%s", + 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} + +sub unvis3 { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); +my ($ref,$opf); +my %visopf = ( "alignaddr" => 0x018, + "bmask" => 0x019, + "alignaddrl" => 0x01a ); + + $ref = "$mnemonic\t$rs1,$rs2,$rd"; + + if ($opf=$visopf{$mnemonic}) { + foreach ($rs1,$rs2,$rd) { + return $ref if (!/%([goli])([0-9])/); + $_=$bias{$1}+$2; + } + + return sprintf ".word\t0x%08x !%s", + 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} + +sub unfx { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my ($ref,$opf); +my %aesopf = ( "faesencx" => 0x90, + "faesdecx" => 0x91, + "faesenclx" => 0x92, + "faesdeclx" => 0x93, + "faeskeyx" => 0x94 ); + + $ref = "$mnemonic\t$rs1,$rs2,$rd"; + + if (defined($opf=$aesopf{$mnemonic})) { + $rs2 = ($rs2 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs2; + $rs2 = oct($rs2) if ($rs2 =~ /^0/); + + foreach ($rs1,$rd) { + return $ref if (!/%f([0-9]{1,2})/); + $_=$1; + if ($1>=32) { + return $ref if ($1&1); + # re-encode for upper double register addressing + $_=($1|$1>>5)&31; + } + } + + return sprintf ".word\t0x%08x !%s", + 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} + +sub unfx3src { +my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; +my ($ref,$opf); +my %aesopf = ( "fshiftorx" => 0x0b ); + + $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; + + if (defined($opf=$aesopf{$mnemonic})) { + foreach ($rs1,$rs2,$rs3,$rd) { + return $ref if (!/%f([0-9]{1,2})/); + $_=$1; + if ($1>=32) { + return $ref if ($1&1); + # re-encode for upper double register addressing + $_=($1|$1>>5)&31; + } + } + + return sprintf ".word\t0x%08x !%s", + 2<<30|$rd<<25|0x37<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/%f([0-9]+)#lo/sprintf "%%f%d",$1+1/ge; + + s/\b(faes[^x]{3,4}x)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/ + &unfx($1,$2,$3,$4) + /ge or + s/\b([f][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ + &unfx3src($1,$2,$3,$4,$5) + /ge or + s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ + &unvis($1,$2,$3,$4) + /ge or + s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ + &unvis3($1,$2,$3,$4) + /ge; + print $_,"\n"; +} + +close STDOUT; diff --git a/deps/openssl/openssl/crypto/aes/asm/aesni-mb-x86_64.pl b/deps/openssl/openssl/crypto/aes/asm/aesni-mb-x86_64.pl index d7ad7882c4..aa2735e06a 100644 --- a/deps/openssl/openssl/crypto/aes/asm/aesni-mb-x86_64.pl +++ b/deps/openssl/openssl/crypto/aes/asm/aesni-mb-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -67,7 +74,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([ $avx = ($2>=3.0) + ($2>3.0); } -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # void aesni_multi_cbc_encrypt ( diff --git a/deps/openssl/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl b/deps/openssl/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl index 7a30e893fb..33a7f0cf44 100644 --- a/deps/openssl/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl +++ b/deps/openssl/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -25,7 +32,10 @@ # Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%) # Ivy Bridge 5.05[+4.6] 9.65 5.54 +74% # Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%) +# Skylake 2.63[+3.5(4.1)] 6.17(6.69) 4.23(4.44) +46%(+51%) # Bulldozer 5.77[+6.0] 11.72 6.37 +84% +# Ryzen(**) 2.71[+1.93] 4.64 2.74 +69% +# Goldmont(**) 3.82[+1.70] 5.52 4.20 +31% # # AES-192-CBC # Westmere 4.51 9.81 6.80 +44% @@ -39,12 +49,16 @@ # Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%) # Ivy Bridge 7.05 11.65 7.12 +64% # Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%) +# Skylake 3.62 7.16(7.68) 4.56(4.76) +57%(+61%) # Bulldozer 8.00 13.95 8.25 +69% +# Ryzen(**) 3.71 5.64 3.72 +52% +# Goldmont(**) 5.35 7.05 5.76 +22% # # (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for # background information. Above numbers in parentheses are SSSE3 # results collected on AVX-capable CPU, i.e. apply on OSes that # don't support AVX. +# (**) SHAEXT results. # # Needless to mention that it makes no sense to implement "stitched" # *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1 @@ -100,7 +114,7 @@ $shaext=1; ### set to zero if compiling for 1.0.1 $stitched_decrypt=0; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # void aesni_cbc_sha1_enc(const void *inp, @@ -298,7 +312,7 @@ ___ $r++; unshift(@rndkey,pop(@rndkey)); }; -sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions @@ -1137,7 +1151,7 @@ ___ $r++; unshift(@rndkey,pop(@rndkey)); }; -sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_avx_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions diff --git a/deps/openssl/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl b/deps/openssl/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl index 588ade64ee..0e49f26faf 100644 --- a/deps/openssl/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl +++ b/deps/openssl/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -21,17 +28,21 @@ # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched # subroutine: # -# AES-128/-192/-256+SHA256 this(**)gain -# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43% -# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50% -# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59% -# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58% +# AES-128/-192/-256+SHA256 this(**) gain +# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43% +# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50% +# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59% +# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40% +# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58% +# Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54% +# Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60% # -# (*) there are XOP, AVX1 and AVX2 code pathes, meaning that +# (*) there are XOP, AVX1 and AVX2 code paths, meaning that # Westmere is omitted from loop, this is because gain was not # estimated high enough to justify the effort; # (**) these are EVP-free results, results obtained with 'speed # -evp aes-256-cbc-hmac-sha256' will vary by percent or two; +# (***) these are SHAEXT results; $flavour = shift; $output = shift; @@ -66,7 +77,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([ $shaext=$avx; ### set to zero if compiling for 1.0.1 $avx=1 if (!$shaext && $avx); -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $func="aesni_cbc_sha256_enc"; diff --git a/deps/openssl/openssl/crypto/aes/asm/aesni-x86.pl b/deps/openssl/openssl/crypto/aes/asm/aesni-x86.pl index 9b2e37aafb..ed1a47c30c 100644 --- a/deps/openssl/openssl/crypto/aes/asm/aesni-x86.pl +++ b/deps/openssl/openssl/crypto/aes/asm/aesni-x86.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -43,16 +50,20 @@ # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. +# November 2015 +# +# Add aesni_ocb_[en|de]crypt. + ###################################################################### # Current large-block performance in cycles per byte processed with # 128-bit key (less is better). # -# CBC en-/decrypt CTR XTS ECB +# CBC en-/decrypt CTR XTS ECB OCB # Westmere 3.77/1.37 1.37 1.52 1.27 -# * Bridge 5.07/0.98 0.99 1.09 0.91 -# Haswell 4.44/0.80 0.97 1.03 0.72 -# Silvermont 5.77/3.56 3.67 4.03 3.46 -# Bulldozer 5.80/0.98 1.05 1.24 0.93 +# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10 +# Haswell 4.44/0.80 0.97 1.03 0.72 0.76 +# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03 +# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for @@ -63,6 +74,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; +$output = pop; +open OUT,">$output"; +*STDOUT=*OUT; + &asm_init($ARGV[0],$0); &external_label("OPENSSL_ia32cap_P"); @@ -1831,6 +1846,877 @@ if ($PREFIX eq "aesni") { &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp &function_end("aesni_xts_decrypt"); } + +###################################################################### +# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, +# const AES_KEY *key, unsigned int start_block_num, +# unsigned char offset_i[16], const unsigned char L_[][16], +# unsigned char checksum[16]); +# +{ +# offsets within stack frame +my $checksum = 16*6; +my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4)); + +# reassigned registers +my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out); +# $l_, $blocks, $inp, $key are permanently allocated in registers; +# remaining non-volatile ones are offloaded to stack, which even +# stay invariant after written to stack. + +&function_begin("aesni_ocb_encrypt"); + &mov ($rounds,&wparam(5)); # &offset_i + &mov ($rounds_,&wparam(7)); # &checksum + + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); + &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i + &mov ($block,&wparam(4)); # start_block_num + &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum + &mov ($l_,&wparam(6)); # L_ + + &mov ($rounds,"esp"); + &sub ("esp",$esp_off+4); # alloca + &and ("esp",-16); # align stack + + &sub ($out,$inp); + &shl ($len,4); + &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6 + &mov (&DWP($out_off,"esp"),$out); + &mov (&DWP($end_off,"esp"),$len); + &mov (&DWP($esp_off,"esp"),$rounds); + + &mov ($rounds,&DWP(240,$key)); + + &test ($block,1); + &jnz (&label("odd")); + + &bsf ($i3,$block); + &add ($block,1); + &shl ($i3,4); + &movdqu ($inout5,&QWP(0,$l_,$i3)); + &mov ($i3,$key); # put aside key + + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &lea ($inp,&DWP(16,$inp)); + + &pxor ($inout5,$rndkey0); # ^ last offset_i + &pxor ($rndkey1,$inout0); # checksum + &pxor ($inout0,$inout5); # ^ offset_i + + &movdqa ($inout4,$rndkey1); + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + + &xorps ($inout0,$inout5); # ^ offset_i + &movdqa ($rndkey0,$inout5); # pass last offset_i + &movdqa ($rndkey1,$inout4); # pass the checksum + + &movups (&QWP(-16,$out,$inp),$inout0); # store output + + &mov ($rounds,&DWP(240,$i3)); + &mov ($key,$i3); # restore key + &mov ($len,&DWP($end_off,"esp")); + +&set_label("odd"); + &shl ($rounds,4); + &mov ($out,16); + &sub ($out,$rounds); # twisted rounds + &mov (&DWP($key_off,"esp"),$key); + &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule + &mov (&DWP($rounds_off,"esp"),$out); + + &cmp ($inp,$len); + &ja (&label("short")); + &jmp (&label("grandloop")); + +&set_label("grandloop",32); + &lea ($i1,&DWP(1,$block)); + &lea ($i3,&DWP(3,$block)); + &lea ($i5,&DWP(5,$block)); + &add ($block,6); + &bsf ($i1,$i1); + &bsf ($i3,$i3); + &bsf ($i5,$i5); + &shl ($i1,4); + &shl ($i3,4); + &shl ($i5,4); + &movdqu ($inout0,&QWP(0,$l_)); + &movdqu ($inout1,&QWP(0,$l_,$i1)); + &mov ($rounds,&DWP($rounds_off,"esp")); + &movdqa ($inout2,$inout0); + &movdqu ($inout3,&QWP(0,$l_,$i3)); + &movdqa ($inout4,$inout0); + &movdqu ($inout5,&QWP(0,$l_,$i5)); + + &pxor ($inout0,$rndkey0); # ^ last offset_i + &pxor ($inout1,$inout0); + &movdqa (&QWP(16*0,"esp"),$inout0); + &pxor ($inout2,$inout1); + &movdqa (&QWP(16*1,"esp"),$inout1); + &pxor ($inout3,$inout2); + &movdqa (&QWP(16*2,"esp"),$inout2); + &pxor ($inout4,$inout3); + &movdqa (&QWP(16*3,"esp"),$inout3); + &pxor ($inout5,$inout4); + &movdqa (&QWP(16*4,"esp"),$inout4); + &movdqa (&QWP(16*5,"esp"),$inout5); + + &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &movdqu ($inout1,&QWP(16*1,$inp)); + &movdqu ($inout2,&QWP(16*2,$inp)); + &movdqu ($inout3,&QWP(16*3,$inp)); + &movdqu ($inout4,&QWP(16*4,$inp)); + &movdqu ($inout5,&QWP(16*5,$inp)); + &lea ($inp,&DWP(16*6,$inp)); + + &pxor ($rndkey1,$inout0); # checksum + &pxor ($inout0,$rndkey0); # ^ roundkey[0] + &pxor ($rndkey1,$inout1); + &pxor ($inout1,$rndkey0); + &pxor ($rndkey1,$inout2); + &pxor ($inout2,$rndkey0); + &pxor ($rndkey1,$inout3); + &pxor ($inout3,$rndkey0); + &pxor ($rndkey1,$inout4); + &pxor ($inout4,$rndkey0); + &pxor ($rndkey1,$inout5); + &pxor ($inout5,$rndkey0); + &movdqa (&QWP($checksum,"esp"),$rndkey1); + + &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); + &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i + &pxor ($inout1,&QWP(16*1,"esp")); + &pxor ($inout2,&QWP(16*2,"esp")); + &pxor ($inout3,&QWP(16*3,"esp")); + &pxor ($inout4,&QWP(16*4,"esp")); + &pxor ($inout5,&QWP(16*5,"esp")); + + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); + &aesenc ($inout0,$rndkey1); + &aesenc ($inout1,$rndkey1); + &aesenc ($inout2,$rndkey1); + &aesenc ($inout3,$rndkey1); + &aesenc ($inout4,$rndkey1); + &aesenc ($inout5,$rndkey1); + + &mov ($out,&DWP($out_off,"esp")); + &mov ($len,&DWP($end_off,"esp")); + &call ("_aesni_encrypt6_enter"); + + &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i + &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i + &pxor ($inout1,&QWP(16*1,"esp")); + &pxor ($inout2,&QWP(16*2,"esp")); + &pxor ($inout3,&QWP(16*3,"esp")); + &pxor ($inout4,&QWP(16*4,"esp")); + &pxor ($inout5,$rndkey0); + &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum + + &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output + &movdqu (&QWP(-16*5,$out,$inp),$inout1); + &movdqu (&QWP(-16*4,$out,$inp),$inout2); + &movdqu (&QWP(-16*3,$out,$inp),$inout3); + &movdqu (&QWP(-16*2,$out,$inp),$inout4); + &movdqu (&QWP(-16*1,$out,$inp),$inout5); + &cmp ($inp,$len); # done yet? + &jb (&label("grandloop")); + +&set_label("short"); + &add ($len,16*6); + &sub ($len,$inp); + &jz (&label("done")); + + &cmp ($len,16*2); + &jb (&label("one")); + &je (&label("two")); + + &cmp ($len,16*4); + &jb (&label("three")); + &je (&label("four")); + + &lea ($i1,&DWP(1,$block)); + &lea ($i3,&DWP(3,$block)); + &bsf ($i1,$i1); + &bsf ($i3,$i3); + &shl ($i1,4); + &shl ($i3,4); + &movdqu ($inout0,&QWP(0,$l_)); + &movdqu ($inout1,&QWP(0,$l_,$i1)); + &mov ($rounds,&DWP($rounds_off,"esp")); + &movdqa ($inout2,$inout0); + &movdqu ($inout3,&QWP(0,$l_,$i3)); + &movdqa ($inout4,$inout0); + + &pxor ($inout0,$rndkey0); # ^ last offset_i + &pxor ($inout1,$inout0); + &movdqa (&QWP(16*0,"esp"),$inout0); + &pxor ($inout2,$inout1); + &movdqa (&QWP(16*1,"esp"),$inout1); + &pxor ($inout3,$inout2); + &movdqa (&QWP(16*2,"esp"),$inout2); + &pxor ($inout4,$inout3); + &movdqa (&QWP(16*3,"esp"),$inout3); + &pxor ($inout5,$inout4); + &movdqa (&QWP(16*4,"esp"),$inout4); + + &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &movdqu ($inout1,&QWP(16*1,$inp)); + &movdqu ($inout2,&QWP(16*2,$inp)); + &movdqu ($inout3,&QWP(16*3,$inp)); + &movdqu ($inout4,&QWP(16*4,$inp)); + &pxor ($inout5,$inout5); + + &pxor ($rndkey1,$inout0); # checksum + &pxor ($inout0,$rndkey0); # ^ roundkey[0] + &pxor ($rndkey1,$inout1); + &pxor ($inout1,$rndkey0); + &pxor ($rndkey1,$inout2); + &pxor ($inout2,$rndkey0); + &pxor ($rndkey1,$inout3); + &pxor ($inout3,$rndkey0); + &pxor ($rndkey1,$inout4); + &pxor ($inout4,$rndkey0); + &movdqa (&QWP($checksum,"esp"),$rndkey1); + + &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); + &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i + &pxor ($inout1,&QWP(16*1,"esp")); + &pxor ($inout2,&QWP(16*2,"esp")); + &pxor ($inout3,&QWP(16*3,"esp")); + &pxor ($inout4,&QWP(16*4,"esp")); + + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); + &aesenc ($inout0,$rndkey1); + &aesenc ($inout1,$rndkey1); + &aesenc ($inout2,$rndkey1); + &aesenc ($inout3,$rndkey1); + &aesenc ($inout4,$rndkey1); + &aesenc ($inout5,$rndkey1); + + &mov ($out,&DWP($out_off,"esp")); + &call ("_aesni_encrypt6_enter"); + + &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i + &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i + &pxor ($inout1,&QWP(16*1,"esp")); + &pxor ($inout2,&QWP(16*2,"esp")); + &pxor ($inout3,&QWP(16*3,"esp")); + &pxor ($inout4,$rndkey0); + &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum + + &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output + &movdqu (&QWP(16*1,$out,$inp),$inout1); + &movdqu (&QWP(16*2,$out,$inp),$inout2); + &movdqu (&QWP(16*3,$out,$inp),$inout3); + &movdqu (&QWP(16*4,$out,$inp),$inout4); + + &jmp (&label("done")); + +&set_label("one",16); + &movdqu ($inout5,&QWP(0,$l_)); + &mov ($key,&DWP($key_off,"esp")); # restore key + + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &mov ($rounds,&DWP(240,$key)); + + &pxor ($inout5,$rndkey0); # ^ last offset_i + &pxor ($rndkey1,$inout0); # checksum + &pxor ($inout0,$inout5); # ^ offset_i + + &movdqa ($inout4,$rndkey1); + &mov ($out,&DWP($out_off,"esp")); + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + + &xorps ($inout0,$inout5); # ^ offset_i + &movdqa ($rndkey0,$inout5); # pass last offset_i + &movdqa ($rndkey1,$inout4); # pass the checksum + &movups (&QWP(0,$out,$inp),$inout0); + + &jmp (&label("done")); + +&set_label("two",16); + &lea ($i1,&DWP(1,$block)); + &mov ($key,&DWP($key_off,"esp")); # restore key + &bsf ($i1,$i1); + &shl ($i1,4); + &movdqu ($inout4,&QWP(0,$l_)); + &movdqu ($inout5,&QWP(0,$l_,$i1)); + + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &movdqu ($inout1,&QWP(16*1,$inp)); + &mov ($rounds,&DWP(240,$key)); + + &pxor ($inout4,$rndkey0); # ^ last offset_i + &pxor ($inout5,$inout4); + + &pxor ($rndkey1,$inout0); # checksum + &pxor ($inout0,$inout4); # ^ offset_i + &pxor ($rndkey1,$inout1); + &pxor ($inout1,$inout5); + + &movdqa ($inout3,$rndkey1) + &mov ($out,&DWP($out_off,"esp")); + &call ("_aesni_encrypt2"); + + &xorps ($inout0,$inout4); # ^ offset_i + &xorps ($inout1,$inout5); + &movdqa ($rndkey0,$inout5); # pass last offset_i + &movdqa ($rndkey1,$inout3); # pass the checksum + &movups (&QWP(16*0,$out,$inp),$inout0); # store output + &movups (&QWP(16*1,$out,$inp),$inout1); + + &jmp (&label("done")); + +&set_label("three",16); + &lea ($i1,&DWP(1,$block)); + &mov ($key,&DWP($key_off,"esp")); # restore key + &bsf ($i1,$i1); + &shl ($i1,4); + &movdqu ($inout3,&QWP(0,$l_)); + &movdqu ($inout4,&QWP(0,$l_,$i1)); + &movdqa ($inout5,$inout3); + + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &movdqu ($inout1,&QWP(16*1,$inp)); + &movdqu ($inout2,&QWP(16*2,$inp)); + &mov ($rounds,&DWP(240,$key)); + + &pxor ($inout3,$rndkey0); # ^ last offset_i + &pxor ($inout4,$inout3); + &pxor ($inout5,$inout4); + + &pxor ($rndkey1,$inout0); # checksum + &pxor ($inout0,$inout3); # ^ offset_i + &pxor ($rndkey1,$inout1); + &pxor ($inout1,$inout4); + &pxor ($rndkey1,$inout2); + &pxor ($inout2,$inout5); + + &movdqa (&QWP($checksum,"esp"),$rndkey1); + &mov ($out,&DWP($out_off,"esp")); + &call ("_aesni_encrypt3"); + + &xorps ($inout0,$inout3); # ^ offset_i + &xorps ($inout1,$inout4); + &xorps ($inout2,$inout5); + &movdqa ($rndkey0,$inout5); # pass last offset_i + &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum + &movups (&QWP(16*0,$out,$inp),$inout0); # store output + &movups (&QWP(16*1,$out,$inp),$inout1); + &movups (&QWP(16*2,$out,$inp),$inout2); + + &jmp (&label("done")); + +&set_label("four",16); + &lea ($i1,&DWP(1,$block)); + &lea ($i3,&DWP(3,$block)); + &bsf ($i1,$i1); + &bsf ($i3,$i3); + &mov ($key,&DWP($key_off,"esp")); # restore key + &shl ($i1,4); + &shl ($i3,4); + &movdqu ($inout2,&QWP(0,$l_)); + &movdqu ($inout3,&QWP(0,$l_,$i1)); + &movdqa ($inout4,$inout2); + &movdqu ($inout5,&QWP(0,$l_,$i3)); + + &pxor ($inout2,$rndkey0); # ^ last offset_i + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &pxor ($inout3,$inout2); + &movdqu ($inout1,&QWP(16*1,$inp)); + &pxor ($inout4,$inout3); + &movdqa (&QWP(16*0,"esp"),$inout2); + &pxor ($inout5,$inout4); + &movdqa (&QWP(16*1,"esp"),$inout3); + &movdqu ($inout2,&QWP(16*2,$inp)); + &movdqu ($inout3,&QWP(16*3,$inp)); + &mov ($rounds,&DWP(240,$key)); + + &pxor ($rndkey1,$inout0); # checksum + &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i + &pxor ($rndkey1,$inout1); + &pxor ($inout1,&QWP(16*1,"esp")); + &pxor ($rndkey1,$inout2); + &pxor ($inout2,$inout4); + &pxor ($rndkey1,$inout3); + &pxor ($inout3,$inout5); + + &movdqa (&QWP($checksum,"esp"),$rndkey1) + &mov ($out,&DWP($out_off,"esp")); + &call ("_aesni_encrypt4"); + + &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,$inout4); + &movups (&QWP(16*0,$out,$inp),$inout0); # store output + &xorps ($inout3,$inout5); + &movups (&QWP(16*1,$out,$inp),$inout1); + &movdqa ($rndkey0,$inout5); # pass last offset_i + &movups (&QWP(16*2,$out,$inp),$inout2); + &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum + &movups (&QWP(16*3,$out,$inp),$inout3); + +&set_label("done"); + &mov ($key,&DWP($esp_off,"esp")); + &pxor ($inout0,$inout0); # clear register bank + &pxor ($inout1,$inout1); + &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack + &pxor ($inout2,$inout2); + &movdqa (&QWP(16*1,"esp"),$inout0); + &pxor ($inout3,$inout3); + &movdqa (&QWP(16*2,"esp"),$inout0); + &pxor ($inout4,$inout4); + &movdqa (&QWP(16*3,"esp"),$inout0); + &pxor ($inout5,$inout5); + &movdqa (&QWP(16*4,"esp"),$inout0); + &movdqa (&QWP(16*5,"esp"),$inout0); + &movdqa (&QWP(16*6,"esp"),$inout0); + + &lea ("esp",&DWP(0,$key)); + &mov ($rounds,&wparam(5)); # &offset_i + &mov ($rounds_,&wparam(7)); # &checksum + &movdqu (&QWP(0,$rounds),$rndkey0); + &pxor ($rndkey0,$rndkey0); + &movdqu (&QWP(0,$rounds_),$rndkey1); + &pxor ($rndkey1,$rndkey1); +&function_end("aesni_ocb_encrypt"); + +&function_begin("aesni_ocb_decrypt"); + &mov ($rounds,&wparam(5)); # &offset_i + &mov ($rounds_,&wparam(7)); # &checksum + + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); + &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i + &mov ($block,&wparam(4)); # start_block_num + &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum + &mov ($l_,&wparam(6)); # L_ + + &mov ($rounds,"esp"); + &sub ("esp",$esp_off+4); # alloca + &and ("esp",-16); # align stack + + &sub ($out,$inp); + &shl ($len,4); + &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6 + &mov (&DWP($out_off,"esp"),$out); + &mov (&DWP($end_off,"esp"),$len); + &mov (&DWP($esp_off,"esp"),$rounds); + + &mov ($rounds,&DWP(240,$key)); + + &test ($block,1); + &jnz (&label("odd")); + + &bsf ($i3,$block); + &add ($block,1); + &shl ($i3,4); + &movdqu ($inout5,&QWP(0,$l_,$i3)); + &mov ($i3,$key); # put aside key + + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &lea ($inp,&DWP(16,$inp)); + + &pxor ($inout5,$rndkey0); # ^ last offset_i + &pxor ($inout0,$inout5); # ^ offset_i + + &movdqa ($inout4,$rndkey1); + if ($inline) + { &aesni_inline_generate1("dec"); } + else + { &call ("_aesni_decrypt1"); } + + &xorps ($inout0,$inout5); # ^ offset_i + &movaps ($rndkey1,$inout4); # pass the checksum + &movdqa ($rndkey0,$inout5); # pass last offset_i + &xorps ($rndkey1,$inout0); # checksum + &movups (&QWP(-16,$out,$inp),$inout0); # store output + + &mov ($rounds,&DWP(240,$i3)); + &mov ($key,$i3); # restore key + &mov ($len,&DWP($end_off,"esp")); + +&set_label("odd"); + &shl ($rounds,4); + &mov ($out,16); + &sub ($out,$rounds); # twisted rounds + &mov (&DWP($key_off,"esp"),$key); + &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule + &mov (&DWP($rounds_off,"esp"),$out); + + &cmp ($inp,$len); + &ja (&label("short")); + &jmp (&label("grandloop")); + +&set_label("grandloop",32); + &lea ($i1,&DWP(1,$block)); + &lea ($i3,&DWP(3,$block)); + &lea ($i5,&DWP(5,$block)); + &add ($block,6); + &bsf ($i1,$i1); + &bsf ($i3,$i3); + &bsf ($i5,$i5); + &shl ($i1,4); + &shl ($i3,4); + &shl ($i5,4); + &movdqu ($inout0,&QWP(0,$l_)); + &movdqu ($inout1,&QWP(0,$l_,$i1)); + &mov ($rounds,&DWP($rounds_off,"esp")); + &movdqa ($inout2,$inout0); + &movdqu ($inout3,&QWP(0,$l_,$i3)); + &movdqa ($inout4,$inout0); + &movdqu ($inout5,&QWP(0,$l_,$i5)); + + &pxor ($inout0,$rndkey0); # ^ last offset_i + &pxor ($inout1,$inout0); + &movdqa (&QWP(16*0,"esp"),$inout0); + &pxor ($inout2,$inout1); + &movdqa (&QWP(16*1,"esp"),$inout1); + &pxor ($inout3,$inout2); + &movdqa (&QWP(16*2,"esp"),$inout2); + &pxor ($inout4,$inout3); + &movdqa (&QWP(16*3,"esp"),$inout3); + &pxor ($inout5,$inout4); + &movdqa (&QWP(16*4,"esp"),$inout4); + &movdqa (&QWP(16*5,"esp"),$inout5); + + &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &movdqu ($inout1,&QWP(16*1,$inp)); + &movdqu ($inout2,&QWP(16*2,$inp)); + &movdqu ($inout3,&QWP(16*3,$inp)); + &movdqu ($inout4,&QWP(16*4,$inp)); + &movdqu ($inout5,&QWP(16*5,$inp)); + &lea ($inp,&DWP(16*6,$inp)); + + &movdqa (&QWP($checksum,"esp"),$rndkey1); + &pxor ($inout0,$rndkey0); # ^ roundkey[0] + &pxor ($inout1,$rndkey0); + &pxor ($inout2,$rndkey0); + &pxor ($inout3,$rndkey0); + &pxor ($inout4,$rndkey0); + &pxor ($inout5,$rndkey0); + + &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); + &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i + &pxor ($inout1,&QWP(16*1,"esp")); + &pxor ($inout2,&QWP(16*2,"esp")); + &pxor ($inout3,&QWP(16*3,"esp")); + &pxor ($inout4,&QWP(16*4,"esp")); + &pxor ($inout5,&QWP(16*5,"esp")); + + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); + &aesdec ($inout0,$rndkey1); + &aesdec ($inout1,$rndkey1); + &aesdec ($inout2,$rndkey1); + &aesdec ($inout3,$rndkey1); + &aesdec ($inout4,$rndkey1); + &aesdec ($inout5,$rndkey1); + + &mov ($out,&DWP($out_off,"esp")); + &mov ($len,&DWP($end_off,"esp")); + &call ("_aesni_decrypt6_enter"); + + &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i + &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i + &movdqa ($rndkey1,&QWP($checksum,"esp")); + &pxor ($inout1,&QWP(16*1,"esp")); + &pxor ($inout2,&QWP(16*2,"esp")); + &pxor ($inout3,&QWP(16*3,"esp")); + &pxor ($inout4,&QWP(16*4,"esp")); + &pxor ($inout5,$rndkey0); + + &pxor ($rndkey1,$inout0); # checksum + &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output + &pxor ($rndkey1,$inout1); + &movdqu (&QWP(-16*5,$out,$inp),$inout1); + &pxor ($rndkey1,$inout2); + &movdqu (&QWP(-16*4,$out,$inp),$inout2); + &pxor ($rndkey1,$inout3); + &movdqu (&QWP(-16*3,$out,$inp),$inout3); + &pxor ($rndkey1,$inout4); + &movdqu (&QWP(-16*2,$out,$inp),$inout4); + &pxor ($rndkey1,$inout5); + &movdqu (&QWP(-16*1,$out,$inp),$inout5); + &cmp ($inp,$len); # done yet? + &jb (&label("grandloop")); + +&set_label("short"); + &add ($len,16*6); + &sub ($len,$inp); + &jz (&label("done")); + + &cmp ($len,16*2); + &jb (&label("one")); + &je (&label("two")); + + &cmp ($len,16*4); + &jb (&label("three")); + &je (&label("four")); + + &lea ($i1,&DWP(1,$block)); + &lea ($i3,&DWP(3,$block)); + &bsf ($i1,$i1); + &bsf ($i3,$i3); + &shl ($i1,4); + &shl ($i3,4); + &movdqu ($inout0,&QWP(0,$l_)); + &movdqu ($inout1,&QWP(0,$l_,$i1)); + &mov ($rounds,&DWP($rounds_off,"esp")); + &movdqa ($inout2,$inout0); + &movdqu ($inout3,&QWP(0,$l_,$i3)); + &movdqa ($inout4,$inout0); + + &pxor ($inout0,$rndkey0); # ^ last offset_i + &pxor ($inout1,$inout0); + &movdqa (&QWP(16*0,"esp"),$inout0); + &pxor ($inout2,$inout1); + &movdqa (&QWP(16*1,"esp"),$inout1); + &pxor ($inout3,$inout2); + &movdqa (&QWP(16*2,"esp"),$inout2); + &pxor ($inout4,$inout3); + &movdqa (&QWP(16*3,"esp"),$inout3); + &pxor ($inout5,$inout4); + &movdqa (&QWP(16*4,"esp"),$inout4); + + &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &movdqu ($inout1,&QWP(16*1,$inp)); + &movdqu ($inout2,&QWP(16*2,$inp)); + &movdqu ($inout3,&QWP(16*3,$inp)); + &movdqu ($inout4,&QWP(16*4,$inp)); + &pxor ($inout5,$inout5); + + &movdqa (&QWP($checksum,"esp"),$rndkey1); + &pxor ($inout0,$rndkey0); # ^ roundkey[0] + &pxor ($inout1,$rndkey0); + &pxor ($inout2,$rndkey0); + &pxor ($inout3,$rndkey0); + &pxor ($inout4,$rndkey0); + + &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); + &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i + &pxor ($inout1,&QWP(16*1,"esp")); + &pxor ($inout2,&QWP(16*2,"esp")); + &pxor ($inout3,&QWP(16*3,"esp")); + &pxor ($inout4,&QWP(16*4,"esp")); + + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); + &aesdec ($inout0,$rndkey1); + &aesdec ($inout1,$rndkey1); + &aesdec ($inout2,$rndkey1); + &aesdec ($inout3,$rndkey1); + &aesdec ($inout4,$rndkey1); + &aesdec ($inout5,$rndkey1); + + &mov ($out,&DWP($out_off,"esp")); + &call ("_aesni_decrypt6_enter"); + + &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i + &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i + &movdqa ($rndkey1,&QWP($checksum,"esp")); + &pxor ($inout1,&QWP(16*1,"esp")); + &pxor ($inout2,&QWP(16*2,"esp")); + &pxor ($inout3,&QWP(16*3,"esp")); + &pxor ($inout4,$rndkey0); + + &pxor ($rndkey1,$inout0); # checksum + &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output + &pxor ($rndkey1,$inout1); + &movdqu (&QWP(16*1,$out,$inp),$inout1); + &pxor ($rndkey1,$inout2); + &movdqu (&QWP(16*2,$out,$inp),$inout2); + &pxor ($rndkey1,$inout3); + &movdqu (&QWP(16*3,$out,$inp),$inout3); + &pxor ($rndkey1,$inout4); + &movdqu (&QWP(16*4,$out,$inp),$inout4); + + &jmp (&label("done")); + +&set_label("one",16); + &movdqu ($inout5,&QWP(0,$l_)); + &mov ($key,&DWP($key_off,"esp")); # restore key + + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &mov ($rounds,&DWP(240,$key)); + + &pxor ($inout5,$rndkey0); # ^ last offset_i + &pxor ($inout0,$inout5); # ^ offset_i + + &movdqa ($inout4,$rndkey1); + &mov ($out,&DWP($out_off,"esp")); + if ($inline) + { &aesni_inline_generate1("dec"); } + else + { &call ("_aesni_decrypt1"); } + + &xorps ($inout0,$inout5); # ^ offset_i + &movaps ($rndkey1,$inout4); # pass the checksum + &movdqa ($rndkey0,$inout5); # pass last offset_i + &xorps ($rndkey1,$inout0); # checksum + &movups (&QWP(0,$out,$inp),$inout0); + + &jmp (&label("done")); + +&set_label("two",16); + &lea ($i1,&DWP(1,$block)); + &mov ($key,&DWP($key_off,"esp")); # restore key + &bsf ($i1,$i1); + &shl ($i1,4); + &movdqu ($inout4,&QWP(0,$l_)); + &movdqu ($inout5,&QWP(0,$l_,$i1)); + + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &movdqu ($inout1,&QWP(16*1,$inp)); + &mov ($rounds,&DWP(240,$key)); + + &movdqa ($inout3,$rndkey1); + &pxor ($inout4,$rndkey0); # ^ last offset_i + &pxor ($inout5,$inout4); + + &pxor ($inout0,$inout4); # ^ offset_i + &pxor ($inout1,$inout5); + + &mov ($out,&DWP($out_off,"esp")); + &call ("_aesni_decrypt2"); + + &xorps ($inout0,$inout4); # ^ offset_i + &xorps ($inout1,$inout5); + &movdqa ($rndkey0,$inout5); # pass last offset_i + &xorps ($inout3,$inout0); # checksum + &movups (&QWP(16*0,$out,$inp),$inout0); # store output + &xorps ($inout3,$inout1); + &movups (&QWP(16*1,$out,$inp),$inout1); + &movaps ($rndkey1,$inout3); # pass the checksum + + &jmp (&label("done")); + +&set_label("three",16); + &lea ($i1,&DWP(1,$block)); + &mov ($key,&DWP($key_off,"esp")); # restore key + &bsf ($i1,$i1); + &shl ($i1,4); + &movdqu ($inout3,&QWP(0,$l_)); + &movdqu ($inout4,&QWP(0,$l_,$i1)); + &movdqa ($inout5,$inout3); + + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &movdqu ($inout1,&QWP(16*1,$inp)); + &movdqu ($inout2,&QWP(16*2,$inp)); + &mov ($rounds,&DWP(240,$key)); + + &movdqa (&QWP($checksum,"esp"),$rndkey1); + &pxor ($inout3,$rndkey0); # ^ last offset_i + &pxor ($inout4,$inout3); + &pxor ($inout5,$inout4); + + &pxor ($inout0,$inout3); # ^ offset_i + &pxor ($inout1,$inout4); + &pxor ($inout2,$inout5); + + &mov ($out,&DWP($out_off,"esp")); + &call ("_aesni_decrypt3"); + + &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum + &xorps ($inout0,$inout3); # ^ offset_i + &xorps ($inout1,$inout4); + &xorps ($inout2,$inout5); + &movups (&QWP(16*0,$out,$inp),$inout0); # store output + &pxor ($rndkey1,$inout0); # checksum + &movdqa ($rndkey0,$inout5); # pass last offset_i + &movups (&QWP(16*1,$out,$inp),$inout1); + &pxor ($rndkey1,$inout1); + &movups (&QWP(16*2,$out,$inp),$inout2); + &pxor ($rndkey1,$inout2); + + &jmp (&label("done")); + +&set_label("four",16); + &lea ($i1,&DWP(1,$block)); + &lea ($i3,&DWP(3,$block)); + &bsf ($i1,$i1); + &bsf ($i3,$i3); + &mov ($key,&DWP($key_off,"esp")); # restore key + &shl ($i1,4); + &shl ($i3,4); + &movdqu ($inout2,&QWP(0,$l_)); + &movdqu ($inout3,&QWP(0,$l_,$i1)); + &movdqa ($inout4,$inout2); + &movdqu ($inout5,&QWP(0,$l_,$i3)); + + &pxor ($inout2,$rndkey0); # ^ last offset_i + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &pxor ($inout3,$inout2); + &movdqu ($inout1,&QWP(16*1,$inp)); + &pxor ($inout4,$inout3); + &movdqa (&QWP(16*0,"esp"),$inout2); + &pxor ($inout5,$inout4); + &movdqa (&QWP(16*1,"esp"),$inout3); + &movdqu ($inout2,&QWP(16*2,$inp)); + &movdqu ($inout3,&QWP(16*3,$inp)); + &mov ($rounds,&DWP(240,$key)); + + &movdqa (&QWP($checksum,"esp"),$rndkey1); + &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i + &pxor ($inout1,&QWP(16*1,"esp")); + &pxor ($inout2,$inout4); + &pxor ($inout3,$inout5); + + &mov ($out,&DWP($out_off,"esp")); + &call ("_aesni_decrypt4"); + + &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum + &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,$inout4); + &movups (&QWP(16*0,$out,$inp),$inout0); # store output + &pxor ($rndkey1,$inout0); # checksum + &xorps ($inout3,$inout5); + &movups (&QWP(16*1,$out,$inp),$inout1); + &pxor ($rndkey1,$inout1); + &movdqa ($rndkey0,$inout5); # pass last offset_i + &movups (&QWP(16*2,$out,$inp),$inout2); + &pxor ($rndkey1,$inout2); + &movups (&QWP(16*3,$out,$inp),$inout3); + &pxor ($rndkey1,$inout3); + +&set_label("done"); + &mov ($key,&DWP($esp_off,"esp")); + &pxor ($inout0,$inout0); # clear register bank + &pxor ($inout1,$inout1); + &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack + &pxor ($inout2,$inout2); + &movdqa (&QWP(16*1,"esp"),$inout0); + &pxor ($inout3,$inout3); + &movdqa (&QWP(16*2,"esp"),$inout0); + &pxor ($inout4,$inout4); + &movdqa (&QWP(16*3,"esp"),$inout0); + &pxor ($inout5,$inout5); + &movdqa (&QWP(16*4,"esp"),$inout0); + &movdqa (&QWP(16*5,"esp"),$inout0); + &movdqa (&QWP(16*6,"esp"),$inout0); + + &lea ("esp",&DWP(0,$key)); + &mov ($rounds,&wparam(5)); # &offset_i + &mov ($rounds_,&wparam(7)); # &checksum + &movdqu (&QWP(0,$rounds),$rndkey0); + &pxor ($rndkey0,$rndkey0); + &movdqu (&QWP(0,$rounds_),$rndkey1); + &pxor ($rndkey1,$rndkey1); +&function_end("aesni_ocb_decrypt"); +} } ###################################################################### @@ -2419,7 +3305,7 @@ if ($PREFIX eq "aesni") { &pxor ("xmm3","xmm3"); &aesenclast ("xmm2","xmm3"); - &movdqa ("xmm3","xmm1") + &movdqa ("xmm3","xmm1"); &pslldq ("xmm1",4); &pxor ("xmm3","xmm1"); &pslldq ("xmm1",4); @@ -2523,3 +3409,5 @@ if ($PREFIX eq "aesni") { &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); &asm_finish(); + +close STDOUT; diff --git a/deps/openssl/openssl/crypto/aes/asm/aesni-x86_64.pl b/deps/openssl/openssl/crypto/aes/asm/aesni-x86_64.pl index 25ca574f6a..98ca17991d 100644 --- a/deps/openssl/openssl/crypto/aes/asm/aesni-x86_64.pl +++ b/deps/openssl/openssl/crypto/aes/asm/aesni-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -157,16 +164,23 @@ # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like # in CTR mode AES instruction interleave factor was chosen to be 6x. +# November 2015 +# +# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was +# chosen to be 6x. + ###################################################################### # Current large-block performance in cycles per byte processed with # 128-bit key (less is better). # -# CBC en-/decrypt CTR XTS ECB +# CBC en-/decrypt CTR XTS ECB OCB # Westmere 3.77/1.25 1.25 1.25 1.26 -# * Bridge 5.07/0.74 0.75 0.90 0.85 -# Haswell 4.44/0.63 0.63 0.73 0.63 -# Silvermont 5.75/3.54 3.56 4.12 3.87(*) -# Bulldozer 5.77/0.70 0.72 0.90 0.70 +# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98 +# Haswell 4.44/0.63 0.63 0.73 0.63 0.70 +# Skylake 2.62/0.63 0.63 0.63 0.63 +# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 +# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 +# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 # # (*) Atom Silvermont ECB result is suboptimal because of penalties # incurred by operations on %xmm8-15. As ECB is not considered @@ -187,7 +201,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $movkey = $PREFIX eq "aesni" ? "movups" : "movups"; @@ -2708,6 +2722,925 @@ $code.=<<___; ret .size aesni_xts_decrypt,.-aesni_xts_decrypt ___ +} + +###################################################################### +# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, +# const AES_KEY *key, unsigned int start_block_num, +# unsigned char offset_i[16], const unsigned char L_[][16], +# unsigned char checksum[16]); +# +{ +my @offset=map("%xmm$_",(10..15)); +my ($checksum,$rndkey0l)=("%xmm8","%xmm9"); +my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments +my ($L_p,$checksum_p) = ("%rbx","%rbp"); +my ($i1,$i3,$i5) = ("%r12","%r13","%r14"); +my $seventh_arg = $win64 ? 56 : 8; +my $blocks = $len; + +$code.=<<___; +.globl aesni_ocb_encrypt +.type aesni_ocb_encrypt,\@function,6 +.align 32 +aesni_ocb_encrypt: + lea (%rsp),%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 +___ +$code.=<<___ if ($win64); + lea -0xa0(%rsp),%rsp + movaps %xmm6,0x00(%rsp) # offload everything + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,0x60(%rsp) + movaps %xmm13,0x70(%rsp) + movaps %xmm14,0x80(%rsp) + movaps %xmm15,0x90(%rsp) +.Locb_enc_body: +___ +$code.=<<___; + mov $seventh_arg(%rax),$L_p # 7th argument + mov $seventh_arg+8(%rax),$checksum_p# 8th argument + + mov 240($key),$rnds_ + mov $key,$key_ + shl \$4,$rnds_ + $movkey ($key),$rndkey0l # round[0] + $movkey 16($key,$rnds_),$rndkey1 # round[last] + + movdqu ($offset_p),@offset[5] # load last offset_i + pxor $rndkey1,$rndkey0l # round[0] ^ round[last] + pxor $rndkey1,@offset[5] # offset_i ^ round[last] + + mov \$16+32,$rounds + lea 32($key_,$rnds_),$key + $movkey 16($key_),$rndkey1 # round[1] + sub %r10,%rax # twisted $rounds + mov %rax,%r10 # backup twisted $rounds + + movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks + movdqu ($checksum_p),$checksum # load checksum + + test \$1,$block_num # is first block number odd? + jnz .Locb_enc_odd + + bsf $block_num,$i1 + add \$1,$block_num + shl \$4,$i1 + movdqu ($L_p,$i1),$inout5 # borrow + movdqu ($inp),$inout0 + lea 16($inp),$inp + + call __ocb_encrypt1 + + movdqa $inout5,@offset[5] + movups $inout0,($out) + lea 16($out),$out + sub \$1,$blocks + jz .Locb_enc_done + +.Locb_enc_odd: + lea 1($block_num),$i1 # even-numbered blocks + lea 3($block_num),$i3 + lea 5($block_num),$i5 + lea 6($block_num),$block_num + bsf $i1,$i1 # ntz(block) + bsf $i3,$i3 + bsf $i5,$i5 + shl \$4,$i1 # ntz(block) -> table offset + shl \$4,$i3 + shl \$4,$i5 + + sub \$6,$blocks + jc .Locb_enc_short + jmp .Locb_enc_grandloop + +.align 32 +.Locb_enc_grandloop: + movdqu `16*0`($inp),$inout0 # load input + movdqu `16*1`($inp),$inout1 + movdqu `16*2`($inp),$inout2 + movdqu `16*3`($inp),$inout3 + movdqu `16*4`($inp),$inout4 + movdqu `16*5`($inp),$inout5 + lea `16*6`($inp),$inp + + call __ocb_encrypt6 + + movups $inout0,`16*0`($out) # store output + movups $inout1,`16*1`($out) + movups $inout2,`16*2`($out) + movups $inout3,`16*3`($out) + movups $inout4,`16*4`($out) + movups $inout5,`16*5`($out) + lea `16*6`($out),$out + sub \$6,$blocks + jnc .Locb_enc_grandloop + +.Locb_enc_short: + add \$6,$blocks + jz .Locb_enc_done + + movdqu `16*0`($inp),$inout0 + cmp \$2,$blocks + jb .Locb_enc_one + movdqu `16*1`($inp),$inout1 + je .Locb_enc_two + + movdqu `16*2`($inp),$inout2 + cmp \$4,$blocks + jb .Locb_enc_three + movdqu `16*3`($inp),$inout3 + je .Locb_enc_four + + movdqu `16*4`($inp),$inout4 + pxor $inout5,$inout5 + + call __ocb_encrypt6 + + movdqa @offset[4],@offset[5] + movups $inout0,`16*0`($out) + movups $inout1,`16*1`($out) + movups $inout2,`16*2`($out) + movups $inout3,`16*3`($out) + movups $inout4,`16*4`($out) + + jmp .Locb_enc_done + +.align 16 +.Locb_enc_one: + movdqa @offset[0],$inout5 # borrow + + call __ocb_encrypt1 + + movdqa $inout5,@offset[5] + movups $inout0,`16*0`($out) + jmp .Locb_enc_done + +.align 16 +.Locb_enc_two: + pxor $inout2,$inout2 + pxor $inout3,$inout3 + + call __ocb_encrypt4 + + movdqa @offset[1],@offset[5] + movups $inout0,`16*0`($out) + movups $inout1,`16*1`($out) + + jmp .Locb_enc_done + +.align 16 +.Locb_enc_three: + pxor $inout3,$inout3 + + call __ocb_encrypt4 + + movdqa @offset[2],@offset[5] + movups $inout0,`16*0`($out) + movups $inout1,`16*1`($out) + movups $inout2,`16*2`($out) + + jmp .Locb_enc_done + +.align 16 +.Locb_enc_four: + call __ocb_encrypt4 + + movdqa @offset[3],@offset[5] + movups $inout0,`16*0`($out) + movups $inout1,`16*1`($out) + movups $inout2,`16*2`($out) + movups $inout3,`16*3`($out) + +.Locb_enc_done: + pxor $rndkey0,@offset[5] # "remove" round[last] + movdqu $checksum,($checksum_p) # store checksum + movdqu @offset[5],($offset_p) # store last offset_i + + xorps %xmm0,%xmm0 # clear register bank + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +$code.=<<___ if (!$win64); + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 +___ +$code.=<<___ if ($win64); + movaps 0x00(%rsp),%xmm6 + movaps %xmm0,0x00(%rsp) # clear stack + movaps 0x10(%rsp),%xmm7 + movaps %xmm0,0x10(%rsp) + movaps 0x20(%rsp),%xmm8 + movaps %xmm0,0x20(%rsp) + movaps 0x30(%rsp),%xmm9 + movaps %xmm0,0x30(%rsp) + movaps 0x40(%rsp),%xmm10 + movaps %xmm0,0x40(%rsp) + movaps 0x50(%rsp),%xmm11 + movaps %xmm0,0x50(%rsp) + movaps 0x60(%rsp),%xmm12 + movaps %xmm0,0x60(%rsp) + movaps 0x70(%rsp),%xmm13 + movaps %xmm0,0x70(%rsp) + movaps 0x80(%rsp),%xmm14 + movaps %xmm0,0x80(%rsp) + movaps 0x90(%rsp),%xmm15 + movaps %xmm0,0x90(%rsp) + lea 0xa0+0x28(%rsp),%rax +.Locb_enc_pop: + lea 0xa0(%rsp),%rsp +___ +$code.=<<___; + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx +.Locb_enc_epilogue: + ret +.size aesni_ocb_encrypt,.-aesni_ocb_encrypt + +.type __ocb_encrypt6,\@abi-omnipotent +.align 32 +__ocb_encrypt6: + pxor $rndkey0l,@offset[5] # offset_i ^ round[0] + movdqu ($L_p,$i1),@offset[1] + movdqa @offset[0],@offset[2] + movdqu ($L_p,$i3),@offset[3] + movdqa @offset[0],@offset[4] + pxor @offset[5],@offset[0] + movdqu ($L_p,$i5),@offset[5] + pxor @offset[0],@offset[1] + pxor $inout0,$checksum # accumulate checksum + pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i + pxor @offset[1],@offset[2] + pxor $inout1,$checksum + pxor @offset[1],$inout1 + pxor @offset[2],@offset[3] + pxor $inout2,$checksum + pxor @offset[2],$inout2 + pxor @offset[3],@offset[4] + pxor $inout3,$checksum + pxor @offset[3],$inout3 + pxor @offset[4],@offset[5] + pxor $inout4,$checksum + pxor @offset[4],$inout4 + pxor $inout5,$checksum + pxor @offset[5],$inout5 + $movkey 32($key_),$rndkey0 + + lea 1($block_num),$i1 # even-numbered blocks + lea 3($block_num),$i3 + lea 5($block_num),$i5 + add \$6,$block_num + pxor $rndkey0l,@offset[0] # offset_i ^ round[last] + bsf $i1,$i1 # ntz(block) + bsf $i3,$i3 + bsf $i5,$i5 + + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + pxor $rndkey0l,@offset[1] + pxor $rndkey0l,@offset[2] + aesenc $rndkey1,$inout4 + pxor $rndkey0l,@offset[3] + pxor $rndkey0l,@offset[4] + aesenc $rndkey1,$inout5 + $movkey 48($key_),$rndkey1 + pxor $rndkey0l,@offset[5] + + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + $movkey 64($key_),$rndkey0 + shl \$4,$i1 # ntz(block) -> table offset + shl \$4,$i3 + jmp .Locb_enc_loop6 + +.align 32 +.Locb_enc_loop6: + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + $movkey -16($key,%rax),$rndkey0 + jnz .Locb_enc_loop6 + + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + $movkey 16($key_),$rndkey1 + shl \$4,$i5 + + aesenclast @offset[0],$inout0 + movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks + mov %r10,%rax # restore twisted rounds + aesenclast @offset[1],$inout1 + aesenclast @offset[2],$inout2 + aesenclast @offset[3],$inout3 + aesenclast @offset[4],$inout4 + aesenclast @offset[5],$inout5 + ret +.size __ocb_encrypt6,.-__ocb_encrypt6 + +.type __ocb_encrypt4,\@abi-omnipotent +.align 32 +__ocb_encrypt4: + pxor $rndkey0l,@offset[5] # offset_i ^ round[0] + movdqu ($L_p,$i1),@offset[1] + movdqa @offset[0],@offset[2] + movdqu ($L_p,$i3),@offset[3] + pxor @offset[5],@offset[0] + pxor @offset[0],@offset[1] + pxor $inout0,$checksum # accumulate checksum + pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i + pxor @offset[1],@offset[2] + pxor $inout1,$checksum + pxor @offset[1],$inout1 + pxor @offset[2],@offset[3] + pxor $inout2,$checksum + pxor @offset[2],$inout2 + pxor $inout3,$checksum + pxor @offset[3],$inout3 + $movkey 32($key_),$rndkey0 + + pxor $rndkey0l,@offset[0] # offset_i ^ round[last] + pxor $rndkey0l,@offset[1] + pxor $rndkey0l,@offset[2] + pxor $rndkey0l,@offset[3] + + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + $movkey 48($key_),$rndkey1 + + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + $movkey 64($key_),$rndkey0 + jmp .Locb_enc_loop4 + +.align 32 +.Locb_enc_loop4: + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + $movkey -16($key,%rax),$rndkey0 + jnz .Locb_enc_loop4 + + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + $movkey 16($key_),$rndkey1 + mov %r10,%rax # restore twisted rounds + + aesenclast @offset[0],$inout0 + aesenclast @offset[1],$inout1 + aesenclast @offset[2],$inout2 + aesenclast @offset[3],$inout3 + ret +.size __ocb_encrypt4,.-__ocb_encrypt4 + +.type __ocb_encrypt1,\@abi-omnipotent +.align 32 +__ocb_encrypt1: + pxor @offset[5],$inout5 # offset_i + pxor $rndkey0l,$inout5 # offset_i ^ round[0] + pxor $inout0,$checksum # accumulate checksum + pxor $inout5,$inout0 # input ^ round[0] ^ offset_i + $movkey 32($key_),$rndkey0 + + aesenc $rndkey1,$inout0 + $movkey 48($key_),$rndkey1 + pxor $rndkey0l,$inout5 # offset_i ^ round[last] + + aesenc $rndkey0,$inout0 + $movkey 64($key_),$rndkey0 + jmp .Locb_enc_loop1 + +.align 32 +.Locb_enc_loop1: + aesenc $rndkey1,$inout0 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + + aesenc $rndkey0,$inout0 + $movkey -16($key,%rax),$rndkey0 + jnz .Locb_enc_loop1 + + aesenc $rndkey1,$inout0 + $movkey 16($key_),$rndkey1 # redundant in tail + mov %r10,%rax # restore twisted rounds + + aesenclast $inout5,$inout0 + ret +.size __ocb_encrypt1,.-__ocb_encrypt1 + +.globl aesni_ocb_decrypt +.type aesni_ocb_decrypt,\@function,6 +.align 32 +aesni_ocb_decrypt: + lea (%rsp),%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 +___ +$code.=<<___ if ($win64); + lea -0xa0(%rsp),%rsp + movaps %xmm6,0x00(%rsp) # offload everything + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,0x60(%rsp) + movaps %xmm13,0x70(%rsp) + movaps %xmm14,0x80(%rsp) + movaps %xmm15,0x90(%rsp) +.Locb_dec_body: +___ +$code.=<<___; + mov $seventh_arg(%rax),$L_p # 7th argument + mov $seventh_arg+8(%rax),$checksum_p# 8th argument + + mov 240($key),$rnds_ + mov $key,$key_ + shl \$4,$rnds_ + $movkey ($key),$rndkey0l # round[0] + $movkey 16($key,$rnds_),$rndkey1 # round[last] + + movdqu ($offset_p),@offset[5] # load last offset_i + pxor $rndkey1,$rndkey0l # round[0] ^ round[last] + pxor $rndkey1,@offset[5] # offset_i ^ round[last] + + mov \$16+32,$rounds + lea 32($key_,$rnds_),$key + $movkey 16($key_),$rndkey1 # round[1] + sub %r10,%rax # twisted $rounds + mov %rax,%r10 # backup twisted $rounds + + movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks + movdqu ($checksum_p),$checksum # load checksum + + test \$1,$block_num # is first block number odd? + jnz .Locb_dec_odd + + bsf $block_num,$i1 + add \$1,$block_num + shl \$4,$i1 + movdqu ($L_p,$i1),$inout5 # borrow + movdqu ($inp),$inout0 + lea 16($inp),$inp + + call __ocb_decrypt1 + + movdqa $inout5,@offset[5] + movups $inout0,($out) + xorps $inout0,$checksum # accumulate checksum + lea 16($out),$out + sub \$1,$blocks + jz .Locb_dec_done + +.Locb_dec_odd: + lea 1($block_num),$i1 # even-numbered blocks + lea 3($block_num),$i3 + lea 5($block_num),$i5 + lea 6($block_num),$block_num + bsf $i1,$i1 # ntz(block) + bsf $i3,$i3 + bsf $i5,$i5 + shl \$4,$i1 # ntz(block) -> table offset + shl \$4,$i3 + shl \$4,$i5 + + sub \$6,$blocks + jc .Locb_dec_short + jmp .Locb_dec_grandloop + +.align 32 +.Locb_dec_grandloop: + movdqu `16*0`($inp),$inout0 # load input + movdqu `16*1`($inp),$inout1 + movdqu `16*2`($inp),$inout2 + movdqu `16*3`($inp),$inout3 + movdqu `16*4`($inp),$inout4 + movdqu `16*5`($inp),$inout5 + lea `16*6`($inp),$inp + + call __ocb_decrypt6 + + movups $inout0,`16*0`($out) # store output + pxor $inout0,$checksum # accumulate checksum + movups $inout1,`16*1`($out) + pxor $inout1,$checksum + movups $inout2,`16*2`($out) + pxor $inout2,$checksum + movups $inout3,`16*3`($out) + pxor $inout3,$checksum + movups $inout4,`16*4`($out) + pxor $inout4,$checksum + movups $inout5,`16*5`($out) + pxor $inout5,$checksum + lea `16*6`($out),$out + sub \$6,$blocks + jnc .Locb_dec_grandloop + +.Locb_dec_short: + add \$6,$blocks + jz .Locb_dec_done + + movdqu `16*0`($inp),$inout0 + cmp \$2,$blocks + jb .Locb_dec_one + movdqu `16*1`($inp),$inout1 + je .Locb_dec_two + + movdqu `16*2`($inp),$inout2 + cmp \$4,$blocks + jb .Locb_dec_three + movdqu `16*3`($inp),$inout3 + je .Locb_dec_four + + movdqu `16*4`($inp),$inout4 + pxor $inout5,$inout5 + + call __ocb_decrypt6 + + movdqa @offset[4],@offset[5] + movups $inout0,`16*0`($out) # store output + pxor $inout0,$checksum # accumulate checksum + movups $inout1,`16*1`($out) + pxor $inout1,$checksum + movups $inout2,`16*2`($out) + pxor $inout2,$checksum + movups $inout3,`16*3`($out) + pxor $inout3,$checksum + movups $inout4,`16*4`($out) + pxor $inout4,$checksum + + jmp .Locb_dec_done + +.align 16 +.Locb_dec_one: + movdqa @offset[0],$inout5 # borrow + + call __ocb_decrypt1 + + movdqa $inout5,@offset[5] + movups $inout0,`16*0`($out) # store output + xorps $inout0,$checksum # accumulate checksum + jmp .Locb_dec_done + +.align 16 +.Locb_dec_two: + pxor $inout2,$inout2 + pxor $inout3,$inout3 + + call __ocb_decrypt4 + + movdqa @offset[1],@offset[5] + movups $inout0,`16*0`($out) # store output + xorps $inout0,$checksum # accumulate checksum + movups $inout1,`16*1`($out) + xorps $inout1,$checksum + + jmp .Locb_dec_done + +.align 16 +.Locb_dec_three: + pxor $inout3,$inout3 + + call __ocb_decrypt4 + + movdqa @offset[2],@offset[5] + movups $inout0,`16*0`($out) # store output + xorps $inout0,$checksum # accumulate checksum + movups $inout1,`16*1`($out) + xorps $inout1,$checksum + movups $inout2,`16*2`($out) + xorps $inout2,$checksum + + jmp .Locb_dec_done + +.align 16 +.Locb_dec_four: + call __ocb_decrypt4 + + movdqa @offset[3],@offset[5] + movups $inout0,`16*0`($out) # store output + pxor $inout0,$checksum # accumulate checksum + movups $inout1,`16*1`($out) + pxor $inout1,$checksum + movups $inout2,`16*2`($out) + pxor $inout2,$checksum + movups $inout3,`16*3`($out) + pxor $inout3,$checksum + +.Locb_dec_done: + pxor $rndkey0,@offset[5] # "remove" round[last] + movdqu $checksum,($checksum_p) # store checksum + movdqu @offset[5],($offset_p) # store last offset_i + + xorps %xmm0,%xmm0 # clear register bank + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +$code.=<<___ if (!$win64); + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 +___ +$code.=<<___ if ($win64); + movaps 0x00(%rsp),%xmm6 + movaps %xmm0,0x00(%rsp) # clear stack + movaps 0x10(%rsp),%xmm7 + movaps %xmm0,0x10(%rsp) + movaps 0x20(%rsp),%xmm8 + movaps %xmm0,0x20(%rsp) + movaps 0x30(%rsp),%xmm9 + movaps %xmm0,0x30(%rsp) + movaps 0x40(%rsp),%xmm10 + movaps %xmm0,0x40(%rsp) + movaps 0x50(%rsp),%xmm11 + movaps %xmm0,0x50(%rsp) + movaps 0x60(%rsp),%xmm12 + movaps %xmm0,0x60(%rsp) + movaps 0x70(%rsp),%xmm13 + movaps %xmm0,0x70(%rsp) + movaps 0x80(%rsp),%xmm14 + movaps %xmm0,0x80(%rsp) + movaps 0x90(%rsp),%xmm15 + movaps %xmm0,0x90(%rsp) + lea 0xa0+0x28(%rsp),%rax +.Locb_dec_pop: + lea 0xa0(%rsp),%rsp +___ +$code.=<<___; + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx +.Locb_dec_epilogue: + ret +.size aesni_ocb_decrypt,.-aesni_ocb_decrypt + +.type __ocb_decrypt6,\@abi-omnipotent +.align 32 +__ocb_decrypt6: + pxor $rndkey0l,@offset[5] # offset_i ^ round[0] + movdqu ($L_p,$i1),@offset[1] + movdqa @offset[0],@offset[2] + movdqu ($L_p,$i3),@offset[3] + movdqa @offset[0],@offset[4] + pxor @offset[5],@offset[0] + movdqu ($L_p,$i5),@offset[5] + pxor @offset[0],@offset[1] + pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i + pxor @offset[1],@offset[2] + pxor @offset[1],$inout1 + pxor @offset[2],@offset[3] + pxor @offset[2],$inout2 + pxor @offset[3],@offset[4] + pxor @offset[3],$inout3 + pxor @offset[4],@offset[5] + pxor @offset[4],$inout4 + pxor @offset[5],$inout5 + $movkey 32($key_),$rndkey0 + + lea 1($block_num),$i1 # even-numbered blocks + lea 3($block_num),$i3 + lea 5($block_num),$i5 + add \$6,$block_num + pxor $rndkey0l,@offset[0] # offset_i ^ round[last] + bsf $i1,$i1 # ntz(block) + bsf $i3,$i3 + bsf $i5,$i5 + + aesdec $rndkey1,$inout0 + aesdec $rndkey1,$inout1 + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + pxor $rndkey0l,@offset[1] + pxor $rndkey0l,@offset[2] + aesdec $rndkey1,$inout4 + pxor $rndkey0l,@offset[3] + pxor $rndkey0l,@offset[4] + aesdec $rndkey1,$inout5 + $movkey 48($key_),$rndkey1 + pxor $rndkey0l,@offset[5] + + aesdec $rndkey0,$inout0 + aesdec $rndkey0,$inout1 + aesdec $rndkey0,$inout2 + aesdec $rndkey0,$inout3 + aesdec $rndkey0,$inout4 + aesdec $rndkey0,$inout5 + $movkey 64($key_),$rndkey0 + shl \$4,$i1 # ntz(block) -> table offset + shl \$4,$i3 + jmp .Locb_dec_loop6 + +.align 32 +.Locb_dec_loop6: + aesdec $rndkey1,$inout0 + aesdec $rndkey1,$inout1 + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + aesdec $rndkey1,$inout4 + aesdec $rndkey1,$inout5 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + + aesdec $rndkey0,$inout0 + aesdec $rndkey0,$inout1 + aesdec $rndkey0,$inout2 + aesdec $rndkey0,$inout3 + aesdec $rndkey0,$inout4 + aesdec $rndkey0,$inout5 + $movkey -16($key,%rax),$rndkey0 + jnz .Locb_dec_loop6 + + aesdec $rndkey1,$inout0 + aesdec $rndkey1,$inout1 + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + aesdec $rndkey1,$inout4 + aesdec $rndkey1,$inout5 + $movkey 16($key_),$rndkey1 + shl \$4,$i5 + + aesdeclast @offset[0],$inout0 + movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks + mov %r10,%rax # restore twisted rounds + aesdeclast @offset[1],$inout1 + aesdeclast @offset[2],$inout2 + aesdeclast @offset[3],$inout3 + aesdeclast @offset[4],$inout4 + aesdeclast @offset[5],$inout5 + ret +.size __ocb_decrypt6,.-__ocb_decrypt6 + +.type __ocb_decrypt4,\@abi-omnipotent +.align 32 +__ocb_decrypt4: + pxor $rndkey0l,@offset[5] # offset_i ^ round[0] + movdqu ($L_p,$i1),@offset[1] + movdqa @offset[0],@offset[2] + movdqu ($L_p,$i3),@offset[3] + pxor @offset[5],@offset[0] + pxor @offset[0],@offset[1] + pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i + pxor @offset[1],@offset[2] + pxor @offset[1],$inout1 + pxor @offset[2],@offset[3] + pxor @offset[2],$inout2 + pxor @offset[3],$inout3 + $movkey 32($key_),$rndkey0 + + pxor $rndkey0l,@offset[0] # offset_i ^ round[last] + pxor $rndkey0l,@offset[1] + pxor $rndkey0l,@offset[2] + pxor $rndkey0l,@offset[3] + + aesdec $rndkey1,$inout0 + aesdec $rndkey1,$inout1 + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + $movkey 48($key_),$rndkey1 + + aesdec $rndkey0,$inout0 + aesdec $rndkey0,$inout1 + aesdec $rndkey0,$inout2 + aesdec $rndkey0,$inout3 + $movkey 64($key_),$rndkey0 + jmp .Locb_dec_loop4 + +.align 32 +.Locb_dec_loop4: + aesdec $rndkey1,$inout0 + aesdec $rndkey1,$inout1 + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + + aesdec $rndkey0,$inout0 + aesdec $rndkey0,$inout1 + aesdec $rndkey0,$inout2 + aesdec $rndkey0,$inout3 + $movkey -16($key,%rax),$rndkey0 + jnz .Locb_dec_loop4 + + aesdec $rndkey1,$inout0 + aesdec $rndkey1,$inout1 + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + $movkey 16($key_),$rndkey1 + mov %r10,%rax # restore twisted rounds + + aesdeclast @offset[0],$inout0 + aesdeclast @offset[1],$inout1 + aesdeclast @offset[2],$inout2 + aesdeclast @offset[3],$inout3 + ret +.size __ocb_decrypt4,.-__ocb_decrypt4 + +.type __ocb_decrypt1,\@abi-omnipotent +.align 32 +__ocb_decrypt1: + pxor @offset[5],$inout5 # offset_i + pxor $rndkey0l,$inout5 # offset_i ^ round[0] + pxor $inout5,$inout0 # input ^ round[0] ^ offset_i + $movkey 32($key_),$rndkey0 + + aesdec $rndkey1,$inout0 + $movkey 48($key_),$rndkey1 + pxor $rndkey0l,$inout5 # offset_i ^ round[last] + + aesdec $rndkey0,$inout0 + $movkey 64($key_),$rndkey0 + jmp .Locb_dec_loop1 + +.align 32 +.Locb_dec_loop1: + aesdec $rndkey1,$inout0 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + + aesdec $rndkey0,$inout0 + $movkey -16($key,%rax),$rndkey0 + jnz .Locb_dec_loop1 + + aesdec $rndkey1,$inout0 + $movkey 16($key_),$rndkey1 # redundant in tail + mov %r10,%rax # restore twisted rounds + + aesdeclast $inout5,$inout0 + ret +.size __ocb_decrypt1,.-__ocb_decrypt1 +___ } }} ######################################################################## @@ -3307,7 +4240,7 @@ ___ # Vinodh Gopal <vinodh.gopal@intel.com> # Kahraman Akdemir # -# Agressively optimized in respect to aeskeygenassist's critical path +# Aggressively optimized in respect to aeskeygenassist's critical path # and is contained in %xmm0-5 to meet Win64 ABI requirement. # # int ${PREFIX}_set_encrypt_key(const unsigned char *inp, @@ -3819,6 +4752,65 @@ ctr_xts_se_handler: jmp .Lcommon_rbp_tail .size ctr_xts_se_handler,.-ctr_xts_se_handler + +.type ocb_se_handler,\@abi-omnipotent +.align 16 +ocb_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue lable + cmp %r10,%rbx # context->Rip<prologue label + jb .Lcommon_seh_tail + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 8(%r11),%r10d # HandlerData[2] + lea (%rsi,%r10),%r10 + cmp %r10,%rbx # context->Rip>=pop label + jae .Locb_no_xmm + + mov 152($context),%rax # pull context->Rsp + + lea (%rax),%rsi # %xmm save area + lea 512($context),%rdi # & context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + lea 0xa0+0x28(%rax),%rax + +.Locb_no_xmm: + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + + jmp .Lcommon_seh_tail +.size ocb_se_handler,.-ocb_se_handler ___ $code.=<<___; .type cbc_se_handler,\@abi-omnipotent @@ -3932,6 +4924,14 @@ $code.=<<___ if ($PREFIX eq "aesni"); .rva .LSEH_begin_aesni_xts_decrypt .rva .LSEH_end_aesni_xts_decrypt .rva .LSEH_info_xts_dec + + .rva .LSEH_begin_aesni_ocb_encrypt + .rva .LSEH_end_aesni_ocb_encrypt + .rva .LSEH_info_ocb_enc + + .rva .LSEH_begin_aesni_ocb_decrypt + .rva .LSEH_end_aesni_ocb_decrypt + .rva .LSEH_info_ocb_dec ___ $code.=<<___; .rva .LSEH_begin_${PREFIX}_cbc_encrypt @@ -3973,6 +4973,18 @@ $code.=<<___ if ($PREFIX eq "aesni"); .byte 9,0,0,0 .rva ctr_xts_se_handler .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] +.LSEH_info_ocb_enc: + .byte 9,0,0,0 + .rva ocb_se_handler + .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[] + .rva .Locb_enc_pop + .long 0 +.LSEH_info_ocb_dec: + .byte 9,0,0,0 + .rva ocb_se_handler + .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[] + .rva .Locb_dec_pop + .long 0 ___ $code.=<<___; .LSEH_info_cbc: diff --git a/deps/openssl/openssl/crypto/aes/asm/aesp8-ppc.pl b/deps/openssl/openssl/crypto/aes/asm/aesp8-ppc.pl index a1891cc03c..b7e92f6538 100755 --- a/deps/openssl/openssl/crypto/aes/asm/aesp8-ppc.pl +++ b/deps/openssl/openssl/crypto/aes/asm/aesp8-ppc.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -20,6 +27,19 @@ # instructions are interleaved. It's reckoned that eventual # misalignment penalties at page boundaries are in average lower # than additional overhead in pure AltiVec approach. +# +# May 2016 +# +# Add XTS subroutine, 9x on little- and 12x improvement on big-endian +# systems were measured. +# +###################################################################### +# Current large-block performance in cycles per byte processed with +# 128-bit key (less is better). +# +# CBC en-/decrypt CTR XTS +# POWER8[le] 3.96/0.72 0.74 1.1 +# POWER8[be] 3.75/0.65 0.66 1.0 $flavour = shift; @@ -1887,6 +1907,1849 @@ Lctr32_enc8x_done: ___ }} }}} +######################################################################### +{{{ # XTS procedures # +# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len, # +# const AES_KEY *key1, const AES_KEY *key2, # +# [const] unsigned char iv[16]); # +# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which # +# input tweak value is assumed to be encrypted already, and last tweak # +# value, one suitable for consecutive call on same chunk of data, is # +# written back to original buffer. In addition, in "tweak chaining" # +# mode only complete input blocks are processed. # + +my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) = map("r$_",(3..10)); +my ($rndkey0,$rndkey1,$inout) = map("v$_",(0..2)); +my ($output,$inptail,$inpperm,$leperm,$keyperm) = map("v$_",(3..7)); +my ($tweak,$seven,$eighty7,$tmp,$tweak1) = map("v$_",(8..12)); +my $taillen = $key2; + + ($inp,$idx) = ($idx,$inp); # reassign + +$code.=<<___; +.globl .${prefix}_xts_encrypt +.align 5 +.${prefix}_xts_encrypt: + mr $inp,r3 # reassign + li r3,-1 + ${UCMP}i $len,16 + bltlr- + + lis r0,0xfff0 + mfspr r12,256 # save vrsave + li r11,0 + mtspr 256,r0 + + vspltisb $seven,0x07 # 0x070707..07 + le?lvsl $leperm,r11,r11 + le?vspltisb $tmp,0x0f + le?vxor $leperm,$leperm,$seven + + li $idx,15 + lvx $tweak,0,$ivp # load [unaligned] iv + lvsl $inpperm,0,$ivp + lvx $inptail,$idx,$ivp + le?vxor $inpperm,$inpperm,$tmp + vperm $tweak,$tweak,$inptail,$inpperm + + neg r11,$inp + lvsr $inpperm,0,r11 # prepare for unaligned load + lvx $inout,0,$inp + addi $inp,$inp,15 # 15 is not typo + le?vxor $inpperm,$inpperm,$tmp + + ${UCMP}i $key2,0 # key2==NULL? + beq Lxts_enc_no_key2 + + ?lvsl $keyperm,0,$key2 # prepare for unaligned key + lwz $rounds,240($key2) + srwi $rounds,$rounds,1 + subi $rounds,$rounds,1 + li $idx,16 + + lvx $rndkey0,0,$key2 + lvx $rndkey1,$idx,$key2 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $tweak,$tweak,$rndkey0 + lvx $rndkey0,$idx,$key2 + addi $idx,$idx,16 + mtctr $rounds + +Ltweak_xts_enc: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $tweak,$tweak,$rndkey1 + lvx $rndkey1,$idx,$key2 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $tweak,$tweak,$rndkey0 + lvx $rndkey0,$idx,$key2 + addi $idx,$idx,16 + bdnz Ltweak_xts_enc + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $tweak,$tweak,$rndkey1 + lvx $rndkey1,$idx,$key2 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipherlast $tweak,$tweak,$rndkey0 + + li $ivp,0 # don't chain the tweak + b Lxts_enc + +Lxts_enc_no_key2: + li $idx,-16 + and $len,$len,$idx # in "tweak chaining" + # mode only complete + # blocks are processed +Lxts_enc: + lvx $inptail,0,$inp + addi $inp,$inp,16 + + ?lvsl $keyperm,0,$key1 # prepare for unaligned key + lwz $rounds,240($key1) + srwi $rounds,$rounds,1 + subi $rounds,$rounds,1 + li $idx,16 + + vslb $eighty7,$seven,$seven # 0x808080..80 + vor $eighty7,$eighty7,$seven # 0x878787..87 + vspltisb $tmp,1 # 0x010101..01 + vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01 + + ${UCMP}i $len,96 + bge _aesp8_xts_encrypt6x + + andi. $taillen,$len,15 + subic r0,$len,32 + subi $taillen,$taillen,16 + subfe r0,r0,r0 + and r0,r0,$taillen + add $inp,$inp,r0 + + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$tweak + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + mtctr $rounds + b Loop_xts_enc + +.align 5 +Loop_xts_enc: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + bdnz Loop_xts_enc + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $rndkey0,$rndkey0,$tweak + vcipherlast $output,$inout,$rndkey0 + + le?vperm $tmp,$output,$output,$leperm + be?nop + le?stvx_u $tmp,0,$out + be?stvx_u $output,0,$out + addi $out,$out,16 + + subic. $len,$len,16 + beq Lxts_enc_done + + vmr $inout,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + + subic r0,$len,32 + subfe r0,r0,r0 + and r0,r0,$taillen + add $inp,$inp,r0 + + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $tweak,$tweak,$tmp + + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$tweak + vxor $output,$output,$rndkey0 # just in case $len<16 + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + + mtctr $rounds + ${UCMP}i $len,16 + bge Loop_xts_enc + + vxor $output,$output,$tweak + lvsr $inpperm,0,$len # $inpperm is no longer needed + vxor $inptail,$inptail,$inptail # $inptail is no longer needed + vspltisb $tmp,-1 + vperm $inptail,$inptail,$tmp,$inpperm + vsel $inout,$inout,$output,$inptail + + subi r11,$out,17 + subi $out,$out,16 + mtctr $len + li $len,16 +Loop_xts_enc_steal: + lbzu r0,1(r11) + stb r0,16(r11) + bdnz Loop_xts_enc_steal + + mtctr $rounds + b Loop_xts_enc # one more time... + +Lxts_enc_done: + ${UCMP}i $ivp,0 + beq Lxts_enc_ret + + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $tweak,$tweak,$tmp + + le?vperm $tweak,$tweak,$tweak,$leperm + stvx_u $tweak,0,$ivp + +Lxts_enc_ret: + mtspr 256,r12 # restore vrsave + li r3,0 + blr + .long 0 + .byte 0,12,0x04,0,0x80,6,6,0 + .long 0 +.size .${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt + +.globl .${prefix}_xts_decrypt +.align 5 +.${prefix}_xts_decrypt: + mr $inp,r3 # reassign + li r3,-1 + ${UCMP}i $len,16 + bltlr- + + lis r0,0xfff8 + mfspr r12,256 # save vrsave + li r11,0 + mtspr 256,r0 + + andi. r0,$len,15 + neg r0,r0 + andi. r0,r0,16 + sub $len,$len,r0 + + vspltisb $seven,0x07 # 0x070707..07 + le?lvsl $leperm,r11,r11 + le?vspltisb $tmp,0x0f + le?vxor $leperm,$leperm,$seven + + li $idx,15 + lvx $tweak,0,$ivp # load [unaligned] iv + lvsl $inpperm,0,$ivp + lvx $inptail,$idx,$ivp + le?vxor $inpperm,$inpperm,$tmp + vperm $tweak,$tweak,$inptail,$inpperm + + neg r11,$inp + lvsr $inpperm,0,r11 # prepare for unaligned load + lvx $inout,0,$inp + addi $inp,$inp,15 # 15 is not typo + le?vxor $inpperm,$inpperm,$tmp + + ${UCMP}i $key2,0 # key2==NULL? + beq Lxts_dec_no_key2 + + ?lvsl $keyperm,0,$key2 # prepare for unaligned key + lwz $rounds,240($key2) + srwi $rounds,$rounds,1 + subi $rounds,$rounds,1 + li $idx,16 + + lvx $rndkey0,0,$key2 + lvx $rndkey1,$idx,$key2 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $tweak,$tweak,$rndkey0 + lvx $rndkey0,$idx,$key2 + addi $idx,$idx,16 + mtctr $rounds + +Ltweak_xts_dec: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $tweak,$tweak,$rndkey1 + lvx $rndkey1,$idx,$key2 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $tweak,$tweak,$rndkey0 + lvx $rndkey0,$idx,$key2 + addi $idx,$idx,16 + bdnz Ltweak_xts_dec + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $tweak,$tweak,$rndkey1 + lvx $rndkey1,$idx,$key2 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipherlast $tweak,$tweak,$rndkey0 + + li $ivp,0 # don't chain the tweak + b Lxts_dec + +Lxts_dec_no_key2: + neg $idx,$len + andi. $idx,$idx,15 + add $len,$len,$idx # in "tweak chaining" + # mode only complete + # blocks are processed +Lxts_dec: + lvx $inptail,0,$inp + addi $inp,$inp,16 + + ?lvsl $keyperm,0,$key1 # prepare for unaligned key + lwz $rounds,240($key1) + srwi $rounds,$rounds,1 + subi $rounds,$rounds,1 + li $idx,16 + + vslb $eighty7,$seven,$seven # 0x808080..80 + vor $eighty7,$eighty7,$seven # 0x878787..87 + vspltisb $tmp,1 # 0x010101..01 + vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01 + + ${UCMP}i $len,96 + bge _aesp8_xts_decrypt6x + + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$tweak + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + mtctr $rounds + + ${UCMP}i $len,16 + blt Ltail_xts_dec + be?b Loop_xts_dec + +.align 5 +Loop_xts_dec: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vncipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + bdnz Loop_xts_dec + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $rndkey0,$rndkey0,$tweak + vncipherlast $output,$inout,$rndkey0 + + le?vperm $tmp,$output,$output,$leperm + be?nop + le?stvx_u $tmp,0,$out + be?stvx_u $output,0,$out + addi $out,$out,16 + + subic. $len,$len,16 + beq Lxts_dec_done + + vmr $inout,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $tweak,$tweak,$tmp + + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$tweak + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + + mtctr $rounds + ${UCMP}i $len,16 + bge Loop_xts_dec + +Ltail_xts_dec: + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak1,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $tweak1,$tweak1,$tmp + + subi $inp,$inp,16 + add $inp,$inp,$len + + vxor $inout,$inout,$tweak # :-( + vxor $inout,$inout,$tweak1 # :-) + +Loop_xts_dec_short: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vncipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + bdnz Loop_xts_dec_short + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $rndkey0,$rndkey0,$tweak1 + vncipherlast $output,$inout,$rndkey0 + + le?vperm $tmp,$output,$output,$leperm + be?nop + le?stvx_u $tmp,0,$out + be?stvx_u $output,0,$out + + vmr $inout,$inptail + lvx $inptail,0,$inp + #addi $inp,$inp,16 + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + + lvsr $inpperm,0,$len # $inpperm is no longer needed + vxor $inptail,$inptail,$inptail # $inptail is no longer needed + vspltisb $tmp,-1 + vperm $inptail,$inptail,$tmp,$inpperm + vsel $inout,$inout,$output,$inptail + + vxor $rndkey0,$rndkey0,$tweak + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + + subi r11,$out,1 + mtctr $len + li $len,16 +Loop_xts_dec_steal: + lbzu r0,1(r11) + stb r0,16(r11) + bdnz Loop_xts_dec_steal + + mtctr $rounds + b Loop_xts_dec # one more time... + +Lxts_dec_done: + ${UCMP}i $ivp,0 + beq Lxts_dec_ret + + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $tweak,$tweak,$tmp + + le?vperm $tweak,$tweak,$tweak,$leperm + stvx_u $tweak,0,$ivp + +Lxts_dec_ret: + mtspr 256,r12 # restore vrsave + li r3,0 + blr + .long 0 + .byte 0,12,0x04,0,0x80,6,6,0 + .long 0 +.size .${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt +___ +######################################################################### +{{ # Optimized XTS procedures # +my $key_=$key2; +my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31)); + $x00=0 if ($flavour =~ /osx/); +my ($in0, $in1, $in2, $in3, $in4, $in5 )=map("v$_",(0..5)); +my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16)); +my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22)); +my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys + # v26-v31 last 6 round keys +my ($keyperm)=($out0); # aliases with "caller", redundant assignment +my $taillen=$x70; + +$code.=<<___; +.align 5 +_aesp8_xts_encrypt6x: + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + mflr r11 + li r7,`$FRAME+8*16+15` + li r3,`$FRAME+8*16+31` + $PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) + stvx v20,r7,$sp # ABI says so + addi r7,r7,32 + stvx v21,r3,$sp + addi r3,r3,32 + stvx v22,r7,$sp + addi r7,r7,32 + stvx v23,r3,$sp + addi r3,r3,32 + stvx v24,r7,$sp + addi r7,r7,32 + stvx v25,r3,$sp + addi r3,r3,32 + stvx v26,r7,$sp + addi r7,r7,32 + stvx v27,r3,$sp + addi r3,r3,32 + stvx v28,r7,$sp + addi r7,r7,32 + stvx v29,r3,$sp + addi r3,r3,32 + stvx v30,r7,$sp + stvx v31,r3,$sp + li r0,-1 + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li $x10,0x10 + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + li $x20,0x20 + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + li $x30,0x30 + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + li $x40,0x40 + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + li $x50,0x50 + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + li $x60,0x60 + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + li $x70,0x70 + mtspr 256,r0 + + subi $rounds,$rounds,3 # -4 in total + + lvx $rndkey0,$x00,$key1 # load key schedule + lvx v30,$x10,$key1 + addi $key1,$key1,0x20 + lvx v31,$x00,$key1 + ?vperm $rndkey0,$rndkey0,v30,$keyperm + addi $key_,$sp,$FRAME+15 + mtctr $rounds + +Load_xts_enc_key: + ?vperm v24,v30,v31,$keyperm + lvx v30,$x10,$key1 + addi $key1,$key1,0x20 + stvx v24,$x00,$key_ # off-load round[1] + ?vperm v25,v31,v30,$keyperm + lvx v31,$x00,$key1 + stvx v25,$x10,$key_ # off-load round[2] + addi $key_,$key_,0x20 + bdnz Load_xts_enc_key + + lvx v26,$x10,$key1 + ?vperm v24,v30,v31,$keyperm + lvx v27,$x20,$key1 + stvx v24,$x00,$key_ # off-load round[3] + ?vperm v25,v31,v26,$keyperm + lvx v28,$x30,$key1 + stvx v25,$x10,$key_ # off-load round[4] + addi $key_,$sp,$FRAME+15 # rewind $key_ + ?vperm v26,v26,v27,$keyperm + lvx v29,$x40,$key1 + ?vperm v27,v27,v28,$keyperm + lvx v30,$x50,$key1 + ?vperm v28,v28,v29,$keyperm + lvx v31,$x60,$key1 + ?vperm v29,v29,v30,$keyperm + lvx $twk5,$x70,$key1 # borrow $twk5 + ?vperm v30,v30,v31,$keyperm + lvx v24,$x00,$key_ # pre-load round[1] + ?vperm v31,v31,$twk5,$keyperm + lvx v25,$x10,$key_ # pre-load round[2] + + vperm $in0,$inout,$inptail,$inpperm + subi $inp,$inp,31 # undo "caller" + vxor $twk0,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $out0,$in0,$twk0 + vxor $tweak,$tweak,$tmp + + lvx_u $in1,$x10,$inp + vxor $twk1,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in1,$in1,$in1,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out1,$in1,$twk1 + vxor $tweak,$tweak,$tmp + + lvx_u $in2,$x20,$inp + andi. $taillen,$len,15 + vxor $twk2,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in2,$in2,$in2,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out2,$in2,$twk2 + vxor $tweak,$tweak,$tmp + + lvx_u $in3,$x30,$inp + sub $len,$len,$taillen + vxor $twk3,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in3,$in3,$in3,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out3,$in3,$twk3 + vxor $tweak,$tweak,$tmp + + lvx_u $in4,$x40,$inp + subi $len,$len,0x60 + vxor $twk4,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in4,$in4,$in4,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out4,$in4,$twk4 + vxor $tweak,$tweak,$tmp + + lvx_u $in5,$x50,$inp + addi $inp,$inp,0x60 + vxor $twk5,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in5,$in5,$in5,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out5,$in5,$twk5 + vxor $tweak,$tweak,$tmp + + vxor v31,v31,$rndkey0 + mtctr $rounds + b Loop_xts_enc6x + +.align 5 +Loop_xts_enc6x: + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + vcipher $out5,$out5,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + vcipher $out5,$out5,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_xts_enc6x + + subic $len,$len,96 # $len-=96 + vxor $in0,$twk0,v31 # xor with last round key + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk0,$tweak,$rndkey0 + vaddubm $tweak,$tweak,$tweak + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vsldoi $tmp,$tmp,$tmp,15 + vcipher $out4,$out4,v24 + vcipher $out5,$out5,v24 + + subfe. r0,r0,r0 # borrow?-1:0 + vand $tmp,$tmp,$eighty7 + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vxor $tweak,$tweak,$tmp + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vxor $in1,$twk1,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk1,$tweak,$rndkey0 + vcipher $out4,$out4,v25 + vcipher $out5,$out5,v25 + + and r0,r0,$len + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vcipher $out0,$out0,v26 + vcipher $out1,$out1,v26 + vand $tmp,$tmp,$eighty7 + vcipher $out2,$out2,v26 + vcipher $out3,$out3,v26 + vxor $tweak,$tweak,$tmp + vcipher $out4,$out4,v26 + vcipher $out5,$out5,v26 + + add $inp,$inp,r0 # $inp is adjusted in such + # way that at exit from the + # loop inX-in5 are loaded + # with last "words" + vxor $in2,$twk2,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk2,$tweak,$rndkey0 + vaddubm $tweak,$tweak,$tweak + vcipher $out0,$out0,v27 + vcipher $out1,$out1,v27 + vsldoi $tmp,$tmp,$tmp,15 + vcipher $out2,$out2,v27 + vcipher $out3,$out3,v27 + vand $tmp,$tmp,$eighty7 + vcipher $out4,$out4,v27 + vcipher $out5,$out5,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vxor $tweak,$tweak,$tmp + vcipher $out0,$out0,v28 + vcipher $out1,$out1,v28 + vxor $in3,$twk3,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk3,$tweak,$rndkey0 + vcipher $out2,$out2,v28 + vcipher $out3,$out3,v28 + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vcipher $out4,$out4,v28 + vcipher $out5,$out5,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + vand $tmp,$tmp,$eighty7 + + vcipher $out0,$out0,v29 + vcipher $out1,$out1,v29 + vxor $tweak,$tweak,$tmp + vcipher $out2,$out2,v29 + vcipher $out3,$out3,v29 + vxor $in4,$twk4,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk4,$tweak,$rndkey0 + vcipher $out4,$out4,v29 + vcipher $out5,$out5,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + + vcipher $out0,$out0,v30 + vcipher $out1,$out1,v30 + vand $tmp,$tmp,$eighty7 + vcipher $out2,$out2,v30 + vcipher $out3,$out3,v30 + vxor $tweak,$tweak,$tmp + vcipher $out4,$out4,v30 + vcipher $out5,$out5,v30 + vxor $in5,$twk5,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk5,$tweak,$rndkey0 + + vcipherlast $out0,$out0,$in0 + lvx_u $in0,$x00,$inp # load next input block + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vcipherlast $out1,$out1,$in1 + lvx_u $in1,$x10,$inp + vcipherlast $out2,$out2,$in2 + le?vperm $in0,$in0,$in0,$leperm + lvx_u $in2,$x20,$inp + vand $tmp,$tmp,$eighty7 + vcipherlast $out3,$out3,$in3 + le?vperm $in1,$in1,$in1,$leperm + lvx_u $in3,$x30,$inp + vcipherlast $out4,$out4,$in4 + le?vperm $in2,$in2,$in2,$leperm + lvx_u $in4,$x40,$inp + vxor $tweak,$tweak,$tmp + vcipherlast $tmp,$out5,$in5 # last block might be needed + # in stealing mode + le?vperm $in3,$in3,$in3,$leperm + lvx_u $in5,$x50,$inp + addi $inp,$inp,0x60 + le?vperm $in4,$in4,$in4,$leperm + le?vperm $in5,$in5,$in5,$leperm + + le?vperm $out0,$out0,$out0,$leperm + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk0 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + vxor $out1,$in1,$twk1 + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + vxor $out2,$in2,$twk2 + le?vperm $out4,$out4,$out4,$leperm + stvx_u $out3,$x30,$out + vxor $out3,$in3,$twk3 + le?vperm $out5,$tmp,$tmp,$leperm + stvx_u $out4,$x40,$out + vxor $out4,$in4,$twk4 + le?stvx_u $out5,$x50,$out + be?stvx_u $tmp, $x50,$out + vxor $out5,$in5,$twk5 + addi $out,$out,0x60 + + mtctr $rounds + beq Loop_xts_enc6x # did $len-=96 borrow? + + addic. $len,$len,0x60 + beq Lxts_enc6x_zero + cmpwi $len,0x20 + blt Lxts_enc6x_one + nop + beq Lxts_enc6x_two + cmpwi $len,0x40 + blt Lxts_enc6x_three + nop + beq Lxts_enc6x_four + +Lxts_enc6x_five: + vxor $out0,$in1,$twk0 + vxor $out1,$in2,$twk1 + vxor $out2,$in3,$twk2 + vxor $out3,$in4,$twk3 + vxor $out4,$in5,$twk4 + + bl _aesp8_xts_enc5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk5 # unused tweak + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + vxor $tmp,$out4,$twk5 # last block prep for stealing + le?vperm $out4,$out4,$out4,$leperm + stvx_u $out3,$x30,$out + stvx_u $out4,$x40,$out + addi $out,$out,0x50 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_four: + vxor $out0,$in2,$twk0 + vxor $out1,$in3,$twk1 + vxor $out2,$in4,$twk2 + vxor $out3,$in5,$twk3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_enc5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk4 # unused tweak + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + vxor $tmp,$out3,$twk4 # last block prep for stealing + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + stvx_u $out3,$x30,$out + addi $out,$out,0x40 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_three: + vxor $out0,$in3,$twk0 + vxor $out1,$in4,$twk1 + vxor $out2,$in5,$twk2 + vxor $out3,$out3,$out3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_enc5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk3 # unused tweak + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $tmp,$out2,$twk3 # last block prep for stealing + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + stvx_u $out2,$x20,$out + addi $out,$out,0x30 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_two: + vxor $out0,$in4,$twk0 + vxor $out1,$in5,$twk1 + vxor $out2,$out2,$out2 + vxor $out3,$out3,$out3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_enc5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk2 # unused tweak + vxor $tmp,$out1,$twk2 # last block prep for stealing + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + stvx_u $out1,$x10,$out + addi $out,$out,0x20 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_one: + vxor $out0,$in5,$twk0 + nop +Loop_xts_enc1x: + vcipher $out0,$out0,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vcipher $out0,$out0,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_xts_enc1x + + add $inp,$inp,$taillen + cmpwi $taillen,0 + vcipher $out0,$out0,v24 + + subi $inp,$inp,16 + vcipher $out0,$out0,v25 + + lvsr $inpperm,0,$taillen + vcipher $out0,$out0,v26 + + lvx_u $in0,0,$inp + vcipher $out0,$out0,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vcipher $out0,$out0,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + + vcipher $out0,$out0,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $twk0,$twk0,v31 + + le?vperm $in0,$in0,$in0,$leperm + vcipher $out0,$out0,v30 + + vperm $in0,$in0,$in0,$inpperm + vcipherlast $out0,$out0,$twk0 + + vmr $twk0,$twk1 # unused tweak + vxor $tmp,$out0,$twk1 # last block prep for stealing + le?vperm $out0,$out0,$out0,$leperm + stvx_u $out0,$x00,$out # store output + addi $out,$out,0x10 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_zero: + cmpwi $taillen,0 + beq Lxts_enc6x_done + + add $inp,$inp,$taillen + subi $inp,$inp,16 + lvx_u $in0,0,$inp + lvsr $inpperm,0,$taillen # $in5 is no more + le?vperm $in0,$in0,$in0,$leperm + vperm $in0,$in0,$in0,$inpperm + vxor $tmp,$tmp,$twk0 +Lxts_enc6x_steal: + vxor $in0,$in0,$twk0 + vxor $out0,$out0,$out0 + vspltisb $out1,-1 + vperm $out0,$out0,$out1,$inpperm + vsel $out0,$in0,$tmp,$out0 # $tmp is last block, remember? + + subi r30,$out,17 + subi $out,$out,16 + mtctr $taillen +Loop_xts_enc6x_steal: + lbzu r0,1(r30) + stb r0,16(r30) + bdnz Loop_xts_enc6x_steal + + li $taillen,0 + mtctr $rounds + b Loop_xts_enc1x # one more time... + +.align 4 +Lxts_enc6x_done: + ${UCMP}i $ivp,0 + beq Lxts_enc6x_ret + + vxor $tweak,$twk0,$rndkey0 + le?vperm $tweak,$tweak,$tweak,$leperm + stvx_u $tweak,0,$ivp + +Lxts_enc6x_ret: + mtlr r11 + li r10,`$FRAME+15` + li r11,`$FRAME+31` + stvx $seven,r10,$sp # wipe copies of round keys + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + + mtspr 256,$vrsave + lvx v20,r10,$sp # ABI says so + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + blr + .long 0 + .byte 0,12,0x04,1,0x80,6,6,0 + .long 0 + +.align 5 +_aesp8_xts_enc5x: + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + lvx v25,$x10,$key_ # round[4] + bdnz _aesp8_xts_enc5x + + add $inp,$inp,$taillen + cmpwi $taillen,0 + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + + subi $inp,$inp,16 + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + vxor $twk0,$twk0,v31 + + vcipher $out0,$out0,v26 + lvsr $inpperm,0,$taillen # $in5 is no more + vcipher $out1,$out1,v26 + vcipher $out2,$out2,v26 + vcipher $out3,$out3,v26 + vcipher $out4,$out4,v26 + vxor $in1,$twk1,v31 + + vcipher $out0,$out0,v27 + lvx_u $in0,0,$inp + vcipher $out1,$out1,v27 + vcipher $out2,$out2,v27 + vcipher $out3,$out3,v27 + vcipher $out4,$out4,v27 + vxor $in2,$twk2,v31 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vcipher $out0,$out0,v28 + vcipher $out1,$out1,v28 + vcipher $out2,$out2,v28 + vcipher $out3,$out3,v28 + vcipher $out4,$out4,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + vxor $in3,$twk3,v31 + + vcipher $out0,$out0,v29 + le?vperm $in0,$in0,$in0,$leperm + vcipher $out1,$out1,v29 + vcipher $out2,$out2,v29 + vcipher $out3,$out3,v29 + vcipher $out4,$out4,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $in4,$twk4,v31 + + vcipher $out0,$out0,v30 + vperm $in0,$in0,$in0,$inpperm + vcipher $out1,$out1,v30 + vcipher $out2,$out2,v30 + vcipher $out3,$out3,v30 + vcipher $out4,$out4,v30 + + vcipherlast $out0,$out0,$twk0 + vcipherlast $out1,$out1,$in1 + vcipherlast $out2,$out2,$in2 + vcipherlast $out3,$out3,$in3 + vcipherlast $out4,$out4,$in4 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + +.align 5 +_aesp8_xts_decrypt6x: + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + mflr r11 + li r7,`$FRAME+8*16+15` + li r3,`$FRAME+8*16+31` + $PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) + stvx v20,r7,$sp # ABI says so + addi r7,r7,32 + stvx v21,r3,$sp + addi r3,r3,32 + stvx v22,r7,$sp + addi r7,r7,32 + stvx v23,r3,$sp + addi r3,r3,32 + stvx v24,r7,$sp + addi r7,r7,32 + stvx v25,r3,$sp + addi r3,r3,32 + stvx v26,r7,$sp + addi r7,r7,32 + stvx v27,r3,$sp + addi r3,r3,32 + stvx v28,r7,$sp + addi r7,r7,32 + stvx v29,r3,$sp + addi r3,r3,32 + stvx v30,r7,$sp + stvx v31,r3,$sp + li r0,-1 + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li $x10,0x10 + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + li $x20,0x20 + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + li $x30,0x30 + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + li $x40,0x40 + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + li $x50,0x50 + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + li $x60,0x60 + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + li $x70,0x70 + mtspr 256,r0 + + subi $rounds,$rounds,3 # -4 in total + + lvx $rndkey0,$x00,$key1 # load key schedule + lvx v30,$x10,$key1 + addi $key1,$key1,0x20 + lvx v31,$x00,$key1 + ?vperm $rndkey0,$rndkey0,v30,$keyperm + addi $key_,$sp,$FRAME+15 + mtctr $rounds + +Load_xts_dec_key: + ?vperm v24,v30,v31,$keyperm + lvx v30,$x10,$key1 + addi $key1,$key1,0x20 + stvx v24,$x00,$key_ # off-load round[1] + ?vperm v25,v31,v30,$keyperm + lvx v31,$x00,$key1 + stvx v25,$x10,$key_ # off-load round[2] + addi $key_,$key_,0x20 + bdnz Load_xts_dec_key + + lvx v26,$x10,$key1 + ?vperm v24,v30,v31,$keyperm + lvx v27,$x20,$key1 + stvx v24,$x00,$key_ # off-load round[3] + ?vperm v25,v31,v26,$keyperm + lvx v28,$x30,$key1 + stvx v25,$x10,$key_ # off-load round[4] + addi $key_,$sp,$FRAME+15 # rewind $key_ + ?vperm v26,v26,v27,$keyperm + lvx v29,$x40,$key1 + ?vperm v27,v27,v28,$keyperm + lvx v30,$x50,$key1 + ?vperm v28,v28,v29,$keyperm + lvx v31,$x60,$key1 + ?vperm v29,v29,v30,$keyperm + lvx $twk5,$x70,$key1 # borrow $twk5 + ?vperm v30,v30,v31,$keyperm + lvx v24,$x00,$key_ # pre-load round[1] + ?vperm v31,v31,$twk5,$keyperm + lvx v25,$x10,$key_ # pre-load round[2] + + vperm $in0,$inout,$inptail,$inpperm + subi $inp,$inp,31 # undo "caller" + vxor $twk0,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $out0,$in0,$twk0 + vxor $tweak,$tweak,$tmp + + lvx_u $in1,$x10,$inp + vxor $twk1,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in1,$in1,$in1,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out1,$in1,$twk1 + vxor $tweak,$tweak,$tmp + + lvx_u $in2,$x20,$inp + andi. $taillen,$len,15 + vxor $twk2,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in2,$in2,$in2,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out2,$in2,$twk2 + vxor $tweak,$tweak,$tmp + + lvx_u $in3,$x30,$inp + sub $len,$len,$taillen + vxor $twk3,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in3,$in3,$in3,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out3,$in3,$twk3 + vxor $tweak,$tweak,$tmp + + lvx_u $in4,$x40,$inp + subi $len,$len,0x60 + vxor $twk4,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in4,$in4,$in4,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out4,$in4,$twk4 + vxor $tweak,$tweak,$tmp + + lvx_u $in5,$x50,$inp + addi $inp,$inp,0x60 + vxor $twk5,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in5,$in5,$in5,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out5,$in5,$twk5 + vxor $tweak,$tweak,$tmp + + vxor v31,v31,$rndkey0 + mtctr $rounds + b Loop_xts_dec6x + +.align 5 +Loop_xts_dec6x: + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_xts_dec6x + + subic $len,$len,96 # $len-=96 + vxor $in0,$twk0,v31 # xor with last round key + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk0,$tweak,$rndkey0 + vaddubm $tweak,$tweak,$tweak + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vsldoi $tmp,$tmp,$tmp,15 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + + subfe. r0,r0,r0 # borrow?-1:0 + vand $tmp,$tmp,$eighty7 + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vxor $tweak,$tweak,$tmp + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vxor $in1,$twk1,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk1,$tweak,$rndkey0 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + + and r0,r0,$len + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vncipher $out0,$out0,v26 + vncipher $out1,$out1,v26 + vand $tmp,$tmp,$eighty7 + vncipher $out2,$out2,v26 + vncipher $out3,$out3,v26 + vxor $tweak,$tweak,$tmp + vncipher $out4,$out4,v26 + vncipher $out5,$out5,v26 + + add $inp,$inp,r0 # $inp is adjusted in such + # way that at exit from the + # loop inX-in5 are loaded + # with last "words" + vxor $in2,$twk2,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk2,$tweak,$rndkey0 + vaddubm $tweak,$tweak,$tweak + vncipher $out0,$out0,v27 + vncipher $out1,$out1,v27 + vsldoi $tmp,$tmp,$tmp,15 + vncipher $out2,$out2,v27 + vncipher $out3,$out3,v27 + vand $tmp,$tmp,$eighty7 + vncipher $out4,$out4,v27 + vncipher $out5,$out5,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vxor $tweak,$tweak,$tmp + vncipher $out0,$out0,v28 + vncipher $out1,$out1,v28 + vxor $in3,$twk3,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk3,$tweak,$rndkey0 + vncipher $out2,$out2,v28 + vncipher $out3,$out3,v28 + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vncipher $out4,$out4,v28 + vncipher $out5,$out5,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + vand $tmp,$tmp,$eighty7 + + vncipher $out0,$out0,v29 + vncipher $out1,$out1,v29 + vxor $tweak,$tweak,$tmp + vncipher $out2,$out2,v29 + vncipher $out3,$out3,v29 + vxor $in4,$twk4,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk4,$tweak,$rndkey0 + vncipher $out4,$out4,v29 + vncipher $out5,$out5,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + + vncipher $out0,$out0,v30 + vncipher $out1,$out1,v30 + vand $tmp,$tmp,$eighty7 + vncipher $out2,$out2,v30 + vncipher $out3,$out3,v30 + vxor $tweak,$tweak,$tmp + vncipher $out4,$out4,v30 + vncipher $out5,$out5,v30 + vxor $in5,$twk5,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk5,$tweak,$rndkey0 + + vncipherlast $out0,$out0,$in0 + lvx_u $in0,$x00,$inp # load next input block + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vncipherlast $out1,$out1,$in1 + lvx_u $in1,$x10,$inp + vncipherlast $out2,$out2,$in2 + le?vperm $in0,$in0,$in0,$leperm + lvx_u $in2,$x20,$inp + vand $tmp,$tmp,$eighty7 + vncipherlast $out3,$out3,$in3 + le?vperm $in1,$in1,$in1,$leperm + lvx_u $in3,$x30,$inp + vncipherlast $out4,$out4,$in4 + le?vperm $in2,$in2,$in2,$leperm + lvx_u $in4,$x40,$inp + vxor $tweak,$tweak,$tmp + vncipherlast $out5,$out5,$in5 + le?vperm $in3,$in3,$in3,$leperm + lvx_u $in5,$x50,$inp + addi $inp,$inp,0x60 + le?vperm $in4,$in4,$in4,$leperm + le?vperm $in5,$in5,$in5,$leperm + + le?vperm $out0,$out0,$out0,$leperm + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk0 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + vxor $out1,$in1,$twk1 + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + vxor $out2,$in2,$twk2 + le?vperm $out4,$out4,$out4,$leperm + stvx_u $out3,$x30,$out + vxor $out3,$in3,$twk3 + le?vperm $out5,$out5,$out5,$leperm + stvx_u $out4,$x40,$out + vxor $out4,$in4,$twk4 + stvx_u $out5,$x50,$out + vxor $out5,$in5,$twk5 + addi $out,$out,0x60 + + mtctr $rounds + beq Loop_xts_dec6x # did $len-=96 borrow? + + addic. $len,$len,0x60 + beq Lxts_dec6x_zero + cmpwi $len,0x20 + blt Lxts_dec6x_one + nop + beq Lxts_dec6x_two + cmpwi $len,0x40 + blt Lxts_dec6x_three + nop + beq Lxts_dec6x_four + +Lxts_dec6x_five: + vxor $out0,$in1,$twk0 + vxor $out1,$in2,$twk1 + vxor $out2,$in3,$twk2 + vxor $out3,$in4,$twk3 + vxor $out4,$in5,$twk4 + + bl _aesp8_xts_dec5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk5 # unused tweak + vxor $twk1,$tweak,$rndkey0 + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk1 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$leperm + stvx_u $out3,$x30,$out + stvx_u $out4,$x40,$out + addi $out,$out,0x50 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_four: + vxor $out0,$in2,$twk0 + vxor $out1,$in3,$twk1 + vxor $out2,$in4,$twk2 + vxor $out3,$in5,$twk3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_dec5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk4 # unused tweak + vmr $twk1,$twk5 + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk5 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + stvx_u $out3,$x30,$out + addi $out,$out,0x40 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_three: + vxor $out0,$in3,$twk0 + vxor $out1,$in4,$twk1 + vxor $out2,$in5,$twk2 + vxor $out3,$out3,$out3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_dec5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk3 # unused tweak + vmr $twk1,$twk4 + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk4 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + stvx_u $out2,$x20,$out + addi $out,$out,0x30 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_two: + vxor $out0,$in4,$twk0 + vxor $out1,$in5,$twk1 + vxor $out2,$out2,$out2 + vxor $out3,$out3,$out3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_dec5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk2 # unused tweak + vmr $twk1,$twk3 + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk3 + stvx_u $out1,$x10,$out + addi $out,$out,0x20 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_one: + vxor $out0,$in5,$twk0 + nop +Loop_xts_dec1x: + vncipher $out0,$out0,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_xts_dec1x + + subi r0,$taillen,1 + vncipher $out0,$out0,v24 + + andi. r0,r0,16 + cmpwi $taillen,0 + vncipher $out0,$out0,v25 + + sub $inp,$inp,r0 + vncipher $out0,$out0,v26 + + lvx_u $in0,0,$inp + vncipher $out0,$out0,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vncipher $out0,$out0,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + + vncipher $out0,$out0,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $twk0,$twk0,v31 + + le?vperm $in0,$in0,$in0,$leperm + vncipher $out0,$out0,v30 + + mtctr $rounds + vncipherlast $out0,$out0,$twk0 + + vmr $twk0,$twk1 # unused tweak + vmr $twk1,$twk2 + le?vperm $out0,$out0,$out0,$leperm + stvx_u $out0,$x00,$out # store output + addi $out,$out,0x10 + vxor $out0,$in0,$twk2 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_zero: + cmpwi $taillen,0 + beq Lxts_dec6x_done + + lvx_u $in0,0,$inp + le?vperm $in0,$in0,$in0,$leperm + vxor $out0,$in0,$twk1 +Lxts_dec6x_steal: + vncipher $out0,$out0,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Lxts_dec6x_steal + + add $inp,$inp,$taillen + vncipher $out0,$out0,v24 + + cmpwi $taillen,0 + vncipher $out0,$out0,v25 + + lvx_u $in0,0,$inp + vncipher $out0,$out0,v26 + + lvsr $inpperm,0,$taillen # $in5 is no more + vncipher $out0,$out0,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vncipher $out0,$out0,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + + vncipher $out0,$out0,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $twk1,$twk1,v31 + + le?vperm $in0,$in0,$in0,$leperm + vncipher $out0,$out0,v30 + + vperm $in0,$in0,$in0,$inpperm + vncipherlast $tmp,$out0,$twk1 + + le?vperm $out0,$tmp,$tmp,$leperm + le?stvx_u $out0,0,$out + be?stvx_u $tmp,0,$out + + vxor $out0,$out0,$out0 + vspltisb $out1,-1 + vperm $out0,$out0,$out1,$inpperm + vsel $out0,$in0,$tmp,$out0 + vxor $out0,$out0,$twk0 + + subi r30,$out,1 + mtctr $taillen +Loop_xts_dec6x_steal: + lbzu r0,1(r30) + stb r0,16(r30) + bdnz Loop_xts_dec6x_steal + + li $taillen,0 + mtctr $rounds + b Loop_xts_dec1x # one more time... + +.align 4 +Lxts_dec6x_done: + ${UCMP}i $ivp,0 + beq Lxts_dec6x_ret + + vxor $tweak,$twk0,$rndkey0 + le?vperm $tweak,$tweak,$tweak,$leperm + stvx_u $tweak,0,$ivp + +Lxts_dec6x_ret: + mtlr r11 + li r10,`$FRAME+15` + li r11,`$FRAME+31` + stvx $seven,r10,$sp # wipe copies of round keys + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + + mtspr 256,$vrsave + lvx v20,r10,$sp # ABI says so + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + blr + .long 0 + .byte 0,12,0x04,1,0x80,6,6,0 + .long 0 + +.align 5 +_aesp8_xts_dec5x: + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + lvx v25,$x10,$key_ # round[4] + bdnz _aesp8_xts_dec5x + + subi r0,$taillen,1 + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + + andi. r0,r0,16 + cmpwi $taillen,0 + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vxor $twk0,$twk0,v31 + + sub $inp,$inp,r0 + vncipher $out0,$out0,v26 + vncipher $out1,$out1,v26 + vncipher $out2,$out2,v26 + vncipher $out3,$out3,v26 + vncipher $out4,$out4,v26 + vxor $in1,$twk1,v31 + + vncipher $out0,$out0,v27 + lvx_u $in0,0,$inp + vncipher $out1,$out1,v27 + vncipher $out2,$out2,v27 + vncipher $out3,$out3,v27 + vncipher $out4,$out4,v27 + vxor $in2,$twk2,v31 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vncipher $out0,$out0,v28 + vncipher $out1,$out1,v28 + vncipher $out2,$out2,v28 + vncipher $out3,$out3,v28 + vncipher $out4,$out4,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + vxor $in3,$twk3,v31 + + vncipher $out0,$out0,v29 + le?vperm $in0,$in0,$in0,$leperm + vncipher $out1,$out1,v29 + vncipher $out2,$out2,v29 + vncipher $out3,$out3,v29 + vncipher $out4,$out4,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $in4,$twk4,v31 + + vncipher $out0,$out0,v30 + vncipher $out1,$out1,v30 + vncipher $out2,$out2,v30 + vncipher $out3,$out3,v30 + vncipher $out4,$out4,v30 + + vncipherlast $out0,$out0,$twk0 + vncipherlast $out1,$out1,$in1 + vncipherlast $out2,$out2,$in2 + vncipherlast $out3,$out3,$in3 + vncipherlast $out4,$out4,$in4 + mtctr $rounds + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 +___ +}} }}} + my $consts=1; foreach(split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/geo; diff --git a/deps/openssl/openssl/crypto/aes/asm/aest4-sparcv9.pl b/deps/openssl/openssl/crypto/aes/asm/aest4-sparcv9.pl index 536f23b47c..bf479c60ae 100644 --- a/deps/openssl/openssl/crypto/aes/asm/aest4-sparcv9.pl +++ b/deps/openssl/openssl/crypto/aes/asm/aest4-sparcv9.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by David S. Miller <davem@devemloft.net> and Andy Polyakov @@ -68,7 +75,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "sparcv9_modes.pl"; -&asm_init(@ARGV); +$output = pop; +open STDOUT,">$output"; $::evp=1; # if $evp is set to 0, script generates module with # AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry @@ -83,12 +91,14 @@ $::evp=1; # if $evp is set to 0, script generates module with { my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5)); -$code.=<<___ if ($::abibits==64); +$code.=<<___; +#include "sparc_arch.h" + +#ifdef __arch64__ .register %g2,#scratch .register %g3,#scratch +#endif -___ -$code.=<<___; .text .globl aes_t4_encrypt diff --git a/deps/openssl/openssl/crypto/aes/asm/aesv8-armx.pl b/deps/openssl/openssl/crypto/aes/asm/aesv8-armx.pl index 95ebae3beb..1782d5b414 100755 --- a/deps/openssl/openssl/crypto/aes/asm/aesv8-armx.pl +++ b/deps/openssl/openssl/crypto/aes/asm/aesv8-armx.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -27,12 +34,21 @@ # Cortex-A53 1.32 1.29 1.46 # Cortex-A57(*) 1.95 0.85 0.93 # Denver 1.96 0.86 0.80 +# Mongoose 1.33 1.20 1.20 # # (*) original 3.64/1.34/1.32 results were for r0p0 revision # and are still same even for updated module; $flavour = shift; -open STDOUT,">".shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; $prefix="aes_v8"; @@ -43,9 +59,12 @@ $code=<<___; .text ___ $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); -$code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/); - #^^^^^^ this is done to simplify adoption by not depending - # on latest binutils. +$code.=<<___ if ($flavour !~ /64/); +.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) +.fpu neon +.code 32 +#undef __thumb2__ +___ # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to @@ -60,7 +79,7 @@ my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= $code.=<<___; .align 5 -rcon: +.Lrcon: .long 0x01,0x01,0x01,0x01 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat .long 0x1b,0x1b,0x1b,0x1b @@ -89,7 +108,7 @@ $code.=<<___; tst $bits,#0x3f b.ne .Lenc_key_abort - adr $ptr,rcon + adr $ptr,.Lrcon cmp $bits,#192 veor $zero,$zero,$zero diff --git a/deps/openssl/openssl/crypto/aes/asm/bsaes-armv7.pl b/deps/openssl/openssl/crypto/aes/asm/bsaes-armv7.pl index ec66b0502a..7af38afcb6 100644 --- a/deps/openssl/openssl/crypto/aes/asm/bsaes-armv7.pl +++ b/deps/openssl/openssl/crypto/aes/asm/bsaes-armv7.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2012-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -47,8 +54,20 @@ # # <ard.biesheuvel@linaro.org> -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} my ($inp,$out,$len,$key)=("r0","r1","r2","r3"); my @XMM=map("q$_",(0..15)); @@ -702,7 +721,7 @@ $code.=<<___; # define BSAES_ASM_EXTENDED_KEY # define XTS_CHAIN_TWEAK # define __ARM_ARCH__ __LINUX_ARM_ARCH__ -# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 #endif #ifdef __thumb__ @@ -715,10 +734,11 @@ $code.=<<___; .text .syntax unified @ ARMv7-capable assembler is expected to handle this -#ifdef __thumb2__ +#if defined(__thumb2__) && !defined(__APPLE__) .thumb #else .code 32 +# undef __thumb2__ #endif .type _bsaes_decrypt8,%function @@ -726,7 +746,11 @@ $code.=<<___; _bsaes_decrypt8: adr $const,. vldmia $key!, {@XMM[9]} @ round 0 key +#if defined(__thumb2__) || defined(__APPLE__) + adr $const,.LM0ISR +#else add $const,$const,#.LM0ISR-_bsaes_decrypt8 +#endif vldmia $const!, {@XMM[8]} @ .LM0ISR veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key @@ -821,7 +845,11 @@ _bsaes_const: _bsaes_encrypt8: adr $const,. vldmia $key!, {@XMM[9]} @ round 0 key +#if defined(__thumb2__) || defined(__APPLE__) + adr $const,.LM0SR +#else sub $const,$const,#_bsaes_encrypt8-.LM0SR +#endif vldmia $const!, {@XMM[8]} @ .LM0SR _bsaes_encrypt8_alt: @@ -925,7 +953,11 @@ $code.=<<___; _bsaes_key_convert: adr $const,. vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key +#if defined(__thumb2__) || defined(__APPLE__) + adr $const,.LM0 +#else sub $const,$const,#_bsaes_key_convert-.LM0 +#endif vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key vmov.i8 @XMM[8], #0x01 @ bit masks @@ -1392,7 +1424,12 @@ bsaes_ctr32_encrypt_blocks: vstmia r12, {@XMM[7]} @ save last round key vld1.8 {@XMM[0]}, [$ctr] @ load counter +#ifdef __APPLE__ + mov $ctr, #:lower16:(.LREVM0SR-.LM0) + add $ctr, $const, $ctr +#else add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr +#endif vldmia $keysched, {@XMM[4]} @ load round0 key #else ldr r12, [$key, #244] @@ -1449,7 +1486,12 @@ bsaes_ctr32_encrypt_blocks: vldmia $ctr, {@XMM[8]} @ .LREVM0SR mov r5, $rounds @ pass rounds vstmia $fp, {@XMM[10]} @ save next counter +#ifdef __APPLE__ + mov $const, #:lower16:(.LREVM0SR-.LSR) + sub $const, $ctr, $const +#else sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants +#endif bl _bsaes_encrypt8_alt @@ -1550,7 +1592,7 @@ bsaes_ctr32_encrypt_blocks: rev r8, r8 #endif sub sp, sp, #0x10 - vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value + vst1.8 {@XMM[1]}, [sp] @ copy counter value sub sp, sp, #0x10 .Lctr_enc_short_loop: @@ -1561,7 +1603,7 @@ bsaes_ctr32_encrypt_blocks: bl AES_encrypt vld1.8 {@XMM[0]}, [r4]! @ load input - vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter + vld1.8 {@XMM[1]}, [sp] @ load encrypted counter add r8, r8, #1 #ifdef __ARMEL__ rev r0, r8 @@ -2068,9 +2110,11 @@ bsaes_xts_decrypt: vld1.8 {@XMM[8]}, [r0] @ initial tweak adr $magic, .Lxts_magic +#ifndef XTS_CHAIN_TWEAK tst $len, #0xf @ if not multiple of 16 it ne @ Thumb2 thing, sanity check in ARM subne $len, #0x10 @ subtract another 16 bytes +#endif subs $len, #0x80 blo .Lxts_dec_short diff --git a/deps/openssl/openssl/crypto/aes/asm/bsaes-x86_64.pl b/deps/openssl/openssl/crypto/aes/asm/bsaes-x86_64.pl index 3f7d33c45b..921d870e98 100644 --- a/deps/openssl/openssl/crypto/aes/asm/bsaes-x86_64.pl +++ b/deps/openssl/openssl/crypto/aes/asm/bsaes-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + ################################################################### ### AES-128 [originally in CTR mode] ### @@ -41,6 +48,7 @@ # Nehalem(**) 7.63 6.88 +11% # Atom 17.1 16.4 +4% # Silvermont - 12.9 +# Goldmont - 8.85 # # (*) Comparison is not completely fair, because "this" is ECB, # i.e. no extra processing such as counter values calculation @@ -80,6 +88,7 @@ # Nehalem 7.80 # Atom 17.9 # Silvermont 14.0 +# Goldmont 10.2 # # November 2011. # @@ -99,7 +108,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); diff --git a/deps/openssl/openssl/crypto/aes/asm/vpaes-armv8.pl b/deps/openssl/openssl/crypto/aes/asm/vpaes-armv8.pl new file mode 100755 index 0000000000..2e704a2124 --- /dev/null +++ b/deps/openssl/openssl/crypto/aes/asm/vpaes-armv8.pl @@ -0,0 +1,1259 @@ +#! /usr/bin/env perl +# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. +## +###################################################################### +# ARMv8 NEON adaptation by <appro@openssl.org> +# +# Reason for undertaken effort is that there is at least one popular +# SoC based on Cortex-A53 that doesn't have crypto extensions. +# +# CBC enc ECB enc/dec(*) [bit-sliced enc/dec] +# Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ] +# Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ] +# X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ] +# Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ] +# Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ] +# Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ] +# +# (*) ECB denotes approximate result for parallelizeable modes +# such as CBC decrypt, CTR, etc.; +# (**) these results are worse than scalar compiler-generated +# code, but it's constant-time and therefore preferred; +# (***) presented for reference/comparison purposes; + +$flavour = shift; +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +$code.=<<___; +.text + +.type _vpaes_consts,%object +.align 7 // totally strategic alignment +_vpaes_consts: +.Lk_mc_forward: // mc_forward + .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 + .quad 0x080B0A0904070605, 0x000302010C0F0E0D + .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 + .quad 0x000302010C0F0E0D, 0x080B0A0904070605 +.Lk_mc_backward:// mc_backward + .quad 0x0605040702010003, 0x0E0D0C0F0A09080B + .quad 0x020100030E0D0C0F, 0x0A09080B06050407 + .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 + .quad 0x0A09080B06050407, 0x020100030E0D0C0F +.Lk_sr: // sr + .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 + .quad 0x030E09040F0A0500, 0x0B06010C07020D08 + .quad 0x0F060D040B020900, 0x070E050C030A0108 + .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +// +// "Hot" constants +// +.Lk_inv: // inv, inva + .quad 0x0E05060F0D080180, 0x040703090A0B0C02 + .quad 0x01040A060F0B0780, 0x030D0E0C02050809 +.Lk_ipt: // input transform (lo, hi) + .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 + .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +.Lk_sbo: // sbou, sbot + .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 + .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +.Lk_sb1: // sb1u, sb1t + .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF + .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.Lk_sb2: // sb2u, sb2t + .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A + .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +// +// Decryption stuff +// +.Lk_dipt: // decryption input transform + .quad 0x0F505B040B545F00, 0x154A411E114E451A + .quad 0x86E383E660056500, 0x12771772F491F194 +.Lk_dsbo: // decryption sbox final output + .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D + .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +.Lk_dsb9: // decryption sbox output *9*u, *9*t + .quad 0x851C03539A86D600, 0xCAD51F504F994CC9 + .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +.Lk_dsbd: // decryption sbox output *D*u, *D*t + .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 + .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +.Lk_dsbb: // decryption sbox output *B*u, *B*t + .quad 0xD022649296B44200, 0x602646F6B0F2D404 + .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +.Lk_dsbe: // decryption sbox output *E*u, *E*t + .quad 0x46F2929626D4D000, 0x2242600464B4F6B0 + .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 + +// +// Key schedule constants +// +.Lk_dksd: // decryption key schedule: invskew x*D + .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 + .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb: // decryption key schedule: invskew x*B + .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 + .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 + .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 + .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9: // decryption key schedule: invskew x*9 + .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC + .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +.Lk_rcon: // rcon + .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_opt: // output transform + .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 + .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +.Lk_deskew: // deskew tables: inverts the sbox's "skew" + .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A + .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +.asciz "Vector Permutaion AES for ARMv8, Mike Hamburg (Stanford University)" +.size _vpaes_consts,.-_vpaes_consts +.align 6 +___ + +{ +my ($inp,$out,$key) = map("x$_",(0..2)); + +my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23)); +my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27)); +my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31)); + +$code.=<<___; +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## +.type _vpaes_encrypt_preheat,%function +.align 4 +_vpaes_encrypt_preheat: + adr x10, .Lk_inv + movi v17.16b, #0x0f + ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv + ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo + ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 + ret +.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## +.type _vpaes_encrypt_core,%function +.align 4 +_vpaes_encrypt_core: + mov x9, $key + ldr w8, [$key,#240] // pull rounds + adr x11, .Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 + tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b .Lenc_entry + +.align 4 +.Lenc_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + sub w8, w8, #1 // nr-- + +.Lenc_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i + tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, .Lenc_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + ret +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + +.globl vpaes_encrypt +.type vpaes_encrypt,%function +.align 4 +vpaes_encrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [$inp] + bl _vpaes_encrypt_preheat + bl _vpaes_encrypt_core + st1 {v0.16b}, [$out] + + ldp x29,x30,[sp],#16 + ret +.size vpaes_encrypt,.-vpaes_encrypt + +.type _vpaes_encrypt_2x,%function +.align 4 +_vpaes_encrypt_2x: + mov x9, $key + ldr w8, [$key,#240] // pull rounds + adr x11, .Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + tbl v9.16b, {$iptlo}, v9.16b + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + tbl v10.16b, {$ipthi}, v8.16b + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v8.16b, v9.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b .Lenc_2x_entry + +.align 4 +.Lenc_2x_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + tbl v12.16b, {$sb1t}, v10.16b + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + tbl v8.16b, {$sb1u}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + tbl v13.16b, {$sb2t}, v10.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + tbl v10.16b, {$sb2u}, v11.16b + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + tbl v11.16b, {v8.16b}, v1.16b + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + eor v10.16b, v10.16b, v13.16b + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + tbl v8.16b, {v8.16b}, v4.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + eor v11.16b, v11.16b, v10.16b + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + tbl v12.16b, {v11.16b},v1.16b + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + eor v8.16b, v8.16b, v11.16b + and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + eor v8.16b, v8.16b, v12.16b + sub w8, w8, #1 // nr-- + +.Lenc_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + tbl v13.16b, {$invhi},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {$invlo},v8.16b + tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {$invlo},v9.16b + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v13.16b + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v13.16b + tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {$invlo},v11.16b + tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {$invlo},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, .Lenc_2x_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {$sbou}, v10.16b + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + tbl v8.16b, {$sbot}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v1.16b + ret +.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x + +.type _vpaes_decrypt_preheat,%function +.align 4 +_vpaes_decrypt_preheat: + adr x10, .Lk_inv + movi v17.16b, #0x0f + adr x11, .Lk_dipt + ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv + ld1 {v20.2d-v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo + ld1 {v24.2d-v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd + ld1 {v28.2d-v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe + ret +.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat + +## +## Decryption core +## +## Same API as encryption core. +## +.type _vpaes_decrypt_core,%function +.align 4 +_vpaes_decrypt_core: + mov x9, $key + ldr w8, [$key,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11 + eor x11, x11, #0x30 // xor \$0x30, %r11 + adr x10, .Lk_sr + and x11, x11, #0x30 // and \$0x30, %r11 + add x11, x11, x10 + adr x10, .Lk_mc_forward+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 + tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b .Ldec_entry + +.align 4 +.Ldec_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + sub w8, w8, #1 // sub \$1,%rax # nr-- + +.Ldec_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i + tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, .Ldec_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + ret +.size _vpaes_decrypt_core,.-_vpaes_decrypt_core + +.globl vpaes_decrypt +.type vpaes_decrypt,%function +.align 4 +vpaes_decrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [$inp] + bl _vpaes_decrypt_preheat + bl _vpaes_decrypt_core + st1 {v0.16b}, [$out] + + ldp x29,x30,[sp],#16 + ret +.size vpaes_decrypt,.-vpaes_decrypt + +// v14-v15 input, v0-v1 output +.type _vpaes_decrypt_2x,%function +.align 4 +_vpaes_decrypt_2x: + mov x9, $key + ldr w8, [$key,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11 + eor x11, x11, #0x30 // xor \$0x30, %r11 + adr x10, .Lk_sr + and x11, x11, #0x30 // and \$0x30, %r11 + add x11, x11, x10 + adr x10, .Lk_mc_forward+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v2.16b, {$iptlo},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + tbl v10.16b, {$iptlo},v9.16b + ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {$ipthi},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + tbl v8.16b, {$ipthi},v8.16b + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v10.16b, v10.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b .Ldec_2x_entry + +.align 4 +.Ldec_2x_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v12.16b, {$sb9u}, v10.16b + tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + tbl v9.16b, {$sb9t}, v11.16b + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + eor v8.16b, v12.16b, v16.16b + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v12.16b, {$sbdu}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + tbl v9.16b, {$sbdt}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v12.16b, {$sbbu}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + tbl v9.16b, {$sbbt}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v12.16b, {$sbeu}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + tbl v9.16b, {$sbet}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + sub w8, w8, #1 // sub \$1,%rax # nr-- + +.Ldec_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v2.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + tbl v10.16b, {$invhi},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {$invlo},v8.16b + tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {$invlo},v9.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v10.16b + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v10.16b + tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {$invlo},v11.16b + tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {$invlo},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, .Ldec_2x_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {$sbou}, v10.16b + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + tbl v9.16b, {$sbot}, v11.16b + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + eor v8.16b, v9.16b, v12.16b + tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v2.16b + ret +.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x +___ +} +{ +my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3"); +my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8)); + +$code.=<<___; +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +.type _vpaes_key_preheat,%function +.align 4 +_vpaes_key_preheat: + adr x10, .Lk_inv + movi v16.16b, #0x5b // .Lk_s63 + adr x11, .Lk_sb1 + movi v17.16b, #0x0f // .Lk_s0F + ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt + adr x10, .Lk_dksd + ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1 + adr x11, .Lk_mc_forward + ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb + ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 + ld1 {v8.2d}, [x10] // .Lk_rcon + ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] + ret +.size _vpaes_key_preheat,.-_vpaes_key_preheat + +.type _vpaes_schedule_core,%function +.align 4 +_vpaes_schedule_core: + stp x29, x30, [sp,#-16]! + add x29,sp,#0 + + bl _vpaes_key_preheat // load the tables + + ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) + + // input transform + mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 + + adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10 + add x8, x8, x10 + cbnz $dir, .Lschedule_am_decrypting + + // encrypting, output zeroth round key after transform + st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) + b .Lschedule_go + +.Lschedule_am_decrypting: + // decrypting, output zeroth round key after shiftrows + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx) + eor x8, x8, #0x30 // xor \$0x30, %r8 + +.Lschedule_go: + cmp $bits, #192 // cmp \$192, %esi + b.hi .Lschedule_256 + b.eq .Lschedule_192 + // 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +.Lschedule_128: + mov $inp, #10 // mov \$10, %esi + +.Loop_schedule_128: + sub $inp, $inp, #1 // dec %esi + bl _vpaes_schedule_round + cbz $inp, .Lschedule_mangle_last + bl _vpaes_schedule_mangle // write output + b .Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 4 +.Lschedule_192: + sub $inp, $inp, #8 + ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform // input transform + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 + ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov $inp, #4 // mov \$4, %esi + +.Loop_schedule_192: + sub $inp, $inp, #1 // dec %esi + bl _vpaes_schedule_round + ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle // save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle // save key n+1 + bl _vpaes_schedule_round + cbz $inp, .Lschedule_mangle_last + bl _vpaes_schedule_mangle // save key n+2 + bl _vpaes_schedule_192_smear + b .Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 4 +.Lschedule_256: + ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform // input transform + mov $inp, #7 // mov \$7, %esi + +.Loop_schedule_256: + sub $inp, $inp, #1 // dec %esi + bl _vpaes_schedule_mangle // output low result + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + // high round + bl _vpaes_schedule_round + cbz $inp, .Lschedule_mangle_last + bl _vpaes_schedule_mangle + + // low round. swap xmm7 and xmm6 + dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 + movi v4.16b, #0 + mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 + mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 + + b .Loop_schedule_256 + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 4 +.Lschedule_mangle_last: + // schedule last round key from xmm0 + adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew + cbnz $dir, .Lschedule_mangle_last_dec + + // encrypting + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 + adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform + add $out, $out, #32 // add \$32, %rdx + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute + +.Lschedule_mangle_last_dec: + ld1 {v20.2d-v21.2d}, [x11] // reload constants + sub $out, $out, #16 // add \$-16, %rdx + eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform // output transform + st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key + + // cleanup + eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 + eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 + eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 + eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 + eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 + ldp x29, x30, [sp],#16 + ret +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +.type _vpaes_schedule_192_smear,%function +.align 4 +_vpaes_schedule_192_smear: + movi v1.16b, #0 + dup v0.4s, v7.s[3] + ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 + ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + ret +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## +.type _vpaes_schedule_round,%function +.align 4 +_vpaes_schedule_round: + // extract rcon from xmm8 + movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 + ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1 + ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + + // rotate + dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 + ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0 + + // fall through... + + // low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + // smear xmm7 + ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4 + + // subbytes + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i + eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 + tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 + tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io + eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + // add in smeared stuff + eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 + eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 + ret +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +.type _vpaes_schedule_transform,%function +.align 4 +_vpaes_schedule_transform: + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 + // vmovdqa (%r11), %xmm2 # lo + tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + // vmovdqa 16(%r11), %xmm1 # hi + tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + ret +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +.type _vpaes_schedule_mangle,%function +.align 4 +_vpaes_schedule_mangle: + mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later + // vmovdqa .Lk_mc_forward(%rip),%xmm5 + cbnz $dir, .Lschedule_mangle_dec + + // encrypting + eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 + add $out, $out, #16 // add \$16, %rdx + tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 + tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 + tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 + eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 + + b .Lschedule_mangle_both +.align 4 +.Lschedule_mangle_dec: + // inverse mix columns + // lea .Lk_dksd(%rip),%r11 + ushr v1.16b, v4.16b, #4 // vpsrlb \$4, %xmm4, %xmm1 # 1 = hi + and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo + + // vmovdqa 0x00(%r11), %xmm2 + tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + // vmovdqa 0x10(%r11), %xmm3 + tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x20(%r11), %xmm2 + tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x30(%r11), %xmm3 + tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x40(%r11), %xmm2 + tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x50(%r11), %xmm3 + tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + + // vmovdqa 0x60(%r11), %xmm2 + tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + // vmovdqa 0x70(%r11), %xmm4 + tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 + + sub $out, $out, #16 // add \$-16, %rdx + +.Lschedule_mangle_both: + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + add x8, x8, #64-16 // add \$-16, %r8 + and x8, x8, #~(1<<6) // and \$0x30, %r8 + st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx) + ret +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +.globl vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,%function +.align 4 +vpaes_set_encrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, $bits, #5 // shr \$5,%eax + add w9, w9, #5 // \$5,%eax + str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov $dir, #0 // mov \$0,%ecx + mov x8, #0x30 // mov \$0x30,%r8d + bl _vpaes_schedule_core + eor x0, x0, x0 + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key + +.globl vpaes_set_decrypt_key +.type vpaes_set_decrypt_key,%function +.align 4 +vpaes_set_decrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, $bits, #5 // shr \$5,%eax + add w9, w9, #5 // \$5,%eax + str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + lsl w9, w9, #4 // shl \$4,%eax + add $out, $out, #16 // lea 16(%rdx,%rax),%rdx + add $out, $out, x9 + + mov $dir, #1 // mov \$1,%ecx + lsr w8, $bits, #1 // shr \$1,%r8d + and x8, x8, #32 // and \$32,%r8d + eor x8, x8, #32 // xor \$32,%r8d # nbits==192?0:32 + bl _vpaes_schedule_core + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key +___ +} +{ +my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5)); + +$code.=<<___; +.globl vpaes_cbc_encrypt +.type vpaes_cbc_encrypt,%function +.align 4 +vpaes_cbc_encrypt: + cbz $len, .Lcbc_abort + cmp w5, #0 // check direction + b.eq vpaes_cbc_decrypt + + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x17, $len // reassign + mov x2, $key // reassign + + ld1 {v0.16b}, [$ivec] // load ivec + bl _vpaes_encrypt_preheat + b .Lcbc_enc_loop + +.align 4 +.Lcbc_enc_loop: + ld1 {v7.16b}, [$inp],#16 // load input + eor v7.16b, v7.16b, v0.16b // xor with ivec + bl _vpaes_encrypt_core + st1 {v0.16b}, [$out],#16 // save output + subs x17, x17, #16 + b.hi .Lcbc_enc_loop + + st1 {v0.16b}, [$ivec] // write ivec + + ldp x29,x30,[sp],#16 +.Lcbc_abort: + ret +.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt + +.type vpaes_cbc_decrypt,%function +.align 4 +vpaes_cbc_decrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + mov x17, $len // reassign + mov x2, $key // reassign + ld1 {v6.16b}, [$ivec] // load ivec + bl _vpaes_decrypt_preheat + tst x17, #16 + b.eq .Lcbc_dec_loop2x + + ld1 {v7.16b}, [$inp], #16 // load input + bl _vpaes_decrypt_core + eor v0.16b, v0.16b, v6.16b // xor with ivec + orr v6.16b, v7.16b, v7.16b // next ivec value + st1 {v0.16b}, [$out], #16 + subs x17, x17, #16 + b.ls .Lcbc_dec_done + +.align 4 +.Lcbc_dec_loop2x: + ld1 {v14.16b,v15.16b}, [$inp], #32 + bl _vpaes_decrypt_2x + eor v0.16b, v0.16b, v6.16b // xor with ivec + eor v1.16b, v1.16b, v14.16b + orr v6.16b, v15.16b, v15.16b + st1 {v0.16b,v1.16b}, [$out], #32 + subs x17, x17, #32 + b.hi .Lcbc_dec_loop2x + +.Lcbc_dec_done: + st1 {v6.16b}, [$ivec] + + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt +___ +if (1) { +$code.=<<___; +.globl vpaes_ecb_encrypt +.type vpaes_ecb_encrypt,%function +.align 4 +vpaes_ecb_encrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + mov x17, $len + mov x2, $key + bl _vpaes_encrypt_preheat + tst x17, #16 + b.eq .Lecb_enc_loop + + ld1 {v7.16b}, [$inp],#16 + bl _vpaes_encrypt_core + st1 {v0.16b}, [$out],#16 + subs x17, x17, #16 + b.ls .Lecb_enc_done + +.align 4 +.Lecb_enc_loop: + ld1 {v14.16b,v15.16b}, [$inp], #32 + bl _vpaes_encrypt_2x + st1 {v0.16b,v1.16b}, [$out], #32 + subs x17, x17, #32 + b.hi .Lecb_enc_loop + +.Lecb_enc_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt + +.globl vpaes_ecb_decrypt +.type vpaes_ecb_decrypt,%function +.align 4 +vpaes_ecb_decrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + mov x17, $len + mov x2, $key + bl _vpaes_decrypt_preheat + tst x17, #16 + b.eq .Lecb_dec_loop + + ld1 {v7.16b}, [$inp],#16 + bl _vpaes_encrypt_core + st1 {v0.16b}, [$out],#16 + subs x17, x17, #16 + b.ls .Lecb_dec_done + +.align 4 +.Lecb_dec_loop: + ld1 {v14.16b,v15.16b}, [$inp], #32 + bl _vpaes_decrypt_2x + st1 {v0.16b,v1.16b}, [$out], #32 + subs x17, x17, #32 + b.hi .Lecb_dec_loop + +.Lecb_dec_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt +___ +} } +print $code; + +close STDOUT; diff --git a/deps/openssl/openssl/crypto/aes/asm/vpaes-ppc.pl b/deps/openssl/openssl/crypto/aes/asm/vpaes-ppc.pl index 1759ae9dcf..bb38fbe60c 100644 --- a/deps/openssl/openssl/crypto/aes/asm/vpaes-ppc.pl +++ b/deps/openssl/openssl/crypto/aes/asm/vpaes-ppc.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + ###################################################################### ## Constant-time SSSE3 AES core implementation. @@ -14,7 +21,8 @@ # 128-bit key. # # aes-ppc.pl this -# G4e 35.5/52.1/(23.8) 11.9(*)/15.4 +# PPC74x0/G4e 35.5/52.1/(23.8) 11.9(*)/15.4 +# PPC970/G5 37.9/55.0/(28.5) 22.2/28.5 # POWER6 42.7/54.3/(28.2) 63.0/92.8(**) # POWER7 32.3/42.9/(18.4) 18.5/23.3 # diff --git a/deps/openssl/openssl/crypto/aes/asm/vpaes-x86.pl b/deps/openssl/openssl/crypto/aes/asm/vpaes-x86.pl index 2ba149c3f9..47615c0795 100644 --- a/deps/openssl/openssl/crypto/aes/asm/vpaes-x86.pl +++ b/deps/openssl/openssl/crypto/aes/asm/vpaes-x86.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + ###################################################################### ## Constant-time SSSE3 AES core implementation. @@ -51,6 +58,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; +$output = pop; +open OUT,">$output"; +*STDOUT=*OUT; + &asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); $PREFIX="vpaes"; @@ -901,3 +912,5 @@ $k_dsbo=0x2c0; # decryption sbox final output &function_end("${PREFIX}_cbc_encrypt"); &asm_finish(); + +close STDOUT; diff --git a/deps/openssl/openssl/crypto/aes/asm/vpaes-x86_64.pl b/deps/openssl/openssl/crypto/aes/asm/vpaes-x86_64.pl index f2ef318fae..422e8ee442 100644 --- a/deps/openssl/openssl/crypto/aes/asm/vpaes-x86_64.pl +++ b/deps/openssl/openssl/crypto/aes/asm/vpaes-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + ###################################################################### ## Constant-time SSSE3 AES core implementation. @@ -31,6 +38,7 @@ # Nehalem 29.6/40.3/14.6 10.0/11.8 # Atom 57.3/74.2/32.1 60.9/77.2(***) # Silvermont 52.7/64.0/19.5 48.8/60.8(***) +# Goldmont 38.9/49.0/17.8 10.6/12.6 # # (*) "Hyper-threading" in the context refers rather to cache shared # among multiple cores, than to specifically Intel HTT. As vast @@ -57,7 +65,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $PREFIX="vpaes"; diff --git a/deps/openssl/openssl/crypto/aes/build.info b/deps/openssl/openssl/crypto/aes/build.info new file mode 100644 index 0000000000..cf6cb5ec25 --- /dev/null +++ b/deps/openssl/openssl/crypto/aes/build.info @@ -0,0 +1,57 @@ +LIBS=../../libcrypto +SOURCE[../../libcrypto]=\ + aes_misc.c aes_ecb.c aes_cfb.c aes_ofb.c \ + aes_ige.c aes_wrap.c {- $target{aes_asm_src} -} + +GENERATE[aes-ia64.s]=asm/aes-ia64.S + +GENERATE[aes-586.s]=asm/aes-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(LIB_CFLAGS) $(PROCESSOR) +DEPEND[aes-586.s]=../perlasm/x86asm.pl +GENERATE[vpaes-x86.s]=asm/vpaes-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(LIB_CFLAGS) $(PROCESSOR) +DEPEND[vpaes-586.s]=../perlasm/x86asm.pl +GENERATE[aesni-x86.s]=asm/aesni-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(LIB_CFLAGS) $(PROCESSOR) +DEPEND[aesni-586.s]=../perlasm/x86asm.pl + +GENERATE[aes-x86_64.s]=asm/aes-x86_64.pl $(PERLASM_SCHEME) +GENERATE[vpaes-x86_64.s]=asm/vpaes-x86_64.pl $(PERLASM_SCHEME) +GENERATE[bsaes-x86_64.s]=asm/bsaes-x86_64.pl $(PERLASM_SCHEME) +GENERATE[aesni-x86_64.s]=asm/aesni-x86_64.pl $(PERLASM_SCHEME) +GENERATE[aesni-sha1-x86_64.s]=asm/aesni-sha1-x86_64.pl $(PERLASM_SCHEME) +GENERATE[aesni-sha256-x86_64.s]=asm/aesni-sha256-x86_64.pl $(PERLASM_SCHEME) +GENERATE[aesni-mb-x86_64.s]=asm/aesni-mb-x86_64.pl $(PERLASM_SCHEME) + +GENERATE[aes-sparcv9.S]=asm/aes-sparcv9.pl $(PERLASM_SCHEME) +INCLUDE[aes-sparcv9.o]=.. +GENERATE[aest4-sparcv9.S]=asm/aest4-sparcv9.pl $(PERLASM_SCHEME) +INCLUDE[aest4-sparcv9.o]=.. +DEPEND[aest4-sparcv9.S]=../perlasm/sparcv9_modes.pl +GENERATE[aesfx-sparcv9.S]=asm/aesfx-sparcv9.pl $(PERLASM_SCHEME) +INCLUDE[aesfx-sparcv9.o]=.. + +GENERATE[aes-ppc.s]=asm/aes-ppc.pl $(PERLASM_SCHEME) +GENERATE[vpaes-ppc.s]=asm/vpaes-ppc.pl $(PERLASM_SCHEME) +GENERATE[aesp8-ppc.s]=asm/aesp8-ppc.pl $(PERLASM_SCHEME) + +GENERATE[aes-parisc.s]=asm/aes-parisc.pl $(PERLASM_SCHEME) + +GENERATE[aes-mips.S]=asm/aes-mips.pl $(PERLASM_SCHEME) + +GENERATE[aesv8-armx.S]=asm/aesv8-armx.pl $(PERLASM_SCHEME) +INCLUDE[aesv8-armx.o]=.. +GENERATE[vpaes-armv8.S]=asm/vpaes-armv8.pl $(PERLASM_SCHEME) + +GENERATE[aes-armv4.S]=asm/aes-armv4.pl $(PERLASM_SCHEME) +INCLUDE[aes-armv4.o]=.. +GENERATE[bsaes-armv7.S]=asm/bsaes-armv7.pl $(PERLASM_SCHEME) +INCLUDE[bsaes-armv7.o]=.. + +BEGINRAW[Makefile] +##### AES assembler implementations + +# GNU make "catch all" +{- $builddir -}/aes-%.S: {- $sourcedir -}/asm/aes-%.pl + CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ +{- $builddir -}/bsaes-%.S: {- $sourcedir -}/asm/bsaes-%.pl + CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ + +ENDRAW[Makefile] |