summaryrefslogtreecommitdiff
path: root/deps
diff options
context:
space:
mode:
authorBen Noordhuis <info@bnoordhuis.nl>2016-03-01 14:03:58 +0100
committerBen Noordhuis <info@bnoordhuis.nl>2016-03-01 18:29:20 +0100
commit32719950df8e792684a49b89fe03a00bf65c7010 (patch)
tree04a2aca536b988902c4dae21d7d5d9c956b26d59 /deps
parenta7e49c886f5cb8c351673f413dc66086ff1d75bc (diff)
downloadandroid-node-v8-32719950df8e792684a49b89fe03a00bf65c7010.tar.gz
android-node-v8-32719950df8e792684a49b89fe03a00bf65c7010.tar.bz2
android-node-v8-32719950df8e792684a49b89fe03a00bf65c7010.zip
deps: upgrade openssl to 1.0.2g
PR-URL: https://github.com/nodejs/node/pull/5507 Reviewed-By: Fedor Indutny <fedor@indutny.com>
Diffstat (limited to 'deps')
-rw-r--r--deps/openssl/asm/x64-elf-gas/aes/aesni-mb-x86_64.s929
-rw-r--r--deps/openssl/asm/x64-elf-gas/aes/aesni-sha1-x86_64.s1335
-rw-r--r--deps/openssl/asm/x64-elf-gas/aes/aesni-sha256-x86_64.s4297
-rw-r--r--deps/openssl/asm/x64-elf-gas/bn/rsaz-avx2.s1637
-rw-r--r--deps/openssl/asm/x64-elf-gas/bn/rsaz-x86_64.s856
-rw-r--r--deps/openssl/asm/x64-elf-gas/bn/x86_64-gf2m.s2
-rw-r--r--deps/openssl/asm/x64-elf-gas/bn/x86_64-mont.s423
-rw-r--r--deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s2037
-rw-r--r--deps/openssl/asm/x64-elf-gas/ec/ecp_nistz256-x86_64.s1529
-rw-r--r--deps/openssl/asm/x64-elf-gas/modes/aesni-gcm-x86_64.s752
-rw-r--r--deps/openssl/asm/x64-elf-gas/modes/ghash-x86_64.s557
-rw-r--r--deps/openssl/asm/x64-elf-gas/sha/sha1-mb-x86_64.s4303
-rw-r--r--deps/openssl/asm/x64-elf-gas/sha/sha1-x86_64.s2813
-rw-r--r--deps/openssl/asm/x64-elf-gas/sha/sha256-mb-x86_64.s4728
-rw-r--r--deps/openssl/asm/x64-elf-gas/sha/sha256-x86_64.s2353
-rw-r--r--deps/openssl/asm/x64-elf-gas/sha/sha512-x86_64.s3582
-rw-r--r--deps/openssl/asm/x64-macosx-gas/aes/aesni-mb-x86_64.s929
-rw-r--r--deps/openssl/asm/x64-macosx-gas/aes/aesni-sha1-x86_64.s1335
-rw-r--r--deps/openssl/asm/x64-macosx-gas/aes/aesni-sha256-x86_64.s4296
-rw-r--r--deps/openssl/asm/x64-macosx-gas/bn/rsaz-avx2.s1632
-rw-r--r--deps/openssl/asm/x64-macosx-gas/bn/rsaz-x86_64.s857
-rw-r--r--deps/openssl/asm/x64-macosx-gas/bn/x86_64-gf2m.s2
-rw-r--r--deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont.s421
-rw-r--r--deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s2033
-rw-r--r--deps/openssl/asm/x64-macosx-gas/ec/ecp_nistz256-x86_64.s1529
-rw-r--r--deps/openssl/asm/x64-macosx-gas/modes/aesni-gcm-x86_64.s749
-rw-r--r--deps/openssl/asm/x64-macosx-gas/modes/ghash-x86_64.s557
-rw-r--r--deps/openssl/asm/x64-macosx-gas/sha/sha1-mb-x86_64.s4303
-rw-r--r--deps/openssl/asm/x64-macosx-gas/sha/sha1-x86_64.s2813
-rw-r--r--deps/openssl/asm/x64-macosx-gas/sha/sha256-mb-x86_64.s4728
-rw-r--r--deps/openssl/asm/x64-macosx-gas/sha/sha256-x86_64.s2353
-rw-r--r--deps/openssl/asm/x64-macosx-gas/sha/sha512-x86_64.s3581
-rw-r--r--deps/openssl/asm/x64-win32-masm/bn/rsaz-avx2.asm171
-rw-r--r--deps/openssl/asm/x64-win32-masm/bn/rsaz-x86_64.asm353
-rw-r--r--deps/openssl/asm/x64-win32-masm/bn/x86_64-mont.asm219
-rw-r--r--deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm1424
-rw-r--r--deps/openssl/asm/x64-win32-masm/ec/ecp_nistz256-x86_64.asm22
-rw-r--r--deps/openssl/asm/x64-win32-masm/modes/aesni-gcm-x86_64.asm2
-rw-r--r--deps/openssl/asm/x86-elf-gas/sha/sha1-586.s1175
-rw-r--r--deps/openssl/asm/x86-elf-gas/sha/sha256-586.s2248
-rw-r--r--deps/openssl/asm/x86-macosx-gas/sha/sha1-586.s1173
-rw-r--r--deps/openssl/asm/x86-macosx-gas/sha/sha256-586.s2248
-rw-r--r--deps/openssl/asm/x86-win32-masm/sha/sha1-586.asm1174
-rw-r--r--deps/openssl/asm/x86-win32-masm/sha/sha256-586.asm2248
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/aes/aes-x86_64.s70
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-sha1-x86_64.s8
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s112
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/aes/bsaes-x86_64.s158
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/aes/vpaes-x86_64.s20
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/bn/rsaz-x86_64.s218
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-gf2m.s2
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont.s108
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s901
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/camellia/cmll-x86_64.s2
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/ec/ecp_nistz256-x86_64.s27
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/modes/ghash-x86_64.s82
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/sha/sha1-mb-x86_64.s16
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/sha/sha1-x86_64.s8
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/sha/sha256-mb-x86_64.s84
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/sha/sha256-x86_64.s44
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/x86_64cpuid.s48
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/aes/aes-x86_64.s70
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-sha1-x86_64.s8
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s112
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/aes/bsaes-x86_64.s158
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/aes/vpaes-x86_64.s20
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/bn/rsaz-x86_64.s219
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-gf2m.s2
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont.s108
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s897
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/camellia/cmll-x86_64.s2
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/ec/ecp_nistz256-x86_64.s27
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/modes/ghash-x86_64.s82
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha1-mb-x86_64.s16
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha1-x86_64.s8
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha256-mb-x86_64.s84
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha256-x86_64.s44
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/x86_64cpuid.s48
-rw-r--r--deps/openssl/asm_obsolete/x64-win32-masm/bn/rsaz-x86_64.asm291
-rw-r--r--deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont.asm108
-rw-r--r--deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm935
-rw-r--r--deps/openssl/asm_obsolete/x64-win32-masm/ec/ecp_nistz256-x86_64.asm11
-rw-r--r--deps/openssl/openssl/CHANGES134
-rwxr-xr-xdeps/openssl/openssl/Configure8
-rw-r--r--deps/openssl/openssl/Makefile.shared6
-rw-r--r--deps/openssl/openssl/NEWS13
-rw-r--r--deps/openssl/openssl/README2
-rw-r--r--deps/openssl/openssl/apps/apps.c8
-rw-r--r--deps/openssl/openssl/apps/apps.h2
-rw-r--r--deps/openssl/openssl/apps/pkeyutl.c90
-rw-r--r--deps/openssl/openssl/apps/req.c4
-rw-r--r--deps/openssl/openssl/apps/rsautl.c6
-rw-r--r--deps/openssl/openssl/apps/s_client.c2
-rw-r--r--deps/openssl/openssl/apps/s_server.c49
-rwxr-xr-xdeps/openssl/openssl/config3
-rw-r--r--deps/openssl/openssl/crypto/asn1/tasn_dec.c14
-rw-r--r--deps/openssl/openssl/crypto/bio/b_print.c187
-rw-r--r--deps/openssl/openssl/crypto/bio/bio.h4
-rw-r--r--deps/openssl/openssl/crypto/bio/bss_mem.c6
-rw-r--r--deps/openssl/openssl/crypto/bn/Makefile4
-rwxr-xr-xdeps/openssl/openssl/crypto/bn/asm/rsaz-avx2.pl219
-rwxr-xr-xdeps/openssl/openssl/crypto/bn/asm/rsaz-x86_64.pl375
-rwxr-xr-xdeps/openssl/openssl/crypto/bn/asm/x86_64-mont.pl227
-rwxr-xr-xdeps/openssl/openssl/crypto/bn/asm/x86_64-mont5.pl1276
-rw-r--r--deps/openssl/openssl/crypto/bn/bn.h14
-rw-r--r--deps/openssl/openssl/crypto/bn/bn_exp.c103
-rw-r--r--deps/openssl/openssl/crypto/bn/bn_print.c17
-rw-r--r--deps/openssl/openssl/crypto/bn/bn_recp.c1
-rw-r--r--deps/openssl/openssl/crypto/cmac/cmac.c8
-rw-r--r--deps/openssl/openssl/crypto/cryptlib.c6
-rw-r--r--deps/openssl/openssl/crypto/crypto.h2
-rw-r--r--deps/openssl/openssl/crypto/dh/dh.h2
-rw-r--r--deps/openssl/openssl/crypto/dh/dh_check.c7
-rw-r--r--deps/openssl/openssl/crypto/dsa/dsa_ameth.c22
-rw-r--r--deps/openssl/openssl/crypto/dso/dso_lib.c1
-rwxr-xr-xdeps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl11
-rw-r--r--deps/openssl/openssl/crypto/ec/ecp_nistp224.c4
-rw-r--r--deps/openssl/openssl/crypto/ec/ecp_nistp256.c4
-rw-r--r--deps/openssl/openssl/crypto/ec/ecp_nistp521.c4
-rw-r--r--deps/openssl/openssl/crypto/ec/ectest.c9
-rw-r--r--deps/openssl/openssl/crypto/engine/eng_dyn.c4
-rw-r--r--deps/openssl/openssl/crypto/evp/e_des.c11
-rw-r--r--deps/openssl/openssl/crypto/evp/e_des3.c13
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl4
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl2
-rw-r--r--deps/openssl/openssl/crypto/modes/ctr128.c41
-rw-r--r--deps/openssl/openssl/crypto/opensslv.h6
-rwxr-xr-xdeps/openssl/openssl/crypto/perlasm/x86_64-xlate.pl7
-rw-r--r--deps/openssl/openssl/crypto/pkcs7/pk7_smime.c17
-rw-r--r--deps/openssl/openssl/crypto/rsa/rsa_sign.c4
-rw-r--r--deps/openssl/openssl/crypto/srp/srp.h10
-rw-r--r--deps/openssl/openssl/crypto/srp/srp_vfy.c57
-rw-r--r--deps/openssl/openssl/crypto/stack/stack.c2
-rw-r--r--deps/openssl/openssl/crypto/x509/x509_vfy.c70
-rw-r--r--deps/openssl/openssl/doc/apps/ciphers.pod59
-rw-r--r--deps/openssl/openssl/doc/apps/pkeyutl.pod13
-rw-r--r--deps/openssl/openssl/doc/apps/req.pod9
-rw-r--r--deps/openssl/openssl/doc/apps/s_client.pod12
-rw-r--r--deps/openssl/openssl/doc/apps/s_server.pod8
-rw-r--r--deps/openssl/openssl/doc/crypto/BIO_s_mem.pod4
-rw-r--r--deps/openssl/openssl/doc/ssl/SSL_CONF_cmd.pod33
-rw-r--r--deps/openssl/openssl/doc/ssl/SSL_CTX_new.pod168
-rw-r--r--deps/openssl/openssl/doc/ssl/SSL_CTX_set_options.pod10
-rw-r--r--deps/openssl/openssl/doc/ssl/ssl.pod77
-rw-r--r--deps/openssl/openssl/engines/e_capi.c32
-rw-r--r--deps/openssl/openssl/include/openssl/bio.h4
-rw-r--r--deps/openssl/openssl/include/openssl/bn.h14
-rw-r--r--deps/openssl/openssl/include/openssl/crypto.h2
-rw-r--r--deps/openssl/openssl/include/openssl/dh.h2
-rw-r--r--deps/openssl/openssl/include/openssl/opensslv.h6
-rw-r--r--deps/openssl/openssl/include/openssl/srp.h10
-rw-r--r--deps/openssl/openssl/include/openssl/ssl.h1
-rwxr-xr-xdeps/openssl/openssl/ms/uplink-x86.pl4
-rw-r--r--deps/openssl/openssl/openssl.spec2
-rw-r--r--deps/openssl/openssl/ssl/Makefile69
-rw-r--r--deps/openssl/openssl/ssl/s2_lib.c6
-rw-r--r--deps/openssl/openssl/ssl/s3_lib.c69
-rw-r--r--deps/openssl/openssl/ssl/ssl.h1
-rw-r--r--deps/openssl/openssl/ssl/ssl_conf.c10
-rw-r--r--deps/openssl/openssl/ssl/ssl_err.c1
-rw-r--r--deps/openssl/openssl/ssl/ssl_lib.c14
-rw-r--r--deps/openssl/openssl/ssl/sslv2conftest.c231
-rw-r--r--deps/openssl/openssl/test/Makefile35
-rwxr-xr-xdeps/openssl/openssl/util/libeay.num2
-rwxr-xr-xdeps/openssl/openssl/util/mk1mf.pl4
-rw-r--r--deps/openssl/openssl/util/pl/BC-32.pl4
-rw-r--r--deps/openssl/openssl/util/pl/Mingw32.pl2
-rw-r--r--deps/openssl/openssl/util/pl/OS2-EMX.pl4
-rw-r--r--deps/openssl/openssl/util/pl/VC-32.pl10
-rw-r--r--deps/openssl/openssl/util/pl/linux.pl2
-rw-r--r--deps/openssl/openssl/util/pl/netware.pl8
-rw-r--r--deps/openssl/openssl/util/pl/ultrix.pl2
-rw-r--r--deps/openssl/openssl/util/pl/unix.pl2
173 files changed, 9791 insertions, 76123 deletions
diff --git a/deps/openssl/asm/x64-elf-gas/aes/aesni-mb-x86_64.s b/deps/openssl/asm/x64-elf-gas/aes/aesni-mb-x86_64.s
index 543b588316..b0467f2f9f 100644
--- a/deps/openssl/asm/x64-elf-gas/aes/aesni-mb-x86_64.s
+++ b/deps/openssl/asm/x64-elf-gas/aes/aesni-mb-x86_64.s
@@ -6,14 +6,6 @@
.type aesni_multi_cbc_encrypt,@function
.align 32
aesni_multi_cbc_encrypt:
- cmpl $2,%edx
- jb .Lenc_non_avx
- movl OPENSSL_ia32cap_P+4(%rip),%ecx
- testl $268435456,%ecx
- jnz _avx_cbc_enc_shortcut
- jmp .Lenc_non_avx
-.align 16
-.Lenc_non_avx:
movq %rsp,%rax
pushq %rbx
pushq %rbp
@@ -270,14 +262,6 @@ aesni_multi_cbc_encrypt:
.type aesni_multi_cbc_decrypt,@function
.align 32
aesni_multi_cbc_decrypt:
- cmpl $2,%edx
- jb .Ldec_non_avx
- movl OPENSSL_ia32cap_P+4(%rip),%ecx
- testl $268435456,%ecx
- jnz _avx_cbc_dec_shortcut
- jmp .Ldec_non_avx
-.align 16
-.Ldec_non_avx:
movq %rsp,%rax
pushq %rbx
pushq %rbp
@@ -520,916 +504,3 @@ aesni_multi_cbc_decrypt:
.Ldec4x_epilogue:
.byte 0xf3,0xc3
.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
-.type aesni_multi_cbc_encrypt_avx,@function
-.align 32
-aesni_multi_cbc_encrypt_avx:
-_avx_cbc_enc_shortcut:
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-
-
-
-
-
-
-
-
- subq $192,%rsp
- andq $-128,%rsp
- movq %rax,16(%rsp)
-
-.Lenc8x_body:
- vzeroupper
- vmovdqu (%rsi),%xmm15
- leaq 120(%rsi),%rsi
- leaq 160(%rdi),%rdi
- shrl $1,%edx
-
-.Lenc8x_loop_grande:
-
- xorl %edx,%edx
- movl -144(%rdi),%ecx
- movq -160(%rdi),%r8
- cmpl %edx,%ecx
- movq -152(%rdi),%rbx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu -136(%rdi),%xmm2
- movl %ecx,32(%rsp)
- cmovleq %rsp,%r8
- subq %r8,%rbx
- movq %rbx,64(%rsp)
- movl -104(%rdi),%ecx
- movq -120(%rdi),%r9
- cmpl %edx,%ecx
- movq -112(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu -96(%rdi),%xmm3
- movl %ecx,36(%rsp)
- cmovleq %rsp,%r9
- subq %r9,%rbp
- movq %rbp,72(%rsp)
- movl -64(%rdi),%ecx
- movq -80(%rdi),%r10
- cmpl %edx,%ecx
- movq -72(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu -56(%rdi),%xmm4
- movl %ecx,40(%rsp)
- cmovleq %rsp,%r10
- subq %r10,%rbp
- movq %rbp,80(%rsp)
- movl -24(%rdi),%ecx
- movq -40(%rdi),%r11
- cmpl %edx,%ecx
- movq -32(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu -16(%rdi),%xmm5
- movl %ecx,44(%rsp)
- cmovleq %rsp,%r11
- subq %r11,%rbp
- movq %rbp,88(%rsp)
- movl 16(%rdi),%ecx
- movq 0(%rdi),%r12
- cmpl %edx,%ecx
- movq 8(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu 24(%rdi),%xmm6
- movl %ecx,48(%rsp)
- cmovleq %rsp,%r12
- subq %r12,%rbp
- movq %rbp,96(%rsp)
- movl 56(%rdi),%ecx
- movq 40(%rdi),%r13
- cmpl %edx,%ecx
- movq 48(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu 64(%rdi),%xmm7
- movl %ecx,52(%rsp)
- cmovleq %rsp,%r13
- subq %r13,%rbp
- movq %rbp,104(%rsp)
- movl 96(%rdi),%ecx
- movq 80(%rdi),%r14
- cmpl %edx,%ecx
- movq 88(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu 104(%rdi),%xmm8
- movl %ecx,56(%rsp)
- cmovleq %rsp,%r14
- subq %r14,%rbp
- movq %rbp,112(%rsp)
- movl 136(%rdi),%ecx
- movq 120(%rdi),%r15
- cmpl %edx,%ecx
- movq 128(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu 144(%rdi),%xmm9
- movl %ecx,60(%rsp)
- cmovleq %rsp,%r15
- subq %r15,%rbp
- movq %rbp,120(%rsp)
- testl %edx,%edx
- jz .Lenc8x_done
-
- vmovups 16-120(%rsi),%xmm1
- vmovups 32-120(%rsi),%xmm0
- movl 240-120(%rsi),%eax
-
- vpxor (%r8),%xmm15,%xmm10
- leaq 128(%rsp),%rbp
- vpxor (%r9),%xmm15,%xmm11
- vpxor (%r10),%xmm15,%xmm12
- vpxor (%r11),%xmm15,%xmm13
- vpxor %xmm10,%xmm2,%xmm2
- vpxor (%r12),%xmm15,%xmm10
- vpxor %xmm11,%xmm3,%xmm3
- vpxor (%r13),%xmm15,%xmm11
- vpxor %xmm12,%xmm4,%xmm4
- vpxor (%r14),%xmm15,%xmm12
- vpxor %xmm13,%xmm5,%xmm5
- vpxor (%r15),%xmm15,%xmm13
- vpxor %xmm10,%xmm6,%xmm6
- movl $1,%ecx
- vpxor %xmm11,%xmm7,%xmm7
- vpxor %xmm12,%xmm8,%xmm8
- vpxor %xmm13,%xmm9,%xmm9
- jmp .Loop_enc8x
-
-.align 32
-.Loop_enc8x:
- vaesenc %xmm1,%xmm2,%xmm2
- cmpl 32+0(%rsp),%ecx
- vaesenc %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r8)
- vaesenc %xmm1,%xmm4,%xmm4
- vaesenc %xmm1,%xmm5,%xmm5
- leaq (%r8,%rbx,1),%rbx
- cmovgeq %rsp,%r8
- vaesenc %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm1,%xmm7,%xmm7
- subq %r8,%rbx
- vaesenc %xmm1,%xmm8,%xmm8
- vpxor 16(%r8),%xmm15,%xmm10
- movq %rbx,64+0(%rsp)
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups -72(%rsi),%xmm1
- leaq 16(%r8,%rbx,1),%r8
- vmovdqu %xmm10,0(%rbp)
- vaesenc %xmm0,%xmm2,%xmm2
- cmpl 32+4(%rsp),%ecx
- movq 64+8(%rsp),%rbx
- vaesenc %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r9)
- vaesenc %xmm0,%xmm4,%xmm4
- vaesenc %xmm0,%xmm5,%xmm5
- leaq (%r9,%rbx,1),%rbx
- cmovgeq %rsp,%r9
- vaesenc %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm0,%xmm7,%xmm7
- subq %r9,%rbx
- vaesenc %xmm0,%xmm8,%xmm8
- vpxor 16(%r9),%xmm15,%xmm11
- movq %rbx,64+8(%rsp)
- vaesenc %xmm0,%xmm9,%xmm9
- vmovups -56(%rsi),%xmm0
- leaq 16(%r9,%rbx,1),%r9
- vmovdqu %xmm11,16(%rbp)
- vaesenc %xmm1,%xmm2,%xmm2
- cmpl 32+8(%rsp),%ecx
- movq 64+16(%rsp),%rbx
- vaesenc %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r10)
- vaesenc %xmm1,%xmm4,%xmm4
- prefetcht0 15(%r8)
- vaesenc %xmm1,%xmm5,%xmm5
- leaq (%r10,%rbx,1),%rbx
- cmovgeq %rsp,%r10
- vaesenc %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm1,%xmm7,%xmm7
- subq %r10,%rbx
- vaesenc %xmm1,%xmm8,%xmm8
- vpxor 16(%r10),%xmm15,%xmm12
- movq %rbx,64+16(%rsp)
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups -40(%rsi),%xmm1
- leaq 16(%r10,%rbx,1),%r10
- vmovdqu %xmm12,32(%rbp)
- vaesenc %xmm0,%xmm2,%xmm2
- cmpl 32+12(%rsp),%ecx
- movq 64+24(%rsp),%rbx
- vaesenc %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r11)
- vaesenc %xmm0,%xmm4,%xmm4
- prefetcht0 15(%r9)
- vaesenc %xmm0,%xmm5,%xmm5
- leaq (%r11,%rbx,1),%rbx
- cmovgeq %rsp,%r11
- vaesenc %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm0,%xmm7,%xmm7
- subq %r11,%rbx
- vaesenc %xmm0,%xmm8,%xmm8
- vpxor 16(%r11),%xmm15,%xmm13
- movq %rbx,64+24(%rsp)
- vaesenc %xmm0,%xmm9,%xmm9
- vmovups -24(%rsi),%xmm0
- leaq 16(%r11,%rbx,1),%r11
- vmovdqu %xmm13,48(%rbp)
- vaesenc %xmm1,%xmm2,%xmm2
- cmpl 32+16(%rsp),%ecx
- movq 64+32(%rsp),%rbx
- vaesenc %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r12)
- vaesenc %xmm1,%xmm4,%xmm4
- prefetcht0 15(%r10)
- vaesenc %xmm1,%xmm5,%xmm5
- leaq (%r12,%rbx,1),%rbx
- cmovgeq %rsp,%r12
- vaesenc %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm1,%xmm7,%xmm7
- subq %r12,%rbx
- vaesenc %xmm1,%xmm8,%xmm8
- vpxor 16(%r12),%xmm15,%xmm10
- movq %rbx,64+32(%rsp)
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups -8(%rsi),%xmm1
- leaq 16(%r12,%rbx,1),%r12
- vaesenc %xmm0,%xmm2,%xmm2
- cmpl 32+20(%rsp),%ecx
- movq 64+40(%rsp),%rbx
- vaesenc %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r13)
- vaesenc %xmm0,%xmm4,%xmm4
- prefetcht0 15(%r11)
- vaesenc %xmm0,%xmm5,%xmm5
- leaq (%rbx,%r13,1),%rbx
- cmovgeq %rsp,%r13
- vaesenc %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm0,%xmm7,%xmm7
- subq %r13,%rbx
- vaesenc %xmm0,%xmm8,%xmm8
- vpxor 16(%r13),%xmm15,%xmm11
- movq %rbx,64+40(%rsp)
- vaesenc %xmm0,%xmm9,%xmm9
- vmovups 8(%rsi),%xmm0
- leaq 16(%r13,%rbx,1),%r13
- vaesenc %xmm1,%xmm2,%xmm2
- cmpl 32+24(%rsp),%ecx
- movq 64+48(%rsp),%rbx
- vaesenc %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r14)
- vaesenc %xmm1,%xmm4,%xmm4
- prefetcht0 15(%r12)
- vaesenc %xmm1,%xmm5,%xmm5
- leaq (%r14,%rbx,1),%rbx
- cmovgeq %rsp,%r14
- vaesenc %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm1,%xmm7,%xmm7
- subq %r14,%rbx
- vaesenc %xmm1,%xmm8,%xmm8
- vpxor 16(%r14),%xmm15,%xmm12
- movq %rbx,64+48(%rsp)
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups 24(%rsi),%xmm1
- leaq 16(%r14,%rbx,1),%r14
- vaesenc %xmm0,%xmm2,%xmm2
- cmpl 32+28(%rsp),%ecx
- movq 64+56(%rsp),%rbx
- vaesenc %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r15)
- vaesenc %xmm0,%xmm4,%xmm4
- prefetcht0 15(%r13)
- vaesenc %xmm0,%xmm5,%xmm5
- leaq (%r15,%rbx,1),%rbx
- cmovgeq %rsp,%r15
- vaesenc %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm0,%xmm7,%xmm7
- subq %r15,%rbx
- vaesenc %xmm0,%xmm8,%xmm8
- vpxor 16(%r15),%xmm15,%xmm13
- movq %rbx,64+56(%rsp)
- vaesenc %xmm0,%xmm9,%xmm9
- vmovups 40(%rsi),%xmm0
- leaq 16(%r15,%rbx,1),%r15
- vmovdqu 32(%rsp),%xmm14
- prefetcht0 15(%r14)
- prefetcht0 15(%r15)
- cmpl $11,%eax
- jb .Lenc8x_tail
-
- vaesenc %xmm1,%xmm2,%xmm2
- vaesenc %xmm1,%xmm3,%xmm3
- vaesenc %xmm1,%xmm4,%xmm4
- vaesenc %xmm1,%xmm5,%xmm5
- vaesenc %xmm1,%xmm6,%xmm6
- vaesenc %xmm1,%xmm7,%xmm7
- vaesenc %xmm1,%xmm8,%xmm8
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups 176-120(%rsi),%xmm1
-
- vaesenc %xmm0,%xmm2,%xmm2
- vaesenc %xmm0,%xmm3,%xmm3
- vaesenc %xmm0,%xmm4,%xmm4
- vaesenc %xmm0,%xmm5,%xmm5
- vaesenc %xmm0,%xmm6,%xmm6
- vaesenc %xmm0,%xmm7,%xmm7
- vaesenc %xmm0,%xmm8,%xmm8
- vaesenc %xmm0,%xmm9,%xmm9
- vmovups 192-120(%rsi),%xmm0
- je .Lenc8x_tail
-
- vaesenc %xmm1,%xmm2,%xmm2
- vaesenc %xmm1,%xmm3,%xmm3
- vaesenc %xmm1,%xmm4,%xmm4
- vaesenc %xmm1,%xmm5,%xmm5
- vaesenc %xmm1,%xmm6,%xmm6
- vaesenc %xmm1,%xmm7,%xmm7
- vaesenc %xmm1,%xmm8,%xmm8
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups 208-120(%rsi),%xmm1
-
- vaesenc %xmm0,%xmm2,%xmm2
- vaesenc %xmm0,%xmm3,%xmm3
- vaesenc %xmm0,%xmm4,%xmm4
- vaesenc %xmm0,%xmm5,%xmm5
- vaesenc %xmm0,%xmm6,%xmm6
- vaesenc %xmm0,%xmm7,%xmm7
- vaesenc %xmm0,%xmm8,%xmm8
- vaesenc %xmm0,%xmm9,%xmm9
- vmovups 224-120(%rsi),%xmm0
-
-.Lenc8x_tail:
- vaesenc %xmm1,%xmm2,%xmm2
- vpxor %xmm15,%xmm15,%xmm15
- vaesenc %xmm1,%xmm3,%xmm3
- vaesenc %xmm1,%xmm4,%xmm4
- vpcmpgtd %xmm15,%xmm14,%xmm15
- vaesenc %xmm1,%xmm5,%xmm5
- vaesenc %xmm1,%xmm6,%xmm6
- vpaddd %xmm14,%xmm15,%xmm15
- vmovdqu 48(%rsp),%xmm14
- vaesenc %xmm1,%xmm7,%xmm7
- movq 64(%rsp),%rbx
- vaesenc %xmm1,%xmm8,%xmm8
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups 16-120(%rsi),%xmm1
-
- vaesenclast %xmm0,%xmm2,%xmm2
- vmovdqa %xmm15,32(%rsp)
- vpxor %xmm15,%xmm15,%xmm15
- vaesenclast %xmm0,%xmm3,%xmm3
- vaesenclast %xmm0,%xmm4,%xmm4
- vpcmpgtd %xmm15,%xmm14,%xmm15
- vaesenclast %xmm0,%xmm5,%xmm5
- vaesenclast %xmm0,%xmm6,%xmm6
- vpaddd %xmm15,%xmm14,%xmm14
- vmovdqu -120(%rsi),%xmm15
- vaesenclast %xmm0,%xmm7,%xmm7
- vaesenclast %xmm0,%xmm8,%xmm8
- vmovdqa %xmm14,48(%rsp)
- vaesenclast %xmm0,%xmm9,%xmm9
- vmovups 32-120(%rsi),%xmm0
-
- vmovups %xmm2,-16(%r8)
- subq %rbx,%r8
- vpxor 0(%rbp),%xmm2,%xmm2
- vmovups %xmm3,-16(%r9)
- subq 72(%rsp),%r9
- vpxor 16(%rbp),%xmm3,%xmm3
- vmovups %xmm4,-16(%r10)
- subq 80(%rsp),%r10
- vpxor 32(%rbp),%xmm4,%xmm4
- vmovups %xmm5,-16(%r11)
- subq 88(%rsp),%r11
- vpxor 48(%rbp),%xmm5,%xmm5
- vmovups %xmm6,-16(%r12)
- subq 96(%rsp),%r12
- vpxor %xmm10,%xmm6,%xmm6
- vmovups %xmm7,-16(%r13)
- subq 104(%rsp),%r13
- vpxor %xmm11,%xmm7,%xmm7
- vmovups %xmm8,-16(%r14)
- subq 112(%rsp),%r14
- vpxor %xmm12,%xmm8,%xmm8
- vmovups %xmm9,-16(%r15)
- subq 120(%rsp),%r15
- vpxor %xmm13,%xmm9,%xmm9
-
- decl %edx
- jnz .Loop_enc8x
-
- movq 16(%rsp),%rax
-
-
-
-
-
-.Lenc8x_done:
- vzeroupper
- movq -48(%rax),%r15
- movq -40(%rax),%r14
- movq -32(%rax),%r13
- movq -24(%rax),%r12
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-.Lenc8x_epilogue:
- .byte 0xf3,0xc3
-.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
-
-.type aesni_multi_cbc_decrypt_avx,@function
-.align 32
-aesni_multi_cbc_decrypt_avx:
-_avx_cbc_dec_shortcut:
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-
-
-
-
-
-
-
-
-
- subq $256,%rsp
- andq $-256,%rsp
- subq $192,%rsp
- movq %rax,16(%rsp)
-
-.Ldec8x_body:
- vzeroupper
- vmovdqu (%rsi),%xmm15
- leaq 120(%rsi),%rsi
- leaq 160(%rdi),%rdi
- shrl $1,%edx
-
-.Ldec8x_loop_grande:
-
- xorl %edx,%edx
- movl -144(%rdi),%ecx
- movq -160(%rdi),%r8
- cmpl %edx,%ecx
- movq -152(%rdi),%rbx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu -136(%rdi),%xmm2
- movl %ecx,32(%rsp)
- cmovleq %rsp,%r8
- subq %r8,%rbx
- movq %rbx,64(%rsp)
- vmovdqu %xmm2,192(%rsp)
- movl -104(%rdi),%ecx
- movq -120(%rdi),%r9
- cmpl %edx,%ecx
- movq -112(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu -96(%rdi),%xmm3
- movl %ecx,36(%rsp)
- cmovleq %rsp,%r9
- subq %r9,%rbp
- movq %rbp,72(%rsp)
- vmovdqu %xmm3,208(%rsp)
- movl -64(%rdi),%ecx
- movq -80(%rdi),%r10
- cmpl %edx,%ecx
- movq -72(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu -56(%rdi),%xmm4
- movl %ecx,40(%rsp)
- cmovleq %rsp,%r10
- subq %r10,%rbp
- movq %rbp,80(%rsp)
- vmovdqu %xmm4,224(%rsp)
- movl -24(%rdi),%ecx
- movq -40(%rdi),%r11
- cmpl %edx,%ecx
- movq -32(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu -16(%rdi),%xmm5
- movl %ecx,44(%rsp)
- cmovleq %rsp,%r11
- subq %r11,%rbp
- movq %rbp,88(%rsp)
- vmovdqu %xmm5,240(%rsp)
- movl 16(%rdi),%ecx
- movq 0(%rdi),%r12
- cmpl %edx,%ecx
- movq 8(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu 24(%rdi),%xmm6
- movl %ecx,48(%rsp)
- cmovleq %rsp,%r12
- subq %r12,%rbp
- movq %rbp,96(%rsp)
- vmovdqu %xmm6,256(%rsp)
- movl 56(%rdi),%ecx
- movq 40(%rdi),%r13
- cmpl %edx,%ecx
- movq 48(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu 64(%rdi),%xmm7
- movl %ecx,52(%rsp)
- cmovleq %rsp,%r13
- subq %r13,%rbp
- movq %rbp,104(%rsp)
- vmovdqu %xmm7,272(%rsp)
- movl 96(%rdi),%ecx
- movq 80(%rdi),%r14
- cmpl %edx,%ecx
- movq 88(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu 104(%rdi),%xmm8
- movl %ecx,56(%rsp)
- cmovleq %rsp,%r14
- subq %r14,%rbp
- movq %rbp,112(%rsp)
- vmovdqu %xmm8,288(%rsp)
- movl 136(%rdi),%ecx
- movq 120(%rdi),%r15
- cmpl %edx,%ecx
- movq 128(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu 144(%rdi),%xmm9
- movl %ecx,60(%rsp)
- cmovleq %rsp,%r15
- subq %r15,%rbp
- movq %rbp,120(%rsp)
- vmovdqu %xmm9,304(%rsp)
- testl %edx,%edx
- jz .Ldec8x_done
-
- vmovups 16-120(%rsi),%xmm1
- vmovups 32-120(%rsi),%xmm0
- movl 240-120(%rsi),%eax
- leaq 192+128(%rsp),%rbp
-
- vmovdqu (%r8),%xmm2
- vmovdqu (%r9),%xmm3
- vmovdqu (%r10),%xmm4
- vmovdqu (%r11),%xmm5
- vmovdqu (%r12),%xmm6
- vmovdqu (%r13),%xmm7
- vmovdqu (%r14),%xmm8
- vmovdqu (%r15),%xmm9
- vmovdqu %xmm2,0(%rbp)
- vpxor %xmm15,%xmm2,%xmm2
- vmovdqu %xmm3,16(%rbp)
- vpxor %xmm15,%xmm3,%xmm3
- vmovdqu %xmm4,32(%rbp)
- vpxor %xmm15,%xmm4,%xmm4
- vmovdqu %xmm5,48(%rbp)
- vpxor %xmm15,%xmm5,%xmm5
- vmovdqu %xmm6,64(%rbp)
- vpxor %xmm15,%xmm6,%xmm6
- vmovdqu %xmm7,80(%rbp)
- vpxor %xmm15,%xmm7,%xmm7
- vmovdqu %xmm8,96(%rbp)
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu %xmm9,112(%rbp)
- vpxor %xmm15,%xmm9,%xmm9
- xorq $128,%rbp
- movl $1,%ecx
- jmp .Loop_dec8x
-
-.align 32
-.Loop_dec8x:
- vaesdec %xmm1,%xmm2,%xmm2
- cmpl 32+0(%rsp),%ecx
- vaesdec %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r8)
- vaesdec %xmm1,%xmm4,%xmm4
- vaesdec %xmm1,%xmm5,%xmm5
- leaq (%r8,%rbx,1),%rbx
- cmovgeq %rsp,%r8
- vaesdec %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm1,%xmm7,%xmm7
- subq %r8,%rbx
- vaesdec %xmm1,%xmm8,%xmm8
- vmovdqu 16(%r8),%xmm10
- movq %rbx,64+0(%rsp)
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups -72(%rsi),%xmm1
- leaq 16(%r8,%rbx,1),%r8
- vmovdqu %xmm10,128(%rsp)
- vaesdec %xmm0,%xmm2,%xmm2
- cmpl 32+4(%rsp),%ecx
- movq 64+8(%rsp),%rbx
- vaesdec %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r9)
- vaesdec %xmm0,%xmm4,%xmm4
- vaesdec %xmm0,%xmm5,%xmm5
- leaq (%r9,%rbx,1),%rbx
- cmovgeq %rsp,%r9
- vaesdec %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm0,%xmm7,%xmm7
- subq %r9,%rbx
- vaesdec %xmm0,%xmm8,%xmm8
- vmovdqu 16(%r9),%xmm11
- movq %rbx,64+8(%rsp)
- vaesdec %xmm0,%xmm9,%xmm9
- vmovups -56(%rsi),%xmm0
- leaq 16(%r9,%rbx,1),%r9
- vmovdqu %xmm11,144(%rsp)
- vaesdec %xmm1,%xmm2,%xmm2
- cmpl 32+8(%rsp),%ecx
- movq 64+16(%rsp),%rbx
- vaesdec %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r10)
- vaesdec %xmm1,%xmm4,%xmm4
- prefetcht0 15(%r8)
- vaesdec %xmm1,%xmm5,%xmm5
- leaq (%r10,%rbx,1),%rbx
- cmovgeq %rsp,%r10
- vaesdec %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm1,%xmm7,%xmm7
- subq %r10,%rbx
- vaesdec %xmm1,%xmm8,%xmm8
- vmovdqu 16(%r10),%xmm12
- movq %rbx,64+16(%rsp)
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups -40(%rsi),%xmm1
- leaq 16(%r10,%rbx,1),%r10
- vmovdqu %xmm12,160(%rsp)
- vaesdec %xmm0,%xmm2,%xmm2
- cmpl 32+12(%rsp),%ecx
- movq 64+24(%rsp),%rbx
- vaesdec %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r11)
- vaesdec %xmm0,%xmm4,%xmm4
- prefetcht0 15(%r9)
- vaesdec %xmm0,%xmm5,%xmm5
- leaq (%r11,%rbx,1),%rbx
- cmovgeq %rsp,%r11
- vaesdec %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm0,%xmm7,%xmm7
- subq %r11,%rbx
- vaesdec %xmm0,%xmm8,%xmm8
- vmovdqu 16(%r11),%xmm13
- movq %rbx,64+24(%rsp)
- vaesdec %xmm0,%xmm9,%xmm9
- vmovups -24(%rsi),%xmm0
- leaq 16(%r11,%rbx,1),%r11
- vmovdqu %xmm13,176(%rsp)
- vaesdec %xmm1,%xmm2,%xmm2
- cmpl 32+16(%rsp),%ecx
- movq 64+32(%rsp),%rbx
- vaesdec %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r12)
- vaesdec %xmm1,%xmm4,%xmm4
- prefetcht0 15(%r10)
- vaesdec %xmm1,%xmm5,%xmm5
- leaq (%r12,%rbx,1),%rbx
- cmovgeq %rsp,%r12
- vaesdec %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm1,%xmm7,%xmm7
- subq %r12,%rbx
- vaesdec %xmm1,%xmm8,%xmm8
- vmovdqu 16(%r12),%xmm10
- movq %rbx,64+32(%rsp)
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups -8(%rsi),%xmm1
- leaq 16(%r12,%rbx,1),%r12
- vaesdec %xmm0,%xmm2,%xmm2
- cmpl 32+20(%rsp),%ecx
- movq 64+40(%rsp),%rbx
- vaesdec %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r13)
- vaesdec %xmm0,%xmm4,%xmm4
- prefetcht0 15(%r11)
- vaesdec %xmm0,%xmm5,%xmm5
- leaq (%rbx,%r13,1),%rbx
- cmovgeq %rsp,%r13
- vaesdec %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm0,%xmm7,%xmm7
- subq %r13,%rbx
- vaesdec %xmm0,%xmm8,%xmm8
- vmovdqu 16(%r13),%xmm11
- movq %rbx,64+40(%rsp)
- vaesdec %xmm0,%xmm9,%xmm9
- vmovups 8(%rsi),%xmm0
- leaq 16(%r13,%rbx,1),%r13
- vaesdec %xmm1,%xmm2,%xmm2
- cmpl 32+24(%rsp),%ecx
- movq 64+48(%rsp),%rbx
- vaesdec %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r14)
- vaesdec %xmm1,%xmm4,%xmm4
- prefetcht0 15(%r12)
- vaesdec %xmm1,%xmm5,%xmm5
- leaq (%r14,%rbx,1),%rbx
- cmovgeq %rsp,%r14
- vaesdec %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm1,%xmm7,%xmm7
- subq %r14,%rbx
- vaesdec %xmm1,%xmm8,%xmm8
- vmovdqu 16(%r14),%xmm12
- movq %rbx,64+48(%rsp)
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups 24(%rsi),%xmm1
- leaq 16(%r14,%rbx,1),%r14
- vaesdec %xmm0,%xmm2,%xmm2
- cmpl 32+28(%rsp),%ecx
- movq 64+56(%rsp),%rbx
- vaesdec %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r15)
- vaesdec %xmm0,%xmm4,%xmm4
- prefetcht0 15(%r13)
- vaesdec %xmm0,%xmm5,%xmm5
- leaq (%r15,%rbx,1),%rbx
- cmovgeq %rsp,%r15
- vaesdec %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm0,%xmm7,%xmm7
- subq %r15,%rbx
- vaesdec %xmm0,%xmm8,%xmm8
- vmovdqu 16(%r15),%xmm13
- movq %rbx,64+56(%rsp)
- vaesdec %xmm0,%xmm9,%xmm9
- vmovups 40(%rsi),%xmm0
- leaq 16(%r15,%rbx,1),%r15
- vmovdqu 32(%rsp),%xmm14
- prefetcht0 15(%r14)
- prefetcht0 15(%r15)
- cmpl $11,%eax
- jb .Ldec8x_tail
-
- vaesdec %xmm1,%xmm2,%xmm2
- vaesdec %xmm1,%xmm3,%xmm3
- vaesdec %xmm1,%xmm4,%xmm4
- vaesdec %xmm1,%xmm5,%xmm5
- vaesdec %xmm1,%xmm6,%xmm6
- vaesdec %xmm1,%xmm7,%xmm7
- vaesdec %xmm1,%xmm8,%xmm8
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups 176-120(%rsi),%xmm1
-
- vaesdec %xmm0,%xmm2,%xmm2
- vaesdec %xmm0,%xmm3,%xmm3
- vaesdec %xmm0,%xmm4,%xmm4
- vaesdec %xmm0,%xmm5,%xmm5
- vaesdec %xmm0,%xmm6,%xmm6
- vaesdec %xmm0,%xmm7,%xmm7
- vaesdec %xmm0,%xmm8,%xmm8
- vaesdec %xmm0,%xmm9,%xmm9
- vmovups 192-120(%rsi),%xmm0
- je .Ldec8x_tail
-
- vaesdec %xmm1,%xmm2,%xmm2
- vaesdec %xmm1,%xmm3,%xmm3
- vaesdec %xmm1,%xmm4,%xmm4
- vaesdec %xmm1,%xmm5,%xmm5
- vaesdec %xmm1,%xmm6,%xmm6
- vaesdec %xmm1,%xmm7,%xmm7
- vaesdec %xmm1,%xmm8,%xmm8
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups 208-120(%rsi),%xmm1
-
- vaesdec %xmm0,%xmm2,%xmm2
- vaesdec %xmm0,%xmm3,%xmm3
- vaesdec %xmm0,%xmm4,%xmm4
- vaesdec %xmm0,%xmm5,%xmm5
- vaesdec %xmm0,%xmm6,%xmm6
- vaesdec %xmm0,%xmm7,%xmm7
- vaesdec %xmm0,%xmm8,%xmm8
- vaesdec %xmm0,%xmm9,%xmm9
- vmovups 224-120(%rsi),%xmm0
-
-.Ldec8x_tail:
- vaesdec %xmm1,%xmm2,%xmm2
- vpxor %xmm15,%xmm15,%xmm15
- vaesdec %xmm1,%xmm3,%xmm3
- vaesdec %xmm1,%xmm4,%xmm4
- vpcmpgtd %xmm15,%xmm14,%xmm15
- vaesdec %xmm1,%xmm5,%xmm5
- vaesdec %xmm1,%xmm6,%xmm6
- vpaddd %xmm14,%xmm15,%xmm15
- vmovdqu 48(%rsp),%xmm14
- vaesdec %xmm1,%xmm7,%xmm7
- movq 64(%rsp),%rbx
- vaesdec %xmm1,%xmm8,%xmm8
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups 16-120(%rsi),%xmm1
-
- vaesdeclast %xmm0,%xmm2,%xmm2
- vmovdqa %xmm15,32(%rsp)
- vpxor %xmm15,%xmm15,%xmm15
- vaesdeclast %xmm0,%xmm3,%xmm3
- vpxor 0(%rbp),%xmm2,%xmm2
- vaesdeclast %xmm0,%xmm4,%xmm4
- vpxor 16(%rbp),%xmm3,%xmm3
- vpcmpgtd %xmm15,%xmm14,%xmm15
- vaesdeclast %xmm0,%xmm5,%xmm5
- vpxor 32(%rbp),%xmm4,%xmm4
- vaesdeclast %xmm0,%xmm6,%xmm6
- vpxor 48(%rbp),%xmm5,%xmm5
- vpaddd %xmm15,%xmm14,%xmm14
- vmovdqu -120(%rsi),%xmm15
- vaesdeclast %xmm0,%xmm7,%xmm7
- vpxor 64(%rbp),%xmm6,%xmm6
- vaesdeclast %xmm0,%xmm8,%xmm8
- vpxor 80(%rbp),%xmm7,%xmm7
- vmovdqa %xmm14,48(%rsp)
- vaesdeclast %xmm0,%xmm9,%xmm9
- vpxor 96(%rbp),%xmm8,%xmm8
- vmovups 32-120(%rsi),%xmm0
-
- vmovups %xmm2,-16(%r8)
- subq %rbx,%r8
- vmovdqu 128+0(%rsp),%xmm2
- vpxor 112(%rbp),%xmm9,%xmm9
- vmovups %xmm3,-16(%r9)
- subq 72(%rsp),%r9
- vmovdqu %xmm2,0(%rbp)
- vpxor %xmm15,%xmm2,%xmm2
- vmovdqu 128+16(%rsp),%xmm3
- vmovups %xmm4,-16(%r10)
- subq 80(%rsp),%r10
- vmovdqu %xmm3,16(%rbp)
- vpxor %xmm15,%xmm3,%xmm3
- vmovdqu 128+32(%rsp),%xmm4
- vmovups %xmm5,-16(%r11)
- subq 88(%rsp),%r11
- vmovdqu %xmm4,32(%rbp)
- vpxor %xmm15,%xmm4,%xmm4
- vmovdqu 128+48(%rsp),%xmm5
- vmovups %xmm6,-16(%r12)
- subq 96(%rsp),%r12
- vmovdqu %xmm5,48(%rbp)
- vpxor %xmm15,%xmm5,%xmm5
- vmovdqu %xmm10,64(%rbp)
- vpxor %xmm10,%xmm15,%xmm6
- vmovups %xmm7,-16(%r13)
- subq 104(%rsp),%r13
- vmovdqu %xmm11,80(%rbp)
- vpxor %xmm11,%xmm15,%xmm7
- vmovups %xmm8,-16(%r14)
- subq 112(%rsp),%r14
- vmovdqu %xmm12,96(%rbp)
- vpxor %xmm12,%xmm15,%xmm8
- vmovups %xmm9,-16(%r15)
- subq 120(%rsp),%r15
- vmovdqu %xmm13,112(%rbp)
- vpxor %xmm13,%xmm15,%xmm9
-
- xorq $128,%rbp
- decl %edx
- jnz .Loop_dec8x
-
- movq 16(%rsp),%rax
-
-
-
-
-
-.Ldec8x_done:
- vzeroupper
- movq -48(%rax),%r15
- movq -40(%rax),%r14
- movq -32(%rax),%r13
- movq -24(%rax),%r12
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-.Ldec8x_epilogue:
- .byte 0xf3,0xc3
-.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
diff --git a/deps/openssl/asm/x64-elf-gas/aes/aesni-sha1-x86_64.s b/deps/openssl/asm/x64-elf-gas/aes/aesni-sha1-x86_64.s
index 8804224868..edbd5cb343 100644
--- a/deps/openssl/asm/x64-elf-gas/aes/aesni-sha1-x86_64.s
+++ b/deps/openssl/asm/x64-elf-gas/aes/aesni-sha1-x86_64.s
@@ -10,11 +10,6 @@ aesni_cbc_sha1_enc:
movq OPENSSL_ia32cap_P+4(%rip),%r11
btq $61,%r11
jc aesni_cbc_sha1_enc_shaext
- andl $268435456,%r11d
- andl $1073741824,%r10d
- orl %r11d,%r10d
- cmpl $1342177280,%r10d
- je aesni_cbc_sha1_enc_avx
jmp aesni_cbc_sha1_enc_ssse3
.byte 0xf3,0xc3
.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
@@ -1372,1304 +1367,6 @@ aesni_cbc_sha1_enc_ssse3:
.Lepilogue_ssse3:
.byte 0xf3,0xc3
.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
-.type aesni_cbc_sha1_enc_avx,@function
-.align 32
-aesni_cbc_sha1_enc_avx:
- movq 8(%rsp),%r10
-
-
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- leaq -104(%rsp),%rsp
-
-
- vzeroall
- movq %rdi,%r12
- movq %rsi,%r13
- movq %rdx,%r14
- leaq 112(%rcx),%r15
- vmovdqu (%r8),%xmm12
- movq %r8,88(%rsp)
- shlq $6,%r14
- subq %r12,%r13
- movl 240-112(%r15),%r8d
- addq %r10,%r14
-
- leaq K_XX_XX(%rip),%r11
- movl 0(%r9),%eax
- movl 4(%r9),%ebx
- movl 8(%r9),%ecx
- movl 12(%r9),%edx
- movl %ebx,%esi
- movl 16(%r9),%ebp
- movl %ecx,%edi
- xorl %edx,%edi
- andl %edi,%esi
-
- vmovdqa 64(%r11),%xmm6
- vmovdqa 0(%r11),%xmm10
- vmovdqu 0(%r10),%xmm0
- vmovdqu 16(%r10),%xmm1
- vmovdqu 32(%r10),%xmm2
- vmovdqu 48(%r10),%xmm3
- vpshufb %xmm6,%xmm0,%xmm0
- addq $64,%r10
- vpshufb %xmm6,%xmm1,%xmm1
- vpshufb %xmm6,%xmm2,%xmm2
- vpshufb %xmm6,%xmm3,%xmm3
- vpaddd %xmm10,%xmm0,%xmm4
- vpaddd %xmm10,%xmm1,%xmm5
- vpaddd %xmm10,%xmm2,%xmm6
- vmovdqa %xmm4,0(%rsp)
- vmovdqa %xmm5,16(%rsp)
- vmovdqa %xmm6,32(%rsp)
- vmovups -112(%r15),%xmm15
- vmovups 16-112(%r15),%xmm14
- jmp .Loop_avx
-.align 32
-.Loop_avx:
- shrdl $2,%ebx,%ebx
- vmovdqu 0(%r12),%xmm13
- vpxor %xmm15,%xmm13,%xmm13
- vpxor %xmm13,%xmm12,%xmm12
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -80(%r15),%xmm15
- xorl %edx,%esi
- vpalignr $8,%xmm0,%xmm1,%xmm4
- movl %eax,%edi
- addl 0(%rsp),%ebp
- vpaddd %xmm3,%xmm10,%xmm9
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrldq $4,%xmm3,%xmm8
- addl %esi,%ebp
- andl %ebx,%edi
- vpxor %xmm0,%xmm4,%xmm4
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpxor %xmm2,%xmm8,%xmm8
- shrdl $7,%eax,%eax
- xorl %ecx,%edi
- movl %ebp,%esi
- addl 4(%rsp),%edx
- vpxor %xmm8,%xmm4,%xmm4
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vmovdqa %xmm9,48(%rsp)
- addl %edi,%edx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -64(%r15),%xmm14
- andl %eax,%esi
- vpsrld $31,%xmm4,%xmm8
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%esi
- vpslldq $12,%xmm4,%xmm9
- vpaddd %xmm4,%xmm4,%xmm4
- movl %edx,%edi
- addl 8(%rsp),%ecx
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpor %xmm8,%xmm4,%xmm4
- vpsrld $30,%xmm9,%xmm8
- addl %esi,%ecx
- andl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- vpslld $2,%xmm9,%xmm9
- vpxor %xmm8,%xmm4,%xmm4
- shrdl $7,%edx,%edx
- xorl %eax,%edi
- movl %ecx,%esi
- addl 12(%rsp),%ebx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -48(%r15),%xmm15
- vpxor %xmm9,%xmm4,%xmm4
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- andl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %ebp,%esi
- vpalignr $8,%xmm1,%xmm2,%xmm5
- movl %ebx,%edi
- addl 16(%rsp),%eax
- vpaddd %xmm4,%xmm10,%xmm9
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrldq $4,%xmm4,%xmm8
- addl %esi,%eax
- andl %ecx,%edi
- vpxor %xmm1,%xmm5,%xmm5
- xorl %edx,%ecx
- addl %ebx,%eax
- vpxor %xmm3,%xmm8,%xmm8
- shrdl $7,%ebx,%ebx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -32(%r15),%xmm14
- xorl %edx,%edi
- movl %eax,%esi
- addl 20(%rsp),%ebp
- vpxor %xmm8,%xmm5,%xmm5
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%ebp
- andl %ebx,%esi
- vpsrld $31,%xmm5,%xmm8
- xorl %ecx,%ebx
- addl %eax,%ebp
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- vpslldq $12,%xmm5,%xmm9
- vpaddd %xmm5,%xmm5,%xmm5
- movl %ebp,%edi
- addl 24(%rsp),%edx
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vpor %xmm8,%xmm5,%xmm5
- vpsrld $30,%xmm9,%xmm8
- addl %esi,%edx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -16(%r15),%xmm15
- andl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- vpslld $2,%xmm9,%xmm9
- vpxor %xmm8,%xmm5,%xmm5
- shrdl $7,%ebp,%ebp
- xorl %ebx,%edi
- movl %edx,%esi
- addl 28(%rsp),%ecx
- vpxor %xmm9,%xmm5,%xmm5
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vmovdqa 16(%r11),%xmm10
- addl %edi,%ecx
- andl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- vpalignr $8,%xmm2,%xmm3,%xmm6
- movl %ecx,%edi
- addl 32(%rsp),%ebx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 0(%r15),%xmm14
- vpaddd %xmm5,%xmm10,%xmm9
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- vpsrldq $4,%xmm5,%xmm8
- addl %esi,%ebx
- andl %edx,%edi
- vpxor %xmm2,%xmm6,%xmm6
- xorl %ebp,%edx
- addl %ecx,%ebx
- vpxor %xmm4,%xmm8,%xmm8
- shrdl $7,%ecx,%ecx
- xorl %ebp,%edi
- movl %ebx,%esi
- addl 36(%rsp),%eax
- vpxor %xmm8,%xmm6,%xmm6
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vmovdqa %xmm9,16(%rsp)
- addl %edi,%eax
- andl %ecx,%esi
- vpsrld $31,%xmm6,%xmm8
- xorl %edx,%ecx
- addl %ebx,%eax
- shrdl $7,%ebx,%ebx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 16(%r15),%xmm15
- xorl %edx,%esi
- vpslldq $12,%xmm6,%xmm9
- vpaddd %xmm6,%xmm6,%xmm6
- movl %eax,%edi
- addl 40(%rsp),%ebp
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpor %xmm8,%xmm6,%xmm6
- vpsrld $30,%xmm9,%xmm8
- addl %esi,%ebp
- andl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpslld $2,%xmm9,%xmm9
- vpxor %xmm8,%xmm6,%xmm6
- shrdl $7,%eax,%eax
- xorl %ecx,%edi
- movl %ebp,%esi
- addl 44(%rsp),%edx
- vpxor %xmm9,%xmm6,%xmm6
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 32(%r15),%xmm14
- andl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%esi
- vpalignr $8,%xmm3,%xmm4,%xmm7
- movl %edx,%edi
- addl 48(%rsp),%ecx
- vpaddd %xmm6,%xmm10,%xmm9
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpsrldq $4,%xmm6,%xmm8
- addl %esi,%ecx
- andl %ebp,%edi
- vpxor %xmm3,%xmm7,%xmm7
- xorl %eax,%ebp
- addl %edx,%ecx
- vpxor %xmm5,%xmm8,%xmm8
- shrdl $7,%edx,%edx
- xorl %eax,%edi
- movl %ecx,%esi
- addl 52(%rsp),%ebx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 48(%r15),%xmm15
- vpxor %xmm8,%xmm7,%xmm7
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%ebx
- andl %edx,%esi
- vpsrld $31,%xmm7,%xmm8
- xorl %ebp,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %ebp,%esi
- vpslldq $12,%xmm7,%xmm9
- vpaddd %xmm7,%xmm7,%xmm7
- movl %ebx,%edi
- addl 56(%rsp),%eax
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpor %xmm8,%xmm7,%xmm7
- vpsrld $30,%xmm9,%xmm8
- addl %esi,%eax
- andl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm9,%xmm9
- vpxor %xmm8,%xmm7,%xmm7
- shrdl $7,%ebx,%ebx
- cmpl $11,%r8d
- jb .Lvaesenclast6
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 64(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 80(%r15),%xmm15
- je .Lvaesenclast6
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 96(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 112(%r15),%xmm15
-.Lvaesenclast6:
- vaesenclast %xmm15,%xmm12,%xmm12
- vmovups -112(%r15),%xmm15
- vmovups 16-112(%r15),%xmm14
- xorl %edx,%edi
- movl %eax,%esi
- addl 60(%rsp),%ebp
- vpxor %xmm9,%xmm7,%xmm7
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- addl %edi,%ebp
- andl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm6,%xmm7,%xmm8
- vpxor %xmm4,%xmm0,%xmm0
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- movl %ebp,%edi
- addl 0(%rsp),%edx
- vpxor %xmm1,%xmm0,%xmm0
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vpaddd %xmm7,%xmm10,%xmm9
- addl %esi,%edx
- vmovdqu 16(%r12),%xmm13
- vpxor %xmm15,%xmm13,%xmm13
- vmovups %xmm12,0(%r12,%r13,1)
- vpxor %xmm13,%xmm12,%xmm12
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -80(%r15),%xmm15
- andl %eax,%edi
- vpxor %xmm8,%xmm0,%xmm0
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%edi
- vpsrld $30,%xmm0,%xmm8
- vmovdqa %xmm9,48(%rsp)
- movl %edx,%esi
- addl 4(%rsp),%ecx
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpslld $2,%xmm0,%xmm0
- addl %edi,%ecx
- andl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- movl %ecx,%edi
- addl 8(%rsp),%ebx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -64(%r15),%xmm14
- vpor %xmm8,%xmm0,%xmm0
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- andl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 12(%rsp),%eax
- xorl %ebp,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm7,%xmm0,%xmm8
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -48(%r15),%xmm15
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- vpxor %xmm2,%xmm1,%xmm1
- addl %esi,%ebp
- xorl %ecx,%edi
- vpaddd %xmm0,%xmm10,%xmm9
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpxor %xmm8,%xmm1,%xmm1
- addl 20(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- vpsrld $30,%xmm1,%xmm8
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpslld $2,%xmm1,%xmm1
- addl 24(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -32(%r15),%xmm14
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpor %xmm8,%xmm1,%xmm1
- addl 28(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm0,%xmm1,%xmm8
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- vpxor %xmm3,%xmm2,%xmm2
- addl %esi,%eax
- xorl %edx,%edi
- vpaddd %xmm1,%xmm10,%xmm9
- vmovdqa 32(%r11),%xmm10
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpxor %xmm8,%xmm2,%xmm2
- addl 36(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -16(%r15),%xmm15
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- vpsrld $30,%xmm2,%xmm8
- vmovdqa %xmm9,16(%rsp)
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpslld $2,%xmm2,%xmm2
- addl 40(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpor %xmm8,%xmm2,%xmm2
- addl 44(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 0(%r15),%xmm14
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpalignr $8,%xmm1,%xmm2,%xmm8
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- vpxor %xmm4,%xmm3,%xmm3
- addl %esi,%ebx
- xorl %ebp,%edi
- vpaddd %xmm2,%xmm10,%xmm9
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpxor %xmm8,%xmm3,%xmm3
- addl 52(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm3,%xmm8
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm3,%xmm3
- addl 56(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 16(%r15),%xmm15
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpor %xmm8,%xmm3,%xmm3
- addl 60(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpalignr $8,%xmm2,%xmm3,%xmm8
- vpxor %xmm0,%xmm4,%xmm4
- addl 0(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- vpxor %xmm5,%xmm4,%xmm4
- addl %esi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 32(%r15),%xmm14
- xorl %eax,%edi
- vpaddd %xmm3,%xmm10,%xmm9
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpxor %xmm8,%xmm4,%xmm4
- addl 4(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- vpsrld $30,%xmm4,%xmm8
- vmovdqa %xmm9,48(%rsp)
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpslld $2,%xmm4,%xmm4
- addl 8(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpor %xmm8,%xmm4,%xmm4
- addl 12(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 48(%r15),%xmm15
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm3,%xmm4,%xmm8
- vpxor %xmm1,%xmm5,%xmm5
- addl 16(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- vpxor %xmm6,%xmm5,%xmm5
- addl %esi,%edx
- xorl %ebx,%edi
- vpaddd %xmm4,%xmm10,%xmm9
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpxor %xmm8,%xmm5,%xmm5
- addl 20(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- vpsrld $30,%xmm5,%xmm8
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%ecx
- cmpl $11,%r8d
- jb .Lvaesenclast7
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 64(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 80(%r15),%xmm15
- je .Lvaesenclast7
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 96(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 112(%r15),%xmm15
-.Lvaesenclast7:
- vaesenclast %xmm15,%xmm12,%xmm12
- vmovups -112(%r15),%xmm15
- vmovups 16-112(%r15),%xmm14
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpslld $2,%xmm5,%xmm5
- addl 24(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpor %xmm8,%xmm5,%xmm5
- addl 28(%rsp),%eax
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm4,%xmm5,%xmm8
- vpxor %xmm2,%xmm6,%xmm6
- addl 32(%rsp),%ebp
- vmovdqu 32(%r12),%xmm13
- vpxor %xmm15,%xmm13,%xmm13
- vmovups %xmm12,16(%r13,%r12,1)
- vpxor %xmm13,%xmm12,%xmm12
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -80(%r15),%xmm15
- andl %ecx,%esi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- movl %eax,%edi
- xorl %ecx,%esi
- vpaddd %xmm5,%xmm10,%xmm9
- shldl $5,%eax,%eax
- addl %esi,%ebp
- vpxor %xmm8,%xmm6,%xmm6
- xorl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 36(%rsp),%edx
- vpsrld $30,%xmm6,%xmm8
- vmovdqa %xmm9,16(%rsp)
- andl %ebx,%edi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%esi
- vpslld $2,%xmm6,%xmm6
- xorl %ebx,%edi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -64(%r15),%xmm14
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 40(%rsp),%ecx
- andl %eax,%esi
- vpor %xmm8,%xmm6,%xmm6
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%edi
- xorl %eax,%esi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 44(%rsp),%ebx
- andl %ebp,%edi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -48(%r15),%xmm15
- movl %ecx,%esi
- xorl %ebp,%edi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm5,%xmm6,%xmm8
- vpxor %xmm3,%xmm7,%xmm7
- addl 48(%rsp),%eax
- andl %edx,%esi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- vpxor %xmm0,%xmm7,%xmm7
- movl %ebx,%edi
- xorl %edx,%esi
- vpaddd %xmm6,%xmm10,%xmm9
- vmovdqa 48(%r11),%xmm10
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vpxor %xmm8,%xmm7,%xmm7
- xorl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 52(%rsp),%ebp
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -32(%r15),%xmm14
- vpsrld $30,%xmm7,%xmm8
- vmovdqa %xmm9,32(%rsp)
- andl %ecx,%edi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- vpslld $2,%xmm7,%xmm7
- xorl %ecx,%edi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 56(%rsp),%edx
- andl %ebx,%esi
- vpor %xmm8,%xmm7,%xmm7
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%edi
- xorl %ebx,%esi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -16(%r15),%xmm15
- xorl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 60(%rsp),%ecx
- andl %eax,%edi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%esi
- xorl %eax,%edi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- vpalignr $8,%xmm6,%xmm7,%xmm8
- vpxor %xmm4,%xmm0,%xmm0
- addl 0(%rsp),%ebx
- andl %ebp,%esi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 0(%r15),%xmm14
- vpxor %xmm1,%xmm0,%xmm0
- movl %ecx,%edi
- xorl %ebp,%esi
- vpaddd %xmm7,%xmm10,%xmm9
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- vpxor %xmm8,%xmm0,%xmm0
- xorl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 4(%rsp),%eax
- vpsrld $30,%xmm0,%xmm8
- vmovdqa %xmm9,48(%rsp)
- andl %edx,%edi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- vpslld $2,%xmm0,%xmm0
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 8(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 16(%r15),%xmm15
- andl %ecx,%esi
- vpor %xmm8,%xmm0,%xmm0
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%edi
- xorl %ecx,%esi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 12(%rsp),%edx
- andl %ebx,%edi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%esi
- xorl %ebx,%edi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 32(%r15),%xmm14
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- vpalignr $8,%xmm7,%xmm0,%xmm8
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%rsp),%ecx
- andl %eax,%esi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- vpxor %xmm2,%xmm1,%xmm1
- movl %edx,%edi
- xorl %eax,%esi
- vpaddd %xmm0,%xmm10,%xmm9
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vpxor %xmm8,%xmm1,%xmm1
- xorl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 20(%rsp),%ebx
- vpsrld $30,%xmm1,%xmm8
- vmovdqa %xmm9,0(%rsp)
- andl %ebp,%edi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 48(%r15),%xmm15
- movl %ecx,%esi
- vpslld $2,%xmm1,%xmm1
- xorl %ebp,%edi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 24(%rsp),%eax
- andl %edx,%esi
- vpor %xmm8,%xmm1,%xmm1
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%edi
- xorl %edx,%esi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 28(%rsp),%ebp
- cmpl $11,%r8d
- jb .Lvaesenclast8
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 64(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 80(%r15),%xmm15
- je .Lvaesenclast8
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 96(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 112(%r15),%xmm15
-.Lvaesenclast8:
- vaesenclast %xmm15,%xmm12,%xmm12
- vmovups -112(%r15),%xmm15
- vmovups 16-112(%r15),%xmm14
- andl %ecx,%edi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- xorl %ecx,%edi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm0,%xmm1,%xmm8
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%rsp),%edx
- andl %ebx,%esi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- vpxor %xmm3,%xmm2,%xmm2
- movl %ebp,%edi
- xorl %ebx,%esi
- vpaddd %xmm1,%xmm10,%xmm9
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- vmovdqu 48(%r12),%xmm13
- vpxor %xmm15,%xmm13,%xmm13
- vmovups %xmm12,32(%r13,%r12,1)
- vpxor %xmm13,%xmm12,%xmm12
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -80(%r15),%xmm15
- vpxor %xmm8,%xmm2,%xmm2
- xorl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 36(%rsp),%ecx
- vpsrld $30,%xmm2,%xmm8
- vmovdqa %xmm9,16(%rsp)
- andl %eax,%edi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%esi
- vpslld $2,%xmm2,%xmm2
- xorl %eax,%edi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 40(%rsp),%ebx
- andl %ebp,%esi
- vpor %xmm8,%xmm2,%xmm2
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -64(%r15),%xmm14
- movl %ecx,%edi
- xorl %ebp,%esi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 44(%rsp),%eax
- andl %edx,%edi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- addl %ebx,%eax
- vpalignr $8,%xmm1,%xmm2,%xmm8
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -48(%r15),%xmm15
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- vpxor %xmm4,%xmm3,%xmm3
- addl %esi,%ebp
- xorl %ecx,%edi
- vpaddd %xmm2,%xmm10,%xmm9
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpxor %xmm8,%xmm3,%xmm3
- addl 52(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- vpsrld $30,%xmm3,%xmm8
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpslld $2,%xmm3,%xmm3
- addl 56(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -32(%r15),%xmm14
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpor %xmm8,%xmm3,%xmm3
- addl 60(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 0(%rsp),%eax
- vpaddd %xmm3,%xmm10,%xmm9
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vmovdqa %xmm9,48(%rsp)
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 4(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -16(%r15),%xmm15
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 8(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 12(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 0(%r15),%xmm14
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- cmpq %r14,%r10
- je .Ldone_avx
- vmovdqa 64(%r11),%xmm9
- vmovdqa 0(%r11),%xmm10
- vmovdqu 0(%r10),%xmm0
- vmovdqu 16(%r10),%xmm1
- vmovdqu 32(%r10),%xmm2
- vmovdqu 48(%r10),%xmm3
- vpshufb %xmm9,%xmm0,%xmm0
- addq $64,%r10
- addl 16(%rsp),%ebx
- xorl %ebp,%esi
- vpshufb %xmm9,%xmm1,%xmm1
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- vpaddd %xmm10,%xmm0,%xmm8
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vmovdqa %xmm8,0(%rsp)
- addl 20(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 16(%r15),%xmm15
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 28(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 32(%rsp),%ecx
- xorl %eax,%esi
- vpshufb %xmm9,%xmm2,%xmm2
- movl %edx,%edi
- shldl $5,%edx,%edx
- vpaddd %xmm10,%xmm1,%xmm8
- addl %esi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 32(%r15),%xmm14
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vmovdqa %xmm8,16(%rsp)
- addl 36(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 48(%r15),%xmm15
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 48(%rsp),%edx
- xorl %ebx,%esi
- vpshufb %xmm9,%xmm3,%xmm3
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- vpaddd %xmm10,%xmm2,%xmm8
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vmovdqa %xmm8,32(%rsp)
- addl 52(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- cmpl $11,%r8d
- jb .Lvaesenclast9
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 64(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 80(%r15),%xmm15
- je .Lvaesenclast9
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 96(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 112(%r15),%xmm15
-.Lvaesenclast9:
- vaesenclast %xmm15,%xmm12,%xmm12
- vmovups -112(%r15),%xmm15
- vmovups 16-112(%r15),%xmm14
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 56(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vmovups %xmm12,48(%r13,%r12,1)
- leaq 64(%r12),%r12
-
- addl 0(%r9),%eax
- addl 4(%r9),%esi
- addl 8(%r9),%ecx
- addl 12(%r9),%edx
- movl %eax,0(%r9)
- addl 16(%r9),%ebp
- movl %esi,4(%r9)
- movl %esi,%ebx
- movl %ecx,8(%r9)
- movl %ecx,%edi
- movl %edx,12(%r9)
- xorl %edx,%edi
- movl %ebp,16(%r9)
- andl %edi,%esi
- jmp .Loop_avx
-
-.Ldone_avx:
- addl 16(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 20(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 16(%r15),%xmm15
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 28(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 32(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 32(%r15),%xmm14
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 36(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 48(%r15),%xmm15
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 48(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 52(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- cmpl $11,%r8d
- jb .Lvaesenclast10
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 64(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 80(%r15),%xmm15
- je .Lvaesenclast10
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 96(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 112(%r15),%xmm15
-.Lvaesenclast10:
- vaesenclast %xmm15,%xmm12,%xmm12
- vmovups -112(%r15),%xmm15
- vmovups 16-112(%r15),%xmm14
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 56(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vmovups %xmm12,48(%r13,%r12,1)
- movq 88(%rsp),%r8
-
- addl 0(%r9),%eax
- addl 4(%r9),%esi
- addl 8(%r9),%ecx
- movl %eax,0(%r9)
- addl 12(%r9),%edx
- movl %esi,4(%r9)
- addl 16(%r9),%ebp
- movl %ecx,8(%r9)
- movl %edx,12(%r9)
- movl %ebp,16(%r9)
- vmovups %xmm12,(%r8)
- vzeroall
- leaq 104(%rsp),%rsi
- movq 0(%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-.Lepilogue_avx:
- .byte 0xf3,0xc3
-.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
.align 64
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
@@ -2695,8 +1392,8 @@ aesni_cbc_sha1_enc_shaext:
movups 16(%rcx),%xmm0
leaq 112(%rcx),%rcx
- pshufd $27,%xmm8,%xmm8
- pshufd $27,%xmm9,%xmm9
+ pshufd $0b00011011,%xmm8,%xmm8
+ pshufd $0b00011011,%xmm9,%xmm9
jmp .Loop_shaext
.align 16
@@ -2759,17 +1456,17 @@ aesni_cbc_sha1_enc_shaext:
pxor %xmm3,%xmm5
.byte 15,56,201,243
cmpl $11,%r11d
- jb .Laesenclast11
+ jb .Laesenclast6
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je .Laesenclast11
+ je .Laesenclast6
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-.Laesenclast11:
+.Laesenclast6:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
movdqa %xmm8,%xmm10
@@ -2825,17 +1522,17 @@ aesni_cbc_sha1_enc_shaext:
pxor %xmm4,%xmm6
.byte 15,56,201,220
cmpl $11,%r11d
- jb .Laesenclast12
+ jb .Laesenclast7
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je .Laesenclast12
+ je .Laesenclast7
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-.Laesenclast12:
+.Laesenclast7:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
movdqa %xmm8,%xmm9
@@ -2891,17 +1588,17 @@ aesni_cbc_sha1_enc_shaext:
pxor %xmm5,%xmm3
.byte 15,56,201,229
cmpl $11,%r11d
- jb .Laesenclast13
+ jb .Laesenclast8
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je .Laesenclast13
+ je .Laesenclast8
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-.Laesenclast13:
+.Laesenclast8:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
movdqa %xmm8,%xmm10
@@ -2955,17 +1652,17 @@ aesni_cbc_sha1_enc_shaext:
movups 48(%rcx),%xmm1
.byte 102,15,56,220,208
cmpl $11,%r11d
- jb .Laesenclast14
+ jb .Laesenclast9
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je .Laesenclast14
+ je .Laesenclast9
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-.Laesenclast14:
+.Laesenclast9:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
decq %rdx
@@ -2975,8 +1672,8 @@ aesni_cbc_sha1_enc_shaext:
leaq 64(%rdi),%rdi
jnz .Loop_shaext
- pshufd $27,%xmm8,%xmm8
- pshufd $27,%xmm9,%xmm9
+ pshufd $0b00011011,%xmm8,%xmm8
+ pshufd $0b00011011,%xmm9,%xmm9
movups %xmm2,(%r8)
movdqu %xmm8,(%r9)
movd %xmm9,16(%r9)
diff --git a/deps/openssl/asm/x64-elf-gas/aes/aesni-sha256-x86_64.s b/deps/openssl/asm/x64-elf-gas/aes/aesni-sha256-x86_64.s
index 3df7f1bf52..2c85f62495 100644
--- a/deps/openssl/asm/x64-elf-gas/aes/aesni-sha256-x86_64.s
+++ b/deps/openssl/asm/x64-elf-gas/aes/aesni-sha256-x86_64.s
@@ -5,25 +5,6 @@
.type aesni_cbc_sha256_enc,@function
.align 16
aesni_cbc_sha256_enc:
- leaq OPENSSL_ia32cap_P(%rip),%r11
- movl $1,%eax
- cmpq $0,%rdi
- je .Lprobe
- movl 0(%r11),%eax
- movq 4(%r11),%r10
- btq $61,%r10
- jc aesni_cbc_sha256_enc_shaext
- movq %r10,%r11
- shrq $32,%r11
-
- testl $2048,%r10d
- jnz aesni_cbc_sha256_enc_xop
- andl $296,%r11d
- cmpl $296,%r11d
- je aesni_cbc_sha256_enc_avx2
- andl $268435456,%r10d
- jnz aesni_cbc_sha256_enc_avx
- ud2
xorl %eax,%eax
cmpq $0,%rdi
je .Lprobe
@@ -74,4281 +55,3 @@ K256:
.long 0,0,0,0, 0,0,0,0
.byte 65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
-.type aesni_cbc_sha256_enc_xop,@function
-.align 64
-aesni_cbc_sha256_enc_xop:
-.Lxop_shortcut:
- movq 8(%rsp),%r10
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rsp,%r11
- subq $128,%rsp
- andq $-64,%rsp
-
- shlq $6,%rdx
- subq %rdi,%rsi
- subq %rdi,%r10
- addq %rdi,%rdx
-
-
- movq %rsi,64+8(%rsp)
- movq %rdx,64+16(%rsp)
-
- movq %r8,64+32(%rsp)
- movq %r9,64+40(%rsp)
- movq %r10,64+48(%rsp)
- movq %r11,64+56(%rsp)
-.Lprologue_xop:
- vzeroall
-
- movq %rdi,%r12
- leaq 128(%rcx),%rdi
- leaq K256+544(%rip),%r13
- movl 240-128(%rdi),%r14d
- movq %r9,%r15
- movq %r10,%rsi
- vmovdqu (%r8),%xmm8
- subq $9,%r14
-
- movl 0(%r15),%eax
- movl 4(%r15),%ebx
- movl 8(%r15),%ecx
- movl 12(%r15),%edx
- movl 16(%r15),%r8d
- movl 20(%r15),%r9d
- movl 24(%r15),%r10d
- movl 28(%r15),%r11d
-
- vmovdqa 0(%r13,%r14,8),%xmm14
- vmovdqa 16(%r13,%r14,8),%xmm13
- vmovdqa 32(%r13,%r14,8),%xmm12
- vmovdqu 0-128(%rdi),%xmm10
- jmp .Lloop_xop
-.align 16
-.Lloop_xop:
- vmovdqa K256+512(%rip),%xmm7
- vmovdqu 0(%rsi,%r12,1),%xmm0
- vmovdqu 16(%rsi,%r12,1),%xmm1
- vmovdqu 32(%rsi,%r12,1),%xmm2
- vmovdqu 48(%rsi,%r12,1),%xmm3
- vpshufb %xmm7,%xmm0,%xmm0
- leaq K256(%rip),%rbp
- vpshufb %xmm7,%xmm1,%xmm1
- vpshufb %xmm7,%xmm2,%xmm2
- vpaddd 0(%rbp),%xmm0,%xmm4
- vpshufb %xmm7,%xmm3,%xmm3
- vpaddd 32(%rbp),%xmm1,%xmm5
- vpaddd 64(%rbp),%xmm2,%xmm6
- vpaddd 96(%rbp),%xmm3,%xmm7
- vmovdqa %xmm4,0(%rsp)
- movl %eax,%r14d
- vmovdqa %xmm5,16(%rsp)
- movl %ebx,%esi
- vmovdqa %xmm6,32(%rsp)
- xorl %ecx,%esi
- vmovdqa %xmm7,48(%rsp)
- movl %r8d,%r13d
- jmp .Lxop_00_47
-
-.align 16
-.Lxop_00_47:
- subq $-32*4,%rbp
- vmovdqu (%r12),%xmm9
- movq %r12,64+0(%rsp)
- vpalignr $4,%xmm0,%xmm1,%xmm4
- rorl $14,%r13d
- movl %r14d,%eax
- vpalignr $4,%xmm2,%xmm3,%xmm7
- movl %r9d,%r12d
- xorl %r8d,%r13d
-.byte 143,232,120,194,236,14
- rorl $9,%r14d
- xorl %r10d,%r12d
- vpsrld $3,%xmm4,%xmm4
- rorl $5,%r13d
- xorl %eax,%r14d
- vpaddd %xmm7,%xmm0,%xmm0
- andl %r8d,%r12d
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
-.byte 143,232,120,194,245,11
- rorl $11,%r14d
- xorl %r10d,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ebx,%r15d
- rorl $6,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
-.byte 143,232,120,194,251,13
- xorl %eax,%r14d
- addl %r13d,%r11d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ebx,%esi
- addl %r11d,%edx
- vpsrld $10,%xmm3,%xmm6
- rorl $2,%r14d
- addl %esi,%r11d
- vpaddd %xmm4,%xmm0,%xmm0
- movl %edx,%r13d
- addl %r11d,%r14d
-.byte 143,232,120,194,239,2
- rorl $14,%r13d
- movl %r14d,%r11d
- vpxor %xmm6,%xmm7,%xmm7
- movl %r8d,%r12d
- xorl %edx,%r13d
- rorl $9,%r14d
- xorl %r9d,%r12d
- vpxor %xmm5,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %edx,%r13d
- vpsrldq $8,%xmm7,%xmm7
- addl 4(%rsp),%r10d
- movl %r11d,%esi
- rorl $11,%r14d
- xorl %r9d,%r12d
- vpaddd %xmm7,%xmm0,%xmm0
- xorl %eax,%esi
- rorl $6,%r13d
- addl %r12d,%r10d
- andl %esi,%r15d
-.byte 143,232,120,194,248,13
- xorl %r11d,%r14d
- addl %r13d,%r10d
- vpsrld $10,%xmm0,%xmm6
- xorl %eax,%r15d
- addl %r10d,%ecx
-.byte 143,232,120,194,239,2
- rorl $2,%r14d
- addl %r15d,%r10d
- vpxor %xmm6,%xmm7,%xmm7
- movl %ecx,%r13d
- addl %r10d,%r14d
- rorl $14,%r13d
- movl %r14d,%r10d
- vpxor %xmm5,%xmm7,%xmm7
- movl %edx,%r12d
- xorl %ecx,%r13d
- rorl $9,%r14d
- xorl %r8d,%r12d
- vpslldq $8,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %ecx,%r13d
- vpaddd %xmm7,%xmm0,%xmm0
- addl 8(%rsp),%r9d
- movl %r10d,%r15d
- rorl $11,%r14d
- xorl %r8d,%r12d
- vpaddd 0(%rbp),%xmm0,%xmm6
- xorl %r11d,%r15d
- rorl $6,%r13d
- addl %r12d,%r9d
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%esi
- addl %r9d,%ebx
- rorl $2,%r14d
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- rorl $14,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- xorl %ebx,%r13d
- rorl $9,%r14d
- xorl %edx,%r12d
- rorl $5,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%esi
- rorl $11,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- rorl $6,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- rorl $2,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,0(%rsp)
- vpalignr $4,%xmm1,%xmm2,%xmm4
- rorl $14,%r13d
- movl %r14d,%r8d
- vpalignr $4,%xmm3,%xmm0,%xmm7
- movl %ebx,%r12d
- xorl %eax,%r13d
-.byte 143,232,120,194,236,14
- rorl $9,%r14d
- xorl %ecx,%r12d
- vpsrld $3,%xmm4,%xmm4
- rorl $5,%r13d
- xorl %r8d,%r14d
- vpaddd %xmm7,%xmm1,%xmm1
- andl %eax,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
-.byte 143,232,120,194,245,11
- rorl $11,%r14d
- xorl %ecx,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %r9d,%r15d
- rorl $6,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
-.byte 143,232,120,194,248,13
- xorl %r8d,%r14d
- addl %r13d,%edx
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r9d,%esi
- addl %edx,%r11d
- vpsrld $10,%xmm0,%xmm6
- rorl $2,%r14d
- addl %esi,%edx
- vpaddd %xmm4,%xmm1,%xmm1
- movl %r11d,%r13d
- addl %edx,%r14d
-.byte 143,232,120,194,239,2
- rorl $14,%r13d
- movl %r14d,%edx
- vpxor %xmm6,%xmm7,%xmm7
- movl %eax,%r12d
- xorl %r11d,%r13d
- rorl $9,%r14d
- xorl %ebx,%r12d
- vpxor %xmm5,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r11d,%r13d
- vpsrldq $8,%xmm7,%xmm7
- addl 20(%rsp),%ecx
- movl %edx,%esi
- rorl $11,%r14d
- xorl %ebx,%r12d
- vpaddd %xmm7,%xmm1,%xmm1
- xorl %r8d,%esi
- rorl $6,%r13d
- addl %r12d,%ecx
- andl %esi,%r15d
-.byte 143,232,120,194,249,13
- xorl %edx,%r14d
- addl %r13d,%ecx
- vpsrld $10,%xmm1,%xmm6
- xorl %r8d,%r15d
- addl %ecx,%r10d
-.byte 143,232,120,194,239,2
- rorl $2,%r14d
- addl %r15d,%ecx
- vpxor %xmm6,%xmm7,%xmm7
- movl %r10d,%r13d
- addl %ecx,%r14d
- rorl $14,%r13d
- movl %r14d,%ecx
- vpxor %xmm5,%xmm7,%xmm7
- movl %r11d,%r12d
- xorl %r10d,%r13d
- rorl $9,%r14d
- xorl %eax,%r12d
- vpslldq $8,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r10d,%r13d
- vpaddd %xmm7,%xmm1,%xmm1
- addl 24(%rsp),%ebx
- movl %ecx,%r15d
- rorl $11,%r14d
- xorl %eax,%r12d
- vpaddd 32(%rbp),%xmm1,%xmm6
- xorl %edx,%r15d
- rorl $6,%r13d
- addl %r12d,%ebx
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%esi
- addl %ebx,%r9d
- rorl $2,%r14d
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- rorl $14,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- xorl %r9d,%r13d
- rorl $9,%r14d
- xorl %r11d,%r12d
- rorl $5,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%esi
- rorl $11,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- rorl $6,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- rorl $2,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,16(%rsp)
- vpalignr $4,%xmm2,%xmm3,%xmm4
- rorl $14,%r13d
- movl %r14d,%eax
- vpalignr $4,%xmm0,%xmm1,%xmm7
- movl %r9d,%r12d
- xorl %r8d,%r13d
-.byte 143,232,120,194,236,14
- rorl $9,%r14d
- xorl %r10d,%r12d
- vpsrld $3,%xmm4,%xmm4
- rorl $5,%r13d
- xorl %eax,%r14d
- vpaddd %xmm7,%xmm2,%xmm2
- andl %r8d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
-.byte 143,232,120,194,245,11
- rorl $11,%r14d
- xorl %r10d,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ebx,%r15d
- rorl $6,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
-.byte 143,232,120,194,249,13
- xorl %eax,%r14d
- addl %r13d,%r11d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ebx,%esi
- addl %r11d,%edx
- vpsrld $10,%xmm1,%xmm6
- rorl $2,%r14d
- addl %esi,%r11d
- vpaddd %xmm4,%xmm2,%xmm2
- movl %edx,%r13d
- addl %r11d,%r14d
-.byte 143,232,120,194,239,2
- rorl $14,%r13d
- movl %r14d,%r11d
- vpxor %xmm6,%xmm7,%xmm7
- movl %r8d,%r12d
- xorl %edx,%r13d
- rorl $9,%r14d
- xorl %r9d,%r12d
- vpxor %xmm5,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %edx,%r13d
- vpsrldq $8,%xmm7,%xmm7
- addl 36(%rsp),%r10d
- movl %r11d,%esi
- rorl $11,%r14d
- xorl %r9d,%r12d
- vpaddd %xmm7,%xmm2,%xmm2
- xorl %eax,%esi
- rorl $6,%r13d
- addl %r12d,%r10d
- andl %esi,%r15d
-.byte 143,232,120,194,250,13
- xorl %r11d,%r14d
- addl %r13d,%r10d
- vpsrld $10,%xmm2,%xmm6
- xorl %eax,%r15d
- addl %r10d,%ecx
-.byte 143,232,120,194,239,2
- rorl $2,%r14d
- addl %r15d,%r10d
- vpxor %xmm6,%xmm7,%xmm7
- movl %ecx,%r13d
- addl %r10d,%r14d
- rorl $14,%r13d
- movl %r14d,%r10d
- vpxor %xmm5,%xmm7,%xmm7
- movl %edx,%r12d
- xorl %ecx,%r13d
- rorl $9,%r14d
- xorl %r8d,%r12d
- vpslldq $8,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %ecx,%r13d
- vpaddd %xmm7,%xmm2,%xmm2
- addl 40(%rsp),%r9d
- movl %r10d,%r15d
- rorl $11,%r14d
- xorl %r8d,%r12d
- vpaddd 64(%rbp),%xmm2,%xmm6
- xorl %r11d,%r15d
- rorl $6,%r13d
- addl %r12d,%r9d
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%esi
- addl %r9d,%ebx
- rorl $2,%r14d
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- rorl $14,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- xorl %ebx,%r13d
- rorl $9,%r14d
- xorl %edx,%r12d
- rorl $5,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%esi
- rorl $11,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- rorl $6,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- rorl $2,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,32(%rsp)
- vpalignr $4,%xmm3,%xmm0,%xmm4
- rorl $14,%r13d
- movl %r14d,%r8d
- vpalignr $4,%xmm1,%xmm2,%xmm7
- movl %ebx,%r12d
- xorl %eax,%r13d
-.byte 143,232,120,194,236,14
- rorl $9,%r14d
- xorl %ecx,%r12d
- vpsrld $3,%xmm4,%xmm4
- rorl $5,%r13d
- xorl %r8d,%r14d
- vpaddd %xmm7,%xmm3,%xmm3
- andl %eax,%r12d
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
-.byte 143,232,120,194,245,11
- rorl $11,%r14d
- xorl %ecx,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %r9d,%r15d
- rorl $6,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
-.byte 143,232,120,194,250,13
- xorl %r8d,%r14d
- addl %r13d,%edx
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r9d,%esi
- addl %edx,%r11d
- vpsrld $10,%xmm2,%xmm6
- rorl $2,%r14d
- addl %esi,%edx
- vpaddd %xmm4,%xmm3,%xmm3
- movl %r11d,%r13d
- addl %edx,%r14d
-.byte 143,232,120,194,239,2
- rorl $14,%r13d
- movl %r14d,%edx
- vpxor %xmm6,%xmm7,%xmm7
- movl %eax,%r12d
- xorl %r11d,%r13d
- rorl $9,%r14d
- xorl %ebx,%r12d
- vpxor %xmm5,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r11d,%r13d
- vpsrldq $8,%xmm7,%xmm7
- addl 52(%rsp),%ecx
- movl %edx,%esi
- rorl $11,%r14d
- xorl %ebx,%r12d
- vpaddd %xmm7,%xmm3,%xmm3
- xorl %r8d,%esi
- rorl $6,%r13d
- addl %r12d,%ecx
- andl %esi,%r15d
-.byte 143,232,120,194,251,13
- xorl %edx,%r14d
- addl %r13d,%ecx
- vpsrld $10,%xmm3,%xmm6
- xorl %r8d,%r15d
- addl %ecx,%r10d
-.byte 143,232,120,194,239,2
- rorl $2,%r14d
- addl %r15d,%ecx
- vpxor %xmm6,%xmm7,%xmm7
- movl %r10d,%r13d
- addl %ecx,%r14d
- rorl $14,%r13d
- movl %r14d,%ecx
- vpxor %xmm5,%xmm7,%xmm7
- movl %r11d,%r12d
- xorl %r10d,%r13d
- rorl $9,%r14d
- xorl %eax,%r12d
- vpslldq $8,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r10d,%r13d
- vpaddd %xmm7,%xmm3,%xmm3
- addl 56(%rsp),%ebx
- movl %ecx,%r15d
- rorl $11,%r14d
- xorl %eax,%r12d
- vpaddd 96(%rbp),%xmm3,%xmm6
- xorl %edx,%r15d
- rorl $6,%r13d
- addl %r12d,%ebx
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%esi
- addl %ebx,%r9d
- rorl $2,%r14d
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- rorl $14,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- xorl %r9d,%r13d
- rorl $9,%r14d
- xorl %r11d,%r12d
- rorl $5,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%esi
- rorl $11,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- rorl $6,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- rorl $2,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,48(%rsp)
- movq 64+0(%rsp),%r12
- vpand %xmm14,%xmm11,%xmm11
- movq 64+8(%rsp),%r15
- vpor %xmm11,%xmm8,%xmm8
- vmovdqu %xmm8,(%r15,%r12,1)
- leaq 16(%r12),%r12
- cmpb $0,131(%rbp)
- jne .Lxop_00_47
- vmovdqu (%r12),%xmm9
- movq %r12,64+0(%rsp)
- rorl $14,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- xorl %r8d,%r13d
- rorl $9,%r14d
- xorl %r10d,%r12d
- rorl $5,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- rorl $11,%r14d
- xorl %r10d,%r12d
- xorl %ebx,%r15d
- rorl $6,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%esi
- addl %r11d,%edx
- rorl $2,%r14d
- addl %esi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- rorl $14,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- xorl %edx,%r13d
- rorl $9,%r14d
- xorl %r9d,%r12d
- rorl $5,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %edx,%r13d
- addl 4(%rsp),%r10d
- movl %r11d,%esi
- rorl $11,%r14d
- xorl %r9d,%r12d
- xorl %eax,%esi
- rorl $6,%r13d
- addl %r12d,%r10d
- andl %esi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- addl %r10d,%ecx
- rorl $2,%r14d
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- rorl $14,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- xorl %ecx,%r13d
- rorl $9,%r14d
- xorl %r8d,%r12d
- rorl $5,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- movl %r10d,%r15d
- rorl $11,%r14d
- xorl %r8d,%r12d
- xorl %r11d,%r15d
- rorl $6,%r13d
- addl %r12d,%r9d
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%esi
- addl %r9d,%ebx
- rorl $2,%r14d
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- rorl $14,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- xorl %ebx,%r13d
- rorl $9,%r14d
- xorl %edx,%r12d
- rorl $5,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%esi
- rorl $11,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- rorl $6,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- rorl $2,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- rorl $14,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- xorl %eax,%r13d
- rorl $9,%r14d
- xorl %ecx,%r12d
- rorl $5,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- rorl $11,%r14d
- xorl %ecx,%r12d
- xorl %r9d,%r15d
- rorl $6,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%esi
- addl %edx,%r11d
- rorl $2,%r14d
- addl %esi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- rorl $14,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- xorl %r11d,%r13d
- rorl $9,%r14d
- xorl %ebx,%r12d
- rorl $5,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r11d,%r13d
- addl 20(%rsp),%ecx
- movl %edx,%esi
- rorl $11,%r14d
- xorl %ebx,%r12d
- xorl %r8d,%esi
- rorl $6,%r13d
- addl %r12d,%ecx
- andl %esi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- addl %ecx,%r10d
- rorl $2,%r14d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- rorl $14,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- xorl %r10d,%r13d
- rorl $9,%r14d
- xorl %eax,%r12d
- rorl $5,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- movl %ecx,%r15d
- rorl $11,%r14d
- xorl %eax,%r12d
- xorl %edx,%r15d
- rorl $6,%r13d
- addl %r12d,%ebx
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%esi
- addl %ebx,%r9d
- rorl $2,%r14d
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- rorl $14,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- xorl %r9d,%r13d
- rorl $9,%r14d
- xorl %r11d,%r12d
- rorl $5,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%esi
- rorl $11,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- rorl $6,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- rorl $2,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- rorl $14,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- xorl %r8d,%r13d
- rorl $9,%r14d
- xorl %r10d,%r12d
- rorl $5,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- rorl $11,%r14d
- xorl %r10d,%r12d
- xorl %ebx,%r15d
- rorl $6,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%esi
- addl %r11d,%edx
- rorl $2,%r14d
- addl %esi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- rorl $14,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- xorl %edx,%r13d
- rorl $9,%r14d
- xorl %r9d,%r12d
- rorl $5,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %edx,%r13d
- addl 36(%rsp),%r10d
- movl %r11d,%esi
- rorl $11,%r14d
- xorl %r9d,%r12d
- xorl %eax,%esi
- rorl $6,%r13d
- addl %r12d,%r10d
- andl %esi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- addl %r10d,%ecx
- rorl $2,%r14d
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- rorl $14,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- xorl %ecx,%r13d
- rorl $9,%r14d
- xorl %r8d,%r12d
- rorl $5,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- movl %r10d,%r15d
- rorl $11,%r14d
- xorl %r8d,%r12d
- xorl %r11d,%r15d
- rorl $6,%r13d
- addl %r12d,%r9d
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%esi
- addl %r9d,%ebx
- rorl $2,%r14d
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- rorl $14,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- xorl %ebx,%r13d
- rorl $9,%r14d
- xorl %edx,%r12d
- rorl $5,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%esi
- rorl $11,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- rorl $6,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- rorl $2,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- rorl $14,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- xorl %eax,%r13d
- rorl $9,%r14d
- xorl %ecx,%r12d
- rorl $5,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- rorl $11,%r14d
- xorl %ecx,%r12d
- xorl %r9d,%r15d
- rorl $6,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%esi
- addl %edx,%r11d
- rorl $2,%r14d
- addl %esi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- rorl $14,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- xorl %r11d,%r13d
- rorl $9,%r14d
- xorl %ebx,%r12d
- rorl $5,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r11d,%r13d
- addl 52(%rsp),%ecx
- movl %edx,%esi
- rorl $11,%r14d
- xorl %ebx,%r12d
- xorl %r8d,%esi
- rorl $6,%r13d
- addl %r12d,%ecx
- andl %esi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- addl %ecx,%r10d
- rorl $2,%r14d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- rorl $14,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- xorl %r10d,%r13d
- rorl $9,%r14d
- xorl %eax,%r12d
- rorl $5,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- movl %ecx,%r15d
- rorl $11,%r14d
- xorl %eax,%r12d
- xorl %edx,%r15d
- rorl $6,%r13d
- addl %r12d,%ebx
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%esi
- addl %ebx,%r9d
- rorl $2,%r14d
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- rorl $14,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- xorl %r9d,%r13d
- rorl $9,%r14d
- xorl %r11d,%r12d
- rorl $5,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%esi
- rorl $11,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- rorl $6,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- rorl $2,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- movq 64+0(%rsp),%r12
- movq 64+8(%rsp),%r13
- movq 64+40(%rsp),%r15
- movq 64+48(%rsp),%rsi
-
- vpand %xmm14,%xmm11,%xmm11
- movl %r14d,%eax
- vpor %xmm11,%xmm8,%xmm8
- vmovdqu %xmm8,(%r12,%r13,1)
- leaq 16(%r12),%r12
-
- addl 0(%r15),%eax
- addl 4(%r15),%ebx
- addl 8(%r15),%ecx
- addl 12(%r15),%edx
- addl 16(%r15),%r8d
- addl 20(%r15),%r9d
- addl 24(%r15),%r10d
- addl 28(%r15),%r11d
-
- cmpq 64+16(%rsp),%r12
-
- movl %eax,0(%r15)
- movl %ebx,4(%r15)
- movl %ecx,8(%r15)
- movl %edx,12(%r15)
- movl %r8d,16(%r15)
- movl %r9d,20(%r15)
- movl %r10d,24(%r15)
- movl %r11d,28(%r15)
-
- jb .Lloop_xop
-
- movq 64+32(%rsp),%r8
- movq 64+56(%rsp),%rsi
- vmovdqu %xmm8,(%r8)
- vzeroall
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-.Lepilogue_xop:
- .byte 0xf3,0xc3
-.size aesni_cbc_sha256_enc_xop,.-aesni_cbc_sha256_enc_xop
-.type aesni_cbc_sha256_enc_avx,@function
-.align 64
-aesni_cbc_sha256_enc_avx:
-.Lavx_shortcut:
- movq 8(%rsp),%r10
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rsp,%r11
- subq $128,%rsp
- andq $-64,%rsp
-
- shlq $6,%rdx
- subq %rdi,%rsi
- subq %rdi,%r10
- addq %rdi,%rdx
-
-
- movq %rsi,64+8(%rsp)
- movq %rdx,64+16(%rsp)
-
- movq %r8,64+32(%rsp)
- movq %r9,64+40(%rsp)
- movq %r10,64+48(%rsp)
- movq %r11,64+56(%rsp)
-.Lprologue_avx:
- vzeroall
-
- movq %rdi,%r12
- leaq 128(%rcx),%rdi
- leaq K256+544(%rip),%r13
- movl 240-128(%rdi),%r14d
- movq %r9,%r15
- movq %r10,%rsi
- vmovdqu (%r8),%xmm8
- subq $9,%r14
-
- movl 0(%r15),%eax
- movl 4(%r15),%ebx
- movl 8(%r15),%ecx
- movl 12(%r15),%edx
- movl 16(%r15),%r8d
- movl 20(%r15),%r9d
- movl 24(%r15),%r10d
- movl 28(%r15),%r11d
-
- vmovdqa 0(%r13,%r14,8),%xmm14
- vmovdqa 16(%r13,%r14,8),%xmm13
- vmovdqa 32(%r13,%r14,8),%xmm12
- vmovdqu 0-128(%rdi),%xmm10
- jmp .Lloop_avx
-.align 16
-.Lloop_avx:
- vmovdqa K256+512(%rip),%xmm7
- vmovdqu 0(%rsi,%r12,1),%xmm0
- vmovdqu 16(%rsi,%r12,1),%xmm1
- vmovdqu 32(%rsi,%r12,1),%xmm2
- vmovdqu 48(%rsi,%r12,1),%xmm3
- vpshufb %xmm7,%xmm0,%xmm0
- leaq K256(%rip),%rbp
- vpshufb %xmm7,%xmm1,%xmm1
- vpshufb %xmm7,%xmm2,%xmm2
- vpaddd 0(%rbp),%xmm0,%xmm4
- vpshufb %xmm7,%xmm3,%xmm3
- vpaddd 32(%rbp),%xmm1,%xmm5
- vpaddd 64(%rbp),%xmm2,%xmm6
- vpaddd 96(%rbp),%xmm3,%xmm7
- vmovdqa %xmm4,0(%rsp)
- movl %eax,%r14d
- vmovdqa %xmm5,16(%rsp)
- movl %ebx,%esi
- vmovdqa %xmm6,32(%rsp)
- xorl %ecx,%esi
- vmovdqa %xmm7,48(%rsp)
- movl %r8d,%r13d
- jmp .Lavx_00_47
-
-.align 16
-.Lavx_00_47:
- subq $-32*4,%rbp
- vmovdqu (%r12),%xmm9
- movq %r12,64+0(%rsp)
- vpalignr $4,%xmm0,%xmm1,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- vpalignr $4,%xmm2,%xmm3,%xmm7
- xorl %r8d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpaddd %xmm7,%xmm0,%xmm0
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- vpsrld $3,%xmm4,%xmm7
- shrdl $11,%r14d,%r14d
- xorl %r10d,%r12d
- xorl %ebx,%r15d
- vpslld $14,%xmm4,%xmm5
- shrdl $6,%r13d,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%esi
- vpshufd $250,%xmm3,%xmm7
- addl %r11d,%edx
- shrdl $2,%r14d,%r14d
- addl %esi,%r11d
- vpsrld $11,%xmm6,%xmm6
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%r11d
- movl %r8d,%r12d
- xorl %edx,%r13d
- vpslld $11,%xmm5,%xmm5
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r11d,%r14d
- andl %edx,%r12d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %edx,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 4(%rsp),%r10d
- movl %r11d,%esi
- shrdl $11,%r14d,%r14d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %r9d,%r12d
- xorl %eax,%esi
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- addl %r12d,%r10d
- andl %esi,%r15d
- xorl %r11d,%r14d
- vpaddd %xmm4,%xmm0,%xmm0
- addl %r13d,%r10d
- xorl %eax,%r15d
- addl %r10d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $2,%r14d,%r14d
- addl %r15d,%r10d
- movl %ecx,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- movl %edx,%r12d
- xorl %ecx,%r13d
- shrdl $9,%r14d,%r14d
- vpshufd $132,%xmm6,%xmm6
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- vpsrldq $8,%xmm6,%xmm6
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- vpaddd %xmm6,%xmm0,%xmm0
- movl %r10d,%r15d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%r12d
- vpshufd $80,%xmm0,%xmm7
- xorl %r11d,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%r9d
- vpsrld $10,%xmm7,%xmm6
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- vpsrlq $17,%xmm7,%xmm7
- xorl %r11d,%esi
- addl %r9d,%ebx
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- vpsrlq $2,%xmm7,%xmm7
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %ebx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r12d
- vpshufd $232,%xmm6,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vpslldq $8,%xmm6,%xmm6
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%esi
- vpaddd %xmm6,%xmm0,%xmm0
- shrdl $11,%r14d,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- vpaddd 0(%rbp),%xmm0,%xmm6
- shrdl $6,%r13d,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- shrdl $2,%r14d,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,0(%rsp)
- vpalignr $4,%xmm1,%xmm2,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- vpalignr $4,%xmm3,%xmm0,%xmm7
- xorl %eax,%r13d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpaddd %xmm7,%xmm1,%xmm1
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- vpsrld $3,%xmm4,%xmm7
- shrdl $11,%r14d,%r14d
- xorl %ecx,%r12d
- xorl %r9d,%r15d
- vpslld $14,%xmm4,%xmm5
- shrdl $6,%r13d,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%esi
- vpshufd $250,%xmm0,%xmm7
- addl %edx,%r11d
- shrdl $2,%r14d,%r14d
- addl %esi,%edx
- vpsrld $11,%xmm6,%xmm6
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%edx
- movl %eax,%r12d
- xorl %r11d,%r13d
- vpslld $11,%xmm5,%xmm5
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r11d,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 20(%rsp),%ecx
- movl %edx,%esi
- shrdl $11,%r14d,%r14d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ebx,%r12d
- xorl %r8d,%esi
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- addl %r12d,%ecx
- andl %esi,%r15d
- xorl %edx,%r14d
- vpaddd %xmm4,%xmm1,%xmm1
- addl %r13d,%ecx
- xorl %r8d,%r15d
- addl %ecx,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $2,%r14d,%r14d
- addl %r15d,%ecx
- movl %r10d,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- movl %r11d,%r12d
- xorl %r10d,%r13d
- shrdl $9,%r14d,%r14d
- vpshufd $132,%xmm6,%xmm6
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- vpsrldq $8,%xmm6,%xmm6
- andl %r10d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- vpaddd %xmm6,%xmm1,%xmm1
- movl %ecx,%r15d
- shrdl $11,%r14d,%r14d
- xorl %eax,%r12d
- vpshufd $80,%xmm1,%xmm7
- xorl %edx,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%ebx
- vpsrld $10,%xmm7,%xmm6
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- vpsrlq $17,%xmm7,%xmm7
- xorl %edx,%esi
- addl %ebx,%r9d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- vpsrlq $2,%xmm7,%xmm7
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %r9d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r12d
- vpshufd $232,%xmm6,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpslldq $8,%xmm6,%xmm6
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%esi
- vpaddd %xmm6,%xmm1,%xmm1
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- vpaddd 32(%rbp),%xmm1,%xmm6
- shrdl $6,%r13d,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- shrdl $2,%r14d,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,16(%rsp)
- vpalignr $4,%xmm2,%xmm3,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- vpalignr $4,%xmm0,%xmm1,%xmm7
- xorl %r8d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpaddd %xmm7,%xmm2,%xmm2
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- vpsrld $3,%xmm4,%xmm7
- shrdl $11,%r14d,%r14d
- xorl %r10d,%r12d
- xorl %ebx,%r15d
- vpslld $14,%xmm4,%xmm5
- shrdl $6,%r13d,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%esi
- vpshufd $250,%xmm1,%xmm7
- addl %r11d,%edx
- shrdl $2,%r14d,%r14d
- addl %esi,%r11d
- vpsrld $11,%xmm6,%xmm6
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%r11d
- movl %r8d,%r12d
- xorl %edx,%r13d
- vpslld $11,%xmm5,%xmm5
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r11d,%r14d
- andl %edx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %edx,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 36(%rsp),%r10d
- movl %r11d,%esi
- shrdl $11,%r14d,%r14d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %r9d,%r12d
- xorl %eax,%esi
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- addl %r12d,%r10d
- andl %esi,%r15d
- xorl %r11d,%r14d
- vpaddd %xmm4,%xmm2,%xmm2
- addl %r13d,%r10d
- xorl %eax,%r15d
- addl %r10d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $2,%r14d,%r14d
- addl %r15d,%r10d
- movl %ecx,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- movl %edx,%r12d
- xorl %ecx,%r13d
- shrdl $9,%r14d,%r14d
- vpshufd $132,%xmm6,%xmm6
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- vpsrldq $8,%xmm6,%xmm6
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- vpaddd %xmm6,%xmm2,%xmm2
- movl %r10d,%r15d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%r12d
- vpshufd $80,%xmm2,%xmm7
- xorl %r11d,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%r9d
- vpsrld $10,%xmm7,%xmm6
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- vpsrlq $17,%xmm7,%xmm7
- xorl %r11d,%esi
- addl %r9d,%ebx
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- vpsrlq $2,%xmm7,%xmm7
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %ebx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r12d
- vpshufd $232,%xmm6,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vpslldq $8,%xmm6,%xmm6
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%esi
- vpaddd %xmm6,%xmm2,%xmm2
- shrdl $11,%r14d,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- vpaddd 64(%rbp),%xmm2,%xmm6
- shrdl $6,%r13d,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- shrdl $2,%r14d,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,32(%rsp)
- vpalignr $4,%xmm3,%xmm0,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- vpalignr $4,%xmm1,%xmm2,%xmm7
- xorl %eax,%r13d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpaddd %xmm7,%xmm3,%xmm3
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- vpsrld $3,%xmm4,%xmm7
- shrdl $11,%r14d,%r14d
- xorl %ecx,%r12d
- xorl %r9d,%r15d
- vpslld $14,%xmm4,%xmm5
- shrdl $6,%r13d,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%esi
- vpshufd $250,%xmm2,%xmm7
- addl %edx,%r11d
- shrdl $2,%r14d,%r14d
- addl %esi,%edx
- vpsrld $11,%xmm6,%xmm6
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%edx
- movl %eax,%r12d
- xorl %r11d,%r13d
- vpslld $11,%xmm5,%xmm5
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r11d,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 52(%rsp),%ecx
- movl %edx,%esi
- shrdl $11,%r14d,%r14d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ebx,%r12d
- xorl %r8d,%esi
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- addl %r12d,%ecx
- andl %esi,%r15d
- xorl %edx,%r14d
- vpaddd %xmm4,%xmm3,%xmm3
- addl %r13d,%ecx
- xorl %r8d,%r15d
- addl %ecx,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $2,%r14d,%r14d
- addl %r15d,%ecx
- movl %r10d,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- movl %r11d,%r12d
- xorl %r10d,%r13d
- shrdl $9,%r14d,%r14d
- vpshufd $132,%xmm6,%xmm6
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- vpsrldq $8,%xmm6,%xmm6
- andl %r10d,%r12d
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- vpaddd %xmm6,%xmm3,%xmm3
- movl %ecx,%r15d
- shrdl $11,%r14d,%r14d
- xorl %eax,%r12d
- vpshufd $80,%xmm3,%xmm7
- xorl %edx,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%ebx
- vpsrld $10,%xmm7,%xmm6
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- vpsrlq $17,%xmm7,%xmm7
- xorl %edx,%esi
- addl %ebx,%r9d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- vpsrlq $2,%xmm7,%xmm7
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %r9d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r12d
- vpshufd $232,%xmm6,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpslldq $8,%xmm6,%xmm6
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%esi
- vpaddd %xmm6,%xmm3,%xmm3
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- vpaddd 96(%rbp),%xmm3,%xmm6
- shrdl $6,%r13d,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- shrdl $2,%r14d,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,48(%rsp)
- movq 64+0(%rsp),%r12
- vpand %xmm14,%xmm11,%xmm11
- movq 64+8(%rsp),%r15
- vpor %xmm11,%xmm8,%xmm8
- vmovdqu %xmm8,(%r15,%r12,1)
- leaq 16(%r12),%r12
- cmpb $0,131(%rbp)
- jne .Lavx_00_47
- vmovdqu (%r12),%xmm9
- movq %r12,64+0(%rsp)
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- xorl %r8d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%r12d
- xorl %ebx,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%esi
- addl %r11d,%edx
- shrdl $2,%r14d,%r14d
- addl %esi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- xorl %edx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %edx,%r13d
- addl 4(%rsp),%r10d
- movl %r11d,%esi
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r12d
- xorl %eax,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%r10d
- andl %esi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- addl %r10d,%ecx
- shrdl $2,%r14d,%r14d
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- xorl %ecx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- movl %r10d,%r15d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%r12d
- xorl %r11d,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%r9d
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%esi
- addl %r9d,%ebx
- shrdl $2,%r14d,%r14d
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- xorl %ebx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%esi
- shrdl $11,%r14d,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- shrdl $2,%r14d,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- xorl %eax,%r13d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%r12d
- xorl %r9d,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%esi
- addl %edx,%r11d
- shrdl $2,%r14d,%r14d
- addl %esi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- xorl %r11d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r11d,%r13d
- addl 20(%rsp),%ecx
- movl %edx,%esi
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r12d
- xorl %r8d,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%ecx
- andl %esi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- addl %ecx,%r10d
- shrdl $2,%r14d,%r14d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- xorl %r10d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- movl %ecx,%r15d
- shrdl $11,%r14d,%r14d
- xorl %eax,%r12d
- xorl %edx,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%ebx
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%esi
- addl %ebx,%r9d
- shrdl $2,%r14d,%r14d
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- xorl %r9d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%esi
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- shrdl $2,%r14d,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- xorl %r8d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%r12d
- xorl %ebx,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%esi
- addl %r11d,%edx
- shrdl $2,%r14d,%r14d
- addl %esi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- xorl %edx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %edx,%r13d
- addl 36(%rsp),%r10d
- movl %r11d,%esi
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r12d
- xorl %eax,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%r10d
- andl %esi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- addl %r10d,%ecx
- shrdl $2,%r14d,%r14d
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- xorl %ecx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- movl %r10d,%r15d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%r12d
- xorl %r11d,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%r9d
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%esi
- addl %r9d,%ebx
- shrdl $2,%r14d,%r14d
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- xorl %ebx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%esi
- shrdl $11,%r14d,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- shrdl $2,%r14d,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- xorl %eax,%r13d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%r12d
- xorl %r9d,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%esi
- addl %edx,%r11d
- shrdl $2,%r14d,%r14d
- addl %esi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- xorl %r11d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r11d,%r13d
- addl 52(%rsp),%ecx
- movl %edx,%esi
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r12d
- xorl %r8d,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%ecx
- andl %esi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- addl %ecx,%r10d
- shrdl $2,%r14d,%r14d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- xorl %r10d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- movl %ecx,%r15d
- shrdl $11,%r14d,%r14d
- xorl %eax,%r12d
- xorl %edx,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%ebx
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%esi
- addl %ebx,%r9d
- shrdl $2,%r14d,%r14d
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- xorl %r9d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%esi
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- shrdl $2,%r14d,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- movq 64+0(%rsp),%r12
- movq 64+8(%rsp),%r13
- movq 64+40(%rsp),%r15
- movq 64+48(%rsp),%rsi
-
- vpand %xmm14,%xmm11,%xmm11
- movl %r14d,%eax
- vpor %xmm11,%xmm8,%xmm8
- vmovdqu %xmm8,(%r12,%r13,1)
- leaq 16(%r12),%r12
-
- addl 0(%r15),%eax
- addl 4(%r15),%ebx
- addl 8(%r15),%ecx
- addl 12(%r15),%edx
- addl 16(%r15),%r8d
- addl 20(%r15),%r9d
- addl 24(%r15),%r10d
- addl 28(%r15),%r11d
-
- cmpq 64+16(%rsp),%r12
-
- movl %eax,0(%r15)
- movl %ebx,4(%r15)
- movl %ecx,8(%r15)
- movl %edx,12(%r15)
- movl %r8d,16(%r15)
- movl %r9d,20(%r15)
- movl %r10d,24(%r15)
- movl %r11d,28(%r15)
- jb .Lloop_avx
-
- movq 64+32(%rsp),%r8
- movq 64+56(%rsp),%rsi
- vmovdqu %xmm8,(%r8)
- vzeroall
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-.Lepilogue_avx:
- .byte 0xf3,0xc3
-.size aesni_cbc_sha256_enc_avx,.-aesni_cbc_sha256_enc_avx
-.type aesni_cbc_sha256_enc_avx2,@function
-.align 64
-aesni_cbc_sha256_enc_avx2:
-.Lavx2_shortcut:
- movq 8(%rsp),%r10
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rsp,%r11
- subq $576,%rsp
- andq $-1024,%rsp
- addq $448,%rsp
-
- shlq $6,%rdx
- subq %rdi,%rsi
- subq %rdi,%r10
- addq %rdi,%rdx
-
-
-
- movq %rdx,64+16(%rsp)
-
- movq %r8,64+32(%rsp)
- movq %r9,64+40(%rsp)
- movq %r10,64+48(%rsp)
- movq %r11,64+56(%rsp)
-.Lprologue_avx2:
- vzeroall
-
- movq %rdi,%r13
- vpinsrq $1,%rsi,%xmm15,%xmm15
- leaq 128(%rcx),%rdi
- leaq K256+544(%rip),%r12
- movl 240-128(%rdi),%r14d
- movq %r9,%r15
- movq %r10,%rsi
- vmovdqu (%r8),%xmm8
- leaq -9(%r14),%r14
-
- vmovdqa 0(%r12,%r14,8),%xmm14
- vmovdqa 16(%r12,%r14,8),%xmm13
- vmovdqa 32(%r12,%r14,8),%xmm12
-
- subq $-64,%r13
- movl 0(%r15),%eax
- leaq (%rsi,%r13,1),%r12
- movl 4(%r15),%ebx
- cmpq %rdx,%r13
- movl 8(%r15),%ecx
- cmoveq %rsp,%r12
- movl 12(%r15),%edx
- movl 16(%r15),%r8d
- movl 20(%r15),%r9d
- movl 24(%r15),%r10d
- movl 28(%r15),%r11d
- vmovdqu 0-128(%rdi),%xmm10
- jmp .Loop_avx2
-.align 16
-.Loop_avx2:
- vmovdqa K256+512(%rip),%ymm7
- vmovdqu -64+0(%rsi,%r13,1),%xmm0
- vmovdqu -64+16(%rsi,%r13,1),%xmm1
- vmovdqu -64+32(%rsi,%r13,1),%xmm2
- vmovdqu -64+48(%rsi,%r13,1),%xmm3
-
- vinserti128 $1,(%r12),%ymm0,%ymm0
- vinserti128 $1,16(%r12),%ymm1,%ymm1
- vpshufb %ymm7,%ymm0,%ymm0
- vinserti128 $1,32(%r12),%ymm2,%ymm2
- vpshufb %ymm7,%ymm1,%ymm1
- vinserti128 $1,48(%r12),%ymm3,%ymm3
-
- leaq K256(%rip),%rbp
- vpshufb %ymm7,%ymm2,%ymm2
- leaq -64(%r13),%r13
- vpaddd 0(%rbp),%ymm0,%ymm4
- vpshufb %ymm7,%ymm3,%ymm3
- vpaddd 32(%rbp),%ymm1,%ymm5
- vpaddd 64(%rbp),%ymm2,%ymm6
- vpaddd 96(%rbp),%ymm3,%ymm7
- vmovdqa %ymm4,0(%rsp)
- xorl %r14d,%r14d
- vmovdqa %ymm5,32(%rsp)
- leaq -64(%rsp),%rsp
- movl %ebx,%esi
- vmovdqa %ymm6,0(%rsp)
- xorl %ecx,%esi
- vmovdqa %ymm7,32(%rsp)
- movl %r9d,%r12d
- subq $-32*4,%rbp
- jmp .Lavx2_00_47
-
-.align 16
-.Lavx2_00_47:
- vmovdqu (%r13),%xmm9
- vpinsrq $0,%r13,%xmm15,%xmm15
- leaq -64(%rsp),%rsp
- vpalignr $4,%ymm0,%ymm1,%ymm4
- addl 0+128(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- vpalignr $4,%ymm2,%ymm3,%ymm7
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- vpsrld $7,%ymm4,%ymm6
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- vpaddd %ymm7,%ymm0,%ymm0
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%esi
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ebx,%esi
- vpshufd $250,%ymm3,%ymm7
- xorl %r13d,%r14d
- leal (%r11,%rsi,1),%r11d
- movl %r8d,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 4+128(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%edx,%esi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- vpslld $11,%ymm5,%ymm5
- andnl %r9d,%edx,%r12d
- xorl %esi,%r13d
- rorxl $6,%edx,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%esi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%esi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- vpsrlq $17,%ymm7,%ymm7
- andl %esi,%r15d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %r12d,%r14d
- xorl %eax,%r15d
- vpaddd %ymm4,%ymm0,%ymm0
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 8+128(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- vpxor %ymm7,%ymm6,%ymm6
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- vpshufd $132,%ymm6,%ymm6
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- vpsrldq $8,%ymm6,%ymm6
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- vpaddd %ymm6,%ymm0,%ymm0
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- vpshufd $80,%ymm0,%ymm7
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r11d,%esi
- vpsrld $10,%ymm7,%ymm6
- xorl %r13d,%r14d
- leal (%r9,%rsi,1),%r9d
- movl %ecx,%r12d
- vpsrlq $17,%ymm7,%ymm7
- addl 12+128(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- vpxor %ymm7,%ymm6,%ymm6
- rorxl $11,%ebx,%esi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- vpsrlq $2,%ymm7,%ymm7
- andnl %edx,%ebx,%r12d
- xorl %esi,%r13d
- rorxl $6,%ebx,%r14d
- vpxor %ymm7,%ymm6,%ymm6
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%esi
- vpshufd $232,%ymm6,%ymm6
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%esi
- vpslldq $8,%ymm6,%ymm6
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- vpaddd %ymm6,%ymm0,%ymm0
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- vpaddd 0(%rbp),%ymm0,%ymm6
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- vmovdqa %ymm6,0(%rsp)
- vpalignr $4,%ymm1,%ymm2,%ymm4
- addl 32+128(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- vpalignr $4,%ymm3,%ymm0,%ymm7
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- vpsrld $7,%ymm4,%ymm6
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- vpaddd %ymm7,%ymm1,%ymm1
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r9d,%esi
- vpshufd $250,%ymm0,%ymm7
- xorl %r13d,%r14d
- leal (%rdx,%rsi,1),%edx
- movl %eax,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 36+128(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%r11d,%esi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- vpslld $11,%ymm5,%ymm5
- andnl %ebx,%r11d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r11d,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%esi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%esi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- vpsrlq $17,%ymm7,%ymm7
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- vpaddd %ymm4,%ymm1,%ymm1
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 40+128(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- vpxor %ymm7,%ymm6,%ymm6
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- vpshufd $132,%ymm6,%ymm6
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- vpsrldq $8,%ymm6,%ymm6
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- vpaddd %ymm6,%ymm1,%ymm1
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- vpshufd $80,%ymm1,%ymm7
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %edx,%esi
- vpsrld $10,%ymm7,%ymm6
- xorl %r13d,%r14d
- leal (%rbx,%rsi,1),%ebx
- movl %r10d,%r12d
- vpsrlq $17,%ymm7,%ymm7
- addl 44+128(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- vpxor %ymm7,%ymm6,%ymm6
- rorxl $11,%r9d,%esi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- vpsrlq $2,%ymm7,%ymm7
- andnl %r11d,%r9d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r9d,%r14d
- vpxor %ymm7,%ymm6,%ymm6
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%esi
- vpshufd $232,%ymm6,%ymm6
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%esi
- vpslldq $8,%ymm6,%ymm6
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- vpaddd %ymm6,%ymm1,%ymm1
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- vpaddd 32(%rbp),%ymm1,%ymm6
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- vmovdqa %ymm6,32(%rsp)
- leaq -64(%rsp),%rsp
- vpalignr $4,%ymm2,%ymm3,%ymm4
- addl 0+128(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- vpalignr $4,%ymm0,%ymm1,%ymm7
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- vpsrld $7,%ymm4,%ymm6
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- vpaddd %ymm7,%ymm2,%ymm2
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ebx,%esi
- vpshufd $250,%ymm1,%ymm7
- xorl %r13d,%r14d
- leal (%r11,%rsi,1),%r11d
- movl %r8d,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 4+128(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%edx,%esi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- vpslld $11,%ymm5,%ymm5
- andnl %r9d,%edx,%r12d
- xorl %esi,%r13d
- rorxl $6,%edx,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%esi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%esi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- vpsrlq $17,%ymm7,%ymm7
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %eax,%r15d
- vpaddd %ymm4,%ymm2,%ymm2
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 8+128(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- vpxor %ymm7,%ymm6,%ymm6
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- vpshufd $132,%ymm6,%ymm6
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- vpsrldq $8,%ymm6,%ymm6
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- vpaddd %ymm6,%ymm2,%ymm2
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- vpshufd $80,%ymm2,%ymm7
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r11d,%esi
- vpsrld $10,%ymm7,%ymm6
- xorl %r13d,%r14d
- leal (%r9,%rsi,1),%r9d
- movl %ecx,%r12d
- vpsrlq $17,%ymm7,%ymm7
- addl 12+128(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- vpxor %ymm7,%ymm6,%ymm6
- rorxl $11,%ebx,%esi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- vpsrlq $2,%ymm7,%ymm7
- andnl %edx,%ebx,%r12d
- xorl %esi,%r13d
- rorxl $6,%ebx,%r14d
- vpxor %ymm7,%ymm6,%ymm6
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%esi
- vpshufd $232,%ymm6,%ymm6
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%esi
- vpslldq $8,%ymm6,%ymm6
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- vpaddd %ymm6,%ymm2,%ymm2
- andl %esi,%r15d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- vpaddd 64(%rbp),%ymm2,%ymm6
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- vmovdqa %ymm6,0(%rsp)
- vpalignr $4,%ymm3,%ymm0,%ymm4
- addl 32+128(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- vpalignr $4,%ymm1,%ymm2,%ymm7
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- vpsrld $7,%ymm4,%ymm6
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- vpaddd %ymm7,%ymm3,%ymm3
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%esi
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r9d,%esi
- vpshufd $250,%ymm2,%ymm7
- xorl %r13d,%r14d
- leal (%rdx,%rsi,1),%edx
- movl %eax,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 36+128(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%r11d,%esi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- vpslld $11,%ymm5,%ymm5
- andnl %ebx,%r11d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r11d,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%esi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%esi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- vpsrlq $17,%ymm7,%ymm7
- andl %esi,%r15d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- vpaddd %ymm4,%ymm3,%ymm3
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 40+128(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- vpxor %ymm7,%ymm6,%ymm6
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- vpshufd $132,%ymm6,%ymm6
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- vpsrldq $8,%ymm6,%ymm6
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- vpaddd %ymm6,%ymm3,%ymm3
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- vpshufd $80,%ymm3,%ymm7
- andl %r15d,%esi
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %edx,%esi
- vpsrld $10,%ymm7,%ymm6
- xorl %r13d,%r14d
- leal (%rbx,%rsi,1),%ebx
- movl %r10d,%r12d
- vpsrlq $17,%ymm7,%ymm7
- addl 44+128(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- vpxor %ymm7,%ymm6,%ymm6
- rorxl $11,%r9d,%esi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- vpsrlq $2,%ymm7,%ymm7
- andnl %r11d,%r9d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r9d,%r14d
- vpxor %ymm7,%ymm6,%ymm6
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%esi
- vpshufd $232,%ymm6,%ymm6
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%esi
- vpslldq $8,%ymm6,%ymm6
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- vpaddd %ymm6,%ymm3,%ymm3
- andl %esi,%r15d
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- vpaddd 96(%rbp),%ymm3,%ymm6
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- vmovdqa %ymm6,32(%rsp)
- vmovq %xmm15,%r13
- vpextrq $1,%xmm15,%r15
- vpand %xmm14,%xmm11,%xmm11
- vpor %xmm11,%xmm8,%xmm8
- vmovdqu %xmm8,(%r15,%r13,1)
- leaq 16(%r13),%r13
- leaq 128(%rbp),%rbp
- cmpb $0,3(%rbp)
- jne .Lavx2_00_47
- vmovdqu (%r13),%xmm9
- vpinsrq $0,%r13,%xmm15,%xmm15
- addl 0+64(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%esi
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ebx,%esi
- xorl %r13d,%r14d
- leal (%r11,%rsi,1),%r11d
- movl %r8d,%r12d
- addl 4+64(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%esi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %esi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%esi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%esi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %esi,%r15d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8+64(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r11d,%esi
- xorl %r13d,%r14d
- leal (%r9,%rsi,1),%r9d
- movl %ecx,%r12d
- addl 12+64(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%esi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %esi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%esi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%esi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32+64(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r9d,%esi
- xorl %r13d,%r14d
- leal (%rdx,%rsi,1),%edx
- movl %eax,%r12d
- addl 36+64(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%esi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%esi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%esi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40+64(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %edx,%esi
- xorl %r13d,%r14d
- leal (%rbx,%rsi,1),%ebx
- movl %r10d,%r12d
- addl 44+64(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%esi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%esi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%esi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- addl 0(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ebx,%esi
- xorl %r13d,%r14d
- leal (%r11,%rsi,1),%r11d
- movl %r8d,%r12d
- addl 4(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%esi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %esi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%esi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%esi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r11d,%esi
- xorl %r13d,%r14d
- leal (%r9,%rsi,1),%r9d
- movl %ecx,%r12d
- addl 12(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%esi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %esi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%esi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%esi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %esi,%r15d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%esi
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r9d,%esi
- xorl %r13d,%r14d
- leal (%rdx,%rsi,1),%edx
- movl %eax,%r12d
- addl 36(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%esi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%esi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%esi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %esi,%r15d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%esi
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %edx,%esi
- xorl %r13d,%r14d
- leal (%rbx,%rsi,1),%ebx
- movl %r10d,%r12d
- addl 44(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%esi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%esi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%esi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %esi,%r15d
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- vpextrq $1,%xmm15,%r12
- vmovq %xmm15,%r13
- movq 552(%rsp),%r15
- addl %r14d,%eax
- leaq 448(%rsp),%rbp
-
- vpand %xmm14,%xmm11,%xmm11
- vpor %xmm11,%xmm8,%xmm8
- vmovdqu %xmm8,(%r12,%r13,1)
- leaq 16(%r13),%r13
-
- addl 0(%r15),%eax
- addl 4(%r15),%ebx
- addl 8(%r15),%ecx
- addl 12(%r15),%edx
- addl 16(%r15),%r8d
- addl 20(%r15),%r9d
- addl 24(%r15),%r10d
- addl 28(%r15),%r11d
-
- movl %eax,0(%r15)
- movl %ebx,4(%r15)
- movl %ecx,8(%r15)
- movl %edx,12(%r15)
- movl %r8d,16(%r15)
- movl %r9d,20(%r15)
- movl %r10d,24(%r15)
- movl %r11d,28(%r15)
-
- cmpq 80(%rbp),%r13
- je .Ldone_avx2
-
- xorl %r14d,%r14d
- movl %ebx,%esi
- movl %r9d,%r12d
- xorl %ecx,%esi
- jmp .Lower_avx2
-.align 16
-.Lower_avx2:
- vmovdqu (%r13),%xmm9
- vpinsrq $0,%r13,%xmm15,%xmm15
- addl 0+16(%rbp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%esi
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ebx,%esi
- xorl %r13d,%r14d
- leal (%r11,%rsi,1),%r11d
- movl %r8d,%r12d
- addl 4+16(%rbp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%esi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %esi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%esi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%esi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %esi,%r15d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8+16(%rbp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r11d,%esi
- xorl %r13d,%r14d
- leal (%r9,%rsi,1),%r9d
- movl %ecx,%r12d
- addl 12+16(%rbp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%esi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %esi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%esi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%esi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32+16(%rbp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r9d,%esi
- xorl %r13d,%r14d
- leal (%rdx,%rsi,1),%edx
- movl %eax,%r12d
- addl 36+16(%rbp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%esi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%esi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%esi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40+16(%rbp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %edx,%esi
- xorl %r13d,%r14d
- leal (%rbx,%rsi,1),%ebx
- movl %r10d,%r12d
- addl 44+16(%rbp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%esi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%esi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%esi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- leaq -64(%rbp),%rbp
- addl 0+16(%rbp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ebx,%esi
- xorl %r13d,%r14d
- leal (%r11,%rsi,1),%r11d
- movl %r8d,%r12d
- addl 4+16(%rbp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%esi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %esi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%esi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%esi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8+16(%rbp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r11d,%esi
- xorl %r13d,%r14d
- leal (%r9,%rsi,1),%r9d
- movl %ecx,%r12d
- addl 12+16(%rbp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%esi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %esi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%esi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%esi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %esi,%r15d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32+16(%rbp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%esi
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r9d,%esi
- xorl %r13d,%r14d
- leal (%rdx,%rsi,1),%edx
- movl %eax,%r12d
- addl 36+16(%rbp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%esi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%esi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%esi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %esi,%r15d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40+16(%rbp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%esi
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %edx,%esi
- xorl %r13d,%r14d
- leal (%rbx,%rsi,1),%ebx
- movl %r10d,%r12d
- addl 44+16(%rbp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%esi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%esi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%esi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %esi,%r15d
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- vmovq %xmm15,%r13
- vpextrq $1,%xmm15,%r15
- vpand %xmm14,%xmm11,%xmm11
- vpor %xmm11,%xmm8,%xmm8
- leaq -64(%rbp),%rbp
- vmovdqu %xmm8,(%r15,%r13,1)
- leaq 16(%r13),%r13
- cmpq %rsp,%rbp
- jae .Lower_avx2
-
- movq 552(%rsp),%r15
- leaq 64(%r13),%r13
- movq 560(%rsp),%rsi
- addl %r14d,%eax
- leaq 448(%rsp),%rsp
-
- addl 0(%r15),%eax
- addl 4(%r15),%ebx
- addl 8(%r15),%ecx
- addl 12(%r15),%edx
- addl 16(%r15),%r8d
- addl 20(%r15),%r9d
- addl 24(%r15),%r10d
- leaq (%rsi,%r13,1),%r12
- addl 28(%r15),%r11d
-
- cmpq 64+16(%rsp),%r13
-
- movl %eax,0(%r15)
- cmoveq %rsp,%r12
- movl %ebx,4(%r15)
- movl %ecx,8(%r15)
- movl %edx,12(%r15)
- movl %r8d,16(%r15)
- movl %r9d,20(%r15)
- movl %r10d,24(%r15)
- movl %r11d,28(%r15)
-
- jbe .Loop_avx2
- leaq (%rsp),%rbp
-
-.Ldone_avx2:
- leaq (%rbp),%rsp
- movq 64+32(%rsp),%r8
- movq 64+56(%rsp),%rsi
- vmovdqu %xmm8,(%r8)
- vzeroall
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-.Lepilogue_avx2:
- .byte 0xf3,0xc3
-.size aesni_cbc_sha256_enc_avx2,.-aesni_cbc_sha256_enc_avx2
-.type aesni_cbc_sha256_enc_shaext,@function
-.align 32
-aesni_cbc_sha256_enc_shaext:
- movq 8(%rsp),%r10
- leaq K256+128(%rip),%rax
- movdqu (%r9),%xmm1
- movdqu 16(%r9),%xmm2
- movdqa 512-128(%rax),%xmm3
-
- movl 240(%rcx),%r11d
- subq %rdi,%rsi
- movups (%rcx),%xmm15
- movups 16(%rcx),%xmm4
- leaq 112(%rcx),%rcx
-
- pshufd $27,%xmm1,%xmm0
- pshufd $177,%xmm1,%xmm1
- pshufd $27,%xmm2,%xmm2
- movdqa %xmm3,%xmm7
-.byte 102,15,58,15,202,8
- punpcklqdq %xmm0,%xmm2
-
- jmp .Loop_shaext
-
-.align 16
-.Loop_shaext:
- movdqu (%r10),%xmm10
- movdqu 16(%r10),%xmm11
- movdqu 32(%r10),%xmm12
-.byte 102,68,15,56,0,211
- movdqu 48(%r10),%xmm13
-
- movdqa 0-128(%rax),%xmm0
- paddd %xmm10,%xmm0
-.byte 102,68,15,56,0,219
- movdqa %xmm2,%xmm9
- movdqa %xmm1,%xmm8
- movups 0(%rdi),%xmm14
- xorps %xmm15,%xmm14
- xorps %xmm14,%xmm6
- movups -80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movups -64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,202
-
- movdqa 32-128(%rax),%xmm0
- paddd %xmm11,%xmm0
-.byte 102,68,15,56,0,227
- leaq 64(%r10),%r10
- movups -48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movups -32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,202
-
- movdqa 64-128(%rax),%xmm0
- paddd %xmm12,%xmm0
-.byte 102,68,15,56,0,235
-.byte 69,15,56,204,211
- movups -16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm13,%xmm3
-.byte 102,65,15,58,15,220,4
- paddd %xmm3,%xmm10
- movups 0(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,202
-
- movdqa 96-128(%rax),%xmm0
- paddd %xmm13,%xmm0
-.byte 69,15,56,205,213
-.byte 69,15,56,204,220
- movups 16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movups 32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movdqa %xmm10,%xmm3
-.byte 102,65,15,58,15,221,4
- paddd %xmm3,%xmm11
-.byte 15,56,203,202
- movdqa 128-128(%rax),%xmm0
- paddd %xmm10,%xmm0
-.byte 69,15,56,205,218
-.byte 69,15,56,204,229
- movups 48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm11,%xmm3
-.byte 102,65,15,58,15,218,4
- paddd %xmm3,%xmm12
- cmpl $11,%r11d
- jb .Laesenclast1
- movups 64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- je .Laesenclast1
- movups 96(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 112(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.Laesenclast1:
- aesenclast %xmm5,%xmm6
- movups 16-112(%rcx),%xmm4
- nop
-.byte 15,56,203,202
- movups 16(%rdi),%xmm14
- xorps %xmm15,%xmm14
- movups %xmm6,0(%rsi,%rdi,1)
- xorps %xmm14,%xmm6
- movups -80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- movdqa 160-128(%rax),%xmm0
- paddd %xmm11,%xmm0
-.byte 69,15,56,205,227
-.byte 69,15,56,204,234
- movups -64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm12,%xmm3
-.byte 102,65,15,58,15,219,4
- paddd %xmm3,%xmm13
- movups -48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 192-128(%rax),%xmm0
- paddd %xmm12,%xmm0
-.byte 69,15,56,205,236
-.byte 69,15,56,204,211
- movups -32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm13,%xmm3
-.byte 102,65,15,58,15,220,4
- paddd %xmm3,%xmm10
- movups -16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 224-128(%rax),%xmm0
- paddd %xmm13,%xmm0
-.byte 69,15,56,205,213
-.byte 69,15,56,204,220
- movups 0(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm10,%xmm3
-.byte 102,65,15,58,15,221,4
- paddd %xmm3,%xmm11
- movups 16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 256-128(%rax),%xmm0
- paddd %xmm10,%xmm0
-.byte 69,15,56,205,218
-.byte 69,15,56,204,229
- movups 32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm11,%xmm3
-.byte 102,65,15,58,15,218,4
- paddd %xmm3,%xmm12
- movups 48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- cmpl $11,%r11d
- jb .Laesenclast2
- movups 64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- je .Laesenclast2
- movups 96(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 112(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.Laesenclast2:
- aesenclast %xmm5,%xmm6
- movups 16-112(%rcx),%xmm4
- nop
-.byte 15,56,203,202
- movups 32(%rdi),%xmm14
- xorps %xmm15,%xmm14
- movups %xmm6,16(%rsi,%rdi,1)
- xorps %xmm14,%xmm6
- movups -80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- movdqa 288-128(%rax),%xmm0
- paddd %xmm11,%xmm0
-.byte 69,15,56,205,227
-.byte 69,15,56,204,234
- movups -64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm12,%xmm3
-.byte 102,65,15,58,15,219,4
- paddd %xmm3,%xmm13
- movups -48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 320-128(%rax),%xmm0
- paddd %xmm12,%xmm0
-.byte 69,15,56,205,236
-.byte 69,15,56,204,211
- movups -32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm13,%xmm3
-.byte 102,65,15,58,15,220,4
- paddd %xmm3,%xmm10
- movups -16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 352-128(%rax),%xmm0
- paddd %xmm13,%xmm0
-.byte 69,15,56,205,213
-.byte 69,15,56,204,220
- movups 0(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm10,%xmm3
-.byte 102,65,15,58,15,221,4
- paddd %xmm3,%xmm11
- movups 16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 384-128(%rax),%xmm0
- paddd %xmm10,%xmm0
-.byte 69,15,56,205,218
-.byte 69,15,56,204,229
- movups 32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm11,%xmm3
-.byte 102,65,15,58,15,218,4
- paddd %xmm3,%xmm12
- movups 48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 416-128(%rax),%xmm0
- paddd %xmm11,%xmm0
-.byte 69,15,56,205,227
-.byte 69,15,56,204,234
- cmpl $11,%r11d
- jb .Laesenclast3
- movups 64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- je .Laesenclast3
- movups 96(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 112(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.Laesenclast3:
- aesenclast %xmm5,%xmm6
- movups 16-112(%rcx),%xmm4
- nop
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm12,%xmm3
-.byte 102,65,15,58,15,219,4
- paddd %xmm3,%xmm13
- movups 48(%rdi),%xmm14
- xorps %xmm15,%xmm14
- movups %xmm6,32(%rsi,%rdi,1)
- xorps %xmm14,%xmm6
- movups -80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- movups -64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,202
-
- movdqa 448-128(%rax),%xmm0
- paddd %xmm12,%xmm0
-.byte 69,15,56,205,236
- movdqa %xmm7,%xmm3
- movups -48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movups -32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,202
-
- movdqa 480-128(%rax),%xmm0
- paddd %xmm13,%xmm0
- movups -16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- movups 0(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movups 16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
-
- movups 32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- cmpl $11,%r11d
- jb .Laesenclast4
- movups 64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- je .Laesenclast4
- movups 96(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 112(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.Laesenclast4:
- aesenclast %xmm5,%xmm6
- movups 16-112(%rcx),%xmm4
- nop
-
- paddd %xmm9,%xmm2
- paddd %xmm8,%xmm1
-
- decq %rdx
- movups %xmm6,48(%rsi,%rdi,1)
- leaq 64(%rdi),%rdi
- jnz .Loop_shaext
-
- pshufd $177,%xmm2,%xmm2
- pshufd $27,%xmm1,%xmm3
- pshufd $177,%xmm1,%xmm1
- punpckhqdq %xmm2,%xmm1
-.byte 102,15,58,15,211,8
-
- movups %xmm6,(%r8)
- movdqu %xmm1,(%r9)
- movdqu %xmm2,16(%r9)
- .byte 0xf3,0xc3
-.size aesni_cbc_sha256_enc_shaext,.-aesni_cbc_sha256_enc_shaext
diff --git a/deps/openssl/asm/x64-elf-gas/bn/rsaz-avx2.s b/deps/openssl/asm/x64-elf-gas/bn/rsaz-avx2.s
index 8f356fc3d5..d8b8bd8de5 100644
--- a/deps/openssl/asm/x64-elf-gas/bn/rsaz-avx2.s
+++ b/deps/openssl/asm/x64-elf-gas/bn/rsaz-avx2.s
@@ -1,1632 +1,25 @@
.text
+.globl rsaz_avx2_eligible
+.type rsaz_avx2_eligible,@function
+rsaz_avx2_eligible:
+ xorl %eax,%eax
+ .byte 0xf3,0xc3
+.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
+
.globl rsaz_1024_sqr_avx2
+.globl rsaz_1024_mul_avx2
+.globl rsaz_1024_norm2red_avx2
+.globl rsaz_1024_red2norm_avx2
+.globl rsaz_1024_scatter5_avx2
+.globl rsaz_1024_gather5_avx2
.type rsaz_1024_sqr_avx2,@function
-.align 64
rsaz_1024_sqr_avx2:
- leaq (%rsp),%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- vzeroupper
- movq %rax,%rbp
- movq %rdx,%r13
- subq $832,%rsp
- movq %r13,%r15
- subq $-128,%rdi
- subq $-128,%rsi
- subq $-128,%r13
-
- andq $4095,%r15
- addq $320,%r15
- shrq $12,%r15
- vpxor %ymm9,%ymm9,%ymm9
- jz .Lsqr_1024_no_n_copy
-
-
-
-
-
- subq $320,%rsp
- vmovdqu 0-128(%r13),%ymm0
- andq $-2048,%rsp
- vmovdqu 32-128(%r13),%ymm1
- vmovdqu 64-128(%r13),%ymm2
- vmovdqu 96-128(%r13),%ymm3
- vmovdqu 128-128(%r13),%ymm4
- vmovdqu 160-128(%r13),%ymm5
- vmovdqu 192-128(%r13),%ymm6
- vmovdqu 224-128(%r13),%ymm7
- vmovdqu 256-128(%r13),%ymm8
- leaq 832+128(%rsp),%r13
- vmovdqu %ymm0,0-128(%r13)
- vmovdqu %ymm1,32-128(%r13)
- vmovdqu %ymm2,64-128(%r13)
- vmovdqu %ymm3,96-128(%r13)
- vmovdqu %ymm4,128-128(%r13)
- vmovdqu %ymm5,160-128(%r13)
- vmovdqu %ymm6,192-128(%r13)
- vmovdqu %ymm7,224-128(%r13)
- vmovdqu %ymm8,256-128(%r13)
- vmovdqu %ymm9,288-128(%r13)
-
-.Lsqr_1024_no_n_copy:
- andq $-1024,%rsp
-
- vmovdqu 32-128(%rsi),%ymm1
- vmovdqu 64-128(%rsi),%ymm2
- vmovdqu 96-128(%rsi),%ymm3
- vmovdqu 128-128(%rsi),%ymm4
- vmovdqu 160-128(%rsi),%ymm5
- vmovdqu 192-128(%rsi),%ymm6
- vmovdqu 224-128(%rsi),%ymm7
- vmovdqu 256-128(%rsi),%ymm8
-
- leaq 192(%rsp),%rbx
- vpbroadcastq .Land_mask(%rip),%ymm15
- jmp .LOOP_GRANDE_SQR_1024
-
-.align 32
-.LOOP_GRANDE_SQR_1024:
- leaq 576+128(%rsp),%r9
- leaq 448(%rsp),%r12
-
-
-
-
- vpaddq %ymm1,%ymm1,%ymm1
- vpbroadcastq 0-128(%rsi),%ymm10
- vpaddq %ymm2,%ymm2,%ymm2
- vmovdqa %ymm1,0-128(%r9)
- vpaddq %ymm3,%ymm3,%ymm3
- vmovdqa %ymm2,32-128(%r9)
- vpaddq %ymm4,%ymm4,%ymm4
- vmovdqa %ymm3,64-128(%r9)
- vpaddq %ymm5,%ymm5,%ymm5
- vmovdqa %ymm4,96-128(%r9)
- vpaddq %ymm6,%ymm6,%ymm6
- vmovdqa %ymm5,128-128(%r9)
- vpaddq %ymm7,%ymm7,%ymm7
- vmovdqa %ymm6,160-128(%r9)
- vpaddq %ymm8,%ymm8,%ymm8
- vmovdqa %ymm7,192-128(%r9)
- vpxor %ymm9,%ymm9,%ymm9
- vmovdqa %ymm8,224-128(%r9)
-
- vpmuludq 0-128(%rsi),%ymm10,%ymm0
- vpbroadcastq 32-128(%rsi),%ymm11
- vmovdqu %ymm9,288-192(%rbx)
- vpmuludq %ymm10,%ymm1,%ymm1
- vmovdqu %ymm9,320-448(%r12)
- vpmuludq %ymm10,%ymm2,%ymm2
- vmovdqu %ymm9,352-448(%r12)
- vpmuludq %ymm10,%ymm3,%ymm3
- vmovdqu %ymm9,384-448(%r12)
- vpmuludq %ymm10,%ymm4,%ymm4
- vmovdqu %ymm9,416-448(%r12)
- vpmuludq %ymm10,%ymm5,%ymm5
- vmovdqu %ymm9,448-448(%r12)
- vpmuludq %ymm10,%ymm6,%ymm6
- vmovdqu %ymm9,480-448(%r12)
- vpmuludq %ymm10,%ymm7,%ymm7
- vmovdqu %ymm9,512-448(%r12)
- vpmuludq %ymm10,%ymm8,%ymm8
- vpbroadcastq 64-128(%rsi),%ymm10
- vmovdqu %ymm9,544-448(%r12)
-
- movq %rsi,%r15
- movl $4,%r14d
- jmp .Lsqr_entry_1024
-.align 32
-.LOOP_SQR_1024:
- vpbroadcastq 32-128(%r15),%ymm11
- vpmuludq 0-128(%rsi),%ymm10,%ymm0
- vpaddq 0-192(%rbx),%ymm0,%ymm0
- vpmuludq 0-128(%r9),%ymm10,%ymm1
- vpaddq 32-192(%rbx),%ymm1,%ymm1
- vpmuludq 32-128(%r9),%ymm10,%ymm2
- vpaddq 64-192(%rbx),%ymm2,%ymm2
- vpmuludq 64-128(%r9),%ymm10,%ymm3
- vpaddq 96-192(%rbx),%ymm3,%ymm3
- vpmuludq 96-128(%r9),%ymm10,%ymm4
- vpaddq 128-192(%rbx),%ymm4,%ymm4
- vpmuludq 128-128(%r9),%ymm10,%ymm5
- vpaddq 160-192(%rbx),%ymm5,%ymm5
- vpmuludq 160-128(%r9),%ymm10,%ymm6
- vpaddq 192-192(%rbx),%ymm6,%ymm6
- vpmuludq 192-128(%r9),%ymm10,%ymm7
- vpaddq 224-192(%rbx),%ymm7,%ymm7
- vpmuludq 224-128(%r9),%ymm10,%ymm8
- vpbroadcastq 64-128(%r15),%ymm10
- vpaddq 256-192(%rbx),%ymm8,%ymm8
-.Lsqr_entry_1024:
- vmovdqu %ymm0,0-192(%rbx)
- vmovdqu %ymm1,32-192(%rbx)
-
- vpmuludq 32-128(%rsi),%ymm11,%ymm12
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq 32-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm3,%ymm3
- vpmuludq 64-128(%r9),%ymm11,%ymm13
- vpaddq %ymm13,%ymm4,%ymm4
- vpmuludq 96-128(%r9),%ymm11,%ymm12
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq 128-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm6,%ymm6
- vpmuludq 160-128(%r9),%ymm11,%ymm13
- vpaddq %ymm13,%ymm7,%ymm7
- vpmuludq 192-128(%r9),%ymm11,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq 224-128(%r9),%ymm11,%ymm0
- vpbroadcastq 96-128(%r15),%ymm11
- vpaddq 288-192(%rbx),%ymm0,%ymm0
-
- vmovdqu %ymm2,64-192(%rbx)
- vmovdqu %ymm3,96-192(%rbx)
-
- vpmuludq 64-128(%rsi),%ymm10,%ymm13
- vpaddq %ymm13,%ymm4,%ymm4
- vpmuludq 64-128(%r9),%ymm10,%ymm12
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq 96-128(%r9),%ymm10,%ymm14
- vpaddq %ymm14,%ymm6,%ymm6
- vpmuludq 128-128(%r9),%ymm10,%ymm13
- vpaddq %ymm13,%ymm7,%ymm7
- vpmuludq 160-128(%r9),%ymm10,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq 192-128(%r9),%ymm10,%ymm14
- vpaddq %ymm14,%ymm0,%ymm0
- vpmuludq 224-128(%r9),%ymm10,%ymm1
- vpbroadcastq 128-128(%r15),%ymm10
- vpaddq 320-448(%r12),%ymm1,%ymm1
-
- vmovdqu %ymm4,128-192(%rbx)
- vmovdqu %ymm5,160-192(%rbx)
-
- vpmuludq 96-128(%rsi),%ymm11,%ymm12
- vpaddq %ymm12,%ymm6,%ymm6
- vpmuludq 96-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm7,%ymm7
- vpmuludq 128-128(%r9),%ymm11,%ymm13
- vpaddq %ymm13,%ymm8,%ymm8
- vpmuludq 160-128(%r9),%ymm11,%ymm12
- vpaddq %ymm12,%ymm0,%ymm0
- vpmuludq 192-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm1,%ymm1
- vpmuludq 224-128(%r9),%ymm11,%ymm2
- vpbroadcastq 160-128(%r15),%ymm11
- vpaddq 352-448(%r12),%ymm2,%ymm2
-
- vmovdqu %ymm6,192-192(%rbx)
- vmovdqu %ymm7,224-192(%rbx)
-
- vpmuludq 128-128(%rsi),%ymm10,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq 128-128(%r9),%ymm10,%ymm14
- vpaddq %ymm14,%ymm0,%ymm0
- vpmuludq 160-128(%r9),%ymm10,%ymm13
- vpaddq %ymm13,%ymm1,%ymm1
- vpmuludq 192-128(%r9),%ymm10,%ymm12
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq 224-128(%r9),%ymm10,%ymm3
- vpbroadcastq 192-128(%r15),%ymm10
- vpaddq 384-448(%r12),%ymm3,%ymm3
-
- vmovdqu %ymm8,256-192(%rbx)
- vmovdqu %ymm0,288-192(%rbx)
- leaq 8(%rbx),%rbx
-
- vpmuludq 160-128(%rsi),%ymm11,%ymm13
- vpaddq %ymm13,%ymm1,%ymm1
- vpmuludq 160-128(%r9),%ymm11,%ymm12
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq 192-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm3,%ymm3
- vpmuludq 224-128(%r9),%ymm11,%ymm4
- vpbroadcastq 224-128(%r15),%ymm11
- vpaddq 416-448(%r12),%ymm4,%ymm4
-
- vmovdqu %ymm1,320-448(%r12)
- vmovdqu %ymm2,352-448(%r12)
-
- vpmuludq 192-128(%rsi),%ymm10,%ymm12
- vpaddq %ymm12,%ymm3,%ymm3
- vpmuludq 192-128(%r9),%ymm10,%ymm14
- vpbroadcastq 256-128(%r15),%ymm0
- vpaddq %ymm14,%ymm4,%ymm4
- vpmuludq 224-128(%r9),%ymm10,%ymm5
- vpbroadcastq 0+8-128(%r15),%ymm10
- vpaddq 448-448(%r12),%ymm5,%ymm5
-
- vmovdqu %ymm3,384-448(%r12)
- vmovdqu %ymm4,416-448(%r12)
- leaq 8(%r15),%r15
-
- vpmuludq 224-128(%rsi),%ymm11,%ymm12
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq 224-128(%r9),%ymm11,%ymm6
- vpaddq 480-448(%r12),%ymm6,%ymm6
-
- vpmuludq 256-128(%rsi),%ymm0,%ymm7
- vmovdqu %ymm5,448-448(%r12)
- vpaddq 512-448(%r12),%ymm7,%ymm7
- vmovdqu %ymm6,480-448(%r12)
- vmovdqu %ymm7,512-448(%r12)
- leaq 8(%r12),%r12
-
- decl %r14d
- jnz .LOOP_SQR_1024
-
- vmovdqu 256(%rsp),%ymm8
- vmovdqu 288(%rsp),%ymm1
- vmovdqu 320(%rsp),%ymm2
- leaq 192(%rsp),%rbx
-
- vpsrlq $29,%ymm8,%ymm14
- vpand %ymm15,%ymm8,%ymm8
- vpsrlq $29,%ymm1,%ymm11
- vpand %ymm15,%ymm1,%ymm1
-
- vpermq $147,%ymm14,%ymm14
- vpxor %ymm9,%ymm9,%ymm9
- vpermq $147,%ymm11,%ymm11
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm8,%ymm8
- vpblendd $3,%ymm11,%ymm9,%ymm11
- vpaddq %ymm14,%ymm1,%ymm1
- vpaddq %ymm11,%ymm2,%ymm2
- vmovdqu %ymm1,288-192(%rbx)
- vmovdqu %ymm2,320-192(%rbx)
-
- movq (%rsp),%rax
- movq 8(%rsp),%r10
- movq 16(%rsp),%r11
- movq 24(%rsp),%r12
- vmovdqu 32(%rsp),%ymm1
- vmovdqu 64-192(%rbx),%ymm2
- vmovdqu 96-192(%rbx),%ymm3
- vmovdqu 128-192(%rbx),%ymm4
- vmovdqu 160-192(%rbx),%ymm5
- vmovdqu 192-192(%rbx),%ymm6
- vmovdqu 224-192(%rbx),%ymm7
-
- movq %rax,%r9
- imull %ecx,%eax
- andl $536870911,%eax
- vmovd %eax,%xmm12
-
- movq %rax,%rdx
- imulq -128(%r13),%rax
- vpbroadcastq %xmm12,%ymm12
- addq %rax,%r9
- movq %rdx,%rax
- imulq 8-128(%r13),%rax
- shrq $29,%r9
- addq %rax,%r10
- movq %rdx,%rax
- imulq 16-128(%r13),%rax
- addq %r9,%r10
- addq %rax,%r11
- imulq 24-128(%r13),%rdx
- addq %rdx,%r12
-
- movq %r10,%rax
- imull %ecx,%eax
- andl $536870911,%eax
-
- movl $9,%r14d
- jmp .LOOP_REDUCE_1024
-
-.align 32
-.LOOP_REDUCE_1024:
- vmovd %eax,%xmm13
- vpbroadcastq %xmm13,%ymm13
-
- vpmuludq 32-128(%r13),%ymm12,%ymm10
- movq %rax,%rdx
- imulq -128(%r13),%rax
- vpaddq %ymm10,%ymm1,%ymm1
- addq %rax,%r10
- vpmuludq 64-128(%r13),%ymm12,%ymm14
- movq %rdx,%rax
- imulq 8-128(%r13),%rax
- vpaddq %ymm14,%ymm2,%ymm2
- vpmuludq 96-128(%r13),%ymm12,%ymm11
-.byte 0x67
- addq %rax,%r11
-.byte 0x67
- movq %rdx,%rax
- imulq 16-128(%r13),%rax
- shrq $29,%r10
- vpaddq %ymm11,%ymm3,%ymm3
- vpmuludq 128-128(%r13),%ymm12,%ymm10
- addq %rax,%r12
- addq %r10,%r11
- vpaddq %ymm10,%ymm4,%ymm4
- vpmuludq 160-128(%r13),%ymm12,%ymm14
- movq %r11,%rax
- imull %ecx,%eax
- vpaddq %ymm14,%ymm5,%ymm5
- vpmuludq 192-128(%r13),%ymm12,%ymm11
- andl $536870911,%eax
- vpaddq %ymm11,%ymm6,%ymm6
- vpmuludq 224-128(%r13),%ymm12,%ymm10
- vpaddq %ymm10,%ymm7,%ymm7
- vpmuludq 256-128(%r13),%ymm12,%ymm14
- vmovd %eax,%xmm12
-
- vpaddq %ymm14,%ymm8,%ymm8
-
- vpbroadcastq %xmm12,%ymm12
-
- vpmuludq 32-8-128(%r13),%ymm13,%ymm11
- vmovdqu 96-8-128(%r13),%ymm14
- movq %rax,%rdx
- imulq -128(%r13),%rax
- vpaddq %ymm11,%ymm1,%ymm1
- vpmuludq 64-8-128(%r13),%ymm13,%ymm10
- vmovdqu 128-8-128(%r13),%ymm11
- addq %rax,%r11
- movq %rdx,%rax
- imulq 8-128(%r13),%rax
- vpaddq %ymm10,%ymm2,%ymm2
- addq %r12,%rax
- shrq $29,%r11
- vpmuludq %ymm13,%ymm14,%ymm14
- vmovdqu 160-8-128(%r13),%ymm10
- addq %r11,%rax
- vpaddq %ymm14,%ymm3,%ymm3
- vpmuludq %ymm13,%ymm11,%ymm11
- vmovdqu 192-8-128(%r13),%ymm14
-.byte 0x67
- movq %rax,%r12
- imull %ecx,%eax
- vpaddq %ymm11,%ymm4,%ymm4
- vpmuludq %ymm13,%ymm10,%ymm10
-.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00
- andl $536870911,%eax
- vpaddq %ymm10,%ymm5,%ymm5
- vpmuludq %ymm13,%ymm14,%ymm14
- vmovdqu 256-8-128(%r13),%ymm10
- vpaddq %ymm14,%ymm6,%ymm6
- vpmuludq %ymm13,%ymm11,%ymm11
- vmovdqu 288-8-128(%r13),%ymm9
- vmovd %eax,%xmm0
- imulq -128(%r13),%rax
- vpaddq %ymm11,%ymm7,%ymm7
- vpmuludq %ymm13,%ymm10,%ymm10
- vmovdqu 32-16-128(%r13),%ymm14
- vpbroadcastq %xmm0,%ymm0
- vpaddq %ymm10,%ymm8,%ymm8
- vpmuludq %ymm13,%ymm9,%ymm9
- vmovdqu 64-16-128(%r13),%ymm11
- addq %rax,%r12
-
- vmovdqu 32-24-128(%r13),%ymm13
- vpmuludq %ymm12,%ymm14,%ymm14
- vmovdqu 96-16-128(%r13),%ymm10
- vpaddq %ymm14,%ymm1,%ymm1
- vpmuludq %ymm0,%ymm13,%ymm13
- vpmuludq %ymm12,%ymm11,%ymm11
-.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff
- vpaddq %ymm1,%ymm13,%ymm13
- vpaddq %ymm11,%ymm2,%ymm2
- vpmuludq %ymm12,%ymm10,%ymm10
- vmovdqu 160-16-128(%r13),%ymm11
-.byte 0x67
- vmovq %xmm13,%rax
- vmovdqu %ymm13,(%rsp)
- vpaddq %ymm10,%ymm3,%ymm3
- vpmuludq %ymm12,%ymm14,%ymm14
- vmovdqu 192-16-128(%r13),%ymm10
- vpaddq %ymm14,%ymm4,%ymm4
- vpmuludq %ymm12,%ymm11,%ymm11
- vmovdqu 224-16-128(%r13),%ymm14
- vpaddq %ymm11,%ymm5,%ymm5
- vpmuludq %ymm12,%ymm10,%ymm10
- vmovdqu 256-16-128(%r13),%ymm11
- vpaddq %ymm10,%ymm6,%ymm6
- vpmuludq %ymm12,%ymm14,%ymm14
- shrq $29,%r12
- vmovdqu 288-16-128(%r13),%ymm10
- addq %r12,%rax
- vpaddq %ymm14,%ymm7,%ymm7
- vpmuludq %ymm12,%ymm11,%ymm11
-
- movq %rax,%r9
- imull %ecx,%eax
- vpaddq %ymm11,%ymm8,%ymm8
- vpmuludq %ymm12,%ymm10,%ymm10
- andl $536870911,%eax
- vmovd %eax,%xmm12
- vmovdqu 96-24-128(%r13),%ymm11
-.byte 0x67
- vpaddq %ymm10,%ymm9,%ymm9
- vpbroadcastq %xmm12,%ymm12
-
- vpmuludq 64-24-128(%r13),%ymm0,%ymm14
- vmovdqu 128-24-128(%r13),%ymm10
- movq %rax,%rdx
- imulq -128(%r13),%rax
- movq 8(%rsp),%r10
- vpaddq %ymm14,%ymm2,%ymm1
- vpmuludq %ymm0,%ymm11,%ymm11
- vmovdqu 160-24-128(%r13),%ymm14
- addq %rax,%r9
- movq %rdx,%rax
- imulq 8-128(%r13),%rax
-.byte 0x67
- shrq $29,%r9
- movq 16(%rsp),%r11
- vpaddq %ymm11,%ymm3,%ymm2
- vpmuludq %ymm0,%ymm10,%ymm10
- vmovdqu 192-24-128(%r13),%ymm11
- addq %rax,%r10
- movq %rdx,%rax
- imulq 16-128(%r13),%rax
- vpaddq %ymm10,%ymm4,%ymm3
- vpmuludq %ymm0,%ymm14,%ymm14
- vmovdqu 224-24-128(%r13),%ymm10
- imulq 24-128(%r13),%rdx
- addq %rax,%r11
- leaq (%r9,%r10,1),%rax
- vpaddq %ymm14,%ymm5,%ymm4
- vpmuludq %ymm0,%ymm11,%ymm11
- vmovdqu 256-24-128(%r13),%ymm14
- movq %rax,%r10
- imull %ecx,%eax
- vpmuludq %ymm0,%ymm10,%ymm10
- vpaddq %ymm11,%ymm6,%ymm5
- vmovdqu 288-24-128(%r13),%ymm11
- andl $536870911,%eax
- vpaddq %ymm10,%ymm7,%ymm6
- vpmuludq %ymm0,%ymm14,%ymm14
- addq 24(%rsp),%rdx
- vpaddq %ymm14,%ymm8,%ymm7
- vpmuludq %ymm0,%ymm11,%ymm11
- vpaddq %ymm11,%ymm9,%ymm8
- vmovq %r12,%xmm9
- movq %rdx,%r12
-
- decl %r14d
- jnz .LOOP_REDUCE_1024
- leaq 448(%rsp),%r12
- vpaddq %ymm9,%ymm13,%ymm0
- vpxor %ymm9,%ymm9,%ymm9
-
- vpaddq 288-192(%rbx),%ymm0,%ymm0
- vpaddq 320-448(%r12),%ymm1,%ymm1
- vpaddq 352-448(%r12),%ymm2,%ymm2
- vpaddq 384-448(%r12),%ymm3,%ymm3
- vpaddq 416-448(%r12),%ymm4,%ymm4
- vpaddq 448-448(%r12),%ymm5,%ymm5
- vpaddq 480-448(%r12),%ymm6,%ymm6
- vpaddq 512-448(%r12),%ymm7,%ymm7
- vpaddq 544-448(%r12),%ymm8,%ymm8
-
- vpsrlq $29,%ymm0,%ymm14
- vpand %ymm15,%ymm0,%ymm0
- vpsrlq $29,%ymm1,%ymm11
- vpand %ymm15,%ymm1,%ymm1
- vpsrlq $29,%ymm2,%ymm12
- vpermq $147,%ymm14,%ymm14
- vpand %ymm15,%ymm2,%ymm2
- vpsrlq $29,%ymm3,%ymm13
- vpermq $147,%ymm11,%ymm11
- vpand %ymm15,%ymm3,%ymm3
- vpermq $147,%ymm12,%ymm12
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpermq $147,%ymm13,%ymm13
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm0,%ymm0
- vpblendd $3,%ymm11,%ymm12,%ymm11
- vpaddq %ymm14,%ymm1,%ymm1
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm11,%ymm2,%ymm2
- vpblendd $3,%ymm13,%ymm9,%ymm13
- vpaddq %ymm12,%ymm3,%ymm3
- vpaddq %ymm13,%ymm4,%ymm4
-
- vpsrlq $29,%ymm0,%ymm14
- vpand %ymm15,%ymm0,%ymm0
- vpsrlq $29,%ymm1,%ymm11
- vpand %ymm15,%ymm1,%ymm1
- vpsrlq $29,%ymm2,%ymm12
- vpermq $147,%ymm14,%ymm14
- vpand %ymm15,%ymm2,%ymm2
- vpsrlq $29,%ymm3,%ymm13
- vpermq $147,%ymm11,%ymm11
- vpand %ymm15,%ymm3,%ymm3
- vpermq $147,%ymm12,%ymm12
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpermq $147,%ymm13,%ymm13
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm0,%ymm0
- vpblendd $3,%ymm11,%ymm12,%ymm11
- vpaddq %ymm14,%ymm1,%ymm1
- vmovdqu %ymm0,0-128(%rdi)
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm11,%ymm2,%ymm2
- vmovdqu %ymm1,32-128(%rdi)
- vpblendd $3,%ymm13,%ymm9,%ymm13
- vpaddq %ymm12,%ymm3,%ymm3
- vmovdqu %ymm2,64-128(%rdi)
- vpaddq %ymm13,%ymm4,%ymm4
- vmovdqu %ymm3,96-128(%rdi)
- vpsrlq $29,%ymm4,%ymm14
- vpand %ymm15,%ymm4,%ymm4
- vpsrlq $29,%ymm5,%ymm11
- vpand %ymm15,%ymm5,%ymm5
- vpsrlq $29,%ymm6,%ymm12
- vpermq $147,%ymm14,%ymm14
- vpand %ymm15,%ymm6,%ymm6
- vpsrlq $29,%ymm7,%ymm13
- vpermq $147,%ymm11,%ymm11
- vpand %ymm15,%ymm7,%ymm7
- vpsrlq $29,%ymm8,%ymm0
- vpermq $147,%ymm12,%ymm12
- vpand %ymm15,%ymm8,%ymm8
- vpermq $147,%ymm13,%ymm13
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpermq $147,%ymm0,%ymm0
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm4,%ymm4
- vpblendd $3,%ymm11,%ymm12,%ymm11
- vpaddq %ymm14,%ymm5,%ymm5
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm11,%ymm6,%ymm6
- vpblendd $3,%ymm13,%ymm0,%ymm13
- vpaddq %ymm12,%ymm7,%ymm7
- vpaddq %ymm13,%ymm8,%ymm8
-
- vpsrlq $29,%ymm4,%ymm14
- vpand %ymm15,%ymm4,%ymm4
- vpsrlq $29,%ymm5,%ymm11
- vpand %ymm15,%ymm5,%ymm5
- vpsrlq $29,%ymm6,%ymm12
- vpermq $147,%ymm14,%ymm14
- vpand %ymm15,%ymm6,%ymm6
- vpsrlq $29,%ymm7,%ymm13
- vpermq $147,%ymm11,%ymm11
- vpand %ymm15,%ymm7,%ymm7
- vpsrlq $29,%ymm8,%ymm0
- vpermq $147,%ymm12,%ymm12
- vpand %ymm15,%ymm8,%ymm8
- vpermq $147,%ymm13,%ymm13
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpermq $147,%ymm0,%ymm0
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm4,%ymm4
- vpblendd $3,%ymm11,%ymm12,%ymm11
- vpaddq %ymm14,%ymm5,%ymm5
- vmovdqu %ymm4,128-128(%rdi)
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm11,%ymm6,%ymm6
- vmovdqu %ymm5,160-128(%rdi)
- vpblendd $3,%ymm13,%ymm0,%ymm13
- vpaddq %ymm12,%ymm7,%ymm7
- vmovdqu %ymm6,192-128(%rdi)
- vpaddq %ymm13,%ymm8,%ymm8
- vmovdqu %ymm7,224-128(%rdi)
- vmovdqu %ymm8,256-128(%rdi)
-
- movq %rdi,%rsi
- decl %r8d
- jne .LOOP_GRANDE_SQR_1024
-
- vzeroall
- movq %rbp,%rax
- movq -48(%rax),%r15
- movq -40(%rax),%r14
- movq -32(%rax),%r13
- movq -24(%rax),%r12
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-.Lsqr_1024_epilogue:
- .byte 0xf3,0xc3
-.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
-.globl rsaz_1024_mul_avx2
-.type rsaz_1024_mul_avx2,@function
-.align 64
rsaz_1024_mul_avx2:
- leaq (%rsp),%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rax,%rbp
- vzeroall
- movq %rdx,%r13
- subq $64,%rsp
-
-
-
-
-
-
-.byte 0x67,0x67
- movq %rsi,%r15
- andq $4095,%r15
- addq $320,%r15
- shrq $12,%r15
- movq %rsi,%r15
- cmovnzq %r13,%rsi
- cmovnzq %r15,%r13
-
- movq %rcx,%r15
- subq $-128,%rsi
- subq $-128,%rcx
- subq $-128,%rdi
-
- andq $4095,%r15
- addq $320,%r15
-.byte 0x67,0x67
- shrq $12,%r15
- jz .Lmul_1024_no_n_copy
-
-
-
-
-
- subq $320,%rsp
- vmovdqu 0-128(%rcx),%ymm0
- andq $-512,%rsp
- vmovdqu 32-128(%rcx),%ymm1
- vmovdqu 64-128(%rcx),%ymm2
- vmovdqu 96-128(%rcx),%ymm3
- vmovdqu 128-128(%rcx),%ymm4
- vmovdqu 160-128(%rcx),%ymm5
- vmovdqu 192-128(%rcx),%ymm6
- vmovdqu 224-128(%rcx),%ymm7
- vmovdqu 256-128(%rcx),%ymm8
- leaq 64+128(%rsp),%rcx
- vmovdqu %ymm0,0-128(%rcx)
- vpxor %ymm0,%ymm0,%ymm0
- vmovdqu %ymm1,32-128(%rcx)
- vpxor %ymm1,%ymm1,%ymm1
- vmovdqu %ymm2,64-128(%rcx)
- vpxor %ymm2,%ymm2,%ymm2
- vmovdqu %ymm3,96-128(%rcx)
- vpxor %ymm3,%ymm3,%ymm3
- vmovdqu %ymm4,128-128(%rcx)
- vpxor %ymm4,%ymm4,%ymm4
- vmovdqu %ymm5,160-128(%rcx)
- vpxor %ymm5,%ymm5,%ymm5
- vmovdqu %ymm6,192-128(%rcx)
- vpxor %ymm6,%ymm6,%ymm6
- vmovdqu %ymm7,224-128(%rcx)
- vpxor %ymm7,%ymm7,%ymm7
- vmovdqu %ymm8,256-128(%rcx)
- vmovdqa %ymm0,%ymm8
- vmovdqu %ymm9,288-128(%rcx)
-.Lmul_1024_no_n_copy:
- andq $-64,%rsp
-
- movq (%r13),%rbx
- vpbroadcastq (%r13),%ymm10
- vmovdqu %ymm0,(%rsp)
- xorq %r9,%r9
-.byte 0x67
- xorq %r10,%r10
- xorq %r11,%r11
- xorq %r12,%r12
-
- vmovdqu .Land_mask(%rip),%ymm15
- movl $9,%r14d
- vmovdqu %ymm9,288-128(%rdi)
- jmp .Loop_mul_1024
-
-.align 32
-.Loop_mul_1024:
- vpsrlq $29,%ymm3,%ymm9
- movq %rbx,%rax
- imulq -128(%rsi),%rax
- addq %r9,%rax
- movq %rbx,%r10
- imulq 8-128(%rsi),%r10
- addq 8(%rsp),%r10
-
- movq %rax,%r9
- imull %r8d,%eax
- andl $536870911,%eax
-
- movq %rbx,%r11
- imulq 16-128(%rsi),%r11
- addq 16(%rsp),%r11
-
- movq %rbx,%r12
- imulq 24-128(%rsi),%r12
- addq 24(%rsp),%r12
- vpmuludq 32-128(%rsi),%ymm10,%ymm0
- vmovd %eax,%xmm11
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq 64-128(%rsi),%ymm10,%ymm12
- vpbroadcastq %xmm11,%ymm11
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq 96-128(%rsi),%ymm10,%ymm13
- vpand %ymm15,%ymm3,%ymm3
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq 128-128(%rsi),%ymm10,%ymm0
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq 160-128(%rsi),%ymm10,%ymm12
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq 192-128(%rsi),%ymm10,%ymm13
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq 224-128(%rsi),%ymm10,%ymm0
- vpermq $147,%ymm9,%ymm9
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq 256-128(%rsi),%ymm10,%ymm12
- vpbroadcastq 8(%r13),%ymm10
- vpaddq %ymm12,%ymm8,%ymm8
-
- movq %rax,%rdx
- imulq -128(%rcx),%rax
- addq %rax,%r9
- movq %rdx,%rax
- imulq 8-128(%rcx),%rax
- addq %rax,%r10
- movq %rdx,%rax
- imulq 16-128(%rcx),%rax
- addq %rax,%r11
- shrq $29,%r9
- imulq 24-128(%rcx),%rdx
- addq %rdx,%r12
- addq %r9,%r10
-
- vpmuludq 32-128(%rcx),%ymm11,%ymm13
- vmovq %xmm10,%rbx
- vpaddq %ymm13,%ymm1,%ymm1
- vpmuludq 64-128(%rcx),%ymm11,%ymm0
- vpaddq %ymm0,%ymm2,%ymm2
- vpmuludq 96-128(%rcx),%ymm11,%ymm12
- vpaddq %ymm12,%ymm3,%ymm3
- vpmuludq 128-128(%rcx),%ymm11,%ymm13
- vpaddq %ymm13,%ymm4,%ymm4
- vpmuludq 160-128(%rcx),%ymm11,%ymm0
- vpaddq %ymm0,%ymm5,%ymm5
- vpmuludq 192-128(%rcx),%ymm11,%ymm12
- vpaddq %ymm12,%ymm6,%ymm6
- vpmuludq 224-128(%rcx),%ymm11,%ymm13
- vpblendd $3,%ymm14,%ymm9,%ymm9
- vpaddq %ymm13,%ymm7,%ymm7
- vpmuludq 256-128(%rcx),%ymm11,%ymm0
- vpaddq %ymm9,%ymm3,%ymm3
- vpaddq %ymm0,%ymm8,%ymm8
-
- movq %rbx,%rax
- imulq -128(%rsi),%rax
- addq %rax,%r10
- vmovdqu -8+32-128(%rsi),%ymm12
- movq %rbx,%rax
- imulq 8-128(%rsi),%rax
- addq %rax,%r11
- vmovdqu -8+64-128(%rsi),%ymm13
-
- movq %r10,%rax
- imull %r8d,%eax
- andl $536870911,%eax
-
- imulq 16-128(%rsi),%rbx
- addq %rbx,%r12
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovd %eax,%xmm11
- vmovdqu -8+96-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm1,%ymm1
- vpmuludq %ymm10,%ymm13,%ymm13
- vpbroadcastq %xmm11,%ymm11
- vmovdqu -8+128-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm2,%ymm2
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -8+160-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm3,%ymm3
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovdqu -8+192-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm4,%ymm4
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -8+224-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm5,%ymm5
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -8+256-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm6,%ymm6
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovdqu -8+288-128(%rsi),%ymm9
- vpaddq %ymm12,%ymm7,%ymm7
- vpmuludq %ymm10,%ymm13,%ymm13
- vpaddq %ymm13,%ymm8,%ymm8
- vpmuludq %ymm10,%ymm9,%ymm9
- vpbroadcastq 16(%r13),%ymm10
-
- movq %rax,%rdx
- imulq -128(%rcx),%rax
- addq %rax,%r10
- vmovdqu -8+32-128(%rcx),%ymm0
- movq %rdx,%rax
- imulq 8-128(%rcx),%rax
- addq %rax,%r11
- vmovdqu -8+64-128(%rcx),%ymm12
- shrq $29,%r10
- imulq 16-128(%rcx),%rdx
- addq %rdx,%r12
- addq %r10,%r11
-
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovq %xmm10,%rbx
- vmovdqu -8+96-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -8+128-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -8+160-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -8+192-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -8+224-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -8+256-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -8+288-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq %ymm11,%ymm12,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq %ymm11,%ymm13,%ymm13
- vpaddq %ymm13,%ymm9,%ymm9
-
- vmovdqu -16+32-128(%rsi),%ymm0
- movq %rbx,%rax
- imulq -128(%rsi),%rax
- addq %r11,%rax
-
- vmovdqu -16+64-128(%rsi),%ymm12
- movq %rax,%r11
- imull %r8d,%eax
- andl $536870911,%eax
-
- imulq 8-128(%rsi),%rbx
- addq %rbx,%r12
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovd %eax,%xmm11
- vmovdqu -16+96-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq %ymm10,%ymm12,%ymm12
- vpbroadcastq %xmm11,%ymm11
- vmovdqu -16+128-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -16+160-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -16+192-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovdqu -16+224-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -16+256-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -16+288-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq %ymm10,%ymm12,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq %ymm10,%ymm13,%ymm13
- vpbroadcastq 24(%r13),%ymm10
- vpaddq %ymm13,%ymm9,%ymm9
-
- vmovdqu -16+32-128(%rcx),%ymm0
- movq %rax,%rdx
- imulq -128(%rcx),%rax
- addq %rax,%r11
- vmovdqu -16+64-128(%rcx),%ymm12
- imulq 8-128(%rcx),%rdx
- addq %rdx,%r12
- shrq $29,%r11
-
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovq %xmm10,%rbx
- vmovdqu -16+96-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -16+128-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -16+160-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -16+192-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -16+224-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -16+256-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -16+288-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -24+32-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -24+64-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm9,%ymm9
-
- addq %r11,%r12
- imulq -128(%rsi),%rbx
- addq %rbx,%r12
-
- movq %r12,%rax
- imull %r8d,%eax
- andl $536870911,%eax
-
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovd %eax,%xmm11
- vmovdqu -24+96-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq %ymm10,%ymm12,%ymm12
- vpbroadcastq %xmm11,%ymm11
- vmovdqu -24+128-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -24+160-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -24+192-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovdqu -24+224-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -24+256-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -24+288-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq %ymm10,%ymm12,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq %ymm10,%ymm13,%ymm13
- vpbroadcastq 32(%r13),%ymm10
- vpaddq %ymm13,%ymm9,%ymm9
- addq $32,%r13
-
- vmovdqu -24+32-128(%rcx),%ymm0
- imulq -128(%rcx),%rax
- addq %rax,%r12
- shrq $29,%r12
-
- vmovdqu -24+64-128(%rcx),%ymm12
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovq %xmm10,%rbx
- vmovdqu -24+96-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm1,%ymm0
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu %ymm0,(%rsp)
- vpaddq %ymm12,%ymm2,%ymm1
- vmovdqu -24+128-128(%rcx),%ymm0
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -24+160-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm3,%ymm2
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -24+192-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm4,%ymm3
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -24+224-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm5,%ymm4
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -24+256-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm6,%ymm5
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -24+288-128(%rcx),%ymm13
- movq %r12,%r9
- vpaddq %ymm0,%ymm7,%ymm6
- vpmuludq %ymm11,%ymm12,%ymm12
- addq (%rsp),%r9
- vpaddq %ymm12,%ymm8,%ymm7
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovq %r12,%xmm12
- vpaddq %ymm13,%ymm9,%ymm8
-
- decl %r14d
- jnz .Loop_mul_1024
- vpermq $0,%ymm15,%ymm15
- vpaddq (%rsp),%ymm12,%ymm0
-
- vpsrlq $29,%ymm0,%ymm12
- vpand %ymm15,%ymm0,%ymm0
- vpsrlq $29,%ymm1,%ymm13
- vpand %ymm15,%ymm1,%ymm1
- vpsrlq $29,%ymm2,%ymm10
- vpermq $147,%ymm12,%ymm12
- vpand %ymm15,%ymm2,%ymm2
- vpsrlq $29,%ymm3,%ymm11
- vpermq $147,%ymm13,%ymm13
- vpand %ymm15,%ymm3,%ymm3
-
- vpblendd $3,%ymm14,%ymm12,%ymm9
- vpermq $147,%ymm10,%ymm10
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpermq $147,%ymm11,%ymm11
- vpaddq %ymm9,%ymm0,%ymm0
- vpblendd $3,%ymm13,%ymm10,%ymm13
- vpaddq %ymm12,%ymm1,%ymm1
- vpblendd $3,%ymm10,%ymm11,%ymm10
- vpaddq %ymm13,%ymm2,%ymm2
- vpblendd $3,%ymm11,%ymm14,%ymm11
- vpaddq %ymm10,%ymm3,%ymm3
- vpaddq %ymm11,%ymm4,%ymm4
-
- vpsrlq $29,%ymm0,%ymm12
- vpand %ymm15,%ymm0,%ymm0
- vpsrlq $29,%ymm1,%ymm13
- vpand %ymm15,%ymm1,%ymm1
- vpsrlq $29,%ymm2,%ymm10
- vpermq $147,%ymm12,%ymm12
- vpand %ymm15,%ymm2,%ymm2
- vpsrlq $29,%ymm3,%ymm11
- vpermq $147,%ymm13,%ymm13
- vpand %ymm15,%ymm3,%ymm3
- vpermq $147,%ymm10,%ymm10
-
- vpblendd $3,%ymm14,%ymm12,%ymm9
- vpermq $147,%ymm11,%ymm11
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm9,%ymm0,%ymm0
- vpblendd $3,%ymm13,%ymm10,%ymm13
- vpaddq %ymm12,%ymm1,%ymm1
- vpblendd $3,%ymm10,%ymm11,%ymm10
- vpaddq %ymm13,%ymm2,%ymm2
- vpblendd $3,%ymm11,%ymm14,%ymm11
- vpaddq %ymm10,%ymm3,%ymm3
- vpaddq %ymm11,%ymm4,%ymm4
-
- vmovdqu %ymm0,0-128(%rdi)
- vmovdqu %ymm1,32-128(%rdi)
- vmovdqu %ymm2,64-128(%rdi)
- vmovdqu %ymm3,96-128(%rdi)
- vpsrlq $29,%ymm4,%ymm12
- vpand %ymm15,%ymm4,%ymm4
- vpsrlq $29,%ymm5,%ymm13
- vpand %ymm15,%ymm5,%ymm5
- vpsrlq $29,%ymm6,%ymm10
- vpermq $147,%ymm12,%ymm12
- vpand %ymm15,%ymm6,%ymm6
- vpsrlq $29,%ymm7,%ymm11
- vpermq $147,%ymm13,%ymm13
- vpand %ymm15,%ymm7,%ymm7
- vpsrlq $29,%ymm8,%ymm0
- vpermq $147,%ymm10,%ymm10
- vpand %ymm15,%ymm8,%ymm8
- vpermq $147,%ymm11,%ymm11
-
- vpblendd $3,%ymm14,%ymm12,%ymm9
- vpermq $147,%ymm0,%ymm0
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm9,%ymm4,%ymm4
- vpblendd $3,%ymm13,%ymm10,%ymm13
- vpaddq %ymm12,%ymm5,%ymm5
- vpblendd $3,%ymm10,%ymm11,%ymm10
- vpaddq %ymm13,%ymm6,%ymm6
- vpblendd $3,%ymm11,%ymm0,%ymm11
- vpaddq %ymm10,%ymm7,%ymm7
- vpaddq %ymm11,%ymm8,%ymm8
-
- vpsrlq $29,%ymm4,%ymm12
- vpand %ymm15,%ymm4,%ymm4
- vpsrlq $29,%ymm5,%ymm13
- vpand %ymm15,%ymm5,%ymm5
- vpsrlq $29,%ymm6,%ymm10
- vpermq $147,%ymm12,%ymm12
- vpand %ymm15,%ymm6,%ymm6
- vpsrlq $29,%ymm7,%ymm11
- vpermq $147,%ymm13,%ymm13
- vpand %ymm15,%ymm7,%ymm7
- vpsrlq $29,%ymm8,%ymm0
- vpermq $147,%ymm10,%ymm10
- vpand %ymm15,%ymm8,%ymm8
- vpermq $147,%ymm11,%ymm11
-
- vpblendd $3,%ymm14,%ymm12,%ymm9
- vpermq $147,%ymm0,%ymm0
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm9,%ymm4,%ymm4
- vpblendd $3,%ymm13,%ymm10,%ymm13
- vpaddq %ymm12,%ymm5,%ymm5
- vpblendd $3,%ymm10,%ymm11,%ymm10
- vpaddq %ymm13,%ymm6,%ymm6
- vpblendd $3,%ymm11,%ymm0,%ymm11
- vpaddq %ymm10,%ymm7,%ymm7
- vpaddq %ymm11,%ymm8,%ymm8
-
- vmovdqu %ymm4,128-128(%rdi)
- vmovdqu %ymm5,160-128(%rdi)
- vmovdqu %ymm6,192-128(%rdi)
- vmovdqu %ymm7,224-128(%rdi)
- vmovdqu %ymm8,256-128(%rdi)
- vzeroupper
-
- movq %rbp,%rax
- movq -48(%rax),%r15
- movq -40(%rax),%r14
- movq -32(%rax),%r13
- movq -24(%rax),%r12
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-.Lmul_1024_epilogue:
- .byte 0xf3,0xc3
-.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
-.globl rsaz_1024_red2norm_avx2
-.type rsaz_1024_red2norm_avx2,@function
-.align 32
-rsaz_1024_red2norm_avx2:
- subq $-128,%rsi
- xorq %rax,%rax
- movq -128(%rsi),%r8
- movq -120(%rsi),%r9
- movq -112(%rsi),%r10
- shlq $0,%r8
- shlq $29,%r9
- movq %r10,%r11
- shlq $58,%r10
- shrq $6,%r11
- addq %r8,%rax
- addq %r9,%rax
- addq %r10,%rax
- adcq $0,%r11
- movq %rax,0(%rdi)
- movq %r11,%rax
- movq -104(%rsi),%r8
- movq -96(%rsi),%r9
- shlq $23,%r8
- movq %r9,%r10
- shlq $52,%r9
- shrq $12,%r10
- addq %r8,%rax
- addq %r9,%rax
- adcq $0,%r10
- movq %rax,8(%rdi)
- movq %r10,%rax
- movq -88(%rsi),%r11
- movq -80(%rsi),%r8
- shlq $17,%r11
- movq %r8,%r9
- shlq $46,%r8
- shrq $18,%r9
- addq %r11,%rax
- addq %r8,%rax
- adcq $0,%r9
- movq %rax,16(%rdi)
- movq %r9,%rax
- movq -72(%rsi),%r10
- movq -64(%rsi),%r11
- shlq $11,%r10
- movq %r11,%r8
- shlq $40,%r11
- shrq $24,%r8
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,24(%rdi)
- movq %r8,%rax
- movq -56(%rsi),%r9
- movq -48(%rsi),%r10
- movq -40(%rsi),%r11
- shlq $5,%r9
- shlq $34,%r10
- movq %r11,%r8
- shlq $63,%r11
- shrq $1,%r8
- addq %r9,%rax
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,32(%rdi)
- movq %r8,%rax
- movq -32(%rsi),%r9
- movq -24(%rsi),%r10
- shlq $28,%r9
- movq %r10,%r11
- shlq $57,%r10
- shrq $7,%r11
- addq %r9,%rax
- addq %r10,%rax
- adcq $0,%r11
- movq %rax,40(%rdi)
- movq %r11,%rax
- movq -16(%rsi),%r8
- movq -8(%rsi),%r9
- shlq $22,%r8
- movq %r9,%r10
- shlq $51,%r9
- shrq $13,%r10
- addq %r8,%rax
- addq %r9,%rax
- adcq $0,%r10
- movq %rax,48(%rdi)
- movq %r10,%rax
- movq 0(%rsi),%r11
- movq 8(%rsi),%r8
- shlq $16,%r11
- movq %r8,%r9
- shlq $45,%r8
- shrq $19,%r9
- addq %r11,%rax
- addq %r8,%rax
- adcq $0,%r9
- movq %rax,56(%rdi)
- movq %r9,%rax
- movq 16(%rsi),%r10
- movq 24(%rsi),%r11
- shlq $10,%r10
- movq %r11,%r8
- shlq $39,%r11
- shrq $25,%r8
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,64(%rdi)
- movq %r8,%rax
- movq 32(%rsi),%r9
- movq 40(%rsi),%r10
- movq 48(%rsi),%r11
- shlq $4,%r9
- shlq $33,%r10
- movq %r11,%r8
- shlq $62,%r11
- shrq $2,%r8
- addq %r9,%rax
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,72(%rdi)
- movq %r8,%rax
- movq 56(%rsi),%r9
- movq 64(%rsi),%r10
- shlq $27,%r9
- movq %r10,%r11
- shlq $56,%r10
- shrq $8,%r11
- addq %r9,%rax
- addq %r10,%rax
- adcq $0,%r11
- movq %rax,80(%rdi)
- movq %r11,%rax
- movq 72(%rsi),%r8
- movq 80(%rsi),%r9
- shlq $21,%r8
- movq %r9,%r10
- shlq $50,%r9
- shrq $14,%r10
- addq %r8,%rax
- addq %r9,%rax
- adcq $0,%r10
- movq %rax,88(%rdi)
- movq %r10,%rax
- movq 88(%rsi),%r11
- movq 96(%rsi),%r8
- shlq $15,%r11
- movq %r8,%r9
- shlq $44,%r8
- shrq $20,%r9
- addq %r11,%rax
- addq %r8,%rax
- adcq $0,%r9
- movq %rax,96(%rdi)
- movq %r9,%rax
- movq 104(%rsi),%r10
- movq 112(%rsi),%r11
- shlq $9,%r10
- movq %r11,%r8
- shlq $38,%r11
- shrq $26,%r8
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,104(%rdi)
- movq %r8,%rax
- movq 120(%rsi),%r9
- movq 128(%rsi),%r10
- movq 136(%rsi),%r11
- shlq $3,%r9
- shlq $32,%r10
- movq %r11,%r8
- shlq $61,%r11
- shrq $3,%r8
- addq %r9,%rax
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,112(%rdi)
- movq %r8,%rax
- movq 144(%rsi),%r9
- movq 152(%rsi),%r10
- shlq $26,%r9
- movq %r10,%r11
- shlq $55,%r10
- shrq $9,%r11
- addq %r9,%rax
- addq %r10,%rax
- adcq $0,%r11
- movq %rax,120(%rdi)
- movq %r11,%rax
- .byte 0xf3,0xc3
-.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
-
-.globl rsaz_1024_norm2red_avx2
-.type rsaz_1024_norm2red_avx2,@function
-.align 32
rsaz_1024_norm2red_avx2:
- subq $-128,%rdi
- movq (%rsi),%r8
- movl $536870911,%eax
- movq 8(%rsi),%r9
- movq %r8,%r11
- shrq $0,%r11
- andq %rax,%r11
- movq %r11,-128(%rdi)
- movq %r8,%r10
- shrq $29,%r10
- andq %rax,%r10
- movq %r10,-120(%rdi)
- shrdq $58,%r9,%r8
- andq %rax,%r8
- movq %r8,-112(%rdi)
- movq 16(%rsi),%r10
- movq %r9,%r8
- shrq $23,%r8
- andq %rax,%r8
- movq %r8,-104(%rdi)
- shrdq $52,%r10,%r9
- andq %rax,%r9
- movq %r9,-96(%rdi)
- movq 24(%rsi),%r11
- movq %r10,%r9
- shrq $17,%r9
- andq %rax,%r9
- movq %r9,-88(%rdi)
- shrdq $46,%r11,%r10
- andq %rax,%r10
- movq %r10,-80(%rdi)
- movq 32(%rsi),%r8
- movq %r11,%r10
- shrq $11,%r10
- andq %rax,%r10
- movq %r10,-72(%rdi)
- shrdq $40,%r8,%r11
- andq %rax,%r11
- movq %r11,-64(%rdi)
- movq 40(%rsi),%r9
- movq %r8,%r11
- shrq $5,%r11
- andq %rax,%r11
- movq %r11,-56(%rdi)
- movq %r8,%r10
- shrq $34,%r10
- andq %rax,%r10
- movq %r10,-48(%rdi)
- shrdq $63,%r9,%r8
- andq %rax,%r8
- movq %r8,-40(%rdi)
- movq 48(%rsi),%r10
- movq %r9,%r8
- shrq $28,%r8
- andq %rax,%r8
- movq %r8,-32(%rdi)
- shrdq $57,%r10,%r9
- andq %rax,%r9
- movq %r9,-24(%rdi)
- movq 56(%rsi),%r11
- movq %r10,%r9
- shrq $22,%r9
- andq %rax,%r9
- movq %r9,-16(%rdi)
- shrdq $51,%r11,%r10
- andq %rax,%r10
- movq %r10,-8(%rdi)
- movq 64(%rsi),%r8
- movq %r11,%r10
- shrq $16,%r10
- andq %rax,%r10
- movq %r10,0(%rdi)
- shrdq $45,%r8,%r11
- andq %rax,%r11
- movq %r11,8(%rdi)
- movq 72(%rsi),%r9
- movq %r8,%r11
- shrq $10,%r11
- andq %rax,%r11
- movq %r11,16(%rdi)
- shrdq $39,%r9,%r8
- andq %rax,%r8
- movq %r8,24(%rdi)
- movq 80(%rsi),%r10
- movq %r9,%r8
- shrq $4,%r8
- andq %rax,%r8
- movq %r8,32(%rdi)
- movq %r9,%r11
- shrq $33,%r11
- andq %rax,%r11
- movq %r11,40(%rdi)
- shrdq $62,%r10,%r9
- andq %rax,%r9
- movq %r9,48(%rdi)
- movq 88(%rsi),%r11
- movq %r10,%r9
- shrq $27,%r9
- andq %rax,%r9
- movq %r9,56(%rdi)
- shrdq $56,%r11,%r10
- andq %rax,%r10
- movq %r10,64(%rdi)
- movq 96(%rsi),%r8
- movq %r11,%r10
- shrq $21,%r10
- andq %rax,%r10
- movq %r10,72(%rdi)
- shrdq $50,%r8,%r11
- andq %rax,%r11
- movq %r11,80(%rdi)
- movq 104(%rsi),%r9
- movq %r8,%r11
- shrq $15,%r11
- andq %rax,%r11
- movq %r11,88(%rdi)
- shrdq $44,%r9,%r8
- andq %rax,%r8
- movq %r8,96(%rdi)
- movq 112(%rsi),%r10
- movq %r9,%r8
- shrq $9,%r8
- andq %rax,%r8
- movq %r8,104(%rdi)
- shrdq $38,%r10,%r9
- andq %rax,%r9
- movq %r9,112(%rdi)
- movq 120(%rsi),%r11
- movq %r10,%r9
- shrq $3,%r9
- andq %rax,%r9
- movq %r9,120(%rdi)
- movq %r10,%r8
- shrq $32,%r8
- andq %rax,%r8
- movq %r8,128(%rdi)
- shrdq $61,%r11,%r10
- andq %rax,%r10
- movq %r10,136(%rdi)
- xorq %r8,%r8
- movq %r11,%r10
- shrq $26,%r10
- andq %rax,%r10
- movq %r10,144(%rdi)
- shrdq $55,%r8,%r11
- andq %rax,%r11
- movq %r11,152(%rdi)
- movq %r8,160(%rdi)
- movq %r8,168(%rdi)
- movq %r8,176(%rdi)
- movq %r8,184(%rdi)
- .byte 0xf3,0xc3
-.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
-.globl rsaz_1024_scatter5_avx2
-.type rsaz_1024_scatter5_avx2,@function
-.align 32
+rsaz_1024_red2norm_avx2:
rsaz_1024_scatter5_avx2:
- vzeroupper
- vmovdqu .Lscatter_permd(%rip),%ymm5
- shll $4,%edx
- leaq (%rdi,%rdx,1),%rdi
- movl $9,%eax
- jmp .Loop_scatter_1024
-
-.align 32
-.Loop_scatter_1024:
- vmovdqu (%rsi),%ymm0
- leaq 32(%rsi),%rsi
- vpermd %ymm0,%ymm5,%ymm0
- vmovdqu %xmm0,(%rdi)
- leaq 512(%rdi),%rdi
- decl %eax
- jnz .Loop_scatter_1024
-
- vzeroupper
- .byte 0xf3,0xc3
-.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
-
-.globl rsaz_1024_gather5_avx2
-.type rsaz_1024_gather5_avx2,@function
-.align 32
rsaz_1024_gather5_avx2:
- leaq .Lgather_table(%rip),%r11
- movl %edx,%eax
- andl $3,%edx
- shrl $2,%eax
- shll $4,%edx
-
- vmovdqu -32(%r11),%ymm7
- vpbroadcastb 8(%r11,%rax,1),%xmm8
- vpbroadcastb 7(%r11,%rax,1),%xmm9
- vpbroadcastb 6(%r11,%rax,1),%xmm10
- vpbroadcastb 5(%r11,%rax,1),%xmm11
- vpbroadcastb 4(%r11,%rax,1),%xmm12
- vpbroadcastb 3(%r11,%rax,1),%xmm13
- vpbroadcastb 2(%r11,%rax,1),%xmm14
- vpbroadcastb 1(%r11,%rax,1),%xmm15
-
- leaq 64(%rsi,%rdx,1),%rsi
- movq $64,%r11
- movl $9,%eax
- jmp .Loop_gather_1024
-
-.align 32
-.Loop_gather_1024:
- vpand -64(%rsi),%xmm8,%xmm0
- vpand (%rsi),%xmm9,%xmm1
- vpand 64(%rsi),%xmm10,%xmm2
- vpand (%rsi,%r11,2),%xmm11,%xmm3
- vpor %xmm0,%xmm1,%xmm1
- vpand 64(%rsi,%r11,2),%xmm12,%xmm4
- vpor %xmm2,%xmm3,%xmm3
- vpand (%rsi,%r11,4),%xmm13,%xmm5
- vpor %xmm1,%xmm3,%xmm3
- vpand 64(%rsi,%r11,4),%xmm14,%xmm6
- vpor %xmm4,%xmm5,%xmm5
- vpand -128(%rsi,%r11,8),%xmm15,%xmm2
- leaq (%rsi,%r11,8),%rsi
- vpor %xmm3,%xmm5,%xmm5
- vpor %xmm2,%xmm6,%xmm6
- vpor %xmm5,%xmm6,%xmm6
- vpermd %ymm6,%ymm7,%ymm6
- vmovdqu %ymm6,(%rdi)
- leaq 32(%rdi),%rdi
- decl %eax
- jnz .Loop_gather_1024
-
- vpxor %ymm0,%ymm0,%ymm0
- vmovdqu %ymm0,(%rdi)
- vzeroupper
+.byte 0x0f,0x0b
.byte 0xf3,0xc3
-.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
-
-.globl rsaz_avx2_eligible
-.type rsaz_avx2_eligible,@function
-.align 32
-rsaz_avx2_eligible:
- movl OPENSSL_ia32cap_P+8(%rip),%eax
- movl $524544,%ecx
- movl $0,%edx
- andl %eax,%ecx
- cmpl $524544,%ecx
- cmovel %edx,%eax
- andl $32,%eax
- shrl $5,%eax
- .byte 0xf3,0xc3
-.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
-
-.align 64
-.Land_mask:
-.quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
-.Lscatter_permd:
-.long 0,2,4,6,7,7,7,7
-.Lgather_permd:
-.long 0,7,1,7,2,7,3,7
-.Lgather_table:
-.byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
-.align 64
+.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
diff --git a/deps/openssl/asm/x64-elf-gas/bn/rsaz-x86_64.s b/deps/openssl/asm/x64-elf-gas/bn/rsaz-x86_64.s
index f42075571e..4a1211329c 100644
--- a/deps/openssl/asm/x64-elf-gas/bn/rsaz-x86_64.s
+++ b/deps/openssl/asm/x64-elf-gas/bn/rsaz-x86_64.s
@@ -19,10 +19,6 @@ rsaz_512_sqr:
movq (%rsi),%rdx
movq 8(%rsi),%rax
movq %rcx,128(%rsp)
- movl $524544,%r11d
- andl OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl $524544,%r11d
- je .Loop_sqrx
jmp .Loop_sqr
.align 32
@@ -386,276 +382,6 @@ rsaz_512_sqr:
decl %r8d
jnz .Loop_sqr
- jmp .Lsqr_tail
-
-.align 32
-.Loop_sqrx:
- movl %r8d,128+8(%rsp)
-.byte 102,72,15,110,199
-.byte 102,72,15,110,205
-
- mulxq %rax,%r8,%r9
-
- mulxq 16(%rsi),%rcx,%r10
- xorq %rbp,%rbp
-
- mulxq 24(%rsi),%rax,%r11
- adcxq %rcx,%r9
-
- mulxq 32(%rsi),%rcx,%r12
- adcxq %rax,%r10
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rcx,%r11
-
-.byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00
- adcxq %rax,%r12
- adcxq %rcx,%r13
-
-.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
- adcxq %rax,%r14
- adcxq %rbp,%r15
-
- movq %r9,%rcx
- shldq $1,%r8,%r9
- shlq $1,%r8
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rdx
- adcxq %rdx,%r8
- movq 8(%rsi),%rdx
- adcxq %rbp,%r9
-
- movq %rax,(%rsp)
- movq %r8,8(%rsp)
-
-
- mulxq 16(%rsi),%rax,%rbx
- adoxq %rax,%r10
- adcxq %rbx,%r11
-
-.byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00
- adoxq %rdi,%r11
- adcxq %r8,%r12
-
- mulxq 32(%rsi),%rax,%rbx
- adoxq %rax,%r12
- adcxq %rbx,%r13
-
- mulxq 40(%rsi),%rdi,%r8
- adoxq %rdi,%r13
- adcxq %r8,%r14
-
-.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
- adoxq %rax,%r14
- adcxq %rbx,%r15
-
-.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
- adoxq %rdi,%r15
- adcxq %rbp,%r8
- adoxq %rbp,%r8
-
- movq %r11,%rbx
- shldq $1,%r10,%r11
- shldq $1,%rcx,%r10
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rcx
- movq 16(%rsi),%rdx
- adcxq %rax,%r9
- adcxq %rcx,%r10
- adcxq %rbp,%r11
-
- movq %r9,16(%rsp)
-.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00
-
-
-.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00
- adoxq %rdi,%r12
- adcxq %r9,%r13
-
- mulxq 32(%rsi),%rax,%rcx
- adoxq %rax,%r13
- adcxq %rcx,%r14
-
- mulxq 40(%rsi),%rdi,%r9
- adoxq %rdi,%r14
- adcxq %r9,%r15
-
-.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
- adoxq %rax,%r15
- adcxq %rcx,%r8
-
-.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00
- adoxq %rdi,%r8
- adcxq %rbp,%r9
- adoxq %rbp,%r9
-
- movq %r13,%rcx
- shldq $1,%r12,%r13
- shldq $1,%rbx,%r12
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rdx
- adcxq %rax,%r11
- adcxq %rdx,%r12
- movq 24(%rsi),%rdx
- adcxq %rbp,%r13
-
- movq %r11,32(%rsp)
-.byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00
-
-
-.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00
- adoxq %rax,%r14
- adcxq %rbx,%r15
-
- mulxq 40(%rsi),%rdi,%r10
- adoxq %rdi,%r15
- adcxq %r10,%r8
-
- mulxq 48(%rsi),%rax,%rbx
- adoxq %rax,%r8
- adcxq %rbx,%r9
-
- mulxq 56(%rsi),%rdi,%r10
- adoxq %rdi,%r9
- adcxq %rbp,%r10
- adoxq %rbp,%r10
-
-.byte 0x66
- movq %r15,%rbx
- shldq $1,%r14,%r15
- shldq $1,%rcx,%r14
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rdx
- adcxq %rax,%r13
- adcxq %rdx,%r14
- movq 32(%rsi),%rdx
- adcxq %rbp,%r15
-
- movq %r13,48(%rsp)
- movq %r14,56(%rsp)
-
-
-.byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00
- adoxq %rdi,%r8
- adcxq %r11,%r9
-
- mulxq 48(%rsi),%rax,%rcx
- adoxq %rax,%r9
- adcxq %rcx,%r10
-
- mulxq 56(%rsi),%rdi,%r11
- adoxq %rdi,%r10
- adcxq %rbp,%r11
- adoxq %rbp,%r11
-
- movq %r9,%rcx
- shldq $1,%r8,%r9
- shldq $1,%rbx,%r8
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rdx
- adcxq %rax,%r15
- adcxq %rdx,%r8
- movq 40(%rsi),%rdx
- adcxq %rbp,%r9
-
- movq %r15,64(%rsp)
- movq %r8,72(%rsp)
-
-
-.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
- adoxq %rax,%r10
- adcxq %rbx,%r11
-
-.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
- adoxq %rdi,%r11
- adcxq %rbp,%r12
- adoxq %rbp,%r12
-
- movq %r11,%rbx
- shldq $1,%r10,%r11
- shldq $1,%rcx,%r10
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rdx
- adcxq %rax,%r9
- adcxq %rdx,%r10
- movq 48(%rsi),%rdx
- adcxq %rbp,%r11
-
- movq %r9,80(%rsp)
- movq %r10,88(%rsp)
-
-
-.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
- adoxq %rax,%r12
- adoxq %rbp,%r13
-
- xorq %r14,%r14
- shldq $1,%r13,%r14
- shldq $1,%r12,%r13
- shldq $1,%rbx,%r12
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rdx
- adcxq %rax,%r11
- adcxq %rdx,%r12
- movq 56(%rsi),%rdx
- adcxq %rbp,%r13
-
-.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
-.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00
-
-
- mulxq %rdx,%rax,%rdx
- adoxq %rax,%r13
- adoxq %rbp,%rdx
-
-.byte 0x66
- addq %rdx,%r14
-
- movq %r13,112(%rsp)
- movq %r14,120(%rsp)
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
-
- movq 128(%rsp),%rdx
- movq (%rsp),%r8
- movq 8(%rsp),%r9
- movq 16(%rsp),%r10
- movq 24(%rsp),%r11
- movq 32(%rsp),%r12
- movq 40(%rsp),%r13
- movq 48(%rsp),%r14
- movq 56(%rsp),%r15
-
- call __rsaz_512_reducex
-
- addq 64(%rsp),%r8
- adcq 72(%rsp),%r9
- adcq 80(%rsp),%r10
- adcq 88(%rsp),%r11
- adcq 96(%rsp),%r12
- adcq 104(%rsp),%r13
- adcq 112(%rsp),%r14
- adcq 120(%rsp),%r15
- sbbq %rcx,%rcx
-
- call __rsaz_512_subtract
-
- movq %r8,%rdx
- movq %r9,%rax
- movl 128+8(%rsp),%r8d
- movq %rdi,%rsi
-
- decl %r8d
- jnz .Loop_sqrx
-
-.Lsqr_tail:
leaq 128+24+48(%rsp),%rax
movq -48(%rax),%r15
@@ -684,10 +410,6 @@ rsaz_512_mul:
.byte 102,72,15,110,199
.byte 102,72,15,110,201
movq %r8,128(%rsp)
- movl $524544,%r11d
- andl OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl $524544,%r11d
- je .Lmulx
movq (%rdx),%rbx
movq %rdx,%rbp
call __rsaz_512_mul
@@ -705,29 +427,6 @@ rsaz_512_mul:
movq 56(%rsp),%r15
call __rsaz_512_reduce
- jmp .Lmul_tail
-
-.align 32
-.Lmulx:
- movq %rdx,%rbp
- movq (%rdx),%rdx
- call __rsaz_512_mulx
-
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
-
- movq 128(%rsp),%rdx
- movq (%rsp),%r8
- movq 8(%rsp),%r9
- movq 16(%rsp),%r10
- movq 24(%rsp),%r11
- movq 32(%rsp),%r12
- movq 40(%rsp),%r13
- movq 48(%rsp),%r14
- movq 56(%rsp),%r15
-
- call __rsaz_512_reducex
-.Lmul_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@@ -762,52 +461,94 @@ rsaz_512_mul_gather4:
pushq %r14
pushq %r15
- movl %r9d,%r9d
- subq $128+24,%rsp
+ subq $152,%rsp
.Lmul_gather4_body:
- movl $524544,%r11d
- andl OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl $524544,%r11d
- je .Lmulx_gather
- movl 64(%rdx,%r9,4),%eax
-.byte 102,72,15,110,199
- movl (%rdx,%r9,4),%ebx
-.byte 102,72,15,110,201
+ movd %r9d,%xmm8
+ movdqa .Linc+16(%rip),%xmm1
+ movdqa .Linc(%rip),%xmm0
+
+ pshufd $0,%xmm8,%xmm8
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm8,%xmm0
+ movdqa %xmm7,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm8,%xmm1
+ movdqa %xmm7,%xmm4
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm8,%xmm2
+ movdqa %xmm7,%xmm5
+ paddd %xmm3,%xmm4
+ pcmpeqd %xmm8,%xmm3
+ movdqa %xmm7,%xmm6
+ paddd %xmm4,%xmm5
+ pcmpeqd %xmm8,%xmm4
+ paddd %xmm5,%xmm6
+ pcmpeqd %xmm8,%xmm5
+ paddd %xmm6,%xmm7
+ pcmpeqd %xmm8,%xmm6
+ pcmpeqd %xmm8,%xmm7
+
+ movdqa 0(%rdx),%xmm8
+ movdqa 16(%rdx),%xmm9
+ movdqa 32(%rdx),%xmm10
+ movdqa 48(%rdx),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rdx),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rdx),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rdx),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rdx),%xmm15
+ leaq 128(%rdx),%rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+.byte 102,76,15,126,195
+
movq %r8,128(%rsp)
+ movq %rdi,128+8(%rsp)
+ movq %rcx,128+16(%rsp)
- shlq $32,%rax
- orq %rax,%rbx
movq (%rsi),%rax
movq 8(%rsi),%rcx
- leaq 128(%rdx,%r9,4),%rbp
mulq %rbx
movq %rax,(%rsp)
movq %rcx,%rax
movq %rdx,%r8
mulq %rbx
- movd (%rbp),%xmm4
addq %rax,%r8
movq 16(%rsi),%rax
movq %rdx,%r9
adcq $0,%r9
mulq %rbx
- movd 64(%rbp),%xmm5
addq %rax,%r9
movq 24(%rsi),%rax
movq %rdx,%r10
adcq $0,%r10
mulq %rbx
- pslldq $4,%xmm5
addq %rax,%r10
movq 32(%rsi),%rax
movq %rdx,%r11
adcq $0,%r11
mulq %rbx
- por %xmm5,%xmm4
addq %rax,%r11
movq 40(%rsi),%rax
movq %rdx,%r12
@@ -820,14 +561,12 @@ rsaz_512_mul_gather4:
adcq $0,%r13
mulq %rbx
- leaq 128(%rbp),%rbp
addq %rax,%r13
movq 56(%rsi),%rax
movq %rdx,%r14
adcq $0,%r14
mulq %rbx
-.byte 102,72,15,126,227
addq %rax,%r14
movq (%rsi),%rax
movq %rdx,%r15
@@ -839,6 +578,35 @@ rsaz_512_mul_gather4:
.align 32
.Loop_mul_gather:
+ movdqa 0(%rbp),%xmm8
+ movdqa 16(%rbp),%xmm9
+ movdqa 32(%rbp),%xmm10
+ movdqa 48(%rbp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rbp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rbp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rbp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rbp),%xmm15
+ leaq 128(%rbp),%rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+.byte 102,76,15,126,195
+
mulq %rbx
addq %rax,%r8
movq 8(%rsi),%rax
@@ -847,7 +615,6 @@ rsaz_512_mul_gather4:
adcq $0,%r8
mulq %rbx
- movd (%rbp),%xmm4
addq %rax,%r9
movq 16(%rsi),%rax
adcq $0,%rdx
@@ -856,7 +623,6 @@ rsaz_512_mul_gather4:
adcq $0,%r9
mulq %rbx
- movd 64(%rbp),%xmm5
addq %rax,%r10
movq 24(%rsi),%rax
adcq $0,%rdx
@@ -865,7 +631,6 @@ rsaz_512_mul_gather4:
adcq $0,%r10
mulq %rbx
- pslldq $4,%xmm5
addq %rax,%r11
movq 32(%rsi),%rax
adcq $0,%rdx
@@ -874,7 +639,6 @@ rsaz_512_mul_gather4:
adcq $0,%r11
mulq %rbx
- por %xmm5,%xmm4
addq %rax,%r12
movq 40(%rsi),%rax
adcq $0,%rdx
@@ -899,7 +663,6 @@ rsaz_512_mul_gather4:
adcq $0,%r14
mulq %rbx
-.byte 102,72,15,126,227
addq %rax,%r15
movq (%rsi),%rax
adcq $0,%rdx
@@ -907,7 +670,6 @@ rsaz_512_mul_gather4:
movq %rdx,%r15
adcq $0,%r15
- leaq 128(%rbp),%rbp
leaq 8(%rdi),%rdi
decl %ecx
@@ -922,8 +684,8 @@ rsaz_512_mul_gather4:
movq %r14,48(%rdi)
movq %r15,56(%rdi)
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
+ movq 128+8(%rsp),%rdi
+ movq 128+16(%rsp),%rbp
movq (%rsp),%r8
movq 8(%rsp),%r9
@@ -935,126 +697,6 @@ rsaz_512_mul_gather4:
movq 56(%rsp),%r15
call __rsaz_512_reduce
- jmp .Lmul_gather_tail
-
-.align 32
-.Lmulx_gather:
- movl 64(%rdx,%r9,4),%eax
-.byte 102,72,15,110,199
- leaq 128(%rdx,%r9,4),%rbp
- movl (%rdx,%r9,4),%edx
-.byte 102,72,15,110,201
- movq %r8,128(%rsp)
-
- shlq $32,%rax
- orq %rax,%rdx
- mulxq (%rsi),%rbx,%r8
- movq %rbx,(%rsp)
- xorl %edi,%edi
-
- mulxq 8(%rsi),%rax,%r9
- movd (%rbp),%xmm4
-
- mulxq 16(%rsi),%rbx,%r10
- movd 64(%rbp),%xmm5
- adcxq %rax,%r8
-
- mulxq 24(%rsi),%rax,%r11
- pslldq $4,%xmm5
- adcxq %rbx,%r9
-
- mulxq 32(%rsi),%rbx,%r12
- por %xmm5,%xmm4
- adcxq %rax,%r10
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rbx,%r11
-
- mulxq 48(%rsi),%rbx,%r14
- leaq 128(%rbp),%rbp
- adcxq %rax,%r12
-
- mulxq 56(%rsi),%rax,%r15
-.byte 102,72,15,126,226
- adcxq %rbx,%r13
- adcxq %rax,%r14
- movq %r8,%rbx
- adcxq %rdi,%r15
-
- movq $-7,%rcx
- jmp .Loop_mulx_gather
-
-.align 32
-.Loop_mulx_gather:
- mulxq (%rsi),%rax,%r8
- adcxq %rax,%rbx
- adoxq %r9,%r8
-
- mulxq 8(%rsi),%rax,%r9
-.byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00
- adcxq %rax,%r8
- adoxq %r10,%r9
-
- mulxq 16(%rsi),%rax,%r10
- movd 64(%rbp),%xmm5
- leaq 128(%rbp),%rbp
- adcxq %rax,%r9
- adoxq %r11,%r10
-
-.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
- pslldq $4,%xmm5
- por %xmm5,%xmm4
- adcxq %rax,%r10
- adoxq %r12,%r11
-
- mulxq 32(%rsi),%rax,%r12
- adcxq %rax,%r11
- adoxq %r13,%r12
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
-.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
- adcxq %rax,%r13
- adoxq %r15,%r14
-
- mulxq 56(%rsi),%rax,%r15
-.byte 102,72,15,126,226
- movq %rbx,64(%rsp,%rcx,8)
- adcxq %rax,%r14
- adoxq %rdi,%r15
- movq %r8,%rbx
- adcxq %rdi,%r15
-
- incq %rcx
- jnz .Loop_mulx_gather
-
- movq %r8,64(%rsp)
- movq %r9,64+8(%rsp)
- movq %r10,64+16(%rsp)
- movq %r11,64+24(%rsp)
- movq %r12,64+32(%rsp)
- movq %r13,64+40(%rsp)
- movq %r14,64+48(%rsp)
- movq %r15,64+56(%rsp)
-
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
-
- movq 128(%rsp),%rdx
- movq (%rsp),%r8
- movq 8(%rsp),%r9
- movq 16(%rsp),%r10
- movq 24(%rsp),%r11
- movq 32(%rsp),%r12
- movq 40(%rsp),%r13
- movq 48(%rsp),%r14
- movq 56(%rsp),%r15
-
- call __rsaz_512_reducex
-
-.Lmul_gather_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@@ -1092,17 +734,13 @@ rsaz_512_mul_scatter4:
movl %r9d,%r9d
subq $128+24,%rsp
.Lmul_scatter4_body:
- leaq (%r8,%r9,4),%r8
+ leaq (%r8,%r9,8),%r8
.byte 102,72,15,110,199
.byte 102,72,15,110,202
.byte 102,73,15,110,208
movq %rcx,128(%rsp)
movq %rdi,%rbp
- movl $524544,%r11d
- andl OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl $524544,%r11d
- je .Lmulx_scatter
movq (%rdi),%rbx
call __rsaz_512_mul
@@ -1119,29 +757,6 @@ rsaz_512_mul_scatter4:
movq 56(%rsp),%r15
call __rsaz_512_reduce
- jmp .Lmul_scatter_tail
-
-.align 32
-.Lmulx_scatter:
- movq (%rdi),%rdx
- call __rsaz_512_mulx
-
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
-
- movq 128(%rsp),%rdx
- movq (%rsp),%r8
- movq 8(%rsp),%r9
- movq 16(%rsp),%r10
- movq 24(%rsp),%r11
- movq 32(%rsp),%r12
- movq 40(%rsp),%r13
- movq 48(%rsp),%r14
- movq 56(%rsp),%r15
-
- call __rsaz_512_reducex
-
-.Lmul_scatter_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@@ -1155,30 +770,14 @@ rsaz_512_mul_scatter4:
call __rsaz_512_subtract
- movl %r8d,0(%rsi)
- shrq $32,%r8
- movl %r9d,128(%rsi)
- shrq $32,%r9
- movl %r10d,256(%rsi)
- shrq $32,%r10
- movl %r11d,384(%rsi)
- shrq $32,%r11
- movl %r12d,512(%rsi)
- shrq $32,%r12
- movl %r13d,640(%rsi)
- shrq $32,%r13
- movl %r14d,768(%rsi)
- shrq $32,%r14
- movl %r15d,896(%rsi)
- shrq $32,%r15
- movl %r8d,64(%rsi)
- movl %r9d,192(%rsi)
- movl %r10d,320(%rsi)
- movl %r11d,448(%rsi)
- movl %r12d,576(%rsi)
- movl %r13d,704(%rsi)
- movl %r14d,832(%rsi)
- movl %r15d,960(%rsi)
+ movq %r8,0(%rsi)
+ movq %r9,128(%rsi)
+ movq %r10,256(%rsi)
+ movq %r11,384(%rsi)
+ movq %r12,512(%rsi)
+ movq %r13,640(%rsi)
+ movq %r14,768(%rsi)
+ movq %r15,896(%rsi)
leaq 128+24+48(%rsp),%rax
movq -48(%rax),%r15
@@ -1204,7 +803,6 @@ rsaz_512_mul_by_one:
subq $128+24,%rsp
.Lmul_by_one_body:
- movl OPENSSL_ia32cap_P+8(%rip),%eax
movq %rdx,%rbp
movq %rcx,128(%rsp)
@@ -1225,16 +823,7 @@ rsaz_512_mul_by_one:
movdqa %xmm0,64(%rsp)
movdqa %xmm0,80(%rsp)
movdqa %xmm0,96(%rsp)
- andl $524544,%eax
- cmpl $524544,%eax
- je .Lby_one_callx
call __rsaz_512_reduce
- jmp .Lby_one_tail
-.align 32
-.Lby_one_callx:
- movq 128(%rsp),%rdx
- call __rsaz_512_reducex
-.Lby_one_tail:
movq %r8,(%rdi)
movq %r9,8(%rdi)
movq %r10,16(%rdi)
@@ -1338,62 +927,6 @@ __rsaz_512_reduce:
.byte 0xf3,0xc3
.size __rsaz_512_reduce,.-__rsaz_512_reduce
-.type __rsaz_512_reducex,@function
-.align 32
-__rsaz_512_reducex:
-
- imulq %r8,%rdx
- xorq %rsi,%rsi
- movl $8,%ecx
- jmp .Lreduction_loopx
-
-.align 32
-.Lreduction_loopx:
- movq %r8,%rbx
- mulxq 0(%rbp),%rax,%r8
- adcxq %rbx,%rax
- adoxq %r9,%r8
-
- mulxq 8(%rbp),%rax,%r9
- adcxq %rax,%r8
- adoxq %r10,%r9
-
- mulxq 16(%rbp),%rbx,%r10
- adcxq %rbx,%r9
- adoxq %r11,%r10
-
- mulxq 24(%rbp),%rbx,%r11
- adcxq %rbx,%r10
- adoxq %r12,%r11
-
-.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
- movq %rdx,%rax
- movq %r8,%rdx
- adcxq %rbx,%r11
- adoxq %r13,%r12
-
- mulxq 128+8(%rsp),%rbx,%rdx
- movq %rax,%rdx
-
- mulxq 40(%rbp),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
-.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
- adcxq %rax,%r13
- adoxq %r15,%r14
-
- mulxq 56(%rbp),%rax,%r15
- movq %rbx,%rdx
- adcxq %rax,%r14
- adoxq %rsi,%r15
- adcxq %rsi,%r15
-
- decl %ecx
- jne .Lreduction_loopx
-
- .byte 0xf3,0xc3
-.size __rsaz_512_reducex,.-__rsaz_512_reducex
.type __rsaz_512_subtract,@function
.align 32
__rsaz_512_subtract:
@@ -1593,140 +1126,18 @@ __rsaz_512_mul:
.byte 0xf3,0xc3
.size __rsaz_512_mul,.-__rsaz_512_mul
-.type __rsaz_512_mulx,@function
-.align 32
-__rsaz_512_mulx:
- mulxq (%rsi),%rbx,%r8
- movq $-6,%rcx
-
- mulxq 8(%rsi),%rax,%r9
- movq %rbx,8(%rsp)
-
- mulxq 16(%rsi),%rbx,%r10
- adcq %rax,%r8
-
- mulxq 24(%rsi),%rax,%r11
- adcq %rbx,%r9
-
- mulxq 32(%rsi),%rbx,%r12
- adcq %rax,%r10
-
- mulxq 40(%rsi),%rax,%r13
- adcq %rbx,%r11
-
- mulxq 48(%rsi),%rbx,%r14
- adcq %rax,%r12
-
- mulxq 56(%rsi),%rax,%r15
- movq 8(%rbp),%rdx
- adcq %rbx,%r13
- adcq %rax,%r14
- adcq $0,%r15
-
- xorq %rdi,%rdi
- jmp .Loop_mulx
-
-.align 32
-.Loop_mulx:
- movq %r8,%rbx
- mulxq (%rsi),%rax,%r8
- adcxq %rax,%rbx
- adoxq %r9,%r8
-
- mulxq 8(%rsi),%rax,%r9
- adcxq %rax,%r8
- adoxq %r10,%r9
-
- mulxq 16(%rsi),%rax,%r10
- adcxq %rax,%r9
- adoxq %r11,%r10
-
- mulxq 24(%rsi),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
-
-.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
- adcxq %rax,%r11
- adoxq %r13,%r12
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
- mulxq 48(%rsi),%rax,%r14
- adcxq %rax,%r13
- adoxq %r15,%r14
-
- mulxq 56(%rsi),%rax,%r15
- movq 64(%rbp,%rcx,8),%rdx
- movq %rbx,8+64-8(%rsp,%rcx,8)
- adcxq %rax,%r14
- adoxq %rdi,%r15
- adcxq %rdi,%r15
-
- incq %rcx
- jnz .Loop_mulx
-
- movq %r8,%rbx
- mulxq (%rsi),%rax,%r8
- adcxq %rax,%rbx
- adoxq %r9,%r8
-
-.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
- adcxq %rax,%r8
- adoxq %r10,%r9
-
-.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
- adcxq %rax,%r9
- adoxq %r11,%r10
-
- mulxq 24(%rsi),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
-
- mulxq 32(%rsi),%rax,%r12
- adcxq %rax,%r11
- adoxq %r13,%r12
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
-.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
- adcxq %rax,%r13
- adoxq %r15,%r14
-
-.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
- adcxq %rax,%r14
- adoxq %rdi,%r15
- adcxq %rdi,%r15
-
- movq %rbx,8+64-8(%rsp)
- movq %r8,8+64(%rsp)
- movq %r9,8+64+8(%rsp)
- movq %r10,8+64+16(%rsp)
- movq %r11,8+64+24(%rsp)
- movq %r12,8+64+32(%rsp)
- movq %r13,8+64+40(%rsp)
- movq %r14,8+64+48(%rsp)
- movq %r15,8+64+56(%rsp)
-
- .byte 0xf3,0xc3
-.size __rsaz_512_mulx,.-__rsaz_512_mulx
.globl rsaz_512_scatter4
.type rsaz_512_scatter4,@function
.align 16
rsaz_512_scatter4:
- leaq (%rdi,%rdx,4),%rdi
+ leaq (%rdi,%rdx,8),%rdi
movl $8,%r9d
jmp .Loop_scatter
.align 16
.Loop_scatter:
movq (%rsi),%rax
leaq 8(%rsi),%rsi
- movl %eax,(%rdi)
- shrq $32,%rax
- movl %eax,64(%rdi)
+ movq %rax,(%rdi)
leaq 128(%rdi),%rdi
decl %r9d
jnz .Loop_scatter
@@ -1737,19 +1148,72 @@ rsaz_512_scatter4:
.type rsaz_512_gather4,@function
.align 16
rsaz_512_gather4:
- leaq (%rsi,%rdx,4),%rsi
+ movd %edx,%xmm8
+ movdqa .Linc+16(%rip),%xmm1
+ movdqa .Linc(%rip),%xmm0
+
+ pshufd $0,%xmm8,%xmm8
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm8,%xmm0
+ movdqa %xmm7,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm8,%xmm1
+ movdqa %xmm7,%xmm4
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm8,%xmm2
+ movdqa %xmm7,%xmm5
+ paddd %xmm3,%xmm4
+ pcmpeqd %xmm8,%xmm3
+ movdqa %xmm7,%xmm6
+ paddd %xmm4,%xmm5
+ pcmpeqd %xmm8,%xmm4
+ paddd %xmm5,%xmm6
+ pcmpeqd %xmm8,%xmm5
+ paddd %xmm6,%xmm7
+ pcmpeqd %xmm8,%xmm6
+ pcmpeqd %xmm8,%xmm7
movl $8,%r9d
jmp .Loop_gather
.align 16
.Loop_gather:
- movl (%rsi),%eax
- movl 64(%rsi),%r8d
+ movdqa 0(%rsi),%xmm8
+ movdqa 16(%rsi),%xmm9
+ movdqa 32(%rsi),%xmm10
+ movdqa 48(%rsi),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rsi),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rsi),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rsi),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rsi),%xmm15
leaq 128(%rsi),%rsi
- shlq $32,%r8
- orq %r8,%rax
- movq %rax,(%rdi)
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+ movq %xmm8,(%rdi)
leaq 8(%rdi),%rdi
decl %r9d
jnz .Loop_gather
.byte 0xf3,0xc3
+.LSEH_end_rsaz_512_gather4:
.size rsaz_512_gather4,.-rsaz_512_gather4
+
+.align 64
+.Linc:
+.long 0,0, 1,1
+.long 2,2, 2,2
diff --git a/deps/openssl/asm/x64-elf-gas/bn/x86_64-gf2m.s b/deps/openssl/asm/x64-elf-gas/bn/x86_64-gf2m.s
index eed057ad6a..f4e5337565 100644
--- a/deps/openssl/asm/x64-elf-gas/bn/x86_64-gf2m.s
+++ b/deps/openssl/asm/x64-elf-gas/bn/x86_64-gf2m.s
@@ -242,7 +242,7 @@ bn_GF2m_mul_2x2:
movq %rcx,56(%rsp)
movq %r8,64(%rsp)
- movq $15,%r8
+ movq $0xf,%r8
movq %rsi,%rax
movq %rcx,%rbp
call _mul_1x1
diff --git a/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont.s b/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont.s
index 45d19cd8b5..9e0019c163 100644
--- a/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont.s
+++ b/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont.s
@@ -10,7 +10,6 @@ bn_mul_mont:
jnz .Lmul_enter
cmpl $8,%r9d
jb .Lmul_enter
- movl OPENSSL_ia32cap_P+8(%rip),%r11d
cmpq %rsi,%rdx
jne .Lmul4x_enter
testl $7,%r9d
@@ -216,9 +215,6 @@ bn_mul_mont:
.align 16
bn_mul4x_mont:
.Lmul4x_enter:
- andl $524544,%r11d
- cmpl $524544,%r11d
- je .Lmulx4x_enter
pushq %rbx
pushq %rbp
pushq %r12
@@ -615,7 +611,6 @@ bn_mul4x_mont:
.size bn_mul4x_mont,.-bn_mul4x_mont
-
.type bn_sqr8x_mont,@function
.align 32
bn_sqr8x_mont:
@@ -638,20 +633,20 @@ bn_sqr8x_mont:
- leaq -64(%rsp,%r9,4),%r11
+ leaq -64(%rsp,%r9,2),%r11
movq (%r8),%r8
subq %rsi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lsqr8x_sp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,4),%rsp
+ leaq -64(%rsp,%r9,2),%rsp
jmp .Lsqr8x_sp_done
.align 32
.Lsqr8x_sp_alt:
- leaq 4096-64(,%r9,4),%r10
- leaq -64(%rsp,%r9,4),%rsp
+ leaq 4096-64(,%r9,2),%r10
+ leaq -64(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -661,385 +656,81 @@ bn_sqr8x_mont:
movq %r9,%r10
negq %r9
- leaq 64(%rsp,%r9,2),%r11
movq %r8,32(%rsp)
movq %rax,40(%rsp)
.Lsqr8x_body:
- movq %r9,%rbp
-.byte 102,73,15,110,211
- shrq $3+2,%rbp
- movl OPENSSL_ia32cap_P+8(%rip),%eax
- jmp .Lsqr8x_copy_n
-
-.align 32
-.Lsqr8x_copy_n:
- movq 0(%rcx),%xmm0
- movq 8(%rcx),%xmm1
- movq 16(%rcx),%xmm3
- movq 24(%rcx),%xmm4
- leaq 32(%rcx),%rcx
- movdqa %xmm0,0(%r11)
- movdqa %xmm1,16(%r11)
- movdqa %xmm3,32(%r11)
- movdqa %xmm4,48(%r11)
- leaq 64(%r11),%r11
- decq %rbp
- jnz .Lsqr8x_copy_n
-
+.byte 102,72,15,110,209
pxor %xmm0,%xmm0
.byte 102,72,15,110,207
.byte 102,73,15,110,218
- andl $524544,%eax
- cmpl $524544,%eax
- jne .Lsqr8x_nox
-
- call bn_sqrx8x_internal
-
- pxor %xmm0,%xmm0
- leaq 48(%rsp),%rax
- leaq 64(%rsp,%r9,2),%rdx
- shrq $3+2,%r9
- movq 40(%rsp),%rsi
- jmp .Lsqr8x_zero
-
-.align 32
-.Lsqr8x_nox:
call bn_sqr8x_internal
- pxor %xmm0,%xmm0
- leaq 48(%rsp),%rax
- leaq 64(%rsp,%r9,2),%rdx
- shrq $3+2,%r9
- movq 40(%rsp),%rsi
- jmp .Lsqr8x_zero
-
-.align 32
-.Lsqr8x_zero:
- movdqa %xmm0,0(%rax)
- movdqa %xmm0,16(%rax)
- movdqa %xmm0,32(%rax)
- movdqa %xmm0,48(%rax)
- leaq 64(%rax),%rax
- movdqa %xmm0,0(%rdx)
- movdqa %xmm0,16(%rdx)
- movdqa %xmm0,32(%rdx)
- movdqa %xmm0,48(%rdx)
- leaq 64(%rdx),%rdx
- decq %r9
- jnz .Lsqr8x_zero
-
- movq $1,%rax
- movq -48(%rsi),%r15
- movq -40(%rsi),%r14
- movq -32(%rsi),%r13
- movq -24(%rsi),%r12
- movq -16(%rsi),%rbp
- movq -8(%rsi),%rbx
- leaq (%rsi),%rsp
-.Lsqr8x_epilogue:
- .byte 0xf3,0xc3
-.size bn_sqr8x_mont,.-bn_sqr8x_mont
-.type bn_mulx4x_mont,@function
-.align 32
-bn_mulx4x_mont:
-.Lmulx4x_enter:
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-
- shll $3,%r9d
-.byte 0x67
- xorq %r10,%r10
- subq %r9,%r10
- movq (%r8),%r8
- leaq -72(%rsp,%r10,1),%rsp
- leaq (%rdx,%r9,1),%r10
- andq $-128,%rsp
-
-
-
-
-
-
-
-
-
- movq %r9,0(%rsp)
- shrq $5,%r9
- movq %r10,16(%rsp)
- subq $1,%r9
- movq %r8,24(%rsp)
- movq %rdi,32(%rsp)
- movq %rax,40(%rsp)
- movq %r9,48(%rsp)
- jmp .Lmulx4x_body
-
-.align 32
-.Lmulx4x_body:
- leaq 8(%rdx),%rdi
- movq (%rdx),%rdx
- leaq 64+32(%rsp),%rbx
- movq %rdx,%r9
-
- mulxq 0(%rsi),%r8,%rax
- mulxq 8(%rsi),%r11,%r14
- addq %rax,%r11
- movq %rdi,8(%rsp)
- mulxq 16(%rsi),%r12,%r13
- adcq %r14,%r12
- adcq $0,%r13
-
- movq %r8,%rdi
- imulq 24(%rsp),%r8
- xorq %rbp,%rbp
-
- mulxq 24(%rsi),%rax,%r14
- movq %r8,%rdx
- leaq 32(%rsi),%rsi
- adcxq %rax,%r13
- adcxq %rbp,%r14
-
- mulxq 0(%rcx),%rax,%r10
- adcxq %rax,%rdi
- adoxq %r11,%r10
- mulxq 8(%rcx),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
-.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
- movq 48(%rsp),%rdi
- movq %r10,-32(%rbx)
- adcxq %rax,%r11
- adoxq %r13,%r12
- mulxq 24(%rcx),%rax,%r15
- movq %r9,%rdx
- movq %r11,-24(%rbx)
- adcxq %rax,%r12
- adoxq %rbp,%r15
- leaq 32(%rcx),%rcx
- movq %r12,-16(%rbx)
-
- jmp .Lmulx4x_1st
-
-.align 32
-.Lmulx4x_1st:
- adcxq %rbp,%r15
- mulxq 0(%rsi),%r10,%rax
- adcxq %r14,%r10
- mulxq 8(%rsi),%r11,%r14
- adcxq %rax,%r11
- mulxq 16(%rsi),%r12,%rax
- adcxq %r14,%r12
- mulxq 24(%rsi),%r13,%r14
-.byte 0x67,0x67
- movq %r8,%rdx
- adcxq %rax,%r13
- adcxq %rbp,%r14
- leaq 32(%rsi),%rsi
- leaq 32(%rbx),%rbx
-
- adoxq %r15,%r10
- mulxq 0(%rcx),%rax,%r15
- adcxq %rax,%r10
- adoxq %r15,%r11
- mulxq 8(%rcx),%rax,%r15
- adcxq %rax,%r11
- adoxq %r15,%r12
- mulxq 16(%rcx),%rax,%r15
- movq %r10,-40(%rbx)
- adcxq %rax,%r12
- movq %r11,-32(%rbx)
- adoxq %r15,%r13
- mulxq 24(%rcx),%rax,%r15
- movq %r9,%rdx
- movq %r12,-24(%rbx)
- adcxq %rax,%r13
- adoxq %rbp,%r15
- leaq 32(%rcx),%rcx
- movq %r13,-16(%rbx)
-
- decq %rdi
- jnz .Lmulx4x_1st
-
- movq 0(%rsp),%rax
- movq 8(%rsp),%rdi
- adcq %rbp,%r15
- addq %r15,%r14
- sbbq %r15,%r15
- movq %r14,-8(%rbx)
- jmp .Lmulx4x_outer
-
-.align 32
-.Lmulx4x_outer:
- movq (%rdi),%rdx
- leaq 8(%rdi),%rdi
- subq %rax,%rsi
- movq %r15,(%rbx)
- leaq 64+32(%rsp),%rbx
- subq %rax,%rcx
-
- mulxq 0(%rsi),%r8,%r11
- xorl %ebp,%ebp
- movq %rdx,%r9
- mulxq 8(%rsi),%r14,%r12
- adoxq -32(%rbx),%r8
- adcxq %r14,%r11
- mulxq 16(%rsi),%r15,%r13
- adoxq -24(%rbx),%r11
- adcxq %r15,%r12
- adoxq %rbp,%r12
- adcxq %rbp,%r13
-
- movq %rdi,8(%rsp)
-.byte 0x67
- movq %r8,%r15
- imulq 24(%rsp),%r8
- xorl %ebp,%ebp
-
- mulxq 24(%rsi),%rax,%r14
- movq %r8,%rdx
- adoxq -16(%rbx),%r12
- adcxq %rax,%r13
- adoxq -8(%rbx),%r13
- adcxq %rbp,%r14
- leaq 32(%rsi),%rsi
- adoxq %rbp,%r14
-
- mulxq 0(%rcx),%rax,%r10
- adcxq %rax,%r15
- adoxq %r11,%r10
- mulxq 8(%rcx),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
- mulxq 16(%rcx),%rax,%r12
- movq %r10,-32(%rbx)
- adcxq %rax,%r11
- adoxq %r13,%r12
- mulxq 24(%rcx),%rax,%r15
+ leaq (%rdi,%r9,1),%rbx
+ movq %r9,%rcx
movq %r9,%rdx
- movq %r11,-24(%rbx)
- leaq 32(%rcx),%rcx
- adcxq %rax,%r12
- adoxq %rbp,%r15
- movq 48(%rsp),%rdi
- movq %r12,-16(%rbx)
-
- jmp .Lmulx4x_inner
+.byte 102,72,15,126,207
+ sarq $3+2,%rcx
+ jmp .Lsqr8x_sub
.align 32
-.Lmulx4x_inner:
- mulxq 0(%rsi),%r10,%rax
- adcxq %rbp,%r15
- adoxq %r14,%r10
- mulxq 8(%rsi),%r11,%r14
- adcxq 0(%rbx),%r10
- adoxq %rax,%r11
- mulxq 16(%rsi),%r12,%rax
- adcxq 8(%rbx),%r11
- adoxq %r14,%r12
- mulxq 24(%rsi),%r13,%r14
- movq %r8,%rdx
- adcxq 16(%rbx),%r12
- adoxq %rax,%r13
- adcxq 24(%rbx),%r13
- adoxq %rbp,%r14
- leaq 32(%rsi),%rsi
+.Lsqr8x_sub:
+ movq 0(%rbx),%r12
+ movq 8(%rbx),%r13
+ movq 16(%rbx),%r14
+ movq 24(%rbx),%r15
leaq 32(%rbx),%rbx
- adcxq %rbp,%r14
-
- adoxq %r15,%r10
- mulxq 0(%rcx),%rax,%r15
- adcxq %rax,%r10
- adoxq %r15,%r11
- mulxq 8(%rcx),%rax,%r15
- adcxq %rax,%r11
- adoxq %r15,%r12
- mulxq 16(%rcx),%rax,%r15
- movq %r10,-40(%rbx)
- adcxq %rax,%r12
- adoxq %r15,%r13
- mulxq 24(%rcx),%rax,%r15
- movq %r9,%rdx
- movq %r11,-32(%rbx)
- movq %r12,-24(%rbx)
- adcxq %rax,%r13
- adoxq %rbp,%r15
- leaq 32(%rcx),%rcx
- movq %r13,-16(%rbx)
-
- decq %rdi
- jnz .Lmulx4x_inner
+ sbbq 0(%rbp),%r12
+ sbbq 8(%rbp),%r13
+ sbbq 16(%rbp),%r14
+ sbbq 24(%rbp),%r15
+ leaq 32(%rbp),%rbp
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ leaq 32(%rdi),%rdi
+ incq %rcx
+ jnz .Lsqr8x_sub
- movq 0(%rsp),%rax
- movq 8(%rsp),%rdi
- adcq %rbp,%r15
- subq 0(%rbx),%rbp
- adcq %r15,%r14
- movq -8(%rcx),%r8
- sbbq %r15,%r15
- movq %r14,-8(%rbx)
-
- cmpq 16(%rsp),%rdi
- jne .Lmulx4x_outer
-
- subq %r14,%r8
- sbbq %r8,%r8
- orq %r8,%r15
-
- negq %rax
- xorq %rdx,%rdx
- movq 32(%rsp),%rdi
- leaq 64(%rsp),%rbx
+ sbbq $0,%rax
+ leaq (%rbx,%r9,1),%rbx
+ leaq (%rdi,%r9,1),%rdi
+.byte 102,72,15,110,200
pxor %xmm0,%xmm0
- movq 0(%rcx,%rax,1),%r8
- movq 8(%rcx,%rax,1),%r9
- negq %r8
- jmp .Lmulx4x_sub_entry
+ pshufd $0,%xmm1,%xmm1
+ movq 40(%rsp),%rsi
+ jmp .Lsqr8x_cond_copy
.align 32
-.Lmulx4x_sub:
- movq 0(%rcx,%rax,1),%r8
- movq 8(%rcx,%rax,1),%r9
- notq %r8
-.Lmulx4x_sub_entry:
- movq 16(%rcx,%rax,1),%r10
- notq %r9
- andq %r15,%r8
- movq 24(%rcx,%rax,1),%r11
- notq %r10
- andq %r15,%r9
- notq %r11
- andq %r15,%r10
- andq %r15,%r11
-
- negq %rdx
- adcq 0(%rbx),%r8
- adcq 8(%rbx),%r9
- movdqa %xmm0,(%rbx)
- adcq 16(%rbx),%r10
- adcq 24(%rbx),%r11
- movdqa %xmm0,16(%rbx)
+.Lsqr8x_cond_copy:
+ movdqa 0(%rbx),%xmm2
+ movdqa 16(%rbx),%xmm3
leaq 32(%rbx),%rbx
- sbbq %rdx,%rdx
-
- movq %r8,0(%rdi)
- movq %r9,8(%rdi)
- movq %r10,16(%rdi)
- movq %r11,24(%rdi)
+ movdqu 0(%rdi),%xmm4
+ movdqu 16(%rdi),%xmm5
leaq 32(%rdi),%rdi
+ movdqa %xmm0,-32(%rbx)
+ movdqa %xmm0,-16(%rbx)
+ movdqa %xmm0,-32(%rbx,%rdx,1)
+ movdqa %xmm0,-16(%rbx,%rdx,1)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-32(%rdi)
+ movdqu %xmm5,-16(%rdi)
+ addq $32,%r9
+ jnz .Lsqr8x_cond_copy
- addq $32,%rax
- jnz .Lmulx4x_sub
-
- movq 40(%rsp),%rsi
movq $1,%rax
movq -48(%rsi),%r15
movq -40(%rsi),%r14
@@ -1048,8 +739,8 @@ bn_mulx4x_mont:
movq -16(%rsi),%rbp
movq -8(%rsi),%rbx
leaq (%rsi),%rsp
-.Lmulx4x_epilogue:
+.Lsqr8x_epilogue:
.byte 0xf3,0xc3
-.size bn_mulx4x_mont,.-bn_mulx4x_mont
+.size bn_sqr8x_mont,.-bn_sqr8x_mont
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 16
diff --git a/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s b/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s
index 41e96c8e90..8afe249695 100644
--- a/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s
+++ b/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s
@@ -8,53 +8,157 @@
bn_mul_mont_gather5:
testl $7,%r9d
jnz .Lmul_enter
- movl OPENSSL_ia32cap_P+8(%rip),%r11d
jmp .Lmul4x_enter
.align 16
.Lmul_enter:
movl %r9d,%r9d
movq %rsp,%rax
- movl 8(%rsp),%r10d
+ movd 8(%rsp),%xmm5
+ leaq .Linc(%rip),%r10
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
+
leaq 2(%r9),%r11
negq %r11
- leaq (%rsp,%r11,8),%rsp
+ leaq -264(%rsp,%r11,8),%rsp
andq $-1024,%rsp
movq %rax,8(%rsp,%r9,8)
.Lmul_body:
- movq %rdx,%r12
- movq %r10,%r11
- shrq $3,%r10
- andq $7,%r11
- notq %r10
- leaq .Lmagic_masks(%rip),%rax
- andq $3,%r10
- leaq 96(%r12,%r11,8),%r12
- movq 0(%rax,%r10,8),%xmm4
- movq 8(%rax,%r10,8),%xmm5
- movq 16(%rax,%r10,8),%xmm6
- movq 24(%rax,%r10,8),%xmm7
-
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
+ leaq 128(%rdx),%r12
+ movdqa 0(%r10),%xmm0
+ movdqa 16(%r10),%xmm1
+ leaq 24-112(%rsp,%r9,8),%r10
+ andq $-16,%r10
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+.byte 0x67
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
+.byte 0x67
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ por %xmm1,%xmm0
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
.byte 102,72,15,126,195
movq (%r8),%r8
@@ -63,29 +167,14 @@ bn_mul_mont_gather5:
xorq %r14,%r14
xorq %r15,%r15
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
-
movq %r8,%rbp
mulq %rbx
movq %rax,%r10
movq (%rcx),%rax
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq %r10,%rbp
movq %rdx,%r11
- por %xmm2,%xmm0
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
@@ -118,14 +207,12 @@ bn_mul_mont_gather5:
cmpq %r9,%r15
jne .L1st
-.byte 102,72,15,126,195
addq %rax,%r13
- movq (%rsi),%rax
adcq $0,%rdx
addq %r11,%r13
adcq $0,%rdx
- movq %r13,-16(%rsp,%r15,8)
+ movq %r13,-16(%rsp,%r9,8)
movq %rdx,%r13
movq %r10,%r11
@@ -139,33 +226,78 @@ bn_mul_mont_gather5:
jmp .Louter
.align 16
.Louter:
+ leaq 24+128(%rsp,%r9,8),%rdx
+ andq $-16,%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r12),%xmm0
+ movdqa 80(%r12),%xmm1
+ movdqa 96(%r12),%xmm2
+ movdqa 112(%r12),%xmm3
+ pand 64(%rdx),%xmm0
+ pand 80(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%r12),%r12
+
+ movq (%rsi),%rax
+.byte 102,72,15,126,195
+
xorq %r15,%r15
movq %r8,%rbp
movq (%rsp),%r10
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
-
mulq %rbx
addq %rax,%r10
movq (%rcx),%rax
adcq $0,%rdx
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq %r10,%rbp
movq %rdx,%r11
- por %xmm2,%xmm0
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
@@ -201,15 +333,12 @@ bn_mul_mont_gather5:
cmpq %r9,%r15
jne .Linner
-.byte 102,72,15,126,195
-
addq %rax,%r13
- movq (%rsi),%rax
adcq $0,%rdx
addq %r10,%r13
- movq (%rsp,%r15,8),%r10
+ movq (%rsp,%r9,8),%r10
adcq $0,%rdx
- movq %r13,-16(%rsp,%r15,8)
+ movq %r13,-16(%rsp,%r9,8)
movq %rdx,%r13
xorq %rdx,%rdx
@@ -256,6 +385,7 @@ bn_mul_mont_gather5:
movq 8(%rsp,%r9,8),%rsi
movq $1,%rax
+
movq -48(%rsi),%r15
movq -40(%rsi),%r14
movq -32(%rsi),%r13
@@ -270,9 +400,6 @@ bn_mul_mont_gather5:
.align 32
bn_mul4x_mont_gather5:
.Lmul4x_enter:
- andl $524544,%r11d
- cmpl $524544,%r11d
- je .Lmulx4x_enter
.byte 0x67
movq %rsp,%rax
pushq %rbx
@@ -281,10 +408,10 @@ bn_mul4x_mont_gather5:
pushq %r13
pushq %r14
pushq %r15
+
.byte 0x67
- movl %r9d,%r10d
shll $3,%r9d
- shll $3+2,%r10d
+ leaq (%r9,%r9,2),%r10
negq %r9
@@ -294,19 +421,21 @@ bn_mul4x_mont_gather5:
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lmul4xsp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -320(%rsp,%r9,2),%rsp
jmp .Lmul4xsp_done
.align 32
.Lmul4xsp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -322,6 +451,7 @@ bn_mul4x_mont_gather5:
movq 40(%rsp),%rsi
movq $1,%rax
+
movq -48(%rsi),%r15
movq -40(%rsi),%r14
movq -32(%rsi),%r13
@@ -337,47 +467,141 @@ bn_mul4x_mont_gather5:
.align 32
mul4x_internal:
shlq $5,%r9
- movl 8(%rax),%r10d
- leaq 256(%rdx,%r9,1),%r13
+ movd 8(%rax),%xmm5
+ leaq .Linc(%rip),%rax
+ leaq 128(%rdx,%r9,1),%r13
shrq $5,%r9
- movq %r10,%r11
- shrq $3,%r10
- andq $7,%r11
- notq %r10
- leaq .Lmagic_masks(%rip),%rax
- andq $3,%r10
- leaq 96(%rdx,%r11,8),%r12
- movq 0(%rax,%r10,8),%xmm4
- movq 8(%rax,%r10,8),%xmm5
- addq $7,%r11
- movq 16(%rax,%r10,8),%xmm6
- movq 24(%rax,%r10,8),%xmm7
- andq $7,%r11
-
- movq -96(%r12),%xmm0
- leaq 256(%r12),%r14
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
-.byte 0x67
- por %xmm1,%xmm0
- movq -96(%r14),%xmm1
-.byte 0x67
- pand %xmm7,%xmm3
-.byte 0x67
- por %xmm2,%xmm0
- movq -32(%r14),%xmm2
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 88-112(%rsp,%r9,1),%r10
+ leaq 128(%rdx),%r12
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+.byte 0x67,0x67
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
.byte 0x67
- pand %xmm4,%xmm1
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
.byte 0x67
- por %xmm3,%xmm0
- movq 32(%r14),%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ por %xmm1,%xmm0
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ leaq 256(%r12),%r12
.byte 102,72,15,126,195
- movq 96(%r14),%xmm0
+
movq %r13,16+8(%rsp)
movq %rdi,56+8(%rsp)
@@ -391,26 +615,10 @@ mul4x_internal:
movq %rax,%r10
movq (%rcx),%rax
- pand %xmm5,%xmm2
- pand %xmm6,%xmm3
- por %xmm2,%xmm1
-
imulq %r10,%rbp
-
-
-
-
-
-
-
- leaq 64+8(%rsp,%r11,8),%r14
+ leaq 64+8(%rsp),%r14
movq %rdx,%r11
- pand %xmm7,%xmm0
- por %xmm3,%xmm1
- leaq 512(%r12),%r12
- por %xmm1,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi,%r9,1),%rax
@@ -419,7 +627,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -429,7 +637,7 @@ mul4x_internal:
adcq $0,%rdx
addq %r11,%rdi
leaq 32(%r9),%r15
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %rdi,(%r14)
movq %rdx,%r13
@@ -439,7 +647,7 @@ mul4x_internal:
.L1st4x:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
leaq 32(%r14),%r14
adcq $0,%rdx
movq %rdx,%r11
@@ -455,7 +663,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq -16(%rcx),%rax
+ movq -8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -485,7 +693,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -494,7 +702,7 @@ mul4x_internal:
movq 16(%rsi,%r15,1),%rax
adcq $0,%rdx
addq %r11,%rdi
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %rdi,(%r14)
movq %rdx,%r13
@@ -504,7 +712,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
leaq 32(%r14),%r14
adcq $0,%rdx
movq %rdx,%r11
@@ -520,7 +728,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq -16(%rcx),%rax
+ movq -8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -533,8 +741,7 @@ mul4x_internal:
movq %rdi,-16(%r14)
movq %rdx,%r13
-.byte 102,72,15,126,195
- leaq (%rcx,%r9,2),%rcx
+ leaq (%rcx,%r9,1),%rcx
xorq %rdi,%rdi
addq %r10,%r13
@@ -545,6 +752,63 @@ mul4x_internal:
.align 32
.Louter4x:
+ leaq 16+128(%r14),%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r12),%xmm0
+ movdqa 80(%r12),%xmm1
+ movdqa 96(%r12),%xmm2
+ movdqa 112(%r12),%xmm3
+ pand 64(%rdx),%xmm0
+ pand 80(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%r12),%r12
+.byte 102,72,15,126,195
+
movq (%r14,%r9,1),%r10
movq %r8,%rbp
mulq %rbx
@@ -552,25 +816,11 @@ mul4x_internal:
movq (%rcx),%rax
adcq $0,%rdx
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
-
imulq %r10,%rbp
-.byte 0x67
movq %rdx,%r11
movq %rdi,(%r14)
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
- por %xmm2,%xmm0
leaq (%r14,%r9,1),%r14
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
mulq %rbp
addq %rax,%r10
@@ -580,7 +830,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
addq 8(%r14),%r11
adcq $0,%rdx
@@ -592,7 +842,7 @@ mul4x_internal:
adcq $0,%rdx
addq %r11,%rdi
leaq 32(%r9),%r15
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %rdx,%r13
jmp .Linner4x
@@ -601,7 +851,7 @@ mul4x_internal:
.Linner4x:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
adcq $0,%rdx
addq 16(%r14),%r10
leaq 32(%r14),%r14
@@ -619,7 +869,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq -16(%rcx),%rax
+ movq -8(%rcx),%rax
adcq $0,%rdx
addq -8(%r14),%r11
adcq $0,%rdx
@@ -653,7 +903,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
addq 8(%r14),%r11
adcq $0,%rdx
@@ -664,7 +914,7 @@ mul4x_internal:
movq 16(%rsi,%r15,1),%rax
adcq $0,%rdx
addq %r11,%rdi
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %r13,-8(%r14)
movq %rdx,%r13
@@ -674,7 +924,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
adcq $0,%rdx
addq 16(%r14),%r10
leaq 32(%r14),%r14
@@ -693,7 +943,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
movq %rbp,%rax
- movq -16(%rcx),%rbp
+ movq -8(%rcx),%rbp
adcq $0,%rdx
addq -8(%r14),%r11
adcq $0,%rdx
@@ -708,9 +958,8 @@ mul4x_internal:
movq %r13,-24(%r14)
movq %rdx,%r13
-.byte 102,72,15,126,195
movq %rdi,-16(%r14)
- leaq (%rcx,%r9,2),%rcx
+ leaq (%rcx,%r9,1),%rcx
xorq %rdi,%rdi
addq %r10,%r13
@@ -721,25 +970,28 @@ mul4x_internal:
cmpq 16+8(%rsp),%r12
jb .Louter4x
+ xorq %rax,%rax
subq %r13,%rbp
adcq %r15,%r15
orq %r15,%rdi
- xorq $1,%rdi
+ subq %rdi,%rax
leaq (%r14,%r9,1),%rbx
- leaq (%rcx,%rdi,8),%rbp
+ movq (%rcx),%r12
+ leaq (%rcx),%rbp
movq %r9,%rcx
sarq $3+2,%rcx
movq 56+8(%rsp),%rdi
- jmp .Lsqr4x_sub
+ decq %r12
+ xorq %r10,%r10
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqr4x_sub_entry
.size mul4x_internal,.-mul4x_internal
.globl bn_power5
.type bn_power5,@function
.align 32
bn_power5:
- movl OPENSSL_ia32cap_P+8(%rip),%r11d
- andl $524544,%r11d
- cmpl $524544,%r11d
- je .Lpowerx5_enter
movq %rsp,%rax
pushq %rbx
pushq %rbp
@@ -747,9 +999,9 @@ bn_power5:
pushq %r13
pushq %r14
pushq %r15
- movl %r9d,%r10d
+
shll $3,%r9d
- shll $3+2,%r10d
+ leal (%r9,%r9,2),%r10d
negq %r9
movq (%r8),%r8
@@ -759,19 +1011,20 @@ bn_power5:
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
+
+ leaq -320(%rsp,%r9,2),%r11
+ subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lpwr_sp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -320(%rsp,%r9,2),%rsp
jmp .Lpwr_sp_done
.align 32
.Lpwr_sp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -799,10 +1052,15 @@ bn_power5:
.byte 102,72,15,110,226
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
.byte 102,72,15,126,209
.byte 102,72,15,126,226
@@ -1346,9 +1604,9 @@ __bn_sqr8x_internal:
movq %rbx,-16(%rdi)
movq %r8,-8(%rdi)
.byte 102,72,15,126,213
-sqr8x_reduction:
+__bn_sqr8x_reduction:
xorq %rax,%rax
- leaq (%rbp,%r9,2),%rcx
+ leaq (%r9,%rbp,1),%rcx
leaq 48+8(%rsp,%r9,2),%rdx
movq %rcx,0+8(%rsp)
leaq 48+8(%rsp,%r9,1),%rdi
@@ -1381,14 +1639,14 @@ sqr8x_reduction:
.align 32
.L8x_reduce:
mulq %rbx
- movq 16(%rbp),%rax
+ movq 8(%rbp),%rax
negq %r8
movq %rdx,%r8
adcq $0,%r8
mulq %rbx
addq %rax,%r9
- movq 32(%rbp),%rax
+ movq 16(%rbp),%rax
adcq $0,%rdx
addq %r9,%r8
movq %rbx,48-8+8(%rsp,%rcx,8)
@@ -1397,7 +1655,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r10
- movq 48(%rbp),%rax
+ movq 24(%rbp),%rax
adcq $0,%rdx
addq %r10,%r9
movq 32+8(%rsp),%rsi
@@ -1406,7 +1664,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r11
- movq 64(%rbp),%rax
+ movq 32(%rbp),%rax
adcq $0,%rdx
imulq %r8,%rsi
addq %r11,%r10
@@ -1415,7 +1673,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r12
- movq 80(%rbp),%rax
+ movq 40(%rbp),%rax
adcq $0,%rdx
addq %r12,%r11
movq %rdx,%r12
@@ -1423,7 +1681,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r13
- movq 96(%rbp),%rax
+ movq 48(%rbp),%rax
adcq $0,%rdx
addq %r13,%r12
movq %rdx,%r13
@@ -1431,7 +1689,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r14
- movq 112(%rbp),%rax
+ movq 56(%rbp),%rax
adcq $0,%rdx
addq %r14,%r13
movq %rdx,%r14
@@ -1449,7 +1707,7 @@ sqr8x_reduction:
decl %ecx
jnz .L8x_reduce
- leaq 128(%rbp),%rbp
+ leaq 64(%rbp),%rbp
xorq %rax,%rax
movq 8+8(%rsp),%rdx
cmpq 0+8(%rsp),%rbp
@@ -1475,14 +1733,14 @@ sqr8x_reduction:
.L8x_tail:
mulq %rbx
addq %rax,%r8
- movq 16(%rbp),%rax
+ movq 8(%rbp),%rax
movq %r8,(%rdi)
movq %rdx,%r8
adcq $0,%r8
mulq %rbx
addq %rax,%r9
- movq 32(%rbp),%rax
+ movq 16(%rbp),%rax
adcq $0,%rdx
addq %r9,%r8
leaq 8(%rdi),%rdi
@@ -1491,7 +1749,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r10
- movq 48(%rbp),%rax
+ movq 24(%rbp),%rax
adcq $0,%rdx
addq %r10,%r9
movq %rdx,%r10
@@ -1499,7 +1757,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r11
- movq 64(%rbp),%rax
+ movq 32(%rbp),%rax
adcq $0,%rdx
addq %r11,%r10
movq %rdx,%r11
@@ -1507,7 +1765,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r12
- movq 80(%rbp),%rax
+ movq 40(%rbp),%rax
adcq $0,%rdx
addq %r12,%r11
movq %rdx,%r12
@@ -1515,7 +1773,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r13
- movq 96(%rbp),%rax
+ movq 48(%rbp),%rax
adcq $0,%rdx
addq %r13,%r12
movq %rdx,%r13
@@ -1523,7 +1781,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r14
- movq 112(%rbp),%rax
+ movq 56(%rbp),%rax
adcq $0,%rdx
addq %r14,%r13
movq %rdx,%r14
@@ -1541,7 +1799,7 @@ sqr8x_reduction:
decl %ecx
jnz .L8x_tail
- leaq 128(%rbp),%rbp
+ leaq 64(%rbp),%rbp
movq 8+8(%rsp),%rdx
cmpq 0+8(%rsp),%rbp
jae .L8x_tail_done
@@ -1587,7 +1845,7 @@ sqr8x_reduction:
adcq 48(%rdi),%r14
adcq 56(%rdi),%r15
adcq $0,%rax
- movq -16(%rbp),%rcx
+ movq -8(%rbp),%rcx
xorq %rsi,%rsi
.byte 102,72,15,126,213
@@ -1605,44 +1863,62 @@ sqr8x_reduction:
cmpq %rdx,%rdi
jb .L8x_reduction_loop
-
- subq %r15,%rcx
+ .byte 0xf3,0xc3
+.size bn_sqr8x_internal,.-bn_sqr8x_internal
+.type __bn_post4x_internal,@function
+.align 32
+__bn_post4x_internal:
+ movq 0(%rbp),%r12
leaq (%rdi,%r9,1),%rbx
- adcq %rsi,%rsi
movq %r9,%rcx
- orq %rsi,%rax
.byte 102,72,15,126,207
- xorq $1,%rax
+ negq %rax
.byte 102,72,15,126,206
- leaq (%rbp,%rax,8),%rbp
sarq $3+2,%rcx
- jmp .Lsqr4x_sub
+ decq %r12
+ xorq %r10,%r10
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqr4x_sub_entry
-.align 32
+.align 16
.Lsqr4x_sub:
-.byte 0x66
- movq 0(%rbx),%r12
- movq 8(%rbx),%r13
- sbbq 0(%rbp),%r12
- movq 16(%rbx),%r14
- sbbq 16(%rbp),%r13
- movq 24(%rbx),%r15
- leaq 32(%rbx),%rbx
- sbbq 32(%rbp),%r14
+ movq 0(%rbp),%r12
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+.Lsqr4x_sub_entry:
+ leaq 32(%rbp),%rbp
+ notq %r12
+ notq %r13
+ notq %r14
+ notq %r15
+ andq %rax,%r12
+ andq %rax,%r13
+ andq %rax,%r14
+ andq %rax,%r15
+
+ negq %r10
+ adcq 0(%rbx),%r12
+ adcq 8(%rbx),%r13
+ adcq 16(%rbx),%r14
+ adcq 24(%rbx),%r15
movq %r12,0(%rdi)
- sbbq 48(%rbp),%r15
- leaq 64(%rbp),%rbp
+ leaq 32(%rbx),%rbx
movq %r13,8(%rdi)
+ sbbq %r10,%r10
movq %r14,16(%rdi)
movq %r15,24(%rdi)
leaq 32(%rdi),%rdi
incq %rcx
jnz .Lsqr4x_sub
+
movq %r9,%r10
negq %r9
.byte 0xf3,0xc3
-.size bn_sqr8x_internal,.-bn_sqr8x_internal
+.size __bn_post4x_internal,.-__bn_post4x_internal
.globl bn_from_montgomery
.type bn_from_montgomery,@function
.align 32
@@ -1664,10 +1940,9 @@ bn_from_mont8x:
pushq %r13
pushq %r14
pushq %r15
-.byte 0x67
- movl %r9d,%r10d
+
shll $3,%r9d
- shll $3+2,%r10d
+ leaq (%r9,%r9,2),%r10
negq %r9
movq (%r8),%r8
@@ -1677,19 +1952,20 @@ bn_from_mont8x:
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
+
+ leaq -320(%rsp,%r9,2),%r11
+ subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lfrom_sp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -320(%rsp,%r9,2),%rsp
jmp .Lfrom_sp_done
.align 32
.Lfrom_sp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -1740,22 +2016,8 @@ bn_from_mont8x:
.byte 0x67
movq %rcx,%rbp
.byte 102,73,15,110,218
- movl OPENSSL_ia32cap_P+8(%rip),%r11d
- andl $524544,%r11d
- cmpl $524544,%r11d
- jne .Lfrom_mont_nox
-
- leaq (%rax,%r9,1),%rdi
- call sqrx8x_reduction
-
- pxor %xmm0,%xmm0
- leaq 48(%rsp),%rax
- movq 40(%rsp),%rsi
- jmp .Lfrom_mont_zero
-
-.align 32
-.Lfrom_mont_nox:
- call sqr8x_reduction
+ call __bn_sqr8x_reduction
+ call __bn_post4x_internal
pxor %xmm0,%xmm0
leaq 48(%rsp),%rax
@@ -1783,1119 +2045,6 @@ bn_from_mont8x:
.Lfrom_epilogue:
.byte 0xf3,0xc3
.size bn_from_mont8x,.-bn_from_mont8x
-.type bn_mulx4x_mont_gather5,@function
-.align 32
-bn_mulx4x_mont_gather5:
-.Lmulx4x_enter:
-.byte 0x67
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-.byte 0x67
- movl %r9d,%r10d
- shll $3,%r9d
- shll $3+2,%r10d
- negq %r9
- movq (%r8),%r8
-
-
-
-
-
-
-
-
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
- andq $4095,%r11
- cmpq %r11,%r10
- jb .Lmulx4xsp_alt
- subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
- jmp .Lmulx4xsp_done
-
-.align 32
-.Lmulx4xsp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
- subq %r10,%r11
- movq $0,%r10
- cmovcq %r10,%r11
- subq %r11,%rsp
-.Lmulx4xsp_done:
- andq $-64,%rsp
-
-
-
-
-
-
-
-
-
-
-
-
- movq %r8,32(%rsp)
- movq %rax,40(%rsp)
-.Lmulx4x_body:
- call mulx4x_internal
-
- movq 40(%rsp),%rsi
- movq $1,%rax
- movq -48(%rsi),%r15
- movq -40(%rsi),%r14
- movq -32(%rsi),%r13
- movq -24(%rsi),%r12
- movq -16(%rsi),%rbp
- movq -8(%rsi),%rbx
- leaq (%rsi),%rsp
-.Lmulx4x_epilogue:
- .byte 0xf3,0xc3
-.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
-
-.type mulx4x_internal,@function
-.align 32
-mulx4x_internal:
-.byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00
-.byte 0x67
- negq %r9
- shlq $5,%r9
- leaq 256(%rdx,%r9,1),%r13
- shrq $5+5,%r9
- movl 8(%rax),%r10d
- subq $1,%r9
- movq %r13,16+8(%rsp)
- movq %r9,24+8(%rsp)
- movq %rdi,56+8(%rsp)
- movq %r10,%r11
- shrq $3,%r10
- andq $7,%r11
- notq %r10
- leaq .Lmagic_masks(%rip),%rax
- andq $3,%r10
- leaq 96(%rdx,%r11,8),%rdi
- movq 0(%rax,%r10,8),%xmm4
- movq 8(%rax,%r10,8),%xmm5
- addq $7,%r11
- movq 16(%rax,%r10,8),%xmm6
- movq 24(%rax,%r10,8),%xmm7
- andq $7,%r11
-
- movq -96(%rdi),%xmm0
- leaq 256(%rdi),%rbx
- movq -32(%rdi),%xmm1
- pand %xmm4,%xmm0
- movq 32(%rdi),%xmm2
- pand %xmm5,%xmm1
- movq 96(%rdi),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- movq -96(%rbx),%xmm1
- pand %xmm7,%xmm3
- por %xmm2,%xmm0
- movq -32(%rbx),%xmm2
- por %xmm3,%xmm0
-.byte 0x67,0x67
- pand %xmm4,%xmm1
- movq 32(%rbx),%xmm3
-
-.byte 102,72,15,126,194
- movq 96(%rbx),%xmm0
- leaq 512(%rdi),%rdi
- pand %xmm5,%xmm2
-.byte 0x67,0x67
- pand %xmm6,%xmm3
-
-
-
-
-
-
-
- leaq 64+32+8(%rsp,%r11,8),%rbx
-
- movq %rdx,%r9
- mulxq 0(%rsi),%r8,%rax
- mulxq 8(%rsi),%r11,%r12
- addq %rax,%r11
- mulxq 16(%rsi),%rax,%r13
- adcq %rax,%r12
- adcq $0,%r13
- mulxq 24(%rsi),%rax,%r14
-
- movq %r8,%r15
- imulq 32+8(%rsp),%r8
- xorq %rbp,%rbp
- movq %r8,%rdx
-
- por %xmm2,%xmm1
- pand %xmm7,%xmm0
- por %xmm3,%xmm1
- movq %rdi,8+8(%rsp)
- por %xmm1,%xmm0
-
-.byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00
- adcxq %rax,%r13
- adcxq %rbp,%r14
-
- mulxq 0(%rcx),%rax,%r10
- adcxq %rax,%r15
- adoxq %r11,%r10
- mulxq 16(%rcx),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
- mulxq 32(%rcx),%rax,%r12
- movq 24+8(%rsp),%rdi
-.byte 0x66
- movq %r10,-32(%rbx)
- adcxq %rax,%r11
- adoxq %r13,%r12
- mulxq 48(%rcx),%rax,%r15
-.byte 0x67,0x67
- movq %r9,%rdx
- movq %r11,-24(%rbx)
- adcxq %rax,%r12
- adoxq %rbp,%r15
-.byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00
- movq %r12,-16(%rbx)
-
-
-.align 32
-.Lmulx4x_1st:
- adcxq %rbp,%r15
- mulxq 0(%rsi),%r10,%rax
- adcxq %r14,%r10
- mulxq 8(%rsi),%r11,%r14
- adcxq %rax,%r11
- mulxq 16(%rsi),%r12,%rax
- adcxq %r14,%r12
- mulxq 24(%rsi),%r13,%r14
-.byte 0x67,0x67
- movq %r8,%rdx
- adcxq %rax,%r13
- adcxq %rbp,%r14
- leaq 32(%rsi),%rsi
- leaq 32(%rbx),%rbx
-
- adoxq %r15,%r10
- mulxq 0(%rcx),%rax,%r15
- adcxq %rax,%r10
- adoxq %r15,%r11
- mulxq 16(%rcx),%rax,%r15
- adcxq %rax,%r11
- adoxq %r15,%r12
- mulxq 32(%rcx),%rax,%r15
- movq %r10,-40(%rbx)
- adcxq %rax,%r12
- movq %r11,-32(%rbx)
- adoxq %r15,%r13
- mulxq 48(%rcx),%rax,%r15
- movq %r9,%rdx
- movq %r12,-24(%rbx)
- adcxq %rax,%r13
- adoxq %rbp,%r15
- leaq 64(%rcx),%rcx
- movq %r13,-16(%rbx)
-
- decq %rdi
- jnz .Lmulx4x_1st
-
- movq 8(%rsp),%rax
-.byte 102,72,15,126,194
- adcq %rbp,%r15
- leaq (%rsi,%rax,1),%rsi
- addq %r15,%r14
- movq 8+8(%rsp),%rdi
- adcq %rbp,%rbp
- movq %r14,-8(%rbx)
- jmp .Lmulx4x_outer
-
-.align 32
-.Lmulx4x_outer:
- movq %rbp,(%rbx)
- leaq 32(%rbx,%rax,1),%rbx
- mulxq 0(%rsi),%r8,%r11
- xorq %rbp,%rbp
- movq %rdx,%r9
- mulxq 8(%rsi),%r14,%r12
- adoxq -32(%rbx),%r8
- adcxq %r14,%r11
- mulxq 16(%rsi),%r15,%r13
- adoxq -24(%rbx),%r11
- adcxq %r15,%r12
- mulxq 24(%rsi),%rdx,%r14
- adoxq -16(%rbx),%r12
- adcxq %rdx,%r13
- leaq (%rcx,%rax,2),%rcx
- leaq 32(%rsi),%rsi
- adoxq -8(%rbx),%r13
- adcxq %rbp,%r14
- adoxq %rbp,%r14
-
-.byte 0x67
- movq %r8,%r15
- imulq 32+8(%rsp),%r8
-
- movq -96(%rdi),%xmm0
-.byte 0x67,0x67
- movq %r8,%rdx
- movq -32(%rdi),%xmm1
-.byte 0x67
- pand %xmm4,%xmm0
- movq 32(%rdi),%xmm2
-.byte 0x67
- pand %xmm5,%xmm1
- movq 96(%rdi),%xmm3
- addq $256,%rdi
-.byte 0x67
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
- xorq %rbp,%rbp
- movq %rdi,8+8(%rsp)
-
- mulxq 0(%rcx),%rax,%r10
- adcxq %rax,%r15
- adoxq %r11,%r10
- mulxq 16(%rcx),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
- mulxq 32(%rcx),%rax,%r12
- adcxq %rax,%r11
- adoxq %r13,%r12
- mulxq 48(%rcx),%rax,%r15
- movq %r9,%rdx
- por %xmm2,%xmm0
- movq 24+8(%rsp),%rdi
- movq %r10,-32(%rbx)
- por %xmm3,%xmm0
- adcxq %rax,%r12
- movq %r11,-24(%rbx)
- adoxq %rbp,%r15
- movq %r12,-16(%rbx)
- leaq 64(%rcx),%rcx
- jmp .Lmulx4x_inner
-
-.align 32
-.Lmulx4x_inner:
- mulxq 0(%rsi),%r10,%rax
- adcxq %rbp,%r15
- adoxq %r14,%r10
- mulxq 8(%rsi),%r11,%r14
- adcxq 0(%rbx),%r10
- adoxq %rax,%r11
- mulxq 16(%rsi),%r12,%rax
- adcxq 8(%rbx),%r11
- adoxq %r14,%r12
- mulxq 24(%rsi),%r13,%r14
- movq %r8,%rdx
- adcxq 16(%rbx),%r12
- adoxq %rax,%r13
- adcxq 24(%rbx),%r13
- adoxq %rbp,%r14
- leaq 32(%rsi),%rsi
- leaq 32(%rbx),%rbx
- adcxq %rbp,%r14
-
- adoxq %r15,%r10
- mulxq 0(%rcx),%rax,%r15
- adcxq %rax,%r10
- adoxq %r15,%r11
- mulxq 16(%rcx),%rax,%r15
- adcxq %rax,%r11
- adoxq %r15,%r12
- mulxq 32(%rcx),%rax,%r15
- movq %r10,-40(%rbx)
- adcxq %rax,%r12
- adoxq %r15,%r13
- movq %r11,-32(%rbx)
- mulxq 48(%rcx),%rax,%r15
- movq %r9,%rdx
- leaq 64(%rcx),%rcx
- movq %r12,-24(%rbx)
- adcxq %rax,%r13
- adoxq %rbp,%r15
- movq %r13,-16(%rbx)
-
- decq %rdi
- jnz .Lmulx4x_inner
-
- movq 0+8(%rsp),%rax
-.byte 102,72,15,126,194
- adcq %rbp,%r15
- subq 0(%rbx),%rdi
- movq 8+8(%rsp),%rdi
- movq 16+8(%rsp),%r10
- adcq %r15,%r14
- leaq (%rsi,%rax,1),%rsi
- adcq %rbp,%rbp
- movq %r14,-8(%rbx)
-
- cmpq %r10,%rdi
- jb .Lmulx4x_outer
-
- movq -16(%rcx),%r10
- xorq %r15,%r15
- subq %r14,%r10
- adcq %r15,%r15
- orq %r15,%rbp
- xorq $1,%rbp
- leaq (%rbx,%rax,1),%rdi
- leaq (%rcx,%rax,2),%rcx
-.byte 0x67,0x67
- sarq $3+2,%rax
- leaq (%rcx,%rbp,8),%rbp
- movq 56+8(%rsp),%rdx
- movq %rax,%rcx
- jmp .Lsqrx4x_sub
-.size mulx4x_internal,.-mulx4x_internal
-.type bn_powerx5,@function
-.align 32
-bn_powerx5:
-.Lpowerx5_enter:
-.byte 0x67
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-.byte 0x67
- movl %r9d,%r10d
- shll $3,%r9d
- shll $3+2,%r10d
- negq %r9
- movq (%r8),%r8
-
-
-
-
-
-
-
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
- andq $4095,%r11
- cmpq %r11,%r10
- jb .Lpwrx_sp_alt
- subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
- jmp .Lpwrx_sp_done
-
-.align 32
-.Lpwrx_sp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
- subq %r10,%r11
- movq $0,%r10
- cmovcq %r10,%r11
- subq %r11,%rsp
-.Lpwrx_sp_done:
- andq $-64,%rsp
- movq %r9,%r10
- negq %r9
-
-
-
-
-
-
-
-
-
-
-
-
- pxor %xmm0,%xmm0
-.byte 102,72,15,110,207
-.byte 102,72,15,110,209
-.byte 102,73,15,110,218
-.byte 102,72,15,110,226
- movq %r8,32(%rsp)
- movq %rax,40(%rsp)
-.Lpowerx5_body:
-
- call __bn_sqrx8x_internal
- call __bn_sqrx8x_internal
- call __bn_sqrx8x_internal
- call __bn_sqrx8x_internal
- call __bn_sqrx8x_internal
-
- movq %r10,%r9
- movq %rsi,%rdi
-.byte 102,72,15,126,209
-.byte 102,72,15,126,226
- movq 40(%rsp),%rax
-
- call mulx4x_internal
-
- movq 40(%rsp),%rsi
- movq $1,%rax
- movq -48(%rsi),%r15
- movq -40(%rsi),%r14
- movq -32(%rsi),%r13
- movq -24(%rsi),%r12
- movq -16(%rsi),%rbp
- movq -8(%rsi),%rbx
- leaq (%rsi),%rsp
-.Lpowerx5_epilogue:
- .byte 0xf3,0xc3
-.size bn_powerx5,.-bn_powerx5
-
-.globl bn_sqrx8x_internal
-.hidden bn_sqrx8x_internal
-.type bn_sqrx8x_internal,@function
-.align 32
-bn_sqrx8x_internal:
-__bn_sqrx8x_internal:
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- leaq 48+8(%rsp),%rdi
- leaq (%rsi,%r9,1),%rbp
- movq %r9,0+8(%rsp)
- movq %rbp,8+8(%rsp)
- jmp .Lsqr8x_zero_start
-
-.align 32
-.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
-.Lsqrx8x_zero:
-.byte 0x3e
- movdqa %xmm0,0(%rdi)
- movdqa %xmm0,16(%rdi)
- movdqa %xmm0,32(%rdi)
- movdqa %xmm0,48(%rdi)
-.Lsqr8x_zero_start:
- movdqa %xmm0,64(%rdi)
- movdqa %xmm0,80(%rdi)
- movdqa %xmm0,96(%rdi)
- movdqa %xmm0,112(%rdi)
- leaq 128(%rdi),%rdi
- subq $64,%r9
- jnz .Lsqrx8x_zero
-
- movq 0(%rsi),%rdx
-
- xorq %r10,%r10
- xorq %r11,%r11
- xorq %r12,%r12
- xorq %r13,%r13
- xorq %r14,%r14
- xorq %r15,%r15
- leaq 48+8(%rsp),%rdi
- xorq %rbp,%rbp
- jmp .Lsqrx8x_outer_loop
-
-.align 32
-.Lsqrx8x_outer_loop:
- mulxq 8(%rsi),%r8,%rax
- adcxq %r9,%r8
- adoxq %rax,%r10
- mulxq 16(%rsi),%r9,%rax
- adcxq %r10,%r9
- adoxq %rax,%r11
-.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
- adcxq %r11,%r10
- adoxq %rax,%r12
-.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
- adcxq %r12,%r11
- adoxq %rax,%r13
- mulxq 40(%rsi),%r12,%rax
- adcxq %r13,%r12
- adoxq %rax,%r14
- mulxq 48(%rsi),%r13,%rax
- adcxq %r14,%r13
- adoxq %r15,%rax
- mulxq 56(%rsi),%r14,%r15
- movq 8(%rsi),%rdx
- adcxq %rax,%r14
- adoxq %rbp,%r15
- adcq 64(%rdi),%r15
- movq %r8,8(%rdi)
- movq %r9,16(%rdi)
- sbbq %rcx,%rcx
- xorq %rbp,%rbp
-
-
- mulxq 16(%rsi),%r8,%rbx
- mulxq 24(%rsi),%r9,%rax
- adcxq %r10,%r8
- adoxq %rbx,%r9
- mulxq 32(%rsi),%r10,%rbx
- adcxq %r11,%r9
- adoxq %rax,%r10
-.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
- adcxq %r12,%r10
- adoxq %rbx,%r11
-.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
- adcxq %r13,%r11
- adoxq %r14,%r12
-.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
- movq 16(%rsi),%rdx
- adcxq %rax,%r12
- adoxq %rbx,%r13
- adcxq %r15,%r13
- adoxq %rbp,%r14
- adcxq %rbp,%r14
-
- movq %r8,24(%rdi)
- movq %r9,32(%rdi)
-
- mulxq 24(%rsi),%r8,%rbx
- mulxq 32(%rsi),%r9,%rax
- adcxq %r10,%r8
- adoxq %rbx,%r9
- mulxq 40(%rsi),%r10,%rbx
- adcxq %r11,%r9
- adoxq %rax,%r10
-.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
- adcxq %r12,%r10
- adoxq %r13,%r11
-.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
-.byte 0x3e
- movq 24(%rsi),%rdx
- adcxq %rbx,%r11
- adoxq %rax,%r12
- adcxq %r14,%r12
- movq %r8,40(%rdi)
- movq %r9,48(%rdi)
- mulxq 32(%rsi),%r8,%rax
- adoxq %rbp,%r13
- adcxq %rbp,%r13
-
- mulxq 40(%rsi),%r9,%rbx
- adcxq %r10,%r8
- adoxq %rax,%r9
- mulxq 48(%rsi),%r10,%rax
- adcxq %r11,%r9
- adoxq %r12,%r10
- mulxq 56(%rsi),%r11,%r12
- movq 32(%rsi),%rdx
- movq 40(%rsi),%r14
- adcxq %rbx,%r10
- adoxq %rax,%r11
- movq 48(%rsi),%r15
- adcxq %r13,%r11
- adoxq %rbp,%r12
- adcxq %rbp,%r12
-
- movq %r8,56(%rdi)
- movq %r9,64(%rdi)
-
- mulxq %r14,%r9,%rax
- movq 56(%rsi),%r8
- adcxq %r10,%r9
- mulxq %r15,%r10,%rbx
- adoxq %rax,%r10
- adcxq %r11,%r10
- mulxq %r8,%r11,%rax
- movq %r14,%rdx
- adoxq %rbx,%r11
- adcxq %r12,%r11
-
- adcxq %rbp,%rax
-
- mulxq %r15,%r14,%rbx
- mulxq %r8,%r12,%r13
- movq %r15,%rdx
- leaq 64(%rsi),%rsi
- adcxq %r14,%r11
- adoxq %rbx,%r12
- adcxq %rax,%r12
- adoxq %rbp,%r13
-
-.byte 0x67,0x67
- mulxq %r8,%r8,%r14
- adcxq %r8,%r13
- adcxq %rbp,%r14
-
- cmpq 8+8(%rsp),%rsi
- je .Lsqrx8x_outer_break
-
- negq %rcx
- movq $-8,%rcx
- movq %rbp,%r15
- movq 64(%rdi),%r8
- adcxq 72(%rdi),%r9
- adcxq 80(%rdi),%r10
- adcxq 88(%rdi),%r11
- adcq 96(%rdi),%r12
- adcq 104(%rdi),%r13
- adcq 112(%rdi),%r14
- adcq 120(%rdi),%r15
- leaq (%rsi),%rbp
- leaq 128(%rdi),%rdi
- sbbq %rax,%rax
-
- movq -64(%rsi),%rdx
- movq %rax,16+8(%rsp)
- movq %rdi,24+8(%rsp)
-
-
- xorl %eax,%eax
- jmp .Lsqrx8x_loop
-
-.align 32
-.Lsqrx8x_loop:
- movq %r8,%rbx
- mulxq 0(%rbp),%rax,%r8
- adcxq %rax,%rbx
- adoxq %r9,%r8
-
- mulxq 8(%rbp),%rax,%r9
- adcxq %rax,%r8
- adoxq %r10,%r9
-
- mulxq 16(%rbp),%rax,%r10
- adcxq %rax,%r9
- adoxq %r11,%r10
-
- mulxq 24(%rbp),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
-
-.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
- adcxq %rax,%r11
- adoxq %r13,%r12
-
- mulxq 40(%rbp),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
- mulxq 48(%rbp),%rax,%r14
- movq %rbx,(%rdi,%rcx,8)
- movl $0,%ebx
- adcxq %rax,%r13
- adoxq %r15,%r14
-
-.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
- movq 8(%rsi,%rcx,8),%rdx
- adcxq %rax,%r14
- adoxq %rbx,%r15
- adcxq %rbx,%r15
-
-.byte 0x67
- incq %rcx
- jnz .Lsqrx8x_loop
-
- leaq 64(%rbp),%rbp
- movq $-8,%rcx
- cmpq 8+8(%rsp),%rbp
- je .Lsqrx8x_break
-
- subq 16+8(%rsp),%rbx
-.byte 0x66
- movq -64(%rsi),%rdx
- adcxq 0(%rdi),%r8
- adcxq 8(%rdi),%r9
- adcq 16(%rdi),%r10
- adcq 24(%rdi),%r11
- adcq 32(%rdi),%r12
- adcq 40(%rdi),%r13
- adcq 48(%rdi),%r14
- adcq 56(%rdi),%r15
- leaq 64(%rdi),%rdi
-.byte 0x67
- sbbq %rax,%rax
- xorl %ebx,%ebx
- movq %rax,16+8(%rsp)
- jmp .Lsqrx8x_loop
-
-.align 32
-.Lsqrx8x_break:
- subq 16+8(%rsp),%r8
- movq 24+8(%rsp),%rcx
- movq 0(%rsi),%rdx
- xorl %ebp,%ebp
- movq %r8,0(%rdi)
- cmpq %rcx,%rdi
- je .Lsqrx8x_outer_loop
-
- movq %r9,8(%rdi)
- movq 8(%rcx),%r9
- movq %r10,16(%rdi)
- movq 16(%rcx),%r10
- movq %r11,24(%rdi)
- movq 24(%rcx),%r11
- movq %r12,32(%rdi)
- movq 32(%rcx),%r12
- movq %r13,40(%rdi)
- movq 40(%rcx),%r13
- movq %r14,48(%rdi)
- movq 48(%rcx),%r14
- movq %r15,56(%rdi)
- movq 56(%rcx),%r15
- movq %rcx,%rdi
- jmp .Lsqrx8x_outer_loop
-
-.align 32
-.Lsqrx8x_outer_break:
- movq %r9,72(%rdi)
-.byte 102,72,15,126,217
- movq %r10,80(%rdi)
- movq %r11,88(%rdi)
- movq %r12,96(%rdi)
- movq %r13,104(%rdi)
- movq %r14,112(%rdi)
- leaq 48+8(%rsp),%rdi
- movq (%rsi,%rcx,1),%rdx
-
- movq 8(%rdi),%r11
- xorq %r10,%r10
- movq 0+8(%rsp),%r9
- adoxq %r11,%r11
- movq 16(%rdi),%r12
- movq 24(%rdi),%r13
-
-
-.align 32
-.Lsqrx4x_shift_n_add:
- mulxq %rdx,%rax,%rbx
- adoxq %r12,%r12
- adcxq %r10,%rax
-.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
-.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
- adoxq %r13,%r13
- adcxq %r11,%rbx
- movq 40(%rdi),%r11
- movq %rax,0(%rdi)
- movq %rbx,8(%rdi)
-
- mulxq %rdx,%rax,%rbx
- adoxq %r10,%r10
- adcxq %r12,%rax
- movq 16(%rsi,%rcx,1),%rdx
- movq 48(%rdi),%r12
- adoxq %r11,%r11
- adcxq %r13,%rbx
- movq 56(%rdi),%r13
- movq %rax,16(%rdi)
- movq %rbx,24(%rdi)
-
- mulxq %rdx,%rax,%rbx
- adoxq %r12,%r12
- adcxq %r10,%rax
- movq 24(%rsi,%rcx,1),%rdx
- leaq 32(%rcx),%rcx
- movq 64(%rdi),%r10
- adoxq %r13,%r13
- adcxq %r11,%rbx
- movq 72(%rdi),%r11
- movq %rax,32(%rdi)
- movq %rbx,40(%rdi)
-
- mulxq %rdx,%rax,%rbx
- adoxq %r10,%r10
- adcxq %r12,%rax
- jrcxz .Lsqrx4x_shift_n_add_break
-.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
- adoxq %r11,%r11
- adcxq %r13,%rbx
- movq 80(%rdi),%r12
- movq 88(%rdi),%r13
- movq %rax,48(%rdi)
- movq %rbx,56(%rdi)
- leaq 64(%rdi),%rdi
- nop
- jmp .Lsqrx4x_shift_n_add
-
-.align 32
-.Lsqrx4x_shift_n_add_break:
- adcxq %r13,%rbx
- movq %rax,48(%rdi)
- movq %rbx,56(%rdi)
- leaq 64(%rdi),%rdi
-.byte 102,72,15,126,213
-sqrx8x_reduction:
- xorl %eax,%eax
- movq 32+8(%rsp),%rbx
- movq 48+8(%rsp),%rdx
- leaq -128(%rbp,%r9,2),%rcx
-
- movq %rcx,0+8(%rsp)
- movq %rdi,8+8(%rsp)
-
- leaq 48+8(%rsp),%rdi
- jmp .Lsqrx8x_reduction_loop
-
-.align 32
-.Lsqrx8x_reduction_loop:
- movq 8(%rdi),%r9
- movq 16(%rdi),%r10
- movq 24(%rdi),%r11
- movq 32(%rdi),%r12
- movq %rdx,%r8
- imulq %rbx,%rdx
- movq 40(%rdi),%r13
- movq 48(%rdi),%r14
- movq 56(%rdi),%r15
- movq %rax,24+8(%rsp)
-
- leaq 64(%rdi),%rdi
- xorq %rsi,%rsi
- movq $-8,%rcx
- jmp .Lsqrx8x_reduce
-
-.align 32
-.Lsqrx8x_reduce:
- movq %r8,%rbx
- mulxq 0(%rbp),%rax,%r8
- adcxq %rbx,%rax
- adoxq %r9,%r8
-
- mulxq 16(%rbp),%rbx,%r9
- adcxq %rbx,%r8
- adoxq %r10,%r9
-
- mulxq 32(%rbp),%rbx,%r10
- adcxq %rbx,%r9
- adoxq %r11,%r10
-
- mulxq 48(%rbp),%rbx,%r11
- adcxq %rbx,%r10
- adoxq %r12,%r11
-
-.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00
- movq %rdx,%rax
- movq %r8,%rdx
- adcxq %rbx,%r11
- adoxq %r13,%r12
-
- mulxq 32+8(%rsp),%rbx,%rdx
- movq %rax,%rdx
- movq %rax,64+48+8(%rsp,%rcx,8)
-
- mulxq 80(%rbp),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
- mulxq 96(%rbp),%rax,%r14
- adcxq %rax,%r13
- adoxq %r15,%r14
-
- mulxq 112(%rbp),%rax,%r15
- movq %rbx,%rdx
- adcxq %rax,%r14
- adoxq %rsi,%r15
- adcxq %rsi,%r15
-
-.byte 0x67,0x67,0x67
- incq %rcx
- jnz .Lsqrx8x_reduce
-
- movq %rsi,%rax
- cmpq 0+8(%rsp),%rbp
- jae .Lsqrx8x_no_tail
-
- movq 48+8(%rsp),%rdx
- addq 0(%rdi),%r8
- leaq 128(%rbp),%rbp
- movq $-8,%rcx
- adcxq 8(%rdi),%r9
- adcxq 16(%rdi),%r10
- adcq 24(%rdi),%r11
- adcq 32(%rdi),%r12
- adcq 40(%rdi),%r13
- adcq 48(%rdi),%r14
- adcq 56(%rdi),%r15
- leaq 64(%rdi),%rdi
- sbbq %rax,%rax
-
- xorq %rsi,%rsi
- movq %rax,16+8(%rsp)
- jmp .Lsqrx8x_tail
-
-.align 32
-.Lsqrx8x_tail:
- movq %r8,%rbx
- mulxq 0(%rbp),%rax,%r8
- adcxq %rax,%rbx
- adoxq %r9,%r8
-
- mulxq 16(%rbp),%rax,%r9
- adcxq %rax,%r8
- adoxq %r10,%r9
-
- mulxq 32(%rbp),%rax,%r10
- adcxq %rax,%r9
- adoxq %r11,%r10
-
- mulxq 48(%rbp),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
-
-.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00
- adcxq %rax,%r11
- adoxq %r13,%r12
-
- mulxq 80(%rbp),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
- mulxq 96(%rbp),%rax,%r14
- adcxq %rax,%r13
- adoxq %r15,%r14
-
- mulxq 112(%rbp),%rax,%r15
- movq 72+48+8(%rsp,%rcx,8),%rdx
- adcxq %rax,%r14
- adoxq %rsi,%r15
- movq %rbx,(%rdi,%rcx,8)
- movq %r8,%rbx
- adcxq %rsi,%r15
-
- incq %rcx
- jnz .Lsqrx8x_tail
-
- cmpq 0+8(%rsp),%rbp
- jae .Lsqrx8x_tail_done
-
- subq 16+8(%rsp),%rsi
- movq 48+8(%rsp),%rdx
- leaq 128(%rbp),%rbp
- adcq 0(%rdi),%r8
- adcq 8(%rdi),%r9
- adcq 16(%rdi),%r10
- adcq 24(%rdi),%r11
- adcq 32(%rdi),%r12
- adcq 40(%rdi),%r13
- adcq 48(%rdi),%r14
- adcq 56(%rdi),%r15
- leaq 64(%rdi),%rdi
- sbbq %rax,%rax
- subq $8,%rcx
-
- xorq %rsi,%rsi
- movq %rax,16+8(%rsp)
- jmp .Lsqrx8x_tail
-
-.align 32
-.Lsqrx8x_tail_done:
- addq 24+8(%rsp),%r8
- adcq $0,%r9
- adcq $0,%r10
- adcq $0,%r11
- adcq $0,%r12
- adcq $0,%r13
- adcq $0,%r14
- adcq $0,%r15
-
-
- movq %rsi,%rax
-
- subq 16+8(%rsp),%rsi
-.Lsqrx8x_no_tail:
- adcq 0(%rdi),%r8
-.byte 102,72,15,126,217
- adcq 8(%rdi),%r9
- movq 112(%rbp),%rsi
-.byte 102,72,15,126,213
- adcq 16(%rdi),%r10
- adcq 24(%rdi),%r11
- adcq 32(%rdi),%r12
- adcq 40(%rdi),%r13
- adcq 48(%rdi),%r14
- adcq 56(%rdi),%r15
- adcq %rax,%rax
-
- movq 32+8(%rsp),%rbx
- movq 64(%rdi,%rcx,1),%rdx
-
- movq %r8,0(%rdi)
- leaq 64(%rdi),%r8
- movq %r9,8(%rdi)
- movq %r10,16(%rdi)
- movq %r11,24(%rdi)
- movq %r12,32(%rdi)
- movq %r13,40(%rdi)
- movq %r14,48(%rdi)
- movq %r15,56(%rdi)
-
- leaq 64(%rdi,%rcx,1),%rdi
- cmpq 8+8(%rsp),%r8
- jb .Lsqrx8x_reduction_loop
- xorl %ebx,%ebx
- subq %r15,%rsi
- adcq %rbx,%rbx
- movq %rcx,%r10
- orq %rbx,%rax
- movq %rcx,%r9
- xorq $1,%rax
- sarq $3+2,%rcx
-
- leaq (%rbp,%rax,8),%rbp
-.byte 102,72,15,126,202
-.byte 102,72,15,126,206
- jmp .Lsqrx4x_sub
-
-.align 32
-.Lsqrx4x_sub:
-.byte 0x66
- movq 0(%rdi),%r12
- movq 8(%rdi),%r13
- sbbq 0(%rbp),%r12
- movq 16(%rdi),%r14
- sbbq 16(%rbp),%r13
- movq 24(%rdi),%r15
- leaq 32(%rdi),%rdi
- sbbq 32(%rbp),%r14
- movq %r12,0(%rdx)
- sbbq 48(%rbp),%r15
- leaq 64(%rbp),%rbp
- movq %r13,8(%rdx)
- movq %r14,16(%rdx)
- movq %r15,24(%rdx)
- leaq 32(%rdx),%rdx
-
- incq %rcx
- jnz .Lsqrx4x_sub
- negq %r9
-
- .byte 0xf3,0xc3
-.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
.globl bn_get_bits5
.type bn_get_bits5,@function
.align 16
@@ -2935,45 +2084,169 @@ bn_scatter5:
.globl bn_gather5
.type bn_gather5,@function
-.align 16
+.align 32
bn_gather5:
- movl %ecx,%r11d
- shrl $3,%ecx
- andq $7,%r11
- notl %ecx
- leaq .Lmagic_masks(%rip),%rax
- andl $3,%ecx
- leaq 128(%rdx,%r11,8),%rdx
- movq 0(%rax,%rcx,8),%xmm4
- movq 8(%rax,%rcx,8),%xmm5
- movq 16(%rax,%rcx,8),%xmm6
- movq 24(%rax,%rcx,8),%xmm7
+.LSEH_begin_bn_gather5:
+
+.byte 0x4c,0x8d,0x14,0x24
+.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00
+ leaq .Linc(%rip),%rax
+ andq $-16,%rsp
+
+ movd %ecx,%xmm5
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 128(%rdx),%r11
+ leaq 128(%rsp),%rax
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,-128(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,-112(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,-96(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,-80(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,-64(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,-48(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,-32(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,-16(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,16(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,32(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,48(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,64(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,80(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,96(%rax)
+ movdqa %xmm4,%xmm2
+ movdqa %xmm3,112(%rax)
jmp .Lgather
-.align 16
-.Lgather:
- movq -128(%rdx),%xmm0
- movq -64(%rdx),%xmm1
- pand %xmm4,%xmm0
- movq 0(%rdx),%xmm2
- pand %xmm5,%xmm1
- movq 64(%rdx),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-.byte 0x67,0x67
- por %xmm2,%xmm0
- leaq 256(%rdx),%rdx
- por %xmm3,%xmm0
+.align 32
+.Lgather:
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r11),%xmm0
+ movdqa -112(%r11),%xmm1
+ movdqa -96(%r11),%xmm2
+ pand -128(%rax),%xmm0
+ movdqa -80(%r11),%xmm3
+ pand -112(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r11),%xmm0
+ movdqa -48(%r11),%xmm1
+ movdqa -32(%r11),%xmm2
+ pand -64(%rax),%xmm0
+ movdqa -16(%r11),%xmm3
+ pand -48(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r11),%xmm0
+ movdqa 16(%r11),%xmm1
+ movdqa 32(%r11),%xmm2
+ pand 0(%rax),%xmm0
+ movdqa 48(%r11),%xmm3
+ pand 16(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r11),%xmm0
+ movdqa 80(%r11),%xmm1
+ movdqa 96(%r11),%xmm2
+ pand 64(%rax),%xmm0
+ movdqa 112(%r11),%xmm3
+ pand 80(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ leaq 256(%r11),%r11
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
movq %xmm0,(%rdi)
leaq 8(%rdi),%rdi
subl $1,%esi
jnz .Lgather
+
+ leaq (%r10),%rsp
.byte 0xf3,0xc3
.LSEH_end_bn_gather5:
.size bn_gather5,.-bn_gather5
.align 64
-.Lmagic_masks:
-.long 0,0, 0,0, 0,0, -1,-1
-.long 0,0, 0,0, 0,0, 0,0
+.Linc:
+.long 0,0, 1,1
+.long 2,2, 2,2
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/deps/openssl/asm/x64-elf-gas/ec/ecp_nistz256-x86_64.s b/deps/openssl/asm/x64-elf-gas/ec/ecp_nistz256-x86_64.s
index 3a999664c7..7876e38299 100644
--- a/deps/openssl/asm/x64-elf-gas/ec/ecp_nistz256-x86_64.s
+++ b/deps/openssl/asm/x64-elf-gas/ec/ecp_nistz256-x86_64.s
@@ -332,8 +332,6 @@ ecp_nistz256_neg:
.type ecp_nistz256_to_mont,@function
.align 32
ecp_nistz256_to_mont:
- movl $524544,%ecx
- andl OPENSSL_ia32cap_P+8(%rip),%ecx
leaq .LRR(%rip),%rdx
jmp .Lmul_mont
.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
@@ -348,8 +346,6 @@ ecp_nistz256_to_mont:
.type ecp_nistz256_mul_mont,@function
.align 32
ecp_nistz256_mul_mont:
- movl $524544,%ecx
- andl OPENSSL_ia32cap_P+8(%rip),%ecx
.Lmul_mont:
pushq %rbp
pushq %rbx
@@ -357,8 +353,6 @@ ecp_nistz256_mul_mont:
pushq %r13
pushq %r14
pushq %r15
- cmpl $524544,%ecx
- je .Lmul_montx
movq %rdx,%rbx
movq 0(%rdx),%rax
movq 0(%rsi),%r9
@@ -367,19 +361,6 @@ ecp_nistz256_mul_mont:
movq 24(%rsi),%r12
call __ecp_nistz256_mul_montq
- jmp .Lmul_mont_done
-
-.align 32
-.Lmul_montx:
- movq %rdx,%rbx
- movq 0(%rdx),%rdx
- movq 0(%rsi),%r9
- movq 8(%rsi),%r10
- movq 16(%rsi),%r11
- movq 24(%rsi),%r12
- leaq -128(%rsi),%rsi
-
- call __ecp_nistz256_mul_montx
.Lmul_mont_done:
popq %r15
popq %r14
@@ -617,33 +598,18 @@ __ecp_nistz256_mul_montq:
.type ecp_nistz256_sqr_mont,@function
.align 32
ecp_nistz256_sqr_mont:
- movl $524544,%ecx
- andl OPENSSL_ia32cap_P+8(%rip),%ecx
pushq %rbp
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
- cmpl $524544,%ecx
- je .Lsqr_montx
movq 0(%rsi),%rax
movq 8(%rsi),%r14
movq 16(%rsi),%r15
movq 24(%rsi),%r8
call __ecp_nistz256_sqr_montq
- jmp .Lsqr_mont_done
-
-.align 32
-.Lsqr_montx:
- movq 0(%rsi),%rdx
- movq 8(%rsi),%r14
- movq 16(%rsi),%r15
- movq 24(%rsi),%r8
- leaq -128(%rsi),%rsi
-
- call __ecp_nistz256_sqr_montx
.Lsqr_mont_done:
popq %r15
popq %r14
@@ -815,304 +781,6 @@ __ecp_nistz256_sqr_montq:
.byte 0xf3,0xc3
.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
-.type __ecp_nistz256_mul_montx,@function
-.align 32
-__ecp_nistz256_mul_montx:
-
-
- mulxq %r9,%r8,%r9
- mulxq %r10,%rcx,%r10
- movq $32,%r14
- xorq %r13,%r13
- mulxq %r11,%rbp,%r11
- movq .Lpoly+24(%rip),%r15
- adcq %rcx,%r9
- mulxq %r12,%rcx,%r12
- movq %r8,%rdx
- adcq %rbp,%r10
- shlxq %r14,%r8,%rbp
- adcq %rcx,%r11
- shrxq %r14,%r8,%rcx
- adcq $0,%r12
-
-
-
- addq %rbp,%r9
- adcq %rcx,%r10
-
- mulxq %r15,%rcx,%rbp
- movq 8(%rbx),%rdx
- adcq %rcx,%r11
- adcq %rbp,%r12
- adcq $0,%r13
- xorq %r8,%r8
-
-
-
- mulxq 0+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r9
- adoxq %rbp,%r10
-
- mulxq 8+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
-
- mulxq 16+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 24+128(%rsi),%rcx,%rbp
- movq %r9,%rdx
- adcxq %rcx,%r12
- shlxq %r14,%r9,%rcx
- adoxq %rbp,%r13
- shrxq %r14,%r9,%rbp
-
- adcxq %r8,%r13
- adoxq %r8,%r8
- adcq $0,%r8
-
-
-
- addq %rcx,%r10
- adcq %rbp,%r11
-
- mulxq %r15,%rcx,%rbp
- movq 16(%rbx),%rdx
- adcq %rcx,%r12
- adcq %rbp,%r13
- adcq $0,%r8
- xorq %r9,%r9
-
-
-
- mulxq 0+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
-
- mulxq 8+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 16+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r12
- adoxq %rbp,%r13
-
- mulxq 24+128(%rsi),%rcx,%rbp
- movq %r10,%rdx
- adcxq %rcx,%r13
- shlxq %r14,%r10,%rcx
- adoxq %rbp,%r8
- shrxq %r14,%r10,%rbp
-
- adcxq %r9,%r8
- adoxq %r9,%r9
- adcq $0,%r9
-
-
-
- addq %rcx,%r11
- adcq %rbp,%r12
-
- mulxq %r15,%rcx,%rbp
- movq 24(%rbx),%rdx
- adcq %rcx,%r13
- adcq %rbp,%r8
- adcq $0,%r9
- xorq %r10,%r10
-
-
-
- mulxq 0+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 8+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r12
- adoxq %rbp,%r13
-
- mulxq 16+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r13
- adoxq %rbp,%r8
-
- mulxq 24+128(%rsi),%rcx,%rbp
- movq %r11,%rdx
- adcxq %rcx,%r8
- shlxq %r14,%r11,%rcx
- adoxq %rbp,%r9
- shrxq %r14,%r11,%rbp
-
- adcxq %r10,%r9
- adoxq %r10,%r10
- adcq $0,%r10
-
-
-
- addq %rcx,%r12
- adcq %rbp,%r13
-
- mulxq %r15,%rcx,%rbp
- movq %r12,%rbx
- movq .Lpoly+8(%rip),%r14
- adcq %rcx,%r8
- movq %r13,%rdx
- adcq %rbp,%r9
- adcq $0,%r10
-
-
-
- xorl %eax,%eax
- movq %r8,%rcx
- sbbq $-1,%r12
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%rbp
- sbbq %r15,%r9
- sbbq $0,%r10
-
- cmovcq %rbx,%r12
- cmovcq %rdx,%r13
- movq %r12,0(%rdi)
- cmovcq %rcx,%r8
- movq %r13,8(%rdi)
- cmovcq %rbp,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- .byte 0xf3,0xc3
-.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
-
-.type __ecp_nistz256_sqr_montx,@function
-.align 32
-__ecp_nistz256_sqr_montx:
- mulxq %r14,%r9,%r10
- mulxq %r15,%rcx,%r11
- xorl %eax,%eax
- adcq %rcx,%r10
- mulxq %r8,%rbp,%r12
- movq %r14,%rdx
- adcq %rbp,%r11
- adcq $0,%r12
- xorq %r13,%r13
-
-
- mulxq %r15,%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq %r8,%rcx,%rbp
- movq %r15,%rdx
- adcxq %rcx,%r12
- adoxq %rbp,%r13
- adcq $0,%r13
-
-
- mulxq %r8,%rcx,%r14
- movq 0+128(%rsi),%rdx
- xorq %r15,%r15
- adcxq %r9,%r9
- adoxq %rcx,%r13
- adcxq %r10,%r10
- adoxq %r15,%r14
-
- mulxq %rdx,%r8,%rbp
- movq 8+128(%rsi),%rdx
- adcxq %r11,%r11
- adoxq %rbp,%r9
- adcxq %r12,%r12
- mulxq %rdx,%rcx,%rax
- movq 16+128(%rsi),%rdx
- adcxq %r13,%r13
- adoxq %rcx,%r10
- adcxq %r14,%r14
-.byte 0x67
- mulxq %rdx,%rcx,%rbp
- movq 24+128(%rsi),%rdx
- adoxq %rax,%r11
- adcxq %r15,%r15
- adoxq %rcx,%r12
- movq $32,%rsi
- adoxq %rbp,%r13
-.byte 0x67,0x67
- mulxq %rdx,%rcx,%rax
- movq %r8,%rdx
- adoxq %rcx,%r14
- shlxq %rsi,%r8,%rcx
- adoxq %rax,%r15
- shrxq %rsi,%r8,%rax
- movq .Lpoly+24(%rip),%rbp
-
-
- addq %rcx,%r9
- adcq %rax,%r10
-
- mulxq %rbp,%rcx,%r8
- movq %r9,%rdx
- adcq %rcx,%r11
- shlxq %rsi,%r9,%rcx
- adcq $0,%r8
- shrxq %rsi,%r9,%rax
-
-
- addq %rcx,%r10
- adcq %rax,%r11
-
- mulxq %rbp,%rcx,%r9
- movq %r10,%rdx
- adcq %rcx,%r8
- shlxq %rsi,%r10,%rcx
- adcq $0,%r9
- shrxq %rsi,%r10,%rax
-
-
- addq %rcx,%r11
- adcq %rax,%r8
-
- mulxq %rbp,%rcx,%r10
- movq %r11,%rdx
- adcq %rcx,%r9
- shlxq %rsi,%r11,%rcx
- adcq $0,%r10
- shrxq %rsi,%r11,%rax
-
-
- addq %rcx,%r8
- adcq %rax,%r9
-
- mulxq %rbp,%rcx,%r11
- adcq %rcx,%r10
- adcq $0,%r11
-
- xorq %rdx,%rdx
- adcq %r8,%r12
- movq .Lpoly+8(%rip),%rsi
- adcq %r9,%r13
- movq %r12,%r8
- adcq %r10,%r14
- adcq %r11,%r15
- movq %r13,%r9
- adcq $0,%rdx
-
- xorl %eax,%eax
- sbbq $-1,%r12
- movq %r14,%r10
- sbbq %rsi,%r13
- sbbq $0,%r14
- movq %r15,%r11
- sbbq %rbp,%r15
- sbbq $0,%rdx
-
- cmovcq %r8,%r12
- cmovcq %r9,%r13
- movq %r12,0(%rdi)
- cmovcq %r10,%r14
- movq %r13,8(%rdi)
- cmovcq %r11,%r15
- movq %r14,16(%rdi)
- movq %r15,24(%rdi)
-
- .byte 0xf3,0xc3
-.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
@@ -1215,9 +883,6 @@ ecp_nistz256_from_mont:
.type ecp_nistz256_select_w5,@function
.align 32
ecp_nistz256_select_w5:
- movl OPENSSL_ia32cap_P+8(%rip),%eax
- testl $32,%eax
- jnz .Lavx2_select_w5
movdqa .LOne(%rip),%xmm0
movd %edx,%xmm1
@@ -1277,9 +942,6 @@ ecp_nistz256_select_w5:
.type ecp_nistz256_select_w7,@function
.align 32
ecp_nistz256_select_w7:
- movl OPENSSL_ia32cap_P+8(%rip),%eax
- testl $32,%eax
- jnz .Lavx2_select_w7
movdqa .LOne(%rip),%xmm8
movd %edx,%xmm1
@@ -1321,141 +983,11 @@ ecp_nistz256_select_w7:
movdqu %xmm5,48(%rdi)
.byte 0xf3,0xc3
.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
-
-
-.type ecp_nistz256_avx2_select_w5,@function
-.align 32
-ecp_nistz256_avx2_select_w5:
-.Lavx2_select_w5:
- vzeroupper
- vmovdqa .LTwo(%rip),%ymm0
-
- vpxor %ymm2,%ymm2,%ymm2
- vpxor %ymm3,%ymm3,%ymm3
- vpxor %ymm4,%ymm4,%ymm4
-
- vmovdqa .LOne(%rip),%ymm5
- vmovdqa .LTwo(%rip),%ymm10
-
- vmovd %edx,%xmm1
- vpermd %ymm1,%ymm2,%ymm1
-
- movq $8,%rax
-.Lselect_loop_avx2_w5:
-
- vmovdqa 0(%rsi),%ymm6
- vmovdqa 32(%rsi),%ymm7
- vmovdqa 64(%rsi),%ymm8
-
- vmovdqa 96(%rsi),%ymm11
- vmovdqa 128(%rsi),%ymm12
- vmovdqa 160(%rsi),%ymm13
-
- vpcmpeqd %ymm1,%ymm5,%ymm9
- vpcmpeqd %ymm1,%ymm10,%ymm14
-
- vpaddd %ymm0,%ymm5,%ymm5
- vpaddd %ymm0,%ymm10,%ymm10
- leaq 192(%rsi),%rsi
-
- vpand %ymm9,%ymm6,%ymm6
- vpand %ymm9,%ymm7,%ymm7
- vpand %ymm9,%ymm8,%ymm8
- vpand %ymm14,%ymm11,%ymm11
- vpand %ymm14,%ymm12,%ymm12
- vpand %ymm14,%ymm13,%ymm13
-
- vpxor %ymm6,%ymm2,%ymm2
- vpxor %ymm7,%ymm3,%ymm3
- vpxor %ymm8,%ymm4,%ymm4
- vpxor %ymm11,%ymm2,%ymm2
- vpxor %ymm12,%ymm3,%ymm3
- vpxor %ymm13,%ymm4,%ymm4
-
- decq %rax
- jnz .Lselect_loop_avx2_w5
-
- vmovdqu %ymm2,0(%rdi)
- vmovdqu %ymm3,32(%rdi)
- vmovdqu %ymm4,64(%rdi)
- vzeroupper
- .byte 0xf3,0xc3
-.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
-
-
-
.globl ecp_nistz256_avx2_select_w7
.type ecp_nistz256_avx2_select_w7,@function
.align 32
ecp_nistz256_avx2_select_w7:
-.Lavx2_select_w7:
- vzeroupper
- vmovdqa .LThree(%rip),%ymm0
-
- vpxor %ymm2,%ymm2,%ymm2
- vpxor %ymm3,%ymm3,%ymm3
-
- vmovdqa .LOne(%rip),%ymm4
- vmovdqa .LTwo(%rip),%ymm8
- vmovdqa .LThree(%rip),%ymm12
-
- vmovd %edx,%xmm1
- vpermd %ymm1,%ymm2,%ymm1
-
-
- movq $21,%rax
-.Lselect_loop_avx2_w7:
-
- vmovdqa 0(%rsi),%ymm5
- vmovdqa 32(%rsi),%ymm6
-
- vmovdqa 64(%rsi),%ymm9
- vmovdqa 96(%rsi),%ymm10
-
- vmovdqa 128(%rsi),%ymm13
- vmovdqa 160(%rsi),%ymm14
-
- vpcmpeqd %ymm1,%ymm4,%ymm7
- vpcmpeqd %ymm1,%ymm8,%ymm11
- vpcmpeqd %ymm1,%ymm12,%ymm15
-
- vpaddd %ymm0,%ymm4,%ymm4
- vpaddd %ymm0,%ymm8,%ymm8
- vpaddd %ymm0,%ymm12,%ymm12
- leaq 192(%rsi),%rsi
-
- vpand %ymm7,%ymm5,%ymm5
- vpand %ymm7,%ymm6,%ymm6
- vpand %ymm11,%ymm9,%ymm9
- vpand %ymm11,%ymm10,%ymm10
- vpand %ymm15,%ymm13,%ymm13
- vpand %ymm15,%ymm14,%ymm14
-
- vpxor %ymm5,%ymm2,%ymm2
- vpxor %ymm6,%ymm3,%ymm3
- vpxor %ymm9,%ymm2,%ymm2
- vpxor %ymm10,%ymm3,%ymm3
- vpxor %ymm13,%ymm2,%ymm2
- vpxor %ymm14,%ymm3,%ymm3
-
- decq %rax
- jnz .Lselect_loop_avx2_w7
-
-
- vmovdqa 0(%rsi),%ymm5
- vmovdqa 32(%rsi),%ymm6
-
- vpcmpeqd %ymm1,%ymm4,%ymm7
-
- vpand %ymm7,%ymm5,%ymm5
- vpand %ymm7,%ymm6,%ymm6
-
- vpxor %ymm5,%ymm2,%ymm2
- vpxor %ymm6,%ymm3,%ymm3
-
- vmovdqu %ymm2,0(%rdi)
- vmovdqu %ymm3,32(%rdi)
- vzeroupper
+.byte 0x0f,0x0b
.byte 0xf3,0xc3
.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
.type __ecp_nistz256_add_toq,@function
@@ -1581,10 +1113,6 @@ __ecp_nistz256_mul_by_2q:
.type ecp_nistz256_point_double,@function
.align 32
ecp_nistz256_point_double:
- movl $524544,%ecx
- andl OPENSSL_ia32cap_P+8(%rip),%ecx
- cmpl $524544,%ecx
- je .Lpoint_doublex
pushq %rbp
pushq %rbx
pushq %r12
@@ -1593,6 +1121,7 @@ ecp_nistz256_point_double:
pushq %r15
subq $160+8,%rsp
+.Lpoint_double_shortcutq:
movdqu 0(%rsi),%xmm0
movq %rsi,%rbx
movdqu 16(%rsi),%xmm1
@@ -1786,10 +1315,6 @@ ecp_nistz256_point_double:
.type ecp_nistz256_point_add,@function
.align 32
ecp_nistz256_point_add:
- movl $524544,%ecx
- andl OPENSSL_ia32cap_P+8(%rip),%ecx
- cmpl $524544,%ecx
- je .Lpoint_addx
pushq %rbp
pushq %rbx
pushq %r12
@@ -1817,7 +1342,7 @@ ecp_nistz256_point_add:
por %xmm1,%xmm3
movdqu 0(%rsi),%xmm0
- pshufd $177,%xmm3,%xmm5
+ pshufd $0xb1,%xmm3,%xmm5
movdqu 16(%rsi),%xmm1
movdqu 32(%rsi),%xmm2
por %xmm3,%xmm5
@@ -1827,7 +1352,7 @@ ecp_nistz256_point_add:
movq 64+16(%rsi),%r15
movq 64+24(%rsi),%r8
movdqa %xmm0,480(%rsp)
- pshufd $30,%xmm5,%xmm4
+ pshufd $0x1e,%xmm5,%xmm4
movdqa %xmm1,480+16(%rsp)
por %xmm0,%xmm1
.byte 102,72,15,110,199
@@ -1847,10 +1372,10 @@ ecp_nistz256_point_add:
call __ecp_nistz256_sqr_montq
pcmpeqd %xmm4,%xmm5
- pshufd $177,%xmm3,%xmm4
+ pshufd $0xb1,%xmm3,%xmm4
por %xmm3,%xmm4
pshufd $0,%xmm5,%xmm5
- pshufd $30,%xmm4,%xmm3
+ pshufd $0x1e,%xmm4,%xmm3
por %xmm3,%xmm4
pxor %xmm3,%xmm3
pcmpeqd %xmm3,%xmm4
@@ -1859,6 +1384,7 @@ ecp_nistz256_point_add:
movq 64+8(%rbx),%r14
movq 64+16(%rbx),%r15
movq 64+24(%rbx),%r8
+.byte 102,72,15,110,203
leaq 64-0(%rbx),%rsi
leaq 32(%rsp),%rdi
@@ -1950,7 +1476,7 @@ ecp_nistz256_point_add:
testq %r8,%r8
jnz .Ladd_proceedq
testq %r9,%r9
- jz .Ladd_proceedq
+ jz .Ladd_doubleq
.byte 102,72,15,126,199
pxor %xmm0,%xmm0
@@ -1963,6 +1489,13 @@ ecp_nistz256_point_add:
jmp .Ladd_doneq
.align 32
+.Ladd_doubleq:
+.byte 102,72,15,126,206
+.byte 102,72,15,126,199
+ addq $416,%rsp
+ jmp .Lpoint_double_shortcutq
+
+.align 32
.Ladd_proceedq:
movq 0+64(%rsp),%rax
movq 8+64(%rsp),%r14
@@ -2179,10 +1712,6 @@ ecp_nistz256_point_add:
.type ecp_nistz256_point_add_affine,@function
.align 32
ecp_nistz256_point_add_affine:
- movl $524544,%ecx
- andl OPENSSL_ia32cap_P+8(%rip),%ecx
- cmpl $524544,%ecx
- je .Lpoint_add_affinex
pushq %rbp
pushq %rbx
pushq %r12
@@ -2213,13 +1742,13 @@ ecp_nistz256_point_add_affine:
por %xmm1,%xmm3
movdqu 0(%rbx),%xmm0
- pshufd $177,%xmm3,%xmm5
+ pshufd $0xb1,%xmm3,%xmm5
movdqu 16(%rbx),%xmm1
movdqu 32(%rbx),%xmm2
por %xmm3,%xmm5
movdqu 48(%rbx),%xmm3
movdqa %xmm0,416(%rsp)
- pshufd $30,%xmm5,%xmm4
+ pshufd $0x1e,%xmm5,%xmm4
movdqa %xmm1,416+16(%rsp)
por %xmm0,%xmm1
.byte 102,72,15,110,199
@@ -2235,13 +1764,13 @@ ecp_nistz256_point_add_affine:
call __ecp_nistz256_sqr_montq
pcmpeqd %xmm4,%xmm5
- pshufd $177,%xmm3,%xmm4
+ pshufd $0xb1,%xmm3,%xmm4
movq 0(%rbx),%rax
movq %r12,%r9
por %xmm3,%xmm4
pshufd $0,%xmm5,%xmm5
- pshufd $30,%xmm4,%xmm3
+ pshufd $0x1e,%xmm4,%xmm3
movq %r13,%r10
por %xmm3,%xmm4
pxor %xmm3,%xmm3
@@ -2482,1023 +2011,3 @@ ecp_nistz256_point_add_affine:
popq %rbp
.byte 0xf3,0xc3
.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
-.type __ecp_nistz256_add_tox,@function
-.align 32
-__ecp_nistz256_add_tox:
- xorq %r11,%r11
- adcq 0(%rbx),%r12
- adcq 8(%rbx),%r13
- movq %r12,%rax
- adcq 16(%rbx),%r8
- adcq 24(%rbx),%r9
- movq %r13,%rbp
- adcq $0,%r11
-
- xorq %r10,%r10
- sbbq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
-
- btq $0,%r11
- cmovncq %rax,%r12
- cmovncq %rbp,%r13
- movq %r12,0(%rdi)
- cmovncq %rcx,%r8
- movq %r13,8(%rdi)
- cmovncq %r10,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- .byte 0xf3,0xc3
-.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
-
-.type __ecp_nistz256_sub_fromx,@function
-.align 32
-__ecp_nistz256_sub_fromx:
- xorq %r11,%r11
- sbbq 0(%rbx),%r12
- sbbq 8(%rbx),%r13
- movq %r12,%rax
- sbbq 16(%rbx),%r8
- sbbq 24(%rbx),%r9
- movq %r13,%rbp
- sbbq $0,%r11
-
- xorq %r10,%r10
- adcq $-1,%r12
- movq %r8,%rcx
- adcq %r14,%r13
- adcq $0,%r8
- movq %r9,%r10
- adcq %r15,%r9
-
- btq $0,%r11
- cmovncq %rax,%r12
- cmovncq %rbp,%r13
- movq %r12,0(%rdi)
- cmovncq %rcx,%r8
- movq %r13,8(%rdi)
- cmovncq %r10,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- .byte 0xf3,0xc3
-.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
-
-.type __ecp_nistz256_subx,@function
-.align 32
-__ecp_nistz256_subx:
- xorq %r11,%r11
- sbbq %r12,%rax
- sbbq %r13,%rbp
- movq %rax,%r12
- sbbq %r8,%rcx
- sbbq %r9,%r10
- movq %rbp,%r13
- sbbq $0,%r11
-
- xorq %r9,%r9
- adcq $-1,%rax
- movq %rcx,%r8
- adcq %r14,%rbp
- adcq $0,%rcx
- movq %r10,%r9
- adcq %r15,%r10
-
- btq $0,%r11
- cmovcq %rax,%r12
- cmovcq %rbp,%r13
- cmovcq %rcx,%r8
- cmovcq %r10,%r9
-
- .byte 0xf3,0xc3
-.size __ecp_nistz256_subx,.-__ecp_nistz256_subx
-
-.type __ecp_nistz256_mul_by_2x,@function
-.align 32
-__ecp_nistz256_mul_by_2x:
- xorq %r11,%r11
- adcq %r12,%r12
- adcq %r13,%r13
- movq %r12,%rax
- adcq %r8,%r8
- adcq %r9,%r9
- movq %r13,%rbp
- adcq $0,%r11
-
- xorq %r10,%r10
- sbbq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
-
- btq $0,%r11
- cmovncq %rax,%r12
- cmovncq %rbp,%r13
- movq %r12,0(%rdi)
- cmovncq %rcx,%r8
- movq %r13,8(%rdi)
- cmovncq %r10,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- .byte 0xf3,0xc3
-.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
-.type ecp_nistz256_point_doublex,@function
-.align 32
-ecp_nistz256_point_doublex:
-.Lpoint_doublex:
- pushq %rbp
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $160+8,%rsp
-
- movdqu 0(%rsi),%xmm0
- movq %rsi,%rbx
- movdqu 16(%rsi),%xmm1
- movq 32+0(%rsi),%r12
- movq 32+8(%rsi),%r13
- movq 32+16(%rsi),%r8
- movq 32+24(%rsi),%r9
- movq .Lpoly+8(%rip),%r14
- movq .Lpoly+24(%rip),%r15
- movdqa %xmm0,96(%rsp)
- movdqa %xmm1,96+16(%rsp)
- leaq 32(%rdi),%r10
- leaq 64(%rdi),%r11
-.byte 102,72,15,110,199
-.byte 102,73,15,110,202
-.byte 102,73,15,110,211
-
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_by_2x
-
- movq 64+0(%rsi),%rdx
- movq 64+8(%rsi),%r14
- movq 64+16(%rsi),%r15
- movq 64+24(%rsi),%r8
- leaq 64-128(%rsi),%rsi
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 0+0(%rsp),%rdx
- movq 8+0(%rsp),%r14
- leaq -128+0(%rsp),%rsi
- movq 16+0(%rsp),%r15
- movq 24+0(%rsp),%r8
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 32(%rbx),%rdx
- movq 64+0(%rbx),%r9
- movq 64+8(%rbx),%r10
- movq 64+16(%rbx),%r11
- movq 64+24(%rbx),%r12
- leaq 64-128(%rbx),%rsi
- leaq 32(%rbx),%rbx
-.byte 102,72,15,126,215
- call __ecp_nistz256_mul_montx
- call __ecp_nistz256_mul_by_2x
-
- movq 96+0(%rsp),%r12
- movq 96+8(%rsp),%r13
- leaq 64(%rsp),%rbx
- movq 96+16(%rsp),%r8
- movq 96+24(%rsp),%r9
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_add_tox
-
- movq 96+0(%rsp),%r12
- movq 96+8(%rsp),%r13
- leaq 64(%rsp),%rbx
- movq 96+16(%rsp),%r8
- movq 96+24(%rsp),%r9
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- movq 0+0(%rsp),%rdx
- movq 8+0(%rsp),%r14
- leaq -128+0(%rsp),%rsi
- movq 16+0(%rsp),%r15
- movq 24+0(%rsp),%r8
-.byte 102,72,15,126,207
- call __ecp_nistz256_sqr_montx
- xorq %r9,%r9
- movq %r12,%rax
- addq $-1,%r12
- movq %r13,%r10
- adcq %rsi,%r13
- movq %r14,%rcx
- adcq $0,%r14
- movq %r15,%r8
- adcq %rbp,%r15
- adcq $0,%r9
- xorq %rsi,%rsi
- testq $1,%rax
-
- cmovzq %rax,%r12
- cmovzq %r10,%r13
- cmovzq %rcx,%r14
- cmovzq %r8,%r15
- cmovzq %rsi,%r9
-
- movq %r13,%rax
- shrq $1,%r12
- shlq $63,%rax
- movq %r14,%r10
- shrq $1,%r13
- orq %rax,%r12
- shlq $63,%r10
- movq %r15,%rcx
- shrq $1,%r14
- orq %r10,%r13
- shlq $63,%rcx
- movq %r12,0(%rdi)
- shrq $1,%r15
- movq %r13,8(%rdi)
- shlq $63,%r9
- orq %rcx,%r14
- orq %r9,%r15
- movq %r14,16(%rdi)
- movq %r15,24(%rdi)
- movq 64(%rsp),%rdx
- leaq 64(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_mul_by_2x
-
- leaq 32(%rsp),%rbx
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_add_tox
-
- movq 96(%rsp),%rdx
- leaq 96(%rsp),%rbx
- movq 0+0(%rsp),%r9
- movq 8+0(%rsp),%r10
- leaq -128+0(%rsp),%rsi
- movq 16+0(%rsp),%r11
- movq 24+0(%rsp),%r12
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_mul_by_2x
-
- movq 0+32(%rsp),%rdx
- movq 8+32(%rsp),%r14
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r15
- movq 24+32(%rsp),%r8
-.byte 102,72,15,126,199
- call __ecp_nistz256_sqr_montx
-
- leaq 128(%rsp),%rbx
- movq %r14,%r8
- movq %r15,%r9
- movq %rsi,%r14
- movq %rbp,%r15
- call __ecp_nistz256_sub_fromx
-
- movq 0+0(%rsp),%rax
- movq 0+8(%rsp),%rbp
- movq 0+16(%rsp),%rcx
- movq 0+24(%rsp),%r10
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_subx
-
- movq 32(%rsp),%rdx
- leaq 32(%rsp),%rbx
- movq %r12,%r14
- xorl %ecx,%ecx
- movq %r12,0+0(%rsp)
- movq %r13,%r10
- movq %r13,0+8(%rsp)
- cmovzq %r8,%r11
- movq %r8,0+16(%rsp)
- leaq 0-128(%rsp),%rsi
- cmovzq %r9,%r12
- movq %r9,0+24(%rsp)
- movq %r14,%r9
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
-.byte 102,72,15,126,203
-.byte 102,72,15,126,207
- call __ecp_nistz256_sub_fromx
-
- addq $160+8,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %rbp
- .byte 0xf3,0xc3
-.size ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex
-.type ecp_nistz256_point_addx,@function
-.align 32
-ecp_nistz256_point_addx:
-.Lpoint_addx:
- pushq %rbp
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $576+8,%rsp
-
- movdqu 0(%rsi),%xmm0
- movdqu 16(%rsi),%xmm1
- movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm3
- movdqu 64(%rsi),%xmm4
- movdqu 80(%rsi),%xmm5
- movq %rsi,%rbx
- movq %rdx,%rsi
- movdqa %xmm0,384(%rsp)
- movdqa %xmm1,384+16(%rsp)
- por %xmm0,%xmm1
- movdqa %xmm2,416(%rsp)
- movdqa %xmm3,416+16(%rsp)
- por %xmm2,%xmm3
- movdqa %xmm4,448(%rsp)
- movdqa %xmm5,448+16(%rsp)
- por %xmm1,%xmm3
-
- movdqu 0(%rsi),%xmm0
- pshufd $177,%xmm3,%xmm5
- movdqu 16(%rsi),%xmm1
- movdqu 32(%rsi),%xmm2
- por %xmm3,%xmm5
- movdqu 48(%rsi),%xmm3
- movq 64+0(%rsi),%rdx
- movq 64+8(%rsi),%r14
- movq 64+16(%rsi),%r15
- movq 64+24(%rsi),%r8
- movdqa %xmm0,480(%rsp)
- pshufd $30,%xmm5,%xmm4
- movdqa %xmm1,480+16(%rsp)
- por %xmm0,%xmm1
-.byte 102,72,15,110,199
- movdqa %xmm2,512(%rsp)
- movdqa %xmm3,512+16(%rsp)
- por %xmm2,%xmm3
- por %xmm4,%xmm5
- pxor %xmm4,%xmm4
- por %xmm1,%xmm3
-
- leaq 64-128(%rsi),%rsi
- movq %rdx,544+0(%rsp)
- movq %r14,544+8(%rsp)
- movq %r15,544+16(%rsp)
- movq %r8,544+24(%rsp)
- leaq 96(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- pcmpeqd %xmm4,%xmm5
- pshufd $177,%xmm3,%xmm4
- por %xmm3,%xmm4
- pshufd $0,%xmm5,%xmm5
- pshufd $30,%xmm4,%xmm3
- por %xmm3,%xmm4
- pxor %xmm3,%xmm3
- pcmpeqd %xmm3,%xmm4
- pshufd $0,%xmm4,%xmm4
- movq 64+0(%rbx),%rdx
- movq 64+8(%rbx),%r14
- movq 64+16(%rbx),%r15
- movq 64+24(%rbx),%r8
-
- leaq 64-128(%rbx),%rsi
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 544(%rsp),%rdx
- leaq 544(%rsp),%rbx
- movq 0+96(%rsp),%r9
- movq 8+96(%rsp),%r10
- leaq -128+96(%rsp),%rsi
- movq 16+96(%rsp),%r11
- movq 24+96(%rsp),%r12
- leaq 224(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 448(%rsp),%rdx
- leaq 448(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 416(%rsp),%rdx
- leaq 416(%rsp),%rbx
- movq 0+224(%rsp),%r9
- movq 8+224(%rsp),%r10
- leaq -128+224(%rsp),%rsi
- movq 16+224(%rsp),%r11
- movq 24+224(%rsp),%r12
- leaq 224(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 512(%rsp),%rdx
- leaq 512(%rsp),%rbx
- movq 0+256(%rsp),%r9
- movq 8+256(%rsp),%r10
- leaq -128+256(%rsp),%rsi
- movq 16+256(%rsp),%r11
- movq 24+256(%rsp),%r12
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 224(%rsp),%rbx
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- orq %r13,%r12
- movdqa %xmm4,%xmm2
- orq %r8,%r12
- orq %r9,%r12
- por %xmm5,%xmm2
-.byte 102,73,15,110,220
-
- movq 384(%rsp),%rdx
- leaq 384(%rsp),%rbx
- movq 0+96(%rsp),%r9
- movq 8+96(%rsp),%r10
- leaq -128+96(%rsp),%rsi
- movq 16+96(%rsp),%r11
- movq 24+96(%rsp),%r12
- leaq 160(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 480(%rsp),%rdx
- leaq 480(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 192(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 160(%rsp),%rbx
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- orq %r13,%r12
- orq %r8,%r12
- orq %r9,%r12
-
-.byte 0x3e
- jnz .Ladd_proceedx
-.byte 102,73,15,126,208
-.byte 102,73,15,126,217
- testq %r8,%r8
- jnz .Ladd_proceedx
- testq %r9,%r9
- jz .Ladd_proceedx
-
-.byte 102,72,15,126,199
- pxor %xmm0,%xmm0
- movdqu %xmm0,0(%rdi)
- movdqu %xmm0,16(%rdi)
- movdqu %xmm0,32(%rdi)
- movdqu %xmm0,48(%rdi)
- movdqu %xmm0,64(%rdi)
- movdqu %xmm0,80(%rdi)
- jmp .Ladd_donex
-
-.align 32
-.Ladd_proceedx:
- movq 0+64(%rsp),%rdx
- movq 8+64(%rsp),%r14
- leaq -128+64(%rsp),%rsi
- movq 16+64(%rsp),%r15
- movq 24+64(%rsp),%r8
- leaq 96(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 448(%rsp),%rdx
- leaq 448(%rsp),%rbx
- movq 0+0(%rsp),%r9
- movq 8+0(%rsp),%r10
- leaq -128+0(%rsp),%rsi
- movq 16+0(%rsp),%r11
- movq 24+0(%rsp),%r12
- leaq 352(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 0+0(%rsp),%rdx
- movq 8+0(%rsp),%r14
- leaq -128+0(%rsp),%rsi
- movq 16+0(%rsp),%r15
- movq 24+0(%rsp),%r8
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 544(%rsp),%rdx
- leaq 544(%rsp),%rbx
- movq 0+352(%rsp),%r9
- movq 8+352(%rsp),%r10
- leaq -128+352(%rsp),%rsi
- movq 16+352(%rsp),%r11
- movq 24+352(%rsp),%r12
- leaq 352(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 0(%rsp),%rdx
- leaq 0(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 160(%rsp),%rdx
- leaq 160(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 192(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
-
-
-
- addq %r12,%r12
- leaq 96(%rsp),%rsi
- adcq %r13,%r13
- movq %r12,%rax
- adcq %r8,%r8
- adcq %r9,%r9
- movq %r13,%rbp
- sbbq %r11,%r11
-
- subq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
- testq %r11,%r11
-
- cmovzq %rax,%r12
- movq 0(%rsi),%rax
- cmovzq %rbp,%r13
- movq 8(%rsi),%rbp
- cmovzq %rcx,%r8
- movq 16(%rsi),%rcx
- cmovzq %r10,%r9
- movq 24(%rsi),%r10
-
- call __ecp_nistz256_subx
-
- leaq 128(%rsp),%rbx
- leaq 288(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- movq 192+0(%rsp),%rax
- movq 192+8(%rsp),%rbp
- movq 192+16(%rsp),%rcx
- movq 192+24(%rsp),%r10
- leaq 320(%rsp),%rdi
-
- call __ecp_nistz256_subx
-
- movq %r12,0(%rdi)
- movq %r13,8(%rdi)
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
- movq 128(%rsp),%rdx
- leaq 128(%rsp),%rbx
- movq 0+224(%rsp),%r9
- movq 8+224(%rsp),%r10
- leaq -128+224(%rsp),%rsi
- movq 16+224(%rsp),%r11
- movq 24+224(%rsp),%r12
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 320(%rsp),%rdx
- leaq 320(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq -128+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 320(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 256(%rsp),%rbx
- leaq 320(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
-.byte 102,72,15,126,199
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 352(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 352+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 544(%rsp),%xmm2
- pand 544+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 448(%rsp),%xmm2
- pand 448+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,64(%rdi)
- movdqu %xmm3,80(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 288(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 288+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 480(%rsp),%xmm2
- pand 480+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 384(%rsp),%xmm2
- pand 384+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,0(%rdi)
- movdqu %xmm3,16(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 320(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 320+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 512(%rsp),%xmm2
- pand 512+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 416(%rsp),%xmm2
- pand 416+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,32(%rdi)
- movdqu %xmm3,48(%rdi)
-
-.Ladd_donex:
- addq $576+8,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %rbp
- .byte 0xf3,0xc3
-.size ecp_nistz256_point_addx,.-ecp_nistz256_point_addx
-.type ecp_nistz256_point_add_affinex,@function
-.align 32
-ecp_nistz256_point_add_affinex:
-.Lpoint_add_affinex:
- pushq %rbp
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $480+8,%rsp
-
- movdqu 0(%rsi),%xmm0
- movq %rdx,%rbx
- movdqu 16(%rsi),%xmm1
- movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm3
- movdqu 64(%rsi),%xmm4
- movdqu 80(%rsi),%xmm5
- movq 64+0(%rsi),%rdx
- movq 64+8(%rsi),%r14
- movq 64+16(%rsi),%r15
- movq 64+24(%rsi),%r8
- movdqa %xmm0,320(%rsp)
- movdqa %xmm1,320+16(%rsp)
- por %xmm0,%xmm1
- movdqa %xmm2,352(%rsp)
- movdqa %xmm3,352+16(%rsp)
- por %xmm2,%xmm3
- movdqa %xmm4,384(%rsp)
- movdqa %xmm5,384+16(%rsp)
- por %xmm1,%xmm3
-
- movdqu 0(%rbx),%xmm0
- pshufd $177,%xmm3,%xmm5
- movdqu 16(%rbx),%xmm1
- movdqu 32(%rbx),%xmm2
- por %xmm3,%xmm5
- movdqu 48(%rbx),%xmm3
- movdqa %xmm0,416(%rsp)
- pshufd $30,%xmm5,%xmm4
- movdqa %xmm1,416+16(%rsp)
- por %xmm0,%xmm1
-.byte 102,72,15,110,199
- movdqa %xmm2,448(%rsp)
- movdqa %xmm3,448+16(%rsp)
- por %xmm2,%xmm3
- por %xmm4,%xmm5
- pxor %xmm4,%xmm4
- por %xmm1,%xmm3
-
- leaq 64-128(%rsi),%rsi
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- pcmpeqd %xmm4,%xmm5
- pshufd $177,%xmm3,%xmm4
- movq 0(%rbx),%rdx
-
- movq %r12,%r9
- por %xmm3,%xmm4
- pshufd $0,%xmm5,%xmm5
- pshufd $30,%xmm4,%xmm3
- movq %r13,%r10
- por %xmm3,%xmm4
- pxor %xmm3,%xmm3
- movq %r14,%r11
- pcmpeqd %xmm3,%xmm4
- pshufd $0,%xmm4,%xmm4
-
- leaq 32-128(%rsp),%rsi
- movq %r15,%r12
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 320(%rsp),%rbx
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- movq 384(%rsp),%rdx
- leaq 384(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 384(%rsp),%rdx
- leaq 384(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq -128+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 288(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 448(%rsp),%rdx
- leaq 448(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 352(%rsp),%rbx
- leaq 96(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- movq 0+64(%rsp),%rdx
- movq 8+64(%rsp),%r14
- leaq -128+64(%rsp),%rsi
- movq 16+64(%rsp),%r15
- movq 24+64(%rsp),%r8
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 0+96(%rsp),%rdx
- movq 8+96(%rsp),%r14
- leaq -128+96(%rsp),%rsi
- movq 16+96(%rsp),%r15
- movq 24+96(%rsp),%r8
- leaq 192(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 128(%rsp),%rdx
- leaq 128(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq -128+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 160(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 320(%rsp),%rdx
- leaq 320(%rsp),%rbx
- movq 0+128(%rsp),%r9
- movq 8+128(%rsp),%r10
- leaq -128+128(%rsp),%rsi
- movq 16+128(%rsp),%r11
- movq 24+128(%rsp),%r12
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
-
-
-
- addq %r12,%r12
- leaq 192(%rsp),%rsi
- adcq %r13,%r13
- movq %r12,%rax
- adcq %r8,%r8
- adcq %r9,%r9
- movq %r13,%rbp
- sbbq %r11,%r11
-
- subq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
- testq %r11,%r11
-
- cmovzq %rax,%r12
- movq 0(%rsi),%rax
- cmovzq %rbp,%r13
- movq 8(%rsi),%rbp
- cmovzq %rcx,%r8
- movq 16(%rsi),%rcx
- cmovzq %r10,%r9
- movq 24(%rsi),%r10
-
- call __ecp_nistz256_subx
-
- leaq 160(%rsp),%rbx
- leaq 224(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- movq 0+0(%rsp),%rax
- movq 0+8(%rsp),%rbp
- movq 0+16(%rsp),%rcx
- movq 0+24(%rsp),%r10
- leaq 64(%rsp),%rdi
-
- call __ecp_nistz256_subx
-
- movq %r12,0(%rdi)
- movq %r13,8(%rdi)
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
- movq 352(%rsp),%rdx
- leaq 352(%rsp),%rbx
- movq 0+160(%rsp),%r9
- movq 8+160(%rsp),%r10
- leaq -128+160(%rsp),%rsi
- movq 16+160(%rsp),%r11
- movq 24+160(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 96(%rsp),%rdx
- leaq 96(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq -128+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 32(%rsp),%rbx
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
-.byte 102,72,15,126,199
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 288(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 288+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand .LONE_mont(%rip),%xmm2
- pand .LONE_mont+16(%rip),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 384(%rsp),%xmm2
- pand 384+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,64(%rdi)
- movdqu %xmm3,80(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 224(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 224+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 416(%rsp),%xmm2
- pand 416+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 320(%rsp),%xmm2
- pand 320+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,0(%rdi)
- movdqu %xmm3,16(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 256(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 256+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 448(%rsp),%xmm2
- pand 448+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 352(%rsp),%xmm2
- pand 352+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,32(%rdi)
- movdqu %xmm3,48(%rdi)
-
- addq $480+8,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %rbp
- .byte 0xf3,0xc3
-.size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex
diff --git a/deps/openssl/asm/x64-elf-gas/modes/aesni-gcm-x86_64.s b/deps/openssl/asm/x64-elf-gas/modes/aesni-gcm-x86_64.s
index 4e82736a3e..35ebd9b4e0 100644
--- a/deps/openssl/asm/x64-elf-gas/modes/aesni-gcm-x86_64.s
+++ b/deps/openssl/asm/x64-elf-gas/modes/aesni-gcm-x86_64.s
@@ -1,753 +1,15 @@
.text
-.type _aesni_ctr32_ghash_6x,@function
-.align 32
-_aesni_ctr32_ghash_6x:
- vmovdqu 32(%r11),%xmm2
- subq $6,%rdx
- vpxor %xmm4,%xmm4,%xmm4
- vmovdqu 0-128(%rcx),%xmm15
- vpaddb %xmm2,%xmm1,%xmm10
- vpaddb %xmm2,%xmm10,%xmm11
- vpaddb %xmm2,%xmm11,%xmm12
- vpaddb %xmm2,%xmm12,%xmm13
- vpaddb %xmm2,%xmm13,%xmm14
- vpxor %xmm15,%xmm1,%xmm9
- vmovdqu %xmm4,16+8(%rsp)
- jmp .Loop6x
-
-.align 32
-.Loop6x:
- addl $100663296,%ebx
- jc .Lhandle_ctr32
- vmovdqu 0-32(%r9),%xmm3
- vpaddb %xmm2,%xmm14,%xmm1
- vpxor %xmm15,%xmm10,%xmm10
- vpxor %xmm15,%xmm11,%xmm11
-
-.Lresume_ctr32:
- vmovdqu %xmm1,(%r8)
- vpclmulqdq $16,%xmm3,%xmm7,%xmm5
- vpxor %xmm15,%xmm12,%xmm12
- vmovups 16-128(%rcx),%xmm2
- vpclmulqdq $1,%xmm3,%xmm7,%xmm6
- xorq %r12,%r12
- cmpq %r14,%r15
-
- vaesenc %xmm2,%xmm9,%xmm9
- vmovdqu 48+8(%rsp),%xmm0
- vpxor %xmm15,%xmm13,%xmm13
- vpclmulqdq $0,%xmm3,%xmm7,%xmm1
- vaesenc %xmm2,%xmm10,%xmm10
- vpxor %xmm15,%xmm14,%xmm14
- setnc %r12b
- vpclmulqdq $17,%xmm3,%xmm7,%xmm7
- vaesenc %xmm2,%xmm11,%xmm11
- vmovdqu 16-32(%r9),%xmm3
- negq %r12
- vaesenc %xmm2,%xmm12,%xmm12
- vpxor %xmm5,%xmm6,%xmm6
- vpclmulqdq $0,%xmm3,%xmm0,%xmm5
- vpxor %xmm4,%xmm8,%xmm8
- vaesenc %xmm2,%xmm13,%xmm13
- vpxor %xmm5,%xmm1,%xmm4
- andq $96,%r12
- vmovups 32-128(%rcx),%xmm15
- vpclmulqdq $16,%xmm3,%xmm0,%xmm1
- vaesenc %xmm2,%xmm14,%xmm14
-
- vpclmulqdq $1,%xmm3,%xmm0,%xmm2
- leaq (%r14,%r12,1),%r14
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor 16+8(%rsp),%xmm8,%xmm8
- vpclmulqdq $17,%xmm3,%xmm0,%xmm3
- vmovdqu 64+8(%rsp),%xmm0
- vaesenc %xmm15,%xmm10,%xmm10
- movbeq 88(%r14),%r13
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 80(%r14),%r12
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r13,32+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- movq %r12,40+8(%rsp)
- vmovdqu 48-32(%r9),%xmm5
- vaesenc %xmm15,%xmm14,%xmm14
-
- vmovups 48-128(%rcx),%xmm15
- vpxor %xmm1,%xmm6,%xmm6
- vpclmulqdq $0,%xmm5,%xmm0,%xmm1
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm2,%xmm6,%xmm6
- vpclmulqdq $16,%xmm5,%xmm0,%xmm2
- vaesenc %xmm15,%xmm10,%xmm10
- vpxor %xmm3,%xmm7,%xmm7
- vpclmulqdq $1,%xmm5,%xmm0,%xmm3
- vaesenc %xmm15,%xmm11,%xmm11
- vpclmulqdq $17,%xmm5,%xmm0,%xmm5
- vmovdqu 80+8(%rsp),%xmm0
- vaesenc %xmm15,%xmm12,%xmm12
- vaesenc %xmm15,%xmm13,%xmm13
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqu 64-32(%r9),%xmm1
- vaesenc %xmm15,%xmm14,%xmm14
-
- vmovups 64-128(%rcx),%xmm15
- vpxor %xmm2,%xmm6,%xmm6
- vpclmulqdq $0,%xmm1,%xmm0,%xmm2
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm3,%xmm6,%xmm6
- vpclmulqdq $16,%xmm1,%xmm0,%xmm3
- vaesenc %xmm15,%xmm10,%xmm10
- movbeq 72(%r14),%r13
- vpxor %xmm5,%xmm7,%xmm7
- vpclmulqdq $1,%xmm1,%xmm0,%xmm5
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 64(%r14),%r12
- vpclmulqdq $17,%xmm1,%xmm0,%xmm1
- vmovdqu 96+8(%rsp),%xmm0
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r13,48+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- movq %r12,56+8(%rsp)
- vpxor %xmm2,%xmm4,%xmm4
- vmovdqu 96-32(%r9),%xmm2
- vaesenc %xmm15,%xmm14,%xmm14
-
- vmovups 80-128(%rcx),%xmm15
- vpxor %xmm3,%xmm6,%xmm6
- vpclmulqdq $0,%xmm2,%xmm0,%xmm3
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm5,%xmm6,%xmm6
- vpclmulqdq $16,%xmm2,%xmm0,%xmm5
- vaesenc %xmm15,%xmm10,%xmm10
- movbeq 56(%r14),%r13
- vpxor %xmm1,%xmm7,%xmm7
- vpclmulqdq $1,%xmm2,%xmm0,%xmm1
- vpxor 112+8(%rsp),%xmm8,%xmm8
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 48(%r14),%r12
- vpclmulqdq $17,%xmm2,%xmm0,%xmm2
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r13,64+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- movq %r12,72+8(%rsp)
- vpxor %xmm3,%xmm4,%xmm4
- vmovdqu 112-32(%r9),%xmm3
- vaesenc %xmm15,%xmm14,%xmm14
-
- vmovups 96-128(%rcx),%xmm15
- vpxor %xmm5,%xmm6,%xmm6
- vpclmulqdq $16,%xmm3,%xmm8,%xmm5
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm1,%xmm6,%xmm6
- vpclmulqdq $1,%xmm3,%xmm8,%xmm1
- vaesenc %xmm15,%xmm10,%xmm10
- movbeq 40(%r14),%r13
- vpxor %xmm2,%xmm7,%xmm7
- vpclmulqdq $0,%xmm3,%xmm8,%xmm2
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 32(%r14),%r12
- vpclmulqdq $17,%xmm3,%xmm8,%xmm8
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r13,80+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- movq %r12,88+8(%rsp)
- vpxor %xmm5,%xmm6,%xmm6
- vaesenc %xmm15,%xmm14,%xmm14
- vpxor %xmm1,%xmm6,%xmm6
-
- vmovups 112-128(%rcx),%xmm15
- vpslldq $8,%xmm6,%xmm5
- vpxor %xmm2,%xmm4,%xmm4
- vmovdqu 16(%r11),%xmm3
-
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm8,%xmm7,%xmm7
- vaesenc %xmm15,%xmm10,%xmm10
- vpxor %xmm5,%xmm4,%xmm4
- movbeq 24(%r14),%r13
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 16(%r14),%r12
- vpalignr $8,%xmm4,%xmm4,%xmm0
- vpclmulqdq $16,%xmm3,%xmm4,%xmm4
- movq %r13,96+8(%rsp)
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r12,104+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- vmovups 128-128(%rcx),%xmm1
- vaesenc %xmm15,%xmm14,%xmm14
-
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups 144-128(%rcx),%xmm15
- vaesenc %xmm1,%xmm10,%xmm10
- vpsrldq $8,%xmm6,%xmm6
- vaesenc %xmm1,%xmm11,%xmm11
- vpxor %xmm6,%xmm7,%xmm7
- vaesenc %xmm1,%xmm12,%xmm12
- vpxor %xmm0,%xmm4,%xmm4
- movbeq 8(%r14),%r13
- vaesenc %xmm1,%xmm13,%xmm13
- movbeq 0(%r14),%r12
- vaesenc %xmm1,%xmm14,%xmm14
- vmovups 160-128(%rcx),%xmm1
- cmpl $11,%ebp
- jb .Lenc_tail
-
- vaesenc %xmm15,%xmm9,%xmm9
- vaesenc %xmm15,%xmm10,%xmm10
- vaesenc %xmm15,%xmm11,%xmm11
- vaesenc %xmm15,%xmm12,%xmm12
- vaesenc %xmm15,%xmm13,%xmm13
- vaesenc %xmm15,%xmm14,%xmm14
-
- vaesenc %xmm1,%xmm9,%xmm9
- vaesenc %xmm1,%xmm10,%xmm10
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
- vaesenc %xmm1,%xmm13,%xmm13
- vmovups 176-128(%rcx),%xmm15
- vaesenc %xmm1,%xmm14,%xmm14
- vmovups 192-128(%rcx),%xmm1
- je .Lenc_tail
-
- vaesenc %xmm15,%xmm9,%xmm9
- vaesenc %xmm15,%xmm10,%xmm10
- vaesenc %xmm15,%xmm11,%xmm11
- vaesenc %xmm15,%xmm12,%xmm12
- vaesenc %xmm15,%xmm13,%xmm13
- vaesenc %xmm15,%xmm14,%xmm14
-
- vaesenc %xmm1,%xmm9,%xmm9
- vaesenc %xmm1,%xmm10,%xmm10
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
- vaesenc %xmm1,%xmm13,%xmm13
- vmovups 208-128(%rcx),%xmm15
- vaesenc %xmm1,%xmm14,%xmm14
- vmovups 224-128(%rcx),%xmm1
- jmp .Lenc_tail
-
-.align 32
-.Lhandle_ctr32:
- vmovdqu (%r11),%xmm0
- vpshufb %xmm0,%xmm1,%xmm6
- vmovdqu 48(%r11),%xmm5
- vpaddd 64(%r11),%xmm6,%xmm10
- vpaddd %xmm5,%xmm6,%xmm11
- vmovdqu 0-32(%r9),%xmm3
- vpaddd %xmm5,%xmm10,%xmm12
- vpshufb %xmm0,%xmm10,%xmm10
- vpaddd %xmm5,%xmm11,%xmm13
- vpshufb %xmm0,%xmm11,%xmm11
- vpxor %xmm15,%xmm10,%xmm10
- vpaddd %xmm5,%xmm12,%xmm14
- vpshufb %xmm0,%xmm12,%xmm12
- vpxor %xmm15,%xmm11,%xmm11
- vpaddd %xmm5,%xmm13,%xmm1
- vpshufb %xmm0,%xmm13,%xmm13
- vpshufb %xmm0,%xmm14,%xmm14
- vpshufb %xmm0,%xmm1,%xmm1
- jmp .Lresume_ctr32
-
-.align 32
-.Lenc_tail:
- vaesenc %xmm15,%xmm9,%xmm9
- vmovdqu %xmm7,16+8(%rsp)
- vpalignr $8,%xmm4,%xmm4,%xmm8
- vaesenc %xmm15,%xmm10,%xmm10
- vpclmulqdq $16,%xmm3,%xmm4,%xmm4
- vpxor 0(%rdi),%xmm1,%xmm2
- vaesenc %xmm15,%xmm11,%xmm11
- vpxor 16(%rdi),%xmm1,%xmm0
- vaesenc %xmm15,%xmm12,%xmm12
- vpxor 32(%rdi),%xmm1,%xmm5
- vaesenc %xmm15,%xmm13,%xmm13
- vpxor 48(%rdi),%xmm1,%xmm6
- vaesenc %xmm15,%xmm14,%xmm14
- vpxor 64(%rdi),%xmm1,%xmm7
- vpxor 80(%rdi),%xmm1,%xmm3
- vmovdqu (%r8),%xmm1
-
- vaesenclast %xmm2,%xmm9,%xmm9
- vmovdqu 32(%r11),%xmm2
- vaesenclast %xmm0,%xmm10,%xmm10
- vpaddb %xmm2,%xmm1,%xmm0
- movq %r13,112+8(%rsp)
- leaq 96(%rdi),%rdi
- vaesenclast %xmm5,%xmm11,%xmm11
- vpaddb %xmm2,%xmm0,%xmm5
- movq %r12,120+8(%rsp)
- leaq 96(%rsi),%rsi
- vmovdqu 0-128(%rcx),%xmm15
- vaesenclast %xmm6,%xmm12,%xmm12
- vpaddb %xmm2,%xmm5,%xmm6
- vaesenclast %xmm7,%xmm13,%xmm13
- vpaddb %xmm2,%xmm6,%xmm7
- vaesenclast %xmm3,%xmm14,%xmm14
- vpaddb %xmm2,%xmm7,%xmm3
-
- addq $96,%r10
- subq $6,%rdx
- jc .L6x_done
-
- vmovups %xmm9,-96(%rsi)
- vpxor %xmm15,%xmm1,%xmm9
- vmovups %xmm10,-80(%rsi)
- vmovdqa %xmm0,%xmm10
- vmovups %xmm11,-64(%rsi)
- vmovdqa %xmm5,%xmm11
- vmovups %xmm12,-48(%rsi)
- vmovdqa %xmm6,%xmm12
- vmovups %xmm13,-32(%rsi)
- vmovdqa %xmm7,%xmm13
- vmovups %xmm14,-16(%rsi)
- vmovdqa %xmm3,%xmm14
- vmovdqu 32+8(%rsp),%xmm7
- jmp .Loop6x
-
-.L6x_done:
- vpxor 16+8(%rsp),%xmm8,%xmm8
- vpxor %xmm4,%xmm8,%xmm8
-
+.globl aesni_gcm_encrypt
+.type aesni_gcm_encrypt,@function
+aesni_gcm_encrypt:
+ xorl %eax,%eax
.byte 0xf3,0xc3
-.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+
.globl aesni_gcm_decrypt
.type aesni_gcm_decrypt,@function
-.align 32
aesni_gcm_decrypt:
- xorq %r10,%r10
- cmpq $96,%rdx
- jb .Lgcm_dec_abort
-
- leaq (%rsp),%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- vzeroupper
-
- vmovdqu (%r8),%xmm1
- addq $-128,%rsp
- movl 12(%r8),%ebx
- leaq .Lbswap_mask(%rip),%r11
- leaq -128(%rcx),%r14
- movq $3968,%r15
- vmovdqu (%r9),%xmm8
- andq $-128,%rsp
- vmovdqu (%r11),%xmm0
- leaq 128(%rcx),%rcx
- leaq 32+32(%r9),%r9
- movl 240-128(%rcx),%ebp
- vpshufb %xmm0,%xmm8,%xmm8
-
- andq %r15,%r14
- andq %rsp,%r15
- subq %r14,%r15
- jc .Ldec_no_key_aliasing
- cmpq $768,%r15
- jnc .Ldec_no_key_aliasing
- subq %r15,%rsp
-.Ldec_no_key_aliasing:
-
- vmovdqu 80(%rdi),%xmm7
- leaq (%rdi),%r14
- vmovdqu 64(%rdi),%xmm4
- leaq -192(%rdi,%rdx,1),%r15
- vmovdqu 48(%rdi),%xmm5
- shrq $4,%rdx
- xorq %r10,%r10
- vmovdqu 32(%rdi),%xmm6
- vpshufb %xmm0,%xmm7,%xmm7
- vmovdqu 16(%rdi),%xmm2
- vpshufb %xmm0,%xmm4,%xmm4
- vmovdqu (%rdi),%xmm3
- vpshufb %xmm0,%xmm5,%xmm5
- vmovdqu %xmm4,48(%rsp)
- vpshufb %xmm0,%xmm6,%xmm6
- vmovdqu %xmm5,64(%rsp)
- vpshufb %xmm0,%xmm2,%xmm2
- vmovdqu %xmm6,80(%rsp)
- vpshufb %xmm0,%xmm3,%xmm3
- vmovdqu %xmm2,96(%rsp)
- vmovdqu %xmm3,112(%rsp)
-
- call _aesni_ctr32_ghash_6x
-
- vmovups %xmm9,-96(%rsi)
- vmovups %xmm10,-80(%rsi)
- vmovups %xmm11,-64(%rsi)
- vmovups %xmm12,-48(%rsi)
- vmovups %xmm13,-32(%rsi)
- vmovups %xmm14,-16(%rsi)
-
- vpshufb (%r11),%xmm8,%xmm8
- vmovdqu %xmm8,-64(%r9)
-
- vzeroupper
- movq -48(%rax),%r15
- movq -40(%rax),%r14
- movq -32(%rax),%r13
- movq -24(%rax),%r12
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-.Lgcm_dec_abort:
- movq %r10,%rax
+ xorl %eax,%eax
.byte 0xf3,0xc3
.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
-.type _aesni_ctr32_6x,@function
-.align 32
-_aesni_ctr32_6x:
- vmovdqu 0-128(%rcx),%xmm4
- vmovdqu 32(%r11),%xmm2
- leaq -1(%rbp),%r13
- vmovups 16-128(%rcx),%xmm15
- leaq 32-128(%rcx),%r12
- vpxor %xmm4,%xmm1,%xmm9
- addl $100663296,%ebx
- jc .Lhandle_ctr32_2
- vpaddb %xmm2,%xmm1,%xmm10
- vpaddb %xmm2,%xmm10,%xmm11
- vpxor %xmm4,%xmm10,%xmm10
- vpaddb %xmm2,%xmm11,%xmm12
- vpxor %xmm4,%xmm11,%xmm11
- vpaddb %xmm2,%xmm12,%xmm13
- vpxor %xmm4,%xmm12,%xmm12
- vpaddb %xmm2,%xmm13,%xmm14
- vpxor %xmm4,%xmm13,%xmm13
- vpaddb %xmm2,%xmm14,%xmm1
- vpxor %xmm4,%xmm14,%xmm14
- jmp .Loop_ctr32
-
-.align 16
-.Loop_ctr32:
- vaesenc %xmm15,%xmm9,%xmm9
- vaesenc %xmm15,%xmm10,%xmm10
- vaesenc %xmm15,%xmm11,%xmm11
- vaesenc %xmm15,%xmm12,%xmm12
- vaesenc %xmm15,%xmm13,%xmm13
- vaesenc %xmm15,%xmm14,%xmm14
- vmovups (%r12),%xmm15
- leaq 16(%r12),%r12
- decl %r13d
- jnz .Loop_ctr32
-
- vmovdqu (%r12),%xmm3
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor 0(%rdi),%xmm3,%xmm4
- vaesenc %xmm15,%xmm10,%xmm10
- vpxor 16(%rdi),%xmm3,%xmm5
- vaesenc %xmm15,%xmm11,%xmm11
- vpxor 32(%rdi),%xmm3,%xmm6
- vaesenc %xmm15,%xmm12,%xmm12
- vpxor 48(%rdi),%xmm3,%xmm8
- vaesenc %xmm15,%xmm13,%xmm13
- vpxor 64(%rdi),%xmm3,%xmm2
- vaesenc %xmm15,%xmm14,%xmm14
- vpxor 80(%rdi),%xmm3,%xmm3
- leaq 96(%rdi),%rdi
-
- vaesenclast %xmm4,%xmm9,%xmm9
- vaesenclast %xmm5,%xmm10,%xmm10
- vaesenclast %xmm6,%xmm11,%xmm11
- vaesenclast %xmm8,%xmm12,%xmm12
- vaesenclast %xmm2,%xmm13,%xmm13
- vaesenclast %xmm3,%xmm14,%xmm14
- vmovups %xmm9,0(%rsi)
- vmovups %xmm10,16(%rsi)
- vmovups %xmm11,32(%rsi)
- vmovups %xmm12,48(%rsi)
- vmovups %xmm13,64(%rsi)
- vmovups %xmm14,80(%rsi)
- leaq 96(%rsi),%rsi
-
- .byte 0xf3,0xc3
-.align 32
-.Lhandle_ctr32_2:
- vpshufb %xmm0,%xmm1,%xmm6
- vmovdqu 48(%r11),%xmm5
- vpaddd 64(%r11),%xmm6,%xmm10
- vpaddd %xmm5,%xmm6,%xmm11
- vpaddd %xmm5,%xmm10,%xmm12
- vpshufb %xmm0,%xmm10,%xmm10
- vpaddd %xmm5,%xmm11,%xmm13
- vpshufb %xmm0,%xmm11,%xmm11
- vpxor %xmm4,%xmm10,%xmm10
- vpaddd %xmm5,%xmm12,%xmm14
- vpshufb %xmm0,%xmm12,%xmm12
- vpxor %xmm4,%xmm11,%xmm11
- vpaddd %xmm5,%xmm13,%xmm1
- vpshufb %xmm0,%xmm13,%xmm13
- vpxor %xmm4,%xmm12,%xmm12
- vpshufb %xmm0,%xmm14,%xmm14
- vpxor %xmm4,%xmm13,%xmm13
- vpshufb %xmm0,%xmm1,%xmm1
- vpxor %xmm4,%xmm14,%xmm14
- jmp .Loop_ctr32
-.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
-
-.globl aesni_gcm_encrypt
-.type aesni_gcm_encrypt,@function
-.align 32
-aesni_gcm_encrypt:
- xorq %r10,%r10
- cmpq $288,%rdx
- jb .Lgcm_enc_abort
-
- leaq (%rsp),%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- vzeroupper
-
- vmovdqu (%r8),%xmm1
- addq $-128,%rsp
- movl 12(%r8),%ebx
- leaq .Lbswap_mask(%rip),%r11
- leaq -128(%rcx),%r14
- movq $3968,%r15
- leaq 128(%rcx),%rcx
- vmovdqu (%r11),%xmm0
- andq $-128,%rsp
- movl 240-128(%rcx),%ebp
-
- andq %r15,%r14
- andq %rsp,%r15
- subq %r14,%r15
- jc .Lenc_no_key_aliasing
- cmpq $768,%r15
- jnc .Lenc_no_key_aliasing
- subq %r15,%rsp
-.Lenc_no_key_aliasing:
-
- leaq (%rsi),%r14
- leaq -192(%rsi,%rdx,1),%r15
- shrq $4,%rdx
-
- call _aesni_ctr32_6x
- vpshufb %xmm0,%xmm9,%xmm8
- vpshufb %xmm0,%xmm10,%xmm2
- vmovdqu %xmm8,112(%rsp)
- vpshufb %xmm0,%xmm11,%xmm4
- vmovdqu %xmm2,96(%rsp)
- vpshufb %xmm0,%xmm12,%xmm5
- vmovdqu %xmm4,80(%rsp)
- vpshufb %xmm0,%xmm13,%xmm6
- vmovdqu %xmm5,64(%rsp)
- vpshufb %xmm0,%xmm14,%xmm7
- vmovdqu %xmm6,48(%rsp)
-
- call _aesni_ctr32_6x
-
- vmovdqu (%r9),%xmm8
- leaq 32+32(%r9),%r9
- subq $12,%rdx
- movq $192,%r10
- vpshufb %xmm0,%xmm8,%xmm8
-
- call _aesni_ctr32_ghash_6x
- vmovdqu 32(%rsp),%xmm7
- vmovdqu (%r11),%xmm0
- vmovdqu 0-32(%r9),%xmm3
- vpunpckhqdq %xmm7,%xmm7,%xmm1
- vmovdqu 32-32(%r9),%xmm15
- vmovups %xmm9,-96(%rsi)
- vpshufb %xmm0,%xmm9,%xmm9
- vpxor %xmm7,%xmm1,%xmm1
- vmovups %xmm10,-80(%rsi)
- vpshufb %xmm0,%xmm10,%xmm10
- vmovups %xmm11,-64(%rsi)
- vpshufb %xmm0,%xmm11,%xmm11
- vmovups %xmm12,-48(%rsi)
- vpshufb %xmm0,%xmm12,%xmm12
- vmovups %xmm13,-32(%rsi)
- vpshufb %xmm0,%xmm13,%xmm13
- vmovups %xmm14,-16(%rsi)
- vpshufb %xmm0,%xmm14,%xmm14
- vmovdqu %xmm9,16(%rsp)
- vmovdqu 48(%rsp),%xmm6
- vmovdqu 16-32(%r9),%xmm0
- vpunpckhqdq %xmm6,%xmm6,%xmm2
- vpclmulqdq $0,%xmm3,%xmm7,%xmm5
- vpxor %xmm6,%xmm2,%xmm2
- vpclmulqdq $17,%xmm3,%xmm7,%xmm7
- vpclmulqdq $0,%xmm15,%xmm1,%xmm1
-
- vmovdqu 64(%rsp),%xmm9
- vpclmulqdq $0,%xmm0,%xmm6,%xmm4
- vmovdqu 48-32(%r9),%xmm3
- vpxor %xmm5,%xmm4,%xmm4
- vpunpckhqdq %xmm9,%xmm9,%xmm5
- vpclmulqdq $17,%xmm0,%xmm6,%xmm6
- vpxor %xmm9,%xmm5,%xmm5
- vpxor %xmm7,%xmm6,%xmm6
- vpclmulqdq $16,%xmm15,%xmm2,%xmm2
- vmovdqu 80-32(%r9),%xmm15
- vpxor %xmm1,%xmm2,%xmm2
-
- vmovdqu 80(%rsp),%xmm1
- vpclmulqdq $0,%xmm3,%xmm9,%xmm7
- vmovdqu 64-32(%r9),%xmm0
- vpxor %xmm4,%xmm7,%xmm7
- vpunpckhqdq %xmm1,%xmm1,%xmm4
- vpclmulqdq $17,%xmm3,%xmm9,%xmm9
- vpxor %xmm1,%xmm4,%xmm4
- vpxor %xmm6,%xmm9,%xmm9
- vpclmulqdq $0,%xmm15,%xmm5,%xmm5
- vpxor %xmm2,%xmm5,%xmm5
-
- vmovdqu 96(%rsp),%xmm2
- vpclmulqdq $0,%xmm0,%xmm1,%xmm6
- vmovdqu 96-32(%r9),%xmm3
- vpxor %xmm7,%xmm6,%xmm6
- vpunpckhqdq %xmm2,%xmm2,%xmm7
- vpclmulqdq $17,%xmm0,%xmm1,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpxor %xmm9,%xmm1,%xmm1
- vpclmulqdq $16,%xmm15,%xmm4,%xmm4
- vmovdqu 128-32(%r9),%xmm15
- vpxor %xmm5,%xmm4,%xmm4
-
- vpxor 112(%rsp),%xmm8,%xmm8
- vpclmulqdq $0,%xmm3,%xmm2,%xmm5
- vmovdqu 112-32(%r9),%xmm0
- vpunpckhqdq %xmm8,%xmm8,%xmm9
- vpxor %xmm6,%xmm5,%xmm5
- vpclmulqdq $17,%xmm3,%xmm2,%xmm2
- vpxor %xmm8,%xmm9,%xmm9
- vpxor %xmm1,%xmm2,%xmm2
- vpclmulqdq $0,%xmm15,%xmm7,%xmm7
- vpxor %xmm4,%xmm7,%xmm4
-
- vpclmulqdq $0,%xmm0,%xmm8,%xmm6
- vmovdqu 0-32(%r9),%xmm3
- vpunpckhqdq %xmm14,%xmm14,%xmm1
- vpclmulqdq $17,%xmm0,%xmm8,%xmm8
- vpxor %xmm14,%xmm1,%xmm1
- vpxor %xmm5,%xmm6,%xmm5
- vpclmulqdq $16,%xmm15,%xmm9,%xmm9
- vmovdqu 32-32(%r9),%xmm15
- vpxor %xmm2,%xmm8,%xmm7
- vpxor %xmm4,%xmm9,%xmm6
-
- vmovdqu 16-32(%r9),%xmm0
- vpxor %xmm5,%xmm7,%xmm9
- vpclmulqdq $0,%xmm3,%xmm14,%xmm4
- vpxor %xmm9,%xmm6,%xmm6
- vpunpckhqdq %xmm13,%xmm13,%xmm2
- vpclmulqdq $17,%xmm3,%xmm14,%xmm14
- vpxor %xmm13,%xmm2,%xmm2
- vpslldq $8,%xmm6,%xmm9
- vpclmulqdq $0,%xmm15,%xmm1,%xmm1
- vpxor %xmm9,%xmm5,%xmm8
- vpsrldq $8,%xmm6,%xmm6
- vpxor %xmm6,%xmm7,%xmm7
-
- vpclmulqdq $0,%xmm0,%xmm13,%xmm5
- vmovdqu 48-32(%r9),%xmm3
- vpxor %xmm4,%xmm5,%xmm5
- vpunpckhqdq %xmm12,%xmm12,%xmm9
- vpclmulqdq $17,%xmm0,%xmm13,%xmm13
- vpxor %xmm12,%xmm9,%xmm9
- vpxor %xmm14,%xmm13,%xmm13
- vpalignr $8,%xmm8,%xmm8,%xmm14
- vpclmulqdq $16,%xmm15,%xmm2,%xmm2
- vmovdqu 80-32(%r9),%xmm15
- vpxor %xmm1,%xmm2,%xmm2
-
- vpclmulqdq $0,%xmm3,%xmm12,%xmm4
- vmovdqu 64-32(%r9),%xmm0
- vpxor %xmm5,%xmm4,%xmm4
- vpunpckhqdq %xmm11,%xmm11,%xmm1
- vpclmulqdq $17,%xmm3,%xmm12,%xmm12
- vpxor %xmm11,%xmm1,%xmm1
- vpxor %xmm13,%xmm12,%xmm12
- vxorps 16(%rsp),%xmm7,%xmm7
- vpclmulqdq $0,%xmm15,%xmm9,%xmm9
- vpxor %xmm2,%xmm9,%xmm9
-
- vpclmulqdq $16,16(%r11),%xmm8,%xmm8
- vxorps %xmm14,%xmm8,%xmm8
-
- vpclmulqdq $0,%xmm0,%xmm11,%xmm5
- vmovdqu 96-32(%r9),%xmm3
- vpxor %xmm4,%xmm5,%xmm5
- vpunpckhqdq %xmm10,%xmm10,%xmm2
- vpclmulqdq $17,%xmm0,%xmm11,%xmm11
- vpxor %xmm10,%xmm2,%xmm2
- vpalignr $8,%xmm8,%xmm8,%xmm14
- vpxor %xmm12,%xmm11,%xmm11
- vpclmulqdq $16,%xmm15,%xmm1,%xmm1
- vmovdqu 128-32(%r9),%xmm15
- vpxor %xmm9,%xmm1,%xmm1
-
- vxorps %xmm7,%xmm14,%xmm14
- vpclmulqdq $16,16(%r11),%xmm8,%xmm8
- vxorps %xmm14,%xmm8,%xmm8
-
- vpclmulqdq $0,%xmm3,%xmm10,%xmm4
- vmovdqu 112-32(%r9),%xmm0
- vpxor %xmm5,%xmm4,%xmm4
- vpunpckhqdq %xmm8,%xmm8,%xmm9
- vpclmulqdq $17,%xmm3,%xmm10,%xmm10
- vpxor %xmm8,%xmm9,%xmm9
- vpxor %xmm11,%xmm10,%xmm10
- vpclmulqdq $0,%xmm15,%xmm2,%xmm2
- vpxor %xmm1,%xmm2,%xmm2
-
- vpclmulqdq $0,%xmm0,%xmm8,%xmm5
- vpclmulqdq $17,%xmm0,%xmm8,%xmm7
- vpxor %xmm4,%xmm5,%xmm5
- vpclmulqdq $16,%xmm15,%xmm9,%xmm6
- vpxor %xmm10,%xmm7,%xmm7
- vpxor %xmm2,%xmm6,%xmm6
-
- vpxor %xmm5,%xmm7,%xmm4
- vpxor %xmm4,%xmm6,%xmm6
- vpslldq $8,%xmm6,%xmm1
- vmovdqu 16(%r11),%xmm3
- vpsrldq $8,%xmm6,%xmm6
- vpxor %xmm1,%xmm5,%xmm8
- vpxor %xmm6,%xmm7,%xmm7
-
- vpalignr $8,%xmm8,%xmm8,%xmm2
- vpclmulqdq $16,%xmm3,%xmm8,%xmm8
- vpxor %xmm2,%xmm8,%xmm8
-
- vpalignr $8,%xmm8,%xmm8,%xmm2
- vpclmulqdq $16,%xmm3,%xmm8,%xmm8
- vpxor %xmm7,%xmm2,%xmm2
- vpxor %xmm2,%xmm8,%xmm8
- vpshufb (%r11),%xmm8,%xmm8
- vmovdqu %xmm8,-64(%r9)
-
- vzeroupper
- movq -48(%rax),%r15
- movq -40(%rax),%r14
- movq -32(%rax),%r13
- movq -24(%rax),%r12
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-.Lgcm_enc_abort:
- movq %r10,%rax
- .byte 0xf3,0xc3
-.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
-.align 64
-.Lbswap_mask:
-.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-.Lpoly:
-.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
-.Lone_msb:
-.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
-.Ltwo_lsb:
-.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-.Lone_lsb:
-.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 64
diff --git a/deps/openssl/asm/x64-elf-gas/modes/ghash-x86_64.s b/deps/openssl/asm/x64-elf-gas/modes/ghash-x86_64.s
index 1cfe19cb55..e9ffdc2de2 100644
--- a/deps/openssl/asm/x64-elf-gas/modes/ghash-x86_64.s
+++ b/deps/openssl/asm/x64-elf-gas/modes/ghash-x86_64.s
@@ -20,14 +20,14 @@ gcm_gmult_4bit:
movq $14,%rcx
movq 8(%rsi,%rax,1),%r8
movq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
movq %r8,%rdx
jmp .Loop1
.align 16
.Loop1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
movb (%rdi,%rcx,1),%al
shrq $4,%r9
@@ -43,13 +43,13 @@ gcm_gmult_4bit:
js .Lbreak1
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
@@ -58,19 +58,19 @@ gcm_gmult_4bit:
.align 16
.Lbreak1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rbx,1),%r8
@@ -661,10 +661,10 @@ gcm_ghash_4bit:
gcm_init_clmul:
.L_init_clmul:
movdqu (%rsi),%xmm2
- pshufd $78,%xmm2,%xmm2
+ pshufd $0b01001110,%xmm2,%xmm2
- pshufd $255,%xmm2,%xmm4
+ pshufd $0b11111111,%xmm2,%xmm4
movdqa %xmm2,%xmm3
psllq $1,%xmm2
pxor %xmm5,%xmm5
@@ -678,11 +678,11 @@ gcm_init_clmul:
pxor %xmm5,%xmm2
- pshufd $78,%xmm2,%xmm6
+ pshufd $0b01001110,%xmm2,%xmm6
movdqa %xmm2,%xmm0
pxor %xmm2,%xmm6
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -718,8 +718,8 @@ gcm_init_clmul:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- pshufd $78,%xmm2,%xmm3
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm2,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm2,%xmm3
movdqu %xmm2,0(%rdi)
pxor %xmm0,%xmm4
@@ -727,7 +727,7 @@ gcm_init_clmul:
.byte 102,15,58,15,227,8
movdqu %xmm4,32(%rdi)
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -765,7 +765,7 @@ gcm_init_clmul:
pxor %xmm1,%xmm0
movdqa %xmm0,%xmm5
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -801,8 +801,8 @@ gcm_init_clmul:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- pshufd $78,%xmm5,%xmm3
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm5,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm5,%xmm3
movdqu %xmm5,48(%rdi)
pxor %xmm0,%xmm4
@@ -822,7 +822,7 @@ gcm_gmult_clmul:
movdqu 32(%rsi),%xmm4
.byte 102,15,56,0,197
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -874,20 +874,20 @@ gcm_ghash_clmul:
movdqu 32(%rsi),%xmm7
.byte 102,65,15,56,0,194
- subq $16,%rcx
+ subq $0x10,%rcx
jz .Lodd_tail
movdqu 16(%rsi),%xmm6
movl OPENSSL_ia32cap_P+4(%rip),%eax
- cmpq $48,%rcx
+ cmpq $0x30,%rcx
jb .Lskip4x
andl $71303168,%eax
cmpl $4194304,%eax
je .Lskip4x
- subq $48,%rcx
- movq $11547335547999543296,%rax
+ subq $0x30,%rcx
+ movq $0xA040608020C0E000,%rax
movdqu 48(%rsi),%xmm14
movdqu 64(%rsi),%xmm15
@@ -899,14 +899,14 @@ gcm_ghash_clmul:
.byte 102,65,15,56,0,218
.byte 102,69,15,56,0,218
movdqa %xmm3,%xmm5
- pshufd $78,%xmm3,%xmm4
+ pshufd $0b01001110,%xmm3,%xmm4
pxor %xmm3,%xmm4
.byte 102,15,58,68,218,0
.byte 102,15,58,68,234,17
.byte 102,15,58,68,231,0
movdqa %xmm11,%xmm13
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
pxor %xmm11,%xmm12
.byte 102,68,15,58,68,222,0
.byte 102,68,15,58,68,238,17
@@ -921,12 +921,12 @@ gcm_ghash_clmul:
.byte 102,69,15,56,0,218
.byte 102,69,15,56,0,194
movdqa %xmm11,%xmm13
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
pxor %xmm8,%xmm0
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm8
+ pshufd $0b01001110,%xmm0,%xmm8
pxor %xmm0,%xmm8
.byte 102,69,15,58,68,238,17
.byte 102,68,15,58,68,231,0
@@ -934,7 +934,7 @@ gcm_ghash_clmul:
xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jc .Ltail4x
jmp .Lmod4_loop
@@ -949,14 +949,14 @@ gcm_ghash_clmul:
movdqu 32(%rdx),%xmm3
movdqa %xmm11,%xmm13
.byte 102,68,15,58,68,199,16
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
xorps %xmm5,%xmm1
pxor %xmm11,%xmm12
.byte 102,65,15,56,0,218
movups 32(%rsi),%xmm7
xorps %xmm4,%xmm8
.byte 102,68,15,58,68,218,0
- pshufd $78,%xmm3,%xmm4
+ pshufd $0b01001110,%xmm3,%xmm4
pxor %xmm0,%xmm8
movdqa %xmm3,%xmm5
@@ -1000,7 +1000,7 @@ gcm_ghash_clmul:
movdqa %xmm11,%xmm13
pxor %xmm12,%xmm4
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
pxor %xmm9,%xmm0
pxor %xmm8,%xmm1
pxor %xmm11,%xmm12
@@ -1010,14 +1010,14 @@ gcm_ghash_clmul:
movdqa %xmm0,%xmm1
.byte 102,69,15,58,68,238,17
xorps %xmm11,%xmm3
- pshufd $78,%xmm0,%xmm8
+ pshufd $0b01001110,%xmm0,%xmm8
pxor %xmm0,%xmm8
.byte 102,68,15,58,68,231,0
xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jnc .Lmod4_loop
.Ltail4x:
@@ -1061,10 +1061,10 @@ gcm_ghash_clmul:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- addq $64,%rcx
+ addq $0x40,%rcx
jz .Ldone
movdqu 32(%rsi),%xmm7
- subq $16,%rcx
+ subq $0x10,%rcx
jz .Lodd_tail
.Lskip4x:
@@ -1079,7 +1079,7 @@ gcm_ghash_clmul:
pxor %xmm8,%xmm0
movdqa %xmm3,%xmm5
- pshufd $78,%xmm3,%xmm4
+ pshufd $0b01001110,%xmm3,%xmm4
pxor %xmm3,%xmm4
.byte 102,15,58,68,218,0
.byte 102,15,58,68,234,17
@@ -1087,7 +1087,7 @@ gcm_ghash_clmul:
leaq 32(%rdx),%rdx
nop
- subq $32,%rcx
+ subq $0x20,%rcx
jbe .Leven_tail
nop
jmp .Lmod_loop
@@ -1096,7 +1096,7 @@ gcm_ghash_clmul:
.Lmod_loop:
movdqa %xmm0,%xmm1
movdqa %xmm4,%xmm8
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm0,%xmm4
.byte 102,15,58,68,198,0
@@ -1134,7 +1134,7 @@ gcm_ghash_clmul:
pslldq $8,%xmm0
psrldq $8,%xmm8
pxor %xmm9,%xmm0
- pshufd $78,%xmm5,%xmm4
+ pshufd $0b01001110,%xmm5,%xmm4
pxor %xmm8,%xmm1
pxor %xmm5,%xmm4
@@ -1150,13 +1150,13 @@ gcm_ghash_clmul:
.byte 102,15,58,68,231,0
pxor %xmm1,%xmm0
- subq $32,%rcx
+ subq $0x20,%rcx
ja .Lmod_loop
.Leven_tail:
movdqa %xmm0,%xmm1
movdqa %xmm4,%xmm8
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm0,%xmm4
.byte 102,15,58,68,198,0
@@ -1204,7 +1204,7 @@ gcm_ghash_clmul:
.byte 102,69,15,56,0,194
pxor %xmm8,%xmm0
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -1249,108 +1249,7 @@ gcm_ghash_clmul:
.type gcm_init_avx,@function
.align 32
gcm_init_avx:
- vzeroupper
-
- vmovdqu (%rsi),%xmm2
- vpshufd $78,%xmm2,%xmm2
-
-
- vpshufd $255,%xmm2,%xmm4
- vpsrlq $63,%xmm2,%xmm3
- vpsllq $1,%xmm2,%xmm2
- vpxor %xmm5,%xmm5,%xmm5
- vpcmpgtd %xmm4,%xmm5,%xmm5
- vpslldq $8,%xmm3,%xmm3
- vpor %xmm3,%xmm2,%xmm2
-
-
- vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5
- vpxor %xmm5,%xmm2,%xmm2
-
- vpunpckhqdq %xmm2,%xmm2,%xmm6
- vmovdqa %xmm2,%xmm0
- vpxor %xmm2,%xmm6,%xmm6
- movq $4,%r10
- jmp .Linit_start_avx
-.align 32
-.Linit_loop_avx:
- vpalignr $8,%xmm3,%xmm4,%xmm5
- vmovdqu %xmm5,-16(%rdi)
- vpunpckhqdq %xmm0,%xmm0,%xmm3
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $17,%xmm2,%xmm0,%xmm1
- vpclmulqdq $0,%xmm2,%xmm0,%xmm0
- vpclmulqdq $0,%xmm6,%xmm3,%xmm3
- vpxor %xmm0,%xmm1,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
-
- vpslldq $8,%xmm3,%xmm4
- vpsrldq $8,%xmm3,%xmm3
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm3,%xmm1,%xmm1
- vpsllq $57,%xmm0,%xmm3
- vpsllq $62,%xmm0,%xmm4
- vpxor %xmm3,%xmm4,%xmm4
- vpsllq $63,%xmm0,%xmm3
- vpxor %xmm3,%xmm4,%xmm4
- vpslldq $8,%xmm4,%xmm3
- vpsrldq $8,%xmm4,%xmm4
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm4,%xmm1,%xmm1
-
- vpsrlq $1,%xmm0,%xmm4
- vpxor %xmm0,%xmm1,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpsrlq $5,%xmm4,%xmm4
- vpxor %xmm4,%xmm0,%xmm0
- vpsrlq $1,%xmm0,%xmm0
- vpxor %xmm1,%xmm0,%xmm0
-.Linit_start_avx:
- vmovdqa %xmm0,%xmm5
- vpunpckhqdq %xmm0,%xmm0,%xmm3
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $17,%xmm2,%xmm0,%xmm1
- vpclmulqdq $0,%xmm2,%xmm0,%xmm0
- vpclmulqdq $0,%xmm6,%xmm3,%xmm3
- vpxor %xmm0,%xmm1,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
-
- vpslldq $8,%xmm3,%xmm4
- vpsrldq $8,%xmm3,%xmm3
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm3,%xmm1,%xmm1
- vpsllq $57,%xmm0,%xmm3
- vpsllq $62,%xmm0,%xmm4
- vpxor %xmm3,%xmm4,%xmm4
- vpsllq $63,%xmm0,%xmm3
- vpxor %xmm3,%xmm4,%xmm4
- vpslldq $8,%xmm4,%xmm3
- vpsrldq $8,%xmm4,%xmm4
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm4,%xmm1,%xmm1
-
- vpsrlq $1,%xmm0,%xmm4
- vpxor %xmm0,%xmm1,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpsrlq $5,%xmm4,%xmm4
- vpxor %xmm4,%xmm0,%xmm0
- vpsrlq $1,%xmm0,%xmm0
- vpxor %xmm1,%xmm0,%xmm0
- vpshufd $78,%xmm5,%xmm3
- vpshufd $78,%xmm0,%xmm4
- vpxor %xmm5,%xmm3,%xmm3
- vmovdqu %xmm5,0(%rdi)
- vpxor %xmm0,%xmm4,%xmm4
- vmovdqu %xmm0,16(%rdi)
- leaq 48(%rdi),%rdi
- subq $1,%r10
- jnz .Linit_loop_avx
-
- vpalignr $8,%xmm4,%xmm3,%xmm5
- vmovdqu %xmm5,-16(%rdi)
-
- vzeroupper
- .byte 0xf3,0xc3
+ jmp .L_init_clmul
.size gcm_init_avx,.-gcm_init_avx
.globl gcm_gmult_avx
.type gcm_gmult_avx,@function
@@ -1362,377 +1261,7 @@ gcm_gmult_avx:
.type gcm_ghash_avx,@function
.align 32
gcm_ghash_avx:
- vzeroupper
-
- vmovdqu (%rdi),%xmm10
- leaq .L0x1c2_polynomial(%rip),%r10
- leaq 64(%rsi),%rsi
- vmovdqu .Lbswap_mask(%rip),%xmm13
- vpshufb %xmm13,%xmm10,%xmm10
- cmpq $128,%rcx
- jb .Lshort_avx
- subq $128,%rcx
-
- vmovdqu 112(%rdx),%xmm14
- vmovdqu 0-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm14
- vmovdqu 32-64(%rsi),%xmm7
-
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vmovdqu 96(%rdx),%xmm15
- vpclmulqdq $0,%xmm6,%xmm14,%xmm0
- vpxor %xmm14,%xmm9,%xmm9
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $17,%xmm6,%xmm14,%xmm1
- vmovdqu 16-64(%rsi),%xmm6
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vmovdqu 80(%rdx),%xmm14
- vpclmulqdq $0,%xmm7,%xmm9,%xmm2
- vpxor %xmm15,%xmm8,%xmm8
-
- vpshufb %xmm13,%xmm14,%xmm14
- vpclmulqdq $0,%xmm6,%xmm15,%xmm3
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpclmulqdq $17,%xmm6,%xmm15,%xmm4
- vmovdqu 48-64(%rsi),%xmm6
- vpxor %xmm14,%xmm9,%xmm9
- vmovdqu 64(%rdx),%xmm15
- vpclmulqdq $16,%xmm7,%xmm8,%xmm5
- vmovdqu 80-64(%rsi),%xmm7
-
- vpshufb %xmm13,%xmm15,%xmm15
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm14,%xmm0
- vpxor %xmm1,%xmm4,%xmm4
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpclmulqdq $17,%xmm6,%xmm14,%xmm1
- vmovdqu 64-64(%rsi),%xmm6
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0,%xmm7,%xmm9,%xmm2
- vpxor %xmm15,%xmm8,%xmm8
-
- vmovdqu 48(%rdx),%xmm14
- vpxor %xmm3,%xmm0,%xmm0
- vpclmulqdq $0,%xmm6,%xmm15,%xmm3
- vpxor %xmm4,%xmm1,%xmm1
- vpshufb %xmm13,%xmm14,%xmm14
- vpclmulqdq $17,%xmm6,%xmm15,%xmm4
- vmovdqu 96-64(%rsi),%xmm6
- vpxor %xmm5,%xmm2,%xmm2
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpclmulqdq $16,%xmm7,%xmm8,%xmm5
- vmovdqu 128-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
-
- vmovdqu 32(%rdx),%xmm15
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm14,%xmm0
- vpxor %xmm1,%xmm4,%xmm4
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $17,%xmm6,%xmm14,%xmm1
- vmovdqu 112-64(%rsi),%xmm6
- vpxor %xmm2,%xmm5,%xmm5
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpclmulqdq $0,%xmm7,%xmm9,%xmm2
- vpxor %xmm15,%xmm8,%xmm8
-
- vmovdqu 16(%rdx),%xmm14
- vpxor %xmm3,%xmm0,%xmm0
- vpclmulqdq $0,%xmm6,%xmm15,%xmm3
- vpxor %xmm4,%xmm1,%xmm1
- vpshufb %xmm13,%xmm14,%xmm14
- vpclmulqdq $17,%xmm6,%xmm15,%xmm4
- vmovdqu 144-64(%rsi),%xmm6
- vpxor %xmm5,%xmm2,%xmm2
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpclmulqdq $16,%xmm7,%xmm8,%xmm5
- vmovdqu 176-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
-
- vmovdqu (%rdx),%xmm15
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm14,%xmm0
- vpxor %xmm1,%xmm4,%xmm4
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $17,%xmm6,%xmm14,%xmm1
- vmovdqu 160-64(%rsi),%xmm6
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $16,%xmm7,%xmm9,%xmm2
-
- leaq 128(%rdx),%rdx
- cmpq $128,%rcx
- jb .Ltail_avx
-
- vpxor %xmm10,%xmm15,%xmm15
- subq $128,%rcx
- jmp .Loop8x_avx
-
-.align 32
-.Loop8x_avx:
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vmovdqu 112(%rdx),%xmm14
- vpxor %xmm0,%xmm3,%xmm3
- vpxor %xmm15,%xmm8,%xmm8
- vpclmulqdq $0,%xmm6,%xmm15,%xmm10
- vpshufb %xmm13,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $17,%xmm6,%xmm15,%xmm11
- vmovdqu 0-64(%rsi),%xmm6
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0,%xmm7,%xmm8,%xmm12
- vmovdqu 32-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
-
- vmovdqu 96(%rdx),%xmm15
- vpclmulqdq $0,%xmm6,%xmm14,%xmm0
- vpxor %xmm3,%xmm10,%xmm10
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $17,%xmm6,%xmm14,%xmm1
- vxorps %xmm4,%xmm11,%xmm11
- vmovdqu 16-64(%rsi),%xmm6
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpclmulqdq $0,%xmm7,%xmm9,%xmm2
- vpxor %xmm5,%xmm12,%xmm12
- vxorps %xmm15,%xmm8,%xmm8
-
- vmovdqu 80(%rdx),%xmm14
- vpxor %xmm10,%xmm12,%xmm12
- vpclmulqdq $0,%xmm6,%xmm15,%xmm3
- vpxor %xmm11,%xmm12,%xmm12
- vpslldq $8,%xmm12,%xmm9
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $17,%xmm6,%xmm15,%xmm4
- vpsrldq $8,%xmm12,%xmm12
- vpxor %xmm9,%xmm10,%xmm10
- vmovdqu 48-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm14
- vxorps %xmm12,%xmm11,%xmm11
- vpxor %xmm1,%xmm4,%xmm4
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpclmulqdq $16,%xmm7,%xmm8,%xmm5
- vmovdqu 80-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
- vpxor %xmm2,%xmm5,%xmm5
-
- vmovdqu 64(%rdx),%xmm15
- vpalignr $8,%xmm10,%xmm10,%xmm12
- vpclmulqdq $0,%xmm6,%xmm14,%xmm0
- vpshufb %xmm13,%xmm15,%xmm15
- vpxor %xmm3,%xmm0,%xmm0
- vpclmulqdq $17,%xmm6,%xmm14,%xmm1
- vmovdqu 64-64(%rsi),%xmm6
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0,%xmm7,%xmm9,%xmm2
- vxorps %xmm15,%xmm8,%xmm8
- vpxor %xmm5,%xmm2,%xmm2
-
- vmovdqu 48(%rdx),%xmm14
- vpclmulqdq $16,(%r10),%xmm10,%xmm10
- vpclmulqdq $0,%xmm6,%xmm15,%xmm3
- vpshufb %xmm13,%xmm14,%xmm14
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $17,%xmm6,%xmm15,%xmm4
- vmovdqu 96-64(%rsi),%xmm6
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $16,%xmm7,%xmm8,%xmm5
- vmovdqu 128-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
- vpxor %xmm2,%xmm5,%xmm5
-
- vmovdqu 32(%rdx),%xmm15
- vpclmulqdq $0,%xmm6,%xmm14,%xmm0
- vpshufb %xmm13,%xmm15,%xmm15
- vpxor %xmm3,%xmm0,%xmm0
- vpclmulqdq $17,%xmm6,%xmm14,%xmm1
- vmovdqu 112-64(%rsi),%xmm6
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0,%xmm7,%xmm9,%xmm2
- vpxor %xmm15,%xmm8,%xmm8
- vpxor %xmm5,%xmm2,%xmm2
- vxorps %xmm12,%xmm10,%xmm10
-
- vmovdqu 16(%rdx),%xmm14
- vpalignr $8,%xmm10,%xmm10,%xmm12
- vpclmulqdq $0,%xmm6,%xmm15,%xmm3
- vpshufb %xmm13,%xmm14,%xmm14
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $17,%xmm6,%xmm15,%xmm4
- vmovdqu 144-64(%rsi),%xmm6
- vpclmulqdq $16,(%r10),%xmm10,%xmm10
- vxorps %xmm11,%xmm12,%xmm12
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $16,%xmm7,%xmm8,%xmm5
- vmovdqu 176-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
- vpxor %xmm2,%xmm5,%xmm5
-
- vmovdqu (%rdx),%xmm15
- vpclmulqdq $0,%xmm6,%xmm14,%xmm0
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $17,%xmm6,%xmm14,%xmm1
- vmovdqu 160-64(%rsi),%xmm6
- vpxor %xmm12,%xmm15,%xmm15
- vpclmulqdq $16,%xmm7,%xmm9,%xmm2
- vpxor %xmm10,%xmm15,%xmm15
-
- leaq 128(%rdx),%rdx
- subq $128,%rcx
- jnc .Loop8x_avx
-
- addq $128,%rcx
- jmp .Ltail_no_xor_avx
-
-.align 32
-.Lshort_avx:
- vmovdqu -16(%rdx,%rcx,1),%xmm14
- leaq (%rdx,%rcx,1),%rdx
- vmovdqu 0-64(%rsi),%xmm6
- vmovdqu 32-64(%rsi),%xmm7
- vpshufb %xmm13,%xmm14,%xmm15
-
- vmovdqa %xmm0,%xmm3
- vmovdqa %xmm1,%xmm4
- vmovdqa %xmm2,%xmm5
- subq $16,%rcx
- jz .Ltail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -32(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $17,%xmm6,%xmm15,%xmm1
- vmovdqu 16-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0,%xmm7,%xmm8,%xmm2
- vpsrldq $8,%xmm7,%xmm7
- subq $16,%rcx
- jz .Ltail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -48(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $17,%xmm6,%xmm15,%xmm1
- vmovdqu 48-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0,%xmm7,%xmm8,%xmm2
- vmovdqu 80-64(%rsi),%xmm7
- subq $16,%rcx
- jz .Ltail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -64(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $17,%xmm6,%xmm15,%xmm1
- vmovdqu 64-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0,%xmm7,%xmm8,%xmm2
- vpsrldq $8,%xmm7,%xmm7
- subq $16,%rcx
- jz .Ltail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -80(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $17,%xmm6,%xmm15,%xmm1
- vmovdqu 96-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0,%xmm7,%xmm8,%xmm2
- vmovdqu 128-64(%rsi),%xmm7
- subq $16,%rcx
- jz .Ltail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -96(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $17,%xmm6,%xmm15,%xmm1
- vmovdqu 112-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0,%xmm7,%xmm8,%xmm2
- vpsrldq $8,%xmm7,%xmm7
- subq $16,%rcx
- jz .Ltail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -112(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $17,%xmm6,%xmm15,%xmm1
- vmovdqu 144-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0,%xmm7,%xmm8,%xmm2
- vmovq 184-64(%rsi),%xmm7
- subq $16,%rcx
- jmp .Ltail_avx
-
-.align 32
-.Ltail_avx:
- vpxor %xmm10,%xmm15,%xmm15
-.Ltail_no_xor_avx:
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $17,%xmm6,%xmm15,%xmm1
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0,%xmm7,%xmm8,%xmm2
-
- vmovdqu (%r10),%xmm12
-
- vpxor %xmm0,%xmm3,%xmm10
- vpxor %xmm1,%xmm4,%xmm11
- vpxor %xmm2,%xmm5,%xmm5
-
- vpxor %xmm10,%xmm5,%xmm5
- vpxor %xmm11,%xmm5,%xmm5
- vpslldq $8,%xmm5,%xmm9
- vpsrldq $8,%xmm5,%xmm5
- vpxor %xmm9,%xmm10,%xmm10
- vpxor %xmm5,%xmm11,%xmm11
-
- vpclmulqdq $16,%xmm12,%xmm10,%xmm9
- vpalignr $8,%xmm10,%xmm10,%xmm10
- vpxor %xmm9,%xmm10,%xmm10
-
- vpclmulqdq $16,%xmm12,%xmm10,%xmm9
- vpalignr $8,%xmm10,%xmm10,%xmm10
- vpxor %xmm11,%xmm10,%xmm10
- vpxor %xmm9,%xmm10,%xmm10
-
- cmpq $0,%rcx
- jne .Lshort_avx
-
- vpshufb %xmm13,%xmm10,%xmm10
- vmovdqu %xmm10,(%rdi)
- vzeroupper
- .byte 0xf3,0xc3
+ jmp .L_ghash_clmul
.size gcm_ghash_avx,.-gcm_ghash_avx
.align 64
.Lbswap_mask:
diff --git a/deps/openssl/asm/x64-elf-gas/sha/sha1-mb-x86_64.s b/deps/openssl/asm/x64-elf-gas/sha/sha1-mb-x86_64.s
index 8da489ea45..4d25c99cf6 100644
--- a/deps/openssl/asm/x64-elf-gas/sha/sha1-mb-x86_64.s
+++ b/deps/openssl/asm/x64-elf-gas/sha/sha1-mb-x86_64.s
@@ -9,8 +9,6 @@ sha1_multi_block:
movq OPENSSL_ia32cap_P+4(%rip),%rcx
btq $61,%rcx
jc _shaext_shortcut
- testl $268435456,%ecx
- jnz _avx_shortcut
movq %rsp,%rax
pushq %rbx
pushq %rbp
@@ -2601,10 +2599,10 @@ _shaext_shortcut:
punpcklqdq %xmm5,%xmm0
punpckhqdq %xmm5,%xmm8
- pshufd $63,%xmm7,%xmm1
- pshufd $127,%xmm7,%xmm9
- pshufd $27,%xmm0,%xmm0
- pshufd $27,%xmm8,%xmm8
+ pshufd $0b00111111,%xmm7,%xmm1
+ pshufd $0b01111111,%xmm7,%xmm9
+ pshufd $0b00011011,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm8,%xmm8
jmp .Loop_shaext
.align 32
@@ -2859,8 +2857,8 @@ _shaext_shortcut:
.byte 69,15,58,204,193,3
.byte 69,15,56,200,214
- pshufd $0,%xmm6,%xmm11
- pshufd $85,%xmm6,%xmm12
+ pshufd $0x00,%xmm6,%xmm11
+ pshufd $0x55,%xmm6,%xmm12
movdqa %xmm6,%xmm7
pcmpgtd %xmm4,%xmm11
pcmpgtd %xmm4,%xmm12
@@ -2890,8 +2888,8 @@ _shaext_shortcut:
movl 280(%rsp),%edx
- pshufd $27,%xmm0,%xmm0
- pshufd $27,%xmm8,%xmm8
+ pshufd $0b00011011,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm8,%xmm8
movdqa %xmm0,%xmm6
punpckldq %xmm8,%xmm0
@@ -2919,4291 +2917,6 @@ _shaext_shortcut:
.Lepilogue_shaext:
.byte 0xf3,0xc3
.size sha1_multi_block_shaext,.-sha1_multi_block_shaext
-.type sha1_multi_block_avx,@function
-.align 32
-sha1_multi_block_avx:
-_avx_shortcut:
- shrq $32,%rcx
- cmpl $2,%edx
- jb .Lavx
- testl $32,%ecx
- jnz _avx2_shortcut
- jmp .Lavx
-.align 32
-.Lavx:
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- subq $288,%rsp
- andq $-256,%rsp
- movq %rax,272(%rsp)
-.Lbody_avx:
- leaq K_XX_XX(%rip),%rbp
- leaq 256(%rsp),%rbx
-
- vzeroupper
-.Loop_grande_avx:
- movl %edx,280(%rsp)
- xorl %edx,%edx
- movq 0(%rsi),%r8
- movl 8(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,0(%rbx)
- cmovleq %rbp,%r8
- movq 16(%rsi),%r9
- movl 24(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,4(%rbx)
- cmovleq %rbp,%r9
- movq 32(%rsi),%r10
- movl 40(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,8(%rbx)
- cmovleq %rbp,%r10
- movq 48(%rsi),%r11
- movl 56(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,12(%rbx)
- cmovleq %rbp,%r11
- testl %edx,%edx
- jz .Ldone_avx
-
- vmovdqu 0(%rdi),%xmm10
- leaq 128(%rsp),%rax
- vmovdqu 32(%rdi),%xmm11
- vmovdqu 64(%rdi),%xmm12
- vmovdqu 96(%rdi),%xmm13
- vmovdqu 128(%rdi),%xmm14
- vmovdqu 96(%rbp),%xmm5
- jmp .Loop_avx
-
-.align 32
-.Loop_avx:
- vmovdqa -32(%rbp),%xmm15
- vmovd (%r8),%xmm0
- leaq 64(%r8),%r8
- vmovd (%r9),%xmm2
- leaq 64(%r9),%r9
- vpinsrd $1,(%r10),%xmm0,%xmm0
- leaq 64(%r10),%r10
- vpinsrd $1,(%r11),%xmm2,%xmm2
- leaq 64(%r11),%r11
- vmovd -60(%r8),%xmm1
- vpunpckldq %xmm2,%xmm0,%xmm0
- vmovd -60(%r9),%xmm9
- vpshufb %xmm5,%xmm0,%xmm0
- vpinsrd $1,-60(%r10),%xmm1,%xmm1
- vpinsrd $1,-60(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpandn %xmm13,%xmm11,%xmm7
- vpand %xmm12,%xmm11,%xmm6
-
- vmovdqa %xmm0,0-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpunpckldq %xmm9,%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -56(%r8),%xmm2
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -56(%r9),%xmm9
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpshufb %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpinsrd $1,-56(%r10),%xmm2,%xmm2
- vpinsrd $1,-56(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpandn %xmm12,%xmm10,%xmm7
- vpand %xmm11,%xmm10,%xmm6
-
- vmovdqa %xmm1,16-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpunpckldq %xmm9,%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -52(%r8),%xmm3
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -52(%r9),%xmm9
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpshufb %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpinsrd $1,-52(%r10),%xmm3,%xmm3
- vpinsrd $1,-52(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpandn %xmm11,%xmm14,%xmm7
- vpand %xmm10,%xmm14,%xmm6
-
- vmovdqa %xmm2,32-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpunpckldq %xmm9,%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -48(%r8),%xmm4
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -48(%r9),%xmm9
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpshufb %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpinsrd $1,-48(%r10),%xmm4,%xmm4
- vpinsrd $1,-48(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpandn %xmm10,%xmm13,%xmm7
- vpand %xmm14,%xmm13,%xmm6
-
- vmovdqa %xmm3,48-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpunpckldq %xmm9,%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -44(%r8),%xmm0
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -44(%r9),%xmm9
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpshufb %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpinsrd $1,-44(%r10),%xmm0,%xmm0
- vpinsrd $1,-44(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpandn %xmm14,%xmm12,%xmm7
- vpand %xmm13,%xmm12,%xmm6
-
- vmovdqa %xmm4,64-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpunpckldq %xmm9,%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -40(%r8),%xmm1
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -40(%r9),%xmm9
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpshufb %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpinsrd $1,-40(%r10),%xmm1,%xmm1
- vpinsrd $1,-40(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpandn %xmm13,%xmm11,%xmm7
- vpand %xmm12,%xmm11,%xmm6
-
- vmovdqa %xmm0,80-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpunpckldq %xmm9,%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -36(%r8),%xmm2
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -36(%r9),%xmm9
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpshufb %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpinsrd $1,-36(%r10),%xmm2,%xmm2
- vpinsrd $1,-36(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpandn %xmm12,%xmm10,%xmm7
- vpand %xmm11,%xmm10,%xmm6
-
- vmovdqa %xmm1,96-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpunpckldq %xmm9,%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -32(%r8),%xmm3
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -32(%r9),%xmm9
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpshufb %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpinsrd $1,-32(%r10),%xmm3,%xmm3
- vpinsrd $1,-32(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpandn %xmm11,%xmm14,%xmm7
- vpand %xmm10,%xmm14,%xmm6
-
- vmovdqa %xmm2,112-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpunpckldq %xmm9,%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -28(%r8),%xmm4
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -28(%r9),%xmm9
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpshufb %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpinsrd $1,-28(%r10),%xmm4,%xmm4
- vpinsrd $1,-28(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpandn %xmm10,%xmm13,%xmm7
- vpand %xmm14,%xmm13,%xmm6
-
- vmovdqa %xmm3,128-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpunpckldq %xmm9,%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -24(%r8),%xmm0
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -24(%r9),%xmm9
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpshufb %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpinsrd $1,-24(%r10),%xmm0,%xmm0
- vpinsrd $1,-24(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpandn %xmm14,%xmm12,%xmm7
- vpand %xmm13,%xmm12,%xmm6
-
- vmovdqa %xmm4,144-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpunpckldq %xmm9,%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -20(%r8),%xmm1
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -20(%r9),%xmm9
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpshufb %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpinsrd $1,-20(%r10),%xmm1,%xmm1
- vpinsrd $1,-20(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpandn %xmm13,%xmm11,%xmm7
- vpand %xmm12,%xmm11,%xmm6
-
- vmovdqa %xmm0,160-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpunpckldq %xmm9,%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -16(%r8),%xmm2
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -16(%r9),%xmm9
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpshufb %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpinsrd $1,-16(%r10),%xmm2,%xmm2
- vpinsrd $1,-16(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpandn %xmm12,%xmm10,%xmm7
- vpand %xmm11,%xmm10,%xmm6
-
- vmovdqa %xmm1,176-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpunpckldq %xmm9,%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -12(%r8),%xmm3
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -12(%r9),%xmm9
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpshufb %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpinsrd $1,-12(%r10),%xmm3,%xmm3
- vpinsrd $1,-12(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpandn %xmm11,%xmm14,%xmm7
- vpand %xmm10,%xmm14,%xmm6
-
- vmovdqa %xmm2,192-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpunpckldq %xmm9,%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -8(%r8),%xmm4
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -8(%r9),%xmm9
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpshufb %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpinsrd $1,-8(%r10),%xmm4,%xmm4
- vpinsrd $1,-8(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpandn %xmm10,%xmm13,%xmm7
- vpand %xmm14,%xmm13,%xmm6
-
- vmovdqa %xmm3,208-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpunpckldq %xmm9,%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -4(%r8),%xmm0
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -4(%r9),%xmm9
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpshufb %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vmovdqa 0-128(%rax),%xmm1
- vpinsrd $1,-4(%r10),%xmm0,%xmm0
- vpinsrd $1,-4(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm10,%xmm10
- prefetcht0 63(%r8)
- vpslld $5,%xmm11,%xmm8
- vpandn %xmm14,%xmm12,%xmm7
- vpand %xmm13,%xmm12,%xmm6
-
- vmovdqa %xmm4,224-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpunpckldq %xmm9,%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- prefetcht0 63(%r9)
- vpxor %xmm7,%xmm6,%xmm6
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- prefetcht0 63(%r10)
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- prefetcht0 63(%r11)
- vpshufb %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vmovdqa 16-128(%rax),%xmm2
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 32-128(%rax),%xmm3
-
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpandn %xmm13,%xmm11,%xmm7
-
- vpand %xmm12,%xmm11,%xmm6
-
- vmovdqa %xmm0,240-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 128-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
-
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 48-128(%rax),%xmm4
-
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpandn %xmm12,%xmm10,%xmm7
-
- vpand %xmm11,%xmm10,%xmm6
-
- vmovdqa %xmm1,0-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 144-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
-
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 64-128(%rax),%xmm0
-
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpandn %xmm11,%xmm14,%xmm7
-
- vpand %xmm10,%xmm14,%xmm6
-
- vmovdqa %xmm2,16-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 160-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
-
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 80-128(%rax),%xmm1
-
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpandn %xmm10,%xmm13,%xmm7
-
- vpand %xmm14,%xmm13,%xmm6
-
- vmovdqa %xmm3,32-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 176-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
-
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 96-128(%rax),%xmm2
-
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpandn %xmm14,%xmm12,%xmm7
-
- vpand %xmm13,%xmm12,%xmm6
-
- vmovdqa %xmm4,48-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 192-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
-
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vmovdqa 0(%rbp),%xmm15
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 112-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,64-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 208-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 128-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,80-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 224-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 144-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vmovdqa %xmm2,96-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 240-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 160-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vmovdqa %xmm3,112-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 0-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 176-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vmovdqa %xmm4,128-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 16-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 192-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,144-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 32-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 208-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,160-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 48-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 224-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vmovdqa %xmm2,176-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 64-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 240-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vmovdqa %xmm3,192-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 80-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 0-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vmovdqa %xmm4,208-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 96-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 16-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,224-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 112-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 32-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,240-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 128-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 48-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vmovdqa %xmm2,0-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 144-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 64-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vmovdqa %xmm3,16-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 160-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 80-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vmovdqa %xmm4,32-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 176-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 96-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,48-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 192-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 112-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,64-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 208-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 128-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vmovdqa %xmm2,80-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 224-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 144-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vmovdqa %xmm3,96-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 240-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 160-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vmovdqa %xmm4,112-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 0-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vmovdqa 32(%rbp),%xmm15
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 176-128(%rax),%xmm3
-
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpand %xmm12,%xmm13,%xmm7
- vpxor 16-128(%rax),%xmm1,%xmm1
-
- vpaddd %xmm7,%xmm14,%xmm14
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm13,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vmovdqu %xmm0,128-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm1,%xmm5
- vpand %xmm11,%xmm6,%xmm6
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 192-128(%rax),%xmm4
-
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpand %xmm11,%xmm12,%xmm7
- vpxor 32-128(%rax),%xmm2,%xmm2
-
- vpaddd %xmm7,%xmm13,%xmm13
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm12,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vmovdqu %xmm1,144-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm2,%xmm5
- vpand %xmm10,%xmm6,%xmm6
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 208-128(%rax),%xmm0
-
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpand %xmm10,%xmm11,%xmm7
- vpxor 48-128(%rax),%xmm3,%xmm3
-
- vpaddd %xmm7,%xmm12,%xmm12
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm11,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vmovdqu %xmm2,160-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm3,%xmm5
- vpand %xmm14,%xmm6,%xmm6
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 224-128(%rax),%xmm1
-
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpand %xmm14,%xmm10,%xmm7
- vpxor 64-128(%rax),%xmm4,%xmm4
-
- vpaddd %xmm7,%xmm11,%xmm11
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm10,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vmovdqu %xmm3,176-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm4,%xmm5
- vpand %xmm13,%xmm6,%xmm6
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 240-128(%rax),%xmm2
-
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpand %xmm13,%xmm14,%xmm7
- vpxor 80-128(%rax),%xmm0,%xmm0
-
- vpaddd %xmm7,%xmm10,%xmm10
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm14,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vmovdqu %xmm4,192-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm0,%xmm5
- vpand %xmm12,%xmm6,%xmm6
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 0-128(%rax),%xmm3
-
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpand %xmm12,%xmm13,%xmm7
- vpxor 96-128(%rax),%xmm1,%xmm1
-
- vpaddd %xmm7,%xmm14,%xmm14
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm13,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vmovdqu %xmm0,208-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm1,%xmm5
- vpand %xmm11,%xmm6,%xmm6
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 16-128(%rax),%xmm4
-
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpand %xmm11,%xmm12,%xmm7
- vpxor 112-128(%rax),%xmm2,%xmm2
-
- vpaddd %xmm7,%xmm13,%xmm13
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm12,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vmovdqu %xmm1,224-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm2,%xmm5
- vpand %xmm10,%xmm6,%xmm6
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 32-128(%rax),%xmm0
-
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpand %xmm10,%xmm11,%xmm7
- vpxor 128-128(%rax),%xmm3,%xmm3
-
- vpaddd %xmm7,%xmm12,%xmm12
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm11,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vmovdqu %xmm2,240-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm3,%xmm5
- vpand %xmm14,%xmm6,%xmm6
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 48-128(%rax),%xmm1
-
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpand %xmm14,%xmm10,%xmm7
- vpxor 144-128(%rax),%xmm4,%xmm4
-
- vpaddd %xmm7,%xmm11,%xmm11
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm10,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vmovdqu %xmm3,0-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm4,%xmm5
- vpand %xmm13,%xmm6,%xmm6
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 64-128(%rax),%xmm2
-
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpand %xmm13,%xmm14,%xmm7
- vpxor 160-128(%rax),%xmm0,%xmm0
-
- vpaddd %xmm7,%xmm10,%xmm10
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm14,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vmovdqu %xmm4,16-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm0,%xmm5
- vpand %xmm12,%xmm6,%xmm6
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 80-128(%rax),%xmm3
-
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpand %xmm12,%xmm13,%xmm7
- vpxor 176-128(%rax),%xmm1,%xmm1
-
- vpaddd %xmm7,%xmm14,%xmm14
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm13,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vmovdqu %xmm0,32-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm1,%xmm5
- vpand %xmm11,%xmm6,%xmm6
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 96-128(%rax),%xmm4
-
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpand %xmm11,%xmm12,%xmm7
- vpxor 192-128(%rax),%xmm2,%xmm2
-
- vpaddd %xmm7,%xmm13,%xmm13
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm12,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vmovdqu %xmm1,48-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm2,%xmm5
- vpand %xmm10,%xmm6,%xmm6
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 112-128(%rax),%xmm0
-
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpand %xmm10,%xmm11,%xmm7
- vpxor 208-128(%rax),%xmm3,%xmm3
-
- vpaddd %xmm7,%xmm12,%xmm12
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm11,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vmovdqu %xmm2,64-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm3,%xmm5
- vpand %xmm14,%xmm6,%xmm6
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 128-128(%rax),%xmm1
-
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpand %xmm14,%xmm10,%xmm7
- vpxor 224-128(%rax),%xmm4,%xmm4
-
- vpaddd %xmm7,%xmm11,%xmm11
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm10,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vmovdqu %xmm3,80-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm4,%xmm5
- vpand %xmm13,%xmm6,%xmm6
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 144-128(%rax),%xmm2
-
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpand %xmm13,%xmm14,%xmm7
- vpxor 240-128(%rax),%xmm0,%xmm0
-
- vpaddd %xmm7,%xmm10,%xmm10
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm14,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vmovdqu %xmm4,96-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm0,%xmm5
- vpand %xmm12,%xmm6,%xmm6
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 160-128(%rax),%xmm3
-
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpand %xmm12,%xmm13,%xmm7
- vpxor 0-128(%rax),%xmm1,%xmm1
-
- vpaddd %xmm7,%xmm14,%xmm14
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm13,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vmovdqu %xmm0,112-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm1,%xmm5
- vpand %xmm11,%xmm6,%xmm6
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 176-128(%rax),%xmm4
-
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpand %xmm11,%xmm12,%xmm7
- vpxor 16-128(%rax),%xmm2,%xmm2
-
- vpaddd %xmm7,%xmm13,%xmm13
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm12,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vmovdqu %xmm1,128-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm2,%xmm5
- vpand %xmm10,%xmm6,%xmm6
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 192-128(%rax),%xmm0
-
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpand %xmm10,%xmm11,%xmm7
- vpxor 32-128(%rax),%xmm3,%xmm3
-
- vpaddd %xmm7,%xmm12,%xmm12
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm11,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vmovdqu %xmm2,144-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm3,%xmm5
- vpand %xmm14,%xmm6,%xmm6
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 208-128(%rax),%xmm1
-
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpand %xmm14,%xmm10,%xmm7
- vpxor 48-128(%rax),%xmm4,%xmm4
-
- vpaddd %xmm7,%xmm11,%xmm11
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm10,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vmovdqu %xmm3,160-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm4,%xmm5
- vpand %xmm13,%xmm6,%xmm6
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 224-128(%rax),%xmm2
-
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpand %xmm13,%xmm14,%xmm7
- vpxor 64-128(%rax),%xmm0,%xmm0
-
- vpaddd %xmm7,%xmm10,%xmm10
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm14,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vmovdqu %xmm4,176-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm0,%xmm5
- vpand %xmm12,%xmm6,%xmm6
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vmovdqa 64(%rbp),%xmm15
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 240-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,192-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 80-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 0-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,208-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 96-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 16-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vmovdqa %xmm2,224-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 112-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 32-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vmovdqa %xmm3,240-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 128-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 48-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vmovdqa %xmm4,0-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 144-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 64-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,16-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 160-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 80-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,32-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 176-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 96-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vmovdqa %xmm2,48-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 192-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 112-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vmovdqa %xmm3,64-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 208-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 128-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vmovdqa %xmm4,80-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 224-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 144-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,96-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 240-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 160-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,112-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 0-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 176-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 16-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 192-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 32-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 208-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 48-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 224-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 64-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 240-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 80-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 0-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 96-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 16-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 112-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
-
- vpsrld $27,%xmm11,%xmm9
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor %xmm13,%xmm6,%xmm6
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm7,%xmm12,%xmm12
- movl $1,%ecx
- cmpl 0(%rbx),%ecx
- cmovgeq %rbp,%r8
- cmpl 4(%rbx),%ecx
- cmovgeq %rbp,%r9
- cmpl 8(%rbx),%ecx
- cmovgeq %rbp,%r10
- cmpl 12(%rbx),%ecx
- cmovgeq %rbp,%r11
- vmovdqu (%rbx),%xmm6
- vpxor %xmm8,%xmm8,%xmm8
- vmovdqa %xmm6,%xmm7
- vpcmpgtd %xmm8,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpand %xmm7,%xmm10,%xmm10
- vpand %xmm7,%xmm11,%xmm11
- vpaddd 0(%rdi),%xmm10,%xmm10
- vpand %xmm7,%xmm12,%xmm12
- vpaddd 32(%rdi),%xmm11,%xmm11
- vpand %xmm7,%xmm13,%xmm13
- vpaddd 64(%rdi),%xmm12,%xmm12
- vpand %xmm7,%xmm14,%xmm14
- vpaddd 96(%rdi),%xmm13,%xmm13
- vpaddd 128(%rdi),%xmm14,%xmm14
- vmovdqu %xmm10,0(%rdi)
- vmovdqu %xmm11,32(%rdi)
- vmovdqu %xmm12,64(%rdi)
- vmovdqu %xmm13,96(%rdi)
- vmovdqu %xmm14,128(%rdi)
-
- vmovdqu %xmm6,(%rbx)
- vmovdqu 96(%rbp),%xmm5
- decl %edx
- jnz .Loop_avx
-
- movl 280(%rsp),%edx
- leaq 16(%rdi),%rdi
- leaq 64(%rsi),%rsi
- decl %edx
- jnz .Loop_grande_avx
-
-.Ldone_avx:
- movq 272(%rsp),%rax
- vzeroupper
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-.Lepilogue_avx:
- .byte 0xf3,0xc3
-.size sha1_multi_block_avx,.-sha1_multi_block_avx
-.type sha1_multi_block_avx2,@function
-.align 32
-sha1_multi_block_avx2:
-_avx2_shortcut:
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $576,%rsp
- andq $-256,%rsp
- movq %rax,544(%rsp)
-.Lbody_avx2:
- leaq K_XX_XX(%rip),%rbp
- shrl $1,%edx
-
- vzeroupper
-.Loop_grande_avx2:
- movl %edx,552(%rsp)
- xorl %edx,%edx
- leaq 512(%rsp),%rbx
- movq 0(%rsi),%r12
- movl 8(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,0(%rbx)
- cmovleq %rbp,%r12
- movq 16(%rsi),%r13
- movl 24(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,4(%rbx)
- cmovleq %rbp,%r13
- movq 32(%rsi),%r14
- movl 40(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,8(%rbx)
- cmovleq %rbp,%r14
- movq 48(%rsi),%r15
- movl 56(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,12(%rbx)
- cmovleq %rbp,%r15
- movq 64(%rsi),%r8
- movl 72(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,16(%rbx)
- cmovleq %rbp,%r8
- movq 80(%rsi),%r9
- movl 88(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,20(%rbx)
- cmovleq %rbp,%r9
- movq 96(%rsi),%r10
- movl 104(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,24(%rbx)
- cmovleq %rbp,%r10
- movq 112(%rsi),%r11
- movl 120(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,28(%rbx)
- cmovleq %rbp,%r11
- vmovdqu 0(%rdi),%ymm0
- leaq 128(%rsp),%rax
- vmovdqu 32(%rdi),%ymm1
- leaq 256+128(%rsp),%rbx
- vmovdqu 64(%rdi),%ymm2
- vmovdqu 96(%rdi),%ymm3
- vmovdqu 128(%rdi),%ymm4
- vmovdqu 96(%rbp),%ymm9
- jmp .Loop_avx2
-
-.align 32
-.Loop_avx2:
- vmovdqa -32(%rbp),%ymm15
- vmovd (%r12),%xmm10
- leaq 64(%r12),%r12
- vmovd (%r8),%xmm12
- leaq 64(%r8),%r8
- vmovd (%r13),%xmm7
- leaq 64(%r13),%r13
- vmovd (%r9),%xmm6
- leaq 64(%r9),%r9
- vpinsrd $1,(%r14),%xmm10,%xmm10
- leaq 64(%r14),%r14
- vpinsrd $1,(%r10),%xmm12,%xmm12
- leaq 64(%r10),%r10
- vpinsrd $1,(%r15),%xmm7,%xmm7
- leaq 64(%r15),%r15
- vpunpckldq %ymm7,%ymm10,%ymm10
- vpinsrd $1,(%r11),%xmm6,%xmm6
- leaq 64(%r11),%r11
- vpunpckldq %ymm6,%ymm12,%ymm12
- vmovd -60(%r12),%xmm11
- vinserti128 $1,%xmm12,%ymm10,%ymm10
- vmovd -60(%r8),%xmm8
- vpshufb %ymm9,%ymm10,%ymm10
- vmovd -60(%r13),%xmm7
- vmovd -60(%r9),%xmm6
- vpinsrd $1,-60(%r14),%xmm11,%xmm11
- vpinsrd $1,-60(%r10),%xmm8,%xmm8
- vpinsrd $1,-60(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm11,%ymm11
- vpinsrd $1,-60(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpandn %ymm3,%ymm1,%ymm6
- vpand %ymm2,%ymm1,%ymm5
-
- vmovdqa %ymm10,0-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vinserti128 $1,%xmm8,%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -56(%r12),%xmm12
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -56(%r8),%xmm8
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpshufb %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vmovd -56(%r13),%xmm7
- vmovd -56(%r9),%xmm6
- vpinsrd $1,-56(%r14),%xmm12,%xmm12
- vpinsrd $1,-56(%r10),%xmm8,%xmm8
- vpinsrd $1,-56(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm12,%ymm12
- vpinsrd $1,-56(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpandn %ymm2,%ymm0,%ymm6
- vpand %ymm1,%ymm0,%ymm5
-
- vmovdqa %ymm11,32-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vinserti128 $1,%xmm8,%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -52(%r12),%xmm13
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -52(%r8),%xmm8
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpshufb %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vmovd -52(%r13),%xmm7
- vmovd -52(%r9),%xmm6
- vpinsrd $1,-52(%r14),%xmm13,%xmm13
- vpinsrd $1,-52(%r10),%xmm8,%xmm8
- vpinsrd $1,-52(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm13,%ymm13
- vpinsrd $1,-52(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpandn %ymm1,%ymm4,%ymm6
- vpand %ymm0,%ymm4,%ymm5
-
- vmovdqa %ymm12,64-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vinserti128 $1,%xmm8,%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -48(%r12),%xmm14
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -48(%r8),%xmm8
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpshufb %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vmovd -48(%r13),%xmm7
- vmovd -48(%r9),%xmm6
- vpinsrd $1,-48(%r14),%xmm14,%xmm14
- vpinsrd $1,-48(%r10),%xmm8,%xmm8
- vpinsrd $1,-48(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm14,%ymm14
- vpinsrd $1,-48(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpandn %ymm0,%ymm3,%ymm6
- vpand %ymm4,%ymm3,%ymm5
-
- vmovdqa %ymm13,96-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vinserti128 $1,%xmm8,%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -44(%r12),%xmm10
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -44(%r8),%xmm8
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpshufb %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vmovd -44(%r13),%xmm7
- vmovd -44(%r9),%xmm6
- vpinsrd $1,-44(%r14),%xmm10,%xmm10
- vpinsrd $1,-44(%r10),%xmm8,%xmm8
- vpinsrd $1,-44(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm10,%ymm10
- vpinsrd $1,-44(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpandn %ymm4,%ymm2,%ymm6
- vpand %ymm3,%ymm2,%ymm5
-
- vmovdqa %ymm14,128-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vinserti128 $1,%xmm8,%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -40(%r12),%xmm11
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -40(%r8),%xmm8
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpshufb %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vmovd -40(%r13),%xmm7
- vmovd -40(%r9),%xmm6
- vpinsrd $1,-40(%r14),%xmm11,%xmm11
- vpinsrd $1,-40(%r10),%xmm8,%xmm8
- vpinsrd $1,-40(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm11,%ymm11
- vpinsrd $1,-40(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpandn %ymm3,%ymm1,%ymm6
- vpand %ymm2,%ymm1,%ymm5
-
- vmovdqa %ymm10,160-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vinserti128 $1,%xmm8,%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -36(%r12),%xmm12
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -36(%r8),%xmm8
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpshufb %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vmovd -36(%r13),%xmm7
- vmovd -36(%r9),%xmm6
- vpinsrd $1,-36(%r14),%xmm12,%xmm12
- vpinsrd $1,-36(%r10),%xmm8,%xmm8
- vpinsrd $1,-36(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm12,%ymm12
- vpinsrd $1,-36(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpandn %ymm2,%ymm0,%ymm6
- vpand %ymm1,%ymm0,%ymm5
-
- vmovdqa %ymm11,192-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vinserti128 $1,%xmm8,%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -32(%r12),%xmm13
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -32(%r8),%xmm8
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpshufb %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vmovd -32(%r13),%xmm7
- vmovd -32(%r9),%xmm6
- vpinsrd $1,-32(%r14),%xmm13,%xmm13
- vpinsrd $1,-32(%r10),%xmm8,%xmm8
- vpinsrd $1,-32(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm13,%ymm13
- vpinsrd $1,-32(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpandn %ymm1,%ymm4,%ymm6
- vpand %ymm0,%ymm4,%ymm5
-
- vmovdqa %ymm12,224-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vinserti128 $1,%xmm8,%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -28(%r12),%xmm14
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -28(%r8),%xmm8
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpshufb %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vmovd -28(%r13),%xmm7
- vmovd -28(%r9),%xmm6
- vpinsrd $1,-28(%r14),%xmm14,%xmm14
- vpinsrd $1,-28(%r10),%xmm8,%xmm8
- vpinsrd $1,-28(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm14,%ymm14
- vpinsrd $1,-28(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpandn %ymm0,%ymm3,%ymm6
- vpand %ymm4,%ymm3,%ymm5
-
- vmovdqa %ymm13,256-256-128(%rbx)
- vpaddd %ymm13,%ymm1,%ymm1
- vinserti128 $1,%xmm8,%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -24(%r12),%xmm10
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -24(%r8),%xmm8
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpshufb %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vmovd -24(%r13),%xmm7
- vmovd -24(%r9),%xmm6
- vpinsrd $1,-24(%r14),%xmm10,%xmm10
- vpinsrd $1,-24(%r10),%xmm8,%xmm8
- vpinsrd $1,-24(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm10,%ymm10
- vpinsrd $1,-24(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpandn %ymm4,%ymm2,%ymm6
- vpand %ymm3,%ymm2,%ymm5
-
- vmovdqa %ymm14,288-256-128(%rbx)
- vpaddd %ymm14,%ymm0,%ymm0
- vinserti128 $1,%xmm8,%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -20(%r12),%xmm11
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -20(%r8),%xmm8
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpshufb %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vmovd -20(%r13),%xmm7
- vmovd -20(%r9),%xmm6
- vpinsrd $1,-20(%r14),%xmm11,%xmm11
- vpinsrd $1,-20(%r10),%xmm8,%xmm8
- vpinsrd $1,-20(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm11,%ymm11
- vpinsrd $1,-20(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpandn %ymm3,%ymm1,%ymm6
- vpand %ymm2,%ymm1,%ymm5
-
- vmovdqa %ymm10,320-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vinserti128 $1,%xmm8,%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -16(%r12),%xmm12
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -16(%r8),%xmm8
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpshufb %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vmovd -16(%r13),%xmm7
- vmovd -16(%r9),%xmm6
- vpinsrd $1,-16(%r14),%xmm12,%xmm12
- vpinsrd $1,-16(%r10),%xmm8,%xmm8
- vpinsrd $1,-16(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm12,%ymm12
- vpinsrd $1,-16(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpandn %ymm2,%ymm0,%ymm6
- vpand %ymm1,%ymm0,%ymm5
-
- vmovdqa %ymm11,352-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vinserti128 $1,%xmm8,%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -12(%r12),%xmm13
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -12(%r8),%xmm8
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpshufb %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vmovd -12(%r13),%xmm7
- vmovd -12(%r9),%xmm6
- vpinsrd $1,-12(%r14),%xmm13,%xmm13
- vpinsrd $1,-12(%r10),%xmm8,%xmm8
- vpinsrd $1,-12(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm13,%ymm13
- vpinsrd $1,-12(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpandn %ymm1,%ymm4,%ymm6
- vpand %ymm0,%ymm4,%ymm5
-
- vmovdqa %ymm12,384-256-128(%rbx)
- vpaddd %ymm12,%ymm2,%ymm2
- vinserti128 $1,%xmm8,%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -8(%r12),%xmm14
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -8(%r8),%xmm8
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpshufb %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vmovd -8(%r13),%xmm7
- vmovd -8(%r9),%xmm6
- vpinsrd $1,-8(%r14),%xmm14,%xmm14
- vpinsrd $1,-8(%r10),%xmm8,%xmm8
- vpinsrd $1,-8(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm14,%ymm14
- vpinsrd $1,-8(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpandn %ymm0,%ymm3,%ymm6
- vpand %ymm4,%ymm3,%ymm5
-
- vmovdqa %ymm13,416-256-128(%rbx)
- vpaddd %ymm13,%ymm1,%ymm1
- vinserti128 $1,%xmm8,%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -4(%r12),%xmm10
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -4(%r8),%xmm8
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpshufb %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vmovdqa 0-128(%rax),%ymm11
- vmovd -4(%r13),%xmm7
- vmovd -4(%r9),%xmm6
- vpinsrd $1,-4(%r14),%xmm10,%xmm10
- vpinsrd $1,-4(%r10),%xmm8,%xmm8
- vpinsrd $1,-4(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm10,%ymm10
- vpinsrd $1,-4(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm0,%ymm0
- prefetcht0 63(%r12)
- vpslld $5,%ymm1,%ymm7
- vpandn %ymm4,%ymm2,%ymm6
- vpand %ymm3,%ymm2,%ymm5
-
- vmovdqa %ymm14,448-256-128(%rbx)
- vpaddd %ymm14,%ymm0,%ymm0
- vinserti128 $1,%xmm8,%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- prefetcht0 63(%r13)
- vpxor %ymm6,%ymm5,%ymm5
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- prefetcht0 63(%r14)
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- prefetcht0 63(%r15)
- vpshufb %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vmovdqa 32-128(%rax),%ymm12
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 64-128(%rax),%ymm13
-
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpandn %ymm3,%ymm1,%ymm6
- prefetcht0 63(%r8)
- vpand %ymm2,%ymm1,%ymm5
-
- vmovdqa %ymm10,480-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 256-256-128(%rbx),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
- prefetcht0 63(%r9)
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- prefetcht0 63(%r10)
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- prefetcht0 63(%r11)
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 96-128(%rax),%ymm14
-
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpandn %ymm2,%ymm0,%ymm6
-
- vpand %ymm1,%ymm0,%ymm5
-
- vmovdqa %ymm11,0-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 288-256-128(%rbx),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
-
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 128-128(%rax),%ymm10
-
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpandn %ymm1,%ymm4,%ymm6
-
- vpand %ymm0,%ymm4,%ymm5
-
- vmovdqa %ymm12,32-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 320-256-128(%rbx),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
-
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 160-128(%rax),%ymm11
-
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpandn %ymm0,%ymm3,%ymm6
-
- vpand %ymm4,%ymm3,%ymm5
-
- vmovdqa %ymm13,64-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 352-256-128(%rbx),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
-
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 192-128(%rax),%ymm12
-
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpandn %ymm4,%ymm2,%ymm6
-
- vpand %ymm3,%ymm2,%ymm5
-
- vmovdqa %ymm14,96-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 384-256-128(%rbx),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
-
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vmovdqa 0(%rbp),%ymm15
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 224-128(%rax),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,128-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 416-256-128(%rbx),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 256-256-128(%rbx),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,160-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 448-256-128(%rbx),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 288-256-128(%rbx),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vmovdqa %ymm12,192-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 480-256-128(%rbx),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 320-256-128(%rbx),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vmovdqa %ymm13,224-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 0-128(%rax),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 352-256-128(%rbx),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vmovdqa %ymm14,256-256-128(%rbx)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 32-128(%rax),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 384-256-128(%rbx),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,288-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 64-128(%rax),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 416-256-128(%rbx),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,320-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 96-128(%rax),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 448-256-128(%rbx),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vmovdqa %ymm12,352-256-128(%rbx)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 128-128(%rax),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 480-256-128(%rbx),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vmovdqa %ymm13,384-256-128(%rbx)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 160-128(%rax),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 0-128(%rax),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vmovdqa %ymm14,416-256-128(%rbx)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 192-128(%rax),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 32-128(%rax),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,448-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 224-128(%rax),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 64-128(%rax),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,480-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 256-256-128(%rbx),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 96-128(%rax),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vmovdqa %ymm12,0-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 288-256-128(%rbx),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 128-128(%rax),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vmovdqa %ymm13,32-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 320-256-128(%rbx),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 160-128(%rax),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vmovdqa %ymm14,64-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 352-256-128(%rbx),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 192-128(%rax),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,96-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 384-256-128(%rbx),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 224-128(%rax),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,128-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 416-256-128(%rbx),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 256-256-128(%rbx),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vmovdqa %ymm12,160-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 448-256-128(%rbx),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 288-256-128(%rbx),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vmovdqa %ymm13,192-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 480-256-128(%rbx),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 320-256-128(%rbx),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vmovdqa %ymm14,224-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 0-128(%rax),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vmovdqa 32(%rbp),%ymm15
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 352-256-128(%rbx),%ymm13
-
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpand %ymm2,%ymm3,%ymm6
- vpxor 32-128(%rax),%ymm11,%ymm11
-
- vpaddd %ymm6,%ymm4,%ymm4
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm3,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vmovdqu %ymm10,256-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm11,%ymm9
- vpand %ymm1,%ymm5,%ymm5
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 384-256-128(%rbx),%ymm14
-
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpand %ymm1,%ymm2,%ymm6
- vpxor 64-128(%rax),%ymm12,%ymm12
-
- vpaddd %ymm6,%ymm3,%ymm3
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm2,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vmovdqu %ymm11,288-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm12,%ymm9
- vpand %ymm0,%ymm5,%ymm5
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 416-256-128(%rbx),%ymm10
-
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpand %ymm0,%ymm1,%ymm6
- vpxor 96-128(%rax),%ymm13,%ymm13
-
- vpaddd %ymm6,%ymm2,%ymm2
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm1,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vmovdqu %ymm12,320-256-128(%rbx)
- vpaddd %ymm12,%ymm2,%ymm2
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm13,%ymm9
- vpand %ymm4,%ymm5,%ymm5
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 448-256-128(%rbx),%ymm11
-
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpand %ymm4,%ymm0,%ymm6
- vpxor 128-128(%rax),%ymm14,%ymm14
-
- vpaddd %ymm6,%ymm1,%ymm1
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm0,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vmovdqu %ymm13,352-256-128(%rbx)
- vpaddd %ymm13,%ymm1,%ymm1
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm14,%ymm9
- vpand %ymm3,%ymm5,%ymm5
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 480-256-128(%rbx),%ymm12
-
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpand %ymm3,%ymm4,%ymm6
- vpxor 160-128(%rax),%ymm10,%ymm10
-
- vpaddd %ymm6,%ymm0,%ymm0
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm4,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vmovdqu %ymm14,384-256-128(%rbx)
- vpaddd %ymm14,%ymm0,%ymm0
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm10,%ymm9
- vpand %ymm2,%ymm5,%ymm5
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 0-128(%rax),%ymm13
-
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpand %ymm2,%ymm3,%ymm6
- vpxor 192-128(%rax),%ymm11,%ymm11
-
- vpaddd %ymm6,%ymm4,%ymm4
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm3,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vmovdqu %ymm10,416-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm11,%ymm9
- vpand %ymm1,%ymm5,%ymm5
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 32-128(%rax),%ymm14
-
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpand %ymm1,%ymm2,%ymm6
- vpxor 224-128(%rax),%ymm12,%ymm12
-
- vpaddd %ymm6,%ymm3,%ymm3
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm2,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vmovdqu %ymm11,448-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm12,%ymm9
- vpand %ymm0,%ymm5,%ymm5
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 64-128(%rax),%ymm10
-
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpand %ymm0,%ymm1,%ymm6
- vpxor 256-256-128(%rbx),%ymm13,%ymm13
-
- vpaddd %ymm6,%ymm2,%ymm2
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm1,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vmovdqu %ymm12,480-256-128(%rbx)
- vpaddd %ymm12,%ymm2,%ymm2
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm13,%ymm9
- vpand %ymm4,%ymm5,%ymm5
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 96-128(%rax),%ymm11
-
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpand %ymm4,%ymm0,%ymm6
- vpxor 288-256-128(%rbx),%ymm14,%ymm14
-
- vpaddd %ymm6,%ymm1,%ymm1
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm0,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vmovdqu %ymm13,0-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm14,%ymm9
- vpand %ymm3,%ymm5,%ymm5
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 128-128(%rax),%ymm12
-
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpand %ymm3,%ymm4,%ymm6
- vpxor 320-256-128(%rbx),%ymm10,%ymm10
-
- vpaddd %ymm6,%ymm0,%ymm0
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm4,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vmovdqu %ymm14,32-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm10,%ymm9
- vpand %ymm2,%ymm5,%ymm5
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 160-128(%rax),%ymm13
-
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpand %ymm2,%ymm3,%ymm6
- vpxor 352-256-128(%rbx),%ymm11,%ymm11
-
- vpaddd %ymm6,%ymm4,%ymm4
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm3,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vmovdqu %ymm10,64-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm11,%ymm9
- vpand %ymm1,%ymm5,%ymm5
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 192-128(%rax),%ymm14
-
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpand %ymm1,%ymm2,%ymm6
- vpxor 384-256-128(%rbx),%ymm12,%ymm12
-
- vpaddd %ymm6,%ymm3,%ymm3
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm2,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vmovdqu %ymm11,96-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm12,%ymm9
- vpand %ymm0,%ymm5,%ymm5
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 224-128(%rax),%ymm10
-
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpand %ymm0,%ymm1,%ymm6
- vpxor 416-256-128(%rbx),%ymm13,%ymm13
-
- vpaddd %ymm6,%ymm2,%ymm2
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm1,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vmovdqu %ymm12,128-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm13,%ymm9
- vpand %ymm4,%ymm5,%ymm5
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 256-256-128(%rbx),%ymm11
-
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpand %ymm4,%ymm0,%ymm6
- vpxor 448-256-128(%rbx),%ymm14,%ymm14
-
- vpaddd %ymm6,%ymm1,%ymm1
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm0,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vmovdqu %ymm13,160-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm14,%ymm9
- vpand %ymm3,%ymm5,%ymm5
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 288-256-128(%rbx),%ymm12
-
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpand %ymm3,%ymm4,%ymm6
- vpxor 480-256-128(%rbx),%ymm10,%ymm10
-
- vpaddd %ymm6,%ymm0,%ymm0
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm4,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vmovdqu %ymm14,192-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm10,%ymm9
- vpand %ymm2,%ymm5,%ymm5
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 320-256-128(%rbx),%ymm13
-
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpand %ymm2,%ymm3,%ymm6
- vpxor 0-128(%rax),%ymm11,%ymm11
-
- vpaddd %ymm6,%ymm4,%ymm4
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm3,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vmovdqu %ymm10,224-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm11,%ymm9
- vpand %ymm1,%ymm5,%ymm5
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 352-256-128(%rbx),%ymm14
-
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpand %ymm1,%ymm2,%ymm6
- vpxor 32-128(%rax),%ymm12,%ymm12
-
- vpaddd %ymm6,%ymm3,%ymm3
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm2,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vmovdqu %ymm11,256-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm12,%ymm9
- vpand %ymm0,%ymm5,%ymm5
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 384-256-128(%rbx),%ymm10
-
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpand %ymm0,%ymm1,%ymm6
- vpxor 64-128(%rax),%ymm13,%ymm13
-
- vpaddd %ymm6,%ymm2,%ymm2
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm1,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vmovdqu %ymm12,288-256-128(%rbx)
- vpaddd %ymm12,%ymm2,%ymm2
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm13,%ymm9
- vpand %ymm4,%ymm5,%ymm5
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 416-256-128(%rbx),%ymm11
-
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpand %ymm4,%ymm0,%ymm6
- vpxor 96-128(%rax),%ymm14,%ymm14
-
- vpaddd %ymm6,%ymm1,%ymm1
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm0,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vmovdqu %ymm13,320-256-128(%rbx)
- vpaddd %ymm13,%ymm1,%ymm1
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm14,%ymm9
- vpand %ymm3,%ymm5,%ymm5
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 448-256-128(%rbx),%ymm12
-
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpand %ymm3,%ymm4,%ymm6
- vpxor 128-128(%rax),%ymm10,%ymm10
-
- vpaddd %ymm6,%ymm0,%ymm0
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm4,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vmovdqu %ymm14,352-256-128(%rbx)
- vpaddd %ymm14,%ymm0,%ymm0
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm10,%ymm9
- vpand %ymm2,%ymm5,%ymm5
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vmovdqa 64(%rbp),%ymm15
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 480-256-128(%rbx),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,384-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 160-128(%rax),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 0-128(%rax),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,416-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 192-128(%rax),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 32-128(%rax),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vmovdqa %ymm12,448-256-128(%rbx)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 224-128(%rax),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 64-128(%rax),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vmovdqa %ymm13,480-256-128(%rbx)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 256-256-128(%rbx),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 96-128(%rax),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vmovdqa %ymm14,0-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 288-256-128(%rbx),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 128-128(%rax),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,32-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 320-256-128(%rbx),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 160-128(%rax),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,64-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 352-256-128(%rbx),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 192-128(%rax),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vmovdqa %ymm12,96-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 384-256-128(%rbx),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 224-128(%rax),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vmovdqa %ymm13,128-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 416-256-128(%rbx),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 256-256-128(%rbx),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vmovdqa %ymm14,160-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 448-256-128(%rbx),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 288-256-128(%rbx),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,192-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 480-256-128(%rbx),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 320-256-128(%rbx),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,224-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 0-128(%rax),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 352-256-128(%rbx),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 32-128(%rax),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 384-256-128(%rbx),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 64-128(%rax),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 416-256-128(%rbx),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 96-128(%rax),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 448-256-128(%rbx),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 128-128(%rax),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 480-256-128(%rbx),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 160-128(%rax),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 0-128(%rax),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 192-128(%rax),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 32-128(%rax),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 224-128(%rax),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
-
- vpsrld $27,%ymm1,%ymm8
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor %ymm3,%ymm5,%ymm5
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm6,%ymm2,%ymm2
- movl $1,%ecx
- leaq 512(%rsp),%rbx
- cmpl 0(%rbx),%ecx
- cmovgeq %rbp,%r12
- cmpl 4(%rbx),%ecx
- cmovgeq %rbp,%r13
- cmpl 8(%rbx),%ecx
- cmovgeq %rbp,%r14
- cmpl 12(%rbx),%ecx
- cmovgeq %rbp,%r15
- cmpl 16(%rbx),%ecx
- cmovgeq %rbp,%r8
- cmpl 20(%rbx),%ecx
- cmovgeq %rbp,%r9
- cmpl 24(%rbx),%ecx
- cmovgeq %rbp,%r10
- cmpl 28(%rbx),%ecx
- cmovgeq %rbp,%r11
- vmovdqu (%rbx),%ymm5
- vpxor %ymm7,%ymm7,%ymm7
- vmovdqa %ymm5,%ymm6
- vpcmpgtd %ymm7,%ymm6,%ymm6
- vpaddd %ymm6,%ymm5,%ymm5
-
- vpand %ymm6,%ymm0,%ymm0
- vpand %ymm6,%ymm1,%ymm1
- vpaddd 0(%rdi),%ymm0,%ymm0
- vpand %ymm6,%ymm2,%ymm2
- vpaddd 32(%rdi),%ymm1,%ymm1
- vpand %ymm6,%ymm3,%ymm3
- vpaddd 64(%rdi),%ymm2,%ymm2
- vpand %ymm6,%ymm4,%ymm4
- vpaddd 96(%rdi),%ymm3,%ymm3
- vpaddd 128(%rdi),%ymm4,%ymm4
- vmovdqu %ymm0,0(%rdi)
- vmovdqu %ymm1,32(%rdi)
- vmovdqu %ymm2,64(%rdi)
- vmovdqu %ymm3,96(%rdi)
- vmovdqu %ymm4,128(%rdi)
-
- vmovdqu %ymm5,(%rbx)
- leaq 256+128(%rsp),%rbx
- vmovdqu 96(%rbp),%ymm9
- decl %edx
- jnz .Loop_avx2
-
-
-
-
-
-
-
-.Ldone_avx2:
- movq 544(%rsp),%rax
- vzeroupper
- movq -48(%rax),%r15
- movq -40(%rax),%r14
- movq -32(%rax),%r13
- movq -24(%rax),%r12
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-.Lepilogue_avx2:
- .byte 0xf3,0xc3
-.size sha1_multi_block_avx2,.-sha1_multi_block_avx2
.align 256
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
diff --git a/deps/openssl/asm/x64-elf-gas/sha/sha1-x86_64.s b/deps/openssl/asm/x64-elf-gas/sha/sha1-x86_64.s
index 22a031f368..38e9956cb6 100644
--- a/deps/openssl/asm/x64-elf-gas/sha/sha1-x86_64.s
+++ b/deps/openssl/asm/x64-elf-gas/sha/sha1-x86_64.s
@@ -12,14 +12,6 @@ sha1_block_data_order:
jz .Lialu
testl $536870912,%r10d
jnz _shaext_shortcut
- andl $296,%r10d
- cmpl $296,%r10d
- je _avx2_shortcut
- andl $268435456,%r8d
- andl $1073741824,%r9d
- orl %r9d,%r8d
- cmpl $1342177280,%r8d
- je _avx_shortcut
jmp _ssse3_shortcut
.align 16
@@ -1248,9 +1240,9 @@ _shaext_shortcut:
movdqa K_XX_XX+160(%rip),%xmm3
movdqu (%rsi),%xmm4
- pshufd $27,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm0,%xmm0
movdqu 16(%rsi),%xmm5
- pshufd $27,%xmm1,%xmm1
+ pshufd $0b00011011,%xmm1,%xmm1
movdqu 32(%rsi),%xmm6
.byte 102,15,56,0,227
movdqu 48(%rsi),%xmm7
@@ -1400,8 +1392,8 @@ _shaext_shortcut:
jnz .Loop_shaext
- pshufd $27,%xmm0,%xmm0
- pshufd $27,%xmm1,%xmm1
+ pshufd $0b00011011,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm1,%xmm1
movdqu %xmm0,(%rdi)
movd %xmm1,16(%rdi)
.byte 0xf3,0xc3
@@ -2582,2803 +2574,6 @@ _ssse3_shortcut:
.Lepilogue_ssse3:
.byte 0xf3,0xc3
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
-.type sha1_block_data_order_avx,@function
-.align 16
-sha1_block_data_order_avx:
-_avx_shortcut:
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- leaq -64(%rsp),%rsp
- vzeroupper
- movq %rax,%r14
- andq $-64,%rsp
- movq %rdi,%r8
- movq %rsi,%r9
- movq %rdx,%r10
-
- shlq $6,%r10
- addq %r9,%r10
- leaq K_XX_XX+64(%rip),%r11
-
- movl 0(%r8),%eax
- movl 4(%r8),%ebx
- movl 8(%r8),%ecx
- movl 12(%r8),%edx
- movl %ebx,%esi
- movl 16(%r8),%ebp
- movl %ecx,%edi
- xorl %edx,%edi
- andl %edi,%esi
-
- vmovdqa 64(%r11),%xmm6
- vmovdqa -64(%r11),%xmm11
- vmovdqu 0(%r9),%xmm0
- vmovdqu 16(%r9),%xmm1
- vmovdqu 32(%r9),%xmm2
- vmovdqu 48(%r9),%xmm3
- vpshufb %xmm6,%xmm0,%xmm0
- addq $64,%r9
- vpshufb %xmm6,%xmm1,%xmm1
- vpshufb %xmm6,%xmm2,%xmm2
- vpshufb %xmm6,%xmm3,%xmm3
- vpaddd %xmm11,%xmm0,%xmm4
- vpaddd %xmm11,%xmm1,%xmm5
- vpaddd %xmm11,%xmm2,%xmm6
- vmovdqa %xmm4,0(%rsp)
- vmovdqa %xmm5,16(%rsp)
- vmovdqa %xmm6,32(%rsp)
- jmp .Loop_avx
-.align 16
-.Loop_avx:
- shrdl $2,%ebx,%ebx
- xorl %edx,%esi
- vpalignr $8,%xmm0,%xmm1,%xmm4
- movl %eax,%edi
- addl 0(%rsp),%ebp
- vpaddd %xmm3,%xmm11,%xmm9
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrldq $4,%xmm3,%xmm8
- addl %esi,%ebp
- andl %ebx,%edi
- vpxor %xmm0,%xmm4,%xmm4
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpxor %xmm2,%xmm8,%xmm8
- shrdl $7,%eax,%eax
- xorl %ecx,%edi
- movl %ebp,%esi
- addl 4(%rsp),%edx
- vpxor %xmm8,%xmm4,%xmm4
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vmovdqa %xmm9,48(%rsp)
- addl %edi,%edx
- andl %eax,%esi
- vpsrld $31,%xmm4,%xmm8
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%esi
- vpslldq $12,%xmm4,%xmm10
- vpaddd %xmm4,%xmm4,%xmm4
- movl %edx,%edi
- addl 8(%rsp),%ecx
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpsrld $30,%xmm10,%xmm9
- vpor %xmm8,%xmm4,%xmm4
- addl %esi,%ecx
- andl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- vpslld $2,%xmm10,%xmm10
- vpxor %xmm9,%xmm4,%xmm4
- shrdl $7,%edx,%edx
- xorl %eax,%edi
- movl %ecx,%esi
- addl 12(%rsp),%ebx
- vpxor %xmm10,%xmm4,%xmm4
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- andl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %ebp,%esi
- vpalignr $8,%xmm1,%xmm2,%xmm5
- movl %ebx,%edi
- addl 16(%rsp),%eax
- vpaddd %xmm4,%xmm11,%xmm9
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrldq $4,%xmm4,%xmm8
- addl %esi,%eax
- andl %ecx,%edi
- vpxor %xmm1,%xmm5,%xmm5
- xorl %edx,%ecx
- addl %ebx,%eax
- vpxor %xmm3,%xmm8,%xmm8
- shrdl $7,%ebx,%ebx
- xorl %edx,%edi
- movl %eax,%esi
- addl 20(%rsp),%ebp
- vpxor %xmm8,%xmm5,%xmm5
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%ebp
- andl %ebx,%esi
- vpsrld $31,%xmm5,%xmm8
- xorl %ecx,%ebx
- addl %eax,%ebp
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- vpslldq $12,%xmm5,%xmm10
- vpaddd %xmm5,%xmm5,%xmm5
- movl %ebp,%edi
- addl 24(%rsp),%edx
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vpsrld $30,%xmm10,%xmm9
- vpor %xmm8,%xmm5,%xmm5
- addl %esi,%edx
- andl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- vpslld $2,%xmm10,%xmm10
- vpxor %xmm9,%xmm5,%xmm5
- shrdl $7,%ebp,%ebp
- xorl %ebx,%edi
- movl %edx,%esi
- addl 28(%rsp),%ecx
- vpxor %xmm10,%xmm5,%xmm5
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vmovdqa -32(%r11),%xmm11
- addl %edi,%ecx
- andl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- vpalignr $8,%xmm2,%xmm3,%xmm6
- movl %ecx,%edi
- addl 32(%rsp),%ebx
- vpaddd %xmm5,%xmm11,%xmm9
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- vpsrldq $4,%xmm5,%xmm8
- addl %esi,%ebx
- andl %edx,%edi
- vpxor %xmm2,%xmm6,%xmm6
- xorl %ebp,%edx
- addl %ecx,%ebx
- vpxor %xmm4,%xmm8,%xmm8
- shrdl $7,%ecx,%ecx
- xorl %ebp,%edi
- movl %ebx,%esi
- addl 36(%rsp),%eax
- vpxor %xmm8,%xmm6,%xmm6
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vmovdqa %xmm9,16(%rsp)
- addl %edi,%eax
- andl %ecx,%esi
- vpsrld $31,%xmm6,%xmm8
- xorl %edx,%ecx
- addl %ebx,%eax
- shrdl $7,%ebx,%ebx
- xorl %edx,%esi
- vpslldq $12,%xmm6,%xmm10
- vpaddd %xmm6,%xmm6,%xmm6
- movl %eax,%edi
- addl 40(%rsp),%ebp
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrld $30,%xmm10,%xmm9
- vpor %xmm8,%xmm6,%xmm6
- addl %esi,%ebp
- andl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpslld $2,%xmm10,%xmm10
- vpxor %xmm9,%xmm6,%xmm6
- shrdl $7,%eax,%eax
- xorl %ecx,%edi
- movl %ebp,%esi
- addl 44(%rsp),%edx
- vpxor %xmm10,%xmm6,%xmm6
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- andl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%esi
- vpalignr $8,%xmm3,%xmm4,%xmm7
- movl %edx,%edi
- addl 48(%rsp),%ecx
- vpaddd %xmm6,%xmm11,%xmm9
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpsrldq $4,%xmm6,%xmm8
- addl %esi,%ecx
- andl %ebp,%edi
- vpxor %xmm3,%xmm7,%xmm7
- xorl %eax,%ebp
- addl %edx,%ecx
- vpxor %xmm5,%xmm8,%xmm8
- shrdl $7,%edx,%edx
- xorl %eax,%edi
- movl %ecx,%esi
- addl 52(%rsp),%ebx
- vpxor %xmm8,%xmm7,%xmm7
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%ebx
- andl %edx,%esi
- vpsrld $31,%xmm7,%xmm8
- xorl %ebp,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %ebp,%esi
- vpslldq $12,%xmm7,%xmm10
- vpaddd %xmm7,%xmm7,%xmm7
- movl %ebx,%edi
- addl 56(%rsp),%eax
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm10,%xmm9
- vpor %xmm8,%xmm7,%xmm7
- addl %esi,%eax
- andl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm10,%xmm10
- vpxor %xmm9,%xmm7,%xmm7
- shrdl $7,%ebx,%ebx
- xorl %edx,%edi
- movl %eax,%esi
- addl 60(%rsp),%ebp
- vpxor %xmm10,%xmm7,%xmm7
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- addl %edi,%ebp
- andl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm6,%xmm7,%xmm8
- vpxor %xmm4,%xmm0,%xmm0
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- movl %ebp,%edi
- addl 0(%rsp),%edx
- vpxor %xmm1,%xmm0,%xmm0
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vpaddd %xmm7,%xmm11,%xmm9
- addl %esi,%edx
- andl %eax,%edi
- vpxor %xmm8,%xmm0,%xmm0
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%edi
- vpsrld $30,%xmm0,%xmm8
- vmovdqa %xmm9,48(%rsp)
- movl %edx,%esi
- addl 4(%rsp),%ecx
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpslld $2,%xmm0,%xmm0
- addl %edi,%ecx
- andl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- movl %ecx,%edi
- addl 8(%rsp),%ebx
- vpor %xmm8,%xmm0,%xmm0
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- andl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 12(%rsp),%eax
- xorl %ebp,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm7,%xmm0,%xmm8
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- vpxor %xmm2,%xmm1,%xmm1
- addl %esi,%ebp
- xorl %ecx,%edi
- vpaddd %xmm0,%xmm11,%xmm9
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpxor %xmm8,%xmm1,%xmm1
- addl 20(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- vpsrld $30,%xmm1,%xmm8
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpslld $2,%xmm1,%xmm1
- addl 24(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpor %xmm8,%xmm1,%xmm1
- addl 28(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm0,%xmm1,%xmm8
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- vpxor %xmm3,%xmm2,%xmm2
- addl %esi,%eax
- xorl %edx,%edi
- vpaddd %xmm1,%xmm11,%xmm9
- vmovdqa 0(%r11),%xmm11
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpxor %xmm8,%xmm2,%xmm2
- addl 36(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- vpsrld $30,%xmm2,%xmm8
- vmovdqa %xmm9,16(%rsp)
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpslld $2,%xmm2,%xmm2
- addl 40(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpor %xmm8,%xmm2,%xmm2
- addl 44(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpalignr $8,%xmm1,%xmm2,%xmm8
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- vpxor %xmm4,%xmm3,%xmm3
- addl %esi,%ebx
- xorl %ebp,%edi
- vpaddd %xmm2,%xmm11,%xmm9
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpxor %xmm8,%xmm3,%xmm3
- addl 52(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm3,%xmm8
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm3,%xmm3
- addl 56(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpor %xmm8,%xmm3,%xmm3
- addl 60(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpalignr $8,%xmm2,%xmm3,%xmm8
- vpxor %xmm0,%xmm4,%xmm4
- addl 0(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- vpxor %xmm5,%xmm4,%xmm4
- addl %esi,%ecx
- xorl %eax,%edi
- vpaddd %xmm3,%xmm11,%xmm9
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpxor %xmm8,%xmm4,%xmm4
- addl 4(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- vpsrld $30,%xmm4,%xmm8
- vmovdqa %xmm9,48(%rsp)
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpslld $2,%xmm4,%xmm4
- addl 8(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpor %xmm8,%xmm4,%xmm4
- addl 12(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm3,%xmm4,%xmm8
- vpxor %xmm1,%xmm5,%xmm5
- addl 16(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- vpxor %xmm6,%xmm5,%xmm5
- addl %esi,%edx
- xorl %ebx,%edi
- vpaddd %xmm4,%xmm11,%xmm9
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpxor %xmm8,%xmm5,%xmm5
- addl 20(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- vpsrld $30,%xmm5,%xmm8
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpslld $2,%xmm5,%xmm5
- addl 24(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpor %xmm8,%xmm5,%xmm5
- addl 28(%rsp),%eax
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm4,%xmm5,%xmm8
- vpxor %xmm2,%xmm6,%xmm6
- addl 32(%rsp),%ebp
- andl %ecx,%esi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- movl %eax,%edi
- xorl %ecx,%esi
- vpaddd %xmm5,%xmm11,%xmm9
- shldl $5,%eax,%eax
- addl %esi,%ebp
- vpxor %xmm8,%xmm6,%xmm6
- xorl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 36(%rsp),%edx
- vpsrld $30,%xmm6,%xmm8
- vmovdqa %xmm9,16(%rsp)
- andl %ebx,%edi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%esi
- vpslld $2,%xmm6,%xmm6
- xorl %ebx,%edi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 40(%rsp),%ecx
- andl %eax,%esi
- vpor %xmm8,%xmm6,%xmm6
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%edi
- xorl %eax,%esi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 44(%rsp),%ebx
- andl %ebp,%edi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- movl %ecx,%esi
- xorl %ebp,%edi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm5,%xmm6,%xmm8
- vpxor %xmm3,%xmm7,%xmm7
- addl 48(%rsp),%eax
- andl %edx,%esi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- vpxor %xmm0,%xmm7,%xmm7
- movl %ebx,%edi
- xorl %edx,%esi
- vpaddd %xmm6,%xmm11,%xmm9
- vmovdqa 32(%r11),%xmm11
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vpxor %xmm8,%xmm7,%xmm7
- xorl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 52(%rsp),%ebp
- vpsrld $30,%xmm7,%xmm8
- vmovdqa %xmm9,32(%rsp)
- andl %ecx,%edi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- vpslld $2,%xmm7,%xmm7
- xorl %ecx,%edi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 56(%rsp),%edx
- andl %ebx,%esi
- vpor %xmm8,%xmm7,%xmm7
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%edi
- xorl %ebx,%esi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 60(%rsp),%ecx
- andl %eax,%edi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%esi
- xorl %eax,%edi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- vpalignr $8,%xmm6,%xmm7,%xmm8
- vpxor %xmm4,%xmm0,%xmm0
- addl 0(%rsp),%ebx
- andl %ebp,%esi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- vpxor %xmm1,%xmm0,%xmm0
- movl %ecx,%edi
- xorl %ebp,%esi
- vpaddd %xmm7,%xmm11,%xmm9
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- vpxor %xmm8,%xmm0,%xmm0
- xorl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 4(%rsp),%eax
- vpsrld $30,%xmm0,%xmm8
- vmovdqa %xmm9,48(%rsp)
- andl %edx,%edi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- vpslld $2,%xmm0,%xmm0
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 8(%rsp),%ebp
- andl %ecx,%esi
- vpor %xmm8,%xmm0,%xmm0
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%edi
- xorl %ecx,%esi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 12(%rsp),%edx
- andl %ebx,%edi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%esi
- xorl %ebx,%edi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- vpalignr $8,%xmm7,%xmm0,%xmm8
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%rsp),%ecx
- andl %eax,%esi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- vpxor %xmm2,%xmm1,%xmm1
- movl %edx,%edi
- xorl %eax,%esi
- vpaddd %xmm0,%xmm11,%xmm9
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vpxor %xmm8,%xmm1,%xmm1
- xorl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 20(%rsp),%ebx
- vpsrld $30,%xmm1,%xmm8
- vmovdqa %xmm9,0(%rsp)
- andl %ebp,%edi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- movl %ecx,%esi
- vpslld $2,%xmm1,%xmm1
- xorl %ebp,%edi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 24(%rsp),%eax
- andl %edx,%esi
- vpor %xmm8,%xmm1,%xmm1
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%edi
- xorl %edx,%esi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 28(%rsp),%ebp
- andl %ecx,%edi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- xorl %ecx,%edi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm0,%xmm1,%xmm8
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%rsp),%edx
- andl %ebx,%esi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- vpxor %xmm3,%xmm2,%xmm2
- movl %ebp,%edi
- xorl %ebx,%esi
- vpaddd %xmm1,%xmm11,%xmm9
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- vpxor %xmm8,%xmm2,%xmm2
- xorl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 36(%rsp),%ecx
- vpsrld $30,%xmm2,%xmm8
- vmovdqa %xmm9,16(%rsp)
- andl %eax,%edi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%esi
- vpslld $2,%xmm2,%xmm2
- xorl %eax,%edi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 40(%rsp),%ebx
- andl %ebp,%esi
- vpor %xmm8,%xmm2,%xmm2
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- movl %ecx,%edi
- xorl %ebp,%esi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 44(%rsp),%eax
- andl %edx,%edi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- addl %ebx,%eax
- vpalignr $8,%xmm1,%xmm2,%xmm8
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- vpxor %xmm4,%xmm3,%xmm3
- addl %esi,%ebp
- xorl %ecx,%edi
- vpaddd %xmm2,%xmm11,%xmm9
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpxor %xmm8,%xmm3,%xmm3
- addl 52(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- vpsrld $30,%xmm3,%xmm8
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpslld $2,%xmm3,%xmm3
- addl 56(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpor %xmm8,%xmm3,%xmm3
- addl 60(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 0(%rsp),%eax
- vpaddd %xmm3,%xmm11,%xmm9
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vmovdqa %xmm9,48(%rsp)
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 4(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 8(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 12(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- cmpq %r10,%r9
- je .Ldone_avx
- vmovdqa 64(%r11),%xmm6
- vmovdqa -64(%r11),%xmm11
- vmovdqu 0(%r9),%xmm0
- vmovdqu 16(%r9),%xmm1
- vmovdqu 32(%r9),%xmm2
- vmovdqu 48(%r9),%xmm3
- vpshufb %xmm6,%xmm0,%xmm0
- addq $64,%r9
- addl 16(%rsp),%ebx
- xorl %ebp,%esi
- vpshufb %xmm6,%xmm1,%xmm1
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- vpaddd %xmm11,%xmm0,%xmm4
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vmovdqa %xmm4,0(%rsp)
- addl 20(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 28(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 32(%rsp),%ecx
- xorl %eax,%esi
- vpshufb %xmm6,%xmm2,%xmm2
- movl %edx,%edi
- shldl $5,%edx,%edx
- vpaddd %xmm11,%xmm1,%xmm5
- addl %esi,%ecx
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vmovdqa %xmm5,16(%rsp)
- addl 36(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 48(%rsp),%edx
- xorl %ebx,%esi
- vpshufb %xmm6,%xmm3,%xmm3
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- vpaddd %xmm11,%xmm2,%xmm6
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vmovdqa %xmm6,32(%rsp)
- addl 52(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 56(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 0(%r8),%eax
- addl 4(%r8),%esi
- addl 8(%r8),%ecx
- addl 12(%r8),%edx
- movl %eax,0(%r8)
- addl 16(%r8),%ebp
- movl %esi,4(%r8)
- movl %esi,%ebx
- movl %ecx,8(%r8)
- movl %ecx,%edi
- movl %edx,12(%r8)
- xorl %edx,%edi
- movl %ebp,16(%r8)
- andl %edi,%esi
- jmp .Loop_avx
-
-.align 16
-.Ldone_avx:
- addl 16(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 20(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 28(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 32(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 36(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 48(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 52(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 56(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vzeroupper
-
- addl 0(%r8),%eax
- addl 4(%r8),%esi
- addl 8(%r8),%ecx
- movl %eax,0(%r8)
- addl 12(%r8),%edx
- movl %esi,4(%r8)
- addl 16(%r8),%ebp
- movl %ecx,8(%r8)
- movl %edx,12(%r8)
- movl %ebp,16(%r8)
- leaq (%r14),%rsi
- movq -40(%rsi),%r14
- movq -32(%rsi),%r13
- movq -24(%rsi),%r12
- movq -16(%rsi),%rbp
- movq -8(%rsi),%rbx
- leaq (%rsi),%rsp
-.Lepilogue_avx:
- .byte 0xf3,0xc3
-.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
-.type sha1_block_data_order_avx2,@function
-.align 16
-sha1_block_data_order_avx2:
-_avx2_shortcut:
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- vzeroupper
- movq %rax,%r14
- movq %rdi,%r8
- movq %rsi,%r9
- movq %rdx,%r10
-
- leaq -640(%rsp),%rsp
- shlq $6,%r10
- leaq 64(%r9),%r13
- andq $-128,%rsp
- addq %r9,%r10
- leaq K_XX_XX+64(%rip),%r11
-
- movl 0(%r8),%eax
- cmpq %r10,%r13
- cmovaeq %r9,%r13
- movl 4(%r8),%ebp
- movl 8(%r8),%ecx
- movl 12(%r8),%edx
- movl 16(%r8),%esi
- vmovdqu 64(%r11),%ymm6
-
- vmovdqu (%r9),%xmm0
- vmovdqu 16(%r9),%xmm1
- vmovdqu 32(%r9),%xmm2
- vmovdqu 48(%r9),%xmm3
- leaq 64(%r9),%r9
- vinserti128 $1,(%r13),%ymm0,%ymm0
- vinserti128 $1,16(%r13),%ymm1,%ymm1
- vpshufb %ymm6,%ymm0,%ymm0
- vinserti128 $1,32(%r13),%ymm2,%ymm2
- vpshufb %ymm6,%ymm1,%ymm1
- vinserti128 $1,48(%r13),%ymm3,%ymm3
- vpshufb %ymm6,%ymm2,%ymm2
- vmovdqu -64(%r11),%ymm11
- vpshufb %ymm6,%ymm3,%ymm3
-
- vpaddd %ymm11,%ymm0,%ymm4
- vpaddd %ymm11,%ymm1,%ymm5
- vmovdqu %ymm4,0(%rsp)
- vpaddd %ymm11,%ymm2,%ymm6
- vmovdqu %ymm5,32(%rsp)
- vpaddd %ymm11,%ymm3,%ymm7
- vmovdqu %ymm6,64(%rsp)
- vmovdqu %ymm7,96(%rsp)
- vpalignr $8,%ymm0,%ymm1,%ymm4
- vpsrldq $4,%ymm3,%ymm8
- vpxor %ymm0,%ymm4,%ymm4
- vpxor %ymm2,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $31,%ymm4,%ymm8
- vpslldq $12,%ymm4,%ymm10
- vpaddd %ymm4,%ymm4,%ymm4
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm4,%ymm4
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm4,%ymm4
- vpxor %ymm10,%ymm4,%ymm4
- vpaddd %ymm11,%ymm4,%ymm9
- vmovdqu %ymm9,128(%rsp)
- vpalignr $8,%ymm1,%ymm2,%ymm5
- vpsrldq $4,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm3,%ymm8,%ymm8
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $31,%ymm5,%ymm8
- vmovdqu -32(%r11),%ymm11
- vpslldq $12,%ymm5,%ymm10
- vpaddd %ymm5,%ymm5,%ymm5
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm5,%ymm5
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm10,%ymm5,%ymm5
- vpaddd %ymm11,%ymm5,%ymm9
- vmovdqu %ymm9,160(%rsp)
- vpalignr $8,%ymm2,%ymm3,%ymm6
- vpsrldq $4,%ymm5,%ymm8
- vpxor %ymm2,%ymm6,%ymm6
- vpxor %ymm4,%ymm8,%ymm8
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $31,%ymm6,%ymm8
- vpslldq $12,%ymm6,%ymm10
- vpaddd %ymm6,%ymm6,%ymm6
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm6,%ymm6
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm6,%ymm6
- vpxor %ymm10,%ymm6,%ymm6
- vpaddd %ymm11,%ymm6,%ymm9
- vmovdqu %ymm9,192(%rsp)
- vpalignr $8,%ymm3,%ymm4,%ymm7
- vpsrldq $4,%ymm6,%ymm8
- vpxor %ymm3,%ymm7,%ymm7
- vpxor %ymm5,%ymm8,%ymm8
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm7,%ymm8
- vpslldq $12,%ymm7,%ymm10
- vpaddd %ymm7,%ymm7,%ymm7
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm7,%ymm7
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm7,%ymm7
- vpxor %ymm10,%ymm7,%ymm7
- vpaddd %ymm11,%ymm7,%ymm9
- vmovdqu %ymm9,224(%rsp)
- leaq 128(%rsp),%r13
- jmp .Loop_avx2
-.align 32
-.Loop_avx2:
- rorxl $2,%ebp,%ebx
- andnl %edx,%ebp,%edi
- andl %ecx,%ebp
- xorl %edi,%ebp
- jmp .Lalign32_1
-.align 32
-.Lalign32_1:
- vpalignr $8,%ymm6,%ymm7,%ymm8
- vpxor %ymm4,%ymm0,%ymm0
- addl -128(%r13),%esi
- andnl %ecx,%eax,%edi
- vpxor %ymm1,%ymm0,%ymm0
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- vpxor %ymm8,%ymm0,%ymm0
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- vpsrld $30,%ymm0,%ymm8
- vpslld $2,%ymm0,%ymm0
- addl -124(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- vpor %ymm8,%ymm0,%ymm0
- addl %r12d,%edx
- xorl %edi,%esi
- addl -120(%r13),%ecx
- andnl %ebp,%edx,%edi
- vpaddd %ymm11,%ymm0,%ymm9
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- vmovdqu %ymm9,256(%rsp)
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -116(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- addl -96(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- vpalignr $8,%ymm7,%ymm0,%ymm8
- vpxor %ymm5,%ymm1,%ymm1
- addl -92(%r13),%eax
- andnl %edx,%ebp,%edi
- vpxor %ymm2,%ymm1,%ymm1
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- vpxor %ymm8,%ymm1,%ymm1
- andl %ecx,%ebp
- addl %r12d,%eax
- xorl %edi,%ebp
- vpsrld $30,%ymm1,%ymm8
- vpslld $2,%ymm1,%ymm1
- addl -88(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- vpor %ymm8,%ymm1,%ymm1
- addl %r12d,%esi
- xorl %edi,%eax
- addl -84(%r13),%edx
- andnl %ebx,%esi,%edi
- vpaddd %ymm11,%ymm1,%ymm9
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- vmovdqu %ymm9,288(%rsp)
- addl %r12d,%edx
- xorl %edi,%esi
- addl -64(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -60(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- vpalignr $8,%ymm0,%ymm1,%ymm8
- vpxor %ymm6,%ymm2,%ymm2
- addl -56(%r13),%ebp
- andnl %esi,%ebx,%edi
- vpxor %ymm3,%ymm2,%ymm2
- vmovdqu 0(%r11),%ymm11
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- vpxor %ymm8,%ymm2,%ymm2
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- vpsrld $30,%ymm2,%ymm8
- vpslld $2,%ymm2,%ymm2
- addl -52(%r13),%eax
- andnl %edx,%ebp,%edi
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- vpor %ymm8,%ymm2,%ymm2
- addl %r12d,%eax
- xorl %edi,%ebp
- addl -32(%r13),%esi
- andnl %ecx,%eax,%edi
- vpaddd %ymm11,%ymm2,%ymm9
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- vmovdqu %ymm9,320(%rsp)
- addl %r12d,%esi
- xorl %edi,%eax
- addl -28(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- addl %r12d,%edx
- xorl %edi,%esi
- addl -24(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- vpalignr $8,%ymm1,%ymm2,%ymm8
- vpxor %ymm7,%ymm3,%ymm3
- addl -20(%r13),%ebx
- andnl %eax,%ecx,%edi
- vpxor %ymm4,%ymm3,%ymm3
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- vpxor %ymm8,%ymm3,%ymm3
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- vpsrld $30,%ymm3,%ymm8
- vpslld $2,%ymm3,%ymm3
- addl 0(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- vpor %ymm8,%ymm3,%ymm3
- addl %r12d,%ebp
- xorl %edi,%ebx
- addl 4(%r13),%eax
- andnl %edx,%ebp,%edi
- vpaddd %ymm11,%ymm3,%ymm9
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- vmovdqu %ymm9,352(%rsp)
- addl %r12d,%eax
- xorl %edi,%ebp
- addl 8(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl 12(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- vpalignr $8,%ymm2,%ymm3,%ymm8
- vpxor %ymm0,%ymm4,%ymm4
- addl 32(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- vpxor %ymm8,%ymm4,%ymm4
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl 36(%r13),%ebx
- vpsrld $30,%ymm4,%ymm8
- vpslld $2,%ymm4,%ymm4
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- vpor %ymm8,%ymm4,%ymm4
- addl 40(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- vpaddd %ymm11,%ymm4,%ymm9
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl 44(%r13),%eax
- vmovdqu %ymm9,384(%rsp)
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl 64(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- vpalignr $8,%ymm3,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- addl 68(%r13),%edx
- leal (%rdx,%rax,1),%edx
- vpxor %ymm6,%ymm5,%ymm5
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- vpxor %ymm8,%ymm5,%ymm5
- addl %r12d,%edx
- xorl %ebx,%esi
- addl 72(%r13),%ecx
- vpsrld $30,%ymm5,%ymm8
- vpslld $2,%ymm5,%ymm5
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- vpor %ymm8,%ymm5,%ymm5
- addl 76(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- vpaddd %ymm11,%ymm5,%ymm9
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl 96(%r13),%ebp
- vmovdqu %ymm9,416(%rsp)
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl 100(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- vpalignr $8,%ymm4,%ymm5,%ymm8
- vpxor %ymm2,%ymm6,%ymm6
- addl 104(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- vpxor %ymm7,%ymm6,%ymm6
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- vpxor %ymm8,%ymm6,%ymm6
- addl %r12d,%esi
- xorl %ecx,%eax
- addl 108(%r13),%edx
- leaq 256(%r13),%r13
- vpsrld $30,%ymm6,%ymm8
- vpslld $2,%ymm6,%ymm6
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- vpor %ymm8,%ymm6,%ymm6
- addl -128(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- vpaddd %ymm11,%ymm6,%ymm9
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -124(%r13),%ebx
- vmovdqu %ymm9,448(%rsp)
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -120(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- vpalignr $8,%ymm5,%ymm6,%ymm8
- vpxor %ymm3,%ymm7,%ymm7
- addl -116(%r13),%eax
- leal (%rax,%rbx,1),%eax
- vpxor %ymm0,%ymm7,%ymm7
- vmovdqu 32(%r11),%ymm11
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- vpxor %ymm8,%ymm7,%ymm7
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -96(%r13),%esi
- vpsrld $30,%ymm7,%ymm8
- vpslld $2,%ymm7,%ymm7
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- vpor %ymm8,%ymm7,%ymm7
- addl -92(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- vpaddd %ymm11,%ymm7,%ymm9
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -88(%r13),%ecx
- vmovdqu %ymm9,480(%rsp)
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -84(%r13),%ebx
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- jmp .Lalign32_2
-.align 32
-.Lalign32_2:
- vpalignr $8,%ymm6,%ymm7,%ymm8
- vpxor %ymm4,%ymm0,%ymm0
- addl -64(%r13),%ebp
- xorl %esi,%ecx
- vpxor %ymm1,%ymm0,%ymm0
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- vpxor %ymm8,%ymm0,%ymm0
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- vpsrld $30,%ymm0,%ymm8
- vpslld $2,%ymm0,%ymm0
- addl %r12d,%ebp
- andl %edi,%ebx
- addl -60(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- vpor %ymm8,%ymm0,%ymm0
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- vpaddd %ymm11,%ymm0,%ymm9
- addl %r12d,%eax
- andl %edi,%ebp
- addl -56(%r13),%esi
- xorl %ecx,%ebp
- vmovdqu %ymm9,512(%rsp)
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- addl -52(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- andl %edi,%esi
- addl -32(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- andl %edi,%edx
- vpalignr $8,%ymm7,%ymm0,%ymm8
- vpxor %ymm5,%ymm1,%ymm1
- addl -28(%r13),%ebx
- xorl %eax,%edx
- vpxor %ymm2,%ymm1,%ymm1
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- vpxor %ymm8,%ymm1,%ymm1
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- vpsrld $30,%ymm1,%ymm8
- vpslld $2,%ymm1,%ymm1
- addl %r12d,%ebx
- andl %edi,%ecx
- addl -24(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- vpor %ymm8,%ymm1,%ymm1
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- vpaddd %ymm11,%ymm1,%ymm9
- addl %r12d,%ebp
- andl %edi,%ebx
- addl -20(%r13),%eax
- xorl %edx,%ebx
- vmovdqu %ymm9,544(%rsp)
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl 0(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- addl 4(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- andl %edi,%esi
- vpalignr $8,%ymm0,%ymm1,%ymm8
- vpxor %ymm6,%ymm2,%ymm2
- addl 8(%r13),%ecx
- xorl %ebp,%esi
- vpxor %ymm3,%ymm2,%ymm2
- movl %eax,%edi
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- vpxor %ymm8,%ymm2,%ymm2
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- vpsrld $30,%ymm2,%ymm8
- vpslld $2,%ymm2,%ymm2
- addl %r12d,%ecx
- andl %edi,%edx
- addl 12(%r13),%ebx
- xorl %eax,%edx
- movl %esi,%edi
- xorl %eax,%edi
- vpor %ymm8,%ymm2,%ymm2
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- vpaddd %ymm11,%ymm2,%ymm9
- addl %r12d,%ebx
- andl %edi,%ecx
- addl 32(%r13),%ebp
- xorl %esi,%ecx
- vmovdqu %ymm9,576(%rsp)
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl 36(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl 40(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- vpalignr $8,%ymm1,%ymm2,%ymm8
- vpxor %ymm7,%ymm3,%ymm3
- addl 44(%r13),%edx
- xorl %ebx,%eax
- vpxor %ymm4,%ymm3,%ymm3
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- vpxor %ymm8,%ymm3,%ymm3
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- vpsrld $30,%ymm3,%ymm8
- vpslld $2,%ymm3,%ymm3
- addl %r12d,%edx
- andl %edi,%esi
- addl 64(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- xorl %ebp,%edi
- vpor %ymm8,%ymm3,%ymm3
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- vpaddd %ymm11,%ymm3,%ymm9
- addl %r12d,%ecx
- andl %edi,%edx
- addl 68(%r13),%ebx
- xorl %eax,%edx
- vmovdqu %ymm9,608(%rsp)
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- addl 72(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl 76(%r13),%eax
- xorl %edx,%ebx
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl 96(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl 100(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl 104(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl 108(%r13),%ebx
- leaq 256(%r13),%r13
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -128(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl -124(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -120(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -116(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -96(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -92(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -88(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl -84(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -64(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -60(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -56(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -52(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -32(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl -28(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -24(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -20(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- addl %r12d,%edx
- leaq 128(%r9),%r13
- leaq 128(%r9),%rdi
- cmpq %r10,%r13
- cmovaeq %r9,%r13
-
-
- addl 0(%r8),%edx
- addl 4(%r8),%esi
- addl 8(%r8),%ebp
- movl %edx,0(%r8)
- addl 12(%r8),%ebx
- movl %esi,4(%r8)
- movl %edx,%eax
- addl 16(%r8),%ecx
- movl %ebp,%r12d
- movl %ebp,8(%r8)
- movl %ebx,%edx
-
- movl %ebx,12(%r8)
- movl %esi,%ebp
- movl %ecx,16(%r8)
-
- movl %ecx,%esi
- movl %r12d,%ecx
-
-
- cmpq %r10,%r9
- je .Ldone_avx2
- vmovdqu 64(%r11),%ymm6
- cmpq %r10,%rdi
- ja .Last_avx2
-
- vmovdqu -64(%rdi),%xmm0
- vmovdqu -48(%rdi),%xmm1
- vmovdqu -32(%rdi),%xmm2
- vmovdqu -16(%rdi),%xmm3
- vinserti128 $1,0(%r13),%ymm0,%ymm0
- vinserti128 $1,16(%r13),%ymm1,%ymm1
- vinserti128 $1,32(%r13),%ymm2,%ymm2
- vinserti128 $1,48(%r13),%ymm3,%ymm3
- jmp .Last_avx2
-
-.align 32
-.Last_avx2:
- leaq 128+16(%rsp),%r13
- rorxl $2,%ebp,%ebx
- andnl %edx,%ebp,%edi
- andl %ecx,%ebp
- xorl %edi,%ebp
- subq $-128,%r9
- addl -128(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl -124(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- addl %r12d,%edx
- xorl %edi,%esi
- addl -120(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -116(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- addl -96(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- addl -92(%r13),%eax
- andnl %edx,%ebp,%edi
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- addl %r12d,%eax
- xorl %edi,%ebp
- addl -88(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl -84(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- addl %r12d,%edx
- xorl %edi,%esi
- addl -64(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -60(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- addl -56(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- addl -52(%r13),%eax
- andnl %edx,%ebp,%edi
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- addl %r12d,%eax
- xorl %edi,%ebp
- addl -32(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl -28(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- addl %r12d,%edx
- xorl %edi,%esi
- addl -24(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -20(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- addl 0(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- addl 4(%r13),%eax
- andnl %edx,%ebp,%edi
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- addl %r12d,%eax
- xorl %edi,%ebp
- addl 8(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl 12(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl 32(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl 36(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl 40(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl 44(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl 64(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- vmovdqu -64(%r11),%ymm11
- vpshufb %ymm6,%ymm0,%ymm0
- addl 68(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl 72(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl 76(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl 96(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl 100(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- vpshufb %ymm6,%ymm1,%ymm1
- vpaddd %ymm11,%ymm0,%ymm8
- addl 104(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl 108(%r13),%edx
- leaq 256(%r13),%r13
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -128(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -124(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -120(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- vmovdqu %ymm8,0(%rsp)
- vpshufb %ymm6,%ymm2,%ymm2
- vpaddd %ymm11,%ymm1,%ymm9
- addl -116(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -96(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -92(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -88(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -84(%r13),%ebx
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- vmovdqu %ymm9,32(%rsp)
- vpshufb %ymm6,%ymm3,%ymm3
- vpaddd %ymm11,%ymm2,%ymm6
- addl -64(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl -60(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl -56(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- addl -52(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- andl %edi,%esi
- addl -32(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- andl %edi,%edx
- jmp .Lalign32_3
-.align 32
-.Lalign32_3:
- vmovdqu %ymm6,64(%rsp)
- vpaddd %ymm11,%ymm3,%ymm7
- addl -28(%r13),%ebx
- xorl %eax,%edx
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- addl -24(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl -20(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl 0(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- addl 4(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- andl %edi,%esi
- vmovdqu %ymm7,96(%rsp)
- addl 8(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- andl %edi,%edx
- addl 12(%r13),%ebx
- xorl %eax,%edx
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- addl 32(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl 36(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl 40(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- vpalignr $8,%ymm0,%ymm1,%ymm4
- addl 44(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- vpsrldq $4,%ymm3,%ymm8
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- vpxor %ymm0,%ymm4,%ymm4
- vpxor %ymm2,%ymm8,%ymm8
- xorl %ebp,%esi
- addl %r12d,%edx
- vpxor %ymm8,%ymm4,%ymm4
- andl %edi,%esi
- addl 64(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- vpsrld $31,%ymm4,%ymm8
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- vpslldq $12,%ymm4,%ymm10
- vpaddd %ymm4,%ymm4,%ymm4
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm4,%ymm4
- addl %r12d,%ecx
- andl %edi,%edx
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm4,%ymm4
- addl 68(%r13),%ebx
- xorl %eax,%edx
- vpxor %ymm10,%ymm4,%ymm4
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- vpaddd %ymm11,%ymm4,%ymm9
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- vmovdqu %ymm9,128(%rsp)
- addl %r12d,%ebx
- andl %edi,%ecx
- addl 72(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl 76(%r13),%eax
- xorl %edx,%ebx
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- vpalignr $8,%ymm1,%ymm2,%ymm5
- addl 96(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- vpsrldq $4,%ymm4,%ymm8
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm3,%ymm8,%ymm8
- addl 100(%r13),%edx
- leal (%rdx,%rax,1),%edx
- vpxor %ymm8,%ymm5,%ymm5
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- vpsrld $31,%ymm5,%ymm8
- vmovdqu -32(%r11),%ymm11
- xorl %ebx,%esi
- addl 104(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- vpslldq $12,%ymm5,%ymm10
- vpaddd %ymm5,%ymm5,%ymm5
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm5,%ymm5
- xorl %eax,%edx
- addl %r12d,%ecx
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm5,%ymm5
- xorl %ebp,%edx
- addl 108(%r13),%ebx
- leaq 256(%r13),%r13
- vpxor %ymm10,%ymm5,%ymm5
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- vpaddd %ymm11,%ymm5,%ymm9
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- vmovdqu %ymm9,160(%rsp)
- addl -128(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- vpalignr $8,%ymm2,%ymm3,%ymm6
- addl -124(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- vpsrldq $4,%ymm5,%ymm8
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- vpxor %ymm2,%ymm6,%ymm6
- vpxor %ymm4,%ymm8,%ymm8
- addl -120(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- vpxor %ymm8,%ymm6,%ymm6
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- vpsrld $31,%ymm6,%ymm8
- xorl %ecx,%eax
- addl -116(%r13),%edx
- leal (%rdx,%rax,1),%edx
- vpslldq $12,%ymm6,%ymm10
- vpaddd %ymm6,%ymm6,%ymm6
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm6,%ymm6
- xorl %ebp,%esi
- addl %r12d,%edx
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm6,%ymm6
- xorl %ebx,%esi
- addl -96(%r13),%ecx
- vpxor %ymm10,%ymm6,%ymm6
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- vpaddd %ymm11,%ymm6,%ymm9
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- vmovdqu %ymm9,192(%rsp)
- addl -92(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- vpalignr $8,%ymm3,%ymm4,%ymm7
- addl -88(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- vpsrldq $4,%ymm6,%ymm8
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- vpxor %ymm3,%ymm7,%ymm7
- vpxor %ymm5,%ymm8,%ymm8
- addl -84(%r13),%eax
- leal (%rax,%rbx,1),%eax
- vpxor %ymm8,%ymm7,%ymm7
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- vpsrld $31,%ymm7,%ymm8
- xorl %edx,%ebp
- addl -64(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- vpslldq $12,%ymm7,%ymm10
- vpaddd %ymm7,%ymm7,%ymm7
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm7,%ymm7
- xorl %ebx,%eax
- addl %r12d,%esi
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm7,%ymm7
- xorl %ecx,%eax
- addl -60(%r13),%edx
- vpxor %ymm10,%ymm7,%ymm7
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- vpaddd %ymm11,%ymm7,%ymm9
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- vmovdqu %ymm9,224(%rsp)
- addl -56(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -52(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -32(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl -28(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -24(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -20(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- addl %r12d,%edx
- leaq 128(%rsp),%r13
-
-
- addl 0(%r8),%edx
- addl 4(%r8),%esi
- addl 8(%r8),%ebp
- movl %edx,0(%r8)
- addl 12(%r8),%ebx
- movl %esi,4(%r8)
- movl %edx,%eax
- addl 16(%r8),%ecx
- movl %ebp,%r12d
- movl %ebp,8(%r8)
- movl %ebx,%edx
-
- movl %ebx,12(%r8)
- movl %esi,%ebp
- movl %ecx,16(%r8)
-
- movl %ecx,%esi
- movl %r12d,%ecx
-
-
- cmpq %r10,%r9
- jbe .Loop_avx2
-
-.Ldone_avx2:
- vzeroupper
- leaq (%r14),%rsi
- movq -40(%rsi),%r14
- movq -32(%rsi),%r13
- movq -24(%rsi),%r12
- movq -16(%rsi),%rbp
- movq -8(%rsi),%rbx
- leaq (%rsi),%rsp
-.Lepilogue_avx2:
- .byte 0xf3,0xc3
-.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
.align 64
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
diff --git a/deps/openssl/asm/x64-elf-gas/sha/sha256-mb-x86_64.s b/deps/openssl/asm/x64-elf-gas/sha/sha256-mb-x86_64.s
index 0c06094084..7655283b98 100644
--- a/deps/openssl/asm/x64-elf-gas/sha/sha256-mb-x86_64.s
+++ b/deps/openssl/asm/x64-elf-gas/sha/sha256-mb-x86_64.s
@@ -9,8 +9,6 @@ sha256_multi_block:
movq OPENSSL_ia32cap_P+4(%rip),%rcx
btq $61,%rcx
jc _shaext_shortcut
- testl $268435456,%ecx
- jnz _avx_shortcut
movq %rsp,%rax
pushq %rbx
pushq %rbp
@@ -2679,10 +2677,10 @@ _shaext_shortcut:
punpckhqdq %xmm8,%xmm14
punpckhqdq %xmm10,%xmm15
- pshufd $27,%xmm12,%xmm12
- pshufd $27,%xmm13,%xmm13
- pshufd $27,%xmm14,%xmm14
- pshufd $27,%xmm15,%xmm15
+ pshufd $0b00011011,%xmm12,%xmm12
+ pshufd $0b00011011,%xmm13,%xmm13
+ pshufd $0b00011011,%xmm14,%xmm14
+ pshufd $0b00011011,%xmm15,%xmm15
jmp .Loop_shaext
.align 32
@@ -2714,11 +2712,11 @@ _shaext_shortcut:
movdqa %xmm2,%xmm0
movdqa %xmm15,112(%rsp)
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
pxor %xmm12,%xmm4
movdqa %xmm12,64(%rsp)
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
pxor %xmm14,%xmm8
movdqa %xmm14,96(%rsp)
movdqa 16-128(%rbp),%xmm1
@@ -2736,11 +2734,11 @@ _shaext_shortcut:
.byte 102,68,15,56,0,211
prefetcht0 127(%r9)
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
.byte 102,68,15,56,0,219
.byte 15,56,204,229
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 32-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -2753,14 +2751,14 @@ _shaext_shortcut:
movdqa %xmm2,%xmm0
movdqa %xmm7,%xmm3
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
.byte 102,15,58,15,222,4
paddd %xmm3,%xmm4
movdqa %xmm11,%xmm3
.byte 102,65,15,58,15,218,4
.byte 15,56,204,238
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 48-128(%rbp),%xmm1
paddd %xmm7,%xmm1
.byte 69,15,56,203,247
@@ -2777,13 +2775,13 @@ _shaext_shortcut:
.byte 102,15,58,15,223,4
.byte 69,15,56,203,254
.byte 69,15,56,205,195
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm5
movdqa %xmm8,%xmm3
.byte 102,65,15,58,15,219,4
.byte 15,56,204,247
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 64-128(%rbp),%xmm1
paddd %xmm4,%xmm1
.byte 69,15,56,203,247
@@ -2799,13 +2797,13 @@ _shaext_shortcut:
.byte 102,15,58,15,220,4
.byte 69,15,56,203,254
.byte 69,15,56,205,200
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm6
movdqa %xmm9,%xmm3
.byte 102,65,15,58,15,216,4
.byte 15,56,204,252
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 80-128(%rbp),%xmm1
paddd %xmm5,%xmm1
.byte 69,15,56,203,247
@@ -2821,13 +2819,13 @@ _shaext_shortcut:
.byte 102,15,58,15,221,4
.byte 69,15,56,203,254
.byte 69,15,56,205,209
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm7
movdqa %xmm10,%xmm3
.byte 102,65,15,58,15,217,4
.byte 15,56,204,229
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 96-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -2843,13 +2841,13 @@ _shaext_shortcut:
.byte 102,15,58,15,222,4
.byte 69,15,56,203,254
.byte 69,15,56,205,218
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm4
movdqa %xmm11,%xmm3
.byte 102,65,15,58,15,218,4
.byte 15,56,204,238
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 112-128(%rbp),%xmm1
paddd %xmm7,%xmm1
.byte 69,15,56,203,247
@@ -2865,13 +2863,13 @@ _shaext_shortcut:
.byte 102,15,58,15,223,4
.byte 69,15,56,203,254
.byte 69,15,56,205,195
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm5
movdqa %xmm8,%xmm3
.byte 102,65,15,58,15,219,4
.byte 15,56,204,247
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 128-128(%rbp),%xmm1
paddd %xmm4,%xmm1
.byte 69,15,56,203,247
@@ -2887,13 +2885,13 @@ _shaext_shortcut:
.byte 102,15,58,15,220,4
.byte 69,15,56,203,254
.byte 69,15,56,205,200
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm6
movdqa %xmm9,%xmm3
.byte 102,65,15,58,15,216,4
.byte 15,56,204,252
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 144-128(%rbp),%xmm1
paddd %xmm5,%xmm1
.byte 69,15,56,203,247
@@ -2909,13 +2907,13 @@ _shaext_shortcut:
.byte 102,15,58,15,221,4
.byte 69,15,56,203,254
.byte 69,15,56,205,209
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm7
movdqa %xmm10,%xmm3
.byte 102,65,15,58,15,217,4
.byte 15,56,204,229
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 160-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -2931,13 +2929,13 @@ _shaext_shortcut:
.byte 102,15,58,15,222,4
.byte 69,15,56,203,254
.byte 69,15,56,205,218
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm4
movdqa %xmm11,%xmm3
.byte 102,65,15,58,15,218,4
.byte 15,56,204,238
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 176-128(%rbp),%xmm1
paddd %xmm7,%xmm1
.byte 69,15,56,203,247
@@ -2953,13 +2951,13 @@ _shaext_shortcut:
.byte 102,15,58,15,223,4
.byte 69,15,56,203,254
.byte 69,15,56,205,195
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm5
movdqa %xmm8,%xmm3
.byte 102,65,15,58,15,219,4
.byte 15,56,204,247
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 192-128(%rbp),%xmm1
paddd %xmm4,%xmm1
.byte 69,15,56,203,247
@@ -2975,13 +2973,13 @@ _shaext_shortcut:
.byte 102,15,58,15,220,4
.byte 69,15,56,203,254
.byte 69,15,56,205,200
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm6
movdqa %xmm9,%xmm3
.byte 102,65,15,58,15,216,4
.byte 15,56,204,252
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 208-128(%rbp),%xmm1
paddd %xmm5,%xmm1
.byte 69,15,56,203,247
@@ -2997,13 +2995,13 @@ _shaext_shortcut:
.byte 102,15,58,15,221,4
.byte 69,15,56,203,254
.byte 69,15,56,205,209
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm7
movdqa %xmm10,%xmm3
.byte 102,65,15,58,15,217,4
nop
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 224-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -3020,13 +3018,13 @@ _shaext_shortcut:
pxor %xmm6,%xmm6
.byte 69,15,56,203,254
.byte 69,15,56,205,218
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
movdqa 240-128(%rbp),%xmm1
paddd %xmm7,%xmm1
movq (%rbx),%xmm7
nop
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 240-128(%rbp),%xmm2
paddd %xmm11,%xmm2
.byte 69,15,56,203,247
@@ -3036,17 +3034,17 @@ _shaext_shortcut:
cmovgeq %rsp,%r8
cmpl 4(%rbx),%ecx
cmovgeq %rsp,%r9
- pshufd $0,%xmm7,%xmm9
+ pshufd $0x00,%xmm7,%xmm9
.byte 69,15,56,203,236
movdqa %xmm2,%xmm0
- pshufd $85,%xmm7,%xmm10
+ pshufd $0x55,%xmm7,%xmm10
movdqa %xmm7,%xmm11
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
pcmpgtd %xmm6,%xmm9
pcmpgtd %xmm6,%xmm10
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
pcmpgtd %xmm6,%xmm11
movdqa K256_shaext-16(%rip),%xmm3
.byte 69,15,56,203,247
@@ -3068,10 +3066,10 @@ _shaext_shortcut:
movl 280(%rsp),%edx
- pshufd $27,%xmm12,%xmm12
- pshufd $27,%xmm13,%xmm13
- pshufd $27,%xmm14,%xmm14
- pshufd $27,%xmm15,%xmm15
+ pshufd $0b00011011,%xmm12,%xmm12
+ pshufd $0b00011011,%xmm13,%xmm13
+ pshufd $0b00011011,%xmm14,%xmm14
+ pshufd $0b00011011,%xmm15,%xmm15
movdqa %xmm12,%xmm5
movdqa %xmm13,%xmm6
@@ -3107,4648 +3105,6 @@ _shaext_shortcut:
.Lepilogue_shaext:
.byte 0xf3,0xc3
.size sha256_multi_block_shaext,.-sha256_multi_block_shaext
-.type sha256_multi_block_avx,@function
-.align 32
-sha256_multi_block_avx:
-_avx_shortcut:
- shrq $32,%rcx
- cmpl $2,%edx
- jb .Lavx
- testl $32,%ecx
- jnz _avx2_shortcut
- jmp .Lavx
-.align 32
-.Lavx:
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- subq $288,%rsp
- andq $-256,%rsp
- movq %rax,272(%rsp)
-.Lbody_avx:
- leaq K256+128(%rip),%rbp
- leaq 256(%rsp),%rbx
- leaq 128(%rdi),%rdi
-
-.Loop_grande_avx:
- movl %edx,280(%rsp)
- xorl %edx,%edx
- movq 0(%rsi),%r8
- movl 8(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,0(%rbx)
- cmovleq %rbp,%r8
- movq 16(%rsi),%r9
- movl 24(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,4(%rbx)
- cmovleq %rbp,%r9
- movq 32(%rsi),%r10
- movl 40(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,8(%rbx)
- cmovleq %rbp,%r10
- movq 48(%rsi),%r11
- movl 56(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,12(%rbx)
- cmovleq %rbp,%r11
- testl %edx,%edx
- jz .Ldone_avx
-
- vmovdqu 0-128(%rdi),%xmm8
- leaq 128(%rsp),%rax
- vmovdqu 32-128(%rdi),%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- vmovdqu 96-128(%rdi),%xmm11
- vmovdqu 128-128(%rdi),%xmm12
- vmovdqu 160-128(%rdi),%xmm13
- vmovdqu 192-128(%rdi),%xmm14
- vmovdqu 224-128(%rdi),%xmm15
- vmovdqu .Lpbswap(%rip),%xmm6
- jmp .Loop_avx
-
-.align 32
-.Loop_avx:
- vpxor %xmm9,%xmm10,%xmm4
- vmovd 0(%r8),%xmm5
- vmovd 0(%r9),%xmm0
- vpinsrd $1,0(%r10),%xmm5,%xmm5
- vpinsrd $1,0(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm12,%xmm7
- vpslld $26,%xmm12,%xmm2
- vmovdqu %xmm5,0-128(%rax)
- vpaddd %xmm15,%xmm5,%xmm5
-
- vpsrld $11,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm12,%xmm2
- vpaddd -128(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm12,%xmm2
- vpandn %xmm14,%xmm12,%xmm0
- vpand %xmm13,%xmm12,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm8,%xmm15
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm8,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm8,%xmm9,%xmm3
-
- vpxor %xmm1,%xmm15,%xmm15
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm8,%xmm1
-
- vpslld $19,%xmm8,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm15,%xmm7
-
- vpsrld $22,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm8,%xmm2
- vpxor %xmm4,%xmm9,%xmm15
- vpaddd %xmm5,%xmm11,%xmm11
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm15,%xmm15
- vpaddd %xmm7,%xmm15,%xmm15
- vmovd 4(%r8),%xmm5
- vmovd 4(%r9),%xmm0
- vpinsrd $1,4(%r10),%xmm5,%xmm5
- vpinsrd $1,4(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm11,%xmm7
- vpslld $26,%xmm11,%xmm2
- vmovdqu %xmm5,16-128(%rax)
- vpaddd %xmm14,%xmm5,%xmm5
-
- vpsrld $11,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm11,%xmm2
- vpaddd -96(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm11,%xmm2
- vpandn %xmm13,%xmm11,%xmm0
- vpand %xmm12,%xmm11,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm15,%xmm14
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm15,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm15,%xmm8,%xmm4
-
- vpxor %xmm1,%xmm14,%xmm14
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm15,%xmm1
-
- vpslld $19,%xmm15,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm14,%xmm7
-
- vpsrld $22,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm15,%xmm2
- vpxor %xmm3,%xmm8,%xmm14
- vpaddd %xmm5,%xmm10,%xmm10
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm14,%xmm14
- vpaddd %xmm7,%xmm14,%xmm14
- vmovd 8(%r8),%xmm5
- vmovd 8(%r9),%xmm0
- vpinsrd $1,8(%r10),%xmm5,%xmm5
- vpinsrd $1,8(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm10,%xmm7
- vpslld $26,%xmm10,%xmm2
- vmovdqu %xmm5,32-128(%rax)
- vpaddd %xmm13,%xmm5,%xmm5
-
- vpsrld $11,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm10,%xmm2
- vpaddd -64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm10,%xmm2
- vpandn %xmm12,%xmm10,%xmm0
- vpand %xmm11,%xmm10,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm14,%xmm13
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm14,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm14,%xmm15,%xmm3
-
- vpxor %xmm1,%xmm13,%xmm13
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm14,%xmm1
-
- vpslld $19,%xmm14,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm13,%xmm7
-
- vpsrld $22,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm14,%xmm2
- vpxor %xmm4,%xmm15,%xmm13
- vpaddd %xmm5,%xmm9,%xmm9
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm13,%xmm13
- vpaddd %xmm7,%xmm13,%xmm13
- vmovd 12(%r8),%xmm5
- vmovd 12(%r9),%xmm0
- vpinsrd $1,12(%r10),%xmm5,%xmm5
- vpinsrd $1,12(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm9,%xmm7
- vpslld $26,%xmm9,%xmm2
- vmovdqu %xmm5,48-128(%rax)
- vpaddd %xmm12,%xmm5,%xmm5
-
- vpsrld $11,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm9,%xmm2
- vpaddd -32(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm9,%xmm2
- vpandn %xmm11,%xmm9,%xmm0
- vpand %xmm10,%xmm9,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm13,%xmm12
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm13,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm13,%xmm14,%xmm4
-
- vpxor %xmm1,%xmm12,%xmm12
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm13,%xmm1
-
- vpslld $19,%xmm13,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm12,%xmm7
-
- vpsrld $22,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm13,%xmm2
- vpxor %xmm3,%xmm14,%xmm12
- vpaddd %xmm5,%xmm8,%xmm8
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm12,%xmm12
- vpaddd %xmm7,%xmm12,%xmm12
- vmovd 16(%r8),%xmm5
- vmovd 16(%r9),%xmm0
- vpinsrd $1,16(%r10),%xmm5,%xmm5
- vpinsrd $1,16(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm8,%xmm7
- vpslld $26,%xmm8,%xmm2
- vmovdqu %xmm5,64-128(%rax)
- vpaddd %xmm11,%xmm5,%xmm5
-
- vpsrld $11,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm8,%xmm2
- vpaddd 0(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm8,%xmm2
- vpandn %xmm10,%xmm8,%xmm0
- vpand %xmm9,%xmm8,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm12,%xmm11
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm12,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm12,%xmm13,%xmm3
-
- vpxor %xmm1,%xmm11,%xmm11
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm12,%xmm1
-
- vpslld $19,%xmm12,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm11,%xmm7
-
- vpsrld $22,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm12,%xmm2
- vpxor %xmm4,%xmm13,%xmm11
- vpaddd %xmm5,%xmm15,%xmm15
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm11,%xmm11
- vpaddd %xmm7,%xmm11,%xmm11
- vmovd 20(%r8),%xmm5
- vmovd 20(%r9),%xmm0
- vpinsrd $1,20(%r10),%xmm5,%xmm5
- vpinsrd $1,20(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm15,%xmm7
- vpslld $26,%xmm15,%xmm2
- vmovdqu %xmm5,80-128(%rax)
- vpaddd %xmm10,%xmm5,%xmm5
-
- vpsrld $11,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm15,%xmm2
- vpaddd 32(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm15,%xmm2
- vpandn %xmm9,%xmm15,%xmm0
- vpand %xmm8,%xmm15,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm11,%xmm10
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm11,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm11,%xmm12,%xmm4
-
- vpxor %xmm1,%xmm10,%xmm10
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm11,%xmm1
-
- vpslld $19,%xmm11,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm10,%xmm7
-
- vpsrld $22,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm11,%xmm2
- vpxor %xmm3,%xmm12,%xmm10
- vpaddd %xmm5,%xmm14,%xmm14
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm10,%xmm10
- vpaddd %xmm7,%xmm10,%xmm10
- vmovd 24(%r8),%xmm5
- vmovd 24(%r9),%xmm0
- vpinsrd $1,24(%r10),%xmm5,%xmm5
- vpinsrd $1,24(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm14,%xmm7
- vpslld $26,%xmm14,%xmm2
- vmovdqu %xmm5,96-128(%rax)
- vpaddd %xmm9,%xmm5,%xmm5
-
- vpsrld $11,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm14,%xmm2
- vpaddd 64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm14,%xmm2
- vpandn %xmm8,%xmm14,%xmm0
- vpand %xmm15,%xmm14,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm10,%xmm9
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm10,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm10,%xmm11,%xmm3
-
- vpxor %xmm1,%xmm9,%xmm9
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm10,%xmm1
-
- vpslld $19,%xmm10,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm9,%xmm7
-
- vpsrld $22,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm10,%xmm2
- vpxor %xmm4,%xmm11,%xmm9
- vpaddd %xmm5,%xmm13,%xmm13
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm9,%xmm9
- vpaddd %xmm7,%xmm9,%xmm9
- vmovd 28(%r8),%xmm5
- vmovd 28(%r9),%xmm0
- vpinsrd $1,28(%r10),%xmm5,%xmm5
- vpinsrd $1,28(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm13,%xmm7
- vpslld $26,%xmm13,%xmm2
- vmovdqu %xmm5,112-128(%rax)
- vpaddd %xmm8,%xmm5,%xmm5
-
- vpsrld $11,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm13,%xmm2
- vpaddd 96(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm13,%xmm2
- vpandn %xmm15,%xmm13,%xmm0
- vpand %xmm14,%xmm13,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm9,%xmm8
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm9,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm9,%xmm10,%xmm4
-
- vpxor %xmm1,%xmm8,%xmm8
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm9,%xmm1
-
- vpslld $19,%xmm9,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm8,%xmm7
-
- vpsrld $22,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm9,%xmm2
- vpxor %xmm3,%xmm10,%xmm8
- vpaddd %xmm5,%xmm12,%xmm12
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm8,%xmm8
- vpaddd %xmm7,%xmm8,%xmm8
- addq $256,%rbp
- vmovd 32(%r8),%xmm5
- vmovd 32(%r9),%xmm0
- vpinsrd $1,32(%r10),%xmm5,%xmm5
- vpinsrd $1,32(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm12,%xmm7
- vpslld $26,%xmm12,%xmm2
- vmovdqu %xmm5,128-128(%rax)
- vpaddd %xmm15,%xmm5,%xmm5
-
- vpsrld $11,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm12,%xmm2
- vpaddd -128(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm12,%xmm2
- vpandn %xmm14,%xmm12,%xmm0
- vpand %xmm13,%xmm12,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm8,%xmm15
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm8,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm8,%xmm9,%xmm3
-
- vpxor %xmm1,%xmm15,%xmm15
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm8,%xmm1
-
- vpslld $19,%xmm8,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm15,%xmm7
-
- vpsrld $22,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm8,%xmm2
- vpxor %xmm4,%xmm9,%xmm15
- vpaddd %xmm5,%xmm11,%xmm11
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm15,%xmm15
- vpaddd %xmm7,%xmm15,%xmm15
- vmovd 36(%r8),%xmm5
- vmovd 36(%r9),%xmm0
- vpinsrd $1,36(%r10),%xmm5,%xmm5
- vpinsrd $1,36(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm11,%xmm7
- vpslld $26,%xmm11,%xmm2
- vmovdqu %xmm5,144-128(%rax)
- vpaddd %xmm14,%xmm5,%xmm5
-
- vpsrld $11,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm11,%xmm2
- vpaddd -96(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm11,%xmm2
- vpandn %xmm13,%xmm11,%xmm0
- vpand %xmm12,%xmm11,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm15,%xmm14
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm15,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm15,%xmm8,%xmm4
-
- vpxor %xmm1,%xmm14,%xmm14
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm15,%xmm1
-
- vpslld $19,%xmm15,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm14,%xmm7
-
- vpsrld $22,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm15,%xmm2
- vpxor %xmm3,%xmm8,%xmm14
- vpaddd %xmm5,%xmm10,%xmm10
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm14,%xmm14
- vpaddd %xmm7,%xmm14,%xmm14
- vmovd 40(%r8),%xmm5
- vmovd 40(%r9),%xmm0
- vpinsrd $1,40(%r10),%xmm5,%xmm5
- vpinsrd $1,40(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm10,%xmm7
- vpslld $26,%xmm10,%xmm2
- vmovdqu %xmm5,160-128(%rax)
- vpaddd %xmm13,%xmm5,%xmm5
-
- vpsrld $11,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm10,%xmm2
- vpaddd -64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm10,%xmm2
- vpandn %xmm12,%xmm10,%xmm0
- vpand %xmm11,%xmm10,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm14,%xmm13
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm14,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm14,%xmm15,%xmm3
-
- vpxor %xmm1,%xmm13,%xmm13
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm14,%xmm1
-
- vpslld $19,%xmm14,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm13,%xmm7
-
- vpsrld $22,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm14,%xmm2
- vpxor %xmm4,%xmm15,%xmm13
- vpaddd %xmm5,%xmm9,%xmm9
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm13,%xmm13
- vpaddd %xmm7,%xmm13,%xmm13
- vmovd 44(%r8),%xmm5
- vmovd 44(%r9),%xmm0
- vpinsrd $1,44(%r10),%xmm5,%xmm5
- vpinsrd $1,44(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm9,%xmm7
- vpslld $26,%xmm9,%xmm2
- vmovdqu %xmm5,176-128(%rax)
- vpaddd %xmm12,%xmm5,%xmm5
-
- vpsrld $11,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm9,%xmm2
- vpaddd -32(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm9,%xmm2
- vpandn %xmm11,%xmm9,%xmm0
- vpand %xmm10,%xmm9,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm13,%xmm12
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm13,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm13,%xmm14,%xmm4
-
- vpxor %xmm1,%xmm12,%xmm12
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm13,%xmm1
-
- vpslld $19,%xmm13,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm12,%xmm7
-
- vpsrld $22,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm13,%xmm2
- vpxor %xmm3,%xmm14,%xmm12
- vpaddd %xmm5,%xmm8,%xmm8
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm12,%xmm12
- vpaddd %xmm7,%xmm12,%xmm12
- vmovd 48(%r8),%xmm5
- vmovd 48(%r9),%xmm0
- vpinsrd $1,48(%r10),%xmm5,%xmm5
- vpinsrd $1,48(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm8,%xmm7
- vpslld $26,%xmm8,%xmm2
- vmovdqu %xmm5,192-128(%rax)
- vpaddd %xmm11,%xmm5,%xmm5
-
- vpsrld $11,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm8,%xmm2
- vpaddd 0(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm8,%xmm2
- vpandn %xmm10,%xmm8,%xmm0
- vpand %xmm9,%xmm8,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm12,%xmm11
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm12,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm12,%xmm13,%xmm3
-
- vpxor %xmm1,%xmm11,%xmm11
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm12,%xmm1
-
- vpslld $19,%xmm12,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm11,%xmm7
-
- vpsrld $22,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm12,%xmm2
- vpxor %xmm4,%xmm13,%xmm11
- vpaddd %xmm5,%xmm15,%xmm15
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm11,%xmm11
- vpaddd %xmm7,%xmm11,%xmm11
- vmovd 52(%r8),%xmm5
- vmovd 52(%r9),%xmm0
- vpinsrd $1,52(%r10),%xmm5,%xmm5
- vpinsrd $1,52(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm15,%xmm7
- vpslld $26,%xmm15,%xmm2
- vmovdqu %xmm5,208-128(%rax)
- vpaddd %xmm10,%xmm5,%xmm5
-
- vpsrld $11,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm15,%xmm2
- vpaddd 32(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm15,%xmm2
- vpandn %xmm9,%xmm15,%xmm0
- vpand %xmm8,%xmm15,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm11,%xmm10
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm11,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm11,%xmm12,%xmm4
-
- vpxor %xmm1,%xmm10,%xmm10
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm11,%xmm1
-
- vpslld $19,%xmm11,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm10,%xmm7
-
- vpsrld $22,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm11,%xmm2
- vpxor %xmm3,%xmm12,%xmm10
- vpaddd %xmm5,%xmm14,%xmm14
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm10,%xmm10
- vpaddd %xmm7,%xmm10,%xmm10
- vmovd 56(%r8),%xmm5
- vmovd 56(%r9),%xmm0
- vpinsrd $1,56(%r10),%xmm5,%xmm5
- vpinsrd $1,56(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm14,%xmm7
- vpslld $26,%xmm14,%xmm2
- vmovdqu %xmm5,224-128(%rax)
- vpaddd %xmm9,%xmm5,%xmm5
-
- vpsrld $11,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm14,%xmm2
- vpaddd 64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm14,%xmm2
- vpandn %xmm8,%xmm14,%xmm0
- vpand %xmm15,%xmm14,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm10,%xmm9
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm10,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm10,%xmm11,%xmm3
-
- vpxor %xmm1,%xmm9,%xmm9
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm10,%xmm1
-
- vpslld $19,%xmm10,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm9,%xmm7
-
- vpsrld $22,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm10,%xmm2
- vpxor %xmm4,%xmm11,%xmm9
- vpaddd %xmm5,%xmm13,%xmm13
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm9,%xmm9
- vpaddd %xmm7,%xmm9,%xmm9
- vmovd 60(%r8),%xmm5
- leaq 64(%r8),%r8
- vmovd 60(%r9),%xmm0
- leaq 64(%r9),%r9
- vpinsrd $1,60(%r10),%xmm5,%xmm5
- leaq 64(%r10),%r10
- vpinsrd $1,60(%r11),%xmm0,%xmm0
- leaq 64(%r11),%r11
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm13,%xmm7
- vpslld $26,%xmm13,%xmm2
- vmovdqu %xmm5,240-128(%rax)
- vpaddd %xmm8,%xmm5,%xmm5
-
- vpsrld $11,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm13,%xmm2
- vpaddd 96(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- prefetcht0 63(%r8)
- vpslld $7,%xmm13,%xmm2
- vpandn %xmm15,%xmm13,%xmm0
- vpand %xmm14,%xmm13,%xmm4
- prefetcht0 63(%r9)
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm9,%xmm8
- vpxor %xmm2,%xmm7,%xmm7
- prefetcht0 63(%r10)
- vpslld $30,%xmm9,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm9,%xmm10,%xmm4
- prefetcht0 63(%r11)
- vpxor %xmm1,%xmm8,%xmm8
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm9,%xmm1
-
- vpslld $19,%xmm9,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm8,%xmm7
-
- vpsrld $22,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm9,%xmm2
- vpxor %xmm3,%xmm10,%xmm8
- vpaddd %xmm5,%xmm12,%xmm12
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm8,%xmm8
- vpaddd %xmm7,%xmm8,%xmm8
- addq $256,%rbp
- vmovdqu 0-128(%rax),%xmm5
- movl $3,%ecx
- jmp .Loop_16_xx_avx
-.align 32
-.Loop_16_xx_avx:
- vmovdqu 16-128(%rax),%xmm6
- vpaddd 144-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 224-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm12,%xmm7
- vpslld $26,%xmm12,%xmm2
- vmovdqu %xmm5,0-128(%rax)
- vpaddd %xmm15,%xmm5,%xmm5
-
- vpsrld $11,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm12,%xmm2
- vpaddd -128(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm12,%xmm2
- vpandn %xmm14,%xmm12,%xmm0
- vpand %xmm13,%xmm12,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm8,%xmm15
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm8,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm8,%xmm9,%xmm3
-
- vpxor %xmm1,%xmm15,%xmm15
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm8,%xmm1
-
- vpslld $19,%xmm8,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm15,%xmm7
-
- vpsrld $22,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm8,%xmm2
- vpxor %xmm4,%xmm9,%xmm15
- vpaddd %xmm5,%xmm11,%xmm11
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm15,%xmm15
- vpaddd %xmm7,%xmm15,%xmm15
- vmovdqu 32-128(%rax),%xmm5
- vpaddd 160-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 240-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm11,%xmm7
- vpslld $26,%xmm11,%xmm2
- vmovdqu %xmm6,16-128(%rax)
- vpaddd %xmm14,%xmm6,%xmm6
-
- vpsrld $11,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm11,%xmm2
- vpaddd -96(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm11,%xmm2
- vpandn %xmm13,%xmm11,%xmm0
- vpand %xmm12,%xmm11,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm15,%xmm14
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm15,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm15,%xmm8,%xmm4
-
- vpxor %xmm1,%xmm14,%xmm14
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm15,%xmm1
-
- vpslld $19,%xmm15,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm14,%xmm7
-
- vpsrld $22,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm15,%xmm2
- vpxor %xmm3,%xmm8,%xmm14
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm14,%xmm14
- vpaddd %xmm7,%xmm14,%xmm14
- vmovdqu 48-128(%rax),%xmm6
- vpaddd 176-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 0-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm10,%xmm7
- vpslld $26,%xmm10,%xmm2
- vmovdqu %xmm5,32-128(%rax)
- vpaddd %xmm13,%xmm5,%xmm5
-
- vpsrld $11,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm10,%xmm2
- vpaddd -64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm10,%xmm2
- vpandn %xmm12,%xmm10,%xmm0
- vpand %xmm11,%xmm10,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm14,%xmm13
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm14,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm14,%xmm15,%xmm3
-
- vpxor %xmm1,%xmm13,%xmm13
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm14,%xmm1
-
- vpslld $19,%xmm14,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm13,%xmm7
-
- vpsrld $22,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm14,%xmm2
- vpxor %xmm4,%xmm15,%xmm13
- vpaddd %xmm5,%xmm9,%xmm9
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm13,%xmm13
- vpaddd %xmm7,%xmm13,%xmm13
- vmovdqu 64-128(%rax),%xmm5
- vpaddd 192-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 16-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm9,%xmm7
- vpslld $26,%xmm9,%xmm2
- vmovdqu %xmm6,48-128(%rax)
- vpaddd %xmm12,%xmm6,%xmm6
-
- vpsrld $11,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm9,%xmm2
- vpaddd -32(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm9,%xmm2
- vpandn %xmm11,%xmm9,%xmm0
- vpand %xmm10,%xmm9,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm13,%xmm12
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm13,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm13,%xmm14,%xmm4
-
- vpxor %xmm1,%xmm12,%xmm12
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm13,%xmm1
-
- vpslld $19,%xmm13,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm12,%xmm7
-
- vpsrld $22,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm13,%xmm2
- vpxor %xmm3,%xmm14,%xmm12
- vpaddd %xmm6,%xmm8,%xmm8
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm12,%xmm12
- vpaddd %xmm7,%xmm12,%xmm12
- vmovdqu 80-128(%rax),%xmm6
- vpaddd 208-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 32-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm8,%xmm7
- vpslld $26,%xmm8,%xmm2
- vmovdqu %xmm5,64-128(%rax)
- vpaddd %xmm11,%xmm5,%xmm5
-
- vpsrld $11,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm8,%xmm2
- vpaddd 0(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm8,%xmm2
- vpandn %xmm10,%xmm8,%xmm0
- vpand %xmm9,%xmm8,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm12,%xmm11
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm12,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm12,%xmm13,%xmm3
-
- vpxor %xmm1,%xmm11,%xmm11
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm12,%xmm1
-
- vpslld $19,%xmm12,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm11,%xmm7
-
- vpsrld $22,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm12,%xmm2
- vpxor %xmm4,%xmm13,%xmm11
- vpaddd %xmm5,%xmm15,%xmm15
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm11,%xmm11
- vpaddd %xmm7,%xmm11,%xmm11
- vmovdqu 96-128(%rax),%xmm5
- vpaddd 224-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 48-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm15,%xmm7
- vpslld $26,%xmm15,%xmm2
- vmovdqu %xmm6,80-128(%rax)
- vpaddd %xmm10,%xmm6,%xmm6
-
- vpsrld $11,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm15,%xmm2
- vpaddd 32(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm15,%xmm2
- vpandn %xmm9,%xmm15,%xmm0
- vpand %xmm8,%xmm15,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm11,%xmm10
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm11,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm11,%xmm12,%xmm4
-
- vpxor %xmm1,%xmm10,%xmm10
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm11,%xmm1
-
- vpslld $19,%xmm11,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm10,%xmm7
-
- vpsrld $22,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm11,%xmm2
- vpxor %xmm3,%xmm12,%xmm10
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm10,%xmm10
- vpaddd %xmm7,%xmm10,%xmm10
- vmovdqu 112-128(%rax),%xmm6
- vpaddd 240-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 64-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm14,%xmm7
- vpslld $26,%xmm14,%xmm2
- vmovdqu %xmm5,96-128(%rax)
- vpaddd %xmm9,%xmm5,%xmm5
-
- vpsrld $11,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm14,%xmm2
- vpaddd 64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm14,%xmm2
- vpandn %xmm8,%xmm14,%xmm0
- vpand %xmm15,%xmm14,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm10,%xmm9
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm10,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm10,%xmm11,%xmm3
-
- vpxor %xmm1,%xmm9,%xmm9
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm10,%xmm1
-
- vpslld $19,%xmm10,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm9,%xmm7
-
- vpsrld $22,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm10,%xmm2
- vpxor %xmm4,%xmm11,%xmm9
- vpaddd %xmm5,%xmm13,%xmm13
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm9,%xmm9
- vpaddd %xmm7,%xmm9,%xmm9
- vmovdqu 128-128(%rax),%xmm5
- vpaddd 0-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 80-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm13,%xmm7
- vpslld $26,%xmm13,%xmm2
- vmovdqu %xmm6,112-128(%rax)
- vpaddd %xmm8,%xmm6,%xmm6
-
- vpsrld $11,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm13,%xmm2
- vpaddd 96(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm13,%xmm2
- vpandn %xmm15,%xmm13,%xmm0
- vpand %xmm14,%xmm13,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm9,%xmm8
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm9,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm9,%xmm10,%xmm4
-
- vpxor %xmm1,%xmm8,%xmm8
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm9,%xmm1
-
- vpslld $19,%xmm9,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm8,%xmm7
-
- vpsrld $22,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm9,%xmm2
- vpxor %xmm3,%xmm10,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm8,%xmm8
- vpaddd %xmm7,%xmm8,%xmm8
- addq $256,%rbp
- vmovdqu 144-128(%rax),%xmm6
- vpaddd 16-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 96-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm12,%xmm7
- vpslld $26,%xmm12,%xmm2
- vmovdqu %xmm5,128-128(%rax)
- vpaddd %xmm15,%xmm5,%xmm5
-
- vpsrld $11,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm12,%xmm2
- vpaddd -128(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm12,%xmm2
- vpandn %xmm14,%xmm12,%xmm0
- vpand %xmm13,%xmm12,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm8,%xmm15
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm8,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm8,%xmm9,%xmm3
-
- vpxor %xmm1,%xmm15,%xmm15
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm8,%xmm1
-
- vpslld $19,%xmm8,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm15,%xmm7
-
- vpsrld $22,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm8,%xmm2
- vpxor %xmm4,%xmm9,%xmm15
- vpaddd %xmm5,%xmm11,%xmm11
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm15,%xmm15
- vpaddd %xmm7,%xmm15,%xmm15
- vmovdqu 160-128(%rax),%xmm5
- vpaddd 32-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 112-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm11,%xmm7
- vpslld $26,%xmm11,%xmm2
- vmovdqu %xmm6,144-128(%rax)
- vpaddd %xmm14,%xmm6,%xmm6
-
- vpsrld $11,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm11,%xmm2
- vpaddd -96(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm11,%xmm2
- vpandn %xmm13,%xmm11,%xmm0
- vpand %xmm12,%xmm11,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm15,%xmm14
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm15,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm15,%xmm8,%xmm4
-
- vpxor %xmm1,%xmm14,%xmm14
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm15,%xmm1
-
- vpslld $19,%xmm15,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm14,%xmm7
-
- vpsrld $22,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm15,%xmm2
- vpxor %xmm3,%xmm8,%xmm14
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm14,%xmm14
- vpaddd %xmm7,%xmm14,%xmm14
- vmovdqu 176-128(%rax),%xmm6
- vpaddd 48-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 128-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm10,%xmm7
- vpslld $26,%xmm10,%xmm2
- vmovdqu %xmm5,160-128(%rax)
- vpaddd %xmm13,%xmm5,%xmm5
-
- vpsrld $11,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm10,%xmm2
- vpaddd -64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm10,%xmm2
- vpandn %xmm12,%xmm10,%xmm0
- vpand %xmm11,%xmm10,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm14,%xmm13
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm14,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm14,%xmm15,%xmm3
-
- vpxor %xmm1,%xmm13,%xmm13
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm14,%xmm1
-
- vpslld $19,%xmm14,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm13,%xmm7
-
- vpsrld $22,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm14,%xmm2
- vpxor %xmm4,%xmm15,%xmm13
- vpaddd %xmm5,%xmm9,%xmm9
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm13,%xmm13
- vpaddd %xmm7,%xmm13,%xmm13
- vmovdqu 192-128(%rax),%xmm5
- vpaddd 64-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 144-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm9,%xmm7
- vpslld $26,%xmm9,%xmm2
- vmovdqu %xmm6,176-128(%rax)
- vpaddd %xmm12,%xmm6,%xmm6
-
- vpsrld $11,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm9,%xmm2
- vpaddd -32(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm9,%xmm2
- vpandn %xmm11,%xmm9,%xmm0
- vpand %xmm10,%xmm9,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm13,%xmm12
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm13,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm13,%xmm14,%xmm4
-
- vpxor %xmm1,%xmm12,%xmm12
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm13,%xmm1
-
- vpslld $19,%xmm13,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm12,%xmm7
-
- vpsrld $22,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm13,%xmm2
- vpxor %xmm3,%xmm14,%xmm12
- vpaddd %xmm6,%xmm8,%xmm8
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm12,%xmm12
- vpaddd %xmm7,%xmm12,%xmm12
- vmovdqu 208-128(%rax),%xmm6
- vpaddd 80-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 160-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm8,%xmm7
- vpslld $26,%xmm8,%xmm2
- vmovdqu %xmm5,192-128(%rax)
- vpaddd %xmm11,%xmm5,%xmm5
-
- vpsrld $11,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm8,%xmm2
- vpaddd 0(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm8,%xmm2
- vpandn %xmm10,%xmm8,%xmm0
- vpand %xmm9,%xmm8,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm12,%xmm11
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm12,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm12,%xmm13,%xmm3
-
- vpxor %xmm1,%xmm11,%xmm11
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm12,%xmm1
-
- vpslld $19,%xmm12,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm11,%xmm7
-
- vpsrld $22,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm12,%xmm2
- vpxor %xmm4,%xmm13,%xmm11
- vpaddd %xmm5,%xmm15,%xmm15
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm11,%xmm11
- vpaddd %xmm7,%xmm11,%xmm11
- vmovdqu 224-128(%rax),%xmm5
- vpaddd 96-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 176-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm15,%xmm7
- vpslld $26,%xmm15,%xmm2
- vmovdqu %xmm6,208-128(%rax)
- vpaddd %xmm10,%xmm6,%xmm6
-
- vpsrld $11,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm15,%xmm2
- vpaddd 32(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm15,%xmm2
- vpandn %xmm9,%xmm15,%xmm0
- vpand %xmm8,%xmm15,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm11,%xmm10
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm11,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm11,%xmm12,%xmm4
-
- vpxor %xmm1,%xmm10,%xmm10
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm11,%xmm1
-
- vpslld $19,%xmm11,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm10,%xmm7
-
- vpsrld $22,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm11,%xmm2
- vpxor %xmm3,%xmm12,%xmm10
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm10,%xmm10
- vpaddd %xmm7,%xmm10,%xmm10
- vmovdqu 240-128(%rax),%xmm6
- vpaddd 112-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 192-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm14,%xmm7
- vpslld $26,%xmm14,%xmm2
- vmovdqu %xmm5,224-128(%rax)
- vpaddd %xmm9,%xmm5,%xmm5
-
- vpsrld $11,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm14,%xmm2
- vpaddd 64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm14,%xmm2
- vpandn %xmm8,%xmm14,%xmm0
- vpand %xmm15,%xmm14,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm10,%xmm9
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm10,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm10,%xmm11,%xmm3
-
- vpxor %xmm1,%xmm9,%xmm9
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm10,%xmm1
-
- vpslld $19,%xmm10,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm9,%xmm7
-
- vpsrld $22,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm10,%xmm2
- vpxor %xmm4,%xmm11,%xmm9
- vpaddd %xmm5,%xmm13,%xmm13
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm9,%xmm9
- vpaddd %xmm7,%xmm9,%xmm9
- vmovdqu 0-128(%rax),%xmm5
- vpaddd 128-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 208-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm13,%xmm7
- vpslld $26,%xmm13,%xmm2
- vmovdqu %xmm6,240-128(%rax)
- vpaddd %xmm8,%xmm6,%xmm6
-
- vpsrld $11,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm13,%xmm2
- vpaddd 96(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm13,%xmm2
- vpandn %xmm15,%xmm13,%xmm0
- vpand %xmm14,%xmm13,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm9,%xmm8
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm9,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm9,%xmm10,%xmm4
-
- vpxor %xmm1,%xmm8,%xmm8
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm9,%xmm1
-
- vpslld $19,%xmm9,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm8,%xmm7
-
- vpsrld $22,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm9,%xmm2
- vpxor %xmm3,%xmm10,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm8,%xmm8
- vpaddd %xmm7,%xmm8,%xmm8
- addq $256,%rbp
- decl %ecx
- jnz .Loop_16_xx_avx
-
- movl $1,%ecx
- leaq K256+128(%rip),%rbp
- cmpl 0(%rbx),%ecx
- cmovgeq %rbp,%r8
- cmpl 4(%rbx),%ecx
- cmovgeq %rbp,%r9
- cmpl 8(%rbx),%ecx
- cmovgeq %rbp,%r10
- cmpl 12(%rbx),%ecx
- cmovgeq %rbp,%r11
- vmovdqa (%rbx),%xmm7
- vpxor %xmm0,%xmm0,%xmm0
- vmovdqa %xmm7,%xmm6
- vpcmpgtd %xmm0,%xmm6,%xmm6
- vpaddd %xmm6,%xmm7,%xmm7
-
- vmovdqu 0-128(%rdi),%xmm0
- vpand %xmm6,%xmm8,%xmm8
- vmovdqu 32-128(%rdi),%xmm1
- vpand %xmm6,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm2
- vpand %xmm6,%xmm10,%xmm10
- vmovdqu 96-128(%rdi),%xmm5
- vpand %xmm6,%xmm11,%xmm11
- vpaddd %xmm0,%xmm8,%xmm8
- vmovdqu 128-128(%rdi),%xmm0
- vpand %xmm6,%xmm12,%xmm12
- vpaddd %xmm1,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm1
- vpand %xmm6,%xmm13,%xmm13
- vpaddd %xmm2,%xmm10,%xmm10
- vmovdqu 192-128(%rdi),%xmm2
- vpand %xmm6,%xmm14,%xmm14
- vpaddd %xmm5,%xmm11,%xmm11
- vmovdqu 224-128(%rdi),%xmm5
- vpand %xmm6,%xmm15,%xmm15
- vpaddd %xmm0,%xmm12,%xmm12
- vpaddd %xmm1,%xmm13,%xmm13
- vmovdqu %xmm8,0-128(%rdi)
- vpaddd %xmm2,%xmm14,%xmm14
- vmovdqu %xmm9,32-128(%rdi)
- vpaddd %xmm5,%xmm15,%xmm15
- vmovdqu %xmm10,64-128(%rdi)
- vmovdqu %xmm11,96-128(%rdi)
- vmovdqu %xmm12,128-128(%rdi)
- vmovdqu %xmm13,160-128(%rdi)
- vmovdqu %xmm14,192-128(%rdi)
- vmovdqu %xmm15,224-128(%rdi)
-
- vmovdqu %xmm7,(%rbx)
- vmovdqu .Lpbswap(%rip),%xmm6
- decl %edx
- jnz .Loop_avx
-
- movl 280(%rsp),%edx
- leaq 16(%rdi),%rdi
- leaq 64(%rsi),%rsi
- decl %edx
- jnz .Loop_grande_avx
-
-.Ldone_avx:
- movq 272(%rsp),%rax
- vzeroupper
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-.Lepilogue_avx:
- .byte 0xf3,0xc3
-.size sha256_multi_block_avx,.-sha256_multi_block_avx
-.type sha256_multi_block_avx2,@function
-.align 32
-sha256_multi_block_avx2:
-_avx2_shortcut:
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $576,%rsp
- andq $-256,%rsp
- movq %rax,544(%rsp)
-.Lbody_avx2:
- leaq K256+128(%rip),%rbp
- leaq 128(%rdi),%rdi
-
-.Loop_grande_avx2:
- movl %edx,552(%rsp)
- xorl %edx,%edx
- leaq 512(%rsp),%rbx
- movq 0(%rsi),%r12
- movl 8(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,0(%rbx)
- cmovleq %rbp,%r12
- movq 16(%rsi),%r13
- movl 24(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,4(%rbx)
- cmovleq %rbp,%r13
- movq 32(%rsi),%r14
- movl 40(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,8(%rbx)
- cmovleq %rbp,%r14
- movq 48(%rsi),%r15
- movl 56(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,12(%rbx)
- cmovleq %rbp,%r15
- movq 64(%rsi),%r8
- movl 72(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,16(%rbx)
- cmovleq %rbp,%r8
- movq 80(%rsi),%r9
- movl 88(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,20(%rbx)
- cmovleq %rbp,%r9
- movq 96(%rsi),%r10
- movl 104(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,24(%rbx)
- cmovleq %rbp,%r10
- movq 112(%rsi),%r11
- movl 120(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,28(%rbx)
- cmovleq %rbp,%r11
- vmovdqu 0-128(%rdi),%ymm8
- leaq 128(%rsp),%rax
- vmovdqu 32-128(%rdi),%ymm9
- leaq 256+128(%rsp),%rbx
- vmovdqu 64-128(%rdi),%ymm10
- vmovdqu 96-128(%rdi),%ymm11
- vmovdqu 128-128(%rdi),%ymm12
- vmovdqu 160-128(%rdi),%ymm13
- vmovdqu 192-128(%rdi),%ymm14
- vmovdqu 224-128(%rdi),%ymm15
- vmovdqu .Lpbswap(%rip),%ymm6
- jmp .Loop_avx2
-
-.align 32
-.Loop_avx2:
- vpxor %ymm9,%ymm10,%ymm4
- vmovd 0(%r12),%xmm5
- vmovd 0(%r8),%xmm0
- vmovd 0(%r13),%xmm1
- vmovd 0(%r9),%xmm2
- vpinsrd $1,0(%r14),%xmm5,%xmm5
- vpinsrd $1,0(%r10),%xmm0,%xmm0
- vpinsrd $1,0(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,0(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm12,%ymm7
- vpslld $26,%ymm12,%ymm2
- vmovdqu %ymm5,0-128(%rax)
- vpaddd %ymm15,%ymm5,%ymm5
-
- vpsrld $11,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm12,%ymm2
- vpaddd -128(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm12,%ymm2
- vpandn %ymm14,%ymm12,%ymm0
- vpand %ymm13,%ymm12,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm8,%ymm15
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm8,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm8,%ymm9,%ymm3
-
- vpxor %ymm1,%ymm15,%ymm15
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm8,%ymm1
-
- vpslld $19,%ymm8,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm15,%ymm7
-
- vpsrld $22,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm8,%ymm2
- vpxor %ymm4,%ymm9,%ymm15
- vpaddd %ymm5,%ymm11,%ymm11
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm15,%ymm15
- vpaddd %ymm7,%ymm15,%ymm15
- vmovd 4(%r12),%xmm5
- vmovd 4(%r8),%xmm0
- vmovd 4(%r13),%xmm1
- vmovd 4(%r9),%xmm2
- vpinsrd $1,4(%r14),%xmm5,%xmm5
- vpinsrd $1,4(%r10),%xmm0,%xmm0
- vpinsrd $1,4(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,4(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm11,%ymm7
- vpslld $26,%ymm11,%ymm2
- vmovdqu %ymm5,32-128(%rax)
- vpaddd %ymm14,%ymm5,%ymm5
-
- vpsrld $11,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm11,%ymm2
- vpaddd -96(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm11,%ymm2
- vpandn %ymm13,%ymm11,%ymm0
- vpand %ymm12,%ymm11,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm15,%ymm14
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm15,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm15,%ymm8,%ymm4
-
- vpxor %ymm1,%ymm14,%ymm14
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm15,%ymm1
-
- vpslld $19,%ymm15,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm14,%ymm7
-
- vpsrld $22,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm15,%ymm2
- vpxor %ymm3,%ymm8,%ymm14
- vpaddd %ymm5,%ymm10,%ymm10
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm14,%ymm14
- vpaddd %ymm7,%ymm14,%ymm14
- vmovd 8(%r12),%xmm5
- vmovd 8(%r8),%xmm0
- vmovd 8(%r13),%xmm1
- vmovd 8(%r9),%xmm2
- vpinsrd $1,8(%r14),%xmm5,%xmm5
- vpinsrd $1,8(%r10),%xmm0,%xmm0
- vpinsrd $1,8(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,8(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm10,%ymm7
- vpslld $26,%ymm10,%ymm2
- vmovdqu %ymm5,64-128(%rax)
- vpaddd %ymm13,%ymm5,%ymm5
-
- vpsrld $11,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm10,%ymm2
- vpaddd -64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm10,%ymm2
- vpandn %ymm12,%ymm10,%ymm0
- vpand %ymm11,%ymm10,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm14,%ymm13
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm14,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm14,%ymm15,%ymm3
-
- vpxor %ymm1,%ymm13,%ymm13
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm14,%ymm1
-
- vpslld $19,%ymm14,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm13,%ymm7
-
- vpsrld $22,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm14,%ymm2
- vpxor %ymm4,%ymm15,%ymm13
- vpaddd %ymm5,%ymm9,%ymm9
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm13,%ymm13
- vpaddd %ymm7,%ymm13,%ymm13
- vmovd 12(%r12),%xmm5
- vmovd 12(%r8),%xmm0
- vmovd 12(%r13),%xmm1
- vmovd 12(%r9),%xmm2
- vpinsrd $1,12(%r14),%xmm5,%xmm5
- vpinsrd $1,12(%r10),%xmm0,%xmm0
- vpinsrd $1,12(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,12(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm9,%ymm7
- vpslld $26,%ymm9,%ymm2
- vmovdqu %ymm5,96-128(%rax)
- vpaddd %ymm12,%ymm5,%ymm5
-
- vpsrld $11,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm9,%ymm2
- vpaddd -32(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm9,%ymm2
- vpandn %ymm11,%ymm9,%ymm0
- vpand %ymm10,%ymm9,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm13,%ymm12
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm13,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm13,%ymm14,%ymm4
-
- vpxor %ymm1,%ymm12,%ymm12
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm13,%ymm1
-
- vpslld $19,%ymm13,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm12,%ymm7
-
- vpsrld $22,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm13,%ymm2
- vpxor %ymm3,%ymm14,%ymm12
- vpaddd %ymm5,%ymm8,%ymm8
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm12,%ymm12
- vpaddd %ymm7,%ymm12,%ymm12
- vmovd 16(%r12),%xmm5
- vmovd 16(%r8),%xmm0
- vmovd 16(%r13),%xmm1
- vmovd 16(%r9),%xmm2
- vpinsrd $1,16(%r14),%xmm5,%xmm5
- vpinsrd $1,16(%r10),%xmm0,%xmm0
- vpinsrd $1,16(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,16(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm8,%ymm7
- vpslld $26,%ymm8,%ymm2
- vmovdqu %ymm5,128-128(%rax)
- vpaddd %ymm11,%ymm5,%ymm5
-
- vpsrld $11,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm8,%ymm2
- vpaddd 0(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm8,%ymm2
- vpandn %ymm10,%ymm8,%ymm0
- vpand %ymm9,%ymm8,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm12,%ymm11
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm12,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm12,%ymm13,%ymm3
-
- vpxor %ymm1,%ymm11,%ymm11
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm12,%ymm1
-
- vpslld $19,%ymm12,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm11,%ymm7
-
- vpsrld $22,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm12,%ymm2
- vpxor %ymm4,%ymm13,%ymm11
- vpaddd %ymm5,%ymm15,%ymm15
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm11,%ymm11
- vpaddd %ymm7,%ymm11,%ymm11
- vmovd 20(%r12),%xmm5
- vmovd 20(%r8),%xmm0
- vmovd 20(%r13),%xmm1
- vmovd 20(%r9),%xmm2
- vpinsrd $1,20(%r14),%xmm5,%xmm5
- vpinsrd $1,20(%r10),%xmm0,%xmm0
- vpinsrd $1,20(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,20(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm15,%ymm7
- vpslld $26,%ymm15,%ymm2
- vmovdqu %ymm5,160-128(%rax)
- vpaddd %ymm10,%ymm5,%ymm5
-
- vpsrld $11,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm15,%ymm2
- vpaddd 32(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm15,%ymm2
- vpandn %ymm9,%ymm15,%ymm0
- vpand %ymm8,%ymm15,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm11,%ymm10
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm11,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm11,%ymm12,%ymm4
-
- vpxor %ymm1,%ymm10,%ymm10
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm11,%ymm1
-
- vpslld $19,%ymm11,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm10,%ymm7
-
- vpsrld $22,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm11,%ymm2
- vpxor %ymm3,%ymm12,%ymm10
- vpaddd %ymm5,%ymm14,%ymm14
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm10,%ymm10
- vpaddd %ymm7,%ymm10,%ymm10
- vmovd 24(%r12),%xmm5
- vmovd 24(%r8),%xmm0
- vmovd 24(%r13),%xmm1
- vmovd 24(%r9),%xmm2
- vpinsrd $1,24(%r14),%xmm5,%xmm5
- vpinsrd $1,24(%r10),%xmm0,%xmm0
- vpinsrd $1,24(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,24(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm14,%ymm7
- vpslld $26,%ymm14,%ymm2
- vmovdqu %ymm5,192-128(%rax)
- vpaddd %ymm9,%ymm5,%ymm5
-
- vpsrld $11,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm14,%ymm2
- vpaddd 64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm14,%ymm2
- vpandn %ymm8,%ymm14,%ymm0
- vpand %ymm15,%ymm14,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm10,%ymm9
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm10,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm10,%ymm11,%ymm3
-
- vpxor %ymm1,%ymm9,%ymm9
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm10,%ymm1
-
- vpslld $19,%ymm10,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm9,%ymm7
-
- vpsrld $22,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm10,%ymm2
- vpxor %ymm4,%ymm11,%ymm9
- vpaddd %ymm5,%ymm13,%ymm13
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm9,%ymm9
- vpaddd %ymm7,%ymm9,%ymm9
- vmovd 28(%r12),%xmm5
- vmovd 28(%r8),%xmm0
- vmovd 28(%r13),%xmm1
- vmovd 28(%r9),%xmm2
- vpinsrd $1,28(%r14),%xmm5,%xmm5
- vpinsrd $1,28(%r10),%xmm0,%xmm0
- vpinsrd $1,28(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,28(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm13,%ymm7
- vpslld $26,%ymm13,%ymm2
- vmovdqu %ymm5,224-128(%rax)
- vpaddd %ymm8,%ymm5,%ymm5
-
- vpsrld $11,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm13,%ymm2
- vpaddd 96(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm13,%ymm2
- vpandn %ymm15,%ymm13,%ymm0
- vpand %ymm14,%ymm13,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm9,%ymm8
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm9,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm9,%ymm10,%ymm4
-
- vpxor %ymm1,%ymm8,%ymm8
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm9,%ymm1
-
- vpslld $19,%ymm9,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm8,%ymm7
-
- vpsrld $22,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm9,%ymm2
- vpxor %ymm3,%ymm10,%ymm8
- vpaddd %ymm5,%ymm12,%ymm12
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm8,%ymm8
- vpaddd %ymm7,%ymm8,%ymm8
- addq $256,%rbp
- vmovd 32(%r12),%xmm5
- vmovd 32(%r8),%xmm0
- vmovd 32(%r13),%xmm1
- vmovd 32(%r9),%xmm2
- vpinsrd $1,32(%r14),%xmm5,%xmm5
- vpinsrd $1,32(%r10),%xmm0,%xmm0
- vpinsrd $1,32(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,32(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm12,%ymm7
- vpslld $26,%ymm12,%ymm2
- vmovdqu %ymm5,256-256-128(%rbx)
- vpaddd %ymm15,%ymm5,%ymm5
-
- vpsrld $11,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm12,%ymm2
- vpaddd -128(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm12,%ymm2
- vpandn %ymm14,%ymm12,%ymm0
- vpand %ymm13,%ymm12,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm8,%ymm15
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm8,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm8,%ymm9,%ymm3
-
- vpxor %ymm1,%ymm15,%ymm15
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm8,%ymm1
-
- vpslld $19,%ymm8,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm15,%ymm7
-
- vpsrld $22,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm8,%ymm2
- vpxor %ymm4,%ymm9,%ymm15
- vpaddd %ymm5,%ymm11,%ymm11
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm15,%ymm15
- vpaddd %ymm7,%ymm15,%ymm15
- vmovd 36(%r12),%xmm5
- vmovd 36(%r8),%xmm0
- vmovd 36(%r13),%xmm1
- vmovd 36(%r9),%xmm2
- vpinsrd $1,36(%r14),%xmm5,%xmm5
- vpinsrd $1,36(%r10),%xmm0,%xmm0
- vpinsrd $1,36(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,36(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm11,%ymm7
- vpslld $26,%ymm11,%ymm2
- vmovdqu %ymm5,288-256-128(%rbx)
- vpaddd %ymm14,%ymm5,%ymm5
-
- vpsrld $11,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm11,%ymm2
- vpaddd -96(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm11,%ymm2
- vpandn %ymm13,%ymm11,%ymm0
- vpand %ymm12,%ymm11,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm15,%ymm14
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm15,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm15,%ymm8,%ymm4
-
- vpxor %ymm1,%ymm14,%ymm14
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm15,%ymm1
-
- vpslld $19,%ymm15,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm14,%ymm7
-
- vpsrld $22,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm15,%ymm2
- vpxor %ymm3,%ymm8,%ymm14
- vpaddd %ymm5,%ymm10,%ymm10
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm14,%ymm14
- vpaddd %ymm7,%ymm14,%ymm14
- vmovd 40(%r12),%xmm5
- vmovd 40(%r8),%xmm0
- vmovd 40(%r13),%xmm1
- vmovd 40(%r9),%xmm2
- vpinsrd $1,40(%r14),%xmm5,%xmm5
- vpinsrd $1,40(%r10),%xmm0,%xmm0
- vpinsrd $1,40(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,40(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm10,%ymm7
- vpslld $26,%ymm10,%ymm2
- vmovdqu %ymm5,320-256-128(%rbx)
- vpaddd %ymm13,%ymm5,%ymm5
-
- vpsrld $11,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm10,%ymm2
- vpaddd -64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm10,%ymm2
- vpandn %ymm12,%ymm10,%ymm0
- vpand %ymm11,%ymm10,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm14,%ymm13
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm14,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm14,%ymm15,%ymm3
-
- vpxor %ymm1,%ymm13,%ymm13
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm14,%ymm1
-
- vpslld $19,%ymm14,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm13,%ymm7
-
- vpsrld $22,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm14,%ymm2
- vpxor %ymm4,%ymm15,%ymm13
- vpaddd %ymm5,%ymm9,%ymm9
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm13,%ymm13
- vpaddd %ymm7,%ymm13,%ymm13
- vmovd 44(%r12),%xmm5
- vmovd 44(%r8),%xmm0
- vmovd 44(%r13),%xmm1
- vmovd 44(%r9),%xmm2
- vpinsrd $1,44(%r14),%xmm5,%xmm5
- vpinsrd $1,44(%r10),%xmm0,%xmm0
- vpinsrd $1,44(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,44(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm9,%ymm7
- vpslld $26,%ymm9,%ymm2
- vmovdqu %ymm5,352-256-128(%rbx)
- vpaddd %ymm12,%ymm5,%ymm5
-
- vpsrld $11,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm9,%ymm2
- vpaddd -32(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm9,%ymm2
- vpandn %ymm11,%ymm9,%ymm0
- vpand %ymm10,%ymm9,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm13,%ymm12
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm13,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm13,%ymm14,%ymm4
-
- vpxor %ymm1,%ymm12,%ymm12
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm13,%ymm1
-
- vpslld $19,%ymm13,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm12,%ymm7
-
- vpsrld $22,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm13,%ymm2
- vpxor %ymm3,%ymm14,%ymm12
- vpaddd %ymm5,%ymm8,%ymm8
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm12,%ymm12
- vpaddd %ymm7,%ymm12,%ymm12
- vmovd 48(%r12),%xmm5
- vmovd 48(%r8),%xmm0
- vmovd 48(%r13),%xmm1
- vmovd 48(%r9),%xmm2
- vpinsrd $1,48(%r14),%xmm5,%xmm5
- vpinsrd $1,48(%r10),%xmm0,%xmm0
- vpinsrd $1,48(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,48(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm8,%ymm7
- vpslld $26,%ymm8,%ymm2
- vmovdqu %ymm5,384-256-128(%rbx)
- vpaddd %ymm11,%ymm5,%ymm5
-
- vpsrld $11,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm8,%ymm2
- vpaddd 0(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm8,%ymm2
- vpandn %ymm10,%ymm8,%ymm0
- vpand %ymm9,%ymm8,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm12,%ymm11
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm12,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm12,%ymm13,%ymm3
-
- vpxor %ymm1,%ymm11,%ymm11
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm12,%ymm1
-
- vpslld $19,%ymm12,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm11,%ymm7
-
- vpsrld $22,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm12,%ymm2
- vpxor %ymm4,%ymm13,%ymm11
- vpaddd %ymm5,%ymm15,%ymm15
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm11,%ymm11
- vpaddd %ymm7,%ymm11,%ymm11
- vmovd 52(%r12),%xmm5
- vmovd 52(%r8),%xmm0
- vmovd 52(%r13),%xmm1
- vmovd 52(%r9),%xmm2
- vpinsrd $1,52(%r14),%xmm5,%xmm5
- vpinsrd $1,52(%r10),%xmm0,%xmm0
- vpinsrd $1,52(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,52(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm15,%ymm7
- vpslld $26,%ymm15,%ymm2
- vmovdqu %ymm5,416-256-128(%rbx)
- vpaddd %ymm10,%ymm5,%ymm5
-
- vpsrld $11,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm15,%ymm2
- vpaddd 32(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm15,%ymm2
- vpandn %ymm9,%ymm15,%ymm0
- vpand %ymm8,%ymm15,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm11,%ymm10
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm11,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm11,%ymm12,%ymm4
-
- vpxor %ymm1,%ymm10,%ymm10
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm11,%ymm1
-
- vpslld $19,%ymm11,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm10,%ymm7
-
- vpsrld $22,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm11,%ymm2
- vpxor %ymm3,%ymm12,%ymm10
- vpaddd %ymm5,%ymm14,%ymm14
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm10,%ymm10
- vpaddd %ymm7,%ymm10,%ymm10
- vmovd 56(%r12),%xmm5
- vmovd 56(%r8),%xmm0
- vmovd 56(%r13),%xmm1
- vmovd 56(%r9),%xmm2
- vpinsrd $1,56(%r14),%xmm5,%xmm5
- vpinsrd $1,56(%r10),%xmm0,%xmm0
- vpinsrd $1,56(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,56(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm14,%ymm7
- vpslld $26,%ymm14,%ymm2
- vmovdqu %ymm5,448-256-128(%rbx)
- vpaddd %ymm9,%ymm5,%ymm5
-
- vpsrld $11,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm14,%ymm2
- vpaddd 64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm14,%ymm2
- vpandn %ymm8,%ymm14,%ymm0
- vpand %ymm15,%ymm14,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm10,%ymm9
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm10,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm10,%ymm11,%ymm3
-
- vpxor %ymm1,%ymm9,%ymm9
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm10,%ymm1
-
- vpslld $19,%ymm10,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm9,%ymm7
-
- vpsrld $22,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm10,%ymm2
- vpxor %ymm4,%ymm11,%ymm9
- vpaddd %ymm5,%ymm13,%ymm13
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm9,%ymm9
- vpaddd %ymm7,%ymm9,%ymm9
- vmovd 60(%r12),%xmm5
- leaq 64(%r12),%r12
- vmovd 60(%r8),%xmm0
- leaq 64(%r8),%r8
- vmovd 60(%r13),%xmm1
- leaq 64(%r13),%r13
- vmovd 60(%r9),%xmm2
- leaq 64(%r9),%r9
- vpinsrd $1,60(%r14),%xmm5,%xmm5
- leaq 64(%r14),%r14
- vpinsrd $1,60(%r10),%xmm0,%xmm0
- leaq 64(%r10),%r10
- vpinsrd $1,60(%r15),%xmm1,%xmm1
- leaq 64(%r15),%r15
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,60(%r11),%xmm2,%xmm2
- leaq 64(%r11),%r11
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm13,%ymm7
- vpslld $26,%ymm13,%ymm2
- vmovdqu %ymm5,480-256-128(%rbx)
- vpaddd %ymm8,%ymm5,%ymm5
-
- vpsrld $11,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm13,%ymm2
- vpaddd 96(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- prefetcht0 63(%r12)
- vpslld $7,%ymm13,%ymm2
- vpandn %ymm15,%ymm13,%ymm0
- vpand %ymm14,%ymm13,%ymm4
- prefetcht0 63(%r13)
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm9,%ymm8
- vpxor %ymm2,%ymm7,%ymm7
- prefetcht0 63(%r14)
- vpslld $30,%ymm9,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm9,%ymm10,%ymm4
- prefetcht0 63(%r15)
- vpxor %ymm1,%ymm8,%ymm8
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm9,%ymm1
- prefetcht0 63(%r8)
- vpslld $19,%ymm9,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
- prefetcht0 63(%r9)
- vpxor %ymm1,%ymm8,%ymm7
-
- vpsrld $22,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- prefetcht0 63(%r10)
- vpslld $10,%ymm9,%ymm2
- vpxor %ymm3,%ymm10,%ymm8
- vpaddd %ymm5,%ymm12,%ymm12
- prefetcht0 63(%r11)
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm8,%ymm8
- vpaddd %ymm7,%ymm8,%ymm8
- addq $256,%rbp
- vmovdqu 0-128(%rax),%ymm5
- movl $3,%ecx
- jmp .Loop_16_xx_avx2
-.align 32
-.Loop_16_xx_avx2:
- vmovdqu 32-128(%rax),%ymm6
- vpaddd 288-256-128(%rbx),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 448-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm12,%ymm7
- vpslld $26,%ymm12,%ymm2
- vmovdqu %ymm5,0-128(%rax)
- vpaddd %ymm15,%ymm5,%ymm5
-
- vpsrld $11,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm12,%ymm2
- vpaddd -128(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm12,%ymm2
- vpandn %ymm14,%ymm12,%ymm0
- vpand %ymm13,%ymm12,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm8,%ymm15
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm8,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm8,%ymm9,%ymm3
-
- vpxor %ymm1,%ymm15,%ymm15
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm8,%ymm1
-
- vpslld $19,%ymm8,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm15,%ymm7
-
- vpsrld $22,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm8,%ymm2
- vpxor %ymm4,%ymm9,%ymm15
- vpaddd %ymm5,%ymm11,%ymm11
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm15,%ymm15
- vpaddd %ymm7,%ymm15,%ymm15
- vmovdqu 64-128(%rax),%ymm5
- vpaddd 320-256-128(%rbx),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 480-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm11,%ymm7
- vpslld $26,%ymm11,%ymm2
- vmovdqu %ymm6,32-128(%rax)
- vpaddd %ymm14,%ymm6,%ymm6
-
- vpsrld $11,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm11,%ymm2
- vpaddd -96(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm11,%ymm2
- vpandn %ymm13,%ymm11,%ymm0
- vpand %ymm12,%ymm11,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm15,%ymm14
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm15,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm15,%ymm8,%ymm4
-
- vpxor %ymm1,%ymm14,%ymm14
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm15,%ymm1
-
- vpslld $19,%ymm15,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm14,%ymm7
-
- vpsrld $22,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm15,%ymm2
- vpxor %ymm3,%ymm8,%ymm14
- vpaddd %ymm6,%ymm10,%ymm10
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm14,%ymm14
- vpaddd %ymm7,%ymm14,%ymm14
- vmovdqu 96-128(%rax),%ymm6
- vpaddd 352-256-128(%rbx),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 0-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm10,%ymm7
- vpslld $26,%ymm10,%ymm2
- vmovdqu %ymm5,64-128(%rax)
- vpaddd %ymm13,%ymm5,%ymm5
-
- vpsrld $11,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm10,%ymm2
- vpaddd -64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm10,%ymm2
- vpandn %ymm12,%ymm10,%ymm0
- vpand %ymm11,%ymm10,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm14,%ymm13
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm14,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm14,%ymm15,%ymm3
-
- vpxor %ymm1,%ymm13,%ymm13
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm14,%ymm1
-
- vpslld $19,%ymm14,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm13,%ymm7
-
- vpsrld $22,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm14,%ymm2
- vpxor %ymm4,%ymm15,%ymm13
- vpaddd %ymm5,%ymm9,%ymm9
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm13,%ymm13
- vpaddd %ymm7,%ymm13,%ymm13
- vmovdqu 128-128(%rax),%ymm5
- vpaddd 384-256-128(%rbx),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 32-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm9,%ymm7
- vpslld $26,%ymm9,%ymm2
- vmovdqu %ymm6,96-128(%rax)
- vpaddd %ymm12,%ymm6,%ymm6
-
- vpsrld $11,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm9,%ymm2
- vpaddd -32(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm9,%ymm2
- vpandn %ymm11,%ymm9,%ymm0
- vpand %ymm10,%ymm9,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm13,%ymm12
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm13,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm13,%ymm14,%ymm4
-
- vpxor %ymm1,%ymm12,%ymm12
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm13,%ymm1
-
- vpslld $19,%ymm13,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm12,%ymm7
-
- vpsrld $22,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm13,%ymm2
- vpxor %ymm3,%ymm14,%ymm12
- vpaddd %ymm6,%ymm8,%ymm8
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm12,%ymm12
- vpaddd %ymm7,%ymm12,%ymm12
- vmovdqu 160-128(%rax),%ymm6
- vpaddd 416-256-128(%rbx),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 64-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm8,%ymm7
- vpslld $26,%ymm8,%ymm2
- vmovdqu %ymm5,128-128(%rax)
- vpaddd %ymm11,%ymm5,%ymm5
-
- vpsrld $11,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm8,%ymm2
- vpaddd 0(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm8,%ymm2
- vpandn %ymm10,%ymm8,%ymm0
- vpand %ymm9,%ymm8,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm12,%ymm11
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm12,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm12,%ymm13,%ymm3
-
- vpxor %ymm1,%ymm11,%ymm11
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm12,%ymm1
-
- vpslld $19,%ymm12,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm11,%ymm7
-
- vpsrld $22,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm12,%ymm2
- vpxor %ymm4,%ymm13,%ymm11
- vpaddd %ymm5,%ymm15,%ymm15
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm11,%ymm11
- vpaddd %ymm7,%ymm11,%ymm11
- vmovdqu 192-128(%rax),%ymm5
- vpaddd 448-256-128(%rbx),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 96-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm15,%ymm7
- vpslld $26,%ymm15,%ymm2
- vmovdqu %ymm6,160-128(%rax)
- vpaddd %ymm10,%ymm6,%ymm6
-
- vpsrld $11,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm15,%ymm2
- vpaddd 32(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm15,%ymm2
- vpandn %ymm9,%ymm15,%ymm0
- vpand %ymm8,%ymm15,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm11,%ymm10
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm11,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm11,%ymm12,%ymm4
-
- vpxor %ymm1,%ymm10,%ymm10
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm11,%ymm1
-
- vpslld $19,%ymm11,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm10,%ymm7
-
- vpsrld $22,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm11,%ymm2
- vpxor %ymm3,%ymm12,%ymm10
- vpaddd %ymm6,%ymm14,%ymm14
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm10,%ymm10
- vpaddd %ymm7,%ymm10,%ymm10
- vmovdqu 224-128(%rax),%ymm6
- vpaddd 480-256-128(%rbx),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 128-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm14,%ymm7
- vpslld $26,%ymm14,%ymm2
- vmovdqu %ymm5,192-128(%rax)
- vpaddd %ymm9,%ymm5,%ymm5
-
- vpsrld $11,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm14,%ymm2
- vpaddd 64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm14,%ymm2
- vpandn %ymm8,%ymm14,%ymm0
- vpand %ymm15,%ymm14,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm10,%ymm9
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm10,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm10,%ymm11,%ymm3
-
- vpxor %ymm1,%ymm9,%ymm9
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm10,%ymm1
-
- vpslld $19,%ymm10,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm9,%ymm7
-
- vpsrld $22,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm10,%ymm2
- vpxor %ymm4,%ymm11,%ymm9
- vpaddd %ymm5,%ymm13,%ymm13
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm9,%ymm9
- vpaddd %ymm7,%ymm9,%ymm9
- vmovdqu 256-256-128(%rbx),%ymm5
- vpaddd 0-128(%rax),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 160-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm13,%ymm7
- vpslld $26,%ymm13,%ymm2
- vmovdqu %ymm6,224-128(%rax)
- vpaddd %ymm8,%ymm6,%ymm6
-
- vpsrld $11,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm13,%ymm2
- vpaddd 96(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm13,%ymm2
- vpandn %ymm15,%ymm13,%ymm0
- vpand %ymm14,%ymm13,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm9,%ymm8
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm9,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm9,%ymm10,%ymm4
-
- vpxor %ymm1,%ymm8,%ymm8
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm9,%ymm1
-
- vpslld $19,%ymm9,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm8,%ymm7
-
- vpsrld $22,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm9,%ymm2
- vpxor %ymm3,%ymm10,%ymm8
- vpaddd %ymm6,%ymm12,%ymm12
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm8,%ymm8
- vpaddd %ymm7,%ymm8,%ymm8
- addq $256,%rbp
- vmovdqu 288-256-128(%rbx),%ymm6
- vpaddd 32-128(%rax),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 192-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm12,%ymm7
- vpslld $26,%ymm12,%ymm2
- vmovdqu %ymm5,256-256-128(%rbx)
- vpaddd %ymm15,%ymm5,%ymm5
-
- vpsrld $11,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm12,%ymm2
- vpaddd -128(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm12,%ymm2
- vpandn %ymm14,%ymm12,%ymm0
- vpand %ymm13,%ymm12,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm8,%ymm15
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm8,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm8,%ymm9,%ymm3
-
- vpxor %ymm1,%ymm15,%ymm15
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm8,%ymm1
-
- vpslld $19,%ymm8,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm15,%ymm7
-
- vpsrld $22,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm8,%ymm2
- vpxor %ymm4,%ymm9,%ymm15
- vpaddd %ymm5,%ymm11,%ymm11
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm15,%ymm15
- vpaddd %ymm7,%ymm15,%ymm15
- vmovdqu 320-256-128(%rbx),%ymm5
- vpaddd 64-128(%rax),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 224-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm11,%ymm7
- vpslld $26,%ymm11,%ymm2
- vmovdqu %ymm6,288-256-128(%rbx)
- vpaddd %ymm14,%ymm6,%ymm6
-
- vpsrld $11,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm11,%ymm2
- vpaddd -96(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm11,%ymm2
- vpandn %ymm13,%ymm11,%ymm0
- vpand %ymm12,%ymm11,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm15,%ymm14
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm15,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm15,%ymm8,%ymm4
-
- vpxor %ymm1,%ymm14,%ymm14
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm15,%ymm1
-
- vpslld $19,%ymm15,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm14,%ymm7
-
- vpsrld $22,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm15,%ymm2
- vpxor %ymm3,%ymm8,%ymm14
- vpaddd %ymm6,%ymm10,%ymm10
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm14,%ymm14
- vpaddd %ymm7,%ymm14,%ymm14
- vmovdqu 352-256-128(%rbx),%ymm6
- vpaddd 96-128(%rax),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 256-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm10,%ymm7
- vpslld $26,%ymm10,%ymm2
- vmovdqu %ymm5,320-256-128(%rbx)
- vpaddd %ymm13,%ymm5,%ymm5
-
- vpsrld $11,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm10,%ymm2
- vpaddd -64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm10,%ymm2
- vpandn %ymm12,%ymm10,%ymm0
- vpand %ymm11,%ymm10,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm14,%ymm13
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm14,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm14,%ymm15,%ymm3
-
- vpxor %ymm1,%ymm13,%ymm13
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm14,%ymm1
-
- vpslld $19,%ymm14,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm13,%ymm7
-
- vpsrld $22,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm14,%ymm2
- vpxor %ymm4,%ymm15,%ymm13
- vpaddd %ymm5,%ymm9,%ymm9
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm13,%ymm13
- vpaddd %ymm7,%ymm13,%ymm13
- vmovdqu 384-256-128(%rbx),%ymm5
- vpaddd 128-128(%rax),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 288-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm9,%ymm7
- vpslld $26,%ymm9,%ymm2
- vmovdqu %ymm6,352-256-128(%rbx)
- vpaddd %ymm12,%ymm6,%ymm6
-
- vpsrld $11,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm9,%ymm2
- vpaddd -32(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm9,%ymm2
- vpandn %ymm11,%ymm9,%ymm0
- vpand %ymm10,%ymm9,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm13,%ymm12
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm13,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm13,%ymm14,%ymm4
-
- vpxor %ymm1,%ymm12,%ymm12
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm13,%ymm1
-
- vpslld $19,%ymm13,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm12,%ymm7
-
- vpsrld $22,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm13,%ymm2
- vpxor %ymm3,%ymm14,%ymm12
- vpaddd %ymm6,%ymm8,%ymm8
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm12,%ymm12
- vpaddd %ymm7,%ymm12,%ymm12
- vmovdqu 416-256-128(%rbx),%ymm6
- vpaddd 160-128(%rax),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 320-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm8,%ymm7
- vpslld $26,%ymm8,%ymm2
- vmovdqu %ymm5,384-256-128(%rbx)
- vpaddd %ymm11,%ymm5,%ymm5
-
- vpsrld $11,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm8,%ymm2
- vpaddd 0(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm8,%ymm2
- vpandn %ymm10,%ymm8,%ymm0
- vpand %ymm9,%ymm8,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm12,%ymm11
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm12,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm12,%ymm13,%ymm3
-
- vpxor %ymm1,%ymm11,%ymm11
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm12,%ymm1
-
- vpslld $19,%ymm12,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm11,%ymm7
-
- vpsrld $22,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm12,%ymm2
- vpxor %ymm4,%ymm13,%ymm11
- vpaddd %ymm5,%ymm15,%ymm15
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm11,%ymm11
- vpaddd %ymm7,%ymm11,%ymm11
- vmovdqu 448-256-128(%rbx),%ymm5
- vpaddd 192-128(%rax),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 352-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm15,%ymm7
- vpslld $26,%ymm15,%ymm2
- vmovdqu %ymm6,416-256-128(%rbx)
- vpaddd %ymm10,%ymm6,%ymm6
-
- vpsrld $11,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm15,%ymm2
- vpaddd 32(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm15,%ymm2
- vpandn %ymm9,%ymm15,%ymm0
- vpand %ymm8,%ymm15,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm11,%ymm10
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm11,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm11,%ymm12,%ymm4
-
- vpxor %ymm1,%ymm10,%ymm10
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm11,%ymm1
-
- vpslld $19,%ymm11,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm10,%ymm7
-
- vpsrld $22,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm11,%ymm2
- vpxor %ymm3,%ymm12,%ymm10
- vpaddd %ymm6,%ymm14,%ymm14
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm10,%ymm10
- vpaddd %ymm7,%ymm10,%ymm10
- vmovdqu 480-256-128(%rbx),%ymm6
- vpaddd 224-128(%rax),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 384-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm14,%ymm7
- vpslld $26,%ymm14,%ymm2
- vmovdqu %ymm5,448-256-128(%rbx)
- vpaddd %ymm9,%ymm5,%ymm5
-
- vpsrld $11,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm14,%ymm2
- vpaddd 64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm14,%ymm2
- vpandn %ymm8,%ymm14,%ymm0
- vpand %ymm15,%ymm14,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm10,%ymm9
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm10,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm10,%ymm11,%ymm3
-
- vpxor %ymm1,%ymm9,%ymm9
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm10,%ymm1
-
- vpslld $19,%ymm10,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm9,%ymm7
-
- vpsrld $22,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm10,%ymm2
- vpxor %ymm4,%ymm11,%ymm9
- vpaddd %ymm5,%ymm13,%ymm13
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm9,%ymm9
- vpaddd %ymm7,%ymm9,%ymm9
- vmovdqu 0-128(%rax),%ymm5
- vpaddd 256-256-128(%rbx),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 416-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm13,%ymm7
- vpslld $26,%ymm13,%ymm2
- vmovdqu %ymm6,480-256-128(%rbx)
- vpaddd %ymm8,%ymm6,%ymm6
-
- vpsrld $11,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm13,%ymm2
- vpaddd 96(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm13,%ymm2
- vpandn %ymm15,%ymm13,%ymm0
- vpand %ymm14,%ymm13,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm9,%ymm8
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm9,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm9,%ymm10,%ymm4
-
- vpxor %ymm1,%ymm8,%ymm8
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm9,%ymm1
-
- vpslld $19,%ymm9,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm8,%ymm7
-
- vpsrld $22,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm9,%ymm2
- vpxor %ymm3,%ymm10,%ymm8
- vpaddd %ymm6,%ymm12,%ymm12
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm8,%ymm8
- vpaddd %ymm7,%ymm8,%ymm8
- addq $256,%rbp
- decl %ecx
- jnz .Loop_16_xx_avx2
-
- movl $1,%ecx
- leaq 512(%rsp),%rbx
- leaq K256+128(%rip),%rbp
- cmpl 0(%rbx),%ecx
- cmovgeq %rbp,%r12
- cmpl 4(%rbx),%ecx
- cmovgeq %rbp,%r13
- cmpl 8(%rbx),%ecx
- cmovgeq %rbp,%r14
- cmpl 12(%rbx),%ecx
- cmovgeq %rbp,%r15
- cmpl 16(%rbx),%ecx
- cmovgeq %rbp,%r8
- cmpl 20(%rbx),%ecx
- cmovgeq %rbp,%r9
- cmpl 24(%rbx),%ecx
- cmovgeq %rbp,%r10
- cmpl 28(%rbx),%ecx
- cmovgeq %rbp,%r11
- vmovdqa (%rbx),%ymm7
- vpxor %ymm0,%ymm0,%ymm0
- vmovdqa %ymm7,%ymm6
- vpcmpgtd %ymm0,%ymm6,%ymm6
- vpaddd %ymm6,%ymm7,%ymm7
-
- vmovdqu 0-128(%rdi),%ymm0
- vpand %ymm6,%ymm8,%ymm8
- vmovdqu 32-128(%rdi),%ymm1
- vpand %ymm6,%ymm9,%ymm9
- vmovdqu 64-128(%rdi),%ymm2
- vpand %ymm6,%ymm10,%ymm10
- vmovdqu 96-128(%rdi),%ymm5
- vpand %ymm6,%ymm11,%ymm11
- vpaddd %ymm0,%ymm8,%ymm8
- vmovdqu 128-128(%rdi),%ymm0
- vpand %ymm6,%ymm12,%ymm12
- vpaddd %ymm1,%ymm9,%ymm9
- vmovdqu 160-128(%rdi),%ymm1
- vpand %ymm6,%ymm13,%ymm13
- vpaddd %ymm2,%ymm10,%ymm10
- vmovdqu 192-128(%rdi),%ymm2
- vpand %ymm6,%ymm14,%ymm14
- vpaddd %ymm5,%ymm11,%ymm11
- vmovdqu 224-128(%rdi),%ymm5
- vpand %ymm6,%ymm15,%ymm15
- vpaddd %ymm0,%ymm12,%ymm12
- vpaddd %ymm1,%ymm13,%ymm13
- vmovdqu %ymm8,0-128(%rdi)
- vpaddd %ymm2,%ymm14,%ymm14
- vmovdqu %ymm9,32-128(%rdi)
- vpaddd %ymm5,%ymm15,%ymm15
- vmovdqu %ymm10,64-128(%rdi)
- vmovdqu %ymm11,96-128(%rdi)
- vmovdqu %ymm12,128-128(%rdi)
- vmovdqu %ymm13,160-128(%rdi)
- vmovdqu %ymm14,192-128(%rdi)
- vmovdqu %ymm15,224-128(%rdi)
-
- vmovdqu %ymm7,(%rbx)
- leaq 256+128(%rsp),%rbx
- vmovdqu .Lpbswap(%rip),%ymm6
- decl %edx
- jnz .Loop_avx2
-
-
-
-
-
-
-
-.Ldone_avx2:
- movq 544(%rsp),%rax
- vzeroupper
- movq -48(%rax),%r15
- movq -40(%rax),%r14
- movq -32(%rax),%r13
- movq -24(%rax),%r12
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-.Lepilogue_avx2:
- .byte 0xf3,0xc3
-.size sha256_multi_block_avx2,.-sha256_multi_block_avx2
.align 256
K256:
.long 1116352408,1116352408,1116352408,1116352408
diff --git a/deps/openssl/asm/x64-elf-gas/sha/sha256-x86_64.s b/deps/openssl/asm/x64-elf-gas/sha/sha256-x86_64.s
index a2fbedaf8c..ab16a7b618 100644
--- a/deps/openssl/asm/x64-elf-gas/sha/sha256-x86_64.s
+++ b/deps/openssl/asm/x64-elf-gas/sha/sha256-x86_64.s
@@ -11,14 +11,6 @@ sha256_block_data_order:
movl 8(%r11),%r11d
testl $536870912,%r11d
jnz _shaext_shortcut
- andl $296,%r11d
- cmpl $296,%r11d
- je .Lavx2_shortcut
- andl $1073741824,%r9d
- andl $268435968,%r10d
- orl %r9d,%r10d
- cmpl $1342177792,%r10d
- je .Lavx_shortcut
testl $512,%r10d
jnz .Lssse3_shortcut
pushq %rbx
@@ -1762,9 +1754,9 @@ _shaext_shortcut:
movdqu 16(%rdi),%xmm2
movdqa 512-128(%rcx),%xmm7
- pshufd $27,%xmm1,%xmm0
- pshufd $177,%xmm1,%xmm1
- pshufd $27,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm0
+ pshufd $0xb1,%xmm1,%xmm1
+ pshufd $0x1b,%xmm2,%xmm2
movdqa %xmm7,%xmm8
.byte 102,15,58,15,202,8
punpcklqdq %xmm0,%xmm2
@@ -1783,7 +1775,7 @@ _shaext_shortcut:
.byte 102,15,56,0,231
movdqa %xmm2,%xmm10
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
nop
movdqa %xmm1,%xmm9
.byte 15,56,203,202
@@ -1792,7 +1784,7 @@ _shaext_shortcut:
paddd %xmm4,%xmm0
.byte 102,15,56,0,239
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
leaq 64(%rsi),%rsi
.byte 15,56,204,220
.byte 15,56,203,202
@@ -1801,7 +1793,7 @@ _shaext_shortcut:
paddd %xmm5,%xmm0
.byte 102,15,56,0,247
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
.byte 102,15,58,15,253,4
nop
@@ -1813,7 +1805,7 @@ _shaext_shortcut:
paddd %xmm6,%xmm0
.byte 15,56,205,222
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
.byte 102,15,58,15,254,4
nop
@@ -1824,7 +1816,7 @@ _shaext_shortcut:
paddd %xmm3,%xmm0
.byte 15,56,205,227
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
.byte 102,15,58,15,251,4
nop
@@ -1835,7 +1827,7 @@ _shaext_shortcut:
paddd %xmm4,%xmm0
.byte 15,56,205,236
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
.byte 102,15,58,15,252,4
nop
@@ -1846,7 +1838,7 @@ _shaext_shortcut:
paddd %xmm5,%xmm0
.byte 15,56,205,245
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
.byte 102,15,58,15,253,4
nop
@@ -1857,7 +1849,7 @@ _shaext_shortcut:
paddd %xmm6,%xmm0
.byte 15,56,205,222
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
.byte 102,15,58,15,254,4
nop
@@ -1868,7 +1860,7 @@ _shaext_shortcut:
paddd %xmm3,%xmm0
.byte 15,56,205,227
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
.byte 102,15,58,15,251,4
nop
@@ -1879,7 +1871,7 @@ _shaext_shortcut:
paddd %xmm4,%xmm0
.byte 15,56,205,236
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
.byte 102,15,58,15,252,4
nop
@@ -1890,7 +1882,7 @@ _shaext_shortcut:
paddd %xmm5,%xmm0
.byte 15,56,205,245
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
.byte 102,15,58,15,253,4
nop
@@ -1901,7 +1893,7 @@ _shaext_shortcut:
paddd %xmm6,%xmm0
.byte 15,56,205,222
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
.byte 102,15,58,15,254,4
nop
@@ -1912,7 +1904,7 @@ _shaext_shortcut:
paddd %xmm3,%xmm0
.byte 15,56,205,227
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
.byte 102,15,58,15,251,4
nop
@@ -1923,7 +1915,7 @@ _shaext_shortcut:
paddd %xmm4,%xmm0
.byte 15,56,205,236
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
.byte 102,15,58,15,252,4
.byte 15,56,203,202
@@ -1932,7 +1924,7 @@ _shaext_shortcut:
movdqa 448-128(%rcx),%xmm0
paddd %xmm5,%xmm0
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
.byte 15,56,205,245
movdqa %xmm8,%xmm7
.byte 15,56,203,202
@@ -1941,7 +1933,7 @@ _shaext_shortcut:
paddd %xmm6,%xmm0
nop
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
decq %rdx
nop
.byte 15,56,203,202
@@ -1950,9 +1942,9 @@ _shaext_shortcut:
paddd %xmm9,%xmm1
jnz .Loop_shaext
- pshufd $177,%xmm2,%xmm2
- pshufd $27,%xmm1,%xmm7
- pshufd $177,%xmm1,%xmm1
+ pshufd $0xb1,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm7
+ pshufd $0xb1,%xmm1,%xmm1
punpckhqdq %xmm2,%xmm1
.byte 102,15,58,15,215,8
@@ -3055,2304 +3047,3 @@ sha256_block_data_order_ssse3:
.Lepilogue_ssse3:
.byte 0xf3,0xc3
.size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3
-.type sha256_block_data_order_avx,@function
-.align 64
-sha256_block_data_order_avx:
-.Lavx_shortcut:
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rsp,%r11
- shlq $4,%rdx
- subq $96,%rsp
- leaq (%rsi,%rdx,4),%rdx
- andq $-64,%rsp
- movq %rdi,64+0(%rsp)
- movq %rsi,64+8(%rsp)
- movq %rdx,64+16(%rsp)
- movq %r11,64+24(%rsp)
-.Lprologue_avx:
-
- vzeroupper
- movl 0(%rdi),%eax
- movl 4(%rdi),%ebx
- movl 8(%rdi),%ecx
- movl 12(%rdi),%edx
- movl 16(%rdi),%r8d
- movl 20(%rdi),%r9d
- movl 24(%rdi),%r10d
- movl 28(%rdi),%r11d
- vmovdqa K256+512+32(%rip),%xmm8
- vmovdqa K256+512+64(%rip),%xmm9
- jmp .Lloop_avx
-.align 16
-.Lloop_avx:
- vmovdqa K256+512(%rip),%xmm7
- vmovdqu 0(%rsi),%xmm0
- vmovdqu 16(%rsi),%xmm1
- vmovdqu 32(%rsi),%xmm2
- vmovdqu 48(%rsi),%xmm3
- vpshufb %xmm7,%xmm0,%xmm0
- leaq K256(%rip),%rbp
- vpshufb %xmm7,%xmm1,%xmm1
- vpshufb %xmm7,%xmm2,%xmm2
- vpaddd 0(%rbp),%xmm0,%xmm4
- vpshufb %xmm7,%xmm3,%xmm3
- vpaddd 32(%rbp),%xmm1,%xmm5
- vpaddd 64(%rbp),%xmm2,%xmm6
- vpaddd 96(%rbp),%xmm3,%xmm7
- vmovdqa %xmm4,0(%rsp)
- movl %eax,%r14d
- vmovdqa %xmm5,16(%rsp)
- movl %ebx,%edi
- vmovdqa %xmm6,32(%rsp)
- xorl %ecx,%edi
- vmovdqa %xmm7,48(%rsp)
- movl %r8d,%r13d
- jmp .Lavx_00_47
-
-.align 16
-.Lavx_00_47:
- subq $-128,%rbp
- vpalignr $4,%xmm0,%xmm1,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- vpalignr $4,%xmm2,%xmm3,%xmm7
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpaddd %xmm7,%xmm0,%xmm0
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- vpsrld $3,%xmm4,%xmm7
- xorl %r10d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r15d
- vpslld $14,%xmm4,%xmm5
- addl %r12d,%r11d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- vpshufd $250,%xmm3,%xmm7
- shrdl $2,%r14d,%r14d
- addl %r11d,%edx
- addl %edi,%r11d
- vpsrld $11,%xmm6,%xmm6
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%r11d
- movl %r8d,%r12d
- shrdl $9,%r14d,%r14d
- vpslld $11,%xmm5,%xmm5
- xorl %edx,%r13d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r11d,%r14d
- andl %edx,%r12d
- xorl %edx,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 4(%rsp),%r10d
- movl %r11d,%edi
- xorl %r9d,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- shrdl $11,%r14d,%r14d
- xorl %eax,%edi
- addl %r12d,%r10d
- vpsrlq $17,%xmm7,%xmm7
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- vpaddd %xmm4,%xmm0,%xmm0
- addl %r13d,%r10d
- xorl %eax,%r15d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %r10d,%ecx
- addl %r15d,%r10d
- movl %ecx,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- movl %edx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r13d
- vpshufb %xmm8,%xmm6,%xmm6
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- vpaddd %xmm6,%xmm0,%xmm0
- andl %ecx,%r12d
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- vpshufd $80,%xmm0,%xmm7
- movl %r10d,%r15d
- xorl %r8d,%r12d
- shrdl $11,%r14d,%r14d
- vpsrld $10,%xmm7,%xmm6
- xorl %r11d,%r15d
- addl %r12d,%r9d
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %r11d,%edi
- shrdl $2,%r14d,%r14d
- addl %r9d,%ebx
- vpsrlq $2,%xmm7,%xmm7
- addl %edi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- vpshufb %xmm9,%xmm6,%xmm6
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- vpaddd %xmm6,%xmm0,%xmm0
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vpaddd 0(%rbp),%xmm0,%xmm6
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%edi
- xorl %edx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%edi
- addl %r12d,%r8d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- shrdl $2,%r14d,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,0(%rsp)
- vpalignr $4,%xmm1,%xmm2,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- vpalignr $4,%xmm3,%xmm0,%xmm7
- shrdl $9,%r14d,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpaddd %xmm7,%xmm1,%xmm1
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- vpsrld $3,%xmm4,%xmm7
- xorl %ecx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r15d
- vpslld $14,%xmm4,%xmm5
- addl %r12d,%edx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- vpshufd $250,%xmm0,%xmm7
- shrdl $2,%r14d,%r14d
- addl %edx,%r11d
- addl %edi,%edx
- vpsrld $11,%xmm6,%xmm6
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%edx
- movl %eax,%r12d
- shrdl $9,%r14d,%r14d
- vpslld $11,%xmm5,%xmm5
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %edx,%r14d
- andl %r11d,%r12d
- xorl %r11d,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 20(%rsp),%ecx
- movl %edx,%edi
- xorl %ebx,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- shrdl $11,%r14d,%r14d
- xorl %r8d,%edi
- addl %r12d,%ecx
- vpsrlq $17,%xmm7,%xmm7
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- vpaddd %xmm4,%xmm1,%xmm1
- addl %r13d,%ecx
- xorl %r8d,%r15d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %ecx,%r10d
- addl %r15d,%ecx
- movl %r10d,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- movl %r11d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r13d
- vpshufb %xmm8,%xmm6,%xmm6
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- vpaddd %xmm6,%xmm1,%xmm1
- andl %r10d,%r12d
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- vpshufd $80,%xmm1,%xmm7
- movl %ecx,%r15d
- xorl %eax,%r12d
- shrdl $11,%r14d,%r14d
- vpsrld $10,%xmm7,%xmm6
- xorl %edx,%r15d
- addl %r12d,%ebx
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- xorl %edx,%edi
- shrdl $2,%r14d,%r14d
- addl %ebx,%r9d
- vpsrlq $2,%xmm7,%xmm7
- addl %edi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- vpshufb %xmm9,%xmm6,%xmm6
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- vpaddd %xmm6,%xmm1,%xmm1
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpaddd 32(%rbp),%xmm1,%xmm6
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%edi
- xorl %r11d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%edi
- addl %r12d,%eax
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- shrdl $2,%r14d,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,16(%rsp)
- vpalignr $4,%xmm2,%xmm3,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- vpalignr $4,%xmm0,%xmm1,%xmm7
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpaddd %xmm7,%xmm2,%xmm2
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- vpsrld $3,%xmm4,%xmm7
- xorl %r10d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r15d
- vpslld $14,%xmm4,%xmm5
- addl %r12d,%r11d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- vpshufd $250,%xmm1,%xmm7
- shrdl $2,%r14d,%r14d
- addl %r11d,%edx
- addl %edi,%r11d
- vpsrld $11,%xmm6,%xmm6
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%r11d
- movl %r8d,%r12d
- shrdl $9,%r14d,%r14d
- vpslld $11,%xmm5,%xmm5
- xorl %edx,%r13d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r11d,%r14d
- andl %edx,%r12d
- xorl %edx,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 36(%rsp),%r10d
- movl %r11d,%edi
- xorl %r9d,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- shrdl $11,%r14d,%r14d
- xorl %eax,%edi
- addl %r12d,%r10d
- vpsrlq $17,%xmm7,%xmm7
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- vpaddd %xmm4,%xmm2,%xmm2
- addl %r13d,%r10d
- xorl %eax,%r15d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %r10d,%ecx
- addl %r15d,%r10d
- movl %ecx,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- movl %edx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r13d
- vpshufb %xmm8,%xmm6,%xmm6
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- vpaddd %xmm6,%xmm2,%xmm2
- andl %ecx,%r12d
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- vpshufd $80,%xmm2,%xmm7
- movl %r10d,%r15d
- xorl %r8d,%r12d
- shrdl $11,%r14d,%r14d
- vpsrld $10,%xmm7,%xmm6
- xorl %r11d,%r15d
- addl %r12d,%r9d
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %r11d,%edi
- shrdl $2,%r14d,%r14d
- addl %r9d,%ebx
- vpsrlq $2,%xmm7,%xmm7
- addl %edi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- vpshufb %xmm9,%xmm6,%xmm6
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- vpaddd %xmm6,%xmm2,%xmm2
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vpaddd 64(%rbp),%xmm2,%xmm6
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%edi
- xorl %edx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%edi
- addl %r12d,%r8d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- shrdl $2,%r14d,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,32(%rsp)
- vpalignr $4,%xmm3,%xmm0,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- vpalignr $4,%xmm1,%xmm2,%xmm7
- shrdl $9,%r14d,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpaddd %xmm7,%xmm3,%xmm3
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- vpsrld $3,%xmm4,%xmm7
- xorl %ecx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r15d
- vpslld $14,%xmm4,%xmm5
- addl %r12d,%edx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- vpshufd $250,%xmm2,%xmm7
- shrdl $2,%r14d,%r14d
- addl %edx,%r11d
- addl %edi,%edx
- vpsrld $11,%xmm6,%xmm6
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%edx
- movl %eax,%r12d
- shrdl $9,%r14d,%r14d
- vpslld $11,%xmm5,%xmm5
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %edx,%r14d
- andl %r11d,%r12d
- xorl %r11d,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 52(%rsp),%ecx
- movl %edx,%edi
- xorl %ebx,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- shrdl $11,%r14d,%r14d
- xorl %r8d,%edi
- addl %r12d,%ecx
- vpsrlq $17,%xmm7,%xmm7
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- vpaddd %xmm4,%xmm3,%xmm3
- addl %r13d,%ecx
- xorl %r8d,%r15d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %ecx,%r10d
- addl %r15d,%ecx
- movl %r10d,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- movl %r11d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r13d
- vpshufb %xmm8,%xmm6,%xmm6
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- vpaddd %xmm6,%xmm3,%xmm3
- andl %r10d,%r12d
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- vpshufd $80,%xmm3,%xmm7
- movl %ecx,%r15d
- xorl %eax,%r12d
- shrdl $11,%r14d,%r14d
- vpsrld $10,%xmm7,%xmm6
- xorl %edx,%r15d
- addl %r12d,%ebx
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- xorl %edx,%edi
- shrdl $2,%r14d,%r14d
- addl %ebx,%r9d
- vpsrlq $2,%xmm7,%xmm7
- addl %edi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- vpshufb %xmm9,%xmm6,%xmm6
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- vpaddd %xmm6,%xmm3,%xmm3
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpaddd 96(%rbp),%xmm3,%xmm6
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%edi
- xorl %r11d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%edi
- addl %r12d,%eax
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- shrdl $2,%r14d,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,48(%rsp)
- cmpb $0,131(%rbp)
- jne .Lavx_00_47
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- xorl %r10d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r15d
- addl %r12d,%r11d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- shrdl $2,%r14d,%r14d
- addl %r11d,%edx
- addl %edi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r13d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- xorl %edx,%r13d
- addl 4(%rsp),%r10d
- movl %r11d,%edi
- xorl %r9d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %eax,%edi
- addl %r12d,%r10d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- shrdl $2,%r14d,%r14d
- addl %r10d,%ecx
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r13d
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- movl %r10d,%r15d
- xorl %r8d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r15d
- addl %r12d,%r9d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%edi
- shrdl $2,%r14d,%r14d
- addl %r9d,%ebx
- addl %edi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%edi
- xorl %edx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%edi
- addl %r12d,%r8d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- shrdl $2,%r14d,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- xorl %ecx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r15d
- addl %r12d,%edx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- shrdl $2,%r14d,%r14d
- addl %edx,%r11d
- addl %edi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- xorl %r11d,%r13d
- addl 20(%rsp),%ecx
- movl %edx,%edi
- xorl %ebx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%edi
- addl %r12d,%ecx
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- shrdl $2,%r14d,%r14d
- addl %ecx,%r10d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r13d
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- movl %ecx,%r15d
- xorl %eax,%r12d
- shrdl $11,%r14d,%r14d
- xorl %edx,%r15d
- addl %r12d,%ebx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%edi
- shrdl $2,%r14d,%r14d
- addl %ebx,%r9d
- addl %edi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%edi
- xorl %r11d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%edi
- addl %r12d,%eax
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- shrdl $2,%r14d,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- xorl %r10d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r15d
- addl %r12d,%r11d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- shrdl $2,%r14d,%r14d
- addl %r11d,%edx
- addl %edi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r13d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- xorl %edx,%r13d
- addl 36(%rsp),%r10d
- movl %r11d,%edi
- xorl %r9d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %eax,%edi
- addl %r12d,%r10d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- shrdl $2,%r14d,%r14d
- addl %r10d,%ecx
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r13d
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- movl %r10d,%r15d
- xorl %r8d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r15d
- addl %r12d,%r9d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%edi
- shrdl $2,%r14d,%r14d
- addl %r9d,%ebx
- addl %edi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%edi
- xorl %edx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%edi
- addl %r12d,%r8d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- shrdl $2,%r14d,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- xorl %ecx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r15d
- addl %r12d,%edx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- shrdl $2,%r14d,%r14d
- addl %edx,%r11d
- addl %edi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- xorl %r11d,%r13d
- addl 52(%rsp),%ecx
- movl %edx,%edi
- xorl %ebx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%edi
- addl %r12d,%ecx
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- shrdl $2,%r14d,%r14d
- addl %ecx,%r10d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r13d
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- movl %ecx,%r15d
- xorl %eax,%r12d
- shrdl $11,%r14d,%r14d
- xorl %edx,%r15d
- addl %r12d,%ebx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%edi
- shrdl $2,%r14d,%r14d
- addl %ebx,%r9d
- addl %edi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%edi
- xorl %r11d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%edi
- addl %r12d,%eax
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- shrdl $2,%r14d,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- movq 64+0(%rsp),%rdi
- movl %r14d,%eax
-
- addl 0(%rdi),%eax
- leaq 64(%rsi),%rsi
- addl 4(%rdi),%ebx
- addl 8(%rdi),%ecx
- addl 12(%rdi),%edx
- addl 16(%rdi),%r8d
- addl 20(%rdi),%r9d
- addl 24(%rdi),%r10d
- addl 28(%rdi),%r11d
-
- cmpq 64+16(%rsp),%rsi
-
- movl %eax,0(%rdi)
- movl %ebx,4(%rdi)
- movl %ecx,8(%rdi)
- movl %edx,12(%rdi)
- movl %r8d,16(%rdi)
- movl %r9d,20(%rdi)
- movl %r10d,24(%rdi)
- movl %r11d,28(%rdi)
- jb .Lloop_avx
-
- movq 64+24(%rsp),%rsi
- vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-.Lepilogue_avx:
- .byte 0xf3,0xc3
-.size sha256_block_data_order_avx,.-sha256_block_data_order_avx
-.type sha256_block_data_order_avx2,@function
-.align 64
-sha256_block_data_order_avx2:
-.Lavx2_shortcut:
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rsp,%r11
- subq $544,%rsp
- shlq $4,%rdx
- andq $-1024,%rsp
- leaq (%rsi,%rdx,4),%rdx
- addq $448,%rsp
- movq %rdi,64+0(%rsp)
- movq %rsi,64+8(%rsp)
- movq %rdx,64+16(%rsp)
- movq %r11,64+24(%rsp)
-.Lprologue_avx2:
-
- vzeroupper
- subq $-64,%rsi
- movl 0(%rdi),%eax
- movq %rsi,%r12
- movl 4(%rdi),%ebx
- cmpq %rdx,%rsi
- movl 8(%rdi),%ecx
- cmoveq %rsp,%r12
- movl 12(%rdi),%edx
- movl 16(%rdi),%r8d
- movl 20(%rdi),%r9d
- movl 24(%rdi),%r10d
- movl 28(%rdi),%r11d
- vmovdqa K256+512+32(%rip),%ymm8
- vmovdqa K256+512+64(%rip),%ymm9
- jmp .Loop_avx2
-.align 16
-.Loop_avx2:
- vmovdqa K256+512(%rip),%ymm7
- vmovdqu -64+0(%rsi),%xmm0
- vmovdqu -64+16(%rsi),%xmm1
- vmovdqu -64+32(%rsi),%xmm2
- vmovdqu -64+48(%rsi),%xmm3
-
- vinserti128 $1,(%r12),%ymm0,%ymm0
- vinserti128 $1,16(%r12),%ymm1,%ymm1
- vpshufb %ymm7,%ymm0,%ymm0
- vinserti128 $1,32(%r12),%ymm2,%ymm2
- vpshufb %ymm7,%ymm1,%ymm1
- vinserti128 $1,48(%r12),%ymm3,%ymm3
-
- leaq K256(%rip),%rbp
- vpshufb %ymm7,%ymm2,%ymm2
- vpaddd 0(%rbp),%ymm0,%ymm4
- vpshufb %ymm7,%ymm3,%ymm3
- vpaddd 32(%rbp),%ymm1,%ymm5
- vpaddd 64(%rbp),%ymm2,%ymm6
- vpaddd 96(%rbp),%ymm3,%ymm7
- vmovdqa %ymm4,0(%rsp)
- xorl %r14d,%r14d
- vmovdqa %ymm5,32(%rsp)
- leaq -64(%rsp),%rsp
- movl %ebx,%edi
- vmovdqa %ymm6,0(%rsp)
- xorl %ecx,%edi
- vmovdqa %ymm7,32(%rsp)
- movl %r9d,%r12d
- subq $-32*4,%rbp
- jmp .Lavx2_00_47
-
-.align 16
-.Lavx2_00_47:
- leaq -64(%rsp),%rsp
- vpalignr $4,%ymm0,%ymm1,%ymm4
- addl 0+128(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- vpalignr $4,%ymm2,%ymm3,%ymm7
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- vpsrld $7,%ymm4,%ymm6
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- vpaddd %ymm7,%ymm0,%ymm0
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %ebx,%edi
- vpshufd $250,%ymm3,%ymm7
- xorl %r13d,%r14d
- leal (%r11,%rdi,1),%r11d
- movl %r8d,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 4+128(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%edx,%edi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- vpslld $11,%ymm5,%ymm5
- andnl %r9d,%edx,%r12d
- xorl %edi,%r13d
- rorxl $6,%edx,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%edi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%edi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- vpsrlq $17,%ymm7,%ymm7
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %eax,%r15d
- vpaddd %ymm4,%ymm0,%ymm0
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 8+128(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- vpxor %ymm7,%ymm6,%ymm6
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- vpshufb %ymm8,%ymm6,%ymm6
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- vpaddd %ymm6,%ymm0,%ymm0
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- vpshufd $80,%ymm0,%ymm7
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- vpsrld $10,%ymm7,%ymm6
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r11d,%edi
- vpsrlq $17,%ymm7,%ymm7
- xorl %r13d,%r14d
- leal (%r9,%rdi,1),%r9d
- movl %ecx,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 12+128(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%ebx,%edi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- vpxor %ymm7,%ymm6,%ymm6
- andnl %edx,%ebx,%r12d
- xorl %edi,%r13d
- rorxl $6,%ebx,%r14d
- vpshufb %ymm9,%ymm6,%ymm6
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%edi
- vpaddd %ymm6,%ymm0,%ymm0
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%edi
- vpaddd 0(%rbp),%ymm0,%ymm6
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- vmovdqa %ymm6,0(%rsp)
- vpalignr $4,%ymm1,%ymm2,%ymm4
- addl 32+128(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- vpalignr $4,%ymm3,%ymm0,%ymm7
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- vpsrld $7,%ymm4,%ymm6
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- vpaddd %ymm7,%ymm1,%ymm1
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r9d,%edi
- vpshufd $250,%ymm0,%ymm7
- xorl %r13d,%r14d
- leal (%rdx,%rdi,1),%edx
- movl %eax,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 36+128(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%r11d,%edi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- vpslld $11,%ymm5,%ymm5
- andnl %ebx,%r11d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r11d,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%edi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%edi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- vpsrlq $17,%ymm7,%ymm7
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- vpaddd %ymm4,%ymm1,%ymm1
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 40+128(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- vpxor %ymm7,%ymm6,%ymm6
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- vpshufb %ymm8,%ymm6,%ymm6
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- vpaddd %ymm6,%ymm1,%ymm1
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- vpshufd $80,%ymm1,%ymm7
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- vpsrld $10,%ymm7,%ymm6
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %edx,%edi
- vpsrlq $17,%ymm7,%ymm7
- xorl %r13d,%r14d
- leal (%rbx,%rdi,1),%ebx
- movl %r10d,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 44+128(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%r9d,%edi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- vpxor %ymm7,%ymm6,%ymm6
- andnl %r11d,%r9d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r9d,%r14d
- vpshufb %ymm9,%ymm6,%ymm6
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%edi
- vpaddd %ymm6,%ymm1,%ymm1
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%edi
- vpaddd 32(%rbp),%ymm1,%ymm6
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- vmovdqa %ymm6,32(%rsp)
- leaq -64(%rsp),%rsp
- vpalignr $4,%ymm2,%ymm3,%ymm4
- addl 0+128(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- vpalignr $4,%ymm0,%ymm1,%ymm7
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- vpsrld $7,%ymm4,%ymm6
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- vpaddd %ymm7,%ymm2,%ymm2
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %ebx,%edi
- vpshufd $250,%ymm1,%ymm7
- xorl %r13d,%r14d
- leal (%r11,%rdi,1),%r11d
- movl %r8d,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 4+128(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%edx,%edi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- vpslld $11,%ymm5,%ymm5
- andnl %r9d,%edx,%r12d
- xorl %edi,%r13d
- rorxl $6,%edx,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%edi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%edi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- vpsrlq $17,%ymm7,%ymm7
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %eax,%r15d
- vpaddd %ymm4,%ymm2,%ymm2
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 8+128(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- vpxor %ymm7,%ymm6,%ymm6
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- vpshufb %ymm8,%ymm6,%ymm6
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- vpaddd %ymm6,%ymm2,%ymm2
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- vpshufd $80,%ymm2,%ymm7
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- vpsrld $10,%ymm7,%ymm6
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r11d,%edi
- vpsrlq $17,%ymm7,%ymm7
- xorl %r13d,%r14d
- leal (%r9,%rdi,1),%r9d
- movl %ecx,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 12+128(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%ebx,%edi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- vpxor %ymm7,%ymm6,%ymm6
- andnl %edx,%ebx,%r12d
- xorl %edi,%r13d
- rorxl $6,%ebx,%r14d
- vpshufb %ymm9,%ymm6,%ymm6
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%edi
- vpaddd %ymm6,%ymm2,%ymm2
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%edi
- vpaddd 64(%rbp),%ymm2,%ymm6
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- vmovdqa %ymm6,0(%rsp)
- vpalignr $4,%ymm3,%ymm0,%ymm4
- addl 32+128(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- vpalignr $4,%ymm1,%ymm2,%ymm7
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- vpsrld $7,%ymm4,%ymm6
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- vpaddd %ymm7,%ymm3,%ymm3
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r9d,%edi
- vpshufd $250,%ymm2,%ymm7
- xorl %r13d,%r14d
- leal (%rdx,%rdi,1),%edx
- movl %eax,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 36+128(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%r11d,%edi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- vpslld $11,%ymm5,%ymm5
- andnl %ebx,%r11d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r11d,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%edi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%edi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- vpsrlq $17,%ymm7,%ymm7
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- vpaddd %ymm4,%ymm3,%ymm3
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 40+128(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- vpxor %ymm7,%ymm6,%ymm6
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- vpshufb %ymm8,%ymm6,%ymm6
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- vpaddd %ymm6,%ymm3,%ymm3
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- vpshufd $80,%ymm3,%ymm7
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- vpsrld $10,%ymm7,%ymm6
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %edx,%edi
- vpsrlq $17,%ymm7,%ymm7
- xorl %r13d,%r14d
- leal (%rbx,%rdi,1),%ebx
- movl %r10d,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 44+128(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%r9d,%edi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- vpxor %ymm7,%ymm6,%ymm6
- andnl %r11d,%r9d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r9d,%r14d
- vpshufb %ymm9,%ymm6,%ymm6
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%edi
- vpaddd %ymm6,%ymm3,%ymm3
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%edi
- vpaddd 96(%rbp),%ymm3,%ymm6
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- vmovdqa %ymm6,32(%rsp)
- leaq 128(%rbp),%rbp
- cmpb $0,3(%rbp)
- jne .Lavx2_00_47
- addl 0+64(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %ebx,%edi
- xorl %r13d,%r14d
- leal (%r11,%rdi,1),%r11d
- movl %r8d,%r12d
- addl 4+64(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%edi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %edi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%edi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%edi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8+64(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r11d,%edi
- xorl %r13d,%r14d
- leal (%r9,%rdi,1),%r9d
- movl %ecx,%r12d
- addl 12+64(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%edi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %edi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%edi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%edi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32+64(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r9d,%edi
- xorl %r13d,%r14d
- leal (%rdx,%rdi,1),%edx
- movl %eax,%r12d
- addl 36+64(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%edi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%edi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%edi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40+64(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %edx,%edi
- xorl %r13d,%r14d
- leal (%rbx,%rdi,1),%ebx
- movl %r10d,%r12d
- addl 44+64(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%edi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%edi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%edi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- addl 0(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %ebx,%edi
- xorl %r13d,%r14d
- leal (%r11,%rdi,1),%r11d
- movl %r8d,%r12d
- addl 4(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%edi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %edi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%edi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%edi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r11d,%edi
- xorl %r13d,%r14d
- leal (%r9,%rdi,1),%r9d
- movl %ecx,%r12d
- addl 12(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%edi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %edi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%edi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%edi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r9d,%edi
- xorl %r13d,%r14d
- leal (%rdx,%rdi,1),%edx
- movl %eax,%r12d
- addl 36(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%edi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%edi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%edi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %edx,%edi
- xorl %r13d,%r14d
- leal (%rbx,%rdi,1),%ebx
- movl %r10d,%r12d
- addl 44(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%edi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%edi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%edi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- movq 512(%rsp),%rdi
- addl %r14d,%eax
-
- leaq 448(%rsp),%rbp
-
- addl 0(%rdi),%eax
- addl 4(%rdi),%ebx
- addl 8(%rdi),%ecx
- addl 12(%rdi),%edx
- addl 16(%rdi),%r8d
- addl 20(%rdi),%r9d
- addl 24(%rdi),%r10d
- addl 28(%rdi),%r11d
-
- movl %eax,0(%rdi)
- movl %ebx,4(%rdi)
- movl %ecx,8(%rdi)
- movl %edx,12(%rdi)
- movl %r8d,16(%rdi)
- movl %r9d,20(%rdi)
- movl %r10d,24(%rdi)
- movl %r11d,28(%rdi)
-
- cmpq 80(%rbp),%rsi
- je .Ldone_avx2
-
- xorl %r14d,%r14d
- movl %ebx,%edi
- xorl %ecx,%edi
- movl %r9d,%r12d
- jmp .Lower_avx2
-.align 16
-.Lower_avx2:
- addl 0+16(%rbp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %ebx,%edi
- xorl %r13d,%r14d
- leal (%r11,%rdi,1),%r11d
- movl %r8d,%r12d
- addl 4+16(%rbp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%edi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %edi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%edi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%edi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8+16(%rbp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r11d,%edi
- xorl %r13d,%r14d
- leal (%r9,%rdi,1),%r9d
- movl %ecx,%r12d
- addl 12+16(%rbp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%edi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %edi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%edi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%edi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32+16(%rbp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r9d,%edi
- xorl %r13d,%r14d
- leal (%rdx,%rdi,1),%edx
- movl %eax,%r12d
- addl 36+16(%rbp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%edi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%edi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%edi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40+16(%rbp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %edx,%edi
- xorl %r13d,%r14d
- leal (%rbx,%rdi,1),%ebx
- movl %r10d,%r12d
- addl 44+16(%rbp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%edi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%edi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%edi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- leaq -64(%rbp),%rbp
- cmpq %rsp,%rbp
- jae .Lower_avx2
-
- movq 512(%rsp),%rdi
- addl %r14d,%eax
-
- leaq 448(%rsp),%rsp
-
- addl 0(%rdi),%eax
- addl 4(%rdi),%ebx
- addl 8(%rdi),%ecx
- addl 12(%rdi),%edx
- addl 16(%rdi),%r8d
- addl 20(%rdi),%r9d
- leaq 128(%rsi),%rsi
- addl 24(%rdi),%r10d
- movq %rsi,%r12
- addl 28(%rdi),%r11d
- cmpq 64+16(%rsp),%rsi
-
- movl %eax,0(%rdi)
- cmoveq %rsp,%r12
- movl %ebx,4(%rdi)
- movl %ecx,8(%rdi)
- movl %edx,12(%rdi)
- movl %r8d,16(%rdi)
- movl %r9d,20(%rdi)
- movl %r10d,24(%rdi)
- movl %r11d,28(%rdi)
-
- jbe .Loop_avx2
- leaq (%rsp),%rbp
-
-.Ldone_avx2:
- leaq (%rbp),%rsp
- movq 64+24(%rsp),%rsi
- vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-.Lepilogue_avx2:
- .byte 0xf3,0xc3
-.size sha256_block_data_order_avx2,.-sha256_block_data_order_avx2
diff --git a/deps/openssl/asm/x64-elf-gas/sha/sha512-x86_64.s b/deps/openssl/asm/x64-elf-gas/sha/sha512-x86_64.s
index a1021c17a9..f6638db30e 100644
--- a/deps/openssl/asm/x64-elf-gas/sha/sha512-x86_64.s
+++ b/deps/openssl/asm/x64-elf-gas/sha/sha512-x86_64.s
@@ -5,20 +5,6 @@
.type sha512_block_data_order,@function
.align 16
sha512_block_data_order:
- leaq OPENSSL_ia32cap_P(%rip),%r11
- movl 0(%r11),%r9d
- movl 4(%r11),%r10d
- movl 8(%r11),%r11d
- testl $2048,%r10d
- jnz .Lxop_shortcut
- andl $296,%r11d
- cmpl $296,%r11d
- je .Lavx2_shortcut
- andl $1073741824,%r9d
- andl $268435968,%r10d
- orl %r9d,%r10d
- cmpl $1342177792,%r10d
- je .Lavx_shortcut
pushq %rbx
pushq %rbp
pushq %r12
@@ -1795,3571 +1781,3 @@ K512:
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.type sha512_block_data_order_xop,@function
-.align 64
-sha512_block_data_order_xop:
-.Lxop_shortcut:
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rsp,%r11
- shlq $4,%rdx
- subq $160,%rsp
- leaq (%rsi,%rdx,8),%rdx
- andq $-64,%rsp
- movq %rdi,128+0(%rsp)
- movq %rsi,128+8(%rsp)
- movq %rdx,128+16(%rsp)
- movq %r11,128+24(%rsp)
-.Lprologue_xop:
-
- vzeroupper
- movq 0(%rdi),%rax
- movq 8(%rdi),%rbx
- movq 16(%rdi),%rcx
- movq 24(%rdi),%rdx
- movq 32(%rdi),%r8
- movq 40(%rdi),%r9
- movq 48(%rdi),%r10
- movq 56(%rdi),%r11
- jmp .Lloop_xop
-.align 16
-.Lloop_xop:
- vmovdqa K512+1280(%rip),%xmm11
- vmovdqu 0(%rsi),%xmm0
- leaq K512+128(%rip),%rbp
- vmovdqu 16(%rsi),%xmm1
- vmovdqu 32(%rsi),%xmm2
- vpshufb %xmm11,%xmm0,%xmm0
- vmovdqu 48(%rsi),%xmm3
- vpshufb %xmm11,%xmm1,%xmm1
- vmovdqu 64(%rsi),%xmm4
- vpshufb %xmm11,%xmm2,%xmm2
- vmovdqu 80(%rsi),%xmm5
- vpshufb %xmm11,%xmm3,%xmm3
- vmovdqu 96(%rsi),%xmm6
- vpshufb %xmm11,%xmm4,%xmm4
- vmovdqu 112(%rsi),%xmm7
- vpshufb %xmm11,%xmm5,%xmm5
- vpaddq -128(%rbp),%xmm0,%xmm8
- vpshufb %xmm11,%xmm6,%xmm6
- vpaddq -96(%rbp),%xmm1,%xmm9
- vpshufb %xmm11,%xmm7,%xmm7
- vpaddq -64(%rbp),%xmm2,%xmm10
- vpaddq -32(%rbp),%xmm3,%xmm11
- vmovdqa %xmm8,0(%rsp)
- vpaddq 0(%rbp),%xmm4,%xmm8
- vmovdqa %xmm9,16(%rsp)
- vpaddq 32(%rbp),%xmm5,%xmm9
- vmovdqa %xmm10,32(%rsp)
- vpaddq 64(%rbp),%xmm6,%xmm10
- vmovdqa %xmm11,48(%rsp)
- vpaddq 96(%rbp),%xmm7,%xmm11
- vmovdqa %xmm8,64(%rsp)
- movq %rax,%r14
- vmovdqa %xmm9,80(%rsp)
- movq %rbx,%rdi
- vmovdqa %xmm10,96(%rsp)
- xorq %rcx,%rdi
- vmovdqa %xmm11,112(%rsp)
- movq %r8,%r13
- jmp .Lxop_00_47
-
-.align 16
-.Lxop_00_47:
- addq $256,%rbp
- vpalignr $8,%xmm0,%xmm1,%xmm8
- rorq $23,%r13
- movq %r14,%rax
- vpalignr $8,%xmm4,%xmm5,%xmm11
- movq %r9,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r8,%r13
- xorq %r10,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rax,%r14
- vpaddq %xmm11,%xmm0,%xmm0
- andq %r8,%r12
- xorq %r8,%r13
- addq 0(%rsp),%r11
- movq %rax,%r15
-.byte 143,72,120,195,209,7
- xorq %r10,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,223,3
- xorq %rax,%r14
- addq %r13,%r11
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rbx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm7,%xmm10
- addq %r11,%rdx
- addq %rdi,%r11
- vpaddq %xmm8,%xmm0,%xmm0
- movq %rdx,%r13
- addq %r11,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r11
- vpxor %xmm10,%xmm11,%xmm11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- vpaddq %xmm11,%xmm0,%xmm0
- addq 8(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- vpaddq -128(%rbp),%xmm0,%xmm10
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- vmovdqa %xmm10,0(%rsp)
- vpalignr $8,%xmm1,%xmm2,%xmm8
- rorq $23,%r13
- movq %r14,%r10
- vpalignr $8,%xmm5,%xmm6,%xmm11
- movq %rdx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rcx,%r13
- xorq %r8,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r10,%r14
- vpaddq %xmm11,%xmm1,%xmm1
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 16(%rsp),%r9
- movq %r10,%r15
-.byte 143,72,120,195,209,7
- xorq %r8,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,216,3
- xorq %r10,%r14
- addq %r13,%r9
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r11,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm0,%xmm10
- addq %r9,%rbx
- addq %rdi,%r9
- vpaddq %xmm8,%xmm1,%xmm1
- movq %rbx,%r13
- addq %r9,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r9
- vpxor %xmm10,%xmm11,%xmm11
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- vpaddq %xmm11,%xmm1,%xmm1
- addq 24(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- vpaddq -96(%rbp),%xmm1,%xmm10
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- vmovdqa %xmm10,16(%rsp)
- vpalignr $8,%xmm2,%xmm3,%xmm8
- rorq $23,%r13
- movq %r14,%r8
- vpalignr $8,%xmm6,%xmm7,%xmm11
- movq %rbx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rax,%r13
- xorq %rcx,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r8,%r14
- vpaddq %xmm11,%xmm2,%xmm2
- andq %rax,%r12
- xorq %rax,%r13
- addq 32(%rsp),%rdx
- movq %r8,%r15
-.byte 143,72,120,195,209,7
- xorq %rcx,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,217,3
- xorq %r8,%r14
- addq %r13,%rdx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r9,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm1,%xmm10
- addq %rdx,%r11
- addq %rdi,%rdx
- vpaddq %xmm8,%xmm2,%xmm2
- movq %r11,%r13
- addq %rdx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rdx
- vpxor %xmm10,%xmm11,%xmm11
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- vpaddq %xmm11,%xmm2,%xmm2
- addq 40(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- vpaddq -64(%rbp),%xmm2,%xmm10
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- vmovdqa %xmm10,32(%rsp)
- vpalignr $8,%xmm3,%xmm4,%xmm8
- rorq $23,%r13
- movq %r14,%rcx
- vpalignr $8,%xmm7,%xmm0,%xmm11
- movq %r11,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r10,%r13
- xorq %rax,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rcx,%r14
- vpaddq %xmm11,%xmm3,%xmm3
- andq %r10,%r12
- xorq %r10,%r13
- addq 48(%rsp),%rbx
- movq %rcx,%r15
-.byte 143,72,120,195,209,7
- xorq %rax,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,218,3
- xorq %rcx,%r14
- addq %r13,%rbx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rdx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm2,%xmm10
- addq %rbx,%r9
- addq %rdi,%rbx
- vpaddq %xmm8,%xmm3,%xmm3
- movq %r9,%r13
- addq %rbx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rbx
- vpxor %xmm10,%xmm11,%xmm11
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- vpaddq %xmm11,%xmm3,%xmm3
- addq 56(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- vpaddq -32(%rbp),%xmm3,%xmm10
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- vmovdqa %xmm10,48(%rsp)
- vpalignr $8,%xmm4,%xmm5,%xmm8
- rorq $23,%r13
- movq %r14,%rax
- vpalignr $8,%xmm0,%xmm1,%xmm11
- movq %r9,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r8,%r13
- xorq %r10,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rax,%r14
- vpaddq %xmm11,%xmm4,%xmm4
- andq %r8,%r12
- xorq %r8,%r13
- addq 64(%rsp),%r11
- movq %rax,%r15
-.byte 143,72,120,195,209,7
- xorq %r10,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,219,3
- xorq %rax,%r14
- addq %r13,%r11
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rbx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm3,%xmm10
- addq %r11,%rdx
- addq %rdi,%r11
- vpaddq %xmm8,%xmm4,%xmm4
- movq %rdx,%r13
- addq %r11,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r11
- vpxor %xmm10,%xmm11,%xmm11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- vpaddq %xmm11,%xmm4,%xmm4
- addq 72(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- vpaddq 0(%rbp),%xmm4,%xmm10
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- vmovdqa %xmm10,64(%rsp)
- vpalignr $8,%xmm5,%xmm6,%xmm8
- rorq $23,%r13
- movq %r14,%r10
- vpalignr $8,%xmm1,%xmm2,%xmm11
- movq %rdx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rcx,%r13
- xorq %r8,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r10,%r14
- vpaddq %xmm11,%xmm5,%xmm5
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 80(%rsp),%r9
- movq %r10,%r15
-.byte 143,72,120,195,209,7
- xorq %r8,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,220,3
- xorq %r10,%r14
- addq %r13,%r9
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r11,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm4,%xmm10
- addq %r9,%rbx
- addq %rdi,%r9
- vpaddq %xmm8,%xmm5,%xmm5
- movq %rbx,%r13
- addq %r9,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r9
- vpxor %xmm10,%xmm11,%xmm11
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- vpaddq %xmm11,%xmm5,%xmm5
- addq 88(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- vpaddq 32(%rbp),%xmm5,%xmm10
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- vmovdqa %xmm10,80(%rsp)
- vpalignr $8,%xmm6,%xmm7,%xmm8
- rorq $23,%r13
- movq %r14,%r8
- vpalignr $8,%xmm2,%xmm3,%xmm11
- movq %rbx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rax,%r13
- xorq %rcx,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r8,%r14
- vpaddq %xmm11,%xmm6,%xmm6
- andq %rax,%r12
- xorq %rax,%r13
- addq 96(%rsp),%rdx
- movq %r8,%r15
-.byte 143,72,120,195,209,7
- xorq %rcx,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,221,3
- xorq %r8,%r14
- addq %r13,%rdx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r9,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm5,%xmm10
- addq %rdx,%r11
- addq %rdi,%rdx
- vpaddq %xmm8,%xmm6,%xmm6
- movq %r11,%r13
- addq %rdx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rdx
- vpxor %xmm10,%xmm11,%xmm11
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- vpaddq %xmm11,%xmm6,%xmm6
- addq 104(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- vpaddq 64(%rbp),%xmm6,%xmm10
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- vmovdqa %xmm10,96(%rsp)
- vpalignr $8,%xmm7,%xmm0,%xmm8
- rorq $23,%r13
- movq %r14,%rcx
- vpalignr $8,%xmm3,%xmm4,%xmm11
- movq %r11,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r10,%r13
- xorq %rax,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rcx,%r14
- vpaddq %xmm11,%xmm7,%xmm7
- andq %r10,%r12
- xorq %r10,%r13
- addq 112(%rsp),%rbx
- movq %rcx,%r15
-.byte 143,72,120,195,209,7
- xorq %rax,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,222,3
- xorq %rcx,%r14
- addq %r13,%rbx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rdx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm6,%xmm10
- addq %rbx,%r9
- addq %rdi,%rbx
- vpaddq %xmm8,%xmm7,%xmm7
- movq %r9,%r13
- addq %rbx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rbx
- vpxor %xmm10,%xmm11,%xmm11
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- vpaddq %xmm11,%xmm7,%xmm7
- addq 120(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- vpaddq 96(%rbp),%xmm7,%xmm10
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- vmovdqa %xmm10,112(%rsp)
- cmpb $0,135(%rbp)
- jne .Lxop_00_47
- rorq $23,%r13
- movq %r14,%rax
- movq %r9,%r12
- rorq $5,%r14
- xorq %r8,%r13
- xorq %r10,%r12
- rorq $4,%r13
- xorq %rax,%r14
- andq %r8,%r12
- xorq %r8,%r13
- addq 0(%rsp),%r11
- movq %rax,%r15
- xorq %r10,%r12
- rorq $6,%r14
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rax,%r14
- addq %r13,%r11
- xorq %rbx,%rdi
- rorq $28,%r14
- addq %r11,%rdx
- addq %rdi,%r11
- movq %rdx,%r13
- addq %r11,%r14
- rorq $23,%r13
- movq %r14,%r11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- addq 8(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- rorq $23,%r13
- movq %r14,%r10
- movq %rdx,%r12
- rorq $5,%r14
- xorq %rcx,%r13
- xorq %r8,%r12
- rorq $4,%r13
- xorq %r10,%r14
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 16(%rsp),%r9
- movq %r10,%r15
- xorq %r8,%r12
- rorq $6,%r14
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r10,%r14
- addq %r13,%r9
- xorq %r11,%rdi
- rorq $28,%r14
- addq %r9,%rbx
- addq %rdi,%r9
- movq %rbx,%r13
- addq %r9,%r14
- rorq $23,%r13
- movq %r14,%r9
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- addq 24(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- rorq $23,%r13
- movq %r14,%r8
- movq %rbx,%r12
- rorq $5,%r14
- xorq %rax,%r13
- xorq %rcx,%r12
- rorq $4,%r13
- xorq %r8,%r14
- andq %rax,%r12
- xorq %rax,%r13
- addq 32(%rsp),%rdx
- movq %r8,%r15
- xorq %rcx,%r12
- rorq $6,%r14
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r8,%r14
- addq %r13,%rdx
- xorq %r9,%rdi
- rorq $28,%r14
- addq %rdx,%r11
- addq %rdi,%rdx
- movq %r11,%r13
- addq %rdx,%r14
- rorq $23,%r13
- movq %r14,%rdx
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- addq 40(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- rorq $23,%r13
- movq %r14,%rcx
- movq %r11,%r12
- rorq $5,%r14
- xorq %r10,%r13
- xorq %rax,%r12
- rorq $4,%r13
- xorq %rcx,%r14
- andq %r10,%r12
- xorq %r10,%r13
- addq 48(%rsp),%rbx
- movq %rcx,%r15
- xorq %rax,%r12
- rorq $6,%r14
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rcx,%r14
- addq %r13,%rbx
- xorq %rdx,%rdi
- rorq $28,%r14
- addq %rbx,%r9
- addq %rdi,%rbx
- movq %r9,%r13
- addq %rbx,%r14
- rorq $23,%r13
- movq %r14,%rbx
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- addq 56(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- rorq $23,%r13
- movq %r14,%rax
- movq %r9,%r12
- rorq $5,%r14
- xorq %r8,%r13
- xorq %r10,%r12
- rorq $4,%r13
- xorq %rax,%r14
- andq %r8,%r12
- xorq %r8,%r13
- addq 64(%rsp),%r11
- movq %rax,%r15
- xorq %r10,%r12
- rorq $6,%r14
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rax,%r14
- addq %r13,%r11
- xorq %rbx,%rdi
- rorq $28,%r14
- addq %r11,%rdx
- addq %rdi,%r11
- movq %rdx,%r13
- addq %r11,%r14
- rorq $23,%r13
- movq %r14,%r11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- addq 72(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- rorq $23,%r13
- movq %r14,%r10
- movq %rdx,%r12
- rorq $5,%r14
- xorq %rcx,%r13
- xorq %r8,%r12
- rorq $4,%r13
- xorq %r10,%r14
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 80(%rsp),%r9
- movq %r10,%r15
- xorq %r8,%r12
- rorq $6,%r14
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r10,%r14
- addq %r13,%r9
- xorq %r11,%rdi
- rorq $28,%r14
- addq %r9,%rbx
- addq %rdi,%r9
- movq %rbx,%r13
- addq %r9,%r14
- rorq $23,%r13
- movq %r14,%r9
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- addq 88(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- rorq $23,%r13
- movq %r14,%r8
- movq %rbx,%r12
- rorq $5,%r14
- xorq %rax,%r13
- xorq %rcx,%r12
- rorq $4,%r13
- xorq %r8,%r14
- andq %rax,%r12
- xorq %rax,%r13
- addq 96(%rsp),%rdx
- movq %r8,%r15
- xorq %rcx,%r12
- rorq $6,%r14
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r8,%r14
- addq %r13,%rdx
- xorq %r9,%rdi
- rorq $28,%r14
- addq %rdx,%r11
- addq %rdi,%rdx
- movq %r11,%r13
- addq %rdx,%r14
- rorq $23,%r13
- movq %r14,%rdx
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- addq 104(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- rorq $23,%r13
- movq %r14,%rcx
- movq %r11,%r12
- rorq $5,%r14
- xorq %r10,%r13
- xorq %rax,%r12
- rorq $4,%r13
- xorq %rcx,%r14
- andq %r10,%r12
- xorq %r10,%r13
- addq 112(%rsp),%rbx
- movq %rcx,%r15
- xorq %rax,%r12
- rorq $6,%r14
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rcx,%r14
- addq %r13,%rbx
- xorq %rdx,%rdi
- rorq $28,%r14
- addq %rbx,%r9
- addq %rdi,%rbx
- movq %r9,%r13
- addq %rbx,%r14
- rorq $23,%r13
- movq %r14,%rbx
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- addq 120(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- movq 128+0(%rsp),%rdi
- movq %r14,%rax
-
- addq 0(%rdi),%rax
- leaq 128(%rsi),%rsi
- addq 8(%rdi),%rbx
- addq 16(%rdi),%rcx
- addq 24(%rdi),%rdx
- addq 32(%rdi),%r8
- addq 40(%rdi),%r9
- addq 48(%rdi),%r10
- addq 56(%rdi),%r11
-
- cmpq 128+16(%rsp),%rsi
-
- movq %rax,0(%rdi)
- movq %rbx,8(%rdi)
- movq %rcx,16(%rdi)
- movq %rdx,24(%rdi)
- movq %r8,32(%rdi)
- movq %r9,40(%rdi)
- movq %r10,48(%rdi)
- movq %r11,56(%rdi)
- jb .Lloop_xop
-
- movq 128+24(%rsp),%rsi
- vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-.Lepilogue_xop:
- .byte 0xf3,0xc3
-.size sha512_block_data_order_xop,.-sha512_block_data_order_xop
-.type sha512_block_data_order_avx,@function
-.align 64
-sha512_block_data_order_avx:
-.Lavx_shortcut:
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rsp,%r11
- shlq $4,%rdx
- subq $160,%rsp
- leaq (%rsi,%rdx,8),%rdx
- andq $-64,%rsp
- movq %rdi,128+0(%rsp)
- movq %rsi,128+8(%rsp)
- movq %rdx,128+16(%rsp)
- movq %r11,128+24(%rsp)
-.Lprologue_avx:
-
- vzeroupper
- movq 0(%rdi),%rax
- movq 8(%rdi),%rbx
- movq 16(%rdi),%rcx
- movq 24(%rdi),%rdx
- movq 32(%rdi),%r8
- movq 40(%rdi),%r9
- movq 48(%rdi),%r10
- movq 56(%rdi),%r11
- jmp .Lloop_avx
-.align 16
-.Lloop_avx:
- vmovdqa K512+1280(%rip),%xmm11
- vmovdqu 0(%rsi),%xmm0
- leaq K512+128(%rip),%rbp
- vmovdqu 16(%rsi),%xmm1
- vmovdqu 32(%rsi),%xmm2
- vpshufb %xmm11,%xmm0,%xmm0
- vmovdqu 48(%rsi),%xmm3
- vpshufb %xmm11,%xmm1,%xmm1
- vmovdqu 64(%rsi),%xmm4
- vpshufb %xmm11,%xmm2,%xmm2
- vmovdqu 80(%rsi),%xmm5
- vpshufb %xmm11,%xmm3,%xmm3
- vmovdqu 96(%rsi),%xmm6
- vpshufb %xmm11,%xmm4,%xmm4
- vmovdqu 112(%rsi),%xmm7
- vpshufb %xmm11,%xmm5,%xmm5
- vpaddq -128(%rbp),%xmm0,%xmm8
- vpshufb %xmm11,%xmm6,%xmm6
- vpaddq -96(%rbp),%xmm1,%xmm9
- vpshufb %xmm11,%xmm7,%xmm7
- vpaddq -64(%rbp),%xmm2,%xmm10
- vpaddq -32(%rbp),%xmm3,%xmm11
- vmovdqa %xmm8,0(%rsp)
- vpaddq 0(%rbp),%xmm4,%xmm8
- vmovdqa %xmm9,16(%rsp)
- vpaddq 32(%rbp),%xmm5,%xmm9
- vmovdqa %xmm10,32(%rsp)
- vpaddq 64(%rbp),%xmm6,%xmm10
- vmovdqa %xmm11,48(%rsp)
- vpaddq 96(%rbp),%xmm7,%xmm11
- vmovdqa %xmm8,64(%rsp)
- movq %rax,%r14
- vmovdqa %xmm9,80(%rsp)
- movq %rbx,%rdi
- vmovdqa %xmm10,96(%rsp)
- xorq %rcx,%rdi
- vmovdqa %xmm11,112(%rsp)
- movq %r8,%r13
- jmp .Lavx_00_47
-
-.align 16
-.Lavx_00_47:
- addq $256,%rbp
- vpalignr $8,%xmm0,%xmm1,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%rax
- vpalignr $8,%xmm4,%xmm5,%xmm11
- movq %r9,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %r8,%r13
- xorq %r10,%r12
- vpaddq %xmm11,%xmm0,%xmm0
- shrdq $4,%r13,%r13
- xorq %rax,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %r8,%r12
- xorq %r8,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 0(%rsp),%r11
- movq %rax,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %r10,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %rbx,%r15
- addq %r12,%r11
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %rax,%r14
- addq %r13,%r11
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rbx,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm7,%xmm11
- addq %r11,%rdx
- addq %rdi,%r11
- vpxor %xmm9,%xmm8,%xmm8
- movq %rdx,%r13
- addq %r11,%r14
- vpsllq $3,%xmm7,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%r11
- vpaddq %xmm8,%xmm0,%xmm0
- movq %r8,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm7,%xmm9
- xorq %rdx,%r13
- xorq %r9,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %r11,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %rdx,%r12
- xorq %rdx,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 8(%rsp),%r10
- movq %r11,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %r9,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %rax,%rdi
- addq %r12,%r10
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm0,%xmm0
- xorq %r11,%r14
- addq %r13,%r10
- vpaddq -128(%rbp),%xmm0,%xmm10
- xorq %rax,%r15
- shrdq $28,%r14,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- vmovdqa %xmm10,0(%rsp)
- vpalignr $8,%xmm1,%xmm2,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%r10
- vpalignr $8,%xmm5,%xmm6,%xmm11
- movq %rdx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %rcx,%r13
- xorq %r8,%r12
- vpaddq %xmm11,%xmm1,%xmm1
- shrdq $4,%r13,%r13
- xorq %r10,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %rcx,%r12
- xorq %rcx,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 16(%rsp),%r9
- movq %r10,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %r8,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %r11,%r15
- addq %r12,%r9
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %r10,%r14
- addq %r13,%r9
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r11,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm0,%xmm11
- addq %r9,%rbx
- addq %rdi,%r9
- vpxor %xmm9,%xmm8,%xmm8
- movq %rbx,%r13
- addq %r9,%r14
- vpsllq $3,%xmm0,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%r9
- vpaddq %xmm8,%xmm1,%xmm1
- movq %rcx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm0,%xmm9
- xorq %rbx,%r13
- xorq %rdx,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %r9,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %rbx,%r12
- xorq %rbx,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 24(%rsp),%r8
- movq %r9,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %rdx,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %r10,%rdi
- addq %r12,%r8
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm1,%xmm1
- xorq %r9,%r14
- addq %r13,%r8
- vpaddq -96(%rbp),%xmm1,%xmm10
- xorq %r10,%r15
- shrdq $28,%r14,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- vmovdqa %xmm10,16(%rsp)
- vpalignr $8,%xmm2,%xmm3,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%r8
- vpalignr $8,%xmm6,%xmm7,%xmm11
- movq %rbx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %rax,%r13
- xorq %rcx,%r12
- vpaddq %xmm11,%xmm2,%xmm2
- shrdq $4,%r13,%r13
- xorq %r8,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %rax,%r12
- xorq %rax,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 32(%rsp),%rdx
- movq %r8,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %rcx,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %r9,%r15
- addq %r12,%rdx
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %r8,%r14
- addq %r13,%rdx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r9,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm1,%xmm11
- addq %rdx,%r11
- addq %rdi,%rdx
- vpxor %xmm9,%xmm8,%xmm8
- movq %r11,%r13
- addq %rdx,%r14
- vpsllq $3,%xmm1,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%rdx
- vpaddq %xmm8,%xmm2,%xmm2
- movq %rax,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm1,%xmm9
- xorq %r11,%r13
- xorq %rbx,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %rdx,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %r11,%r12
- xorq %r11,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 40(%rsp),%rcx
- movq %rdx,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %rbx,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %r8,%rdi
- addq %r12,%rcx
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm2,%xmm2
- xorq %rdx,%r14
- addq %r13,%rcx
- vpaddq -64(%rbp),%xmm2,%xmm10
- xorq %r8,%r15
- shrdq $28,%r14,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- vmovdqa %xmm10,32(%rsp)
- vpalignr $8,%xmm3,%xmm4,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%rcx
- vpalignr $8,%xmm7,%xmm0,%xmm11
- movq %r11,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %r10,%r13
- xorq %rax,%r12
- vpaddq %xmm11,%xmm3,%xmm3
- shrdq $4,%r13,%r13
- xorq %rcx,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %r10,%r12
- xorq %r10,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 48(%rsp),%rbx
- movq %rcx,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %rax,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %rdx,%r15
- addq %r12,%rbx
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %rcx,%r14
- addq %r13,%rbx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rdx,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm2,%xmm11
- addq %rbx,%r9
- addq %rdi,%rbx
- vpxor %xmm9,%xmm8,%xmm8
- movq %r9,%r13
- addq %rbx,%r14
- vpsllq $3,%xmm2,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%rbx
- vpaddq %xmm8,%xmm3,%xmm3
- movq %r10,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm2,%xmm9
- xorq %r9,%r13
- xorq %r11,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %rbx,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %r9,%r12
- xorq %r9,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 56(%rsp),%rax
- movq %rbx,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %r11,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %rcx,%rdi
- addq %r12,%rax
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm3,%xmm3
- xorq %rbx,%r14
- addq %r13,%rax
- vpaddq -32(%rbp),%xmm3,%xmm10
- xorq %rcx,%r15
- shrdq $28,%r14,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- vmovdqa %xmm10,48(%rsp)
- vpalignr $8,%xmm4,%xmm5,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%rax
- vpalignr $8,%xmm0,%xmm1,%xmm11
- movq %r9,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %r8,%r13
- xorq %r10,%r12
- vpaddq %xmm11,%xmm4,%xmm4
- shrdq $4,%r13,%r13
- xorq %rax,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %r8,%r12
- xorq %r8,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 64(%rsp),%r11
- movq %rax,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %r10,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %rbx,%r15
- addq %r12,%r11
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %rax,%r14
- addq %r13,%r11
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rbx,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm3,%xmm11
- addq %r11,%rdx
- addq %rdi,%r11
- vpxor %xmm9,%xmm8,%xmm8
- movq %rdx,%r13
- addq %r11,%r14
- vpsllq $3,%xmm3,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%r11
- vpaddq %xmm8,%xmm4,%xmm4
- movq %r8,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm3,%xmm9
- xorq %rdx,%r13
- xorq %r9,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %r11,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %rdx,%r12
- xorq %rdx,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 72(%rsp),%r10
- movq %r11,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %r9,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %rax,%rdi
- addq %r12,%r10
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm4,%xmm4
- xorq %r11,%r14
- addq %r13,%r10
- vpaddq 0(%rbp),%xmm4,%xmm10
- xorq %rax,%r15
- shrdq $28,%r14,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- vmovdqa %xmm10,64(%rsp)
- vpalignr $8,%xmm5,%xmm6,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%r10
- vpalignr $8,%xmm1,%xmm2,%xmm11
- movq %rdx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %rcx,%r13
- xorq %r8,%r12
- vpaddq %xmm11,%xmm5,%xmm5
- shrdq $4,%r13,%r13
- xorq %r10,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %rcx,%r12
- xorq %rcx,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 80(%rsp),%r9
- movq %r10,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %r8,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %r11,%r15
- addq %r12,%r9
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %r10,%r14
- addq %r13,%r9
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r11,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm4,%xmm11
- addq %r9,%rbx
- addq %rdi,%r9
- vpxor %xmm9,%xmm8,%xmm8
- movq %rbx,%r13
- addq %r9,%r14
- vpsllq $3,%xmm4,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%r9
- vpaddq %xmm8,%xmm5,%xmm5
- movq %rcx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm4,%xmm9
- xorq %rbx,%r13
- xorq %rdx,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %r9,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %rbx,%r12
- xorq %rbx,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 88(%rsp),%r8
- movq %r9,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %rdx,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %r10,%rdi
- addq %r12,%r8
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm5,%xmm5
- xorq %r9,%r14
- addq %r13,%r8
- vpaddq 32(%rbp),%xmm5,%xmm10
- xorq %r10,%r15
- shrdq $28,%r14,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- vmovdqa %xmm10,80(%rsp)
- vpalignr $8,%xmm6,%xmm7,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%r8
- vpalignr $8,%xmm2,%xmm3,%xmm11
- movq %rbx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %rax,%r13
- xorq %rcx,%r12
- vpaddq %xmm11,%xmm6,%xmm6
- shrdq $4,%r13,%r13
- xorq %r8,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %rax,%r12
- xorq %rax,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 96(%rsp),%rdx
- movq %r8,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %rcx,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %r9,%r15
- addq %r12,%rdx
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %r8,%r14
- addq %r13,%rdx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r9,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm5,%xmm11
- addq %rdx,%r11
- addq %rdi,%rdx
- vpxor %xmm9,%xmm8,%xmm8
- movq %r11,%r13
- addq %rdx,%r14
- vpsllq $3,%xmm5,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%rdx
- vpaddq %xmm8,%xmm6,%xmm6
- movq %rax,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm5,%xmm9
- xorq %r11,%r13
- xorq %rbx,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %rdx,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %r11,%r12
- xorq %r11,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 104(%rsp),%rcx
- movq %rdx,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %rbx,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %r8,%rdi
- addq %r12,%rcx
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm6,%xmm6
- xorq %rdx,%r14
- addq %r13,%rcx
- vpaddq 64(%rbp),%xmm6,%xmm10
- xorq %r8,%r15
- shrdq $28,%r14,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- vmovdqa %xmm10,96(%rsp)
- vpalignr $8,%xmm7,%xmm0,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%rcx
- vpalignr $8,%xmm3,%xmm4,%xmm11
- movq %r11,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %r10,%r13
- xorq %rax,%r12
- vpaddq %xmm11,%xmm7,%xmm7
- shrdq $4,%r13,%r13
- xorq %rcx,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %r10,%r12
- xorq %r10,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 112(%rsp),%rbx
- movq %rcx,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %rax,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %rdx,%r15
- addq %r12,%rbx
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %rcx,%r14
- addq %r13,%rbx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rdx,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm6,%xmm11
- addq %rbx,%r9
- addq %rdi,%rbx
- vpxor %xmm9,%xmm8,%xmm8
- movq %r9,%r13
- addq %rbx,%r14
- vpsllq $3,%xmm6,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%rbx
- vpaddq %xmm8,%xmm7,%xmm7
- movq %r10,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm6,%xmm9
- xorq %r9,%r13
- xorq %r11,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %rbx,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %r9,%r12
- xorq %r9,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 120(%rsp),%rax
- movq %rbx,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %r11,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %rcx,%rdi
- addq %r12,%rax
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm7,%xmm7
- xorq %rbx,%r14
- addq %r13,%rax
- vpaddq 96(%rbp),%xmm7,%xmm10
- xorq %rcx,%r15
- shrdq $28,%r14,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- vmovdqa %xmm10,112(%rsp)
- cmpb $0,135(%rbp)
- jne .Lavx_00_47
- shrdq $23,%r13,%r13
- movq %r14,%rax
- movq %r9,%r12
- shrdq $5,%r14,%r14
- xorq %r8,%r13
- xorq %r10,%r12
- shrdq $4,%r13,%r13
- xorq %rax,%r14
- andq %r8,%r12
- xorq %r8,%r13
- addq 0(%rsp),%r11
- movq %rax,%r15
- xorq %r10,%r12
- shrdq $6,%r14,%r14
- xorq %rbx,%r15
- addq %r12,%r11
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %rax,%r14
- addq %r13,%r11
- xorq %rbx,%rdi
- shrdq $28,%r14,%r14
- addq %r11,%rdx
- addq %rdi,%r11
- movq %rdx,%r13
- addq %r11,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r11
- movq %r8,%r12
- shrdq $5,%r14,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- shrdq $4,%r13,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- addq 8(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- shrdq $6,%r14,%r14
- xorq %rax,%rdi
- addq %r12,%r10
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- shrdq $28,%r14,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r10
- movq %rdx,%r12
- shrdq $5,%r14,%r14
- xorq %rcx,%r13
- xorq %r8,%r12
- shrdq $4,%r13,%r13
- xorq %r10,%r14
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 16(%rsp),%r9
- movq %r10,%r15
- xorq %r8,%r12
- shrdq $6,%r14,%r14
- xorq %r11,%r15
- addq %r12,%r9
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %r10,%r14
- addq %r13,%r9
- xorq %r11,%rdi
- shrdq $28,%r14,%r14
- addq %r9,%rbx
- addq %rdi,%r9
- movq %rbx,%r13
- addq %r9,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r9
- movq %rcx,%r12
- shrdq $5,%r14,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- shrdq $4,%r13,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- addq 24(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- shrdq $6,%r14,%r14
- xorq %r10,%rdi
- addq %r12,%r8
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- shrdq $28,%r14,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r8
- movq %rbx,%r12
- shrdq $5,%r14,%r14
- xorq %rax,%r13
- xorq %rcx,%r12
- shrdq $4,%r13,%r13
- xorq %r8,%r14
- andq %rax,%r12
- xorq %rax,%r13
- addq 32(%rsp),%rdx
- movq %r8,%r15
- xorq %rcx,%r12
- shrdq $6,%r14,%r14
- xorq %r9,%r15
- addq %r12,%rdx
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %r8,%r14
- addq %r13,%rdx
- xorq %r9,%rdi
- shrdq $28,%r14,%r14
- addq %rdx,%r11
- addq %rdi,%rdx
- movq %r11,%r13
- addq %rdx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rdx
- movq %rax,%r12
- shrdq $5,%r14,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- shrdq $4,%r13,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- addq 40(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- shrdq $6,%r14,%r14
- xorq %r8,%rdi
- addq %r12,%rcx
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- shrdq $28,%r14,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rcx
- movq %r11,%r12
- shrdq $5,%r14,%r14
- xorq %r10,%r13
- xorq %rax,%r12
- shrdq $4,%r13,%r13
- xorq %rcx,%r14
- andq %r10,%r12
- xorq %r10,%r13
- addq 48(%rsp),%rbx
- movq %rcx,%r15
- xorq %rax,%r12
- shrdq $6,%r14,%r14
- xorq %rdx,%r15
- addq %r12,%rbx
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %rcx,%r14
- addq %r13,%rbx
- xorq %rdx,%rdi
- shrdq $28,%r14,%r14
- addq %rbx,%r9
- addq %rdi,%rbx
- movq %r9,%r13
- addq %rbx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rbx
- movq %r10,%r12
- shrdq $5,%r14,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- shrdq $4,%r13,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- addq 56(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- shrdq $6,%r14,%r14
- xorq %rcx,%rdi
- addq %r12,%rax
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- shrdq $28,%r14,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rax
- movq %r9,%r12
- shrdq $5,%r14,%r14
- xorq %r8,%r13
- xorq %r10,%r12
- shrdq $4,%r13,%r13
- xorq %rax,%r14
- andq %r8,%r12
- xorq %r8,%r13
- addq 64(%rsp),%r11
- movq %rax,%r15
- xorq %r10,%r12
- shrdq $6,%r14,%r14
- xorq %rbx,%r15
- addq %r12,%r11
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %rax,%r14
- addq %r13,%r11
- xorq %rbx,%rdi
- shrdq $28,%r14,%r14
- addq %r11,%rdx
- addq %rdi,%r11
- movq %rdx,%r13
- addq %r11,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r11
- movq %r8,%r12
- shrdq $5,%r14,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- shrdq $4,%r13,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- addq 72(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- shrdq $6,%r14,%r14
- xorq %rax,%rdi
- addq %r12,%r10
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- shrdq $28,%r14,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r10
- movq %rdx,%r12
- shrdq $5,%r14,%r14
- xorq %rcx,%r13
- xorq %r8,%r12
- shrdq $4,%r13,%r13
- xorq %r10,%r14
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 80(%rsp),%r9
- movq %r10,%r15
- xorq %r8,%r12
- shrdq $6,%r14,%r14
- xorq %r11,%r15
- addq %r12,%r9
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %r10,%r14
- addq %r13,%r9
- xorq %r11,%rdi
- shrdq $28,%r14,%r14
- addq %r9,%rbx
- addq %rdi,%r9
- movq %rbx,%r13
- addq %r9,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r9
- movq %rcx,%r12
- shrdq $5,%r14,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- shrdq $4,%r13,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- addq 88(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- shrdq $6,%r14,%r14
- xorq %r10,%rdi
- addq %r12,%r8
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- shrdq $28,%r14,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r8
- movq %rbx,%r12
- shrdq $5,%r14,%r14
- xorq %rax,%r13
- xorq %rcx,%r12
- shrdq $4,%r13,%r13
- xorq %r8,%r14
- andq %rax,%r12
- xorq %rax,%r13
- addq 96(%rsp),%rdx
- movq %r8,%r15
- xorq %rcx,%r12
- shrdq $6,%r14,%r14
- xorq %r9,%r15
- addq %r12,%rdx
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %r8,%r14
- addq %r13,%rdx
- xorq %r9,%rdi
- shrdq $28,%r14,%r14
- addq %rdx,%r11
- addq %rdi,%rdx
- movq %r11,%r13
- addq %rdx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rdx
- movq %rax,%r12
- shrdq $5,%r14,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- shrdq $4,%r13,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- addq 104(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- shrdq $6,%r14,%r14
- xorq %r8,%rdi
- addq %r12,%rcx
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- shrdq $28,%r14,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rcx
- movq %r11,%r12
- shrdq $5,%r14,%r14
- xorq %r10,%r13
- xorq %rax,%r12
- shrdq $4,%r13,%r13
- xorq %rcx,%r14
- andq %r10,%r12
- xorq %r10,%r13
- addq 112(%rsp),%rbx
- movq %rcx,%r15
- xorq %rax,%r12
- shrdq $6,%r14,%r14
- xorq %rdx,%r15
- addq %r12,%rbx
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %rcx,%r14
- addq %r13,%rbx
- xorq %rdx,%rdi
- shrdq $28,%r14,%r14
- addq %rbx,%r9
- addq %rdi,%rbx
- movq %r9,%r13
- addq %rbx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rbx
- movq %r10,%r12
- shrdq $5,%r14,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- shrdq $4,%r13,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- addq 120(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- shrdq $6,%r14,%r14
- xorq %rcx,%rdi
- addq %r12,%rax
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- shrdq $28,%r14,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- movq 128+0(%rsp),%rdi
- movq %r14,%rax
-
- addq 0(%rdi),%rax
- leaq 128(%rsi),%rsi
- addq 8(%rdi),%rbx
- addq 16(%rdi),%rcx
- addq 24(%rdi),%rdx
- addq 32(%rdi),%r8
- addq 40(%rdi),%r9
- addq 48(%rdi),%r10
- addq 56(%rdi),%r11
-
- cmpq 128+16(%rsp),%rsi
-
- movq %rax,0(%rdi)
- movq %rbx,8(%rdi)
- movq %rcx,16(%rdi)
- movq %rdx,24(%rdi)
- movq %r8,32(%rdi)
- movq %r9,40(%rdi)
- movq %r10,48(%rdi)
- movq %r11,56(%rdi)
- jb .Lloop_avx
-
- movq 128+24(%rsp),%rsi
- vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-.Lepilogue_avx:
- .byte 0xf3,0xc3
-.size sha512_block_data_order_avx,.-sha512_block_data_order_avx
-.type sha512_block_data_order_avx2,@function
-.align 64
-sha512_block_data_order_avx2:
-.Lavx2_shortcut:
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rsp,%r11
- subq $1312,%rsp
- shlq $4,%rdx
- andq $-2048,%rsp
- leaq (%rsi,%rdx,8),%rdx
- addq $1152,%rsp
- movq %rdi,128+0(%rsp)
- movq %rsi,128+8(%rsp)
- movq %rdx,128+16(%rsp)
- movq %r11,128+24(%rsp)
-.Lprologue_avx2:
-
- vzeroupper
- subq $-128,%rsi
- movq 0(%rdi),%rax
- movq %rsi,%r12
- movq 8(%rdi),%rbx
- cmpq %rdx,%rsi
- movq 16(%rdi),%rcx
- cmoveq %rsp,%r12
- movq 24(%rdi),%rdx
- movq 32(%rdi),%r8
- movq 40(%rdi),%r9
- movq 48(%rdi),%r10
- movq 56(%rdi),%r11
- jmp .Loop_avx2
-.align 16
-.Loop_avx2:
- vmovdqu -128(%rsi),%xmm0
- vmovdqu -128+16(%rsi),%xmm1
- vmovdqu -128+32(%rsi),%xmm2
- leaq K512+128(%rip),%rbp
- vmovdqu -128+48(%rsi),%xmm3
- vmovdqu -128+64(%rsi),%xmm4
- vmovdqu -128+80(%rsi),%xmm5
- vmovdqu -128+96(%rsi),%xmm6
- vmovdqu -128+112(%rsi),%xmm7
-
- vmovdqa 1152(%rbp),%ymm10
- vinserti128 $1,(%r12),%ymm0,%ymm0
- vinserti128 $1,16(%r12),%ymm1,%ymm1
- vpshufb %ymm10,%ymm0,%ymm0
- vinserti128 $1,32(%r12),%ymm2,%ymm2
- vpshufb %ymm10,%ymm1,%ymm1
- vinserti128 $1,48(%r12),%ymm3,%ymm3
- vpshufb %ymm10,%ymm2,%ymm2
- vinserti128 $1,64(%r12),%ymm4,%ymm4
- vpshufb %ymm10,%ymm3,%ymm3
- vinserti128 $1,80(%r12),%ymm5,%ymm5
- vpshufb %ymm10,%ymm4,%ymm4
- vinserti128 $1,96(%r12),%ymm6,%ymm6
- vpshufb %ymm10,%ymm5,%ymm5
- vinserti128 $1,112(%r12),%ymm7,%ymm7
-
- vpaddq -128(%rbp),%ymm0,%ymm8
- vpshufb %ymm10,%ymm6,%ymm6
- vpaddq -96(%rbp),%ymm1,%ymm9
- vpshufb %ymm10,%ymm7,%ymm7
- vpaddq -64(%rbp),%ymm2,%ymm10
- vpaddq -32(%rbp),%ymm3,%ymm11
- vmovdqa %ymm8,0(%rsp)
- vpaddq 0(%rbp),%ymm4,%ymm8
- vmovdqa %ymm9,32(%rsp)
- vpaddq 32(%rbp),%ymm5,%ymm9
- vmovdqa %ymm10,64(%rsp)
- vpaddq 64(%rbp),%ymm6,%ymm10
- vmovdqa %ymm11,96(%rsp)
- leaq -128(%rsp),%rsp
- vpaddq 96(%rbp),%ymm7,%ymm11
- vmovdqa %ymm8,0(%rsp)
- xorq %r14,%r14
- vmovdqa %ymm9,32(%rsp)
- movq %rbx,%rdi
- vmovdqa %ymm10,64(%rsp)
- xorq %rcx,%rdi
- vmovdqa %ymm11,96(%rsp)
- movq %r9,%r12
- addq $32*8,%rbp
- jmp .Lavx2_00_47
-
-.align 16
-.Lavx2_00_47:
- leaq -128(%rsp),%rsp
- vpalignr $8,%ymm0,%ymm1,%ymm8
- addq 0+256(%rsp),%r11
- andq %r8,%r12
- rorxq $41,%r8,%r13
- vpalignr $8,%ymm4,%ymm5,%ymm11
- rorxq $18,%r8,%r15
- leaq (%rax,%r14,1),%rax
- leaq (%r11,%r12,1),%r11
- vpsrlq $1,%ymm8,%ymm10
- andnq %r10,%r8,%r12
- xorq %r15,%r13
- rorxq $14,%r8,%r14
- vpaddq %ymm11,%ymm0,%ymm0
- vpsrlq $7,%ymm8,%ymm11
- leaq (%r11,%r12,1),%r11
- xorq %r14,%r13
- movq %rax,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%rax,%r12
- leaq (%r11,%r13,1),%r11
- xorq %rbx,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%rax,%r14
- rorxq $28,%rax,%r13
- leaq (%rdx,%r11,1),%rdx
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rbx,%rdi
- vpsrlq $6,%ymm7,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%r11,%rdi,1),%r11
- movq %r8,%r12
- vpsllq $3,%ymm7,%ymm10
- vpaddq %ymm8,%ymm0,%ymm0
- addq 8+256(%rsp),%r10
- andq %rdx,%r12
- rorxq $41,%rdx,%r13
- vpsrlq $19,%ymm7,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%rdx,%rdi
- leaq (%r11,%r14,1),%r11
- leaq (%r10,%r12,1),%r10
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %r9,%rdx,%r12
- xorq %rdi,%r13
- rorxq $14,%rdx,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%r10,%r12,1),%r10
- xorq %r14,%r13
- movq %r11,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%r11,%r12
- leaq (%r10,%r13,1),%r10
- xorq %rax,%rdi
- vpaddq %ymm11,%ymm0,%ymm0
- rorxq $34,%r11,%r14
- rorxq $28,%r11,%r13
- leaq (%rcx,%r10,1),%rcx
- vpaddq -128(%rbp),%ymm0,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rax,%r15
- xorq %r13,%r14
- leaq (%r10,%r15,1),%r10
- movq %rdx,%r12
- vmovdqa %ymm10,0(%rsp)
- vpalignr $8,%ymm1,%ymm2,%ymm8
- addq 32+256(%rsp),%r9
- andq %rcx,%r12
- rorxq $41,%rcx,%r13
- vpalignr $8,%ymm5,%ymm6,%ymm11
- rorxq $18,%rcx,%r15
- leaq (%r10,%r14,1),%r10
- leaq (%r9,%r12,1),%r9
- vpsrlq $1,%ymm8,%ymm10
- andnq %r8,%rcx,%r12
- xorq %r15,%r13
- rorxq $14,%rcx,%r14
- vpaddq %ymm11,%ymm1,%ymm1
- vpsrlq $7,%ymm8,%ymm11
- leaq (%r9,%r12,1),%r9
- xorq %r14,%r13
- movq %r10,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%r10,%r12
- leaq (%r9,%r13,1),%r9
- xorq %r11,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%r10,%r14
- rorxq $28,%r10,%r13
- leaq (%rbx,%r9,1),%rbx
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r11,%rdi
- vpsrlq $6,%ymm0,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%r9,%rdi,1),%r9
- movq %rcx,%r12
- vpsllq $3,%ymm0,%ymm10
- vpaddq %ymm8,%ymm1,%ymm1
- addq 40+256(%rsp),%r8
- andq %rbx,%r12
- rorxq $41,%rbx,%r13
- vpsrlq $19,%ymm0,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%rbx,%rdi
- leaq (%r9,%r14,1),%r9
- leaq (%r8,%r12,1),%r8
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %rdx,%rbx,%r12
- xorq %rdi,%r13
- rorxq $14,%rbx,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%r8,%r12,1),%r8
- xorq %r14,%r13
- movq %r9,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%r9,%r12
- leaq (%r8,%r13,1),%r8
- xorq %r10,%rdi
- vpaddq %ymm11,%ymm1,%ymm1
- rorxq $34,%r9,%r14
- rorxq $28,%r9,%r13
- leaq (%rax,%r8,1),%rax
- vpaddq -96(%rbp),%ymm1,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r10,%r15
- xorq %r13,%r14
- leaq (%r8,%r15,1),%r8
- movq %rbx,%r12
- vmovdqa %ymm10,32(%rsp)
- vpalignr $8,%ymm2,%ymm3,%ymm8
- addq 64+256(%rsp),%rdx
- andq %rax,%r12
- rorxq $41,%rax,%r13
- vpalignr $8,%ymm6,%ymm7,%ymm11
- rorxq $18,%rax,%r15
- leaq (%r8,%r14,1),%r8
- leaq (%rdx,%r12,1),%rdx
- vpsrlq $1,%ymm8,%ymm10
- andnq %rcx,%rax,%r12
- xorq %r15,%r13
- rorxq $14,%rax,%r14
- vpaddq %ymm11,%ymm2,%ymm2
- vpsrlq $7,%ymm8,%ymm11
- leaq (%rdx,%r12,1),%rdx
- xorq %r14,%r13
- movq %r8,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%r8,%r12
- leaq (%rdx,%r13,1),%rdx
- xorq %r9,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%r8,%r14
- rorxq $28,%r8,%r13
- leaq (%r11,%rdx,1),%r11
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r9,%rdi
- vpsrlq $6,%ymm1,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%rdx,%rdi,1),%rdx
- movq %rax,%r12
- vpsllq $3,%ymm1,%ymm10
- vpaddq %ymm8,%ymm2,%ymm2
- addq 72+256(%rsp),%rcx
- andq %r11,%r12
- rorxq $41,%r11,%r13
- vpsrlq $19,%ymm1,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%r11,%rdi
- leaq (%rdx,%r14,1),%rdx
- leaq (%rcx,%r12,1),%rcx
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %rbx,%r11,%r12
- xorq %rdi,%r13
- rorxq $14,%r11,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%rcx,%r12,1),%rcx
- xorq %r14,%r13
- movq %rdx,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%rdx,%r12
- leaq (%rcx,%r13,1),%rcx
- xorq %r8,%rdi
- vpaddq %ymm11,%ymm2,%ymm2
- rorxq $34,%rdx,%r14
- rorxq $28,%rdx,%r13
- leaq (%r10,%rcx,1),%r10
- vpaddq -64(%rbp),%ymm2,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r8,%r15
- xorq %r13,%r14
- leaq (%rcx,%r15,1),%rcx
- movq %r11,%r12
- vmovdqa %ymm10,64(%rsp)
- vpalignr $8,%ymm3,%ymm4,%ymm8
- addq 96+256(%rsp),%rbx
- andq %r10,%r12
- rorxq $41,%r10,%r13
- vpalignr $8,%ymm7,%ymm0,%ymm11
- rorxq $18,%r10,%r15
- leaq (%rcx,%r14,1),%rcx
- leaq (%rbx,%r12,1),%rbx
- vpsrlq $1,%ymm8,%ymm10
- andnq %rax,%r10,%r12
- xorq %r15,%r13
- rorxq $14,%r10,%r14
- vpaddq %ymm11,%ymm3,%ymm3
- vpsrlq $7,%ymm8,%ymm11
- leaq (%rbx,%r12,1),%rbx
- xorq %r14,%r13
- movq %rcx,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%rcx,%r12
- leaq (%rbx,%r13,1),%rbx
- xorq %rdx,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%rcx,%r14
- rorxq $28,%rcx,%r13
- leaq (%r9,%rbx,1),%r9
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rdx,%rdi
- vpsrlq $6,%ymm2,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%rbx,%rdi,1),%rbx
- movq %r10,%r12
- vpsllq $3,%ymm2,%ymm10
- vpaddq %ymm8,%ymm3,%ymm3
- addq 104+256(%rsp),%rax
- andq %r9,%r12
- rorxq $41,%r9,%r13
- vpsrlq $19,%ymm2,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%r9,%rdi
- leaq (%rbx,%r14,1),%rbx
- leaq (%rax,%r12,1),%rax
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %r11,%r9,%r12
- xorq %rdi,%r13
- rorxq $14,%r9,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%rax,%r12,1),%rax
- xorq %r14,%r13
- movq %rbx,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%rbx,%r12
- leaq (%rax,%r13,1),%rax
- xorq %rcx,%rdi
- vpaddq %ymm11,%ymm3,%ymm3
- rorxq $34,%rbx,%r14
- rorxq $28,%rbx,%r13
- leaq (%r8,%rax,1),%r8
- vpaddq -32(%rbp),%ymm3,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rcx,%r15
- xorq %r13,%r14
- leaq (%rax,%r15,1),%rax
- movq %r9,%r12
- vmovdqa %ymm10,96(%rsp)
- leaq -128(%rsp),%rsp
- vpalignr $8,%ymm4,%ymm5,%ymm8
- addq 0+256(%rsp),%r11
- andq %r8,%r12
- rorxq $41,%r8,%r13
- vpalignr $8,%ymm0,%ymm1,%ymm11
- rorxq $18,%r8,%r15
- leaq (%rax,%r14,1),%rax
- leaq (%r11,%r12,1),%r11
- vpsrlq $1,%ymm8,%ymm10
- andnq %r10,%r8,%r12
- xorq %r15,%r13
- rorxq $14,%r8,%r14
- vpaddq %ymm11,%ymm4,%ymm4
- vpsrlq $7,%ymm8,%ymm11
- leaq (%r11,%r12,1),%r11
- xorq %r14,%r13
- movq %rax,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%rax,%r12
- leaq (%r11,%r13,1),%r11
- xorq %rbx,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%rax,%r14
- rorxq $28,%rax,%r13
- leaq (%rdx,%r11,1),%rdx
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rbx,%rdi
- vpsrlq $6,%ymm3,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%r11,%rdi,1),%r11
- movq %r8,%r12
- vpsllq $3,%ymm3,%ymm10
- vpaddq %ymm8,%ymm4,%ymm4
- addq 8+256(%rsp),%r10
- andq %rdx,%r12
- rorxq $41,%rdx,%r13
- vpsrlq $19,%ymm3,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%rdx,%rdi
- leaq (%r11,%r14,1),%r11
- leaq (%r10,%r12,1),%r10
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %r9,%rdx,%r12
- xorq %rdi,%r13
- rorxq $14,%rdx,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%r10,%r12,1),%r10
- xorq %r14,%r13
- movq %r11,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%r11,%r12
- leaq (%r10,%r13,1),%r10
- xorq %rax,%rdi
- vpaddq %ymm11,%ymm4,%ymm4
- rorxq $34,%r11,%r14
- rorxq $28,%r11,%r13
- leaq (%rcx,%r10,1),%rcx
- vpaddq 0(%rbp),%ymm4,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rax,%r15
- xorq %r13,%r14
- leaq (%r10,%r15,1),%r10
- movq %rdx,%r12
- vmovdqa %ymm10,0(%rsp)
- vpalignr $8,%ymm5,%ymm6,%ymm8
- addq 32+256(%rsp),%r9
- andq %rcx,%r12
- rorxq $41,%rcx,%r13
- vpalignr $8,%ymm1,%ymm2,%ymm11
- rorxq $18,%rcx,%r15
- leaq (%r10,%r14,1),%r10
- leaq (%r9,%r12,1),%r9
- vpsrlq $1,%ymm8,%ymm10
- andnq %r8,%rcx,%r12
- xorq %r15,%r13
- rorxq $14,%rcx,%r14
- vpaddq %ymm11,%ymm5,%ymm5
- vpsrlq $7,%ymm8,%ymm11
- leaq (%r9,%r12,1),%r9
- xorq %r14,%r13
- movq %r10,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%r10,%r12
- leaq (%r9,%r13,1),%r9
- xorq %r11,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%r10,%r14
- rorxq $28,%r10,%r13
- leaq (%rbx,%r9,1),%rbx
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r11,%rdi
- vpsrlq $6,%ymm4,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%r9,%rdi,1),%r9
- movq %rcx,%r12
- vpsllq $3,%ymm4,%ymm10
- vpaddq %ymm8,%ymm5,%ymm5
- addq 40+256(%rsp),%r8
- andq %rbx,%r12
- rorxq $41,%rbx,%r13
- vpsrlq $19,%ymm4,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%rbx,%rdi
- leaq (%r9,%r14,1),%r9
- leaq (%r8,%r12,1),%r8
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %rdx,%rbx,%r12
- xorq %rdi,%r13
- rorxq $14,%rbx,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%r8,%r12,1),%r8
- xorq %r14,%r13
- movq %r9,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%r9,%r12
- leaq (%r8,%r13,1),%r8
- xorq %r10,%rdi
- vpaddq %ymm11,%ymm5,%ymm5
- rorxq $34,%r9,%r14
- rorxq $28,%r9,%r13
- leaq (%rax,%r8,1),%rax
- vpaddq 32(%rbp),%ymm5,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r10,%r15
- xorq %r13,%r14
- leaq (%r8,%r15,1),%r8
- movq %rbx,%r12
- vmovdqa %ymm10,32(%rsp)
- vpalignr $8,%ymm6,%ymm7,%ymm8
- addq 64+256(%rsp),%rdx
- andq %rax,%r12
- rorxq $41,%rax,%r13
- vpalignr $8,%ymm2,%ymm3,%ymm11
- rorxq $18,%rax,%r15
- leaq (%r8,%r14,1),%r8
- leaq (%rdx,%r12,1),%rdx
- vpsrlq $1,%ymm8,%ymm10
- andnq %rcx,%rax,%r12
- xorq %r15,%r13
- rorxq $14,%rax,%r14
- vpaddq %ymm11,%ymm6,%ymm6
- vpsrlq $7,%ymm8,%ymm11
- leaq (%rdx,%r12,1),%rdx
- xorq %r14,%r13
- movq %r8,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%r8,%r12
- leaq (%rdx,%r13,1),%rdx
- xorq %r9,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%r8,%r14
- rorxq $28,%r8,%r13
- leaq (%r11,%rdx,1),%r11
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r9,%rdi
- vpsrlq $6,%ymm5,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%rdx,%rdi,1),%rdx
- movq %rax,%r12
- vpsllq $3,%ymm5,%ymm10
- vpaddq %ymm8,%ymm6,%ymm6
- addq 72+256(%rsp),%rcx
- andq %r11,%r12
- rorxq $41,%r11,%r13
- vpsrlq $19,%ymm5,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%r11,%rdi
- leaq (%rdx,%r14,1),%rdx
- leaq (%rcx,%r12,1),%rcx
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %rbx,%r11,%r12
- xorq %rdi,%r13
- rorxq $14,%r11,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%rcx,%r12,1),%rcx
- xorq %r14,%r13
- movq %rdx,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%rdx,%r12
- leaq (%rcx,%r13,1),%rcx
- xorq %r8,%rdi
- vpaddq %ymm11,%ymm6,%ymm6
- rorxq $34,%rdx,%r14
- rorxq $28,%rdx,%r13
- leaq (%r10,%rcx,1),%r10
- vpaddq 64(%rbp),%ymm6,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r8,%r15
- xorq %r13,%r14
- leaq (%rcx,%r15,1),%rcx
- movq %r11,%r12
- vmovdqa %ymm10,64(%rsp)
- vpalignr $8,%ymm7,%ymm0,%ymm8
- addq 96+256(%rsp),%rbx
- andq %r10,%r12
- rorxq $41,%r10,%r13
- vpalignr $8,%ymm3,%ymm4,%ymm11
- rorxq $18,%r10,%r15
- leaq (%rcx,%r14,1),%rcx
- leaq (%rbx,%r12,1),%rbx
- vpsrlq $1,%ymm8,%ymm10
- andnq %rax,%r10,%r12
- xorq %r15,%r13
- rorxq $14,%r10,%r14
- vpaddq %ymm11,%ymm7,%ymm7
- vpsrlq $7,%ymm8,%ymm11
- leaq (%rbx,%r12,1),%rbx
- xorq %r14,%r13
- movq %rcx,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%rcx,%r12
- leaq (%rbx,%r13,1),%rbx
- xorq %rdx,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%rcx,%r14
- rorxq $28,%rcx,%r13
- leaq (%r9,%rbx,1),%r9
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rdx,%rdi
- vpsrlq $6,%ymm6,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%rbx,%rdi,1),%rbx
- movq %r10,%r12
- vpsllq $3,%ymm6,%ymm10
- vpaddq %ymm8,%ymm7,%ymm7
- addq 104+256(%rsp),%rax
- andq %r9,%r12
- rorxq $41,%r9,%r13
- vpsrlq $19,%ymm6,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%r9,%rdi
- leaq (%rbx,%r14,1),%rbx
- leaq (%rax,%r12,1),%rax
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %r11,%r9,%r12
- xorq %rdi,%r13
- rorxq $14,%r9,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%rax,%r12,1),%rax
- xorq %r14,%r13
- movq %rbx,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%rbx,%r12
- leaq (%rax,%r13,1),%rax
- xorq %rcx,%rdi
- vpaddq %ymm11,%ymm7,%ymm7
- rorxq $34,%rbx,%r14
- rorxq $28,%rbx,%r13
- leaq (%r8,%rax,1),%r8
- vpaddq 96(%rbp),%ymm7,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rcx,%r15
- xorq %r13,%r14
- leaq (%rax,%r15,1),%rax
- movq %r9,%r12
- vmovdqa %ymm10,96(%rsp)
- leaq 256(%rbp),%rbp
- cmpb $0,-121(%rbp)
- jne .Lavx2_00_47
- addq 0+128(%rsp),%r11
- andq %r8,%r12
- rorxq $41,%r8,%r13
- rorxq $18,%r8,%r15
- leaq (%rax,%r14,1),%rax
- leaq (%r11,%r12,1),%r11
- andnq %r10,%r8,%r12
- xorq %r15,%r13
- rorxq $14,%r8,%r14
- leaq (%r11,%r12,1),%r11
- xorq %r14,%r13
- movq %rax,%r15
- rorxq $39,%rax,%r12
- leaq (%r11,%r13,1),%r11
- xorq %rbx,%r15
- rorxq $34,%rax,%r14
- rorxq $28,%rax,%r13
- leaq (%rdx,%r11,1),%rdx
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rbx,%rdi
- xorq %r13,%r14
- leaq (%r11,%rdi,1),%r11
- movq %r8,%r12
- addq 8+128(%rsp),%r10
- andq %rdx,%r12
- rorxq $41,%rdx,%r13
- rorxq $18,%rdx,%rdi
- leaq (%r11,%r14,1),%r11
- leaq (%r10,%r12,1),%r10
- andnq %r9,%rdx,%r12
- xorq %rdi,%r13
- rorxq $14,%rdx,%r14
- leaq (%r10,%r12,1),%r10
- xorq %r14,%r13
- movq %r11,%rdi
- rorxq $39,%r11,%r12
- leaq (%r10,%r13,1),%r10
- xorq %rax,%rdi
- rorxq $34,%r11,%r14
- rorxq $28,%r11,%r13
- leaq (%rcx,%r10,1),%rcx
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rax,%r15
- xorq %r13,%r14
- leaq (%r10,%r15,1),%r10
- movq %rdx,%r12
- addq 32+128(%rsp),%r9
- andq %rcx,%r12
- rorxq $41,%rcx,%r13
- rorxq $18,%rcx,%r15
- leaq (%r10,%r14,1),%r10
- leaq (%r9,%r12,1),%r9
- andnq %r8,%rcx,%r12
- xorq %r15,%r13
- rorxq $14,%rcx,%r14
- leaq (%r9,%r12,1),%r9
- xorq %r14,%r13
- movq %r10,%r15
- rorxq $39,%r10,%r12
- leaq (%r9,%r13,1),%r9
- xorq %r11,%r15
- rorxq $34,%r10,%r14
- rorxq $28,%r10,%r13
- leaq (%rbx,%r9,1),%rbx
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r11,%rdi
- xorq %r13,%r14
- leaq (%r9,%rdi,1),%r9
- movq %rcx,%r12
- addq 40+128(%rsp),%r8
- andq %rbx,%r12
- rorxq $41,%rbx,%r13
- rorxq $18,%rbx,%rdi
- leaq (%r9,%r14,1),%r9
- leaq (%r8,%r12,1),%r8
- andnq %rdx,%rbx,%r12
- xorq %rdi,%r13
- rorxq $14,%rbx,%r14
- leaq (%r8,%r12,1),%r8
- xorq %r14,%r13
- movq %r9,%rdi
- rorxq $39,%r9,%r12
- leaq (%r8,%r13,1),%r8
- xorq %r10,%rdi
- rorxq $34,%r9,%r14
- rorxq $28,%r9,%r13
- leaq (%rax,%r8,1),%rax
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r10,%r15
- xorq %r13,%r14
- leaq (%r8,%r15,1),%r8
- movq %rbx,%r12
- addq 64+128(%rsp),%rdx
- andq %rax,%r12
- rorxq $41,%rax,%r13
- rorxq $18,%rax,%r15
- leaq (%r8,%r14,1),%r8
- leaq (%rdx,%r12,1),%rdx
- andnq %rcx,%rax,%r12
- xorq %r15,%r13
- rorxq $14,%rax,%r14
- leaq (%rdx,%r12,1),%rdx
- xorq %r14,%r13
- movq %r8,%r15
- rorxq $39,%r8,%r12
- leaq (%rdx,%r13,1),%rdx
- xorq %r9,%r15
- rorxq $34,%r8,%r14
- rorxq $28,%r8,%r13
- leaq (%r11,%rdx,1),%r11
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r9,%rdi
- xorq %r13,%r14
- leaq (%rdx,%rdi,1),%rdx
- movq %rax,%r12
- addq 72+128(%rsp),%rcx
- andq %r11,%r12
- rorxq $41,%r11,%r13
- rorxq $18,%r11,%rdi
- leaq (%rdx,%r14,1),%rdx
- leaq (%rcx,%r12,1),%rcx
- andnq %rbx,%r11,%r12
- xorq %rdi,%r13
- rorxq $14,%r11,%r14
- leaq (%rcx,%r12,1),%rcx
- xorq %r14,%r13
- movq %rdx,%rdi
- rorxq $39,%rdx,%r12
- leaq (%rcx,%r13,1),%rcx
- xorq %r8,%rdi
- rorxq $34,%rdx,%r14
- rorxq $28,%rdx,%r13
- leaq (%r10,%rcx,1),%r10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r8,%r15
- xorq %r13,%r14
- leaq (%rcx,%r15,1),%rcx
- movq %r11,%r12
- addq 96+128(%rsp),%rbx
- andq %r10,%r12
- rorxq $41,%r10,%r13
- rorxq $18,%r10,%r15
- leaq (%rcx,%r14,1),%rcx
- leaq (%rbx,%r12,1),%rbx
- andnq %rax,%r10,%r12
- xorq %r15,%r13
- rorxq $14,%r10,%r14
- leaq (%rbx,%r12,1),%rbx
- xorq %r14,%r13
- movq %rcx,%r15
- rorxq $39,%rcx,%r12
- leaq (%rbx,%r13,1),%rbx
- xorq %rdx,%r15
- rorxq $34,%rcx,%r14
- rorxq $28,%rcx,%r13
- leaq (%r9,%rbx,1),%r9
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rdx,%rdi
- xorq %r13,%r14
- leaq (%rbx,%rdi,1),%rbx
- movq %r10,%r12
- addq 104+128(%rsp),%rax
- andq %r9,%r12
- rorxq $41,%r9,%r13
- rorxq $18,%r9,%rdi
- leaq (%rbx,%r14,1),%rbx
- leaq (%rax,%r12,1),%rax
- andnq %r11,%r9,%r12
- xorq %rdi,%r13
- rorxq $14,%r9,%r14
- leaq (%rax,%r12,1),%rax
- xorq %r14,%r13
- movq %rbx,%rdi
- rorxq $39,%rbx,%r12
- leaq (%rax,%r13,1),%rax
- xorq %rcx,%rdi
- rorxq $34,%rbx,%r14
- rorxq $28,%rbx,%r13
- leaq (%r8,%rax,1),%r8
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rcx,%r15
- xorq %r13,%r14
- leaq (%rax,%r15,1),%rax
- movq %r9,%r12
- addq 0(%rsp),%r11
- andq %r8,%r12
- rorxq $41,%r8,%r13
- rorxq $18,%r8,%r15
- leaq (%rax,%r14,1),%rax
- leaq (%r11,%r12,1),%r11
- andnq %r10,%r8,%r12
- xorq %r15,%r13
- rorxq $14,%r8,%r14
- leaq (%r11,%r12,1),%r11
- xorq %r14,%r13
- movq %rax,%r15
- rorxq $39,%rax,%r12
- leaq (%r11,%r13,1),%r11
- xorq %rbx,%r15
- rorxq $34,%rax,%r14
- rorxq $28,%rax,%r13
- leaq (%rdx,%r11,1),%rdx
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rbx,%rdi
- xorq %r13,%r14
- leaq (%r11,%rdi,1),%r11
- movq %r8,%r12
- addq 8(%rsp),%r10
- andq %rdx,%r12
- rorxq $41,%rdx,%r13
- rorxq $18,%rdx,%rdi
- leaq (%r11,%r14,1),%r11
- leaq (%r10,%r12,1),%r10
- andnq %r9,%rdx,%r12
- xorq %rdi,%r13
- rorxq $14,%rdx,%r14
- leaq (%r10,%r12,1),%r10
- xorq %r14,%r13
- movq %r11,%rdi
- rorxq $39,%r11,%r12
- leaq (%r10,%r13,1),%r10
- xorq %rax,%rdi
- rorxq $34,%r11,%r14
- rorxq $28,%r11,%r13
- leaq (%rcx,%r10,1),%rcx
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rax,%r15
- xorq %r13,%r14
- leaq (%r10,%r15,1),%r10
- movq %rdx,%r12
- addq 32(%rsp),%r9
- andq %rcx,%r12
- rorxq $41,%rcx,%r13
- rorxq $18,%rcx,%r15
- leaq (%r10,%r14,1),%r10
- leaq (%r9,%r12,1),%r9
- andnq %r8,%rcx,%r12
- xorq %r15,%r13
- rorxq $14,%rcx,%r14
- leaq (%r9,%r12,1),%r9
- xorq %r14,%r13
- movq %r10,%r15
- rorxq $39,%r10,%r12
- leaq (%r9,%r13,1),%r9
- xorq %r11,%r15
- rorxq $34,%r10,%r14
- rorxq $28,%r10,%r13
- leaq (%rbx,%r9,1),%rbx
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r11,%rdi
- xorq %r13,%r14
- leaq (%r9,%rdi,1),%r9
- movq %rcx,%r12
- addq 40(%rsp),%r8
- andq %rbx,%r12
- rorxq $41,%rbx,%r13
- rorxq $18,%rbx,%rdi
- leaq (%r9,%r14,1),%r9
- leaq (%r8,%r12,1),%r8
- andnq %rdx,%rbx,%r12
- xorq %rdi,%r13
- rorxq $14,%rbx,%r14
- leaq (%r8,%r12,1),%r8
- xorq %r14,%r13
- movq %r9,%rdi
- rorxq $39,%r9,%r12
- leaq (%r8,%r13,1),%r8
- xorq %r10,%rdi
- rorxq $34,%r9,%r14
- rorxq $28,%r9,%r13
- leaq (%rax,%r8,1),%rax
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r10,%r15
- xorq %r13,%r14
- leaq (%r8,%r15,1),%r8
- movq %rbx,%r12
- addq 64(%rsp),%rdx
- andq %rax,%r12
- rorxq $41,%rax,%r13
- rorxq $18,%rax,%r15
- leaq (%r8,%r14,1),%r8
- leaq (%rdx,%r12,1),%rdx
- andnq %rcx,%rax,%r12
- xorq %r15,%r13
- rorxq $14,%rax,%r14
- leaq (%rdx,%r12,1),%rdx
- xorq %r14,%r13
- movq %r8,%r15
- rorxq $39,%r8,%r12
- leaq (%rdx,%r13,1),%rdx
- xorq %r9,%r15
- rorxq $34,%r8,%r14
- rorxq $28,%r8,%r13
- leaq (%r11,%rdx,1),%r11
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r9,%rdi
- xorq %r13,%r14
- leaq (%rdx,%rdi,1),%rdx
- movq %rax,%r12
- addq 72(%rsp),%rcx
- andq %r11,%r12
- rorxq $41,%r11,%r13
- rorxq $18,%r11,%rdi
- leaq (%rdx,%r14,1),%rdx
- leaq (%rcx,%r12,1),%rcx
- andnq %rbx,%r11,%r12
- xorq %rdi,%r13
- rorxq $14,%r11,%r14
- leaq (%rcx,%r12,1),%rcx
- xorq %r14,%r13
- movq %rdx,%rdi
- rorxq $39,%rdx,%r12
- leaq (%rcx,%r13,1),%rcx
- xorq %r8,%rdi
- rorxq $34,%rdx,%r14
- rorxq $28,%rdx,%r13
- leaq (%r10,%rcx,1),%r10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r8,%r15
- xorq %r13,%r14
- leaq (%rcx,%r15,1),%rcx
- movq %r11,%r12
- addq 96(%rsp),%rbx
- andq %r10,%r12
- rorxq $41,%r10,%r13
- rorxq $18,%r10,%r15
- leaq (%rcx,%r14,1),%rcx
- leaq (%rbx,%r12,1),%rbx
- andnq %rax,%r10,%r12
- xorq %r15,%r13
- rorxq $14,%r10,%r14
- leaq (%rbx,%r12,1),%rbx
- xorq %r14,%r13
- movq %rcx,%r15
- rorxq $39,%rcx,%r12
- leaq (%rbx,%r13,1),%rbx
- xorq %rdx,%r15
- rorxq $34,%rcx,%r14
- rorxq $28,%rcx,%r13
- leaq (%r9,%rbx,1),%r9
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rdx,%rdi
- xorq %r13,%r14
- leaq (%rbx,%rdi,1),%rbx
- movq %r10,%r12
- addq 104(%rsp),%rax
- andq %r9,%r12
- rorxq $41,%r9,%r13
- rorxq $18,%r9,%rdi
- leaq (%rbx,%r14,1),%rbx
- leaq (%rax,%r12,1),%rax
- andnq %r11,%r9,%r12
- xorq %rdi,%r13
- rorxq $14,%r9,%r14
- leaq (%rax,%r12,1),%rax
- xorq %r14,%r13
- movq %rbx,%rdi
- rorxq $39,%rbx,%r12
- leaq (%rax,%r13,1),%rax
- xorq %rcx,%rdi
- rorxq $34,%rbx,%r14
- rorxq $28,%rbx,%r13
- leaq (%r8,%rax,1),%r8
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rcx,%r15
- xorq %r13,%r14
- leaq (%rax,%r15,1),%rax
- movq %r9,%r12
- movq 1280(%rsp),%rdi
- addq %r14,%rax
-
- leaq 1152(%rsp),%rbp
-
- addq 0(%rdi),%rax
- addq 8(%rdi),%rbx
- addq 16(%rdi),%rcx
- addq 24(%rdi),%rdx
- addq 32(%rdi),%r8
- addq 40(%rdi),%r9
- addq 48(%rdi),%r10
- addq 56(%rdi),%r11
-
- movq %rax,0(%rdi)
- movq %rbx,8(%rdi)
- movq %rcx,16(%rdi)
- movq %rdx,24(%rdi)
- movq %r8,32(%rdi)
- movq %r9,40(%rdi)
- movq %r10,48(%rdi)
- movq %r11,56(%rdi)
-
- cmpq 144(%rbp),%rsi
- je .Ldone_avx2
-
- xorq %r14,%r14
- movq %rbx,%rdi
- xorq %rcx,%rdi
- movq %r9,%r12
- jmp .Lower_avx2
-.align 16
-.Lower_avx2:
- addq 0+16(%rbp),%r11
- andq %r8,%r12
- rorxq $41,%r8,%r13
- rorxq $18,%r8,%r15
- leaq (%rax,%r14,1),%rax
- leaq (%r11,%r12,1),%r11
- andnq %r10,%r8,%r12
- xorq %r15,%r13
- rorxq $14,%r8,%r14
- leaq (%r11,%r12,1),%r11
- xorq %r14,%r13
- movq %rax,%r15
- rorxq $39,%rax,%r12
- leaq (%r11,%r13,1),%r11
- xorq %rbx,%r15
- rorxq $34,%rax,%r14
- rorxq $28,%rax,%r13
- leaq (%rdx,%r11,1),%rdx
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rbx,%rdi
- xorq %r13,%r14
- leaq (%r11,%rdi,1),%r11
- movq %r8,%r12
- addq 8+16(%rbp),%r10
- andq %rdx,%r12
- rorxq $41,%rdx,%r13
- rorxq $18,%rdx,%rdi
- leaq (%r11,%r14,1),%r11
- leaq (%r10,%r12,1),%r10
- andnq %r9,%rdx,%r12
- xorq %rdi,%r13
- rorxq $14,%rdx,%r14
- leaq (%r10,%r12,1),%r10
- xorq %r14,%r13
- movq %r11,%rdi
- rorxq $39,%r11,%r12
- leaq (%r10,%r13,1),%r10
- xorq %rax,%rdi
- rorxq $34,%r11,%r14
- rorxq $28,%r11,%r13
- leaq (%rcx,%r10,1),%rcx
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rax,%r15
- xorq %r13,%r14
- leaq (%r10,%r15,1),%r10
- movq %rdx,%r12
- addq 32+16(%rbp),%r9
- andq %rcx,%r12
- rorxq $41,%rcx,%r13
- rorxq $18,%rcx,%r15
- leaq (%r10,%r14,1),%r10
- leaq (%r9,%r12,1),%r9
- andnq %r8,%rcx,%r12
- xorq %r15,%r13
- rorxq $14,%rcx,%r14
- leaq (%r9,%r12,1),%r9
- xorq %r14,%r13
- movq %r10,%r15
- rorxq $39,%r10,%r12
- leaq (%r9,%r13,1),%r9
- xorq %r11,%r15
- rorxq $34,%r10,%r14
- rorxq $28,%r10,%r13
- leaq (%rbx,%r9,1),%rbx
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r11,%rdi
- xorq %r13,%r14
- leaq (%r9,%rdi,1),%r9
- movq %rcx,%r12
- addq 40+16(%rbp),%r8
- andq %rbx,%r12
- rorxq $41,%rbx,%r13
- rorxq $18,%rbx,%rdi
- leaq (%r9,%r14,1),%r9
- leaq (%r8,%r12,1),%r8
- andnq %rdx,%rbx,%r12
- xorq %rdi,%r13
- rorxq $14,%rbx,%r14
- leaq (%r8,%r12,1),%r8
- xorq %r14,%r13
- movq %r9,%rdi
- rorxq $39,%r9,%r12
- leaq (%r8,%r13,1),%r8
- xorq %r10,%rdi
- rorxq $34,%r9,%r14
- rorxq $28,%r9,%r13
- leaq (%rax,%r8,1),%rax
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r10,%r15
- xorq %r13,%r14
- leaq (%r8,%r15,1),%r8
- movq %rbx,%r12
- addq 64+16(%rbp),%rdx
- andq %rax,%r12
- rorxq $41,%rax,%r13
- rorxq $18,%rax,%r15
- leaq (%r8,%r14,1),%r8
- leaq (%rdx,%r12,1),%rdx
- andnq %rcx,%rax,%r12
- xorq %r15,%r13
- rorxq $14,%rax,%r14
- leaq (%rdx,%r12,1),%rdx
- xorq %r14,%r13
- movq %r8,%r15
- rorxq $39,%r8,%r12
- leaq (%rdx,%r13,1),%rdx
- xorq %r9,%r15
- rorxq $34,%r8,%r14
- rorxq $28,%r8,%r13
- leaq (%r11,%rdx,1),%r11
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r9,%rdi
- xorq %r13,%r14
- leaq (%rdx,%rdi,1),%rdx
- movq %rax,%r12
- addq 72+16(%rbp),%rcx
- andq %r11,%r12
- rorxq $41,%r11,%r13
- rorxq $18,%r11,%rdi
- leaq (%rdx,%r14,1),%rdx
- leaq (%rcx,%r12,1),%rcx
- andnq %rbx,%r11,%r12
- xorq %rdi,%r13
- rorxq $14,%r11,%r14
- leaq (%rcx,%r12,1),%rcx
- xorq %r14,%r13
- movq %rdx,%rdi
- rorxq $39,%rdx,%r12
- leaq (%rcx,%r13,1),%rcx
- xorq %r8,%rdi
- rorxq $34,%rdx,%r14
- rorxq $28,%rdx,%r13
- leaq (%r10,%rcx,1),%r10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r8,%r15
- xorq %r13,%r14
- leaq (%rcx,%r15,1),%rcx
- movq %r11,%r12
- addq 96+16(%rbp),%rbx
- andq %r10,%r12
- rorxq $41,%r10,%r13
- rorxq $18,%r10,%r15
- leaq (%rcx,%r14,1),%rcx
- leaq (%rbx,%r12,1),%rbx
- andnq %rax,%r10,%r12
- xorq %r15,%r13
- rorxq $14,%r10,%r14
- leaq (%rbx,%r12,1),%rbx
- xorq %r14,%r13
- movq %rcx,%r15
- rorxq $39,%rcx,%r12
- leaq (%rbx,%r13,1),%rbx
- xorq %rdx,%r15
- rorxq $34,%rcx,%r14
- rorxq $28,%rcx,%r13
- leaq (%r9,%rbx,1),%r9
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rdx,%rdi
- xorq %r13,%r14
- leaq (%rbx,%rdi,1),%rbx
- movq %r10,%r12
- addq 104+16(%rbp),%rax
- andq %r9,%r12
- rorxq $41,%r9,%r13
- rorxq $18,%r9,%rdi
- leaq (%rbx,%r14,1),%rbx
- leaq (%rax,%r12,1),%rax
- andnq %r11,%r9,%r12
- xorq %rdi,%r13
- rorxq $14,%r9,%r14
- leaq (%rax,%r12,1),%rax
- xorq %r14,%r13
- movq %rbx,%rdi
- rorxq $39,%rbx,%r12
- leaq (%rax,%r13,1),%rax
- xorq %rcx,%rdi
- rorxq $34,%rbx,%r14
- rorxq $28,%rbx,%r13
- leaq (%r8,%rax,1),%r8
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rcx,%r15
- xorq %r13,%r14
- leaq (%rax,%r15,1),%rax
- movq %r9,%r12
- leaq -128(%rbp),%rbp
- cmpq %rsp,%rbp
- jae .Lower_avx2
-
- movq 1280(%rsp),%rdi
- addq %r14,%rax
-
- leaq 1152(%rsp),%rsp
-
- addq 0(%rdi),%rax
- addq 8(%rdi),%rbx
- addq 16(%rdi),%rcx
- addq 24(%rdi),%rdx
- addq 32(%rdi),%r8
- addq 40(%rdi),%r9
- leaq 256(%rsi),%rsi
- addq 48(%rdi),%r10
- movq %rsi,%r12
- addq 56(%rdi),%r11
- cmpq 128+16(%rsp),%rsi
-
- movq %rax,0(%rdi)
- cmoveq %rsp,%r12
- movq %rbx,8(%rdi)
- movq %rcx,16(%rdi)
- movq %rdx,24(%rdi)
- movq %r8,32(%rdi)
- movq %r9,40(%rdi)
- movq %r10,48(%rdi)
- movq %r11,56(%rdi)
-
- jbe .Loop_avx2
- leaq (%rsp),%rbp
-
-.Ldone_avx2:
- leaq (%rbp),%rsp
- movq 128+24(%rsp),%rsi
- vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-.Lepilogue_avx2:
- .byte 0xf3,0xc3
-.size sha512_block_data_order_avx2,.-sha512_block_data_order_avx2
diff --git a/deps/openssl/asm/x64-macosx-gas/aes/aesni-mb-x86_64.s b/deps/openssl/asm/x64-macosx-gas/aes/aesni-mb-x86_64.s
index ccd3c70900..e45c622a52 100644
--- a/deps/openssl/asm/x64-macosx-gas/aes/aesni-mb-x86_64.s
+++ b/deps/openssl/asm/x64-macosx-gas/aes/aesni-mb-x86_64.s
@@ -6,14 +6,6 @@
.p2align 5
_aesni_multi_cbc_encrypt:
- cmpl $2,%edx
- jb L$enc_non_avx
- movl _OPENSSL_ia32cap_P+4(%rip),%ecx
- testl $268435456,%ecx
- jnz _avx_cbc_enc_shortcut
- jmp L$enc_non_avx
-.p2align 4
-L$enc_non_avx:
movq %rsp,%rax
pushq %rbx
pushq %rbp
@@ -270,14 +262,6 @@ L$enc4x_epilogue:
.p2align 5
_aesni_multi_cbc_decrypt:
- cmpl $2,%edx
- jb L$dec_non_avx
- movl _OPENSSL_ia32cap_P+4(%rip),%ecx
- testl $268435456,%ecx
- jnz _avx_cbc_dec_shortcut
- jmp L$dec_non_avx
-.p2align 4
-L$dec_non_avx:
movq %rsp,%rax
pushq %rbx
pushq %rbp
@@ -519,916 +503,3 @@ L$dec4x_done:
leaq (%rax),%rsp
L$dec4x_epilogue:
.byte 0xf3,0xc3
-
-
-.p2align 5
-aesni_multi_cbc_encrypt_avx:
-_avx_cbc_enc_shortcut:
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-
-
-
-
-
-
-
-
- subq $192,%rsp
- andq $-128,%rsp
- movq %rax,16(%rsp)
-
-L$enc8x_body:
- vzeroupper
- vmovdqu (%rsi),%xmm15
- leaq 120(%rsi),%rsi
- leaq 160(%rdi),%rdi
- shrl $1,%edx
-
-L$enc8x_loop_grande:
-
- xorl %edx,%edx
- movl -144(%rdi),%ecx
- movq -160(%rdi),%r8
- cmpl %edx,%ecx
- movq -152(%rdi),%rbx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu -136(%rdi),%xmm2
- movl %ecx,32(%rsp)
- cmovleq %rsp,%r8
- subq %r8,%rbx
- movq %rbx,64(%rsp)
- movl -104(%rdi),%ecx
- movq -120(%rdi),%r9
- cmpl %edx,%ecx
- movq -112(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu -96(%rdi),%xmm3
- movl %ecx,36(%rsp)
- cmovleq %rsp,%r9
- subq %r9,%rbp
- movq %rbp,72(%rsp)
- movl -64(%rdi),%ecx
- movq -80(%rdi),%r10
- cmpl %edx,%ecx
- movq -72(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu -56(%rdi),%xmm4
- movl %ecx,40(%rsp)
- cmovleq %rsp,%r10
- subq %r10,%rbp
- movq %rbp,80(%rsp)
- movl -24(%rdi),%ecx
- movq -40(%rdi),%r11
- cmpl %edx,%ecx
- movq -32(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu -16(%rdi),%xmm5
- movl %ecx,44(%rsp)
- cmovleq %rsp,%r11
- subq %r11,%rbp
- movq %rbp,88(%rsp)
- movl 16(%rdi),%ecx
- movq 0(%rdi),%r12
- cmpl %edx,%ecx
- movq 8(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu 24(%rdi),%xmm6
- movl %ecx,48(%rsp)
- cmovleq %rsp,%r12
- subq %r12,%rbp
- movq %rbp,96(%rsp)
- movl 56(%rdi),%ecx
- movq 40(%rdi),%r13
- cmpl %edx,%ecx
- movq 48(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu 64(%rdi),%xmm7
- movl %ecx,52(%rsp)
- cmovleq %rsp,%r13
- subq %r13,%rbp
- movq %rbp,104(%rsp)
- movl 96(%rdi),%ecx
- movq 80(%rdi),%r14
- cmpl %edx,%ecx
- movq 88(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu 104(%rdi),%xmm8
- movl %ecx,56(%rsp)
- cmovleq %rsp,%r14
- subq %r14,%rbp
- movq %rbp,112(%rsp)
- movl 136(%rdi),%ecx
- movq 120(%rdi),%r15
- cmpl %edx,%ecx
- movq 128(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu 144(%rdi),%xmm9
- movl %ecx,60(%rsp)
- cmovleq %rsp,%r15
- subq %r15,%rbp
- movq %rbp,120(%rsp)
- testl %edx,%edx
- jz L$enc8x_done
-
- vmovups 16-120(%rsi),%xmm1
- vmovups 32-120(%rsi),%xmm0
- movl 240-120(%rsi),%eax
-
- vpxor (%r8),%xmm15,%xmm10
- leaq 128(%rsp),%rbp
- vpxor (%r9),%xmm15,%xmm11
- vpxor (%r10),%xmm15,%xmm12
- vpxor (%r11),%xmm15,%xmm13
- vpxor %xmm10,%xmm2,%xmm2
- vpxor (%r12),%xmm15,%xmm10
- vpxor %xmm11,%xmm3,%xmm3
- vpxor (%r13),%xmm15,%xmm11
- vpxor %xmm12,%xmm4,%xmm4
- vpxor (%r14),%xmm15,%xmm12
- vpxor %xmm13,%xmm5,%xmm5
- vpxor (%r15),%xmm15,%xmm13
- vpxor %xmm10,%xmm6,%xmm6
- movl $1,%ecx
- vpxor %xmm11,%xmm7,%xmm7
- vpxor %xmm12,%xmm8,%xmm8
- vpxor %xmm13,%xmm9,%xmm9
- jmp L$oop_enc8x
-
-.p2align 5
-L$oop_enc8x:
- vaesenc %xmm1,%xmm2,%xmm2
- cmpl 32+0(%rsp),%ecx
- vaesenc %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r8)
- vaesenc %xmm1,%xmm4,%xmm4
- vaesenc %xmm1,%xmm5,%xmm5
- leaq (%r8,%rbx,1),%rbx
- cmovgeq %rsp,%r8
- vaesenc %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm1,%xmm7,%xmm7
- subq %r8,%rbx
- vaesenc %xmm1,%xmm8,%xmm8
- vpxor 16(%r8),%xmm15,%xmm10
- movq %rbx,64+0(%rsp)
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups -72(%rsi),%xmm1
- leaq 16(%r8,%rbx,1),%r8
- vmovdqu %xmm10,0(%rbp)
- vaesenc %xmm0,%xmm2,%xmm2
- cmpl 32+4(%rsp),%ecx
- movq 64+8(%rsp),%rbx
- vaesenc %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r9)
- vaesenc %xmm0,%xmm4,%xmm4
- vaesenc %xmm0,%xmm5,%xmm5
- leaq (%r9,%rbx,1),%rbx
- cmovgeq %rsp,%r9
- vaesenc %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm0,%xmm7,%xmm7
- subq %r9,%rbx
- vaesenc %xmm0,%xmm8,%xmm8
- vpxor 16(%r9),%xmm15,%xmm11
- movq %rbx,64+8(%rsp)
- vaesenc %xmm0,%xmm9,%xmm9
- vmovups -56(%rsi),%xmm0
- leaq 16(%r9,%rbx,1),%r9
- vmovdqu %xmm11,16(%rbp)
- vaesenc %xmm1,%xmm2,%xmm2
- cmpl 32+8(%rsp),%ecx
- movq 64+16(%rsp),%rbx
- vaesenc %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r10)
- vaesenc %xmm1,%xmm4,%xmm4
- prefetcht0 15(%r8)
- vaesenc %xmm1,%xmm5,%xmm5
- leaq (%r10,%rbx,1),%rbx
- cmovgeq %rsp,%r10
- vaesenc %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm1,%xmm7,%xmm7
- subq %r10,%rbx
- vaesenc %xmm1,%xmm8,%xmm8
- vpxor 16(%r10),%xmm15,%xmm12
- movq %rbx,64+16(%rsp)
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups -40(%rsi),%xmm1
- leaq 16(%r10,%rbx,1),%r10
- vmovdqu %xmm12,32(%rbp)
- vaesenc %xmm0,%xmm2,%xmm2
- cmpl 32+12(%rsp),%ecx
- movq 64+24(%rsp),%rbx
- vaesenc %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r11)
- vaesenc %xmm0,%xmm4,%xmm4
- prefetcht0 15(%r9)
- vaesenc %xmm0,%xmm5,%xmm5
- leaq (%r11,%rbx,1),%rbx
- cmovgeq %rsp,%r11
- vaesenc %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm0,%xmm7,%xmm7
- subq %r11,%rbx
- vaesenc %xmm0,%xmm8,%xmm8
- vpxor 16(%r11),%xmm15,%xmm13
- movq %rbx,64+24(%rsp)
- vaesenc %xmm0,%xmm9,%xmm9
- vmovups -24(%rsi),%xmm0
- leaq 16(%r11,%rbx,1),%r11
- vmovdqu %xmm13,48(%rbp)
- vaesenc %xmm1,%xmm2,%xmm2
- cmpl 32+16(%rsp),%ecx
- movq 64+32(%rsp),%rbx
- vaesenc %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r12)
- vaesenc %xmm1,%xmm4,%xmm4
- prefetcht0 15(%r10)
- vaesenc %xmm1,%xmm5,%xmm5
- leaq (%r12,%rbx,1),%rbx
- cmovgeq %rsp,%r12
- vaesenc %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm1,%xmm7,%xmm7
- subq %r12,%rbx
- vaesenc %xmm1,%xmm8,%xmm8
- vpxor 16(%r12),%xmm15,%xmm10
- movq %rbx,64+32(%rsp)
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups -8(%rsi),%xmm1
- leaq 16(%r12,%rbx,1),%r12
- vaesenc %xmm0,%xmm2,%xmm2
- cmpl 32+20(%rsp),%ecx
- movq 64+40(%rsp),%rbx
- vaesenc %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r13)
- vaesenc %xmm0,%xmm4,%xmm4
- prefetcht0 15(%r11)
- vaesenc %xmm0,%xmm5,%xmm5
- leaq (%rbx,%r13,1),%rbx
- cmovgeq %rsp,%r13
- vaesenc %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm0,%xmm7,%xmm7
- subq %r13,%rbx
- vaesenc %xmm0,%xmm8,%xmm8
- vpxor 16(%r13),%xmm15,%xmm11
- movq %rbx,64+40(%rsp)
- vaesenc %xmm0,%xmm9,%xmm9
- vmovups 8(%rsi),%xmm0
- leaq 16(%r13,%rbx,1),%r13
- vaesenc %xmm1,%xmm2,%xmm2
- cmpl 32+24(%rsp),%ecx
- movq 64+48(%rsp),%rbx
- vaesenc %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r14)
- vaesenc %xmm1,%xmm4,%xmm4
- prefetcht0 15(%r12)
- vaesenc %xmm1,%xmm5,%xmm5
- leaq (%r14,%rbx,1),%rbx
- cmovgeq %rsp,%r14
- vaesenc %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm1,%xmm7,%xmm7
- subq %r14,%rbx
- vaesenc %xmm1,%xmm8,%xmm8
- vpxor 16(%r14),%xmm15,%xmm12
- movq %rbx,64+48(%rsp)
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups 24(%rsi),%xmm1
- leaq 16(%r14,%rbx,1),%r14
- vaesenc %xmm0,%xmm2,%xmm2
- cmpl 32+28(%rsp),%ecx
- movq 64+56(%rsp),%rbx
- vaesenc %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r15)
- vaesenc %xmm0,%xmm4,%xmm4
- prefetcht0 15(%r13)
- vaesenc %xmm0,%xmm5,%xmm5
- leaq (%r15,%rbx,1),%rbx
- cmovgeq %rsp,%r15
- vaesenc %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm0,%xmm7,%xmm7
- subq %r15,%rbx
- vaesenc %xmm0,%xmm8,%xmm8
- vpxor 16(%r15),%xmm15,%xmm13
- movq %rbx,64+56(%rsp)
- vaesenc %xmm0,%xmm9,%xmm9
- vmovups 40(%rsi),%xmm0
- leaq 16(%r15,%rbx,1),%r15
- vmovdqu 32(%rsp),%xmm14
- prefetcht0 15(%r14)
- prefetcht0 15(%r15)
- cmpl $11,%eax
- jb L$enc8x_tail
-
- vaesenc %xmm1,%xmm2,%xmm2
- vaesenc %xmm1,%xmm3,%xmm3
- vaesenc %xmm1,%xmm4,%xmm4
- vaesenc %xmm1,%xmm5,%xmm5
- vaesenc %xmm1,%xmm6,%xmm6
- vaesenc %xmm1,%xmm7,%xmm7
- vaesenc %xmm1,%xmm8,%xmm8
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups 176-120(%rsi),%xmm1
-
- vaesenc %xmm0,%xmm2,%xmm2
- vaesenc %xmm0,%xmm3,%xmm3
- vaesenc %xmm0,%xmm4,%xmm4
- vaesenc %xmm0,%xmm5,%xmm5
- vaesenc %xmm0,%xmm6,%xmm6
- vaesenc %xmm0,%xmm7,%xmm7
- vaesenc %xmm0,%xmm8,%xmm8
- vaesenc %xmm0,%xmm9,%xmm9
- vmovups 192-120(%rsi),%xmm0
- je L$enc8x_tail
-
- vaesenc %xmm1,%xmm2,%xmm2
- vaesenc %xmm1,%xmm3,%xmm3
- vaesenc %xmm1,%xmm4,%xmm4
- vaesenc %xmm1,%xmm5,%xmm5
- vaesenc %xmm1,%xmm6,%xmm6
- vaesenc %xmm1,%xmm7,%xmm7
- vaesenc %xmm1,%xmm8,%xmm8
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups 208-120(%rsi),%xmm1
-
- vaesenc %xmm0,%xmm2,%xmm2
- vaesenc %xmm0,%xmm3,%xmm3
- vaesenc %xmm0,%xmm4,%xmm4
- vaesenc %xmm0,%xmm5,%xmm5
- vaesenc %xmm0,%xmm6,%xmm6
- vaesenc %xmm0,%xmm7,%xmm7
- vaesenc %xmm0,%xmm8,%xmm8
- vaesenc %xmm0,%xmm9,%xmm9
- vmovups 224-120(%rsi),%xmm0
-
-L$enc8x_tail:
- vaesenc %xmm1,%xmm2,%xmm2
- vpxor %xmm15,%xmm15,%xmm15
- vaesenc %xmm1,%xmm3,%xmm3
- vaesenc %xmm1,%xmm4,%xmm4
- vpcmpgtd %xmm15,%xmm14,%xmm15
- vaesenc %xmm1,%xmm5,%xmm5
- vaesenc %xmm1,%xmm6,%xmm6
- vpaddd %xmm14,%xmm15,%xmm15
- vmovdqu 48(%rsp),%xmm14
- vaesenc %xmm1,%xmm7,%xmm7
- movq 64(%rsp),%rbx
- vaesenc %xmm1,%xmm8,%xmm8
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups 16-120(%rsi),%xmm1
-
- vaesenclast %xmm0,%xmm2,%xmm2
- vmovdqa %xmm15,32(%rsp)
- vpxor %xmm15,%xmm15,%xmm15
- vaesenclast %xmm0,%xmm3,%xmm3
- vaesenclast %xmm0,%xmm4,%xmm4
- vpcmpgtd %xmm15,%xmm14,%xmm15
- vaesenclast %xmm0,%xmm5,%xmm5
- vaesenclast %xmm0,%xmm6,%xmm6
- vpaddd %xmm15,%xmm14,%xmm14
- vmovdqu -120(%rsi),%xmm15
- vaesenclast %xmm0,%xmm7,%xmm7
- vaesenclast %xmm0,%xmm8,%xmm8
- vmovdqa %xmm14,48(%rsp)
- vaesenclast %xmm0,%xmm9,%xmm9
- vmovups 32-120(%rsi),%xmm0
-
- vmovups %xmm2,-16(%r8)
- subq %rbx,%r8
- vpxor 0(%rbp),%xmm2,%xmm2
- vmovups %xmm3,-16(%r9)
- subq 72(%rsp),%r9
- vpxor 16(%rbp),%xmm3,%xmm3
- vmovups %xmm4,-16(%r10)
- subq 80(%rsp),%r10
- vpxor 32(%rbp),%xmm4,%xmm4
- vmovups %xmm5,-16(%r11)
- subq 88(%rsp),%r11
- vpxor 48(%rbp),%xmm5,%xmm5
- vmovups %xmm6,-16(%r12)
- subq 96(%rsp),%r12
- vpxor %xmm10,%xmm6,%xmm6
- vmovups %xmm7,-16(%r13)
- subq 104(%rsp),%r13
- vpxor %xmm11,%xmm7,%xmm7
- vmovups %xmm8,-16(%r14)
- subq 112(%rsp),%r14
- vpxor %xmm12,%xmm8,%xmm8
- vmovups %xmm9,-16(%r15)
- subq 120(%rsp),%r15
- vpxor %xmm13,%xmm9,%xmm9
-
- decl %edx
- jnz L$oop_enc8x
-
- movq 16(%rsp),%rax
-
-
-
-
-
-L$enc8x_done:
- vzeroupper
- movq -48(%rax),%r15
- movq -40(%rax),%r14
- movq -32(%rax),%r13
- movq -24(%rax),%r12
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-L$enc8x_epilogue:
- .byte 0xf3,0xc3
-
-
-
-.p2align 5
-aesni_multi_cbc_decrypt_avx:
-_avx_cbc_dec_shortcut:
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-
-
-
-
-
-
-
-
-
- subq $256,%rsp
- andq $-256,%rsp
- subq $192,%rsp
- movq %rax,16(%rsp)
-
-L$dec8x_body:
- vzeroupper
- vmovdqu (%rsi),%xmm15
- leaq 120(%rsi),%rsi
- leaq 160(%rdi),%rdi
- shrl $1,%edx
-
-L$dec8x_loop_grande:
-
- xorl %edx,%edx
- movl -144(%rdi),%ecx
- movq -160(%rdi),%r8
- cmpl %edx,%ecx
- movq -152(%rdi),%rbx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu -136(%rdi),%xmm2
- movl %ecx,32(%rsp)
- cmovleq %rsp,%r8
- subq %r8,%rbx
- movq %rbx,64(%rsp)
- vmovdqu %xmm2,192(%rsp)
- movl -104(%rdi),%ecx
- movq -120(%rdi),%r9
- cmpl %edx,%ecx
- movq -112(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu -96(%rdi),%xmm3
- movl %ecx,36(%rsp)
- cmovleq %rsp,%r9
- subq %r9,%rbp
- movq %rbp,72(%rsp)
- vmovdqu %xmm3,208(%rsp)
- movl -64(%rdi),%ecx
- movq -80(%rdi),%r10
- cmpl %edx,%ecx
- movq -72(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu -56(%rdi),%xmm4
- movl %ecx,40(%rsp)
- cmovleq %rsp,%r10
- subq %r10,%rbp
- movq %rbp,80(%rsp)
- vmovdqu %xmm4,224(%rsp)
- movl -24(%rdi),%ecx
- movq -40(%rdi),%r11
- cmpl %edx,%ecx
- movq -32(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu -16(%rdi),%xmm5
- movl %ecx,44(%rsp)
- cmovleq %rsp,%r11
- subq %r11,%rbp
- movq %rbp,88(%rsp)
- vmovdqu %xmm5,240(%rsp)
- movl 16(%rdi),%ecx
- movq 0(%rdi),%r12
- cmpl %edx,%ecx
- movq 8(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu 24(%rdi),%xmm6
- movl %ecx,48(%rsp)
- cmovleq %rsp,%r12
- subq %r12,%rbp
- movq %rbp,96(%rsp)
- vmovdqu %xmm6,256(%rsp)
- movl 56(%rdi),%ecx
- movq 40(%rdi),%r13
- cmpl %edx,%ecx
- movq 48(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu 64(%rdi),%xmm7
- movl %ecx,52(%rsp)
- cmovleq %rsp,%r13
- subq %r13,%rbp
- movq %rbp,104(%rsp)
- vmovdqu %xmm7,272(%rsp)
- movl 96(%rdi),%ecx
- movq 80(%rdi),%r14
- cmpl %edx,%ecx
- movq 88(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu 104(%rdi),%xmm8
- movl %ecx,56(%rsp)
- cmovleq %rsp,%r14
- subq %r14,%rbp
- movq %rbp,112(%rsp)
- vmovdqu %xmm8,288(%rsp)
- movl 136(%rdi),%ecx
- movq 120(%rdi),%r15
- cmpl %edx,%ecx
- movq 128(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- vmovdqu 144(%rdi),%xmm9
- movl %ecx,60(%rsp)
- cmovleq %rsp,%r15
- subq %r15,%rbp
- movq %rbp,120(%rsp)
- vmovdqu %xmm9,304(%rsp)
- testl %edx,%edx
- jz L$dec8x_done
-
- vmovups 16-120(%rsi),%xmm1
- vmovups 32-120(%rsi),%xmm0
- movl 240-120(%rsi),%eax
- leaq 192+128(%rsp),%rbp
-
- vmovdqu (%r8),%xmm2
- vmovdqu (%r9),%xmm3
- vmovdqu (%r10),%xmm4
- vmovdqu (%r11),%xmm5
- vmovdqu (%r12),%xmm6
- vmovdqu (%r13),%xmm7
- vmovdqu (%r14),%xmm8
- vmovdqu (%r15),%xmm9
- vmovdqu %xmm2,0(%rbp)
- vpxor %xmm15,%xmm2,%xmm2
- vmovdqu %xmm3,16(%rbp)
- vpxor %xmm15,%xmm3,%xmm3
- vmovdqu %xmm4,32(%rbp)
- vpxor %xmm15,%xmm4,%xmm4
- vmovdqu %xmm5,48(%rbp)
- vpxor %xmm15,%xmm5,%xmm5
- vmovdqu %xmm6,64(%rbp)
- vpxor %xmm15,%xmm6,%xmm6
- vmovdqu %xmm7,80(%rbp)
- vpxor %xmm15,%xmm7,%xmm7
- vmovdqu %xmm8,96(%rbp)
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu %xmm9,112(%rbp)
- vpxor %xmm15,%xmm9,%xmm9
- xorq $128,%rbp
- movl $1,%ecx
- jmp L$oop_dec8x
-
-.p2align 5
-L$oop_dec8x:
- vaesdec %xmm1,%xmm2,%xmm2
- cmpl 32+0(%rsp),%ecx
- vaesdec %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r8)
- vaesdec %xmm1,%xmm4,%xmm4
- vaesdec %xmm1,%xmm5,%xmm5
- leaq (%r8,%rbx,1),%rbx
- cmovgeq %rsp,%r8
- vaesdec %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm1,%xmm7,%xmm7
- subq %r8,%rbx
- vaesdec %xmm1,%xmm8,%xmm8
- vmovdqu 16(%r8),%xmm10
- movq %rbx,64+0(%rsp)
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups -72(%rsi),%xmm1
- leaq 16(%r8,%rbx,1),%r8
- vmovdqu %xmm10,128(%rsp)
- vaesdec %xmm0,%xmm2,%xmm2
- cmpl 32+4(%rsp),%ecx
- movq 64+8(%rsp),%rbx
- vaesdec %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r9)
- vaesdec %xmm0,%xmm4,%xmm4
- vaesdec %xmm0,%xmm5,%xmm5
- leaq (%r9,%rbx,1),%rbx
- cmovgeq %rsp,%r9
- vaesdec %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm0,%xmm7,%xmm7
- subq %r9,%rbx
- vaesdec %xmm0,%xmm8,%xmm8
- vmovdqu 16(%r9),%xmm11
- movq %rbx,64+8(%rsp)
- vaesdec %xmm0,%xmm9,%xmm9
- vmovups -56(%rsi),%xmm0
- leaq 16(%r9,%rbx,1),%r9
- vmovdqu %xmm11,144(%rsp)
- vaesdec %xmm1,%xmm2,%xmm2
- cmpl 32+8(%rsp),%ecx
- movq 64+16(%rsp),%rbx
- vaesdec %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r10)
- vaesdec %xmm1,%xmm4,%xmm4
- prefetcht0 15(%r8)
- vaesdec %xmm1,%xmm5,%xmm5
- leaq (%r10,%rbx,1),%rbx
- cmovgeq %rsp,%r10
- vaesdec %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm1,%xmm7,%xmm7
- subq %r10,%rbx
- vaesdec %xmm1,%xmm8,%xmm8
- vmovdqu 16(%r10),%xmm12
- movq %rbx,64+16(%rsp)
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups -40(%rsi),%xmm1
- leaq 16(%r10,%rbx,1),%r10
- vmovdqu %xmm12,160(%rsp)
- vaesdec %xmm0,%xmm2,%xmm2
- cmpl 32+12(%rsp),%ecx
- movq 64+24(%rsp),%rbx
- vaesdec %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r11)
- vaesdec %xmm0,%xmm4,%xmm4
- prefetcht0 15(%r9)
- vaesdec %xmm0,%xmm5,%xmm5
- leaq (%r11,%rbx,1),%rbx
- cmovgeq %rsp,%r11
- vaesdec %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm0,%xmm7,%xmm7
- subq %r11,%rbx
- vaesdec %xmm0,%xmm8,%xmm8
- vmovdqu 16(%r11),%xmm13
- movq %rbx,64+24(%rsp)
- vaesdec %xmm0,%xmm9,%xmm9
- vmovups -24(%rsi),%xmm0
- leaq 16(%r11,%rbx,1),%r11
- vmovdqu %xmm13,176(%rsp)
- vaesdec %xmm1,%xmm2,%xmm2
- cmpl 32+16(%rsp),%ecx
- movq 64+32(%rsp),%rbx
- vaesdec %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r12)
- vaesdec %xmm1,%xmm4,%xmm4
- prefetcht0 15(%r10)
- vaesdec %xmm1,%xmm5,%xmm5
- leaq (%r12,%rbx,1),%rbx
- cmovgeq %rsp,%r12
- vaesdec %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm1,%xmm7,%xmm7
- subq %r12,%rbx
- vaesdec %xmm1,%xmm8,%xmm8
- vmovdqu 16(%r12),%xmm10
- movq %rbx,64+32(%rsp)
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups -8(%rsi),%xmm1
- leaq 16(%r12,%rbx,1),%r12
- vaesdec %xmm0,%xmm2,%xmm2
- cmpl 32+20(%rsp),%ecx
- movq 64+40(%rsp),%rbx
- vaesdec %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r13)
- vaesdec %xmm0,%xmm4,%xmm4
- prefetcht0 15(%r11)
- vaesdec %xmm0,%xmm5,%xmm5
- leaq (%rbx,%r13,1),%rbx
- cmovgeq %rsp,%r13
- vaesdec %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm0,%xmm7,%xmm7
- subq %r13,%rbx
- vaesdec %xmm0,%xmm8,%xmm8
- vmovdqu 16(%r13),%xmm11
- movq %rbx,64+40(%rsp)
- vaesdec %xmm0,%xmm9,%xmm9
- vmovups 8(%rsi),%xmm0
- leaq 16(%r13,%rbx,1),%r13
- vaesdec %xmm1,%xmm2,%xmm2
- cmpl 32+24(%rsp),%ecx
- movq 64+48(%rsp),%rbx
- vaesdec %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r14)
- vaesdec %xmm1,%xmm4,%xmm4
- prefetcht0 15(%r12)
- vaesdec %xmm1,%xmm5,%xmm5
- leaq (%r14,%rbx,1),%rbx
- cmovgeq %rsp,%r14
- vaesdec %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm1,%xmm7,%xmm7
- subq %r14,%rbx
- vaesdec %xmm1,%xmm8,%xmm8
- vmovdqu 16(%r14),%xmm12
- movq %rbx,64+48(%rsp)
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups 24(%rsi),%xmm1
- leaq 16(%r14,%rbx,1),%r14
- vaesdec %xmm0,%xmm2,%xmm2
- cmpl 32+28(%rsp),%ecx
- movq 64+56(%rsp),%rbx
- vaesdec %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r15)
- vaesdec %xmm0,%xmm4,%xmm4
- prefetcht0 15(%r13)
- vaesdec %xmm0,%xmm5,%xmm5
- leaq (%r15,%rbx,1),%rbx
- cmovgeq %rsp,%r15
- vaesdec %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm0,%xmm7,%xmm7
- subq %r15,%rbx
- vaesdec %xmm0,%xmm8,%xmm8
- vmovdqu 16(%r15),%xmm13
- movq %rbx,64+56(%rsp)
- vaesdec %xmm0,%xmm9,%xmm9
- vmovups 40(%rsi),%xmm0
- leaq 16(%r15,%rbx,1),%r15
- vmovdqu 32(%rsp),%xmm14
- prefetcht0 15(%r14)
- prefetcht0 15(%r15)
- cmpl $11,%eax
- jb L$dec8x_tail
-
- vaesdec %xmm1,%xmm2,%xmm2
- vaesdec %xmm1,%xmm3,%xmm3
- vaesdec %xmm1,%xmm4,%xmm4
- vaesdec %xmm1,%xmm5,%xmm5
- vaesdec %xmm1,%xmm6,%xmm6
- vaesdec %xmm1,%xmm7,%xmm7
- vaesdec %xmm1,%xmm8,%xmm8
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups 176-120(%rsi),%xmm1
-
- vaesdec %xmm0,%xmm2,%xmm2
- vaesdec %xmm0,%xmm3,%xmm3
- vaesdec %xmm0,%xmm4,%xmm4
- vaesdec %xmm0,%xmm5,%xmm5
- vaesdec %xmm0,%xmm6,%xmm6
- vaesdec %xmm0,%xmm7,%xmm7
- vaesdec %xmm0,%xmm8,%xmm8
- vaesdec %xmm0,%xmm9,%xmm9
- vmovups 192-120(%rsi),%xmm0
- je L$dec8x_tail
-
- vaesdec %xmm1,%xmm2,%xmm2
- vaesdec %xmm1,%xmm3,%xmm3
- vaesdec %xmm1,%xmm4,%xmm4
- vaesdec %xmm1,%xmm5,%xmm5
- vaesdec %xmm1,%xmm6,%xmm6
- vaesdec %xmm1,%xmm7,%xmm7
- vaesdec %xmm1,%xmm8,%xmm8
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups 208-120(%rsi),%xmm1
-
- vaesdec %xmm0,%xmm2,%xmm2
- vaesdec %xmm0,%xmm3,%xmm3
- vaesdec %xmm0,%xmm4,%xmm4
- vaesdec %xmm0,%xmm5,%xmm5
- vaesdec %xmm0,%xmm6,%xmm6
- vaesdec %xmm0,%xmm7,%xmm7
- vaesdec %xmm0,%xmm8,%xmm8
- vaesdec %xmm0,%xmm9,%xmm9
- vmovups 224-120(%rsi),%xmm0
-
-L$dec8x_tail:
- vaesdec %xmm1,%xmm2,%xmm2
- vpxor %xmm15,%xmm15,%xmm15
- vaesdec %xmm1,%xmm3,%xmm3
- vaesdec %xmm1,%xmm4,%xmm4
- vpcmpgtd %xmm15,%xmm14,%xmm15
- vaesdec %xmm1,%xmm5,%xmm5
- vaesdec %xmm1,%xmm6,%xmm6
- vpaddd %xmm14,%xmm15,%xmm15
- vmovdqu 48(%rsp),%xmm14
- vaesdec %xmm1,%xmm7,%xmm7
- movq 64(%rsp),%rbx
- vaesdec %xmm1,%xmm8,%xmm8
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups 16-120(%rsi),%xmm1
-
- vaesdeclast %xmm0,%xmm2,%xmm2
- vmovdqa %xmm15,32(%rsp)
- vpxor %xmm15,%xmm15,%xmm15
- vaesdeclast %xmm0,%xmm3,%xmm3
- vpxor 0(%rbp),%xmm2,%xmm2
- vaesdeclast %xmm0,%xmm4,%xmm4
- vpxor 16(%rbp),%xmm3,%xmm3
- vpcmpgtd %xmm15,%xmm14,%xmm15
- vaesdeclast %xmm0,%xmm5,%xmm5
- vpxor 32(%rbp),%xmm4,%xmm4
- vaesdeclast %xmm0,%xmm6,%xmm6
- vpxor 48(%rbp),%xmm5,%xmm5
- vpaddd %xmm15,%xmm14,%xmm14
- vmovdqu -120(%rsi),%xmm15
- vaesdeclast %xmm0,%xmm7,%xmm7
- vpxor 64(%rbp),%xmm6,%xmm6
- vaesdeclast %xmm0,%xmm8,%xmm8
- vpxor 80(%rbp),%xmm7,%xmm7
- vmovdqa %xmm14,48(%rsp)
- vaesdeclast %xmm0,%xmm9,%xmm9
- vpxor 96(%rbp),%xmm8,%xmm8
- vmovups 32-120(%rsi),%xmm0
-
- vmovups %xmm2,-16(%r8)
- subq %rbx,%r8
- vmovdqu 128+0(%rsp),%xmm2
- vpxor 112(%rbp),%xmm9,%xmm9
- vmovups %xmm3,-16(%r9)
- subq 72(%rsp),%r9
- vmovdqu %xmm2,0(%rbp)
- vpxor %xmm15,%xmm2,%xmm2
- vmovdqu 128+16(%rsp),%xmm3
- vmovups %xmm4,-16(%r10)
- subq 80(%rsp),%r10
- vmovdqu %xmm3,16(%rbp)
- vpxor %xmm15,%xmm3,%xmm3
- vmovdqu 128+32(%rsp),%xmm4
- vmovups %xmm5,-16(%r11)
- subq 88(%rsp),%r11
- vmovdqu %xmm4,32(%rbp)
- vpxor %xmm15,%xmm4,%xmm4
- vmovdqu 128+48(%rsp),%xmm5
- vmovups %xmm6,-16(%r12)
- subq 96(%rsp),%r12
- vmovdqu %xmm5,48(%rbp)
- vpxor %xmm15,%xmm5,%xmm5
- vmovdqu %xmm10,64(%rbp)
- vpxor %xmm10,%xmm15,%xmm6
- vmovups %xmm7,-16(%r13)
- subq 104(%rsp),%r13
- vmovdqu %xmm11,80(%rbp)
- vpxor %xmm11,%xmm15,%xmm7
- vmovups %xmm8,-16(%r14)
- subq 112(%rsp),%r14
- vmovdqu %xmm12,96(%rbp)
- vpxor %xmm12,%xmm15,%xmm8
- vmovups %xmm9,-16(%r15)
- subq 120(%rsp),%r15
- vmovdqu %xmm13,112(%rbp)
- vpxor %xmm13,%xmm15,%xmm9
-
- xorq $128,%rbp
- decl %edx
- jnz L$oop_dec8x
-
- movq 16(%rsp),%rax
-
-
-
-
-
-L$dec8x_done:
- vzeroupper
- movq -48(%rax),%r15
- movq -40(%rax),%r14
- movq -32(%rax),%r13
- movq -24(%rax),%r12
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-L$dec8x_epilogue:
- .byte 0xf3,0xc3
diff --git a/deps/openssl/asm/x64-macosx-gas/aes/aesni-sha1-x86_64.s b/deps/openssl/asm/x64-macosx-gas/aes/aesni-sha1-x86_64.s
index c7606aec49..970a12149b 100644
--- a/deps/openssl/asm/x64-macosx-gas/aes/aesni-sha1-x86_64.s
+++ b/deps/openssl/asm/x64-macosx-gas/aes/aesni-sha1-x86_64.s
@@ -10,11 +10,6 @@ _aesni_cbc_sha1_enc:
movq _OPENSSL_ia32cap_P+4(%rip),%r11
btq $61,%r11
jc aesni_cbc_sha1_enc_shaext
- andl $268435456,%r11d
- andl $1073741824,%r10d
- orl %r11d,%r10d
- cmpl $1342177280,%r10d
- je aesni_cbc_sha1_enc_avx
jmp aesni_cbc_sha1_enc_ssse3
.byte 0xf3,0xc3
@@ -1372,1304 +1367,6 @@ L$aesenclast5:
L$epilogue_ssse3:
.byte 0xf3,0xc3
-
-.p2align 5
-aesni_cbc_sha1_enc_avx:
- movq 8(%rsp),%r10
-
-
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- leaq -104(%rsp),%rsp
-
-
- vzeroall
- movq %rdi,%r12
- movq %rsi,%r13
- movq %rdx,%r14
- leaq 112(%rcx),%r15
- vmovdqu (%r8),%xmm12
- movq %r8,88(%rsp)
- shlq $6,%r14
- subq %r12,%r13
- movl 240-112(%r15),%r8d
- addq %r10,%r14
-
- leaq K_XX_XX(%rip),%r11
- movl 0(%r9),%eax
- movl 4(%r9),%ebx
- movl 8(%r9),%ecx
- movl 12(%r9),%edx
- movl %ebx,%esi
- movl 16(%r9),%ebp
- movl %ecx,%edi
- xorl %edx,%edi
- andl %edi,%esi
-
- vmovdqa 64(%r11),%xmm6
- vmovdqa 0(%r11),%xmm10
- vmovdqu 0(%r10),%xmm0
- vmovdqu 16(%r10),%xmm1
- vmovdqu 32(%r10),%xmm2
- vmovdqu 48(%r10),%xmm3
- vpshufb %xmm6,%xmm0,%xmm0
- addq $64,%r10
- vpshufb %xmm6,%xmm1,%xmm1
- vpshufb %xmm6,%xmm2,%xmm2
- vpshufb %xmm6,%xmm3,%xmm3
- vpaddd %xmm10,%xmm0,%xmm4
- vpaddd %xmm10,%xmm1,%xmm5
- vpaddd %xmm10,%xmm2,%xmm6
- vmovdqa %xmm4,0(%rsp)
- vmovdqa %xmm5,16(%rsp)
- vmovdqa %xmm6,32(%rsp)
- vmovups -112(%r15),%xmm15
- vmovups 16-112(%r15),%xmm14
- jmp L$oop_avx
-.p2align 5
-L$oop_avx:
- shrdl $2,%ebx,%ebx
- vmovdqu 0(%r12),%xmm13
- vpxor %xmm15,%xmm13,%xmm13
- vpxor %xmm13,%xmm12,%xmm12
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -80(%r15),%xmm15
- xorl %edx,%esi
- vpalignr $8,%xmm0,%xmm1,%xmm4
- movl %eax,%edi
- addl 0(%rsp),%ebp
- vpaddd %xmm3,%xmm10,%xmm9
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrldq $4,%xmm3,%xmm8
- addl %esi,%ebp
- andl %ebx,%edi
- vpxor %xmm0,%xmm4,%xmm4
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpxor %xmm2,%xmm8,%xmm8
- shrdl $7,%eax,%eax
- xorl %ecx,%edi
- movl %ebp,%esi
- addl 4(%rsp),%edx
- vpxor %xmm8,%xmm4,%xmm4
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vmovdqa %xmm9,48(%rsp)
- addl %edi,%edx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -64(%r15),%xmm14
- andl %eax,%esi
- vpsrld $31,%xmm4,%xmm8
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%esi
- vpslldq $12,%xmm4,%xmm9
- vpaddd %xmm4,%xmm4,%xmm4
- movl %edx,%edi
- addl 8(%rsp),%ecx
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpor %xmm8,%xmm4,%xmm4
- vpsrld $30,%xmm9,%xmm8
- addl %esi,%ecx
- andl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- vpslld $2,%xmm9,%xmm9
- vpxor %xmm8,%xmm4,%xmm4
- shrdl $7,%edx,%edx
- xorl %eax,%edi
- movl %ecx,%esi
- addl 12(%rsp),%ebx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -48(%r15),%xmm15
- vpxor %xmm9,%xmm4,%xmm4
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- andl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %ebp,%esi
- vpalignr $8,%xmm1,%xmm2,%xmm5
- movl %ebx,%edi
- addl 16(%rsp),%eax
- vpaddd %xmm4,%xmm10,%xmm9
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrldq $4,%xmm4,%xmm8
- addl %esi,%eax
- andl %ecx,%edi
- vpxor %xmm1,%xmm5,%xmm5
- xorl %edx,%ecx
- addl %ebx,%eax
- vpxor %xmm3,%xmm8,%xmm8
- shrdl $7,%ebx,%ebx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -32(%r15),%xmm14
- xorl %edx,%edi
- movl %eax,%esi
- addl 20(%rsp),%ebp
- vpxor %xmm8,%xmm5,%xmm5
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%ebp
- andl %ebx,%esi
- vpsrld $31,%xmm5,%xmm8
- xorl %ecx,%ebx
- addl %eax,%ebp
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- vpslldq $12,%xmm5,%xmm9
- vpaddd %xmm5,%xmm5,%xmm5
- movl %ebp,%edi
- addl 24(%rsp),%edx
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vpor %xmm8,%xmm5,%xmm5
- vpsrld $30,%xmm9,%xmm8
- addl %esi,%edx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -16(%r15),%xmm15
- andl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- vpslld $2,%xmm9,%xmm9
- vpxor %xmm8,%xmm5,%xmm5
- shrdl $7,%ebp,%ebp
- xorl %ebx,%edi
- movl %edx,%esi
- addl 28(%rsp),%ecx
- vpxor %xmm9,%xmm5,%xmm5
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vmovdqa 16(%r11),%xmm10
- addl %edi,%ecx
- andl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- vpalignr $8,%xmm2,%xmm3,%xmm6
- movl %ecx,%edi
- addl 32(%rsp),%ebx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 0(%r15),%xmm14
- vpaddd %xmm5,%xmm10,%xmm9
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- vpsrldq $4,%xmm5,%xmm8
- addl %esi,%ebx
- andl %edx,%edi
- vpxor %xmm2,%xmm6,%xmm6
- xorl %ebp,%edx
- addl %ecx,%ebx
- vpxor %xmm4,%xmm8,%xmm8
- shrdl $7,%ecx,%ecx
- xorl %ebp,%edi
- movl %ebx,%esi
- addl 36(%rsp),%eax
- vpxor %xmm8,%xmm6,%xmm6
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vmovdqa %xmm9,16(%rsp)
- addl %edi,%eax
- andl %ecx,%esi
- vpsrld $31,%xmm6,%xmm8
- xorl %edx,%ecx
- addl %ebx,%eax
- shrdl $7,%ebx,%ebx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 16(%r15),%xmm15
- xorl %edx,%esi
- vpslldq $12,%xmm6,%xmm9
- vpaddd %xmm6,%xmm6,%xmm6
- movl %eax,%edi
- addl 40(%rsp),%ebp
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpor %xmm8,%xmm6,%xmm6
- vpsrld $30,%xmm9,%xmm8
- addl %esi,%ebp
- andl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpslld $2,%xmm9,%xmm9
- vpxor %xmm8,%xmm6,%xmm6
- shrdl $7,%eax,%eax
- xorl %ecx,%edi
- movl %ebp,%esi
- addl 44(%rsp),%edx
- vpxor %xmm9,%xmm6,%xmm6
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 32(%r15),%xmm14
- andl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%esi
- vpalignr $8,%xmm3,%xmm4,%xmm7
- movl %edx,%edi
- addl 48(%rsp),%ecx
- vpaddd %xmm6,%xmm10,%xmm9
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpsrldq $4,%xmm6,%xmm8
- addl %esi,%ecx
- andl %ebp,%edi
- vpxor %xmm3,%xmm7,%xmm7
- xorl %eax,%ebp
- addl %edx,%ecx
- vpxor %xmm5,%xmm8,%xmm8
- shrdl $7,%edx,%edx
- xorl %eax,%edi
- movl %ecx,%esi
- addl 52(%rsp),%ebx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 48(%r15),%xmm15
- vpxor %xmm8,%xmm7,%xmm7
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%ebx
- andl %edx,%esi
- vpsrld $31,%xmm7,%xmm8
- xorl %ebp,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %ebp,%esi
- vpslldq $12,%xmm7,%xmm9
- vpaddd %xmm7,%xmm7,%xmm7
- movl %ebx,%edi
- addl 56(%rsp),%eax
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpor %xmm8,%xmm7,%xmm7
- vpsrld $30,%xmm9,%xmm8
- addl %esi,%eax
- andl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm9,%xmm9
- vpxor %xmm8,%xmm7,%xmm7
- shrdl $7,%ebx,%ebx
- cmpl $11,%r8d
- jb L$vaesenclast6
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 64(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 80(%r15),%xmm15
- je L$vaesenclast6
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 96(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 112(%r15),%xmm15
-L$vaesenclast6:
- vaesenclast %xmm15,%xmm12,%xmm12
- vmovups -112(%r15),%xmm15
- vmovups 16-112(%r15),%xmm14
- xorl %edx,%edi
- movl %eax,%esi
- addl 60(%rsp),%ebp
- vpxor %xmm9,%xmm7,%xmm7
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- addl %edi,%ebp
- andl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm6,%xmm7,%xmm8
- vpxor %xmm4,%xmm0,%xmm0
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- movl %ebp,%edi
- addl 0(%rsp),%edx
- vpxor %xmm1,%xmm0,%xmm0
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vpaddd %xmm7,%xmm10,%xmm9
- addl %esi,%edx
- vmovdqu 16(%r12),%xmm13
- vpxor %xmm15,%xmm13,%xmm13
- vmovups %xmm12,0(%r12,%r13,1)
- vpxor %xmm13,%xmm12,%xmm12
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -80(%r15),%xmm15
- andl %eax,%edi
- vpxor %xmm8,%xmm0,%xmm0
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%edi
- vpsrld $30,%xmm0,%xmm8
- vmovdqa %xmm9,48(%rsp)
- movl %edx,%esi
- addl 4(%rsp),%ecx
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpslld $2,%xmm0,%xmm0
- addl %edi,%ecx
- andl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- movl %ecx,%edi
- addl 8(%rsp),%ebx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -64(%r15),%xmm14
- vpor %xmm8,%xmm0,%xmm0
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- andl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 12(%rsp),%eax
- xorl %ebp,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm7,%xmm0,%xmm8
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -48(%r15),%xmm15
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- vpxor %xmm2,%xmm1,%xmm1
- addl %esi,%ebp
- xorl %ecx,%edi
- vpaddd %xmm0,%xmm10,%xmm9
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpxor %xmm8,%xmm1,%xmm1
- addl 20(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- vpsrld $30,%xmm1,%xmm8
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpslld $2,%xmm1,%xmm1
- addl 24(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -32(%r15),%xmm14
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpor %xmm8,%xmm1,%xmm1
- addl 28(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm0,%xmm1,%xmm8
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- vpxor %xmm3,%xmm2,%xmm2
- addl %esi,%eax
- xorl %edx,%edi
- vpaddd %xmm1,%xmm10,%xmm9
- vmovdqa 32(%r11),%xmm10
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpxor %xmm8,%xmm2,%xmm2
- addl 36(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -16(%r15),%xmm15
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- vpsrld $30,%xmm2,%xmm8
- vmovdqa %xmm9,16(%rsp)
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpslld $2,%xmm2,%xmm2
- addl 40(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpor %xmm8,%xmm2,%xmm2
- addl 44(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 0(%r15),%xmm14
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpalignr $8,%xmm1,%xmm2,%xmm8
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- vpxor %xmm4,%xmm3,%xmm3
- addl %esi,%ebx
- xorl %ebp,%edi
- vpaddd %xmm2,%xmm10,%xmm9
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpxor %xmm8,%xmm3,%xmm3
- addl 52(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm3,%xmm8
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm3,%xmm3
- addl 56(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 16(%r15),%xmm15
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpor %xmm8,%xmm3,%xmm3
- addl 60(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpalignr $8,%xmm2,%xmm3,%xmm8
- vpxor %xmm0,%xmm4,%xmm4
- addl 0(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- vpxor %xmm5,%xmm4,%xmm4
- addl %esi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 32(%r15),%xmm14
- xorl %eax,%edi
- vpaddd %xmm3,%xmm10,%xmm9
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpxor %xmm8,%xmm4,%xmm4
- addl 4(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- vpsrld $30,%xmm4,%xmm8
- vmovdqa %xmm9,48(%rsp)
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpslld $2,%xmm4,%xmm4
- addl 8(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpor %xmm8,%xmm4,%xmm4
- addl 12(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 48(%r15),%xmm15
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm3,%xmm4,%xmm8
- vpxor %xmm1,%xmm5,%xmm5
- addl 16(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- vpxor %xmm6,%xmm5,%xmm5
- addl %esi,%edx
- xorl %ebx,%edi
- vpaddd %xmm4,%xmm10,%xmm9
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpxor %xmm8,%xmm5,%xmm5
- addl 20(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- vpsrld $30,%xmm5,%xmm8
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%ecx
- cmpl $11,%r8d
- jb L$vaesenclast7
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 64(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 80(%r15),%xmm15
- je L$vaesenclast7
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 96(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 112(%r15),%xmm15
-L$vaesenclast7:
- vaesenclast %xmm15,%xmm12,%xmm12
- vmovups -112(%r15),%xmm15
- vmovups 16-112(%r15),%xmm14
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpslld $2,%xmm5,%xmm5
- addl 24(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpor %xmm8,%xmm5,%xmm5
- addl 28(%rsp),%eax
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm4,%xmm5,%xmm8
- vpxor %xmm2,%xmm6,%xmm6
- addl 32(%rsp),%ebp
- vmovdqu 32(%r12),%xmm13
- vpxor %xmm15,%xmm13,%xmm13
- vmovups %xmm12,16(%r13,%r12,1)
- vpxor %xmm13,%xmm12,%xmm12
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -80(%r15),%xmm15
- andl %ecx,%esi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- movl %eax,%edi
- xorl %ecx,%esi
- vpaddd %xmm5,%xmm10,%xmm9
- shldl $5,%eax,%eax
- addl %esi,%ebp
- vpxor %xmm8,%xmm6,%xmm6
- xorl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 36(%rsp),%edx
- vpsrld $30,%xmm6,%xmm8
- vmovdqa %xmm9,16(%rsp)
- andl %ebx,%edi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%esi
- vpslld $2,%xmm6,%xmm6
- xorl %ebx,%edi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -64(%r15),%xmm14
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 40(%rsp),%ecx
- andl %eax,%esi
- vpor %xmm8,%xmm6,%xmm6
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%edi
- xorl %eax,%esi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 44(%rsp),%ebx
- andl %ebp,%edi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -48(%r15),%xmm15
- movl %ecx,%esi
- xorl %ebp,%edi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm5,%xmm6,%xmm8
- vpxor %xmm3,%xmm7,%xmm7
- addl 48(%rsp),%eax
- andl %edx,%esi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- vpxor %xmm0,%xmm7,%xmm7
- movl %ebx,%edi
- xorl %edx,%esi
- vpaddd %xmm6,%xmm10,%xmm9
- vmovdqa 48(%r11),%xmm10
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vpxor %xmm8,%xmm7,%xmm7
- xorl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 52(%rsp),%ebp
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -32(%r15),%xmm14
- vpsrld $30,%xmm7,%xmm8
- vmovdqa %xmm9,32(%rsp)
- andl %ecx,%edi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- vpslld $2,%xmm7,%xmm7
- xorl %ecx,%edi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 56(%rsp),%edx
- andl %ebx,%esi
- vpor %xmm8,%xmm7,%xmm7
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%edi
- xorl %ebx,%esi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -16(%r15),%xmm15
- xorl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 60(%rsp),%ecx
- andl %eax,%edi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%esi
- xorl %eax,%edi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- vpalignr $8,%xmm6,%xmm7,%xmm8
- vpxor %xmm4,%xmm0,%xmm0
- addl 0(%rsp),%ebx
- andl %ebp,%esi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 0(%r15),%xmm14
- vpxor %xmm1,%xmm0,%xmm0
- movl %ecx,%edi
- xorl %ebp,%esi
- vpaddd %xmm7,%xmm10,%xmm9
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- vpxor %xmm8,%xmm0,%xmm0
- xorl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 4(%rsp),%eax
- vpsrld $30,%xmm0,%xmm8
- vmovdqa %xmm9,48(%rsp)
- andl %edx,%edi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- vpslld $2,%xmm0,%xmm0
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 8(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 16(%r15),%xmm15
- andl %ecx,%esi
- vpor %xmm8,%xmm0,%xmm0
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%edi
- xorl %ecx,%esi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 12(%rsp),%edx
- andl %ebx,%edi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%esi
- xorl %ebx,%edi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 32(%r15),%xmm14
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- vpalignr $8,%xmm7,%xmm0,%xmm8
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%rsp),%ecx
- andl %eax,%esi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- vpxor %xmm2,%xmm1,%xmm1
- movl %edx,%edi
- xorl %eax,%esi
- vpaddd %xmm0,%xmm10,%xmm9
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vpxor %xmm8,%xmm1,%xmm1
- xorl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 20(%rsp),%ebx
- vpsrld $30,%xmm1,%xmm8
- vmovdqa %xmm9,0(%rsp)
- andl %ebp,%edi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 48(%r15),%xmm15
- movl %ecx,%esi
- vpslld $2,%xmm1,%xmm1
- xorl %ebp,%edi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 24(%rsp),%eax
- andl %edx,%esi
- vpor %xmm8,%xmm1,%xmm1
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%edi
- xorl %edx,%esi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 28(%rsp),%ebp
- cmpl $11,%r8d
- jb L$vaesenclast8
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 64(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 80(%r15),%xmm15
- je L$vaesenclast8
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 96(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 112(%r15),%xmm15
-L$vaesenclast8:
- vaesenclast %xmm15,%xmm12,%xmm12
- vmovups -112(%r15),%xmm15
- vmovups 16-112(%r15),%xmm14
- andl %ecx,%edi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- xorl %ecx,%edi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm0,%xmm1,%xmm8
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%rsp),%edx
- andl %ebx,%esi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- vpxor %xmm3,%xmm2,%xmm2
- movl %ebp,%edi
- xorl %ebx,%esi
- vpaddd %xmm1,%xmm10,%xmm9
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- vmovdqu 48(%r12),%xmm13
- vpxor %xmm15,%xmm13,%xmm13
- vmovups %xmm12,32(%r13,%r12,1)
- vpxor %xmm13,%xmm12,%xmm12
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -80(%r15),%xmm15
- vpxor %xmm8,%xmm2,%xmm2
- xorl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 36(%rsp),%ecx
- vpsrld $30,%xmm2,%xmm8
- vmovdqa %xmm9,16(%rsp)
- andl %eax,%edi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%esi
- vpslld $2,%xmm2,%xmm2
- xorl %eax,%edi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 40(%rsp),%ebx
- andl %ebp,%esi
- vpor %xmm8,%xmm2,%xmm2
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -64(%r15),%xmm14
- movl %ecx,%edi
- xorl %ebp,%esi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 44(%rsp),%eax
- andl %edx,%edi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- addl %ebx,%eax
- vpalignr $8,%xmm1,%xmm2,%xmm8
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -48(%r15),%xmm15
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- vpxor %xmm4,%xmm3,%xmm3
- addl %esi,%ebp
- xorl %ecx,%edi
- vpaddd %xmm2,%xmm10,%xmm9
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpxor %xmm8,%xmm3,%xmm3
- addl 52(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- vpsrld $30,%xmm3,%xmm8
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpslld $2,%xmm3,%xmm3
- addl 56(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -32(%r15),%xmm14
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpor %xmm8,%xmm3,%xmm3
- addl 60(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 0(%rsp),%eax
- vpaddd %xmm3,%xmm10,%xmm9
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vmovdqa %xmm9,48(%rsp)
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 4(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -16(%r15),%xmm15
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 8(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 12(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 0(%r15),%xmm14
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- cmpq %r14,%r10
- je L$done_avx
- vmovdqa 64(%r11),%xmm9
- vmovdqa 0(%r11),%xmm10
- vmovdqu 0(%r10),%xmm0
- vmovdqu 16(%r10),%xmm1
- vmovdqu 32(%r10),%xmm2
- vmovdqu 48(%r10),%xmm3
- vpshufb %xmm9,%xmm0,%xmm0
- addq $64,%r10
- addl 16(%rsp),%ebx
- xorl %ebp,%esi
- vpshufb %xmm9,%xmm1,%xmm1
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- vpaddd %xmm10,%xmm0,%xmm8
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vmovdqa %xmm8,0(%rsp)
- addl 20(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 16(%r15),%xmm15
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 28(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 32(%rsp),%ecx
- xorl %eax,%esi
- vpshufb %xmm9,%xmm2,%xmm2
- movl %edx,%edi
- shldl $5,%edx,%edx
- vpaddd %xmm10,%xmm1,%xmm8
- addl %esi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 32(%r15),%xmm14
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vmovdqa %xmm8,16(%rsp)
- addl 36(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 48(%r15),%xmm15
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 48(%rsp),%edx
- xorl %ebx,%esi
- vpshufb %xmm9,%xmm3,%xmm3
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- vpaddd %xmm10,%xmm2,%xmm8
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vmovdqa %xmm8,32(%rsp)
- addl 52(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- cmpl $11,%r8d
- jb L$vaesenclast9
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 64(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 80(%r15),%xmm15
- je L$vaesenclast9
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 96(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 112(%r15),%xmm15
-L$vaesenclast9:
- vaesenclast %xmm15,%xmm12,%xmm12
- vmovups -112(%r15),%xmm15
- vmovups 16-112(%r15),%xmm14
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 56(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vmovups %xmm12,48(%r13,%r12,1)
- leaq 64(%r12),%r12
-
- addl 0(%r9),%eax
- addl 4(%r9),%esi
- addl 8(%r9),%ecx
- addl 12(%r9),%edx
- movl %eax,0(%r9)
- addl 16(%r9),%ebp
- movl %esi,4(%r9)
- movl %esi,%ebx
- movl %ecx,8(%r9)
- movl %ecx,%edi
- movl %edx,12(%r9)
- xorl %edx,%edi
- movl %ebp,16(%r9)
- andl %edi,%esi
- jmp L$oop_avx
-
-L$done_avx:
- addl 16(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 20(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 16(%r15),%xmm15
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 28(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 32(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 32(%r15),%xmm14
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 36(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 48(%r15),%xmm15
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 48(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 52(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- cmpl $11,%r8d
- jb L$vaesenclast10
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 64(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 80(%r15),%xmm15
- je L$vaesenclast10
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 96(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 112(%r15),%xmm15
-L$vaesenclast10:
- vaesenclast %xmm15,%xmm12,%xmm12
- vmovups -112(%r15),%xmm15
- vmovups 16-112(%r15),%xmm14
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 56(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vmovups %xmm12,48(%r13,%r12,1)
- movq 88(%rsp),%r8
-
- addl 0(%r9),%eax
- addl 4(%r9),%esi
- addl 8(%r9),%ecx
- movl %eax,0(%r9)
- addl 12(%r9),%edx
- movl %esi,4(%r9)
- addl 16(%r9),%ebp
- movl %ecx,8(%r9)
- movl %edx,12(%r9)
- movl %ebp,16(%r9)
- vmovups %xmm12,(%r8)
- vzeroall
- leaq 104(%rsp),%rsi
- movq 0(%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-L$epilogue_avx:
- .byte 0xf3,0xc3
-
.p2align 6
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
@@ -2695,8 +1392,8 @@ aesni_cbc_sha1_enc_shaext:
movups 16(%rcx),%xmm0
leaq 112(%rcx),%rcx
- pshufd $27,%xmm8,%xmm8
- pshufd $27,%xmm9,%xmm9
+ pshufd $0b00011011,%xmm8,%xmm8
+ pshufd $0b00011011,%xmm9,%xmm9
jmp L$oop_shaext
.p2align 4
@@ -2759,17 +1456,17 @@ L$oop_shaext:
pxor %xmm3,%xmm5
.byte 15,56,201,243
cmpl $11,%r11d
- jb L$aesenclast11
+ jb L$aesenclast6
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je L$aesenclast11
+ je L$aesenclast6
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-L$aesenclast11:
+L$aesenclast6:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
movdqa %xmm8,%xmm10
@@ -2825,17 +1522,17 @@ L$aesenclast11:
pxor %xmm4,%xmm6
.byte 15,56,201,220
cmpl $11,%r11d
- jb L$aesenclast12
+ jb L$aesenclast7
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je L$aesenclast12
+ je L$aesenclast7
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-L$aesenclast12:
+L$aesenclast7:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
movdqa %xmm8,%xmm9
@@ -2891,17 +1588,17 @@ L$aesenclast12:
pxor %xmm5,%xmm3
.byte 15,56,201,229
cmpl $11,%r11d
- jb L$aesenclast13
+ jb L$aesenclast8
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je L$aesenclast13
+ je L$aesenclast8
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-L$aesenclast13:
+L$aesenclast8:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
movdqa %xmm8,%xmm10
@@ -2955,17 +1652,17 @@ L$aesenclast13:
movups 48(%rcx),%xmm1
.byte 102,15,56,220,208
cmpl $11,%r11d
- jb L$aesenclast14
+ jb L$aesenclast9
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je L$aesenclast14
+ je L$aesenclast9
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-L$aesenclast14:
+L$aesenclast9:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
decq %rdx
@@ -2975,8 +1672,8 @@ L$aesenclast14:
leaq 64(%rdi),%rdi
jnz L$oop_shaext
- pshufd $27,%xmm8,%xmm8
- pshufd $27,%xmm9,%xmm9
+ pshufd $0b00011011,%xmm8,%xmm8
+ pshufd $0b00011011,%xmm9,%xmm9
movups %xmm2,(%r8)
movdqu %xmm8,(%r9)
movd %xmm9,16(%r9)
diff --git a/deps/openssl/asm/x64-macosx-gas/aes/aesni-sha256-x86_64.s b/deps/openssl/asm/x64-macosx-gas/aes/aesni-sha256-x86_64.s
index 7d8bdff634..53307da358 100644
--- a/deps/openssl/asm/x64-macosx-gas/aes/aesni-sha256-x86_64.s
+++ b/deps/openssl/asm/x64-macosx-gas/aes/aesni-sha256-x86_64.s
@@ -5,25 +5,6 @@
.p2align 4
_aesni_cbc_sha256_enc:
- leaq _OPENSSL_ia32cap_P(%rip),%r11
- movl $1,%eax
- cmpq $0,%rdi
- je L$probe
- movl 0(%r11),%eax
- movq 4(%r11),%r10
- btq $61,%r10
- jc aesni_cbc_sha256_enc_shaext
- movq %r10,%r11
- shrq $32,%r11
-
- testl $2048,%r10d
- jnz aesni_cbc_sha256_enc_xop
- andl $296,%r11d
- cmpl $296,%r11d
- je aesni_cbc_sha256_enc_avx2
- andl $268435456,%r10d
- jnz aesni_cbc_sha256_enc_avx
- ud2
xorl %eax,%eax
cmpq $0,%rdi
je L$probe
@@ -74,4280 +55,3 @@ K256:
.long 0,0,0,0, 0,0,0,0
.byte 65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6
-
-.p2align 6
-aesni_cbc_sha256_enc_xop:
-L$xop_shortcut:
- movq 8(%rsp),%r10
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rsp,%r11
- subq $128,%rsp
- andq $-64,%rsp
-
- shlq $6,%rdx
- subq %rdi,%rsi
- subq %rdi,%r10
- addq %rdi,%rdx
-
-
- movq %rsi,64+8(%rsp)
- movq %rdx,64+16(%rsp)
-
- movq %r8,64+32(%rsp)
- movq %r9,64+40(%rsp)
- movq %r10,64+48(%rsp)
- movq %r11,64+56(%rsp)
-L$prologue_xop:
- vzeroall
-
- movq %rdi,%r12
- leaq 128(%rcx),%rdi
- leaq K256+544(%rip),%r13
- movl 240-128(%rdi),%r14d
- movq %r9,%r15
- movq %r10,%rsi
- vmovdqu (%r8),%xmm8
- subq $9,%r14
-
- movl 0(%r15),%eax
- movl 4(%r15),%ebx
- movl 8(%r15),%ecx
- movl 12(%r15),%edx
- movl 16(%r15),%r8d
- movl 20(%r15),%r9d
- movl 24(%r15),%r10d
- movl 28(%r15),%r11d
-
- vmovdqa 0(%r13,%r14,8),%xmm14
- vmovdqa 16(%r13,%r14,8),%xmm13
- vmovdqa 32(%r13,%r14,8),%xmm12
- vmovdqu 0-128(%rdi),%xmm10
- jmp L$loop_xop
-.p2align 4
-L$loop_xop:
- vmovdqa K256+512(%rip),%xmm7
- vmovdqu 0(%rsi,%r12,1),%xmm0
- vmovdqu 16(%rsi,%r12,1),%xmm1
- vmovdqu 32(%rsi,%r12,1),%xmm2
- vmovdqu 48(%rsi,%r12,1),%xmm3
- vpshufb %xmm7,%xmm0,%xmm0
- leaq K256(%rip),%rbp
- vpshufb %xmm7,%xmm1,%xmm1
- vpshufb %xmm7,%xmm2,%xmm2
- vpaddd 0(%rbp),%xmm0,%xmm4
- vpshufb %xmm7,%xmm3,%xmm3
- vpaddd 32(%rbp),%xmm1,%xmm5
- vpaddd 64(%rbp),%xmm2,%xmm6
- vpaddd 96(%rbp),%xmm3,%xmm7
- vmovdqa %xmm4,0(%rsp)
- movl %eax,%r14d
- vmovdqa %xmm5,16(%rsp)
- movl %ebx,%esi
- vmovdqa %xmm6,32(%rsp)
- xorl %ecx,%esi
- vmovdqa %xmm7,48(%rsp)
- movl %r8d,%r13d
- jmp L$xop_00_47
-
-.p2align 4
-L$xop_00_47:
- subq $-32*4,%rbp
- vmovdqu (%r12),%xmm9
- movq %r12,64+0(%rsp)
- vpalignr $4,%xmm0,%xmm1,%xmm4
- rorl $14,%r13d
- movl %r14d,%eax
- vpalignr $4,%xmm2,%xmm3,%xmm7
- movl %r9d,%r12d
- xorl %r8d,%r13d
-.byte 143,232,120,194,236,14
- rorl $9,%r14d
- xorl %r10d,%r12d
- vpsrld $3,%xmm4,%xmm4
- rorl $5,%r13d
- xorl %eax,%r14d
- vpaddd %xmm7,%xmm0,%xmm0
- andl %r8d,%r12d
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
-.byte 143,232,120,194,245,11
- rorl $11,%r14d
- xorl %r10d,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ebx,%r15d
- rorl $6,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
-.byte 143,232,120,194,251,13
- xorl %eax,%r14d
- addl %r13d,%r11d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ebx,%esi
- addl %r11d,%edx
- vpsrld $10,%xmm3,%xmm6
- rorl $2,%r14d
- addl %esi,%r11d
- vpaddd %xmm4,%xmm0,%xmm0
- movl %edx,%r13d
- addl %r11d,%r14d
-.byte 143,232,120,194,239,2
- rorl $14,%r13d
- movl %r14d,%r11d
- vpxor %xmm6,%xmm7,%xmm7
- movl %r8d,%r12d
- xorl %edx,%r13d
- rorl $9,%r14d
- xorl %r9d,%r12d
- vpxor %xmm5,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %edx,%r13d
- vpsrldq $8,%xmm7,%xmm7
- addl 4(%rsp),%r10d
- movl %r11d,%esi
- rorl $11,%r14d
- xorl %r9d,%r12d
- vpaddd %xmm7,%xmm0,%xmm0
- xorl %eax,%esi
- rorl $6,%r13d
- addl %r12d,%r10d
- andl %esi,%r15d
-.byte 143,232,120,194,248,13
- xorl %r11d,%r14d
- addl %r13d,%r10d
- vpsrld $10,%xmm0,%xmm6
- xorl %eax,%r15d
- addl %r10d,%ecx
-.byte 143,232,120,194,239,2
- rorl $2,%r14d
- addl %r15d,%r10d
- vpxor %xmm6,%xmm7,%xmm7
- movl %ecx,%r13d
- addl %r10d,%r14d
- rorl $14,%r13d
- movl %r14d,%r10d
- vpxor %xmm5,%xmm7,%xmm7
- movl %edx,%r12d
- xorl %ecx,%r13d
- rorl $9,%r14d
- xorl %r8d,%r12d
- vpslldq $8,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %ecx,%r13d
- vpaddd %xmm7,%xmm0,%xmm0
- addl 8(%rsp),%r9d
- movl %r10d,%r15d
- rorl $11,%r14d
- xorl %r8d,%r12d
- vpaddd 0(%rbp),%xmm0,%xmm6
- xorl %r11d,%r15d
- rorl $6,%r13d
- addl %r12d,%r9d
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%esi
- addl %r9d,%ebx
- rorl $2,%r14d
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- rorl $14,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- xorl %ebx,%r13d
- rorl $9,%r14d
- xorl %edx,%r12d
- rorl $5,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%esi
- rorl $11,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- rorl $6,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- rorl $2,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,0(%rsp)
- vpalignr $4,%xmm1,%xmm2,%xmm4
- rorl $14,%r13d
- movl %r14d,%r8d
- vpalignr $4,%xmm3,%xmm0,%xmm7
- movl %ebx,%r12d
- xorl %eax,%r13d
-.byte 143,232,120,194,236,14
- rorl $9,%r14d
- xorl %ecx,%r12d
- vpsrld $3,%xmm4,%xmm4
- rorl $5,%r13d
- xorl %r8d,%r14d
- vpaddd %xmm7,%xmm1,%xmm1
- andl %eax,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
-.byte 143,232,120,194,245,11
- rorl $11,%r14d
- xorl %ecx,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %r9d,%r15d
- rorl $6,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
-.byte 143,232,120,194,248,13
- xorl %r8d,%r14d
- addl %r13d,%edx
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r9d,%esi
- addl %edx,%r11d
- vpsrld $10,%xmm0,%xmm6
- rorl $2,%r14d
- addl %esi,%edx
- vpaddd %xmm4,%xmm1,%xmm1
- movl %r11d,%r13d
- addl %edx,%r14d
-.byte 143,232,120,194,239,2
- rorl $14,%r13d
- movl %r14d,%edx
- vpxor %xmm6,%xmm7,%xmm7
- movl %eax,%r12d
- xorl %r11d,%r13d
- rorl $9,%r14d
- xorl %ebx,%r12d
- vpxor %xmm5,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r11d,%r13d
- vpsrldq $8,%xmm7,%xmm7
- addl 20(%rsp),%ecx
- movl %edx,%esi
- rorl $11,%r14d
- xorl %ebx,%r12d
- vpaddd %xmm7,%xmm1,%xmm1
- xorl %r8d,%esi
- rorl $6,%r13d
- addl %r12d,%ecx
- andl %esi,%r15d
-.byte 143,232,120,194,249,13
- xorl %edx,%r14d
- addl %r13d,%ecx
- vpsrld $10,%xmm1,%xmm6
- xorl %r8d,%r15d
- addl %ecx,%r10d
-.byte 143,232,120,194,239,2
- rorl $2,%r14d
- addl %r15d,%ecx
- vpxor %xmm6,%xmm7,%xmm7
- movl %r10d,%r13d
- addl %ecx,%r14d
- rorl $14,%r13d
- movl %r14d,%ecx
- vpxor %xmm5,%xmm7,%xmm7
- movl %r11d,%r12d
- xorl %r10d,%r13d
- rorl $9,%r14d
- xorl %eax,%r12d
- vpslldq $8,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r10d,%r13d
- vpaddd %xmm7,%xmm1,%xmm1
- addl 24(%rsp),%ebx
- movl %ecx,%r15d
- rorl $11,%r14d
- xorl %eax,%r12d
- vpaddd 32(%rbp),%xmm1,%xmm6
- xorl %edx,%r15d
- rorl $6,%r13d
- addl %r12d,%ebx
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%esi
- addl %ebx,%r9d
- rorl $2,%r14d
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- rorl $14,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- xorl %r9d,%r13d
- rorl $9,%r14d
- xorl %r11d,%r12d
- rorl $5,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%esi
- rorl $11,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- rorl $6,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- rorl $2,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,16(%rsp)
- vpalignr $4,%xmm2,%xmm3,%xmm4
- rorl $14,%r13d
- movl %r14d,%eax
- vpalignr $4,%xmm0,%xmm1,%xmm7
- movl %r9d,%r12d
- xorl %r8d,%r13d
-.byte 143,232,120,194,236,14
- rorl $9,%r14d
- xorl %r10d,%r12d
- vpsrld $3,%xmm4,%xmm4
- rorl $5,%r13d
- xorl %eax,%r14d
- vpaddd %xmm7,%xmm2,%xmm2
- andl %r8d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
-.byte 143,232,120,194,245,11
- rorl $11,%r14d
- xorl %r10d,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ebx,%r15d
- rorl $6,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
-.byte 143,232,120,194,249,13
- xorl %eax,%r14d
- addl %r13d,%r11d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ebx,%esi
- addl %r11d,%edx
- vpsrld $10,%xmm1,%xmm6
- rorl $2,%r14d
- addl %esi,%r11d
- vpaddd %xmm4,%xmm2,%xmm2
- movl %edx,%r13d
- addl %r11d,%r14d
-.byte 143,232,120,194,239,2
- rorl $14,%r13d
- movl %r14d,%r11d
- vpxor %xmm6,%xmm7,%xmm7
- movl %r8d,%r12d
- xorl %edx,%r13d
- rorl $9,%r14d
- xorl %r9d,%r12d
- vpxor %xmm5,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %edx,%r13d
- vpsrldq $8,%xmm7,%xmm7
- addl 36(%rsp),%r10d
- movl %r11d,%esi
- rorl $11,%r14d
- xorl %r9d,%r12d
- vpaddd %xmm7,%xmm2,%xmm2
- xorl %eax,%esi
- rorl $6,%r13d
- addl %r12d,%r10d
- andl %esi,%r15d
-.byte 143,232,120,194,250,13
- xorl %r11d,%r14d
- addl %r13d,%r10d
- vpsrld $10,%xmm2,%xmm6
- xorl %eax,%r15d
- addl %r10d,%ecx
-.byte 143,232,120,194,239,2
- rorl $2,%r14d
- addl %r15d,%r10d
- vpxor %xmm6,%xmm7,%xmm7
- movl %ecx,%r13d
- addl %r10d,%r14d
- rorl $14,%r13d
- movl %r14d,%r10d
- vpxor %xmm5,%xmm7,%xmm7
- movl %edx,%r12d
- xorl %ecx,%r13d
- rorl $9,%r14d
- xorl %r8d,%r12d
- vpslldq $8,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %ecx,%r13d
- vpaddd %xmm7,%xmm2,%xmm2
- addl 40(%rsp),%r9d
- movl %r10d,%r15d
- rorl $11,%r14d
- xorl %r8d,%r12d
- vpaddd 64(%rbp),%xmm2,%xmm6
- xorl %r11d,%r15d
- rorl $6,%r13d
- addl %r12d,%r9d
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%esi
- addl %r9d,%ebx
- rorl $2,%r14d
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- rorl $14,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- xorl %ebx,%r13d
- rorl $9,%r14d
- xorl %edx,%r12d
- rorl $5,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%esi
- rorl $11,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- rorl $6,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- rorl $2,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,32(%rsp)
- vpalignr $4,%xmm3,%xmm0,%xmm4
- rorl $14,%r13d
- movl %r14d,%r8d
- vpalignr $4,%xmm1,%xmm2,%xmm7
- movl %ebx,%r12d
- xorl %eax,%r13d
-.byte 143,232,120,194,236,14
- rorl $9,%r14d
- xorl %ecx,%r12d
- vpsrld $3,%xmm4,%xmm4
- rorl $5,%r13d
- xorl %r8d,%r14d
- vpaddd %xmm7,%xmm3,%xmm3
- andl %eax,%r12d
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
-.byte 143,232,120,194,245,11
- rorl $11,%r14d
- xorl %ecx,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %r9d,%r15d
- rorl $6,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
-.byte 143,232,120,194,250,13
- xorl %r8d,%r14d
- addl %r13d,%edx
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r9d,%esi
- addl %edx,%r11d
- vpsrld $10,%xmm2,%xmm6
- rorl $2,%r14d
- addl %esi,%edx
- vpaddd %xmm4,%xmm3,%xmm3
- movl %r11d,%r13d
- addl %edx,%r14d
-.byte 143,232,120,194,239,2
- rorl $14,%r13d
- movl %r14d,%edx
- vpxor %xmm6,%xmm7,%xmm7
- movl %eax,%r12d
- xorl %r11d,%r13d
- rorl $9,%r14d
- xorl %ebx,%r12d
- vpxor %xmm5,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r11d,%r13d
- vpsrldq $8,%xmm7,%xmm7
- addl 52(%rsp),%ecx
- movl %edx,%esi
- rorl $11,%r14d
- xorl %ebx,%r12d
- vpaddd %xmm7,%xmm3,%xmm3
- xorl %r8d,%esi
- rorl $6,%r13d
- addl %r12d,%ecx
- andl %esi,%r15d
-.byte 143,232,120,194,251,13
- xorl %edx,%r14d
- addl %r13d,%ecx
- vpsrld $10,%xmm3,%xmm6
- xorl %r8d,%r15d
- addl %ecx,%r10d
-.byte 143,232,120,194,239,2
- rorl $2,%r14d
- addl %r15d,%ecx
- vpxor %xmm6,%xmm7,%xmm7
- movl %r10d,%r13d
- addl %ecx,%r14d
- rorl $14,%r13d
- movl %r14d,%ecx
- vpxor %xmm5,%xmm7,%xmm7
- movl %r11d,%r12d
- xorl %r10d,%r13d
- rorl $9,%r14d
- xorl %eax,%r12d
- vpslldq $8,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r10d,%r13d
- vpaddd %xmm7,%xmm3,%xmm3
- addl 56(%rsp),%ebx
- movl %ecx,%r15d
- rorl $11,%r14d
- xorl %eax,%r12d
- vpaddd 96(%rbp),%xmm3,%xmm6
- xorl %edx,%r15d
- rorl $6,%r13d
- addl %r12d,%ebx
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%esi
- addl %ebx,%r9d
- rorl $2,%r14d
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- rorl $14,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- xorl %r9d,%r13d
- rorl $9,%r14d
- xorl %r11d,%r12d
- rorl $5,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%esi
- rorl $11,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- rorl $6,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- rorl $2,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,48(%rsp)
- movq 64+0(%rsp),%r12
- vpand %xmm14,%xmm11,%xmm11
- movq 64+8(%rsp),%r15
- vpor %xmm11,%xmm8,%xmm8
- vmovdqu %xmm8,(%r15,%r12,1)
- leaq 16(%r12),%r12
- cmpb $0,131(%rbp)
- jne L$xop_00_47
- vmovdqu (%r12),%xmm9
- movq %r12,64+0(%rsp)
- rorl $14,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- xorl %r8d,%r13d
- rorl $9,%r14d
- xorl %r10d,%r12d
- rorl $5,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- rorl $11,%r14d
- xorl %r10d,%r12d
- xorl %ebx,%r15d
- rorl $6,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%esi
- addl %r11d,%edx
- rorl $2,%r14d
- addl %esi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- rorl $14,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- xorl %edx,%r13d
- rorl $9,%r14d
- xorl %r9d,%r12d
- rorl $5,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %edx,%r13d
- addl 4(%rsp),%r10d
- movl %r11d,%esi
- rorl $11,%r14d
- xorl %r9d,%r12d
- xorl %eax,%esi
- rorl $6,%r13d
- addl %r12d,%r10d
- andl %esi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- addl %r10d,%ecx
- rorl $2,%r14d
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- rorl $14,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- xorl %ecx,%r13d
- rorl $9,%r14d
- xorl %r8d,%r12d
- rorl $5,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- movl %r10d,%r15d
- rorl $11,%r14d
- xorl %r8d,%r12d
- xorl %r11d,%r15d
- rorl $6,%r13d
- addl %r12d,%r9d
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%esi
- addl %r9d,%ebx
- rorl $2,%r14d
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- rorl $14,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- xorl %ebx,%r13d
- rorl $9,%r14d
- xorl %edx,%r12d
- rorl $5,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%esi
- rorl $11,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- rorl $6,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- rorl $2,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- rorl $14,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- xorl %eax,%r13d
- rorl $9,%r14d
- xorl %ecx,%r12d
- rorl $5,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- rorl $11,%r14d
- xorl %ecx,%r12d
- xorl %r9d,%r15d
- rorl $6,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%esi
- addl %edx,%r11d
- rorl $2,%r14d
- addl %esi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- rorl $14,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- xorl %r11d,%r13d
- rorl $9,%r14d
- xorl %ebx,%r12d
- rorl $5,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r11d,%r13d
- addl 20(%rsp),%ecx
- movl %edx,%esi
- rorl $11,%r14d
- xorl %ebx,%r12d
- xorl %r8d,%esi
- rorl $6,%r13d
- addl %r12d,%ecx
- andl %esi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- addl %ecx,%r10d
- rorl $2,%r14d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- rorl $14,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- xorl %r10d,%r13d
- rorl $9,%r14d
- xorl %eax,%r12d
- rorl $5,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- movl %ecx,%r15d
- rorl $11,%r14d
- xorl %eax,%r12d
- xorl %edx,%r15d
- rorl $6,%r13d
- addl %r12d,%ebx
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%esi
- addl %ebx,%r9d
- rorl $2,%r14d
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- rorl $14,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- xorl %r9d,%r13d
- rorl $9,%r14d
- xorl %r11d,%r12d
- rorl $5,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%esi
- rorl $11,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- rorl $6,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- rorl $2,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- rorl $14,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- xorl %r8d,%r13d
- rorl $9,%r14d
- xorl %r10d,%r12d
- rorl $5,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- rorl $11,%r14d
- xorl %r10d,%r12d
- xorl %ebx,%r15d
- rorl $6,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%esi
- addl %r11d,%edx
- rorl $2,%r14d
- addl %esi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- rorl $14,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- xorl %edx,%r13d
- rorl $9,%r14d
- xorl %r9d,%r12d
- rorl $5,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %edx,%r13d
- addl 36(%rsp),%r10d
- movl %r11d,%esi
- rorl $11,%r14d
- xorl %r9d,%r12d
- xorl %eax,%esi
- rorl $6,%r13d
- addl %r12d,%r10d
- andl %esi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- addl %r10d,%ecx
- rorl $2,%r14d
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- rorl $14,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- xorl %ecx,%r13d
- rorl $9,%r14d
- xorl %r8d,%r12d
- rorl $5,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- movl %r10d,%r15d
- rorl $11,%r14d
- xorl %r8d,%r12d
- xorl %r11d,%r15d
- rorl $6,%r13d
- addl %r12d,%r9d
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%esi
- addl %r9d,%ebx
- rorl $2,%r14d
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- rorl $14,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- xorl %ebx,%r13d
- rorl $9,%r14d
- xorl %edx,%r12d
- rorl $5,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%esi
- rorl $11,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- rorl $6,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- rorl $2,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- rorl $14,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- xorl %eax,%r13d
- rorl $9,%r14d
- xorl %ecx,%r12d
- rorl $5,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- rorl $11,%r14d
- xorl %ecx,%r12d
- xorl %r9d,%r15d
- rorl $6,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%esi
- addl %edx,%r11d
- rorl $2,%r14d
- addl %esi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- rorl $14,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- xorl %r11d,%r13d
- rorl $9,%r14d
- xorl %ebx,%r12d
- rorl $5,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r11d,%r13d
- addl 52(%rsp),%ecx
- movl %edx,%esi
- rorl $11,%r14d
- xorl %ebx,%r12d
- xorl %r8d,%esi
- rorl $6,%r13d
- addl %r12d,%ecx
- andl %esi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- addl %ecx,%r10d
- rorl $2,%r14d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- rorl $14,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- xorl %r10d,%r13d
- rorl $9,%r14d
- xorl %eax,%r12d
- rorl $5,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- movl %ecx,%r15d
- rorl $11,%r14d
- xorl %eax,%r12d
- xorl %edx,%r15d
- rorl $6,%r13d
- addl %r12d,%ebx
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%esi
- addl %ebx,%r9d
- rorl $2,%r14d
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- rorl $14,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- xorl %r9d,%r13d
- rorl $9,%r14d
- xorl %r11d,%r12d
- rorl $5,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%esi
- rorl $11,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- rorl $6,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- rorl $2,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- movq 64+0(%rsp),%r12
- movq 64+8(%rsp),%r13
- movq 64+40(%rsp),%r15
- movq 64+48(%rsp),%rsi
-
- vpand %xmm14,%xmm11,%xmm11
- movl %r14d,%eax
- vpor %xmm11,%xmm8,%xmm8
- vmovdqu %xmm8,(%r12,%r13,1)
- leaq 16(%r12),%r12
-
- addl 0(%r15),%eax
- addl 4(%r15),%ebx
- addl 8(%r15),%ecx
- addl 12(%r15),%edx
- addl 16(%r15),%r8d
- addl 20(%r15),%r9d
- addl 24(%r15),%r10d
- addl 28(%r15),%r11d
-
- cmpq 64+16(%rsp),%r12
-
- movl %eax,0(%r15)
- movl %ebx,4(%r15)
- movl %ecx,8(%r15)
- movl %edx,12(%r15)
- movl %r8d,16(%r15)
- movl %r9d,20(%r15)
- movl %r10d,24(%r15)
- movl %r11d,28(%r15)
-
- jb L$loop_xop
-
- movq 64+32(%rsp),%r8
- movq 64+56(%rsp),%rsi
- vmovdqu %xmm8,(%r8)
- vzeroall
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-L$epilogue_xop:
- .byte 0xf3,0xc3
-
-
-.p2align 6
-aesni_cbc_sha256_enc_avx:
-L$avx_shortcut:
- movq 8(%rsp),%r10
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rsp,%r11
- subq $128,%rsp
- andq $-64,%rsp
-
- shlq $6,%rdx
- subq %rdi,%rsi
- subq %rdi,%r10
- addq %rdi,%rdx
-
-
- movq %rsi,64+8(%rsp)
- movq %rdx,64+16(%rsp)
-
- movq %r8,64+32(%rsp)
- movq %r9,64+40(%rsp)
- movq %r10,64+48(%rsp)
- movq %r11,64+56(%rsp)
-L$prologue_avx:
- vzeroall
-
- movq %rdi,%r12
- leaq 128(%rcx),%rdi
- leaq K256+544(%rip),%r13
- movl 240-128(%rdi),%r14d
- movq %r9,%r15
- movq %r10,%rsi
- vmovdqu (%r8),%xmm8
- subq $9,%r14
-
- movl 0(%r15),%eax
- movl 4(%r15),%ebx
- movl 8(%r15),%ecx
- movl 12(%r15),%edx
- movl 16(%r15),%r8d
- movl 20(%r15),%r9d
- movl 24(%r15),%r10d
- movl 28(%r15),%r11d
-
- vmovdqa 0(%r13,%r14,8),%xmm14
- vmovdqa 16(%r13,%r14,8),%xmm13
- vmovdqa 32(%r13,%r14,8),%xmm12
- vmovdqu 0-128(%rdi),%xmm10
- jmp L$loop_avx
-.p2align 4
-L$loop_avx:
- vmovdqa K256+512(%rip),%xmm7
- vmovdqu 0(%rsi,%r12,1),%xmm0
- vmovdqu 16(%rsi,%r12,1),%xmm1
- vmovdqu 32(%rsi,%r12,1),%xmm2
- vmovdqu 48(%rsi,%r12,1),%xmm3
- vpshufb %xmm7,%xmm0,%xmm0
- leaq K256(%rip),%rbp
- vpshufb %xmm7,%xmm1,%xmm1
- vpshufb %xmm7,%xmm2,%xmm2
- vpaddd 0(%rbp),%xmm0,%xmm4
- vpshufb %xmm7,%xmm3,%xmm3
- vpaddd 32(%rbp),%xmm1,%xmm5
- vpaddd 64(%rbp),%xmm2,%xmm6
- vpaddd 96(%rbp),%xmm3,%xmm7
- vmovdqa %xmm4,0(%rsp)
- movl %eax,%r14d
- vmovdqa %xmm5,16(%rsp)
- movl %ebx,%esi
- vmovdqa %xmm6,32(%rsp)
- xorl %ecx,%esi
- vmovdqa %xmm7,48(%rsp)
- movl %r8d,%r13d
- jmp L$avx_00_47
-
-.p2align 4
-L$avx_00_47:
- subq $-32*4,%rbp
- vmovdqu (%r12),%xmm9
- movq %r12,64+0(%rsp)
- vpalignr $4,%xmm0,%xmm1,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- vpalignr $4,%xmm2,%xmm3,%xmm7
- xorl %r8d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpaddd %xmm7,%xmm0,%xmm0
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- vpsrld $3,%xmm4,%xmm7
- shrdl $11,%r14d,%r14d
- xorl %r10d,%r12d
- xorl %ebx,%r15d
- vpslld $14,%xmm4,%xmm5
- shrdl $6,%r13d,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%esi
- vpshufd $250,%xmm3,%xmm7
- addl %r11d,%edx
- shrdl $2,%r14d,%r14d
- addl %esi,%r11d
- vpsrld $11,%xmm6,%xmm6
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%r11d
- movl %r8d,%r12d
- xorl %edx,%r13d
- vpslld $11,%xmm5,%xmm5
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r11d,%r14d
- andl %edx,%r12d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %edx,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 4(%rsp),%r10d
- movl %r11d,%esi
- shrdl $11,%r14d,%r14d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %r9d,%r12d
- xorl %eax,%esi
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- addl %r12d,%r10d
- andl %esi,%r15d
- xorl %r11d,%r14d
- vpaddd %xmm4,%xmm0,%xmm0
- addl %r13d,%r10d
- xorl %eax,%r15d
- addl %r10d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $2,%r14d,%r14d
- addl %r15d,%r10d
- movl %ecx,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- movl %edx,%r12d
- xorl %ecx,%r13d
- shrdl $9,%r14d,%r14d
- vpshufd $132,%xmm6,%xmm6
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- vpsrldq $8,%xmm6,%xmm6
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- vpaddd %xmm6,%xmm0,%xmm0
- movl %r10d,%r15d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%r12d
- vpshufd $80,%xmm0,%xmm7
- xorl %r11d,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%r9d
- vpsrld $10,%xmm7,%xmm6
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- vpsrlq $17,%xmm7,%xmm7
- xorl %r11d,%esi
- addl %r9d,%ebx
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- vpsrlq $2,%xmm7,%xmm7
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %ebx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r12d
- vpshufd $232,%xmm6,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vpslldq $8,%xmm6,%xmm6
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%esi
- vpaddd %xmm6,%xmm0,%xmm0
- shrdl $11,%r14d,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- vpaddd 0(%rbp),%xmm0,%xmm6
- shrdl $6,%r13d,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- shrdl $2,%r14d,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,0(%rsp)
- vpalignr $4,%xmm1,%xmm2,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- vpalignr $4,%xmm3,%xmm0,%xmm7
- xorl %eax,%r13d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpaddd %xmm7,%xmm1,%xmm1
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- vpsrld $3,%xmm4,%xmm7
- shrdl $11,%r14d,%r14d
- xorl %ecx,%r12d
- xorl %r9d,%r15d
- vpslld $14,%xmm4,%xmm5
- shrdl $6,%r13d,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%esi
- vpshufd $250,%xmm0,%xmm7
- addl %edx,%r11d
- shrdl $2,%r14d,%r14d
- addl %esi,%edx
- vpsrld $11,%xmm6,%xmm6
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%edx
- movl %eax,%r12d
- xorl %r11d,%r13d
- vpslld $11,%xmm5,%xmm5
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r11d,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 20(%rsp),%ecx
- movl %edx,%esi
- shrdl $11,%r14d,%r14d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ebx,%r12d
- xorl %r8d,%esi
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- addl %r12d,%ecx
- andl %esi,%r15d
- xorl %edx,%r14d
- vpaddd %xmm4,%xmm1,%xmm1
- addl %r13d,%ecx
- xorl %r8d,%r15d
- addl %ecx,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $2,%r14d,%r14d
- addl %r15d,%ecx
- movl %r10d,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- movl %r11d,%r12d
- xorl %r10d,%r13d
- shrdl $9,%r14d,%r14d
- vpshufd $132,%xmm6,%xmm6
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- vpsrldq $8,%xmm6,%xmm6
- andl %r10d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- vpaddd %xmm6,%xmm1,%xmm1
- movl %ecx,%r15d
- shrdl $11,%r14d,%r14d
- xorl %eax,%r12d
- vpshufd $80,%xmm1,%xmm7
- xorl %edx,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%ebx
- vpsrld $10,%xmm7,%xmm6
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- vpsrlq $17,%xmm7,%xmm7
- xorl %edx,%esi
- addl %ebx,%r9d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- vpsrlq $2,%xmm7,%xmm7
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %r9d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r12d
- vpshufd $232,%xmm6,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpslldq $8,%xmm6,%xmm6
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%esi
- vpaddd %xmm6,%xmm1,%xmm1
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- vpaddd 32(%rbp),%xmm1,%xmm6
- shrdl $6,%r13d,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- shrdl $2,%r14d,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,16(%rsp)
- vpalignr $4,%xmm2,%xmm3,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- vpalignr $4,%xmm0,%xmm1,%xmm7
- xorl %r8d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpaddd %xmm7,%xmm2,%xmm2
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- vpsrld $3,%xmm4,%xmm7
- shrdl $11,%r14d,%r14d
- xorl %r10d,%r12d
- xorl %ebx,%r15d
- vpslld $14,%xmm4,%xmm5
- shrdl $6,%r13d,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%esi
- vpshufd $250,%xmm1,%xmm7
- addl %r11d,%edx
- shrdl $2,%r14d,%r14d
- addl %esi,%r11d
- vpsrld $11,%xmm6,%xmm6
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%r11d
- movl %r8d,%r12d
- xorl %edx,%r13d
- vpslld $11,%xmm5,%xmm5
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r11d,%r14d
- andl %edx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %edx,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 36(%rsp),%r10d
- movl %r11d,%esi
- shrdl $11,%r14d,%r14d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %r9d,%r12d
- xorl %eax,%esi
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- addl %r12d,%r10d
- andl %esi,%r15d
- xorl %r11d,%r14d
- vpaddd %xmm4,%xmm2,%xmm2
- addl %r13d,%r10d
- xorl %eax,%r15d
- addl %r10d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $2,%r14d,%r14d
- addl %r15d,%r10d
- movl %ecx,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- movl %edx,%r12d
- xorl %ecx,%r13d
- shrdl $9,%r14d,%r14d
- vpshufd $132,%xmm6,%xmm6
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- vpsrldq $8,%xmm6,%xmm6
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- vpaddd %xmm6,%xmm2,%xmm2
- movl %r10d,%r15d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%r12d
- vpshufd $80,%xmm2,%xmm7
- xorl %r11d,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%r9d
- vpsrld $10,%xmm7,%xmm6
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- vpsrlq $17,%xmm7,%xmm7
- xorl %r11d,%esi
- addl %r9d,%ebx
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- vpsrlq $2,%xmm7,%xmm7
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %ebx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r12d
- vpshufd $232,%xmm6,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vpslldq $8,%xmm6,%xmm6
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%esi
- vpaddd %xmm6,%xmm2,%xmm2
- shrdl $11,%r14d,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- vpaddd 64(%rbp),%xmm2,%xmm6
- shrdl $6,%r13d,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- shrdl $2,%r14d,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,32(%rsp)
- vpalignr $4,%xmm3,%xmm0,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- vpalignr $4,%xmm1,%xmm2,%xmm7
- xorl %eax,%r13d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpaddd %xmm7,%xmm3,%xmm3
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- vpsrld $3,%xmm4,%xmm7
- shrdl $11,%r14d,%r14d
- xorl %ecx,%r12d
- xorl %r9d,%r15d
- vpslld $14,%xmm4,%xmm5
- shrdl $6,%r13d,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%esi
- vpshufd $250,%xmm2,%xmm7
- addl %edx,%r11d
- shrdl $2,%r14d,%r14d
- addl %esi,%edx
- vpsrld $11,%xmm6,%xmm6
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%edx
- movl %eax,%r12d
- xorl %r11d,%r13d
- vpslld $11,%xmm5,%xmm5
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r11d,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 52(%rsp),%ecx
- movl %edx,%esi
- shrdl $11,%r14d,%r14d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ebx,%r12d
- xorl %r8d,%esi
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- addl %r12d,%ecx
- andl %esi,%r15d
- xorl %edx,%r14d
- vpaddd %xmm4,%xmm3,%xmm3
- addl %r13d,%ecx
- xorl %r8d,%r15d
- addl %ecx,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $2,%r14d,%r14d
- addl %r15d,%ecx
- movl %r10d,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- movl %r11d,%r12d
- xorl %r10d,%r13d
- shrdl $9,%r14d,%r14d
- vpshufd $132,%xmm6,%xmm6
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- vpsrldq $8,%xmm6,%xmm6
- andl %r10d,%r12d
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- vpaddd %xmm6,%xmm3,%xmm3
- movl %ecx,%r15d
- shrdl $11,%r14d,%r14d
- xorl %eax,%r12d
- vpshufd $80,%xmm3,%xmm7
- xorl %edx,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%ebx
- vpsrld $10,%xmm7,%xmm6
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- vpsrlq $17,%xmm7,%xmm7
- xorl %edx,%esi
- addl %ebx,%r9d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- vpsrlq $2,%xmm7,%xmm7
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %r9d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r12d
- vpshufd $232,%xmm6,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpslldq $8,%xmm6,%xmm6
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%esi
- vpaddd %xmm6,%xmm3,%xmm3
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- vpaddd 96(%rbp),%xmm3,%xmm6
- shrdl $6,%r13d,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- shrdl $2,%r14d,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,48(%rsp)
- movq 64+0(%rsp),%r12
- vpand %xmm14,%xmm11,%xmm11
- movq 64+8(%rsp),%r15
- vpor %xmm11,%xmm8,%xmm8
- vmovdqu %xmm8,(%r15,%r12,1)
- leaq 16(%r12),%r12
- cmpb $0,131(%rbp)
- jne L$avx_00_47
- vmovdqu (%r12),%xmm9
- movq %r12,64+0(%rsp)
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- xorl %r8d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%r12d
- xorl %ebx,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%esi
- addl %r11d,%edx
- shrdl $2,%r14d,%r14d
- addl %esi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- xorl %edx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %edx,%r13d
- addl 4(%rsp),%r10d
- movl %r11d,%esi
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r12d
- xorl %eax,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%r10d
- andl %esi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- addl %r10d,%ecx
- shrdl $2,%r14d,%r14d
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- xorl %ecx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- movl %r10d,%r15d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%r12d
- xorl %r11d,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%r9d
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%esi
- addl %r9d,%ebx
- shrdl $2,%r14d,%r14d
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- xorl %ebx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%esi
- shrdl $11,%r14d,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- shrdl $2,%r14d,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- xorl %eax,%r13d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%r12d
- xorl %r9d,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%esi
- addl %edx,%r11d
- shrdl $2,%r14d,%r14d
- addl %esi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- xorl %r11d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r11d,%r13d
- addl 20(%rsp),%ecx
- movl %edx,%esi
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r12d
- xorl %r8d,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%ecx
- andl %esi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- addl %ecx,%r10d
- shrdl $2,%r14d,%r14d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- xorl %r10d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- movl %ecx,%r15d
- shrdl $11,%r14d,%r14d
- xorl %eax,%r12d
- xorl %edx,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%ebx
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%esi
- addl %ebx,%r9d
- shrdl $2,%r14d,%r14d
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- xorl %r9d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%esi
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- shrdl $2,%r14d,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- xorl %r8d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%r12d
- xorl %ebx,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%esi
- addl %r11d,%edx
- shrdl $2,%r14d,%r14d
- addl %esi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- xorl %edx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %edx,%r13d
- addl 36(%rsp),%r10d
- movl %r11d,%esi
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r12d
- xorl %eax,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%r10d
- andl %esi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- addl %r10d,%ecx
- shrdl $2,%r14d,%r14d
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- xorl %ecx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- movl %r10d,%r15d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%r12d
- xorl %r11d,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%r9d
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%esi
- addl %r9d,%ebx
- shrdl $2,%r14d,%r14d
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- xorl %ebx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%esi
- shrdl $11,%r14d,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- shrdl $2,%r14d,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- xorl %eax,%r13d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%r12d
- xorl %r9d,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%esi
- addl %edx,%r11d
- shrdl $2,%r14d,%r14d
- addl %esi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- xorl %r11d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r11d,%r13d
- addl 52(%rsp),%ecx
- movl %edx,%esi
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r12d
- xorl %r8d,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%ecx
- andl %esi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- addl %ecx,%r10d
- shrdl $2,%r14d,%r14d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- xorl %r10d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- movl %ecx,%r15d
- shrdl $11,%r14d,%r14d
- xorl %eax,%r12d
- xorl %edx,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%ebx
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%esi
- addl %ebx,%r9d
- shrdl $2,%r14d,%r14d
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- xorl %r9d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%esi
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- shrdl $2,%r14d,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- movq 64+0(%rsp),%r12
- movq 64+8(%rsp),%r13
- movq 64+40(%rsp),%r15
- movq 64+48(%rsp),%rsi
-
- vpand %xmm14,%xmm11,%xmm11
- movl %r14d,%eax
- vpor %xmm11,%xmm8,%xmm8
- vmovdqu %xmm8,(%r12,%r13,1)
- leaq 16(%r12),%r12
-
- addl 0(%r15),%eax
- addl 4(%r15),%ebx
- addl 8(%r15),%ecx
- addl 12(%r15),%edx
- addl 16(%r15),%r8d
- addl 20(%r15),%r9d
- addl 24(%r15),%r10d
- addl 28(%r15),%r11d
-
- cmpq 64+16(%rsp),%r12
-
- movl %eax,0(%r15)
- movl %ebx,4(%r15)
- movl %ecx,8(%r15)
- movl %edx,12(%r15)
- movl %r8d,16(%r15)
- movl %r9d,20(%r15)
- movl %r10d,24(%r15)
- movl %r11d,28(%r15)
- jb L$loop_avx
-
- movq 64+32(%rsp),%r8
- movq 64+56(%rsp),%rsi
- vmovdqu %xmm8,(%r8)
- vzeroall
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-L$epilogue_avx:
- .byte 0xf3,0xc3
-
-
-.p2align 6
-aesni_cbc_sha256_enc_avx2:
-L$avx2_shortcut:
- movq 8(%rsp),%r10
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rsp,%r11
- subq $576,%rsp
- andq $-1024,%rsp
- addq $448,%rsp
-
- shlq $6,%rdx
- subq %rdi,%rsi
- subq %rdi,%r10
- addq %rdi,%rdx
-
-
-
- movq %rdx,64+16(%rsp)
-
- movq %r8,64+32(%rsp)
- movq %r9,64+40(%rsp)
- movq %r10,64+48(%rsp)
- movq %r11,64+56(%rsp)
-L$prologue_avx2:
- vzeroall
-
- movq %rdi,%r13
- vpinsrq $1,%rsi,%xmm15,%xmm15
- leaq 128(%rcx),%rdi
- leaq K256+544(%rip),%r12
- movl 240-128(%rdi),%r14d
- movq %r9,%r15
- movq %r10,%rsi
- vmovdqu (%r8),%xmm8
- leaq -9(%r14),%r14
-
- vmovdqa 0(%r12,%r14,8),%xmm14
- vmovdqa 16(%r12,%r14,8),%xmm13
- vmovdqa 32(%r12,%r14,8),%xmm12
-
- subq $-64,%r13
- movl 0(%r15),%eax
- leaq (%rsi,%r13,1),%r12
- movl 4(%r15),%ebx
- cmpq %rdx,%r13
- movl 8(%r15),%ecx
- cmoveq %rsp,%r12
- movl 12(%r15),%edx
- movl 16(%r15),%r8d
- movl 20(%r15),%r9d
- movl 24(%r15),%r10d
- movl 28(%r15),%r11d
- vmovdqu 0-128(%rdi),%xmm10
- jmp L$oop_avx2
-.p2align 4
-L$oop_avx2:
- vmovdqa K256+512(%rip),%ymm7
- vmovdqu -64+0(%rsi,%r13,1),%xmm0
- vmovdqu -64+16(%rsi,%r13,1),%xmm1
- vmovdqu -64+32(%rsi,%r13,1),%xmm2
- vmovdqu -64+48(%rsi,%r13,1),%xmm3
-
- vinserti128 $1,(%r12),%ymm0,%ymm0
- vinserti128 $1,16(%r12),%ymm1,%ymm1
- vpshufb %ymm7,%ymm0,%ymm0
- vinserti128 $1,32(%r12),%ymm2,%ymm2
- vpshufb %ymm7,%ymm1,%ymm1
- vinserti128 $1,48(%r12),%ymm3,%ymm3
-
- leaq K256(%rip),%rbp
- vpshufb %ymm7,%ymm2,%ymm2
- leaq -64(%r13),%r13
- vpaddd 0(%rbp),%ymm0,%ymm4
- vpshufb %ymm7,%ymm3,%ymm3
- vpaddd 32(%rbp),%ymm1,%ymm5
- vpaddd 64(%rbp),%ymm2,%ymm6
- vpaddd 96(%rbp),%ymm3,%ymm7
- vmovdqa %ymm4,0(%rsp)
- xorl %r14d,%r14d
- vmovdqa %ymm5,32(%rsp)
- leaq -64(%rsp),%rsp
- movl %ebx,%esi
- vmovdqa %ymm6,0(%rsp)
- xorl %ecx,%esi
- vmovdqa %ymm7,32(%rsp)
- movl %r9d,%r12d
- subq $-32*4,%rbp
- jmp L$avx2_00_47
-
-.p2align 4
-L$avx2_00_47:
- vmovdqu (%r13),%xmm9
- vpinsrq $0,%r13,%xmm15,%xmm15
- leaq -64(%rsp),%rsp
- vpalignr $4,%ymm0,%ymm1,%ymm4
- addl 0+128(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- vpalignr $4,%ymm2,%ymm3,%ymm7
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- vpsrld $7,%ymm4,%ymm6
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- vpaddd %ymm7,%ymm0,%ymm0
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%esi
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ebx,%esi
- vpshufd $250,%ymm3,%ymm7
- xorl %r13d,%r14d
- leal (%r11,%rsi,1),%r11d
- movl %r8d,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 4+128(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%edx,%esi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- vpslld $11,%ymm5,%ymm5
- andnl %r9d,%edx,%r12d
- xorl %esi,%r13d
- rorxl $6,%edx,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%esi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%esi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- vpsrlq $17,%ymm7,%ymm7
- andl %esi,%r15d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %r12d,%r14d
- xorl %eax,%r15d
- vpaddd %ymm4,%ymm0,%ymm0
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 8+128(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- vpxor %ymm7,%ymm6,%ymm6
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- vpshufd $132,%ymm6,%ymm6
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- vpsrldq $8,%ymm6,%ymm6
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- vpaddd %ymm6,%ymm0,%ymm0
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- vpshufd $80,%ymm0,%ymm7
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r11d,%esi
- vpsrld $10,%ymm7,%ymm6
- xorl %r13d,%r14d
- leal (%r9,%rsi,1),%r9d
- movl %ecx,%r12d
- vpsrlq $17,%ymm7,%ymm7
- addl 12+128(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- vpxor %ymm7,%ymm6,%ymm6
- rorxl $11,%ebx,%esi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- vpsrlq $2,%ymm7,%ymm7
- andnl %edx,%ebx,%r12d
- xorl %esi,%r13d
- rorxl $6,%ebx,%r14d
- vpxor %ymm7,%ymm6,%ymm6
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%esi
- vpshufd $232,%ymm6,%ymm6
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%esi
- vpslldq $8,%ymm6,%ymm6
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- vpaddd %ymm6,%ymm0,%ymm0
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- vpaddd 0(%rbp),%ymm0,%ymm6
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- vmovdqa %ymm6,0(%rsp)
- vpalignr $4,%ymm1,%ymm2,%ymm4
- addl 32+128(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- vpalignr $4,%ymm3,%ymm0,%ymm7
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- vpsrld $7,%ymm4,%ymm6
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- vpaddd %ymm7,%ymm1,%ymm1
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r9d,%esi
- vpshufd $250,%ymm0,%ymm7
- xorl %r13d,%r14d
- leal (%rdx,%rsi,1),%edx
- movl %eax,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 36+128(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%r11d,%esi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- vpslld $11,%ymm5,%ymm5
- andnl %ebx,%r11d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r11d,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%esi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%esi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- vpsrlq $17,%ymm7,%ymm7
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- vpaddd %ymm4,%ymm1,%ymm1
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 40+128(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- vpxor %ymm7,%ymm6,%ymm6
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- vpshufd $132,%ymm6,%ymm6
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- vpsrldq $8,%ymm6,%ymm6
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- vpaddd %ymm6,%ymm1,%ymm1
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- vpshufd $80,%ymm1,%ymm7
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %edx,%esi
- vpsrld $10,%ymm7,%ymm6
- xorl %r13d,%r14d
- leal (%rbx,%rsi,1),%ebx
- movl %r10d,%r12d
- vpsrlq $17,%ymm7,%ymm7
- addl 44+128(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- vpxor %ymm7,%ymm6,%ymm6
- rorxl $11,%r9d,%esi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- vpsrlq $2,%ymm7,%ymm7
- andnl %r11d,%r9d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r9d,%r14d
- vpxor %ymm7,%ymm6,%ymm6
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%esi
- vpshufd $232,%ymm6,%ymm6
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%esi
- vpslldq $8,%ymm6,%ymm6
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- vpaddd %ymm6,%ymm1,%ymm1
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- vpaddd 32(%rbp),%ymm1,%ymm6
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- vmovdqa %ymm6,32(%rsp)
- leaq -64(%rsp),%rsp
- vpalignr $4,%ymm2,%ymm3,%ymm4
- addl 0+128(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- vpalignr $4,%ymm0,%ymm1,%ymm7
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- vpsrld $7,%ymm4,%ymm6
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- vpaddd %ymm7,%ymm2,%ymm2
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ebx,%esi
- vpshufd $250,%ymm1,%ymm7
- xorl %r13d,%r14d
- leal (%r11,%rsi,1),%r11d
- movl %r8d,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 4+128(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%edx,%esi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- vpslld $11,%ymm5,%ymm5
- andnl %r9d,%edx,%r12d
- xorl %esi,%r13d
- rorxl $6,%edx,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%esi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%esi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- vpsrlq $17,%ymm7,%ymm7
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %eax,%r15d
- vpaddd %ymm4,%ymm2,%ymm2
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 8+128(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- vpxor %ymm7,%ymm6,%ymm6
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- vpshufd $132,%ymm6,%ymm6
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- vpsrldq $8,%ymm6,%ymm6
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- vpaddd %ymm6,%ymm2,%ymm2
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- vpshufd $80,%ymm2,%ymm7
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r11d,%esi
- vpsrld $10,%ymm7,%ymm6
- xorl %r13d,%r14d
- leal (%r9,%rsi,1),%r9d
- movl %ecx,%r12d
- vpsrlq $17,%ymm7,%ymm7
- addl 12+128(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- vpxor %ymm7,%ymm6,%ymm6
- rorxl $11,%ebx,%esi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- vpsrlq $2,%ymm7,%ymm7
- andnl %edx,%ebx,%r12d
- xorl %esi,%r13d
- rorxl $6,%ebx,%r14d
- vpxor %ymm7,%ymm6,%ymm6
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%esi
- vpshufd $232,%ymm6,%ymm6
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%esi
- vpslldq $8,%ymm6,%ymm6
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- vpaddd %ymm6,%ymm2,%ymm2
- andl %esi,%r15d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- vpaddd 64(%rbp),%ymm2,%ymm6
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- vmovdqa %ymm6,0(%rsp)
- vpalignr $4,%ymm3,%ymm0,%ymm4
- addl 32+128(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- vpalignr $4,%ymm1,%ymm2,%ymm7
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- vpsrld $7,%ymm4,%ymm6
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- vpaddd %ymm7,%ymm3,%ymm3
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%esi
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r9d,%esi
- vpshufd $250,%ymm2,%ymm7
- xorl %r13d,%r14d
- leal (%rdx,%rsi,1),%edx
- movl %eax,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 36+128(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%r11d,%esi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- vpslld $11,%ymm5,%ymm5
- andnl %ebx,%r11d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r11d,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%esi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%esi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- vpsrlq $17,%ymm7,%ymm7
- andl %esi,%r15d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- vpaddd %ymm4,%ymm3,%ymm3
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 40+128(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- vpxor %ymm7,%ymm6,%ymm6
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- vpshufd $132,%ymm6,%ymm6
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- vpsrldq $8,%ymm6,%ymm6
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- vpaddd %ymm6,%ymm3,%ymm3
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- vpshufd $80,%ymm3,%ymm7
- andl %r15d,%esi
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %edx,%esi
- vpsrld $10,%ymm7,%ymm6
- xorl %r13d,%r14d
- leal (%rbx,%rsi,1),%ebx
- movl %r10d,%r12d
- vpsrlq $17,%ymm7,%ymm7
- addl 44+128(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- vpxor %ymm7,%ymm6,%ymm6
- rorxl $11,%r9d,%esi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- vpsrlq $2,%ymm7,%ymm7
- andnl %r11d,%r9d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r9d,%r14d
- vpxor %ymm7,%ymm6,%ymm6
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%esi
- vpshufd $232,%ymm6,%ymm6
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%esi
- vpslldq $8,%ymm6,%ymm6
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- vpaddd %ymm6,%ymm3,%ymm3
- andl %esi,%r15d
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- vpaddd 96(%rbp),%ymm3,%ymm6
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- vmovdqa %ymm6,32(%rsp)
- vmovq %xmm15,%r13
- vpextrq $1,%xmm15,%r15
- vpand %xmm14,%xmm11,%xmm11
- vpor %xmm11,%xmm8,%xmm8
- vmovdqu %xmm8,(%r15,%r13,1)
- leaq 16(%r13),%r13
- leaq 128(%rbp),%rbp
- cmpb $0,3(%rbp)
- jne L$avx2_00_47
- vmovdqu (%r13),%xmm9
- vpinsrq $0,%r13,%xmm15,%xmm15
- addl 0+64(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%esi
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ebx,%esi
- xorl %r13d,%r14d
- leal (%r11,%rsi,1),%r11d
- movl %r8d,%r12d
- addl 4+64(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%esi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %esi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%esi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%esi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %esi,%r15d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8+64(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r11d,%esi
- xorl %r13d,%r14d
- leal (%r9,%rsi,1),%r9d
- movl %ecx,%r12d
- addl 12+64(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%esi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %esi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%esi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%esi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32+64(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r9d,%esi
- xorl %r13d,%r14d
- leal (%rdx,%rsi,1),%edx
- movl %eax,%r12d
- addl 36+64(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%esi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%esi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%esi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40+64(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %edx,%esi
- xorl %r13d,%r14d
- leal (%rbx,%rsi,1),%ebx
- movl %r10d,%r12d
- addl 44+64(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%esi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%esi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%esi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- addl 0(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ebx,%esi
- xorl %r13d,%r14d
- leal (%r11,%rsi,1),%r11d
- movl %r8d,%r12d
- addl 4(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%esi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %esi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%esi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%esi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r11d,%esi
- xorl %r13d,%r14d
- leal (%r9,%rsi,1),%r9d
- movl %ecx,%r12d
- addl 12(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%esi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %esi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%esi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%esi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %esi,%r15d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%esi
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r9d,%esi
- xorl %r13d,%r14d
- leal (%rdx,%rsi,1),%edx
- movl %eax,%r12d
- addl 36(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%esi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%esi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%esi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %esi,%r15d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%esi
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %edx,%esi
- xorl %r13d,%r14d
- leal (%rbx,%rsi,1),%ebx
- movl %r10d,%r12d
- addl 44(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%esi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%esi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%esi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %esi,%r15d
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- vpextrq $1,%xmm15,%r12
- vmovq %xmm15,%r13
- movq 552(%rsp),%r15
- addl %r14d,%eax
- leaq 448(%rsp),%rbp
-
- vpand %xmm14,%xmm11,%xmm11
- vpor %xmm11,%xmm8,%xmm8
- vmovdqu %xmm8,(%r12,%r13,1)
- leaq 16(%r13),%r13
-
- addl 0(%r15),%eax
- addl 4(%r15),%ebx
- addl 8(%r15),%ecx
- addl 12(%r15),%edx
- addl 16(%r15),%r8d
- addl 20(%r15),%r9d
- addl 24(%r15),%r10d
- addl 28(%r15),%r11d
-
- movl %eax,0(%r15)
- movl %ebx,4(%r15)
- movl %ecx,8(%r15)
- movl %edx,12(%r15)
- movl %r8d,16(%r15)
- movl %r9d,20(%r15)
- movl %r10d,24(%r15)
- movl %r11d,28(%r15)
-
- cmpq 80(%rbp),%r13
- je L$done_avx2
-
- xorl %r14d,%r14d
- movl %ebx,%esi
- movl %r9d,%r12d
- xorl %ecx,%esi
- jmp L$ower_avx2
-.p2align 4
-L$ower_avx2:
- vmovdqu (%r13),%xmm9
- vpinsrq $0,%r13,%xmm15,%xmm15
- addl 0+16(%rbp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%esi
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ebx,%esi
- xorl %r13d,%r14d
- leal (%r11,%rsi,1),%r11d
- movl %r8d,%r12d
- addl 4+16(%rbp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%esi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %esi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%esi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%esi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %esi,%r15d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8+16(%rbp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r11d,%esi
- xorl %r13d,%r14d
- leal (%r9,%rsi,1),%r9d
- movl %ecx,%r12d
- addl 12+16(%rbp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%esi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %esi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%esi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%esi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32+16(%rbp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r9d,%esi
- xorl %r13d,%r14d
- leal (%rdx,%rsi,1),%edx
- movl %eax,%r12d
- addl 36+16(%rbp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%esi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%esi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%esi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40+16(%rbp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %edx,%esi
- xorl %r13d,%r14d
- leal (%rbx,%rsi,1),%ebx
- movl %r10d,%r12d
- addl 44+16(%rbp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%esi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%esi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%esi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- leaq -64(%rbp),%rbp
- addl 0+16(%rbp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ebx,%esi
- xorl %r13d,%r14d
- leal (%r11,%rsi,1),%r11d
- movl %r8d,%r12d
- addl 4+16(%rbp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%esi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %esi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%esi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%esi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8+16(%rbp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r11d,%esi
- xorl %r13d,%r14d
- leal (%r9,%rsi,1),%r9d
- movl %ecx,%r12d
- addl 12+16(%rbp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%esi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %esi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%esi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%esi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %esi,%r15d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32+16(%rbp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%esi
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r9d,%esi
- xorl %r13d,%r14d
- leal (%rdx,%rsi,1),%edx
- movl %eax,%r12d
- addl 36+16(%rbp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%esi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%esi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%esi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %esi,%r15d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40+16(%rbp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%esi
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %edx,%esi
- xorl %r13d,%r14d
- leal (%rbx,%rsi,1),%ebx
- movl %r10d,%r12d
- addl 44+16(%rbp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%esi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%esi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%esi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %esi,%r15d
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- vmovq %xmm15,%r13
- vpextrq $1,%xmm15,%r15
- vpand %xmm14,%xmm11,%xmm11
- vpor %xmm11,%xmm8,%xmm8
- leaq -64(%rbp),%rbp
- vmovdqu %xmm8,(%r15,%r13,1)
- leaq 16(%r13),%r13
- cmpq %rsp,%rbp
- jae L$ower_avx2
-
- movq 552(%rsp),%r15
- leaq 64(%r13),%r13
- movq 560(%rsp),%rsi
- addl %r14d,%eax
- leaq 448(%rsp),%rsp
-
- addl 0(%r15),%eax
- addl 4(%r15),%ebx
- addl 8(%r15),%ecx
- addl 12(%r15),%edx
- addl 16(%r15),%r8d
- addl 20(%r15),%r9d
- addl 24(%r15),%r10d
- leaq (%rsi,%r13,1),%r12
- addl 28(%r15),%r11d
-
- cmpq 64+16(%rsp),%r13
-
- movl %eax,0(%r15)
- cmoveq %rsp,%r12
- movl %ebx,4(%r15)
- movl %ecx,8(%r15)
- movl %edx,12(%r15)
- movl %r8d,16(%r15)
- movl %r9d,20(%r15)
- movl %r10d,24(%r15)
- movl %r11d,28(%r15)
-
- jbe L$oop_avx2
- leaq (%rsp),%rbp
-
-L$done_avx2:
- leaq (%rbp),%rsp
- movq 64+32(%rsp),%r8
- movq 64+56(%rsp),%rsi
- vmovdqu %xmm8,(%r8)
- vzeroall
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-L$epilogue_avx2:
- .byte 0xf3,0xc3
-
-
-.p2align 5
-aesni_cbc_sha256_enc_shaext:
- movq 8(%rsp),%r10
- leaq K256+128(%rip),%rax
- movdqu (%r9),%xmm1
- movdqu 16(%r9),%xmm2
- movdqa 512-128(%rax),%xmm3
-
- movl 240(%rcx),%r11d
- subq %rdi,%rsi
- movups (%rcx),%xmm15
- movups 16(%rcx),%xmm4
- leaq 112(%rcx),%rcx
-
- pshufd $27,%xmm1,%xmm0
- pshufd $177,%xmm1,%xmm1
- pshufd $27,%xmm2,%xmm2
- movdqa %xmm3,%xmm7
-.byte 102,15,58,15,202,8
- punpcklqdq %xmm0,%xmm2
-
- jmp L$oop_shaext
-
-.p2align 4
-L$oop_shaext:
- movdqu (%r10),%xmm10
- movdqu 16(%r10),%xmm11
- movdqu 32(%r10),%xmm12
-.byte 102,68,15,56,0,211
- movdqu 48(%r10),%xmm13
-
- movdqa 0-128(%rax),%xmm0
- paddd %xmm10,%xmm0
-.byte 102,68,15,56,0,219
- movdqa %xmm2,%xmm9
- movdqa %xmm1,%xmm8
- movups 0(%rdi),%xmm14
- xorps %xmm15,%xmm14
- xorps %xmm14,%xmm6
- movups -80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movups -64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,202
-
- movdqa 32-128(%rax),%xmm0
- paddd %xmm11,%xmm0
-.byte 102,68,15,56,0,227
- leaq 64(%r10),%r10
- movups -48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movups -32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,202
-
- movdqa 64-128(%rax),%xmm0
- paddd %xmm12,%xmm0
-.byte 102,68,15,56,0,235
-.byte 69,15,56,204,211
- movups -16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm13,%xmm3
-.byte 102,65,15,58,15,220,4
- paddd %xmm3,%xmm10
- movups 0(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,202
-
- movdqa 96-128(%rax),%xmm0
- paddd %xmm13,%xmm0
-.byte 69,15,56,205,213
-.byte 69,15,56,204,220
- movups 16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movups 32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movdqa %xmm10,%xmm3
-.byte 102,65,15,58,15,221,4
- paddd %xmm3,%xmm11
-.byte 15,56,203,202
- movdqa 128-128(%rax),%xmm0
- paddd %xmm10,%xmm0
-.byte 69,15,56,205,218
-.byte 69,15,56,204,229
- movups 48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm11,%xmm3
-.byte 102,65,15,58,15,218,4
- paddd %xmm3,%xmm12
- cmpl $11,%r11d
- jb L$aesenclast1
- movups 64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- je L$aesenclast1
- movups 96(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 112(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-L$aesenclast1:
- aesenclast %xmm5,%xmm6
- movups 16-112(%rcx),%xmm4
- nop
-.byte 15,56,203,202
- movups 16(%rdi),%xmm14
- xorps %xmm15,%xmm14
- movups %xmm6,0(%rsi,%rdi,1)
- xorps %xmm14,%xmm6
- movups -80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- movdqa 160-128(%rax),%xmm0
- paddd %xmm11,%xmm0
-.byte 69,15,56,205,227
-.byte 69,15,56,204,234
- movups -64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm12,%xmm3
-.byte 102,65,15,58,15,219,4
- paddd %xmm3,%xmm13
- movups -48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 192-128(%rax),%xmm0
- paddd %xmm12,%xmm0
-.byte 69,15,56,205,236
-.byte 69,15,56,204,211
- movups -32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm13,%xmm3
-.byte 102,65,15,58,15,220,4
- paddd %xmm3,%xmm10
- movups -16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 224-128(%rax),%xmm0
- paddd %xmm13,%xmm0
-.byte 69,15,56,205,213
-.byte 69,15,56,204,220
- movups 0(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm10,%xmm3
-.byte 102,65,15,58,15,221,4
- paddd %xmm3,%xmm11
- movups 16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 256-128(%rax),%xmm0
- paddd %xmm10,%xmm0
-.byte 69,15,56,205,218
-.byte 69,15,56,204,229
- movups 32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm11,%xmm3
-.byte 102,65,15,58,15,218,4
- paddd %xmm3,%xmm12
- movups 48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- cmpl $11,%r11d
- jb L$aesenclast2
- movups 64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- je L$aesenclast2
- movups 96(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 112(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-L$aesenclast2:
- aesenclast %xmm5,%xmm6
- movups 16-112(%rcx),%xmm4
- nop
-.byte 15,56,203,202
- movups 32(%rdi),%xmm14
- xorps %xmm15,%xmm14
- movups %xmm6,16(%rsi,%rdi,1)
- xorps %xmm14,%xmm6
- movups -80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- movdqa 288-128(%rax),%xmm0
- paddd %xmm11,%xmm0
-.byte 69,15,56,205,227
-.byte 69,15,56,204,234
- movups -64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm12,%xmm3
-.byte 102,65,15,58,15,219,4
- paddd %xmm3,%xmm13
- movups -48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 320-128(%rax),%xmm0
- paddd %xmm12,%xmm0
-.byte 69,15,56,205,236
-.byte 69,15,56,204,211
- movups -32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm13,%xmm3
-.byte 102,65,15,58,15,220,4
- paddd %xmm3,%xmm10
- movups -16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 352-128(%rax),%xmm0
- paddd %xmm13,%xmm0
-.byte 69,15,56,205,213
-.byte 69,15,56,204,220
- movups 0(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm10,%xmm3
-.byte 102,65,15,58,15,221,4
- paddd %xmm3,%xmm11
- movups 16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 384-128(%rax),%xmm0
- paddd %xmm10,%xmm0
-.byte 69,15,56,205,218
-.byte 69,15,56,204,229
- movups 32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm11,%xmm3
-.byte 102,65,15,58,15,218,4
- paddd %xmm3,%xmm12
- movups 48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 416-128(%rax),%xmm0
- paddd %xmm11,%xmm0
-.byte 69,15,56,205,227
-.byte 69,15,56,204,234
- cmpl $11,%r11d
- jb L$aesenclast3
- movups 64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- je L$aesenclast3
- movups 96(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 112(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-L$aesenclast3:
- aesenclast %xmm5,%xmm6
- movups 16-112(%rcx),%xmm4
- nop
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movdqa %xmm12,%xmm3
-.byte 102,65,15,58,15,219,4
- paddd %xmm3,%xmm13
- movups 48(%rdi),%xmm14
- xorps %xmm15,%xmm14
- movups %xmm6,32(%rsi,%rdi,1)
- xorps %xmm14,%xmm6
- movups -80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- movups -64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,202
-
- movdqa 448-128(%rax),%xmm0
- paddd %xmm12,%xmm0
-.byte 69,15,56,205,236
- movdqa %xmm7,%xmm3
- movups -48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movups -32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,202
-
- movdqa 480-128(%rax),%xmm0
- paddd %xmm13,%xmm0
- movups -16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- movups 0(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
- movups 16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
-
- movups 32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- cmpl $11,%r11d
- jb L$aesenclast4
- movups 64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- je L$aesenclast4
- movups 96(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 112(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-L$aesenclast4:
- aesenclast %xmm5,%xmm6
- movups 16-112(%rcx),%xmm4
- nop
-
- paddd %xmm9,%xmm2
- paddd %xmm8,%xmm1
-
- decq %rdx
- movups %xmm6,48(%rsi,%rdi,1)
- leaq 64(%rdi),%rdi
- jnz L$oop_shaext
-
- pshufd $177,%xmm2,%xmm2
- pshufd $27,%xmm1,%xmm3
- pshufd $177,%xmm1,%xmm1
- punpckhqdq %xmm2,%xmm1
-.byte 102,15,58,15,211,8
-
- movups %xmm6,(%r8)
- movdqu %xmm1,(%r9)
- movdqu %xmm2,16(%r9)
- .byte 0xf3,0xc3
diff --git a/deps/openssl/asm/x64-macosx-gas/bn/rsaz-avx2.s b/deps/openssl/asm/x64-macosx-gas/bn/rsaz-avx2.s
index 1819757f0b..02f7f562ea 100644
--- a/deps/openssl/asm/x64-macosx-gas/bn/rsaz-avx2.s
+++ b/deps/openssl/asm/x64-macosx-gas/bn/rsaz-avx2.s
@@ -1,1632 +1,24 @@
.text
-.globl _rsaz_1024_sqr_avx2
-
-.p2align 6
-_rsaz_1024_sqr_avx2:
- leaq (%rsp),%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- vzeroupper
- movq %rax,%rbp
- movq %rdx,%r13
- subq $832,%rsp
- movq %r13,%r15
- subq $-128,%rdi
- subq $-128,%rsi
- subq $-128,%r13
-
- andq $4095,%r15
- addq $320,%r15
- shrq $12,%r15
- vpxor %ymm9,%ymm9,%ymm9
- jz L$sqr_1024_no_n_copy
-
-
-
-
-
- subq $320,%rsp
- vmovdqu 0-128(%r13),%ymm0
- andq $-2048,%rsp
- vmovdqu 32-128(%r13),%ymm1
- vmovdqu 64-128(%r13),%ymm2
- vmovdqu 96-128(%r13),%ymm3
- vmovdqu 128-128(%r13),%ymm4
- vmovdqu 160-128(%r13),%ymm5
- vmovdqu 192-128(%r13),%ymm6
- vmovdqu 224-128(%r13),%ymm7
- vmovdqu 256-128(%r13),%ymm8
- leaq 832+128(%rsp),%r13
- vmovdqu %ymm0,0-128(%r13)
- vmovdqu %ymm1,32-128(%r13)
- vmovdqu %ymm2,64-128(%r13)
- vmovdqu %ymm3,96-128(%r13)
- vmovdqu %ymm4,128-128(%r13)
- vmovdqu %ymm5,160-128(%r13)
- vmovdqu %ymm6,192-128(%r13)
- vmovdqu %ymm7,224-128(%r13)
- vmovdqu %ymm8,256-128(%r13)
- vmovdqu %ymm9,288-128(%r13)
-
-L$sqr_1024_no_n_copy:
- andq $-1024,%rsp
-
- vmovdqu 32-128(%rsi),%ymm1
- vmovdqu 64-128(%rsi),%ymm2
- vmovdqu 96-128(%rsi),%ymm3
- vmovdqu 128-128(%rsi),%ymm4
- vmovdqu 160-128(%rsi),%ymm5
- vmovdqu 192-128(%rsi),%ymm6
- vmovdqu 224-128(%rsi),%ymm7
- vmovdqu 256-128(%rsi),%ymm8
-
- leaq 192(%rsp),%rbx
- vpbroadcastq L$and_mask(%rip),%ymm15
- jmp L$OOP_GRANDE_SQR_1024
-
-.p2align 5
-L$OOP_GRANDE_SQR_1024:
- leaq 576+128(%rsp),%r9
- leaq 448(%rsp),%r12
-
-
-
-
- vpaddq %ymm1,%ymm1,%ymm1
- vpbroadcastq 0-128(%rsi),%ymm10
- vpaddq %ymm2,%ymm2,%ymm2
- vmovdqa %ymm1,0-128(%r9)
- vpaddq %ymm3,%ymm3,%ymm3
- vmovdqa %ymm2,32-128(%r9)
- vpaddq %ymm4,%ymm4,%ymm4
- vmovdqa %ymm3,64-128(%r9)
- vpaddq %ymm5,%ymm5,%ymm5
- vmovdqa %ymm4,96-128(%r9)
- vpaddq %ymm6,%ymm6,%ymm6
- vmovdqa %ymm5,128-128(%r9)
- vpaddq %ymm7,%ymm7,%ymm7
- vmovdqa %ymm6,160-128(%r9)
- vpaddq %ymm8,%ymm8,%ymm8
- vmovdqa %ymm7,192-128(%r9)
- vpxor %ymm9,%ymm9,%ymm9
- vmovdqa %ymm8,224-128(%r9)
-
- vpmuludq 0-128(%rsi),%ymm10,%ymm0
- vpbroadcastq 32-128(%rsi),%ymm11
- vmovdqu %ymm9,288-192(%rbx)
- vpmuludq %ymm10,%ymm1,%ymm1
- vmovdqu %ymm9,320-448(%r12)
- vpmuludq %ymm10,%ymm2,%ymm2
- vmovdqu %ymm9,352-448(%r12)
- vpmuludq %ymm10,%ymm3,%ymm3
- vmovdqu %ymm9,384-448(%r12)
- vpmuludq %ymm10,%ymm4,%ymm4
- vmovdqu %ymm9,416-448(%r12)
- vpmuludq %ymm10,%ymm5,%ymm5
- vmovdqu %ymm9,448-448(%r12)
- vpmuludq %ymm10,%ymm6,%ymm6
- vmovdqu %ymm9,480-448(%r12)
- vpmuludq %ymm10,%ymm7,%ymm7
- vmovdqu %ymm9,512-448(%r12)
- vpmuludq %ymm10,%ymm8,%ymm8
- vpbroadcastq 64-128(%rsi),%ymm10
- vmovdqu %ymm9,544-448(%r12)
-
- movq %rsi,%r15
- movl $4,%r14d
- jmp L$sqr_entry_1024
-.p2align 5
-L$OOP_SQR_1024:
- vpbroadcastq 32-128(%r15),%ymm11
- vpmuludq 0-128(%rsi),%ymm10,%ymm0
- vpaddq 0-192(%rbx),%ymm0,%ymm0
- vpmuludq 0-128(%r9),%ymm10,%ymm1
- vpaddq 32-192(%rbx),%ymm1,%ymm1
- vpmuludq 32-128(%r9),%ymm10,%ymm2
- vpaddq 64-192(%rbx),%ymm2,%ymm2
- vpmuludq 64-128(%r9),%ymm10,%ymm3
- vpaddq 96-192(%rbx),%ymm3,%ymm3
- vpmuludq 96-128(%r9),%ymm10,%ymm4
- vpaddq 128-192(%rbx),%ymm4,%ymm4
- vpmuludq 128-128(%r9),%ymm10,%ymm5
- vpaddq 160-192(%rbx),%ymm5,%ymm5
- vpmuludq 160-128(%r9),%ymm10,%ymm6
- vpaddq 192-192(%rbx),%ymm6,%ymm6
- vpmuludq 192-128(%r9),%ymm10,%ymm7
- vpaddq 224-192(%rbx),%ymm7,%ymm7
- vpmuludq 224-128(%r9),%ymm10,%ymm8
- vpbroadcastq 64-128(%r15),%ymm10
- vpaddq 256-192(%rbx),%ymm8,%ymm8
-L$sqr_entry_1024:
- vmovdqu %ymm0,0-192(%rbx)
- vmovdqu %ymm1,32-192(%rbx)
-
- vpmuludq 32-128(%rsi),%ymm11,%ymm12
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq 32-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm3,%ymm3
- vpmuludq 64-128(%r9),%ymm11,%ymm13
- vpaddq %ymm13,%ymm4,%ymm4
- vpmuludq 96-128(%r9),%ymm11,%ymm12
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq 128-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm6,%ymm6
- vpmuludq 160-128(%r9),%ymm11,%ymm13
- vpaddq %ymm13,%ymm7,%ymm7
- vpmuludq 192-128(%r9),%ymm11,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq 224-128(%r9),%ymm11,%ymm0
- vpbroadcastq 96-128(%r15),%ymm11
- vpaddq 288-192(%rbx),%ymm0,%ymm0
-
- vmovdqu %ymm2,64-192(%rbx)
- vmovdqu %ymm3,96-192(%rbx)
-
- vpmuludq 64-128(%rsi),%ymm10,%ymm13
- vpaddq %ymm13,%ymm4,%ymm4
- vpmuludq 64-128(%r9),%ymm10,%ymm12
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq 96-128(%r9),%ymm10,%ymm14
- vpaddq %ymm14,%ymm6,%ymm6
- vpmuludq 128-128(%r9),%ymm10,%ymm13
- vpaddq %ymm13,%ymm7,%ymm7
- vpmuludq 160-128(%r9),%ymm10,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq 192-128(%r9),%ymm10,%ymm14
- vpaddq %ymm14,%ymm0,%ymm0
- vpmuludq 224-128(%r9),%ymm10,%ymm1
- vpbroadcastq 128-128(%r15),%ymm10
- vpaddq 320-448(%r12),%ymm1,%ymm1
-
- vmovdqu %ymm4,128-192(%rbx)
- vmovdqu %ymm5,160-192(%rbx)
-
- vpmuludq 96-128(%rsi),%ymm11,%ymm12
- vpaddq %ymm12,%ymm6,%ymm6
- vpmuludq 96-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm7,%ymm7
- vpmuludq 128-128(%r9),%ymm11,%ymm13
- vpaddq %ymm13,%ymm8,%ymm8
- vpmuludq 160-128(%r9),%ymm11,%ymm12
- vpaddq %ymm12,%ymm0,%ymm0
- vpmuludq 192-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm1,%ymm1
- vpmuludq 224-128(%r9),%ymm11,%ymm2
- vpbroadcastq 160-128(%r15),%ymm11
- vpaddq 352-448(%r12),%ymm2,%ymm2
-
- vmovdqu %ymm6,192-192(%rbx)
- vmovdqu %ymm7,224-192(%rbx)
-
- vpmuludq 128-128(%rsi),%ymm10,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq 128-128(%r9),%ymm10,%ymm14
- vpaddq %ymm14,%ymm0,%ymm0
- vpmuludq 160-128(%r9),%ymm10,%ymm13
- vpaddq %ymm13,%ymm1,%ymm1
- vpmuludq 192-128(%r9),%ymm10,%ymm12
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq 224-128(%r9),%ymm10,%ymm3
- vpbroadcastq 192-128(%r15),%ymm10
- vpaddq 384-448(%r12),%ymm3,%ymm3
-
- vmovdqu %ymm8,256-192(%rbx)
- vmovdqu %ymm0,288-192(%rbx)
- leaq 8(%rbx),%rbx
-
- vpmuludq 160-128(%rsi),%ymm11,%ymm13
- vpaddq %ymm13,%ymm1,%ymm1
- vpmuludq 160-128(%r9),%ymm11,%ymm12
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq 192-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm3,%ymm3
- vpmuludq 224-128(%r9),%ymm11,%ymm4
- vpbroadcastq 224-128(%r15),%ymm11
- vpaddq 416-448(%r12),%ymm4,%ymm4
-
- vmovdqu %ymm1,320-448(%r12)
- vmovdqu %ymm2,352-448(%r12)
-
- vpmuludq 192-128(%rsi),%ymm10,%ymm12
- vpaddq %ymm12,%ymm3,%ymm3
- vpmuludq 192-128(%r9),%ymm10,%ymm14
- vpbroadcastq 256-128(%r15),%ymm0
- vpaddq %ymm14,%ymm4,%ymm4
- vpmuludq 224-128(%r9),%ymm10,%ymm5
- vpbroadcastq 0+8-128(%r15),%ymm10
- vpaddq 448-448(%r12),%ymm5,%ymm5
-
- vmovdqu %ymm3,384-448(%r12)
- vmovdqu %ymm4,416-448(%r12)
- leaq 8(%r15),%r15
-
- vpmuludq 224-128(%rsi),%ymm11,%ymm12
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq 224-128(%r9),%ymm11,%ymm6
- vpaddq 480-448(%r12),%ymm6,%ymm6
-
- vpmuludq 256-128(%rsi),%ymm0,%ymm7
- vmovdqu %ymm5,448-448(%r12)
- vpaddq 512-448(%r12),%ymm7,%ymm7
- vmovdqu %ymm6,480-448(%r12)
- vmovdqu %ymm7,512-448(%r12)
- leaq 8(%r12),%r12
-
- decl %r14d
- jnz L$OOP_SQR_1024
-
- vmovdqu 256(%rsp),%ymm8
- vmovdqu 288(%rsp),%ymm1
- vmovdqu 320(%rsp),%ymm2
- leaq 192(%rsp),%rbx
-
- vpsrlq $29,%ymm8,%ymm14
- vpand %ymm15,%ymm8,%ymm8
- vpsrlq $29,%ymm1,%ymm11
- vpand %ymm15,%ymm1,%ymm1
-
- vpermq $147,%ymm14,%ymm14
- vpxor %ymm9,%ymm9,%ymm9
- vpermq $147,%ymm11,%ymm11
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm8,%ymm8
- vpblendd $3,%ymm11,%ymm9,%ymm11
- vpaddq %ymm14,%ymm1,%ymm1
- vpaddq %ymm11,%ymm2,%ymm2
- vmovdqu %ymm1,288-192(%rbx)
- vmovdqu %ymm2,320-192(%rbx)
-
- movq (%rsp),%rax
- movq 8(%rsp),%r10
- movq 16(%rsp),%r11
- movq 24(%rsp),%r12
- vmovdqu 32(%rsp),%ymm1
- vmovdqu 64-192(%rbx),%ymm2
- vmovdqu 96-192(%rbx),%ymm3
- vmovdqu 128-192(%rbx),%ymm4
- vmovdqu 160-192(%rbx),%ymm5
- vmovdqu 192-192(%rbx),%ymm6
- vmovdqu 224-192(%rbx),%ymm7
-
- movq %rax,%r9
- imull %ecx,%eax
- andl $536870911,%eax
- vmovd %eax,%xmm12
-
- movq %rax,%rdx
- imulq -128(%r13),%rax
- vpbroadcastq %xmm12,%ymm12
- addq %rax,%r9
- movq %rdx,%rax
- imulq 8-128(%r13),%rax
- shrq $29,%r9
- addq %rax,%r10
- movq %rdx,%rax
- imulq 16-128(%r13),%rax
- addq %r9,%r10
- addq %rax,%r11
- imulq 24-128(%r13),%rdx
- addq %rdx,%r12
-
- movq %r10,%rax
- imull %ecx,%eax
- andl $536870911,%eax
-
- movl $9,%r14d
- jmp L$OOP_REDUCE_1024
-
-.p2align 5
-L$OOP_REDUCE_1024:
- vmovd %eax,%xmm13
- vpbroadcastq %xmm13,%ymm13
-
- vpmuludq 32-128(%r13),%ymm12,%ymm10
- movq %rax,%rdx
- imulq -128(%r13),%rax
- vpaddq %ymm10,%ymm1,%ymm1
- addq %rax,%r10
- vpmuludq 64-128(%r13),%ymm12,%ymm14
- movq %rdx,%rax
- imulq 8-128(%r13),%rax
- vpaddq %ymm14,%ymm2,%ymm2
- vpmuludq 96-128(%r13),%ymm12,%ymm11
-.byte 0x67
- addq %rax,%r11
-.byte 0x67
- movq %rdx,%rax
- imulq 16-128(%r13),%rax
- shrq $29,%r10
- vpaddq %ymm11,%ymm3,%ymm3
- vpmuludq 128-128(%r13),%ymm12,%ymm10
- addq %rax,%r12
- addq %r10,%r11
- vpaddq %ymm10,%ymm4,%ymm4
- vpmuludq 160-128(%r13),%ymm12,%ymm14
- movq %r11,%rax
- imull %ecx,%eax
- vpaddq %ymm14,%ymm5,%ymm5
- vpmuludq 192-128(%r13),%ymm12,%ymm11
- andl $536870911,%eax
- vpaddq %ymm11,%ymm6,%ymm6
- vpmuludq 224-128(%r13),%ymm12,%ymm10
- vpaddq %ymm10,%ymm7,%ymm7
- vpmuludq 256-128(%r13),%ymm12,%ymm14
- vmovd %eax,%xmm12
-
- vpaddq %ymm14,%ymm8,%ymm8
-
- vpbroadcastq %xmm12,%ymm12
-
- vpmuludq 32-8-128(%r13),%ymm13,%ymm11
- vmovdqu 96-8-128(%r13),%ymm14
- movq %rax,%rdx
- imulq -128(%r13),%rax
- vpaddq %ymm11,%ymm1,%ymm1
- vpmuludq 64-8-128(%r13),%ymm13,%ymm10
- vmovdqu 128-8-128(%r13),%ymm11
- addq %rax,%r11
- movq %rdx,%rax
- imulq 8-128(%r13),%rax
- vpaddq %ymm10,%ymm2,%ymm2
- addq %r12,%rax
- shrq $29,%r11
- vpmuludq %ymm13,%ymm14,%ymm14
- vmovdqu 160-8-128(%r13),%ymm10
- addq %r11,%rax
- vpaddq %ymm14,%ymm3,%ymm3
- vpmuludq %ymm13,%ymm11,%ymm11
- vmovdqu 192-8-128(%r13),%ymm14
-.byte 0x67
- movq %rax,%r12
- imull %ecx,%eax
- vpaddq %ymm11,%ymm4,%ymm4
- vpmuludq %ymm13,%ymm10,%ymm10
-.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00
- andl $536870911,%eax
- vpaddq %ymm10,%ymm5,%ymm5
- vpmuludq %ymm13,%ymm14,%ymm14
- vmovdqu 256-8-128(%r13),%ymm10
- vpaddq %ymm14,%ymm6,%ymm6
- vpmuludq %ymm13,%ymm11,%ymm11
- vmovdqu 288-8-128(%r13),%ymm9
- vmovd %eax,%xmm0
- imulq -128(%r13),%rax
- vpaddq %ymm11,%ymm7,%ymm7
- vpmuludq %ymm13,%ymm10,%ymm10
- vmovdqu 32-16-128(%r13),%ymm14
- vpbroadcastq %xmm0,%ymm0
- vpaddq %ymm10,%ymm8,%ymm8
- vpmuludq %ymm13,%ymm9,%ymm9
- vmovdqu 64-16-128(%r13),%ymm11
- addq %rax,%r12
-
- vmovdqu 32-24-128(%r13),%ymm13
- vpmuludq %ymm12,%ymm14,%ymm14
- vmovdqu 96-16-128(%r13),%ymm10
- vpaddq %ymm14,%ymm1,%ymm1
- vpmuludq %ymm0,%ymm13,%ymm13
- vpmuludq %ymm12,%ymm11,%ymm11
-.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff
- vpaddq %ymm1,%ymm13,%ymm13
- vpaddq %ymm11,%ymm2,%ymm2
- vpmuludq %ymm12,%ymm10,%ymm10
- vmovdqu 160-16-128(%r13),%ymm11
-.byte 0x67
- vmovq %xmm13,%rax
- vmovdqu %ymm13,(%rsp)
- vpaddq %ymm10,%ymm3,%ymm3
- vpmuludq %ymm12,%ymm14,%ymm14
- vmovdqu 192-16-128(%r13),%ymm10
- vpaddq %ymm14,%ymm4,%ymm4
- vpmuludq %ymm12,%ymm11,%ymm11
- vmovdqu 224-16-128(%r13),%ymm14
- vpaddq %ymm11,%ymm5,%ymm5
- vpmuludq %ymm12,%ymm10,%ymm10
- vmovdqu 256-16-128(%r13),%ymm11
- vpaddq %ymm10,%ymm6,%ymm6
- vpmuludq %ymm12,%ymm14,%ymm14
- shrq $29,%r12
- vmovdqu 288-16-128(%r13),%ymm10
- addq %r12,%rax
- vpaddq %ymm14,%ymm7,%ymm7
- vpmuludq %ymm12,%ymm11,%ymm11
-
- movq %rax,%r9
- imull %ecx,%eax
- vpaddq %ymm11,%ymm8,%ymm8
- vpmuludq %ymm12,%ymm10,%ymm10
- andl $536870911,%eax
- vmovd %eax,%xmm12
- vmovdqu 96-24-128(%r13),%ymm11
-.byte 0x67
- vpaddq %ymm10,%ymm9,%ymm9
- vpbroadcastq %xmm12,%ymm12
-
- vpmuludq 64-24-128(%r13),%ymm0,%ymm14
- vmovdqu 128-24-128(%r13),%ymm10
- movq %rax,%rdx
- imulq -128(%r13),%rax
- movq 8(%rsp),%r10
- vpaddq %ymm14,%ymm2,%ymm1
- vpmuludq %ymm0,%ymm11,%ymm11
- vmovdqu 160-24-128(%r13),%ymm14
- addq %rax,%r9
- movq %rdx,%rax
- imulq 8-128(%r13),%rax
-.byte 0x67
- shrq $29,%r9
- movq 16(%rsp),%r11
- vpaddq %ymm11,%ymm3,%ymm2
- vpmuludq %ymm0,%ymm10,%ymm10
- vmovdqu 192-24-128(%r13),%ymm11
- addq %rax,%r10
- movq %rdx,%rax
- imulq 16-128(%r13),%rax
- vpaddq %ymm10,%ymm4,%ymm3
- vpmuludq %ymm0,%ymm14,%ymm14
- vmovdqu 224-24-128(%r13),%ymm10
- imulq 24-128(%r13),%rdx
- addq %rax,%r11
- leaq (%r9,%r10,1),%rax
- vpaddq %ymm14,%ymm5,%ymm4
- vpmuludq %ymm0,%ymm11,%ymm11
- vmovdqu 256-24-128(%r13),%ymm14
- movq %rax,%r10
- imull %ecx,%eax
- vpmuludq %ymm0,%ymm10,%ymm10
- vpaddq %ymm11,%ymm6,%ymm5
- vmovdqu 288-24-128(%r13),%ymm11
- andl $536870911,%eax
- vpaddq %ymm10,%ymm7,%ymm6
- vpmuludq %ymm0,%ymm14,%ymm14
- addq 24(%rsp),%rdx
- vpaddq %ymm14,%ymm8,%ymm7
- vpmuludq %ymm0,%ymm11,%ymm11
- vpaddq %ymm11,%ymm9,%ymm8
- vmovq %r12,%xmm9
- movq %rdx,%r12
-
- decl %r14d
- jnz L$OOP_REDUCE_1024
- leaq 448(%rsp),%r12
- vpaddq %ymm9,%ymm13,%ymm0
- vpxor %ymm9,%ymm9,%ymm9
-
- vpaddq 288-192(%rbx),%ymm0,%ymm0
- vpaddq 320-448(%r12),%ymm1,%ymm1
- vpaddq 352-448(%r12),%ymm2,%ymm2
- vpaddq 384-448(%r12),%ymm3,%ymm3
- vpaddq 416-448(%r12),%ymm4,%ymm4
- vpaddq 448-448(%r12),%ymm5,%ymm5
- vpaddq 480-448(%r12),%ymm6,%ymm6
- vpaddq 512-448(%r12),%ymm7,%ymm7
- vpaddq 544-448(%r12),%ymm8,%ymm8
-
- vpsrlq $29,%ymm0,%ymm14
- vpand %ymm15,%ymm0,%ymm0
- vpsrlq $29,%ymm1,%ymm11
- vpand %ymm15,%ymm1,%ymm1
- vpsrlq $29,%ymm2,%ymm12
- vpermq $147,%ymm14,%ymm14
- vpand %ymm15,%ymm2,%ymm2
- vpsrlq $29,%ymm3,%ymm13
- vpermq $147,%ymm11,%ymm11
- vpand %ymm15,%ymm3,%ymm3
- vpermq $147,%ymm12,%ymm12
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpermq $147,%ymm13,%ymm13
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm0,%ymm0
- vpblendd $3,%ymm11,%ymm12,%ymm11
- vpaddq %ymm14,%ymm1,%ymm1
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm11,%ymm2,%ymm2
- vpblendd $3,%ymm13,%ymm9,%ymm13
- vpaddq %ymm12,%ymm3,%ymm3
- vpaddq %ymm13,%ymm4,%ymm4
-
- vpsrlq $29,%ymm0,%ymm14
- vpand %ymm15,%ymm0,%ymm0
- vpsrlq $29,%ymm1,%ymm11
- vpand %ymm15,%ymm1,%ymm1
- vpsrlq $29,%ymm2,%ymm12
- vpermq $147,%ymm14,%ymm14
- vpand %ymm15,%ymm2,%ymm2
- vpsrlq $29,%ymm3,%ymm13
- vpermq $147,%ymm11,%ymm11
- vpand %ymm15,%ymm3,%ymm3
- vpermq $147,%ymm12,%ymm12
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpermq $147,%ymm13,%ymm13
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm0,%ymm0
- vpblendd $3,%ymm11,%ymm12,%ymm11
- vpaddq %ymm14,%ymm1,%ymm1
- vmovdqu %ymm0,0-128(%rdi)
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm11,%ymm2,%ymm2
- vmovdqu %ymm1,32-128(%rdi)
- vpblendd $3,%ymm13,%ymm9,%ymm13
- vpaddq %ymm12,%ymm3,%ymm3
- vmovdqu %ymm2,64-128(%rdi)
- vpaddq %ymm13,%ymm4,%ymm4
- vmovdqu %ymm3,96-128(%rdi)
- vpsrlq $29,%ymm4,%ymm14
- vpand %ymm15,%ymm4,%ymm4
- vpsrlq $29,%ymm5,%ymm11
- vpand %ymm15,%ymm5,%ymm5
- vpsrlq $29,%ymm6,%ymm12
- vpermq $147,%ymm14,%ymm14
- vpand %ymm15,%ymm6,%ymm6
- vpsrlq $29,%ymm7,%ymm13
- vpermq $147,%ymm11,%ymm11
- vpand %ymm15,%ymm7,%ymm7
- vpsrlq $29,%ymm8,%ymm0
- vpermq $147,%ymm12,%ymm12
- vpand %ymm15,%ymm8,%ymm8
- vpermq $147,%ymm13,%ymm13
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpermq $147,%ymm0,%ymm0
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm4,%ymm4
- vpblendd $3,%ymm11,%ymm12,%ymm11
- vpaddq %ymm14,%ymm5,%ymm5
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm11,%ymm6,%ymm6
- vpblendd $3,%ymm13,%ymm0,%ymm13
- vpaddq %ymm12,%ymm7,%ymm7
- vpaddq %ymm13,%ymm8,%ymm8
-
- vpsrlq $29,%ymm4,%ymm14
- vpand %ymm15,%ymm4,%ymm4
- vpsrlq $29,%ymm5,%ymm11
- vpand %ymm15,%ymm5,%ymm5
- vpsrlq $29,%ymm6,%ymm12
- vpermq $147,%ymm14,%ymm14
- vpand %ymm15,%ymm6,%ymm6
- vpsrlq $29,%ymm7,%ymm13
- vpermq $147,%ymm11,%ymm11
- vpand %ymm15,%ymm7,%ymm7
- vpsrlq $29,%ymm8,%ymm0
- vpermq $147,%ymm12,%ymm12
- vpand %ymm15,%ymm8,%ymm8
- vpermq $147,%ymm13,%ymm13
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpermq $147,%ymm0,%ymm0
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm4,%ymm4
- vpblendd $3,%ymm11,%ymm12,%ymm11
- vpaddq %ymm14,%ymm5,%ymm5
- vmovdqu %ymm4,128-128(%rdi)
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm11,%ymm6,%ymm6
- vmovdqu %ymm5,160-128(%rdi)
- vpblendd $3,%ymm13,%ymm0,%ymm13
- vpaddq %ymm12,%ymm7,%ymm7
- vmovdqu %ymm6,192-128(%rdi)
- vpaddq %ymm13,%ymm8,%ymm8
- vmovdqu %ymm7,224-128(%rdi)
- vmovdqu %ymm8,256-128(%rdi)
-
- movq %rdi,%rsi
- decl %r8d
- jne L$OOP_GRANDE_SQR_1024
-
- vzeroall
- movq %rbp,%rax
- movq -48(%rax),%r15
- movq -40(%rax),%r14
- movq -32(%rax),%r13
- movq -24(%rax),%r12
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-L$sqr_1024_epilogue:
- .byte 0xf3,0xc3
-
-.globl _rsaz_1024_mul_avx2
-
-.p2align 6
-_rsaz_1024_mul_avx2:
- leaq (%rsp),%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rax,%rbp
- vzeroall
- movq %rdx,%r13
- subq $64,%rsp
-
-
-
-
-
-
-.byte 0x67,0x67
- movq %rsi,%r15
- andq $4095,%r15
- addq $320,%r15
- shrq $12,%r15
- movq %rsi,%r15
- cmovnzq %r13,%rsi
- cmovnzq %r15,%r13
-
- movq %rcx,%r15
- subq $-128,%rsi
- subq $-128,%rcx
- subq $-128,%rdi
-
- andq $4095,%r15
- addq $320,%r15
-.byte 0x67,0x67
- shrq $12,%r15
- jz L$mul_1024_no_n_copy
-
-
-
-
-
- subq $320,%rsp
- vmovdqu 0-128(%rcx),%ymm0
- andq $-512,%rsp
- vmovdqu 32-128(%rcx),%ymm1
- vmovdqu 64-128(%rcx),%ymm2
- vmovdqu 96-128(%rcx),%ymm3
- vmovdqu 128-128(%rcx),%ymm4
- vmovdqu 160-128(%rcx),%ymm5
- vmovdqu 192-128(%rcx),%ymm6
- vmovdqu 224-128(%rcx),%ymm7
- vmovdqu 256-128(%rcx),%ymm8
- leaq 64+128(%rsp),%rcx
- vmovdqu %ymm0,0-128(%rcx)
- vpxor %ymm0,%ymm0,%ymm0
- vmovdqu %ymm1,32-128(%rcx)
- vpxor %ymm1,%ymm1,%ymm1
- vmovdqu %ymm2,64-128(%rcx)
- vpxor %ymm2,%ymm2,%ymm2
- vmovdqu %ymm3,96-128(%rcx)
- vpxor %ymm3,%ymm3,%ymm3
- vmovdqu %ymm4,128-128(%rcx)
- vpxor %ymm4,%ymm4,%ymm4
- vmovdqu %ymm5,160-128(%rcx)
- vpxor %ymm5,%ymm5,%ymm5
- vmovdqu %ymm6,192-128(%rcx)
- vpxor %ymm6,%ymm6,%ymm6
- vmovdqu %ymm7,224-128(%rcx)
- vpxor %ymm7,%ymm7,%ymm7
- vmovdqu %ymm8,256-128(%rcx)
- vmovdqa %ymm0,%ymm8
- vmovdqu %ymm9,288-128(%rcx)
-L$mul_1024_no_n_copy:
- andq $-64,%rsp
-
- movq (%r13),%rbx
- vpbroadcastq (%r13),%ymm10
- vmovdqu %ymm0,(%rsp)
- xorq %r9,%r9
-.byte 0x67
- xorq %r10,%r10
- xorq %r11,%r11
- xorq %r12,%r12
-
- vmovdqu L$and_mask(%rip),%ymm15
- movl $9,%r14d
- vmovdqu %ymm9,288-128(%rdi)
- jmp L$oop_mul_1024
-
-.p2align 5
-L$oop_mul_1024:
- vpsrlq $29,%ymm3,%ymm9
- movq %rbx,%rax
- imulq -128(%rsi),%rax
- addq %r9,%rax
- movq %rbx,%r10
- imulq 8-128(%rsi),%r10
- addq 8(%rsp),%r10
-
- movq %rax,%r9
- imull %r8d,%eax
- andl $536870911,%eax
-
- movq %rbx,%r11
- imulq 16-128(%rsi),%r11
- addq 16(%rsp),%r11
-
- movq %rbx,%r12
- imulq 24-128(%rsi),%r12
- addq 24(%rsp),%r12
- vpmuludq 32-128(%rsi),%ymm10,%ymm0
- vmovd %eax,%xmm11
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq 64-128(%rsi),%ymm10,%ymm12
- vpbroadcastq %xmm11,%ymm11
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq 96-128(%rsi),%ymm10,%ymm13
- vpand %ymm15,%ymm3,%ymm3
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq 128-128(%rsi),%ymm10,%ymm0
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq 160-128(%rsi),%ymm10,%ymm12
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq 192-128(%rsi),%ymm10,%ymm13
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq 224-128(%rsi),%ymm10,%ymm0
- vpermq $147,%ymm9,%ymm9
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq 256-128(%rsi),%ymm10,%ymm12
- vpbroadcastq 8(%r13),%ymm10
- vpaddq %ymm12,%ymm8,%ymm8
-
- movq %rax,%rdx
- imulq -128(%rcx),%rax
- addq %rax,%r9
- movq %rdx,%rax
- imulq 8-128(%rcx),%rax
- addq %rax,%r10
- movq %rdx,%rax
- imulq 16-128(%rcx),%rax
- addq %rax,%r11
- shrq $29,%r9
- imulq 24-128(%rcx),%rdx
- addq %rdx,%r12
- addq %r9,%r10
-
- vpmuludq 32-128(%rcx),%ymm11,%ymm13
- vmovq %xmm10,%rbx
- vpaddq %ymm13,%ymm1,%ymm1
- vpmuludq 64-128(%rcx),%ymm11,%ymm0
- vpaddq %ymm0,%ymm2,%ymm2
- vpmuludq 96-128(%rcx),%ymm11,%ymm12
- vpaddq %ymm12,%ymm3,%ymm3
- vpmuludq 128-128(%rcx),%ymm11,%ymm13
- vpaddq %ymm13,%ymm4,%ymm4
- vpmuludq 160-128(%rcx),%ymm11,%ymm0
- vpaddq %ymm0,%ymm5,%ymm5
- vpmuludq 192-128(%rcx),%ymm11,%ymm12
- vpaddq %ymm12,%ymm6,%ymm6
- vpmuludq 224-128(%rcx),%ymm11,%ymm13
- vpblendd $3,%ymm14,%ymm9,%ymm9
- vpaddq %ymm13,%ymm7,%ymm7
- vpmuludq 256-128(%rcx),%ymm11,%ymm0
- vpaddq %ymm9,%ymm3,%ymm3
- vpaddq %ymm0,%ymm8,%ymm8
-
- movq %rbx,%rax
- imulq -128(%rsi),%rax
- addq %rax,%r10
- vmovdqu -8+32-128(%rsi),%ymm12
- movq %rbx,%rax
- imulq 8-128(%rsi),%rax
- addq %rax,%r11
- vmovdqu -8+64-128(%rsi),%ymm13
-
- movq %r10,%rax
- imull %r8d,%eax
- andl $536870911,%eax
-
- imulq 16-128(%rsi),%rbx
- addq %rbx,%r12
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovd %eax,%xmm11
- vmovdqu -8+96-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm1,%ymm1
- vpmuludq %ymm10,%ymm13,%ymm13
- vpbroadcastq %xmm11,%ymm11
- vmovdqu -8+128-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm2,%ymm2
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -8+160-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm3,%ymm3
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovdqu -8+192-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm4,%ymm4
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -8+224-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm5,%ymm5
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -8+256-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm6,%ymm6
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovdqu -8+288-128(%rsi),%ymm9
- vpaddq %ymm12,%ymm7,%ymm7
- vpmuludq %ymm10,%ymm13,%ymm13
- vpaddq %ymm13,%ymm8,%ymm8
- vpmuludq %ymm10,%ymm9,%ymm9
- vpbroadcastq 16(%r13),%ymm10
-
- movq %rax,%rdx
- imulq -128(%rcx),%rax
- addq %rax,%r10
- vmovdqu -8+32-128(%rcx),%ymm0
- movq %rdx,%rax
- imulq 8-128(%rcx),%rax
- addq %rax,%r11
- vmovdqu -8+64-128(%rcx),%ymm12
- shrq $29,%r10
- imulq 16-128(%rcx),%rdx
- addq %rdx,%r12
- addq %r10,%r11
-
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovq %xmm10,%rbx
- vmovdqu -8+96-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -8+128-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -8+160-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -8+192-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -8+224-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -8+256-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -8+288-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq %ymm11,%ymm12,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq %ymm11,%ymm13,%ymm13
- vpaddq %ymm13,%ymm9,%ymm9
-
- vmovdqu -16+32-128(%rsi),%ymm0
- movq %rbx,%rax
- imulq -128(%rsi),%rax
- addq %r11,%rax
-
- vmovdqu -16+64-128(%rsi),%ymm12
- movq %rax,%r11
- imull %r8d,%eax
- andl $536870911,%eax
-
- imulq 8-128(%rsi),%rbx
- addq %rbx,%r12
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovd %eax,%xmm11
- vmovdqu -16+96-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq %ymm10,%ymm12,%ymm12
- vpbroadcastq %xmm11,%ymm11
- vmovdqu -16+128-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -16+160-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -16+192-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovdqu -16+224-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -16+256-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -16+288-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq %ymm10,%ymm12,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq %ymm10,%ymm13,%ymm13
- vpbroadcastq 24(%r13),%ymm10
- vpaddq %ymm13,%ymm9,%ymm9
-
- vmovdqu -16+32-128(%rcx),%ymm0
- movq %rax,%rdx
- imulq -128(%rcx),%rax
- addq %rax,%r11
- vmovdqu -16+64-128(%rcx),%ymm12
- imulq 8-128(%rcx),%rdx
- addq %rdx,%r12
- shrq $29,%r11
-
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovq %xmm10,%rbx
- vmovdqu -16+96-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -16+128-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -16+160-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -16+192-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -16+224-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -16+256-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -16+288-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -24+32-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -24+64-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm9,%ymm9
-
- addq %r11,%r12
- imulq -128(%rsi),%rbx
- addq %rbx,%r12
-
- movq %r12,%rax
- imull %r8d,%eax
- andl $536870911,%eax
-
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovd %eax,%xmm11
- vmovdqu -24+96-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq %ymm10,%ymm12,%ymm12
- vpbroadcastq %xmm11,%ymm11
- vmovdqu -24+128-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -24+160-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -24+192-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovdqu -24+224-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -24+256-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -24+288-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq %ymm10,%ymm12,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq %ymm10,%ymm13,%ymm13
- vpbroadcastq 32(%r13),%ymm10
- vpaddq %ymm13,%ymm9,%ymm9
- addq $32,%r13
-
- vmovdqu -24+32-128(%rcx),%ymm0
- imulq -128(%rcx),%rax
- addq %rax,%r12
- shrq $29,%r12
-
- vmovdqu -24+64-128(%rcx),%ymm12
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovq %xmm10,%rbx
- vmovdqu -24+96-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm1,%ymm0
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu %ymm0,(%rsp)
- vpaddq %ymm12,%ymm2,%ymm1
- vmovdqu -24+128-128(%rcx),%ymm0
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -24+160-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm3,%ymm2
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -24+192-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm4,%ymm3
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -24+224-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm5,%ymm4
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -24+256-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm6,%ymm5
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -24+288-128(%rcx),%ymm13
- movq %r12,%r9
- vpaddq %ymm0,%ymm7,%ymm6
- vpmuludq %ymm11,%ymm12,%ymm12
- addq (%rsp),%r9
- vpaddq %ymm12,%ymm8,%ymm7
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovq %r12,%xmm12
- vpaddq %ymm13,%ymm9,%ymm8
-
- decl %r14d
- jnz L$oop_mul_1024
- vpermq $0,%ymm15,%ymm15
- vpaddq (%rsp),%ymm12,%ymm0
-
- vpsrlq $29,%ymm0,%ymm12
- vpand %ymm15,%ymm0,%ymm0
- vpsrlq $29,%ymm1,%ymm13
- vpand %ymm15,%ymm1,%ymm1
- vpsrlq $29,%ymm2,%ymm10
- vpermq $147,%ymm12,%ymm12
- vpand %ymm15,%ymm2,%ymm2
- vpsrlq $29,%ymm3,%ymm11
- vpermq $147,%ymm13,%ymm13
- vpand %ymm15,%ymm3,%ymm3
-
- vpblendd $3,%ymm14,%ymm12,%ymm9
- vpermq $147,%ymm10,%ymm10
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpermq $147,%ymm11,%ymm11
- vpaddq %ymm9,%ymm0,%ymm0
- vpblendd $3,%ymm13,%ymm10,%ymm13
- vpaddq %ymm12,%ymm1,%ymm1
- vpblendd $3,%ymm10,%ymm11,%ymm10
- vpaddq %ymm13,%ymm2,%ymm2
- vpblendd $3,%ymm11,%ymm14,%ymm11
- vpaddq %ymm10,%ymm3,%ymm3
- vpaddq %ymm11,%ymm4,%ymm4
-
- vpsrlq $29,%ymm0,%ymm12
- vpand %ymm15,%ymm0,%ymm0
- vpsrlq $29,%ymm1,%ymm13
- vpand %ymm15,%ymm1,%ymm1
- vpsrlq $29,%ymm2,%ymm10
- vpermq $147,%ymm12,%ymm12
- vpand %ymm15,%ymm2,%ymm2
- vpsrlq $29,%ymm3,%ymm11
- vpermq $147,%ymm13,%ymm13
- vpand %ymm15,%ymm3,%ymm3
- vpermq $147,%ymm10,%ymm10
-
- vpblendd $3,%ymm14,%ymm12,%ymm9
- vpermq $147,%ymm11,%ymm11
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm9,%ymm0,%ymm0
- vpblendd $3,%ymm13,%ymm10,%ymm13
- vpaddq %ymm12,%ymm1,%ymm1
- vpblendd $3,%ymm10,%ymm11,%ymm10
- vpaddq %ymm13,%ymm2,%ymm2
- vpblendd $3,%ymm11,%ymm14,%ymm11
- vpaddq %ymm10,%ymm3,%ymm3
- vpaddq %ymm11,%ymm4,%ymm4
-
- vmovdqu %ymm0,0-128(%rdi)
- vmovdqu %ymm1,32-128(%rdi)
- vmovdqu %ymm2,64-128(%rdi)
- vmovdqu %ymm3,96-128(%rdi)
- vpsrlq $29,%ymm4,%ymm12
- vpand %ymm15,%ymm4,%ymm4
- vpsrlq $29,%ymm5,%ymm13
- vpand %ymm15,%ymm5,%ymm5
- vpsrlq $29,%ymm6,%ymm10
- vpermq $147,%ymm12,%ymm12
- vpand %ymm15,%ymm6,%ymm6
- vpsrlq $29,%ymm7,%ymm11
- vpermq $147,%ymm13,%ymm13
- vpand %ymm15,%ymm7,%ymm7
- vpsrlq $29,%ymm8,%ymm0
- vpermq $147,%ymm10,%ymm10
- vpand %ymm15,%ymm8,%ymm8
- vpermq $147,%ymm11,%ymm11
-
- vpblendd $3,%ymm14,%ymm12,%ymm9
- vpermq $147,%ymm0,%ymm0
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm9,%ymm4,%ymm4
- vpblendd $3,%ymm13,%ymm10,%ymm13
- vpaddq %ymm12,%ymm5,%ymm5
- vpblendd $3,%ymm10,%ymm11,%ymm10
- vpaddq %ymm13,%ymm6,%ymm6
- vpblendd $3,%ymm11,%ymm0,%ymm11
- vpaddq %ymm10,%ymm7,%ymm7
- vpaddq %ymm11,%ymm8,%ymm8
-
- vpsrlq $29,%ymm4,%ymm12
- vpand %ymm15,%ymm4,%ymm4
- vpsrlq $29,%ymm5,%ymm13
- vpand %ymm15,%ymm5,%ymm5
- vpsrlq $29,%ymm6,%ymm10
- vpermq $147,%ymm12,%ymm12
- vpand %ymm15,%ymm6,%ymm6
- vpsrlq $29,%ymm7,%ymm11
- vpermq $147,%ymm13,%ymm13
- vpand %ymm15,%ymm7,%ymm7
- vpsrlq $29,%ymm8,%ymm0
- vpermq $147,%ymm10,%ymm10
- vpand %ymm15,%ymm8,%ymm8
- vpermq $147,%ymm11,%ymm11
-
- vpblendd $3,%ymm14,%ymm12,%ymm9
- vpermq $147,%ymm0,%ymm0
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm9,%ymm4,%ymm4
- vpblendd $3,%ymm13,%ymm10,%ymm13
- vpaddq %ymm12,%ymm5,%ymm5
- vpblendd $3,%ymm10,%ymm11,%ymm10
- vpaddq %ymm13,%ymm6,%ymm6
- vpblendd $3,%ymm11,%ymm0,%ymm11
- vpaddq %ymm10,%ymm7,%ymm7
- vpaddq %ymm11,%ymm8,%ymm8
-
- vmovdqu %ymm4,128-128(%rdi)
- vmovdqu %ymm5,160-128(%rdi)
- vmovdqu %ymm6,192-128(%rdi)
- vmovdqu %ymm7,224-128(%rdi)
- vmovdqu %ymm8,256-128(%rdi)
- vzeroupper
-
- movq %rbp,%rax
- movq -48(%rax),%r15
- movq -40(%rax),%r14
- movq -32(%rax),%r13
- movq -24(%rax),%r12
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-L$mul_1024_epilogue:
- .byte 0xf3,0xc3
-
-.globl _rsaz_1024_red2norm_avx2
+.globl _rsaz_avx2_eligible
-.p2align 5
-_rsaz_1024_red2norm_avx2:
- subq $-128,%rsi
- xorq %rax,%rax
- movq -128(%rsi),%r8
- movq -120(%rsi),%r9
- movq -112(%rsi),%r10
- shlq $0,%r8
- shlq $29,%r9
- movq %r10,%r11
- shlq $58,%r10
- shrq $6,%r11
- addq %r8,%rax
- addq %r9,%rax
- addq %r10,%rax
- adcq $0,%r11
- movq %rax,0(%rdi)
- movq %r11,%rax
- movq -104(%rsi),%r8
- movq -96(%rsi),%r9
- shlq $23,%r8
- movq %r9,%r10
- shlq $52,%r9
- shrq $12,%r10
- addq %r8,%rax
- addq %r9,%rax
- adcq $0,%r10
- movq %rax,8(%rdi)
- movq %r10,%rax
- movq -88(%rsi),%r11
- movq -80(%rsi),%r8
- shlq $17,%r11
- movq %r8,%r9
- shlq $46,%r8
- shrq $18,%r9
- addq %r11,%rax
- addq %r8,%rax
- adcq $0,%r9
- movq %rax,16(%rdi)
- movq %r9,%rax
- movq -72(%rsi),%r10
- movq -64(%rsi),%r11
- shlq $11,%r10
- movq %r11,%r8
- shlq $40,%r11
- shrq $24,%r8
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,24(%rdi)
- movq %r8,%rax
- movq -56(%rsi),%r9
- movq -48(%rsi),%r10
- movq -40(%rsi),%r11
- shlq $5,%r9
- shlq $34,%r10
- movq %r11,%r8
- shlq $63,%r11
- shrq $1,%r8
- addq %r9,%rax
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,32(%rdi)
- movq %r8,%rax
- movq -32(%rsi),%r9
- movq -24(%rsi),%r10
- shlq $28,%r9
- movq %r10,%r11
- shlq $57,%r10
- shrq $7,%r11
- addq %r9,%rax
- addq %r10,%rax
- adcq $0,%r11
- movq %rax,40(%rdi)
- movq %r11,%rax
- movq -16(%rsi),%r8
- movq -8(%rsi),%r9
- shlq $22,%r8
- movq %r9,%r10
- shlq $51,%r9
- shrq $13,%r10
- addq %r8,%rax
- addq %r9,%rax
- adcq $0,%r10
- movq %rax,48(%rdi)
- movq %r10,%rax
- movq 0(%rsi),%r11
- movq 8(%rsi),%r8
- shlq $16,%r11
- movq %r8,%r9
- shlq $45,%r8
- shrq $19,%r9
- addq %r11,%rax
- addq %r8,%rax
- adcq $0,%r9
- movq %rax,56(%rdi)
- movq %r9,%rax
- movq 16(%rsi),%r10
- movq 24(%rsi),%r11
- shlq $10,%r10
- movq %r11,%r8
- shlq $39,%r11
- shrq $25,%r8
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,64(%rdi)
- movq %r8,%rax
- movq 32(%rsi),%r9
- movq 40(%rsi),%r10
- movq 48(%rsi),%r11
- shlq $4,%r9
- shlq $33,%r10
- movq %r11,%r8
- shlq $62,%r11
- shrq $2,%r8
- addq %r9,%rax
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,72(%rdi)
- movq %r8,%rax
- movq 56(%rsi),%r9
- movq 64(%rsi),%r10
- shlq $27,%r9
- movq %r10,%r11
- shlq $56,%r10
- shrq $8,%r11
- addq %r9,%rax
- addq %r10,%rax
- adcq $0,%r11
- movq %rax,80(%rdi)
- movq %r11,%rax
- movq 72(%rsi),%r8
- movq 80(%rsi),%r9
- shlq $21,%r8
- movq %r9,%r10
- shlq $50,%r9
- shrq $14,%r10
- addq %r8,%rax
- addq %r9,%rax
- adcq $0,%r10
- movq %rax,88(%rdi)
- movq %r10,%rax
- movq 88(%rsi),%r11
- movq 96(%rsi),%r8
- shlq $15,%r11
- movq %r8,%r9
- shlq $44,%r8
- shrq $20,%r9
- addq %r11,%rax
- addq %r8,%rax
- adcq $0,%r9
- movq %rax,96(%rdi)
- movq %r9,%rax
- movq 104(%rsi),%r10
- movq 112(%rsi),%r11
- shlq $9,%r10
- movq %r11,%r8
- shlq $38,%r11
- shrq $26,%r8
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,104(%rdi)
- movq %r8,%rax
- movq 120(%rsi),%r9
- movq 128(%rsi),%r10
- movq 136(%rsi),%r11
- shlq $3,%r9
- shlq $32,%r10
- movq %r11,%r8
- shlq $61,%r11
- shrq $3,%r8
- addq %r9,%rax
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,112(%rdi)
- movq %r8,%rax
- movq 144(%rsi),%r9
- movq 152(%rsi),%r10
- shlq $26,%r9
- movq %r10,%r11
- shlq $55,%r10
- shrq $9,%r11
- addq %r9,%rax
- addq %r10,%rax
- adcq $0,%r11
- movq %rax,120(%rdi)
- movq %r11,%rax
+_rsaz_avx2_eligible:
+ xorl %eax,%eax
.byte 0xf3,0xc3
+.globl _rsaz_1024_sqr_avx2
+.globl _rsaz_1024_mul_avx2
.globl _rsaz_1024_norm2red_avx2
-
-.p2align 5
-_rsaz_1024_norm2red_avx2:
- subq $-128,%rdi
- movq (%rsi),%r8
- movl $536870911,%eax
- movq 8(%rsi),%r9
- movq %r8,%r11
- shrq $0,%r11
- andq %rax,%r11
- movq %r11,-128(%rdi)
- movq %r8,%r10
- shrq $29,%r10
- andq %rax,%r10
- movq %r10,-120(%rdi)
- shrdq $58,%r9,%r8
- andq %rax,%r8
- movq %r8,-112(%rdi)
- movq 16(%rsi),%r10
- movq %r9,%r8
- shrq $23,%r8
- andq %rax,%r8
- movq %r8,-104(%rdi)
- shrdq $52,%r10,%r9
- andq %rax,%r9
- movq %r9,-96(%rdi)
- movq 24(%rsi),%r11
- movq %r10,%r9
- shrq $17,%r9
- andq %rax,%r9
- movq %r9,-88(%rdi)
- shrdq $46,%r11,%r10
- andq %rax,%r10
- movq %r10,-80(%rdi)
- movq 32(%rsi),%r8
- movq %r11,%r10
- shrq $11,%r10
- andq %rax,%r10
- movq %r10,-72(%rdi)
- shrdq $40,%r8,%r11
- andq %rax,%r11
- movq %r11,-64(%rdi)
- movq 40(%rsi),%r9
- movq %r8,%r11
- shrq $5,%r11
- andq %rax,%r11
- movq %r11,-56(%rdi)
- movq %r8,%r10
- shrq $34,%r10
- andq %rax,%r10
- movq %r10,-48(%rdi)
- shrdq $63,%r9,%r8
- andq %rax,%r8
- movq %r8,-40(%rdi)
- movq 48(%rsi),%r10
- movq %r9,%r8
- shrq $28,%r8
- andq %rax,%r8
- movq %r8,-32(%rdi)
- shrdq $57,%r10,%r9
- andq %rax,%r9
- movq %r9,-24(%rdi)
- movq 56(%rsi),%r11
- movq %r10,%r9
- shrq $22,%r9
- andq %rax,%r9
- movq %r9,-16(%rdi)
- shrdq $51,%r11,%r10
- andq %rax,%r10
- movq %r10,-8(%rdi)
- movq 64(%rsi),%r8
- movq %r11,%r10
- shrq $16,%r10
- andq %rax,%r10
- movq %r10,0(%rdi)
- shrdq $45,%r8,%r11
- andq %rax,%r11
- movq %r11,8(%rdi)
- movq 72(%rsi),%r9
- movq %r8,%r11
- shrq $10,%r11
- andq %rax,%r11
- movq %r11,16(%rdi)
- shrdq $39,%r9,%r8
- andq %rax,%r8
- movq %r8,24(%rdi)
- movq 80(%rsi),%r10
- movq %r9,%r8
- shrq $4,%r8
- andq %rax,%r8
- movq %r8,32(%rdi)
- movq %r9,%r11
- shrq $33,%r11
- andq %rax,%r11
- movq %r11,40(%rdi)
- shrdq $62,%r10,%r9
- andq %rax,%r9
- movq %r9,48(%rdi)
- movq 88(%rsi),%r11
- movq %r10,%r9
- shrq $27,%r9
- andq %rax,%r9
- movq %r9,56(%rdi)
- shrdq $56,%r11,%r10
- andq %rax,%r10
- movq %r10,64(%rdi)
- movq 96(%rsi),%r8
- movq %r11,%r10
- shrq $21,%r10
- andq %rax,%r10
- movq %r10,72(%rdi)
- shrdq $50,%r8,%r11
- andq %rax,%r11
- movq %r11,80(%rdi)
- movq 104(%rsi),%r9
- movq %r8,%r11
- shrq $15,%r11
- andq %rax,%r11
- movq %r11,88(%rdi)
- shrdq $44,%r9,%r8
- andq %rax,%r8
- movq %r8,96(%rdi)
- movq 112(%rsi),%r10
- movq %r9,%r8
- shrq $9,%r8
- andq %rax,%r8
- movq %r8,104(%rdi)
- shrdq $38,%r10,%r9
- andq %rax,%r9
- movq %r9,112(%rdi)
- movq 120(%rsi),%r11
- movq %r10,%r9
- shrq $3,%r9
- andq %rax,%r9
- movq %r9,120(%rdi)
- movq %r10,%r8
- shrq $32,%r8
- andq %rax,%r8
- movq %r8,128(%rdi)
- shrdq $61,%r11,%r10
- andq %rax,%r10
- movq %r10,136(%rdi)
- xorq %r8,%r8
- movq %r11,%r10
- shrq $26,%r10
- andq %rax,%r10
- movq %r10,144(%rdi)
- shrdq $55,%r8,%r11
- andq %rax,%r11
- movq %r11,152(%rdi)
- movq %r8,160(%rdi)
- movq %r8,168(%rdi)
- movq %r8,176(%rdi)
- movq %r8,184(%rdi)
- .byte 0xf3,0xc3
-
+.globl _rsaz_1024_red2norm_avx2
.globl _rsaz_1024_scatter5_avx2
-
-.p2align 5
-_rsaz_1024_scatter5_avx2:
- vzeroupper
- vmovdqu L$scatter_permd(%rip),%ymm5
- shll $4,%edx
- leaq (%rdi,%rdx,1),%rdi
- movl $9,%eax
- jmp L$oop_scatter_1024
-
-.p2align 5
-L$oop_scatter_1024:
- vmovdqu (%rsi),%ymm0
- leaq 32(%rsi),%rsi
- vpermd %ymm0,%ymm5,%ymm0
- vmovdqu %xmm0,(%rdi)
- leaq 512(%rdi),%rdi
- decl %eax
- jnz L$oop_scatter_1024
-
- vzeroupper
- .byte 0xf3,0xc3
-
-
.globl _rsaz_1024_gather5_avx2
-.p2align 5
+_rsaz_1024_sqr_avx2:
+_rsaz_1024_mul_avx2:
+_rsaz_1024_norm2red_avx2:
+_rsaz_1024_red2norm_avx2:
+_rsaz_1024_scatter5_avx2:
_rsaz_1024_gather5_avx2:
- leaq L$gather_table(%rip),%r11
- movl %edx,%eax
- andl $3,%edx
- shrl $2,%eax
- shll $4,%edx
-
- vmovdqu -32(%r11),%ymm7
- vpbroadcastb 8(%r11,%rax,1),%xmm8
- vpbroadcastb 7(%r11,%rax,1),%xmm9
- vpbroadcastb 6(%r11,%rax,1),%xmm10
- vpbroadcastb 5(%r11,%rax,1),%xmm11
- vpbroadcastb 4(%r11,%rax,1),%xmm12
- vpbroadcastb 3(%r11,%rax,1),%xmm13
- vpbroadcastb 2(%r11,%rax,1),%xmm14
- vpbroadcastb 1(%r11,%rax,1),%xmm15
-
- leaq 64(%rsi,%rdx,1),%rsi
- movq $64,%r11
- movl $9,%eax
- jmp L$oop_gather_1024
-
-.p2align 5
-L$oop_gather_1024:
- vpand -64(%rsi),%xmm8,%xmm0
- vpand (%rsi),%xmm9,%xmm1
- vpand 64(%rsi),%xmm10,%xmm2
- vpand (%rsi,%r11,2),%xmm11,%xmm3
- vpor %xmm0,%xmm1,%xmm1
- vpand 64(%rsi,%r11,2),%xmm12,%xmm4
- vpor %xmm2,%xmm3,%xmm3
- vpand (%rsi,%r11,4),%xmm13,%xmm5
- vpor %xmm1,%xmm3,%xmm3
- vpand 64(%rsi,%r11,4),%xmm14,%xmm6
- vpor %xmm4,%xmm5,%xmm5
- vpand -128(%rsi,%r11,8),%xmm15,%xmm2
- leaq (%rsi,%r11,8),%rsi
- vpor %xmm3,%xmm5,%xmm5
- vpor %xmm2,%xmm6,%xmm6
- vpor %xmm5,%xmm6,%xmm6
- vpermd %ymm6,%ymm7,%ymm6
- vmovdqu %ymm6,(%rdi)
- leaq 32(%rdi),%rdi
- decl %eax
- jnz L$oop_gather_1024
-
- vpxor %ymm0,%ymm0,%ymm0
- vmovdqu %ymm0,(%rdi)
- vzeroupper
+.byte 0x0f,0x0b
.byte 0xf3,0xc3
-
-
-.globl _rsaz_avx2_eligible
-
-.p2align 5
-_rsaz_avx2_eligible:
- movl _OPENSSL_ia32cap_P+8(%rip),%eax
- movl $524544,%ecx
- movl $0,%edx
- andl %eax,%ecx
- cmpl $524544,%ecx
- cmovel %edx,%eax
- andl $32,%eax
- shrl $5,%eax
- .byte 0xf3,0xc3
-
-
-.p2align 6
-L$and_mask:
-.quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
-L$scatter_permd:
-.long 0,2,4,6,7,7,7,7
-L$gather_permd:
-.long 0,7,1,7,2,7,3,7
-L$gather_table:
-.byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
-.p2align 6
diff --git a/deps/openssl/asm/x64-macosx-gas/bn/rsaz-x86_64.s b/deps/openssl/asm/x64-macosx-gas/bn/rsaz-x86_64.s
index 23c540d3ac..b92f098e73 100644
--- a/deps/openssl/asm/x64-macosx-gas/bn/rsaz-x86_64.s
+++ b/deps/openssl/asm/x64-macosx-gas/bn/rsaz-x86_64.s
@@ -19,10 +19,6 @@ L$sqr_body:
movq (%rsi),%rdx
movq 8(%rsi),%rax
movq %rcx,128(%rsp)
- movl $524544,%r11d
- andl _OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl $524544,%r11d
- je L$oop_sqrx
jmp L$oop_sqr
.p2align 5
@@ -386,276 +382,6 @@ L$oop_sqr:
decl %r8d
jnz L$oop_sqr
- jmp L$sqr_tail
-
-.p2align 5
-L$oop_sqrx:
- movl %r8d,128+8(%rsp)
-.byte 102,72,15,110,199
-.byte 102,72,15,110,205
-
- mulxq %rax,%r8,%r9
-
- mulxq 16(%rsi),%rcx,%r10
- xorq %rbp,%rbp
-
- mulxq 24(%rsi),%rax,%r11
- adcxq %rcx,%r9
-
- mulxq 32(%rsi),%rcx,%r12
- adcxq %rax,%r10
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rcx,%r11
-
-.byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00
- adcxq %rax,%r12
- adcxq %rcx,%r13
-
-.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
- adcxq %rax,%r14
- adcxq %rbp,%r15
-
- movq %r9,%rcx
- shldq $1,%r8,%r9
- shlq $1,%r8
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rdx
- adcxq %rdx,%r8
- movq 8(%rsi),%rdx
- adcxq %rbp,%r9
-
- movq %rax,(%rsp)
- movq %r8,8(%rsp)
-
-
- mulxq 16(%rsi),%rax,%rbx
- adoxq %rax,%r10
- adcxq %rbx,%r11
-
-.byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00
- adoxq %rdi,%r11
- adcxq %r8,%r12
-
- mulxq 32(%rsi),%rax,%rbx
- adoxq %rax,%r12
- adcxq %rbx,%r13
-
- mulxq 40(%rsi),%rdi,%r8
- adoxq %rdi,%r13
- adcxq %r8,%r14
-
-.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
- adoxq %rax,%r14
- adcxq %rbx,%r15
-
-.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
- adoxq %rdi,%r15
- adcxq %rbp,%r8
- adoxq %rbp,%r8
-
- movq %r11,%rbx
- shldq $1,%r10,%r11
- shldq $1,%rcx,%r10
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rcx
- movq 16(%rsi),%rdx
- adcxq %rax,%r9
- adcxq %rcx,%r10
- adcxq %rbp,%r11
-
- movq %r9,16(%rsp)
-.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00
-
-
-.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00
- adoxq %rdi,%r12
- adcxq %r9,%r13
-
- mulxq 32(%rsi),%rax,%rcx
- adoxq %rax,%r13
- adcxq %rcx,%r14
-
- mulxq 40(%rsi),%rdi,%r9
- adoxq %rdi,%r14
- adcxq %r9,%r15
-
-.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
- adoxq %rax,%r15
- adcxq %rcx,%r8
-
-.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00
- adoxq %rdi,%r8
- adcxq %rbp,%r9
- adoxq %rbp,%r9
-
- movq %r13,%rcx
- shldq $1,%r12,%r13
- shldq $1,%rbx,%r12
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rdx
- adcxq %rax,%r11
- adcxq %rdx,%r12
- movq 24(%rsi),%rdx
- adcxq %rbp,%r13
-
- movq %r11,32(%rsp)
-.byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00
-
-
-.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00
- adoxq %rax,%r14
- adcxq %rbx,%r15
-
- mulxq 40(%rsi),%rdi,%r10
- adoxq %rdi,%r15
- adcxq %r10,%r8
-
- mulxq 48(%rsi),%rax,%rbx
- adoxq %rax,%r8
- adcxq %rbx,%r9
-
- mulxq 56(%rsi),%rdi,%r10
- adoxq %rdi,%r9
- adcxq %rbp,%r10
- adoxq %rbp,%r10
-
-.byte 0x66
- movq %r15,%rbx
- shldq $1,%r14,%r15
- shldq $1,%rcx,%r14
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rdx
- adcxq %rax,%r13
- adcxq %rdx,%r14
- movq 32(%rsi),%rdx
- adcxq %rbp,%r15
-
- movq %r13,48(%rsp)
- movq %r14,56(%rsp)
-
-
-.byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00
- adoxq %rdi,%r8
- adcxq %r11,%r9
-
- mulxq 48(%rsi),%rax,%rcx
- adoxq %rax,%r9
- adcxq %rcx,%r10
-
- mulxq 56(%rsi),%rdi,%r11
- adoxq %rdi,%r10
- adcxq %rbp,%r11
- adoxq %rbp,%r11
-
- movq %r9,%rcx
- shldq $1,%r8,%r9
- shldq $1,%rbx,%r8
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rdx
- adcxq %rax,%r15
- adcxq %rdx,%r8
- movq 40(%rsi),%rdx
- adcxq %rbp,%r9
-
- movq %r15,64(%rsp)
- movq %r8,72(%rsp)
-
-
-.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
- adoxq %rax,%r10
- adcxq %rbx,%r11
-
-.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
- adoxq %rdi,%r11
- adcxq %rbp,%r12
- adoxq %rbp,%r12
-
- movq %r11,%rbx
- shldq $1,%r10,%r11
- shldq $1,%rcx,%r10
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rdx
- adcxq %rax,%r9
- adcxq %rdx,%r10
- movq 48(%rsi),%rdx
- adcxq %rbp,%r11
-
- movq %r9,80(%rsp)
- movq %r10,88(%rsp)
-
-
-.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
- adoxq %rax,%r12
- adoxq %rbp,%r13
-
- xorq %r14,%r14
- shldq $1,%r13,%r14
- shldq $1,%r12,%r13
- shldq $1,%rbx,%r12
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rdx
- adcxq %rax,%r11
- adcxq %rdx,%r12
- movq 56(%rsi),%rdx
- adcxq %rbp,%r13
-
-.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
-.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00
-
-
- mulxq %rdx,%rax,%rdx
- adoxq %rax,%r13
- adoxq %rbp,%rdx
-
-.byte 0x66
- addq %rdx,%r14
-
- movq %r13,112(%rsp)
- movq %r14,120(%rsp)
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
-
- movq 128(%rsp),%rdx
- movq (%rsp),%r8
- movq 8(%rsp),%r9
- movq 16(%rsp),%r10
- movq 24(%rsp),%r11
- movq 32(%rsp),%r12
- movq 40(%rsp),%r13
- movq 48(%rsp),%r14
- movq 56(%rsp),%r15
-
- call __rsaz_512_reducex
-
- addq 64(%rsp),%r8
- adcq 72(%rsp),%r9
- adcq 80(%rsp),%r10
- adcq 88(%rsp),%r11
- adcq 96(%rsp),%r12
- adcq 104(%rsp),%r13
- adcq 112(%rsp),%r14
- adcq 120(%rsp),%r15
- sbbq %rcx,%rcx
-
- call __rsaz_512_subtract
-
- movq %r8,%rdx
- movq %r9,%rax
- movl 128+8(%rsp),%r8d
- movq %rdi,%rsi
-
- decl %r8d
- jnz L$oop_sqrx
-
-L$sqr_tail:
leaq 128+24+48(%rsp),%rax
movq -48(%rax),%r15
@@ -684,10 +410,6 @@ L$mul_body:
.byte 102,72,15,110,199
.byte 102,72,15,110,201
movq %r8,128(%rsp)
- movl $524544,%r11d
- andl _OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl $524544,%r11d
- je L$mulx
movq (%rdx),%rbx
movq %rdx,%rbp
call __rsaz_512_mul
@@ -705,29 +427,6 @@ L$mul_body:
movq 56(%rsp),%r15
call __rsaz_512_reduce
- jmp L$mul_tail
-
-.p2align 5
-L$mulx:
- movq %rdx,%rbp
- movq (%rdx),%rdx
- call __rsaz_512_mulx
-
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
-
- movq 128(%rsp),%rdx
- movq (%rsp),%r8
- movq 8(%rsp),%r9
- movq 16(%rsp),%r10
- movq 24(%rsp),%r11
- movq 32(%rsp),%r12
- movq 40(%rsp),%r13
- movq 48(%rsp),%r14
- movq 56(%rsp),%r15
-
- call __rsaz_512_reducex
-L$mul_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@@ -762,52 +461,94 @@ _rsaz_512_mul_gather4:
pushq %r14
pushq %r15
- movl %r9d,%r9d
- subq $128+24,%rsp
+ subq $152,%rsp
L$mul_gather4_body:
- movl $524544,%r11d
- andl _OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl $524544,%r11d
- je L$mulx_gather
- movl 64(%rdx,%r9,4),%eax
-.byte 102,72,15,110,199
- movl (%rdx,%r9,4),%ebx
-.byte 102,72,15,110,201
+ movd %r9d,%xmm8
+ movdqa L$inc+16(%rip),%xmm1
+ movdqa L$inc(%rip),%xmm0
+
+ pshufd $0,%xmm8,%xmm8
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm8,%xmm0
+ movdqa %xmm7,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm8,%xmm1
+ movdqa %xmm7,%xmm4
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm8,%xmm2
+ movdqa %xmm7,%xmm5
+ paddd %xmm3,%xmm4
+ pcmpeqd %xmm8,%xmm3
+ movdqa %xmm7,%xmm6
+ paddd %xmm4,%xmm5
+ pcmpeqd %xmm8,%xmm4
+ paddd %xmm5,%xmm6
+ pcmpeqd %xmm8,%xmm5
+ paddd %xmm6,%xmm7
+ pcmpeqd %xmm8,%xmm6
+ pcmpeqd %xmm8,%xmm7
+
+ movdqa 0(%rdx),%xmm8
+ movdqa 16(%rdx),%xmm9
+ movdqa 32(%rdx),%xmm10
+ movdqa 48(%rdx),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rdx),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rdx),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rdx),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rdx),%xmm15
+ leaq 128(%rdx),%rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+.byte 102,76,15,126,195
+
movq %r8,128(%rsp)
+ movq %rdi,128+8(%rsp)
+ movq %rcx,128+16(%rsp)
- shlq $32,%rax
- orq %rax,%rbx
movq (%rsi),%rax
movq 8(%rsi),%rcx
- leaq 128(%rdx,%r9,4),%rbp
mulq %rbx
movq %rax,(%rsp)
movq %rcx,%rax
movq %rdx,%r8
mulq %rbx
- movd (%rbp),%xmm4
addq %rax,%r8
movq 16(%rsi),%rax
movq %rdx,%r9
adcq $0,%r9
mulq %rbx
- movd 64(%rbp),%xmm5
addq %rax,%r9
movq 24(%rsi),%rax
movq %rdx,%r10
adcq $0,%r10
mulq %rbx
- pslldq $4,%xmm5
addq %rax,%r10
movq 32(%rsi),%rax
movq %rdx,%r11
adcq $0,%r11
mulq %rbx
- por %xmm5,%xmm4
addq %rax,%r11
movq 40(%rsi),%rax
movq %rdx,%r12
@@ -820,14 +561,12 @@ L$mul_gather4_body:
adcq $0,%r13
mulq %rbx
- leaq 128(%rbp),%rbp
addq %rax,%r13
movq 56(%rsi),%rax
movq %rdx,%r14
adcq $0,%r14
mulq %rbx
-.byte 102,72,15,126,227
addq %rax,%r14
movq (%rsi),%rax
movq %rdx,%r15
@@ -839,6 +578,35 @@ L$mul_gather4_body:
.p2align 5
L$oop_mul_gather:
+ movdqa 0(%rbp),%xmm8
+ movdqa 16(%rbp),%xmm9
+ movdqa 32(%rbp),%xmm10
+ movdqa 48(%rbp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rbp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rbp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rbp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rbp),%xmm15
+ leaq 128(%rbp),%rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+.byte 102,76,15,126,195
+
mulq %rbx
addq %rax,%r8
movq 8(%rsi),%rax
@@ -847,7 +615,6 @@ L$oop_mul_gather:
adcq $0,%r8
mulq %rbx
- movd (%rbp),%xmm4
addq %rax,%r9
movq 16(%rsi),%rax
adcq $0,%rdx
@@ -856,7 +623,6 @@ L$oop_mul_gather:
adcq $0,%r9
mulq %rbx
- movd 64(%rbp),%xmm5
addq %rax,%r10
movq 24(%rsi),%rax
adcq $0,%rdx
@@ -865,7 +631,6 @@ L$oop_mul_gather:
adcq $0,%r10
mulq %rbx
- pslldq $4,%xmm5
addq %rax,%r11
movq 32(%rsi),%rax
adcq $0,%rdx
@@ -874,7 +639,6 @@ L$oop_mul_gather:
adcq $0,%r11
mulq %rbx
- por %xmm5,%xmm4
addq %rax,%r12
movq 40(%rsi),%rax
adcq $0,%rdx
@@ -899,7 +663,6 @@ L$oop_mul_gather:
adcq $0,%r14
mulq %rbx
-.byte 102,72,15,126,227
addq %rax,%r15
movq (%rsi),%rax
adcq $0,%rdx
@@ -907,7 +670,6 @@ L$oop_mul_gather:
movq %rdx,%r15
adcq $0,%r15
- leaq 128(%rbp),%rbp
leaq 8(%rdi),%rdi
decl %ecx
@@ -922,8 +684,8 @@ L$oop_mul_gather:
movq %r14,48(%rdi)
movq %r15,56(%rdi)
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
+ movq 128+8(%rsp),%rdi
+ movq 128+16(%rsp),%rbp
movq (%rsp),%r8
movq 8(%rsp),%r9
@@ -935,126 +697,6 @@ L$oop_mul_gather:
movq 56(%rsp),%r15
call __rsaz_512_reduce
- jmp L$mul_gather_tail
-
-.p2align 5
-L$mulx_gather:
- movl 64(%rdx,%r9,4),%eax
-.byte 102,72,15,110,199
- leaq 128(%rdx,%r9,4),%rbp
- movl (%rdx,%r9,4),%edx
-.byte 102,72,15,110,201
- movq %r8,128(%rsp)
-
- shlq $32,%rax
- orq %rax,%rdx
- mulxq (%rsi),%rbx,%r8
- movq %rbx,(%rsp)
- xorl %edi,%edi
-
- mulxq 8(%rsi),%rax,%r9
- movd (%rbp),%xmm4
-
- mulxq 16(%rsi),%rbx,%r10
- movd 64(%rbp),%xmm5
- adcxq %rax,%r8
-
- mulxq 24(%rsi),%rax,%r11
- pslldq $4,%xmm5
- adcxq %rbx,%r9
-
- mulxq 32(%rsi),%rbx,%r12
- por %xmm5,%xmm4
- adcxq %rax,%r10
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rbx,%r11
-
- mulxq 48(%rsi),%rbx,%r14
- leaq 128(%rbp),%rbp
- adcxq %rax,%r12
-
- mulxq 56(%rsi),%rax,%r15
-.byte 102,72,15,126,226
- adcxq %rbx,%r13
- adcxq %rax,%r14
- movq %r8,%rbx
- adcxq %rdi,%r15
-
- movq $-7,%rcx
- jmp L$oop_mulx_gather
-
-.p2align 5
-L$oop_mulx_gather:
- mulxq (%rsi),%rax,%r8
- adcxq %rax,%rbx
- adoxq %r9,%r8
-
- mulxq 8(%rsi),%rax,%r9
-.byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00
- adcxq %rax,%r8
- adoxq %r10,%r9
-
- mulxq 16(%rsi),%rax,%r10
- movd 64(%rbp),%xmm5
- leaq 128(%rbp),%rbp
- adcxq %rax,%r9
- adoxq %r11,%r10
-
-.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
- pslldq $4,%xmm5
- por %xmm5,%xmm4
- adcxq %rax,%r10
- adoxq %r12,%r11
-
- mulxq 32(%rsi),%rax,%r12
- adcxq %rax,%r11
- adoxq %r13,%r12
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
-.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
- adcxq %rax,%r13
- adoxq %r15,%r14
-
- mulxq 56(%rsi),%rax,%r15
-.byte 102,72,15,126,226
- movq %rbx,64(%rsp,%rcx,8)
- adcxq %rax,%r14
- adoxq %rdi,%r15
- movq %r8,%rbx
- adcxq %rdi,%r15
-
- incq %rcx
- jnz L$oop_mulx_gather
-
- movq %r8,64(%rsp)
- movq %r9,64+8(%rsp)
- movq %r10,64+16(%rsp)
- movq %r11,64+24(%rsp)
- movq %r12,64+32(%rsp)
- movq %r13,64+40(%rsp)
- movq %r14,64+48(%rsp)
- movq %r15,64+56(%rsp)
-
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
-
- movq 128(%rsp),%rdx
- movq (%rsp),%r8
- movq 8(%rsp),%r9
- movq 16(%rsp),%r10
- movq 24(%rsp),%r11
- movq 32(%rsp),%r12
- movq 40(%rsp),%r13
- movq 48(%rsp),%r14
- movq 56(%rsp),%r15
-
- call __rsaz_512_reducex
-
-L$mul_gather_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@@ -1092,17 +734,13 @@ _rsaz_512_mul_scatter4:
movl %r9d,%r9d
subq $128+24,%rsp
L$mul_scatter4_body:
- leaq (%r8,%r9,4),%r8
+ leaq (%r8,%r9,8),%r8
.byte 102,72,15,110,199
.byte 102,72,15,110,202
.byte 102,73,15,110,208
movq %rcx,128(%rsp)
movq %rdi,%rbp
- movl $524544,%r11d
- andl _OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl $524544,%r11d
- je L$mulx_scatter
movq (%rdi),%rbx
call __rsaz_512_mul
@@ -1119,29 +757,6 @@ L$mul_scatter4_body:
movq 56(%rsp),%r15
call __rsaz_512_reduce
- jmp L$mul_scatter_tail
-
-.p2align 5
-L$mulx_scatter:
- movq (%rdi),%rdx
- call __rsaz_512_mulx
-
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
-
- movq 128(%rsp),%rdx
- movq (%rsp),%r8
- movq 8(%rsp),%r9
- movq 16(%rsp),%r10
- movq 24(%rsp),%r11
- movq 32(%rsp),%r12
- movq 40(%rsp),%r13
- movq 48(%rsp),%r14
- movq 56(%rsp),%r15
-
- call __rsaz_512_reducex
-
-L$mul_scatter_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@@ -1155,30 +770,14 @@ L$mul_scatter_tail:
call __rsaz_512_subtract
- movl %r8d,0(%rsi)
- shrq $32,%r8
- movl %r9d,128(%rsi)
- shrq $32,%r9
- movl %r10d,256(%rsi)
- shrq $32,%r10
- movl %r11d,384(%rsi)
- shrq $32,%r11
- movl %r12d,512(%rsi)
- shrq $32,%r12
- movl %r13d,640(%rsi)
- shrq $32,%r13
- movl %r14d,768(%rsi)
- shrq $32,%r14
- movl %r15d,896(%rsi)
- shrq $32,%r15
- movl %r8d,64(%rsi)
- movl %r9d,192(%rsi)
- movl %r10d,320(%rsi)
- movl %r11d,448(%rsi)
- movl %r12d,576(%rsi)
- movl %r13d,704(%rsi)
- movl %r14d,832(%rsi)
- movl %r15d,960(%rsi)
+ movq %r8,0(%rsi)
+ movq %r9,128(%rsi)
+ movq %r10,256(%rsi)
+ movq %r11,384(%rsi)
+ movq %r12,512(%rsi)
+ movq %r13,640(%rsi)
+ movq %r14,768(%rsi)
+ movq %r15,896(%rsi)
leaq 128+24+48(%rsp),%rax
movq -48(%rax),%r15
@@ -1204,7 +803,6 @@ _rsaz_512_mul_by_one:
subq $128+24,%rsp
L$mul_by_one_body:
- movl _OPENSSL_ia32cap_P+8(%rip),%eax
movq %rdx,%rbp
movq %rcx,128(%rsp)
@@ -1225,16 +823,7 @@ L$mul_by_one_body:
movdqa %xmm0,64(%rsp)
movdqa %xmm0,80(%rsp)
movdqa %xmm0,96(%rsp)
- andl $524544,%eax
- cmpl $524544,%eax
- je L$by_one_callx
call __rsaz_512_reduce
- jmp L$by_one_tail
-.p2align 5
-L$by_one_callx:
- movq 128(%rsp),%rdx
- call __rsaz_512_reducex
-L$by_one_tail:
movq %r8,(%rdi)
movq %r9,8(%rdi)
movq %r10,16(%rdi)
@@ -1340,62 +929,6 @@ L$reduction_loop:
.p2align 5
-__rsaz_512_reducex:
-
- imulq %r8,%rdx
- xorq %rsi,%rsi
- movl $8,%ecx
- jmp L$reduction_loopx
-
-.p2align 5
-L$reduction_loopx:
- movq %r8,%rbx
- mulxq 0(%rbp),%rax,%r8
- adcxq %rbx,%rax
- adoxq %r9,%r8
-
- mulxq 8(%rbp),%rax,%r9
- adcxq %rax,%r8
- adoxq %r10,%r9
-
- mulxq 16(%rbp),%rbx,%r10
- adcxq %rbx,%r9
- adoxq %r11,%r10
-
- mulxq 24(%rbp),%rbx,%r11
- adcxq %rbx,%r10
- adoxq %r12,%r11
-
-.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
- movq %rdx,%rax
- movq %r8,%rdx
- adcxq %rbx,%r11
- adoxq %r13,%r12
-
- mulxq 128+8(%rsp),%rbx,%rdx
- movq %rax,%rdx
-
- mulxq 40(%rbp),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
-.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
- adcxq %rax,%r13
- adoxq %r15,%r14
-
- mulxq 56(%rbp),%rax,%r15
- movq %rbx,%rdx
- adcxq %rax,%r14
- adoxq %rsi,%r15
- adcxq %rsi,%r15
-
- decl %ecx
- jne L$reduction_loopx
-
- .byte 0xf3,0xc3
-
-
-.p2align 5
__rsaz_512_subtract:
movq %r8,(%rdi)
movq %r9,8(%rdi)
@@ -1593,140 +1126,18 @@ L$oop_mul:
.byte 0xf3,0xc3
-
-.p2align 5
-__rsaz_512_mulx:
- mulxq (%rsi),%rbx,%r8
- movq $-6,%rcx
-
- mulxq 8(%rsi),%rax,%r9
- movq %rbx,8(%rsp)
-
- mulxq 16(%rsi),%rbx,%r10
- adcq %rax,%r8
-
- mulxq 24(%rsi),%rax,%r11
- adcq %rbx,%r9
-
- mulxq 32(%rsi),%rbx,%r12
- adcq %rax,%r10
-
- mulxq 40(%rsi),%rax,%r13
- adcq %rbx,%r11
-
- mulxq 48(%rsi),%rbx,%r14
- adcq %rax,%r12
-
- mulxq 56(%rsi),%rax,%r15
- movq 8(%rbp),%rdx
- adcq %rbx,%r13
- adcq %rax,%r14
- adcq $0,%r15
-
- xorq %rdi,%rdi
- jmp L$oop_mulx
-
-.p2align 5
-L$oop_mulx:
- movq %r8,%rbx
- mulxq (%rsi),%rax,%r8
- adcxq %rax,%rbx
- adoxq %r9,%r8
-
- mulxq 8(%rsi),%rax,%r9
- adcxq %rax,%r8
- adoxq %r10,%r9
-
- mulxq 16(%rsi),%rax,%r10
- adcxq %rax,%r9
- adoxq %r11,%r10
-
- mulxq 24(%rsi),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
-
-.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
- adcxq %rax,%r11
- adoxq %r13,%r12
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
- mulxq 48(%rsi),%rax,%r14
- adcxq %rax,%r13
- adoxq %r15,%r14
-
- mulxq 56(%rsi),%rax,%r15
- movq 64(%rbp,%rcx,8),%rdx
- movq %rbx,8+64-8(%rsp,%rcx,8)
- adcxq %rax,%r14
- adoxq %rdi,%r15
- adcxq %rdi,%r15
-
- incq %rcx
- jnz L$oop_mulx
-
- movq %r8,%rbx
- mulxq (%rsi),%rax,%r8
- adcxq %rax,%rbx
- adoxq %r9,%r8
-
-.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
- adcxq %rax,%r8
- adoxq %r10,%r9
-
-.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
- adcxq %rax,%r9
- adoxq %r11,%r10
-
- mulxq 24(%rsi),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
-
- mulxq 32(%rsi),%rax,%r12
- adcxq %rax,%r11
- adoxq %r13,%r12
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
-.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
- adcxq %rax,%r13
- adoxq %r15,%r14
-
-.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
- adcxq %rax,%r14
- adoxq %rdi,%r15
- adcxq %rdi,%r15
-
- movq %rbx,8+64-8(%rsp)
- movq %r8,8+64(%rsp)
- movq %r9,8+64+8(%rsp)
- movq %r10,8+64+16(%rsp)
- movq %r11,8+64+24(%rsp)
- movq %r12,8+64+32(%rsp)
- movq %r13,8+64+40(%rsp)
- movq %r14,8+64+48(%rsp)
- movq %r15,8+64+56(%rsp)
-
- .byte 0xf3,0xc3
-
.globl _rsaz_512_scatter4
.p2align 4
_rsaz_512_scatter4:
- leaq (%rdi,%rdx,4),%rdi
+ leaq (%rdi,%rdx,8),%rdi
movl $8,%r9d
jmp L$oop_scatter
.p2align 4
L$oop_scatter:
movq (%rsi),%rax
leaq 8(%rsi),%rsi
- movl %eax,(%rdi)
- shrq $32,%rax
- movl %eax,64(%rdi)
+ movq %rax,(%rdi)
leaq 128(%rdi),%rdi
decl %r9d
jnz L$oop_scatter
@@ -1737,18 +1148,72 @@ L$oop_scatter:
.p2align 4
_rsaz_512_gather4:
- leaq (%rsi,%rdx,4),%rsi
+ movd %edx,%xmm8
+ movdqa L$inc+16(%rip),%xmm1
+ movdqa L$inc(%rip),%xmm0
+
+ pshufd $0,%xmm8,%xmm8
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm8,%xmm0
+ movdqa %xmm7,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm8,%xmm1
+ movdqa %xmm7,%xmm4
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm8,%xmm2
+ movdqa %xmm7,%xmm5
+ paddd %xmm3,%xmm4
+ pcmpeqd %xmm8,%xmm3
+ movdqa %xmm7,%xmm6
+ paddd %xmm4,%xmm5
+ pcmpeqd %xmm8,%xmm4
+ paddd %xmm5,%xmm6
+ pcmpeqd %xmm8,%xmm5
+ paddd %xmm6,%xmm7
+ pcmpeqd %xmm8,%xmm6
+ pcmpeqd %xmm8,%xmm7
movl $8,%r9d
jmp L$oop_gather
.p2align 4
L$oop_gather:
- movl (%rsi),%eax
- movl 64(%rsi),%r8d
+ movdqa 0(%rsi),%xmm8
+ movdqa 16(%rsi),%xmm9
+ movdqa 32(%rsi),%xmm10
+ movdqa 48(%rsi),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rsi),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rsi),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rsi),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rsi),%xmm15
leaq 128(%rsi),%rsi
- shlq $32,%r8
- orq %r8,%rax
- movq %rax,(%rdi)
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+ movq %xmm8,(%rdi)
leaq 8(%rdi),%rdi
decl %r9d
jnz L$oop_gather
.byte 0xf3,0xc3
+L$SEH_end_rsaz_512_gather4:
+
+
+.p2align 6
+L$inc:
+.long 0,0, 1,1
+.long 2,2, 2,2
diff --git a/deps/openssl/asm/x64-macosx-gas/bn/x86_64-gf2m.s b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-gf2m.s
index 040c324c49..c0f0b4bd68 100644
--- a/deps/openssl/asm/x64-macosx-gas/bn/x86_64-gf2m.s
+++ b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-gf2m.s
@@ -242,7 +242,7 @@ L$body_mul_2x2:
movq %rcx,56(%rsp)
movq %r8,64(%rsp)
- movq $15,%r8
+ movq $0xf,%r8
movq %rsi,%rax
movq %rcx,%rbp
call _mul_1x1
diff --git a/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont.s b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont.s
index 03b9c7d949..9b49555a4d 100644
--- a/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont.s
+++ b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont.s
@@ -10,7 +10,6 @@ _bn_mul_mont:
jnz L$mul_enter
cmpl $8,%r9d
jb L$mul_enter
- movl _OPENSSL_ia32cap_P+8(%rip),%r11d
cmpq %rsi,%rdx
jne L$mul4x_enter
testl $7,%r9d
@@ -216,9 +215,6 @@ L$mul_epilogue:
.p2align 4
bn_mul4x_mont:
L$mul4x_enter:
- andl $524544,%r11d
- cmpl $524544,%r11d
- je L$mulx4x_enter
pushq %rbx
pushq %rbp
pushq %r12
@@ -616,7 +612,6 @@ L$mul4x_epilogue:
-
.p2align 5
bn_sqr8x_mont:
L$sqr8x_enter:
@@ -638,20 +633,20 @@ L$sqr8x_enter:
- leaq -64(%rsp,%r9,4),%r11
+ leaq -64(%rsp,%r9,2),%r11
movq (%r8),%r8
subq %rsi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb L$sqr8x_sp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,4),%rsp
+ leaq -64(%rsp,%r9,2),%rsp
jmp L$sqr8x_sp_done
.p2align 5
L$sqr8x_sp_alt:
- leaq 4096-64(,%r9,4),%r10
- leaq -64(%rsp,%r9,4),%rsp
+ leaq 4096-64(,%r9,2),%r10
+ leaq -64(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -661,385 +656,81 @@ L$sqr8x_sp_done:
movq %r9,%r10
negq %r9
- leaq 64(%rsp,%r9,2),%r11
movq %r8,32(%rsp)
movq %rax,40(%rsp)
L$sqr8x_body:
- movq %r9,%rbp
-.byte 102,73,15,110,211
- shrq $3+2,%rbp
- movl _OPENSSL_ia32cap_P+8(%rip),%eax
- jmp L$sqr8x_copy_n
-
-.p2align 5
-L$sqr8x_copy_n:
- movq 0(%rcx),%xmm0
- movq 8(%rcx),%xmm1
- movq 16(%rcx),%xmm3
- movq 24(%rcx),%xmm4
- leaq 32(%rcx),%rcx
- movdqa %xmm0,0(%r11)
- movdqa %xmm1,16(%r11)
- movdqa %xmm3,32(%r11)
- movdqa %xmm4,48(%r11)
- leaq 64(%r11),%r11
- decq %rbp
- jnz L$sqr8x_copy_n
-
+.byte 102,72,15,110,209
pxor %xmm0,%xmm0
.byte 102,72,15,110,207
.byte 102,73,15,110,218
- andl $524544,%eax
- cmpl $524544,%eax
- jne L$sqr8x_nox
-
- call _bn_sqrx8x_internal
-
- pxor %xmm0,%xmm0
- leaq 48(%rsp),%rax
- leaq 64(%rsp,%r9,2),%rdx
- shrq $3+2,%r9
- movq 40(%rsp),%rsi
- jmp L$sqr8x_zero
-
-.p2align 5
-L$sqr8x_nox:
call _bn_sqr8x_internal
- pxor %xmm0,%xmm0
- leaq 48(%rsp),%rax
- leaq 64(%rsp,%r9,2),%rdx
- shrq $3+2,%r9
- movq 40(%rsp),%rsi
- jmp L$sqr8x_zero
-
-.p2align 5
-L$sqr8x_zero:
- movdqa %xmm0,0(%rax)
- movdqa %xmm0,16(%rax)
- movdqa %xmm0,32(%rax)
- movdqa %xmm0,48(%rax)
- leaq 64(%rax),%rax
- movdqa %xmm0,0(%rdx)
- movdqa %xmm0,16(%rdx)
- movdqa %xmm0,32(%rdx)
- movdqa %xmm0,48(%rdx)
- leaq 64(%rdx),%rdx
- decq %r9
- jnz L$sqr8x_zero
-
- movq $1,%rax
- movq -48(%rsi),%r15
- movq -40(%rsi),%r14
- movq -32(%rsi),%r13
- movq -24(%rsi),%r12
- movq -16(%rsi),%rbp
- movq -8(%rsi),%rbx
- leaq (%rsi),%rsp
-L$sqr8x_epilogue:
- .byte 0xf3,0xc3
-
-
-.p2align 5
-bn_mulx4x_mont:
-L$mulx4x_enter:
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-
- shll $3,%r9d
-.byte 0x67
- xorq %r10,%r10
- subq %r9,%r10
- movq (%r8),%r8
- leaq -72(%rsp,%r10,1),%rsp
- leaq (%rdx,%r9,1),%r10
- andq $-128,%rsp
-
-
-
-
-
-
-
-
-
-
-
-
- movq %r9,0(%rsp)
- shrq $5,%r9
- movq %r10,16(%rsp)
- subq $1,%r9
- movq %r8,24(%rsp)
- movq %rdi,32(%rsp)
- movq %rax,40(%rsp)
- movq %r9,48(%rsp)
- jmp L$mulx4x_body
-
-.p2align 5
-L$mulx4x_body:
- leaq 8(%rdx),%rdi
- movq (%rdx),%rdx
- leaq 64+32(%rsp),%rbx
- movq %rdx,%r9
-
- mulxq 0(%rsi),%r8,%rax
- mulxq 8(%rsi),%r11,%r14
- addq %rax,%r11
- movq %rdi,8(%rsp)
- mulxq 16(%rsi),%r12,%r13
- adcq %r14,%r12
- adcq $0,%r13
-
- movq %r8,%rdi
- imulq 24(%rsp),%r8
- xorq %rbp,%rbp
-
- mulxq 24(%rsi),%rax,%r14
- movq %r8,%rdx
- leaq 32(%rsi),%rsi
- adcxq %rax,%r13
- adcxq %rbp,%r14
-
- mulxq 0(%rcx),%rax,%r10
- adcxq %rax,%rdi
- adoxq %r11,%r10
- mulxq 8(%rcx),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
-.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
- movq 48(%rsp),%rdi
- movq %r10,-32(%rbx)
- adcxq %rax,%r11
- adoxq %r13,%r12
- mulxq 24(%rcx),%rax,%r15
- movq %r9,%rdx
- movq %r11,-24(%rbx)
- adcxq %rax,%r12
- adoxq %rbp,%r15
- leaq 32(%rcx),%rcx
- movq %r12,-16(%rbx)
-
- jmp L$mulx4x_1st
-
-.p2align 5
-L$mulx4x_1st:
- adcxq %rbp,%r15
- mulxq 0(%rsi),%r10,%rax
- adcxq %r14,%r10
- mulxq 8(%rsi),%r11,%r14
- adcxq %rax,%r11
- mulxq 16(%rsi),%r12,%rax
- adcxq %r14,%r12
- mulxq 24(%rsi),%r13,%r14
-.byte 0x67,0x67
- movq %r8,%rdx
- adcxq %rax,%r13
- adcxq %rbp,%r14
- leaq 32(%rsi),%rsi
- leaq 32(%rbx),%rbx
- adoxq %r15,%r10
- mulxq 0(%rcx),%rax,%r15
- adcxq %rax,%r10
- adoxq %r15,%r11
- mulxq 8(%rcx),%rax,%r15
- adcxq %rax,%r11
- adoxq %r15,%r12
- mulxq 16(%rcx),%rax,%r15
- movq %r10,-40(%rbx)
- adcxq %rax,%r12
- movq %r11,-32(%rbx)
- adoxq %r15,%r13
- mulxq 24(%rcx),%rax,%r15
- movq %r9,%rdx
- movq %r12,-24(%rbx)
- adcxq %rax,%r13
- adoxq %rbp,%r15
- leaq 32(%rcx),%rcx
- movq %r13,-16(%rbx)
- decq %rdi
- jnz L$mulx4x_1st
-
- movq 0(%rsp),%rax
- movq 8(%rsp),%rdi
- adcq %rbp,%r15
- addq %r15,%r14
- sbbq %r15,%r15
- movq %r14,-8(%rbx)
- jmp L$mulx4x_outer
-.p2align 5
-L$mulx4x_outer:
- movq (%rdi),%rdx
- leaq 8(%rdi),%rdi
- subq %rax,%rsi
- movq %r15,(%rbx)
- leaq 64+32(%rsp),%rbx
- subq %rax,%rcx
-
- mulxq 0(%rsi),%r8,%r11
- xorl %ebp,%ebp
- movq %rdx,%r9
- mulxq 8(%rsi),%r14,%r12
- adoxq -32(%rbx),%r8
- adcxq %r14,%r11
- mulxq 16(%rsi),%r15,%r13
- adoxq -24(%rbx),%r11
- adcxq %r15,%r12
- adoxq %rbp,%r12
- adcxq %rbp,%r13
-
- movq %rdi,8(%rsp)
-.byte 0x67
- movq %r8,%r15
- imulq 24(%rsp),%r8
- xorl %ebp,%ebp
-
- mulxq 24(%rsi),%rax,%r14
- movq %r8,%rdx
- adoxq -16(%rbx),%r12
- adcxq %rax,%r13
- adoxq -8(%rbx),%r13
- adcxq %rbp,%r14
- leaq 32(%rsi),%rsi
- adoxq %rbp,%r14
-
- mulxq 0(%rcx),%rax,%r10
- adcxq %rax,%r15
- adoxq %r11,%r10
- mulxq 8(%rcx),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
- mulxq 16(%rcx),%rax,%r12
- movq %r10,-32(%rbx)
- adcxq %rax,%r11
- adoxq %r13,%r12
- mulxq 24(%rcx),%rax,%r15
+ leaq (%rdi,%r9,1),%rbx
+ movq %r9,%rcx
movq %r9,%rdx
- movq %r11,-24(%rbx)
- leaq 32(%rcx),%rcx
- adcxq %rax,%r12
- adoxq %rbp,%r15
- movq 48(%rsp),%rdi
- movq %r12,-16(%rbx)
-
- jmp L$mulx4x_inner
+.byte 102,72,15,126,207
+ sarq $3+2,%rcx
+ jmp L$sqr8x_sub
.p2align 5
-L$mulx4x_inner:
- mulxq 0(%rsi),%r10,%rax
- adcxq %rbp,%r15
- adoxq %r14,%r10
- mulxq 8(%rsi),%r11,%r14
- adcxq 0(%rbx),%r10
- adoxq %rax,%r11
- mulxq 16(%rsi),%r12,%rax
- adcxq 8(%rbx),%r11
- adoxq %r14,%r12
- mulxq 24(%rsi),%r13,%r14
- movq %r8,%rdx
- adcxq 16(%rbx),%r12
- adoxq %rax,%r13
- adcxq 24(%rbx),%r13
- adoxq %rbp,%r14
- leaq 32(%rsi),%rsi
+L$sqr8x_sub:
+ movq 0(%rbx),%r12
+ movq 8(%rbx),%r13
+ movq 16(%rbx),%r14
+ movq 24(%rbx),%r15
leaq 32(%rbx),%rbx
- adcxq %rbp,%r14
-
- adoxq %r15,%r10
- mulxq 0(%rcx),%rax,%r15
- adcxq %rax,%r10
- adoxq %r15,%r11
- mulxq 8(%rcx),%rax,%r15
- adcxq %rax,%r11
- adoxq %r15,%r12
- mulxq 16(%rcx),%rax,%r15
- movq %r10,-40(%rbx)
- adcxq %rax,%r12
- adoxq %r15,%r13
- mulxq 24(%rcx),%rax,%r15
- movq %r9,%rdx
- movq %r11,-32(%rbx)
- movq %r12,-24(%rbx)
- adcxq %rax,%r13
- adoxq %rbp,%r15
- leaq 32(%rcx),%rcx
- movq %r13,-16(%rbx)
-
- decq %rdi
- jnz L$mulx4x_inner
+ sbbq 0(%rbp),%r12
+ sbbq 8(%rbp),%r13
+ sbbq 16(%rbp),%r14
+ sbbq 24(%rbp),%r15
+ leaq 32(%rbp),%rbp
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ leaq 32(%rdi),%rdi
+ incq %rcx
+ jnz L$sqr8x_sub
- movq 0(%rsp),%rax
- movq 8(%rsp),%rdi
- adcq %rbp,%r15
- subq 0(%rbx),%rbp
- adcq %r15,%r14
- movq -8(%rcx),%r8
- sbbq %r15,%r15
- movq %r14,-8(%rbx)
-
- cmpq 16(%rsp),%rdi
- jne L$mulx4x_outer
-
- subq %r14,%r8
- sbbq %r8,%r8
- orq %r8,%r15
-
- negq %rax
- xorq %rdx,%rdx
- movq 32(%rsp),%rdi
- leaq 64(%rsp),%rbx
+ sbbq $0,%rax
+ leaq (%rbx,%r9,1),%rbx
+ leaq (%rdi,%r9,1),%rdi
+.byte 102,72,15,110,200
pxor %xmm0,%xmm0
- movq 0(%rcx,%rax,1),%r8
- movq 8(%rcx,%rax,1),%r9
- negq %r8
- jmp L$mulx4x_sub_entry
+ pshufd $0,%xmm1,%xmm1
+ movq 40(%rsp),%rsi
+ jmp L$sqr8x_cond_copy
.p2align 5
-L$mulx4x_sub:
- movq 0(%rcx,%rax,1),%r8
- movq 8(%rcx,%rax,1),%r9
- notq %r8
-L$mulx4x_sub_entry:
- movq 16(%rcx,%rax,1),%r10
- notq %r9
- andq %r15,%r8
- movq 24(%rcx,%rax,1),%r11
- notq %r10
- andq %r15,%r9
- notq %r11
- andq %r15,%r10
- andq %r15,%r11
-
- negq %rdx
- adcq 0(%rbx),%r8
- adcq 8(%rbx),%r9
- movdqa %xmm0,(%rbx)
- adcq 16(%rbx),%r10
- adcq 24(%rbx),%r11
- movdqa %xmm0,16(%rbx)
+L$sqr8x_cond_copy:
+ movdqa 0(%rbx),%xmm2
+ movdqa 16(%rbx),%xmm3
leaq 32(%rbx),%rbx
- sbbq %rdx,%rdx
-
- movq %r8,0(%rdi)
- movq %r9,8(%rdi)
- movq %r10,16(%rdi)
- movq %r11,24(%rdi)
+ movdqu 0(%rdi),%xmm4
+ movdqu 16(%rdi),%xmm5
leaq 32(%rdi),%rdi
+ movdqa %xmm0,-32(%rbx)
+ movdqa %xmm0,-16(%rbx)
+ movdqa %xmm0,-32(%rbx,%rdx,1)
+ movdqa %xmm0,-16(%rbx,%rdx,1)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-32(%rdi)
+ movdqu %xmm5,-16(%rdi)
+ addq $32,%r9
+ jnz L$sqr8x_cond_copy
- addq $32,%rax
- jnz L$mulx4x_sub
-
- movq 40(%rsp),%rsi
movq $1,%rax
movq -48(%rsi),%r15
movq -40(%rsi),%r14
@@ -1048,7 +739,7 @@ L$mulx4x_sub_entry:
movq -16(%rsi),%rbp
movq -8(%rsi),%rbx
leaq (%rsi),%rsp
-L$mulx4x_epilogue:
+L$sqr8x_epilogue:
.byte 0xf3,0xc3
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s
index 99a16f8503..c9731e162d 100644
--- a/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s
+++ b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s
@@ -8,53 +8,157 @@
_bn_mul_mont_gather5:
testl $7,%r9d
jnz L$mul_enter
- movl _OPENSSL_ia32cap_P+8(%rip),%r11d
jmp L$mul4x_enter
.p2align 4
L$mul_enter:
movl %r9d,%r9d
movq %rsp,%rax
- movl 8(%rsp),%r10d
+ movd 8(%rsp),%xmm5
+ leaq L$inc(%rip),%r10
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
+
leaq 2(%r9),%r11
negq %r11
- leaq (%rsp,%r11,8),%rsp
+ leaq -264(%rsp,%r11,8),%rsp
andq $-1024,%rsp
movq %rax,8(%rsp,%r9,8)
L$mul_body:
- movq %rdx,%r12
- movq %r10,%r11
- shrq $3,%r10
- andq $7,%r11
- notq %r10
- leaq L$magic_masks(%rip),%rax
- andq $3,%r10
- leaq 96(%r12,%r11,8),%r12
- movq 0(%rax,%r10,8),%xmm4
- movq 8(%rax,%r10,8),%xmm5
- movq 16(%rax,%r10,8),%xmm6
- movq 24(%rax,%r10,8),%xmm7
-
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
+ leaq 128(%rdx),%r12
+ movdqa 0(%r10),%xmm0
+ movdqa 16(%r10),%xmm1
+ leaq 24-112(%rsp,%r9,8),%r10
+ andq $-16,%r10
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+.byte 0x67
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
+.byte 0x67
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ por %xmm1,%xmm0
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
.byte 102,72,15,126,195
movq (%r8),%r8
@@ -63,29 +167,14 @@ L$mul_body:
xorq %r14,%r14
xorq %r15,%r15
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
-
movq %r8,%rbp
mulq %rbx
movq %rax,%r10
movq (%rcx),%rax
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq %r10,%rbp
movq %rdx,%r11
- por %xmm2,%xmm0
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
@@ -118,14 +207,12 @@ L$1st_enter:
cmpq %r9,%r15
jne L$1st
-.byte 102,72,15,126,195
addq %rax,%r13
- movq (%rsi),%rax
adcq $0,%rdx
addq %r11,%r13
adcq $0,%rdx
- movq %r13,-16(%rsp,%r15,8)
+ movq %r13,-16(%rsp,%r9,8)
movq %rdx,%r13
movq %r10,%r11
@@ -139,33 +226,78 @@ L$1st_enter:
jmp L$outer
.p2align 4
L$outer:
+ leaq 24+128(%rsp,%r9,8),%rdx
+ andq $-16,%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r12),%xmm0
+ movdqa 80(%r12),%xmm1
+ movdqa 96(%r12),%xmm2
+ movdqa 112(%r12),%xmm3
+ pand 64(%rdx),%xmm0
+ pand 80(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%r12),%r12
+
+ movq (%rsi),%rax
+.byte 102,72,15,126,195
+
xorq %r15,%r15
movq %r8,%rbp
movq (%rsp),%r10
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
-
mulq %rbx
addq %rax,%r10
movq (%rcx),%rax
adcq $0,%rdx
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq %r10,%rbp
movq %rdx,%r11
- por %xmm2,%xmm0
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
@@ -201,15 +333,12 @@ L$inner_enter:
cmpq %r9,%r15
jne L$inner
-.byte 102,72,15,126,195
-
addq %rax,%r13
- movq (%rsi),%rax
adcq $0,%rdx
addq %r10,%r13
- movq (%rsp,%r15,8),%r10
+ movq (%rsp,%r9,8),%r10
adcq $0,%rdx
- movq %r13,-16(%rsp,%r15,8)
+ movq %r13,-16(%rsp,%r9,8)
movq %rdx,%r13
xorq %rdx,%rdx
@@ -256,6 +385,7 @@ L$copy:
movq 8(%rsp,%r9,8),%rsi
movq $1,%rax
+
movq -48(%rsi),%r15
movq -40(%rsi),%r14
movq -32(%rsi),%r13
@@ -270,9 +400,6 @@ L$mul_epilogue:
.p2align 5
bn_mul4x_mont_gather5:
L$mul4x_enter:
- andl $524544,%r11d
- cmpl $524544,%r11d
- je L$mulx4x_enter
.byte 0x67
movq %rsp,%rax
pushq %rbx
@@ -281,10 +408,10 @@ L$mul4x_enter:
pushq %r13
pushq %r14
pushq %r15
+
.byte 0x67
- movl %r9d,%r10d
shll $3,%r9d
- shll $3+2,%r10d
+ leaq (%r9,%r9,2),%r10
negq %r9
@@ -294,19 +421,21 @@ L$mul4x_enter:
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb L$mul4xsp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -320(%rsp,%r9,2),%rsp
jmp L$mul4xsp_done
.p2align 5
L$mul4xsp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -322,6 +451,7 @@ L$mul4x_body:
movq 40(%rsp),%rsi
movq $1,%rax
+
movq -48(%rsi),%r15
movq -40(%rsi),%r14
movq -32(%rsi),%r13
@@ -337,47 +467,141 @@ L$mul4x_epilogue:
.p2align 5
mul4x_internal:
shlq $5,%r9
- movl 8(%rax),%r10d
- leaq 256(%rdx,%r9,1),%r13
+ movd 8(%rax),%xmm5
+ leaq L$inc(%rip),%rax
+ leaq 128(%rdx,%r9,1),%r13
shrq $5,%r9
- movq %r10,%r11
- shrq $3,%r10
- andq $7,%r11
- notq %r10
- leaq L$magic_masks(%rip),%rax
- andq $3,%r10
- leaq 96(%rdx,%r11,8),%r12
- movq 0(%rax,%r10,8),%xmm4
- movq 8(%rax,%r10,8),%xmm5
- addq $7,%r11
- movq 16(%rax,%r10,8),%xmm6
- movq 24(%rax,%r10,8),%xmm7
- andq $7,%r11
-
- movq -96(%r12),%xmm0
- leaq 256(%r12),%r14
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
-.byte 0x67
- por %xmm1,%xmm0
- movq -96(%r14),%xmm1
-.byte 0x67
- pand %xmm7,%xmm3
-.byte 0x67
- por %xmm2,%xmm0
- movq -32(%r14),%xmm2
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 88-112(%rsp,%r9,1),%r10
+ leaq 128(%rdx),%r12
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+.byte 0x67,0x67
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
.byte 0x67
- pand %xmm4,%xmm1
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
.byte 0x67
- por %xmm3,%xmm0
- movq 32(%r14),%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ por %xmm1,%xmm0
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ leaq 256(%r12),%r12
.byte 102,72,15,126,195
- movq 96(%r14),%xmm0
+
movq %r13,16+8(%rsp)
movq %rdi,56+8(%rsp)
@@ -391,26 +615,10 @@ mul4x_internal:
movq %rax,%r10
movq (%rcx),%rax
- pand %xmm5,%xmm2
- pand %xmm6,%xmm3
- por %xmm2,%xmm1
-
imulq %r10,%rbp
-
-
-
-
-
-
-
- leaq 64+8(%rsp,%r11,8),%r14
+ leaq 64+8(%rsp),%r14
movq %rdx,%r11
- pand %xmm7,%xmm0
- por %xmm3,%xmm1
- leaq 512(%r12),%r12
- por %xmm1,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi,%r9,1),%rax
@@ -419,7 +627,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -429,7 +637,7 @@ mul4x_internal:
adcq $0,%rdx
addq %r11,%rdi
leaq 32(%r9),%r15
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %rdi,(%r14)
movq %rdx,%r13
@@ -439,7 +647,7 @@ mul4x_internal:
L$1st4x:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
leaq 32(%r14),%r14
adcq $0,%rdx
movq %rdx,%r11
@@ -455,7 +663,7 @@ L$1st4x:
mulq %rbx
addq %rax,%r11
- movq -16(%rcx),%rax
+ movq -8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -485,7 +693,7 @@ L$1st4x:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -494,7 +702,7 @@ L$1st4x:
movq 16(%rsi,%r15,1),%rax
adcq $0,%rdx
addq %r11,%rdi
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %rdi,(%r14)
movq %rdx,%r13
@@ -504,7 +712,7 @@ L$1st4x:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
leaq 32(%r14),%r14
adcq $0,%rdx
movq %rdx,%r11
@@ -520,7 +728,7 @@ L$1st4x:
mulq %rbx
addq %rax,%r11
- movq -16(%rcx),%rax
+ movq -8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -533,8 +741,7 @@ L$1st4x:
movq %rdi,-16(%r14)
movq %rdx,%r13
-.byte 102,72,15,126,195
- leaq (%rcx,%r9,2),%rcx
+ leaq (%rcx,%r9,1),%rcx
xorq %rdi,%rdi
addq %r10,%r13
@@ -545,6 +752,63 @@ L$1st4x:
.p2align 5
L$outer4x:
+ leaq 16+128(%r14),%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r12),%xmm0
+ movdqa 80(%r12),%xmm1
+ movdqa 96(%r12),%xmm2
+ movdqa 112(%r12),%xmm3
+ pand 64(%rdx),%xmm0
+ pand 80(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%r12),%r12
+.byte 102,72,15,126,195
+
movq (%r14,%r9,1),%r10
movq %r8,%rbp
mulq %rbx
@@ -552,25 +816,11 @@ L$outer4x:
movq (%rcx),%rax
adcq $0,%rdx
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
-
imulq %r10,%rbp
-.byte 0x67
movq %rdx,%r11
movq %rdi,(%r14)
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
- por %xmm2,%xmm0
leaq (%r14,%r9,1),%r14
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
mulq %rbp
addq %rax,%r10
@@ -580,7 +830,7 @@ L$outer4x:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
addq 8(%r14),%r11
adcq $0,%rdx
@@ -592,7 +842,7 @@ L$outer4x:
adcq $0,%rdx
addq %r11,%rdi
leaq 32(%r9),%r15
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %rdx,%r13
jmp L$inner4x
@@ -601,7 +851,7 @@ L$outer4x:
L$inner4x:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
adcq $0,%rdx
addq 16(%r14),%r10
leaq 32(%r14),%r14
@@ -619,7 +869,7 @@ L$inner4x:
mulq %rbx
addq %rax,%r11
- movq -16(%rcx),%rax
+ movq -8(%rcx),%rax
adcq $0,%rdx
addq -8(%r14),%r11
adcq $0,%rdx
@@ -653,7 +903,7 @@ L$inner4x:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
addq 8(%r14),%r11
adcq $0,%rdx
@@ -664,7 +914,7 @@ L$inner4x:
movq 16(%rsi,%r15,1),%rax
adcq $0,%rdx
addq %r11,%rdi
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %r13,-8(%r14)
movq %rdx,%r13
@@ -674,7 +924,7 @@ L$inner4x:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
adcq $0,%rdx
addq 16(%r14),%r10
leaq 32(%r14),%r14
@@ -693,7 +943,7 @@ L$inner4x:
mulq %rbx
addq %rax,%r11
movq %rbp,%rax
- movq -16(%rcx),%rbp
+ movq -8(%rcx),%rbp
adcq $0,%rdx
addq -8(%r14),%r11
adcq $0,%rdx
@@ -708,9 +958,8 @@ L$inner4x:
movq %r13,-24(%r14)
movq %rdx,%r13
-.byte 102,72,15,126,195
movq %rdi,-16(%r14)
- leaq (%rcx,%r9,2),%rcx
+ leaq (%rcx,%r9,1),%rcx
xorq %rdi,%rdi
addq %r10,%r13
@@ -721,25 +970,28 @@ L$inner4x:
cmpq 16+8(%rsp),%r12
jb L$outer4x
+ xorq %rax,%rax
subq %r13,%rbp
adcq %r15,%r15
orq %r15,%rdi
- xorq $1,%rdi
+ subq %rdi,%rax
leaq (%r14,%r9,1),%rbx
- leaq (%rcx,%rdi,8),%rbp
+ movq (%rcx),%r12
+ leaq (%rcx),%rbp
movq %r9,%rcx
sarq $3+2,%rcx
movq 56+8(%rsp),%rdi
- jmp L$sqr4x_sub
+ decq %r12
+ xorq %r10,%r10
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp L$sqr4x_sub_entry
.globl _bn_power5
.p2align 5
_bn_power5:
- movl _OPENSSL_ia32cap_P+8(%rip),%r11d
- andl $524544,%r11d
- cmpl $524544,%r11d
- je L$powerx5_enter
movq %rsp,%rax
pushq %rbx
pushq %rbp
@@ -747,9 +999,9 @@ _bn_power5:
pushq %r13
pushq %r14
pushq %r15
- movl %r9d,%r10d
+
shll $3,%r9d
- shll $3+2,%r10d
+ leal (%r9,%r9,2),%r10d
negq %r9
movq (%r8),%r8
@@ -759,19 +1011,20 @@ _bn_power5:
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
+
+ leaq -320(%rsp,%r9,2),%r11
+ subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb L$pwr_sp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -320(%rsp,%r9,2),%rsp
jmp L$pwr_sp_done
.p2align 5
L$pwr_sp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -799,10 +1052,15 @@ L$power5_body:
.byte 102,72,15,110,226
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
.byte 102,72,15,126,209
.byte 102,72,15,126,226
@@ -1346,9 +1604,9 @@ L$sqr4x_shift_n_add:
movq %rbx,-16(%rdi)
movq %r8,-8(%rdi)
.byte 102,72,15,126,213
-sqr8x_reduction:
+__bn_sqr8x_reduction:
xorq %rax,%rax
- leaq (%rbp,%r9,2),%rcx
+ leaq (%r9,%rbp,1),%rcx
leaq 48+8(%rsp,%r9,2),%rdx
movq %rcx,0+8(%rsp)
leaq 48+8(%rsp,%r9,1),%rdi
@@ -1381,14 +1639,14 @@ L$8x_reduction_loop:
.p2align 5
L$8x_reduce:
mulq %rbx
- movq 16(%rbp),%rax
+ movq 8(%rbp),%rax
negq %r8
movq %rdx,%r8
adcq $0,%r8
mulq %rbx
addq %rax,%r9
- movq 32(%rbp),%rax
+ movq 16(%rbp),%rax
adcq $0,%rdx
addq %r9,%r8
movq %rbx,48-8+8(%rsp,%rcx,8)
@@ -1397,7 +1655,7 @@ L$8x_reduce:
mulq %rbx
addq %rax,%r10
- movq 48(%rbp),%rax
+ movq 24(%rbp),%rax
adcq $0,%rdx
addq %r10,%r9
movq 32+8(%rsp),%rsi
@@ -1406,7 +1664,7 @@ L$8x_reduce:
mulq %rbx
addq %rax,%r11
- movq 64(%rbp),%rax
+ movq 32(%rbp),%rax
adcq $0,%rdx
imulq %r8,%rsi
addq %r11,%r10
@@ -1415,7 +1673,7 @@ L$8x_reduce:
mulq %rbx
addq %rax,%r12
- movq 80(%rbp),%rax
+ movq 40(%rbp),%rax
adcq $0,%rdx
addq %r12,%r11
movq %rdx,%r12
@@ -1423,7 +1681,7 @@ L$8x_reduce:
mulq %rbx
addq %rax,%r13
- movq 96(%rbp),%rax
+ movq 48(%rbp),%rax
adcq $0,%rdx
addq %r13,%r12
movq %rdx,%r13
@@ -1431,7 +1689,7 @@ L$8x_reduce:
mulq %rbx
addq %rax,%r14
- movq 112(%rbp),%rax
+ movq 56(%rbp),%rax
adcq $0,%rdx
addq %r14,%r13
movq %rdx,%r14
@@ -1449,7 +1707,7 @@ L$8x_reduce:
decl %ecx
jnz L$8x_reduce
- leaq 128(%rbp),%rbp
+ leaq 64(%rbp),%rbp
xorq %rax,%rax
movq 8+8(%rsp),%rdx
cmpq 0+8(%rsp),%rbp
@@ -1475,14 +1733,14 @@ L$8x_reduce:
L$8x_tail:
mulq %rbx
addq %rax,%r8
- movq 16(%rbp),%rax
+ movq 8(%rbp),%rax
movq %r8,(%rdi)
movq %rdx,%r8
adcq $0,%r8
mulq %rbx
addq %rax,%r9
- movq 32(%rbp),%rax
+ movq 16(%rbp),%rax
adcq $0,%rdx
addq %r9,%r8
leaq 8(%rdi),%rdi
@@ -1491,7 +1749,7 @@ L$8x_tail:
mulq %rbx
addq %rax,%r10
- movq 48(%rbp),%rax
+ movq 24(%rbp),%rax
adcq $0,%rdx
addq %r10,%r9
movq %rdx,%r10
@@ -1499,7 +1757,7 @@ L$8x_tail:
mulq %rbx
addq %rax,%r11
- movq 64(%rbp),%rax
+ movq 32(%rbp),%rax
adcq $0,%rdx
addq %r11,%r10
movq %rdx,%r11
@@ -1507,7 +1765,7 @@ L$8x_tail:
mulq %rbx
addq %rax,%r12
- movq 80(%rbp),%rax
+ movq 40(%rbp),%rax
adcq $0,%rdx
addq %r12,%r11
movq %rdx,%r12
@@ -1515,7 +1773,7 @@ L$8x_tail:
mulq %rbx
addq %rax,%r13
- movq 96(%rbp),%rax
+ movq 48(%rbp),%rax
adcq $0,%rdx
addq %r13,%r12
movq %rdx,%r13
@@ -1523,7 +1781,7 @@ L$8x_tail:
mulq %rbx
addq %rax,%r14
- movq 112(%rbp),%rax
+ movq 56(%rbp),%rax
adcq $0,%rdx
addq %r14,%r13
movq %rdx,%r14
@@ -1541,7 +1799,7 @@ L$8x_tail:
decl %ecx
jnz L$8x_tail
- leaq 128(%rbp),%rbp
+ leaq 64(%rbp),%rbp
movq 8+8(%rsp),%rdx
cmpq 0+8(%rsp),%rbp
jae L$8x_tail_done
@@ -1587,7 +1845,7 @@ L$8x_no_tail:
adcq 48(%rdi),%r14
adcq 56(%rdi),%r15
adcq $0,%rax
- movq -16(%rbp),%rcx
+ movq -8(%rbp),%rcx
xorq %rsi,%rsi
.byte 102,72,15,126,213
@@ -1605,40 +1863,58 @@ L$8x_no_tail:
cmpq %rdx,%rdi
jb L$8x_reduction_loop
+ .byte 0xf3,0xc3
+
- subq %r15,%rcx
+.p2align 5
+__bn_post4x_internal:
+ movq 0(%rbp),%r12
leaq (%rdi,%r9,1),%rbx
- adcq %rsi,%rsi
movq %r9,%rcx
- orq %rsi,%rax
.byte 102,72,15,126,207
- xorq $1,%rax
+ negq %rax
.byte 102,72,15,126,206
- leaq (%rbp,%rax,8),%rbp
sarq $3+2,%rcx
- jmp L$sqr4x_sub
+ decq %r12
+ xorq %r10,%r10
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp L$sqr4x_sub_entry
-.p2align 5
+.p2align 4
L$sqr4x_sub:
-.byte 0x66
- movq 0(%rbx),%r12
- movq 8(%rbx),%r13
- sbbq 0(%rbp),%r12
- movq 16(%rbx),%r14
- sbbq 16(%rbp),%r13
- movq 24(%rbx),%r15
- leaq 32(%rbx),%rbx
- sbbq 32(%rbp),%r14
+ movq 0(%rbp),%r12
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+L$sqr4x_sub_entry:
+ leaq 32(%rbp),%rbp
+ notq %r12
+ notq %r13
+ notq %r14
+ notq %r15
+ andq %rax,%r12
+ andq %rax,%r13
+ andq %rax,%r14
+ andq %rax,%r15
+
+ negq %r10
+ adcq 0(%rbx),%r12
+ adcq 8(%rbx),%r13
+ adcq 16(%rbx),%r14
+ adcq 24(%rbx),%r15
movq %r12,0(%rdi)
- sbbq 48(%rbp),%r15
- leaq 64(%rbp),%rbp
+ leaq 32(%rbx),%rbx
movq %r13,8(%rdi)
+ sbbq %r10,%r10
movq %r14,16(%rdi)
movq %r15,24(%rdi)
leaq 32(%rdi),%rdi
incq %rcx
jnz L$sqr4x_sub
+
movq %r9,%r10
negq %r9
.byte 0xf3,0xc3
@@ -1664,10 +1940,9 @@ bn_from_mont8x:
pushq %r13
pushq %r14
pushq %r15
-.byte 0x67
- movl %r9d,%r10d
+
shll $3,%r9d
- shll $3+2,%r10d
+ leaq (%r9,%r9,2),%r10
negq %r9
movq (%r8),%r8
@@ -1677,19 +1952,20 @@ bn_from_mont8x:
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
+
+ leaq -320(%rsp,%r9,2),%r11
+ subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb L$from_sp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -320(%rsp,%r9,2),%rsp
jmp L$from_sp_done
.p2align 5
L$from_sp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -1740,22 +2016,8 @@ L$mul_by_1:
.byte 0x67
movq %rcx,%rbp
.byte 102,73,15,110,218
- movl _OPENSSL_ia32cap_P+8(%rip),%r11d
- andl $524544,%r11d
- cmpl $524544,%r11d
- jne L$from_mont_nox
-
- leaq (%rax,%r9,1),%rdi
- call sqrx8x_reduction
-
- pxor %xmm0,%xmm0
- leaq 48(%rsp),%rax
- movq 40(%rsp),%rsi
- jmp L$from_mont_zero
-
-.p2align 5
-L$from_mont_nox:
- call sqr8x_reduction
+ call __bn_sqr8x_reduction
+ call __bn_post4x_internal
pxor %xmm0,%xmm0
leaq 48(%rsp),%rax
@@ -1783,1119 +2045,6 @@ L$from_mont_zero:
L$from_epilogue:
.byte 0xf3,0xc3
-
-.p2align 5
-bn_mulx4x_mont_gather5:
-L$mulx4x_enter:
-.byte 0x67
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-.byte 0x67
- movl %r9d,%r10d
- shll $3,%r9d
- shll $3+2,%r10d
- negq %r9
- movq (%r8),%r8
-
-
-
-
-
-
-
-
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
- andq $4095,%r11
- cmpq %r11,%r10
- jb L$mulx4xsp_alt
- subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
- jmp L$mulx4xsp_done
-
-.p2align 5
-L$mulx4xsp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
- subq %r10,%r11
- movq $0,%r10
- cmovcq %r10,%r11
- subq %r11,%rsp
-L$mulx4xsp_done:
- andq $-64,%rsp
-
-
-
-
-
-
-
-
-
-
-
-
- movq %r8,32(%rsp)
- movq %rax,40(%rsp)
-L$mulx4x_body:
- call mulx4x_internal
-
- movq 40(%rsp),%rsi
- movq $1,%rax
- movq -48(%rsi),%r15
- movq -40(%rsi),%r14
- movq -32(%rsi),%r13
- movq -24(%rsi),%r12
- movq -16(%rsi),%rbp
- movq -8(%rsi),%rbx
- leaq (%rsi),%rsp
-L$mulx4x_epilogue:
- .byte 0xf3,0xc3
-
-
-
-.p2align 5
-mulx4x_internal:
-.byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00
-.byte 0x67
- negq %r9
- shlq $5,%r9
- leaq 256(%rdx,%r9,1),%r13
- shrq $5+5,%r9
- movl 8(%rax),%r10d
- subq $1,%r9
- movq %r13,16+8(%rsp)
- movq %r9,24+8(%rsp)
- movq %rdi,56+8(%rsp)
- movq %r10,%r11
- shrq $3,%r10
- andq $7,%r11
- notq %r10
- leaq L$magic_masks(%rip),%rax
- andq $3,%r10
- leaq 96(%rdx,%r11,8),%rdi
- movq 0(%rax,%r10,8),%xmm4
- movq 8(%rax,%r10,8),%xmm5
- addq $7,%r11
- movq 16(%rax,%r10,8),%xmm6
- movq 24(%rax,%r10,8),%xmm7
- andq $7,%r11
-
- movq -96(%rdi),%xmm0
- leaq 256(%rdi),%rbx
- movq -32(%rdi),%xmm1
- pand %xmm4,%xmm0
- movq 32(%rdi),%xmm2
- pand %xmm5,%xmm1
- movq 96(%rdi),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- movq -96(%rbx),%xmm1
- pand %xmm7,%xmm3
- por %xmm2,%xmm0
- movq -32(%rbx),%xmm2
- por %xmm3,%xmm0
-.byte 0x67,0x67
- pand %xmm4,%xmm1
- movq 32(%rbx),%xmm3
-
-.byte 102,72,15,126,194
- movq 96(%rbx),%xmm0
- leaq 512(%rdi),%rdi
- pand %xmm5,%xmm2
-.byte 0x67,0x67
- pand %xmm6,%xmm3
-
-
-
-
-
-
-
- leaq 64+32+8(%rsp,%r11,8),%rbx
-
- movq %rdx,%r9
- mulxq 0(%rsi),%r8,%rax
- mulxq 8(%rsi),%r11,%r12
- addq %rax,%r11
- mulxq 16(%rsi),%rax,%r13
- adcq %rax,%r12
- adcq $0,%r13
- mulxq 24(%rsi),%rax,%r14
-
- movq %r8,%r15
- imulq 32+8(%rsp),%r8
- xorq %rbp,%rbp
- movq %r8,%rdx
-
- por %xmm2,%xmm1
- pand %xmm7,%xmm0
- por %xmm3,%xmm1
- movq %rdi,8+8(%rsp)
- por %xmm1,%xmm0
-
-.byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00
- adcxq %rax,%r13
- adcxq %rbp,%r14
-
- mulxq 0(%rcx),%rax,%r10
- adcxq %rax,%r15
- adoxq %r11,%r10
- mulxq 16(%rcx),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
- mulxq 32(%rcx),%rax,%r12
- movq 24+8(%rsp),%rdi
-.byte 0x66
- movq %r10,-32(%rbx)
- adcxq %rax,%r11
- adoxq %r13,%r12
- mulxq 48(%rcx),%rax,%r15
-.byte 0x67,0x67
- movq %r9,%rdx
- movq %r11,-24(%rbx)
- adcxq %rax,%r12
- adoxq %rbp,%r15
-.byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00
- movq %r12,-16(%rbx)
-
-
-.p2align 5
-L$mulx4x_1st:
- adcxq %rbp,%r15
- mulxq 0(%rsi),%r10,%rax
- adcxq %r14,%r10
- mulxq 8(%rsi),%r11,%r14
- adcxq %rax,%r11
- mulxq 16(%rsi),%r12,%rax
- adcxq %r14,%r12
- mulxq 24(%rsi),%r13,%r14
-.byte 0x67,0x67
- movq %r8,%rdx
- adcxq %rax,%r13
- adcxq %rbp,%r14
- leaq 32(%rsi),%rsi
- leaq 32(%rbx),%rbx
-
- adoxq %r15,%r10
- mulxq 0(%rcx),%rax,%r15
- adcxq %rax,%r10
- adoxq %r15,%r11
- mulxq 16(%rcx),%rax,%r15
- adcxq %rax,%r11
- adoxq %r15,%r12
- mulxq 32(%rcx),%rax,%r15
- movq %r10,-40(%rbx)
- adcxq %rax,%r12
- movq %r11,-32(%rbx)
- adoxq %r15,%r13
- mulxq 48(%rcx),%rax,%r15
- movq %r9,%rdx
- movq %r12,-24(%rbx)
- adcxq %rax,%r13
- adoxq %rbp,%r15
- leaq 64(%rcx),%rcx
- movq %r13,-16(%rbx)
-
- decq %rdi
- jnz L$mulx4x_1st
-
- movq 8(%rsp),%rax
-.byte 102,72,15,126,194
- adcq %rbp,%r15
- leaq (%rsi,%rax,1),%rsi
- addq %r15,%r14
- movq 8+8(%rsp),%rdi
- adcq %rbp,%rbp
- movq %r14,-8(%rbx)
- jmp L$mulx4x_outer
-
-.p2align 5
-L$mulx4x_outer:
- movq %rbp,(%rbx)
- leaq 32(%rbx,%rax,1),%rbx
- mulxq 0(%rsi),%r8,%r11
- xorq %rbp,%rbp
- movq %rdx,%r9
- mulxq 8(%rsi),%r14,%r12
- adoxq -32(%rbx),%r8
- adcxq %r14,%r11
- mulxq 16(%rsi),%r15,%r13
- adoxq -24(%rbx),%r11
- adcxq %r15,%r12
- mulxq 24(%rsi),%rdx,%r14
- adoxq -16(%rbx),%r12
- adcxq %rdx,%r13
- leaq (%rcx,%rax,2),%rcx
- leaq 32(%rsi),%rsi
- adoxq -8(%rbx),%r13
- adcxq %rbp,%r14
- adoxq %rbp,%r14
-
-.byte 0x67
- movq %r8,%r15
- imulq 32+8(%rsp),%r8
-
- movq -96(%rdi),%xmm0
-.byte 0x67,0x67
- movq %r8,%rdx
- movq -32(%rdi),%xmm1
-.byte 0x67
- pand %xmm4,%xmm0
- movq 32(%rdi),%xmm2
-.byte 0x67
- pand %xmm5,%xmm1
- movq 96(%rdi),%xmm3
- addq $256,%rdi
-.byte 0x67
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
- xorq %rbp,%rbp
- movq %rdi,8+8(%rsp)
-
- mulxq 0(%rcx),%rax,%r10
- adcxq %rax,%r15
- adoxq %r11,%r10
- mulxq 16(%rcx),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
- mulxq 32(%rcx),%rax,%r12
- adcxq %rax,%r11
- adoxq %r13,%r12
- mulxq 48(%rcx),%rax,%r15
- movq %r9,%rdx
- por %xmm2,%xmm0
- movq 24+8(%rsp),%rdi
- movq %r10,-32(%rbx)
- por %xmm3,%xmm0
- adcxq %rax,%r12
- movq %r11,-24(%rbx)
- adoxq %rbp,%r15
- movq %r12,-16(%rbx)
- leaq 64(%rcx),%rcx
- jmp L$mulx4x_inner
-
-.p2align 5
-L$mulx4x_inner:
- mulxq 0(%rsi),%r10,%rax
- adcxq %rbp,%r15
- adoxq %r14,%r10
- mulxq 8(%rsi),%r11,%r14
- adcxq 0(%rbx),%r10
- adoxq %rax,%r11
- mulxq 16(%rsi),%r12,%rax
- adcxq 8(%rbx),%r11
- adoxq %r14,%r12
- mulxq 24(%rsi),%r13,%r14
- movq %r8,%rdx
- adcxq 16(%rbx),%r12
- adoxq %rax,%r13
- adcxq 24(%rbx),%r13
- adoxq %rbp,%r14
- leaq 32(%rsi),%rsi
- leaq 32(%rbx),%rbx
- adcxq %rbp,%r14
-
- adoxq %r15,%r10
- mulxq 0(%rcx),%rax,%r15
- adcxq %rax,%r10
- adoxq %r15,%r11
- mulxq 16(%rcx),%rax,%r15
- adcxq %rax,%r11
- adoxq %r15,%r12
- mulxq 32(%rcx),%rax,%r15
- movq %r10,-40(%rbx)
- adcxq %rax,%r12
- adoxq %r15,%r13
- movq %r11,-32(%rbx)
- mulxq 48(%rcx),%rax,%r15
- movq %r9,%rdx
- leaq 64(%rcx),%rcx
- movq %r12,-24(%rbx)
- adcxq %rax,%r13
- adoxq %rbp,%r15
- movq %r13,-16(%rbx)
-
- decq %rdi
- jnz L$mulx4x_inner
-
- movq 0+8(%rsp),%rax
-.byte 102,72,15,126,194
- adcq %rbp,%r15
- subq 0(%rbx),%rdi
- movq 8+8(%rsp),%rdi
- movq 16+8(%rsp),%r10
- adcq %r15,%r14
- leaq (%rsi,%rax,1),%rsi
- adcq %rbp,%rbp
- movq %r14,-8(%rbx)
-
- cmpq %r10,%rdi
- jb L$mulx4x_outer
-
- movq -16(%rcx),%r10
- xorq %r15,%r15
- subq %r14,%r10
- adcq %r15,%r15
- orq %r15,%rbp
- xorq $1,%rbp
- leaq (%rbx,%rax,1),%rdi
- leaq (%rcx,%rax,2),%rcx
-.byte 0x67,0x67
- sarq $3+2,%rax
- leaq (%rcx,%rbp,8),%rbp
- movq 56+8(%rsp),%rdx
- movq %rax,%rcx
- jmp L$sqrx4x_sub
-
-
-.p2align 5
-bn_powerx5:
-L$powerx5_enter:
-.byte 0x67
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-.byte 0x67
- movl %r9d,%r10d
- shll $3,%r9d
- shll $3+2,%r10d
- negq %r9
- movq (%r8),%r8
-
-
-
-
-
-
-
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
- andq $4095,%r11
- cmpq %r11,%r10
- jb L$pwrx_sp_alt
- subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
- jmp L$pwrx_sp_done
-
-.p2align 5
-L$pwrx_sp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
- subq %r10,%r11
- movq $0,%r10
- cmovcq %r10,%r11
- subq %r11,%rsp
-L$pwrx_sp_done:
- andq $-64,%rsp
- movq %r9,%r10
- negq %r9
-
-
-
-
-
-
-
-
-
-
-
-
- pxor %xmm0,%xmm0
-.byte 102,72,15,110,207
-.byte 102,72,15,110,209
-.byte 102,73,15,110,218
-.byte 102,72,15,110,226
- movq %r8,32(%rsp)
- movq %rax,40(%rsp)
-L$powerx5_body:
-
- call __bn_sqrx8x_internal
- call __bn_sqrx8x_internal
- call __bn_sqrx8x_internal
- call __bn_sqrx8x_internal
- call __bn_sqrx8x_internal
-
- movq %r10,%r9
- movq %rsi,%rdi
-.byte 102,72,15,126,209
-.byte 102,72,15,126,226
- movq 40(%rsp),%rax
-
- call mulx4x_internal
-
- movq 40(%rsp),%rsi
- movq $1,%rax
- movq -48(%rsi),%r15
- movq -40(%rsi),%r14
- movq -32(%rsi),%r13
- movq -24(%rsi),%r12
- movq -16(%rsi),%rbp
- movq -8(%rsi),%rbx
- leaq (%rsi),%rsp
-L$powerx5_epilogue:
- .byte 0xf3,0xc3
-
-
-.globl _bn_sqrx8x_internal
-.private_extern _bn_sqrx8x_internal
-
-.p2align 5
-_bn_sqrx8x_internal:
-__bn_sqrx8x_internal:
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- leaq 48+8(%rsp),%rdi
- leaq (%rsi,%r9,1),%rbp
- movq %r9,0+8(%rsp)
- movq %rbp,8+8(%rsp)
- jmp L$sqr8x_zero_start
-
-.p2align 5
-.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
-L$sqrx8x_zero:
-.byte 0x3e
- movdqa %xmm0,0(%rdi)
- movdqa %xmm0,16(%rdi)
- movdqa %xmm0,32(%rdi)
- movdqa %xmm0,48(%rdi)
-L$sqr8x_zero_start:
- movdqa %xmm0,64(%rdi)
- movdqa %xmm0,80(%rdi)
- movdqa %xmm0,96(%rdi)
- movdqa %xmm0,112(%rdi)
- leaq 128(%rdi),%rdi
- subq $64,%r9
- jnz L$sqrx8x_zero
-
- movq 0(%rsi),%rdx
-
- xorq %r10,%r10
- xorq %r11,%r11
- xorq %r12,%r12
- xorq %r13,%r13
- xorq %r14,%r14
- xorq %r15,%r15
- leaq 48+8(%rsp),%rdi
- xorq %rbp,%rbp
- jmp L$sqrx8x_outer_loop
-
-.p2align 5
-L$sqrx8x_outer_loop:
- mulxq 8(%rsi),%r8,%rax
- adcxq %r9,%r8
- adoxq %rax,%r10
- mulxq 16(%rsi),%r9,%rax
- adcxq %r10,%r9
- adoxq %rax,%r11
-.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
- adcxq %r11,%r10
- adoxq %rax,%r12
-.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
- adcxq %r12,%r11
- adoxq %rax,%r13
- mulxq 40(%rsi),%r12,%rax
- adcxq %r13,%r12
- adoxq %rax,%r14
- mulxq 48(%rsi),%r13,%rax
- adcxq %r14,%r13
- adoxq %r15,%rax
- mulxq 56(%rsi),%r14,%r15
- movq 8(%rsi),%rdx
- adcxq %rax,%r14
- adoxq %rbp,%r15
- adcq 64(%rdi),%r15
- movq %r8,8(%rdi)
- movq %r9,16(%rdi)
- sbbq %rcx,%rcx
- xorq %rbp,%rbp
-
-
- mulxq 16(%rsi),%r8,%rbx
- mulxq 24(%rsi),%r9,%rax
- adcxq %r10,%r8
- adoxq %rbx,%r9
- mulxq 32(%rsi),%r10,%rbx
- adcxq %r11,%r9
- adoxq %rax,%r10
-.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
- adcxq %r12,%r10
- adoxq %rbx,%r11
-.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
- adcxq %r13,%r11
- adoxq %r14,%r12
-.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
- movq 16(%rsi),%rdx
- adcxq %rax,%r12
- adoxq %rbx,%r13
- adcxq %r15,%r13
- adoxq %rbp,%r14
- adcxq %rbp,%r14
-
- movq %r8,24(%rdi)
- movq %r9,32(%rdi)
-
- mulxq 24(%rsi),%r8,%rbx
- mulxq 32(%rsi),%r9,%rax
- adcxq %r10,%r8
- adoxq %rbx,%r9
- mulxq 40(%rsi),%r10,%rbx
- adcxq %r11,%r9
- adoxq %rax,%r10
-.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
- adcxq %r12,%r10
- adoxq %r13,%r11
-.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
-.byte 0x3e
- movq 24(%rsi),%rdx
- adcxq %rbx,%r11
- adoxq %rax,%r12
- adcxq %r14,%r12
- movq %r8,40(%rdi)
- movq %r9,48(%rdi)
- mulxq 32(%rsi),%r8,%rax
- adoxq %rbp,%r13
- adcxq %rbp,%r13
-
- mulxq 40(%rsi),%r9,%rbx
- adcxq %r10,%r8
- adoxq %rax,%r9
- mulxq 48(%rsi),%r10,%rax
- adcxq %r11,%r9
- adoxq %r12,%r10
- mulxq 56(%rsi),%r11,%r12
- movq 32(%rsi),%rdx
- movq 40(%rsi),%r14
- adcxq %rbx,%r10
- adoxq %rax,%r11
- movq 48(%rsi),%r15
- adcxq %r13,%r11
- adoxq %rbp,%r12
- adcxq %rbp,%r12
-
- movq %r8,56(%rdi)
- movq %r9,64(%rdi)
-
- mulxq %r14,%r9,%rax
- movq 56(%rsi),%r8
- adcxq %r10,%r9
- mulxq %r15,%r10,%rbx
- adoxq %rax,%r10
- adcxq %r11,%r10
- mulxq %r8,%r11,%rax
- movq %r14,%rdx
- adoxq %rbx,%r11
- adcxq %r12,%r11
-
- adcxq %rbp,%rax
-
- mulxq %r15,%r14,%rbx
- mulxq %r8,%r12,%r13
- movq %r15,%rdx
- leaq 64(%rsi),%rsi
- adcxq %r14,%r11
- adoxq %rbx,%r12
- adcxq %rax,%r12
- adoxq %rbp,%r13
-
-.byte 0x67,0x67
- mulxq %r8,%r8,%r14
- adcxq %r8,%r13
- adcxq %rbp,%r14
-
- cmpq 8+8(%rsp),%rsi
- je L$sqrx8x_outer_break
-
- negq %rcx
- movq $-8,%rcx
- movq %rbp,%r15
- movq 64(%rdi),%r8
- adcxq 72(%rdi),%r9
- adcxq 80(%rdi),%r10
- adcxq 88(%rdi),%r11
- adcq 96(%rdi),%r12
- adcq 104(%rdi),%r13
- adcq 112(%rdi),%r14
- adcq 120(%rdi),%r15
- leaq (%rsi),%rbp
- leaq 128(%rdi),%rdi
- sbbq %rax,%rax
-
- movq -64(%rsi),%rdx
- movq %rax,16+8(%rsp)
- movq %rdi,24+8(%rsp)
-
-
- xorl %eax,%eax
- jmp L$sqrx8x_loop
-
-.p2align 5
-L$sqrx8x_loop:
- movq %r8,%rbx
- mulxq 0(%rbp),%rax,%r8
- adcxq %rax,%rbx
- adoxq %r9,%r8
-
- mulxq 8(%rbp),%rax,%r9
- adcxq %rax,%r8
- adoxq %r10,%r9
-
- mulxq 16(%rbp),%rax,%r10
- adcxq %rax,%r9
- adoxq %r11,%r10
-
- mulxq 24(%rbp),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
-
-.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
- adcxq %rax,%r11
- adoxq %r13,%r12
-
- mulxq 40(%rbp),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
- mulxq 48(%rbp),%rax,%r14
- movq %rbx,(%rdi,%rcx,8)
- movl $0,%ebx
- adcxq %rax,%r13
- adoxq %r15,%r14
-
-.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
- movq 8(%rsi,%rcx,8),%rdx
- adcxq %rax,%r14
- adoxq %rbx,%r15
- adcxq %rbx,%r15
-
-.byte 0x67
- incq %rcx
- jnz L$sqrx8x_loop
-
- leaq 64(%rbp),%rbp
- movq $-8,%rcx
- cmpq 8+8(%rsp),%rbp
- je L$sqrx8x_break
-
- subq 16+8(%rsp),%rbx
-.byte 0x66
- movq -64(%rsi),%rdx
- adcxq 0(%rdi),%r8
- adcxq 8(%rdi),%r9
- adcq 16(%rdi),%r10
- adcq 24(%rdi),%r11
- adcq 32(%rdi),%r12
- adcq 40(%rdi),%r13
- adcq 48(%rdi),%r14
- adcq 56(%rdi),%r15
- leaq 64(%rdi),%rdi
-.byte 0x67
- sbbq %rax,%rax
- xorl %ebx,%ebx
- movq %rax,16+8(%rsp)
- jmp L$sqrx8x_loop
-
-.p2align 5
-L$sqrx8x_break:
- subq 16+8(%rsp),%r8
- movq 24+8(%rsp),%rcx
- movq 0(%rsi),%rdx
- xorl %ebp,%ebp
- movq %r8,0(%rdi)
- cmpq %rcx,%rdi
- je L$sqrx8x_outer_loop
-
- movq %r9,8(%rdi)
- movq 8(%rcx),%r9
- movq %r10,16(%rdi)
- movq 16(%rcx),%r10
- movq %r11,24(%rdi)
- movq 24(%rcx),%r11
- movq %r12,32(%rdi)
- movq 32(%rcx),%r12
- movq %r13,40(%rdi)
- movq 40(%rcx),%r13
- movq %r14,48(%rdi)
- movq 48(%rcx),%r14
- movq %r15,56(%rdi)
- movq 56(%rcx),%r15
- movq %rcx,%rdi
- jmp L$sqrx8x_outer_loop
-
-.p2align 5
-L$sqrx8x_outer_break:
- movq %r9,72(%rdi)
-.byte 102,72,15,126,217
- movq %r10,80(%rdi)
- movq %r11,88(%rdi)
- movq %r12,96(%rdi)
- movq %r13,104(%rdi)
- movq %r14,112(%rdi)
- leaq 48+8(%rsp),%rdi
- movq (%rsi,%rcx,1),%rdx
-
- movq 8(%rdi),%r11
- xorq %r10,%r10
- movq 0+8(%rsp),%r9
- adoxq %r11,%r11
- movq 16(%rdi),%r12
- movq 24(%rdi),%r13
-
-
-.p2align 5
-L$sqrx4x_shift_n_add:
- mulxq %rdx,%rax,%rbx
- adoxq %r12,%r12
- adcxq %r10,%rax
-.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
-.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
- adoxq %r13,%r13
- adcxq %r11,%rbx
- movq 40(%rdi),%r11
- movq %rax,0(%rdi)
- movq %rbx,8(%rdi)
-
- mulxq %rdx,%rax,%rbx
- adoxq %r10,%r10
- adcxq %r12,%rax
- movq 16(%rsi,%rcx,1),%rdx
- movq 48(%rdi),%r12
- adoxq %r11,%r11
- adcxq %r13,%rbx
- movq 56(%rdi),%r13
- movq %rax,16(%rdi)
- movq %rbx,24(%rdi)
-
- mulxq %rdx,%rax,%rbx
- adoxq %r12,%r12
- adcxq %r10,%rax
- movq 24(%rsi,%rcx,1),%rdx
- leaq 32(%rcx),%rcx
- movq 64(%rdi),%r10
- adoxq %r13,%r13
- adcxq %r11,%rbx
- movq 72(%rdi),%r11
- movq %rax,32(%rdi)
- movq %rbx,40(%rdi)
-
- mulxq %rdx,%rax,%rbx
- adoxq %r10,%r10
- adcxq %r12,%rax
- jrcxz L$sqrx4x_shift_n_add_break
-.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
- adoxq %r11,%r11
- adcxq %r13,%rbx
- movq 80(%rdi),%r12
- movq 88(%rdi),%r13
- movq %rax,48(%rdi)
- movq %rbx,56(%rdi)
- leaq 64(%rdi),%rdi
- nop
- jmp L$sqrx4x_shift_n_add
-
-.p2align 5
-L$sqrx4x_shift_n_add_break:
- adcxq %r13,%rbx
- movq %rax,48(%rdi)
- movq %rbx,56(%rdi)
- leaq 64(%rdi),%rdi
-.byte 102,72,15,126,213
-sqrx8x_reduction:
- xorl %eax,%eax
- movq 32+8(%rsp),%rbx
- movq 48+8(%rsp),%rdx
- leaq -128(%rbp,%r9,2),%rcx
-
- movq %rcx,0+8(%rsp)
- movq %rdi,8+8(%rsp)
-
- leaq 48+8(%rsp),%rdi
- jmp L$sqrx8x_reduction_loop
-
-.p2align 5
-L$sqrx8x_reduction_loop:
- movq 8(%rdi),%r9
- movq 16(%rdi),%r10
- movq 24(%rdi),%r11
- movq 32(%rdi),%r12
- movq %rdx,%r8
- imulq %rbx,%rdx
- movq 40(%rdi),%r13
- movq 48(%rdi),%r14
- movq 56(%rdi),%r15
- movq %rax,24+8(%rsp)
-
- leaq 64(%rdi),%rdi
- xorq %rsi,%rsi
- movq $-8,%rcx
- jmp L$sqrx8x_reduce
-
-.p2align 5
-L$sqrx8x_reduce:
- movq %r8,%rbx
- mulxq 0(%rbp),%rax,%r8
- adcxq %rbx,%rax
- adoxq %r9,%r8
-
- mulxq 16(%rbp),%rbx,%r9
- adcxq %rbx,%r8
- adoxq %r10,%r9
-
- mulxq 32(%rbp),%rbx,%r10
- adcxq %rbx,%r9
- adoxq %r11,%r10
-
- mulxq 48(%rbp),%rbx,%r11
- adcxq %rbx,%r10
- adoxq %r12,%r11
-
-.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00
- movq %rdx,%rax
- movq %r8,%rdx
- adcxq %rbx,%r11
- adoxq %r13,%r12
-
- mulxq 32+8(%rsp),%rbx,%rdx
- movq %rax,%rdx
- movq %rax,64+48+8(%rsp,%rcx,8)
-
- mulxq 80(%rbp),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
- mulxq 96(%rbp),%rax,%r14
- adcxq %rax,%r13
- adoxq %r15,%r14
-
- mulxq 112(%rbp),%rax,%r15
- movq %rbx,%rdx
- adcxq %rax,%r14
- adoxq %rsi,%r15
- adcxq %rsi,%r15
-
-.byte 0x67,0x67,0x67
- incq %rcx
- jnz L$sqrx8x_reduce
-
- movq %rsi,%rax
- cmpq 0+8(%rsp),%rbp
- jae L$sqrx8x_no_tail
-
- movq 48+8(%rsp),%rdx
- addq 0(%rdi),%r8
- leaq 128(%rbp),%rbp
- movq $-8,%rcx
- adcxq 8(%rdi),%r9
- adcxq 16(%rdi),%r10
- adcq 24(%rdi),%r11
- adcq 32(%rdi),%r12
- adcq 40(%rdi),%r13
- adcq 48(%rdi),%r14
- adcq 56(%rdi),%r15
- leaq 64(%rdi),%rdi
- sbbq %rax,%rax
-
- xorq %rsi,%rsi
- movq %rax,16+8(%rsp)
- jmp L$sqrx8x_tail
-
-.p2align 5
-L$sqrx8x_tail:
- movq %r8,%rbx
- mulxq 0(%rbp),%rax,%r8
- adcxq %rax,%rbx
- adoxq %r9,%r8
-
- mulxq 16(%rbp),%rax,%r9
- adcxq %rax,%r8
- adoxq %r10,%r9
-
- mulxq 32(%rbp),%rax,%r10
- adcxq %rax,%r9
- adoxq %r11,%r10
-
- mulxq 48(%rbp),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
-
-.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00
- adcxq %rax,%r11
- adoxq %r13,%r12
-
- mulxq 80(%rbp),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
- mulxq 96(%rbp),%rax,%r14
- adcxq %rax,%r13
- adoxq %r15,%r14
-
- mulxq 112(%rbp),%rax,%r15
- movq 72+48+8(%rsp,%rcx,8),%rdx
- adcxq %rax,%r14
- adoxq %rsi,%r15
- movq %rbx,(%rdi,%rcx,8)
- movq %r8,%rbx
- adcxq %rsi,%r15
-
- incq %rcx
- jnz L$sqrx8x_tail
-
- cmpq 0+8(%rsp),%rbp
- jae L$sqrx8x_tail_done
-
- subq 16+8(%rsp),%rsi
- movq 48+8(%rsp),%rdx
- leaq 128(%rbp),%rbp
- adcq 0(%rdi),%r8
- adcq 8(%rdi),%r9
- adcq 16(%rdi),%r10
- adcq 24(%rdi),%r11
- adcq 32(%rdi),%r12
- adcq 40(%rdi),%r13
- adcq 48(%rdi),%r14
- adcq 56(%rdi),%r15
- leaq 64(%rdi),%rdi
- sbbq %rax,%rax
- subq $8,%rcx
-
- xorq %rsi,%rsi
- movq %rax,16+8(%rsp)
- jmp L$sqrx8x_tail
-
-.p2align 5
-L$sqrx8x_tail_done:
- addq 24+8(%rsp),%r8
- adcq $0,%r9
- adcq $0,%r10
- adcq $0,%r11
- adcq $0,%r12
- adcq $0,%r13
- adcq $0,%r14
- adcq $0,%r15
-
-
- movq %rsi,%rax
-
- subq 16+8(%rsp),%rsi
-L$sqrx8x_no_tail:
- adcq 0(%rdi),%r8
-.byte 102,72,15,126,217
- adcq 8(%rdi),%r9
- movq 112(%rbp),%rsi
-.byte 102,72,15,126,213
- adcq 16(%rdi),%r10
- adcq 24(%rdi),%r11
- adcq 32(%rdi),%r12
- adcq 40(%rdi),%r13
- adcq 48(%rdi),%r14
- adcq 56(%rdi),%r15
- adcq %rax,%rax
-
- movq 32+8(%rsp),%rbx
- movq 64(%rdi,%rcx,1),%rdx
-
- movq %r8,0(%rdi)
- leaq 64(%rdi),%r8
- movq %r9,8(%rdi)
- movq %r10,16(%rdi)
- movq %r11,24(%rdi)
- movq %r12,32(%rdi)
- movq %r13,40(%rdi)
- movq %r14,48(%rdi)
- movq %r15,56(%rdi)
-
- leaq 64(%rdi,%rcx,1),%rdi
- cmpq 8+8(%rsp),%r8
- jb L$sqrx8x_reduction_loop
- xorl %ebx,%ebx
- subq %r15,%rsi
- adcq %rbx,%rbx
- movq %rcx,%r10
- orq %rbx,%rax
- movq %rcx,%r9
- xorq $1,%rax
- sarq $3+2,%rcx
-
- leaq (%rbp,%rax,8),%rbp
-.byte 102,72,15,126,202
-.byte 102,72,15,126,206
- jmp L$sqrx4x_sub
-
-.p2align 5
-L$sqrx4x_sub:
-.byte 0x66
- movq 0(%rdi),%r12
- movq 8(%rdi),%r13
- sbbq 0(%rbp),%r12
- movq 16(%rdi),%r14
- sbbq 16(%rbp),%r13
- movq 24(%rdi),%r15
- leaq 32(%rdi),%rdi
- sbbq 32(%rbp),%r14
- movq %r12,0(%rdx)
- sbbq 48(%rbp),%r15
- leaq 64(%rbp),%rbp
- movq %r13,8(%rdx)
- movq %r14,16(%rdx)
- movq %r15,24(%rdx)
- leaq 32(%rdx),%rdx
-
- incq %rcx
- jnz L$sqrx4x_sub
- negq %r9
-
- .byte 0xf3,0xc3
-
.globl _bn_get_bits5
.p2align 4
@@ -2935,45 +2084,169 @@ L$scatter_epilogue:
.globl _bn_gather5
-.p2align 4
+.p2align 5
_bn_gather5:
- movl %ecx,%r11d
- shrl $3,%ecx
- andq $7,%r11
- notl %ecx
- leaq L$magic_masks(%rip),%rax
- andl $3,%ecx
- leaq 128(%rdx,%r11,8),%rdx
- movq 0(%rax,%rcx,8),%xmm4
- movq 8(%rax,%rcx,8),%xmm5
- movq 16(%rax,%rcx,8),%xmm6
- movq 24(%rax,%rcx,8),%xmm7
+L$SEH_begin_bn_gather5:
+
+.byte 0x4c,0x8d,0x14,0x24
+.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00
+ leaq L$inc(%rip),%rax
+ andq $-16,%rsp
+
+ movd %ecx,%xmm5
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 128(%rdx),%r11
+ leaq 128(%rsp),%rax
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,-128(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,-112(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,-96(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,-80(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,-64(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,-48(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,-32(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,-16(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,16(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,32(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,48(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,64(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,80(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,96(%rax)
+ movdqa %xmm4,%xmm2
+ movdqa %xmm3,112(%rax)
jmp L$gather
-.p2align 4
-L$gather:
- movq -128(%rdx),%xmm0
- movq -64(%rdx),%xmm1
- pand %xmm4,%xmm0
- movq 0(%rdx),%xmm2
- pand %xmm5,%xmm1
- movq 64(%rdx),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-.byte 0x67,0x67
- por %xmm2,%xmm0
- leaq 256(%rdx),%rdx
- por %xmm3,%xmm0
+.p2align 5
+L$gather:
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r11),%xmm0
+ movdqa -112(%r11),%xmm1
+ movdqa -96(%r11),%xmm2
+ pand -128(%rax),%xmm0
+ movdqa -80(%r11),%xmm3
+ pand -112(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r11),%xmm0
+ movdqa -48(%r11),%xmm1
+ movdqa -32(%r11),%xmm2
+ pand -64(%rax),%xmm0
+ movdqa -16(%r11),%xmm3
+ pand -48(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r11),%xmm0
+ movdqa 16(%r11),%xmm1
+ movdqa 32(%r11),%xmm2
+ pand 0(%rax),%xmm0
+ movdqa 48(%r11),%xmm3
+ pand 16(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r11),%xmm0
+ movdqa 80(%r11),%xmm1
+ movdqa 96(%r11),%xmm2
+ pand 64(%rax),%xmm0
+ movdqa 112(%r11),%xmm3
+ pand 80(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ leaq 256(%r11),%r11
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
movq %xmm0,(%rdi)
leaq 8(%rdi),%rdi
subl $1,%esi
jnz L$gather
+
+ leaq (%r10),%rsp
.byte 0xf3,0xc3
L$SEH_end_bn_gather5:
.p2align 6
-L$magic_masks:
-.long 0,0, 0,0, 0,0, -1,-1
-.long 0,0, 0,0, 0,0, 0,0
+L$inc:
+.long 0,0, 1,1
+.long 2,2, 2,2
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/deps/openssl/asm/x64-macosx-gas/ec/ecp_nistz256-x86_64.s b/deps/openssl/asm/x64-macosx-gas/ec/ecp_nistz256-x86_64.s
index f2eb8554e8..30456b900f 100644
--- a/deps/openssl/asm/x64-macosx-gas/ec/ecp_nistz256-x86_64.s
+++ b/deps/openssl/asm/x64-macosx-gas/ec/ecp_nistz256-x86_64.s
@@ -332,8 +332,6 @@ _ecp_nistz256_neg:
.p2align 5
_ecp_nistz256_to_mont:
- movl $524544,%ecx
- andl _OPENSSL_ia32cap_P+8(%rip),%ecx
leaq L$RR(%rip),%rdx
jmp L$mul_mont
@@ -348,8 +346,6 @@ _ecp_nistz256_to_mont:
.p2align 5
_ecp_nistz256_mul_mont:
- movl $524544,%ecx
- andl _OPENSSL_ia32cap_P+8(%rip),%ecx
L$mul_mont:
pushq %rbp
pushq %rbx
@@ -357,8 +353,6 @@ L$mul_mont:
pushq %r13
pushq %r14
pushq %r15
- cmpl $524544,%ecx
- je L$mul_montx
movq %rdx,%rbx
movq 0(%rdx),%rax
movq 0(%rsi),%r9
@@ -367,19 +361,6 @@ L$mul_mont:
movq 24(%rsi),%r12
call __ecp_nistz256_mul_montq
- jmp L$mul_mont_done
-
-.p2align 5
-L$mul_montx:
- movq %rdx,%rbx
- movq 0(%rdx),%rdx
- movq 0(%rsi),%r9
- movq 8(%rsi),%r10
- movq 16(%rsi),%r11
- movq 24(%rsi),%r12
- leaq -128(%rsi),%rsi
-
- call __ecp_nistz256_mul_montx
L$mul_mont_done:
popq %r15
popq %r14
@@ -617,33 +598,18 @@ __ecp_nistz256_mul_montq:
.p2align 5
_ecp_nistz256_sqr_mont:
- movl $524544,%ecx
- andl _OPENSSL_ia32cap_P+8(%rip),%ecx
pushq %rbp
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
- cmpl $524544,%ecx
- je L$sqr_montx
movq 0(%rsi),%rax
movq 8(%rsi),%r14
movq 16(%rsi),%r15
movq 24(%rsi),%r8
call __ecp_nistz256_sqr_montq
- jmp L$sqr_mont_done
-
-.p2align 5
-L$sqr_montx:
- movq 0(%rsi),%rdx
- movq 8(%rsi),%r14
- movq 16(%rsi),%r15
- movq 24(%rsi),%r8
- leaq -128(%rsi),%rsi
-
- call __ecp_nistz256_sqr_montx
L$sqr_mont_done:
popq %r15
popq %r14
@@ -816,304 +782,6 @@ __ecp_nistz256_sqr_montq:
.byte 0xf3,0xc3
-.p2align 5
-__ecp_nistz256_mul_montx:
-
-
- mulxq %r9,%r8,%r9
- mulxq %r10,%rcx,%r10
- movq $32,%r14
- xorq %r13,%r13
- mulxq %r11,%rbp,%r11
- movq L$poly+24(%rip),%r15
- adcq %rcx,%r9
- mulxq %r12,%rcx,%r12
- movq %r8,%rdx
- adcq %rbp,%r10
- shlxq %r14,%r8,%rbp
- adcq %rcx,%r11
- shrxq %r14,%r8,%rcx
- adcq $0,%r12
-
-
-
- addq %rbp,%r9
- adcq %rcx,%r10
-
- mulxq %r15,%rcx,%rbp
- movq 8(%rbx),%rdx
- adcq %rcx,%r11
- adcq %rbp,%r12
- adcq $0,%r13
- xorq %r8,%r8
-
-
-
- mulxq 0+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r9
- adoxq %rbp,%r10
-
- mulxq 8+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
-
- mulxq 16+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 24+128(%rsi),%rcx,%rbp
- movq %r9,%rdx
- adcxq %rcx,%r12
- shlxq %r14,%r9,%rcx
- adoxq %rbp,%r13
- shrxq %r14,%r9,%rbp
-
- adcxq %r8,%r13
- adoxq %r8,%r8
- adcq $0,%r8
-
-
-
- addq %rcx,%r10
- adcq %rbp,%r11
-
- mulxq %r15,%rcx,%rbp
- movq 16(%rbx),%rdx
- adcq %rcx,%r12
- adcq %rbp,%r13
- adcq $0,%r8
- xorq %r9,%r9
-
-
-
- mulxq 0+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
-
- mulxq 8+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 16+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r12
- adoxq %rbp,%r13
-
- mulxq 24+128(%rsi),%rcx,%rbp
- movq %r10,%rdx
- adcxq %rcx,%r13
- shlxq %r14,%r10,%rcx
- adoxq %rbp,%r8
- shrxq %r14,%r10,%rbp
-
- adcxq %r9,%r8
- adoxq %r9,%r9
- adcq $0,%r9
-
-
-
- addq %rcx,%r11
- adcq %rbp,%r12
-
- mulxq %r15,%rcx,%rbp
- movq 24(%rbx),%rdx
- adcq %rcx,%r13
- adcq %rbp,%r8
- adcq $0,%r9
- xorq %r10,%r10
-
-
-
- mulxq 0+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 8+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r12
- adoxq %rbp,%r13
-
- mulxq 16+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r13
- adoxq %rbp,%r8
-
- mulxq 24+128(%rsi),%rcx,%rbp
- movq %r11,%rdx
- adcxq %rcx,%r8
- shlxq %r14,%r11,%rcx
- adoxq %rbp,%r9
- shrxq %r14,%r11,%rbp
-
- adcxq %r10,%r9
- adoxq %r10,%r10
- adcq $0,%r10
-
-
-
- addq %rcx,%r12
- adcq %rbp,%r13
-
- mulxq %r15,%rcx,%rbp
- movq %r12,%rbx
- movq L$poly+8(%rip),%r14
- adcq %rcx,%r8
- movq %r13,%rdx
- adcq %rbp,%r9
- adcq $0,%r10
-
-
-
- xorl %eax,%eax
- movq %r8,%rcx
- sbbq $-1,%r12
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%rbp
- sbbq %r15,%r9
- sbbq $0,%r10
-
- cmovcq %rbx,%r12
- cmovcq %rdx,%r13
- movq %r12,0(%rdi)
- cmovcq %rcx,%r8
- movq %r13,8(%rdi)
- cmovcq %rbp,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- .byte 0xf3,0xc3
-
-
-
-.p2align 5
-__ecp_nistz256_sqr_montx:
- mulxq %r14,%r9,%r10
- mulxq %r15,%rcx,%r11
- xorl %eax,%eax
- adcq %rcx,%r10
- mulxq %r8,%rbp,%r12
- movq %r14,%rdx
- adcq %rbp,%r11
- adcq $0,%r12
- xorq %r13,%r13
-
-
- mulxq %r15,%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq %r8,%rcx,%rbp
- movq %r15,%rdx
- adcxq %rcx,%r12
- adoxq %rbp,%r13
- adcq $0,%r13
-
-
- mulxq %r8,%rcx,%r14
- movq 0+128(%rsi),%rdx
- xorq %r15,%r15
- adcxq %r9,%r9
- adoxq %rcx,%r13
- adcxq %r10,%r10
- adoxq %r15,%r14
-
- mulxq %rdx,%r8,%rbp
- movq 8+128(%rsi),%rdx
- adcxq %r11,%r11
- adoxq %rbp,%r9
- adcxq %r12,%r12
- mulxq %rdx,%rcx,%rax
- movq 16+128(%rsi),%rdx
- adcxq %r13,%r13
- adoxq %rcx,%r10
- adcxq %r14,%r14
-.byte 0x67
- mulxq %rdx,%rcx,%rbp
- movq 24+128(%rsi),%rdx
- adoxq %rax,%r11
- adcxq %r15,%r15
- adoxq %rcx,%r12
- movq $32,%rsi
- adoxq %rbp,%r13
-.byte 0x67,0x67
- mulxq %rdx,%rcx,%rax
- movq %r8,%rdx
- adoxq %rcx,%r14
- shlxq %rsi,%r8,%rcx
- adoxq %rax,%r15
- shrxq %rsi,%r8,%rax
- movq L$poly+24(%rip),%rbp
-
-
- addq %rcx,%r9
- adcq %rax,%r10
-
- mulxq %rbp,%rcx,%r8
- movq %r9,%rdx
- adcq %rcx,%r11
- shlxq %rsi,%r9,%rcx
- adcq $0,%r8
- shrxq %rsi,%r9,%rax
-
-
- addq %rcx,%r10
- adcq %rax,%r11
-
- mulxq %rbp,%rcx,%r9
- movq %r10,%rdx
- adcq %rcx,%r8
- shlxq %rsi,%r10,%rcx
- adcq $0,%r9
- shrxq %rsi,%r10,%rax
-
-
- addq %rcx,%r11
- adcq %rax,%r8
-
- mulxq %rbp,%rcx,%r10
- movq %r11,%rdx
- adcq %rcx,%r9
- shlxq %rsi,%r11,%rcx
- adcq $0,%r10
- shrxq %rsi,%r11,%rax
-
-
- addq %rcx,%r8
- adcq %rax,%r9
-
- mulxq %rbp,%rcx,%r11
- adcq %rcx,%r10
- adcq $0,%r11
-
- xorq %rdx,%rdx
- adcq %r8,%r12
- movq L$poly+8(%rip),%rsi
- adcq %r9,%r13
- movq %r12,%r8
- adcq %r10,%r14
- adcq %r11,%r15
- movq %r13,%r9
- adcq $0,%rdx
-
- xorl %eax,%eax
- sbbq $-1,%r12
- movq %r14,%r10
- sbbq %rsi,%r13
- sbbq $0,%r14
- movq %r15,%r11
- sbbq %rbp,%r15
- sbbq $0,%rdx
-
- cmovcq %r8,%r12
- cmovcq %r9,%r13
- movq %r12,0(%rdi)
- cmovcq %r10,%r14
- movq %r13,8(%rdi)
- cmovcq %r11,%r15
- movq %r14,16(%rdi)
- movq %r15,24(%rdi)
-
- .byte 0xf3,0xc3
-
-
@@ -1215,9 +883,6 @@ _ecp_nistz256_from_mont:
.p2align 5
_ecp_nistz256_select_w5:
- movl _OPENSSL_ia32cap_P+8(%rip),%eax
- testl $32,%eax
- jnz L$avx2_select_w5
movdqa L$One(%rip),%xmm0
movd %edx,%xmm1
@@ -1277,9 +942,6 @@ L$select_loop_sse_w5:
.p2align 5
_ecp_nistz256_select_w7:
- movl _OPENSSL_ia32cap_P+8(%rip),%eax
- testl $32,%eax
- jnz L$avx2_select_w7
movdqa L$One(%rip),%xmm8
movd %edx,%xmm1
@@ -1321,141 +983,11 @@ L$select_loop_sse_w7:
movdqu %xmm5,48(%rdi)
.byte 0xf3,0xc3
-
-
-
-.p2align 5
-ecp_nistz256_avx2_select_w5:
-L$avx2_select_w5:
- vzeroupper
- vmovdqa L$Two(%rip),%ymm0
-
- vpxor %ymm2,%ymm2,%ymm2
- vpxor %ymm3,%ymm3,%ymm3
- vpxor %ymm4,%ymm4,%ymm4
-
- vmovdqa L$One(%rip),%ymm5
- vmovdqa L$Two(%rip),%ymm10
-
- vmovd %edx,%xmm1
- vpermd %ymm1,%ymm2,%ymm1
-
- movq $8,%rax
-L$select_loop_avx2_w5:
-
- vmovdqa 0(%rsi),%ymm6
- vmovdqa 32(%rsi),%ymm7
- vmovdqa 64(%rsi),%ymm8
-
- vmovdqa 96(%rsi),%ymm11
- vmovdqa 128(%rsi),%ymm12
- vmovdqa 160(%rsi),%ymm13
-
- vpcmpeqd %ymm1,%ymm5,%ymm9
- vpcmpeqd %ymm1,%ymm10,%ymm14
-
- vpaddd %ymm0,%ymm5,%ymm5
- vpaddd %ymm0,%ymm10,%ymm10
- leaq 192(%rsi),%rsi
-
- vpand %ymm9,%ymm6,%ymm6
- vpand %ymm9,%ymm7,%ymm7
- vpand %ymm9,%ymm8,%ymm8
- vpand %ymm14,%ymm11,%ymm11
- vpand %ymm14,%ymm12,%ymm12
- vpand %ymm14,%ymm13,%ymm13
-
- vpxor %ymm6,%ymm2,%ymm2
- vpxor %ymm7,%ymm3,%ymm3
- vpxor %ymm8,%ymm4,%ymm4
- vpxor %ymm11,%ymm2,%ymm2
- vpxor %ymm12,%ymm3,%ymm3
- vpxor %ymm13,%ymm4,%ymm4
-
- decq %rax
- jnz L$select_loop_avx2_w5
-
- vmovdqu %ymm2,0(%rdi)
- vmovdqu %ymm3,32(%rdi)
- vmovdqu %ymm4,64(%rdi)
- vzeroupper
- .byte 0xf3,0xc3
-
-
-
-
.globl _ecp_nistz256_avx2_select_w7
.p2align 5
_ecp_nistz256_avx2_select_w7:
-L$avx2_select_w7:
- vzeroupper
- vmovdqa L$Three(%rip),%ymm0
-
- vpxor %ymm2,%ymm2,%ymm2
- vpxor %ymm3,%ymm3,%ymm3
-
- vmovdqa L$One(%rip),%ymm4
- vmovdqa L$Two(%rip),%ymm8
- vmovdqa L$Three(%rip),%ymm12
-
- vmovd %edx,%xmm1
- vpermd %ymm1,%ymm2,%ymm1
-
-
- movq $21,%rax
-L$select_loop_avx2_w7:
-
- vmovdqa 0(%rsi),%ymm5
- vmovdqa 32(%rsi),%ymm6
-
- vmovdqa 64(%rsi),%ymm9
- vmovdqa 96(%rsi),%ymm10
-
- vmovdqa 128(%rsi),%ymm13
- vmovdqa 160(%rsi),%ymm14
-
- vpcmpeqd %ymm1,%ymm4,%ymm7
- vpcmpeqd %ymm1,%ymm8,%ymm11
- vpcmpeqd %ymm1,%ymm12,%ymm15
-
- vpaddd %ymm0,%ymm4,%ymm4
- vpaddd %ymm0,%ymm8,%ymm8
- vpaddd %ymm0,%ymm12,%ymm12
- leaq 192(%rsi),%rsi
-
- vpand %ymm7,%ymm5,%ymm5
- vpand %ymm7,%ymm6,%ymm6
- vpand %ymm11,%ymm9,%ymm9
- vpand %ymm11,%ymm10,%ymm10
- vpand %ymm15,%ymm13,%ymm13
- vpand %ymm15,%ymm14,%ymm14
-
- vpxor %ymm5,%ymm2,%ymm2
- vpxor %ymm6,%ymm3,%ymm3
- vpxor %ymm9,%ymm2,%ymm2
- vpxor %ymm10,%ymm3,%ymm3
- vpxor %ymm13,%ymm2,%ymm2
- vpxor %ymm14,%ymm3,%ymm3
-
- decq %rax
- jnz L$select_loop_avx2_w7
-
-
- vmovdqa 0(%rsi),%ymm5
- vmovdqa 32(%rsi),%ymm6
-
- vpcmpeqd %ymm1,%ymm4,%ymm7
-
- vpand %ymm7,%ymm5,%ymm5
- vpand %ymm7,%ymm6,%ymm6
-
- vpxor %ymm5,%ymm2,%ymm2
- vpxor %ymm6,%ymm3,%ymm3
-
- vmovdqu %ymm2,0(%rdi)
- vmovdqu %ymm3,32(%rdi)
- vzeroupper
+.byte 0x0f,0x0b
.byte 0xf3,0xc3
@@ -1581,10 +1113,6 @@ __ecp_nistz256_mul_by_2q:
.p2align 5
_ecp_nistz256_point_double:
- movl $524544,%ecx
- andl _OPENSSL_ia32cap_P+8(%rip),%ecx
- cmpl $524544,%ecx
- je L$point_doublex
pushq %rbp
pushq %rbx
pushq %r12
@@ -1593,6 +1121,7 @@ _ecp_nistz256_point_double:
pushq %r15
subq $160+8,%rsp
+L$point_double_shortcutq:
movdqu 0(%rsi),%xmm0
movq %rsi,%rbx
movdqu 16(%rsi),%xmm1
@@ -1786,10 +1315,6 @@ _ecp_nistz256_point_double:
.p2align 5
_ecp_nistz256_point_add:
- movl $524544,%ecx
- andl _OPENSSL_ia32cap_P+8(%rip),%ecx
- cmpl $524544,%ecx
- je L$point_addx
pushq %rbp
pushq %rbx
pushq %r12
@@ -1817,7 +1342,7 @@ _ecp_nistz256_point_add:
por %xmm1,%xmm3
movdqu 0(%rsi),%xmm0
- pshufd $177,%xmm3,%xmm5
+ pshufd $0xb1,%xmm3,%xmm5
movdqu 16(%rsi),%xmm1
movdqu 32(%rsi),%xmm2
por %xmm3,%xmm5
@@ -1827,7 +1352,7 @@ _ecp_nistz256_point_add:
movq 64+16(%rsi),%r15
movq 64+24(%rsi),%r8
movdqa %xmm0,480(%rsp)
- pshufd $30,%xmm5,%xmm4
+ pshufd $0x1e,%xmm5,%xmm4
movdqa %xmm1,480+16(%rsp)
por %xmm0,%xmm1
.byte 102,72,15,110,199
@@ -1847,10 +1372,10 @@ _ecp_nistz256_point_add:
call __ecp_nistz256_sqr_montq
pcmpeqd %xmm4,%xmm5
- pshufd $177,%xmm3,%xmm4
+ pshufd $0xb1,%xmm3,%xmm4
por %xmm3,%xmm4
pshufd $0,%xmm5,%xmm5
- pshufd $30,%xmm4,%xmm3
+ pshufd $0x1e,%xmm4,%xmm3
por %xmm3,%xmm4
pxor %xmm3,%xmm3
pcmpeqd %xmm3,%xmm4
@@ -1859,6 +1384,7 @@ _ecp_nistz256_point_add:
movq 64+8(%rbx),%r14
movq 64+16(%rbx),%r15
movq 64+24(%rbx),%r8
+.byte 102,72,15,110,203
leaq 64-0(%rbx),%rsi
leaq 32(%rsp),%rdi
@@ -1950,7 +1476,7 @@ _ecp_nistz256_point_add:
testq %r8,%r8
jnz L$add_proceedq
testq %r9,%r9
- jz L$add_proceedq
+ jz L$add_doubleq
.byte 102,72,15,126,199
pxor %xmm0,%xmm0
@@ -1963,6 +1489,13 @@ _ecp_nistz256_point_add:
jmp L$add_doneq
.p2align 5
+L$add_doubleq:
+.byte 102,72,15,126,206
+.byte 102,72,15,126,199
+ addq $416,%rsp
+ jmp L$point_double_shortcutq
+
+.p2align 5
L$add_proceedq:
movq 0+64(%rsp),%rax
movq 8+64(%rsp),%r14
@@ -2179,10 +1712,6 @@ L$add_doneq:
.p2align 5
_ecp_nistz256_point_add_affine:
- movl $524544,%ecx
- andl _OPENSSL_ia32cap_P+8(%rip),%ecx
- cmpl $524544,%ecx
- je L$point_add_affinex
pushq %rbp
pushq %rbx
pushq %r12
@@ -2213,13 +1742,13 @@ _ecp_nistz256_point_add_affine:
por %xmm1,%xmm3
movdqu 0(%rbx),%xmm0
- pshufd $177,%xmm3,%xmm5
+ pshufd $0xb1,%xmm3,%xmm5
movdqu 16(%rbx),%xmm1
movdqu 32(%rbx),%xmm2
por %xmm3,%xmm5
movdqu 48(%rbx),%xmm3
movdqa %xmm0,416(%rsp)
- pshufd $30,%xmm5,%xmm4
+ pshufd $0x1e,%xmm5,%xmm4
movdqa %xmm1,416+16(%rsp)
por %xmm0,%xmm1
.byte 102,72,15,110,199
@@ -2235,13 +1764,13 @@ _ecp_nistz256_point_add_affine:
call __ecp_nistz256_sqr_montq
pcmpeqd %xmm4,%xmm5
- pshufd $177,%xmm3,%xmm4
+ pshufd $0xb1,%xmm3,%xmm4
movq 0(%rbx),%rax
movq %r12,%r9
por %xmm3,%xmm4
pshufd $0,%xmm5,%xmm5
- pshufd $30,%xmm4,%xmm3
+ pshufd $0x1e,%xmm4,%xmm3
movq %r13,%r10
por %xmm3,%xmm4
pxor %xmm3,%xmm3
@@ -2481,1023 +2010,3 @@ _ecp_nistz256_point_add_affine:
popq %rbx
popq %rbp
.byte 0xf3,0xc3
-
-
-.p2align 5
-__ecp_nistz256_add_tox:
- xorq %r11,%r11
- adcq 0(%rbx),%r12
- adcq 8(%rbx),%r13
- movq %r12,%rax
- adcq 16(%rbx),%r8
- adcq 24(%rbx),%r9
- movq %r13,%rbp
- adcq $0,%r11
-
- xorq %r10,%r10
- sbbq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
-
- btq $0,%r11
- cmovncq %rax,%r12
- cmovncq %rbp,%r13
- movq %r12,0(%rdi)
- cmovncq %rcx,%r8
- movq %r13,8(%rdi)
- cmovncq %r10,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- .byte 0xf3,0xc3
-
-
-
-.p2align 5
-__ecp_nistz256_sub_fromx:
- xorq %r11,%r11
- sbbq 0(%rbx),%r12
- sbbq 8(%rbx),%r13
- movq %r12,%rax
- sbbq 16(%rbx),%r8
- sbbq 24(%rbx),%r9
- movq %r13,%rbp
- sbbq $0,%r11
-
- xorq %r10,%r10
- adcq $-1,%r12
- movq %r8,%rcx
- adcq %r14,%r13
- adcq $0,%r8
- movq %r9,%r10
- adcq %r15,%r9
-
- btq $0,%r11
- cmovncq %rax,%r12
- cmovncq %rbp,%r13
- movq %r12,0(%rdi)
- cmovncq %rcx,%r8
- movq %r13,8(%rdi)
- cmovncq %r10,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- .byte 0xf3,0xc3
-
-
-
-.p2align 5
-__ecp_nistz256_subx:
- xorq %r11,%r11
- sbbq %r12,%rax
- sbbq %r13,%rbp
- movq %rax,%r12
- sbbq %r8,%rcx
- sbbq %r9,%r10
- movq %rbp,%r13
- sbbq $0,%r11
-
- xorq %r9,%r9
- adcq $-1,%rax
- movq %rcx,%r8
- adcq %r14,%rbp
- adcq $0,%rcx
- movq %r10,%r9
- adcq %r15,%r10
-
- btq $0,%r11
- cmovcq %rax,%r12
- cmovcq %rbp,%r13
- cmovcq %rcx,%r8
- cmovcq %r10,%r9
-
- .byte 0xf3,0xc3
-
-
-
-.p2align 5
-__ecp_nistz256_mul_by_2x:
- xorq %r11,%r11
- adcq %r12,%r12
- adcq %r13,%r13
- movq %r12,%rax
- adcq %r8,%r8
- adcq %r9,%r9
- movq %r13,%rbp
- adcq $0,%r11
-
- xorq %r10,%r10
- sbbq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
-
- btq $0,%r11
- cmovncq %rax,%r12
- cmovncq %rbp,%r13
- movq %r12,0(%rdi)
- cmovncq %rcx,%r8
- movq %r13,8(%rdi)
- cmovncq %r10,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- .byte 0xf3,0xc3
-
-
-.p2align 5
-ecp_nistz256_point_doublex:
-L$point_doublex:
- pushq %rbp
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $160+8,%rsp
-
- movdqu 0(%rsi),%xmm0
- movq %rsi,%rbx
- movdqu 16(%rsi),%xmm1
- movq 32+0(%rsi),%r12
- movq 32+8(%rsi),%r13
- movq 32+16(%rsi),%r8
- movq 32+24(%rsi),%r9
- movq L$poly+8(%rip),%r14
- movq L$poly+24(%rip),%r15
- movdqa %xmm0,96(%rsp)
- movdqa %xmm1,96+16(%rsp)
- leaq 32(%rdi),%r10
- leaq 64(%rdi),%r11
-.byte 102,72,15,110,199
-.byte 102,73,15,110,202
-.byte 102,73,15,110,211
-
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_by_2x
-
- movq 64+0(%rsi),%rdx
- movq 64+8(%rsi),%r14
- movq 64+16(%rsi),%r15
- movq 64+24(%rsi),%r8
- leaq 64-128(%rsi),%rsi
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 0+0(%rsp),%rdx
- movq 8+0(%rsp),%r14
- leaq -128+0(%rsp),%rsi
- movq 16+0(%rsp),%r15
- movq 24+0(%rsp),%r8
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 32(%rbx),%rdx
- movq 64+0(%rbx),%r9
- movq 64+8(%rbx),%r10
- movq 64+16(%rbx),%r11
- movq 64+24(%rbx),%r12
- leaq 64-128(%rbx),%rsi
- leaq 32(%rbx),%rbx
-.byte 102,72,15,126,215
- call __ecp_nistz256_mul_montx
- call __ecp_nistz256_mul_by_2x
-
- movq 96+0(%rsp),%r12
- movq 96+8(%rsp),%r13
- leaq 64(%rsp),%rbx
- movq 96+16(%rsp),%r8
- movq 96+24(%rsp),%r9
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_add_tox
-
- movq 96+0(%rsp),%r12
- movq 96+8(%rsp),%r13
- leaq 64(%rsp),%rbx
- movq 96+16(%rsp),%r8
- movq 96+24(%rsp),%r9
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- movq 0+0(%rsp),%rdx
- movq 8+0(%rsp),%r14
- leaq -128+0(%rsp),%rsi
- movq 16+0(%rsp),%r15
- movq 24+0(%rsp),%r8
-.byte 102,72,15,126,207
- call __ecp_nistz256_sqr_montx
- xorq %r9,%r9
- movq %r12,%rax
- addq $-1,%r12
- movq %r13,%r10
- adcq %rsi,%r13
- movq %r14,%rcx
- adcq $0,%r14
- movq %r15,%r8
- adcq %rbp,%r15
- adcq $0,%r9
- xorq %rsi,%rsi
- testq $1,%rax
-
- cmovzq %rax,%r12
- cmovzq %r10,%r13
- cmovzq %rcx,%r14
- cmovzq %r8,%r15
- cmovzq %rsi,%r9
-
- movq %r13,%rax
- shrq $1,%r12
- shlq $63,%rax
- movq %r14,%r10
- shrq $1,%r13
- orq %rax,%r12
- shlq $63,%r10
- movq %r15,%rcx
- shrq $1,%r14
- orq %r10,%r13
- shlq $63,%rcx
- movq %r12,0(%rdi)
- shrq $1,%r15
- movq %r13,8(%rdi)
- shlq $63,%r9
- orq %rcx,%r14
- orq %r9,%r15
- movq %r14,16(%rdi)
- movq %r15,24(%rdi)
- movq 64(%rsp),%rdx
- leaq 64(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_mul_by_2x
-
- leaq 32(%rsp),%rbx
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_add_tox
-
- movq 96(%rsp),%rdx
- leaq 96(%rsp),%rbx
- movq 0+0(%rsp),%r9
- movq 8+0(%rsp),%r10
- leaq -128+0(%rsp),%rsi
- movq 16+0(%rsp),%r11
- movq 24+0(%rsp),%r12
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_mul_by_2x
-
- movq 0+32(%rsp),%rdx
- movq 8+32(%rsp),%r14
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r15
- movq 24+32(%rsp),%r8
-.byte 102,72,15,126,199
- call __ecp_nistz256_sqr_montx
-
- leaq 128(%rsp),%rbx
- movq %r14,%r8
- movq %r15,%r9
- movq %rsi,%r14
- movq %rbp,%r15
- call __ecp_nistz256_sub_fromx
-
- movq 0+0(%rsp),%rax
- movq 0+8(%rsp),%rbp
- movq 0+16(%rsp),%rcx
- movq 0+24(%rsp),%r10
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_subx
-
- movq 32(%rsp),%rdx
- leaq 32(%rsp),%rbx
- movq %r12,%r14
- xorl %ecx,%ecx
- movq %r12,0+0(%rsp)
- movq %r13,%r10
- movq %r13,0+8(%rsp)
- cmovzq %r8,%r11
- movq %r8,0+16(%rsp)
- leaq 0-128(%rsp),%rsi
- cmovzq %r9,%r12
- movq %r9,0+24(%rsp)
- movq %r14,%r9
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
-.byte 102,72,15,126,203
-.byte 102,72,15,126,207
- call __ecp_nistz256_sub_fromx
-
- addq $160+8,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %rbp
- .byte 0xf3,0xc3
-
-
-.p2align 5
-ecp_nistz256_point_addx:
-L$point_addx:
- pushq %rbp
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $576+8,%rsp
-
- movdqu 0(%rsi),%xmm0
- movdqu 16(%rsi),%xmm1
- movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm3
- movdqu 64(%rsi),%xmm4
- movdqu 80(%rsi),%xmm5
- movq %rsi,%rbx
- movq %rdx,%rsi
- movdqa %xmm0,384(%rsp)
- movdqa %xmm1,384+16(%rsp)
- por %xmm0,%xmm1
- movdqa %xmm2,416(%rsp)
- movdqa %xmm3,416+16(%rsp)
- por %xmm2,%xmm3
- movdqa %xmm4,448(%rsp)
- movdqa %xmm5,448+16(%rsp)
- por %xmm1,%xmm3
-
- movdqu 0(%rsi),%xmm0
- pshufd $177,%xmm3,%xmm5
- movdqu 16(%rsi),%xmm1
- movdqu 32(%rsi),%xmm2
- por %xmm3,%xmm5
- movdqu 48(%rsi),%xmm3
- movq 64+0(%rsi),%rdx
- movq 64+8(%rsi),%r14
- movq 64+16(%rsi),%r15
- movq 64+24(%rsi),%r8
- movdqa %xmm0,480(%rsp)
- pshufd $30,%xmm5,%xmm4
- movdqa %xmm1,480+16(%rsp)
- por %xmm0,%xmm1
-.byte 102,72,15,110,199
- movdqa %xmm2,512(%rsp)
- movdqa %xmm3,512+16(%rsp)
- por %xmm2,%xmm3
- por %xmm4,%xmm5
- pxor %xmm4,%xmm4
- por %xmm1,%xmm3
-
- leaq 64-128(%rsi),%rsi
- movq %rdx,544+0(%rsp)
- movq %r14,544+8(%rsp)
- movq %r15,544+16(%rsp)
- movq %r8,544+24(%rsp)
- leaq 96(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- pcmpeqd %xmm4,%xmm5
- pshufd $177,%xmm3,%xmm4
- por %xmm3,%xmm4
- pshufd $0,%xmm5,%xmm5
- pshufd $30,%xmm4,%xmm3
- por %xmm3,%xmm4
- pxor %xmm3,%xmm3
- pcmpeqd %xmm3,%xmm4
- pshufd $0,%xmm4,%xmm4
- movq 64+0(%rbx),%rdx
- movq 64+8(%rbx),%r14
- movq 64+16(%rbx),%r15
- movq 64+24(%rbx),%r8
-
- leaq 64-128(%rbx),%rsi
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 544(%rsp),%rdx
- leaq 544(%rsp),%rbx
- movq 0+96(%rsp),%r9
- movq 8+96(%rsp),%r10
- leaq -128+96(%rsp),%rsi
- movq 16+96(%rsp),%r11
- movq 24+96(%rsp),%r12
- leaq 224(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 448(%rsp),%rdx
- leaq 448(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 416(%rsp),%rdx
- leaq 416(%rsp),%rbx
- movq 0+224(%rsp),%r9
- movq 8+224(%rsp),%r10
- leaq -128+224(%rsp),%rsi
- movq 16+224(%rsp),%r11
- movq 24+224(%rsp),%r12
- leaq 224(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 512(%rsp),%rdx
- leaq 512(%rsp),%rbx
- movq 0+256(%rsp),%r9
- movq 8+256(%rsp),%r10
- leaq -128+256(%rsp),%rsi
- movq 16+256(%rsp),%r11
- movq 24+256(%rsp),%r12
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 224(%rsp),%rbx
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- orq %r13,%r12
- movdqa %xmm4,%xmm2
- orq %r8,%r12
- orq %r9,%r12
- por %xmm5,%xmm2
-.byte 102,73,15,110,220
-
- movq 384(%rsp),%rdx
- leaq 384(%rsp),%rbx
- movq 0+96(%rsp),%r9
- movq 8+96(%rsp),%r10
- leaq -128+96(%rsp),%rsi
- movq 16+96(%rsp),%r11
- movq 24+96(%rsp),%r12
- leaq 160(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 480(%rsp),%rdx
- leaq 480(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 192(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 160(%rsp),%rbx
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- orq %r13,%r12
- orq %r8,%r12
- orq %r9,%r12
-
-.byte 0x3e
- jnz L$add_proceedx
-.byte 102,73,15,126,208
-.byte 102,73,15,126,217
- testq %r8,%r8
- jnz L$add_proceedx
- testq %r9,%r9
- jz L$add_proceedx
-
-.byte 102,72,15,126,199
- pxor %xmm0,%xmm0
- movdqu %xmm0,0(%rdi)
- movdqu %xmm0,16(%rdi)
- movdqu %xmm0,32(%rdi)
- movdqu %xmm0,48(%rdi)
- movdqu %xmm0,64(%rdi)
- movdqu %xmm0,80(%rdi)
- jmp L$add_donex
-
-.p2align 5
-L$add_proceedx:
- movq 0+64(%rsp),%rdx
- movq 8+64(%rsp),%r14
- leaq -128+64(%rsp),%rsi
- movq 16+64(%rsp),%r15
- movq 24+64(%rsp),%r8
- leaq 96(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 448(%rsp),%rdx
- leaq 448(%rsp),%rbx
- movq 0+0(%rsp),%r9
- movq 8+0(%rsp),%r10
- leaq -128+0(%rsp),%rsi
- movq 16+0(%rsp),%r11
- movq 24+0(%rsp),%r12
- leaq 352(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 0+0(%rsp),%rdx
- movq 8+0(%rsp),%r14
- leaq -128+0(%rsp),%rsi
- movq 16+0(%rsp),%r15
- movq 24+0(%rsp),%r8
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 544(%rsp),%rdx
- leaq 544(%rsp),%rbx
- movq 0+352(%rsp),%r9
- movq 8+352(%rsp),%r10
- leaq -128+352(%rsp),%rsi
- movq 16+352(%rsp),%r11
- movq 24+352(%rsp),%r12
- leaq 352(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 0(%rsp),%rdx
- leaq 0(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 160(%rsp),%rdx
- leaq 160(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 192(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
-
-
-
- addq %r12,%r12
- leaq 96(%rsp),%rsi
- adcq %r13,%r13
- movq %r12,%rax
- adcq %r8,%r8
- adcq %r9,%r9
- movq %r13,%rbp
- sbbq %r11,%r11
-
- subq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
- testq %r11,%r11
-
- cmovzq %rax,%r12
- movq 0(%rsi),%rax
- cmovzq %rbp,%r13
- movq 8(%rsi),%rbp
- cmovzq %rcx,%r8
- movq 16(%rsi),%rcx
- cmovzq %r10,%r9
- movq 24(%rsi),%r10
-
- call __ecp_nistz256_subx
-
- leaq 128(%rsp),%rbx
- leaq 288(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- movq 192+0(%rsp),%rax
- movq 192+8(%rsp),%rbp
- movq 192+16(%rsp),%rcx
- movq 192+24(%rsp),%r10
- leaq 320(%rsp),%rdi
-
- call __ecp_nistz256_subx
-
- movq %r12,0(%rdi)
- movq %r13,8(%rdi)
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
- movq 128(%rsp),%rdx
- leaq 128(%rsp),%rbx
- movq 0+224(%rsp),%r9
- movq 8+224(%rsp),%r10
- leaq -128+224(%rsp),%rsi
- movq 16+224(%rsp),%r11
- movq 24+224(%rsp),%r12
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 320(%rsp),%rdx
- leaq 320(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq -128+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 320(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 256(%rsp),%rbx
- leaq 320(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
-.byte 102,72,15,126,199
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 352(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 352+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 544(%rsp),%xmm2
- pand 544+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 448(%rsp),%xmm2
- pand 448+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,64(%rdi)
- movdqu %xmm3,80(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 288(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 288+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 480(%rsp),%xmm2
- pand 480+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 384(%rsp),%xmm2
- pand 384+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,0(%rdi)
- movdqu %xmm3,16(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 320(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 320+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 512(%rsp),%xmm2
- pand 512+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 416(%rsp),%xmm2
- pand 416+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,32(%rdi)
- movdqu %xmm3,48(%rdi)
-
-L$add_donex:
- addq $576+8,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %rbp
- .byte 0xf3,0xc3
-
-
-.p2align 5
-ecp_nistz256_point_add_affinex:
-L$point_add_affinex:
- pushq %rbp
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $480+8,%rsp
-
- movdqu 0(%rsi),%xmm0
- movq %rdx,%rbx
- movdqu 16(%rsi),%xmm1
- movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm3
- movdqu 64(%rsi),%xmm4
- movdqu 80(%rsi),%xmm5
- movq 64+0(%rsi),%rdx
- movq 64+8(%rsi),%r14
- movq 64+16(%rsi),%r15
- movq 64+24(%rsi),%r8
- movdqa %xmm0,320(%rsp)
- movdqa %xmm1,320+16(%rsp)
- por %xmm0,%xmm1
- movdqa %xmm2,352(%rsp)
- movdqa %xmm3,352+16(%rsp)
- por %xmm2,%xmm3
- movdqa %xmm4,384(%rsp)
- movdqa %xmm5,384+16(%rsp)
- por %xmm1,%xmm3
-
- movdqu 0(%rbx),%xmm0
- pshufd $177,%xmm3,%xmm5
- movdqu 16(%rbx),%xmm1
- movdqu 32(%rbx),%xmm2
- por %xmm3,%xmm5
- movdqu 48(%rbx),%xmm3
- movdqa %xmm0,416(%rsp)
- pshufd $30,%xmm5,%xmm4
- movdqa %xmm1,416+16(%rsp)
- por %xmm0,%xmm1
-.byte 102,72,15,110,199
- movdqa %xmm2,448(%rsp)
- movdqa %xmm3,448+16(%rsp)
- por %xmm2,%xmm3
- por %xmm4,%xmm5
- pxor %xmm4,%xmm4
- por %xmm1,%xmm3
-
- leaq 64-128(%rsi),%rsi
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- pcmpeqd %xmm4,%xmm5
- pshufd $177,%xmm3,%xmm4
- movq 0(%rbx),%rdx
-
- movq %r12,%r9
- por %xmm3,%xmm4
- pshufd $0,%xmm5,%xmm5
- pshufd $30,%xmm4,%xmm3
- movq %r13,%r10
- por %xmm3,%xmm4
- pxor %xmm3,%xmm3
- movq %r14,%r11
- pcmpeqd %xmm3,%xmm4
- pshufd $0,%xmm4,%xmm4
-
- leaq 32-128(%rsp),%rsi
- movq %r15,%r12
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 320(%rsp),%rbx
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- movq 384(%rsp),%rdx
- leaq 384(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 384(%rsp),%rdx
- leaq 384(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq -128+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 288(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 448(%rsp),%rdx
- leaq 448(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 352(%rsp),%rbx
- leaq 96(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- movq 0+64(%rsp),%rdx
- movq 8+64(%rsp),%r14
- leaq -128+64(%rsp),%rsi
- movq 16+64(%rsp),%r15
- movq 24+64(%rsp),%r8
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 0+96(%rsp),%rdx
- movq 8+96(%rsp),%r14
- leaq -128+96(%rsp),%rsi
- movq 16+96(%rsp),%r15
- movq 24+96(%rsp),%r8
- leaq 192(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 128(%rsp),%rdx
- leaq 128(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq -128+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 160(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 320(%rsp),%rdx
- leaq 320(%rsp),%rbx
- movq 0+128(%rsp),%r9
- movq 8+128(%rsp),%r10
- leaq -128+128(%rsp),%rsi
- movq 16+128(%rsp),%r11
- movq 24+128(%rsp),%r12
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
-
-
-
- addq %r12,%r12
- leaq 192(%rsp),%rsi
- adcq %r13,%r13
- movq %r12,%rax
- adcq %r8,%r8
- adcq %r9,%r9
- movq %r13,%rbp
- sbbq %r11,%r11
-
- subq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
- testq %r11,%r11
-
- cmovzq %rax,%r12
- movq 0(%rsi),%rax
- cmovzq %rbp,%r13
- movq 8(%rsi),%rbp
- cmovzq %rcx,%r8
- movq 16(%rsi),%rcx
- cmovzq %r10,%r9
- movq 24(%rsi),%r10
-
- call __ecp_nistz256_subx
-
- leaq 160(%rsp),%rbx
- leaq 224(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- movq 0+0(%rsp),%rax
- movq 0+8(%rsp),%rbp
- movq 0+16(%rsp),%rcx
- movq 0+24(%rsp),%r10
- leaq 64(%rsp),%rdi
-
- call __ecp_nistz256_subx
-
- movq %r12,0(%rdi)
- movq %r13,8(%rdi)
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
- movq 352(%rsp),%rdx
- leaq 352(%rsp),%rbx
- movq 0+160(%rsp),%r9
- movq 8+160(%rsp),%r10
- leaq -128+160(%rsp),%rsi
- movq 16+160(%rsp),%r11
- movq 24+160(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 96(%rsp),%rdx
- leaq 96(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq -128+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 32(%rsp),%rbx
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
-.byte 102,72,15,126,199
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 288(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 288+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand L$ONE_mont(%rip),%xmm2
- pand L$ONE_mont+16(%rip),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 384(%rsp),%xmm2
- pand 384+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,64(%rdi)
- movdqu %xmm3,80(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 224(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 224+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 416(%rsp),%xmm2
- pand 416+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 320(%rsp),%xmm2
- pand 320+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,0(%rdi)
- movdqu %xmm3,16(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 256(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 256+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 448(%rsp),%xmm2
- pand 448+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 352(%rsp),%xmm2
- pand 352+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,32(%rdi)
- movdqu %xmm3,48(%rdi)
-
- addq $480+8,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %rbp
- .byte 0xf3,0xc3
diff --git a/deps/openssl/asm/x64-macosx-gas/modes/aesni-gcm-x86_64.s b/deps/openssl/asm/x64-macosx-gas/modes/aesni-gcm-x86_64.s
index 86665d6e99..e2bf1bb53a 100644
--- a/deps/openssl/asm/x64-macosx-gas/modes/aesni-gcm-x86_64.s
+++ b/deps/openssl/asm/x64-macosx-gas/modes/aesni-gcm-x86_64.s
@@ -1,753 +1,14 @@
.text
-
-.p2align 5
-_aesni_ctr32_ghash_6x:
- vmovdqu 32(%r11),%xmm2
- subq $6,%rdx
- vpxor %xmm4,%xmm4,%xmm4
- vmovdqu 0-128(%rcx),%xmm15
- vpaddb %xmm2,%xmm1,%xmm10
- vpaddb %xmm2,%xmm10,%xmm11
- vpaddb %xmm2,%xmm11,%xmm12
- vpaddb %xmm2,%xmm12,%xmm13
- vpaddb %xmm2,%xmm13,%xmm14
- vpxor %xmm15,%xmm1,%xmm9
- vmovdqu %xmm4,16+8(%rsp)
- jmp L$oop6x
-
-.p2align 5
-L$oop6x:
- addl $100663296,%ebx
- jc L$handle_ctr32
- vmovdqu 0-32(%r9),%xmm3
- vpaddb %xmm2,%xmm14,%xmm1
- vpxor %xmm15,%xmm10,%xmm10
- vpxor %xmm15,%xmm11,%xmm11
-
-L$resume_ctr32:
- vmovdqu %xmm1,(%r8)
- vpclmulqdq $16,%xmm3,%xmm7,%xmm5
- vpxor %xmm15,%xmm12,%xmm12
- vmovups 16-128(%rcx),%xmm2
- vpclmulqdq $1,%xmm3,%xmm7,%xmm6
- xorq %r12,%r12
- cmpq %r14,%r15
-
- vaesenc %xmm2,%xmm9,%xmm9
- vmovdqu 48+8(%rsp),%xmm0
- vpxor %xmm15,%xmm13,%xmm13
- vpclmulqdq $0,%xmm3,%xmm7,%xmm1
- vaesenc %xmm2,%xmm10,%xmm10
- vpxor %xmm15,%xmm14,%xmm14
- setnc %r12b
- vpclmulqdq $17,%xmm3,%xmm7,%xmm7
- vaesenc %xmm2,%xmm11,%xmm11
- vmovdqu 16-32(%r9),%xmm3
- negq %r12
- vaesenc %xmm2,%xmm12,%xmm12
- vpxor %xmm5,%xmm6,%xmm6
- vpclmulqdq $0,%xmm3,%xmm0,%xmm5
- vpxor %xmm4,%xmm8,%xmm8
- vaesenc %xmm2,%xmm13,%xmm13
- vpxor %xmm5,%xmm1,%xmm4
- andq $96,%r12
- vmovups 32-128(%rcx),%xmm15
- vpclmulqdq $16,%xmm3,%xmm0,%xmm1
- vaesenc %xmm2,%xmm14,%xmm14
-
- vpclmulqdq $1,%xmm3,%xmm0,%xmm2
- leaq (%r14,%r12,1),%r14
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor 16+8(%rsp),%xmm8,%xmm8
- vpclmulqdq $17,%xmm3,%xmm0,%xmm3
- vmovdqu 64+8(%rsp),%xmm0
- vaesenc %xmm15,%xmm10,%xmm10
- movbeq 88(%r14),%r13
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 80(%r14),%r12
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r13,32+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- movq %r12,40+8(%rsp)
- vmovdqu 48-32(%r9),%xmm5
- vaesenc %xmm15,%xmm14,%xmm14
-
- vmovups 48-128(%rcx),%xmm15
- vpxor %xmm1,%xmm6,%xmm6
- vpclmulqdq $0,%xmm5,%xmm0,%xmm1
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm2,%xmm6,%xmm6
- vpclmulqdq $16,%xmm5,%xmm0,%xmm2
- vaesenc %xmm15,%xmm10,%xmm10
- vpxor %xmm3,%xmm7,%xmm7
- vpclmulqdq $1,%xmm5,%xmm0,%xmm3
- vaesenc %xmm15,%xmm11,%xmm11
- vpclmulqdq $17,%xmm5,%xmm0,%xmm5
- vmovdqu 80+8(%rsp),%xmm0
- vaesenc %xmm15,%xmm12,%xmm12
- vaesenc %xmm15,%xmm13,%xmm13
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqu 64-32(%r9),%xmm1
- vaesenc %xmm15,%xmm14,%xmm14
-
- vmovups 64-128(%rcx),%xmm15
- vpxor %xmm2,%xmm6,%xmm6
- vpclmulqdq $0,%xmm1,%xmm0,%xmm2
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm3,%xmm6,%xmm6
- vpclmulqdq $16,%xmm1,%xmm0,%xmm3
- vaesenc %xmm15,%xmm10,%xmm10
- movbeq 72(%r14),%r13
- vpxor %xmm5,%xmm7,%xmm7
- vpclmulqdq $1,%xmm1,%xmm0,%xmm5
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 64(%r14),%r12
- vpclmulqdq $17,%xmm1,%xmm0,%xmm1
- vmovdqu 96+8(%rsp),%xmm0
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r13,48+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- movq %r12,56+8(%rsp)
- vpxor %xmm2,%xmm4,%xmm4
- vmovdqu 96-32(%r9),%xmm2
- vaesenc %xmm15,%xmm14,%xmm14
-
- vmovups 80-128(%rcx),%xmm15
- vpxor %xmm3,%xmm6,%xmm6
- vpclmulqdq $0,%xmm2,%xmm0,%xmm3
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm5,%xmm6,%xmm6
- vpclmulqdq $16,%xmm2,%xmm0,%xmm5
- vaesenc %xmm15,%xmm10,%xmm10
- movbeq 56(%r14),%r13
- vpxor %xmm1,%xmm7,%xmm7
- vpclmulqdq $1,%xmm2,%xmm0,%xmm1
- vpxor 112+8(%rsp),%xmm8,%xmm8
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 48(%r14),%r12
- vpclmulqdq $17,%xmm2,%xmm0,%xmm2
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r13,64+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- movq %r12,72+8(%rsp)
- vpxor %xmm3,%xmm4,%xmm4
- vmovdqu 112-32(%r9),%xmm3
- vaesenc %xmm15,%xmm14,%xmm14
-
- vmovups 96-128(%rcx),%xmm15
- vpxor %xmm5,%xmm6,%xmm6
- vpclmulqdq $16,%xmm3,%xmm8,%xmm5
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm1,%xmm6,%xmm6
- vpclmulqdq $1,%xmm3,%xmm8,%xmm1
- vaesenc %xmm15,%xmm10,%xmm10
- movbeq 40(%r14),%r13
- vpxor %xmm2,%xmm7,%xmm7
- vpclmulqdq $0,%xmm3,%xmm8,%xmm2
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 32(%r14),%r12
- vpclmulqdq $17,%xmm3,%xmm8,%xmm8
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r13,80+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- movq %r12,88+8(%rsp)
- vpxor %xmm5,%xmm6,%xmm6
- vaesenc %xmm15,%xmm14,%xmm14
- vpxor %xmm1,%xmm6,%xmm6
-
- vmovups 112-128(%rcx),%xmm15
- vpslldq $8,%xmm6,%xmm5
- vpxor %xmm2,%xmm4,%xmm4
- vmovdqu 16(%r11),%xmm3
-
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm8,%xmm7,%xmm7
- vaesenc %xmm15,%xmm10,%xmm10
- vpxor %xmm5,%xmm4,%xmm4
- movbeq 24(%r14),%r13
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 16(%r14),%r12
- vpalignr $8,%xmm4,%xmm4,%xmm0
- vpclmulqdq $16,%xmm3,%xmm4,%xmm4
- movq %r13,96+8(%rsp)
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r12,104+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- vmovups 128-128(%rcx),%xmm1
- vaesenc %xmm15,%xmm14,%xmm14
-
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups 144-128(%rcx),%xmm15
- vaesenc %xmm1,%xmm10,%xmm10
- vpsrldq $8,%xmm6,%xmm6
- vaesenc %xmm1,%xmm11,%xmm11
- vpxor %xmm6,%xmm7,%xmm7
- vaesenc %xmm1,%xmm12,%xmm12
- vpxor %xmm0,%xmm4,%xmm4
- movbeq 8(%r14),%r13
- vaesenc %xmm1,%xmm13,%xmm13
- movbeq 0(%r14),%r12
- vaesenc %xmm1,%xmm14,%xmm14
- vmovups 160-128(%rcx),%xmm1
- cmpl $11,%ebp
- jb L$enc_tail
-
- vaesenc %xmm15,%xmm9,%xmm9
- vaesenc %xmm15,%xmm10,%xmm10
- vaesenc %xmm15,%xmm11,%xmm11
- vaesenc %xmm15,%xmm12,%xmm12
- vaesenc %xmm15,%xmm13,%xmm13
- vaesenc %xmm15,%xmm14,%xmm14
-
- vaesenc %xmm1,%xmm9,%xmm9
- vaesenc %xmm1,%xmm10,%xmm10
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
- vaesenc %xmm1,%xmm13,%xmm13
- vmovups 176-128(%rcx),%xmm15
- vaesenc %xmm1,%xmm14,%xmm14
- vmovups 192-128(%rcx),%xmm1
- je L$enc_tail
-
- vaesenc %xmm15,%xmm9,%xmm9
- vaesenc %xmm15,%xmm10,%xmm10
- vaesenc %xmm15,%xmm11,%xmm11
- vaesenc %xmm15,%xmm12,%xmm12
- vaesenc %xmm15,%xmm13,%xmm13
- vaesenc %xmm15,%xmm14,%xmm14
-
- vaesenc %xmm1,%xmm9,%xmm9
- vaesenc %xmm1,%xmm10,%xmm10
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
- vaesenc %xmm1,%xmm13,%xmm13
- vmovups 208-128(%rcx),%xmm15
- vaesenc %xmm1,%xmm14,%xmm14
- vmovups 224-128(%rcx),%xmm1
- jmp L$enc_tail
-
-.p2align 5
-L$handle_ctr32:
- vmovdqu (%r11),%xmm0
- vpshufb %xmm0,%xmm1,%xmm6
- vmovdqu 48(%r11),%xmm5
- vpaddd 64(%r11),%xmm6,%xmm10
- vpaddd %xmm5,%xmm6,%xmm11
- vmovdqu 0-32(%r9),%xmm3
- vpaddd %xmm5,%xmm10,%xmm12
- vpshufb %xmm0,%xmm10,%xmm10
- vpaddd %xmm5,%xmm11,%xmm13
- vpshufb %xmm0,%xmm11,%xmm11
- vpxor %xmm15,%xmm10,%xmm10
- vpaddd %xmm5,%xmm12,%xmm14
- vpshufb %xmm0,%xmm12,%xmm12
- vpxor %xmm15,%xmm11,%xmm11
- vpaddd %xmm5,%xmm13,%xmm1
- vpshufb %xmm0,%xmm13,%xmm13
- vpshufb %xmm0,%xmm14,%xmm14
- vpshufb %xmm0,%xmm1,%xmm1
- jmp L$resume_ctr32
-
-.p2align 5
-L$enc_tail:
- vaesenc %xmm15,%xmm9,%xmm9
- vmovdqu %xmm7,16+8(%rsp)
- vpalignr $8,%xmm4,%xmm4,%xmm8
- vaesenc %xmm15,%xmm10,%xmm10
- vpclmulqdq $16,%xmm3,%xmm4,%xmm4
- vpxor 0(%rdi),%xmm1,%xmm2
- vaesenc %xmm15,%xmm11,%xmm11
- vpxor 16(%rdi),%xmm1,%xmm0
- vaesenc %xmm15,%xmm12,%xmm12
- vpxor 32(%rdi),%xmm1,%xmm5
- vaesenc %xmm15,%xmm13,%xmm13
- vpxor 48(%rdi),%xmm1,%xmm6
- vaesenc %xmm15,%xmm14,%xmm14
- vpxor 64(%rdi),%xmm1,%xmm7
- vpxor 80(%rdi),%xmm1,%xmm3
- vmovdqu (%r8),%xmm1
-
- vaesenclast %xmm2,%xmm9,%xmm9
- vmovdqu 32(%r11),%xmm2
- vaesenclast %xmm0,%xmm10,%xmm10
- vpaddb %xmm2,%xmm1,%xmm0
- movq %r13,112+8(%rsp)
- leaq 96(%rdi),%rdi
- vaesenclast %xmm5,%xmm11,%xmm11
- vpaddb %xmm2,%xmm0,%xmm5
- movq %r12,120+8(%rsp)
- leaq 96(%rsi),%rsi
- vmovdqu 0-128(%rcx),%xmm15
- vaesenclast %xmm6,%xmm12,%xmm12
- vpaddb %xmm2,%xmm5,%xmm6
- vaesenclast %xmm7,%xmm13,%xmm13
- vpaddb %xmm2,%xmm6,%xmm7
- vaesenclast %xmm3,%xmm14,%xmm14
- vpaddb %xmm2,%xmm7,%xmm3
-
- addq $96,%r10
- subq $6,%rdx
- jc L$6x_done
-
- vmovups %xmm9,-96(%rsi)
- vpxor %xmm15,%xmm1,%xmm9
- vmovups %xmm10,-80(%rsi)
- vmovdqa %xmm0,%xmm10
- vmovups %xmm11,-64(%rsi)
- vmovdqa %xmm5,%xmm11
- vmovups %xmm12,-48(%rsi)
- vmovdqa %xmm6,%xmm12
- vmovups %xmm13,-32(%rsi)
- vmovdqa %xmm7,%xmm13
- vmovups %xmm14,-16(%rsi)
- vmovdqa %xmm3,%xmm14
- vmovdqu 32+8(%rsp),%xmm7
- jmp L$oop6x
-
-L$6x_done:
- vpxor 16+8(%rsp),%xmm8,%xmm8
- vpxor %xmm4,%xmm8,%xmm8
-
- .byte 0xf3,0xc3
-
-.globl _aesni_gcm_decrypt
-
-.p2align 5
-_aesni_gcm_decrypt:
- xorq %r10,%r10
- cmpq $96,%rdx
- jb L$gcm_dec_abort
-
- leaq (%rsp),%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- vzeroupper
-
- vmovdqu (%r8),%xmm1
- addq $-128,%rsp
- movl 12(%r8),%ebx
- leaq L$bswap_mask(%rip),%r11
- leaq -128(%rcx),%r14
- movq $3968,%r15
- vmovdqu (%r9),%xmm8
- andq $-128,%rsp
- vmovdqu (%r11),%xmm0
- leaq 128(%rcx),%rcx
- leaq 32+32(%r9),%r9
- movl 240-128(%rcx),%ebp
- vpshufb %xmm0,%xmm8,%xmm8
-
- andq %r15,%r14
- andq %rsp,%r15
- subq %r14,%r15
- jc L$dec_no_key_aliasing
- cmpq $768,%r15
- jnc L$dec_no_key_aliasing
- subq %r15,%rsp
-L$dec_no_key_aliasing:
-
- vmovdqu 80(%rdi),%xmm7
- leaq (%rdi),%r14
- vmovdqu 64(%rdi),%xmm4
- leaq -192(%rdi,%rdx,1),%r15
- vmovdqu 48(%rdi),%xmm5
- shrq $4,%rdx
- xorq %r10,%r10
- vmovdqu 32(%rdi),%xmm6
- vpshufb %xmm0,%xmm7,%xmm7
- vmovdqu 16(%rdi),%xmm2
- vpshufb %xmm0,%xmm4,%xmm4
- vmovdqu (%rdi),%xmm3
- vpshufb %xmm0,%xmm5,%xmm5
- vmovdqu %xmm4,48(%rsp)
- vpshufb %xmm0,%xmm6,%xmm6
- vmovdqu %xmm5,64(%rsp)
- vpshufb %xmm0,%xmm2,%xmm2
- vmovdqu %xmm6,80(%rsp)
- vpshufb %xmm0,%xmm3,%xmm3
- vmovdqu %xmm2,96(%rsp)
- vmovdqu %xmm3,112(%rsp)
-
- call _aesni_ctr32_ghash_6x
-
- vmovups %xmm9,-96(%rsi)
- vmovups %xmm10,-80(%rsi)
- vmovups %xmm11,-64(%rsi)
- vmovups %xmm12,-48(%rsi)
- vmovups %xmm13,-32(%rsi)
- vmovups %xmm14,-16(%rsi)
-
- vpshufb (%r11),%xmm8,%xmm8
- vmovdqu %xmm8,-64(%r9)
-
- vzeroupper
- movq -48(%rax),%r15
- movq -40(%rax),%r14
- movq -32(%rax),%r13
- movq -24(%rax),%r12
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-L$gcm_dec_abort:
- movq %r10,%rax
- .byte 0xf3,0xc3
-
-
-.p2align 5
-_aesni_ctr32_6x:
- vmovdqu 0-128(%rcx),%xmm4
- vmovdqu 32(%r11),%xmm2
- leaq -1(%rbp),%r13
- vmovups 16-128(%rcx),%xmm15
- leaq 32-128(%rcx),%r12
- vpxor %xmm4,%xmm1,%xmm9
- addl $100663296,%ebx
- jc L$handle_ctr32_2
- vpaddb %xmm2,%xmm1,%xmm10
- vpaddb %xmm2,%xmm10,%xmm11
- vpxor %xmm4,%xmm10,%xmm10
- vpaddb %xmm2,%xmm11,%xmm12
- vpxor %xmm4,%xmm11,%xmm11
- vpaddb %xmm2,%xmm12,%xmm13
- vpxor %xmm4,%xmm12,%xmm12
- vpaddb %xmm2,%xmm13,%xmm14
- vpxor %xmm4,%xmm13,%xmm13
- vpaddb %xmm2,%xmm14,%xmm1
- vpxor %xmm4,%xmm14,%xmm14
- jmp L$oop_ctr32
-
-.p2align 4
-L$oop_ctr32:
- vaesenc %xmm15,%xmm9,%xmm9
- vaesenc %xmm15,%xmm10,%xmm10
- vaesenc %xmm15,%xmm11,%xmm11
- vaesenc %xmm15,%xmm12,%xmm12
- vaesenc %xmm15,%xmm13,%xmm13
- vaesenc %xmm15,%xmm14,%xmm14
- vmovups (%r12),%xmm15
- leaq 16(%r12),%r12
- decl %r13d
- jnz L$oop_ctr32
-
- vmovdqu (%r12),%xmm3
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor 0(%rdi),%xmm3,%xmm4
- vaesenc %xmm15,%xmm10,%xmm10
- vpxor 16(%rdi),%xmm3,%xmm5
- vaesenc %xmm15,%xmm11,%xmm11
- vpxor 32(%rdi),%xmm3,%xmm6
- vaesenc %xmm15,%xmm12,%xmm12
- vpxor 48(%rdi),%xmm3,%xmm8
- vaesenc %xmm15,%xmm13,%xmm13
- vpxor 64(%rdi),%xmm3,%xmm2
- vaesenc %xmm15,%xmm14,%xmm14
- vpxor 80(%rdi),%xmm3,%xmm3
- leaq 96(%rdi),%rdi
-
- vaesenclast %xmm4,%xmm9,%xmm9
- vaesenclast %xmm5,%xmm10,%xmm10
- vaesenclast %xmm6,%xmm11,%xmm11
- vaesenclast %xmm8,%xmm12,%xmm12
- vaesenclast %xmm2,%xmm13,%xmm13
- vaesenclast %xmm3,%xmm14,%xmm14
- vmovups %xmm9,0(%rsi)
- vmovups %xmm10,16(%rsi)
- vmovups %xmm11,32(%rsi)
- vmovups %xmm12,48(%rsi)
- vmovups %xmm13,64(%rsi)
- vmovups %xmm14,80(%rsi)
- leaq 96(%rsi),%rsi
-
- .byte 0xf3,0xc3
-.p2align 5
-L$handle_ctr32_2:
- vpshufb %xmm0,%xmm1,%xmm6
- vmovdqu 48(%r11),%xmm5
- vpaddd 64(%r11),%xmm6,%xmm10
- vpaddd %xmm5,%xmm6,%xmm11
- vpaddd %xmm5,%xmm10,%xmm12
- vpshufb %xmm0,%xmm10,%xmm10
- vpaddd %xmm5,%xmm11,%xmm13
- vpshufb %xmm0,%xmm11,%xmm11
- vpxor %xmm4,%xmm10,%xmm10
- vpaddd %xmm5,%xmm12,%xmm14
- vpshufb %xmm0,%xmm12,%xmm12
- vpxor %xmm4,%xmm11,%xmm11
- vpaddd %xmm5,%xmm13,%xmm1
- vpshufb %xmm0,%xmm13,%xmm13
- vpxor %xmm4,%xmm12,%xmm12
- vpshufb %xmm0,%xmm14,%xmm14
- vpxor %xmm4,%xmm13,%xmm13
- vpshufb %xmm0,%xmm1,%xmm1
- vpxor %xmm4,%xmm14,%xmm14
- jmp L$oop_ctr32
-
-
.globl _aesni_gcm_encrypt
-.p2align 5
_aesni_gcm_encrypt:
- xorq %r10,%r10
- cmpq $288,%rdx
- jb L$gcm_enc_abort
-
- leaq (%rsp),%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- vzeroupper
-
- vmovdqu (%r8),%xmm1
- addq $-128,%rsp
- movl 12(%r8),%ebx
- leaq L$bswap_mask(%rip),%r11
- leaq -128(%rcx),%r14
- movq $3968,%r15
- leaq 128(%rcx),%rcx
- vmovdqu (%r11),%xmm0
- andq $-128,%rsp
- movl 240-128(%rcx),%ebp
-
- andq %r15,%r14
- andq %rsp,%r15
- subq %r14,%r15
- jc L$enc_no_key_aliasing
- cmpq $768,%r15
- jnc L$enc_no_key_aliasing
- subq %r15,%rsp
-L$enc_no_key_aliasing:
-
- leaq (%rsi),%r14
- leaq -192(%rsi,%rdx,1),%r15
- shrq $4,%rdx
-
- call _aesni_ctr32_6x
- vpshufb %xmm0,%xmm9,%xmm8
- vpshufb %xmm0,%xmm10,%xmm2
- vmovdqu %xmm8,112(%rsp)
- vpshufb %xmm0,%xmm11,%xmm4
- vmovdqu %xmm2,96(%rsp)
- vpshufb %xmm0,%xmm12,%xmm5
- vmovdqu %xmm4,80(%rsp)
- vpshufb %xmm0,%xmm13,%xmm6
- vmovdqu %xmm5,64(%rsp)
- vpshufb %xmm0,%xmm14,%xmm7
- vmovdqu %xmm6,48(%rsp)
-
- call _aesni_ctr32_6x
-
- vmovdqu (%r9),%xmm8
- leaq 32+32(%r9),%r9
- subq $12,%rdx
- movq $192,%r10
- vpshufb %xmm0,%xmm8,%xmm8
-
- call _aesni_ctr32_ghash_6x
- vmovdqu 32(%rsp),%xmm7
- vmovdqu (%r11),%xmm0
- vmovdqu 0-32(%r9),%xmm3
- vpunpckhqdq %xmm7,%xmm7,%xmm1
- vmovdqu 32-32(%r9),%xmm15
- vmovups %xmm9,-96(%rsi)
- vpshufb %xmm0,%xmm9,%xmm9
- vpxor %xmm7,%xmm1,%xmm1
- vmovups %xmm10,-80(%rsi)
- vpshufb %xmm0,%xmm10,%xmm10
- vmovups %xmm11,-64(%rsi)
- vpshufb %xmm0,%xmm11,%xmm11
- vmovups %xmm12,-48(%rsi)
- vpshufb %xmm0,%xmm12,%xmm12
- vmovups %xmm13,-32(%rsi)
- vpshufb %xmm0,%xmm13,%xmm13
- vmovups %xmm14,-16(%rsi)
- vpshufb %xmm0,%xmm14,%xmm14
- vmovdqu %xmm9,16(%rsp)
- vmovdqu 48(%rsp),%xmm6
- vmovdqu 16-32(%r9),%xmm0
- vpunpckhqdq %xmm6,%xmm6,%xmm2
- vpclmulqdq $0,%xmm3,%xmm7,%xmm5
- vpxor %xmm6,%xmm2,%xmm2
- vpclmulqdq $17,%xmm3,%xmm7,%xmm7
- vpclmulqdq $0,%xmm15,%xmm1,%xmm1
-
- vmovdqu 64(%rsp),%xmm9
- vpclmulqdq $0,%xmm0,%xmm6,%xmm4
- vmovdqu 48-32(%r9),%xmm3
- vpxor %xmm5,%xmm4,%xmm4
- vpunpckhqdq %xmm9,%xmm9,%xmm5
- vpclmulqdq $17,%xmm0,%xmm6,%xmm6
- vpxor %xmm9,%xmm5,%xmm5
- vpxor %xmm7,%xmm6,%xmm6
- vpclmulqdq $16,%xmm15,%xmm2,%xmm2
- vmovdqu 80-32(%r9),%xmm15
- vpxor %xmm1,%xmm2,%xmm2
-
- vmovdqu 80(%rsp),%xmm1
- vpclmulqdq $0,%xmm3,%xmm9,%xmm7
- vmovdqu 64-32(%r9),%xmm0
- vpxor %xmm4,%xmm7,%xmm7
- vpunpckhqdq %xmm1,%xmm1,%xmm4
- vpclmulqdq $17,%xmm3,%xmm9,%xmm9
- vpxor %xmm1,%xmm4,%xmm4
- vpxor %xmm6,%xmm9,%xmm9
- vpclmulqdq $0,%xmm15,%xmm5,%xmm5
- vpxor %xmm2,%xmm5,%xmm5
-
- vmovdqu 96(%rsp),%xmm2
- vpclmulqdq $0,%xmm0,%xmm1,%xmm6
- vmovdqu 96-32(%r9),%xmm3
- vpxor %xmm7,%xmm6,%xmm6
- vpunpckhqdq %xmm2,%xmm2,%xmm7
- vpclmulqdq $17,%xmm0,%xmm1,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpxor %xmm9,%xmm1,%xmm1
- vpclmulqdq $16,%xmm15,%xmm4,%xmm4
- vmovdqu 128-32(%r9),%xmm15
- vpxor %xmm5,%xmm4,%xmm4
-
- vpxor 112(%rsp),%xmm8,%xmm8
- vpclmulqdq $0,%xmm3,%xmm2,%xmm5
- vmovdqu 112-32(%r9),%xmm0
- vpunpckhqdq %xmm8,%xmm8,%xmm9
- vpxor %xmm6,%xmm5,%xmm5
- vpclmulqdq $17,%xmm3,%xmm2,%xmm2
- vpxor %xmm8,%xmm9,%xmm9
- vpxor %xmm1,%xmm2,%xmm2
- vpclmulqdq $0,%xmm15,%xmm7,%xmm7
- vpxor %xmm4,%xmm7,%xmm4
-
- vpclmulqdq $0,%xmm0,%xmm8,%xmm6
- vmovdqu 0-32(%r9),%xmm3
- vpunpckhqdq %xmm14,%xmm14,%xmm1
- vpclmulqdq $17,%xmm0,%xmm8,%xmm8
- vpxor %xmm14,%xmm1,%xmm1
- vpxor %xmm5,%xmm6,%xmm5
- vpclmulqdq $16,%xmm15,%xmm9,%xmm9
- vmovdqu 32-32(%r9),%xmm15
- vpxor %xmm2,%xmm8,%xmm7
- vpxor %xmm4,%xmm9,%xmm6
-
- vmovdqu 16-32(%r9),%xmm0
- vpxor %xmm5,%xmm7,%xmm9
- vpclmulqdq $0,%xmm3,%xmm14,%xmm4
- vpxor %xmm9,%xmm6,%xmm6
- vpunpckhqdq %xmm13,%xmm13,%xmm2
- vpclmulqdq $17,%xmm3,%xmm14,%xmm14
- vpxor %xmm13,%xmm2,%xmm2
- vpslldq $8,%xmm6,%xmm9
- vpclmulqdq $0,%xmm15,%xmm1,%xmm1
- vpxor %xmm9,%xmm5,%xmm8
- vpsrldq $8,%xmm6,%xmm6
- vpxor %xmm6,%xmm7,%xmm7
-
- vpclmulqdq $0,%xmm0,%xmm13,%xmm5
- vmovdqu 48-32(%r9),%xmm3
- vpxor %xmm4,%xmm5,%xmm5
- vpunpckhqdq %xmm12,%xmm12,%xmm9
- vpclmulqdq $17,%xmm0,%xmm13,%xmm13
- vpxor %xmm12,%xmm9,%xmm9
- vpxor %xmm14,%xmm13,%xmm13
- vpalignr $8,%xmm8,%xmm8,%xmm14
- vpclmulqdq $16,%xmm15,%xmm2,%xmm2
- vmovdqu 80-32(%r9),%xmm15
- vpxor %xmm1,%xmm2,%xmm2
-
- vpclmulqdq $0,%xmm3,%xmm12,%xmm4
- vmovdqu 64-32(%r9),%xmm0
- vpxor %xmm5,%xmm4,%xmm4
- vpunpckhqdq %xmm11,%xmm11,%xmm1
- vpclmulqdq $17,%xmm3,%xmm12,%xmm12
- vpxor %xmm11,%xmm1,%xmm1
- vpxor %xmm13,%xmm12,%xmm12
- vxorps 16(%rsp),%xmm7,%xmm7
- vpclmulqdq $0,%xmm15,%xmm9,%xmm9
- vpxor %xmm2,%xmm9,%xmm9
-
- vpclmulqdq $16,16(%r11),%xmm8,%xmm8
- vxorps %xmm14,%xmm8,%xmm8
-
- vpclmulqdq $0,%xmm0,%xmm11,%xmm5
- vmovdqu 96-32(%r9),%xmm3
- vpxor %xmm4,%xmm5,%xmm5
- vpunpckhqdq %xmm10,%xmm10,%xmm2
- vpclmulqdq $17,%xmm0,%xmm11,%xmm11
- vpxor %xmm10,%xmm2,%xmm2
- vpalignr $8,%xmm8,%xmm8,%xmm14
- vpxor %xmm12,%xmm11,%xmm11
- vpclmulqdq $16,%xmm15,%xmm1,%xmm1
- vmovdqu 128-32(%r9),%xmm15
- vpxor %xmm9,%xmm1,%xmm1
-
- vxorps %xmm7,%xmm14,%xmm14
- vpclmulqdq $16,16(%r11),%xmm8,%xmm8
- vxorps %xmm14,%xmm8,%xmm8
-
- vpclmulqdq $0,%xmm3,%xmm10,%xmm4
- vmovdqu 112-32(%r9),%xmm0
- vpxor %xmm5,%xmm4,%xmm4
- vpunpckhqdq %xmm8,%xmm8,%xmm9
- vpclmulqdq $17,%xmm3,%xmm10,%xmm10
- vpxor %xmm8,%xmm9,%xmm9
- vpxor %xmm11,%xmm10,%xmm10
- vpclmulqdq $0,%xmm15,%xmm2,%xmm2
- vpxor %xmm1,%xmm2,%xmm2
-
- vpclmulqdq $0,%xmm0,%xmm8,%xmm5
- vpclmulqdq $17,%xmm0,%xmm8,%xmm7
- vpxor %xmm4,%xmm5,%xmm5
- vpclmulqdq $16,%xmm15,%xmm9,%xmm6
- vpxor %xmm10,%xmm7,%xmm7
- vpxor %xmm2,%xmm6,%xmm6
-
- vpxor %xmm5,%xmm7,%xmm4
- vpxor %xmm4,%xmm6,%xmm6
- vpslldq $8,%xmm6,%xmm1
- vmovdqu 16(%r11),%xmm3
- vpsrldq $8,%xmm6,%xmm6
- vpxor %xmm1,%xmm5,%xmm8
- vpxor %xmm6,%xmm7,%xmm7
+ xorl %eax,%eax
+ .byte 0xf3,0xc3
- vpalignr $8,%xmm8,%xmm8,%xmm2
- vpclmulqdq $16,%xmm3,%xmm8,%xmm8
- vpxor %xmm2,%xmm8,%xmm8
- vpalignr $8,%xmm8,%xmm8,%xmm2
- vpclmulqdq $16,%xmm3,%xmm8,%xmm8
- vpxor %xmm7,%xmm2,%xmm2
- vpxor %xmm2,%xmm8,%xmm8
- vpshufb (%r11),%xmm8,%xmm8
- vmovdqu %xmm8,-64(%r9)
+.globl _aesni_gcm_decrypt
- vzeroupper
- movq -48(%rax),%r15
- movq -40(%rax),%r14
- movq -32(%rax),%r13
- movq -24(%rax),%r12
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-L$gcm_enc_abort:
- movq %r10,%rax
+_aesni_gcm_decrypt:
+ xorl %eax,%eax
.byte 0xf3,0xc3
-
-.p2align 6
-L$bswap_mask:
-.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-L$poly:
-.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
-L$one_msb:
-.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
-L$two_lsb:
-.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-L$one_lsb:
-.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.p2align 6
diff --git a/deps/openssl/asm/x64-macosx-gas/modes/ghash-x86_64.s b/deps/openssl/asm/x64-macosx-gas/modes/ghash-x86_64.s
index 09ac73bc97..77fddf934a 100644
--- a/deps/openssl/asm/x64-macosx-gas/modes/ghash-x86_64.s
+++ b/deps/openssl/asm/x64-macosx-gas/modes/ghash-x86_64.s
@@ -20,14 +20,14 @@ L$gmult_prologue:
movq $14,%rcx
movq 8(%rsi,%rax,1),%r8
movq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
movq %r8,%rdx
jmp L$oop1
.p2align 4
L$oop1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
movb (%rdi,%rcx,1),%al
shrq $4,%r9
@@ -43,13 +43,13 @@ L$oop1:
js L$break1
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
@@ -58,19 +58,19 @@ L$oop1:
.p2align 4
L$break1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rbx,1),%r8
@@ -661,10 +661,10 @@ L$ghash_epilogue:
_gcm_init_clmul:
L$_init_clmul:
movdqu (%rsi),%xmm2
- pshufd $78,%xmm2,%xmm2
+ pshufd $0b01001110,%xmm2,%xmm2
- pshufd $255,%xmm2,%xmm4
+ pshufd $0b11111111,%xmm2,%xmm4
movdqa %xmm2,%xmm3
psllq $1,%xmm2
pxor %xmm5,%xmm5
@@ -678,11 +678,11 @@ L$_init_clmul:
pxor %xmm5,%xmm2
- pshufd $78,%xmm2,%xmm6
+ pshufd $0b01001110,%xmm2,%xmm6
movdqa %xmm2,%xmm0
pxor %xmm2,%xmm6
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -718,8 +718,8 @@ L$_init_clmul:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- pshufd $78,%xmm2,%xmm3
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm2,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm2,%xmm3
movdqu %xmm2,0(%rdi)
pxor %xmm0,%xmm4
@@ -727,7 +727,7 @@ L$_init_clmul:
.byte 102,15,58,15,227,8
movdqu %xmm4,32(%rdi)
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -765,7 +765,7 @@ L$_init_clmul:
pxor %xmm1,%xmm0
movdqa %xmm0,%xmm5
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -801,8 +801,8 @@ L$_init_clmul:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- pshufd $78,%xmm5,%xmm3
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm5,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm5,%xmm3
movdqu %xmm5,48(%rdi)
pxor %xmm0,%xmm4
@@ -822,7 +822,7 @@ L$_gmult_clmul:
movdqu 32(%rsi),%xmm4
.byte 102,15,56,0,197
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -874,20 +874,20 @@ L$_ghash_clmul:
movdqu 32(%rsi),%xmm7
.byte 102,65,15,56,0,194
- subq $16,%rcx
+ subq $0x10,%rcx
jz L$odd_tail
movdqu 16(%rsi),%xmm6
movl _OPENSSL_ia32cap_P+4(%rip),%eax
- cmpq $48,%rcx
+ cmpq $0x30,%rcx
jb L$skip4x
andl $71303168,%eax
cmpl $4194304,%eax
je L$skip4x
- subq $48,%rcx
- movq $11547335547999543296,%rax
+ subq $0x30,%rcx
+ movq $0xA040608020C0E000,%rax
movdqu 48(%rsi),%xmm14
movdqu 64(%rsi),%xmm15
@@ -899,14 +899,14 @@ L$_ghash_clmul:
.byte 102,65,15,56,0,218
.byte 102,69,15,56,0,218
movdqa %xmm3,%xmm5
- pshufd $78,%xmm3,%xmm4
+ pshufd $0b01001110,%xmm3,%xmm4
pxor %xmm3,%xmm4
.byte 102,15,58,68,218,0
.byte 102,15,58,68,234,17
.byte 102,15,58,68,231,0
movdqa %xmm11,%xmm13
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
pxor %xmm11,%xmm12
.byte 102,68,15,58,68,222,0
.byte 102,68,15,58,68,238,17
@@ -921,12 +921,12 @@ L$_ghash_clmul:
.byte 102,69,15,56,0,218
.byte 102,69,15,56,0,194
movdqa %xmm11,%xmm13
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
pxor %xmm8,%xmm0
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm8
+ pshufd $0b01001110,%xmm0,%xmm8
pxor %xmm0,%xmm8
.byte 102,69,15,58,68,238,17
.byte 102,68,15,58,68,231,0
@@ -934,7 +934,7 @@ L$_ghash_clmul:
xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jc L$tail4x
jmp L$mod4_loop
@@ -949,14 +949,14 @@ L$mod4_loop:
movdqu 32(%rdx),%xmm3
movdqa %xmm11,%xmm13
.byte 102,68,15,58,68,199,16
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
xorps %xmm5,%xmm1
pxor %xmm11,%xmm12
.byte 102,65,15,56,0,218
movups 32(%rsi),%xmm7
xorps %xmm4,%xmm8
.byte 102,68,15,58,68,218,0
- pshufd $78,%xmm3,%xmm4
+ pshufd $0b01001110,%xmm3,%xmm4
pxor %xmm0,%xmm8
movdqa %xmm3,%xmm5
@@ -1000,7 +1000,7 @@ L$mod4_loop:
movdqa %xmm11,%xmm13
pxor %xmm12,%xmm4
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
pxor %xmm9,%xmm0
pxor %xmm8,%xmm1
pxor %xmm11,%xmm12
@@ -1010,14 +1010,14 @@ L$mod4_loop:
movdqa %xmm0,%xmm1
.byte 102,69,15,58,68,238,17
xorps %xmm11,%xmm3
- pshufd $78,%xmm0,%xmm8
+ pshufd $0b01001110,%xmm0,%xmm8
pxor %xmm0,%xmm8
.byte 102,68,15,58,68,231,0
xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jnc L$mod4_loop
L$tail4x:
@@ -1061,10 +1061,10 @@ L$tail4x:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- addq $64,%rcx
+ addq $0x40,%rcx
jz L$done
movdqu 32(%rsi),%xmm7
- subq $16,%rcx
+ subq $0x10,%rcx
jz L$odd_tail
L$skip4x:
@@ -1079,7 +1079,7 @@ L$skip4x:
pxor %xmm8,%xmm0
movdqa %xmm3,%xmm5
- pshufd $78,%xmm3,%xmm4
+ pshufd $0b01001110,%xmm3,%xmm4
pxor %xmm3,%xmm4
.byte 102,15,58,68,218,0
.byte 102,15,58,68,234,17
@@ -1087,7 +1087,7 @@ L$skip4x:
leaq 32(%rdx),%rdx
nop
- subq $32,%rcx
+ subq $0x20,%rcx
jbe L$even_tail
nop
jmp L$mod_loop
@@ -1096,7 +1096,7 @@ L$skip4x:
L$mod_loop:
movdqa %xmm0,%xmm1
movdqa %xmm4,%xmm8
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm0,%xmm4
.byte 102,15,58,68,198,0
@@ -1134,7 +1134,7 @@ L$mod_loop:
pslldq $8,%xmm0
psrldq $8,%xmm8
pxor %xmm9,%xmm0
- pshufd $78,%xmm5,%xmm4
+ pshufd $0b01001110,%xmm5,%xmm4
pxor %xmm8,%xmm1
pxor %xmm5,%xmm4
@@ -1150,13 +1150,13 @@ L$mod_loop:
.byte 102,15,58,68,231,0
pxor %xmm1,%xmm0
- subq $32,%rcx
+ subq $0x20,%rcx
ja L$mod_loop
L$even_tail:
movdqa %xmm0,%xmm1
movdqa %xmm4,%xmm8
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm0,%xmm4
.byte 102,15,58,68,198,0
@@ -1204,7 +1204,7 @@ L$odd_tail:
.byte 102,69,15,56,0,194
pxor %xmm8,%xmm0
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -1249,108 +1249,7 @@ L$done:
.p2align 5
_gcm_init_avx:
- vzeroupper
-
- vmovdqu (%rsi),%xmm2
- vpshufd $78,%xmm2,%xmm2
-
-
- vpshufd $255,%xmm2,%xmm4
- vpsrlq $63,%xmm2,%xmm3
- vpsllq $1,%xmm2,%xmm2
- vpxor %xmm5,%xmm5,%xmm5
- vpcmpgtd %xmm4,%xmm5,%xmm5
- vpslldq $8,%xmm3,%xmm3
- vpor %xmm3,%xmm2,%xmm2
-
-
- vpand L$0x1c2_polynomial(%rip),%xmm5,%xmm5
- vpxor %xmm5,%xmm2,%xmm2
-
- vpunpckhqdq %xmm2,%xmm2,%xmm6
- vmovdqa %xmm2,%xmm0
- vpxor %xmm2,%xmm6,%xmm6
- movq $4,%r10
- jmp L$init_start_avx
-.p2align 5
-L$init_loop_avx:
- vpalignr $8,%xmm3,%xmm4,%xmm5
- vmovdqu %xmm5,-16(%rdi)
- vpunpckhqdq %xmm0,%xmm0,%xmm3
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $17,%xmm2,%xmm0,%xmm1
- vpclmulqdq $0,%xmm2,%xmm0,%xmm0
- vpclmulqdq $0,%xmm6,%xmm3,%xmm3
- vpxor %xmm0,%xmm1,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
-
- vpslldq $8,%xmm3,%xmm4
- vpsrldq $8,%xmm3,%xmm3
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm3,%xmm1,%xmm1
- vpsllq $57,%xmm0,%xmm3
- vpsllq $62,%xmm0,%xmm4
- vpxor %xmm3,%xmm4,%xmm4
- vpsllq $63,%xmm0,%xmm3
- vpxor %xmm3,%xmm4,%xmm4
- vpslldq $8,%xmm4,%xmm3
- vpsrldq $8,%xmm4,%xmm4
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm4,%xmm1,%xmm1
-
- vpsrlq $1,%xmm0,%xmm4
- vpxor %xmm0,%xmm1,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpsrlq $5,%xmm4,%xmm4
- vpxor %xmm4,%xmm0,%xmm0
- vpsrlq $1,%xmm0,%xmm0
- vpxor %xmm1,%xmm0,%xmm0
-L$init_start_avx:
- vmovdqa %xmm0,%xmm5
- vpunpckhqdq %xmm0,%xmm0,%xmm3
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $17,%xmm2,%xmm0,%xmm1
- vpclmulqdq $0,%xmm2,%xmm0,%xmm0
- vpclmulqdq $0,%xmm6,%xmm3,%xmm3
- vpxor %xmm0,%xmm1,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
-
- vpslldq $8,%xmm3,%xmm4
- vpsrldq $8,%xmm3,%xmm3
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm3,%xmm1,%xmm1
- vpsllq $57,%xmm0,%xmm3
- vpsllq $62,%xmm0,%xmm4
- vpxor %xmm3,%xmm4,%xmm4
- vpsllq $63,%xmm0,%xmm3
- vpxor %xmm3,%xmm4,%xmm4
- vpslldq $8,%xmm4,%xmm3
- vpsrldq $8,%xmm4,%xmm4
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm4,%xmm1,%xmm1
-
- vpsrlq $1,%xmm0,%xmm4
- vpxor %xmm0,%xmm1,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpsrlq $5,%xmm4,%xmm4
- vpxor %xmm4,%xmm0,%xmm0
- vpsrlq $1,%xmm0,%xmm0
- vpxor %xmm1,%xmm0,%xmm0
- vpshufd $78,%xmm5,%xmm3
- vpshufd $78,%xmm0,%xmm4
- vpxor %xmm5,%xmm3,%xmm3
- vmovdqu %xmm5,0(%rdi)
- vpxor %xmm0,%xmm4,%xmm4
- vmovdqu %xmm0,16(%rdi)
- leaq 48(%rdi),%rdi
- subq $1,%r10
- jnz L$init_loop_avx
-
- vpalignr $8,%xmm4,%xmm3,%xmm5
- vmovdqu %xmm5,-16(%rdi)
-
- vzeroupper
- .byte 0xf3,0xc3
+ jmp L$_init_clmul
.globl _gcm_gmult_avx
@@ -1362,377 +1261,7 @@ _gcm_gmult_avx:
.p2align 5
_gcm_ghash_avx:
- vzeroupper
-
- vmovdqu (%rdi),%xmm10
- leaq L$0x1c2_polynomial(%rip),%r10
- leaq 64(%rsi),%rsi
- vmovdqu L$bswap_mask(%rip),%xmm13
- vpshufb %xmm13,%xmm10,%xmm10
- cmpq $128,%rcx
- jb L$short_avx
- subq $128,%rcx
-
- vmovdqu 112(%rdx),%xmm14
- vmovdqu 0-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm14
- vmovdqu 32-64(%rsi),%xmm7
-
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vmovdqu 96(%rdx),%xmm15
- vpclmulqdq $0,%xmm6,%xmm14,%xmm0
- vpxor %xmm14,%xmm9,%xmm9
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $17,%xmm6,%xmm14,%xmm1
- vmovdqu 16-64(%rsi),%xmm6
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vmovdqu 80(%rdx),%xmm14
- vpclmulqdq $0,%xmm7,%xmm9,%xmm2
- vpxor %xmm15,%xmm8,%xmm8
-
- vpshufb %xmm13,%xmm14,%xmm14
- vpclmulqdq $0,%xmm6,%xmm15,%xmm3
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpclmulqdq $17,%xmm6,%xmm15,%xmm4
- vmovdqu 48-64(%rsi),%xmm6
- vpxor %xmm14,%xmm9,%xmm9
- vmovdqu 64(%rdx),%xmm15
- vpclmulqdq $16,%xmm7,%xmm8,%xmm5
- vmovdqu 80-64(%rsi),%xmm7
-
- vpshufb %xmm13,%xmm15,%xmm15
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm14,%xmm0
- vpxor %xmm1,%xmm4,%xmm4
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpclmulqdq $17,%xmm6,%xmm14,%xmm1
- vmovdqu 64-64(%rsi),%xmm6
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0,%xmm7,%xmm9,%xmm2
- vpxor %xmm15,%xmm8,%xmm8
-
- vmovdqu 48(%rdx),%xmm14
- vpxor %xmm3,%xmm0,%xmm0
- vpclmulqdq $0,%xmm6,%xmm15,%xmm3
- vpxor %xmm4,%xmm1,%xmm1
- vpshufb %xmm13,%xmm14,%xmm14
- vpclmulqdq $17,%xmm6,%xmm15,%xmm4
- vmovdqu 96-64(%rsi),%xmm6
- vpxor %xmm5,%xmm2,%xmm2
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpclmulqdq $16,%xmm7,%xmm8,%xmm5
- vmovdqu 128-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
-
- vmovdqu 32(%rdx),%xmm15
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm14,%xmm0
- vpxor %xmm1,%xmm4,%xmm4
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $17,%xmm6,%xmm14,%xmm1
- vmovdqu 112-64(%rsi),%xmm6
- vpxor %xmm2,%xmm5,%xmm5
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpclmulqdq $0,%xmm7,%xmm9,%xmm2
- vpxor %xmm15,%xmm8,%xmm8
-
- vmovdqu 16(%rdx),%xmm14
- vpxor %xmm3,%xmm0,%xmm0
- vpclmulqdq $0,%xmm6,%xmm15,%xmm3
- vpxor %xmm4,%xmm1,%xmm1
- vpshufb %xmm13,%xmm14,%xmm14
- vpclmulqdq $17,%xmm6,%xmm15,%xmm4
- vmovdqu 144-64(%rsi),%xmm6
- vpxor %xmm5,%xmm2,%xmm2
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpclmulqdq $16,%xmm7,%xmm8,%xmm5
- vmovdqu 176-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
-
- vmovdqu (%rdx),%xmm15
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm14,%xmm0
- vpxor %xmm1,%xmm4,%xmm4
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $17,%xmm6,%xmm14,%xmm1
- vmovdqu 160-64(%rsi),%xmm6
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $16,%xmm7,%xmm9,%xmm2
-
- leaq 128(%rdx),%rdx
- cmpq $128,%rcx
- jb L$tail_avx
-
- vpxor %xmm10,%xmm15,%xmm15
- subq $128,%rcx
- jmp L$oop8x_avx
-
-.p2align 5
-L$oop8x_avx:
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vmovdqu 112(%rdx),%xmm14
- vpxor %xmm0,%xmm3,%xmm3
- vpxor %xmm15,%xmm8,%xmm8
- vpclmulqdq $0,%xmm6,%xmm15,%xmm10
- vpshufb %xmm13,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $17,%xmm6,%xmm15,%xmm11
- vmovdqu 0-64(%rsi),%xmm6
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0,%xmm7,%xmm8,%xmm12
- vmovdqu 32-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
-
- vmovdqu 96(%rdx),%xmm15
- vpclmulqdq $0,%xmm6,%xmm14,%xmm0
- vpxor %xmm3,%xmm10,%xmm10
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $17,%xmm6,%xmm14,%xmm1
- vxorps %xmm4,%xmm11,%xmm11
- vmovdqu 16-64(%rsi),%xmm6
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpclmulqdq $0,%xmm7,%xmm9,%xmm2
- vpxor %xmm5,%xmm12,%xmm12
- vxorps %xmm15,%xmm8,%xmm8
-
- vmovdqu 80(%rdx),%xmm14
- vpxor %xmm10,%xmm12,%xmm12
- vpclmulqdq $0,%xmm6,%xmm15,%xmm3
- vpxor %xmm11,%xmm12,%xmm12
- vpslldq $8,%xmm12,%xmm9
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $17,%xmm6,%xmm15,%xmm4
- vpsrldq $8,%xmm12,%xmm12
- vpxor %xmm9,%xmm10,%xmm10
- vmovdqu 48-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm14
- vxorps %xmm12,%xmm11,%xmm11
- vpxor %xmm1,%xmm4,%xmm4
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpclmulqdq $16,%xmm7,%xmm8,%xmm5
- vmovdqu 80-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
- vpxor %xmm2,%xmm5,%xmm5
-
- vmovdqu 64(%rdx),%xmm15
- vpalignr $8,%xmm10,%xmm10,%xmm12
- vpclmulqdq $0,%xmm6,%xmm14,%xmm0
- vpshufb %xmm13,%xmm15,%xmm15
- vpxor %xmm3,%xmm0,%xmm0
- vpclmulqdq $17,%xmm6,%xmm14,%xmm1
- vmovdqu 64-64(%rsi),%xmm6
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0,%xmm7,%xmm9,%xmm2
- vxorps %xmm15,%xmm8,%xmm8
- vpxor %xmm5,%xmm2,%xmm2
-
- vmovdqu 48(%rdx),%xmm14
- vpclmulqdq $16,(%r10),%xmm10,%xmm10
- vpclmulqdq $0,%xmm6,%xmm15,%xmm3
- vpshufb %xmm13,%xmm14,%xmm14
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $17,%xmm6,%xmm15,%xmm4
- vmovdqu 96-64(%rsi),%xmm6
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $16,%xmm7,%xmm8,%xmm5
- vmovdqu 128-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
- vpxor %xmm2,%xmm5,%xmm5
-
- vmovdqu 32(%rdx),%xmm15
- vpclmulqdq $0,%xmm6,%xmm14,%xmm0
- vpshufb %xmm13,%xmm15,%xmm15
- vpxor %xmm3,%xmm0,%xmm0
- vpclmulqdq $17,%xmm6,%xmm14,%xmm1
- vmovdqu 112-64(%rsi),%xmm6
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0,%xmm7,%xmm9,%xmm2
- vpxor %xmm15,%xmm8,%xmm8
- vpxor %xmm5,%xmm2,%xmm2
- vxorps %xmm12,%xmm10,%xmm10
-
- vmovdqu 16(%rdx),%xmm14
- vpalignr $8,%xmm10,%xmm10,%xmm12
- vpclmulqdq $0,%xmm6,%xmm15,%xmm3
- vpshufb %xmm13,%xmm14,%xmm14
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $17,%xmm6,%xmm15,%xmm4
- vmovdqu 144-64(%rsi),%xmm6
- vpclmulqdq $16,(%r10),%xmm10,%xmm10
- vxorps %xmm11,%xmm12,%xmm12
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $16,%xmm7,%xmm8,%xmm5
- vmovdqu 176-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
- vpxor %xmm2,%xmm5,%xmm5
-
- vmovdqu (%rdx),%xmm15
- vpclmulqdq $0,%xmm6,%xmm14,%xmm0
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $17,%xmm6,%xmm14,%xmm1
- vmovdqu 160-64(%rsi),%xmm6
- vpxor %xmm12,%xmm15,%xmm15
- vpclmulqdq $16,%xmm7,%xmm9,%xmm2
- vpxor %xmm10,%xmm15,%xmm15
-
- leaq 128(%rdx),%rdx
- subq $128,%rcx
- jnc L$oop8x_avx
-
- addq $128,%rcx
- jmp L$tail_no_xor_avx
-
-.p2align 5
-L$short_avx:
- vmovdqu -16(%rdx,%rcx,1),%xmm14
- leaq (%rdx,%rcx,1),%rdx
- vmovdqu 0-64(%rsi),%xmm6
- vmovdqu 32-64(%rsi),%xmm7
- vpshufb %xmm13,%xmm14,%xmm15
-
- vmovdqa %xmm0,%xmm3
- vmovdqa %xmm1,%xmm4
- vmovdqa %xmm2,%xmm5
- subq $16,%rcx
- jz L$tail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -32(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $17,%xmm6,%xmm15,%xmm1
- vmovdqu 16-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0,%xmm7,%xmm8,%xmm2
- vpsrldq $8,%xmm7,%xmm7
- subq $16,%rcx
- jz L$tail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -48(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $17,%xmm6,%xmm15,%xmm1
- vmovdqu 48-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0,%xmm7,%xmm8,%xmm2
- vmovdqu 80-64(%rsi),%xmm7
- subq $16,%rcx
- jz L$tail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -64(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $17,%xmm6,%xmm15,%xmm1
- vmovdqu 64-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0,%xmm7,%xmm8,%xmm2
- vpsrldq $8,%xmm7,%xmm7
- subq $16,%rcx
- jz L$tail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -80(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $17,%xmm6,%xmm15,%xmm1
- vmovdqu 96-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0,%xmm7,%xmm8,%xmm2
- vmovdqu 128-64(%rsi),%xmm7
- subq $16,%rcx
- jz L$tail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -96(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $17,%xmm6,%xmm15,%xmm1
- vmovdqu 112-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0,%xmm7,%xmm8,%xmm2
- vpsrldq $8,%xmm7,%xmm7
- subq $16,%rcx
- jz L$tail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -112(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $17,%xmm6,%xmm15,%xmm1
- vmovdqu 144-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0,%xmm7,%xmm8,%xmm2
- vmovq 184-64(%rsi),%xmm7
- subq $16,%rcx
- jmp L$tail_avx
-
-.p2align 5
-L$tail_avx:
- vpxor %xmm10,%xmm15,%xmm15
-L$tail_no_xor_avx:
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $17,%xmm6,%xmm15,%xmm1
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0,%xmm7,%xmm8,%xmm2
-
- vmovdqu (%r10),%xmm12
-
- vpxor %xmm0,%xmm3,%xmm10
- vpxor %xmm1,%xmm4,%xmm11
- vpxor %xmm2,%xmm5,%xmm5
-
- vpxor %xmm10,%xmm5,%xmm5
- vpxor %xmm11,%xmm5,%xmm5
- vpslldq $8,%xmm5,%xmm9
- vpsrldq $8,%xmm5,%xmm5
- vpxor %xmm9,%xmm10,%xmm10
- vpxor %xmm5,%xmm11,%xmm11
-
- vpclmulqdq $16,%xmm12,%xmm10,%xmm9
- vpalignr $8,%xmm10,%xmm10,%xmm10
- vpxor %xmm9,%xmm10,%xmm10
-
- vpclmulqdq $16,%xmm12,%xmm10,%xmm9
- vpalignr $8,%xmm10,%xmm10,%xmm10
- vpxor %xmm11,%xmm10,%xmm10
- vpxor %xmm9,%xmm10,%xmm10
-
- cmpq $0,%rcx
- jne L$short_avx
-
- vpshufb %xmm13,%xmm10,%xmm10
- vmovdqu %xmm10,(%rdi)
- vzeroupper
- .byte 0xf3,0xc3
+ jmp L$_ghash_clmul
.p2align 6
L$bswap_mask:
diff --git a/deps/openssl/asm/x64-macosx-gas/sha/sha1-mb-x86_64.s b/deps/openssl/asm/x64-macosx-gas/sha/sha1-mb-x86_64.s
index c164fc3c42..a0de51655d 100644
--- a/deps/openssl/asm/x64-macosx-gas/sha/sha1-mb-x86_64.s
+++ b/deps/openssl/asm/x64-macosx-gas/sha/sha1-mb-x86_64.s
@@ -9,8 +9,6 @@ _sha1_multi_block:
movq _OPENSSL_ia32cap_P+4(%rip),%rcx
btq $61,%rcx
jc _shaext_shortcut
- testl $268435456,%ecx
- jnz _avx_shortcut
movq %rsp,%rax
pushq %rbx
pushq %rbp
@@ -2601,10 +2599,10 @@ L$oop_grande_shaext:
punpcklqdq %xmm5,%xmm0
punpckhqdq %xmm5,%xmm8
- pshufd $63,%xmm7,%xmm1
- pshufd $127,%xmm7,%xmm9
- pshufd $27,%xmm0,%xmm0
- pshufd $27,%xmm8,%xmm8
+ pshufd $0b00111111,%xmm7,%xmm1
+ pshufd $0b01111111,%xmm7,%xmm9
+ pshufd $0b00011011,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm8,%xmm8
jmp L$oop_shaext
.p2align 5
@@ -2859,8 +2857,8 @@ L$oop_shaext:
.byte 69,15,58,204,193,3
.byte 69,15,56,200,214
- pshufd $0,%xmm6,%xmm11
- pshufd $85,%xmm6,%xmm12
+ pshufd $0x00,%xmm6,%xmm11
+ pshufd $0x55,%xmm6,%xmm12
movdqa %xmm6,%xmm7
pcmpgtd %xmm4,%xmm11
pcmpgtd %xmm4,%xmm12
@@ -2890,8 +2888,8 @@ L$oop_shaext:
movl 280(%rsp),%edx
- pshufd $27,%xmm0,%xmm0
- pshufd $27,%xmm8,%xmm8
+ pshufd $0b00011011,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm8,%xmm8
movdqa %xmm0,%xmm6
punpckldq %xmm8,%xmm0
@@ -2920,4291 +2918,6 @@ L$epilogue_shaext:
.byte 0xf3,0xc3
-.p2align 5
-sha1_multi_block_avx:
-_avx_shortcut:
- shrq $32,%rcx
- cmpl $2,%edx
- jb L$avx
- testl $32,%ecx
- jnz _avx2_shortcut
- jmp L$avx
-.p2align 5
-L$avx:
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- subq $288,%rsp
- andq $-256,%rsp
- movq %rax,272(%rsp)
-L$body_avx:
- leaq K_XX_XX(%rip),%rbp
- leaq 256(%rsp),%rbx
-
- vzeroupper
-L$oop_grande_avx:
- movl %edx,280(%rsp)
- xorl %edx,%edx
- movq 0(%rsi),%r8
- movl 8(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,0(%rbx)
- cmovleq %rbp,%r8
- movq 16(%rsi),%r9
- movl 24(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,4(%rbx)
- cmovleq %rbp,%r9
- movq 32(%rsi),%r10
- movl 40(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,8(%rbx)
- cmovleq %rbp,%r10
- movq 48(%rsi),%r11
- movl 56(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,12(%rbx)
- cmovleq %rbp,%r11
- testl %edx,%edx
- jz L$done_avx
-
- vmovdqu 0(%rdi),%xmm10
- leaq 128(%rsp),%rax
- vmovdqu 32(%rdi),%xmm11
- vmovdqu 64(%rdi),%xmm12
- vmovdqu 96(%rdi),%xmm13
- vmovdqu 128(%rdi),%xmm14
- vmovdqu 96(%rbp),%xmm5
- jmp L$oop_avx
-
-.p2align 5
-L$oop_avx:
- vmovdqa -32(%rbp),%xmm15
- vmovd (%r8),%xmm0
- leaq 64(%r8),%r8
- vmovd (%r9),%xmm2
- leaq 64(%r9),%r9
- vpinsrd $1,(%r10),%xmm0,%xmm0
- leaq 64(%r10),%r10
- vpinsrd $1,(%r11),%xmm2,%xmm2
- leaq 64(%r11),%r11
- vmovd -60(%r8),%xmm1
- vpunpckldq %xmm2,%xmm0,%xmm0
- vmovd -60(%r9),%xmm9
- vpshufb %xmm5,%xmm0,%xmm0
- vpinsrd $1,-60(%r10),%xmm1,%xmm1
- vpinsrd $1,-60(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpandn %xmm13,%xmm11,%xmm7
- vpand %xmm12,%xmm11,%xmm6
-
- vmovdqa %xmm0,0-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpunpckldq %xmm9,%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -56(%r8),%xmm2
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -56(%r9),%xmm9
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpshufb %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpinsrd $1,-56(%r10),%xmm2,%xmm2
- vpinsrd $1,-56(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpandn %xmm12,%xmm10,%xmm7
- vpand %xmm11,%xmm10,%xmm6
-
- vmovdqa %xmm1,16-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpunpckldq %xmm9,%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -52(%r8),%xmm3
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -52(%r9),%xmm9
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpshufb %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpinsrd $1,-52(%r10),%xmm3,%xmm3
- vpinsrd $1,-52(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpandn %xmm11,%xmm14,%xmm7
- vpand %xmm10,%xmm14,%xmm6
-
- vmovdqa %xmm2,32-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpunpckldq %xmm9,%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -48(%r8),%xmm4
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -48(%r9),%xmm9
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpshufb %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpinsrd $1,-48(%r10),%xmm4,%xmm4
- vpinsrd $1,-48(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpandn %xmm10,%xmm13,%xmm7
- vpand %xmm14,%xmm13,%xmm6
-
- vmovdqa %xmm3,48-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpunpckldq %xmm9,%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -44(%r8),%xmm0
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -44(%r9),%xmm9
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpshufb %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpinsrd $1,-44(%r10),%xmm0,%xmm0
- vpinsrd $1,-44(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpandn %xmm14,%xmm12,%xmm7
- vpand %xmm13,%xmm12,%xmm6
-
- vmovdqa %xmm4,64-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpunpckldq %xmm9,%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -40(%r8),%xmm1
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -40(%r9),%xmm9
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpshufb %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpinsrd $1,-40(%r10),%xmm1,%xmm1
- vpinsrd $1,-40(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpandn %xmm13,%xmm11,%xmm7
- vpand %xmm12,%xmm11,%xmm6
-
- vmovdqa %xmm0,80-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpunpckldq %xmm9,%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -36(%r8),%xmm2
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -36(%r9),%xmm9
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpshufb %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpinsrd $1,-36(%r10),%xmm2,%xmm2
- vpinsrd $1,-36(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpandn %xmm12,%xmm10,%xmm7
- vpand %xmm11,%xmm10,%xmm6
-
- vmovdqa %xmm1,96-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpunpckldq %xmm9,%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -32(%r8),%xmm3
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -32(%r9),%xmm9
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpshufb %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpinsrd $1,-32(%r10),%xmm3,%xmm3
- vpinsrd $1,-32(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpandn %xmm11,%xmm14,%xmm7
- vpand %xmm10,%xmm14,%xmm6
-
- vmovdqa %xmm2,112-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpunpckldq %xmm9,%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -28(%r8),%xmm4
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -28(%r9),%xmm9
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpshufb %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpinsrd $1,-28(%r10),%xmm4,%xmm4
- vpinsrd $1,-28(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpandn %xmm10,%xmm13,%xmm7
- vpand %xmm14,%xmm13,%xmm6
-
- vmovdqa %xmm3,128-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpunpckldq %xmm9,%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -24(%r8),%xmm0
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -24(%r9),%xmm9
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpshufb %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpinsrd $1,-24(%r10),%xmm0,%xmm0
- vpinsrd $1,-24(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpandn %xmm14,%xmm12,%xmm7
- vpand %xmm13,%xmm12,%xmm6
-
- vmovdqa %xmm4,144-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpunpckldq %xmm9,%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -20(%r8),%xmm1
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -20(%r9),%xmm9
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpshufb %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpinsrd $1,-20(%r10),%xmm1,%xmm1
- vpinsrd $1,-20(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpandn %xmm13,%xmm11,%xmm7
- vpand %xmm12,%xmm11,%xmm6
-
- vmovdqa %xmm0,160-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpunpckldq %xmm9,%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -16(%r8),%xmm2
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -16(%r9),%xmm9
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpshufb %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpinsrd $1,-16(%r10),%xmm2,%xmm2
- vpinsrd $1,-16(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpandn %xmm12,%xmm10,%xmm7
- vpand %xmm11,%xmm10,%xmm6
-
- vmovdqa %xmm1,176-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpunpckldq %xmm9,%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -12(%r8),%xmm3
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -12(%r9),%xmm9
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpshufb %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpinsrd $1,-12(%r10),%xmm3,%xmm3
- vpinsrd $1,-12(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpandn %xmm11,%xmm14,%xmm7
- vpand %xmm10,%xmm14,%xmm6
-
- vmovdqa %xmm2,192-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpunpckldq %xmm9,%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -8(%r8),%xmm4
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -8(%r9),%xmm9
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpshufb %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpinsrd $1,-8(%r10),%xmm4,%xmm4
- vpinsrd $1,-8(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpandn %xmm10,%xmm13,%xmm7
- vpand %xmm14,%xmm13,%xmm6
-
- vmovdqa %xmm3,208-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpunpckldq %xmm9,%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -4(%r8),%xmm0
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -4(%r9),%xmm9
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpshufb %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vmovdqa 0-128(%rax),%xmm1
- vpinsrd $1,-4(%r10),%xmm0,%xmm0
- vpinsrd $1,-4(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm10,%xmm10
- prefetcht0 63(%r8)
- vpslld $5,%xmm11,%xmm8
- vpandn %xmm14,%xmm12,%xmm7
- vpand %xmm13,%xmm12,%xmm6
-
- vmovdqa %xmm4,224-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpunpckldq %xmm9,%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- prefetcht0 63(%r9)
- vpxor %xmm7,%xmm6,%xmm6
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- prefetcht0 63(%r10)
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- prefetcht0 63(%r11)
- vpshufb %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vmovdqa 16-128(%rax),%xmm2
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 32-128(%rax),%xmm3
-
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpandn %xmm13,%xmm11,%xmm7
-
- vpand %xmm12,%xmm11,%xmm6
-
- vmovdqa %xmm0,240-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 128-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
-
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 48-128(%rax),%xmm4
-
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpandn %xmm12,%xmm10,%xmm7
-
- vpand %xmm11,%xmm10,%xmm6
-
- vmovdqa %xmm1,0-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 144-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
-
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 64-128(%rax),%xmm0
-
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpandn %xmm11,%xmm14,%xmm7
-
- vpand %xmm10,%xmm14,%xmm6
-
- vmovdqa %xmm2,16-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 160-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
-
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 80-128(%rax),%xmm1
-
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpandn %xmm10,%xmm13,%xmm7
-
- vpand %xmm14,%xmm13,%xmm6
-
- vmovdqa %xmm3,32-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 176-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
-
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 96-128(%rax),%xmm2
-
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpandn %xmm14,%xmm12,%xmm7
-
- vpand %xmm13,%xmm12,%xmm6
-
- vmovdqa %xmm4,48-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 192-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
-
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vmovdqa 0(%rbp),%xmm15
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 112-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,64-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 208-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 128-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,80-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 224-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 144-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vmovdqa %xmm2,96-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 240-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 160-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vmovdqa %xmm3,112-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 0-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 176-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vmovdqa %xmm4,128-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 16-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 192-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,144-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 32-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 208-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,160-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 48-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 224-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vmovdqa %xmm2,176-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 64-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 240-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vmovdqa %xmm3,192-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 80-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 0-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vmovdqa %xmm4,208-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 96-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 16-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,224-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 112-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 32-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,240-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 128-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 48-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vmovdqa %xmm2,0-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 144-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 64-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vmovdqa %xmm3,16-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 160-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 80-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vmovdqa %xmm4,32-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 176-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 96-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,48-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 192-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 112-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,64-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 208-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 128-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vmovdqa %xmm2,80-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 224-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 144-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vmovdqa %xmm3,96-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 240-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 160-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vmovdqa %xmm4,112-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 0-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vmovdqa 32(%rbp),%xmm15
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 176-128(%rax),%xmm3
-
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpand %xmm12,%xmm13,%xmm7
- vpxor 16-128(%rax),%xmm1,%xmm1
-
- vpaddd %xmm7,%xmm14,%xmm14
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm13,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vmovdqu %xmm0,128-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm1,%xmm5
- vpand %xmm11,%xmm6,%xmm6
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 192-128(%rax),%xmm4
-
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpand %xmm11,%xmm12,%xmm7
- vpxor 32-128(%rax),%xmm2,%xmm2
-
- vpaddd %xmm7,%xmm13,%xmm13
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm12,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vmovdqu %xmm1,144-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm2,%xmm5
- vpand %xmm10,%xmm6,%xmm6
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 208-128(%rax),%xmm0
-
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpand %xmm10,%xmm11,%xmm7
- vpxor 48-128(%rax),%xmm3,%xmm3
-
- vpaddd %xmm7,%xmm12,%xmm12
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm11,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vmovdqu %xmm2,160-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm3,%xmm5
- vpand %xmm14,%xmm6,%xmm6
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 224-128(%rax),%xmm1
-
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpand %xmm14,%xmm10,%xmm7
- vpxor 64-128(%rax),%xmm4,%xmm4
-
- vpaddd %xmm7,%xmm11,%xmm11
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm10,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vmovdqu %xmm3,176-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm4,%xmm5
- vpand %xmm13,%xmm6,%xmm6
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 240-128(%rax),%xmm2
-
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpand %xmm13,%xmm14,%xmm7
- vpxor 80-128(%rax),%xmm0,%xmm0
-
- vpaddd %xmm7,%xmm10,%xmm10
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm14,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vmovdqu %xmm4,192-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm0,%xmm5
- vpand %xmm12,%xmm6,%xmm6
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 0-128(%rax),%xmm3
-
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpand %xmm12,%xmm13,%xmm7
- vpxor 96-128(%rax),%xmm1,%xmm1
-
- vpaddd %xmm7,%xmm14,%xmm14
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm13,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vmovdqu %xmm0,208-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm1,%xmm5
- vpand %xmm11,%xmm6,%xmm6
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 16-128(%rax),%xmm4
-
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpand %xmm11,%xmm12,%xmm7
- vpxor 112-128(%rax),%xmm2,%xmm2
-
- vpaddd %xmm7,%xmm13,%xmm13
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm12,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vmovdqu %xmm1,224-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm2,%xmm5
- vpand %xmm10,%xmm6,%xmm6
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 32-128(%rax),%xmm0
-
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpand %xmm10,%xmm11,%xmm7
- vpxor 128-128(%rax),%xmm3,%xmm3
-
- vpaddd %xmm7,%xmm12,%xmm12
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm11,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vmovdqu %xmm2,240-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm3,%xmm5
- vpand %xmm14,%xmm6,%xmm6
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 48-128(%rax),%xmm1
-
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpand %xmm14,%xmm10,%xmm7
- vpxor 144-128(%rax),%xmm4,%xmm4
-
- vpaddd %xmm7,%xmm11,%xmm11
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm10,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vmovdqu %xmm3,0-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm4,%xmm5
- vpand %xmm13,%xmm6,%xmm6
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 64-128(%rax),%xmm2
-
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpand %xmm13,%xmm14,%xmm7
- vpxor 160-128(%rax),%xmm0,%xmm0
-
- vpaddd %xmm7,%xmm10,%xmm10
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm14,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vmovdqu %xmm4,16-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm0,%xmm5
- vpand %xmm12,%xmm6,%xmm6
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 80-128(%rax),%xmm3
-
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpand %xmm12,%xmm13,%xmm7
- vpxor 176-128(%rax),%xmm1,%xmm1
-
- vpaddd %xmm7,%xmm14,%xmm14
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm13,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vmovdqu %xmm0,32-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm1,%xmm5
- vpand %xmm11,%xmm6,%xmm6
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 96-128(%rax),%xmm4
-
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpand %xmm11,%xmm12,%xmm7
- vpxor 192-128(%rax),%xmm2,%xmm2
-
- vpaddd %xmm7,%xmm13,%xmm13
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm12,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vmovdqu %xmm1,48-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm2,%xmm5
- vpand %xmm10,%xmm6,%xmm6
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 112-128(%rax),%xmm0
-
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpand %xmm10,%xmm11,%xmm7
- vpxor 208-128(%rax),%xmm3,%xmm3
-
- vpaddd %xmm7,%xmm12,%xmm12
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm11,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vmovdqu %xmm2,64-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm3,%xmm5
- vpand %xmm14,%xmm6,%xmm6
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 128-128(%rax),%xmm1
-
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpand %xmm14,%xmm10,%xmm7
- vpxor 224-128(%rax),%xmm4,%xmm4
-
- vpaddd %xmm7,%xmm11,%xmm11
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm10,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vmovdqu %xmm3,80-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm4,%xmm5
- vpand %xmm13,%xmm6,%xmm6
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 144-128(%rax),%xmm2
-
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpand %xmm13,%xmm14,%xmm7
- vpxor 240-128(%rax),%xmm0,%xmm0
-
- vpaddd %xmm7,%xmm10,%xmm10
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm14,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vmovdqu %xmm4,96-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm0,%xmm5
- vpand %xmm12,%xmm6,%xmm6
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 160-128(%rax),%xmm3
-
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpand %xmm12,%xmm13,%xmm7
- vpxor 0-128(%rax),%xmm1,%xmm1
-
- vpaddd %xmm7,%xmm14,%xmm14
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm13,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vmovdqu %xmm0,112-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm1,%xmm5
- vpand %xmm11,%xmm6,%xmm6
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 176-128(%rax),%xmm4
-
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpand %xmm11,%xmm12,%xmm7
- vpxor 16-128(%rax),%xmm2,%xmm2
-
- vpaddd %xmm7,%xmm13,%xmm13
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm12,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vmovdqu %xmm1,128-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm2,%xmm5
- vpand %xmm10,%xmm6,%xmm6
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 192-128(%rax),%xmm0
-
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpand %xmm10,%xmm11,%xmm7
- vpxor 32-128(%rax),%xmm3,%xmm3
-
- vpaddd %xmm7,%xmm12,%xmm12
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm11,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vmovdqu %xmm2,144-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm3,%xmm5
- vpand %xmm14,%xmm6,%xmm6
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 208-128(%rax),%xmm1
-
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpand %xmm14,%xmm10,%xmm7
- vpxor 48-128(%rax),%xmm4,%xmm4
-
- vpaddd %xmm7,%xmm11,%xmm11
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm10,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vmovdqu %xmm3,160-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm4,%xmm5
- vpand %xmm13,%xmm6,%xmm6
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 224-128(%rax),%xmm2
-
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpand %xmm13,%xmm14,%xmm7
- vpxor 64-128(%rax),%xmm0,%xmm0
-
- vpaddd %xmm7,%xmm10,%xmm10
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm14,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vmovdqu %xmm4,176-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm0,%xmm5
- vpand %xmm12,%xmm6,%xmm6
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vmovdqa 64(%rbp),%xmm15
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 240-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,192-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 80-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 0-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,208-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 96-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 16-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vmovdqa %xmm2,224-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 112-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 32-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vmovdqa %xmm3,240-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 128-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 48-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vmovdqa %xmm4,0-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 144-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 64-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,16-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 160-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 80-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,32-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 176-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 96-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vmovdqa %xmm2,48-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 192-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 112-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vmovdqa %xmm3,64-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 208-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 128-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vmovdqa %xmm4,80-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 224-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 144-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,96-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 240-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 160-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,112-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 0-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 176-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 16-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 192-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 32-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 208-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 48-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 224-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 64-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 240-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 80-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 0-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 96-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 16-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 112-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
-
- vpsrld $27,%xmm11,%xmm9
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor %xmm13,%xmm6,%xmm6
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm7,%xmm12,%xmm12
- movl $1,%ecx
- cmpl 0(%rbx),%ecx
- cmovgeq %rbp,%r8
- cmpl 4(%rbx),%ecx
- cmovgeq %rbp,%r9
- cmpl 8(%rbx),%ecx
- cmovgeq %rbp,%r10
- cmpl 12(%rbx),%ecx
- cmovgeq %rbp,%r11
- vmovdqu (%rbx),%xmm6
- vpxor %xmm8,%xmm8,%xmm8
- vmovdqa %xmm6,%xmm7
- vpcmpgtd %xmm8,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpand %xmm7,%xmm10,%xmm10
- vpand %xmm7,%xmm11,%xmm11
- vpaddd 0(%rdi),%xmm10,%xmm10
- vpand %xmm7,%xmm12,%xmm12
- vpaddd 32(%rdi),%xmm11,%xmm11
- vpand %xmm7,%xmm13,%xmm13
- vpaddd 64(%rdi),%xmm12,%xmm12
- vpand %xmm7,%xmm14,%xmm14
- vpaddd 96(%rdi),%xmm13,%xmm13
- vpaddd 128(%rdi),%xmm14,%xmm14
- vmovdqu %xmm10,0(%rdi)
- vmovdqu %xmm11,32(%rdi)
- vmovdqu %xmm12,64(%rdi)
- vmovdqu %xmm13,96(%rdi)
- vmovdqu %xmm14,128(%rdi)
-
- vmovdqu %xmm6,(%rbx)
- vmovdqu 96(%rbp),%xmm5
- decl %edx
- jnz L$oop_avx
-
- movl 280(%rsp),%edx
- leaq 16(%rdi),%rdi
- leaq 64(%rsi),%rsi
- decl %edx
- jnz L$oop_grande_avx
-
-L$done_avx:
- movq 272(%rsp),%rax
- vzeroupper
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-L$epilogue_avx:
- .byte 0xf3,0xc3
-
-
-.p2align 5
-sha1_multi_block_avx2:
-_avx2_shortcut:
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $576,%rsp
- andq $-256,%rsp
- movq %rax,544(%rsp)
-L$body_avx2:
- leaq K_XX_XX(%rip),%rbp
- shrl $1,%edx
-
- vzeroupper
-L$oop_grande_avx2:
- movl %edx,552(%rsp)
- xorl %edx,%edx
- leaq 512(%rsp),%rbx
- movq 0(%rsi),%r12
- movl 8(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,0(%rbx)
- cmovleq %rbp,%r12
- movq 16(%rsi),%r13
- movl 24(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,4(%rbx)
- cmovleq %rbp,%r13
- movq 32(%rsi),%r14
- movl 40(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,8(%rbx)
- cmovleq %rbp,%r14
- movq 48(%rsi),%r15
- movl 56(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,12(%rbx)
- cmovleq %rbp,%r15
- movq 64(%rsi),%r8
- movl 72(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,16(%rbx)
- cmovleq %rbp,%r8
- movq 80(%rsi),%r9
- movl 88(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,20(%rbx)
- cmovleq %rbp,%r9
- movq 96(%rsi),%r10
- movl 104(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,24(%rbx)
- cmovleq %rbp,%r10
- movq 112(%rsi),%r11
- movl 120(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,28(%rbx)
- cmovleq %rbp,%r11
- vmovdqu 0(%rdi),%ymm0
- leaq 128(%rsp),%rax
- vmovdqu 32(%rdi),%ymm1
- leaq 256+128(%rsp),%rbx
- vmovdqu 64(%rdi),%ymm2
- vmovdqu 96(%rdi),%ymm3
- vmovdqu 128(%rdi),%ymm4
- vmovdqu 96(%rbp),%ymm9
- jmp L$oop_avx2
-
-.p2align 5
-L$oop_avx2:
- vmovdqa -32(%rbp),%ymm15
- vmovd (%r12),%xmm10
- leaq 64(%r12),%r12
- vmovd (%r8),%xmm12
- leaq 64(%r8),%r8
- vmovd (%r13),%xmm7
- leaq 64(%r13),%r13
- vmovd (%r9),%xmm6
- leaq 64(%r9),%r9
- vpinsrd $1,(%r14),%xmm10,%xmm10
- leaq 64(%r14),%r14
- vpinsrd $1,(%r10),%xmm12,%xmm12
- leaq 64(%r10),%r10
- vpinsrd $1,(%r15),%xmm7,%xmm7
- leaq 64(%r15),%r15
- vpunpckldq %ymm7,%ymm10,%ymm10
- vpinsrd $1,(%r11),%xmm6,%xmm6
- leaq 64(%r11),%r11
- vpunpckldq %ymm6,%ymm12,%ymm12
- vmovd -60(%r12),%xmm11
- vinserti128 $1,%xmm12,%ymm10,%ymm10
- vmovd -60(%r8),%xmm8
- vpshufb %ymm9,%ymm10,%ymm10
- vmovd -60(%r13),%xmm7
- vmovd -60(%r9),%xmm6
- vpinsrd $1,-60(%r14),%xmm11,%xmm11
- vpinsrd $1,-60(%r10),%xmm8,%xmm8
- vpinsrd $1,-60(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm11,%ymm11
- vpinsrd $1,-60(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpandn %ymm3,%ymm1,%ymm6
- vpand %ymm2,%ymm1,%ymm5
-
- vmovdqa %ymm10,0-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vinserti128 $1,%xmm8,%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -56(%r12),%xmm12
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -56(%r8),%xmm8
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpshufb %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vmovd -56(%r13),%xmm7
- vmovd -56(%r9),%xmm6
- vpinsrd $1,-56(%r14),%xmm12,%xmm12
- vpinsrd $1,-56(%r10),%xmm8,%xmm8
- vpinsrd $1,-56(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm12,%ymm12
- vpinsrd $1,-56(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpandn %ymm2,%ymm0,%ymm6
- vpand %ymm1,%ymm0,%ymm5
-
- vmovdqa %ymm11,32-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vinserti128 $1,%xmm8,%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -52(%r12),%xmm13
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -52(%r8),%xmm8
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpshufb %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vmovd -52(%r13),%xmm7
- vmovd -52(%r9),%xmm6
- vpinsrd $1,-52(%r14),%xmm13,%xmm13
- vpinsrd $1,-52(%r10),%xmm8,%xmm8
- vpinsrd $1,-52(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm13,%ymm13
- vpinsrd $1,-52(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpandn %ymm1,%ymm4,%ymm6
- vpand %ymm0,%ymm4,%ymm5
-
- vmovdqa %ymm12,64-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vinserti128 $1,%xmm8,%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -48(%r12),%xmm14
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -48(%r8),%xmm8
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpshufb %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vmovd -48(%r13),%xmm7
- vmovd -48(%r9),%xmm6
- vpinsrd $1,-48(%r14),%xmm14,%xmm14
- vpinsrd $1,-48(%r10),%xmm8,%xmm8
- vpinsrd $1,-48(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm14,%ymm14
- vpinsrd $1,-48(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpandn %ymm0,%ymm3,%ymm6
- vpand %ymm4,%ymm3,%ymm5
-
- vmovdqa %ymm13,96-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vinserti128 $1,%xmm8,%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -44(%r12),%xmm10
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -44(%r8),%xmm8
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpshufb %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vmovd -44(%r13),%xmm7
- vmovd -44(%r9),%xmm6
- vpinsrd $1,-44(%r14),%xmm10,%xmm10
- vpinsrd $1,-44(%r10),%xmm8,%xmm8
- vpinsrd $1,-44(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm10,%ymm10
- vpinsrd $1,-44(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpandn %ymm4,%ymm2,%ymm6
- vpand %ymm3,%ymm2,%ymm5
-
- vmovdqa %ymm14,128-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vinserti128 $1,%xmm8,%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -40(%r12),%xmm11
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -40(%r8),%xmm8
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpshufb %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vmovd -40(%r13),%xmm7
- vmovd -40(%r9),%xmm6
- vpinsrd $1,-40(%r14),%xmm11,%xmm11
- vpinsrd $1,-40(%r10),%xmm8,%xmm8
- vpinsrd $1,-40(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm11,%ymm11
- vpinsrd $1,-40(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpandn %ymm3,%ymm1,%ymm6
- vpand %ymm2,%ymm1,%ymm5
-
- vmovdqa %ymm10,160-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vinserti128 $1,%xmm8,%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -36(%r12),%xmm12
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -36(%r8),%xmm8
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpshufb %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vmovd -36(%r13),%xmm7
- vmovd -36(%r9),%xmm6
- vpinsrd $1,-36(%r14),%xmm12,%xmm12
- vpinsrd $1,-36(%r10),%xmm8,%xmm8
- vpinsrd $1,-36(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm12,%ymm12
- vpinsrd $1,-36(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpandn %ymm2,%ymm0,%ymm6
- vpand %ymm1,%ymm0,%ymm5
-
- vmovdqa %ymm11,192-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vinserti128 $1,%xmm8,%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -32(%r12),%xmm13
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -32(%r8),%xmm8
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpshufb %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vmovd -32(%r13),%xmm7
- vmovd -32(%r9),%xmm6
- vpinsrd $1,-32(%r14),%xmm13,%xmm13
- vpinsrd $1,-32(%r10),%xmm8,%xmm8
- vpinsrd $1,-32(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm13,%ymm13
- vpinsrd $1,-32(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpandn %ymm1,%ymm4,%ymm6
- vpand %ymm0,%ymm4,%ymm5
-
- vmovdqa %ymm12,224-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vinserti128 $1,%xmm8,%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -28(%r12),%xmm14
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -28(%r8),%xmm8
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpshufb %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vmovd -28(%r13),%xmm7
- vmovd -28(%r9),%xmm6
- vpinsrd $1,-28(%r14),%xmm14,%xmm14
- vpinsrd $1,-28(%r10),%xmm8,%xmm8
- vpinsrd $1,-28(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm14,%ymm14
- vpinsrd $1,-28(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpandn %ymm0,%ymm3,%ymm6
- vpand %ymm4,%ymm3,%ymm5
-
- vmovdqa %ymm13,256-256-128(%rbx)
- vpaddd %ymm13,%ymm1,%ymm1
- vinserti128 $1,%xmm8,%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -24(%r12),%xmm10
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -24(%r8),%xmm8
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpshufb %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vmovd -24(%r13),%xmm7
- vmovd -24(%r9),%xmm6
- vpinsrd $1,-24(%r14),%xmm10,%xmm10
- vpinsrd $1,-24(%r10),%xmm8,%xmm8
- vpinsrd $1,-24(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm10,%ymm10
- vpinsrd $1,-24(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpandn %ymm4,%ymm2,%ymm6
- vpand %ymm3,%ymm2,%ymm5
-
- vmovdqa %ymm14,288-256-128(%rbx)
- vpaddd %ymm14,%ymm0,%ymm0
- vinserti128 $1,%xmm8,%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -20(%r12),%xmm11
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -20(%r8),%xmm8
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpshufb %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vmovd -20(%r13),%xmm7
- vmovd -20(%r9),%xmm6
- vpinsrd $1,-20(%r14),%xmm11,%xmm11
- vpinsrd $1,-20(%r10),%xmm8,%xmm8
- vpinsrd $1,-20(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm11,%ymm11
- vpinsrd $1,-20(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpandn %ymm3,%ymm1,%ymm6
- vpand %ymm2,%ymm1,%ymm5
-
- vmovdqa %ymm10,320-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vinserti128 $1,%xmm8,%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -16(%r12),%xmm12
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -16(%r8),%xmm8
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpshufb %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vmovd -16(%r13),%xmm7
- vmovd -16(%r9),%xmm6
- vpinsrd $1,-16(%r14),%xmm12,%xmm12
- vpinsrd $1,-16(%r10),%xmm8,%xmm8
- vpinsrd $1,-16(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm12,%ymm12
- vpinsrd $1,-16(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpandn %ymm2,%ymm0,%ymm6
- vpand %ymm1,%ymm0,%ymm5
-
- vmovdqa %ymm11,352-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vinserti128 $1,%xmm8,%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -12(%r12),%xmm13
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -12(%r8),%xmm8
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpshufb %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vmovd -12(%r13),%xmm7
- vmovd -12(%r9),%xmm6
- vpinsrd $1,-12(%r14),%xmm13,%xmm13
- vpinsrd $1,-12(%r10),%xmm8,%xmm8
- vpinsrd $1,-12(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm13,%ymm13
- vpinsrd $1,-12(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpandn %ymm1,%ymm4,%ymm6
- vpand %ymm0,%ymm4,%ymm5
-
- vmovdqa %ymm12,384-256-128(%rbx)
- vpaddd %ymm12,%ymm2,%ymm2
- vinserti128 $1,%xmm8,%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -8(%r12),%xmm14
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -8(%r8),%xmm8
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpshufb %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vmovd -8(%r13),%xmm7
- vmovd -8(%r9),%xmm6
- vpinsrd $1,-8(%r14),%xmm14,%xmm14
- vpinsrd $1,-8(%r10),%xmm8,%xmm8
- vpinsrd $1,-8(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm14,%ymm14
- vpinsrd $1,-8(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpandn %ymm0,%ymm3,%ymm6
- vpand %ymm4,%ymm3,%ymm5
-
- vmovdqa %ymm13,416-256-128(%rbx)
- vpaddd %ymm13,%ymm1,%ymm1
- vinserti128 $1,%xmm8,%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -4(%r12),%xmm10
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -4(%r8),%xmm8
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpshufb %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vmovdqa 0-128(%rax),%ymm11
- vmovd -4(%r13),%xmm7
- vmovd -4(%r9),%xmm6
- vpinsrd $1,-4(%r14),%xmm10,%xmm10
- vpinsrd $1,-4(%r10),%xmm8,%xmm8
- vpinsrd $1,-4(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm10,%ymm10
- vpinsrd $1,-4(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm0,%ymm0
- prefetcht0 63(%r12)
- vpslld $5,%ymm1,%ymm7
- vpandn %ymm4,%ymm2,%ymm6
- vpand %ymm3,%ymm2,%ymm5
-
- vmovdqa %ymm14,448-256-128(%rbx)
- vpaddd %ymm14,%ymm0,%ymm0
- vinserti128 $1,%xmm8,%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- prefetcht0 63(%r13)
- vpxor %ymm6,%ymm5,%ymm5
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- prefetcht0 63(%r14)
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- prefetcht0 63(%r15)
- vpshufb %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vmovdqa 32-128(%rax),%ymm12
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 64-128(%rax),%ymm13
-
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpandn %ymm3,%ymm1,%ymm6
- prefetcht0 63(%r8)
- vpand %ymm2,%ymm1,%ymm5
-
- vmovdqa %ymm10,480-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 256-256-128(%rbx),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
- prefetcht0 63(%r9)
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- prefetcht0 63(%r10)
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- prefetcht0 63(%r11)
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 96-128(%rax),%ymm14
-
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpandn %ymm2,%ymm0,%ymm6
-
- vpand %ymm1,%ymm0,%ymm5
-
- vmovdqa %ymm11,0-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 288-256-128(%rbx),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
-
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 128-128(%rax),%ymm10
-
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpandn %ymm1,%ymm4,%ymm6
-
- vpand %ymm0,%ymm4,%ymm5
-
- vmovdqa %ymm12,32-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 320-256-128(%rbx),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
-
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 160-128(%rax),%ymm11
-
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpandn %ymm0,%ymm3,%ymm6
-
- vpand %ymm4,%ymm3,%ymm5
-
- vmovdqa %ymm13,64-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 352-256-128(%rbx),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
-
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 192-128(%rax),%ymm12
-
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpandn %ymm4,%ymm2,%ymm6
-
- vpand %ymm3,%ymm2,%ymm5
-
- vmovdqa %ymm14,96-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 384-256-128(%rbx),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
-
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vmovdqa 0(%rbp),%ymm15
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 224-128(%rax),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,128-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 416-256-128(%rbx),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 256-256-128(%rbx),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,160-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 448-256-128(%rbx),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 288-256-128(%rbx),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vmovdqa %ymm12,192-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 480-256-128(%rbx),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 320-256-128(%rbx),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vmovdqa %ymm13,224-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 0-128(%rax),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 352-256-128(%rbx),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vmovdqa %ymm14,256-256-128(%rbx)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 32-128(%rax),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 384-256-128(%rbx),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,288-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 64-128(%rax),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 416-256-128(%rbx),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,320-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 96-128(%rax),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 448-256-128(%rbx),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vmovdqa %ymm12,352-256-128(%rbx)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 128-128(%rax),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 480-256-128(%rbx),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vmovdqa %ymm13,384-256-128(%rbx)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 160-128(%rax),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 0-128(%rax),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vmovdqa %ymm14,416-256-128(%rbx)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 192-128(%rax),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 32-128(%rax),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,448-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 224-128(%rax),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 64-128(%rax),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,480-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 256-256-128(%rbx),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 96-128(%rax),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vmovdqa %ymm12,0-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 288-256-128(%rbx),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 128-128(%rax),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vmovdqa %ymm13,32-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 320-256-128(%rbx),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 160-128(%rax),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vmovdqa %ymm14,64-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 352-256-128(%rbx),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 192-128(%rax),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,96-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 384-256-128(%rbx),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 224-128(%rax),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,128-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 416-256-128(%rbx),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 256-256-128(%rbx),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vmovdqa %ymm12,160-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 448-256-128(%rbx),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 288-256-128(%rbx),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vmovdqa %ymm13,192-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 480-256-128(%rbx),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 320-256-128(%rbx),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vmovdqa %ymm14,224-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 0-128(%rax),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vmovdqa 32(%rbp),%ymm15
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 352-256-128(%rbx),%ymm13
-
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpand %ymm2,%ymm3,%ymm6
- vpxor 32-128(%rax),%ymm11,%ymm11
-
- vpaddd %ymm6,%ymm4,%ymm4
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm3,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vmovdqu %ymm10,256-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm11,%ymm9
- vpand %ymm1,%ymm5,%ymm5
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 384-256-128(%rbx),%ymm14
-
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpand %ymm1,%ymm2,%ymm6
- vpxor 64-128(%rax),%ymm12,%ymm12
-
- vpaddd %ymm6,%ymm3,%ymm3
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm2,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vmovdqu %ymm11,288-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm12,%ymm9
- vpand %ymm0,%ymm5,%ymm5
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 416-256-128(%rbx),%ymm10
-
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpand %ymm0,%ymm1,%ymm6
- vpxor 96-128(%rax),%ymm13,%ymm13
-
- vpaddd %ymm6,%ymm2,%ymm2
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm1,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vmovdqu %ymm12,320-256-128(%rbx)
- vpaddd %ymm12,%ymm2,%ymm2
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm13,%ymm9
- vpand %ymm4,%ymm5,%ymm5
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 448-256-128(%rbx),%ymm11
-
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpand %ymm4,%ymm0,%ymm6
- vpxor 128-128(%rax),%ymm14,%ymm14
-
- vpaddd %ymm6,%ymm1,%ymm1
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm0,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vmovdqu %ymm13,352-256-128(%rbx)
- vpaddd %ymm13,%ymm1,%ymm1
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm14,%ymm9
- vpand %ymm3,%ymm5,%ymm5
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 480-256-128(%rbx),%ymm12
-
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpand %ymm3,%ymm4,%ymm6
- vpxor 160-128(%rax),%ymm10,%ymm10
-
- vpaddd %ymm6,%ymm0,%ymm0
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm4,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vmovdqu %ymm14,384-256-128(%rbx)
- vpaddd %ymm14,%ymm0,%ymm0
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm10,%ymm9
- vpand %ymm2,%ymm5,%ymm5
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 0-128(%rax),%ymm13
-
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpand %ymm2,%ymm3,%ymm6
- vpxor 192-128(%rax),%ymm11,%ymm11
-
- vpaddd %ymm6,%ymm4,%ymm4
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm3,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vmovdqu %ymm10,416-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm11,%ymm9
- vpand %ymm1,%ymm5,%ymm5
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 32-128(%rax),%ymm14
-
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpand %ymm1,%ymm2,%ymm6
- vpxor 224-128(%rax),%ymm12,%ymm12
-
- vpaddd %ymm6,%ymm3,%ymm3
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm2,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vmovdqu %ymm11,448-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm12,%ymm9
- vpand %ymm0,%ymm5,%ymm5
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 64-128(%rax),%ymm10
-
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpand %ymm0,%ymm1,%ymm6
- vpxor 256-256-128(%rbx),%ymm13,%ymm13
-
- vpaddd %ymm6,%ymm2,%ymm2
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm1,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vmovdqu %ymm12,480-256-128(%rbx)
- vpaddd %ymm12,%ymm2,%ymm2
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm13,%ymm9
- vpand %ymm4,%ymm5,%ymm5
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 96-128(%rax),%ymm11
-
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpand %ymm4,%ymm0,%ymm6
- vpxor 288-256-128(%rbx),%ymm14,%ymm14
-
- vpaddd %ymm6,%ymm1,%ymm1
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm0,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vmovdqu %ymm13,0-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm14,%ymm9
- vpand %ymm3,%ymm5,%ymm5
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 128-128(%rax),%ymm12
-
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpand %ymm3,%ymm4,%ymm6
- vpxor 320-256-128(%rbx),%ymm10,%ymm10
-
- vpaddd %ymm6,%ymm0,%ymm0
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm4,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vmovdqu %ymm14,32-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm10,%ymm9
- vpand %ymm2,%ymm5,%ymm5
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 160-128(%rax),%ymm13
-
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpand %ymm2,%ymm3,%ymm6
- vpxor 352-256-128(%rbx),%ymm11,%ymm11
-
- vpaddd %ymm6,%ymm4,%ymm4
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm3,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vmovdqu %ymm10,64-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm11,%ymm9
- vpand %ymm1,%ymm5,%ymm5
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 192-128(%rax),%ymm14
-
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpand %ymm1,%ymm2,%ymm6
- vpxor 384-256-128(%rbx),%ymm12,%ymm12
-
- vpaddd %ymm6,%ymm3,%ymm3
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm2,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vmovdqu %ymm11,96-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm12,%ymm9
- vpand %ymm0,%ymm5,%ymm5
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 224-128(%rax),%ymm10
-
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpand %ymm0,%ymm1,%ymm6
- vpxor 416-256-128(%rbx),%ymm13,%ymm13
-
- vpaddd %ymm6,%ymm2,%ymm2
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm1,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vmovdqu %ymm12,128-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm13,%ymm9
- vpand %ymm4,%ymm5,%ymm5
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 256-256-128(%rbx),%ymm11
-
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpand %ymm4,%ymm0,%ymm6
- vpxor 448-256-128(%rbx),%ymm14,%ymm14
-
- vpaddd %ymm6,%ymm1,%ymm1
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm0,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vmovdqu %ymm13,160-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm14,%ymm9
- vpand %ymm3,%ymm5,%ymm5
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 288-256-128(%rbx),%ymm12
-
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpand %ymm3,%ymm4,%ymm6
- vpxor 480-256-128(%rbx),%ymm10,%ymm10
-
- vpaddd %ymm6,%ymm0,%ymm0
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm4,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vmovdqu %ymm14,192-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm10,%ymm9
- vpand %ymm2,%ymm5,%ymm5
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 320-256-128(%rbx),%ymm13
-
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpand %ymm2,%ymm3,%ymm6
- vpxor 0-128(%rax),%ymm11,%ymm11
-
- vpaddd %ymm6,%ymm4,%ymm4
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm3,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vmovdqu %ymm10,224-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm11,%ymm9
- vpand %ymm1,%ymm5,%ymm5
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 352-256-128(%rbx),%ymm14
-
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpand %ymm1,%ymm2,%ymm6
- vpxor 32-128(%rax),%ymm12,%ymm12
-
- vpaddd %ymm6,%ymm3,%ymm3
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm2,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vmovdqu %ymm11,256-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm12,%ymm9
- vpand %ymm0,%ymm5,%ymm5
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 384-256-128(%rbx),%ymm10
-
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpand %ymm0,%ymm1,%ymm6
- vpxor 64-128(%rax),%ymm13,%ymm13
-
- vpaddd %ymm6,%ymm2,%ymm2
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm1,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vmovdqu %ymm12,288-256-128(%rbx)
- vpaddd %ymm12,%ymm2,%ymm2
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm13,%ymm9
- vpand %ymm4,%ymm5,%ymm5
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 416-256-128(%rbx),%ymm11
-
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpand %ymm4,%ymm0,%ymm6
- vpxor 96-128(%rax),%ymm14,%ymm14
-
- vpaddd %ymm6,%ymm1,%ymm1
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm0,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vmovdqu %ymm13,320-256-128(%rbx)
- vpaddd %ymm13,%ymm1,%ymm1
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm14,%ymm9
- vpand %ymm3,%ymm5,%ymm5
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 448-256-128(%rbx),%ymm12
-
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpand %ymm3,%ymm4,%ymm6
- vpxor 128-128(%rax),%ymm10,%ymm10
-
- vpaddd %ymm6,%ymm0,%ymm0
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm4,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vmovdqu %ymm14,352-256-128(%rbx)
- vpaddd %ymm14,%ymm0,%ymm0
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm10,%ymm9
- vpand %ymm2,%ymm5,%ymm5
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vmovdqa 64(%rbp),%ymm15
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 480-256-128(%rbx),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,384-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 160-128(%rax),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 0-128(%rax),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,416-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 192-128(%rax),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 32-128(%rax),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vmovdqa %ymm12,448-256-128(%rbx)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 224-128(%rax),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 64-128(%rax),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vmovdqa %ymm13,480-256-128(%rbx)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 256-256-128(%rbx),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 96-128(%rax),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vmovdqa %ymm14,0-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 288-256-128(%rbx),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 128-128(%rax),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,32-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 320-256-128(%rbx),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 160-128(%rax),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,64-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 352-256-128(%rbx),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 192-128(%rax),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vmovdqa %ymm12,96-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 384-256-128(%rbx),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 224-128(%rax),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vmovdqa %ymm13,128-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 416-256-128(%rbx),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 256-256-128(%rbx),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vmovdqa %ymm14,160-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 448-256-128(%rbx),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 288-256-128(%rbx),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,192-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 480-256-128(%rbx),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 320-256-128(%rbx),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,224-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 0-128(%rax),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 352-256-128(%rbx),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 32-128(%rax),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 384-256-128(%rbx),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 64-128(%rax),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 416-256-128(%rbx),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 96-128(%rax),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 448-256-128(%rbx),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 128-128(%rax),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 480-256-128(%rbx),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 160-128(%rax),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 0-128(%rax),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 192-128(%rax),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 32-128(%rax),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 224-128(%rax),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
-
- vpsrld $27,%ymm1,%ymm8
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor %ymm3,%ymm5,%ymm5
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm6,%ymm2,%ymm2
- movl $1,%ecx
- leaq 512(%rsp),%rbx
- cmpl 0(%rbx),%ecx
- cmovgeq %rbp,%r12
- cmpl 4(%rbx),%ecx
- cmovgeq %rbp,%r13
- cmpl 8(%rbx),%ecx
- cmovgeq %rbp,%r14
- cmpl 12(%rbx),%ecx
- cmovgeq %rbp,%r15
- cmpl 16(%rbx),%ecx
- cmovgeq %rbp,%r8
- cmpl 20(%rbx),%ecx
- cmovgeq %rbp,%r9
- cmpl 24(%rbx),%ecx
- cmovgeq %rbp,%r10
- cmpl 28(%rbx),%ecx
- cmovgeq %rbp,%r11
- vmovdqu (%rbx),%ymm5
- vpxor %ymm7,%ymm7,%ymm7
- vmovdqa %ymm5,%ymm6
- vpcmpgtd %ymm7,%ymm6,%ymm6
- vpaddd %ymm6,%ymm5,%ymm5
-
- vpand %ymm6,%ymm0,%ymm0
- vpand %ymm6,%ymm1,%ymm1
- vpaddd 0(%rdi),%ymm0,%ymm0
- vpand %ymm6,%ymm2,%ymm2
- vpaddd 32(%rdi),%ymm1,%ymm1
- vpand %ymm6,%ymm3,%ymm3
- vpaddd 64(%rdi),%ymm2,%ymm2
- vpand %ymm6,%ymm4,%ymm4
- vpaddd 96(%rdi),%ymm3,%ymm3
- vpaddd 128(%rdi),%ymm4,%ymm4
- vmovdqu %ymm0,0(%rdi)
- vmovdqu %ymm1,32(%rdi)
- vmovdqu %ymm2,64(%rdi)
- vmovdqu %ymm3,96(%rdi)
- vmovdqu %ymm4,128(%rdi)
-
- vmovdqu %ymm5,(%rbx)
- leaq 256+128(%rsp),%rbx
- vmovdqu 96(%rbp),%ymm9
- decl %edx
- jnz L$oop_avx2
-
-
-
-
-
-
-
-L$done_avx2:
- movq 544(%rsp),%rax
- vzeroupper
- movq -48(%rax),%r15
- movq -40(%rax),%r14
- movq -32(%rax),%r13
- movq -24(%rax),%r12
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-L$epilogue_avx2:
- .byte 0xf3,0xc3
-
-
.p2align 8
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
diff --git a/deps/openssl/asm/x64-macosx-gas/sha/sha1-x86_64.s b/deps/openssl/asm/x64-macosx-gas/sha/sha1-x86_64.s
index c89ffe3df6..798ca0dc4d 100644
--- a/deps/openssl/asm/x64-macosx-gas/sha/sha1-x86_64.s
+++ b/deps/openssl/asm/x64-macosx-gas/sha/sha1-x86_64.s
@@ -12,14 +12,6 @@ _sha1_block_data_order:
jz L$ialu
testl $536870912,%r10d
jnz _shaext_shortcut
- andl $296,%r10d
- cmpl $296,%r10d
- je _avx2_shortcut
- andl $268435456,%r8d
- andl $1073741824,%r9d
- orl %r9d,%r8d
- cmpl $1342177280,%r8d
- je _avx_shortcut
jmp _ssse3_shortcut
.p2align 4
@@ -1248,9 +1240,9 @@ _shaext_shortcut:
movdqa K_XX_XX+160(%rip),%xmm3
movdqu (%rsi),%xmm4
- pshufd $27,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm0,%xmm0
movdqu 16(%rsi),%xmm5
- pshufd $27,%xmm1,%xmm1
+ pshufd $0b00011011,%xmm1,%xmm1
movdqu 32(%rsi),%xmm6
.byte 102,15,56,0,227
movdqu 48(%rsi),%xmm7
@@ -1400,8 +1392,8 @@ L$oop_shaext:
jnz L$oop_shaext
- pshufd $27,%xmm0,%xmm0
- pshufd $27,%xmm1,%xmm1
+ pshufd $0b00011011,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm1,%xmm1
movdqu %xmm0,(%rdi)
movd %xmm1,16(%rdi)
.byte 0xf3,0xc3
@@ -2582,2803 +2574,6 @@ L$done_ssse3:
L$epilogue_ssse3:
.byte 0xf3,0xc3
-
-.p2align 4
-sha1_block_data_order_avx:
-_avx_shortcut:
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- leaq -64(%rsp),%rsp
- vzeroupper
- movq %rax,%r14
- andq $-64,%rsp
- movq %rdi,%r8
- movq %rsi,%r9
- movq %rdx,%r10
-
- shlq $6,%r10
- addq %r9,%r10
- leaq K_XX_XX+64(%rip),%r11
-
- movl 0(%r8),%eax
- movl 4(%r8),%ebx
- movl 8(%r8),%ecx
- movl 12(%r8),%edx
- movl %ebx,%esi
- movl 16(%r8),%ebp
- movl %ecx,%edi
- xorl %edx,%edi
- andl %edi,%esi
-
- vmovdqa 64(%r11),%xmm6
- vmovdqa -64(%r11),%xmm11
- vmovdqu 0(%r9),%xmm0
- vmovdqu 16(%r9),%xmm1
- vmovdqu 32(%r9),%xmm2
- vmovdqu 48(%r9),%xmm3
- vpshufb %xmm6,%xmm0,%xmm0
- addq $64,%r9
- vpshufb %xmm6,%xmm1,%xmm1
- vpshufb %xmm6,%xmm2,%xmm2
- vpshufb %xmm6,%xmm3,%xmm3
- vpaddd %xmm11,%xmm0,%xmm4
- vpaddd %xmm11,%xmm1,%xmm5
- vpaddd %xmm11,%xmm2,%xmm6
- vmovdqa %xmm4,0(%rsp)
- vmovdqa %xmm5,16(%rsp)
- vmovdqa %xmm6,32(%rsp)
- jmp L$oop_avx
-.p2align 4
-L$oop_avx:
- shrdl $2,%ebx,%ebx
- xorl %edx,%esi
- vpalignr $8,%xmm0,%xmm1,%xmm4
- movl %eax,%edi
- addl 0(%rsp),%ebp
- vpaddd %xmm3,%xmm11,%xmm9
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrldq $4,%xmm3,%xmm8
- addl %esi,%ebp
- andl %ebx,%edi
- vpxor %xmm0,%xmm4,%xmm4
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpxor %xmm2,%xmm8,%xmm8
- shrdl $7,%eax,%eax
- xorl %ecx,%edi
- movl %ebp,%esi
- addl 4(%rsp),%edx
- vpxor %xmm8,%xmm4,%xmm4
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vmovdqa %xmm9,48(%rsp)
- addl %edi,%edx
- andl %eax,%esi
- vpsrld $31,%xmm4,%xmm8
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%esi
- vpslldq $12,%xmm4,%xmm10
- vpaddd %xmm4,%xmm4,%xmm4
- movl %edx,%edi
- addl 8(%rsp),%ecx
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpsrld $30,%xmm10,%xmm9
- vpor %xmm8,%xmm4,%xmm4
- addl %esi,%ecx
- andl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- vpslld $2,%xmm10,%xmm10
- vpxor %xmm9,%xmm4,%xmm4
- shrdl $7,%edx,%edx
- xorl %eax,%edi
- movl %ecx,%esi
- addl 12(%rsp),%ebx
- vpxor %xmm10,%xmm4,%xmm4
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- andl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %ebp,%esi
- vpalignr $8,%xmm1,%xmm2,%xmm5
- movl %ebx,%edi
- addl 16(%rsp),%eax
- vpaddd %xmm4,%xmm11,%xmm9
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrldq $4,%xmm4,%xmm8
- addl %esi,%eax
- andl %ecx,%edi
- vpxor %xmm1,%xmm5,%xmm5
- xorl %edx,%ecx
- addl %ebx,%eax
- vpxor %xmm3,%xmm8,%xmm8
- shrdl $7,%ebx,%ebx
- xorl %edx,%edi
- movl %eax,%esi
- addl 20(%rsp),%ebp
- vpxor %xmm8,%xmm5,%xmm5
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%ebp
- andl %ebx,%esi
- vpsrld $31,%xmm5,%xmm8
- xorl %ecx,%ebx
- addl %eax,%ebp
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- vpslldq $12,%xmm5,%xmm10
- vpaddd %xmm5,%xmm5,%xmm5
- movl %ebp,%edi
- addl 24(%rsp),%edx
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vpsrld $30,%xmm10,%xmm9
- vpor %xmm8,%xmm5,%xmm5
- addl %esi,%edx
- andl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- vpslld $2,%xmm10,%xmm10
- vpxor %xmm9,%xmm5,%xmm5
- shrdl $7,%ebp,%ebp
- xorl %ebx,%edi
- movl %edx,%esi
- addl 28(%rsp),%ecx
- vpxor %xmm10,%xmm5,%xmm5
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vmovdqa -32(%r11),%xmm11
- addl %edi,%ecx
- andl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- vpalignr $8,%xmm2,%xmm3,%xmm6
- movl %ecx,%edi
- addl 32(%rsp),%ebx
- vpaddd %xmm5,%xmm11,%xmm9
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- vpsrldq $4,%xmm5,%xmm8
- addl %esi,%ebx
- andl %edx,%edi
- vpxor %xmm2,%xmm6,%xmm6
- xorl %ebp,%edx
- addl %ecx,%ebx
- vpxor %xmm4,%xmm8,%xmm8
- shrdl $7,%ecx,%ecx
- xorl %ebp,%edi
- movl %ebx,%esi
- addl 36(%rsp),%eax
- vpxor %xmm8,%xmm6,%xmm6
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vmovdqa %xmm9,16(%rsp)
- addl %edi,%eax
- andl %ecx,%esi
- vpsrld $31,%xmm6,%xmm8
- xorl %edx,%ecx
- addl %ebx,%eax
- shrdl $7,%ebx,%ebx
- xorl %edx,%esi
- vpslldq $12,%xmm6,%xmm10
- vpaddd %xmm6,%xmm6,%xmm6
- movl %eax,%edi
- addl 40(%rsp),%ebp
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrld $30,%xmm10,%xmm9
- vpor %xmm8,%xmm6,%xmm6
- addl %esi,%ebp
- andl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpslld $2,%xmm10,%xmm10
- vpxor %xmm9,%xmm6,%xmm6
- shrdl $7,%eax,%eax
- xorl %ecx,%edi
- movl %ebp,%esi
- addl 44(%rsp),%edx
- vpxor %xmm10,%xmm6,%xmm6
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- andl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%esi
- vpalignr $8,%xmm3,%xmm4,%xmm7
- movl %edx,%edi
- addl 48(%rsp),%ecx
- vpaddd %xmm6,%xmm11,%xmm9
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpsrldq $4,%xmm6,%xmm8
- addl %esi,%ecx
- andl %ebp,%edi
- vpxor %xmm3,%xmm7,%xmm7
- xorl %eax,%ebp
- addl %edx,%ecx
- vpxor %xmm5,%xmm8,%xmm8
- shrdl $7,%edx,%edx
- xorl %eax,%edi
- movl %ecx,%esi
- addl 52(%rsp),%ebx
- vpxor %xmm8,%xmm7,%xmm7
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%ebx
- andl %edx,%esi
- vpsrld $31,%xmm7,%xmm8
- xorl %ebp,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %ebp,%esi
- vpslldq $12,%xmm7,%xmm10
- vpaddd %xmm7,%xmm7,%xmm7
- movl %ebx,%edi
- addl 56(%rsp),%eax
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm10,%xmm9
- vpor %xmm8,%xmm7,%xmm7
- addl %esi,%eax
- andl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm10,%xmm10
- vpxor %xmm9,%xmm7,%xmm7
- shrdl $7,%ebx,%ebx
- xorl %edx,%edi
- movl %eax,%esi
- addl 60(%rsp),%ebp
- vpxor %xmm10,%xmm7,%xmm7
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- addl %edi,%ebp
- andl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm6,%xmm7,%xmm8
- vpxor %xmm4,%xmm0,%xmm0
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- movl %ebp,%edi
- addl 0(%rsp),%edx
- vpxor %xmm1,%xmm0,%xmm0
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vpaddd %xmm7,%xmm11,%xmm9
- addl %esi,%edx
- andl %eax,%edi
- vpxor %xmm8,%xmm0,%xmm0
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%edi
- vpsrld $30,%xmm0,%xmm8
- vmovdqa %xmm9,48(%rsp)
- movl %edx,%esi
- addl 4(%rsp),%ecx
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpslld $2,%xmm0,%xmm0
- addl %edi,%ecx
- andl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- movl %ecx,%edi
- addl 8(%rsp),%ebx
- vpor %xmm8,%xmm0,%xmm0
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- andl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 12(%rsp),%eax
- xorl %ebp,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm7,%xmm0,%xmm8
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- vpxor %xmm2,%xmm1,%xmm1
- addl %esi,%ebp
- xorl %ecx,%edi
- vpaddd %xmm0,%xmm11,%xmm9
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpxor %xmm8,%xmm1,%xmm1
- addl 20(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- vpsrld $30,%xmm1,%xmm8
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpslld $2,%xmm1,%xmm1
- addl 24(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpor %xmm8,%xmm1,%xmm1
- addl 28(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm0,%xmm1,%xmm8
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- vpxor %xmm3,%xmm2,%xmm2
- addl %esi,%eax
- xorl %edx,%edi
- vpaddd %xmm1,%xmm11,%xmm9
- vmovdqa 0(%r11),%xmm11
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpxor %xmm8,%xmm2,%xmm2
- addl 36(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- vpsrld $30,%xmm2,%xmm8
- vmovdqa %xmm9,16(%rsp)
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpslld $2,%xmm2,%xmm2
- addl 40(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpor %xmm8,%xmm2,%xmm2
- addl 44(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpalignr $8,%xmm1,%xmm2,%xmm8
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- vpxor %xmm4,%xmm3,%xmm3
- addl %esi,%ebx
- xorl %ebp,%edi
- vpaddd %xmm2,%xmm11,%xmm9
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpxor %xmm8,%xmm3,%xmm3
- addl 52(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm3,%xmm8
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm3,%xmm3
- addl 56(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpor %xmm8,%xmm3,%xmm3
- addl 60(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpalignr $8,%xmm2,%xmm3,%xmm8
- vpxor %xmm0,%xmm4,%xmm4
- addl 0(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- vpxor %xmm5,%xmm4,%xmm4
- addl %esi,%ecx
- xorl %eax,%edi
- vpaddd %xmm3,%xmm11,%xmm9
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpxor %xmm8,%xmm4,%xmm4
- addl 4(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- vpsrld $30,%xmm4,%xmm8
- vmovdqa %xmm9,48(%rsp)
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpslld $2,%xmm4,%xmm4
- addl 8(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpor %xmm8,%xmm4,%xmm4
- addl 12(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm3,%xmm4,%xmm8
- vpxor %xmm1,%xmm5,%xmm5
- addl 16(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- vpxor %xmm6,%xmm5,%xmm5
- addl %esi,%edx
- xorl %ebx,%edi
- vpaddd %xmm4,%xmm11,%xmm9
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpxor %xmm8,%xmm5,%xmm5
- addl 20(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- vpsrld $30,%xmm5,%xmm8
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpslld $2,%xmm5,%xmm5
- addl 24(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpor %xmm8,%xmm5,%xmm5
- addl 28(%rsp),%eax
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm4,%xmm5,%xmm8
- vpxor %xmm2,%xmm6,%xmm6
- addl 32(%rsp),%ebp
- andl %ecx,%esi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- movl %eax,%edi
- xorl %ecx,%esi
- vpaddd %xmm5,%xmm11,%xmm9
- shldl $5,%eax,%eax
- addl %esi,%ebp
- vpxor %xmm8,%xmm6,%xmm6
- xorl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 36(%rsp),%edx
- vpsrld $30,%xmm6,%xmm8
- vmovdqa %xmm9,16(%rsp)
- andl %ebx,%edi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%esi
- vpslld $2,%xmm6,%xmm6
- xorl %ebx,%edi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 40(%rsp),%ecx
- andl %eax,%esi
- vpor %xmm8,%xmm6,%xmm6
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%edi
- xorl %eax,%esi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 44(%rsp),%ebx
- andl %ebp,%edi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- movl %ecx,%esi
- xorl %ebp,%edi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm5,%xmm6,%xmm8
- vpxor %xmm3,%xmm7,%xmm7
- addl 48(%rsp),%eax
- andl %edx,%esi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- vpxor %xmm0,%xmm7,%xmm7
- movl %ebx,%edi
- xorl %edx,%esi
- vpaddd %xmm6,%xmm11,%xmm9
- vmovdqa 32(%r11),%xmm11
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vpxor %xmm8,%xmm7,%xmm7
- xorl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 52(%rsp),%ebp
- vpsrld $30,%xmm7,%xmm8
- vmovdqa %xmm9,32(%rsp)
- andl %ecx,%edi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- vpslld $2,%xmm7,%xmm7
- xorl %ecx,%edi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 56(%rsp),%edx
- andl %ebx,%esi
- vpor %xmm8,%xmm7,%xmm7
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%edi
- xorl %ebx,%esi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 60(%rsp),%ecx
- andl %eax,%edi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%esi
- xorl %eax,%edi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- vpalignr $8,%xmm6,%xmm7,%xmm8
- vpxor %xmm4,%xmm0,%xmm0
- addl 0(%rsp),%ebx
- andl %ebp,%esi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- vpxor %xmm1,%xmm0,%xmm0
- movl %ecx,%edi
- xorl %ebp,%esi
- vpaddd %xmm7,%xmm11,%xmm9
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- vpxor %xmm8,%xmm0,%xmm0
- xorl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 4(%rsp),%eax
- vpsrld $30,%xmm0,%xmm8
- vmovdqa %xmm9,48(%rsp)
- andl %edx,%edi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- vpslld $2,%xmm0,%xmm0
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 8(%rsp),%ebp
- andl %ecx,%esi
- vpor %xmm8,%xmm0,%xmm0
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%edi
- xorl %ecx,%esi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 12(%rsp),%edx
- andl %ebx,%edi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%esi
- xorl %ebx,%edi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- vpalignr $8,%xmm7,%xmm0,%xmm8
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%rsp),%ecx
- andl %eax,%esi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- vpxor %xmm2,%xmm1,%xmm1
- movl %edx,%edi
- xorl %eax,%esi
- vpaddd %xmm0,%xmm11,%xmm9
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vpxor %xmm8,%xmm1,%xmm1
- xorl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 20(%rsp),%ebx
- vpsrld $30,%xmm1,%xmm8
- vmovdqa %xmm9,0(%rsp)
- andl %ebp,%edi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- movl %ecx,%esi
- vpslld $2,%xmm1,%xmm1
- xorl %ebp,%edi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 24(%rsp),%eax
- andl %edx,%esi
- vpor %xmm8,%xmm1,%xmm1
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%edi
- xorl %edx,%esi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 28(%rsp),%ebp
- andl %ecx,%edi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- xorl %ecx,%edi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm0,%xmm1,%xmm8
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%rsp),%edx
- andl %ebx,%esi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- vpxor %xmm3,%xmm2,%xmm2
- movl %ebp,%edi
- xorl %ebx,%esi
- vpaddd %xmm1,%xmm11,%xmm9
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- vpxor %xmm8,%xmm2,%xmm2
- xorl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 36(%rsp),%ecx
- vpsrld $30,%xmm2,%xmm8
- vmovdqa %xmm9,16(%rsp)
- andl %eax,%edi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%esi
- vpslld $2,%xmm2,%xmm2
- xorl %eax,%edi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 40(%rsp),%ebx
- andl %ebp,%esi
- vpor %xmm8,%xmm2,%xmm2
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- movl %ecx,%edi
- xorl %ebp,%esi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 44(%rsp),%eax
- andl %edx,%edi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- addl %ebx,%eax
- vpalignr $8,%xmm1,%xmm2,%xmm8
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- vpxor %xmm4,%xmm3,%xmm3
- addl %esi,%ebp
- xorl %ecx,%edi
- vpaddd %xmm2,%xmm11,%xmm9
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpxor %xmm8,%xmm3,%xmm3
- addl 52(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- vpsrld $30,%xmm3,%xmm8
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpslld $2,%xmm3,%xmm3
- addl 56(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpor %xmm8,%xmm3,%xmm3
- addl 60(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 0(%rsp),%eax
- vpaddd %xmm3,%xmm11,%xmm9
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vmovdqa %xmm9,48(%rsp)
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 4(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 8(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 12(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- cmpq %r10,%r9
- je L$done_avx
- vmovdqa 64(%r11),%xmm6
- vmovdqa -64(%r11),%xmm11
- vmovdqu 0(%r9),%xmm0
- vmovdqu 16(%r9),%xmm1
- vmovdqu 32(%r9),%xmm2
- vmovdqu 48(%r9),%xmm3
- vpshufb %xmm6,%xmm0,%xmm0
- addq $64,%r9
- addl 16(%rsp),%ebx
- xorl %ebp,%esi
- vpshufb %xmm6,%xmm1,%xmm1
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- vpaddd %xmm11,%xmm0,%xmm4
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vmovdqa %xmm4,0(%rsp)
- addl 20(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 28(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 32(%rsp),%ecx
- xorl %eax,%esi
- vpshufb %xmm6,%xmm2,%xmm2
- movl %edx,%edi
- shldl $5,%edx,%edx
- vpaddd %xmm11,%xmm1,%xmm5
- addl %esi,%ecx
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vmovdqa %xmm5,16(%rsp)
- addl 36(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 48(%rsp),%edx
- xorl %ebx,%esi
- vpshufb %xmm6,%xmm3,%xmm3
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- vpaddd %xmm11,%xmm2,%xmm6
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vmovdqa %xmm6,32(%rsp)
- addl 52(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 56(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 0(%r8),%eax
- addl 4(%r8),%esi
- addl 8(%r8),%ecx
- addl 12(%r8),%edx
- movl %eax,0(%r8)
- addl 16(%r8),%ebp
- movl %esi,4(%r8)
- movl %esi,%ebx
- movl %ecx,8(%r8)
- movl %ecx,%edi
- movl %edx,12(%r8)
- xorl %edx,%edi
- movl %ebp,16(%r8)
- andl %edi,%esi
- jmp L$oop_avx
-
-.p2align 4
-L$done_avx:
- addl 16(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 20(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 28(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 32(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 36(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 48(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 52(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 56(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vzeroupper
-
- addl 0(%r8),%eax
- addl 4(%r8),%esi
- addl 8(%r8),%ecx
- movl %eax,0(%r8)
- addl 12(%r8),%edx
- movl %esi,4(%r8)
- addl 16(%r8),%ebp
- movl %ecx,8(%r8)
- movl %edx,12(%r8)
- movl %ebp,16(%r8)
- leaq (%r14),%rsi
- movq -40(%rsi),%r14
- movq -32(%rsi),%r13
- movq -24(%rsi),%r12
- movq -16(%rsi),%rbp
- movq -8(%rsi),%rbx
- leaq (%rsi),%rsp
-L$epilogue_avx:
- .byte 0xf3,0xc3
-
-
-.p2align 4
-sha1_block_data_order_avx2:
-_avx2_shortcut:
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- vzeroupper
- movq %rax,%r14
- movq %rdi,%r8
- movq %rsi,%r9
- movq %rdx,%r10
-
- leaq -640(%rsp),%rsp
- shlq $6,%r10
- leaq 64(%r9),%r13
- andq $-128,%rsp
- addq %r9,%r10
- leaq K_XX_XX+64(%rip),%r11
-
- movl 0(%r8),%eax
- cmpq %r10,%r13
- cmovaeq %r9,%r13
- movl 4(%r8),%ebp
- movl 8(%r8),%ecx
- movl 12(%r8),%edx
- movl 16(%r8),%esi
- vmovdqu 64(%r11),%ymm6
-
- vmovdqu (%r9),%xmm0
- vmovdqu 16(%r9),%xmm1
- vmovdqu 32(%r9),%xmm2
- vmovdqu 48(%r9),%xmm3
- leaq 64(%r9),%r9
- vinserti128 $1,(%r13),%ymm0,%ymm0
- vinserti128 $1,16(%r13),%ymm1,%ymm1
- vpshufb %ymm6,%ymm0,%ymm0
- vinserti128 $1,32(%r13),%ymm2,%ymm2
- vpshufb %ymm6,%ymm1,%ymm1
- vinserti128 $1,48(%r13),%ymm3,%ymm3
- vpshufb %ymm6,%ymm2,%ymm2
- vmovdqu -64(%r11),%ymm11
- vpshufb %ymm6,%ymm3,%ymm3
-
- vpaddd %ymm11,%ymm0,%ymm4
- vpaddd %ymm11,%ymm1,%ymm5
- vmovdqu %ymm4,0(%rsp)
- vpaddd %ymm11,%ymm2,%ymm6
- vmovdqu %ymm5,32(%rsp)
- vpaddd %ymm11,%ymm3,%ymm7
- vmovdqu %ymm6,64(%rsp)
- vmovdqu %ymm7,96(%rsp)
- vpalignr $8,%ymm0,%ymm1,%ymm4
- vpsrldq $4,%ymm3,%ymm8
- vpxor %ymm0,%ymm4,%ymm4
- vpxor %ymm2,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $31,%ymm4,%ymm8
- vpslldq $12,%ymm4,%ymm10
- vpaddd %ymm4,%ymm4,%ymm4
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm4,%ymm4
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm4,%ymm4
- vpxor %ymm10,%ymm4,%ymm4
- vpaddd %ymm11,%ymm4,%ymm9
- vmovdqu %ymm9,128(%rsp)
- vpalignr $8,%ymm1,%ymm2,%ymm5
- vpsrldq $4,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm3,%ymm8,%ymm8
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $31,%ymm5,%ymm8
- vmovdqu -32(%r11),%ymm11
- vpslldq $12,%ymm5,%ymm10
- vpaddd %ymm5,%ymm5,%ymm5
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm5,%ymm5
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm10,%ymm5,%ymm5
- vpaddd %ymm11,%ymm5,%ymm9
- vmovdqu %ymm9,160(%rsp)
- vpalignr $8,%ymm2,%ymm3,%ymm6
- vpsrldq $4,%ymm5,%ymm8
- vpxor %ymm2,%ymm6,%ymm6
- vpxor %ymm4,%ymm8,%ymm8
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $31,%ymm6,%ymm8
- vpslldq $12,%ymm6,%ymm10
- vpaddd %ymm6,%ymm6,%ymm6
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm6,%ymm6
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm6,%ymm6
- vpxor %ymm10,%ymm6,%ymm6
- vpaddd %ymm11,%ymm6,%ymm9
- vmovdqu %ymm9,192(%rsp)
- vpalignr $8,%ymm3,%ymm4,%ymm7
- vpsrldq $4,%ymm6,%ymm8
- vpxor %ymm3,%ymm7,%ymm7
- vpxor %ymm5,%ymm8,%ymm8
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm7,%ymm8
- vpslldq $12,%ymm7,%ymm10
- vpaddd %ymm7,%ymm7,%ymm7
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm7,%ymm7
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm7,%ymm7
- vpxor %ymm10,%ymm7,%ymm7
- vpaddd %ymm11,%ymm7,%ymm9
- vmovdqu %ymm9,224(%rsp)
- leaq 128(%rsp),%r13
- jmp L$oop_avx2
-.p2align 5
-L$oop_avx2:
- rorxl $2,%ebp,%ebx
- andnl %edx,%ebp,%edi
- andl %ecx,%ebp
- xorl %edi,%ebp
- jmp L$align32_1
-.p2align 5
-L$align32_1:
- vpalignr $8,%ymm6,%ymm7,%ymm8
- vpxor %ymm4,%ymm0,%ymm0
- addl -128(%r13),%esi
- andnl %ecx,%eax,%edi
- vpxor %ymm1,%ymm0,%ymm0
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- vpxor %ymm8,%ymm0,%ymm0
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- vpsrld $30,%ymm0,%ymm8
- vpslld $2,%ymm0,%ymm0
- addl -124(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- vpor %ymm8,%ymm0,%ymm0
- addl %r12d,%edx
- xorl %edi,%esi
- addl -120(%r13),%ecx
- andnl %ebp,%edx,%edi
- vpaddd %ymm11,%ymm0,%ymm9
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- vmovdqu %ymm9,256(%rsp)
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -116(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- addl -96(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- vpalignr $8,%ymm7,%ymm0,%ymm8
- vpxor %ymm5,%ymm1,%ymm1
- addl -92(%r13),%eax
- andnl %edx,%ebp,%edi
- vpxor %ymm2,%ymm1,%ymm1
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- vpxor %ymm8,%ymm1,%ymm1
- andl %ecx,%ebp
- addl %r12d,%eax
- xorl %edi,%ebp
- vpsrld $30,%ymm1,%ymm8
- vpslld $2,%ymm1,%ymm1
- addl -88(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- vpor %ymm8,%ymm1,%ymm1
- addl %r12d,%esi
- xorl %edi,%eax
- addl -84(%r13),%edx
- andnl %ebx,%esi,%edi
- vpaddd %ymm11,%ymm1,%ymm9
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- vmovdqu %ymm9,288(%rsp)
- addl %r12d,%edx
- xorl %edi,%esi
- addl -64(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -60(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- vpalignr $8,%ymm0,%ymm1,%ymm8
- vpxor %ymm6,%ymm2,%ymm2
- addl -56(%r13),%ebp
- andnl %esi,%ebx,%edi
- vpxor %ymm3,%ymm2,%ymm2
- vmovdqu 0(%r11),%ymm11
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- vpxor %ymm8,%ymm2,%ymm2
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- vpsrld $30,%ymm2,%ymm8
- vpslld $2,%ymm2,%ymm2
- addl -52(%r13),%eax
- andnl %edx,%ebp,%edi
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- vpor %ymm8,%ymm2,%ymm2
- addl %r12d,%eax
- xorl %edi,%ebp
- addl -32(%r13),%esi
- andnl %ecx,%eax,%edi
- vpaddd %ymm11,%ymm2,%ymm9
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- vmovdqu %ymm9,320(%rsp)
- addl %r12d,%esi
- xorl %edi,%eax
- addl -28(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- addl %r12d,%edx
- xorl %edi,%esi
- addl -24(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- vpalignr $8,%ymm1,%ymm2,%ymm8
- vpxor %ymm7,%ymm3,%ymm3
- addl -20(%r13),%ebx
- andnl %eax,%ecx,%edi
- vpxor %ymm4,%ymm3,%ymm3
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- vpxor %ymm8,%ymm3,%ymm3
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- vpsrld $30,%ymm3,%ymm8
- vpslld $2,%ymm3,%ymm3
- addl 0(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- vpor %ymm8,%ymm3,%ymm3
- addl %r12d,%ebp
- xorl %edi,%ebx
- addl 4(%r13),%eax
- andnl %edx,%ebp,%edi
- vpaddd %ymm11,%ymm3,%ymm9
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- vmovdqu %ymm9,352(%rsp)
- addl %r12d,%eax
- xorl %edi,%ebp
- addl 8(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl 12(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- vpalignr $8,%ymm2,%ymm3,%ymm8
- vpxor %ymm0,%ymm4,%ymm4
- addl 32(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- vpxor %ymm8,%ymm4,%ymm4
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl 36(%r13),%ebx
- vpsrld $30,%ymm4,%ymm8
- vpslld $2,%ymm4,%ymm4
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- vpor %ymm8,%ymm4,%ymm4
- addl 40(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- vpaddd %ymm11,%ymm4,%ymm9
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl 44(%r13),%eax
- vmovdqu %ymm9,384(%rsp)
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl 64(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- vpalignr $8,%ymm3,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- addl 68(%r13),%edx
- leal (%rdx,%rax,1),%edx
- vpxor %ymm6,%ymm5,%ymm5
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- vpxor %ymm8,%ymm5,%ymm5
- addl %r12d,%edx
- xorl %ebx,%esi
- addl 72(%r13),%ecx
- vpsrld $30,%ymm5,%ymm8
- vpslld $2,%ymm5,%ymm5
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- vpor %ymm8,%ymm5,%ymm5
- addl 76(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- vpaddd %ymm11,%ymm5,%ymm9
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl 96(%r13),%ebp
- vmovdqu %ymm9,416(%rsp)
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl 100(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- vpalignr $8,%ymm4,%ymm5,%ymm8
- vpxor %ymm2,%ymm6,%ymm6
- addl 104(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- vpxor %ymm7,%ymm6,%ymm6
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- vpxor %ymm8,%ymm6,%ymm6
- addl %r12d,%esi
- xorl %ecx,%eax
- addl 108(%r13),%edx
- leaq 256(%r13),%r13
- vpsrld $30,%ymm6,%ymm8
- vpslld $2,%ymm6,%ymm6
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- vpor %ymm8,%ymm6,%ymm6
- addl -128(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- vpaddd %ymm11,%ymm6,%ymm9
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -124(%r13),%ebx
- vmovdqu %ymm9,448(%rsp)
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -120(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- vpalignr $8,%ymm5,%ymm6,%ymm8
- vpxor %ymm3,%ymm7,%ymm7
- addl -116(%r13),%eax
- leal (%rax,%rbx,1),%eax
- vpxor %ymm0,%ymm7,%ymm7
- vmovdqu 32(%r11),%ymm11
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- vpxor %ymm8,%ymm7,%ymm7
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -96(%r13),%esi
- vpsrld $30,%ymm7,%ymm8
- vpslld $2,%ymm7,%ymm7
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- vpor %ymm8,%ymm7,%ymm7
- addl -92(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- vpaddd %ymm11,%ymm7,%ymm9
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -88(%r13),%ecx
- vmovdqu %ymm9,480(%rsp)
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -84(%r13),%ebx
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- jmp L$align32_2
-.p2align 5
-L$align32_2:
- vpalignr $8,%ymm6,%ymm7,%ymm8
- vpxor %ymm4,%ymm0,%ymm0
- addl -64(%r13),%ebp
- xorl %esi,%ecx
- vpxor %ymm1,%ymm0,%ymm0
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- vpxor %ymm8,%ymm0,%ymm0
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- vpsrld $30,%ymm0,%ymm8
- vpslld $2,%ymm0,%ymm0
- addl %r12d,%ebp
- andl %edi,%ebx
- addl -60(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- vpor %ymm8,%ymm0,%ymm0
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- vpaddd %ymm11,%ymm0,%ymm9
- addl %r12d,%eax
- andl %edi,%ebp
- addl -56(%r13),%esi
- xorl %ecx,%ebp
- vmovdqu %ymm9,512(%rsp)
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- addl -52(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- andl %edi,%esi
- addl -32(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- andl %edi,%edx
- vpalignr $8,%ymm7,%ymm0,%ymm8
- vpxor %ymm5,%ymm1,%ymm1
- addl -28(%r13),%ebx
- xorl %eax,%edx
- vpxor %ymm2,%ymm1,%ymm1
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- vpxor %ymm8,%ymm1,%ymm1
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- vpsrld $30,%ymm1,%ymm8
- vpslld $2,%ymm1,%ymm1
- addl %r12d,%ebx
- andl %edi,%ecx
- addl -24(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- vpor %ymm8,%ymm1,%ymm1
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- vpaddd %ymm11,%ymm1,%ymm9
- addl %r12d,%ebp
- andl %edi,%ebx
- addl -20(%r13),%eax
- xorl %edx,%ebx
- vmovdqu %ymm9,544(%rsp)
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl 0(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- addl 4(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- andl %edi,%esi
- vpalignr $8,%ymm0,%ymm1,%ymm8
- vpxor %ymm6,%ymm2,%ymm2
- addl 8(%r13),%ecx
- xorl %ebp,%esi
- vpxor %ymm3,%ymm2,%ymm2
- movl %eax,%edi
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- vpxor %ymm8,%ymm2,%ymm2
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- vpsrld $30,%ymm2,%ymm8
- vpslld $2,%ymm2,%ymm2
- addl %r12d,%ecx
- andl %edi,%edx
- addl 12(%r13),%ebx
- xorl %eax,%edx
- movl %esi,%edi
- xorl %eax,%edi
- vpor %ymm8,%ymm2,%ymm2
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- vpaddd %ymm11,%ymm2,%ymm9
- addl %r12d,%ebx
- andl %edi,%ecx
- addl 32(%r13),%ebp
- xorl %esi,%ecx
- vmovdqu %ymm9,576(%rsp)
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl 36(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl 40(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- vpalignr $8,%ymm1,%ymm2,%ymm8
- vpxor %ymm7,%ymm3,%ymm3
- addl 44(%r13),%edx
- xorl %ebx,%eax
- vpxor %ymm4,%ymm3,%ymm3
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- vpxor %ymm8,%ymm3,%ymm3
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- vpsrld $30,%ymm3,%ymm8
- vpslld $2,%ymm3,%ymm3
- addl %r12d,%edx
- andl %edi,%esi
- addl 64(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- xorl %ebp,%edi
- vpor %ymm8,%ymm3,%ymm3
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- vpaddd %ymm11,%ymm3,%ymm9
- addl %r12d,%ecx
- andl %edi,%edx
- addl 68(%r13),%ebx
- xorl %eax,%edx
- vmovdqu %ymm9,608(%rsp)
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- addl 72(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl 76(%r13),%eax
- xorl %edx,%ebx
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl 96(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl 100(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl 104(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl 108(%r13),%ebx
- leaq 256(%r13),%r13
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -128(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl -124(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -120(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -116(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -96(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -92(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -88(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl -84(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -64(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -60(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -56(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -52(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -32(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl -28(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -24(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -20(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- addl %r12d,%edx
- leaq 128(%r9),%r13
- leaq 128(%r9),%rdi
- cmpq %r10,%r13
- cmovaeq %r9,%r13
-
-
- addl 0(%r8),%edx
- addl 4(%r8),%esi
- addl 8(%r8),%ebp
- movl %edx,0(%r8)
- addl 12(%r8),%ebx
- movl %esi,4(%r8)
- movl %edx,%eax
- addl 16(%r8),%ecx
- movl %ebp,%r12d
- movl %ebp,8(%r8)
- movl %ebx,%edx
-
- movl %ebx,12(%r8)
- movl %esi,%ebp
- movl %ecx,16(%r8)
-
- movl %ecx,%esi
- movl %r12d,%ecx
-
-
- cmpq %r10,%r9
- je L$done_avx2
- vmovdqu 64(%r11),%ymm6
- cmpq %r10,%rdi
- ja L$ast_avx2
-
- vmovdqu -64(%rdi),%xmm0
- vmovdqu -48(%rdi),%xmm1
- vmovdqu -32(%rdi),%xmm2
- vmovdqu -16(%rdi),%xmm3
- vinserti128 $1,0(%r13),%ymm0,%ymm0
- vinserti128 $1,16(%r13),%ymm1,%ymm1
- vinserti128 $1,32(%r13),%ymm2,%ymm2
- vinserti128 $1,48(%r13),%ymm3,%ymm3
- jmp L$ast_avx2
-
-.p2align 5
-L$ast_avx2:
- leaq 128+16(%rsp),%r13
- rorxl $2,%ebp,%ebx
- andnl %edx,%ebp,%edi
- andl %ecx,%ebp
- xorl %edi,%ebp
- subq $-128,%r9
- addl -128(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl -124(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- addl %r12d,%edx
- xorl %edi,%esi
- addl -120(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -116(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- addl -96(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- addl -92(%r13),%eax
- andnl %edx,%ebp,%edi
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- addl %r12d,%eax
- xorl %edi,%ebp
- addl -88(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl -84(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- addl %r12d,%edx
- xorl %edi,%esi
- addl -64(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -60(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- addl -56(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- addl -52(%r13),%eax
- andnl %edx,%ebp,%edi
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- addl %r12d,%eax
- xorl %edi,%ebp
- addl -32(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl -28(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- addl %r12d,%edx
- xorl %edi,%esi
- addl -24(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -20(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- addl 0(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- addl 4(%r13),%eax
- andnl %edx,%ebp,%edi
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- addl %r12d,%eax
- xorl %edi,%ebp
- addl 8(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl 12(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl 32(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl 36(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl 40(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl 44(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl 64(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- vmovdqu -64(%r11),%ymm11
- vpshufb %ymm6,%ymm0,%ymm0
- addl 68(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl 72(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl 76(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl 96(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl 100(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- vpshufb %ymm6,%ymm1,%ymm1
- vpaddd %ymm11,%ymm0,%ymm8
- addl 104(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl 108(%r13),%edx
- leaq 256(%r13),%r13
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -128(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -124(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -120(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- vmovdqu %ymm8,0(%rsp)
- vpshufb %ymm6,%ymm2,%ymm2
- vpaddd %ymm11,%ymm1,%ymm9
- addl -116(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -96(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -92(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -88(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -84(%r13),%ebx
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- vmovdqu %ymm9,32(%rsp)
- vpshufb %ymm6,%ymm3,%ymm3
- vpaddd %ymm11,%ymm2,%ymm6
- addl -64(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl -60(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl -56(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- addl -52(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- andl %edi,%esi
- addl -32(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- andl %edi,%edx
- jmp L$align32_3
-.p2align 5
-L$align32_3:
- vmovdqu %ymm6,64(%rsp)
- vpaddd %ymm11,%ymm3,%ymm7
- addl -28(%r13),%ebx
- xorl %eax,%edx
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- addl -24(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl -20(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl 0(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- addl 4(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- andl %edi,%esi
- vmovdqu %ymm7,96(%rsp)
- addl 8(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- andl %edi,%edx
- addl 12(%r13),%ebx
- xorl %eax,%edx
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- addl 32(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl 36(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl 40(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- vpalignr $8,%ymm0,%ymm1,%ymm4
- addl 44(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- vpsrldq $4,%ymm3,%ymm8
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- vpxor %ymm0,%ymm4,%ymm4
- vpxor %ymm2,%ymm8,%ymm8
- xorl %ebp,%esi
- addl %r12d,%edx
- vpxor %ymm8,%ymm4,%ymm4
- andl %edi,%esi
- addl 64(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- vpsrld $31,%ymm4,%ymm8
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- vpslldq $12,%ymm4,%ymm10
- vpaddd %ymm4,%ymm4,%ymm4
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm4,%ymm4
- addl %r12d,%ecx
- andl %edi,%edx
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm4,%ymm4
- addl 68(%r13),%ebx
- xorl %eax,%edx
- vpxor %ymm10,%ymm4,%ymm4
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- vpaddd %ymm11,%ymm4,%ymm9
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- vmovdqu %ymm9,128(%rsp)
- addl %r12d,%ebx
- andl %edi,%ecx
- addl 72(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl 76(%r13),%eax
- xorl %edx,%ebx
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- vpalignr $8,%ymm1,%ymm2,%ymm5
- addl 96(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- vpsrldq $4,%ymm4,%ymm8
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm3,%ymm8,%ymm8
- addl 100(%r13),%edx
- leal (%rdx,%rax,1),%edx
- vpxor %ymm8,%ymm5,%ymm5
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- vpsrld $31,%ymm5,%ymm8
- vmovdqu -32(%r11),%ymm11
- xorl %ebx,%esi
- addl 104(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- vpslldq $12,%ymm5,%ymm10
- vpaddd %ymm5,%ymm5,%ymm5
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm5,%ymm5
- xorl %eax,%edx
- addl %r12d,%ecx
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm5,%ymm5
- xorl %ebp,%edx
- addl 108(%r13),%ebx
- leaq 256(%r13),%r13
- vpxor %ymm10,%ymm5,%ymm5
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- vpaddd %ymm11,%ymm5,%ymm9
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- vmovdqu %ymm9,160(%rsp)
- addl -128(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- vpalignr $8,%ymm2,%ymm3,%ymm6
- addl -124(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- vpsrldq $4,%ymm5,%ymm8
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- vpxor %ymm2,%ymm6,%ymm6
- vpxor %ymm4,%ymm8,%ymm8
- addl -120(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- vpxor %ymm8,%ymm6,%ymm6
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- vpsrld $31,%ymm6,%ymm8
- xorl %ecx,%eax
- addl -116(%r13),%edx
- leal (%rdx,%rax,1),%edx
- vpslldq $12,%ymm6,%ymm10
- vpaddd %ymm6,%ymm6,%ymm6
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm6,%ymm6
- xorl %ebp,%esi
- addl %r12d,%edx
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm6,%ymm6
- xorl %ebx,%esi
- addl -96(%r13),%ecx
- vpxor %ymm10,%ymm6,%ymm6
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- vpaddd %ymm11,%ymm6,%ymm9
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- vmovdqu %ymm9,192(%rsp)
- addl -92(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- vpalignr $8,%ymm3,%ymm4,%ymm7
- addl -88(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- vpsrldq $4,%ymm6,%ymm8
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- vpxor %ymm3,%ymm7,%ymm7
- vpxor %ymm5,%ymm8,%ymm8
- addl -84(%r13),%eax
- leal (%rax,%rbx,1),%eax
- vpxor %ymm8,%ymm7,%ymm7
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- vpsrld $31,%ymm7,%ymm8
- xorl %edx,%ebp
- addl -64(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- vpslldq $12,%ymm7,%ymm10
- vpaddd %ymm7,%ymm7,%ymm7
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm7,%ymm7
- xorl %ebx,%eax
- addl %r12d,%esi
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm7,%ymm7
- xorl %ecx,%eax
- addl -60(%r13),%edx
- vpxor %ymm10,%ymm7,%ymm7
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- vpaddd %ymm11,%ymm7,%ymm9
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- vmovdqu %ymm9,224(%rsp)
- addl -56(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -52(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -32(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl -28(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -24(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -20(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- addl %r12d,%edx
- leaq 128(%rsp),%r13
-
-
- addl 0(%r8),%edx
- addl 4(%r8),%esi
- addl 8(%r8),%ebp
- movl %edx,0(%r8)
- addl 12(%r8),%ebx
- movl %esi,4(%r8)
- movl %edx,%eax
- addl 16(%r8),%ecx
- movl %ebp,%r12d
- movl %ebp,8(%r8)
- movl %ebx,%edx
-
- movl %ebx,12(%r8)
- movl %esi,%ebp
- movl %ecx,16(%r8)
-
- movl %ecx,%esi
- movl %r12d,%ecx
-
-
- cmpq %r10,%r9
- jbe L$oop_avx2
-
-L$done_avx2:
- vzeroupper
- leaq (%r14),%rsi
- movq -40(%rsi),%r14
- movq -32(%rsi),%r13
- movq -24(%rsi),%r12
- movq -16(%rsi),%rbp
- movq -8(%rsi),%rbx
- leaq (%rsi),%rsp
-L$epilogue_avx2:
- .byte 0xf3,0xc3
-
.p2align 6
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
diff --git a/deps/openssl/asm/x64-macosx-gas/sha/sha256-mb-x86_64.s b/deps/openssl/asm/x64-macosx-gas/sha/sha256-mb-x86_64.s
index 77c24f1cf5..276322bec2 100644
--- a/deps/openssl/asm/x64-macosx-gas/sha/sha256-mb-x86_64.s
+++ b/deps/openssl/asm/x64-macosx-gas/sha/sha256-mb-x86_64.s
@@ -9,8 +9,6 @@ _sha256_multi_block:
movq _OPENSSL_ia32cap_P+4(%rip),%rcx
btq $61,%rcx
jc _shaext_shortcut
- testl $268435456,%ecx
- jnz _avx_shortcut
movq %rsp,%rax
pushq %rbx
pushq %rbp
@@ -2679,10 +2677,10 @@ L$oop_grande_shaext:
punpckhqdq %xmm8,%xmm14
punpckhqdq %xmm10,%xmm15
- pshufd $27,%xmm12,%xmm12
- pshufd $27,%xmm13,%xmm13
- pshufd $27,%xmm14,%xmm14
- pshufd $27,%xmm15,%xmm15
+ pshufd $0b00011011,%xmm12,%xmm12
+ pshufd $0b00011011,%xmm13,%xmm13
+ pshufd $0b00011011,%xmm14,%xmm14
+ pshufd $0b00011011,%xmm15,%xmm15
jmp L$oop_shaext
.p2align 5
@@ -2714,11 +2712,11 @@ L$oop_shaext:
movdqa %xmm2,%xmm0
movdqa %xmm15,112(%rsp)
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
pxor %xmm12,%xmm4
movdqa %xmm12,64(%rsp)
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
pxor %xmm14,%xmm8
movdqa %xmm14,96(%rsp)
movdqa 16-128(%rbp),%xmm1
@@ -2736,11 +2734,11 @@ L$oop_shaext:
.byte 102,68,15,56,0,211
prefetcht0 127(%r9)
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
.byte 102,68,15,56,0,219
.byte 15,56,204,229
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 32-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -2753,14 +2751,14 @@ L$oop_shaext:
movdqa %xmm2,%xmm0
movdqa %xmm7,%xmm3
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
.byte 102,15,58,15,222,4
paddd %xmm3,%xmm4
movdqa %xmm11,%xmm3
.byte 102,65,15,58,15,218,4
.byte 15,56,204,238
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 48-128(%rbp),%xmm1
paddd %xmm7,%xmm1
.byte 69,15,56,203,247
@@ -2777,13 +2775,13 @@ L$oop_shaext:
.byte 102,15,58,15,223,4
.byte 69,15,56,203,254
.byte 69,15,56,205,195
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm5
movdqa %xmm8,%xmm3
.byte 102,65,15,58,15,219,4
.byte 15,56,204,247
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 64-128(%rbp),%xmm1
paddd %xmm4,%xmm1
.byte 69,15,56,203,247
@@ -2799,13 +2797,13 @@ L$oop_shaext:
.byte 102,15,58,15,220,4
.byte 69,15,56,203,254
.byte 69,15,56,205,200
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm6
movdqa %xmm9,%xmm3
.byte 102,65,15,58,15,216,4
.byte 15,56,204,252
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 80-128(%rbp),%xmm1
paddd %xmm5,%xmm1
.byte 69,15,56,203,247
@@ -2821,13 +2819,13 @@ L$oop_shaext:
.byte 102,15,58,15,221,4
.byte 69,15,56,203,254
.byte 69,15,56,205,209
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm7
movdqa %xmm10,%xmm3
.byte 102,65,15,58,15,217,4
.byte 15,56,204,229
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 96-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -2843,13 +2841,13 @@ L$oop_shaext:
.byte 102,15,58,15,222,4
.byte 69,15,56,203,254
.byte 69,15,56,205,218
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm4
movdqa %xmm11,%xmm3
.byte 102,65,15,58,15,218,4
.byte 15,56,204,238
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 112-128(%rbp),%xmm1
paddd %xmm7,%xmm1
.byte 69,15,56,203,247
@@ -2865,13 +2863,13 @@ L$oop_shaext:
.byte 102,15,58,15,223,4
.byte 69,15,56,203,254
.byte 69,15,56,205,195
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm5
movdqa %xmm8,%xmm3
.byte 102,65,15,58,15,219,4
.byte 15,56,204,247
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 128-128(%rbp),%xmm1
paddd %xmm4,%xmm1
.byte 69,15,56,203,247
@@ -2887,13 +2885,13 @@ L$oop_shaext:
.byte 102,15,58,15,220,4
.byte 69,15,56,203,254
.byte 69,15,56,205,200
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm6
movdqa %xmm9,%xmm3
.byte 102,65,15,58,15,216,4
.byte 15,56,204,252
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 144-128(%rbp),%xmm1
paddd %xmm5,%xmm1
.byte 69,15,56,203,247
@@ -2909,13 +2907,13 @@ L$oop_shaext:
.byte 102,15,58,15,221,4
.byte 69,15,56,203,254
.byte 69,15,56,205,209
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm7
movdqa %xmm10,%xmm3
.byte 102,65,15,58,15,217,4
.byte 15,56,204,229
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 160-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -2931,13 +2929,13 @@ L$oop_shaext:
.byte 102,15,58,15,222,4
.byte 69,15,56,203,254
.byte 69,15,56,205,218
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm4
movdqa %xmm11,%xmm3
.byte 102,65,15,58,15,218,4
.byte 15,56,204,238
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 176-128(%rbp),%xmm1
paddd %xmm7,%xmm1
.byte 69,15,56,203,247
@@ -2953,13 +2951,13 @@ L$oop_shaext:
.byte 102,15,58,15,223,4
.byte 69,15,56,203,254
.byte 69,15,56,205,195
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm5
movdqa %xmm8,%xmm3
.byte 102,65,15,58,15,219,4
.byte 15,56,204,247
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 192-128(%rbp),%xmm1
paddd %xmm4,%xmm1
.byte 69,15,56,203,247
@@ -2975,13 +2973,13 @@ L$oop_shaext:
.byte 102,15,58,15,220,4
.byte 69,15,56,203,254
.byte 69,15,56,205,200
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm6
movdqa %xmm9,%xmm3
.byte 102,65,15,58,15,216,4
.byte 15,56,204,252
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 208-128(%rbp),%xmm1
paddd %xmm5,%xmm1
.byte 69,15,56,203,247
@@ -2997,13 +2995,13 @@ L$oop_shaext:
.byte 102,15,58,15,221,4
.byte 69,15,56,203,254
.byte 69,15,56,205,209
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm7
movdqa %xmm10,%xmm3
.byte 102,65,15,58,15,217,4
nop
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 224-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -3020,13 +3018,13 @@ L$oop_shaext:
pxor %xmm6,%xmm6
.byte 69,15,56,203,254
.byte 69,15,56,205,218
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
movdqa 240-128(%rbp),%xmm1
paddd %xmm7,%xmm1
movq (%rbx),%xmm7
nop
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 240-128(%rbp),%xmm2
paddd %xmm11,%xmm2
.byte 69,15,56,203,247
@@ -3036,17 +3034,17 @@ L$oop_shaext:
cmovgeq %rsp,%r8
cmpl 4(%rbx),%ecx
cmovgeq %rsp,%r9
- pshufd $0,%xmm7,%xmm9
+ pshufd $0x00,%xmm7,%xmm9
.byte 69,15,56,203,236
movdqa %xmm2,%xmm0
- pshufd $85,%xmm7,%xmm10
+ pshufd $0x55,%xmm7,%xmm10
movdqa %xmm7,%xmm11
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
pcmpgtd %xmm6,%xmm9
pcmpgtd %xmm6,%xmm10
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
pcmpgtd %xmm6,%xmm11
movdqa K256_shaext-16(%rip),%xmm3
.byte 69,15,56,203,247
@@ -3068,10 +3066,10 @@ L$oop_shaext:
movl 280(%rsp),%edx
- pshufd $27,%xmm12,%xmm12
- pshufd $27,%xmm13,%xmm13
- pshufd $27,%xmm14,%xmm14
- pshufd $27,%xmm15,%xmm15
+ pshufd $0b00011011,%xmm12,%xmm12
+ pshufd $0b00011011,%xmm13,%xmm13
+ pshufd $0b00011011,%xmm14,%xmm14
+ pshufd $0b00011011,%xmm15,%xmm15
movdqa %xmm12,%xmm5
movdqa %xmm13,%xmm6
@@ -3107,4648 +3105,6 @@ L$done_shaext:
L$epilogue_shaext:
.byte 0xf3,0xc3
-
-.p2align 5
-sha256_multi_block_avx:
-_avx_shortcut:
- shrq $32,%rcx
- cmpl $2,%edx
- jb L$avx
- testl $32,%ecx
- jnz _avx2_shortcut
- jmp L$avx
-.p2align 5
-L$avx:
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- subq $288,%rsp
- andq $-256,%rsp
- movq %rax,272(%rsp)
-L$body_avx:
- leaq K256+128(%rip),%rbp
- leaq 256(%rsp),%rbx
- leaq 128(%rdi),%rdi
-
-L$oop_grande_avx:
- movl %edx,280(%rsp)
- xorl %edx,%edx
- movq 0(%rsi),%r8
- movl 8(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,0(%rbx)
- cmovleq %rbp,%r8
- movq 16(%rsi),%r9
- movl 24(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,4(%rbx)
- cmovleq %rbp,%r9
- movq 32(%rsi),%r10
- movl 40(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,8(%rbx)
- cmovleq %rbp,%r10
- movq 48(%rsi),%r11
- movl 56(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,12(%rbx)
- cmovleq %rbp,%r11
- testl %edx,%edx
- jz L$done_avx
-
- vmovdqu 0-128(%rdi),%xmm8
- leaq 128(%rsp),%rax
- vmovdqu 32-128(%rdi),%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- vmovdqu 96-128(%rdi),%xmm11
- vmovdqu 128-128(%rdi),%xmm12
- vmovdqu 160-128(%rdi),%xmm13
- vmovdqu 192-128(%rdi),%xmm14
- vmovdqu 224-128(%rdi),%xmm15
- vmovdqu L$pbswap(%rip),%xmm6
- jmp L$oop_avx
-
-.p2align 5
-L$oop_avx:
- vpxor %xmm9,%xmm10,%xmm4
- vmovd 0(%r8),%xmm5
- vmovd 0(%r9),%xmm0
- vpinsrd $1,0(%r10),%xmm5,%xmm5
- vpinsrd $1,0(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm12,%xmm7
- vpslld $26,%xmm12,%xmm2
- vmovdqu %xmm5,0-128(%rax)
- vpaddd %xmm15,%xmm5,%xmm5
-
- vpsrld $11,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm12,%xmm2
- vpaddd -128(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm12,%xmm2
- vpandn %xmm14,%xmm12,%xmm0
- vpand %xmm13,%xmm12,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm8,%xmm15
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm8,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm8,%xmm9,%xmm3
-
- vpxor %xmm1,%xmm15,%xmm15
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm8,%xmm1
-
- vpslld $19,%xmm8,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm15,%xmm7
-
- vpsrld $22,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm8,%xmm2
- vpxor %xmm4,%xmm9,%xmm15
- vpaddd %xmm5,%xmm11,%xmm11
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm15,%xmm15
- vpaddd %xmm7,%xmm15,%xmm15
- vmovd 4(%r8),%xmm5
- vmovd 4(%r9),%xmm0
- vpinsrd $1,4(%r10),%xmm5,%xmm5
- vpinsrd $1,4(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm11,%xmm7
- vpslld $26,%xmm11,%xmm2
- vmovdqu %xmm5,16-128(%rax)
- vpaddd %xmm14,%xmm5,%xmm5
-
- vpsrld $11,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm11,%xmm2
- vpaddd -96(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm11,%xmm2
- vpandn %xmm13,%xmm11,%xmm0
- vpand %xmm12,%xmm11,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm15,%xmm14
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm15,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm15,%xmm8,%xmm4
-
- vpxor %xmm1,%xmm14,%xmm14
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm15,%xmm1
-
- vpslld $19,%xmm15,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm14,%xmm7
-
- vpsrld $22,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm15,%xmm2
- vpxor %xmm3,%xmm8,%xmm14
- vpaddd %xmm5,%xmm10,%xmm10
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm14,%xmm14
- vpaddd %xmm7,%xmm14,%xmm14
- vmovd 8(%r8),%xmm5
- vmovd 8(%r9),%xmm0
- vpinsrd $1,8(%r10),%xmm5,%xmm5
- vpinsrd $1,8(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm10,%xmm7
- vpslld $26,%xmm10,%xmm2
- vmovdqu %xmm5,32-128(%rax)
- vpaddd %xmm13,%xmm5,%xmm5
-
- vpsrld $11,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm10,%xmm2
- vpaddd -64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm10,%xmm2
- vpandn %xmm12,%xmm10,%xmm0
- vpand %xmm11,%xmm10,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm14,%xmm13
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm14,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm14,%xmm15,%xmm3
-
- vpxor %xmm1,%xmm13,%xmm13
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm14,%xmm1
-
- vpslld $19,%xmm14,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm13,%xmm7
-
- vpsrld $22,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm14,%xmm2
- vpxor %xmm4,%xmm15,%xmm13
- vpaddd %xmm5,%xmm9,%xmm9
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm13,%xmm13
- vpaddd %xmm7,%xmm13,%xmm13
- vmovd 12(%r8),%xmm5
- vmovd 12(%r9),%xmm0
- vpinsrd $1,12(%r10),%xmm5,%xmm5
- vpinsrd $1,12(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm9,%xmm7
- vpslld $26,%xmm9,%xmm2
- vmovdqu %xmm5,48-128(%rax)
- vpaddd %xmm12,%xmm5,%xmm5
-
- vpsrld $11,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm9,%xmm2
- vpaddd -32(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm9,%xmm2
- vpandn %xmm11,%xmm9,%xmm0
- vpand %xmm10,%xmm9,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm13,%xmm12
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm13,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm13,%xmm14,%xmm4
-
- vpxor %xmm1,%xmm12,%xmm12
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm13,%xmm1
-
- vpslld $19,%xmm13,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm12,%xmm7
-
- vpsrld $22,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm13,%xmm2
- vpxor %xmm3,%xmm14,%xmm12
- vpaddd %xmm5,%xmm8,%xmm8
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm12,%xmm12
- vpaddd %xmm7,%xmm12,%xmm12
- vmovd 16(%r8),%xmm5
- vmovd 16(%r9),%xmm0
- vpinsrd $1,16(%r10),%xmm5,%xmm5
- vpinsrd $1,16(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm8,%xmm7
- vpslld $26,%xmm8,%xmm2
- vmovdqu %xmm5,64-128(%rax)
- vpaddd %xmm11,%xmm5,%xmm5
-
- vpsrld $11,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm8,%xmm2
- vpaddd 0(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm8,%xmm2
- vpandn %xmm10,%xmm8,%xmm0
- vpand %xmm9,%xmm8,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm12,%xmm11
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm12,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm12,%xmm13,%xmm3
-
- vpxor %xmm1,%xmm11,%xmm11
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm12,%xmm1
-
- vpslld $19,%xmm12,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm11,%xmm7
-
- vpsrld $22,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm12,%xmm2
- vpxor %xmm4,%xmm13,%xmm11
- vpaddd %xmm5,%xmm15,%xmm15
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm11,%xmm11
- vpaddd %xmm7,%xmm11,%xmm11
- vmovd 20(%r8),%xmm5
- vmovd 20(%r9),%xmm0
- vpinsrd $1,20(%r10),%xmm5,%xmm5
- vpinsrd $1,20(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm15,%xmm7
- vpslld $26,%xmm15,%xmm2
- vmovdqu %xmm5,80-128(%rax)
- vpaddd %xmm10,%xmm5,%xmm5
-
- vpsrld $11,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm15,%xmm2
- vpaddd 32(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm15,%xmm2
- vpandn %xmm9,%xmm15,%xmm0
- vpand %xmm8,%xmm15,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm11,%xmm10
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm11,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm11,%xmm12,%xmm4
-
- vpxor %xmm1,%xmm10,%xmm10
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm11,%xmm1
-
- vpslld $19,%xmm11,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm10,%xmm7
-
- vpsrld $22,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm11,%xmm2
- vpxor %xmm3,%xmm12,%xmm10
- vpaddd %xmm5,%xmm14,%xmm14
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm10,%xmm10
- vpaddd %xmm7,%xmm10,%xmm10
- vmovd 24(%r8),%xmm5
- vmovd 24(%r9),%xmm0
- vpinsrd $1,24(%r10),%xmm5,%xmm5
- vpinsrd $1,24(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm14,%xmm7
- vpslld $26,%xmm14,%xmm2
- vmovdqu %xmm5,96-128(%rax)
- vpaddd %xmm9,%xmm5,%xmm5
-
- vpsrld $11,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm14,%xmm2
- vpaddd 64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm14,%xmm2
- vpandn %xmm8,%xmm14,%xmm0
- vpand %xmm15,%xmm14,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm10,%xmm9
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm10,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm10,%xmm11,%xmm3
-
- vpxor %xmm1,%xmm9,%xmm9
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm10,%xmm1
-
- vpslld $19,%xmm10,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm9,%xmm7
-
- vpsrld $22,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm10,%xmm2
- vpxor %xmm4,%xmm11,%xmm9
- vpaddd %xmm5,%xmm13,%xmm13
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm9,%xmm9
- vpaddd %xmm7,%xmm9,%xmm9
- vmovd 28(%r8),%xmm5
- vmovd 28(%r9),%xmm0
- vpinsrd $1,28(%r10),%xmm5,%xmm5
- vpinsrd $1,28(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm13,%xmm7
- vpslld $26,%xmm13,%xmm2
- vmovdqu %xmm5,112-128(%rax)
- vpaddd %xmm8,%xmm5,%xmm5
-
- vpsrld $11,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm13,%xmm2
- vpaddd 96(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm13,%xmm2
- vpandn %xmm15,%xmm13,%xmm0
- vpand %xmm14,%xmm13,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm9,%xmm8
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm9,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm9,%xmm10,%xmm4
-
- vpxor %xmm1,%xmm8,%xmm8
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm9,%xmm1
-
- vpslld $19,%xmm9,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm8,%xmm7
-
- vpsrld $22,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm9,%xmm2
- vpxor %xmm3,%xmm10,%xmm8
- vpaddd %xmm5,%xmm12,%xmm12
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm8,%xmm8
- vpaddd %xmm7,%xmm8,%xmm8
- addq $256,%rbp
- vmovd 32(%r8),%xmm5
- vmovd 32(%r9),%xmm0
- vpinsrd $1,32(%r10),%xmm5,%xmm5
- vpinsrd $1,32(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm12,%xmm7
- vpslld $26,%xmm12,%xmm2
- vmovdqu %xmm5,128-128(%rax)
- vpaddd %xmm15,%xmm5,%xmm5
-
- vpsrld $11,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm12,%xmm2
- vpaddd -128(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm12,%xmm2
- vpandn %xmm14,%xmm12,%xmm0
- vpand %xmm13,%xmm12,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm8,%xmm15
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm8,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm8,%xmm9,%xmm3
-
- vpxor %xmm1,%xmm15,%xmm15
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm8,%xmm1
-
- vpslld $19,%xmm8,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm15,%xmm7
-
- vpsrld $22,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm8,%xmm2
- vpxor %xmm4,%xmm9,%xmm15
- vpaddd %xmm5,%xmm11,%xmm11
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm15,%xmm15
- vpaddd %xmm7,%xmm15,%xmm15
- vmovd 36(%r8),%xmm5
- vmovd 36(%r9),%xmm0
- vpinsrd $1,36(%r10),%xmm5,%xmm5
- vpinsrd $1,36(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm11,%xmm7
- vpslld $26,%xmm11,%xmm2
- vmovdqu %xmm5,144-128(%rax)
- vpaddd %xmm14,%xmm5,%xmm5
-
- vpsrld $11,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm11,%xmm2
- vpaddd -96(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm11,%xmm2
- vpandn %xmm13,%xmm11,%xmm0
- vpand %xmm12,%xmm11,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm15,%xmm14
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm15,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm15,%xmm8,%xmm4
-
- vpxor %xmm1,%xmm14,%xmm14
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm15,%xmm1
-
- vpslld $19,%xmm15,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm14,%xmm7
-
- vpsrld $22,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm15,%xmm2
- vpxor %xmm3,%xmm8,%xmm14
- vpaddd %xmm5,%xmm10,%xmm10
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm14,%xmm14
- vpaddd %xmm7,%xmm14,%xmm14
- vmovd 40(%r8),%xmm5
- vmovd 40(%r9),%xmm0
- vpinsrd $1,40(%r10),%xmm5,%xmm5
- vpinsrd $1,40(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm10,%xmm7
- vpslld $26,%xmm10,%xmm2
- vmovdqu %xmm5,160-128(%rax)
- vpaddd %xmm13,%xmm5,%xmm5
-
- vpsrld $11,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm10,%xmm2
- vpaddd -64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm10,%xmm2
- vpandn %xmm12,%xmm10,%xmm0
- vpand %xmm11,%xmm10,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm14,%xmm13
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm14,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm14,%xmm15,%xmm3
-
- vpxor %xmm1,%xmm13,%xmm13
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm14,%xmm1
-
- vpslld $19,%xmm14,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm13,%xmm7
-
- vpsrld $22,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm14,%xmm2
- vpxor %xmm4,%xmm15,%xmm13
- vpaddd %xmm5,%xmm9,%xmm9
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm13,%xmm13
- vpaddd %xmm7,%xmm13,%xmm13
- vmovd 44(%r8),%xmm5
- vmovd 44(%r9),%xmm0
- vpinsrd $1,44(%r10),%xmm5,%xmm5
- vpinsrd $1,44(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm9,%xmm7
- vpslld $26,%xmm9,%xmm2
- vmovdqu %xmm5,176-128(%rax)
- vpaddd %xmm12,%xmm5,%xmm5
-
- vpsrld $11,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm9,%xmm2
- vpaddd -32(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm9,%xmm2
- vpandn %xmm11,%xmm9,%xmm0
- vpand %xmm10,%xmm9,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm13,%xmm12
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm13,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm13,%xmm14,%xmm4
-
- vpxor %xmm1,%xmm12,%xmm12
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm13,%xmm1
-
- vpslld $19,%xmm13,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm12,%xmm7
-
- vpsrld $22,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm13,%xmm2
- vpxor %xmm3,%xmm14,%xmm12
- vpaddd %xmm5,%xmm8,%xmm8
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm12,%xmm12
- vpaddd %xmm7,%xmm12,%xmm12
- vmovd 48(%r8),%xmm5
- vmovd 48(%r9),%xmm0
- vpinsrd $1,48(%r10),%xmm5,%xmm5
- vpinsrd $1,48(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm8,%xmm7
- vpslld $26,%xmm8,%xmm2
- vmovdqu %xmm5,192-128(%rax)
- vpaddd %xmm11,%xmm5,%xmm5
-
- vpsrld $11,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm8,%xmm2
- vpaddd 0(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm8,%xmm2
- vpandn %xmm10,%xmm8,%xmm0
- vpand %xmm9,%xmm8,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm12,%xmm11
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm12,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm12,%xmm13,%xmm3
-
- vpxor %xmm1,%xmm11,%xmm11
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm12,%xmm1
-
- vpslld $19,%xmm12,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm11,%xmm7
-
- vpsrld $22,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm12,%xmm2
- vpxor %xmm4,%xmm13,%xmm11
- vpaddd %xmm5,%xmm15,%xmm15
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm11,%xmm11
- vpaddd %xmm7,%xmm11,%xmm11
- vmovd 52(%r8),%xmm5
- vmovd 52(%r9),%xmm0
- vpinsrd $1,52(%r10),%xmm5,%xmm5
- vpinsrd $1,52(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm15,%xmm7
- vpslld $26,%xmm15,%xmm2
- vmovdqu %xmm5,208-128(%rax)
- vpaddd %xmm10,%xmm5,%xmm5
-
- vpsrld $11,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm15,%xmm2
- vpaddd 32(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm15,%xmm2
- vpandn %xmm9,%xmm15,%xmm0
- vpand %xmm8,%xmm15,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm11,%xmm10
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm11,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm11,%xmm12,%xmm4
-
- vpxor %xmm1,%xmm10,%xmm10
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm11,%xmm1
-
- vpslld $19,%xmm11,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm10,%xmm7
-
- vpsrld $22,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm11,%xmm2
- vpxor %xmm3,%xmm12,%xmm10
- vpaddd %xmm5,%xmm14,%xmm14
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm10,%xmm10
- vpaddd %xmm7,%xmm10,%xmm10
- vmovd 56(%r8),%xmm5
- vmovd 56(%r9),%xmm0
- vpinsrd $1,56(%r10),%xmm5,%xmm5
- vpinsrd $1,56(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm14,%xmm7
- vpslld $26,%xmm14,%xmm2
- vmovdqu %xmm5,224-128(%rax)
- vpaddd %xmm9,%xmm5,%xmm5
-
- vpsrld $11,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm14,%xmm2
- vpaddd 64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm14,%xmm2
- vpandn %xmm8,%xmm14,%xmm0
- vpand %xmm15,%xmm14,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm10,%xmm9
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm10,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm10,%xmm11,%xmm3
-
- vpxor %xmm1,%xmm9,%xmm9
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm10,%xmm1
-
- vpslld $19,%xmm10,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm9,%xmm7
-
- vpsrld $22,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm10,%xmm2
- vpxor %xmm4,%xmm11,%xmm9
- vpaddd %xmm5,%xmm13,%xmm13
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm9,%xmm9
- vpaddd %xmm7,%xmm9,%xmm9
- vmovd 60(%r8),%xmm5
- leaq 64(%r8),%r8
- vmovd 60(%r9),%xmm0
- leaq 64(%r9),%r9
- vpinsrd $1,60(%r10),%xmm5,%xmm5
- leaq 64(%r10),%r10
- vpinsrd $1,60(%r11),%xmm0,%xmm0
- leaq 64(%r11),%r11
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm13,%xmm7
- vpslld $26,%xmm13,%xmm2
- vmovdqu %xmm5,240-128(%rax)
- vpaddd %xmm8,%xmm5,%xmm5
-
- vpsrld $11,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm13,%xmm2
- vpaddd 96(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- prefetcht0 63(%r8)
- vpslld $7,%xmm13,%xmm2
- vpandn %xmm15,%xmm13,%xmm0
- vpand %xmm14,%xmm13,%xmm4
- prefetcht0 63(%r9)
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm9,%xmm8
- vpxor %xmm2,%xmm7,%xmm7
- prefetcht0 63(%r10)
- vpslld $30,%xmm9,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm9,%xmm10,%xmm4
- prefetcht0 63(%r11)
- vpxor %xmm1,%xmm8,%xmm8
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm9,%xmm1
-
- vpslld $19,%xmm9,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm8,%xmm7
-
- vpsrld $22,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm9,%xmm2
- vpxor %xmm3,%xmm10,%xmm8
- vpaddd %xmm5,%xmm12,%xmm12
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm8,%xmm8
- vpaddd %xmm7,%xmm8,%xmm8
- addq $256,%rbp
- vmovdqu 0-128(%rax),%xmm5
- movl $3,%ecx
- jmp L$oop_16_xx_avx
-.p2align 5
-L$oop_16_xx_avx:
- vmovdqu 16-128(%rax),%xmm6
- vpaddd 144-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 224-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm12,%xmm7
- vpslld $26,%xmm12,%xmm2
- vmovdqu %xmm5,0-128(%rax)
- vpaddd %xmm15,%xmm5,%xmm5
-
- vpsrld $11,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm12,%xmm2
- vpaddd -128(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm12,%xmm2
- vpandn %xmm14,%xmm12,%xmm0
- vpand %xmm13,%xmm12,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm8,%xmm15
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm8,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm8,%xmm9,%xmm3
-
- vpxor %xmm1,%xmm15,%xmm15
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm8,%xmm1
-
- vpslld $19,%xmm8,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm15,%xmm7
-
- vpsrld $22,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm8,%xmm2
- vpxor %xmm4,%xmm9,%xmm15
- vpaddd %xmm5,%xmm11,%xmm11
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm15,%xmm15
- vpaddd %xmm7,%xmm15,%xmm15
- vmovdqu 32-128(%rax),%xmm5
- vpaddd 160-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 240-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm11,%xmm7
- vpslld $26,%xmm11,%xmm2
- vmovdqu %xmm6,16-128(%rax)
- vpaddd %xmm14,%xmm6,%xmm6
-
- vpsrld $11,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm11,%xmm2
- vpaddd -96(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm11,%xmm2
- vpandn %xmm13,%xmm11,%xmm0
- vpand %xmm12,%xmm11,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm15,%xmm14
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm15,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm15,%xmm8,%xmm4
-
- vpxor %xmm1,%xmm14,%xmm14
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm15,%xmm1
-
- vpslld $19,%xmm15,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm14,%xmm7
-
- vpsrld $22,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm15,%xmm2
- vpxor %xmm3,%xmm8,%xmm14
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm14,%xmm14
- vpaddd %xmm7,%xmm14,%xmm14
- vmovdqu 48-128(%rax),%xmm6
- vpaddd 176-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 0-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm10,%xmm7
- vpslld $26,%xmm10,%xmm2
- vmovdqu %xmm5,32-128(%rax)
- vpaddd %xmm13,%xmm5,%xmm5
-
- vpsrld $11,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm10,%xmm2
- vpaddd -64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm10,%xmm2
- vpandn %xmm12,%xmm10,%xmm0
- vpand %xmm11,%xmm10,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm14,%xmm13
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm14,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm14,%xmm15,%xmm3
-
- vpxor %xmm1,%xmm13,%xmm13
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm14,%xmm1
-
- vpslld $19,%xmm14,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm13,%xmm7
-
- vpsrld $22,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm14,%xmm2
- vpxor %xmm4,%xmm15,%xmm13
- vpaddd %xmm5,%xmm9,%xmm9
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm13,%xmm13
- vpaddd %xmm7,%xmm13,%xmm13
- vmovdqu 64-128(%rax),%xmm5
- vpaddd 192-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 16-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm9,%xmm7
- vpslld $26,%xmm9,%xmm2
- vmovdqu %xmm6,48-128(%rax)
- vpaddd %xmm12,%xmm6,%xmm6
-
- vpsrld $11,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm9,%xmm2
- vpaddd -32(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm9,%xmm2
- vpandn %xmm11,%xmm9,%xmm0
- vpand %xmm10,%xmm9,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm13,%xmm12
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm13,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm13,%xmm14,%xmm4
-
- vpxor %xmm1,%xmm12,%xmm12
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm13,%xmm1
-
- vpslld $19,%xmm13,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm12,%xmm7
-
- vpsrld $22,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm13,%xmm2
- vpxor %xmm3,%xmm14,%xmm12
- vpaddd %xmm6,%xmm8,%xmm8
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm12,%xmm12
- vpaddd %xmm7,%xmm12,%xmm12
- vmovdqu 80-128(%rax),%xmm6
- vpaddd 208-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 32-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm8,%xmm7
- vpslld $26,%xmm8,%xmm2
- vmovdqu %xmm5,64-128(%rax)
- vpaddd %xmm11,%xmm5,%xmm5
-
- vpsrld $11,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm8,%xmm2
- vpaddd 0(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm8,%xmm2
- vpandn %xmm10,%xmm8,%xmm0
- vpand %xmm9,%xmm8,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm12,%xmm11
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm12,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm12,%xmm13,%xmm3
-
- vpxor %xmm1,%xmm11,%xmm11
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm12,%xmm1
-
- vpslld $19,%xmm12,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm11,%xmm7
-
- vpsrld $22,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm12,%xmm2
- vpxor %xmm4,%xmm13,%xmm11
- vpaddd %xmm5,%xmm15,%xmm15
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm11,%xmm11
- vpaddd %xmm7,%xmm11,%xmm11
- vmovdqu 96-128(%rax),%xmm5
- vpaddd 224-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 48-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm15,%xmm7
- vpslld $26,%xmm15,%xmm2
- vmovdqu %xmm6,80-128(%rax)
- vpaddd %xmm10,%xmm6,%xmm6
-
- vpsrld $11,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm15,%xmm2
- vpaddd 32(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm15,%xmm2
- vpandn %xmm9,%xmm15,%xmm0
- vpand %xmm8,%xmm15,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm11,%xmm10
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm11,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm11,%xmm12,%xmm4
-
- vpxor %xmm1,%xmm10,%xmm10
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm11,%xmm1
-
- vpslld $19,%xmm11,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm10,%xmm7
-
- vpsrld $22,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm11,%xmm2
- vpxor %xmm3,%xmm12,%xmm10
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm10,%xmm10
- vpaddd %xmm7,%xmm10,%xmm10
- vmovdqu 112-128(%rax),%xmm6
- vpaddd 240-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 64-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm14,%xmm7
- vpslld $26,%xmm14,%xmm2
- vmovdqu %xmm5,96-128(%rax)
- vpaddd %xmm9,%xmm5,%xmm5
-
- vpsrld $11,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm14,%xmm2
- vpaddd 64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm14,%xmm2
- vpandn %xmm8,%xmm14,%xmm0
- vpand %xmm15,%xmm14,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm10,%xmm9
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm10,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm10,%xmm11,%xmm3
-
- vpxor %xmm1,%xmm9,%xmm9
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm10,%xmm1
-
- vpslld $19,%xmm10,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm9,%xmm7
-
- vpsrld $22,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm10,%xmm2
- vpxor %xmm4,%xmm11,%xmm9
- vpaddd %xmm5,%xmm13,%xmm13
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm9,%xmm9
- vpaddd %xmm7,%xmm9,%xmm9
- vmovdqu 128-128(%rax),%xmm5
- vpaddd 0-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 80-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm13,%xmm7
- vpslld $26,%xmm13,%xmm2
- vmovdqu %xmm6,112-128(%rax)
- vpaddd %xmm8,%xmm6,%xmm6
-
- vpsrld $11,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm13,%xmm2
- vpaddd 96(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm13,%xmm2
- vpandn %xmm15,%xmm13,%xmm0
- vpand %xmm14,%xmm13,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm9,%xmm8
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm9,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm9,%xmm10,%xmm4
-
- vpxor %xmm1,%xmm8,%xmm8
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm9,%xmm1
-
- vpslld $19,%xmm9,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm8,%xmm7
-
- vpsrld $22,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm9,%xmm2
- vpxor %xmm3,%xmm10,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm8,%xmm8
- vpaddd %xmm7,%xmm8,%xmm8
- addq $256,%rbp
- vmovdqu 144-128(%rax),%xmm6
- vpaddd 16-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 96-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm12,%xmm7
- vpslld $26,%xmm12,%xmm2
- vmovdqu %xmm5,128-128(%rax)
- vpaddd %xmm15,%xmm5,%xmm5
-
- vpsrld $11,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm12,%xmm2
- vpaddd -128(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm12,%xmm2
- vpandn %xmm14,%xmm12,%xmm0
- vpand %xmm13,%xmm12,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm8,%xmm15
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm8,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm8,%xmm9,%xmm3
-
- vpxor %xmm1,%xmm15,%xmm15
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm8,%xmm1
-
- vpslld $19,%xmm8,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm15,%xmm7
-
- vpsrld $22,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm8,%xmm2
- vpxor %xmm4,%xmm9,%xmm15
- vpaddd %xmm5,%xmm11,%xmm11
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm15,%xmm15
- vpaddd %xmm7,%xmm15,%xmm15
- vmovdqu 160-128(%rax),%xmm5
- vpaddd 32-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 112-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm11,%xmm7
- vpslld $26,%xmm11,%xmm2
- vmovdqu %xmm6,144-128(%rax)
- vpaddd %xmm14,%xmm6,%xmm6
-
- vpsrld $11,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm11,%xmm2
- vpaddd -96(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm11,%xmm2
- vpandn %xmm13,%xmm11,%xmm0
- vpand %xmm12,%xmm11,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm15,%xmm14
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm15,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm15,%xmm8,%xmm4
-
- vpxor %xmm1,%xmm14,%xmm14
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm15,%xmm1
-
- vpslld $19,%xmm15,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm14,%xmm7
-
- vpsrld $22,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm15,%xmm2
- vpxor %xmm3,%xmm8,%xmm14
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm14,%xmm14
- vpaddd %xmm7,%xmm14,%xmm14
- vmovdqu 176-128(%rax),%xmm6
- vpaddd 48-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 128-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm10,%xmm7
- vpslld $26,%xmm10,%xmm2
- vmovdqu %xmm5,160-128(%rax)
- vpaddd %xmm13,%xmm5,%xmm5
-
- vpsrld $11,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm10,%xmm2
- vpaddd -64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm10,%xmm2
- vpandn %xmm12,%xmm10,%xmm0
- vpand %xmm11,%xmm10,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm14,%xmm13
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm14,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm14,%xmm15,%xmm3
-
- vpxor %xmm1,%xmm13,%xmm13
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm14,%xmm1
-
- vpslld $19,%xmm14,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm13,%xmm7
-
- vpsrld $22,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm14,%xmm2
- vpxor %xmm4,%xmm15,%xmm13
- vpaddd %xmm5,%xmm9,%xmm9
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm13,%xmm13
- vpaddd %xmm7,%xmm13,%xmm13
- vmovdqu 192-128(%rax),%xmm5
- vpaddd 64-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 144-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm9,%xmm7
- vpslld $26,%xmm9,%xmm2
- vmovdqu %xmm6,176-128(%rax)
- vpaddd %xmm12,%xmm6,%xmm6
-
- vpsrld $11,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm9,%xmm2
- vpaddd -32(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm9,%xmm2
- vpandn %xmm11,%xmm9,%xmm0
- vpand %xmm10,%xmm9,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm13,%xmm12
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm13,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm13,%xmm14,%xmm4
-
- vpxor %xmm1,%xmm12,%xmm12
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm13,%xmm1
-
- vpslld $19,%xmm13,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm12,%xmm7
-
- vpsrld $22,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm13,%xmm2
- vpxor %xmm3,%xmm14,%xmm12
- vpaddd %xmm6,%xmm8,%xmm8
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm12,%xmm12
- vpaddd %xmm7,%xmm12,%xmm12
- vmovdqu 208-128(%rax),%xmm6
- vpaddd 80-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 160-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm8,%xmm7
- vpslld $26,%xmm8,%xmm2
- vmovdqu %xmm5,192-128(%rax)
- vpaddd %xmm11,%xmm5,%xmm5
-
- vpsrld $11,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm8,%xmm2
- vpaddd 0(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm8,%xmm2
- vpandn %xmm10,%xmm8,%xmm0
- vpand %xmm9,%xmm8,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm12,%xmm11
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm12,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm12,%xmm13,%xmm3
-
- vpxor %xmm1,%xmm11,%xmm11
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm12,%xmm1
-
- vpslld $19,%xmm12,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm11,%xmm7
-
- vpsrld $22,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm12,%xmm2
- vpxor %xmm4,%xmm13,%xmm11
- vpaddd %xmm5,%xmm15,%xmm15
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm11,%xmm11
- vpaddd %xmm7,%xmm11,%xmm11
- vmovdqu 224-128(%rax),%xmm5
- vpaddd 96-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 176-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm15,%xmm7
- vpslld $26,%xmm15,%xmm2
- vmovdqu %xmm6,208-128(%rax)
- vpaddd %xmm10,%xmm6,%xmm6
-
- vpsrld $11,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm15,%xmm2
- vpaddd 32(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm15,%xmm2
- vpandn %xmm9,%xmm15,%xmm0
- vpand %xmm8,%xmm15,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm11,%xmm10
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm11,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm11,%xmm12,%xmm4
-
- vpxor %xmm1,%xmm10,%xmm10
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm11,%xmm1
-
- vpslld $19,%xmm11,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm10,%xmm7
-
- vpsrld $22,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm11,%xmm2
- vpxor %xmm3,%xmm12,%xmm10
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm10,%xmm10
- vpaddd %xmm7,%xmm10,%xmm10
- vmovdqu 240-128(%rax),%xmm6
- vpaddd 112-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 192-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm14,%xmm7
- vpslld $26,%xmm14,%xmm2
- vmovdqu %xmm5,224-128(%rax)
- vpaddd %xmm9,%xmm5,%xmm5
-
- vpsrld $11,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm14,%xmm2
- vpaddd 64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm14,%xmm2
- vpandn %xmm8,%xmm14,%xmm0
- vpand %xmm15,%xmm14,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm10,%xmm9
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm10,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm10,%xmm11,%xmm3
-
- vpxor %xmm1,%xmm9,%xmm9
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm10,%xmm1
-
- vpslld $19,%xmm10,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm9,%xmm7
-
- vpsrld $22,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm10,%xmm2
- vpxor %xmm4,%xmm11,%xmm9
- vpaddd %xmm5,%xmm13,%xmm13
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm9,%xmm9
- vpaddd %xmm7,%xmm9,%xmm9
- vmovdqu 0-128(%rax),%xmm5
- vpaddd 128-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 208-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm13,%xmm7
- vpslld $26,%xmm13,%xmm2
- vmovdqu %xmm6,240-128(%rax)
- vpaddd %xmm8,%xmm6,%xmm6
-
- vpsrld $11,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm13,%xmm2
- vpaddd 96(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm13,%xmm2
- vpandn %xmm15,%xmm13,%xmm0
- vpand %xmm14,%xmm13,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm9,%xmm8
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm9,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm9,%xmm10,%xmm4
-
- vpxor %xmm1,%xmm8,%xmm8
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm9,%xmm1
-
- vpslld $19,%xmm9,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm8,%xmm7
-
- vpsrld $22,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm9,%xmm2
- vpxor %xmm3,%xmm10,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm8,%xmm8
- vpaddd %xmm7,%xmm8,%xmm8
- addq $256,%rbp
- decl %ecx
- jnz L$oop_16_xx_avx
-
- movl $1,%ecx
- leaq K256+128(%rip),%rbp
- cmpl 0(%rbx),%ecx
- cmovgeq %rbp,%r8
- cmpl 4(%rbx),%ecx
- cmovgeq %rbp,%r9
- cmpl 8(%rbx),%ecx
- cmovgeq %rbp,%r10
- cmpl 12(%rbx),%ecx
- cmovgeq %rbp,%r11
- vmovdqa (%rbx),%xmm7
- vpxor %xmm0,%xmm0,%xmm0
- vmovdqa %xmm7,%xmm6
- vpcmpgtd %xmm0,%xmm6,%xmm6
- vpaddd %xmm6,%xmm7,%xmm7
-
- vmovdqu 0-128(%rdi),%xmm0
- vpand %xmm6,%xmm8,%xmm8
- vmovdqu 32-128(%rdi),%xmm1
- vpand %xmm6,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm2
- vpand %xmm6,%xmm10,%xmm10
- vmovdqu 96-128(%rdi),%xmm5
- vpand %xmm6,%xmm11,%xmm11
- vpaddd %xmm0,%xmm8,%xmm8
- vmovdqu 128-128(%rdi),%xmm0
- vpand %xmm6,%xmm12,%xmm12
- vpaddd %xmm1,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm1
- vpand %xmm6,%xmm13,%xmm13
- vpaddd %xmm2,%xmm10,%xmm10
- vmovdqu 192-128(%rdi),%xmm2
- vpand %xmm6,%xmm14,%xmm14
- vpaddd %xmm5,%xmm11,%xmm11
- vmovdqu 224-128(%rdi),%xmm5
- vpand %xmm6,%xmm15,%xmm15
- vpaddd %xmm0,%xmm12,%xmm12
- vpaddd %xmm1,%xmm13,%xmm13
- vmovdqu %xmm8,0-128(%rdi)
- vpaddd %xmm2,%xmm14,%xmm14
- vmovdqu %xmm9,32-128(%rdi)
- vpaddd %xmm5,%xmm15,%xmm15
- vmovdqu %xmm10,64-128(%rdi)
- vmovdqu %xmm11,96-128(%rdi)
- vmovdqu %xmm12,128-128(%rdi)
- vmovdqu %xmm13,160-128(%rdi)
- vmovdqu %xmm14,192-128(%rdi)
- vmovdqu %xmm15,224-128(%rdi)
-
- vmovdqu %xmm7,(%rbx)
- vmovdqu L$pbswap(%rip),%xmm6
- decl %edx
- jnz L$oop_avx
-
- movl 280(%rsp),%edx
- leaq 16(%rdi),%rdi
- leaq 64(%rsi),%rsi
- decl %edx
- jnz L$oop_grande_avx
-
-L$done_avx:
- movq 272(%rsp),%rax
- vzeroupper
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-L$epilogue_avx:
- .byte 0xf3,0xc3
-
-
-.p2align 5
-sha256_multi_block_avx2:
-_avx2_shortcut:
- movq %rsp,%rax
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $576,%rsp
- andq $-256,%rsp
- movq %rax,544(%rsp)
-L$body_avx2:
- leaq K256+128(%rip),%rbp
- leaq 128(%rdi),%rdi
-
-L$oop_grande_avx2:
- movl %edx,552(%rsp)
- xorl %edx,%edx
- leaq 512(%rsp),%rbx
- movq 0(%rsi),%r12
- movl 8(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,0(%rbx)
- cmovleq %rbp,%r12
- movq 16(%rsi),%r13
- movl 24(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,4(%rbx)
- cmovleq %rbp,%r13
- movq 32(%rsi),%r14
- movl 40(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,8(%rbx)
- cmovleq %rbp,%r14
- movq 48(%rsi),%r15
- movl 56(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,12(%rbx)
- cmovleq %rbp,%r15
- movq 64(%rsi),%r8
- movl 72(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,16(%rbx)
- cmovleq %rbp,%r8
- movq 80(%rsi),%r9
- movl 88(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,20(%rbx)
- cmovleq %rbp,%r9
- movq 96(%rsi),%r10
- movl 104(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,24(%rbx)
- cmovleq %rbp,%r10
- movq 112(%rsi),%r11
- movl 120(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,28(%rbx)
- cmovleq %rbp,%r11
- vmovdqu 0-128(%rdi),%ymm8
- leaq 128(%rsp),%rax
- vmovdqu 32-128(%rdi),%ymm9
- leaq 256+128(%rsp),%rbx
- vmovdqu 64-128(%rdi),%ymm10
- vmovdqu 96-128(%rdi),%ymm11
- vmovdqu 128-128(%rdi),%ymm12
- vmovdqu 160-128(%rdi),%ymm13
- vmovdqu 192-128(%rdi),%ymm14
- vmovdqu 224-128(%rdi),%ymm15
- vmovdqu L$pbswap(%rip),%ymm6
- jmp L$oop_avx2
-
-.p2align 5
-L$oop_avx2:
- vpxor %ymm9,%ymm10,%ymm4
- vmovd 0(%r12),%xmm5
- vmovd 0(%r8),%xmm0
- vmovd 0(%r13),%xmm1
- vmovd 0(%r9),%xmm2
- vpinsrd $1,0(%r14),%xmm5,%xmm5
- vpinsrd $1,0(%r10),%xmm0,%xmm0
- vpinsrd $1,0(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,0(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm12,%ymm7
- vpslld $26,%ymm12,%ymm2
- vmovdqu %ymm5,0-128(%rax)
- vpaddd %ymm15,%ymm5,%ymm5
-
- vpsrld $11,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm12,%ymm2
- vpaddd -128(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm12,%ymm2
- vpandn %ymm14,%ymm12,%ymm0
- vpand %ymm13,%ymm12,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm8,%ymm15
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm8,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm8,%ymm9,%ymm3
-
- vpxor %ymm1,%ymm15,%ymm15
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm8,%ymm1
-
- vpslld $19,%ymm8,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm15,%ymm7
-
- vpsrld $22,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm8,%ymm2
- vpxor %ymm4,%ymm9,%ymm15
- vpaddd %ymm5,%ymm11,%ymm11
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm15,%ymm15
- vpaddd %ymm7,%ymm15,%ymm15
- vmovd 4(%r12),%xmm5
- vmovd 4(%r8),%xmm0
- vmovd 4(%r13),%xmm1
- vmovd 4(%r9),%xmm2
- vpinsrd $1,4(%r14),%xmm5,%xmm5
- vpinsrd $1,4(%r10),%xmm0,%xmm0
- vpinsrd $1,4(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,4(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm11,%ymm7
- vpslld $26,%ymm11,%ymm2
- vmovdqu %ymm5,32-128(%rax)
- vpaddd %ymm14,%ymm5,%ymm5
-
- vpsrld $11,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm11,%ymm2
- vpaddd -96(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm11,%ymm2
- vpandn %ymm13,%ymm11,%ymm0
- vpand %ymm12,%ymm11,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm15,%ymm14
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm15,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm15,%ymm8,%ymm4
-
- vpxor %ymm1,%ymm14,%ymm14
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm15,%ymm1
-
- vpslld $19,%ymm15,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm14,%ymm7
-
- vpsrld $22,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm15,%ymm2
- vpxor %ymm3,%ymm8,%ymm14
- vpaddd %ymm5,%ymm10,%ymm10
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm14,%ymm14
- vpaddd %ymm7,%ymm14,%ymm14
- vmovd 8(%r12),%xmm5
- vmovd 8(%r8),%xmm0
- vmovd 8(%r13),%xmm1
- vmovd 8(%r9),%xmm2
- vpinsrd $1,8(%r14),%xmm5,%xmm5
- vpinsrd $1,8(%r10),%xmm0,%xmm0
- vpinsrd $1,8(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,8(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm10,%ymm7
- vpslld $26,%ymm10,%ymm2
- vmovdqu %ymm5,64-128(%rax)
- vpaddd %ymm13,%ymm5,%ymm5
-
- vpsrld $11,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm10,%ymm2
- vpaddd -64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm10,%ymm2
- vpandn %ymm12,%ymm10,%ymm0
- vpand %ymm11,%ymm10,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm14,%ymm13
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm14,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm14,%ymm15,%ymm3
-
- vpxor %ymm1,%ymm13,%ymm13
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm14,%ymm1
-
- vpslld $19,%ymm14,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm13,%ymm7
-
- vpsrld $22,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm14,%ymm2
- vpxor %ymm4,%ymm15,%ymm13
- vpaddd %ymm5,%ymm9,%ymm9
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm13,%ymm13
- vpaddd %ymm7,%ymm13,%ymm13
- vmovd 12(%r12),%xmm5
- vmovd 12(%r8),%xmm0
- vmovd 12(%r13),%xmm1
- vmovd 12(%r9),%xmm2
- vpinsrd $1,12(%r14),%xmm5,%xmm5
- vpinsrd $1,12(%r10),%xmm0,%xmm0
- vpinsrd $1,12(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,12(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm9,%ymm7
- vpslld $26,%ymm9,%ymm2
- vmovdqu %ymm5,96-128(%rax)
- vpaddd %ymm12,%ymm5,%ymm5
-
- vpsrld $11,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm9,%ymm2
- vpaddd -32(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm9,%ymm2
- vpandn %ymm11,%ymm9,%ymm0
- vpand %ymm10,%ymm9,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm13,%ymm12
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm13,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm13,%ymm14,%ymm4
-
- vpxor %ymm1,%ymm12,%ymm12
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm13,%ymm1
-
- vpslld $19,%ymm13,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm12,%ymm7
-
- vpsrld $22,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm13,%ymm2
- vpxor %ymm3,%ymm14,%ymm12
- vpaddd %ymm5,%ymm8,%ymm8
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm12,%ymm12
- vpaddd %ymm7,%ymm12,%ymm12
- vmovd 16(%r12),%xmm5
- vmovd 16(%r8),%xmm0
- vmovd 16(%r13),%xmm1
- vmovd 16(%r9),%xmm2
- vpinsrd $1,16(%r14),%xmm5,%xmm5
- vpinsrd $1,16(%r10),%xmm0,%xmm0
- vpinsrd $1,16(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,16(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm8,%ymm7
- vpslld $26,%ymm8,%ymm2
- vmovdqu %ymm5,128-128(%rax)
- vpaddd %ymm11,%ymm5,%ymm5
-
- vpsrld $11,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm8,%ymm2
- vpaddd 0(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm8,%ymm2
- vpandn %ymm10,%ymm8,%ymm0
- vpand %ymm9,%ymm8,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm12,%ymm11
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm12,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm12,%ymm13,%ymm3
-
- vpxor %ymm1,%ymm11,%ymm11
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm12,%ymm1
-
- vpslld $19,%ymm12,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm11,%ymm7
-
- vpsrld $22,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm12,%ymm2
- vpxor %ymm4,%ymm13,%ymm11
- vpaddd %ymm5,%ymm15,%ymm15
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm11,%ymm11
- vpaddd %ymm7,%ymm11,%ymm11
- vmovd 20(%r12),%xmm5
- vmovd 20(%r8),%xmm0
- vmovd 20(%r13),%xmm1
- vmovd 20(%r9),%xmm2
- vpinsrd $1,20(%r14),%xmm5,%xmm5
- vpinsrd $1,20(%r10),%xmm0,%xmm0
- vpinsrd $1,20(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,20(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm15,%ymm7
- vpslld $26,%ymm15,%ymm2
- vmovdqu %ymm5,160-128(%rax)
- vpaddd %ymm10,%ymm5,%ymm5
-
- vpsrld $11,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm15,%ymm2
- vpaddd 32(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm15,%ymm2
- vpandn %ymm9,%ymm15,%ymm0
- vpand %ymm8,%ymm15,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm11,%ymm10
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm11,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm11,%ymm12,%ymm4
-
- vpxor %ymm1,%ymm10,%ymm10
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm11,%ymm1
-
- vpslld $19,%ymm11,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm10,%ymm7
-
- vpsrld $22,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm11,%ymm2
- vpxor %ymm3,%ymm12,%ymm10
- vpaddd %ymm5,%ymm14,%ymm14
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm10,%ymm10
- vpaddd %ymm7,%ymm10,%ymm10
- vmovd 24(%r12),%xmm5
- vmovd 24(%r8),%xmm0
- vmovd 24(%r13),%xmm1
- vmovd 24(%r9),%xmm2
- vpinsrd $1,24(%r14),%xmm5,%xmm5
- vpinsrd $1,24(%r10),%xmm0,%xmm0
- vpinsrd $1,24(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,24(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm14,%ymm7
- vpslld $26,%ymm14,%ymm2
- vmovdqu %ymm5,192-128(%rax)
- vpaddd %ymm9,%ymm5,%ymm5
-
- vpsrld $11,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm14,%ymm2
- vpaddd 64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm14,%ymm2
- vpandn %ymm8,%ymm14,%ymm0
- vpand %ymm15,%ymm14,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm10,%ymm9
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm10,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm10,%ymm11,%ymm3
-
- vpxor %ymm1,%ymm9,%ymm9
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm10,%ymm1
-
- vpslld $19,%ymm10,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm9,%ymm7
-
- vpsrld $22,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm10,%ymm2
- vpxor %ymm4,%ymm11,%ymm9
- vpaddd %ymm5,%ymm13,%ymm13
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm9,%ymm9
- vpaddd %ymm7,%ymm9,%ymm9
- vmovd 28(%r12),%xmm5
- vmovd 28(%r8),%xmm0
- vmovd 28(%r13),%xmm1
- vmovd 28(%r9),%xmm2
- vpinsrd $1,28(%r14),%xmm5,%xmm5
- vpinsrd $1,28(%r10),%xmm0,%xmm0
- vpinsrd $1,28(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,28(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm13,%ymm7
- vpslld $26,%ymm13,%ymm2
- vmovdqu %ymm5,224-128(%rax)
- vpaddd %ymm8,%ymm5,%ymm5
-
- vpsrld $11,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm13,%ymm2
- vpaddd 96(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm13,%ymm2
- vpandn %ymm15,%ymm13,%ymm0
- vpand %ymm14,%ymm13,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm9,%ymm8
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm9,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm9,%ymm10,%ymm4
-
- vpxor %ymm1,%ymm8,%ymm8
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm9,%ymm1
-
- vpslld $19,%ymm9,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm8,%ymm7
-
- vpsrld $22,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm9,%ymm2
- vpxor %ymm3,%ymm10,%ymm8
- vpaddd %ymm5,%ymm12,%ymm12
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm8,%ymm8
- vpaddd %ymm7,%ymm8,%ymm8
- addq $256,%rbp
- vmovd 32(%r12),%xmm5
- vmovd 32(%r8),%xmm0
- vmovd 32(%r13),%xmm1
- vmovd 32(%r9),%xmm2
- vpinsrd $1,32(%r14),%xmm5,%xmm5
- vpinsrd $1,32(%r10),%xmm0,%xmm0
- vpinsrd $1,32(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,32(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm12,%ymm7
- vpslld $26,%ymm12,%ymm2
- vmovdqu %ymm5,256-256-128(%rbx)
- vpaddd %ymm15,%ymm5,%ymm5
-
- vpsrld $11,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm12,%ymm2
- vpaddd -128(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm12,%ymm2
- vpandn %ymm14,%ymm12,%ymm0
- vpand %ymm13,%ymm12,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm8,%ymm15
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm8,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm8,%ymm9,%ymm3
-
- vpxor %ymm1,%ymm15,%ymm15
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm8,%ymm1
-
- vpslld $19,%ymm8,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm15,%ymm7
-
- vpsrld $22,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm8,%ymm2
- vpxor %ymm4,%ymm9,%ymm15
- vpaddd %ymm5,%ymm11,%ymm11
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm15,%ymm15
- vpaddd %ymm7,%ymm15,%ymm15
- vmovd 36(%r12),%xmm5
- vmovd 36(%r8),%xmm0
- vmovd 36(%r13),%xmm1
- vmovd 36(%r9),%xmm2
- vpinsrd $1,36(%r14),%xmm5,%xmm5
- vpinsrd $1,36(%r10),%xmm0,%xmm0
- vpinsrd $1,36(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,36(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm11,%ymm7
- vpslld $26,%ymm11,%ymm2
- vmovdqu %ymm5,288-256-128(%rbx)
- vpaddd %ymm14,%ymm5,%ymm5
-
- vpsrld $11,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm11,%ymm2
- vpaddd -96(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm11,%ymm2
- vpandn %ymm13,%ymm11,%ymm0
- vpand %ymm12,%ymm11,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm15,%ymm14
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm15,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm15,%ymm8,%ymm4
-
- vpxor %ymm1,%ymm14,%ymm14
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm15,%ymm1
-
- vpslld $19,%ymm15,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm14,%ymm7
-
- vpsrld $22,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm15,%ymm2
- vpxor %ymm3,%ymm8,%ymm14
- vpaddd %ymm5,%ymm10,%ymm10
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm14,%ymm14
- vpaddd %ymm7,%ymm14,%ymm14
- vmovd 40(%r12),%xmm5
- vmovd 40(%r8),%xmm0
- vmovd 40(%r13),%xmm1
- vmovd 40(%r9),%xmm2
- vpinsrd $1,40(%r14),%xmm5,%xmm5
- vpinsrd $1,40(%r10),%xmm0,%xmm0
- vpinsrd $1,40(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,40(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm10,%ymm7
- vpslld $26,%ymm10,%ymm2
- vmovdqu %ymm5,320-256-128(%rbx)
- vpaddd %ymm13,%ymm5,%ymm5
-
- vpsrld $11,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm10,%ymm2
- vpaddd -64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm10,%ymm2
- vpandn %ymm12,%ymm10,%ymm0
- vpand %ymm11,%ymm10,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm14,%ymm13
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm14,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm14,%ymm15,%ymm3
-
- vpxor %ymm1,%ymm13,%ymm13
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm14,%ymm1
-
- vpslld $19,%ymm14,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm13,%ymm7
-
- vpsrld $22,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm14,%ymm2
- vpxor %ymm4,%ymm15,%ymm13
- vpaddd %ymm5,%ymm9,%ymm9
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm13,%ymm13
- vpaddd %ymm7,%ymm13,%ymm13
- vmovd 44(%r12),%xmm5
- vmovd 44(%r8),%xmm0
- vmovd 44(%r13),%xmm1
- vmovd 44(%r9),%xmm2
- vpinsrd $1,44(%r14),%xmm5,%xmm5
- vpinsrd $1,44(%r10),%xmm0,%xmm0
- vpinsrd $1,44(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,44(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm9,%ymm7
- vpslld $26,%ymm9,%ymm2
- vmovdqu %ymm5,352-256-128(%rbx)
- vpaddd %ymm12,%ymm5,%ymm5
-
- vpsrld $11,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm9,%ymm2
- vpaddd -32(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm9,%ymm2
- vpandn %ymm11,%ymm9,%ymm0
- vpand %ymm10,%ymm9,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm13,%ymm12
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm13,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm13,%ymm14,%ymm4
-
- vpxor %ymm1,%ymm12,%ymm12
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm13,%ymm1
-
- vpslld $19,%ymm13,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm12,%ymm7
-
- vpsrld $22,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm13,%ymm2
- vpxor %ymm3,%ymm14,%ymm12
- vpaddd %ymm5,%ymm8,%ymm8
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm12,%ymm12
- vpaddd %ymm7,%ymm12,%ymm12
- vmovd 48(%r12),%xmm5
- vmovd 48(%r8),%xmm0
- vmovd 48(%r13),%xmm1
- vmovd 48(%r9),%xmm2
- vpinsrd $1,48(%r14),%xmm5,%xmm5
- vpinsrd $1,48(%r10),%xmm0,%xmm0
- vpinsrd $1,48(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,48(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm8,%ymm7
- vpslld $26,%ymm8,%ymm2
- vmovdqu %ymm5,384-256-128(%rbx)
- vpaddd %ymm11,%ymm5,%ymm5
-
- vpsrld $11,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm8,%ymm2
- vpaddd 0(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm8,%ymm2
- vpandn %ymm10,%ymm8,%ymm0
- vpand %ymm9,%ymm8,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm12,%ymm11
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm12,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm12,%ymm13,%ymm3
-
- vpxor %ymm1,%ymm11,%ymm11
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm12,%ymm1
-
- vpslld $19,%ymm12,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm11,%ymm7
-
- vpsrld $22,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm12,%ymm2
- vpxor %ymm4,%ymm13,%ymm11
- vpaddd %ymm5,%ymm15,%ymm15
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm11,%ymm11
- vpaddd %ymm7,%ymm11,%ymm11
- vmovd 52(%r12),%xmm5
- vmovd 52(%r8),%xmm0
- vmovd 52(%r13),%xmm1
- vmovd 52(%r9),%xmm2
- vpinsrd $1,52(%r14),%xmm5,%xmm5
- vpinsrd $1,52(%r10),%xmm0,%xmm0
- vpinsrd $1,52(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,52(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm15,%ymm7
- vpslld $26,%ymm15,%ymm2
- vmovdqu %ymm5,416-256-128(%rbx)
- vpaddd %ymm10,%ymm5,%ymm5
-
- vpsrld $11,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm15,%ymm2
- vpaddd 32(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm15,%ymm2
- vpandn %ymm9,%ymm15,%ymm0
- vpand %ymm8,%ymm15,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm11,%ymm10
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm11,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm11,%ymm12,%ymm4
-
- vpxor %ymm1,%ymm10,%ymm10
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm11,%ymm1
-
- vpslld $19,%ymm11,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm10,%ymm7
-
- vpsrld $22,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm11,%ymm2
- vpxor %ymm3,%ymm12,%ymm10
- vpaddd %ymm5,%ymm14,%ymm14
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm10,%ymm10
- vpaddd %ymm7,%ymm10,%ymm10
- vmovd 56(%r12),%xmm5
- vmovd 56(%r8),%xmm0
- vmovd 56(%r13),%xmm1
- vmovd 56(%r9),%xmm2
- vpinsrd $1,56(%r14),%xmm5,%xmm5
- vpinsrd $1,56(%r10),%xmm0,%xmm0
- vpinsrd $1,56(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,56(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm14,%ymm7
- vpslld $26,%ymm14,%ymm2
- vmovdqu %ymm5,448-256-128(%rbx)
- vpaddd %ymm9,%ymm5,%ymm5
-
- vpsrld $11,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm14,%ymm2
- vpaddd 64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm14,%ymm2
- vpandn %ymm8,%ymm14,%ymm0
- vpand %ymm15,%ymm14,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm10,%ymm9
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm10,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm10,%ymm11,%ymm3
-
- vpxor %ymm1,%ymm9,%ymm9
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm10,%ymm1
-
- vpslld $19,%ymm10,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm9,%ymm7
-
- vpsrld $22,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm10,%ymm2
- vpxor %ymm4,%ymm11,%ymm9
- vpaddd %ymm5,%ymm13,%ymm13
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm9,%ymm9
- vpaddd %ymm7,%ymm9,%ymm9
- vmovd 60(%r12),%xmm5
- leaq 64(%r12),%r12
- vmovd 60(%r8),%xmm0
- leaq 64(%r8),%r8
- vmovd 60(%r13),%xmm1
- leaq 64(%r13),%r13
- vmovd 60(%r9),%xmm2
- leaq 64(%r9),%r9
- vpinsrd $1,60(%r14),%xmm5,%xmm5
- leaq 64(%r14),%r14
- vpinsrd $1,60(%r10),%xmm0,%xmm0
- leaq 64(%r10),%r10
- vpinsrd $1,60(%r15),%xmm1,%xmm1
- leaq 64(%r15),%r15
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,60(%r11),%xmm2,%xmm2
- leaq 64(%r11),%r11
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm13,%ymm7
- vpslld $26,%ymm13,%ymm2
- vmovdqu %ymm5,480-256-128(%rbx)
- vpaddd %ymm8,%ymm5,%ymm5
-
- vpsrld $11,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm13,%ymm2
- vpaddd 96(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- prefetcht0 63(%r12)
- vpslld $7,%ymm13,%ymm2
- vpandn %ymm15,%ymm13,%ymm0
- vpand %ymm14,%ymm13,%ymm4
- prefetcht0 63(%r13)
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm9,%ymm8
- vpxor %ymm2,%ymm7,%ymm7
- prefetcht0 63(%r14)
- vpslld $30,%ymm9,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm9,%ymm10,%ymm4
- prefetcht0 63(%r15)
- vpxor %ymm1,%ymm8,%ymm8
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm9,%ymm1
- prefetcht0 63(%r8)
- vpslld $19,%ymm9,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
- prefetcht0 63(%r9)
- vpxor %ymm1,%ymm8,%ymm7
-
- vpsrld $22,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- prefetcht0 63(%r10)
- vpslld $10,%ymm9,%ymm2
- vpxor %ymm3,%ymm10,%ymm8
- vpaddd %ymm5,%ymm12,%ymm12
- prefetcht0 63(%r11)
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm8,%ymm8
- vpaddd %ymm7,%ymm8,%ymm8
- addq $256,%rbp
- vmovdqu 0-128(%rax),%ymm5
- movl $3,%ecx
- jmp L$oop_16_xx_avx2
-.p2align 5
-L$oop_16_xx_avx2:
- vmovdqu 32-128(%rax),%ymm6
- vpaddd 288-256-128(%rbx),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 448-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm12,%ymm7
- vpslld $26,%ymm12,%ymm2
- vmovdqu %ymm5,0-128(%rax)
- vpaddd %ymm15,%ymm5,%ymm5
-
- vpsrld $11,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm12,%ymm2
- vpaddd -128(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm12,%ymm2
- vpandn %ymm14,%ymm12,%ymm0
- vpand %ymm13,%ymm12,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm8,%ymm15
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm8,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm8,%ymm9,%ymm3
-
- vpxor %ymm1,%ymm15,%ymm15
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm8,%ymm1
-
- vpslld $19,%ymm8,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm15,%ymm7
-
- vpsrld $22,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm8,%ymm2
- vpxor %ymm4,%ymm9,%ymm15
- vpaddd %ymm5,%ymm11,%ymm11
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm15,%ymm15
- vpaddd %ymm7,%ymm15,%ymm15
- vmovdqu 64-128(%rax),%ymm5
- vpaddd 320-256-128(%rbx),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 480-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm11,%ymm7
- vpslld $26,%ymm11,%ymm2
- vmovdqu %ymm6,32-128(%rax)
- vpaddd %ymm14,%ymm6,%ymm6
-
- vpsrld $11,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm11,%ymm2
- vpaddd -96(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm11,%ymm2
- vpandn %ymm13,%ymm11,%ymm0
- vpand %ymm12,%ymm11,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm15,%ymm14
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm15,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm15,%ymm8,%ymm4
-
- vpxor %ymm1,%ymm14,%ymm14
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm15,%ymm1
-
- vpslld $19,%ymm15,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm14,%ymm7
-
- vpsrld $22,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm15,%ymm2
- vpxor %ymm3,%ymm8,%ymm14
- vpaddd %ymm6,%ymm10,%ymm10
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm14,%ymm14
- vpaddd %ymm7,%ymm14,%ymm14
- vmovdqu 96-128(%rax),%ymm6
- vpaddd 352-256-128(%rbx),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 0-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm10,%ymm7
- vpslld $26,%ymm10,%ymm2
- vmovdqu %ymm5,64-128(%rax)
- vpaddd %ymm13,%ymm5,%ymm5
-
- vpsrld $11,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm10,%ymm2
- vpaddd -64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm10,%ymm2
- vpandn %ymm12,%ymm10,%ymm0
- vpand %ymm11,%ymm10,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm14,%ymm13
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm14,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm14,%ymm15,%ymm3
-
- vpxor %ymm1,%ymm13,%ymm13
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm14,%ymm1
-
- vpslld $19,%ymm14,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm13,%ymm7
-
- vpsrld $22,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm14,%ymm2
- vpxor %ymm4,%ymm15,%ymm13
- vpaddd %ymm5,%ymm9,%ymm9
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm13,%ymm13
- vpaddd %ymm7,%ymm13,%ymm13
- vmovdqu 128-128(%rax),%ymm5
- vpaddd 384-256-128(%rbx),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 32-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm9,%ymm7
- vpslld $26,%ymm9,%ymm2
- vmovdqu %ymm6,96-128(%rax)
- vpaddd %ymm12,%ymm6,%ymm6
-
- vpsrld $11,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm9,%ymm2
- vpaddd -32(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm9,%ymm2
- vpandn %ymm11,%ymm9,%ymm0
- vpand %ymm10,%ymm9,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm13,%ymm12
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm13,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm13,%ymm14,%ymm4
-
- vpxor %ymm1,%ymm12,%ymm12
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm13,%ymm1
-
- vpslld $19,%ymm13,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm12,%ymm7
-
- vpsrld $22,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm13,%ymm2
- vpxor %ymm3,%ymm14,%ymm12
- vpaddd %ymm6,%ymm8,%ymm8
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm12,%ymm12
- vpaddd %ymm7,%ymm12,%ymm12
- vmovdqu 160-128(%rax),%ymm6
- vpaddd 416-256-128(%rbx),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 64-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm8,%ymm7
- vpslld $26,%ymm8,%ymm2
- vmovdqu %ymm5,128-128(%rax)
- vpaddd %ymm11,%ymm5,%ymm5
-
- vpsrld $11,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm8,%ymm2
- vpaddd 0(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm8,%ymm2
- vpandn %ymm10,%ymm8,%ymm0
- vpand %ymm9,%ymm8,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm12,%ymm11
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm12,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm12,%ymm13,%ymm3
-
- vpxor %ymm1,%ymm11,%ymm11
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm12,%ymm1
-
- vpslld $19,%ymm12,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm11,%ymm7
-
- vpsrld $22,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm12,%ymm2
- vpxor %ymm4,%ymm13,%ymm11
- vpaddd %ymm5,%ymm15,%ymm15
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm11,%ymm11
- vpaddd %ymm7,%ymm11,%ymm11
- vmovdqu 192-128(%rax),%ymm5
- vpaddd 448-256-128(%rbx),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 96-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm15,%ymm7
- vpslld $26,%ymm15,%ymm2
- vmovdqu %ymm6,160-128(%rax)
- vpaddd %ymm10,%ymm6,%ymm6
-
- vpsrld $11,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm15,%ymm2
- vpaddd 32(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm15,%ymm2
- vpandn %ymm9,%ymm15,%ymm0
- vpand %ymm8,%ymm15,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm11,%ymm10
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm11,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm11,%ymm12,%ymm4
-
- vpxor %ymm1,%ymm10,%ymm10
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm11,%ymm1
-
- vpslld $19,%ymm11,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm10,%ymm7
-
- vpsrld $22,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm11,%ymm2
- vpxor %ymm3,%ymm12,%ymm10
- vpaddd %ymm6,%ymm14,%ymm14
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm10,%ymm10
- vpaddd %ymm7,%ymm10,%ymm10
- vmovdqu 224-128(%rax),%ymm6
- vpaddd 480-256-128(%rbx),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 128-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm14,%ymm7
- vpslld $26,%ymm14,%ymm2
- vmovdqu %ymm5,192-128(%rax)
- vpaddd %ymm9,%ymm5,%ymm5
-
- vpsrld $11,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm14,%ymm2
- vpaddd 64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm14,%ymm2
- vpandn %ymm8,%ymm14,%ymm0
- vpand %ymm15,%ymm14,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm10,%ymm9
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm10,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm10,%ymm11,%ymm3
-
- vpxor %ymm1,%ymm9,%ymm9
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm10,%ymm1
-
- vpslld $19,%ymm10,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm9,%ymm7
-
- vpsrld $22,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm10,%ymm2
- vpxor %ymm4,%ymm11,%ymm9
- vpaddd %ymm5,%ymm13,%ymm13
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm9,%ymm9
- vpaddd %ymm7,%ymm9,%ymm9
- vmovdqu 256-256-128(%rbx),%ymm5
- vpaddd 0-128(%rax),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 160-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm13,%ymm7
- vpslld $26,%ymm13,%ymm2
- vmovdqu %ymm6,224-128(%rax)
- vpaddd %ymm8,%ymm6,%ymm6
-
- vpsrld $11,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm13,%ymm2
- vpaddd 96(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm13,%ymm2
- vpandn %ymm15,%ymm13,%ymm0
- vpand %ymm14,%ymm13,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm9,%ymm8
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm9,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm9,%ymm10,%ymm4
-
- vpxor %ymm1,%ymm8,%ymm8
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm9,%ymm1
-
- vpslld $19,%ymm9,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm8,%ymm7
-
- vpsrld $22,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm9,%ymm2
- vpxor %ymm3,%ymm10,%ymm8
- vpaddd %ymm6,%ymm12,%ymm12
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm8,%ymm8
- vpaddd %ymm7,%ymm8,%ymm8
- addq $256,%rbp
- vmovdqu 288-256-128(%rbx),%ymm6
- vpaddd 32-128(%rax),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 192-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm12,%ymm7
- vpslld $26,%ymm12,%ymm2
- vmovdqu %ymm5,256-256-128(%rbx)
- vpaddd %ymm15,%ymm5,%ymm5
-
- vpsrld $11,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm12,%ymm2
- vpaddd -128(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm12,%ymm2
- vpandn %ymm14,%ymm12,%ymm0
- vpand %ymm13,%ymm12,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm8,%ymm15
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm8,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm8,%ymm9,%ymm3
-
- vpxor %ymm1,%ymm15,%ymm15
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm8,%ymm1
-
- vpslld $19,%ymm8,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm15,%ymm7
-
- vpsrld $22,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm8,%ymm2
- vpxor %ymm4,%ymm9,%ymm15
- vpaddd %ymm5,%ymm11,%ymm11
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm15,%ymm15
- vpaddd %ymm7,%ymm15,%ymm15
- vmovdqu 320-256-128(%rbx),%ymm5
- vpaddd 64-128(%rax),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 224-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm11,%ymm7
- vpslld $26,%ymm11,%ymm2
- vmovdqu %ymm6,288-256-128(%rbx)
- vpaddd %ymm14,%ymm6,%ymm6
-
- vpsrld $11,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm11,%ymm2
- vpaddd -96(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm11,%ymm2
- vpandn %ymm13,%ymm11,%ymm0
- vpand %ymm12,%ymm11,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm15,%ymm14
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm15,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm15,%ymm8,%ymm4
-
- vpxor %ymm1,%ymm14,%ymm14
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm15,%ymm1
-
- vpslld $19,%ymm15,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm14,%ymm7
-
- vpsrld $22,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm15,%ymm2
- vpxor %ymm3,%ymm8,%ymm14
- vpaddd %ymm6,%ymm10,%ymm10
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm14,%ymm14
- vpaddd %ymm7,%ymm14,%ymm14
- vmovdqu 352-256-128(%rbx),%ymm6
- vpaddd 96-128(%rax),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 256-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm10,%ymm7
- vpslld $26,%ymm10,%ymm2
- vmovdqu %ymm5,320-256-128(%rbx)
- vpaddd %ymm13,%ymm5,%ymm5
-
- vpsrld $11,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm10,%ymm2
- vpaddd -64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm10,%ymm2
- vpandn %ymm12,%ymm10,%ymm0
- vpand %ymm11,%ymm10,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm14,%ymm13
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm14,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm14,%ymm15,%ymm3
-
- vpxor %ymm1,%ymm13,%ymm13
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm14,%ymm1
-
- vpslld $19,%ymm14,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm13,%ymm7
-
- vpsrld $22,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm14,%ymm2
- vpxor %ymm4,%ymm15,%ymm13
- vpaddd %ymm5,%ymm9,%ymm9
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm13,%ymm13
- vpaddd %ymm7,%ymm13,%ymm13
- vmovdqu 384-256-128(%rbx),%ymm5
- vpaddd 128-128(%rax),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 288-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm9,%ymm7
- vpslld $26,%ymm9,%ymm2
- vmovdqu %ymm6,352-256-128(%rbx)
- vpaddd %ymm12,%ymm6,%ymm6
-
- vpsrld $11,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm9,%ymm2
- vpaddd -32(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm9,%ymm2
- vpandn %ymm11,%ymm9,%ymm0
- vpand %ymm10,%ymm9,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm13,%ymm12
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm13,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm13,%ymm14,%ymm4
-
- vpxor %ymm1,%ymm12,%ymm12
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm13,%ymm1
-
- vpslld $19,%ymm13,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm12,%ymm7
-
- vpsrld $22,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm13,%ymm2
- vpxor %ymm3,%ymm14,%ymm12
- vpaddd %ymm6,%ymm8,%ymm8
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm12,%ymm12
- vpaddd %ymm7,%ymm12,%ymm12
- vmovdqu 416-256-128(%rbx),%ymm6
- vpaddd 160-128(%rax),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 320-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm8,%ymm7
- vpslld $26,%ymm8,%ymm2
- vmovdqu %ymm5,384-256-128(%rbx)
- vpaddd %ymm11,%ymm5,%ymm5
-
- vpsrld $11,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm8,%ymm2
- vpaddd 0(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm8,%ymm2
- vpandn %ymm10,%ymm8,%ymm0
- vpand %ymm9,%ymm8,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm12,%ymm11
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm12,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm12,%ymm13,%ymm3
-
- vpxor %ymm1,%ymm11,%ymm11
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm12,%ymm1
-
- vpslld $19,%ymm12,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm11,%ymm7
-
- vpsrld $22,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm12,%ymm2
- vpxor %ymm4,%ymm13,%ymm11
- vpaddd %ymm5,%ymm15,%ymm15
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm11,%ymm11
- vpaddd %ymm7,%ymm11,%ymm11
- vmovdqu 448-256-128(%rbx),%ymm5
- vpaddd 192-128(%rax),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 352-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm15,%ymm7
- vpslld $26,%ymm15,%ymm2
- vmovdqu %ymm6,416-256-128(%rbx)
- vpaddd %ymm10,%ymm6,%ymm6
-
- vpsrld $11,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm15,%ymm2
- vpaddd 32(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm15,%ymm2
- vpandn %ymm9,%ymm15,%ymm0
- vpand %ymm8,%ymm15,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm11,%ymm10
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm11,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm11,%ymm12,%ymm4
-
- vpxor %ymm1,%ymm10,%ymm10
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm11,%ymm1
-
- vpslld $19,%ymm11,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm10,%ymm7
-
- vpsrld $22,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm11,%ymm2
- vpxor %ymm3,%ymm12,%ymm10
- vpaddd %ymm6,%ymm14,%ymm14
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm10,%ymm10
- vpaddd %ymm7,%ymm10,%ymm10
- vmovdqu 480-256-128(%rbx),%ymm6
- vpaddd 224-128(%rax),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 384-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm14,%ymm7
- vpslld $26,%ymm14,%ymm2
- vmovdqu %ymm5,448-256-128(%rbx)
- vpaddd %ymm9,%ymm5,%ymm5
-
- vpsrld $11,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm14,%ymm2
- vpaddd 64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm14,%ymm2
- vpandn %ymm8,%ymm14,%ymm0
- vpand %ymm15,%ymm14,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm10,%ymm9
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm10,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm10,%ymm11,%ymm3
-
- vpxor %ymm1,%ymm9,%ymm9
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm10,%ymm1
-
- vpslld $19,%ymm10,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm9,%ymm7
-
- vpsrld $22,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm10,%ymm2
- vpxor %ymm4,%ymm11,%ymm9
- vpaddd %ymm5,%ymm13,%ymm13
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm9,%ymm9
- vpaddd %ymm7,%ymm9,%ymm9
- vmovdqu 0-128(%rax),%ymm5
- vpaddd 256-256-128(%rbx),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 416-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm13,%ymm7
- vpslld $26,%ymm13,%ymm2
- vmovdqu %ymm6,480-256-128(%rbx)
- vpaddd %ymm8,%ymm6,%ymm6
-
- vpsrld $11,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm13,%ymm2
- vpaddd 96(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm13,%ymm2
- vpandn %ymm15,%ymm13,%ymm0
- vpand %ymm14,%ymm13,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm9,%ymm8
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm9,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm9,%ymm10,%ymm4
-
- vpxor %ymm1,%ymm8,%ymm8
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm9,%ymm1
-
- vpslld $19,%ymm9,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm8,%ymm7
-
- vpsrld $22,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm9,%ymm2
- vpxor %ymm3,%ymm10,%ymm8
- vpaddd %ymm6,%ymm12,%ymm12
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm8,%ymm8
- vpaddd %ymm7,%ymm8,%ymm8
- addq $256,%rbp
- decl %ecx
- jnz L$oop_16_xx_avx2
-
- movl $1,%ecx
- leaq 512(%rsp),%rbx
- leaq K256+128(%rip),%rbp
- cmpl 0(%rbx),%ecx
- cmovgeq %rbp,%r12
- cmpl 4(%rbx),%ecx
- cmovgeq %rbp,%r13
- cmpl 8(%rbx),%ecx
- cmovgeq %rbp,%r14
- cmpl 12(%rbx),%ecx
- cmovgeq %rbp,%r15
- cmpl 16(%rbx),%ecx
- cmovgeq %rbp,%r8
- cmpl 20(%rbx),%ecx
- cmovgeq %rbp,%r9
- cmpl 24(%rbx),%ecx
- cmovgeq %rbp,%r10
- cmpl 28(%rbx),%ecx
- cmovgeq %rbp,%r11
- vmovdqa (%rbx),%ymm7
- vpxor %ymm0,%ymm0,%ymm0
- vmovdqa %ymm7,%ymm6
- vpcmpgtd %ymm0,%ymm6,%ymm6
- vpaddd %ymm6,%ymm7,%ymm7
-
- vmovdqu 0-128(%rdi),%ymm0
- vpand %ymm6,%ymm8,%ymm8
- vmovdqu 32-128(%rdi),%ymm1
- vpand %ymm6,%ymm9,%ymm9
- vmovdqu 64-128(%rdi),%ymm2
- vpand %ymm6,%ymm10,%ymm10
- vmovdqu 96-128(%rdi),%ymm5
- vpand %ymm6,%ymm11,%ymm11
- vpaddd %ymm0,%ymm8,%ymm8
- vmovdqu 128-128(%rdi),%ymm0
- vpand %ymm6,%ymm12,%ymm12
- vpaddd %ymm1,%ymm9,%ymm9
- vmovdqu 160-128(%rdi),%ymm1
- vpand %ymm6,%ymm13,%ymm13
- vpaddd %ymm2,%ymm10,%ymm10
- vmovdqu 192-128(%rdi),%ymm2
- vpand %ymm6,%ymm14,%ymm14
- vpaddd %ymm5,%ymm11,%ymm11
- vmovdqu 224-128(%rdi),%ymm5
- vpand %ymm6,%ymm15,%ymm15
- vpaddd %ymm0,%ymm12,%ymm12
- vpaddd %ymm1,%ymm13,%ymm13
- vmovdqu %ymm8,0-128(%rdi)
- vpaddd %ymm2,%ymm14,%ymm14
- vmovdqu %ymm9,32-128(%rdi)
- vpaddd %ymm5,%ymm15,%ymm15
- vmovdqu %ymm10,64-128(%rdi)
- vmovdqu %ymm11,96-128(%rdi)
- vmovdqu %ymm12,128-128(%rdi)
- vmovdqu %ymm13,160-128(%rdi)
- vmovdqu %ymm14,192-128(%rdi)
- vmovdqu %ymm15,224-128(%rdi)
-
- vmovdqu %ymm7,(%rbx)
- leaq 256+128(%rsp),%rbx
- vmovdqu L$pbswap(%rip),%ymm6
- decl %edx
- jnz L$oop_avx2
-
-
-
-
-
-
-
-L$done_avx2:
- movq 544(%rsp),%rax
- vzeroupper
- movq -48(%rax),%r15
- movq -40(%rax),%r14
- movq -32(%rax),%r13
- movq -24(%rax),%r12
- movq -16(%rax),%rbp
- movq -8(%rax),%rbx
- leaq (%rax),%rsp
-L$epilogue_avx2:
- .byte 0xf3,0xc3
-
.p2align 8
K256:
.long 1116352408,1116352408,1116352408,1116352408
diff --git a/deps/openssl/asm/x64-macosx-gas/sha/sha256-x86_64.s b/deps/openssl/asm/x64-macosx-gas/sha/sha256-x86_64.s
index b66bd34406..5566d58761 100644
--- a/deps/openssl/asm/x64-macosx-gas/sha/sha256-x86_64.s
+++ b/deps/openssl/asm/x64-macosx-gas/sha/sha256-x86_64.s
@@ -11,14 +11,6 @@ _sha256_block_data_order:
movl 8(%r11),%r11d
testl $536870912,%r11d
jnz _shaext_shortcut
- andl $296,%r11d
- cmpl $296,%r11d
- je L$avx2_shortcut
- andl $1073741824,%r9d
- andl $268435968,%r10d
- orl %r9d,%r10d
- cmpl $1342177792,%r10d
- je L$avx_shortcut
testl $512,%r10d
jnz L$ssse3_shortcut
pushq %rbx
@@ -1762,9 +1754,9 @@ _shaext_shortcut:
movdqu 16(%rdi),%xmm2
movdqa 512-128(%rcx),%xmm7
- pshufd $27,%xmm1,%xmm0
- pshufd $177,%xmm1,%xmm1
- pshufd $27,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm0
+ pshufd $0xb1,%xmm1,%xmm1
+ pshufd $0x1b,%xmm2,%xmm2
movdqa %xmm7,%xmm8
.byte 102,15,58,15,202,8
punpcklqdq %xmm0,%xmm2
@@ -1783,7 +1775,7 @@ L$oop_shaext:
.byte 102,15,56,0,231
movdqa %xmm2,%xmm10
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
nop
movdqa %xmm1,%xmm9
.byte 15,56,203,202
@@ -1792,7 +1784,7 @@ L$oop_shaext:
paddd %xmm4,%xmm0
.byte 102,15,56,0,239
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
leaq 64(%rsi),%rsi
.byte 15,56,204,220
.byte 15,56,203,202
@@ -1801,7 +1793,7 @@ L$oop_shaext:
paddd %xmm5,%xmm0
.byte 102,15,56,0,247
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
.byte 102,15,58,15,253,4
nop
@@ -1813,7 +1805,7 @@ L$oop_shaext:
paddd %xmm6,%xmm0
.byte 15,56,205,222
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
.byte 102,15,58,15,254,4
nop
@@ -1824,7 +1816,7 @@ L$oop_shaext:
paddd %xmm3,%xmm0
.byte 15,56,205,227
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
.byte 102,15,58,15,251,4
nop
@@ -1835,7 +1827,7 @@ L$oop_shaext:
paddd %xmm4,%xmm0
.byte 15,56,205,236
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
.byte 102,15,58,15,252,4
nop
@@ -1846,7 +1838,7 @@ L$oop_shaext:
paddd %xmm5,%xmm0
.byte 15,56,205,245
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
.byte 102,15,58,15,253,4
nop
@@ -1857,7 +1849,7 @@ L$oop_shaext:
paddd %xmm6,%xmm0
.byte 15,56,205,222
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
.byte 102,15,58,15,254,4
nop
@@ -1868,7 +1860,7 @@ L$oop_shaext:
paddd %xmm3,%xmm0
.byte 15,56,205,227
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
.byte 102,15,58,15,251,4
nop
@@ -1879,7 +1871,7 @@ L$oop_shaext:
paddd %xmm4,%xmm0
.byte 15,56,205,236
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
.byte 102,15,58,15,252,4
nop
@@ -1890,7 +1882,7 @@ L$oop_shaext:
paddd %xmm5,%xmm0
.byte 15,56,205,245
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
.byte 102,15,58,15,253,4
nop
@@ -1901,7 +1893,7 @@ L$oop_shaext:
paddd %xmm6,%xmm0
.byte 15,56,205,222
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
.byte 102,15,58,15,254,4
nop
@@ -1912,7 +1904,7 @@ L$oop_shaext:
paddd %xmm3,%xmm0
.byte 15,56,205,227
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
.byte 102,15,58,15,251,4
nop
@@ -1923,7 +1915,7 @@ L$oop_shaext:
paddd %xmm4,%xmm0
.byte 15,56,205,236
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
.byte 102,15,58,15,252,4
.byte 15,56,203,202
@@ -1932,7 +1924,7 @@ L$oop_shaext:
movdqa 448-128(%rcx),%xmm0
paddd %xmm5,%xmm0
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
.byte 15,56,205,245
movdqa %xmm8,%xmm7
.byte 15,56,203,202
@@ -1941,7 +1933,7 @@ L$oop_shaext:
paddd %xmm6,%xmm0
nop
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
decq %rdx
nop
.byte 15,56,203,202
@@ -1950,9 +1942,9 @@ L$oop_shaext:
paddd %xmm9,%xmm1
jnz L$oop_shaext
- pshufd $177,%xmm2,%xmm2
- pshufd $27,%xmm1,%xmm7
- pshufd $177,%xmm1,%xmm1
+ pshufd $0xb1,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm7
+ pshufd $0xb1,%xmm1,%xmm1
punpckhqdq %xmm2,%xmm1
.byte 102,15,58,15,215,8
@@ -3054,2304 +3046,3 @@ L$ssse3_00_47:
leaq 48(%rsi),%rsp
L$epilogue_ssse3:
.byte 0xf3,0xc3
-
-
-.p2align 6
-sha256_block_data_order_avx:
-L$avx_shortcut:
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rsp,%r11
- shlq $4,%rdx
- subq $96,%rsp
- leaq (%rsi,%rdx,4),%rdx
- andq $-64,%rsp
- movq %rdi,64+0(%rsp)
- movq %rsi,64+8(%rsp)
- movq %rdx,64+16(%rsp)
- movq %r11,64+24(%rsp)
-L$prologue_avx:
-
- vzeroupper
- movl 0(%rdi),%eax
- movl 4(%rdi),%ebx
- movl 8(%rdi),%ecx
- movl 12(%rdi),%edx
- movl 16(%rdi),%r8d
- movl 20(%rdi),%r9d
- movl 24(%rdi),%r10d
- movl 28(%rdi),%r11d
- vmovdqa K256+512+32(%rip),%xmm8
- vmovdqa K256+512+64(%rip),%xmm9
- jmp L$loop_avx
-.p2align 4
-L$loop_avx:
- vmovdqa K256+512(%rip),%xmm7
- vmovdqu 0(%rsi),%xmm0
- vmovdqu 16(%rsi),%xmm1
- vmovdqu 32(%rsi),%xmm2
- vmovdqu 48(%rsi),%xmm3
- vpshufb %xmm7,%xmm0,%xmm0
- leaq K256(%rip),%rbp
- vpshufb %xmm7,%xmm1,%xmm1
- vpshufb %xmm7,%xmm2,%xmm2
- vpaddd 0(%rbp),%xmm0,%xmm4
- vpshufb %xmm7,%xmm3,%xmm3
- vpaddd 32(%rbp),%xmm1,%xmm5
- vpaddd 64(%rbp),%xmm2,%xmm6
- vpaddd 96(%rbp),%xmm3,%xmm7
- vmovdqa %xmm4,0(%rsp)
- movl %eax,%r14d
- vmovdqa %xmm5,16(%rsp)
- movl %ebx,%edi
- vmovdqa %xmm6,32(%rsp)
- xorl %ecx,%edi
- vmovdqa %xmm7,48(%rsp)
- movl %r8d,%r13d
- jmp L$avx_00_47
-
-.p2align 4
-L$avx_00_47:
- subq $-128,%rbp
- vpalignr $4,%xmm0,%xmm1,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- vpalignr $4,%xmm2,%xmm3,%xmm7
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpaddd %xmm7,%xmm0,%xmm0
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- vpsrld $3,%xmm4,%xmm7
- xorl %r10d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r15d
- vpslld $14,%xmm4,%xmm5
- addl %r12d,%r11d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- vpshufd $250,%xmm3,%xmm7
- shrdl $2,%r14d,%r14d
- addl %r11d,%edx
- addl %edi,%r11d
- vpsrld $11,%xmm6,%xmm6
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%r11d
- movl %r8d,%r12d
- shrdl $9,%r14d,%r14d
- vpslld $11,%xmm5,%xmm5
- xorl %edx,%r13d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r11d,%r14d
- andl %edx,%r12d
- xorl %edx,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 4(%rsp),%r10d
- movl %r11d,%edi
- xorl %r9d,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- shrdl $11,%r14d,%r14d
- xorl %eax,%edi
- addl %r12d,%r10d
- vpsrlq $17,%xmm7,%xmm7
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- vpaddd %xmm4,%xmm0,%xmm0
- addl %r13d,%r10d
- xorl %eax,%r15d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %r10d,%ecx
- addl %r15d,%r10d
- movl %ecx,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- movl %edx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r13d
- vpshufb %xmm8,%xmm6,%xmm6
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- vpaddd %xmm6,%xmm0,%xmm0
- andl %ecx,%r12d
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- vpshufd $80,%xmm0,%xmm7
- movl %r10d,%r15d
- xorl %r8d,%r12d
- shrdl $11,%r14d,%r14d
- vpsrld $10,%xmm7,%xmm6
- xorl %r11d,%r15d
- addl %r12d,%r9d
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %r11d,%edi
- shrdl $2,%r14d,%r14d
- addl %r9d,%ebx
- vpsrlq $2,%xmm7,%xmm7
- addl %edi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- vpshufb %xmm9,%xmm6,%xmm6
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- vpaddd %xmm6,%xmm0,%xmm0
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vpaddd 0(%rbp),%xmm0,%xmm6
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%edi
- xorl %edx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%edi
- addl %r12d,%r8d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- shrdl $2,%r14d,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,0(%rsp)
- vpalignr $4,%xmm1,%xmm2,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- vpalignr $4,%xmm3,%xmm0,%xmm7
- shrdl $9,%r14d,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpaddd %xmm7,%xmm1,%xmm1
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- vpsrld $3,%xmm4,%xmm7
- xorl %ecx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r15d
- vpslld $14,%xmm4,%xmm5
- addl %r12d,%edx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- vpshufd $250,%xmm0,%xmm7
- shrdl $2,%r14d,%r14d
- addl %edx,%r11d
- addl %edi,%edx
- vpsrld $11,%xmm6,%xmm6
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%edx
- movl %eax,%r12d
- shrdl $9,%r14d,%r14d
- vpslld $11,%xmm5,%xmm5
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %edx,%r14d
- andl %r11d,%r12d
- xorl %r11d,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 20(%rsp),%ecx
- movl %edx,%edi
- xorl %ebx,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- shrdl $11,%r14d,%r14d
- xorl %r8d,%edi
- addl %r12d,%ecx
- vpsrlq $17,%xmm7,%xmm7
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- vpaddd %xmm4,%xmm1,%xmm1
- addl %r13d,%ecx
- xorl %r8d,%r15d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %ecx,%r10d
- addl %r15d,%ecx
- movl %r10d,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- movl %r11d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r13d
- vpshufb %xmm8,%xmm6,%xmm6
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- vpaddd %xmm6,%xmm1,%xmm1
- andl %r10d,%r12d
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- vpshufd $80,%xmm1,%xmm7
- movl %ecx,%r15d
- xorl %eax,%r12d
- shrdl $11,%r14d,%r14d
- vpsrld $10,%xmm7,%xmm6
- xorl %edx,%r15d
- addl %r12d,%ebx
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- xorl %edx,%edi
- shrdl $2,%r14d,%r14d
- addl %ebx,%r9d
- vpsrlq $2,%xmm7,%xmm7
- addl %edi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- vpshufb %xmm9,%xmm6,%xmm6
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- vpaddd %xmm6,%xmm1,%xmm1
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpaddd 32(%rbp),%xmm1,%xmm6
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%edi
- xorl %r11d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%edi
- addl %r12d,%eax
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- shrdl $2,%r14d,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,16(%rsp)
- vpalignr $4,%xmm2,%xmm3,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- vpalignr $4,%xmm0,%xmm1,%xmm7
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpaddd %xmm7,%xmm2,%xmm2
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- vpsrld $3,%xmm4,%xmm7
- xorl %r10d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r15d
- vpslld $14,%xmm4,%xmm5
- addl %r12d,%r11d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- vpshufd $250,%xmm1,%xmm7
- shrdl $2,%r14d,%r14d
- addl %r11d,%edx
- addl %edi,%r11d
- vpsrld $11,%xmm6,%xmm6
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%r11d
- movl %r8d,%r12d
- shrdl $9,%r14d,%r14d
- vpslld $11,%xmm5,%xmm5
- xorl %edx,%r13d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r11d,%r14d
- andl %edx,%r12d
- xorl %edx,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 36(%rsp),%r10d
- movl %r11d,%edi
- xorl %r9d,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- shrdl $11,%r14d,%r14d
- xorl %eax,%edi
- addl %r12d,%r10d
- vpsrlq $17,%xmm7,%xmm7
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- vpaddd %xmm4,%xmm2,%xmm2
- addl %r13d,%r10d
- xorl %eax,%r15d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %r10d,%ecx
- addl %r15d,%r10d
- movl %ecx,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- movl %edx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r13d
- vpshufb %xmm8,%xmm6,%xmm6
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- vpaddd %xmm6,%xmm2,%xmm2
- andl %ecx,%r12d
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- vpshufd $80,%xmm2,%xmm7
- movl %r10d,%r15d
- xorl %r8d,%r12d
- shrdl $11,%r14d,%r14d
- vpsrld $10,%xmm7,%xmm6
- xorl %r11d,%r15d
- addl %r12d,%r9d
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %r11d,%edi
- shrdl $2,%r14d,%r14d
- addl %r9d,%ebx
- vpsrlq $2,%xmm7,%xmm7
- addl %edi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- vpshufb %xmm9,%xmm6,%xmm6
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- vpaddd %xmm6,%xmm2,%xmm2
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vpaddd 64(%rbp),%xmm2,%xmm6
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%edi
- xorl %edx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%edi
- addl %r12d,%r8d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- shrdl $2,%r14d,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,32(%rsp)
- vpalignr $4,%xmm3,%xmm0,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- vpalignr $4,%xmm1,%xmm2,%xmm7
- shrdl $9,%r14d,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpaddd %xmm7,%xmm3,%xmm3
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- vpsrld $3,%xmm4,%xmm7
- xorl %ecx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r15d
- vpslld $14,%xmm4,%xmm5
- addl %r12d,%edx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- vpshufd $250,%xmm2,%xmm7
- shrdl $2,%r14d,%r14d
- addl %edx,%r11d
- addl %edi,%edx
- vpsrld $11,%xmm6,%xmm6
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%edx
- movl %eax,%r12d
- shrdl $9,%r14d,%r14d
- vpslld $11,%xmm5,%xmm5
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %edx,%r14d
- andl %r11d,%r12d
- xorl %r11d,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 52(%rsp),%ecx
- movl %edx,%edi
- xorl %ebx,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- shrdl $11,%r14d,%r14d
- xorl %r8d,%edi
- addl %r12d,%ecx
- vpsrlq $17,%xmm7,%xmm7
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- vpaddd %xmm4,%xmm3,%xmm3
- addl %r13d,%ecx
- xorl %r8d,%r15d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %ecx,%r10d
- addl %r15d,%ecx
- movl %r10d,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- movl %r11d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r13d
- vpshufb %xmm8,%xmm6,%xmm6
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- vpaddd %xmm6,%xmm3,%xmm3
- andl %r10d,%r12d
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- vpshufd $80,%xmm3,%xmm7
- movl %ecx,%r15d
- xorl %eax,%r12d
- shrdl $11,%r14d,%r14d
- vpsrld $10,%xmm7,%xmm6
- xorl %edx,%r15d
- addl %r12d,%ebx
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- xorl %edx,%edi
- shrdl $2,%r14d,%r14d
- addl %ebx,%r9d
- vpsrlq $2,%xmm7,%xmm7
- addl %edi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- vpshufb %xmm9,%xmm6,%xmm6
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- vpaddd %xmm6,%xmm3,%xmm3
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpaddd 96(%rbp),%xmm3,%xmm6
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%edi
- xorl %r11d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%edi
- addl %r12d,%eax
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- shrdl $2,%r14d,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,48(%rsp)
- cmpb $0,131(%rbp)
- jne L$avx_00_47
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- xorl %r10d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r15d
- addl %r12d,%r11d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- shrdl $2,%r14d,%r14d
- addl %r11d,%edx
- addl %edi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r13d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- xorl %edx,%r13d
- addl 4(%rsp),%r10d
- movl %r11d,%edi
- xorl %r9d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %eax,%edi
- addl %r12d,%r10d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- shrdl $2,%r14d,%r14d
- addl %r10d,%ecx
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r13d
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- movl %r10d,%r15d
- xorl %r8d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r15d
- addl %r12d,%r9d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%edi
- shrdl $2,%r14d,%r14d
- addl %r9d,%ebx
- addl %edi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%edi
- xorl %edx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%edi
- addl %r12d,%r8d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- shrdl $2,%r14d,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- xorl %ecx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r15d
- addl %r12d,%edx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- shrdl $2,%r14d,%r14d
- addl %edx,%r11d
- addl %edi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- xorl %r11d,%r13d
- addl 20(%rsp),%ecx
- movl %edx,%edi
- xorl %ebx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%edi
- addl %r12d,%ecx
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- shrdl $2,%r14d,%r14d
- addl %ecx,%r10d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r13d
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- movl %ecx,%r15d
- xorl %eax,%r12d
- shrdl $11,%r14d,%r14d
- xorl %edx,%r15d
- addl %r12d,%ebx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%edi
- shrdl $2,%r14d,%r14d
- addl %ebx,%r9d
- addl %edi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%edi
- xorl %r11d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%edi
- addl %r12d,%eax
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- shrdl $2,%r14d,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- xorl %r10d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r15d
- addl %r12d,%r11d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- shrdl $2,%r14d,%r14d
- addl %r11d,%edx
- addl %edi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r13d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- xorl %edx,%r13d
- addl 36(%rsp),%r10d
- movl %r11d,%edi
- xorl %r9d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %eax,%edi
- addl %r12d,%r10d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- shrdl $2,%r14d,%r14d
- addl %r10d,%ecx
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r13d
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- movl %r10d,%r15d
- xorl %r8d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r15d
- addl %r12d,%r9d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%edi
- shrdl $2,%r14d,%r14d
- addl %r9d,%ebx
- addl %edi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%edi
- xorl %edx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%edi
- addl %r12d,%r8d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- shrdl $2,%r14d,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- xorl %ecx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r15d
- addl %r12d,%edx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- shrdl $2,%r14d,%r14d
- addl %edx,%r11d
- addl %edi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- xorl %r11d,%r13d
- addl 52(%rsp),%ecx
- movl %edx,%edi
- xorl %ebx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%edi
- addl %r12d,%ecx
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- shrdl $2,%r14d,%r14d
- addl %ecx,%r10d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r13d
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- movl %ecx,%r15d
- xorl %eax,%r12d
- shrdl $11,%r14d,%r14d
- xorl %edx,%r15d
- addl %r12d,%ebx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%edi
- shrdl $2,%r14d,%r14d
- addl %ebx,%r9d
- addl %edi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%edi
- xorl %r11d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%edi
- addl %r12d,%eax
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- shrdl $2,%r14d,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- movq 64+0(%rsp),%rdi
- movl %r14d,%eax
-
- addl 0(%rdi),%eax
- leaq 64(%rsi),%rsi
- addl 4(%rdi),%ebx
- addl 8(%rdi),%ecx
- addl 12(%rdi),%edx
- addl 16(%rdi),%r8d
- addl 20(%rdi),%r9d
- addl 24(%rdi),%r10d
- addl 28(%rdi),%r11d
-
- cmpq 64+16(%rsp),%rsi
-
- movl %eax,0(%rdi)
- movl %ebx,4(%rdi)
- movl %ecx,8(%rdi)
- movl %edx,12(%rdi)
- movl %r8d,16(%rdi)
- movl %r9d,20(%rdi)
- movl %r10d,24(%rdi)
- movl %r11d,28(%rdi)
- jb L$loop_avx
-
- movq 64+24(%rsp),%rsi
- vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-L$epilogue_avx:
- .byte 0xf3,0xc3
-
-
-.p2align 6
-sha256_block_data_order_avx2:
-L$avx2_shortcut:
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rsp,%r11
- subq $544,%rsp
- shlq $4,%rdx
- andq $-1024,%rsp
- leaq (%rsi,%rdx,4),%rdx
- addq $448,%rsp
- movq %rdi,64+0(%rsp)
- movq %rsi,64+8(%rsp)
- movq %rdx,64+16(%rsp)
- movq %r11,64+24(%rsp)
-L$prologue_avx2:
-
- vzeroupper
- subq $-64,%rsi
- movl 0(%rdi),%eax
- movq %rsi,%r12
- movl 4(%rdi),%ebx
- cmpq %rdx,%rsi
- movl 8(%rdi),%ecx
- cmoveq %rsp,%r12
- movl 12(%rdi),%edx
- movl 16(%rdi),%r8d
- movl 20(%rdi),%r9d
- movl 24(%rdi),%r10d
- movl 28(%rdi),%r11d
- vmovdqa K256+512+32(%rip),%ymm8
- vmovdqa K256+512+64(%rip),%ymm9
- jmp L$oop_avx2
-.p2align 4
-L$oop_avx2:
- vmovdqa K256+512(%rip),%ymm7
- vmovdqu -64+0(%rsi),%xmm0
- vmovdqu -64+16(%rsi),%xmm1
- vmovdqu -64+32(%rsi),%xmm2
- vmovdqu -64+48(%rsi),%xmm3
-
- vinserti128 $1,(%r12),%ymm0,%ymm0
- vinserti128 $1,16(%r12),%ymm1,%ymm1
- vpshufb %ymm7,%ymm0,%ymm0
- vinserti128 $1,32(%r12),%ymm2,%ymm2
- vpshufb %ymm7,%ymm1,%ymm1
- vinserti128 $1,48(%r12),%ymm3,%ymm3
-
- leaq K256(%rip),%rbp
- vpshufb %ymm7,%ymm2,%ymm2
- vpaddd 0(%rbp),%ymm0,%ymm4
- vpshufb %ymm7,%ymm3,%ymm3
- vpaddd 32(%rbp),%ymm1,%ymm5
- vpaddd 64(%rbp),%ymm2,%ymm6
- vpaddd 96(%rbp),%ymm3,%ymm7
- vmovdqa %ymm4,0(%rsp)
- xorl %r14d,%r14d
- vmovdqa %ymm5,32(%rsp)
- leaq -64(%rsp),%rsp
- movl %ebx,%edi
- vmovdqa %ymm6,0(%rsp)
- xorl %ecx,%edi
- vmovdqa %ymm7,32(%rsp)
- movl %r9d,%r12d
- subq $-32*4,%rbp
- jmp L$avx2_00_47
-
-.p2align 4
-L$avx2_00_47:
- leaq -64(%rsp),%rsp
- vpalignr $4,%ymm0,%ymm1,%ymm4
- addl 0+128(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- vpalignr $4,%ymm2,%ymm3,%ymm7
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- vpsrld $7,%ymm4,%ymm6
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- vpaddd %ymm7,%ymm0,%ymm0
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %ebx,%edi
- vpshufd $250,%ymm3,%ymm7
- xorl %r13d,%r14d
- leal (%r11,%rdi,1),%r11d
- movl %r8d,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 4+128(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%edx,%edi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- vpslld $11,%ymm5,%ymm5
- andnl %r9d,%edx,%r12d
- xorl %edi,%r13d
- rorxl $6,%edx,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%edi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%edi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- vpsrlq $17,%ymm7,%ymm7
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %eax,%r15d
- vpaddd %ymm4,%ymm0,%ymm0
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 8+128(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- vpxor %ymm7,%ymm6,%ymm6
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- vpshufb %ymm8,%ymm6,%ymm6
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- vpaddd %ymm6,%ymm0,%ymm0
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- vpshufd $80,%ymm0,%ymm7
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- vpsrld $10,%ymm7,%ymm6
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r11d,%edi
- vpsrlq $17,%ymm7,%ymm7
- xorl %r13d,%r14d
- leal (%r9,%rdi,1),%r9d
- movl %ecx,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 12+128(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%ebx,%edi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- vpxor %ymm7,%ymm6,%ymm6
- andnl %edx,%ebx,%r12d
- xorl %edi,%r13d
- rorxl $6,%ebx,%r14d
- vpshufb %ymm9,%ymm6,%ymm6
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%edi
- vpaddd %ymm6,%ymm0,%ymm0
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%edi
- vpaddd 0(%rbp),%ymm0,%ymm6
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- vmovdqa %ymm6,0(%rsp)
- vpalignr $4,%ymm1,%ymm2,%ymm4
- addl 32+128(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- vpalignr $4,%ymm3,%ymm0,%ymm7
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- vpsrld $7,%ymm4,%ymm6
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- vpaddd %ymm7,%ymm1,%ymm1
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r9d,%edi
- vpshufd $250,%ymm0,%ymm7
- xorl %r13d,%r14d
- leal (%rdx,%rdi,1),%edx
- movl %eax,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 36+128(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%r11d,%edi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- vpslld $11,%ymm5,%ymm5
- andnl %ebx,%r11d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r11d,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%edi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%edi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- vpsrlq $17,%ymm7,%ymm7
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- vpaddd %ymm4,%ymm1,%ymm1
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 40+128(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- vpxor %ymm7,%ymm6,%ymm6
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- vpshufb %ymm8,%ymm6,%ymm6
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- vpaddd %ymm6,%ymm1,%ymm1
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- vpshufd $80,%ymm1,%ymm7
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- vpsrld $10,%ymm7,%ymm6
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %edx,%edi
- vpsrlq $17,%ymm7,%ymm7
- xorl %r13d,%r14d
- leal (%rbx,%rdi,1),%ebx
- movl %r10d,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 44+128(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%r9d,%edi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- vpxor %ymm7,%ymm6,%ymm6
- andnl %r11d,%r9d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r9d,%r14d
- vpshufb %ymm9,%ymm6,%ymm6
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%edi
- vpaddd %ymm6,%ymm1,%ymm1
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%edi
- vpaddd 32(%rbp),%ymm1,%ymm6
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- vmovdqa %ymm6,32(%rsp)
- leaq -64(%rsp),%rsp
- vpalignr $4,%ymm2,%ymm3,%ymm4
- addl 0+128(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- vpalignr $4,%ymm0,%ymm1,%ymm7
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- vpsrld $7,%ymm4,%ymm6
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- vpaddd %ymm7,%ymm2,%ymm2
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %ebx,%edi
- vpshufd $250,%ymm1,%ymm7
- xorl %r13d,%r14d
- leal (%r11,%rdi,1),%r11d
- movl %r8d,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 4+128(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%edx,%edi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- vpslld $11,%ymm5,%ymm5
- andnl %r9d,%edx,%r12d
- xorl %edi,%r13d
- rorxl $6,%edx,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%edi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%edi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- vpsrlq $17,%ymm7,%ymm7
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %eax,%r15d
- vpaddd %ymm4,%ymm2,%ymm2
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 8+128(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- vpxor %ymm7,%ymm6,%ymm6
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- vpshufb %ymm8,%ymm6,%ymm6
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- vpaddd %ymm6,%ymm2,%ymm2
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- vpshufd $80,%ymm2,%ymm7
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- vpsrld $10,%ymm7,%ymm6
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r11d,%edi
- vpsrlq $17,%ymm7,%ymm7
- xorl %r13d,%r14d
- leal (%r9,%rdi,1),%r9d
- movl %ecx,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 12+128(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%ebx,%edi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- vpxor %ymm7,%ymm6,%ymm6
- andnl %edx,%ebx,%r12d
- xorl %edi,%r13d
- rorxl $6,%ebx,%r14d
- vpshufb %ymm9,%ymm6,%ymm6
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%edi
- vpaddd %ymm6,%ymm2,%ymm2
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%edi
- vpaddd 64(%rbp),%ymm2,%ymm6
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- vmovdqa %ymm6,0(%rsp)
- vpalignr $4,%ymm3,%ymm0,%ymm4
- addl 32+128(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- vpalignr $4,%ymm1,%ymm2,%ymm7
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- vpsrld $7,%ymm4,%ymm6
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- vpaddd %ymm7,%ymm3,%ymm3
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r9d,%edi
- vpshufd $250,%ymm2,%ymm7
- xorl %r13d,%r14d
- leal (%rdx,%rdi,1),%edx
- movl %eax,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 36+128(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%r11d,%edi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- vpslld $11,%ymm5,%ymm5
- andnl %ebx,%r11d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r11d,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%edi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%edi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- vpsrlq $17,%ymm7,%ymm7
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- vpaddd %ymm4,%ymm3,%ymm3
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 40+128(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- vpxor %ymm7,%ymm6,%ymm6
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- vpshufb %ymm8,%ymm6,%ymm6
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- vpaddd %ymm6,%ymm3,%ymm3
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- vpshufd $80,%ymm3,%ymm7
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- vpsrld $10,%ymm7,%ymm6
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %edx,%edi
- vpsrlq $17,%ymm7,%ymm7
- xorl %r13d,%r14d
- leal (%rbx,%rdi,1),%ebx
- movl %r10d,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 44+128(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%r9d,%edi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- vpxor %ymm7,%ymm6,%ymm6
- andnl %r11d,%r9d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r9d,%r14d
- vpshufb %ymm9,%ymm6,%ymm6
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%edi
- vpaddd %ymm6,%ymm3,%ymm3
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%edi
- vpaddd 96(%rbp),%ymm3,%ymm6
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- vmovdqa %ymm6,32(%rsp)
- leaq 128(%rbp),%rbp
- cmpb $0,3(%rbp)
- jne L$avx2_00_47
- addl 0+64(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %ebx,%edi
- xorl %r13d,%r14d
- leal (%r11,%rdi,1),%r11d
- movl %r8d,%r12d
- addl 4+64(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%edi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %edi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%edi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%edi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8+64(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r11d,%edi
- xorl %r13d,%r14d
- leal (%r9,%rdi,1),%r9d
- movl %ecx,%r12d
- addl 12+64(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%edi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %edi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%edi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%edi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32+64(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r9d,%edi
- xorl %r13d,%r14d
- leal (%rdx,%rdi,1),%edx
- movl %eax,%r12d
- addl 36+64(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%edi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%edi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%edi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40+64(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %edx,%edi
- xorl %r13d,%r14d
- leal (%rbx,%rdi,1),%ebx
- movl %r10d,%r12d
- addl 44+64(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%edi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%edi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%edi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- addl 0(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %ebx,%edi
- xorl %r13d,%r14d
- leal (%r11,%rdi,1),%r11d
- movl %r8d,%r12d
- addl 4(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%edi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %edi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%edi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%edi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r11d,%edi
- xorl %r13d,%r14d
- leal (%r9,%rdi,1),%r9d
- movl %ecx,%r12d
- addl 12(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%edi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %edi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%edi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%edi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r9d,%edi
- xorl %r13d,%r14d
- leal (%rdx,%rdi,1),%edx
- movl %eax,%r12d
- addl 36(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%edi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%edi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%edi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %edx,%edi
- xorl %r13d,%r14d
- leal (%rbx,%rdi,1),%ebx
- movl %r10d,%r12d
- addl 44(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%edi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%edi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%edi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- movq 512(%rsp),%rdi
- addl %r14d,%eax
-
- leaq 448(%rsp),%rbp
-
- addl 0(%rdi),%eax
- addl 4(%rdi),%ebx
- addl 8(%rdi),%ecx
- addl 12(%rdi),%edx
- addl 16(%rdi),%r8d
- addl 20(%rdi),%r9d
- addl 24(%rdi),%r10d
- addl 28(%rdi),%r11d
-
- movl %eax,0(%rdi)
- movl %ebx,4(%rdi)
- movl %ecx,8(%rdi)
- movl %edx,12(%rdi)
- movl %r8d,16(%rdi)
- movl %r9d,20(%rdi)
- movl %r10d,24(%rdi)
- movl %r11d,28(%rdi)
-
- cmpq 80(%rbp),%rsi
- je L$done_avx2
-
- xorl %r14d,%r14d
- movl %ebx,%edi
- xorl %ecx,%edi
- movl %r9d,%r12d
- jmp L$ower_avx2
-.p2align 4
-L$ower_avx2:
- addl 0+16(%rbp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %ebx,%edi
- xorl %r13d,%r14d
- leal (%r11,%rdi,1),%r11d
- movl %r8d,%r12d
- addl 4+16(%rbp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%edi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %edi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%edi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%edi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8+16(%rbp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r11d,%edi
- xorl %r13d,%r14d
- leal (%r9,%rdi,1),%r9d
- movl %ecx,%r12d
- addl 12+16(%rbp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%edi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %edi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%edi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%edi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32+16(%rbp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r9d,%edi
- xorl %r13d,%r14d
- leal (%rdx,%rdi,1),%edx
- movl %eax,%r12d
- addl 36+16(%rbp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%edi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%edi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%edi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40+16(%rbp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %edx,%edi
- xorl %r13d,%r14d
- leal (%rbx,%rdi,1),%ebx
- movl %r10d,%r12d
- addl 44+16(%rbp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%edi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%edi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%edi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- leaq -64(%rbp),%rbp
- cmpq %rsp,%rbp
- jae L$ower_avx2
-
- movq 512(%rsp),%rdi
- addl %r14d,%eax
-
- leaq 448(%rsp),%rsp
-
- addl 0(%rdi),%eax
- addl 4(%rdi),%ebx
- addl 8(%rdi),%ecx
- addl 12(%rdi),%edx
- addl 16(%rdi),%r8d
- addl 20(%rdi),%r9d
- leaq 128(%rsi),%rsi
- addl 24(%rdi),%r10d
- movq %rsi,%r12
- addl 28(%rdi),%r11d
- cmpq 64+16(%rsp),%rsi
-
- movl %eax,0(%rdi)
- cmoveq %rsp,%r12
- movl %ebx,4(%rdi)
- movl %ecx,8(%rdi)
- movl %edx,12(%rdi)
- movl %r8d,16(%rdi)
- movl %r9d,20(%rdi)
- movl %r10d,24(%rdi)
- movl %r11d,28(%rdi)
-
- jbe L$oop_avx2
- leaq (%rsp),%rbp
-
-L$done_avx2:
- leaq (%rbp),%rsp
- movq 64+24(%rsp),%rsi
- vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-L$epilogue_avx2:
- .byte 0xf3,0xc3
diff --git a/deps/openssl/asm/x64-macosx-gas/sha/sha512-x86_64.s b/deps/openssl/asm/x64-macosx-gas/sha/sha512-x86_64.s
index 91821da126..1cae4e11fb 100644
--- a/deps/openssl/asm/x64-macosx-gas/sha/sha512-x86_64.s
+++ b/deps/openssl/asm/x64-macosx-gas/sha/sha512-x86_64.s
@@ -5,20 +5,6 @@
.p2align 4
_sha512_block_data_order:
- leaq _OPENSSL_ia32cap_P(%rip),%r11
- movl 0(%r11),%r9d
- movl 4(%r11),%r10d
- movl 8(%r11),%r11d
- testl $2048,%r10d
- jnz L$xop_shortcut
- andl $296,%r11d
- cmpl $296,%r11d
- je L$avx2_shortcut
- andl $1073741824,%r9d
- andl $268435968,%r10d
- orl %r9d,%r10d
- cmpl $1342177792,%r10d
- je L$avx_shortcut
pushq %rbx
pushq %rbp
pushq %r12
@@ -1795,3570 +1781,3 @@ K512:
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-
-.p2align 6
-sha512_block_data_order_xop:
-L$xop_shortcut:
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rsp,%r11
- shlq $4,%rdx
- subq $160,%rsp
- leaq (%rsi,%rdx,8),%rdx
- andq $-64,%rsp
- movq %rdi,128+0(%rsp)
- movq %rsi,128+8(%rsp)
- movq %rdx,128+16(%rsp)
- movq %r11,128+24(%rsp)
-L$prologue_xop:
-
- vzeroupper
- movq 0(%rdi),%rax
- movq 8(%rdi),%rbx
- movq 16(%rdi),%rcx
- movq 24(%rdi),%rdx
- movq 32(%rdi),%r8
- movq 40(%rdi),%r9
- movq 48(%rdi),%r10
- movq 56(%rdi),%r11
- jmp L$loop_xop
-.p2align 4
-L$loop_xop:
- vmovdqa K512+1280(%rip),%xmm11
- vmovdqu 0(%rsi),%xmm0
- leaq K512+128(%rip),%rbp
- vmovdqu 16(%rsi),%xmm1
- vmovdqu 32(%rsi),%xmm2
- vpshufb %xmm11,%xmm0,%xmm0
- vmovdqu 48(%rsi),%xmm3
- vpshufb %xmm11,%xmm1,%xmm1
- vmovdqu 64(%rsi),%xmm4
- vpshufb %xmm11,%xmm2,%xmm2
- vmovdqu 80(%rsi),%xmm5
- vpshufb %xmm11,%xmm3,%xmm3
- vmovdqu 96(%rsi),%xmm6
- vpshufb %xmm11,%xmm4,%xmm4
- vmovdqu 112(%rsi),%xmm7
- vpshufb %xmm11,%xmm5,%xmm5
- vpaddq -128(%rbp),%xmm0,%xmm8
- vpshufb %xmm11,%xmm6,%xmm6
- vpaddq -96(%rbp),%xmm1,%xmm9
- vpshufb %xmm11,%xmm7,%xmm7
- vpaddq -64(%rbp),%xmm2,%xmm10
- vpaddq -32(%rbp),%xmm3,%xmm11
- vmovdqa %xmm8,0(%rsp)
- vpaddq 0(%rbp),%xmm4,%xmm8
- vmovdqa %xmm9,16(%rsp)
- vpaddq 32(%rbp),%xmm5,%xmm9
- vmovdqa %xmm10,32(%rsp)
- vpaddq 64(%rbp),%xmm6,%xmm10
- vmovdqa %xmm11,48(%rsp)
- vpaddq 96(%rbp),%xmm7,%xmm11
- vmovdqa %xmm8,64(%rsp)
- movq %rax,%r14
- vmovdqa %xmm9,80(%rsp)
- movq %rbx,%rdi
- vmovdqa %xmm10,96(%rsp)
- xorq %rcx,%rdi
- vmovdqa %xmm11,112(%rsp)
- movq %r8,%r13
- jmp L$xop_00_47
-
-.p2align 4
-L$xop_00_47:
- addq $256,%rbp
- vpalignr $8,%xmm0,%xmm1,%xmm8
- rorq $23,%r13
- movq %r14,%rax
- vpalignr $8,%xmm4,%xmm5,%xmm11
- movq %r9,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r8,%r13
- xorq %r10,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rax,%r14
- vpaddq %xmm11,%xmm0,%xmm0
- andq %r8,%r12
- xorq %r8,%r13
- addq 0(%rsp),%r11
- movq %rax,%r15
-.byte 143,72,120,195,209,7
- xorq %r10,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,223,3
- xorq %rax,%r14
- addq %r13,%r11
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rbx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm7,%xmm10
- addq %r11,%rdx
- addq %rdi,%r11
- vpaddq %xmm8,%xmm0,%xmm0
- movq %rdx,%r13
- addq %r11,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r11
- vpxor %xmm10,%xmm11,%xmm11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- vpaddq %xmm11,%xmm0,%xmm0
- addq 8(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- vpaddq -128(%rbp),%xmm0,%xmm10
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- vmovdqa %xmm10,0(%rsp)
- vpalignr $8,%xmm1,%xmm2,%xmm8
- rorq $23,%r13
- movq %r14,%r10
- vpalignr $8,%xmm5,%xmm6,%xmm11
- movq %rdx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rcx,%r13
- xorq %r8,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r10,%r14
- vpaddq %xmm11,%xmm1,%xmm1
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 16(%rsp),%r9
- movq %r10,%r15
-.byte 143,72,120,195,209,7
- xorq %r8,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,216,3
- xorq %r10,%r14
- addq %r13,%r9
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r11,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm0,%xmm10
- addq %r9,%rbx
- addq %rdi,%r9
- vpaddq %xmm8,%xmm1,%xmm1
- movq %rbx,%r13
- addq %r9,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r9
- vpxor %xmm10,%xmm11,%xmm11
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- vpaddq %xmm11,%xmm1,%xmm1
- addq 24(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- vpaddq -96(%rbp),%xmm1,%xmm10
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- vmovdqa %xmm10,16(%rsp)
- vpalignr $8,%xmm2,%xmm3,%xmm8
- rorq $23,%r13
- movq %r14,%r8
- vpalignr $8,%xmm6,%xmm7,%xmm11
- movq %rbx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rax,%r13
- xorq %rcx,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r8,%r14
- vpaddq %xmm11,%xmm2,%xmm2
- andq %rax,%r12
- xorq %rax,%r13
- addq 32(%rsp),%rdx
- movq %r8,%r15
-.byte 143,72,120,195,209,7
- xorq %rcx,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,217,3
- xorq %r8,%r14
- addq %r13,%rdx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r9,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm1,%xmm10
- addq %rdx,%r11
- addq %rdi,%rdx
- vpaddq %xmm8,%xmm2,%xmm2
- movq %r11,%r13
- addq %rdx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rdx
- vpxor %xmm10,%xmm11,%xmm11
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- vpaddq %xmm11,%xmm2,%xmm2
- addq 40(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- vpaddq -64(%rbp),%xmm2,%xmm10
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- vmovdqa %xmm10,32(%rsp)
- vpalignr $8,%xmm3,%xmm4,%xmm8
- rorq $23,%r13
- movq %r14,%rcx
- vpalignr $8,%xmm7,%xmm0,%xmm11
- movq %r11,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r10,%r13
- xorq %rax,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rcx,%r14
- vpaddq %xmm11,%xmm3,%xmm3
- andq %r10,%r12
- xorq %r10,%r13
- addq 48(%rsp),%rbx
- movq %rcx,%r15
-.byte 143,72,120,195,209,7
- xorq %rax,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,218,3
- xorq %rcx,%r14
- addq %r13,%rbx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rdx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm2,%xmm10
- addq %rbx,%r9
- addq %rdi,%rbx
- vpaddq %xmm8,%xmm3,%xmm3
- movq %r9,%r13
- addq %rbx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rbx
- vpxor %xmm10,%xmm11,%xmm11
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- vpaddq %xmm11,%xmm3,%xmm3
- addq 56(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- vpaddq -32(%rbp),%xmm3,%xmm10
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- vmovdqa %xmm10,48(%rsp)
- vpalignr $8,%xmm4,%xmm5,%xmm8
- rorq $23,%r13
- movq %r14,%rax
- vpalignr $8,%xmm0,%xmm1,%xmm11
- movq %r9,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r8,%r13
- xorq %r10,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rax,%r14
- vpaddq %xmm11,%xmm4,%xmm4
- andq %r8,%r12
- xorq %r8,%r13
- addq 64(%rsp),%r11
- movq %rax,%r15
-.byte 143,72,120,195,209,7
- xorq %r10,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,219,3
- xorq %rax,%r14
- addq %r13,%r11
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rbx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm3,%xmm10
- addq %r11,%rdx
- addq %rdi,%r11
- vpaddq %xmm8,%xmm4,%xmm4
- movq %rdx,%r13
- addq %r11,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r11
- vpxor %xmm10,%xmm11,%xmm11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- vpaddq %xmm11,%xmm4,%xmm4
- addq 72(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- vpaddq 0(%rbp),%xmm4,%xmm10
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- vmovdqa %xmm10,64(%rsp)
- vpalignr $8,%xmm5,%xmm6,%xmm8
- rorq $23,%r13
- movq %r14,%r10
- vpalignr $8,%xmm1,%xmm2,%xmm11
- movq %rdx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rcx,%r13
- xorq %r8,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r10,%r14
- vpaddq %xmm11,%xmm5,%xmm5
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 80(%rsp),%r9
- movq %r10,%r15
-.byte 143,72,120,195,209,7
- xorq %r8,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,220,3
- xorq %r10,%r14
- addq %r13,%r9
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r11,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm4,%xmm10
- addq %r9,%rbx
- addq %rdi,%r9
- vpaddq %xmm8,%xmm5,%xmm5
- movq %rbx,%r13
- addq %r9,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r9
- vpxor %xmm10,%xmm11,%xmm11
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- vpaddq %xmm11,%xmm5,%xmm5
- addq 88(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- vpaddq 32(%rbp),%xmm5,%xmm10
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- vmovdqa %xmm10,80(%rsp)
- vpalignr $8,%xmm6,%xmm7,%xmm8
- rorq $23,%r13
- movq %r14,%r8
- vpalignr $8,%xmm2,%xmm3,%xmm11
- movq %rbx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rax,%r13
- xorq %rcx,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r8,%r14
- vpaddq %xmm11,%xmm6,%xmm6
- andq %rax,%r12
- xorq %rax,%r13
- addq 96(%rsp),%rdx
- movq %r8,%r15
-.byte 143,72,120,195,209,7
- xorq %rcx,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,221,3
- xorq %r8,%r14
- addq %r13,%rdx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r9,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm5,%xmm10
- addq %rdx,%r11
- addq %rdi,%rdx
- vpaddq %xmm8,%xmm6,%xmm6
- movq %r11,%r13
- addq %rdx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rdx
- vpxor %xmm10,%xmm11,%xmm11
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- vpaddq %xmm11,%xmm6,%xmm6
- addq 104(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- vpaddq 64(%rbp),%xmm6,%xmm10
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- vmovdqa %xmm10,96(%rsp)
- vpalignr $8,%xmm7,%xmm0,%xmm8
- rorq $23,%r13
- movq %r14,%rcx
- vpalignr $8,%xmm3,%xmm4,%xmm11
- movq %r11,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r10,%r13
- xorq %rax,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rcx,%r14
- vpaddq %xmm11,%xmm7,%xmm7
- andq %r10,%r12
- xorq %r10,%r13
- addq 112(%rsp),%rbx
- movq %rcx,%r15
-.byte 143,72,120,195,209,7
- xorq %rax,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,222,3
- xorq %rcx,%r14
- addq %r13,%rbx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rdx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm6,%xmm10
- addq %rbx,%r9
- addq %rdi,%rbx
- vpaddq %xmm8,%xmm7,%xmm7
- movq %r9,%r13
- addq %rbx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rbx
- vpxor %xmm10,%xmm11,%xmm11
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- vpaddq %xmm11,%xmm7,%xmm7
- addq 120(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- vpaddq 96(%rbp),%xmm7,%xmm10
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- vmovdqa %xmm10,112(%rsp)
- cmpb $0,135(%rbp)
- jne L$xop_00_47
- rorq $23,%r13
- movq %r14,%rax
- movq %r9,%r12
- rorq $5,%r14
- xorq %r8,%r13
- xorq %r10,%r12
- rorq $4,%r13
- xorq %rax,%r14
- andq %r8,%r12
- xorq %r8,%r13
- addq 0(%rsp),%r11
- movq %rax,%r15
- xorq %r10,%r12
- rorq $6,%r14
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rax,%r14
- addq %r13,%r11
- xorq %rbx,%rdi
- rorq $28,%r14
- addq %r11,%rdx
- addq %rdi,%r11
- movq %rdx,%r13
- addq %r11,%r14
- rorq $23,%r13
- movq %r14,%r11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- addq 8(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- rorq $23,%r13
- movq %r14,%r10
- movq %rdx,%r12
- rorq $5,%r14
- xorq %rcx,%r13
- xorq %r8,%r12
- rorq $4,%r13
- xorq %r10,%r14
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 16(%rsp),%r9
- movq %r10,%r15
- xorq %r8,%r12
- rorq $6,%r14
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r10,%r14
- addq %r13,%r9
- xorq %r11,%rdi
- rorq $28,%r14
- addq %r9,%rbx
- addq %rdi,%r9
- movq %rbx,%r13
- addq %r9,%r14
- rorq $23,%r13
- movq %r14,%r9
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- addq 24(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- rorq $23,%r13
- movq %r14,%r8
- movq %rbx,%r12
- rorq $5,%r14
- xorq %rax,%r13
- xorq %rcx,%r12
- rorq $4,%r13
- xorq %r8,%r14
- andq %rax,%r12
- xorq %rax,%r13
- addq 32(%rsp),%rdx
- movq %r8,%r15
- xorq %rcx,%r12
- rorq $6,%r14
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r8,%r14
- addq %r13,%rdx
- xorq %r9,%rdi
- rorq $28,%r14
- addq %rdx,%r11
- addq %rdi,%rdx
- movq %r11,%r13
- addq %rdx,%r14
- rorq $23,%r13
- movq %r14,%rdx
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- addq 40(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- rorq $23,%r13
- movq %r14,%rcx
- movq %r11,%r12
- rorq $5,%r14
- xorq %r10,%r13
- xorq %rax,%r12
- rorq $4,%r13
- xorq %rcx,%r14
- andq %r10,%r12
- xorq %r10,%r13
- addq 48(%rsp),%rbx
- movq %rcx,%r15
- xorq %rax,%r12
- rorq $6,%r14
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rcx,%r14
- addq %r13,%rbx
- xorq %rdx,%rdi
- rorq $28,%r14
- addq %rbx,%r9
- addq %rdi,%rbx
- movq %r9,%r13
- addq %rbx,%r14
- rorq $23,%r13
- movq %r14,%rbx
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- addq 56(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- rorq $23,%r13
- movq %r14,%rax
- movq %r9,%r12
- rorq $5,%r14
- xorq %r8,%r13
- xorq %r10,%r12
- rorq $4,%r13
- xorq %rax,%r14
- andq %r8,%r12
- xorq %r8,%r13
- addq 64(%rsp),%r11
- movq %rax,%r15
- xorq %r10,%r12
- rorq $6,%r14
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rax,%r14
- addq %r13,%r11
- xorq %rbx,%rdi
- rorq $28,%r14
- addq %r11,%rdx
- addq %rdi,%r11
- movq %rdx,%r13
- addq %r11,%r14
- rorq $23,%r13
- movq %r14,%r11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- addq 72(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- rorq $23,%r13
- movq %r14,%r10
- movq %rdx,%r12
- rorq $5,%r14
- xorq %rcx,%r13
- xorq %r8,%r12
- rorq $4,%r13
- xorq %r10,%r14
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 80(%rsp),%r9
- movq %r10,%r15
- xorq %r8,%r12
- rorq $6,%r14
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r10,%r14
- addq %r13,%r9
- xorq %r11,%rdi
- rorq $28,%r14
- addq %r9,%rbx
- addq %rdi,%r9
- movq %rbx,%r13
- addq %r9,%r14
- rorq $23,%r13
- movq %r14,%r9
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- addq 88(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- rorq $23,%r13
- movq %r14,%r8
- movq %rbx,%r12
- rorq $5,%r14
- xorq %rax,%r13
- xorq %rcx,%r12
- rorq $4,%r13
- xorq %r8,%r14
- andq %rax,%r12
- xorq %rax,%r13
- addq 96(%rsp),%rdx
- movq %r8,%r15
- xorq %rcx,%r12
- rorq $6,%r14
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r8,%r14
- addq %r13,%rdx
- xorq %r9,%rdi
- rorq $28,%r14
- addq %rdx,%r11
- addq %rdi,%rdx
- movq %r11,%r13
- addq %rdx,%r14
- rorq $23,%r13
- movq %r14,%rdx
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- addq 104(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- rorq $23,%r13
- movq %r14,%rcx
- movq %r11,%r12
- rorq $5,%r14
- xorq %r10,%r13
- xorq %rax,%r12
- rorq $4,%r13
- xorq %rcx,%r14
- andq %r10,%r12
- xorq %r10,%r13
- addq 112(%rsp),%rbx
- movq %rcx,%r15
- xorq %rax,%r12
- rorq $6,%r14
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rcx,%r14
- addq %r13,%rbx
- xorq %rdx,%rdi
- rorq $28,%r14
- addq %rbx,%r9
- addq %rdi,%rbx
- movq %r9,%r13
- addq %rbx,%r14
- rorq $23,%r13
- movq %r14,%rbx
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- addq 120(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- movq 128+0(%rsp),%rdi
- movq %r14,%rax
-
- addq 0(%rdi),%rax
- leaq 128(%rsi),%rsi
- addq 8(%rdi),%rbx
- addq 16(%rdi),%rcx
- addq 24(%rdi),%rdx
- addq 32(%rdi),%r8
- addq 40(%rdi),%r9
- addq 48(%rdi),%r10
- addq 56(%rdi),%r11
-
- cmpq 128+16(%rsp),%rsi
-
- movq %rax,0(%rdi)
- movq %rbx,8(%rdi)
- movq %rcx,16(%rdi)
- movq %rdx,24(%rdi)
- movq %r8,32(%rdi)
- movq %r9,40(%rdi)
- movq %r10,48(%rdi)
- movq %r11,56(%rdi)
- jb L$loop_xop
-
- movq 128+24(%rsp),%rsi
- vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-L$epilogue_xop:
- .byte 0xf3,0xc3
-
-
-.p2align 6
-sha512_block_data_order_avx:
-L$avx_shortcut:
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rsp,%r11
- shlq $4,%rdx
- subq $160,%rsp
- leaq (%rsi,%rdx,8),%rdx
- andq $-64,%rsp
- movq %rdi,128+0(%rsp)
- movq %rsi,128+8(%rsp)
- movq %rdx,128+16(%rsp)
- movq %r11,128+24(%rsp)
-L$prologue_avx:
-
- vzeroupper
- movq 0(%rdi),%rax
- movq 8(%rdi),%rbx
- movq 16(%rdi),%rcx
- movq 24(%rdi),%rdx
- movq 32(%rdi),%r8
- movq 40(%rdi),%r9
- movq 48(%rdi),%r10
- movq 56(%rdi),%r11
- jmp L$loop_avx
-.p2align 4
-L$loop_avx:
- vmovdqa K512+1280(%rip),%xmm11
- vmovdqu 0(%rsi),%xmm0
- leaq K512+128(%rip),%rbp
- vmovdqu 16(%rsi),%xmm1
- vmovdqu 32(%rsi),%xmm2
- vpshufb %xmm11,%xmm0,%xmm0
- vmovdqu 48(%rsi),%xmm3
- vpshufb %xmm11,%xmm1,%xmm1
- vmovdqu 64(%rsi),%xmm4
- vpshufb %xmm11,%xmm2,%xmm2
- vmovdqu 80(%rsi),%xmm5
- vpshufb %xmm11,%xmm3,%xmm3
- vmovdqu 96(%rsi),%xmm6
- vpshufb %xmm11,%xmm4,%xmm4
- vmovdqu 112(%rsi),%xmm7
- vpshufb %xmm11,%xmm5,%xmm5
- vpaddq -128(%rbp),%xmm0,%xmm8
- vpshufb %xmm11,%xmm6,%xmm6
- vpaddq -96(%rbp),%xmm1,%xmm9
- vpshufb %xmm11,%xmm7,%xmm7
- vpaddq -64(%rbp),%xmm2,%xmm10
- vpaddq -32(%rbp),%xmm3,%xmm11
- vmovdqa %xmm8,0(%rsp)
- vpaddq 0(%rbp),%xmm4,%xmm8
- vmovdqa %xmm9,16(%rsp)
- vpaddq 32(%rbp),%xmm5,%xmm9
- vmovdqa %xmm10,32(%rsp)
- vpaddq 64(%rbp),%xmm6,%xmm10
- vmovdqa %xmm11,48(%rsp)
- vpaddq 96(%rbp),%xmm7,%xmm11
- vmovdqa %xmm8,64(%rsp)
- movq %rax,%r14
- vmovdqa %xmm9,80(%rsp)
- movq %rbx,%rdi
- vmovdqa %xmm10,96(%rsp)
- xorq %rcx,%rdi
- vmovdqa %xmm11,112(%rsp)
- movq %r8,%r13
- jmp L$avx_00_47
-
-.p2align 4
-L$avx_00_47:
- addq $256,%rbp
- vpalignr $8,%xmm0,%xmm1,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%rax
- vpalignr $8,%xmm4,%xmm5,%xmm11
- movq %r9,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %r8,%r13
- xorq %r10,%r12
- vpaddq %xmm11,%xmm0,%xmm0
- shrdq $4,%r13,%r13
- xorq %rax,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %r8,%r12
- xorq %r8,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 0(%rsp),%r11
- movq %rax,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %r10,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %rbx,%r15
- addq %r12,%r11
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %rax,%r14
- addq %r13,%r11
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rbx,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm7,%xmm11
- addq %r11,%rdx
- addq %rdi,%r11
- vpxor %xmm9,%xmm8,%xmm8
- movq %rdx,%r13
- addq %r11,%r14
- vpsllq $3,%xmm7,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%r11
- vpaddq %xmm8,%xmm0,%xmm0
- movq %r8,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm7,%xmm9
- xorq %rdx,%r13
- xorq %r9,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %r11,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %rdx,%r12
- xorq %rdx,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 8(%rsp),%r10
- movq %r11,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %r9,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %rax,%rdi
- addq %r12,%r10
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm0,%xmm0
- xorq %r11,%r14
- addq %r13,%r10
- vpaddq -128(%rbp),%xmm0,%xmm10
- xorq %rax,%r15
- shrdq $28,%r14,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- vmovdqa %xmm10,0(%rsp)
- vpalignr $8,%xmm1,%xmm2,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%r10
- vpalignr $8,%xmm5,%xmm6,%xmm11
- movq %rdx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %rcx,%r13
- xorq %r8,%r12
- vpaddq %xmm11,%xmm1,%xmm1
- shrdq $4,%r13,%r13
- xorq %r10,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %rcx,%r12
- xorq %rcx,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 16(%rsp),%r9
- movq %r10,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %r8,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %r11,%r15
- addq %r12,%r9
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %r10,%r14
- addq %r13,%r9
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r11,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm0,%xmm11
- addq %r9,%rbx
- addq %rdi,%r9
- vpxor %xmm9,%xmm8,%xmm8
- movq %rbx,%r13
- addq %r9,%r14
- vpsllq $3,%xmm0,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%r9
- vpaddq %xmm8,%xmm1,%xmm1
- movq %rcx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm0,%xmm9
- xorq %rbx,%r13
- xorq %rdx,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %r9,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %rbx,%r12
- xorq %rbx,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 24(%rsp),%r8
- movq %r9,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %rdx,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %r10,%rdi
- addq %r12,%r8
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm1,%xmm1
- xorq %r9,%r14
- addq %r13,%r8
- vpaddq -96(%rbp),%xmm1,%xmm10
- xorq %r10,%r15
- shrdq $28,%r14,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- vmovdqa %xmm10,16(%rsp)
- vpalignr $8,%xmm2,%xmm3,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%r8
- vpalignr $8,%xmm6,%xmm7,%xmm11
- movq %rbx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %rax,%r13
- xorq %rcx,%r12
- vpaddq %xmm11,%xmm2,%xmm2
- shrdq $4,%r13,%r13
- xorq %r8,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %rax,%r12
- xorq %rax,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 32(%rsp),%rdx
- movq %r8,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %rcx,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %r9,%r15
- addq %r12,%rdx
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %r8,%r14
- addq %r13,%rdx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r9,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm1,%xmm11
- addq %rdx,%r11
- addq %rdi,%rdx
- vpxor %xmm9,%xmm8,%xmm8
- movq %r11,%r13
- addq %rdx,%r14
- vpsllq $3,%xmm1,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%rdx
- vpaddq %xmm8,%xmm2,%xmm2
- movq %rax,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm1,%xmm9
- xorq %r11,%r13
- xorq %rbx,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %rdx,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %r11,%r12
- xorq %r11,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 40(%rsp),%rcx
- movq %rdx,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %rbx,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %r8,%rdi
- addq %r12,%rcx
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm2,%xmm2
- xorq %rdx,%r14
- addq %r13,%rcx
- vpaddq -64(%rbp),%xmm2,%xmm10
- xorq %r8,%r15
- shrdq $28,%r14,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- vmovdqa %xmm10,32(%rsp)
- vpalignr $8,%xmm3,%xmm4,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%rcx
- vpalignr $8,%xmm7,%xmm0,%xmm11
- movq %r11,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %r10,%r13
- xorq %rax,%r12
- vpaddq %xmm11,%xmm3,%xmm3
- shrdq $4,%r13,%r13
- xorq %rcx,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %r10,%r12
- xorq %r10,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 48(%rsp),%rbx
- movq %rcx,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %rax,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %rdx,%r15
- addq %r12,%rbx
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %rcx,%r14
- addq %r13,%rbx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rdx,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm2,%xmm11
- addq %rbx,%r9
- addq %rdi,%rbx
- vpxor %xmm9,%xmm8,%xmm8
- movq %r9,%r13
- addq %rbx,%r14
- vpsllq $3,%xmm2,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%rbx
- vpaddq %xmm8,%xmm3,%xmm3
- movq %r10,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm2,%xmm9
- xorq %r9,%r13
- xorq %r11,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %rbx,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %r9,%r12
- xorq %r9,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 56(%rsp),%rax
- movq %rbx,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %r11,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %rcx,%rdi
- addq %r12,%rax
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm3,%xmm3
- xorq %rbx,%r14
- addq %r13,%rax
- vpaddq -32(%rbp),%xmm3,%xmm10
- xorq %rcx,%r15
- shrdq $28,%r14,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- vmovdqa %xmm10,48(%rsp)
- vpalignr $8,%xmm4,%xmm5,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%rax
- vpalignr $8,%xmm0,%xmm1,%xmm11
- movq %r9,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %r8,%r13
- xorq %r10,%r12
- vpaddq %xmm11,%xmm4,%xmm4
- shrdq $4,%r13,%r13
- xorq %rax,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %r8,%r12
- xorq %r8,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 64(%rsp),%r11
- movq %rax,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %r10,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %rbx,%r15
- addq %r12,%r11
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %rax,%r14
- addq %r13,%r11
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rbx,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm3,%xmm11
- addq %r11,%rdx
- addq %rdi,%r11
- vpxor %xmm9,%xmm8,%xmm8
- movq %rdx,%r13
- addq %r11,%r14
- vpsllq $3,%xmm3,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%r11
- vpaddq %xmm8,%xmm4,%xmm4
- movq %r8,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm3,%xmm9
- xorq %rdx,%r13
- xorq %r9,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %r11,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %rdx,%r12
- xorq %rdx,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 72(%rsp),%r10
- movq %r11,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %r9,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %rax,%rdi
- addq %r12,%r10
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm4,%xmm4
- xorq %r11,%r14
- addq %r13,%r10
- vpaddq 0(%rbp),%xmm4,%xmm10
- xorq %rax,%r15
- shrdq $28,%r14,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- vmovdqa %xmm10,64(%rsp)
- vpalignr $8,%xmm5,%xmm6,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%r10
- vpalignr $8,%xmm1,%xmm2,%xmm11
- movq %rdx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %rcx,%r13
- xorq %r8,%r12
- vpaddq %xmm11,%xmm5,%xmm5
- shrdq $4,%r13,%r13
- xorq %r10,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %rcx,%r12
- xorq %rcx,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 80(%rsp),%r9
- movq %r10,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %r8,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %r11,%r15
- addq %r12,%r9
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %r10,%r14
- addq %r13,%r9
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r11,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm4,%xmm11
- addq %r9,%rbx
- addq %rdi,%r9
- vpxor %xmm9,%xmm8,%xmm8
- movq %rbx,%r13
- addq %r9,%r14
- vpsllq $3,%xmm4,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%r9
- vpaddq %xmm8,%xmm5,%xmm5
- movq %rcx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm4,%xmm9
- xorq %rbx,%r13
- xorq %rdx,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %r9,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %rbx,%r12
- xorq %rbx,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 88(%rsp),%r8
- movq %r9,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %rdx,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %r10,%rdi
- addq %r12,%r8
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm5,%xmm5
- xorq %r9,%r14
- addq %r13,%r8
- vpaddq 32(%rbp),%xmm5,%xmm10
- xorq %r10,%r15
- shrdq $28,%r14,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- vmovdqa %xmm10,80(%rsp)
- vpalignr $8,%xmm6,%xmm7,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%r8
- vpalignr $8,%xmm2,%xmm3,%xmm11
- movq %rbx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %rax,%r13
- xorq %rcx,%r12
- vpaddq %xmm11,%xmm6,%xmm6
- shrdq $4,%r13,%r13
- xorq %r8,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %rax,%r12
- xorq %rax,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 96(%rsp),%rdx
- movq %r8,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %rcx,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %r9,%r15
- addq %r12,%rdx
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %r8,%r14
- addq %r13,%rdx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r9,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm5,%xmm11
- addq %rdx,%r11
- addq %rdi,%rdx
- vpxor %xmm9,%xmm8,%xmm8
- movq %r11,%r13
- addq %rdx,%r14
- vpsllq $3,%xmm5,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%rdx
- vpaddq %xmm8,%xmm6,%xmm6
- movq %rax,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm5,%xmm9
- xorq %r11,%r13
- xorq %rbx,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %rdx,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %r11,%r12
- xorq %r11,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 104(%rsp),%rcx
- movq %rdx,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %rbx,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %r8,%rdi
- addq %r12,%rcx
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm6,%xmm6
- xorq %rdx,%r14
- addq %r13,%rcx
- vpaddq 64(%rbp),%xmm6,%xmm10
- xorq %r8,%r15
- shrdq $28,%r14,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- vmovdqa %xmm10,96(%rsp)
- vpalignr $8,%xmm7,%xmm0,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%rcx
- vpalignr $8,%xmm3,%xmm4,%xmm11
- movq %r11,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %r10,%r13
- xorq %rax,%r12
- vpaddq %xmm11,%xmm7,%xmm7
- shrdq $4,%r13,%r13
- xorq %rcx,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %r10,%r12
- xorq %r10,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 112(%rsp),%rbx
- movq %rcx,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %rax,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %rdx,%r15
- addq %r12,%rbx
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %rcx,%r14
- addq %r13,%rbx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rdx,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm6,%xmm11
- addq %rbx,%r9
- addq %rdi,%rbx
- vpxor %xmm9,%xmm8,%xmm8
- movq %r9,%r13
- addq %rbx,%r14
- vpsllq $3,%xmm6,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%rbx
- vpaddq %xmm8,%xmm7,%xmm7
- movq %r10,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm6,%xmm9
- xorq %r9,%r13
- xorq %r11,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %rbx,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %r9,%r12
- xorq %r9,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 120(%rsp),%rax
- movq %rbx,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %r11,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %rcx,%rdi
- addq %r12,%rax
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm7,%xmm7
- xorq %rbx,%r14
- addq %r13,%rax
- vpaddq 96(%rbp),%xmm7,%xmm10
- xorq %rcx,%r15
- shrdq $28,%r14,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- vmovdqa %xmm10,112(%rsp)
- cmpb $0,135(%rbp)
- jne L$avx_00_47
- shrdq $23,%r13,%r13
- movq %r14,%rax
- movq %r9,%r12
- shrdq $5,%r14,%r14
- xorq %r8,%r13
- xorq %r10,%r12
- shrdq $4,%r13,%r13
- xorq %rax,%r14
- andq %r8,%r12
- xorq %r8,%r13
- addq 0(%rsp),%r11
- movq %rax,%r15
- xorq %r10,%r12
- shrdq $6,%r14,%r14
- xorq %rbx,%r15
- addq %r12,%r11
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %rax,%r14
- addq %r13,%r11
- xorq %rbx,%rdi
- shrdq $28,%r14,%r14
- addq %r11,%rdx
- addq %rdi,%r11
- movq %rdx,%r13
- addq %r11,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r11
- movq %r8,%r12
- shrdq $5,%r14,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- shrdq $4,%r13,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- addq 8(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- shrdq $6,%r14,%r14
- xorq %rax,%rdi
- addq %r12,%r10
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- shrdq $28,%r14,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r10
- movq %rdx,%r12
- shrdq $5,%r14,%r14
- xorq %rcx,%r13
- xorq %r8,%r12
- shrdq $4,%r13,%r13
- xorq %r10,%r14
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 16(%rsp),%r9
- movq %r10,%r15
- xorq %r8,%r12
- shrdq $6,%r14,%r14
- xorq %r11,%r15
- addq %r12,%r9
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %r10,%r14
- addq %r13,%r9
- xorq %r11,%rdi
- shrdq $28,%r14,%r14
- addq %r9,%rbx
- addq %rdi,%r9
- movq %rbx,%r13
- addq %r9,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r9
- movq %rcx,%r12
- shrdq $5,%r14,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- shrdq $4,%r13,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- addq 24(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- shrdq $6,%r14,%r14
- xorq %r10,%rdi
- addq %r12,%r8
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- shrdq $28,%r14,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r8
- movq %rbx,%r12
- shrdq $5,%r14,%r14
- xorq %rax,%r13
- xorq %rcx,%r12
- shrdq $4,%r13,%r13
- xorq %r8,%r14
- andq %rax,%r12
- xorq %rax,%r13
- addq 32(%rsp),%rdx
- movq %r8,%r15
- xorq %rcx,%r12
- shrdq $6,%r14,%r14
- xorq %r9,%r15
- addq %r12,%rdx
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %r8,%r14
- addq %r13,%rdx
- xorq %r9,%rdi
- shrdq $28,%r14,%r14
- addq %rdx,%r11
- addq %rdi,%rdx
- movq %r11,%r13
- addq %rdx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rdx
- movq %rax,%r12
- shrdq $5,%r14,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- shrdq $4,%r13,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- addq 40(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- shrdq $6,%r14,%r14
- xorq %r8,%rdi
- addq %r12,%rcx
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- shrdq $28,%r14,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rcx
- movq %r11,%r12
- shrdq $5,%r14,%r14
- xorq %r10,%r13
- xorq %rax,%r12
- shrdq $4,%r13,%r13
- xorq %rcx,%r14
- andq %r10,%r12
- xorq %r10,%r13
- addq 48(%rsp),%rbx
- movq %rcx,%r15
- xorq %rax,%r12
- shrdq $6,%r14,%r14
- xorq %rdx,%r15
- addq %r12,%rbx
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %rcx,%r14
- addq %r13,%rbx
- xorq %rdx,%rdi
- shrdq $28,%r14,%r14
- addq %rbx,%r9
- addq %rdi,%rbx
- movq %r9,%r13
- addq %rbx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rbx
- movq %r10,%r12
- shrdq $5,%r14,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- shrdq $4,%r13,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- addq 56(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- shrdq $6,%r14,%r14
- xorq %rcx,%rdi
- addq %r12,%rax
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- shrdq $28,%r14,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rax
- movq %r9,%r12
- shrdq $5,%r14,%r14
- xorq %r8,%r13
- xorq %r10,%r12
- shrdq $4,%r13,%r13
- xorq %rax,%r14
- andq %r8,%r12
- xorq %r8,%r13
- addq 64(%rsp),%r11
- movq %rax,%r15
- xorq %r10,%r12
- shrdq $6,%r14,%r14
- xorq %rbx,%r15
- addq %r12,%r11
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %rax,%r14
- addq %r13,%r11
- xorq %rbx,%rdi
- shrdq $28,%r14,%r14
- addq %r11,%rdx
- addq %rdi,%r11
- movq %rdx,%r13
- addq %r11,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r11
- movq %r8,%r12
- shrdq $5,%r14,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- shrdq $4,%r13,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- addq 72(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- shrdq $6,%r14,%r14
- xorq %rax,%rdi
- addq %r12,%r10
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- shrdq $28,%r14,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r10
- movq %rdx,%r12
- shrdq $5,%r14,%r14
- xorq %rcx,%r13
- xorq %r8,%r12
- shrdq $4,%r13,%r13
- xorq %r10,%r14
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 80(%rsp),%r9
- movq %r10,%r15
- xorq %r8,%r12
- shrdq $6,%r14,%r14
- xorq %r11,%r15
- addq %r12,%r9
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %r10,%r14
- addq %r13,%r9
- xorq %r11,%rdi
- shrdq $28,%r14,%r14
- addq %r9,%rbx
- addq %rdi,%r9
- movq %rbx,%r13
- addq %r9,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r9
- movq %rcx,%r12
- shrdq $5,%r14,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- shrdq $4,%r13,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- addq 88(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- shrdq $6,%r14,%r14
- xorq %r10,%rdi
- addq %r12,%r8
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- shrdq $28,%r14,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r8
- movq %rbx,%r12
- shrdq $5,%r14,%r14
- xorq %rax,%r13
- xorq %rcx,%r12
- shrdq $4,%r13,%r13
- xorq %r8,%r14
- andq %rax,%r12
- xorq %rax,%r13
- addq 96(%rsp),%rdx
- movq %r8,%r15
- xorq %rcx,%r12
- shrdq $6,%r14,%r14
- xorq %r9,%r15
- addq %r12,%rdx
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %r8,%r14
- addq %r13,%rdx
- xorq %r9,%rdi
- shrdq $28,%r14,%r14
- addq %rdx,%r11
- addq %rdi,%rdx
- movq %r11,%r13
- addq %rdx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rdx
- movq %rax,%r12
- shrdq $5,%r14,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- shrdq $4,%r13,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- addq 104(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- shrdq $6,%r14,%r14
- xorq %r8,%rdi
- addq %r12,%rcx
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- shrdq $28,%r14,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rcx
- movq %r11,%r12
- shrdq $5,%r14,%r14
- xorq %r10,%r13
- xorq %rax,%r12
- shrdq $4,%r13,%r13
- xorq %rcx,%r14
- andq %r10,%r12
- xorq %r10,%r13
- addq 112(%rsp),%rbx
- movq %rcx,%r15
- xorq %rax,%r12
- shrdq $6,%r14,%r14
- xorq %rdx,%r15
- addq %r12,%rbx
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %rcx,%r14
- addq %r13,%rbx
- xorq %rdx,%rdi
- shrdq $28,%r14,%r14
- addq %rbx,%r9
- addq %rdi,%rbx
- movq %r9,%r13
- addq %rbx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rbx
- movq %r10,%r12
- shrdq $5,%r14,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- shrdq $4,%r13,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- addq 120(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- shrdq $6,%r14,%r14
- xorq %rcx,%rdi
- addq %r12,%rax
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- shrdq $28,%r14,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- movq 128+0(%rsp),%rdi
- movq %r14,%rax
-
- addq 0(%rdi),%rax
- leaq 128(%rsi),%rsi
- addq 8(%rdi),%rbx
- addq 16(%rdi),%rcx
- addq 24(%rdi),%rdx
- addq 32(%rdi),%r8
- addq 40(%rdi),%r9
- addq 48(%rdi),%r10
- addq 56(%rdi),%r11
-
- cmpq 128+16(%rsp),%rsi
-
- movq %rax,0(%rdi)
- movq %rbx,8(%rdi)
- movq %rcx,16(%rdi)
- movq %rdx,24(%rdi)
- movq %r8,32(%rdi)
- movq %r9,40(%rdi)
- movq %r10,48(%rdi)
- movq %r11,56(%rdi)
- jb L$loop_avx
-
- movq 128+24(%rsp),%rsi
- vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-L$epilogue_avx:
- .byte 0xf3,0xc3
-
-
-.p2align 6
-sha512_block_data_order_avx2:
-L$avx2_shortcut:
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rsp,%r11
- subq $1312,%rsp
- shlq $4,%rdx
- andq $-2048,%rsp
- leaq (%rsi,%rdx,8),%rdx
- addq $1152,%rsp
- movq %rdi,128+0(%rsp)
- movq %rsi,128+8(%rsp)
- movq %rdx,128+16(%rsp)
- movq %r11,128+24(%rsp)
-L$prologue_avx2:
-
- vzeroupper
- subq $-128,%rsi
- movq 0(%rdi),%rax
- movq %rsi,%r12
- movq 8(%rdi),%rbx
- cmpq %rdx,%rsi
- movq 16(%rdi),%rcx
- cmoveq %rsp,%r12
- movq 24(%rdi),%rdx
- movq 32(%rdi),%r8
- movq 40(%rdi),%r9
- movq 48(%rdi),%r10
- movq 56(%rdi),%r11
- jmp L$oop_avx2
-.p2align 4
-L$oop_avx2:
- vmovdqu -128(%rsi),%xmm0
- vmovdqu -128+16(%rsi),%xmm1
- vmovdqu -128+32(%rsi),%xmm2
- leaq K512+128(%rip),%rbp
- vmovdqu -128+48(%rsi),%xmm3
- vmovdqu -128+64(%rsi),%xmm4
- vmovdqu -128+80(%rsi),%xmm5
- vmovdqu -128+96(%rsi),%xmm6
- vmovdqu -128+112(%rsi),%xmm7
-
- vmovdqa 1152(%rbp),%ymm10
- vinserti128 $1,(%r12),%ymm0,%ymm0
- vinserti128 $1,16(%r12),%ymm1,%ymm1
- vpshufb %ymm10,%ymm0,%ymm0
- vinserti128 $1,32(%r12),%ymm2,%ymm2
- vpshufb %ymm10,%ymm1,%ymm1
- vinserti128 $1,48(%r12),%ymm3,%ymm3
- vpshufb %ymm10,%ymm2,%ymm2
- vinserti128 $1,64(%r12),%ymm4,%ymm4
- vpshufb %ymm10,%ymm3,%ymm3
- vinserti128 $1,80(%r12),%ymm5,%ymm5
- vpshufb %ymm10,%ymm4,%ymm4
- vinserti128 $1,96(%r12),%ymm6,%ymm6
- vpshufb %ymm10,%ymm5,%ymm5
- vinserti128 $1,112(%r12),%ymm7,%ymm7
-
- vpaddq -128(%rbp),%ymm0,%ymm8
- vpshufb %ymm10,%ymm6,%ymm6
- vpaddq -96(%rbp),%ymm1,%ymm9
- vpshufb %ymm10,%ymm7,%ymm7
- vpaddq -64(%rbp),%ymm2,%ymm10
- vpaddq -32(%rbp),%ymm3,%ymm11
- vmovdqa %ymm8,0(%rsp)
- vpaddq 0(%rbp),%ymm4,%ymm8
- vmovdqa %ymm9,32(%rsp)
- vpaddq 32(%rbp),%ymm5,%ymm9
- vmovdqa %ymm10,64(%rsp)
- vpaddq 64(%rbp),%ymm6,%ymm10
- vmovdqa %ymm11,96(%rsp)
- leaq -128(%rsp),%rsp
- vpaddq 96(%rbp),%ymm7,%ymm11
- vmovdqa %ymm8,0(%rsp)
- xorq %r14,%r14
- vmovdqa %ymm9,32(%rsp)
- movq %rbx,%rdi
- vmovdqa %ymm10,64(%rsp)
- xorq %rcx,%rdi
- vmovdqa %ymm11,96(%rsp)
- movq %r9,%r12
- addq $32*8,%rbp
- jmp L$avx2_00_47
-
-.p2align 4
-L$avx2_00_47:
- leaq -128(%rsp),%rsp
- vpalignr $8,%ymm0,%ymm1,%ymm8
- addq 0+256(%rsp),%r11
- andq %r8,%r12
- rorxq $41,%r8,%r13
- vpalignr $8,%ymm4,%ymm5,%ymm11
- rorxq $18,%r8,%r15
- leaq (%rax,%r14,1),%rax
- leaq (%r11,%r12,1),%r11
- vpsrlq $1,%ymm8,%ymm10
- andnq %r10,%r8,%r12
- xorq %r15,%r13
- rorxq $14,%r8,%r14
- vpaddq %ymm11,%ymm0,%ymm0
- vpsrlq $7,%ymm8,%ymm11
- leaq (%r11,%r12,1),%r11
- xorq %r14,%r13
- movq %rax,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%rax,%r12
- leaq (%r11,%r13,1),%r11
- xorq %rbx,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%rax,%r14
- rorxq $28,%rax,%r13
- leaq (%rdx,%r11,1),%rdx
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rbx,%rdi
- vpsrlq $6,%ymm7,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%r11,%rdi,1),%r11
- movq %r8,%r12
- vpsllq $3,%ymm7,%ymm10
- vpaddq %ymm8,%ymm0,%ymm0
- addq 8+256(%rsp),%r10
- andq %rdx,%r12
- rorxq $41,%rdx,%r13
- vpsrlq $19,%ymm7,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%rdx,%rdi
- leaq (%r11,%r14,1),%r11
- leaq (%r10,%r12,1),%r10
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %r9,%rdx,%r12
- xorq %rdi,%r13
- rorxq $14,%rdx,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%r10,%r12,1),%r10
- xorq %r14,%r13
- movq %r11,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%r11,%r12
- leaq (%r10,%r13,1),%r10
- xorq %rax,%rdi
- vpaddq %ymm11,%ymm0,%ymm0
- rorxq $34,%r11,%r14
- rorxq $28,%r11,%r13
- leaq (%rcx,%r10,1),%rcx
- vpaddq -128(%rbp),%ymm0,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rax,%r15
- xorq %r13,%r14
- leaq (%r10,%r15,1),%r10
- movq %rdx,%r12
- vmovdqa %ymm10,0(%rsp)
- vpalignr $8,%ymm1,%ymm2,%ymm8
- addq 32+256(%rsp),%r9
- andq %rcx,%r12
- rorxq $41,%rcx,%r13
- vpalignr $8,%ymm5,%ymm6,%ymm11
- rorxq $18,%rcx,%r15
- leaq (%r10,%r14,1),%r10
- leaq (%r9,%r12,1),%r9
- vpsrlq $1,%ymm8,%ymm10
- andnq %r8,%rcx,%r12
- xorq %r15,%r13
- rorxq $14,%rcx,%r14
- vpaddq %ymm11,%ymm1,%ymm1
- vpsrlq $7,%ymm8,%ymm11
- leaq (%r9,%r12,1),%r9
- xorq %r14,%r13
- movq %r10,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%r10,%r12
- leaq (%r9,%r13,1),%r9
- xorq %r11,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%r10,%r14
- rorxq $28,%r10,%r13
- leaq (%rbx,%r9,1),%rbx
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r11,%rdi
- vpsrlq $6,%ymm0,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%r9,%rdi,1),%r9
- movq %rcx,%r12
- vpsllq $3,%ymm0,%ymm10
- vpaddq %ymm8,%ymm1,%ymm1
- addq 40+256(%rsp),%r8
- andq %rbx,%r12
- rorxq $41,%rbx,%r13
- vpsrlq $19,%ymm0,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%rbx,%rdi
- leaq (%r9,%r14,1),%r9
- leaq (%r8,%r12,1),%r8
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %rdx,%rbx,%r12
- xorq %rdi,%r13
- rorxq $14,%rbx,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%r8,%r12,1),%r8
- xorq %r14,%r13
- movq %r9,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%r9,%r12
- leaq (%r8,%r13,1),%r8
- xorq %r10,%rdi
- vpaddq %ymm11,%ymm1,%ymm1
- rorxq $34,%r9,%r14
- rorxq $28,%r9,%r13
- leaq (%rax,%r8,1),%rax
- vpaddq -96(%rbp),%ymm1,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r10,%r15
- xorq %r13,%r14
- leaq (%r8,%r15,1),%r8
- movq %rbx,%r12
- vmovdqa %ymm10,32(%rsp)
- vpalignr $8,%ymm2,%ymm3,%ymm8
- addq 64+256(%rsp),%rdx
- andq %rax,%r12
- rorxq $41,%rax,%r13
- vpalignr $8,%ymm6,%ymm7,%ymm11
- rorxq $18,%rax,%r15
- leaq (%r8,%r14,1),%r8
- leaq (%rdx,%r12,1),%rdx
- vpsrlq $1,%ymm8,%ymm10
- andnq %rcx,%rax,%r12
- xorq %r15,%r13
- rorxq $14,%rax,%r14
- vpaddq %ymm11,%ymm2,%ymm2
- vpsrlq $7,%ymm8,%ymm11
- leaq (%rdx,%r12,1),%rdx
- xorq %r14,%r13
- movq %r8,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%r8,%r12
- leaq (%rdx,%r13,1),%rdx
- xorq %r9,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%r8,%r14
- rorxq $28,%r8,%r13
- leaq (%r11,%rdx,1),%r11
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r9,%rdi
- vpsrlq $6,%ymm1,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%rdx,%rdi,1),%rdx
- movq %rax,%r12
- vpsllq $3,%ymm1,%ymm10
- vpaddq %ymm8,%ymm2,%ymm2
- addq 72+256(%rsp),%rcx
- andq %r11,%r12
- rorxq $41,%r11,%r13
- vpsrlq $19,%ymm1,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%r11,%rdi
- leaq (%rdx,%r14,1),%rdx
- leaq (%rcx,%r12,1),%rcx
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %rbx,%r11,%r12
- xorq %rdi,%r13
- rorxq $14,%r11,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%rcx,%r12,1),%rcx
- xorq %r14,%r13
- movq %rdx,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%rdx,%r12
- leaq (%rcx,%r13,1),%rcx
- xorq %r8,%rdi
- vpaddq %ymm11,%ymm2,%ymm2
- rorxq $34,%rdx,%r14
- rorxq $28,%rdx,%r13
- leaq (%r10,%rcx,1),%r10
- vpaddq -64(%rbp),%ymm2,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r8,%r15
- xorq %r13,%r14
- leaq (%rcx,%r15,1),%rcx
- movq %r11,%r12
- vmovdqa %ymm10,64(%rsp)
- vpalignr $8,%ymm3,%ymm4,%ymm8
- addq 96+256(%rsp),%rbx
- andq %r10,%r12
- rorxq $41,%r10,%r13
- vpalignr $8,%ymm7,%ymm0,%ymm11
- rorxq $18,%r10,%r15
- leaq (%rcx,%r14,1),%rcx
- leaq (%rbx,%r12,1),%rbx
- vpsrlq $1,%ymm8,%ymm10
- andnq %rax,%r10,%r12
- xorq %r15,%r13
- rorxq $14,%r10,%r14
- vpaddq %ymm11,%ymm3,%ymm3
- vpsrlq $7,%ymm8,%ymm11
- leaq (%rbx,%r12,1),%rbx
- xorq %r14,%r13
- movq %rcx,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%rcx,%r12
- leaq (%rbx,%r13,1),%rbx
- xorq %rdx,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%rcx,%r14
- rorxq $28,%rcx,%r13
- leaq (%r9,%rbx,1),%r9
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rdx,%rdi
- vpsrlq $6,%ymm2,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%rbx,%rdi,1),%rbx
- movq %r10,%r12
- vpsllq $3,%ymm2,%ymm10
- vpaddq %ymm8,%ymm3,%ymm3
- addq 104+256(%rsp),%rax
- andq %r9,%r12
- rorxq $41,%r9,%r13
- vpsrlq $19,%ymm2,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%r9,%rdi
- leaq (%rbx,%r14,1),%rbx
- leaq (%rax,%r12,1),%rax
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %r11,%r9,%r12
- xorq %rdi,%r13
- rorxq $14,%r9,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%rax,%r12,1),%rax
- xorq %r14,%r13
- movq %rbx,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%rbx,%r12
- leaq (%rax,%r13,1),%rax
- xorq %rcx,%rdi
- vpaddq %ymm11,%ymm3,%ymm3
- rorxq $34,%rbx,%r14
- rorxq $28,%rbx,%r13
- leaq (%r8,%rax,1),%r8
- vpaddq -32(%rbp),%ymm3,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rcx,%r15
- xorq %r13,%r14
- leaq (%rax,%r15,1),%rax
- movq %r9,%r12
- vmovdqa %ymm10,96(%rsp)
- leaq -128(%rsp),%rsp
- vpalignr $8,%ymm4,%ymm5,%ymm8
- addq 0+256(%rsp),%r11
- andq %r8,%r12
- rorxq $41,%r8,%r13
- vpalignr $8,%ymm0,%ymm1,%ymm11
- rorxq $18,%r8,%r15
- leaq (%rax,%r14,1),%rax
- leaq (%r11,%r12,1),%r11
- vpsrlq $1,%ymm8,%ymm10
- andnq %r10,%r8,%r12
- xorq %r15,%r13
- rorxq $14,%r8,%r14
- vpaddq %ymm11,%ymm4,%ymm4
- vpsrlq $7,%ymm8,%ymm11
- leaq (%r11,%r12,1),%r11
- xorq %r14,%r13
- movq %rax,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%rax,%r12
- leaq (%r11,%r13,1),%r11
- xorq %rbx,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%rax,%r14
- rorxq $28,%rax,%r13
- leaq (%rdx,%r11,1),%rdx
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rbx,%rdi
- vpsrlq $6,%ymm3,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%r11,%rdi,1),%r11
- movq %r8,%r12
- vpsllq $3,%ymm3,%ymm10
- vpaddq %ymm8,%ymm4,%ymm4
- addq 8+256(%rsp),%r10
- andq %rdx,%r12
- rorxq $41,%rdx,%r13
- vpsrlq $19,%ymm3,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%rdx,%rdi
- leaq (%r11,%r14,1),%r11
- leaq (%r10,%r12,1),%r10
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %r9,%rdx,%r12
- xorq %rdi,%r13
- rorxq $14,%rdx,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%r10,%r12,1),%r10
- xorq %r14,%r13
- movq %r11,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%r11,%r12
- leaq (%r10,%r13,1),%r10
- xorq %rax,%rdi
- vpaddq %ymm11,%ymm4,%ymm4
- rorxq $34,%r11,%r14
- rorxq $28,%r11,%r13
- leaq (%rcx,%r10,1),%rcx
- vpaddq 0(%rbp),%ymm4,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rax,%r15
- xorq %r13,%r14
- leaq (%r10,%r15,1),%r10
- movq %rdx,%r12
- vmovdqa %ymm10,0(%rsp)
- vpalignr $8,%ymm5,%ymm6,%ymm8
- addq 32+256(%rsp),%r9
- andq %rcx,%r12
- rorxq $41,%rcx,%r13
- vpalignr $8,%ymm1,%ymm2,%ymm11
- rorxq $18,%rcx,%r15
- leaq (%r10,%r14,1),%r10
- leaq (%r9,%r12,1),%r9
- vpsrlq $1,%ymm8,%ymm10
- andnq %r8,%rcx,%r12
- xorq %r15,%r13
- rorxq $14,%rcx,%r14
- vpaddq %ymm11,%ymm5,%ymm5
- vpsrlq $7,%ymm8,%ymm11
- leaq (%r9,%r12,1),%r9
- xorq %r14,%r13
- movq %r10,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%r10,%r12
- leaq (%r9,%r13,1),%r9
- xorq %r11,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%r10,%r14
- rorxq $28,%r10,%r13
- leaq (%rbx,%r9,1),%rbx
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r11,%rdi
- vpsrlq $6,%ymm4,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%r9,%rdi,1),%r9
- movq %rcx,%r12
- vpsllq $3,%ymm4,%ymm10
- vpaddq %ymm8,%ymm5,%ymm5
- addq 40+256(%rsp),%r8
- andq %rbx,%r12
- rorxq $41,%rbx,%r13
- vpsrlq $19,%ymm4,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%rbx,%rdi
- leaq (%r9,%r14,1),%r9
- leaq (%r8,%r12,1),%r8
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %rdx,%rbx,%r12
- xorq %rdi,%r13
- rorxq $14,%rbx,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%r8,%r12,1),%r8
- xorq %r14,%r13
- movq %r9,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%r9,%r12
- leaq (%r8,%r13,1),%r8
- xorq %r10,%rdi
- vpaddq %ymm11,%ymm5,%ymm5
- rorxq $34,%r9,%r14
- rorxq $28,%r9,%r13
- leaq (%rax,%r8,1),%rax
- vpaddq 32(%rbp),%ymm5,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r10,%r15
- xorq %r13,%r14
- leaq (%r8,%r15,1),%r8
- movq %rbx,%r12
- vmovdqa %ymm10,32(%rsp)
- vpalignr $8,%ymm6,%ymm7,%ymm8
- addq 64+256(%rsp),%rdx
- andq %rax,%r12
- rorxq $41,%rax,%r13
- vpalignr $8,%ymm2,%ymm3,%ymm11
- rorxq $18,%rax,%r15
- leaq (%r8,%r14,1),%r8
- leaq (%rdx,%r12,1),%rdx
- vpsrlq $1,%ymm8,%ymm10
- andnq %rcx,%rax,%r12
- xorq %r15,%r13
- rorxq $14,%rax,%r14
- vpaddq %ymm11,%ymm6,%ymm6
- vpsrlq $7,%ymm8,%ymm11
- leaq (%rdx,%r12,1),%rdx
- xorq %r14,%r13
- movq %r8,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%r8,%r12
- leaq (%rdx,%r13,1),%rdx
- xorq %r9,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%r8,%r14
- rorxq $28,%r8,%r13
- leaq (%r11,%rdx,1),%r11
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r9,%rdi
- vpsrlq $6,%ymm5,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%rdx,%rdi,1),%rdx
- movq %rax,%r12
- vpsllq $3,%ymm5,%ymm10
- vpaddq %ymm8,%ymm6,%ymm6
- addq 72+256(%rsp),%rcx
- andq %r11,%r12
- rorxq $41,%r11,%r13
- vpsrlq $19,%ymm5,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%r11,%rdi
- leaq (%rdx,%r14,1),%rdx
- leaq (%rcx,%r12,1),%rcx
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %rbx,%r11,%r12
- xorq %rdi,%r13
- rorxq $14,%r11,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%rcx,%r12,1),%rcx
- xorq %r14,%r13
- movq %rdx,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%rdx,%r12
- leaq (%rcx,%r13,1),%rcx
- xorq %r8,%rdi
- vpaddq %ymm11,%ymm6,%ymm6
- rorxq $34,%rdx,%r14
- rorxq $28,%rdx,%r13
- leaq (%r10,%rcx,1),%r10
- vpaddq 64(%rbp),%ymm6,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r8,%r15
- xorq %r13,%r14
- leaq (%rcx,%r15,1),%rcx
- movq %r11,%r12
- vmovdqa %ymm10,64(%rsp)
- vpalignr $8,%ymm7,%ymm0,%ymm8
- addq 96+256(%rsp),%rbx
- andq %r10,%r12
- rorxq $41,%r10,%r13
- vpalignr $8,%ymm3,%ymm4,%ymm11
- rorxq $18,%r10,%r15
- leaq (%rcx,%r14,1),%rcx
- leaq (%rbx,%r12,1),%rbx
- vpsrlq $1,%ymm8,%ymm10
- andnq %rax,%r10,%r12
- xorq %r15,%r13
- rorxq $14,%r10,%r14
- vpaddq %ymm11,%ymm7,%ymm7
- vpsrlq $7,%ymm8,%ymm11
- leaq (%rbx,%r12,1),%rbx
- xorq %r14,%r13
- movq %rcx,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%rcx,%r12
- leaq (%rbx,%r13,1),%rbx
- xorq %rdx,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%rcx,%r14
- rorxq $28,%rcx,%r13
- leaq (%r9,%rbx,1),%r9
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rdx,%rdi
- vpsrlq $6,%ymm6,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%rbx,%rdi,1),%rbx
- movq %r10,%r12
- vpsllq $3,%ymm6,%ymm10
- vpaddq %ymm8,%ymm7,%ymm7
- addq 104+256(%rsp),%rax
- andq %r9,%r12
- rorxq $41,%r9,%r13
- vpsrlq $19,%ymm6,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%r9,%rdi
- leaq (%rbx,%r14,1),%rbx
- leaq (%rax,%r12,1),%rax
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %r11,%r9,%r12
- xorq %rdi,%r13
- rorxq $14,%r9,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%rax,%r12,1),%rax
- xorq %r14,%r13
- movq %rbx,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%rbx,%r12
- leaq (%rax,%r13,1),%rax
- xorq %rcx,%rdi
- vpaddq %ymm11,%ymm7,%ymm7
- rorxq $34,%rbx,%r14
- rorxq $28,%rbx,%r13
- leaq (%r8,%rax,1),%r8
- vpaddq 96(%rbp),%ymm7,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rcx,%r15
- xorq %r13,%r14
- leaq (%rax,%r15,1),%rax
- movq %r9,%r12
- vmovdqa %ymm10,96(%rsp)
- leaq 256(%rbp),%rbp
- cmpb $0,-121(%rbp)
- jne L$avx2_00_47
- addq 0+128(%rsp),%r11
- andq %r8,%r12
- rorxq $41,%r8,%r13
- rorxq $18,%r8,%r15
- leaq (%rax,%r14,1),%rax
- leaq (%r11,%r12,1),%r11
- andnq %r10,%r8,%r12
- xorq %r15,%r13
- rorxq $14,%r8,%r14
- leaq (%r11,%r12,1),%r11
- xorq %r14,%r13
- movq %rax,%r15
- rorxq $39,%rax,%r12
- leaq (%r11,%r13,1),%r11
- xorq %rbx,%r15
- rorxq $34,%rax,%r14
- rorxq $28,%rax,%r13
- leaq (%rdx,%r11,1),%rdx
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rbx,%rdi
- xorq %r13,%r14
- leaq (%r11,%rdi,1),%r11
- movq %r8,%r12
- addq 8+128(%rsp),%r10
- andq %rdx,%r12
- rorxq $41,%rdx,%r13
- rorxq $18,%rdx,%rdi
- leaq (%r11,%r14,1),%r11
- leaq (%r10,%r12,1),%r10
- andnq %r9,%rdx,%r12
- xorq %rdi,%r13
- rorxq $14,%rdx,%r14
- leaq (%r10,%r12,1),%r10
- xorq %r14,%r13
- movq %r11,%rdi
- rorxq $39,%r11,%r12
- leaq (%r10,%r13,1),%r10
- xorq %rax,%rdi
- rorxq $34,%r11,%r14
- rorxq $28,%r11,%r13
- leaq (%rcx,%r10,1),%rcx
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rax,%r15
- xorq %r13,%r14
- leaq (%r10,%r15,1),%r10
- movq %rdx,%r12
- addq 32+128(%rsp),%r9
- andq %rcx,%r12
- rorxq $41,%rcx,%r13
- rorxq $18,%rcx,%r15
- leaq (%r10,%r14,1),%r10
- leaq (%r9,%r12,1),%r9
- andnq %r8,%rcx,%r12
- xorq %r15,%r13
- rorxq $14,%rcx,%r14
- leaq (%r9,%r12,1),%r9
- xorq %r14,%r13
- movq %r10,%r15
- rorxq $39,%r10,%r12
- leaq (%r9,%r13,1),%r9
- xorq %r11,%r15
- rorxq $34,%r10,%r14
- rorxq $28,%r10,%r13
- leaq (%rbx,%r9,1),%rbx
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r11,%rdi
- xorq %r13,%r14
- leaq (%r9,%rdi,1),%r9
- movq %rcx,%r12
- addq 40+128(%rsp),%r8
- andq %rbx,%r12
- rorxq $41,%rbx,%r13
- rorxq $18,%rbx,%rdi
- leaq (%r9,%r14,1),%r9
- leaq (%r8,%r12,1),%r8
- andnq %rdx,%rbx,%r12
- xorq %rdi,%r13
- rorxq $14,%rbx,%r14
- leaq (%r8,%r12,1),%r8
- xorq %r14,%r13
- movq %r9,%rdi
- rorxq $39,%r9,%r12
- leaq (%r8,%r13,1),%r8
- xorq %r10,%rdi
- rorxq $34,%r9,%r14
- rorxq $28,%r9,%r13
- leaq (%rax,%r8,1),%rax
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r10,%r15
- xorq %r13,%r14
- leaq (%r8,%r15,1),%r8
- movq %rbx,%r12
- addq 64+128(%rsp),%rdx
- andq %rax,%r12
- rorxq $41,%rax,%r13
- rorxq $18,%rax,%r15
- leaq (%r8,%r14,1),%r8
- leaq (%rdx,%r12,1),%rdx
- andnq %rcx,%rax,%r12
- xorq %r15,%r13
- rorxq $14,%rax,%r14
- leaq (%rdx,%r12,1),%rdx
- xorq %r14,%r13
- movq %r8,%r15
- rorxq $39,%r8,%r12
- leaq (%rdx,%r13,1),%rdx
- xorq %r9,%r15
- rorxq $34,%r8,%r14
- rorxq $28,%r8,%r13
- leaq (%r11,%rdx,1),%r11
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r9,%rdi
- xorq %r13,%r14
- leaq (%rdx,%rdi,1),%rdx
- movq %rax,%r12
- addq 72+128(%rsp),%rcx
- andq %r11,%r12
- rorxq $41,%r11,%r13
- rorxq $18,%r11,%rdi
- leaq (%rdx,%r14,1),%rdx
- leaq (%rcx,%r12,1),%rcx
- andnq %rbx,%r11,%r12
- xorq %rdi,%r13
- rorxq $14,%r11,%r14
- leaq (%rcx,%r12,1),%rcx
- xorq %r14,%r13
- movq %rdx,%rdi
- rorxq $39,%rdx,%r12
- leaq (%rcx,%r13,1),%rcx
- xorq %r8,%rdi
- rorxq $34,%rdx,%r14
- rorxq $28,%rdx,%r13
- leaq (%r10,%rcx,1),%r10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r8,%r15
- xorq %r13,%r14
- leaq (%rcx,%r15,1),%rcx
- movq %r11,%r12
- addq 96+128(%rsp),%rbx
- andq %r10,%r12
- rorxq $41,%r10,%r13
- rorxq $18,%r10,%r15
- leaq (%rcx,%r14,1),%rcx
- leaq (%rbx,%r12,1),%rbx
- andnq %rax,%r10,%r12
- xorq %r15,%r13
- rorxq $14,%r10,%r14
- leaq (%rbx,%r12,1),%rbx
- xorq %r14,%r13
- movq %rcx,%r15
- rorxq $39,%rcx,%r12
- leaq (%rbx,%r13,1),%rbx
- xorq %rdx,%r15
- rorxq $34,%rcx,%r14
- rorxq $28,%rcx,%r13
- leaq (%r9,%rbx,1),%r9
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rdx,%rdi
- xorq %r13,%r14
- leaq (%rbx,%rdi,1),%rbx
- movq %r10,%r12
- addq 104+128(%rsp),%rax
- andq %r9,%r12
- rorxq $41,%r9,%r13
- rorxq $18,%r9,%rdi
- leaq (%rbx,%r14,1),%rbx
- leaq (%rax,%r12,1),%rax
- andnq %r11,%r9,%r12
- xorq %rdi,%r13
- rorxq $14,%r9,%r14
- leaq (%rax,%r12,1),%rax
- xorq %r14,%r13
- movq %rbx,%rdi
- rorxq $39,%rbx,%r12
- leaq (%rax,%r13,1),%rax
- xorq %rcx,%rdi
- rorxq $34,%rbx,%r14
- rorxq $28,%rbx,%r13
- leaq (%r8,%rax,1),%r8
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rcx,%r15
- xorq %r13,%r14
- leaq (%rax,%r15,1),%rax
- movq %r9,%r12
- addq 0(%rsp),%r11
- andq %r8,%r12
- rorxq $41,%r8,%r13
- rorxq $18,%r8,%r15
- leaq (%rax,%r14,1),%rax
- leaq (%r11,%r12,1),%r11
- andnq %r10,%r8,%r12
- xorq %r15,%r13
- rorxq $14,%r8,%r14
- leaq (%r11,%r12,1),%r11
- xorq %r14,%r13
- movq %rax,%r15
- rorxq $39,%rax,%r12
- leaq (%r11,%r13,1),%r11
- xorq %rbx,%r15
- rorxq $34,%rax,%r14
- rorxq $28,%rax,%r13
- leaq (%rdx,%r11,1),%rdx
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rbx,%rdi
- xorq %r13,%r14
- leaq (%r11,%rdi,1),%r11
- movq %r8,%r12
- addq 8(%rsp),%r10
- andq %rdx,%r12
- rorxq $41,%rdx,%r13
- rorxq $18,%rdx,%rdi
- leaq (%r11,%r14,1),%r11
- leaq (%r10,%r12,1),%r10
- andnq %r9,%rdx,%r12
- xorq %rdi,%r13
- rorxq $14,%rdx,%r14
- leaq (%r10,%r12,1),%r10
- xorq %r14,%r13
- movq %r11,%rdi
- rorxq $39,%r11,%r12
- leaq (%r10,%r13,1),%r10
- xorq %rax,%rdi
- rorxq $34,%r11,%r14
- rorxq $28,%r11,%r13
- leaq (%rcx,%r10,1),%rcx
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rax,%r15
- xorq %r13,%r14
- leaq (%r10,%r15,1),%r10
- movq %rdx,%r12
- addq 32(%rsp),%r9
- andq %rcx,%r12
- rorxq $41,%rcx,%r13
- rorxq $18,%rcx,%r15
- leaq (%r10,%r14,1),%r10
- leaq (%r9,%r12,1),%r9
- andnq %r8,%rcx,%r12
- xorq %r15,%r13
- rorxq $14,%rcx,%r14
- leaq (%r9,%r12,1),%r9
- xorq %r14,%r13
- movq %r10,%r15
- rorxq $39,%r10,%r12
- leaq (%r9,%r13,1),%r9
- xorq %r11,%r15
- rorxq $34,%r10,%r14
- rorxq $28,%r10,%r13
- leaq (%rbx,%r9,1),%rbx
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r11,%rdi
- xorq %r13,%r14
- leaq (%r9,%rdi,1),%r9
- movq %rcx,%r12
- addq 40(%rsp),%r8
- andq %rbx,%r12
- rorxq $41,%rbx,%r13
- rorxq $18,%rbx,%rdi
- leaq (%r9,%r14,1),%r9
- leaq (%r8,%r12,1),%r8
- andnq %rdx,%rbx,%r12
- xorq %rdi,%r13
- rorxq $14,%rbx,%r14
- leaq (%r8,%r12,1),%r8
- xorq %r14,%r13
- movq %r9,%rdi
- rorxq $39,%r9,%r12
- leaq (%r8,%r13,1),%r8
- xorq %r10,%rdi
- rorxq $34,%r9,%r14
- rorxq $28,%r9,%r13
- leaq (%rax,%r8,1),%rax
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r10,%r15
- xorq %r13,%r14
- leaq (%r8,%r15,1),%r8
- movq %rbx,%r12
- addq 64(%rsp),%rdx
- andq %rax,%r12
- rorxq $41,%rax,%r13
- rorxq $18,%rax,%r15
- leaq (%r8,%r14,1),%r8
- leaq (%rdx,%r12,1),%rdx
- andnq %rcx,%rax,%r12
- xorq %r15,%r13
- rorxq $14,%rax,%r14
- leaq (%rdx,%r12,1),%rdx
- xorq %r14,%r13
- movq %r8,%r15
- rorxq $39,%r8,%r12
- leaq (%rdx,%r13,1),%rdx
- xorq %r9,%r15
- rorxq $34,%r8,%r14
- rorxq $28,%r8,%r13
- leaq (%r11,%rdx,1),%r11
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r9,%rdi
- xorq %r13,%r14
- leaq (%rdx,%rdi,1),%rdx
- movq %rax,%r12
- addq 72(%rsp),%rcx
- andq %r11,%r12
- rorxq $41,%r11,%r13
- rorxq $18,%r11,%rdi
- leaq (%rdx,%r14,1),%rdx
- leaq (%rcx,%r12,1),%rcx
- andnq %rbx,%r11,%r12
- xorq %rdi,%r13
- rorxq $14,%r11,%r14
- leaq (%rcx,%r12,1),%rcx
- xorq %r14,%r13
- movq %rdx,%rdi
- rorxq $39,%rdx,%r12
- leaq (%rcx,%r13,1),%rcx
- xorq %r8,%rdi
- rorxq $34,%rdx,%r14
- rorxq $28,%rdx,%r13
- leaq (%r10,%rcx,1),%r10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r8,%r15
- xorq %r13,%r14
- leaq (%rcx,%r15,1),%rcx
- movq %r11,%r12
- addq 96(%rsp),%rbx
- andq %r10,%r12
- rorxq $41,%r10,%r13
- rorxq $18,%r10,%r15
- leaq (%rcx,%r14,1),%rcx
- leaq (%rbx,%r12,1),%rbx
- andnq %rax,%r10,%r12
- xorq %r15,%r13
- rorxq $14,%r10,%r14
- leaq (%rbx,%r12,1),%rbx
- xorq %r14,%r13
- movq %rcx,%r15
- rorxq $39,%rcx,%r12
- leaq (%rbx,%r13,1),%rbx
- xorq %rdx,%r15
- rorxq $34,%rcx,%r14
- rorxq $28,%rcx,%r13
- leaq (%r9,%rbx,1),%r9
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rdx,%rdi
- xorq %r13,%r14
- leaq (%rbx,%rdi,1),%rbx
- movq %r10,%r12
- addq 104(%rsp),%rax
- andq %r9,%r12
- rorxq $41,%r9,%r13
- rorxq $18,%r9,%rdi
- leaq (%rbx,%r14,1),%rbx
- leaq (%rax,%r12,1),%rax
- andnq %r11,%r9,%r12
- xorq %rdi,%r13
- rorxq $14,%r9,%r14
- leaq (%rax,%r12,1),%rax
- xorq %r14,%r13
- movq %rbx,%rdi
- rorxq $39,%rbx,%r12
- leaq (%rax,%r13,1),%rax
- xorq %rcx,%rdi
- rorxq $34,%rbx,%r14
- rorxq $28,%rbx,%r13
- leaq (%r8,%rax,1),%r8
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rcx,%r15
- xorq %r13,%r14
- leaq (%rax,%r15,1),%rax
- movq %r9,%r12
- movq 1280(%rsp),%rdi
- addq %r14,%rax
-
- leaq 1152(%rsp),%rbp
-
- addq 0(%rdi),%rax
- addq 8(%rdi),%rbx
- addq 16(%rdi),%rcx
- addq 24(%rdi),%rdx
- addq 32(%rdi),%r8
- addq 40(%rdi),%r9
- addq 48(%rdi),%r10
- addq 56(%rdi),%r11
-
- movq %rax,0(%rdi)
- movq %rbx,8(%rdi)
- movq %rcx,16(%rdi)
- movq %rdx,24(%rdi)
- movq %r8,32(%rdi)
- movq %r9,40(%rdi)
- movq %r10,48(%rdi)
- movq %r11,56(%rdi)
-
- cmpq 144(%rbp),%rsi
- je L$done_avx2
-
- xorq %r14,%r14
- movq %rbx,%rdi
- xorq %rcx,%rdi
- movq %r9,%r12
- jmp L$ower_avx2
-.p2align 4
-L$ower_avx2:
- addq 0+16(%rbp),%r11
- andq %r8,%r12
- rorxq $41,%r8,%r13
- rorxq $18,%r8,%r15
- leaq (%rax,%r14,1),%rax
- leaq (%r11,%r12,1),%r11
- andnq %r10,%r8,%r12
- xorq %r15,%r13
- rorxq $14,%r8,%r14
- leaq (%r11,%r12,1),%r11
- xorq %r14,%r13
- movq %rax,%r15
- rorxq $39,%rax,%r12
- leaq (%r11,%r13,1),%r11
- xorq %rbx,%r15
- rorxq $34,%rax,%r14
- rorxq $28,%rax,%r13
- leaq (%rdx,%r11,1),%rdx
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rbx,%rdi
- xorq %r13,%r14
- leaq (%r11,%rdi,1),%r11
- movq %r8,%r12
- addq 8+16(%rbp),%r10
- andq %rdx,%r12
- rorxq $41,%rdx,%r13
- rorxq $18,%rdx,%rdi
- leaq (%r11,%r14,1),%r11
- leaq (%r10,%r12,1),%r10
- andnq %r9,%rdx,%r12
- xorq %rdi,%r13
- rorxq $14,%rdx,%r14
- leaq (%r10,%r12,1),%r10
- xorq %r14,%r13
- movq %r11,%rdi
- rorxq $39,%r11,%r12
- leaq (%r10,%r13,1),%r10
- xorq %rax,%rdi
- rorxq $34,%r11,%r14
- rorxq $28,%r11,%r13
- leaq (%rcx,%r10,1),%rcx
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rax,%r15
- xorq %r13,%r14
- leaq (%r10,%r15,1),%r10
- movq %rdx,%r12
- addq 32+16(%rbp),%r9
- andq %rcx,%r12
- rorxq $41,%rcx,%r13
- rorxq $18,%rcx,%r15
- leaq (%r10,%r14,1),%r10
- leaq (%r9,%r12,1),%r9
- andnq %r8,%rcx,%r12
- xorq %r15,%r13
- rorxq $14,%rcx,%r14
- leaq (%r9,%r12,1),%r9
- xorq %r14,%r13
- movq %r10,%r15
- rorxq $39,%r10,%r12
- leaq (%r9,%r13,1),%r9
- xorq %r11,%r15
- rorxq $34,%r10,%r14
- rorxq $28,%r10,%r13
- leaq (%rbx,%r9,1),%rbx
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r11,%rdi
- xorq %r13,%r14
- leaq (%r9,%rdi,1),%r9
- movq %rcx,%r12
- addq 40+16(%rbp),%r8
- andq %rbx,%r12
- rorxq $41,%rbx,%r13
- rorxq $18,%rbx,%rdi
- leaq (%r9,%r14,1),%r9
- leaq (%r8,%r12,1),%r8
- andnq %rdx,%rbx,%r12
- xorq %rdi,%r13
- rorxq $14,%rbx,%r14
- leaq (%r8,%r12,1),%r8
- xorq %r14,%r13
- movq %r9,%rdi
- rorxq $39,%r9,%r12
- leaq (%r8,%r13,1),%r8
- xorq %r10,%rdi
- rorxq $34,%r9,%r14
- rorxq $28,%r9,%r13
- leaq (%rax,%r8,1),%rax
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r10,%r15
- xorq %r13,%r14
- leaq (%r8,%r15,1),%r8
- movq %rbx,%r12
- addq 64+16(%rbp),%rdx
- andq %rax,%r12
- rorxq $41,%rax,%r13
- rorxq $18,%rax,%r15
- leaq (%r8,%r14,1),%r8
- leaq (%rdx,%r12,1),%rdx
- andnq %rcx,%rax,%r12
- xorq %r15,%r13
- rorxq $14,%rax,%r14
- leaq (%rdx,%r12,1),%rdx
- xorq %r14,%r13
- movq %r8,%r15
- rorxq $39,%r8,%r12
- leaq (%rdx,%r13,1),%rdx
- xorq %r9,%r15
- rorxq $34,%r8,%r14
- rorxq $28,%r8,%r13
- leaq (%r11,%rdx,1),%r11
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r9,%rdi
- xorq %r13,%r14
- leaq (%rdx,%rdi,1),%rdx
- movq %rax,%r12
- addq 72+16(%rbp),%rcx
- andq %r11,%r12
- rorxq $41,%r11,%r13
- rorxq $18,%r11,%rdi
- leaq (%rdx,%r14,1),%rdx
- leaq (%rcx,%r12,1),%rcx
- andnq %rbx,%r11,%r12
- xorq %rdi,%r13
- rorxq $14,%r11,%r14
- leaq (%rcx,%r12,1),%rcx
- xorq %r14,%r13
- movq %rdx,%rdi
- rorxq $39,%rdx,%r12
- leaq (%rcx,%r13,1),%rcx
- xorq %r8,%rdi
- rorxq $34,%rdx,%r14
- rorxq $28,%rdx,%r13
- leaq (%r10,%rcx,1),%r10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r8,%r15
- xorq %r13,%r14
- leaq (%rcx,%r15,1),%rcx
- movq %r11,%r12
- addq 96+16(%rbp),%rbx
- andq %r10,%r12
- rorxq $41,%r10,%r13
- rorxq $18,%r10,%r15
- leaq (%rcx,%r14,1),%rcx
- leaq (%rbx,%r12,1),%rbx
- andnq %rax,%r10,%r12
- xorq %r15,%r13
- rorxq $14,%r10,%r14
- leaq (%rbx,%r12,1),%rbx
- xorq %r14,%r13
- movq %rcx,%r15
- rorxq $39,%rcx,%r12
- leaq (%rbx,%r13,1),%rbx
- xorq %rdx,%r15
- rorxq $34,%rcx,%r14
- rorxq $28,%rcx,%r13
- leaq (%r9,%rbx,1),%r9
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rdx,%rdi
- xorq %r13,%r14
- leaq (%rbx,%rdi,1),%rbx
- movq %r10,%r12
- addq 104+16(%rbp),%rax
- andq %r9,%r12
- rorxq $41,%r9,%r13
- rorxq $18,%r9,%rdi
- leaq (%rbx,%r14,1),%rbx
- leaq (%rax,%r12,1),%rax
- andnq %r11,%r9,%r12
- xorq %rdi,%r13
- rorxq $14,%r9,%r14
- leaq (%rax,%r12,1),%rax
- xorq %r14,%r13
- movq %rbx,%rdi
- rorxq $39,%rbx,%r12
- leaq (%rax,%r13,1),%rax
- xorq %rcx,%rdi
- rorxq $34,%rbx,%r14
- rorxq $28,%rbx,%r13
- leaq (%r8,%rax,1),%r8
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rcx,%r15
- xorq %r13,%r14
- leaq (%rax,%r15,1),%rax
- movq %r9,%r12
- leaq -128(%rbp),%rbp
- cmpq %rsp,%rbp
- jae L$ower_avx2
-
- movq 1280(%rsp),%rdi
- addq %r14,%rax
-
- leaq 1152(%rsp),%rsp
-
- addq 0(%rdi),%rax
- addq 8(%rdi),%rbx
- addq 16(%rdi),%rcx
- addq 24(%rdi),%rdx
- addq 32(%rdi),%r8
- addq 40(%rdi),%r9
- leaq 256(%rsi),%rsi
- addq 48(%rdi),%r10
- movq %rsi,%r12
- addq 56(%rdi),%r11
- cmpq 128+16(%rsp),%rsi
-
- movq %rax,0(%rdi)
- cmoveq %rsp,%r12
- movq %rbx,8(%rdi)
- movq %rcx,16(%rdi)
- movq %rdx,24(%rdi)
- movq %r8,32(%rdi)
- movq %r9,40(%rdi)
- movq %r10,48(%rdi)
- movq %r11,56(%rdi)
-
- jbe L$oop_avx2
- leaq (%rsp),%rbp
-
-L$done_avx2:
- leaq (%rbp),%rsp
- movq 128+24(%rsp),%rsi
- vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
-L$epilogue_avx2:
- .byte 0xf3,0xc3
diff --git a/deps/openssl/asm/x64-win32-masm/bn/rsaz-avx2.asm b/deps/openssl/asm/x64-win32-masm/bn/rsaz-avx2.asm
index 0d3107834e..c24d0c5e6a 100644
--- a/deps/openssl/asm/x64-win32-masm/bn/rsaz-avx2.asm
+++ b/deps/openssl/asm/x64-win32-masm/bn/rsaz-avx2.asm
@@ -1628,8 +1628,9 @@ PUBLIC rsaz_1024_gather5_avx2
ALIGN 32
rsaz_1024_gather5_avx2 PROC PUBLIC
- lea rax,QWORD PTR[((-136))+rsp]
vzeroupper
+ mov r11,rsp
+ lea rax,QWORD PTR[((-136))+rsp]
$L$SEH_begin_rsaz_1024_gather5::
DB 048h,08dh,060h,0e0h
@@ -1643,66 +1644,125 @@ DB 0c5h,078h,029h,060h,040h
DB 0c5h,078h,029h,068h,050h
DB 0c5h,078h,029h,070h,060h
DB 0c5h,078h,029h,078h,070h
- lea r11,QWORD PTR[$L$gather_table]
- mov eax,r8d
- and r8d,3
- shr eax,2
- shl r8d,4
-
- vmovdqu ymm7,YMMWORD PTR[((-32))+r11]
- vpbroadcastb xmm8,BYTE PTR[8+rax*1+r11]
- vpbroadcastb xmm9,BYTE PTR[7+rax*1+r11]
- vpbroadcastb xmm10,BYTE PTR[6+rax*1+r11]
- vpbroadcastb xmm11,BYTE PTR[5+rax*1+r11]
- vpbroadcastb xmm12,BYTE PTR[4+rax*1+r11]
- vpbroadcastb xmm13,BYTE PTR[3+rax*1+r11]
- vpbroadcastb xmm14,BYTE PTR[2+rax*1+r11]
- vpbroadcastb xmm15,BYTE PTR[1+rax*1+r11]
-
- lea rdx,QWORD PTR[64+r8*1+rdx]
- mov r11,64
- mov eax,9
- jmp $L$oop_gather_1024
+ lea rsp,QWORD PTR[((-256))+rsp]
+ and rsp,-32
+ lea r10,QWORD PTR[$L$inc]
+ lea rax,QWORD PTR[((-128))+rsp]
+
+ vmovd xmm4,r8d
+ vmovdqa ymm0,YMMWORD PTR[r10]
+ vmovdqa ymm1,YMMWORD PTR[32+r10]
+ vmovdqa ymm5,YMMWORD PTR[64+r10]
+ vpbroadcastd ymm4,xmm4
+
+ vpaddd ymm2,ymm0,ymm5
+ vpcmpeqd ymm0,ymm0,ymm4
+ vpaddd ymm3,ymm1,ymm5
+ vpcmpeqd ymm1,ymm1,ymm4
+ vmovdqa YMMWORD PTR[(0+128)+rax],ymm0
+ vpaddd ymm0,ymm2,ymm5
+ vpcmpeqd ymm2,ymm2,ymm4
+ vmovdqa YMMWORD PTR[(32+128)+rax],ymm1
+ vpaddd ymm1,ymm3,ymm5
+ vpcmpeqd ymm3,ymm3,ymm4
+ vmovdqa YMMWORD PTR[(64+128)+rax],ymm2
+ vpaddd ymm2,ymm0,ymm5
+ vpcmpeqd ymm0,ymm0,ymm4
+ vmovdqa YMMWORD PTR[(96+128)+rax],ymm3
+ vpaddd ymm3,ymm1,ymm5
+ vpcmpeqd ymm1,ymm1,ymm4
+ vmovdqa YMMWORD PTR[(128+128)+rax],ymm0
+ vpaddd ymm8,ymm2,ymm5
+ vpcmpeqd ymm2,ymm2,ymm4
+ vmovdqa YMMWORD PTR[(160+128)+rax],ymm1
+ vpaddd ymm9,ymm3,ymm5
+ vpcmpeqd ymm3,ymm3,ymm4
+ vmovdqa YMMWORD PTR[(192+128)+rax],ymm2
+ vpaddd ymm10,ymm8,ymm5
+ vpcmpeqd ymm8,ymm8,ymm4
+ vmovdqa YMMWORD PTR[(224+128)+rax],ymm3
+ vpaddd ymm11,ymm9,ymm5
+ vpcmpeqd ymm9,ymm9,ymm4
+ vpaddd ymm12,ymm10,ymm5
+ vpcmpeqd ymm10,ymm10,ymm4
+ vpaddd ymm13,ymm11,ymm5
+ vpcmpeqd ymm11,ymm11,ymm4
+ vpaddd ymm14,ymm12,ymm5
+ vpcmpeqd ymm12,ymm12,ymm4
+ vpaddd ymm15,ymm13,ymm5
+ vpcmpeqd ymm13,ymm13,ymm4
+ vpcmpeqd ymm14,ymm14,ymm4
+ vpcmpeqd ymm15,ymm15,ymm4
+
+ vmovdqa ymm7,YMMWORD PTR[((-32))+r10]
+ lea rdx,QWORD PTR[128+rdx]
+ mov r8d,9
-ALIGN 32
$L$oop_gather_1024::
- vpand xmm0,xmm8,XMMWORD PTR[((-64))+rdx]
- vpand xmm1,xmm9,XMMWORD PTR[rdx]
- vpand xmm2,xmm10,XMMWORD PTR[64+rdx]
- vpand xmm3,xmm11,XMMWORD PTR[r11*2+rdx]
- vpor xmm1,xmm1,xmm0
- vpand xmm4,xmm12,XMMWORD PTR[64+r11*2+rdx]
- vpor xmm3,xmm3,xmm2
- vpand xmm5,xmm13,XMMWORD PTR[r11*4+rdx]
- vpor xmm3,xmm3,xmm1
- vpand xmm6,xmm14,XMMWORD PTR[64+r11*4+rdx]
+ vmovdqa ymm0,YMMWORD PTR[((0-128))+rdx]
+ vmovdqa ymm1,YMMWORD PTR[((32-128))+rdx]
+ vmovdqa ymm2,YMMWORD PTR[((64-128))+rdx]
+ vmovdqa ymm3,YMMWORD PTR[((96-128))+rdx]
+ vpand ymm0,ymm0,YMMWORD PTR[((0+128))+rax]
+ vpand ymm1,ymm1,YMMWORD PTR[((32+128))+rax]
+ vpand ymm2,ymm2,YMMWORD PTR[((64+128))+rax]
+ vpor ymm4,ymm1,ymm0
+ vpand ymm3,ymm3,YMMWORD PTR[((96+128))+rax]
+ vmovdqa ymm0,YMMWORD PTR[((128-128))+rdx]
+ vmovdqa ymm1,YMMWORD PTR[((160-128))+rdx]
+ vpor ymm5,ymm3,ymm2
+ vmovdqa ymm2,YMMWORD PTR[((192-128))+rdx]
+ vmovdqa ymm3,YMMWORD PTR[((224-128))+rdx]
+ vpand ymm0,ymm0,YMMWORD PTR[((128+128))+rax]
+ vpand ymm1,ymm1,YMMWORD PTR[((160+128))+rax]
+ vpand ymm2,ymm2,YMMWORD PTR[((192+128))+rax]
+ vpor ymm4,ymm4,ymm0
+ vpand ymm3,ymm3,YMMWORD PTR[((224+128))+rax]
+ vpand ymm0,ymm8,YMMWORD PTR[((256-128))+rdx]
+ vpor ymm5,ymm5,ymm1
+ vpand ymm1,ymm9,YMMWORD PTR[((288-128))+rdx]
+ vpor ymm4,ymm4,ymm2
+ vpand ymm2,ymm10,YMMWORD PTR[((320-128))+rdx]
+ vpor ymm5,ymm5,ymm3
+ vpand ymm3,ymm11,YMMWORD PTR[((352-128))+rdx]
+ vpor ymm4,ymm4,ymm0
+ vpand ymm0,ymm12,YMMWORD PTR[((384-128))+rdx]
+ vpor ymm5,ymm5,ymm1
+ vpand ymm1,ymm13,YMMWORD PTR[((416-128))+rdx]
+ vpor ymm4,ymm4,ymm2
+ vpand ymm2,ymm14,YMMWORD PTR[((448-128))+rdx]
+ vpor ymm5,ymm5,ymm3
+ vpand ymm3,ymm15,YMMWORD PTR[((480-128))+rdx]
+ lea rdx,QWORD PTR[512+rdx]
+ vpor ymm4,ymm4,ymm0
+ vpor ymm5,ymm5,ymm1
+ vpor ymm4,ymm4,ymm2
+ vpor ymm5,ymm5,ymm3
+
+ vpor ymm4,ymm4,ymm5
+ vextracti128 xmm5,ymm4,1
vpor xmm5,xmm5,xmm4
- vpand xmm2,xmm15,XMMWORD PTR[((-128))+r11*8+rdx]
- lea rdx,QWORD PTR[r11*8+rdx]
- vpor xmm5,xmm5,xmm3
- vpor xmm6,xmm6,xmm2
- vpor xmm6,xmm6,xmm5
- vpermd ymm6,ymm7,ymm6
- vmovdqu YMMWORD PTR[rcx],ymm6
+ vpermd ymm5,ymm7,ymm5
+ vmovdqu YMMWORD PTR[rcx],ymm5
lea rcx,QWORD PTR[32+rcx]
- dec eax
+ dec r8d
jnz $L$oop_gather_1024
vpxor ymm0,ymm0,ymm0
vmovdqu YMMWORD PTR[rcx],ymm0
vzeroupper
- movaps xmm6,XMMWORD PTR[rsp]
- movaps xmm7,XMMWORD PTR[16+rsp]
- movaps xmm8,XMMWORD PTR[32+rsp]
- movaps xmm9,XMMWORD PTR[48+rsp]
- movaps xmm10,XMMWORD PTR[64+rsp]
- movaps xmm11,XMMWORD PTR[80+rsp]
- movaps xmm12,XMMWORD PTR[96+rsp]
- movaps xmm13,XMMWORD PTR[112+rsp]
- movaps xmm14,XMMWORD PTR[128+rsp]
- movaps xmm15,XMMWORD PTR[144+rsp]
- lea rsp,QWORD PTR[168+rsp]
+ movaps xmm6,XMMWORD PTR[((-168))+r11]
+ movaps xmm7,XMMWORD PTR[((-152))+r11]
+ movaps xmm8,XMMWORD PTR[((-136))+r11]
+ movaps xmm9,XMMWORD PTR[((-120))+r11]
+ movaps xmm10,XMMWORD PTR[((-104))+r11]
+ movaps xmm11,XMMWORD PTR[((-88))+r11]
+ movaps xmm12,XMMWORD PTR[((-72))+r11]
+ movaps xmm13,XMMWORD PTR[((-56))+r11]
+ movaps xmm14,XMMWORD PTR[((-40))+r11]
+ movaps xmm15,XMMWORD PTR[((-24))+r11]
$L$SEH_end_rsaz_1024_gather5::
+ lea rsp,QWORD PTR[r11]
DB 0F3h,0C3h ;repret
rsaz_1024_gather5_avx2 ENDP
EXTERN OPENSSL_ia32cap_P:NEAR
@@ -1728,8 +1788,10 @@ $L$scatter_permd::
DD 0,2,4,6,7,7,7,7
$L$gather_permd::
DD 0,7,1,7,2,7,3,7
-$L$gather_table::
-DB 0,0,0,0,0,0,0,0,0ffh,0,0,0,0,0,0,0
+$L$inc::
+ DD 0,0,0,0,1,1,1,1
+ DD 2,2,2,2,3,3,3,3
+ DD 4,4,4,4,4,4,4,4
ALIGN 64
EXTERN __imp_RtlVirtualUnwind:NEAR
@@ -1850,7 +1912,7 @@ DB 9,0,0,0
DD imagerel rsaz_se_handler
DD imagerel $L$mul_1024_body,imagerel $L$mul_1024_epilogue
$L$SEH_info_rsaz_1024_gather5::
-DB 001h,033h,016h,000h
+DB 001h,036h,017h,00bh
DB 036h,0f8h,009h,000h
DB 031h,0e8h,008h,000h
DB 02ch,0d8h,007h,000h
@@ -1862,6 +1924,7 @@ DB 013h,088h,002h,000h
DB 00eh,078h,001h,000h
DB 009h,068h,000h,000h
DB 004h,001h,015h,000h
+DB 000h,0b3h,000h,000h
.xdata ENDS
END
diff --git a/deps/openssl/asm/x64-win32-masm/bn/rsaz-x86_64.asm b/deps/openssl/asm/x64-win32-masm/bn/rsaz-x86_64.asm
index 1c6440470d..e431b62090 100644
--- a/deps/openssl/asm/x64-win32-masm/bn/rsaz-x86_64.asm
+++ b/deps/openssl/asm/x64-win32-masm/bn/rsaz-x86_64.asm
@@ -803,52 +803,108 @@ $L$SEH_begin_rsaz_512_mul_gather4::
push r14
push r15
- mov r9d,r9d
- sub rsp,128+24
+ sub rsp,328
+ movaps XMMWORD PTR[160+rsp],xmm6
+ movaps XMMWORD PTR[176+rsp],xmm7
+ movaps XMMWORD PTR[192+rsp],xmm8
+ movaps XMMWORD PTR[208+rsp],xmm9
+ movaps XMMWORD PTR[224+rsp],xmm10
+ movaps XMMWORD PTR[240+rsp],xmm11
+ movaps XMMWORD PTR[256+rsp],xmm12
+ movaps XMMWORD PTR[272+rsp],xmm13
+ movaps XMMWORD PTR[288+rsp],xmm14
+ movaps XMMWORD PTR[304+rsp],xmm15
$L$mul_gather4_body::
+ movd xmm8,r9d
+ movdqa xmm1,XMMWORD PTR[(($L$inc+16))]
+ movdqa xmm0,XMMWORD PTR[$L$inc]
+
+ pshufd xmm8,xmm8,0
+ movdqa xmm7,xmm1
+ movdqa xmm2,xmm1
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm8
+ movdqa xmm3,xmm7
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm8
+ movdqa xmm4,xmm7
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm8
+ movdqa xmm5,xmm7
+ paddd xmm4,xmm3
+ pcmpeqd xmm3,xmm8
+ movdqa xmm6,xmm7
+ paddd xmm5,xmm4
+ pcmpeqd xmm4,xmm8
+ paddd xmm6,xmm5
+ pcmpeqd xmm5,xmm8
+ paddd xmm7,xmm6
+ pcmpeqd xmm6,xmm8
+ pcmpeqd xmm7,xmm8
+
+ movdqa xmm8,XMMWORD PTR[rdx]
+ movdqa xmm9,XMMWORD PTR[16+rdx]
+ movdqa xmm10,XMMWORD PTR[32+rdx]
+ movdqa xmm11,XMMWORD PTR[48+rdx]
+ pand xmm8,xmm0
+ movdqa xmm12,XMMWORD PTR[64+rdx]
+ pand xmm9,xmm1
+ movdqa xmm13,XMMWORD PTR[80+rdx]
+ pand xmm10,xmm2
+ movdqa xmm14,XMMWORD PTR[96+rdx]
+ pand xmm11,xmm3
+ movdqa xmm15,XMMWORD PTR[112+rdx]
+ lea rbp,QWORD PTR[128+rdx]
+ pand xmm12,xmm4
+ pand xmm13,xmm5
+ pand xmm14,xmm6
+ pand xmm15,xmm7
+ por xmm8,xmm10
+ por xmm9,xmm11
+ por xmm8,xmm12
+ por xmm9,xmm13
+ por xmm8,xmm14
+ por xmm9,xmm15
+
+ por xmm8,xmm9
+ pshufd xmm9,xmm8,04eh
+ por xmm8,xmm9
mov r11d,080100h
and r11d,DWORD PTR[((OPENSSL_ia32cap_P+8))]
cmp r11d,080100h
je $L$mulx_gather
- mov eax,DWORD PTR[64+r9*4+rdx]
-DB 102,72,15,110,199
- mov ebx,DWORD PTR[r9*4+rdx]
-DB 102,72,15,110,201
+DB 102,76,15,126,195
+
mov QWORD PTR[128+rsp],r8
+ mov QWORD PTR[((128+8))+rsp],rdi
+ mov QWORD PTR[((128+16))+rsp],rcx
- shl rax,32
- or rbx,rax
mov rax,QWORD PTR[rsi]
mov rcx,QWORD PTR[8+rsi]
- lea rbp,QWORD PTR[128+r9*4+rdx]
mul rbx
mov QWORD PTR[rsp],rax
mov rax,rcx
mov r8,rdx
mul rbx
- movd xmm4,DWORD PTR[rbp]
add r8,rax
mov rax,QWORD PTR[16+rsi]
mov r9,rdx
adc r9,0
mul rbx
- movd xmm5,DWORD PTR[64+rbp]
add r9,rax
mov rax,QWORD PTR[24+rsi]
mov r10,rdx
adc r10,0
mul rbx
- pslldq xmm5,4
add r10,rax
mov rax,QWORD PTR[32+rsi]
mov r11,rdx
adc r11,0
mul rbx
- por xmm4,xmm5
add r11,rax
mov rax,QWORD PTR[40+rsi]
mov r12,rdx
@@ -861,14 +917,12 @@ DB 102,72,15,110,201
adc r13,0
mul rbx
- lea rbp,QWORD PTR[128+rbp]
add r13,rax
mov rax,QWORD PTR[56+rsi]
mov r14,rdx
adc r14,0
mul rbx
-DB 102,72,15,126,227
add r14,rax
mov rax,QWORD PTR[rsi]
mov r15,rdx
@@ -880,6 +934,35 @@ DB 102,72,15,126,227
ALIGN 32
$L$oop_mul_gather::
+ movdqa xmm8,XMMWORD PTR[rbp]
+ movdqa xmm9,XMMWORD PTR[16+rbp]
+ movdqa xmm10,XMMWORD PTR[32+rbp]
+ movdqa xmm11,XMMWORD PTR[48+rbp]
+ pand xmm8,xmm0
+ movdqa xmm12,XMMWORD PTR[64+rbp]
+ pand xmm9,xmm1
+ movdqa xmm13,XMMWORD PTR[80+rbp]
+ pand xmm10,xmm2
+ movdqa xmm14,XMMWORD PTR[96+rbp]
+ pand xmm11,xmm3
+ movdqa xmm15,XMMWORD PTR[112+rbp]
+ lea rbp,QWORD PTR[128+rbp]
+ pand xmm12,xmm4
+ pand xmm13,xmm5
+ pand xmm14,xmm6
+ pand xmm15,xmm7
+ por xmm8,xmm10
+ por xmm9,xmm11
+ por xmm8,xmm12
+ por xmm9,xmm13
+ por xmm8,xmm14
+ por xmm9,xmm15
+
+ por xmm8,xmm9
+ pshufd xmm9,xmm8,04eh
+ por xmm8,xmm9
+DB 102,76,15,126,195
+
mul rbx
add r8,rax
mov rax,QWORD PTR[8+rsi]
@@ -888,7 +971,6 @@ $L$oop_mul_gather::
adc r8,0
mul rbx
- movd xmm4,DWORD PTR[rbp]
add r9,rax
mov rax,QWORD PTR[16+rsi]
adc rdx,0
@@ -897,7 +979,6 @@ $L$oop_mul_gather::
adc r9,0
mul rbx
- movd xmm5,DWORD PTR[64+rbp]
add r10,rax
mov rax,QWORD PTR[24+rsi]
adc rdx,0
@@ -906,7 +987,6 @@ $L$oop_mul_gather::
adc r10,0
mul rbx
- pslldq xmm5,4
add r11,rax
mov rax,QWORD PTR[32+rsi]
adc rdx,0
@@ -915,7 +995,6 @@ $L$oop_mul_gather::
adc r11,0
mul rbx
- por xmm4,xmm5
add r12,rax
mov rax,QWORD PTR[40+rsi]
adc rdx,0
@@ -940,7 +1019,6 @@ $L$oop_mul_gather::
adc r14,0
mul rbx
-DB 102,72,15,126,227
add r15,rax
mov rax,QWORD PTR[rsi]
adc rdx,0
@@ -948,7 +1026,6 @@ DB 102,72,15,126,227
mov r15,rdx
adc r15,0
- lea rbp,QWORD PTR[128+rbp]
lea rdi,QWORD PTR[8+rdi]
dec ecx
@@ -963,8 +1040,8 @@ DB 102,72,15,126,227
mov QWORD PTR[48+rdi],r14
mov QWORD PTR[56+rdi],r15
-DB 102,72,15,126,199
-DB 102,72,15,126,205
+ mov rdi,QWORD PTR[((128+8))+rsp]
+ mov rbp,QWORD PTR[((128+16))+rsp]
mov r8,QWORD PTR[rsp]
mov r9,QWORD PTR[8+rsp]
@@ -980,45 +1057,37 @@ DB 102,72,15,126,205
ALIGN 32
$L$mulx_gather::
- mov eax,DWORD PTR[64+r9*4+rdx]
-DB 102,72,15,110,199
- lea rbp,QWORD PTR[128+r9*4+rdx]
- mov edx,DWORD PTR[r9*4+rdx]
-DB 102,72,15,110,201
+DB 102,76,15,126,194
+
mov QWORD PTR[128+rsp],r8
+ mov QWORD PTR[((128+8))+rsp],rdi
+ mov QWORD PTR[((128+16))+rsp],rcx
- shl rax,32
- or rdx,rax
mulx r8,rbx,QWORD PTR[rsi]
mov QWORD PTR[rsp],rbx
xor edi,edi
mulx r9,rax,QWORD PTR[8+rsi]
- movd xmm4,DWORD PTR[rbp]
mulx r10,rbx,QWORD PTR[16+rsi]
- movd xmm5,DWORD PTR[64+rbp]
adcx r8,rax
mulx r11,rax,QWORD PTR[24+rsi]
- pslldq xmm5,4
adcx r9,rbx
mulx r12,rbx,QWORD PTR[32+rsi]
- por xmm4,xmm5
adcx r10,rax
mulx r13,rax,QWORD PTR[40+rsi]
adcx r11,rbx
mulx r14,rbx,QWORD PTR[48+rsi]
- lea rbp,QWORD PTR[128+rbp]
adcx r12,rax
mulx r15,rax,QWORD PTR[56+rsi]
-DB 102,72,15,126,226
adcx r13,rbx
adcx r14,rax
+DB 067h
mov rbx,r8
adcx r15,rdi
@@ -1027,24 +1096,48 @@ DB 102,72,15,126,226
ALIGN 32
$L$oop_mulx_gather::
- mulx r8,rax,QWORD PTR[rsi]
+ movdqa xmm8,XMMWORD PTR[rbp]
+ movdqa xmm9,XMMWORD PTR[16+rbp]
+ movdqa xmm10,XMMWORD PTR[32+rbp]
+ movdqa xmm11,XMMWORD PTR[48+rbp]
+ pand xmm8,xmm0
+ movdqa xmm12,XMMWORD PTR[64+rbp]
+ pand xmm9,xmm1
+ movdqa xmm13,XMMWORD PTR[80+rbp]
+ pand xmm10,xmm2
+ movdqa xmm14,XMMWORD PTR[96+rbp]
+ pand xmm11,xmm3
+ movdqa xmm15,XMMWORD PTR[112+rbp]
+ lea rbp,QWORD PTR[128+rbp]
+ pand xmm12,xmm4
+ pand xmm13,xmm5
+ pand xmm14,xmm6
+ pand xmm15,xmm7
+ por xmm8,xmm10
+ por xmm9,xmm11
+ por xmm8,xmm12
+ por xmm9,xmm13
+ por xmm8,xmm14
+ por xmm9,xmm15
+
+ por xmm8,xmm9
+ pshufd xmm9,xmm8,04eh
+ por xmm8,xmm9
+DB 102,76,15,126,194
+
+DB 0c4h,062h,0fbh,0f6h,086h,000h,000h,000h,000h
adcx rbx,rax
adox r8,r9
mulx r9,rax,QWORD PTR[8+rsi]
-DB 066h,00fh,06eh,0a5h,000h,000h,000h,000h
adcx r8,rax
adox r9,r10
mulx r10,rax,QWORD PTR[16+rsi]
- movd xmm5,DWORD PTR[64+rbp]
- lea rbp,QWORD PTR[128+rbp]
adcx r9,rax
adox r10,r11
DB 0c4h,062h,0fbh,0f6h,09eh,018h,000h,000h,000h
- pslldq xmm5,4
- por xmm4,xmm5
adcx r10,rax
adox r11,r12
@@ -1058,10 +1151,10 @@ DB 0c4h,062h,0fbh,0f6h,09eh,018h,000h,000h,000h
DB 0c4h,062h,0fbh,0f6h,0b6h,030h,000h,000h,000h
adcx r13,rax
+DB 067h
adox r14,r15
mulx r15,rax,QWORD PTR[56+rsi]
-DB 102,72,15,126,226
mov QWORD PTR[64+rcx*8+rsp],rbx
adcx r14,rax
adox r15,rdi
@@ -1080,10 +1173,10 @@ DB 102,72,15,126,226
mov QWORD PTR[((64+48))+rsp],r14
mov QWORD PTR[((64+56))+rsp],r15
-DB 102,72,15,126,199
-DB 102,72,15,126,205
-
mov rdx,QWORD PTR[128+rsp]
+ mov rdi,QWORD PTR[((128+8))+rsp]
+ mov rbp,QWORD PTR[((128+16))+rsp]
+
mov r8,QWORD PTR[rsp]
mov r9,QWORD PTR[8+rsp]
mov r10,QWORD PTR[16+rsp]
@@ -1109,6 +1202,17 @@ $L$mul_gather_tail::
call __rsaz_512_subtract
lea rax,QWORD PTR[((128+24+48))+rsp]
+ movaps xmm6,XMMWORD PTR[((160-200))+rax]
+ movaps xmm7,XMMWORD PTR[((176-200))+rax]
+ movaps xmm8,XMMWORD PTR[((192-200))+rax]
+ movaps xmm9,XMMWORD PTR[((208-200))+rax]
+ movaps xmm10,XMMWORD PTR[((224-200))+rax]
+ movaps xmm11,XMMWORD PTR[((240-200))+rax]
+ movaps xmm12,XMMWORD PTR[((256-200))+rax]
+ movaps xmm13,XMMWORD PTR[((272-200))+rax]
+ movaps xmm14,XMMWORD PTR[((288-200))+rax]
+ movaps xmm15,XMMWORD PTR[((304-200))+rax]
+ lea rax,QWORD PTR[176+rax]
mov r15,QWORD PTR[((-48))+rax]
mov r14,QWORD PTR[((-40))+rax]
mov r13,QWORD PTR[((-32))+rax]
@@ -1148,7 +1252,7 @@ $L$SEH_begin_rsaz_512_mul_scatter4::
mov r9d,r9d
sub rsp,128+24
$L$mul_scatter4_body::
- lea r8,QWORD PTR[r9*4+r8]
+ lea r8,QWORD PTR[r9*8+r8]
DB 102,72,15,110,199
DB 102,72,15,110,202
DB 102,73,15,110,208
@@ -1211,30 +1315,14 @@ DB 102,72,15,126,214
call __rsaz_512_subtract
- mov DWORD PTR[rsi],r8d
- shr r8,32
- mov DWORD PTR[128+rsi],r9d
- shr r9,32
- mov DWORD PTR[256+rsi],r10d
- shr r10,32
- mov DWORD PTR[384+rsi],r11d
- shr r11,32
- mov DWORD PTR[512+rsi],r12d
- shr r12,32
- mov DWORD PTR[640+rsi],r13d
- shr r13,32
- mov DWORD PTR[768+rsi],r14d
- shr r14,32
- mov DWORD PTR[896+rsi],r15d
- shr r15,32
- mov DWORD PTR[64+rsi],r8d
- mov DWORD PTR[192+rsi],r9d
- mov DWORD PTR[320+rsi],r10d
- mov DWORD PTR[448+rsi],r11d
- mov DWORD PTR[576+rsi],r12d
- mov DWORD PTR[704+rsi],r13d
- mov DWORD PTR[832+rsi],r14d
- mov DWORD PTR[960+rsi],r15d
+ mov QWORD PTR[rsi],r8
+ mov QWORD PTR[128+rsi],r9
+ mov QWORD PTR[256+rsi],r10
+ mov QWORD PTR[384+rsi],r11
+ mov QWORD PTR[512+rsi],r12
+ mov QWORD PTR[640+rsi],r13
+ mov QWORD PTR[768+rsi],r14
+ mov QWORD PTR[896+rsi],r15
lea rax,QWORD PTR[((128+24+48))+rsp]
mov r15,QWORD PTR[((-48))+rax]
@@ -1789,16 +1877,14 @@ PUBLIC rsaz_512_scatter4
ALIGN 16
rsaz_512_scatter4 PROC PUBLIC
- lea rcx,QWORD PTR[r8*4+rcx]
+ lea rcx,QWORD PTR[r8*8+rcx]
mov r9d,8
jmp $L$oop_scatter
ALIGN 16
$L$oop_scatter::
mov rax,QWORD PTR[rdx]
lea rdx,QWORD PTR[8+rdx]
- mov DWORD PTR[rcx],eax
- shr rax,32
- mov DWORD PTR[64+rcx],eax
+ mov QWORD PTR[rcx],rax
lea rcx,QWORD PTR[128+rcx]
dec r9d
jnz $L$oop_scatter
@@ -1809,22 +1895,98 @@ PUBLIC rsaz_512_gather4
ALIGN 16
rsaz_512_gather4 PROC PUBLIC
- lea rdx,QWORD PTR[r8*4+rdx]
+$L$SEH_begin_rsaz_512_gather4::
+DB 048h,081h,0ech,0a8h,000h,000h,000h
+DB 00fh,029h,034h,024h
+DB 00fh,029h,07ch,024h,010h
+DB 044h,00fh,029h,044h,024h,020h
+DB 044h,00fh,029h,04ch,024h,030h
+DB 044h,00fh,029h,054h,024h,040h
+DB 044h,00fh,029h,05ch,024h,050h
+DB 044h,00fh,029h,064h,024h,060h
+DB 044h,00fh,029h,06ch,024h,070h
+DB 044h,00fh,029h,0b4h,024h,080h,0,0,0
+DB 044h,00fh,029h,0bch,024h,090h,0,0,0
+ movd xmm8,r8d
+ movdqa xmm1,XMMWORD PTR[(($L$inc+16))]
+ movdqa xmm0,XMMWORD PTR[$L$inc]
+
+ pshufd xmm8,xmm8,0
+ movdqa xmm7,xmm1
+ movdqa xmm2,xmm1
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm8
+ movdqa xmm3,xmm7
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm8
+ movdqa xmm4,xmm7
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm8
+ movdqa xmm5,xmm7
+ paddd xmm4,xmm3
+ pcmpeqd xmm3,xmm8
+ movdqa xmm6,xmm7
+ paddd xmm5,xmm4
+ pcmpeqd xmm4,xmm8
+ paddd xmm6,xmm5
+ pcmpeqd xmm5,xmm8
+ paddd xmm7,xmm6
+ pcmpeqd xmm6,xmm8
+ pcmpeqd xmm7,xmm8
mov r9d,8
jmp $L$oop_gather
ALIGN 16
$L$oop_gather::
- mov eax,DWORD PTR[rdx]
- mov r8d,DWORD PTR[64+rdx]
+ movdqa xmm8,XMMWORD PTR[rdx]
+ movdqa xmm9,XMMWORD PTR[16+rdx]
+ movdqa xmm10,XMMWORD PTR[32+rdx]
+ movdqa xmm11,XMMWORD PTR[48+rdx]
+ pand xmm8,xmm0
+ movdqa xmm12,XMMWORD PTR[64+rdx]
+ pand xmm9,xmm1
+ movdqa xmm13,XMMWORD PTR[80+rdx]
+ pand xmm10,xmm2
+ movdqa xmm14,XMMWORD PTR[96+rdx]
+ pand xmm11,xmm3
+ movdqa xmm15,XMMWORD PTR[112+rdx]
lea rdx,QWORD PTR[128+rdx]
- shl r8,32
- or rax,r8
- mov QWORD PTR[rcx],rax
+ pand xmm12,xmm4
+ pand xmm13,xmm5
+ pand xmm14,xmm6
+ pand xmm15,xmm7
+ por xmm8,xmm10
+ por xmm9,xmm11
+ por xmm8,xmm12
+ por xmm9,xmm13
+ por xmm8,xmm14
+ por xmm9,xmm15
+
+ por xmm8,xmm9
+ pshufd xmm9,xmm8,04eh
+ por xmm8,xmm9
+ movq QWORD PTR[rcx],xmm8
lea rcx,QWORD PTR[8+rcx]
dec r9d
jnz $L$oop_gather
+ movaps xmm6,XMMWORD PTR[rsp]
+ movaps xmm7,XMMWORD PTR[16+rsp]
+ movaps xmm8,XMMWORD PTR[32+rsp]
+ movaps xmm9,XMMWORD PTR[48+rsp]
+ movaps xmm10,XMMWORD PTR[64+rsp]
+ movaps xmm11,XMMWORD PTR[80+rsp]
+ movaps xmm12,XMMWORD PTR[96+rsp]
+ movaps xmm13,XMMWORD PTR[112+rsp]
+ movaps xmm14,XMMWORD PTR[128+rsp]
+ movaps xmm15,XMMWORD PTR[144+rsp]
+ add rsp,0a8h
DB 0F3h,0C3h ;repret
+$L$SEH_end_rsaz_512_gather4::
rsaz_512_gather4 ENDP
+
+ALIGN 64
+$L$inc::
+ DD 0,0,1,1
+ DD 2,2,2,2
EXTERN __imp_RtlVirtualUnwind:NEAR
ALIGN 16
@@ -1860,6 +2022,18 @@ se_handler PROC PRIVATE
lea rax,QWORD PTR[((128+24+48))+rax]
+ lea rbx,QWORD PTR[$L$mul_gather4_epilogue]
+ cmp rbx,r10
+ jne $L$se_not_in_mul_gather4
+
+ lea rax,QWORD PTR[176+rax]
+
+ lea rsi,QWORD PTR[((-48-168))+rax]
+ lea rdi,QWORD PTR[512+r8]
+ mov ecx,20
+ DD 0a548f3fch
+
+$L$se_not_in_mul_gather4::
mov rbx,QWORD PTR[((-8))+rax]
mov rbp,QWORD PTR[((-16))+rax]
mov r12,QWORD PTR[((-24))+rax]
@@ -1936,6 +2110,10 @@ ALIGN 4
DD imagerel $L$SEH_end_rsaz_512_mul_by_one
DD imagerel $L$SEH_info_rsaz_512_mul_by_one
+ DD imagerel $L$SEH_begin_rsaz_512_gather4
+ DD imagerel $L$SEH_end_rsaz_512_gather4
+ DD imagerel $L$SEH_info_rsaz_512_gather4
+
.pdata ENDS
.xdata SEGMENT READONLY ALIGN(8)
ALIGN 8
@@ -1959,6 +2137,19 @@ $L$SEH_info_rsaz_512_mul_by_one::
DB 9,0,0,0
DD imagerel se_handler
DD imagerel $L$mul_by_one_body,imagerel $L$mul_by_one_epilogue
+$L$SEH_info_rsaz_512_gather4::
+DB 001h,046h,016h,000h
+DB 046h,0f8h,009h,000h
+DB 03dh,0e8h,008h,000h
+DB 034h,0d8h,007h,000h
+DB 02eh,0c8h,006h,000h
+DB 028h,0b8h,005h,000h
+DB 022h,0a8h,004h,000h
+DB 01ch,098h,003h,000h
+DB 016h,088h,002h,000h
+DB 010h,078h,001h,000h
+DB 00bh,068h,000h,000h
+DB 007h,001h,015h,000h
.xdata ENDS
END
diff --git a/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont.asm b/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont.asm
index f252745060..e70ec9f31a 100644
--- a/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont.asm
+++ b/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont.asm
@@ -681,20 +681,20 @@ $L$sqr8x_enter::
- lea r11,QWORD PTR[((-64))+r9*4+rsp]
+ lea r11,QWORD PTR[((-64))+r9*2+rsp]
mov r8,QWORD PTR[r8]
sub r11,rsi
and r11,4095
cmp r10,r11
jb $L$sqr8x_sp_alt
sub rsp,r11
- lea rsp,QWORD PTR[((-64))+r9*4+rsp]
+ lea rsp,QWORD PTR[((-64))+r9*2+rsp]
jmp $L$sqr8x_sp_done
ALIGN 32
$L$sqr8x_sp_alt::
- lea r10,QWORD PTR[((4096-64))+r9*4]
- lea rsp,QWORD PTR[((-64))+r9*4+rsp]
+ lea r10,QWORD PTR[((4096-64))+r9*2]
+ lea rsp,QWORD PTR[((-64))+r9*2+rsp]
sub r11,r10
mov r10,0
cmovc r11,r10
@@ -704,73 +704,99 @@ $L$sqr8x_sp_done::
mov r10,r9
neg r9
- lea r11,QWORD PTR[64+r9*2+rsp]
mov QWORD PTR[32+rsp],r8
mov QWORD PTR[40+rsp],rax
$L$sqr8x_body::
- mov rbp,r9
-DB 102,73,15,110,211
- shr rbp,3+2
- mov eax,DWORD PTR[((OPENSSL_ia32cap_P+8))]
- jmp $L$sqr8x_copy_n
-
-ALIGN 32
-$L$sqr8x_copy_n::
- movq xmm0,QWORD PTR[rcx]
- movq xmm1,QWORD PTR[8+rcx]
- movq xmm3,QWORD PTR[16+rcx]
- movq xmm4,QWORD PTR[24+rcx]
- lea rcx,QWORD PTR[32+rcx]
- movdqa XMMWORD PTR[r11],xmm0
- movdqa XMMWORD PTR[16+r11],xmm1
- movdqa XMMWORD PTR[32+r11],xmm3
- movdqa XMMWORD PTR[48+r11],xmm4
- lea r11,QWORD PTR[64+r11]
- dec rbp
- jnz $L$sqr8x_copy_n
-
+DB 102,72,15,110,209
pxor xmm0,xmm0
DB 102,72,15,110,207
DB 102,73,15,110,218
+ mov eax,DWORD PTR[((OPENSSL_ia32cap_P+8))]
and eax,080100h
cmp eax,080100h
jne $L$sqr8x_nox
call bn_sqrx8x_internal
- pxor xmm0,xmm0
- lea rax,QWORD PTR[48+rsp]
- lea rdx,QWORD PTR[64+r9*2+rsp]
- shr r9,3+2
- mov rsi,QWORD PTR[40+rsp]
- jmp $L$sqr8x_zero
+
+
+
+ lea rbx,QWORD PTR[rcx*1+r8]
+ mov r9,rcx
+ mov rdx,rcx
+DB 102,72,15,126,207
+ sar rcx,3+2
+ jmp $L$sqr8x_sub
ALIGN 32
$L$sqr8x_nox::
call bn_sqr8x_internal
+
+
+
+ lea rbx,QWORD PTR[r9*1+rdi]
+ mov rcx,r9
+ mov rdx,r9
+DB 102,72,15,126,207
+ sar rcx,3+2
+ jmp $L$sqr8x_sub
+
+ALIGN 32
+$L$sqr8x_sub::
+ mov r12,QWORD PTR[rbx]
+ mov r13,QWORD PTR[8+rbx]
+ mov r14,QWORD PTR[16+rbx]
+ mov r15,QWORD PTR[24+rbx]
+ lea rbx,QWORD PTR[32+rbx]
+ sbb r12,QWORD PTR[rbp]
+ sbb r13,QWORD PTR[8+rbp]
+ sbb r14,QWORD PTR[16+rbp]
+ sbb r15,QWORD PTR[24+rbp]
+ lea rbp,QWORD PTR[32+rbp]
+ mov QWORD PTR[rdi],r12
+ mov QWORD PTR[8+rdi],r13
+ mov QWORD PTR[16+rdi],r14
+ mov QWORD PTR[24+rdi],r15
+ lea rdi,QWORD PTR[32+rdi]
+ inc rcx
+ jnz $L$sqr8x_sub
+
+ sbb rax,0
+ lea rbx,QWORD PTR[r9*1+rbx]
+ lea rdi,QWORD PTR[r9*1+rdi]
+
+DB 102,72,15,110,200
pxor xmm0,xmm0
- lea rax,QWORD PTR[48+rsp]
- lea rdx,QWORD PTR[64+r9*2+rsp]
- shr r9,3+2
+ pshufd xmm1,xmm1,0
mov rsi,QWORD PTR[40+rsp]
- jmp $L$sqr8x_zero
+ jmp $L$sqr8x_cond_copy
ALIGN 32
-$L$sqr8x_zero::
- movdqa XMMWORD PTR[rax],xmm0
- movdqa XMMWORD PTR[16+rax],xmm0
- movdqa XMMWORD PTR[32+rax],xmm0
- movdqa XMMWORD PTR[48+rax],xmm0
- lea rax,QWORD PTR[64+rax]
- movdqa XMMWORD PTR[rdx],xmm0
- movdqa XMMWORD PTR[16+rdx],xmm0
- movdqa XMMWORD PTR[32+rdx],xmm0
- movdqa XMMWORD PTR[48+rdx],xmm0
- lea rdx,QWORD PTR[64+rdx]
- dec r9
- jnz $L$sqr8x_zero
+$L$sqr8x_cond_copy::
+ movdqa xmm2,XMMWORD PTR[rbx]
+ movdqa xmm3,XMMWORD PTR[16+rbx]
+ lea rbx,QWORD PTR[32+rbx]
+ movdqu xmm4,XMMWORD PTR[rdi]
+ movdqu xmm5,XMMWORD PTR[16+rdi]
+ lea rdi,QWORD PTR[32+rdi]
+ movdqa XMMWORD PTR[(-32)+rbx],xmm0
+ movdqa XMMWORD PTR[(-16)+rbx],xmm0
+ movdqa XMMWORD PTR[(-32)+rdx*1+rbx],xmm0
+ movdqa XMMWORD PTR[(-16)+rdx*1+rbx],xmm0
+ pcmpeqd xmm0,xmm1
+ pand xmm2,xmm1
+ pand xmm3,xmm1
+ pand xmm4,xmm0
+ pand xmm5,xmm0
+ pxor xmm0,xmm0
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqu XMMWORD PTR[(-32)+rdi],xmm4
+ movdqu XMMWORD PTR[(-16)+rdi],xmm5
+ add r9,32
+ jnz $L$sqr8x_cond_copy
mov rax,1
mov r15,QWORD PTR[((-48))+rsi]
@@ -1040,64 +1066,75 @@ $L$mulx4x_inner::
adc r15,rbp
sub rbp,QWORD PTR[rbx]
adc r14,r15
- mov r8,QWORD PTR[((-8))+rcx]
sbb r15,r15
mov QWORD PTR[((-8))+rbx],r14
cmp rdi,QWORD PTR[16+rsp]
jne $L$mulx4x_outer
- sub r8,r14
- sbb r8,r8
- or r15,r8
-
- neg rax
- xor rdx,rdx
+ lea rbx,QWORD PTR[64+rsp]
+ sub rcx,rax
+ neg r15
+ mov rdx,rax
+ shr rax,3+2
mov rdi,QWORD PTR[32+rsp]
+ jmp $L$mulx4x_sub
+
+ALIGN 32
+$L$mulx4x_sub::
+ mov r11,QWORD PTR[rbx]
+ mov r12,QWORD PTR[8+rbx]
+ mov r13,QWORD PTR[16+rbx]
+ mov r14,QWORD PTR[24+rbx]
+ lea rbx,QWORD PTR[32+rbx]
+ sbb r11,QWORD PTR[rcx]
+ sbb r12,QWORD PTR[8+rcx]
+ sbb r13,QWORD PTR[16+rcx]
+ sbb r14,QWORD PTR[24+rcx]
+ lea rcx,QWORD PTR[32+rcx]
+ mov QWORD PTR[rdi],r11
+ mov QWORD PTR[8+rdi],r12
+ mov QWORD PTR[16+rdi],r13
+ mov QWORD PTR[24+rdi],r14
+ lea rdi,QWORD PTR[32+rdi]
+ dec rax
+ jnz $L$mulx4x_sub
+
+ sbb r15,0
lea rbx,QWORD PTR[64+rsp]
+ sub rdi,rdx
+DB 102,73,15,110,207
pxor xmm0,xmm0
- mov r8,QWORD PTR[rax*1+rcx]
- mov r9,QWORD PTR[8+rax*1+rcx]
- neg r8
- jmp $L$mulx4x_sub_entry
+ pshufd xmm1,xmm1,0
+ mov rsi,QWORD PTR[40+rsp]
+ jmp $L$mulx4x_cond_copy
ALIGN 32
-$L$mulx4x_sub::
- mov r8,QWORD PTR[rax*1+rcx]
- mov r9,QWORD PTR[8+rax*1+rcx]
- not r8
-$L$mulx4x_sub_entry::
- mov r10,QWORD PTR[16+rax*1+rcx]
- not r9
- and r8,r15
- mov r11,QWORD PTR[24+rax*1+rcx]
- not r10
- and r9,r15
- not r11
- and r10,r15
- and r11,r15
-
- neg rdx
- adc r8,QWORD PTR[rbx]
- adc r9,QWORD PTR[8+rbx]
- movdqa XMMWORD PTR[rbx],xmm0
- adc r10,QWORD PTR[16+rbx]
- adc r11,QWORD PTR[24+rbx]
- movdqa XMMWORD PTR[16+rbx],xmm0
+$L$mulx4x_cond_copy::
+ movdqa xmm2,XMMWORD PTR[rbx]
+ movdqa xmm3,XMMWORD PTR[16+rbx]
lea rbx,QWORD PTR[32+rbx]
- sbb rdx,rdx
-
- mov QWORD PTR[rdi],r8
- mov QWORD PTR[8+rdi],r9
- mov QWORD PTR[16+rdi],r10
- mov QWORD PTR[24+rdi],r11
+ movdqu xmm4,XMMWORD PTR[rdi]
+ movdqu xmm5,XMMWORD PTR[16+rdi]
lea rdi,QWORD PTR[32+rdi]
+ movdqa XMMWORD PTR[(-32)+rbx],xmm0
+ movdqa XMMWORD PTR[(-16)+rbx],xmm0
+ pcmpeqd xmm0,xmm1
+ pand xmm2,xmm1
+ pand xmm3,xmm1
+ pand xmm4,xmm0
+ pand xmm5,xmm0
+ pxor xmm0,xmm0
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqu XMMWORD PTR[(-32)+rdi],xmm4
+ movdqu XMMWORD PTR[(-16)+rdi],xmm5
+ sub rdx,32
+ jnz $L$mulx4x_cond_copy
- add rax,32
- jnz $L$mulx4x_sub
+ mov QWORD PTR[rbx],rdx
- mov rsi,QWORD PTR[40+rsp]
mov rax,1
mov r15,QWORD PTR[((-48))+rsi]
mov r14,QWORD PTR[((-40))+rsi]
diff --git a/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm b/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm
index eae2339ef1..080fb16784 100644
--- a/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm
+++ b/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm
@@ -28,49 +28,151 @@ ALIGN 16
$L$mul_enter::
mov r9d,r9d
mov rax,rsp
- mov r10d,DWORD PTR[56+rsp]
+ movd xmm5,DWORD PTR[56+rsp]
+ lea r10,QWORD PTR[$L$inc]
push rbx
push rbp
push r12
push r13
push r14
push r15
- lea rsp,QWORD PTR[((-40))+rsp]
- movaps XMMWORD PTR[rsp],xmm6
- movaps XMMWORD PTR[16+rsp],xmm7
+
lea r11,QWORD PTR[2+r9]
neg r11
- lea rsp,QWORD PTR[r11*8+rsp]
+ lea rsp,QWORD PTR[((-264))+r11*8+rsp]
and rsp,-1024
mov QWORD PTR[8+r9*8+rsp],rax
$L$mul_body::
- mov r12,rdx
- mov r11,r10
- shr r10,3
- and r11,7
- not r10
- lea rax,QWORD PTR[$L$magic_masks]
- and r10,3
- lea r12,QWORD PTR[96+r11*8+r12]
- movq xmm4,QWORD PTR[r10*8+rax]
- movq xmm5,QWORD PTR[8+r10*8+rax]
- movq xmm6,QWORD PTR[16+r10*8+rax]
- movq xmm7,QWORD PTR[24+r10*8+rax]
-
- movq xmm0,QWORD PTR[((-96))+r12]
- movq xmm1,QWORD PTR[((-32))+r12]
- pand xmm0,xmm4
- movq xmm2,QWORD PTR[32+r12]
- pand xmm1,xmm5
- movq xmm3,QWORD PTR[96+r12]
- pand xmm2,xmm6
- por xmm0,xmm1
- pand xmm3,xmm7
+ lea r12,QWORD PTR[128+rdx]
+ movdqa xmm0,XMMWORD PTR[r10]
+ movdqa xmm1,XMMWORD PTR[16+r10]
+ lea r10,QWORD PTR[((24-112))+r9*8+rsp]
+ and r10,-16
+
+ pshufd xmm5,xmm5,0
+ movdqa xmm4,xmm1
+ movdqa xmm2,xmm1
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+DB 067h
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[112+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[128+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[144+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[160+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[176+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[192+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[208+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[224+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[240+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[256+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[272+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[288+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[304+r10],xmm0
+
+ paddd xmm3,xmm2
+DB 067h
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[320+r10],xmm1
+
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[336+r10],xmm2
+ pand xmm0,XMMWORD PTR[64+r12]
+
+ pand xmm1,XMMWORD PTR[80+r12]
+ pand xmm2,XMMWORD PTR[96+r12]
+ movdqa XMMWORD PTR[352+r10],xmm3
+ pand xmm3,XMMWORD PTR[112+r12]
por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD PTR[((-128))+r12]
+ movdqa xmm5,XMMWORD PTR[((-112))+r12]
+ movdqa xmm2,XMMWORD PTR[((-96))+r12]
+ pand xmm4,XMMWORD PTR[112+r10]
+ movdqa xmm3,XMMWORD PTR[((-80))+r12]
+ pand xmm5,XMMWORD PTR[128+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD PTR[144+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD PTR[160+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD PTR[((-64))+r12]
+ movdqa xmm5,XMMWORD PTR[((-48))+r12]
+ movdqa xmm2,XMMWORD PTR[((-32))+r12]
+ pand xmm4,XMMWORD PTR[176+r10]
+ movdqa xmm3,XMMWORD PTR[((-16))+r12]
+ pand xmm5,XMMWORD PTR[192+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD PTR[208+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD PTR[224+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD PTR[r12]
+ movdqa xmm5,XMMWORD PTR[16+r12]
+ movdqa xmm2,XMMWORD PTR[32+r12]
+ pand xmm4,XMMWORD PTR[240+r10]
+ movdqa xmm3,XMMWORD PTR[48+r12]
+ pand xmm5,XMMWORD PTR[256+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD PTR[272+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD PTR[288+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ por xmm0,xmm1
+ pshufd xmm1,xmm0,04eh
+ por xmm0,xmm1
lea r12,QWORD PTR[256+r12]
- por xmm0,xmm3
-
DB 102,72,15,126,195
mov r8,QWORD PTR[r8]
@@ -79,29 +181,14 @@ DB 102,72,15,126,195
xor r14,r14
xor r15,r15
- movq xmm0,QWORD PTR[((-96))+r12]
- movq xmm1,QWORD PTR[((-32))+r12]
- pand xmm0,xmm4
- movq xmm2,QWORD PTR[32+r12]
- pand xmm1,xmm5
-
mov rbp,r8
mul rbx
mov r10,rax
mov rax,QWORD PTR[rcx]
- movq xmm3,QWORD PTR[96+r12]
- pand xmm2,xmm6
- por xmm0,xmm1
- pand xmm3,xmm7
-
imul rbp,r10
mov r11,rdx
- por xmm0,xmm2
- lea r12,QWORD PTR[256+r12]
- por xmm0,xmm3
-
mul rbp
add r10,rax
mov rax,QWORD PTR[8+rsi]
@@ -134,14 +221,12 @@ $L$1st_enter::
cmp r15,r9
jne $L$1st
-DB 102,72,15,126,195
add r13,rax
- mov rax,QWORD PTR[rsi]
adc rdx,0
add r13,r11
adc rdx,0
- mov QWORD PTR[((-16))+r15*8+rsp],r13
+ mov QWORD PTR[((-16))+r9*8+rsp],r13
mov r13,rdx
mov r11,r10
@@ -155,33 +240,78 @@ DB 102,72,15,126,195
jmp $L$outer
ALIGN 16
$L$outer::
+ lea rdx,QWORD PTR[((24+128))+r9*8+rsp]
+ and rdx,-16
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ movdqa xmm0,XMMWORD PTR[((-128))+r12]
+ movdqa xmm1,XMMWORD PTR[((-112))+r12]
+ movdqa xmm2,XMMWORD PTR[((-96))+r12]
+ movdqa xmm3,XMMWORD PTR[((-80))+r12]
+ pand xmm0,XMMWORD PTR[((-128))+rdx]
+ pand xmm1,XMMWORD PTR[((-112))+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[((-96))+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[((-80))+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[((-64))+r12]
+ movdqa xmm1,XMMWORD PTR[((-48))+r12]
+ movdqa xmm2,XMMWORD PTR[((-32))+r12]
+ movdqa xmm3,XMMWORD PTR[((-16))+r12]
+ pand xmm0,XMMWORD PTR[((-64))+rdx]
+ pand xmm1,XMMWORD PTR[((-48))+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[((-32))+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[((-16))+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[r12]
+ movdqa xmm1,XMMWORD PTR[16+r12]
+ movdqa xmm2,XMMWORD PTR[32+r12]
+ movdqa xmm3,XMMWORD PTR[48+r12]
+ pand xmm0,XMMWORD PTR[rdx]
+ pand xmm1,XMMWORD PTR[16+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[32+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[48+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[64+r12]
+ movdqa xmm1,XMMWORD PTR[80+r12]
+ movdqa xmm2,XMMWORD PTR[96+r12]
+ movdqa xmm3,XMMWORD PTR[112+r12]
+ pand xmm0,XMMWORD PTR[64+rdx]
+ pand xmm1,XMMWORD PTR[80+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[96+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[112+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ por xmm4,xmm5
+ pshufd xmm0,xmm4,04eh
+ por xmm0,xmm4
+ lea r12,QWORD PTR[256+r12]
+
+ mov rax,QWORD PTR[rsi]
+DB 102,72,15,126,195
+
xor r15,r15
mov rbp,r8
mov r10,QWORD PTR[rsp]
- movq xmm0,QWORD PTR[((-96))+r12]
- movq xmm1,QWORD PTR[((-32))+r12]
- pand xmm0,xmm4
- movq xmm2,QWORD PTR[32+r12]
- pand xmm1,xmm5
-
mul rbx
add r10,rax
mov rax,QWORD PTR[rcx]
adc rdx,0
- movq xmm3,QWORD PTR[96+r12]
- pand xmm2,xmm6
- por xmm0,xmm1
- pand xmm3,xmm7
-
imul rbp,r10
mov r11,rdx
- por xmm0,xmm2
- lea r12,QWORD PTR[256+r12]
- por xmm0,xmm3
-
mul rbp
add r10,rax
mov rax,QWORD PTR[8+rsi]
@@ -217,15 +347,12 @@ $L$inner_enter::
cmp r15,r9
jne $L$inner
-DB 102,72,15,126,195
-
add r13,rax
- mov rax,QWORD PTR[rsi]
adc rdx,0
add r13,r10
- mov r10,QWORD PTR[r15*8+rsp]
+ mov r10,QWORD PTR[r9*8+rsp]
adc rdx,0
- mov QWORD PTR[((-16))+r15*8+rsp],r13
+ mov QWORD PTR[((-16))+r9*8+rsp],r13
mov r13,rdx
xor rdx,rdx
@@ -272,8 +399,7 @@ $L$copy::
mov rsi,QWORD PTR[8+r9*8+rsp]
mov rax,1
- movaps xmm6,XMMWORD PTR[((-88))+rsi]
- movaps xmm7,XMMWORD PTR[((-72))+rsi]
+
mov r15,QWORD PTR[((-48))+rsi]
mov r14,QWORD PTR[((-40))+rsi]
mov r13,QWORD PTR[((-32))+rsi]
@@ -303,8 +429,8 @@ $L$SEH_begin_bn_mul4x_mont_gather5::
$L$mul4x_enter::
- and r11d,080100h
- cmp r11d,080100h
+ and r11d,080108h
+ cmp r11d,080108h
je $L$mulx4x_enter
DB 067h
mov rax,rsp
@@ -314,13 +440,10 @@ DB 067h
push r13
push r14
push r15
- lea rsp,QWORD PTR[((-40))+rsp]
- movaps XMMWORD PTR[rsp],xmm6
- movaps XMMWORD PTR[16+rsp],xmm7
+
DB 067h
- mov r10d,r9d
shl r9d,3
- shl r10d,3+2
+ lea r10,QWORD PTR[r9*2+r9]
neg r9
@@ -330,19 +453,21 @@ DB 067h
- lea r11,QWORD PTR[((-64))+r9*2+rsp]
- sub r11,rsi
+
+
+ lea r11,QWORD PTR[((-320))+r9*2+rsp]
+ sub r11,rdi
and r11,4095
cmp r10,r11
jb $L$mul4xsp_alt
sub rsp,r11
- lea rsp,QWORD PTR[((-64))+r9*2+rsp]
+ lea rsp,QWORD PTR[((-320))+r9*2+rsp]
jmp $L$mul4xsp_done
ALIGN 32
$L$mul4xsp_alt::
- lea r10,QWORD PTR[((4096-64))+r9*2]
- lea rsp,QWORD PTR[((-64))+r9*2+rsp]
+ lea r10,QWORD PTR[((4096-320))+r9*2]
+ lea rsp,QWORD PTR[((-320))+r9*2+rsp]
sub r11,r10
mov r10,0
cmovc r11,r10
@@ -358,8 +483,7 @@ $L$mul4x_body::
mov rsi,QWORD PTR[40+rsp]
mov rax,1
- movaps xmm6,XMMWORD PTR[((-88))+rsi]
- movaps xmm7,XMMWORD PTR[((-72))+rsi]
+
mov r15,QWORD PTR[((-48))+rsi]
mov r14,QWORD PTR[((-40))+rsi]
mov r13,QWORD PTR[((-32))+rsi]
@@ -378,47 +502,141 @@ bn_mul4x_mont_gather5 ENDP
ALIGN 32
mul4x_internal PROC PRIVATE
shl r9,5
- mov r10d,DWORD PTR[56+rax]
- lea r13,QWORD PTR[256+r9*1+rdx]
+ movd xmm5,DWORD PTR[56+rax]
+ lea rax,QWORD PTR[$L$inc]
+ lea r13,QWORD PTR[128+r9*1+rdx]
shr r9,5
- mov r11,r10
- shr r10,3
- and r11,7
- not r10
- lea rax,QWORD PTR[$L$magic_masks]
- and r10,3
- lea r12,QWORD PTR[96+r11*8+rdx]
- movq xmm4,QWORD PTR[r10*8+rax]
- movq xmm5,QWORD PTR[8+r10*8+rax]
- add r11,7
- movq xmm6,QWORD PTR[16+r10*8+rax]
- movq xmm7,QWORD PTR[24+r10*8+rax]
- and r11,7
-
- movq xmm0,QWORD PTR[((-96))+r12]
- lea r14,QWORD PTR[256+r12]
- movq xmm1,QWORD PTR[((-32))+r12]
- pand xmm0,xmm4
- movq xmm2,QWORD PTR[32+r12]
- pand xmm1,xmm5
- movq xmm3,QWORD PTR[96+r12]
- pand xmm2,xmm6
-DB 067h
- por xmm0,xmm1
- movq xmm1,QWORD PTR[((-96))+r14]
-DB 067h
- pand xmm3,xmm7
-DB 067h
- por xmm0,xmm2
- movq xmm2,QWORD PTR[((-32))+r14]
+ movdqa xmm0,XMMWORD PTR[rax]
+ movdqa xmm1,XMMWORD PTR[16+rax]
+ lea r10,QWORD PTR[((88-112))+r9*1+rsp]
+ lea r12,QWORD PTR[128+rdx]
+
+ pshufd xmm5,xmm5,0
+ movdqa xmm4,xmm1
+DB 067h,067h
+ movdqa xmm2,xmm1
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
DB 067h
- pand xmm1,xmm4
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[112+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[128+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[144+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[160+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[176+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[192+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[208+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[224+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[240+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[256+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[272+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[288+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[304+r10],xmm0
+
+ paddd xmm3,xmm2
DB 067h
- por xmm0,xmm3
- movq xmm3,QWORD PTR[32+r14]
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[320+r10],xmm1
+
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[336+r10],xmm2
+ pand xmm0,XMMWORD PTR[64+r12]
+ pand xmm1,XMMWORD PTR[80+r12]
+ pand xmm2,XMMWORD PTR[96+r12]
+ movdqa XMMWORD PTR[352+r10],xmm3
+ pand xmm3,XMMWORD PTR[112+r12]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD PTR[((-128))+r12]
+ movdqa xmm5,XMMWORD PTR[((-112))+r12]
+ movdqa xmm2,XMMWORD PTR[((-96))+r12]
+ pand xmm4,XMMWORD PTR[112+r10]
+ movdqa xmm3,XMMWORD PTR[((-80))+r12]
+ pand xmm5,XMMWORD PTR[128+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD PTR[144+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD PTR[160+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD PTR[((-64))+r12]
+ movdqa xmm5,XMMWORD PTR[((-48))+r12]
+ movdqa xmm2,XMMWORD PTR[((-32))+r12]
+ pand xmm4,XMMWORD PTR[176+r10]
+ movdqa xmm3,XMMWORD PTR[((-16))+r12]
+ pand xmm5,XMMWORD PTR[192+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD PTR[208+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD PTR[224+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD PTR[r12]
+ movdqa xmm5,XMMWORD PTR[16+r12]
+ movdqa xmm2,XMMWORD PTR[32+r12]
+ pand xmm4,XMMWORD PTR[240+r10]
+ movdqa xmm3,XMMWORD PTR[48+r12]
+ pand xmm5,XMMWORD PTR[256+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD PTR[272+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD PTR[288+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ por xmm0,xmm1
+ pshufd xmm1,xmm0,04eh
+ por xmm0,xmm1
+ lea r12,QWORD PTR[256+r12]
DB 102,72,15,126,195
- movq xmm0,QWORD PTR[96+r14]
+
mov QWORD PTR[((16+8))+rsp],r13
mov QWORD PTR[((56+8))+rsp],rdi
@@ -432,26 +650,10 @@ DB 102,72,15,126,195
mov r10,rax
mov rax,QWORD PTR[rcx]
- pand xmm2,xmm5
- pand xmm3,xmm6
- por xmm1,xmm2
-
imul rbp,r10
-
-
-
-
-
-
-
- lea r14,QWORD PTR[((64+8))+r11*8+rsp]
+ lea r14,QWORD PTR[((64+8))+rsp]
mov r11,rdx
- pand xmm0,xmm7
- por xmm1,xmm3
- lea r12,QWORD PTR[512+r12]
- por xmm0,xmm1
-
mul rbp
add r10,rax
mov rax,QWORD PTR[8+r9*1+rsi]
@@ -460,7 +662,7 @@ DB 102,72,15,126,195
mul rbx
add r11,rax
- mov rax,QWORD PTR[16+rcx]
+ mov rax,QWORD PTR[8+rcx]
adc rdx,0
mov r10,rdx
@@ -470,7 +672,7 @@ DB 102,72,15,126,195
adc rdx,0
add rdi,r11
lea r15,QWORD PTR[32+r9]
- lea rcx,QWORD PTR[64+rcx]
+ lea rcx,QWORD PTR[32+rcx]
adc rdx,0
mov QWORD PTR[r14],rdi
mov r13,rdx
@@ -480,7 +682,7 @@ ALIGN 32
$L$1st4x::
mul rbx
add r10,rax
- mov rax,QWORD PTR[((-32))+rcx]
+ mov rax,QWORD PTR[((-16))+rcx]
lea r14,QWORD PTR[32+r14]
adc rdx,0
mov r11,rdx
@@ -496,7 +698,7 @@ $L$1st4x::
mul rbx
add r11,rax
- mov rax,QWORD PTR[((-16))+rcx]
+ mov rax,QWORD PTR[((-8))+rcx]
adc rdx,0
mov r10,rdx
@@ -526,7 +728,7 @@ $L$1st4x::
mul rbx
add r11,rax
- mov rax,QWORD PTR[16+rcx]
+ mov rax,QWORD PTR[8+rcx]
adc rdx,0
mov r10,rdx
@@ -535,7 +737,7 @@ $L$1st4x::
mov rax,QWORD PTR[16+r15*1+rsi]
adc rdx,0
add rdi,r11
- lea rcx,QWORD PTR[64+rcx]
+ lea rcx,QWORD PTR[32+rcx]
adc rdx,0
mov QWORD PTR[r14],rdi
mov r13,rdx
@@ -545,7 +747,7 @@ $L$1st4x::
mul rbx
add r10,rax
- mov rax,QWORD PTR[((-32))+rcx]
+ mov rax,QWORD PTR[((-16))+rcx]
lea r14,QWORD PTR[32+r14]
adc rdx,0
mov r11,rdx
@@ -561,7 +763,7 @@ $L$1st4x::
mul rbx
add r11,rax
- mov rax,QWORD PTR[((-16))+rcx]
+ mov rax,QWORD PTR[((-8))+rcx]
adc rdx,0
mov r10,rdx
@@ -574,8 +776,7 @@ $L$1st4x::
mov QWORD PTR[((-16))+r14],rdi
mov r13,rdx
-DB 102,72,15,126,195
- lea rcx,QWORD PTR[r9*2+rcx]
+ lea rcx,QWORD PTR[r9*1+rcx]
xor rdi,rdi
add r13,r10
@@ -586,6 +787,63 @@ DB 102,72,15,126,195
ALIGN 32
$L$outer4x::
+ lea rdx,QWORD PTR[((16+128))+r14]
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ movdqa xmm0,XMMWORD PTR[((-128))+r12]
+ movdqa xmm1,XMMWORD PTR[((-112))+r12]
+ movdqa xmm2,XMMWORD PTR[((-96))+r12]
+ movdqa xmm3,XMMWORD PTR[((-80))+r12]
+ pand xmm0,XMMWORD PTR[((-128))+rdx]
+ pand xmm1,XMMWORD PTR[((-112))+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[((-96))+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[((-80))+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[((-64))+r12]
+ movdqa xmm1,XMMWORD PTR[((-48))+r12]
+ movdqa xmm2,XMMWORD PTR[((-32))+r12]
+ movdqa xmm3,XMMWORD PTR[((-16))+r12]
+ pand xmm0,XMMWORD PTR[((-64))+rdx]
+ pand xmm1,XMMWORD PTR[((-48))+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[((-32))+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[((-16))+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[r12]
+ movdqa xmm1,XMMWORD PTR[16+r12]
+ movdqa xmm2,XMMWORD PTR[32+r12]
+ movdqa xmm3,XMMWORD PTR[48+r12]
+ pand xmm0,XMMWORD PTR[rdx]
+ pand xmm1,XMMWORD PTR[16+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[32+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[48+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[64+r12]
+ movdqa xmm1,XMMWORD PTR[80+r12]
+ movdqa xmm2,XMMWORD PTR[96+r12]
+ movdqa xmm3,XMMWORD PTR[112+r12]
+ pand xmm0,XMMWORD PTR[64+rdx]
+ pand xmm1,XMMWORD PTR[80+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[96+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[112+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ por xmm4,xmm5
+ pshufd xmm0,xmm4,04eh
+ por xmm0,xmm4
+ lea r12,QWORD PTR[256+r12]
+DB 102,72,15,126,195
+
mov r10,QWORD PTR[r9*1+r14]
mov rbp,r8
mul rbx
@@ -593,25 +851,11 @@ $L$outer4x::
mov rax,QWORD PTR[rcx]
adc rdx,0
- movq xmm0,QWORD PTR[((-96))+r12]
- movq xmm1,QWORD PTR[((-32))+r12]
- pand xmm0,xmm4
- movq xmm2,QWORD PTR[32+r12]
- pand xmm1,xmm5
- movq xmm3,QWORD PTR[96+r12]
-
imul rbp,r10
-DB 067h
mov r11,rdx
mov QWORD PTR[r14],rdi
- pand xmm2,xmm6
- por xmm0,xmm1
- pand xmm3,xmm7
- por xmm0,xmm2
lea r14,QWORD PTR[r9*1+r14]
- lea r12,QWORD PTR[256+r12]
- por xmm0,xmm3
mul rbp
add r10,rax
@@ -621,7 +865,7 @@ DB 067h
mul rbx
add r11,rax
- mov rax,QWORD PTR[16+rcx]
+ mov rax,QWORD PTR[8+rcx]
adc rdx,0
add r11,QWORD PTR[8+r14]
adc rdx,0
@@ -633,7 +877,7 @@ DB 067h
adc rdx,0
add rdi,r11
lea r15,QWORD PTR[32+r9]
- lea rcx,QWORD PTR[64+rcx]
+ lea rcx,QWORD PTR[32+rcx]
adc rdx,0
mov r13,rdx
jmp $L$inner4x
@@ -642,7 +886,7 @@ ALIGN 32
$L$inner4x::
mul rbx
add r10,rax
- mov rax,QWORD PTR[((-32))+rcx]
+ mov rax,QWORD PTR[((-16))+rcx]
adc rdx,0
add r10,QWORD PTR[16+r14]
lea r14,QWORD PTR[32+r14]
@@ -660,7 +904,7 @@ $L$inner4x::
mul rbx
add r11,rax
- mov rax,QWORD PTR[((-16))+rcx]
+ mov rax,QWORD PTR[((-8))+rcx]
adc rdx,0
add r11,QWORD PTR[((-8))+r14]
adc rdx,0
@@ -694,7 +938,7 @@ $L$inner4x::
mul rbx
add r11,rax
- mov rax,QWORD PTR[16+rcx]
+ mov rax,QWORD PTR[8+rcx]
adc rdx,0
add r11,QWORD PTR[8+r14]
adc rdx,0
@@ -705,7 +949,7 @@ $L$inner4x::
mov rax,QWORD PTR[16+r15*1+rsi]
adc rdx,0
add rdi,r11
- lea rcx,QWORD PTR[64+rcx]
+ lea rcx,QWORD PTR[32+rcx]
adc rdx,0
mov QWORD PTR[((-8))+r14],r13
mov r13,rdx
@@ -715,7 +959,7 @@ $L$inner4x::
mul rbx
add r10,rax
- mov rax,QWORD PTR[((-32))+rcx]
+ mov rax,QWORD PTR[((-16))+rcx]
adc rdx,0
add r10,QWORD PTR[16+r14]
lea r14,QWORD PTR[32+r14]
@@ -734,7 +978,7 @@ $L$inner4x::
mul rbx
add r11,rax
mov rax,rbp
- mov rbp,QWORD PTR[((-16))+rcx]
+ mov rbp,QWORD PTR[((-8))+rcx]
adc rdx,0
add r11,QWORD PTR[((-8))+r14]
adc rdx,0
@@ -749,9 +993,8 @@ $L$inner4x::
mov QWORD PTR[((-24))+r14],r13
mov r13,rdx
-DB 102,72,15,126,195
mov QWORD PTR[((-16))+r14],rdi
- lea rcx,QWORD PTR[r9*2+rcx]
+ lea rcx,QWORD PTR[r9*1+rcx]
xor rdi,rdi
add r13,r10
@@ -762,16 +1005,23 @@ DB 102,72,15,126,195
cmp r12,QWORD PTR[((16+8))+rsp]
jb $L$outer4x
+ xor rax,rax
sub rbp,r13
adc r15,r15
or rdi,r15
- xor rdi,1
+ sub rax,rdi
lea rbx,QWORD PTR[r9*1+r14]
- lea rbp,QWORD PTR[rdi*8+rcx]
+ mov r12,QWORD PTR[rcx]
+ lea rbp,QWORD PTR[rcx]
mov rcx,r9
sar rcx,3+2
mov rdi,QWORD PTR[((56+8))+rsp]
- jmp $L$sqr4x_sub
+ dec r12
+ xor r10,r10
+ mov r13,QWORD PTR[8+rbp]
+ mov r14,QWORD PTR[16+rbp]
+ mov r15,QWORD PTR[24+rbp]
+ jmp $L$sqr4x_sub_entry
mul4x_internal ENDP
PUBLIC bn_power5
@@ -790,8 +1040,8 @@ $L$SEH_begin_bn_power5::
mov r11d,DWORD PTR[((OPENSSL_ia32cap_P+8))]
- and r11d,080100h
- cmp r11d,080100h
+ and r11d,080108h
+ cmp r11d,080108h
je $L$powerx5_enter
mov rax,rsp
push rbx
@@ -800,12 +1050,9 @@ $L$SEH_begin_bn_power5::
push r13
push r14
push r15
- lea rsp,QWORD PTR[((-40))+rsp]
- movaps XMMWORD PTR[rsp],xmm6
- movaps XMMWORD PTR[16+rsp],xmm7
- mov r10d,r9d
+
shl r9d,3
- shl r10d,3+2
+ lea r10d,DWORD PTR[r9*2+r9]
neg r9
mov r8,QWORD PTR[r8]
@@ -815,19 +1062,20 @@ $L$SEH_begin_bn_power5::
- lea r11,QWORD PTR[((-64))+r9*2+rsp]
- sub r11,rsi
+
+ lea r11,QWORD PTR[((-320))+r9*2+rsp]
+ sub r11,rdi
and r11,4095
cmp r10,r11
jb $L$pwr_sp_alt
sub rsp,r11
- lea rsp,QWORD PTR[((-64))+r9*2+rsp]
+ lea rsp,QWORD PTR[((-320))+r9*2+rsp]
jmp $L$pwr_sp_done
ALIGN 32
$L$pwr_sp_alt::
- lea r10,QWORD PTR[((4096-64))+r9*2]
- lea rsp,QWORD PTR[((-64))+r9*2+rsp]
+ lea r10,QWORD PTR[((4096-320))+r9*2]
+ lea rsp,QWORD PTR[((-320))+r9*2+rsp]
sub r11,r10
mov r10,0
cmovc r11,r10
@@ -855,10 +1103,15 @@ DB 102,73,15,110,218
DB 102,72,15,110,226
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
DB 102,72,15,126,209
DB 102,72,15,126,226
@@ -1405,9 +1658,9 @@ DB 067h
mov QWORD PTR[((-16))+rdi],rbx
mov QWORD PTR[((-8))+rdi],r8
DB 102,72,15,126,213
-sqr8x_reduction::
+__bn_sqr8x_reduction::
xor rax,rax
- lea rcx,QWORD PTR[r9*2+rbp]
+ lea rcx,QWORD PTR[rbp*1+r9]
lea rdx,QWORD PTR[((48+8))+r9*2+rsp]
mov QWORD PTR[((0+8))+rsp],rcx
lea rdi,QWORD PTR[((48+8))+r9*1+rsp]
@@ -1440,14 +1693,14 @@ DB 067h
ALIGN 32
$L$8x_reduce::
mul rbx
- mov rax,QWORD PTR[16+rbp]
+ mov rax,QWORD PTR[8+rbp]
neg r8
mov r8,rdx
adc r8,0
mul rbx
add r9,rax
- mov rax,QWORD PTR[32+rbp]
+ mov rax,QWORD PTR[16+rbp]
adc rdx,0
add r8,r9
mov QWORD PTR[((48-8+8))+rcx*8+rsp],rbx
@@ -1456,7 +1709,7 @@ $L$8x_reduce::
mul rbx
add r10,rax
- mov rax,QWORD PTR[48+rbp]
+ mov rax,QWORD PTR[24+rbp]
adc rdx,0
add r9,r10
mov rsi,QWORD PTR[((32+8))+rsp]
@@ -1465,7 +1718,7 @@ $L$8x_reduce::
mul rbx
add r11,rax
- mov rax,QWORD PTR[64+rbp]
+ mov rax,QWORD PTR[32+rbp]
adc rdx,0
imul rsi,r8
add r10,r11
@@ -1474,7 +1727,7 @@ $L$8x_reduce::
mul rbx
add r12,rax
- mov rax,QWORD PTR[80+rbp]
+ mov rax,QWORD PTR[40+rbp]
adc rdx,0
add r11,r12
mov r12,rdx
@@ -1482,7 +1735,7 @@ $L$8x_reduce::
mul rbx
add r13,rax
- mov rax,QWORD PTR[96+rbp]
+ mov rax,QWORD PTR[48+rbp]
adc rdx,0
add r12,r13
mov r13,rdx
@@ -1490,7 +1743,7 @@ $L$8x_reduce::
mul rbx
add r14,rax
- mov rax,QWORD PTR[112+rbp]
+ mov rax,QWORD PTR[56+rbp]
adc rdx,0
add r13,r14
mov r14,rdx
@@ -1508,7 +1761,7 @@ $L$8x_reduce::
dec ecx
jnz $L$8x_reduce
- lea rbp,QWORD PTR[128+rbp]
+ lea rbp,QWORD PTR[64+rbp]
xor rax,rax
mov rdx,QWORD PTR[((8+8))+rsp]
cmp rbp,QWORD PTR[((0+8))+rsp]
@@ -1534,14 +1787,14 @@ ALIGN 32
$L$8x_tail::
mul rbx
add r8,rax
- mov rax,QWORD PTR[16+rbp]
+ mov rax,QWORD PTR[8+rbp]
mov QWORD PTR[rdi],r8
mov r8,rdx
adc r8,0
mul rbx
add r9,rax
- mov rax,QWORD PTR[32+rbp]
+ mov rax,QWORD PTR[16+rbp]
adc rdx,0
add r8,r9
lea rdi,QWORD PTR[8+rdi]
@@ -1550,7 +1803,7 @@ $L$8x_tail::
mul rbx
add r10,rax
- mov rax,QWORD PTR[48+rbp]
+ mov rax,QWORD PTR[24+rbp]
adc rdx,0
add r9,r10
mov r10,rdx
@@ -1558,7 +1811,7 @@ $L$8x_tail::
mul rbx
add r11,rax
- mov rax,QWORD PTR[64+rbp]
+ mov rax,QWORD PTR[32+rbp]
adc rdx,0
add r10,r11
mov r11,rdx
@@ -1566,7 +1819,7 @@ $L$8x_tail::
mul rbx
add r12,rax
- mov rax,QWORD PTR[80+rbp]
+ mov rax,QWORD PTR[40+rbp]
adc rdx,0
add r11,r12
mov r12,rdx
@@ -1574,7 +1827,7 @@ $L$8x_tail::
mul rbx
add r13,rax
- mov rax,QWORD PTR[96+rbp]
+ mov rax,QWORD PTR[48+rbp]
adc rdx,0
add r12,r13
mov r13,rdx
@@ -1582,7 +1835,7 @@ $L$8x_tail::
mul rbx
add r14,rax
- mov rax,QWORD PTR[112+rbp]
+ mov rax,QWORD PTR[56+rbp]
adc rdx,0
add r13,r14
mov r14,rdx
@@ -1600,7 +1853,7 @@ $L$8x_tail::
dec ecx
jnz $L$8x_tail
- lea rbp,QWORD PTR[128+rbp]
+ lea rbp,QWORD PTR[64+rbp]
mov rdx,QWORD PTR[((8+8))+rsp]
cmp rbp,QWORD PTR[((0+8))+rsp]
jae $L$8x_tail_done
@@ -1646,7 +1899,7 @@ $L$8x_no_tail::
adc r14,QWORD PTR[48+rdi]
adc r15,QWORD PTR[56+rdi]
adc rax,0
- mov rcx,QWORD PTR[((-16))+rbp]
+ mov rcx,QWORD PTR[((-8))+rbp]
xor rsi,rsi
DB 102,72,15,126,213
@@ -1664,44 +1917,62 @@ DB 102,73,15,126,217
cmp rdi,rdx
jb $L$8x_reduction_loop
+ DB 0F3h,0C3h ;repret
+bn_sqr8x_internal ENDP
- sub rcx,r15
+ALIGN 32
+__bn_post4x_internal PROC PRIVATE
+ mov r12,QWORD PTR[rbp]
lea rbx,QWORD PTR[r9*1+rdi]
- adc rsi,rsi
mov rcx,r9
- or rax,rsi
DB 102,72,15,126,207
- xor rax,1
+ neg rax
DB 102,72,15,126,206
- lea rbp,QWORD PTR[rax*8+rbp]
sar rcx,3+2
- jmp $L$sqr4x_sub
+ dec r12
+ xor r10,r10
+ mov r13,QWORD PTR[8+rbp]
+ mov r14,QWORD PTR[16+rbp]
+ mov r15,QWORD PTR[24+rbp]
+ jmp $L$sqr4x_sub_entry
-ALIGN 32
+ALIGN 16
$L$sqr4x_sub::
-DB 066h
- mov r12,QWORD PTR[rbx]
- mov r13,QWORD PTR[8+rbx]
- sbb r12,QWORD PTR[rbp]
- mov r14,QWORD PTR[16+rbx]
- sbb r13,QWORD PTR[16+rbp]
- mov r15,QWORD PTR[24+rbx]
- lea rbx,QWORD PTR[32+rbx]
- sbb r14,QWORD PTR[32+rbp]
+ mov r12,QWORD PTR[rbp]
+ mov r13,QWORD PTR[8+rbp]
+ mov r14,QWORD PTR[16+rbp]
+ mov r15,QWORD PTR[24+rbp]
+$L$sqr4x_sub_entry::
+ lea rbp,QWORD PTR[32+rbp]
+ not r12
+ not r13
+ not r14
+ not r15
+ and r12,rax
+ and r13,rax
+ and r14,rax
+ and r15,rax
+
+ neg r10
+ adc r12,QWORD PTR[rbx]
+ adc r13,QWORD PTR[8+rbx]
+ adc r14,QWORD PTR[16+rbx]
+ adc r15,QWORD PTR[24+rbx]
mov QWORD PTR[rdi],r12
- sbb r15,QWORD PTR[48+rbp]
- lea rbp,QWORD PTR[64+rbp]
+ lea rbx,QWORD PTR[32+rbx]
mov QWORD PTR[8+rdi],r13
+ sbb r10,r10
mov QWORD PTR[16+rdi],r14
mov QWORD PTR[24+rdi],r15
lea rdi,QWORD PTR[32+rdi]
inc rcx
jnz $L$sqr4x_sub
+
mov r10,r9
neg r9
DB 0F3h,0C3h ;repret
-bn_sqr8x_internal ENDP
+__bn_post4x_internal ENDP
PUBLIC bn_from_montgomery
ALIGN 32
@@ -1735,13 +2006,9 @@ DB 067h
push r13
push r14
push r15
- lea rsp,QWORD PTR[((-40))+rsp]
- movaps XMMWORD PTR[rsp],xmm6
- movaps XMMWORD PTR[16+rsp],xmm7
-DB 067h
- mov r10d,r9d
+
shl r9d,3
- shl r10d,3+2
+ lea r10,QWORD PTR[r9*2+r9]
neg r9
mov r8,QWORD PTR[r8]
@@ -1751,19 +2018,20 @@ DB 067h
- lea r11,QWORD PTR[((-64))+r9*2+rsp]
- sub r11,rsi
+
+ lea r11,QWORD PTR[((-320))+r9*2+rsp]
+ sub r11,rdi
and r11,4095
cmp r10,r11
jb $L$from_sp_alt
sub rsp,r11
- lea rsp,QWORD PTR[((-64))+r9*2+rsp]
+ lea rsp,QWORD PTR[((-320))+r9*2+rsp]
jmp $L$from_sp_done
ALIGN 32
$L$from_sp_alt::
- lea r10,QWORD PTR[((4096-64))+r9*2]
- lea rsp,QWORD PTR[((-64))+r9*2+rsp]
+ lea r10,QWORD PTR[((4096-320))+r9*2]
+ lea rsp,QWORD PTR[((-320))+r9*2+rsp]
sub r11,r10
mov r10,0
cmovc r11,r10
@@ -1815,12 +2083,13 @@ DB 067h
mov rbp,rcx
DB 102,73,15,110,218
mov r11d,DWORD PTR[((OPENSSL_ia32cap_P+8))]
- and r11d,080100h
- cmp r11d,080100h
+ and r11d,080108h
+ cmp r11d,080108h
jne $L$from_mont_nox
lea rdi,QWORD PTR[r9*1+rax]
- call sqrx8x_reduction
+ call __bn_sqrx8x_reduction
+ call __bn_postx4x_internal
pxor xmm0,xmm0
lea rax,QWORD PTR[48+rsp]
@@ -1829,7 +2098,8 @@ DB 102,73,15,110,218
ALIGN 32
$L$from_mont_nox::
- call sqr8x_reduction
+ call __bn_sqr8x_reduction
+ call __bn_post4x_internal
pxor xmm0,xmm0
lea rax,QWORD PTR[48+rsp]
@@ -1876,7 +2146,6 @@ $L$SEH_begin_bn_mulx4x_mont_gather5::
$L$mulx4x_enter::
-DB 067h
mov rax,rsp
push rbx
push rbp
@@ -1884,13 +2153,9 @@ DB 067h
push r13
push r14
push r15
- lea rsp,QWORD PTR[((-40))+rsp]
- movaps XMMWORD PTR[rsp],xmm6
- movaps XMMWORD PTR[16+rsp],xmm7
-DB 067h
- mov r10d,r9d
+
shl r9d,3
- shl r10d,3+2
+ lea r10,QWORD PTR[r9*2+r9]
neg r9
mov r8,QWORD PTR[r8]
@@ -1901,19 +2166,20 @@ DB 067h
- lea r11,QWORD PTR[((-64))+r9*2+rsp]
- sub r11,rsi
+
+
+ lea r11,QWORD PTR[((-320))+r9*2+rsp]
+ sub r11,rdi
and r11,4095
cmp r10,r11
jb $L$mulx4xsp_alt
sub rsp,r11
- lea rsp,QWORD PTR[((-64))+r9*2+rsp]
+ lea rsp,QWORD PTR[((-320))+r9*2+rsp]
jmp $L$mulx4xsp_done
-ALIGN 32
$L$mulx4xsp_alt::
- lea r10,QWORD PTR[((4096-64))+r9*2]
- lea rsp,QWORD PTR[((-64))+r9*2+rsp]
+ lea r10,QWORD PTR[((4096-320))+r9*2]
+ lea rsp,QWORD PTR[((-320))+r9*2+rsp]
sub r11,r10
mov r10,0
cmovc r11,r10
@@ -1939,8 +2205,7 @@ $L$mulx4x_body::
mov rsi,QWORD PTR[40+rsp]
mov rax,1
- movaps xmm6,XMMWORD PTR[((-88))+rsi]
- movaps xmm7,XMMWORD PTR[((-72))+rsi]
+
mov r15,QWORD PTR[((-48))+rsi]
mov r14,QWORD PTR[((-40))+rsi]
mov r13,QWORD PTR[((-32))+rsi]
@@ -1958,63 +2223,150 @@ bn_mulx4x_mont_gather5 ENDP
ALIGN 32
mulx4x_internal PROC PRIVATE
-DB 04ch,089h,08ch,024h,008h,000h,000h,000h
-DB 067h
+ mov QWORD PTR[8+rsp],r9
+ mov r10,r9
neg r9
shl r9,5
- lea r13,QWORD PTR[256+r9*1+rdx]
+ neg r10
+ lea r13,QWORD PTR[128+r9*1+rdx]
shr r9,5+5
- mov r10d,DWORD PTR[56+rax]
+ movd xmm5,DWORD PTR[56+rax]
sub r9,1
+ lea rax,QWORD PTR[$L$inc]
mov QWORD PTR[((16+8))+rsp],r13
mov QWORD PTR[((24+8))+rsp],r9
mov QWORD PTR[((56+8))+rsp],rdi
- mov r11,r10
- shr r10,3
- and r11,7
- not r10
- lea rax,QWORD PTR[$L$magic_masks]
- and r10,3
- lea rdi,QWORD PTR[96+r11*8+rdx]
- movq xmm4,QWORD PTR[r10*8+rax]
- movq xmm5,QWORD PTR[8+r10*8+rax]
- add r11,7
- movq xmm6,QWORD PTR[16+r10*8+rax]
- movq xmm7,QWORD PTR[24+r10*8+rax]
- and r11,7
-
- movq xmm0,QWORD PTR[((-96))+rdi]
- lea rbx,QWORD PTR[256+rdi]
- movq xmm1,QWORD PTR[((-32))+rdi]
- pand xmm0,xmm4
- movq xmm2,QWORD PTR[32+rdi]
- pand xmm1,xmm5
- movq xmm3,QWORD PTR[96+rdi]
- pand xmm2,xmm6
- por xmm0,xmm1
- movq xmm1,QWORD PTR[((-96))+rbx]
- pand xmm3,xmm7
- por xmm0,xmm2
- movq xmm2,QWORD PTR[((-32))+rbx]
- por xmm0,xmm3
-DB 067h,067h
- pand xmm1,xmm4
- movq xmm3,QWORD PTR[32+rbx]
+ movdqa xmm0,XMMWORD PTR[rax]
+ movdqa xmm1,XMMWORD PTR[16+rax]
+ lea r10,QWORD PTR[((88-112))+r10*1+rsp]
+ lea rdi,QWORD PTR[128+rdx]
+ pshufd xmm5,xmm5,0
+ movdqa xmm4,xmm1
+DB 067h
+ movdqa xmm2,xmm1
+DB 067h
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[112+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[128+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[144+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[160+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[176+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[192+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[208+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[224+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[240+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[256+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[272+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[288+r10],xmm3
+ movdqa xmm3,xmm4
+DB 067h
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[304+r10],xmm0
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[320+r10],xmm1
+
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[336+r10],xmm2
+
+ pand xmm0,XMMWORD PTR[64+rdi]
+ pand xmm1,XMMWORD PTR[80+rdi]
+ pand xmm2,XMMWORD PTR[96+rdi]
+ movdqa XMMWORD PTR[352+r10],xmm3
+ pand xmm3,XMMWORD PTR[112+rdi]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD PTR[((-128))+rdi]
+ movdqa xmm5,XMMWORD PTR[((-112))+rdi]
+ movdqa xmm2,XMMWORD PTR[((-96))+rdi]
+ pand xmm4,XMMWORD PTR[112+r10]
+ movdqa xmm3,XMMWORD PTR[((-80))+rdi]
+ pand xmm5,XMMWORD PTR[128+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD PTR[144+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD PTR[160+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD PTR[((-64))+rdi]
+ movdqa xmm5,XMMWORD PTR[((-48))+rdi]
+ movdqa xmm2,XMMWORD PTR[((-32))+rdi]
+ pand xmm4,XMMWORD PTR[176+r10]
+ movdqa xmm3,XMMWORD PTR[((-16))+rdi]
+ pand xmm5,XMMWORD PTR[192+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD PTR[208+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD PTR[224+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD PTR[rdi]
+ movdqa xmm5,XMMWORD PTR[16+rdi]
+ movdqa xmm2,XMMWORD PTR[32+rdi]
+ pand xmm4,XMMWORD PTR[240+r10]
+ movdqa xmm3,XMMWORD PTR[48+rdi]
+ pand xmm5,XMMWORD PTR[256+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD PTR[272+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD PTR[288+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ pxor xmm0,xmm1
+ pshufd xmm1,xmm0,04eh
+ por xmm0,xmm1
+ lea rdi,QWORD PTR[256+rdi]
DB 102,72,15,126,194
- movq xmm0,QWORD PTR[96+rbx]
- lea rdi,QWORD PTR[512+rdi]
- pand xmm2,xmm5
-DB 067h,067h
- pand xmm3,xmm6
-
-
-
-
-
-
-
- lea rbx,QWORD PTR[((64+32+8))+r11*8+rsp]
+ lea rbx,QWORD PTR[((64+32+8))+rsp]
mov r9,rdx
mulx rax,r8,QWORD PTR[rsi]
@@ -2030,37 +2382,31 @@ DB 067h,067h
xor rbp,rbp
mov rdx,r8
- por xmm1,xmm2
- pand xmm0,xmm7
- por xmm1,xmm3
mov QWORD PTR[((8+8))+rsp],rdi
- por xmm0,xmm1
-DB 048h,08dh,0b6h,020h,000h,000h,000h
+ lea rsi,QWORD PTR[32+rsi]
adcx r13,rax
adcx r14,rbp
mulx r10,rax,QWORD PTR[rcx]
adcx r15,rax
adox r10,r11
- mulx r11,rax,QWORD PTR[16+rcx]
+ mulx r11,rax,QWORD PTR[8+rcx]
adcx r10,rax
adox r11,r12
- mulx r12,rax,QWORD PTR[32+rcx]
+ mulx r12,rax,QWORD PTR[16+rcx]
mov rdi,QWORD PTR[((24+8))+rsp]
-DB 066h
mov QWORD PTR[((-32))+rbx],r10
adcx r11,rax
adox r12,r13
- mulx r15,rax,QWORD PTR[48+rcx]
-DB 067h,067h
+ mulx r15,rax,QWORD PTR[24+rcx]
mov rdx,r9
mov QWORD PTR[((-24))+rbx],r11
adcx r12,rax
adox r15,rbp
-DB 048h,08dh,089h,040h,000h,000h,000h
+ lea rcx,QWORD PTR[32+rcx]
mov QWORD PTR[((-16))+rbx],r12
-
+ jmp $L$mulx4x_1st
ALIGN 32
$L$mulx4x_1st::
@@ -2083,27 +2429,26 @@ DB 067h,067h
mulx r15,rax,QWORD PTR[rcx]
adcx r10,rax
adox r11,r15
- mulx r15,rax,QWORD PTR[16+rcx]
+ mulx r15,rax,QWORD PTR[8+rcx]
adcx r11,rax
adox r12,r15
- mulx r15,rax,QWORD PTR[32+rcx]
+ mulx r15,rax,QWORD PTR[16+rcx]
mov QWORD PTR[((-40))+rbx],r10
adcx r12,rax
mov QWORD PTR[((-32))+rbx],r11
adox r13,r15
- mulx r15,rax,QWORD PTR[48+rcx]
+ mulx r15,rax,QWORD PTR[24+rcx]
mov rdx,r9
mov QWORD PTR[((-24))+rbx],r12
adcx r13,rax
adox r15,rbp
- lea rcx,QWORD PTR[64+rcx]
+ lea rcx,QWORD PTR[32+rcx]
mov QWORD PTR[((-16))+rbx],r13
dec rdi
jnz $L$mulx4x_1st
mov rax,QWORD PTR[8+rsp]
-DB 102,72,15,126,194
adc r15,rbp
lea rsi,QWORD PTR[rax*1+rsi]
add r14,r15
@@ -2114,6 +2459,64 @@ DB 102,72,15,126,194
ALIGN 32
$L$mulx4x_outer::
+ lea r10,QWORD PTR[((16-256))+rbx]
+ pxor xmm4,xmm4
+DB 067h,067h
+ pxor xmm5,xmm5
+ movdqa xmm0,XMMWORD PTR[((-128))+rdi]
+ movdqa xmm1,XMMWORD PTR[((-112))+rdi]
+ movdqa xmm2,XMMWORD PTR[((-96))+rdi]
+ pand xmm0,XMMWORD PTR[256+r10]
+ movdqa xmm3,XMMWORD PTR[((-80))+rdi]
+ pand xmm1,XMMWORD PTR[272+r10]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[288+r10]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[304+r10]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[((-64))+rdi]
+ movdqa xmm1,XMMWORD PTR[((-48))+rdi]
+ movdqa xmm2,XMMWORD PTR[((-32))+rdi]
+ pand xmm0,XMMWORD PTR[320+r10]
+ movdqa xmm3,XMMWORD PTR[((-16))+rdi]
+ pand xmm1,XMMWORD PTR[336+r10]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[352+r10]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[368+r10]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[rdi]
+ movdqa xmm1,XMMWORD PTR[16+rdi]
+ movdqa xmm2,XMMWORD PTR[32+rdi]
+ pand xmm0,XMMWORD PTR[384+r10]
+ movdqa xmm3,XMMWORD PTR[48+rdi]
+ pand xmm1,XMMWORD PTR[400+r10]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[416+r10]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[432+r10]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[64+rdi]
+ movdqa xmm1,XMMWORD PTR[80+rdi]
+ movdqa xmm2,XMMWORD PTR[96+rdi]
+ pand xmm0,XMMWORD PTR[448+r10]
+ movdqa xmm3,XMMWORD PTR[112+rdi]
+ pand xmm1,XMMWORD PTR[464+r10]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[480+r10]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[496+r10]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ por xmm4,xmm5
+ pshufd xmm0,xmm4,04eh
+ por xmm0,xmm4
+ lea rdi,QWORD PTR[256+rdi]
+DB 102,72,15,126,194
+
mov QWORD PTR[rbx],rbp
lea rbx,QWORD PTR[32+rax*1+rbx]
mulx r11,r8,QWORD PTR[rsi]
@@ -2128,54 +2531,37 @@ $L$mulx4x_outer::
mulx r14,rdx,QWORD PTR[24+rsi]
adox r12,QWORD PTR[((-16))+rbx]
adcx r13,rdx
- lea rcx,QWORD PTR[rax*2+rcx]
+ lea rcx,QWORD PTR[rax*1+rcx]
lea rsi,QWORD PTR[32+rsi]
adox r13,QWORD PTR[((-8))+rbx]
adcx r14,rbp
adox r14,rbp
-DB 067h
mov r15,r8
imul r8,QWORD PTR[((32+8))+rsp]
- movq xmm0,QWORD PTR[((-96))+rdi]
-DB 067h,067h
mov rdx,r8
- movq xmm1,QWORD PTR[((-32))+rdi]
-DB 067h
- pand xmm0,xmm4
- movq xmm2,QWORD PTR[32+rdi]
-DB 067h
- pand xmm1,xmm5
- movq xmm3,QWORD PTR[96+rdi]
- add rdi,256
-DB 067h
- pand xmm2,xmm6
- por xmm0,xmm1
- pand xmm3,xmm7
xor rbp,rbp
mov QWORD PTR[((8+8))+rsp],rdi
mulx r10,rax,QWORD PTR[rcx]
adcx r15,rax
adox r10,r11
- mulx r11,rax,QWORD PTR[16+rcx]
+ mulx r11,rax,QWORD PTR[8+rcx]
adcx r10,rax
adox r11,r12
- mulx r12,rax,QWORD PTR[32+rcx]
+ mulx r12,rax,QWORD PTR[16+rcx]
adcx r11,rax
adox r12,r13
- mulx r15,rax,QWORD PTR[48+rcx]
+ mulx r15,rax,QWORD PTR[24+rcx]
mov rdx,r9
- por xmm0,xmm2
mov rdi,QWORD PTR[((24+8))+rsp]
mov QWORD PTR[((-32))+rbx],r10
- por xmm0,xmm3
adcx r12,rax
mov QWORD PTR[((-24))+rbx],r11
adox r15,rbp
mov QWORD PTR[((-16))+rbx],r12
- lea rcx,QWORD PTR[64+rcx]
+ lea rcx,QWORD PTR[32+rcx]
jmp $L$mulx4x_inner
ALIGN 32
@@ -2203,17 +2589,17 @@ $L$mulx4x_inner::
mulx r15,rax,QWORD PTR[rcx]
adcx r10,rax
adox r11,r15
- mulx r15,rax,QWORD PTR[16+rcx]
+ mulx r15,rax,QWORD PTR[8+rcx]
adcx r11,rax
adox r12,r15
- mulx r15,rax,QWORD PTR[32+rcx]
+ mulx r15,rax,QWORD PTR[16+rcx]
mov QWORD PTR[((-40))+rbx],r10
adcx r12,rax
adox r13,r15
mov QWORD PTR[((-32))+rbx],r11
- mulx r15,rax,QWORD PTR[48+rcx]
+ mulx r15,rax,QWORD PTR[24+rcx]
mov rdx,r9
- lea rcx,QWORD PTR[64+rcx]
+ lea rcx,QWORD PTR[32+rcx]
mov QWORD PTR[((-24))+rbx],r12
adcx r13,rax
adox r15,rbp
@@ -2223,7 +2609,6 @@ $L$mulx4x_inner::
jnz $L$mulx4x_inner
mov rax,QWORD PTR[((0+8))+rsp]
-DB 102,72,15,126,194
adc r15,rbp
sub rdi,QWORD PTR[rbx]
mov rdi,QWORD PTR[((8+8))+rsp]
@@ -2236,20 +2621,26 @@ DB 102,72,15,126,194
cmp rdi,r10
jb $L$mulx4x_outer
- mov r10,QWORD PTR[((-16))+rcx]
+ mov r10,QWORD PTR[((-8))+rcx]
+ mov r8,rbp
+ mov r12,QWORD PTR[rax*1+rcx]
+ lea rbp,QWORD PTR[rax*1+rcx]
+ mov rcx,rax
+ lea rdi,QWORD PTR[rax*1+rbx]
+ xor eax,eax
xor r15,r15
sub r10,r14
adc r15,r15
- or rbp,r15
- xor rbp,1
- lea rdi,QWORD PTR[rax*1+rbx]
- lea rcx,QWORD PTR[rax*2+rcx]
-DB 067h,067h
- sar rax,3+2
- lea rbp,QWORD PTR[rbp*8+rcx]
+ or r8,r15
+ sar rcx,3+2
+ sub rax,r8
mov rdx,QWORD PTR[((56+8))+rsp]
- mov rcx,rax
- jmp $L$sqrx4x_sub
+ dec r12
+ mov r13,QWORD PTR[8+rbp]
+ xor r8,r8
+ mov r14,QWORD PTR[16+rbp]
+ mov r15,QWORD PTR[24+rbp]
+ jmp $L$sqrx4x_sub_entry
mulx4x_internal ENDP
ALIGN 32
@@ -2267,7 +2658,6 @@ $L$SEH_begin_bn_powerx5::
$L$powerx5_enter::
-DB 067h
mov rax,rsp
push rbx
push rbp
@@ -2275,13 +2665,9 @@ DB 067h
push r13
push r14
push r15
- lea rsp,QWORD PTR[((-40))+rsp]
- movaps XMMWORD PTR[rsp],xmm6
- movaps XMMWORD PTR[16+rsp],xmm7
-DB 067h
- mov r10d,r9d
+
shl r9d,3
- shl r10d,3+2
+ lea r10,QWORD PTR[r9*2+r9]
neg r9
mov r8,QWORD PTR[r8]
@@ -2291,19 +2677,20 @@ DB 067h
- lea r11,QWORD PTR[((-64))+r9*2+rsp]
- sub r11,rsi
+
+ lea r11,QWORD PTR[((-320))+r9*2+rsp]
+ sub r11,rdi
and r11,4095
cmp r10,r11
jb $L$pwrx_sp_alt
sub rsp,r11
- lea rsp,QWORD PTR[((-64))+r9*2+rsp]
+ lea rsp,QWORD PTR[((-320))+r9*2+rsp]
jmp $L$pwrx_sp_done
ALIGN 32
$L$pwrx_sp_alt::
- lea r10,QWORD PTR[((4096-64))+r9*2]
- lea rsp,QWORD PTR[((-64))+r9*2+rsp]
+ lea r10,QWORD PTR[((4096-320))+r9*2]
+ lea rsp,QWORD PTR[((-320))+r9*2+rsp]
sub r11,r10
mov r10,0
cmovc r11,r10
@@ -2334,10 +2721,15 @@ DB 102,72,15,110,226
$L$powerx5_body::
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
mov r9,r10
mov rdi,rsi
@@ -2349,8 +2741,7 @@ DB 102,72,15,126,226
mov rsi,QWORD PTR[40+rsp]
mov rax,1
- movaps xmm6,XMMWORD PTR[((-88))+rsi]
- movaps xmm7,XMMWORD PTR[((-72))+rsi]
+
mov r15,QWORD PTR[((-48))+rsi]
mov r14,QWORD PTR[((-40))+rsi]
mov r13,QWORD PTR[((-32))+rsi]
@@ -2766,11 +3157,11 @@ $L$sqrx4x_shift_n_add_break::
mov QWORD PTR[56+rdi],rbx
lea rdi,QWORD PTR[64+rdi]
DB 102,72,15,126,213
-sqrx8x_reduction::
+__bn_sqrx8x_reduction::
xor eax,eax
mov rbx,QWORD PTR[((32+8))+rsp]
mov rdx,QWORD PTR[((48+8))+rsp]
- lea rcx,QWORD PTR[((-128))+r9*2+rbp]
+ lea rcx,QWORD PTR[((-64))+r9*1+rbp]
mov QWORD PTR[((0+8))+rsp],rcx
mov QWORD PTR[((8+8))+rsp],rdi
@@ -2803,19 +3194,19 @@ $L$sqrx8x_reduce::
adcx rax,rbx
adox r8,r9
- mulx r9,rbx,QWORD PTR[16+rbp]
+ mulx r9,rbx,QWORD PTR[8+rbp]
adcx r8,rbx
adox r9,r10
- mulx r10,rbx,QWORD PTR[32+rbp]
+ mulx r10,rbx,QWORD PTR[16+rbp]
adcx r9,rbx
adox r10,r11
- mulx r11,rbx,QWORD PTR[48+rbp]
+ mulx r11,rbx,QWORD PTR[24+rbp]
adcx r10,rbx
adox r11,r12
-DB 0c4h,062h,0e3h,0f6h,0a5h,040h,000h,000h,000h
+DB 0c4h,062h,0e3h,0f6h,0a5h,020h,000h,000h,000h
mov rax,rdx
mov rdx,r8
adcx r11,rbx
@@ -2825,15 +3216,15 @@ DB 0c4h,062h,0e3h,0f6h,0a5h,040h,000h,000h,000h
mov rdx,rax
mov QWORD PTR[((64+48+8))+rcx*8+rsp],rax
- mulx r13,rax,QWORD PTR[80+rbp]
+ mulx r13,rax,QWORD PTR[40+rbp]
adcx r12,rax
adox r13,r14
- mulx r14,rax,QWORD PTR[96+rbp]
+ mulx r14,rax,QWORD PTR[48+rbp]
adcx r13,rax
adox r14,r15
- mulx r15,rax,QWORD PTR[112+rbp]
+ mulx r15,rax,QWORD PTR[56+rbp]
mov rdx,rbx
adcx r14,rax
adox r15,rsi
@@ -2849,7 +3240,7 @@ DB 067h,067h,067h
mov rdx,QWORD PTR[((48+8))+rsp]
add r8,QWORD PTR[rdi]
- lea rbp,QWORD PTR[128+rbp]
+ lea rbp,QWORD PTR[64+rbp]
mov rcx,-8
adcx r9,QWORD PTR[8+rdi]
adcx r10,QWORD PTR[16+rdi]
@@ -2872,31 +3263,31 @@ $L$sqrx8x_tail::
adcx rbx,rax
adox r8,r9
- mulx r9,rax,QWORD PTR[16+rbp]
+ mulx r9,rax,QWORD PTR[8+rbp]
adcx r8,rax
adox r9,r10
- mulx r10,rax,QWORD PTR[32+rbp]
+ mulx r10,rax,QWORD PTR[16+rbp]
adcx r9,rax
adox r10,r11
- mulx r11,rax,QWORD PTR[48+rbp]
+ mulx r11,rax,QWORD PTR[24+rbp]
adcx r10,rax
adox r11,r12
-DB 0c4h,062h,0fbh,0f6h,0a5h,040h,000h,000h,000h
+DB 0c4h,062h,0fbh,0f6h,0a5h,020h,000h,000h,000h
adcx r11,rax
adox r12,r13
- mulx r13,rax,QWORD PTR[80+rbp]
+ mulx r13,rax,QWORD PTR[40+rbp]
adcx r12,rax
adox r13,r14
- mulx r14,rax,QWORD PTR[96+rbp]
+ mulx r14,rax,QWORD PTR[48+rbp]
adcx r13,rax
adox r14,r15
- mulx r15,rax,QWORD PTR[112+rbp]
+ mulx r15,rax,QWORD PTR[56+rbp]
mov rdx,QWORD PTR[((72+48+8))+rcx*8+rsp]
adcx r14,rax
adox r15,rsi
@@ -2912,7 +3303,7 @@ DB 0c4h,062h,0fbh,0f6h,0a5h,040h,000h,000h,000h
sub rsi,QWORD PTR[((16+8))+rsp]
mov rdx,QWORD PTR[((48+8))+rsp]
- lea rbp,QWORD PTR[128+rbp]
+ lea rbp,QWORD PTR[64+rbp]
adc r8,QWORD PTR[rdi]
adc r9,QWORD PTR[8+rdi]
adc r10,QWORD PTR[16+rdi]
@@ -2948,7 +3339,7 @@ $L$sqrx8x_no_tail::
adc r8,QWORD PTR[rdi]
DB 102,72,15,126,217
adc r9,QWORD PTR[8+rdi]
- mov rsi,QWORD PTR[112+rbp]
+ mov rsi,QWORD PTR[56+rbp]
DB 102,72,15,126,213
adc r10,QWORD PTR[16+rdi]
adc r11,QWORD PTR[24+rdi]
@@ -2974,45 +3365,58 @@ DB 102,72,15,126,213
lea rdi,QWORD PTR[64+rcx*1+rdi]
cmp r8,QWORD PTR[((8+8))+rsp]
jb $L$sqrx8x_reduction_loop
- xor ebx,ebx
- sub rsi,r15
- adc rbx,rbx
+ DB 0F3h,0C3h ;repret
+bn_sqrx8x_internal ENDP
+ALIGN 32
+__bn_postx4x_internal::
+ mov r12,QWORD PTR[rbp]
mov r10,rcx
- or rax,rbx
mov r9,rcx
- xor rax,1
+ neg rax
sar rcx,3+2
- lea rbp,QWORD PTR[rax*8+rbp]
DB 102,72,15,126,202
DB 102,72,15,126,206
- jmp $L$sqrx4x_sub
+ dec r12
+ mov r13,QWORD PTR[8+rbp]
+ xor r8,r8
+ mov r14,QWORD PTR[16+rbp]
+ mov r15,QWORD PTR[24+rbp]
+ jmp $L$sqrx4x_sub_entry
-ALIGN 32
+ALIGN 16
$L$sqrx4x_sub::
-DB 066h
- mov r12,QWORD PTR[rdi]
- mov r13,QWORD PTR[8+rdi]
- sbb r12,QWORD PTR[rbp]
- mov r14,QWORD PTR[16+rdi]
- sbb r13,QWORD PTR[16+rbp]
- mov r15,QWORD PTR[24+rdi]
- lea rdi,QWORD PTR[32+rdi]
- sbb r14,QWORD PTR[32+rbp]
+ mov r12,QWORD PTR[rbp]
+ mov r13,QWORD PTR[8+rbp]
+ mov r14,QWORD PTR[16+rbp]
+ mov r15,QWORD PTR[24+rbp]
+$L$sqrx4x_sub_entry::
+ andn r12,r12,rax
+ lea rbp,QWORD PTR[32+rbp]
+ andn r13,r13,rax
+ andn r14,r14,rax
+ andn r15,r15,rax
+
+ neg r8
+ adc r12,QWORD PTR[rdi]
+ adc r13,QWORD PTR[8+rdi]
+ adc r14,QWORD PTR[16+rdi]
+ adc r15,QWORD PTR[24+rdi]
mov QWORD PTR[rdx],r12
- sbb r15,QWORD PTR[48+rbp]
- lea rbp,QWORD PTR[64+rbp]
+ lea rdi,QWORD PTR[32+rdi]
mov QWORD PTR[8+rdx],r13
+ sbb r8,r8
mov QWORD PTR[16+rdx],r14
mov QWORD PTR[24+rdx],r15
lea rdx,QWORD PTR[32+rdx]
inc rcx
jnz $L$sqrx4x_sub
+
neg r9
DB 0F3h,0C3h ;repret
-bn_sqrx8x_internal ENDP
+
PUBLIC bn_get_bits5
ALIGN 16
@@ -3052,55 +3456,171 @@ bn_scatter5 ENDP
PUBLIC bn_gather5
-ALIGN 16
+ALIGN 32
bn_gather5 PROC PUBLIC
$L$SEH_begin_bn_gather5::
-DB 048h,083h,0ech,028h
-DB 00fh,029h,034h,024h
-DB 00fh,029h,07ch,024h,010h
- mov r11d,r9d
- shr r9d,3
- and r11,7
- not r9d
- lea rax,QWORD PTR[$L$magic_masks]
- and r9d,3
- lea r8,QWORD PTR[128+r11*8+r8]
- movq xmm4,QWORD PTR[r9*8+rax]
- movq xmm5,QWORD PTR[8+r9*8+rax]
- movq xmm6,QWORD PTR[16+r9*8+rax]
- movq xmm7,QWORD PTR[24+r9*8+rax]
+DB 04ch,08dh,014h,024h
+DB 048h,081h,0ech,008h,001h,000h,000h
+ lea rax,QWORD PTR[$L$inc]
+ and rsp,-16
+
+ movd xmm5,r9d
+ movdqa xmm0,XMMWORD PTR[rax]
+ movdqa xmm1,XMMWORD PTR[16+rax]
+ lea r11,QWORD PTR[128+r8]
+ lea rax,QWORD PTR[128+rsp]
+
+ pshufd xmm5,xmm5,0
+ movdqa xmm4,xmm1
+ movdqa xmm2,xmm1
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa xmm3,xmm4
+
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[(-128)+rax],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[(-112)+rax],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[(-96)+rax],xmm2
+ movdqa xmm2,xmm4
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[(-80)+rax],xmm3
+ movdqa xmm3,xmm4
+
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[(-64)+rax],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[(-48)+rax],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[(-32)+rax],xmm2
+ movdqa xmm2,xmm4
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[(-16)+rax],xmm3
+ movdqa xmm3,xmm4
+
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[rax],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[16+rax],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[32+rax],xmm2
+ movdqa xmm2,xmm4
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[48+rax],xmm3
+ movdqa xmm3,xmm4
+
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[64+rax],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[80+rax],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[96+rax],xmm2
+ movdqa xmm2,xmm4
+ movdqa XMMWORD PTR[112+rax],xmm3
jmp $L$gather
-ALIGN 16
-$L$gather::
- movq xmm0,QWORD PTR[((-128))+r8]
- movq xmm1,QWORD PTR[((-64))+r8]
- pand xmm0,xmm4
- movq xmm2,QWORD PTR[r8]
- pand xmm1,xmm5
- movq xmm3,QWORD PTR[64+r8]
- pand xmm2,xmm6
- por xmm0,xmm1
- pand xmm3,xmm7
-DB 067h,067h
- por xmm0,xmm2
- lea r8,QWORD PTR[256+r8]
- por xmm0,xmm3
+ALIGN 32
+$L$gather::
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ movdqa xmm0,XMMWORD PTR[((-128))+r11]
+ movdqa xmm1,XMMWORD PTR[((-112))+r11]
+ movdqa xmm2,XMMWORD PTR[((-96))+r11]
+ pand xmm0,XMMWORD PTR[((-128))+rax]
+ movdqa xmm3,XMMWORD PTR[((-80))+r11]
+ pand xmm1,XMMWORD PTR[((-112))+rax]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[((-96))+rax]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[((-80))+rax]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[((-64))+r11]
+ movdqa xmm1,XMMWORD PTR[((-48))+r11]
+ movdqa xmm2,XMMWORD PTR[((-32))+r11]
+ pand xmm0,XMMWORD PTR[((-64))+rax]
+ movdqa xmm3,XMMWORD PTR[((-16))+r11]
+ pand xmm1,XMMWORD PTR[((-48))+rax]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[((-32))+rax]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[((-16))+rax]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[r11]
+ movdqa xmm1,XMMWORD PTR[16+r11]
+ movdqa xmm2,XMMWORD PTR[32+r11]
+ pand xmm0,XMMWORD PTR[rax]
+ movdqa xmm3,XMMWORD PTR[48+r11]
+ pand xmm1,XMMWORD PTR[16+rax]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[32+rax]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[48+rax]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[64+r11]
+ movdqa xmm1,XMMWORD PTR[80+r11]
+ movdqa xmm2,XMMWORD PTR[96+r11]
+ pand xmm0,XMMWORD PTR[64+rax]
+ movdqa xmm3,XMMWORD PTR[112+r11]
+ pand xmm1,XMMWORD PTR[80+rax]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[96+rax]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[112+rax]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ por xmm4,xmm5
+ lea r11,QWORD PTR[256+r11]
+ pshufd xmm0,xmm4,04eh
+ por xmm0,xmm4
movq QWORD PTR[rcx],xmm0
lea rcx,QWORD PTR[8+rcx]
sub edx,1
jnz $L$gather
- movaps xmm6,XMMWORD PTR[rsp]
- movaps xmm7,XMMWORD PTR[16+rsp]
- lea rsp,QWORD PTR[40+rsp]
+
+ lea rsp,QWORD PTR[r10]
DB 0F3h,0C3h ;repret
$L$SEH_end_bn_gather5::
bn_gather5 ENDP
ALIGN 64
-$L$magic_masks::
- DD 0,0,0,0,0,0,-1,-1
- DD 0,0,0,0,0,0,0,0
+$L$inc::
+ DD 0,0,1,1
+ DD 2,2,2,2
DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
DB 112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115
DB 99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111
@@ -3142,19 +3662,16 @@ mul_handler PROC PRIVATE
lea r10,QWORD PTR[$L$mul_epilogue]
cmp rbx,r10
- jb $L$body_40
+ ja $L$body_40
mov r10,QWORD PTR[192+r8]
mov rax,QWORD PTR[8+r10*8+rax]
+
jmp $L$body_proceed
$L$body_40::
mov rax,QWORD PTR[40+rax]
$L$body_proceed::
-
- movaps xmm0,XMMWORD PTR[((-88))+rax]
- movaps xmm1,XMMWORD PTR[((-72))+rax]
-
mov rbx,QWORD PTR[((-8))+rax]
mov rbp,QWORD PTR[((-16))+rax]
mov r12,QWORD PTR[((-24))+rax]
@@ -3167,8 +3684,6 @@ $L$body_proceed::
mov QWORD PTR[224+r8],r13
mov QWORD PTR[232+r8],r14
mov QWORD PTR[240+r8],r15
- movups XMMWORD PTR[512+r8],xmm0
- movups XMMWORD PTR[528+r8],xmm1
$L$common_seh_tail::
mov rdi,QWORD PTR[8+rax]
@@ -3273,10 +3788,9 @@ DB 9,0,0,0
DD imagerel $L$powerx5_body,imagerel $L$powerx5_epilogue
ALIGN 8
$L$SEH_info_bn_gather5::
-DB 001h,00dh,005h,000h
-DB 00dh,078h,001h,000h
-DB 008h,068h,000h,000h
-DB 004h,042h,000h,000h
+DB 001h,00bh,003h,00ah
+DB 00bh,001h,021h,000h
+DB 004h,0a3h,000h,000h
ALIGN 8
.xdata ENDS
diff --git a/deps/openssl/asm/x64-win32-masm/ec/ecp_nistz256-x86_64.asm b/deps/openssl/asm/x64-win32-masm/ec/ecp_nistz256-x86_64.asm
index 3fa69816b5..f38d253c16 100644
--- a/deps/openssl/asm/x64-win32-masm/ec/ecp_nistz256-x86_64.asm
+++ b/deps/openssl/asm/x64-win32-masm/ec/ecp_nistz256-x86_64.asm
@@ -1813,6 +1813,7 @@ $L$SEH_begin_ecp_nistz256_point_double::
push r15
sub rsp,32*5+8
+$L$point_double_shortcutq::
movdqu xmm0,XMMWORD PTR[rsi]
mov rbx,rsi
movdqu xmm1,XMMWORD PTR[16+rsi]
@@ -2091,6 +2092,7 @@ DB 102,72,15,110,199
mov r14,QWORD PTR[((64+8))+rbx]
mov r15,QWORD PTR[((64+16))+rbx]
mov r8,QWORD PTR[((64+24))+rbx]
+DB 102,72,15,110,203
lea rsi,QWORD PTR[((64-0))+rbx]
lea rdi,QWORD PTR[32+rsp]
@@ -2182,7 +2184,7 @@ DB 102,73,15,126,217
test r8,r8
jnz $L$add_proceedq
test r9,r9
- jz $L$add_proceedq
+ jz $L$add_doubleq
DB 102,72,15,126,199
pxor xmm0,xmm0
@@ -2195,6 +2197,13 @@ DB 102,72,15,126,199
jmp $L$add_doneq
ALIGN 32
+$L$add_doubleq::
+DB 102,72,15,126,206
+DB 102,72,15,126,199
+ add rsp,416
+ jmp $L$point_double_shortcutq
+
+ALIGN 32
$L$add_proceedq::
mov rax,QWORD PTR[((0+64))+rsp]
mov r14,QWORD PTR[((8+64))+rsp]
@@ -2876,6 +2885,7 @@ $L$point_doublex::
push r15
sub rsp,32*5+8
+$L$point_double_shortcutx::
movdqu xmm0,XMMWORD PTR[rsi]
mov rbx,rsi
movdqu xmm1,XMMWORD PTR[16+rsi]
@@ -3150,6 +3160,7 @@ DB 102,72,15,110,199
mov r14,QWORD PTR[((64+8))+rbx]
mov r15,QWORD PTR[((64+16))+rbx]
mov r8,QWORD PTR[((64+24))+rbx]
+DB 102,72,15,110,203
lea rsi,QWORD PTR[((64-128))+rbx]
lea rdi,QWORD PTR[32+rsp]
@@ -3241,7 +3252,7 @@ DB 102,73,15,126,217
test r8,r8
jnz $L$add_proceedx
test r9,r9
- jz $L$add_proceedx
+ jz $L$add_doublex
DB 102,72,15,126,199
pxor xmm0,xmm0
@@ -3254,6 +3265,13 @@ DB 102,72,15,126,199
jmp $L$add_donex
ALIGN 32
+$L$add_doublex::
+DB 102,72,15,126,206
+DB 102,72,15,126,199
+ add rsp,416
+ jmp $L$point_double_shortcutx
+
+ALIGN 32
$L$add_proceedx::
mov rdx,QWORD PTR[((0+64))+rsp]
mov r14,QWORD PTR[((8+64))+rsp]
diff --git a/deps/openssl/asm/x64-win32-masm/modes/aesni-gcm-x86_64.asm b/deps/openssl/asm/x64-win32-masm/modes/aesni-gcm-x86_64.asm
index 0626d8f782..6552f7d017 100644
--- a/deps/openssl/asm/x64-win32-masm/modes/aesni-gcm-x86_64.asm
+++ b/deps/openssl/asm/x64-win32-masm/modes/aesni-gcm-x86_64.asm
@@ -412,7 +412,7 @@ $L$dec_no_key_aliasing::
vzeroupper
movaps xmm6,XMMWORD PTR[((-216))+rax]
- movaps xmm7,XMMWORD PTR[((-216))+rax]
+ movaps xmm7,XMMWORD PTR[((-200))+rax]
movaps xmm8,XMMWORD PTR[((-184))+rax]
movaps xmm9,XMMWORD PTR[((-168))+rax]
movaps xmm10,XMMWORD PTR[((-152))+rax]
diff --git a/deps/openssl/asm/x86-elf-gas/sha/sha1-586.s b/deps/openssl/asm/x86-elf-gas/sha/sha1-586.s
index 8a9ef772b7..816b1d55bb 100644
--- a/deps/openssl/asm/x86-elf-gas/sha/sha1-586.s
+++ b/deps/openssl/asm/x86-elf-gas/sha/sha1-586.s
@@ -23,11 +23,6 @@ sha1_block_data_order:
jz .L001x86
testl $536870912,%ecx
jnz .Lshaext_shortcut
- andl $268435456,%edx
- andl $1073741824,%eax
- orl %edx,%eax
- cmpl $1342177280,%eax
- je .Lavx_shortcut
jmp .Lssse3_shortcut
.align 16
.L001x86:
@@ -2785,1176 +2780,6 @@ _sha1_block_data_order_ssse3:
popl %ebp
ret
.size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3
-.type _sha1_block_data_order_avx,@function
-.align 16
-_sha1_block_data_order_avx:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- call .L008pic_point
-.L008pic_point:
- popl %ebp
- leal .LK_XX_XX-.L008pic_point(%ebp),%ebp
-.Lavx_shortcut:
- vzeroall
- vmovdqa (%ebp),%xmm7
- vmovdqa 16(%ebp),%xmm0
- vmovdqa 32(%ebp),%xmm1
- vmovdqa 48(%ebp),%xmm2
- vmovdqa 64(%ebp),%xmm6
- movl 20(%esp),%edi
- movl 24(%esp),%ebp
- movl 28(%esp),%edx
- movl %esp,%esi
- subl $208,%esp
- andl $-64,%esp
- vmovdqa %xmm0,112(%esp)
- vmovdqa %xmm1,128(%esp)
- vmovdqa %xmm2,144(%esp)
- shll $6,%edx
- vmovdqa %xmm7,160(%esp)
- addl %ebp,%edx
- vmovdqa %xmm6,176(%esp)
- addl $64,%ebp
- movl %edi,192(%esp)
- movl %ebp,196(%esp)
- movl %edx,200(%esp)
- movl %esi,204(%esp)
- movl (%edi),%eax
- movl 4(%edi),%ebx
- movl 8(%edi),%ecx
- movl 12(%edi),%edx
- movl 16(%edi),%edi
- movl %ebx,%esi
- vmovdqu -64(%ebp),%xmm0
- vmovdqu -48(%ebp),%xmm1
- vmovdqu -32(%ebp),%xmm2
- vmovdqu -16(%ebp),%xmm3
- vpshufb %xmm6,%xmm0,%xmm0
- vpshufb %xmm6,%xmm1,%xmm1
- vpshufb %xmm6,%xmm2,%xmm2
- vmovdqa %xmm7,96(%esp)
- vpshufb %xmm6,%xmm3,%xmm3
- vpaddd %xmm7,%xmm0,%xmm4
- vpaddd %xmm7,%xmm1,%xmm5
- vpaddd %xmm7,%xmm2,%xmm6
- vmovdqa %xmm4,(%esp)
- movl %ecx,%ebp
- vmovdqa %xmm5,16(%esp)
- xorl %edx,%ebp
- vmovdqa %xmm6,32(%esp)
- andl %ebp,%esi
- jmp .L009loop
-.align 16
-.L009loop:
- shrdl $2,%ebx,%ebx
- xorl %edx,%esi
- vpalignr $8,%xmm0,%xmm1,%xmm4
- movl %eax,%ebp
- addl (%esp),%edi
- vpaddd %xmm3,%xmm7,%xmm7
- vmovdqa %xmm0,64(%esp)
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrldq $4,%xmm3,%xmm6
- addl %esi,%edi
- andl %ebx,%ebp
- vpxor %xmm0,%xmm4,%xmm4
- xorl %ecx,%ebx
- addl %eax,%edi
- vpxor %xmm2,%xmm6,%xmm6
- shrdl $7,%eax,%eax
- xorl %ecx,%ebp
- vmovdqa %xmm7,48(%esp)
- movl %edi,%esi
- addl 4(%esp),%edx
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ebx,%eax
- shldl $5,%edi,%edi
- addl %ebp,%edx
- andl %eax,%esi
- vpsrld $31,%xmm4,%xmm6
- xorl %ebx,%eax
- addl %edi,%edx
- shrdl $7,%edi,%edi
- xorl %ebx,%esi
- vpslldq $12,%xmm4,%xmm0
- vpaddd %xmm4,%xmm4,%xmm4
- movl %edx,%ebp
- addl 8(%esp),%ecx
- xorl %eax,%edi
- shldl $5,%edx,%edx
- vpsrld $30,%xmm0,%xmm7
- vpor %xmm6,%xmm4,%xmm4
- addl %esi,%ecx
- andl %edi,%ebp
- xorl %eax,%edi
- addl %edx,%ecx
- vpslld $2,%xmm0,%xmm0
- shrdl $7,%edx,%edx
- xorl %eax,%ebp
- vpxor %xmm7,%xmm4,%xmm4
- movl %ecx,%esi
- addl 12(%esp),%ebx
- xorl %edi,%edx
- shldl $5,%ecx,%ecx
- vpxor %xmm0,%xmm4,%xmm4
- addl %ebp,%ebx
- andl %edx,%esi
- vmovdqa 96(%esp),%xmm0
- xorl %edi,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %edi,%esi
- vpalignr $8,%xmm1,%xmm2,%xmm5
- movl %ebx,%ebp
- addl 16(%esp),%eax
- vpaddd %xmm4,%xmm0,%xmm0
- vmovdqa %xmm1,80(%esp)
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrldq $4,%xmm4,%xmm7
- addl %esi,%eax
- andl %ecx,%ebp
- vpxor %xmm1,%xmm5,%xmm5
- xorl %edx,%ecx
- addl %ebx,%eax
- vpxor %xmm3,%xmm7,%xmm7
- shrdl $7,%ebx,%ebx
- xorl %edx,%ebp
- vmovdqa %xmm0,(%esp)
- movl %eax,%esi
- addl 20(%esp),%edi
- vpxor %xmm7,%xmm5,%xmm5
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- addl %ebp,%edi
- andl %ebx,%esi
- vpsrld $31,%xmm5,%xmm7
- xorl %ecx,%ebx
- addl %eax,%edi
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- vpslldq $12,%xmm5,%xmm1
- vpaddd %xmm5,%xmm5,%xmm5
- movl %edi,%ebp
- addl 24(%esp),%edx
- xorl %ebx,%eax
- shldl $5,%edi,%edi
- vpsrld $30,%xmm1,%xmm0
- vpor %xmm7,%xmm5,%xmm5
- addl %esi,%edx
- andl %eax,%ebp
- xorl %ebx,%eax
- addl %edi,%edx
- vpslld $2,%xmm1,%xmm1
- shrdl $7,%edi,%edi
- xorl %ebx,%ebp
- vpxor %xmm0,%xmm5,%xmm5
- movl %edx,%esi
- addl 28(%esp),%ecx
- xorl %eax,%edi
- shldl $5,%edx,%edx
- vpxor %xmm1,%xmm5,%xmm5
- addl %ebp,%ecx
- andl %edi,%esi
- vmovdqa 112(%esp),%xmm1
- xorl %eax,%edi
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- vpalignr $8,%xmm2,%xmm3,%xmm6
- movl %ecx,%ebp
- addl 32(%esp),%ebx
- vpaddd %xmm5,%xmm1,%xmm1
- vmovdqa %xmm2,96(%esp)
- xorl %edi,%edx
- shldl $5,%ecx,%ecx
- vpsrldq $4,%xmm5,%xmm0
- addl %esi,%ebx
- andl %edx,%ebp
- vpxor %xmm2,%xmm6,%xmm6
- xorl %edi,%edx
- addl %ecx,%ebx
- vpxor %xmm4,%xmm0,%xmm0
- shrdl $7,%ecx,%ecx
- xorl %edi,%ebp
- vmovdqa %xmm1,16(%esp)
- movl %ebx,%esi
- addl 36(%esp),%eax
- vpxor %xmm0,%xmm6,%xmm6
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- andl %ecx,%esi
- vpsrld $31,%xmm6,%xmm0
- xorl %edx,%ecx
- addl %ebx,%eax
- shrdl $7,%ebx,%ebx
- xorl %edx,%esi
- vpslldq $12,%xmm6,%xmm2
- vpaddd %xmm6,%xmm6,%xmm6
- movl %eax,%ebp
- addl 40(%esp),%edi
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrld $30,%xmm2,%xmm1
- vpor %xmm0,%xmm6,%xmm6
- addl %esi,%edi
- andl %ebx,%ebp
- xorl %ecx,%ebx
- addl %eax,%edi
- vpslld $2,%xmm2,%xmm2
- vmovdqa 64(%esp),%xmm0
- shrdl $7,%eax,%eax
- xorl %ecx,%ebp
- vpxor %xmm1,%xmm6,%xmm6
- movl %edi,%esi
- addl 44(%esp),%edx
- xorl %ebx,%eax
- shldl $5,%edi,%edi
- vpxor %xmm2,%xmm6,%xmm6
- addl %ebp,%edx
- andl %eax,%esi
- vmovdqa 112(%esp),%xmm2
- xorl %ebx,%eax
- addl %edi,%edx
- shrdl $7,%edi,%edi
- xorl %ebx,%esi
- vpalignr $8,%xmm3,%xmm4,%xmm7
- movl %edx,%ebp
- addl 48(%esp),%ecx
- vpaddd %xmm6,%xmm2,%xmm2
- vmovdqa %xmm3,64(%esp)
- xorl %eax,%edi
- shldl $5,%edx,%edx
- vpsrldq $4,%xmm6,%xmm1
- addl %esi,%ecx
- andl %edi,%ebp
- vpxor %xmm3,%xmm7,%xmm7
- xorl %eax,%edi
- addl %edx,%ecx
- vpxor %xmm5,%xmm1,%xmm1
- shrdl $7,%edx,%edx
- xorl %eax,%ebp
- vmovdqa %xmm2,32(%esp)
- movl %ecx,%esi
- addl 52(%esp),%ebx
- vpxor %xmm1,%xmm7,%xmm7
- xorl %edi,%edx
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- andl %edx,%esi
- vpsrld $31,%xmm7,%xmm1
- xorl %edi,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %edi,%esi
- vpslldq $12,%xmm7,%xmm3
- vpaddd %xmm7,%xmm7,%xmm7
- movl %ebx,%ebp
- addl 56(%esp),%eax
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm3,%xmm2
- vpor %xmm1,%xmm7,%xmm7
- addl %esi,%eax
- andl %ecx,%ebp
- xorl %edx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm3,%xmm3
- vmovdqa 80(%esp),%xmm1
- shrdl $7,%ebx,%ebx
- xorl %edx,%ebp
- vpxor %xmm2,%xmm7,%xmm7
- movl %eax,%esi
- addl 60(%esp),%edi
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpxor %xmm3,%xmm7,%xmm7
- addl %ebp,%edi
- andl %ebx,%esi
- vmovdqa 112(%esp),%xmm3
- xorl %ecx,%ebx
- addl %eax,%edi
- vpalignr $8,%xmm6,%xmm7,%xmm2
- vpxor %xmm4,%xmm0,%xmm0
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- movl %edi,%ebp
- addl (%esp),%edx
- vpxor %xmm1,%xmm0,%xmm0
- vmovdqa %xmm4,80(%esp)
- xorl %ebx,%eax
- shldl $5,%edi,%edi
- vmovdqa %xmm3,%xmm4
- vpaddd %xmm7,%xmm3,%xmm3
- addl %esi,%edx
- andl %eax,%ebp
- vpxor %xmm2,%xmm0,%xmm0
- xorl %ebx,%eax
- addl %edi,%edx
- shrdl $7,%edi,%edi
- xorl %ebx,%ebp
- vpsrld $30,%xmm0,%xmm2
- vmovdqa %xmm3,48(%esp)
- movl %edx,%esi
- addl 4(%esp),%ecx
- xorl %eax,%edi
- shldl $5,%edx,%edx
- vpslld $2,%xmm0,%xmm0
- addl %ebp,%ecx
- andl %edi,%esi
- xorl %eax,%edi
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- movl %ecx,%ebp
- addl 8(%esp),%ebx
- vpor %xmm2,%xmm0,%xmm0
- xorl %edi,%edx
- shldl $5,%ecx,%ecx
- vmovdqa 96(%esp),%xmm2
- addl %esi,%ebx
- andl %edx,%ebp
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 12(%esp),%eax
- xorl %edi,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm7,%xmm0,%xmm3
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- vpxor %xmm2,%xmm1,%xmm1
- vmovdqa %xmm5,96(%esp)
- addl %esi,%edi
- xorl %ecx,%ebp
- vmovdqa %xmm4,%xmm5
- vpaddd %xmm0,%xmm4,%xmm4
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpxor %xmm3,%xmm1,%xmm1
- addl 20(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- shldl $5,%edi,%edi
- vpsrld $30,%xmm1,%xmm3
- vmovdqa %xmm4,(%esp)
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpslld $2,%xmm1,%xmm1
- addl 24(%esp),%ecx
- xorl %eax,%esi
- movl %edx,%ebp
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%ebp
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpor %xmm3,%xmm1,%xmm1
- addl 28(%esp),%ebx
- xorl %edi,%ebp
- vmovdqa 64(%esp),%xmm3
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm0,%xmm1,%xmm4
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- vpxor %xmm3,%xmm2,%xmm2
- vmovdqa %xmm6,64(%esp)
- addl %esi,%eax
- xorl %edx,%ebp
- vmovdqa 128(%esp),%xmm6
- vpaddd %xmm1,%xmm5,%xmm5
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpxor %xmm4,%xmm2,%xmm2
- addl 36(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- shldl $5,%eax,%eax
- vpsrld $30,%xmm2,%xmm4
- vmovdqa %xmm5,16(%esp)
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpslld $2,%xmm2,%xmm2
- addl 40(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- shldl $5,%edi,%edi
- addl %esi,%edx
- xorl %ebx,%ebp
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpor %xmm4,%xmm2,%xmm2
- addl 44(%esp),%ecx
- xorl %eax,%ebp
- vmovdqa 80(%esp),%xmm4
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpalignr $8,%xmm1,%xmm2,%xmm5
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- vpxor %xmm4,%xmm3,%xmm3
- vmovdqa %xmm7,80(%esp)
- addl %esi,%ebx
- xorl %edi,%ebp
- vmovdqa %xmm6,%xmm7
- vpaddd %xmm2,%xmm6,%xmm6
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpxor %xmm5,%xmm3,%xmm3
- addl 52(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm3,%xmm5
- vmovdqa %xmm6,32(%esp)
- addl %ebp,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm3,%xmm3
- addl 56(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- addl %esi,%edi
- xorl %ecx,%ebp
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpor %xmm5,%xmm3,%xmm3
- addl 60(%esp),%edx
- xorl %ebx,%ebp
- vmovdqa 96(%esp),%xmm5
- movl %edi,%esi
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpalignr $8,%xmm2,%xmm3,%xmm6
- vpxor %xmm0,%xmm4,%xmm4
- addl (%esp),%ecx
- xorl %eax,%esi
- movl %edx,%ebp
- shldl $5,%edx,%edx
- vpxor %xmm5,%xmm4,%xmm4
- vmovdqa %xmm0,96(%esp)
- addl %esi,%ecx
- xorl %eax,%ebp
- vmovdqa %xmm7,%xmm0
- vpaddd %xmm3,%xmm7,%xmm7
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpxor %xmm6,%xmm4,%xmm4
- addl 4(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- vpsrld $30,%xmm4,%xmm6
- vmovdqa %xmm7,48(%esp)
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpslld $2,%xmm4,%xmm4
- addl 8(%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%ebp
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpor %xmm6,%xmm4,%xmm4
- addl 12(%esp),%edi
- xorl %ecx,%ebp
- vmovdqa 64(%esp),%xmm6
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpalignr $8,%xmm3,%xmm4,%xmm7
- vpxor %xmm1,%xmm5,%xmm5
- addl 16(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- shldl $5,%edi,%edi
- vpxor %xmm6,%xmm5,%xmm5
- vmovdqa %xmm1,64(%esp)
- addl %esi,%edx
- xorl %ebx,%ebp
- vmovdqa %xmm0,%xmm1
- vpaddd %xmm4,%xmm0,%xmm0
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpxor %xmm7,%xmm5,%xmm5
- addl 20(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- shldl $5,%edx,%edx
- vpsrld $30,%xmm5,%xmm7
- vmovdqa %xmm0,(%esp)
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpslld $2,%xmm5,%xmm5
- addl 24(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpor %xmm7,%xmm5,%xmm5
- addl 28(%esp),%eax
- vmovdqa 80(%esp),%xmm7
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%ebp
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm4,%xmm5,%xmm0
- vpxor %xmm2,%xmm6,%xmm6
- addl 32(%esp),%edi
- andl %ecx,%esi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- vmovdqa %xmm2,80(%esp)
- movl %eax,%ebp
- xorl %ecx,%esi
- vmovdqa %xmm1,%xmm2
- vpaddd %xmm5,%xmm1,%xmm1
- shldl $5,%eax,%eax
- addl %esi,%edi
- vpxor %xmm0,%xmm6,%xmm6
- xorl %ebx,%ebp
- xorl %ecx,%ebx
- addl %eax,%edi
- addl 36(%esp),%edx
- vpsrld $30,%xmm6,%xmm0
- vmovdqa %xmm1,16(%esp)
- andl %ebx,%ebp
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %edi,%esi
- vpslld $2,%xmm6,%xmm6
- xorl %ebx,%ebp
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %edi,%edx
- addl 40(%esp),%ecx
- andl %eax,%esi
- vpor %xmm0,%xmm6,%xmm6
- xorl %ebx,%eax
- shrdl $7,%edi,%edi
- vmovdqa 96(%esp),%xmm0
- movl %edx,%ebp
- xorl %eax,%esi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %edi,%ebp
- xorl %eax,%edi
- addl %edx,%ecx
- addl 44(%esp),%ebx
- andl %edi,%ebp
- xorl %eax,%edi
- shrdl $7,%edx,%edx
- movl %ecx,%esi
- xorl %edi,%ebp
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edx,%esi
- xorl %edi,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm5,%xmm6,%xmm1
- vpxor %xmm3,%xmm7,%xmm7
- addl 48(%esp),%eax
- andl %edx,%esi
- xorl %edi,%edx
- shrdl $7,%ecx,%ecx
- vpxor %xmm0,%xmm7,%xmm7
- vmovdqa %xmm3,96(%esp)
- movl %ebx,%ebp
- xorl %edx,%esi
- vmovdqa 144(%esp),%xmm3
- vpaddd %xmm6,%xmm2,%xmm2
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vpxor %xmm1,%xmm7,%xmm7
- xorl %ecx,%ebp
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 52(%esp),%edi
- vpsrld $30,%xmm7,%xmm1
- vmovdqa %xmm2,32(%esp)
- andl %ecx,%ebp
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- vpslld $2,%xmm7,%xmm7
- xorl %ecx,%ebp
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%edi
- addl 56(%esp),%edx
- andl %ebx,%esi
- vpor %xmm1,%xmm7,%xmm7
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- vmovdqa 64(%esp),%xmm1
- movl %edi,%ebp
- xorl %ebx,%esi
- shldl $5,%edi,%edi
- addl %esi,%edx
- xorl %eax,%ebp
- xorl %ebx,%eax
- addl %edi,%edx
- addl 60(%esp),%ecx
- andl %eax,%ebp
- xorl %ebx,%eax
- shrdl $7,%edi,%edi
- movl %edx,%esi
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %edi,%esi
- xorl %eax,%edi
- addl %edx,%ecx
- vpalignr $8,%xmm6,%xmm7,%xmm2
- vpxor %xmm4,%xmm0,%xmm0
- addl (%esp),%ebx
- andl %edi,%esi
- xorl %eax,%edi
- shrdl $7,%edx,%edx
- vpxor %xmm1,%xmm0,%xmm0
- vmovdqa %xmm4,64(%esp)
- movl %ecx,%ebp
- xorl %edi,%esi
- vmovdqa %xmm3,%xmm4
- vpaddd %xmm7,%xmm3,%xmm3
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- vpxor %xmm2,%xmm0,%xmm0
- xorl %edx,%ebp
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 4(%esp),%eax
- vpsrld $30,%xmm0,%xmm2
- vmovdqa %xmm3,48(%esp)
- andl %edx,%ebp
- xorl %edi,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- vpslld $2,%xmm0,%xmm0
- xorl %edx,%ebp
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 8(%esp),%edi
- andl %ecx,%esi
- vpor %xmm2,%xmm0,%xmm0
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- vmovdqa 80(%esp),%xmm2
- movl %eax,%ebp
- xorl %ecx,%esi
- shldl $5,%eax,%eax
- addl %esi,%edi
- xorl %ebx,%ebp
- xorl %ecx,%ebx
- addl %eax,%edi
- addl 12(%esp),%edx
- andl %ebx,%ebp
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %edi,%esi
- xorl %ebx,%ebp
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %edi,%edx
- vpalignr $8,%xmm7,%xmm0,%xmm3
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%esp),%ecx
- andl %eax,%esi
- xorl %ebx,%eax
- shrdl $7,%edi,%edi
- vpxor %xmm2,%xmm1,%xmm1
- vmovdqa %xmm5,80(%esp)
- movl %edx,%ebp
- xorl %eax,%esi
- vmovdqa %xmm4,%xmm5
- vpaddd %xmm0,%xmm4,%xmm4
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vpxor %xmm3,%xmm1,%xmm1
- xorl %edi,%ebp
- xorl %eax,%edi
- addl %edx,%ecx
- addl 20(%esp),%ebx
- vpsrld $30,%xmm1,%xmm3
- vmovdqa %xmm4,(%esp)
- andl %edi,%ebp
- xorl %eax,%edi
- shrdl $7,%edx,%edx
- movl %ecx,%esi
- vpslld $2,%xmm1,%xmm1
- xorl %edi,%ebp
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edx,%esi
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 24(%esp),%eax
- andl %edx,%esi
- vpor %xmm3,%xmm1,%xmm1
- xorl %edi,%edx
- shrdl $7,%ecx,%ecx
- vmovdqa 96(%esp),%xmm3
- movl %ebx,%ebp
- xorl %edx,%esi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %ecx,%ebp
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 28(%esp),%edi
- andl %ecx,%ebp
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- xorl %ecx,%ebp
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%edi
- vpalignr $8,%xmm0,%xmm1,%xmm4
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%esp),%edx
- andl %ebx,%esi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- vpxor %xmm3,%xmm2,%xmm2
- vmovdqa %xmm6,96(%esp)
- movl %edi,%ebp
- xorl %ebx,%esi
- vmovdqa %xmm5,%xmm6
- vpaddd %xmm1,%xmm5,%xmm5
- shldl $5,%edi,%edi
- addl %esi,%edx
- vpxor %xmm4,%xmm2,%xmm2
- xorl %eax,%ebp
- xorl %ebx,%eax
- addl %edi,%edx
- addl 36(%esp),%ecx
- vpsrld $30,%xmm2,%xmm4
- vmovdqa %xmm5,16(%esp)
- andl %eax,%ebp
- xorl %ebx,%eax
- shrdl $7,%edi,%edi
- movl %edx,%esi
- vpslld $2,%xmm2,%xmm2
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %edi,%esi
- xorl %eax,%edi
- addl %edx,%ecx
- addl 40(%esp),%ebx
- andl %edi,%esi
- vpor %xmm4,%xmm2,%xmm2
- xorl %eax,%edi
- shrdl $7,%edx,%edx
- vmovdqa 64(%esp),%xmm4
- movl %ecx,%ebp
- xorl %edi,%esi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edx,%ebp
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 44(%esp),%eax
- andl %edx,%ebp
- xorl %edi,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%ebp
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- addl %ebx,%eax
- vpalignr $8,%xmm1,%xmm2,%xmm5
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- vpxor %xmm4,%xmm3,%xmm3
- vmovdqa %xmm7,64(%esp)
- addl %esi,%edi
- xorl %ecx,%ebp
- vmovdqa %xmm6,%xmm7
- vpaddd %xmm2,%xmm6,%xmm6
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpxor %xmm5,%xmm3,%xmm3
- addl 52(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- shldl $5,%edi,%edi
- vpsrld $30,%xmm3,%xmm5
- vmovdqa %xmm6,32(%esp)
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpslld $2,%xmm3,%xmm3
- addl 56(%esp),%ecx
- xorl %eax,%esi
- movl %edx,%ebp
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%ebp
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpor %xmm5,%xmm3,%xmm3
- addl 60(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl (%esp),%eax
- vpaddd %xmm3,%xmm7,%xmm7
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vmovdqa %xmm7,48(%esp)
- xorl %edx,%ebp
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 4(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 8(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- shldl $5,%edi,%edi
- addl %esi,%edx
- xorl %ebx,%ebp
- shrdl $7,%eax,%eax
- addl %edi,%edx
- addl 12(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- movl 196(%esp),%ebp
- cmpl 200(%esp),%ebp
- je .L010done
- vmovdqa 160(%esp),%xmm7
- vmovdqa 176(%esp),%xmm6
- vmovdqu (%ebp),%xmm0
- vmovdqu 16(%ebp),%xmm1
- vmovdqu 32(%ebp),%xmm2
- vmovdqu 48(%ebp),%xmm3
- addl $64,%ebp
- vpshufb %xmm6,%xmm0,%xmm0
- movl %ebp,196(%esp)
- vmovdqa %xmm7,96(%esp)
- addl 16(%esp),%ebx
- xorl %edi,%esi
- vpshufb %xmm6,%xmm1,%xmm1
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- vpaddd %xmm7,%xmm0,%xmm4
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vmovdqa %xmm4,(%esp)
- addl 20(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- addl %esi,%edi
- xorl %ecx,%ebp
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 28(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- addl 32(%esp),%ecx
- xorl %eax,%esi
- vpshufb %xmm6,%xmm2,%xmm2
- movl %edx,%ebp
- shldl $5,%edx,%edx
- vpaddd %xmm7,%xmm1,%xmm5
- addl %esi,%ecx
- xorl %eax,%ebp
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vmovdqa %xmm5,16(%esp)
- addl 36(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%ebp
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 48(%esp),%edx
- xorl %ebx,%esi
- vpshufb %xmm6,%xmm3,%xmm3
- movl %edi,%ebp
- shldl $5,%edi,%edi
- vpaddd %xmm7,%xmm2,%xmm6
- addl %esi,%edx
- xorl %ebx,%ebp
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vmovdqa %xmm6,32(%esp)
- addl 52(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- addl 56(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- movl 192(%esp),%ebp
- addl (%ebp),%eax
- addl 4(%ebp),%esi
- addl 8(%ebp),%ecx
- movl %eax,(%ebp)
- addl 12(%ebp),%edx
- movl %esi,4(%ebp)
- addl 16(%ebp),%edi
- movl %ecx,%ebx
- movl %ecx,8(%ebp)
- xorl %edx,%ebx
- movl %edx,12(%ebp)
- movl %edi,16(%ebp)
- movl %esi,%ebp
- andl %ebx,%esi
- movl %ebp,%ebx
- jmp .L009loop
-.align 16
-.L010done:
- addl 16(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 20(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- addl %esi,%edi
- xorl %ecx,%ebp
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 28(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- addl 32(%esp),%ecx
- xorl %eax,%esi
- movl %edx,%ebp
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%ebp
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- addl 36(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%ebp
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 48(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- shldl $5,%edi,%edi
- addl %esi,%edx
- xorl %ebx,%ebp
- shrdl $7,%eax,%eax
- addl %edi,%edx
- addl 52(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- addl 56(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vzeroall
- movl 192(%esp),%ebp
- addl (%ebp),%eax
- movl 204(%esp),%esp
- addl 4(%ebp),%esi
- addl 8(%ebp),%ecx
- movl %eax,(%ebp)
- addl 12(%ebp),%edx
- movl %esi,4(%ebp)
- addl 16(%ebp),%edi
- movl %ecx,8(%ebp)
- movl %edx,12(%ebp)
- movl %edi,16(%ebp)
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.size _sha1_block_data_order_avx,.-_sha1_block_data_order_avx
.align 64
.LK_XX_XX:
.long 1518500249,1518500249,1518500249,1518500249
diff --git a/deps/openssl/asm/x86-elf-gas/sha/sha256-586.s b/deps/openssl/asm/x86-elf-gas/sha/sha256-586.s
index b434e42bab..836d91886b 100644
--- a/deps/openssl/asm/x86-elf-gas/sha/sha256-586.s
+++ b/deps/openssl/asm/x86-elf-gas/sha/sha256-586.s
@@ -40,13 +40,12 @@ sha256_block_data_order:
orl %ebx,%ecx
andl $1342177280,%ecx
cmpl $1342177280,%ecx
- je .L005AVX
testl $512,%ebx
- jnz .L006SSSE3
+ jnz .L005SSSE3
.L003no_xmm:
subl %edi,%eax
cmpl $256,%eax
- jae .L007unrolled
+ jae .L006unrolled
jmp .L002loop
.align 16
.L002loop:
@@ -118,7 +117,7 @@ sha256_block_data_order:
movl %ecx,28(%esp)
movl %edi,32(%esp)
.align 16
-.L00800_15:
+.L00700_15:
movl %edx,%ecx
movl 24(%esp),%esi
rorl $14,%ecx
@@ -156,11 +155,11 @@ sha256_block_data_order:
addl $4,%ebp
addl %ebx,%eax
cmpl $3248222580,%esi
- jne .L00800_15
+ jne .L00700_15
movl 156(%esp),%ecx
- jmp .L00916_63
+ jmp .L00816_63
.align 16
-.L00916_63:
+.L00816_63:
movl %ecx,%ebx
movl 104(%esp),%esi
rorl $11,%ecx
@@ -215,7 +214,7 @@ sha256_block_data_order:
addl $4,%ebp
addl %ebx,%eax
cmpl $3329325298,%esi
- jne .L00916_63
+ jne .L00816_63
movl 356(%esp),%esi
movl 8(%esp),%ebx
movl 16(%esp),%ecx
@@ -259,7 +258,7 @@ sha256_block_data_order:
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
.align 16
-.L007unrolled:
+.L006unrolled:
leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebp
@@ -276,9 +275,9 @@ sha256_block_data_order:
movl %ebx,20(%esp)
movl %ecx,24(%esp)
movl %esi,28(%esp)
- jmp .L010grand_loop
+ jmp .L009grand_loop
.align 16
-.L010grand_loop:
+.L009grand_loop:
movl (%edi),%ebx
movl 4(%edi),%ecx
bswap %ebx
@@ -3158,7 +3157,7 @@ sha256_block_data_order:
movl %ebx,24(%esp)
movl %ecx,28(%esp)
cmpl 104(%esp),%edi
- jb .L010grand_loop
+ jb .L009grand_loop
movl 108(%esp),%esp
popl %edi
popl %esi
@@ -3177,9 +3176,9 @@ sha256_block_data_order:
pshufd $27,%xmm2,%xmm2
.byte 102,15,58,15,202,8
punpcklqdq %xmm0,%xmm2
- jmp .L011loop_shaext
+ jmp .L010loop_shaext
.align 16
-.L011loop_shaext:
+.L010loop_shaext:
movdqu (%edi),%xmm3
movdqu 16(%edi),%xmm4
movdqu 32(%edi),%xmm5
@@ -3349,7 +3348,7 @@ sha256_block_data_order:
.byte 15,56,203,202
paddd 16(%esp),%xmm2
paddd (%esp),%xmm1
- jnz .L011loop_shaext
+ jnz .L010loop_shaext
pshufd $177,%xmm2,%xmm2
pshufd $27,%xmm1,%xmm7
pshufd $177,%xmm1,%xmm1
@@ -3364,7 +3363,7 @@ sha256_block_data_order:
popl %ebp
ret
.align 32
-.L006SSSE3:
+.L005SSSE3:
leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebx
@@ -3383,9 +3382,9 @@ sha256_block_data_order:
movl %ecx,24(%esp)
movl %esi,28(%esp)
movdqa 256(%ebp),%xmm7
- jmp .L012grand_ssse3
+ jmp .L011grand_ssse3
.align 16
-.L012grand_ssse3:
+.L011grand_ssse3:
movdqu (%edi),%xmm0
movdqu 16(%edi),%xmm1
movdqu 32(%edi),%xmm2
@@ -3408,9 +3407,9 @@ sha256_block_data_order:
paddd %xmm3,%xmm7
movdqa %xmm6,64(%esp)
movdqa %xmm7,80(%esp)
- jmp .L013ssse3_00_47
+ jmp .L012ssse3_00_47
.align 16
-.L013ssse3_00_47:
+.L012ssse3_00_47:
addl $64,%ebp
movl %edx,%ecx
movdqa %xmm1,%xmm4
@@ -4053,7 +4052,7 @@ sha256_block_data_order:
addl %ecx,%eax
movdqa %xmm6,80(%esp)
cmpl $66051,64(%ebp)
- jne .L013ssse3_00_47
+ jne .L012ssse3_00_47
movl %edx,%ecx
rorl $14,%edx
movl 20(%esp),%esi
@@ -4567,2217 +4566,12 @@ sha256_block_data_order:
movdqa 64(%ebp),%xmm7
subl $192,%ebp
cmpl 104(%esp),%edi
- jb .L012grand_ssse3
+ jb .L011grand_ssse3
movl 108(%esp),%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
-.align 32
-.L005AVX:
- andl $264,%edx
- cmpl $264,%edx
- je .L014AVX_BMI
- leal -96(%esp),%esp
- vzeroall
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl 8(%esi),%ecx
- movl 12(%esi),%edi
- movl %ebx,4(%esp)
- xorl %ecx,%ebx
- movl %ecx,8(%esp)
- movl %edi,12(%esp)
- movl 16(%esi),%edx
- movl 20(%esi),%edi
- movl 24(%esi),%ecx
- movl 28(%esi),%esi
- movl %edi,20(%esp)
- movl 100(%esp),%edi
- movl %ecx,24(%esp)
- movl %esi,28(%esp)
- vmovdqa 256(%ebp),%xmm7
- jmp .L015grand_avx
-.align 32
-.L015grand_avx:
- vmovdqu (%edi),%xmm0
- vmovdqu 16(%edi),%xmm1
- vmovdqu 32(%edi),%xmm2
- vmovdqu 48(%edi),%xmm3
- addl $64,%edi
- vpshufb %xmm7,%xmm0,%xmm0
- movl %edi,100(%esp)
- vpshufb %xmm7,%xmm1,%xmm1
- vpshufb %xmm7,%xmm2,%xmm2
- vpaddd (%ebp),%xmm0,%xmm4
- vpshufb %xmm7,%xmm3,%xmm3
- vpaddd 16(%ebp),%xmm1,%xmm5
- vpaddd 32(%ebp),%xmm2,%xmm6
- vpaddd 48(%ebp),%xmm3,%xmm7
- vmovdqa %xmm4,32(%esp)
- vmovdqa %xmm5,48(%esp)
- vmovdqa %xmm6,64(%esp)
- vmovdqa %xmm7,80(%esp)
- jmp .L016avx_00_47
-.align 16
-.L016avx_00_47:
- addl $64,%ebp
- vpalignr $4,%xmm0,%xmm1,%xmm4
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 20(%esp),%esi
- vpalignr $4,%xmm2,%xmm3,%xmm7
- xorl %ecx,%edx
- movl 24(%esp),%edi
- xorl %edi,%esi
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- vpaddd %xmm7,%xmm0,%xmm0
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrld $3,%xmm4,%xmm7
- movl %eax,%ecx
- addl %edi,%edx
- movl 4(%esp),%edi
- vpslld $14,%xmm4,%xmm5
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,(%esp)
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- vpshufd $250,%xmm3,%xmm7
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpsrld $11,%xmm6,%xmm6
- addl 32(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpxor %xmm5,%xmm4,%xmm4
- addl %edx,%ebx
- addl 12(%esp),%edx
- addl %ecx,%ebx
- vpslld $11,%xmm5,%xmm5
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 16(%esp),%esi
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ecx,%edx
- movl 20(%esp),%edi
- xorl %edi,%esi
- vpsrld $10,%xmm7,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,12(%esp)
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %ebx,%ecx
- addl %edi,%edx
- movl (%esp),%edi
- vpaddd %xmm4,%xmm0,%xmm0
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,28(%esp)
- vpxor %xmm5,%xmm6,%xmm6
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- vpsrlq $19,%xmm7,%xmm7
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- addl 36(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- vpshufd $132,%xmm6,%xmm7
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %ecx,%eax
- vpsrldq $8,%xmm7,%xmm7
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 12(%esp),%esi
- vpaddd %xmm7,%xmm0,%xmm0
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- vpshufd $80,%xmm0,%xmm7
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- vpsrld $10,%xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- vpxor %xmm5,%xmm6,%xmm6
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,24(%esp)
- vpsrlq $19,%xmm7,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpshufd $232,%xmm6,%xmm7
- addl 40(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpslldq $8,%xmm7,%xmm7
- addl %edx,%ebx
- addl 4(%esp),%edx
- addl %ecx,%ebx
- vpaddd %xmm7,%xmm0,%xmm0
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 8(%esp),%esi
- vpaddd (%ebp),%xmm0,%xmm6
- xorl %ecx,%edx
- movl 12(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,4(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 44(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- vmovdqa %xmm6,32(%esp)
- vpalignr $4,%xmm1,%xmm2,%xmm4
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 4(%esp),%esi
- vpalignr $4,%xmm3,%xmm0,%xmm7
- xorl %ecx,%edx
- movl 8(%esp),%edi
- xorl %edi,%esi
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- vpaddd %xmm7,%xmm1,%xmm1
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrld $3,%xmm4,%xmm7
- movl %eax,%ecx
- addl %edi,%edx
- movl 20(%esp),%edi
- vpslld $14,%xmm4,%xmm5
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,16(%esp)
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- vpshufd $250,%xmm0,%xmm7
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpsrld $11,%xmm6,%xmm6
- addl 48(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpxor %xmm5,%xmm4,%xmm4
- addl %edx,%ebx
- addl 28(%esp),%edx
- addl %ecx,%ebx
- vpslld $11,%xmm5,%xmm5
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl (%esp),%esi
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ecx,%edx
- movl 4(%esp),%edi
- xorl %edi,%esi
- vpsrld $10,%xmm7,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,28(%esp)
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %ebx,%ecx
- addl %edi,%edx
- movl 16(%esp),%edi
- vpaddd %xmm4,%xmm1,%xmm1
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,12(%esp)
- vpxor %xmm5,%xmm6,%xmm6
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- vpsrlq $19,%xmm7,%xmm7
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- addl 52(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- vpshufd $132,%xmm6,%xmm7
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %ecx,%eax
- vpsrldq $8,%xmm7,%xmm7
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 28(%esp),%esi
- vpaddd %xmm7,%xmm1,%xmm1
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- vpshufd $80,%xmm1,%xmm7
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- vpsrld $10,%xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- vpxor %xmm5,%xmm6,%xmm6
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,8(%esp)
- vpsrlq $19,%xmm7,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpshufd $232,%xmm6,%xmm7
- addl 56(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpslldq $8,%xmm7,%xmm7
- addl %edx,%ebx
- addl 20(%esp),%edx
- addl %ecx,%ebx
- vpaddd %xmm7,%xmm1,%xmm1
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 24(%esp),%esi
- vpaddd 16(%ebp),%xmm1,%xmm6
- xorl %ecx,%edx
- movl 28(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,20(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 60(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- vmovdqa %xmm6,48(%esp)
- vpalignr $4,%xmm2,%xmm3,%xmm4
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 20(%esp),%esi
- vpalignr $4,%xmm0,%xmm1,%xmm7
- xorl %ecx,%edx
- movl 24(%esp),%edi
- xorl %edi,%esi
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- vpaddd %xmm7,%xmm2,%xmm2
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrld $3,%xmm4,%xmm7
- movl %eax,%ecx
- addl %edi,%edx
- movl 4(%esp),%edi
- vpslld $14,%xmm4,%xmm5
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,(%esp)
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- vpshufd $250,%xmm1,%xmm7
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpsrld $11,%xmm6,%xmm6
- addl 64(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpxor %xmm5,%xmm4,%xmm4
- addl %edx,%ebx
- addl 12(%esp),%edx
- addl %ecx,%ebx
- vpslld $11,%xmm5,%xmm5
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 16(%esp),%esi
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ecx,%edx
- movl 20(%esp),%edi
- xorl %edi,%esi
- vpsrld $10,%xmm7,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,12(%esp)
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %ebx,%ecx
- addl %edi,%edx
- movl (%esp),%edi
- vpaddd %xmm4,%xmm2,%xmm2
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,28(%esp)
- vpxor %xmm5,%xmm6,%xmm6
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- vpsrlq $19,%xmm7,%xmm7
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- addl 68(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- vpshufd $132,%xmm6,%xmm7
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %ecx,%eax
- vpsrldq $8,%xmm7,%xmm7
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 12(%esp),%esi
- vpaddd %xmm7,%xmm2,%xmm2
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- vpshufd $80,%xmm2,%xmm7
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- vpsrld $10,%xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- vpxor %xmm5,%xmm6,%xmm6
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,24(%esp)
- vpsrlq $19,%xmm7,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpshufd $232,%xmm6,%xmm7
- addl 72(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpslldq $8,%xmm7,%xmm7
- addl %edx,%ebx
- addl 4(%esp),%edx
- addl %ecx,%ebx
- vpaddd %xmm7,%xmm2,%xmm2
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 8(%esp),%esi
- vpaddd 32(%ebp),%xmm2,%xmm6
- xorl %ecx,%edx
- movl 12(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,4(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 76(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- vmovdqa %xmm6,64(%esp)
- vpalignr $4,%xmm3,%xmm0,%xmm4
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 4(%esp),%esi
- vpalignr $4,%xmm1,%xmm2,%xmm7
- xorl %ecx,%edx
- movl 8(%esp),%edi
- xorl %edi,%esi
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- vpaddd %xmm7,%xmm3,%xmm3
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrld $3,%xmm4,%xmm7
- movl %eax,%ecx
- addl %edi,%edx
- movl 20(%esp),%edi
- vpslld $14,%xmm4,%xmm5
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,16(%esp)
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- vpshufd $250,%xmm2,%xmm7
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpsrld $11,%xmm6,%xmm6
- addl 80(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpxor %xmm5,%xmm4,%xmm4
- addl %edx,%ebx
- addl 28(%esp),%edx
- addl %ecx,%ebx
- vpslld $11,%xmm5,%xmm5
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl (%esp),%esi
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ecx,%edx
- movl 4(%esp),%edi
- xorl %edi,%esi
- vpsrld $10,%xmm7,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,28(%esp)
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %ebx,%ecx
- addl %edi,%edx
- movl 16(%esp),%edi
- vpaddd %xmm4,%xmm3,%xmm3
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,12(%esp)
- vpxor %xmm5,%xmm6,%xmm6
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- vpsrlq $19,%xmm7,%xmm7
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- addl 84(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- vpshufd $132,%xmm6,%xmm7
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %ecx,%eax
- vpsrldq $8,%xmm7,%xmm7
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 28(%esp),%esi
- vpaddd %xmm7,%xmm3,%xmm3
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- vpshufd $80,%xmm3,%xmm7
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- vpsrld $10,%xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- vpxor %xmm5,%xmm6,%xmm6
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,8(%esp)
- vpsrlq $19,%xmm7,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpshufd $232,%xmm6,%xmm7
- addl 88(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpslldq $8,%xmm7,%xmm7
- addl %edx,%ebx
- addl 20(%esp),%edx
- addl %ecx,%ebx
- vpaddd %xmm7,%xmm3,%xmm3
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 24(%esp),%esi
- vpaddd 48(%ebp),%xmm3,%xmm6
- xorl %ecx,%edx
- movl 28(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,20(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 92(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- vmovdqa %xmm6,80(%esp)
- cmpl $66051,64(%ebp)
- jne .L016avx_00_47
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 20(%esp),%esi
- xorl %ecx,%edx
- movl 24(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 4(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 32(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 12(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 16(%esp),%esi
- xorl %ecx,%edx
- movl 20(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,12(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl (%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,28(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 36(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 12(%esp),%esi
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,24(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 40(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 4(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 8(%esp),%esi
- xorl %ecx,%edx
- movl 12(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,4(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 44(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 4(%esp),%esi
- xorl %ecx,%edx
- movl 8(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 20(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,16(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 48(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 28(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl (%esp),%esi
- xorl %ecx,%edx
- movl 4(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,28(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 16(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,12(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 52(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 28(%esp),%esi
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,8(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 56(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 20(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 24(%esp),%esi
- xorl %ecx,%edx
- movl 28(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,20(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 60(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 20(%esp),%esi
- xorl %ecx,%edx
- movl 24(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 4(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 64(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 12(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 16(%esp),%esi
- xorl %ecx,%edx
- movl 20(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,12(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl (%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,28(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 68(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 12(%esp),%esi
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,24(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 72(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 4(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 8(%esp),%esi
- xorl %ecx,%edx
- movl 12(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,4(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 76(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 4(%esp),%esi
- xorl %ecx,%edx
- movl 8(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 20(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,16(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 80(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 28(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl (%esp),%esi
- xorl %ecx,%edx
- movl 4(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,28(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 16(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,12(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 84(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 28(%esp),%esi
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,8(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 88(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 20(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 24(%esp),%esi
- xorl %ecx,%edx
- movl 28(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,20(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 92(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- movl 96(%esp),%esi
- xorl %edi,%ebx
- movl 12(%esp),%ecx
- addl (%esi),%eax
- addl 4(%esi),%ebx
- addl 8(%esi),%edi
- addl 12(%esi),%ecx
- movl %eax,(%esi)
- movl %ebx,4(%esi)
- movl %edi,8(%esi)
- movl %ecx,12(%esi)
- movl %ebx,4(%esp)
- xorl %edi,%ebx
- movl %edi,8(%esp)
- movl %ecx,12(%esp)
- movl 20(%esp),%edi
- movl 24(%esp),%ecx
- addl 16(%esi),%edx
- addl 20(%esi),%edi
- addl 24(%esi),%ecx
- movl %edx,16(%esi)
- movl %edi,20(%esi)
- movl %edi,20(%esp)
- movl 28(%esp),%edi
- movl %ecx,24(%esi)
- addl 28(%esi),%edi
- movl %ecx,24(%esp)
- movl %edi,28(%esi)
- movl %edi,28(%esp)
- movl 100(%esp),%edi
- vmovdqa 64(%ebp),%xmm7
- subl $192,%ebp
- cmpl 104(%esp),%edi
- jb .L015grand_avx
- movl 108(%esp),%esp
- vzeroall
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.align 32
-.L014AVX_BMI:
- leal -96(%esp),%esp
- vzeroall
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl 8(%esi),%ecx
- movl 12(%esi),%edi
- movl %ebx,4(%esp)
- xorl %ecx,%ebx
- movl %ecx,8(%esp)
- movl %edi,12(%esp)
- movl 16(%esi),%edx
- movl 20(%esi),%edi
- movl 24(%esi),%ecx
- movl 28(%esi),%esi
- movl %edi,20(%esp)
- movl 100(%esp),%edi
- movl %ecx,24(%esp)
- movl %esi,28(%esp)
- vmovdqa 256(%ebp),%xmm7
- jmp .L017grand_avx_bmi
-.align 32
-.L017grand_avx_bmi:
- vmovdqu (%edi),%xmm0
- vmovdqu 16(%edi),%xmm1
- vmovdqu 32(%edi),%xmm2
- vmovdqu 48(%edi),%xmm3
- addl $64,%edi
- vpshufb %xmm7,%xmm0,%xmm0
- movl %edi,100(%esp)
- vpshufb %xmm7,%xmm1,%xmm1
- vpshufb %xmm7,%xmm2,%xmm2
- vpaddd (%ebp),%xmm0,%xmm4
- vpshufb %xmm7,%xmm3,%xmm3
- vpaddd 16(%ebp),%xmm1,%xmm5
- vpaddd 32(%ebp),%xmm2,%xmm6
- vpaddd 48(%ebp),%xmm3,%xmm7
- vmovdqa %xmm4,32(%esp)
- vmovdqa %xmm5,48(%esp)
- vmovdqa %xmm6,64(%esp)
- vmovdqa %xmm7,80(%esp)
- jmp .L018avx_bmi_00_47
-.align 16
-.L018avx_bmi_00_47:
- addl $64,%ebp
- vpalignr $4,%xmm0,%xmm1,%xmm4
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,16(%esp)
- vpalignr $4,%xmm2,%xmm3,%xmm7
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 24(%esp),%edx,%esi
- vpsrld $7,%xmm4,%xmm6
- xorl %edi,%ecx
- andl 20(%esp),%edx
- movl %eax,(%esp)
- vpaddd %xmm7,%xmm0,%xmm0
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrld $3,%xmm4,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpslld $14,%xmm4,%xmm5
- movl 4(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpxor %xmm6,%xmm7,%xmm4
- addl 28(%esp),%edx
- andl %eax,%ebx
- addl 32(%esp),%edx
- vpshufd $250,%xmm3,%xmm7
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 12(%esp),%edx
- vpsrld $11,%xmm6,%xmm6
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl %edx,12(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpslld $11,%xmm5,%xmm5
- andnl 20(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 16(%esp),%edx
- vpxor %xmm6,%xmm4,%xmm4
- movl %ebx,28(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpsrld $10,%xmm7,%xmm6
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl (%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpsrlq $17,%xmm7,%xmm5
- addl 24(%esp),%edx
- andl %ebx,%eax
- addl 36(%esp),%edx
- vpaddd %xmm4,%xmm0,%xmm0
- xorl %edi,%eax
- addl %edx,%ecx
- addl 8(%esp),%edx
- vpxor %xmm5,%xmm6,%xmm6
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpsrlq $19,%xmm7,%xmm7
- movl %edx,8(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- andnl 16(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 12(%esp),%edx
- vpshufd $132,%xmm6,%xmm7
- movl %eax,24(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrldq $8,%xmm7,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpaddd %xmm7,%xmm0,%xmm0
- movl 28(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpshufd $80,%xmm0,%xmm7
- addl 20(%esp),%edx
- andl %eax,%ebx
- addl 40(%esp),%edx
- vpsrld $10,%xmm7,%xmm6
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 4(%esp),%edx
- vpsrlq $17,%xmm7,%xmm5
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm6,%xmm6
- movl %edx,4(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpsrlq $19,%xmm7,%xmm7
- andnl 12(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 8(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- movl %ebx,20(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpshufd $232,%xmm6,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpslldq $8,%xmm7,%xmm7
- movl 24(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpaddd %xmm7,%xmm0,%xmm0
- addl 16(%esp),%edx
- andl %ebx,%eax
- addl 44(%esp),%edx
- vpaddd (%ebp),%xmm0,%xmm6
- xorl %edi,%eax
- addl %edx,%ecx
- addl (%esp),%edx
- leal (%eax,%ecx,1),%eax
- vmovdqa %xmm6,32(%esp)
- vpalignr $4,%xmm1,%xmm2,%xmm4
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,(%esp)
- vpalignr $4,%xmm3,%xmm0,%xmm7
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 8(%esp),%edx,%esi
- vpsrld $7,%xmm4,%xmm6
- xorl %edi,%ecx
- andl 4(%esp),%edx
- movl %eax,16(%esp)
- vpaddd %xmm7,%xmm1,%xmm1
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrld $3,%xmm4,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpslld $14,%xmm4,%xmm5
- movl 20(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpxor %xmm6,%xmm7,%xmm4
- addl 12(%esp),%edx
- andl %eax,%ebx
- addl 48(%esp),%edx
- vpshufd $250,%xmm0,%xmm7
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 28(%esp),%edx
- vpsrld $11,%xmm6,%xmm6
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl %edx,28(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpslld $11,%xmm5,%xmm5
- andnl 4(%esp),%edx,%esi
- xorl %edi,%ecx
- andl (%esp),%edx
- vpxor %xmm6,%xmm4,%xmm4
- movl %ebx,12(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpsrld $10,%xmm7,%xmm6
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl 16(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpsrlq $17,%xmm7,%xmm5
- addl 8(%esp),%edx
- andl %ebx,%eax
- addl 52(%esp),%edx
- vpaddd %xmm4,%xmm1,%xmm1
- xorl %edi,%eax
- addl %edx,%ecx
- addl 24(%esp),%edx
- vpxor %xmm5,%xmm6,%xmm6
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpsrlq $19,%xmm7,%xmm7
- movl %edx,24(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- andnl (%esp),%edx,%esi
- xorl %edi,%ecx
- andl 28(%esp),%edx
- vpshufd $132,%xmm6,%xmm7
- movl %eax,8(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrldq $8,%xmm7,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpaddd %xmm7,%xmm1,%xmm1
- movl 12(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpshufd $80,%xmm1,%xmm7
- addl 4(%esp),%edx
- andl %eax,%ebx
- addl 56(%esp),%edx
- vpsrld $10,%xmm7,%xmm6
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 20(%esp),%edx
- vpsrlq $17,%xmm7,%xmm5
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm6,%xmm6
- movl %edx,20(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpsrlq $19,%xmm7,%xmm7
- andnl 28(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 24(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- movl %ebx,4(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpshufd $232,%xmm6,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpslldq $8,%xmm7,%xmm7
- movl 8(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpaddd %xmm7,%xmm1,%xmm1
- addl (%esp),%edx
- andl %ebx,%eax
- addl 60(%esp),%edx
- vpaddd 16(%ebp),%xmm1,%xmm6
- xorl %edi,%eax
- addl %edx,%ecx
- addl 16(%esp),%edx
- leal (%eax,%ecx,1),%eax
- vmovdqa %xmm6,48(%esp)
- vpalignr $4,%xmm2,%xmm3,%xmm4
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,16(%esp)
- vpalignr $4,%xmm0,%xmm1,%xmm7
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 24(%esp),%edx,%esi
- vpsrld $7,%xmm4,%xmm6
- xorl %edi,%ecx
- andl 20(%esp),%edx
- movl %eax,(%esp)
- vpaddd %xmm7,%xmm2,%xmm2
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrld $3,%xmm4,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpslld $14,%xmm4,%xmm5
- movl 4(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpxor %xmm6,%xmm7,%xmm4
- addl 28(%esp),%edx
- andl %eax,%ebx
- addl 64(%esp),%edx
- vpshufd $250,%xmm1,%xmm7
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 12(%esp),%edx
- vpsrld $11,%xmm6,%xmm6
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl %edx,12(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpslld $11,%xmm5,%xmm5
- andnl 20(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 16(%esp),%edx
- vpxor %xmm6,%xmm4,%xmm4
- movl %ebx,28(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpsrld $10,%xmm7,%xmm6
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl (%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpsrlq $17,%xmm7,%xmm5
- addl 24(%esp),%edx
- andl %ebx,%eax
- addl 68(%esp),%edx
- vpaddd %xmm4,%xmm2,%xmm2
- xorl %edi,%eax
- addl %edx,%ecx
- addl 8(%esp),%edx
- vpxor %xmm5,%xmm6,%xmm6
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpsrlq $19,%xmm7,%xmm7
- movl %edx,8(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- andnl 16(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 12(%esp),%edx
- vpshufd $132,%xmm6,%xmm7
- movl %eax,24(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrldq $8,%xmm7,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpaddd %xmm7,%xmm2,%xmm2
- movl 28(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpshufd $80,%xmm2,%xmm7
- addl 20(%esp),%edx
- andl %eax,%ebx
- addl 72(%esp),%edx
- vpsrld $10,%xmm7,%xmm6
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 4(%esp),%edx
- vpsrlq $17,%xmm7,%xmm5
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm6,%xmm6
- movl %edx,4(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpsrlq $19,%xmm7,%xmm7
- andnl 12(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 8(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- movl %ebx,20(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpshufd $232,%xmm6,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpslldq $8,%xmm7,%xmm7
- movl 24(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpaddd %xmm7,%xmm2,%xmm2
- addl 16(%esp),%edx
- andl %ebx,%eax
- addl 76(%esp),%edx
- vpaddd 32(%ebp),%xmm2,%xmm6
- xorl %edi,%eax
- addl %edx,%ecx
- addl (%esp),%edx
- leal (%eax,%ecx,1),%eax
- vmovdqa %xmm6,64(%esp)
- vpalignr $4,%xmm3,%xmm0,%xmm4
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,(%esp)
- vpalignr $4,%xmm1,%xmm2,%xmm7
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 8(%esp),%edx,%esi
- vpsrld $7,%xmm4,%xmm6
- xorl %edi,%ecx
- andl 4(%esp),%edx
- movl %eax,16(%esp)
- vpaddd %xmm7,%xmm3,%xmm3
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrld $3,%xmm4,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpslld $14,%xmm4,%xmm5
- movl 20(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpxor %xmm6,%xmm7,%xmm4
- addl 12(%esp),%edx
- andl %eax,%ebx
- addl 80(%esp),%edx
- vpshufd $250,%xmm2,%xmm7
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 28(%esp),%edx
- vpsrld $11,%xmm6,%xmm6
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl %edx,28(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpslld $11,%xmm5,%xmm5
- andnl 4(%esp),%edx,%esi
- xorl %edi,%ecx
- andl (%esp),%edx
- vpxor %xmm6,%xmm4,%xmm4
- movl %ebx,12(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpsrld $10,%xmm7,%xmm6
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl 16(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpsrlq $17,%xmm7,%xmm5
- addl 8(%esp),%edx
- andl %ebx,%eax
- addl 84(%esp),%edx
- vpaddd %xmm4,%xmm3,%xmm3
- xorl %edi,%eax
- addl %edx,%ecx
- addl 24(%esp),%edx
- vpxor %xmm5,%xmm6,%xmm6
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpsrlq $19,%xmm7,%xmm7
- movl %edx,24(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- andnl (%esp),%edx,%esi
- xorl %edi,%ecx
- andl 28(%esp),%edx
- vpshufd $132,%xmm6,%xmm7
- movl %eax,8(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrldq $8,%xmm7,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpaddd %xmm7,%xmm3,%xmm3
- movl 12(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpshufd $80,%xmm3,%xmm7
- addl 4(%esp),%edx
- andl %eax,%ebx
- addl 88(%esp),%edx
- vpsrld $10,%xmm7,%xmm6
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 20(%esp),%edx
- vpsrlq $17,%xmm7,%xmm5
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm6,%xmm6
- movl %edx,20(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpsrlq $19,%xmm7,%xmm7
- andnl 28(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 24(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- movl %ebx,4(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpshufd $232,%xmm6,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpslldq $8,%xmm7,%xmm7
- movl 8(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpaddd %xmm7,%xmm3,%xmm3
- addl (%esp),%edx
- andl %ebx,%eax
- addl 92(%esp),%edx
- vpaddd 48(%ebp),%xmm3,%xmm6
- xorl %edi,%eax
- addl %edx,%ecx
- addl 16(%esp),%edx
- leal (%eax,%ecx,1),%eax
- vmovdqa %xmm6,80(%esp)
- cmpl $66051,64(%ebp)
- jne .L018avx_bmi_00_47
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,16(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 24(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 20(%esp),%edx
- movl %eax,(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 4(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- andl %eax,%ebx
- addl 32(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 12(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,12(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 20(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 16(%esp),%edx
- movl %ebx,28(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl (%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- andl %ebx,%eax
- addl 36(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl 8(%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,8(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 16(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 12(%esp),%edx
- movl %eax,24(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 28(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- andl %eax,%ebx
- addl 40(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 4(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,4(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 12(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 8(%esp),%edx
- movl %ebx,20(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl 24(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- andl %ebx,%eax
- addl 44(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl (%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 8(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 4(%esp),%edx
- movl %eax,16(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 20(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- andl %eax,%ebx
- addl 48(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 28(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,28(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 4(%esp),%edx,%esi
- xorl %edi,%ecx
- andl (%esp),%edx
- movl %ebx,12(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl 16(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- andl %ebx,%eax
- addl 52(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl 24(%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,24(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl (%esp),%edx,%esi
- xorl %edi,%ecx
- andl 28(%esp),%edx
- movl %eax,8(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 12(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- andl %eax,%ebx
- addl 56(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 20(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,20(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 28(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 24(%esp),%edx
- movl %ebx,4(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl 8(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- andl %ebx,%eax
- addl 60(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl 16(%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,16(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 24(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 20(%esp),%edx
- movl %eax,(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 4(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- andl %eax,%ebx
- addl 64(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 12(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,12(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 20(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 16(%esp),%edx
- movl %ebx,28(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl (%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- andl %ebx,%eax
- addl 68(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl 8(%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,8(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 16(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 12(%esp),%edx
- movl %eax,24(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 28(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- andl %eax,%ebx
- addl 72(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 4(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,4(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 12(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 8(%esp),%edx
- movl %ebx,20(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl 24(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- andl %ebx,%eax
- addl 76(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl (%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 8(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 4(%esp),%edx
- movl %eax,16(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 20(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- andl %eax,%ebx
- addl 80(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 28(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,28(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 4(%esp),%edx,%esi
- xorl %edi,%ecx
- andl (%esp),%edx
- movl %ebx,12(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl 16(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- andl %ebx,%eax
- addl 84(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl 24(%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,24(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl (%esp),%edx,%esi
- xorl %edi,%ecx
- andl 28(%esp),%edx
- movl %eax,8(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 12(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- andl %eax,%ebx
- addl 88(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 20(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,20(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 28(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 24(%esp),%edx
- movl %ebx,4(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl 8(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- andl %ebx,%eax
- addl 92(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl 16(%esp),%edx
- leal (%eax,%ecx,1),%eax
- movl 96(%esp),%esi
- xorl %edi,%ebx
- movl 12(%esp),%ecx
- addl (%esi),%eax
- addl 4(%esi),%ebx
- addl 8(%esi),%edi
- addl 12(%esi),%ecx
- movl %eax,(%esi)
- movl %ebx,4(%esi)
- movl %edi,8(%esi)
- movl %ecx,12(%esi)
- movl %ebx,4(%esp)
- xorl %edi,%ebx
- movl %edi,8(%esp)
- movl %ecx,12(%esp)
- movl 20(%esp),%edi
- movl 24(%esp),%ecx
- addl 16(%esi),%edx
- addl 20(%esi),%edi
- addl 24(%esi),%ecx
- movl %edx,16(%esi)
- movl %edi,20(%esi)
- movl %edi,20(%esp)
- movl 28(%esp),%edi
- movl %ecx,24(%esi)
- addl 28(%esi),%edi
- movl %ecx,24(%esp)
- movl %edi,28(%esi)
- movl %edi,28(%esp)
- movl 100(%esp),%edi
- vmovdqa 64(%ebp),%xmm7
- subl $192,%ebp
- cmpl 104(%esp),%edi
- jb .L017grand_avx_bmi
- movl 108(%esp),%esp
- vzeroall
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
.size sha256_block_data_order,.-.L_sha256_block_data_order_begin
.comm OPENSSL_ia32cap_P,16,4
diff --git a/deps/openssl/asm/x86-macosx-gas/sha/sha1-586.s b/deps/openssl/asm/x86-macosx-gas/sha/sha1-586.s
index d75e61693d..a0fe22eb2e 100644
--- a/deps/openssl/asm/x86-macosx-gas/sha/sha1-586.s
+++ b/deps/openssl/asm/x86-macosx-gas/sha/sha1-586.s
@@ -22,11 +22,6 @@ L000pic_point:
jz L001x86
testl $536870912,%ecx
jnz Lshaext_shortcut
- andl $268435456,%edx
- andl $1073741824,%eax
- orl %edx,%eax
- cmpl $1342177280,%eax
- je Lavx_shortcut
jmp Lssse3_shortcut
.align 4,0x90
L001x86:
@@ -2779,1174 +2774,6 @@ L007done:
popl %ebx
popl %ebp
ret
-.align 4
-__sha1_block_data_order_avx:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- call L008pic_point
-L008pic_point:
- popl %ebp
- leal LK_XX_XX-L008pic_point(%ebp),%ebp
-Lavx_shortcut:
- vzeroall
- vmovdqa (%ebp),%xmm7
- vmovdqa 16(%ebp),%xmm0
- vmovdqa 32(%ebp),%xmm1
- vmovdqa 48(%ebp),%xmm2
- vmovdqa 64(%ebp),%xmm6
- movl 20(%esp),%edi
- movl 24(%esp),%ebp
- movl 28(%esp),%edx
- movl %esp,%esi
- subl $208,%esp
- andl $-64,%esp
- vmovdqa %xmm0,112(%esp)
- vmovdqa %xmm1,128(%esp)
- vmovdqa %xmm2,144(%esp)
- shll $6,%edx
- vmovdqa %xmm7,160(%esp)
- addl %ebp,%edx
- vmovdqa %xmm6,176(%esp)
- addl $64,%ebp
- movl %edi,192(%esp)
- movl %ebp,196(%esp)
- movl %edx,200(%esp)
- movl %esi,204(%esp)
- movl (%edi),%eax
- movl 4(%edi),%ebx
- movl 8(%edi),%ecx
- movl 12(%edi),%edx
- movl 16(%edi),%edi
- movl %ebx,%esi
- vmovdqu -64(%ebp),%xmm0
- vmovdqu -48(%ebp),%xmm1
- vmovdqu -32(%ebp),%xmm2
- vmovdqu -16(%ebp),%xmm3
- vpshufb %xmm6,%xmm0,%xmm0
- vpshufb %xmm6,%xmm1,%xmm1
- vpshufb %xmm6,%xmm2,%xmm2
- vmovdqa %xmm7,96(%esp)
- vpshufb %xmm6,%xmm3,%xmm3
- vpaddd %xmm7,%xmm0,%xmm4
- vpaddd %xmm7,%xmm1,%xmm5
- vpaddd %xmm7,%xmm2,%xmm6
- vmovdqa %xmm4,(%esp)
- movl %ecx,%ebp
- vmovdqa %xmm5,16(%esp)
- xorl %edx,%ebp
- vmovdqa %xmm6,32(%esp)
- andl %ebp,%esi
- jmp L009loop
-.align 4,0x90
-L009loop:
- shrdl $2,%ebx,%ebx
- xorl %edx,%esi
- vpalignr $8,%xmm0,%xmm1,%xmm4
- movl %eax,%ebp
- addl (%esp),%edi
- vpaddd %xmm3,%xmm7,%xmm7
- vmovdqa %xmm0,64(%esp)
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrldq $4,%xmm3,%xmm6
- addl %esi,%edi
- andl %ebx,%ebp
- vpxor %xmm0,%xmm4,%xmm4
- xorl %ecx,%ebx
- addl %eax,%edi
- vpxor %xmm2,%xmm6,%xmm6
- shrdl $7,%eax,%eax
- xorl %ecx,%ebp
- vmovdqa %xmm7,48(%esp)
- movl %edi,%esi
- addl 4(%esp),%edx
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ebx,%eax
- shldl $5,%edi,%edi
- addl %ebp,%edx
- andl %eax,%esi
- vpsrld $31,%xmm4,%xmm6
- xorl %ebx,%eax
- addl %edi,%edx
- shrdl $7,%edi,%edi
- xorl %ebx,%esi
- vpslldq $12,%xmm4,%xmm0
- vpaddd %xmm4,%xmm4,%xmm4
- movl %edx,%ebp
- addl 8(%esp),%ecx
- xorl %eax,%edi
- shldl $5,%edx,%edx
- vpsrld $30,%xmm0,%xmm7
- vpor %xmm6,%xmm4,%xmm4
- addl %esi,%ecx
- andl %edi,%ebp
- xorl %eax,%edi
- addl %edx,%ecx
- vpslld $2,%xmm0,%xmm0
- shrdl $7,%edx,%edx
- xorl %eax,%ebp
- vpxor %xmm7,%xmm4,%xmm4
- movl %ecx,%esi
- addl 12(%esp),%ebx
- xorl %edi,%edx
- shldl $5,%ecx,%ecx
- vpxor %xmm0,%xmm4,%xmm4
- addl %ebp,%ebx
- andl %edx,%esi
- vmovdqa 96(%esp),%xmm0
- xorl %edi,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %edi,%esi
- vpalignr $8,%xmm1,%xmm2,%xmm5
- movl %ebx,%ebp
- addl 16(%esp),%eax
- vpaddd %xmm4,%xmm0,%xmm0
- vmovdqa %xmm1,80(%esp)
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrldq $4,%xmm4,%xmm7
- addl %esi,%eax
- andl %ecx,%ebp
- vpxor %xmm1,%xmm5,%xmm5
- xorl %edx,%ecx
- addl %ebx,%eax
- vpxor %xmm3,%xmm7,%xmm7
- shrdl $7,%ebx,%ebx
- xorl %edx,%ebp
- vmovdqa %xmm0,(%esp)
- movl %eax,%esi
- addl 20(%esp),%edi
- vpxor %xmm7,%xmm5,%xmm5
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- addl %ebp,%edi
- andl %ebx,%esi
- vpsrld $31,%xmm5,%xmm7
- xorl %ecx,%ebx
- addl %eax,%edi
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- vpslldq $12,%xmm5,%xmm1
- vpaddd %xmm5,%xmm5,%xmm5
- movl %edi,%ebp
- addl 24(%esp),%edx
- xorl %ebx,%eax
- shldl $5,%edi,%edi
- vpsrld $30,%xmm1,%xmm0
- vpor %xmm7,%xmm5,%xmm5
- addl %esi,%edx
- andl %eax,%ebp
- xorl %ebx,%eax
- addl %edi,%edx
- vpslld $2,%xmm1,%xmm1
- shrdl $7,%edi,%edi
- xorl %ebx,%ebp
- vpxor %xmm0,%xmm5,%xmm5
- movl %edx,%esi
- addl 28(%esp),%ecx
- xorl %eax,%edi
- shldl $5,%edx,%edx
- vpxor %xmm1,%xmm5,%xmm5
- addl %ebp,%ecx
- andl %edi,%esi
- vmovdqa 112(%esp),%xmm1
- xorl %eax,%edi
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- vpalignr $8,%xmm2,%xmm3,%xmm6
- movl %ecx,%ebp
- addl 32(%esp),%ebx
- vpaddd %xmm5,%xmm1,%xmm1
- vmovdqa %xmm2,96(%esp)
- xorl %edi,%edx
- shldl $5,%ecx,%ecx
- vpsrldq $4,%xmm5,%xmm0
- addl %esi,%ebx
- andl %edx,%ebp
- vpxor %xmm2,%xmm6,%xmm6
- xorl %edi,%edx
- addl %ecx,%ebx
- vpxor %xmm4,%xmm0,%xmm0
- shrdl $7,%ecx,%ecx
- xorl %edi,%ebp
- vmovdqa %xmm1,16(%esp)
- movl %ebx,%esi
- addl 36(%esp),%eax
- vpxor %xmm0,%xmm6,%xmm6
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- andl %ecx,%esi
- vpsrld $31,%xmm6,%xmm0
- xorl %edx,%ecx
- addl %ebx,%eax
- shrdl $7,%ebx,%ebx
- xorl %edx,%esi
- vpslldq $12,%xmm6,%xmm2
- vpaddd %xmm6,%xmm6,%xmm6
- movl %eax,%ebp
- addl 40(%esp),%edi
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrld $30,%xmm2,%xmm1
- vpor %xmm0,%xmm6,%xmm6
- addl %esi,%edi
- andl %ebx,%ebp
- xorl %ecx,%ebx
- addl %eax,%edi
- vpslld $2,%xmm2,%xmm2
- vmovdqa 64(%esp),%xmm0
- shrdl $7,%eax,%eax
- xorl %ecx,%ebp
- vpxor %xmm1,%xmm6,%xmm6
- movl %edi,%esi
- addl 44(%esp),%edx
- xorl %ebx,%eax
- shldl $5,%edi,%edi
- vpxor %xmm2,%xmm6,%xmm6
- addl %ebp,%edx
- andl %eax,%esi
- vmovdqa 112(%esp),%xmm2
- xorl %ebx,%eax
- addl %edi,%edx
- shrdl $7,%edi,%edi
- xorl %ebx,%esi
- vpalignr $8,%xmm3,%xmm4,%xmm7
- movl %edx,%ebp
- addl 48(%esp),%ecx
- vpaddd %xmm6,%xmm2,%xmm2
- vmovdqa %xmm3,64(%esp)
- xorl %eax,%edi
- shldl $5,%edx,%edx
- vpsrldq $4,%xmm6,%xmm1
- addl %esi,%ecx
- andl %edi,%ebp
- vpxor %xmm3,%xmm7,%xmm7
- xorl %eax,%edi
- addl %edx,%ecx
- vpxor %xmm5,%xmm1,%xmm1
- shrdl $7,%edx,%edx
- xorl %eax,%ebp
- vmovdqa %xmm2,32(%esp)
- movl %ecx,%esi
- addl 52(%esp),%ebx
- vpxor %xmm1,%xmm7,%xmm7
- xorl %edi,%edx
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- andl %edx,%esi
- vpsrld $31,%xmm7,%xmm1
- xorl %edi,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %edi,%esi
- vpslldq $12,%xmm7,%xmm3
- vpaddd %xmm7,%xmm7,%xmm7
- movl %ebx,%ebp
- addl 56(%esp),%eax
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm3,%xmm2
- vpor %xmm1,%xmm7,%xmm7
- addl %esi,%eax
- andl %ecx,%ebp
- xorl %edx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm3,%xmm3
- vmovdqa 80(%esp),%xmm1
- shrdl $7,%ebx,%ebx
- xorl %edx,%ebp
- vpxor %xmm2,%xmm7,%xmm7
- movl %eax,%esi
- addl 60(%esp),%edi
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpxor %xmm3,%xmm7,%xmm7
- addl %ebp,%edi
- andl %ebx,%esi
- vmovdqa 112(%esp),%xmm3
- xorl %ecx,%ebx
- addl %eax,%edi
- vpalignr $8,%xmm6,%xmm7,%xmm2
- vpxor %xmm4,%xmm0,%xmm0
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- movl %edi,%ebp
- addl (%esp),%edx
- vpxor %xmm1,%xmm0,%xmm0
- vmovdqa %xmm4,80(%esp)
- xorl %ebx,%eax
- shldl $5,%edi,%edi
- vmovdqa %xmm3,%xmm4
- vpaddd %xmm7,%xmm3,%xmm3
- addl %esi,%edx
- andl %eax,%ebp
- vpxor %xmm2,%xmm0,%xmm0
- xorl %ebx,%eax
- addl %edi,%edx
- shrdl $7,%edi,%edi
- xorl %ebx,%ebp
- vpsrld $30,%xmm0,%xmm2
- vmovdqa %xmm3,48(%esp)
- movl %edx,%esi
- addl 4(%esp),%ecx
- xorl %eax,%edi
- shldl $5,%edx,%edx
- vpslld $2,%xmm0,%xmm0
- addl %ebp,%ecx
- andl %edi,%esi
- xorl %eax,%edi
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- movl %ecx,%ebp
- addl 8(%esp),%ebx
- vpor %xmm2,%xmm0,%xmm0
- xorl %edi,%edx
- shldl $5,%ecx,%ecx
- vmovdqa 96(%esp),%xmm2
- addl %esi,%ebx
- andl %edx,%ebp
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 12(%esp),%eax
- xorl %edi,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm7,%xmm0,%xmm3
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- vpxor %xmm2,%xmm1,%xmm1
- vmovdqa %xmm5,96(%esp)
- addl %esi,%edi
- xorl %ecx,%ebp
- vmovdqa %xmm4,%xmm5
- vpaddd %xmm0,%xmm4,%xmm4
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpxor %xmm3,%xmm1,%xmm1
- addl 20(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- shldl $5,%edi,%edi
- vpsrld $30,%xmm1,%xmm3
- vmovdqa %xmm4,(%esp)
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpslld $2,%xmm1,%xmm1
- addl 24(%esp),%ecx
- xorl %eax,%esi
- movl %edx,%ebp
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%ebp
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpor %xmm3,%xmm1,%xmm1
- addl 28(%esp),%ebx
- xorl %edi,%ebp
- vmovdqa 64(%esp),%xmm3
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm0,%xmm1,%xmm4
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- vpxor %xmm3,%xmm2,%xmm2
- vmovdqa %xmm6,64(%esp)
- addl %esi,%eax
- xorl %edx,%ebp
- vmovdqa 128(%esp),%xmm6
- vpaddd %xmm1,%xmm5,%xmm5
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpxor %xmm4,%xmm2,%xmm2
- addl 36(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- shldl $5,%eax,%eax
- vpsrld $30,%xmm2,%xmm4
- vmovdqa %xmm5,16(%esp)
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpslld $2,%xmm2,%xmm2
- addl 40(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- shldl $5,%edi,%edi
- addl %esi,%edx
- xorl %ebx,%ebp
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpor %xmm4,%xmm2,%xmm2
- addl 44(%esp),%ecx
- xorl %eax,%ebp
- vmovdqa 80(%esp),%xmm4
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpalignr $8,%xmm1,%xmm2,%xmm5
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- vpxor %xmm4,%xmm3,%xmm3
- vmovdqa %xmm7,80(%esp)
- addl %esi,%ebx
- xorl %edi,%ebp
- vmovdqa %xmm6,%xmm7
- vpaddd %xmm2,%xmm6,%xmm6
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpxor %xmm5,%xmm3,%xmm3
- addl 52(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm3,%xmm5
- vmovdqa %xmm6,32(%esp)
- addl %ebp,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm3,%xmm3
- addl 56(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- addl %esi,%edi
- xorl %ecx,%ebp
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpor %xmm5,%xmm3,%xmm3
- addl 60(%esp),%edx
- xorl %ebx,%ebp
- vmovdqa 96(%esp),%xmm5
- movl %edi,%esi
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpalignr $8,%xmm2,%xmm3,%xmm6
- vpxor %xmm0,%xmm4,%xmm4
- addl (%esp),%ecx
- xorl %eax,%esi
- movl %edx,%ebp
- shldl $5,%edx,%edx
- vpxor %xmm5,%xmm4,%xmm4
- vmovdqa %xmm0,96(%esp)
- addl %esi,%ecx
- xorl %eax,%ebp
- vmovdqa %xmm7,%xmm0
- vpaddd %xmm3,%xmm7,%xmm7
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpxor %xmm6,%xmm4,%xmm4
- addl 4(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- vpsrld $30,%xmm4,%xmm6
- vmovdqa %xmm7,48(%esp)
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpslld $2,%xmm4,%xmm4
- addl 8(%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%ebp
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpor %xmm6,%xmm4,%xmm4
- addl 12(%esp),%edi
- xorl %ecx,%ebp
- vmovdqa 64(%esp),%xmm6
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpalignr $8,%xmm3,%xmm4,%xmm7
- vpxor %xmm1,%xmm5,%xmm5
- addl 16(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- shldl $5,%edi,%edi
- vpxor %xmm6,%xmm5,%xmm5
- vmovdqa %xmm1,64(%esp)
- addl %esi,%edx
- xorl %ebx,%ebp
- vmovdqa %xmm0,%xmm1
- vpaddd %xmm4,%xmm0,%xmm0
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpxor %xmm7,%xmm5,%xmm5
- addl 20(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- shldl $5,%edx,%edx
- vpsrld $30,%xmm5,%xmm7
- vmovdqa %xmm0,(%esp)
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpslld $2,%xmm5,%xmm5
- addl 24(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpor %xmm7,%xmm5,%xmm5
- addl 28(%esp),%eax
- vmovdqa 80(%esp),%xmm7
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%ebp
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm4,%xmm5,%xmm0
- vpxor %xmm2,%xmm6,%xmm6
- addl 32(%esp),%edi
- andl %ecx,%esi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- vmovdqa %xmm2,80(%esp)
- movl %eax,%ebp
- xorl %ecx,%esi
- vmovdqa %xmm1,%xmm2
- vpaddd %xmm5,%xmm1,%xmm1
- shldl $5,%eax,%eax
- addl %esi,%edi
- vpxor %xmm0,%xmm6,%xmm6
- xorl %ebx,%ebp
- xorl %ecx,%ebx
- addl %eax,%edi
- addl 36(%esp),%edx
- vpsrld $30,%xmm6,%xmm0
- vmovdqa %xmm1,16(%esp)
- andl %ebx,%ebp
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %edi,%esi
- vpslld $2,%xmm6,%xmm6
- xorl %ebx,%ebp
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %edi,%edx
- addl 40(%esp),%ecx
- andl %eax,%esi
- vpor %xmm0,%xmm6,%xmm6
- xorl %ebx,%eax
- shrdl $7,%edi,%edi
- vmovdqa 96(%esp),%xmm0
- movl %edx,%ebp
- xorl %eax,%esi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %edi,%ebp
- xorl %eax,%edi
- addl %edx,%ecx
- addl 44(%esp),%ebx
- andl %edi,%ebp
- xorl %eax,%edi
- shrdl $7,%edx,%edx
- movl %ecx,%esi
- xorl %edi,%ebp
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edx,%esi
- xorl %edi,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm5,%xmm6,%xmm1
- vpxor %xmm3,%xmm7,%xmm7
- addl 48(%esp),%eax
- andl %edx,%esi
- xorl %edi,%edx
- shrdl $7,%ecx,%ecx
- vpxor %xmm0,%xmm7,%xmm7
- vmovdqa %xmm3,96(%esp)
- movl %ebx,%ebp
- xorl %edx,%esi
- vmovdqa 144(%esp),%xmm3
- vpaddd %xmm6,%xmm2,%xmm2
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vpxor %xmm1,%xmm7,%xmm7
- xorl %ecx,%ebp
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 52(%esp),%edi
- vpsrld $30,%xmm7,%xmm1
- vmovdqa %xmm2,32(%esp)
- andl %ecx,%ebp
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- vpslld $2,%xmm7,%xmm7
- xorl %ecx,%ebp
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%edi
- addl 56(%esp),%edx
- andl %ebx,%esi
- vpor %xmm1,%xmm7,%xmm7
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- vmovdqa 64(%esp),%xmm1
- movl %edi,%ebp
- xorl %ebx,%esi
- shldl $5,%edi,%edi
- addl %esi,%edx
- xorl %eax,%ebp
- xorl %ebx,%eax
- addl %edi,%edx
- addl 60(%esp),%ecx
- andl %eax,%ebp
- xorl %ebx,%eax
- shrdl $7,%edi,%edi
- movl %edx,%esi
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %edi,%esi
- xorl %eax,%edi
- addl %edx,%ecx
- vpalignr $8,%xmm6,%xmm7,%xmm2
- vpxor %xmm4,%xmm0,%xmm0
- addl (%esp),%ebx
- andl %edi,%esi
- xorl %eax,%edi
- shrdl $7,%edx,%edx
- vpxor %xmm1,%xmm0,%xmm0
- vmovdqa %xmm4,64(%esp)
- movl %ecx,%ebp
- xorl %edi,%esi
- vmovdqa %xmm3,%xmm4
- vpaddd %xmm7,%xmm3,%xmm3
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- vpxor %xmm2,%xmm0,%xmm0
- xorl %edx,%ebp
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 4(%esp),%eax
- vpsrld $30,%xmm0,%xmm2
- vmovdqa %xmm3,48(%esp)
- andl %edx,%ebp
- xorl %edi,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- vpslld $2,%xmm0,%xmm0
- xorl %edx,%ebp
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 8(%esp),%edi
- andl %ecx,%esi
- vpor %xmm2,%xmm0,%xmm0
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- vmovdqa 80(%esp),%xmm2
- movl %eax,%ebp
- xorl %ecx,%esi
- shldl $5,%eax,%eax
- addl %esi,%edi
- xorl %ebx,%ebp
- xorl %ecx,%ebx
- addl %eax,%edi
- addl 12(%esp),%edx
- andl %ebx,%ebp
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %edi,%esi
- xorl %ebx,%ebp
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %edi,%edx
- vpalignr $8,%xmm7,%xmm0,%xmm3
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%esp),%ecx
- andl %eax,%esi
- xorl %ebx,%eax
- shrdl $7,%edi,%edi
- vpxor %xmm2,%xmm1,%xmm1
- vmovdqa %xmm5,80(%esp)
- movl %edx,%ebp
- xorl %eax,%esi
- vmovdqa %xmm4,%xmm5
- vpaddd %xmm0,%xmm4,%xmm4
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vpxor %xmm3,%xmm1,%xmm1
- xorl %edi,%ebp
- xorl %eax,%edi
- addl %edx,%ecx
- addl 20(%esp),%ebx
- vpsrld $30,%xmm1,%xmm3
- vmovdqa %xmm4,(%esp)
- andl %edi,%ebp
- xorl %eax,%edi
- shrdl $7,%edx,%edx
- movl %ecx,%esi
- vpslld $2,%xmm1,%xmm1
- xorl %edi,%ebp
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edx,%esi
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 24(%esp),%eax
- andl %edx,%esi
- vpor %xmm3,%xmm1,%xmm1
- xorl %edi,%edx
- shrdl $7,%ecx,%ecx
- vmovdqa 96(%esp),%xmm3
- movl %ebx,%ebp
- xorl %edx,%esi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %ecx,%ebp
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 28(%esp),%edi
- andl %ecx,%ebp
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- xorl %ecx,%ebp
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%edi
- vpalignr $8,%xmm0,%xmm1,%xmm4
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%esp),%edx
- andl %ebx,%esi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- vpxor %xmm3,%xmm2,%xmm2
- vmovdqa %xmm6,96(%esp)
- movl %edi,%ebp
- xorl %ebx,%esi
- vmovdqa %xmm5,%xmm6
- vpaddd %xmm1,%xmm5,%xmm5
- shldl $5,%edi,%edi
- addl %esi,%edx
- vpxor %xmm4,%xmm2,%xmm2
- xorl %eax,%ebp
- xorl %ebx,%eax
- addl %edi,%edx
- addl 36(%esp),%ecx
- vpsrld $30,%xmm2,%xmm4
- vmovdqa %xmm5,16(%esp)
- andl %eax,%ebp
- xorl %ebx,%eax
- shrdl $7,%edi,%edi
- movl %edx,%esi
- vpslld $2,%xmm2,%xmm2
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %edi,%esi
- xorl %eax,%edi
- addl %edx,%ecx
- addl 40(%esp),%ebx
- andl %edi,%esi
- vpor %xmm4,%xmm2,%xmm2
- xorl %eax,%edi
- shrdl $7,%edx,%edx
- vmovdqa 64(%esp),%xmm4
- movl %ecx,%ebp
- xorl %edi,%esi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edx,%ebp
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 44(%esp),%eax
- andl %edx,%ebp
- xorl %edi,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%ebp
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- addl %ebx,%eax
- vpalignr $8,%xmm1,%xmm2,%xmm5
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- vpxor %xmm4,%xmm3,%xmm3
- vmovdqa %xmm7,64(%esp)
- addl %esi,%edi
- xorl %ecx,%ebp
- vmovdqa %xmm6,%xmm7
- vpaddd %xmm2,%xmm6,%xmm6
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpxor %xmm5,%xmm3,%xmm3
- addl 52(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- shldl $5,%edi,%edi
- vpsrld $30,%xmm3,%xmm5
- vmovdqa %xmm6,32(%esp)
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpslld $2,%xmm3,%xmm3
- addl 56(%esp),%ecx
- xorl %eax,%esi
- movl %edx,%ebp
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%ebp
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpor %xmm5,%xmm3,%xmm3
- addl 60(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl (%esp),%eax
- vpaddd %xmm3,%xmm7,%xmm7
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vmovdqa %xmm7,48(%esp)
- xorl %edx,%ebp
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 4(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 8(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- shldl $5,%edi,%edi
- addl %esi,%edx
- xorl %ebx,%ebp
- shrdl $7,%eax,%eax
- addl %edi,%edx
- addl 12(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- movl 196(%esp),%ebp
- cmpl 200(%esp),%ebp
- je L010done
- vmovdqa 160(%esp),%xmm7
- vmovdqa 176(%esp),%xmm6
- vmovdqu (%ebp),%xmm0
- vmovdqu 16(%ebp),%xmm1
- vmovdqu 32(%ebp),%xmm2
- vmovdqu 48(%ebp),%xmm3
- addl $64,%ebp
- vpshufb %xmm6,%xmm0,%xmm0
- movl %ebp,196(%esp)
- vmovdqa %xmm7,96(%esp)
- addl 16(%esp),%ebx
- xorl %edi,%esi
- vpshufb %xmm6,%xmm1,%xmm1
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- vpaddd %xmm7,%xmm0,%xmm4
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vmovdqa %xmm4,(%esp)
- addl 20(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- addl %esi,%edi
- xorl %ecx,%ebp
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 28(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- addl 32(%esp),%ecx
- xorl %eax,%esi
- vpshufb %xmm6,%xmm2,%xmm2
- movl %edx,%ebp
- shldl $5,%edx,%edx
- vpaddd %xmm7,%xmm1,%xmm5
- addl %esi,%ecx
- xorl %eax,%ebp
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vmovdqa %xmm5,16(%esp)
- addl 36(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%ebp
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 48(%esp),%edx
- xorl %ebx,%esi
- vpshufb %xmm6,%xmm3,%xmm3
- movl %edi,%ebp
- shldl $5,%edi,%edi
- vpaddd %xmm7,%xmm2,%xmm6
- addl %esi,%edx
- xorl %ebx,%ebp
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vmovdqa %xmm6,32(%esp)
- addl 52(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- addl 56(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- movl 192(%esp),%ebp
- addl (%ebp),%eax
- addl 4(%ebp),%esi
- addl 8(%ebp),%ecx
- movl %eax,(%ebp)
- addl 12(%ebp),%edx
- movl %esi,4(%ebp)
- addl 16(%ebp),%edi
- movl %ecx,%ebx
- movl %ecx,8(%ebp)
- xorl %edx,%ebx
- movl %edx,12(%ebp)
- movl %edi,16(%ebp)
- movl %esi,%ebp
- andl %ebx,%esi
- movl %ebp,%ebx
- jmp L009loop
-.align 4,0x90
-L010done:
- addl 16(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 20(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- addl %esi,%edi
- xorl %ecx,%ebp
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 28(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- addl 32(%esp),%ecx
- xorl %eax,%esi
- movl %edx,%ebp
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%ebp
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- addl 36(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%ebp
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 48(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- shldl $5,%edi,%edi
- addl %esi,%edx
- xorl %ebx,%ebp
- shrdl $7,%eax,%eax
- addl %edi,%edx
- addl 52(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- addl 56(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vzeroall
- movl 192(%esp),%ebp
- addl (%ebp),%eax
- movl 204(%esp),%esp
- addl 4(%ebp),%esi
- addl 8(%ebp),%ecx
- movl %eax,(%ebp)
- addl 12(%ebp),%edx
- movl %esi,4(%ebp)
- addl 16(%ebp),%edi
- movl %ecx,8(%ebp)
- movl %edx,12(%ebp)
- movl %edi,16(%ebp)
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
.align 6,0x90
LK_XX_XX:
.long 1518500249,1518500249,1518500249,1518500249
diff --git a/deps/openssl/asm/x86-macosx-gas/sha/sha256-586.s b/deps/openssl/asm/x86-macosx-gas/sha/sha256-586.s
index d30c582726..37f532aa5f 100644
--- a/deps/openssl/asm/x86-macosx-gas/sha/sha256-586.s
+++ b/deps/openssl/asm/x86-macosx-gas/sha/sha256-586.s
@@ -39,13 +39,12 @@ L000pic_point:
orl %ebx,%ecx
andl $1342177280,%ecx
cmpl $1342177280,%ecx
- je L005AVX
testl $512,%ebx
- jnz L006SSSE3
+ jnz L005SSSE3
L003no_xmm:
subl %edi,%eax
cmpl $256,%eax
- jae L007unrolled
+ jae L006unrolled
jmp L002loop
.align 4,0x90
L002loop:
@@ -117,7 +116,7 @@ L002loop:
movl %ecx,28(%esp)
movl %edi,32(%esp)
.align 4,0x90
-L00800_15:
+L00700_15:
movl %edx,%ecx
movl 24(%esp),%esi
rorl $14,%ecx
@@ -155,11 +154,11 @@ L00800_15:
addl $4,%ebp
addl %ebx,%eax
cmpl $3248222580,%esi
- jne L00800_15
+ jne L00700_15
movl 156(%esp),%ecx
- jmp L00916_63
+ jmp L00816_63
.align 4,0x90
-L00916_63:
+L00816_63:
movl %ecx,%ebx
movl 104(%esp),%esi
rorl $11,%ecx
@@ -214,7 +213,7 @@ L00916_63:
addl $4,%ebp
addl %ebx,%eax
cmpl $3329325298,%esi
- jne L00916_63
+ jne L00816_63
movl 356(%esp),%esi
movl 8(%esp),%ebx
movl 16(%esp),%ecx
@@ -258,7 +257,7 @@ L001K256:
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
.align 4,0x90
-L007unrolled:
+L006unrolled:
leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebp
@@ -275,9 +274,9 @@ L007unrolled:
movl %ebx,20(%esp)
movl %ecx,24(%esp)
movl %esi,28(%esp)
- jmp L010grand_loop
+ jmp L009grand_loop
.align 4,0x90
-L010grand_loop:
+L009grand_loop:
movl (%edi),%ebx
movl 4(%edi),%ecx
bswap %ebx
@@ -3157,7 +3156,7 @@ L010grand_loop:
movl %ebx,24(%esp)
movl %ecx,28(%esp)
cmpl 104(%esp),%edi
- jb L010grand_loop
+ jb L009grand_loop
movl 108(%esp),%esp
popl %edi
popl %esi
@@ -3176,9 +3175,9 @@ L004shaext:
pshufd $27,%xmm2,%xmm2
.byte 102,15,58,15,202,8
punpcklqdq %xmm0,%xmm2
- jmp L011loop_shaext
+ jmp L010loop_shaext
.align 4,0x90
-L011loop_shaext:
+L010loop_shaext:
movdqu (%edi),%xmm3
movdqu 16(%edi),%xmm4
movdqu 32(%edi),%xmm5
@@ -3348,7 +3347,7 @@ L011loop_shaext:
.byte 15,56,203,202
paddd 16(%esp),%xmm2
paddd (%esp),%xmm1
- jnz L011loop_shaext
+ jnz L010loop_shaext
pshufd $177,%xmm2,%xmm2
pshufd $27,%xmm1,%xmm7
pshufd $177,%xmm1,%xmm1
@@ -3363,7 +3362,7 @@ L011loop_shaext:
popl %ebp
ret
.align 5,0x90
-L006SSSE3:
+L005SSSE3:
leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebx
@@ -3382,9 +3381,9 @@ L006SSSE3:
movl %ecx,24(%esp)
movl %esi,28(%esp)
movdqa 256(%ebp),%xmm7
- jmp L012grand_ssse3
+ jmp L011grand_ssse3
.align 4,0x90
-L012grand_ssse3:
+L011grand_ssse3:
movdqu (%edi),%xmm0
movdqu 16(%edi),%xmm1
movdqu 32(%edi),%xmm2
@@ -3407,9 +3406,9 @@ L012grand_ssse3:
paddd %xmm3,%xmm7
movdqa %xmm6,64(%esp)
movdqa %xmm7,80(%esp)
- jmp L013ssse3_00_47
+ jmp L012ssse3_00_47
.align 4,0x90
-L013ssse3_00_47:
+L012ssse3_00_47:
addl $64,%ebp
movl %edx,%ecx
movdqa %xmm1,%xmm4
@@ -4052,7 +4051,7 @@ L013ssse3_00_47:
addl %ecx,%eax
movdqa %xmm6,80(%esp)
cmpl $66051,64(%ebp)
- jne L013ssse3_00_47
+ jne L012ssse3_00_47
movl %edx,%ecx
rorl $14,%edx
movl 20(%esp),%esi
@@ -4566,2218 +4565,13 @@ L013ssse3_00_47:
movdqa 64(%ebp),%xmm7
subl $192,%ebp
cmpl 104(%esp),%edi
- jb L012grand_ssse3
+ jb L011grand_ssse3
movl 108(%esp),%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
-.align 5,0x90
-L005AVX:
- andl $264,%edx
- cmpl $264,%edx
- je L014AVX_BMI
- leal -96(%esp),%esp
- vzeroall
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl 8(%esi),%ecx
- movl 12(%esi),%edi
- movl %ebx,4(%esp)
- xorl %ecx,%ebx
- movl %ecx,8(%esp)
- movl %edi,12(%esp)
- movl 16(%esi),%edx
- movl 20(%esi),%edi
- movl 24(%esi),%ecx
- movl 28(%esi),%esi
- movl %edi,20(%esp)
- movl 100(%esp),%edi
- movl %ecx,24(%esp)
- movl %esi,28(%esp)
- vmovdqa 256(%ebp),%xmm7
- jmp L015grand_avx
-.align 5,0x90
-L015grand_avx:
- vmovdqu (%edi),%xmm0
- vmovdqu 16(%edi),%xmm1
- vmovdqu 32(%edi),%xmm2
- vmovdqu 48(%edi),%xmm3
- addl $64,%edi
- vpshufb %xmm7,%xmm0,%xmm0
- movl %edi,100(%esp)
- vpshufb %xmm7,%xmm1,%xmm1
- vpshufb %xmm7,%xmm2,%xmm2
- vpaddd (%ebp),%xmm0,%xmm4
- vpshufb %xmm7,%xmm3,%xmm3
- vpaddd 16(%ebp),%xmm1,%xmm5
- vpaddd 32(%ebp),%xmm2,%xmm6
- vpaddd 48(%ebp),%xmm3,%xmm7
- vmovdqa %xmm4,32(%esp)
- vmovdqa %xmm5,48(%esp)
- vmovdqa %xmm6,64(%esp)
- vmovdqa %xmm7,80(%esp)
- jmp L016avx_00_47
-.align 4,0x90
-L016avx_00_47:
- addl $64,%ebp
- vpalignr $4,%xmm0,%xmm1,%xmm4
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 20(%esp),%esi
- vpalignr $4,%xmm2,%xmm3,%xmm7
- xorl %ecx,%edx
- movl 24(%esp),%edi
- xorl %edi,%esi
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- vpaddd %xmm7,%xmm0,%xmm0
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrld $3,%xmm4,%xmm7
- movl %eax,%ecx
- addl %edi,%edx
- movl 4(%esp),%edi
- vpslld $14,%xmm4,%xmm5
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,(%esp)
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- vpshufd $250,%xmm3,%xmm7
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpsrld $11,%xmm6,%xmm6
- addl 32(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpxor %xmm5,%xmm4,%xmm4
- addl %edx,%ebx
- addl 12(%esp),%edx
- addl %ecx,%ebx
- vpslld $11,%xmm5,%xmm5
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 16(%esp),%esi
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ecx,%edx
- movl 20(%esp),%edi
- xorl %edi,%esi
- vpsrld $10,%xmm7,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,12(%esp)
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %ebx,%ecx
- addl %edi,%edx
- movl (%esp),%edi
- vpaddd %xmm4,%xmm0,%xmm0
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,28(%esp)
- vpxor %xmm5,%xmm6,%xmm6
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- vpsrlq $19,%xmm7,%xmm7
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- addl 36(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- vpshufd $132,%xmm6,%xmm7
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %ecx,%eax
- vpsrldq $8,%xmm7,%xmm7
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 12(%esp),%esi
- vpaddd %xmm7,%xmm0,%xmm0
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- vpshufd $80,%xmm0,%xmm7
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- vpsrld $10,%xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- vpxor %xmm5,%xmm6,%xmm6
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,24(%esp)
- vpsrlq $19,%xmm7,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpshufd $232,%xmm6,%xmm7
- addl 40(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpslldq $8,%xmm7,%xmm7
- addl %edx,%ebx
- addl 4(%esp),%edx
- addl %ecx,%ebx
- vpaddd %xmm7,%xmm0,%xmm0
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 8(%esp),%esi
- vpaddd (%ebp),%xmm0,%xmm6
- xorl %ecx,%edx
- movl 12(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,4(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 44(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- vmovdqa %xmm6,32(%esp)
- vpalignr $4,%xmm1,%xmm2,%xmm4
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 4(%esp),%esi
- vpalignr $4,%xmm3,%xmm0,%xmm7
- xorl %ecx,%edx
- movl 8(%esp),%edi
- xorl %edi,%esi
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- vpaddd %xmm7,%xmm1,%xmm1
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrld $3,%xmm4,%xmm7
- movl %eax,%ecx
- addl %edi,%edx
- movl 20(%esp),%edi
- vpslld $14,%xmm4,%xmm5
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,16(%esp)
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- vpshufd $250,%xmm0,%xmm7
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpsrld $11,%xmm6,%xmm6
- addl 48(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpxor %xmm5,%xmm4,%xmm4
- addl %edx,%ebx
- addl 28(%esp),%edx
- addl %ecx,%ebx
- vpslld $11,%xmm5,%xmm5
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl (%esp),%esi
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ecx,%edx
- movl 4(%esp),%edi
- xorl %edi,%esi
- vpsrld $10,%xmm7,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,28(%esp)
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %ebx,%ecx
- addl %edi,%edx
- movl 16(%esp),%edi
- vpaddd %xmm4,%xmm1,%xmm1
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,12(%esp)
- vpxor %xmm5,%xmm6,%xmm6
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- vpsrlq $19,%xmm7,%xmm7
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- addl 52(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- vpshufd $132,%xmm6,%xmm7
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %ecx,%eax
- vpsrldq $8,%xmm7,%xmm7
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 28(%esp),%esi
- vpaddd %xmm7,%xmm1,%xmm1
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- vpshufd $80,%xmm1,%xmm7
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- vpsrld $10,%xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- vpxor %xmm5,%xmm6,%xmm6
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,8(%esp)
- vpsrlq $19,%xmm7,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpshufd $232,%xmm6,%xmm7
- addl 56(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpslldq $8,%xmm7,%xmm7
- addl %edx,%ebx
- addl 20(%esp),%edx
- addl %ecx,%ebx
- vpaddd %xmm7,%xmm1,%xmm1
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 24(%esp),%esi
- vpaddd 16(%ebp),%xmm1,%xmm6
- xorl %ecx,%edx
- movl 28(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,20(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 60(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- vmovdqa %xmm6,48(%esp)
- vpalignr $4,%xmm2,%xmm3,%xmm4
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 20(%esp),%esi
- vpalignr $4,%xmm0,%xmm1,%xmm7
- xorl %ecx,%edx
- movl 24(%esp),%edi
- xorl %edi,%esi
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- vpaddd %xmm7,%xmm2,%xmm2
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrld $3,%xmm4,%xmm7
- movl %eax,%ecx
- addl %edi,%edx
- movl 4(%esp),%edi
- vpslld $14,%xmm4,%xmm5
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,(%esp)
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- vpshufd $250,%xmm1,%xmm7
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpsrld $11,%xmm6,%xmm6
- addl 64(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpxor %xmm5,%xmm4,%xmm4
- addl %edx,%ebx
- addl 12(%esp),%edx
- addl %ecx,%ebx
- vpslld $11,%xmm5,%xmm5
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 16(%esp),%esi
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ecx,%edx
- movl 20(%esp),%edi
- xorl %edi,%esi
- vpsrld $10,%xmm7,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,12(%esp)
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %ebx,%ecx
- addl %edi,%edx
- movl (%esp),%edi
- vpaddd %xmm4,%xmm2,%xmm2
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,28(%esp)
- vpxor %xmm5,%xmm6,%xmm6
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- vpsrlq $19,%xmm7,%xmm7
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- addl 68(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- vpshufd $132,%xmm6,%xmm7
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %ecx,%eax
- vpsrldq $8,%xmm7,%xmm7
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 12(%esp),%esi
- vpaddd %xmm7,%xmm2,%xmm2
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- vpshufd $80,%xmm2,%xmm7
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- vpsrld $10,%xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- vpxor %xmm5,%xmm6,%xmm6
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,24(%esp)
- vpsrlq $19,%xmm7,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpshufd $232,%xmm6,%xmm7
- addl 72(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpslldq $8,%xmm7,%xmm7
- addl %edx,%ebx
- addl 4(%esp),%edx
- addl %ecx,%ebx
- vpaddd %xmm7,%xmm2,%xmm2
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 8(%esp),%esi
- vpaddd 32(%ebp),%xmm2,%xmm6
- xorl %ecx,%edx
- movl 12(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,4(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 76(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- vmovdqa %xmm6,64(%esp)
- vpalignr $4,%xmm3,%xmm0,%xmm4
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 4(%esp),%esi
- vpalignr $4,%xmm1,%xmm2,%xmm7
- xorl %ecx,%edx
- movl 8(%esp),%edi
- xorl %edi,%esi
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- vpaddd %xmm7,%xmm3,%xmm3
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrld $3,%xmm4,%xmm7
- movl %eax,%ecx
- addl %edi,%edx
- movl 20(%esp),%edi
- vpslld $14,%xmm4,%xmm5
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,16(%esp)
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- vpshufd $250,%xmm2,%xmm7
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpsrld $11,%xmm6,%xmm6
- addl 80(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpxor %xmm5,%xmm4,%xmm4
- addl %edx,%ebx
- addl 28(%esp),%edx
- addl %ecx,%ebx
- vpslld $11,%xmm5,%xmm5
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl (%esp),%esi
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ecx,%edx
- movl 4(%esp),%edi
- xorl %edi,%esi
- vpsrld $10,%xmm7,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,28(%esp)
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %ebx,%ecx
- addl %edi,%edx
- movl 16(%esp),%edi
- vpaddd %xmm4,%xmm3,%xmm3
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,12(%esp)
- vpxor %xmm5,%xmm6,%xmm6
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- vpsrlq $19,%xmm7,%xmm7
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- addl 84(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- vpshufd $132,%xmm6,%xmm7
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %ecx,%eax
- vpsrldq $8,%xmm7,%xmm7
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 28(%esp),%esi
- vpaddd %xmm7,%xmm3,%xmm3
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- vpshufd $80,%xmm3,%xmm7
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- vpsrld $10,%xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- vpxor %xmm5,%xmm6,%xmm6
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,8(%esp)
- vpsrlq $19,%xmm7,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpshufd $232,%xmm6,%xmm7
- addl 88(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpslldq $8,%xmm7,%xmm7
- addl %edx,%ebx
- addl 20(%esp),%edx
- addl %ecx,%ebx
- vpaddd %xmm7,%xmm3,%xmm3
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 24(%esp),%esi
- vpaddd 48(%ebp),%xmm3,%xmm6
- xorl %ecx,%edx
- movl 28(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,20(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 92(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- vmovdqa %xmm6,80(%esp)
- cmpl $66051,64(%ebp)
- jne L016avx_00_47
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 20(%esp),%esi
- xorl %ecx,%edx
- movl 24(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 4(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 32(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 12(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 16(%esp),%esi
- xorl %ecx,%edx
- movl 20(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,12(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl (%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,28(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 36(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 12(%esp),%esi
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,24(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 40(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 4(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 8(%esp),%esi
- xorl %ecx,%edx
- movl 12(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,4(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 44(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 4(%esp),%esi
- xorl %ecx,%edx
- movl 8(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 20(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,16(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 48(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 28(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl (%esp),%esi
- xorl %ecx,%edx
- movl 4(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,28(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 16(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,12(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 52(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 28(%esp),%esi
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,8(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 56(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 20(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 24(%esp),%esi
- xorl %ecx,%edx
- movl 28(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,20(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 60(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 20(%esp),%esi
- xorl %ecx,%edx
- movl 24(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 4(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 64(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 12(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 16(%esp),%esi
- xorl %ecx,%edx
- movl 20(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,12(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl (%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,28(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 68(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 12(%esp),%esi
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,24(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 72(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 4(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 8(%esp),%esi
- xorl %ecx,%edx
- movl 12(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,4(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 76(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 4(%esp),%esi
- xorl %ecx,%edx
- movl 8(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 20(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,16(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 80(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 28(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl (%esp),%esi
- xorl %ecx,%edx
- movl 4(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,28(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 16(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,12(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 84(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 28(%esp),%esi
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,8(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 88(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 20(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 24(%esp),%esi
- xorl %ecx,%edx
- movl 28(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,20(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 92(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- movl 96(%esp),%esi
- xorl %edi,%ebx
- movl 12(%esp),%ecx
- addl (%esi),%eax
- addl 4(%esi),%ebx
- addl 8(%esi),%edi
- addl 12(%esi),%ecx
- movl %eax,(%esi)
- movl %ebx,4(%esi)
- movl %edi,8(%esi)
- movl %ecx,12(%esi)
- movl %ebx,4(%esp)
- xorl %edi,%ebx
- movl %edi,8(%esp)
- movl %ecx,12(%esp)
- movl 20(%esp),%edi
- movl 24(%esp),%ecx
- addl 16(%esi),%edx
- addl 20(%esi),%edi
- addl 24(%esi),%ecx
- movl %edx,16(%esi)
- movl %edi,20(%esi)
- movl %edi,20(%esp)
- movl 28(%esp),%edi
- movl %ecx,24(%esi)
- addl 28(%esi),%edi
- movl %ecx,24(%esp)
- movl %edi,28(%esi)
- movl %edi,28(%esp)
- movl 100(%esp),%edi
- vmovdqa 64(%ebp),%xmm7
- subl $192,%ebp
- cmpl 104(%esp),%edi
- jb L015grand_avx
- movl 108(%esp),%esp
- vzeroall
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.align 5,0x90
-L014AVX_BMI:
- leal -96(%esp),%esp
- vzeroall
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl 8(%esi),%ecx
- movl 12(%esi),%edi
- movl %ebx,4(%esp)
- xorl %ecx,%ebx
- movl %ecx,8(%esp)
- movl %edi,12(%esp)
- movl 16(%esi),%edx
- movl 20(%esi),%edi
- movl 24(%esi),%ecx
- movl 28(%esi),%esi
- movl %edi,20(%esp)
- movl 100(%esp),%edi
- movl %ecx,24(%esp)
- movl %esi,28(%esp)
- vmovdqa 256(%ebp),%xmm7
- jmp L017grand_avx_bmi
-.align 5,0x90
-L017grand_avx_bmi:
- vmovdqu (%edi),%xmm0
- vmovdqu 16(%edi),%xmm1
- vmovdqu 32(%edi),%xmm2
- vmovdqu 48(%edi),%xmm3
- addl $64,%edi
- vpshufb %xmm7,%xmm0,%xmm0
- movl %edi,100(%esp)
- vpshufb %xmm7,%xmm1,%xmm1
- vpshufb %xmm7,%xmm2,%xmm2
- vpaddd (%ebp),%xmm0,%xmm4
- vpshufb %xmm7,%xmm3,%xmm3
- vpaddd 16(%ebp),%xmm1,%xmm5
- vpaddd 32(%ebp),%xmm2,%xmm6
- vpaddd 48(%ebp),%xmm3,%xmm7
- vmovdqa %xmm4,32(%esp)
- vmovdqa %xmm5,48(%esp)
- vmovdqa %xmm6,64(%esp)
- vmovdqa %xmm7,80(%esp)
- jmp L018avx_bmi_00_47
-.align 4,0x90
-L018avx_bmi_00_47:
- addl $64,%ebp
- vpalignr $4,%xmm0,%xmm1,%xmm4
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,16(%esp)
- vpalignr $4,%xmm2,%xmm3,%xmm7
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 24(%esp),%edx,%esi
- vpsrld $7,%xmm4,%xmm6
- xorl %edi,%ecx
- andl 20(%esp),%edx
- movl %eax,(%esp)
- vpaddd %xmm7,%xmm0,%xmm0
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrld $3,%xmm4,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpslld $14,%xmm4,%xmm5
- movl 4(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpxor %xmm6,%xmm7,%xmm4
- addl 28(%esp),%edx
- andl %eax,%ebx
- addl 32(%esp),%edx
- vpshufd $250,%xmm3,%xmm7
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 12(%esp),%edx
- vpsrld $11,%xmm6,%xmm6
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl %edx,12(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpslld $11,%xmm5,%xmm5
- andnl 20(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 16(%esp),%edx
- vpxor %xmm6,%xmm4,%xmm4
- movl %ebx,28(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpsrld $10,%xmm7,%xmm6
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl (%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpsrlq $17,%xmm7,%xmm5
- addl 24(%esp),%edx
- andl %ebx,%eax
- addl 36(%esp),%edx
- vpaddd %xmm4,%xmm0,%xmm0
- xorl %edi,%eax
- addl %edx,%ecx
- addl 8(%esp),%edx
- vpxor %xmm5,%xmm6,%xmm6
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpsrlq $19,%xmm7,%xmm7
- movl %edx,8(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- andnl 16(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 12(%esp),%edx
- vpshufd $132,%xmm6,%xmm7
- movl %eax,24(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrldq $8,%xmm7,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpaddd %xmm7,%xmm0,%xmm0
- movl 28(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpshufd $80,%xmm0,%xmm7
- addl 20(%esp),%edx
- andl %eax,%ebx
- addl 40(%esp),%edx
- vpsrld $10,%xmm7,%xmm6
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 4(%esp),%edx
- vpsrlq $17,%xmm7,%xmm5
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm6,%xmm6
- movl %edx,4(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpsrlq $19,%xmm7,%xmm7
- andnl 12(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 8(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- movl %ebx,20(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpshufd $232,%xmm6,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpslldq $8,%xmm7,%xmm7
- movl 24(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpaddd %xmm7,%xmm0,%xmm0
- addl 16(%esp),%edx
- andl %ebx,%eax
- addl 44(%esp),%edx
- vpaddd (%ebp),%xmm0,%xmm6
- xorl %edi,%eax
- addl %edx,%ecx
- addl (%esp),%edx
- leal (%eax,%ecx,1),%eax
- vmovdqa %xmm6,32(%esp)
- vpalignr $4,%xmm1,%xmm2,%xmm4
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,(%esp)
- vpalignr $4,%xmm3,%xmm0,%xmm7
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 8(%esp),%edx,%esi
- vpsrld $7,%xmm4,%xmm6
- xorl %edi,%ecx
- andl 4(%esp),%edx
- movl %eax,16(%esp)
- vpaddd %xmm7,%xmm1,%xmm1
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrld $3,%xmm4,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpslld $14,%xmm4,%xmm5
- movl 20(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpxor %xmm6,%xmm7,%xmm4
- addl 12(%esp),%edx
- andl %eax,%ebx
- addl 48(%esp),%edx
- vpshufd $250,%xmm0,%xmm7
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 28(%esp),%edx
- vpsrld $11,%xmm6,%xmm6
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl %edx,28(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpslld $11,%xmm5,%xmm5
- andnl 4(%esp),%edx,%esi
- xorl %edi,%ecx
- andl (%esp),%edx
- vpxor %xmm6,%xmm4,%xmm4
- movl %ebx,12(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpsrld $10,%xmm7,%xmm6
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl 16(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpsrlq $17,%xmm7,%xmm5
- addl 8(%esp),%edx
- andl %ebx,%eax
- addl 52(%esp),%edx
- vpaddd %xmm4,%xmm1,%xmm1
- xorl %edi,%eax
- addl %edx,%ecx
- addl 24(%esp),%edx
- vpxor %xmm5,%xmm6,%xmm6
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpsrlq $19,%xmm7,%xmm7
- movl %edx,24(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- andnl (%esp),%edx,%esi
- xorl %edi,%ecx
- andl 28(%esp),%edx
- vpshufd $132,%xmm6,%xmm7
- movl %eax,8(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrldq $8,%xmm7,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpaddd %xmm7,%xmm1,%xmm1
- movl 12(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpshufd $80,%xmm1,%xmm7
- addl 4(%esp),%edx
- andl %eax,%ebx
- addl 56(%esp),%edx
- vpsrld $10,%xmm7,%xmm6
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 20(%esp),%edx
- vpsrlq $17,%xmm7,%xmm5
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm6,%xmm6
- movl %edx,20(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpsrlq $19,%xmm7,%xmm7
- andnl 28(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 24(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- movl %ebx,4(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpshufd $232,%xmm6,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpslldq $8,%xmm7,%xmm7
- movl 8(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpaddd %xmm7,%xmm1,%xmm1
- addl (%esp),%edx
- andl %ebx,%eax
- addl 60(%esp),%edx
- vpaddd 16(%ebp),%xmm1,%xmm6
- xorl %edi,%eax
- addl %edx,%ecx
- addl 16(%esp),%edx
- leal (%eax,%ecx,1),%eax
- vmovdqa %xmm6,48(%esp)
- vpalignr $4,%xmm2,%xmm3,%xmm4
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,16(%esp)
- vpalignr $4,%xmm0,%xmm1,%xmm7
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 24(%esp),%edx,%esi
- vpsrld $7,%xmm4,%xmm6
- xorl %edi,%ecx
- andl 20(%esp),%edx
- movl %eax,(%esp)
- vpaddd %xmm7,%xmm2,%xmm2
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrld $3,%xmm4,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpslld $14,%xmm4,%xmm5
- movl 4(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpxor %xmm6,%xmm7,%xmm4
- addl 28(%esp),%edx
- andl %eax,%ebx
- addl 64(%esp),%edx
- vpshufd $250,%xmm1,%xmm7
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 12(%esp),%edx
- vpsrld $11,%xmm6,%xmm6
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl %edx,12(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpslld $11,%xmm5,%xmm5
- andnl 20(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 16(%esp),%edx
- vpxor %xmm6,%xmm4,%xmm4
- movl %ebx,28(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpsrld $10,%xmm7,%xmm6
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl (%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpsrlq $17,%xmm7,%xmm5
- addl 24(%esp),%edx
- andl %ebx,%eax
- addl 68(%esp),%edx
- vpaddd %xmm4,%xmm2,%xmm2
- xorl %edi,%eax
- addl %edx,%ecx
- addl 8(%esp),%edx
- vpxor %xmm5,%xmm6,%xmm6
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpsrlq $19,%xmm7,%xmm7
- movl %edx,8(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- andnl 16(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 12(%esp),%edx
- vpshufd $132,%xmm6,%xmm7
- movl %eax,24(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrldq $8,%xmm7,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpaddd %xmm7,%xmm2,%xmm2
- movl 28(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpshufd $80,%xmm2,%xmm7
- addl 20(%esp),%edx
- andl %eax,%ebx
- addl 72(%esp),%edx
- vpsrld $10,%xmm7,%xmm6
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 4(%esp),%edx
- vpsrlq $17,%xmm7,%xmm5
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm6,%xmm6
- movl %edx,4(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpsrlq $19,%xmm7,%xmm7
- andnl 12(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 8(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- movl %ebx,20(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpshufd $232,%xmm6,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpslldq $8,%xmm7,%xmm7
- movl 24(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpaddd %xmm7,%xmm2,%xmm2
- addl 16(%esp),%edx
- andl %ebx,%eax
- addl 76(%esp),%edx
- vpaddd 32(%ebp),%xmm2,%xmm6
- xorl %edi,%eax
- addl %edx,%ecx
- addl (%esp),%edx
- leal (%eax,%ecx,1),%eax
- vmovdqa %xmm6,64(%esp)
- vpalignr $4,%xmm3,%xmm0,%xmm4
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,(%esp)
- vpalignr $4,%xmm1,%xmm2,%xmm7
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 8(%esp),%edx,%esi
- vpsrld $7,%xmm4,%xmm6
- xorl %edi,%ecx
- andl 4(%esp),%edx
- movl %eax,16(%esp)
- vpaddd %xmm7,%xmm3,%xmm3
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrld $3,%xmm4,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpslld $14,%xmm4,%xmm5
- movl 20(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpxor %xmm6,%xmm7,%xmm4
- addl 12(%esp),%edx
- andl %eax,%ebx
- addl 80(%esp),%edx
- vpshufd $250,%xmm2,%xmm7
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 28(%esp),%edx
- vpsrld $11,%xmm6,%xmm6
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl %edx,28(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpslld $11,%xmm5,%xmm5
- andnl 4(%esp),%edx,%esi
- xorl %edi,%ecx
- andl (%esp),%edx
- vpxor %xmm6,%xmm4,%xmm4
- movl %ebx,12(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpsrld $10,%xmm7,%xmm6
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl 16(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpsrlq $17,%xmm7,%xmm5
- addl 8(%esp),%edx
- andl %ebx,%eax
- addl 84(%esp),%edx
- vpaddd %xmm4,%xmm3,%xmm3
- xorl %edi,%eax
- addl %edx,%ecx
- addl 24(%esp),%edx
- vpxor %xmm5,%xmm6,%xmm6
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpsrlq $19,%xmm7,%xmm7
- movl %edx,24(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- andnl (%esp),%edx,%esi
- xorl %edi,%ecx
- andl 28(%esp),%edx
- vpshufd $132,%xmm6,%xmm7
- movl %eax,8(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrldq $8,%xmm7,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpaddd %xmm7,%xmm3,%xmm3
- movl 12(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpshufd $80,%xmm3,%xmm7
- addl 4(%esp),%edx
- andl %eax,%ebx
- addl 88(%esp),%edx
- vpsrld $10,%xmm7,%xmm6
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 20(%esp),%edx
- vpsrlq $17,%xmm7,%xmm5
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm6,%xmm6
- movl %edx,20(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpsrlq $19,%xmm7,%xmm7
- andnl 28(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 24(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- movl %ebx,4(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpshufd $232,%xmm6,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpslldq $8,%xmm7,%xmm7
- movl 8(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpaddd %xmm7,%xmm3,%xmm3
- addl (%esp),%edx
- andl %ebx,%eax
- addl 92(%esp),%edx
- vpaddd 48(%ebp),%xmm3,%xmm6
- xorl %edi,%eax
- addl %edx,%ecx
- addl 16(%esp),%edx
- leal (%eax,%ecx,1),%eax
- vmovdqa %xmm6,80(%esp)
- cmpl $66051,64(%ebp)
- jne L018avx_bmi_00_47
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,16(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 24(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 20(%esp),%edx
- movl %eax,(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 4(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- andl %eax,%ebx
- addl 32(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 12(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,12(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 20(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 16(%esp),%edx
- movl %ebx,28(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl (%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- andl %ebx,%eax
- addl 36(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl 8(%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,8(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 16(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 12(%esp),%edx
- movl %eax,24(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 28(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- andl %eax,%ebx
- addl 40(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 4(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,4(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 12(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 8(%esp),%edx
- movl %ebx,20(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl 24(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- andl %ebx,%eax
- addl 44(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl (%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 8(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 4(%esp),%edx
- movl %eax,16(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 20(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- andl %eax,%ebx
- addl 48(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 28(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,28(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 4(%esp),%edx,%esi
- xorl %edi,%ecx
- andl (%esp),%edx
- movl %ebx,12(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl 16(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- andl %ebx,%eax
- addl 52(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl 24(%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,24(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl (%esp),%edx,%esi
- xorl %edi,%ecx
- andl 28(%esp),%edx
- movl %eax,8(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 12(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- andl %eax,%ebx
- addl 56(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 20(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,20(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 28(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 24(%esp),%edx
- movl %ebx,4(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl 8(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- andl %ebx,%eax
- addl 60(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl 16(%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,16(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 24(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 20(%esp),%edx
- movl %eax,(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 4(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- andl %eax,%ebx
- addl 64(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 12(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,12(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 20(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 16(%esp),%edx
- movl %ebx,28(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl (%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- andl %ebx,%eax
- addl 68(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl 8(%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,8(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 16(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 12(%esp),%edx
- movl %eax,24(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 28(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- andl %eax,%ebx
- addl 72(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 4(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,4(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 12(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 8(%esp),%edx
- movl %ebx,20(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl 24(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- andl %ebx,%eax
- addl 76(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl (%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 8(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 4(%esp),%edx
- movl %eax,16(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 20(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- andl %eax,%ebx
- addl 80(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 28(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,28(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 4(%esp),%edx,%esi
- xorl %edi,%ecx
- andl (%esp),%edx
- movl %ebx,12(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl 16(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- andl %ebx,%eax
- addl 84(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl 24(%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,24(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl (%esp),%edx,%esi
- xorl %edi,%ecx
- andl 28(%esp),%edx
- movl %eax,8(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 12(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- andl %eax,%ebx
- addl 88(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 20(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,20(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 28(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 24(%esp),%edx
- movl %ebx,4(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl 8(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- andl %ebx,%eax
- addl 92(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl 16(%esp),%edx
- leal (%eax,%ecx,1),%eax
- movl 96(%esp),%esi
- xorl %edi,%ebx
- movl 12(%esp),%ecx
- addl (%esi),%eax
- addl 4(%esi),%ebx
- addl 8(%esi),%edi
- addl 12(%esi),%ecx
- movl %eax,(%esi)
- movl %ebx,4(%esi)
- movl %edi,8(%esi)
- movl %ecx,12(%esi)
- movl %ebx,4(%esp)
- xorl %edi,%ebx
- movl %edi,8(%esp)
- movl %ecx,12(%esp)
- movl 20(%esp),%edi
- movl 24(%esp),%ecx
- addl 16(%esi),%edx
- addl 20(%esi),%edi
- addl 24(%esi),%ecx
- movl %edx,16(%esi)
- movl %edi,20(%esi)
- movl %edi,20(%esp)
- movl 28(%esp),%edi
- movl %ecx,24(%esi)
- addl 28(%esi),%edi
- movl %ecx,24(%esp)
- movl %edi,28(%esi)
- movl %edi,28(%esp)
- movl 100(%esp),%edi
- vmovdqa 64(%ebp),%xmm7
- subl $192,%ebp
- cmpl 104(%esp),%edi
- jb L017grand_avx_bmi
- movl 108(%esp),%esp
- vzeroall
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
.section __IMPORT,__pointers,non_lazy_symbol_pointers
L_OPENSSL_ia32cap_P$non_lazy_ptr:
.indirect_symbol _OPENSSL_ia32cap_P
diff --git a/deps/openssl/asm/x86-win32-masm/sha/sha1-586.asm b/deps/openssl/asm/x86-win32-masm/sha/sha1-586.asm
index 4607eda762..38aaf17445 100644
--- a/deps/openssl/asm/x86-win32-masm/sha/sha1-586.asm
+++ b/deps/openssl/asm/x86-win32-masm/sha/sha1-586.asm
@@ -39,11 +39,6 @@ $L000pic_point:
jz $L001x86
test ecx,536870912
jnz $Lshaext_shortcut
- and edx,268435456
- and eax,1073741824
- or eax,edx
- cmp eax,1342177280
- je $Lavx_shortcut
jmp $Lssse3_shortcut
ALIGN 16
$L001x86:
@@ -2799,1175 +2794,6 @@ $L007done:
pop ebp
ret
__sha1_block_data_order_ssse3 ENDP
-ALIGN 16
-__sha1_block_data_order_avx PROC PRIVATE
- push ebp
- push ebx
- push esi
- push edi
- call $L008pic_point
-$L008pic_point:
- pop ebp
- lea ebp,DWORD PTR ($LK_XX_XX-$L008pic_point)[ebp]
-$Lavx_shortcut::
- vzeroall
- vmovdqa xmm7,XMMWORD PTR [ebp]
- vmovdqa xmm0,XMMWORD PTR 16[ebp]
- vmovdqa xmm1,XMMWORD PTR 32[ebp]
- vmovdqa xmm2,XMMWORD PTR 48[ebp]
- vmovdqa xmm6,XMMWORD PTR 64[ebp]
- mov edi,DWORD PTR 20[esp]
- mov ebp,DWORD PTR 24[esp]
- mov edx,DWORD PTR 28[esp]
- mov esi,esp
- sub esp,208
- and esp,-64
- vmovdqa XMMWORD PTR 112[esp],xmm0
- vmovdqa XMMWORD PTR 128[esp],xmm1
- vmovdqa XMMWORD PTR 144[esp],xmm2
- shl edx,6
- vmovdqa XMMWORD PTR 160[esp],xmm7
- add edx,ebp
- vmovdqa XMMWORD PTR 176[esp],xmm6
- add ebp,64
- mov DWORD PTR 192[esp],edi
- mov DWORD PTR 196[esp],ebp
- mov DWORD PTR 200[esp],edx
- mov DWORD PTR 204[esp],esi
- mov eax,DWORD PTR [edi]
- mov ebx,DWORD PTR 4[edi]
- mov ecx,DWORD PTR 8[edi]
- mov edx,DWORD PTR 12[edi]
- mov edi,DWORD PTR 16[edi]
- mov esi,ebx
- vmovdqu xmm0,XMMWORD PTR [ebp-64]
- vmovdqu xmm1,XMMWORD PTR [ebp-48]
- vmovdqu xmm2,XMMWORD PTR [ebp-32]
- vmovdqu xmm3,XMMWORD PTR [ebp-16]
- vpshufb xmm0,xmm0,xmm6
- vpshufb xmm1,xmm1,xmm6
- vpshufb xmm2,xmm2,xmm6
- vmovdqa XMMWORD PTR 96[esp],xmm7
- vpshufb xmm3,xmm3,xmm6
- vpaddd xmm4,xmm0,xmm7
- vpaddd xmm5,xmm1,xmm7
- vpaddd xmm6,xmm2,xmm7
- vmovdqa XMMWORD PTR [esp],xmm4
- mov ebp,ecx
- vmovdqa XMMWORD PTR 16[esp],xmm5
- xor ebp,edx
- vmovdqa XMMWORD PTR 32[esp],xmm6
- and esi,ebp
- jmp $L009loop
-ALIGN 16
-$L009loop:
- shrd ebx,ebx,2
- xor esi,edx
- vpalignr xmm4,xmm1,xmm0,8
- mov ebp,eax
- add edi,DWORD PTR [esp]
- vpaddd xmm7,xmm7,xmm3
- vmovdqa XMMWORD PTR 64[esp],xmm0
- xor ebx,ecx
- shld eax,eax,5
- vpsrldq xmm6,xmm3,4
- add edi,esi
- and ebp,ebx
- vpxor xmm4,xmm4,xmm0
- xor ebx,ecx
- add edi,eax
- vpxor xmm6,xmm6,xmm2
- shrd eax,eax,7
- xor ebp,ecx
- vmovdqa XMMWORD PTR 48[esp],xmm7
- mov esi,edi
- add edx,DWORD PTR 4[esp]
- vpxor xmm4,xmm4,xmm6
- xor eax,ebx
- shld edi,edi,5
- add edx,ebp
- and esi,eax
- vpsrld xmm6,xmm4,31
- xor eax,ebx
- add edx,edi
- shrd edi,edi,7
- xor esi,ebx
- vpslldq xmm0,xmm4,12
- vpaddd xmm4,xmm4,xmm4
- mov ebp,edx
- add ecx,DWORD PTR 8[esp]
- xor edi,eax
- shld edx,edx,5
- vpsrld xmm7,xmm0,30
- vpor xmm4,xmm4,xmm6
- add ecx,esi
- and ebp,edi
- xor edi,eax
- add ecx,edx
- vpslld xmm0,xmm0,2
- shrd edx,edx,7
- xor ebp,eax
- vpxor xmm4,xmm4,xmm7
- mov esi,ecx
- add ebx,DWORD PTR 12[esp]
- xor edx,edi
- shld ecx,ecx,5
- vpxor xmm4,xmm4,xmm0
- add ebx,ebp
- and esi,edx
- vmovdqa xmm0,XMMWORD PTR 96[esp]
- xor edx,edi
- add ebx,ecx
- shrd ecx,ecx,7
- xor esi,edi
- vpalignr xmm5,xmm2,xmm1,8
- mov ebp,ebx
- add eax,DWORD PTR 16[esp]
- vpaddd xmm0,xmm0,xmm4
- vmovdqa XMMWORD PTR 80[esp],xmm1
- xor ecx,edx
- shld ebx,ebx,5
- vpsrldq xmm7,xmm4,4
- add eax,esi
- and ebp,ecx
- vpxor xmm5,xmm5,xmm1
- xor ecx,edx
- add eax,ebx
- vpxor xmm7,xmm7,xmm3
- shrd ebx,ebx,7
- xor ebp,edx
- vmovdqa XMMWORD PTR [esp],xmm0
- mov esi,eax
- add edi,DWORD PTR 20[esp]
- vpxor xmm5,xmm5,xmm7
- xor ebx,ecx
- shld eax,eax,5
- add edi,ebp
- and esi,ebx
- vpsrld xmm7,xmm5,31
- xor ebx,ecx
- add edi,eax
- shrd eax,eax,7
- xor esi,ecx
- vpslldq xmm1,xmm5,12
- vpaddd xmm5,xmm5,xmm5
- mov ebp,edi
- add edx,DWORD PTR 24[esp]
- xor eax,ebx
- shld edi,edi,5
- vpsrld xmm0,xmm1,30
- vpor xmm5,xmm5,xmm7
- add edx,esi
- and ebp,eax
- xor eax,ebx
- add edx,edi
- vpslld xmm1,xmm1,2
- shrd edi,edi,7
- xor ebp,ebx
- vpxor xmm5,xmm5,xmm0
- mov esi,edx
- add ecx,DWORD PTR 28[esp]
- xor edi,eax
- shld edx,edx,5
- vpxor xmm5,xmm5,xmm1
- add ecx,ebp
- and esi,edi
- vmovdqa xmm1,XMMWORD PTR 112[esp]
- xor edi,eax
- add ecx,edx
- shrd edx,edx,7
- xor esi,eax
- vpalignr xmm6,xmm3,xmm2,8
- mov ebp,ecx
- add ebx,DWORD PTR 32[esp]
- vpaddd xmm1,xmm1,xmm5
- vmovdqa XMMWORD PTR 96[esp],xmm2
- xor edx,edi
- shld ecx,ecx,5
- vpsrldq xmm0,xmm5,4
- add ebx,esi
- and ebp,edx
- vpxor xmm6,xmm6,xmm2
- xor edx,edi
- add ebx,ecx
- vpxor xmm0,xmm0,xmm4
- shrd ecx,ecx,7
- xor ebp,edi
- vmovdqa XMMWORD PTR 16[esp],xmm1
- mov esi,ebx
- add eax,DWORD PTR 36[esp]
- vpxor xmm6,xmm6,xmm0
- xor ecx,edx
- shld ebx,ebx,5
- add eax,ebp
- and esi,ecx
- vpsrld xmm0,xmm6,31
- xor ecx,edx
- add eax,ebx
- shrd ebx,ebx,7
- xor esi,edx
- vpslldq xmm2,xmm6,12
- vpaddd xmm6,xmm6,xmm6
- mov ebp,eax
- add edi,DWORD PTR 40[esp]
- xor ebx,ecx
- shld eax,eax,5
- vpsrld xmm1,xmm2,30
- vpor xmm6,xmm6,xmm0
- add edi,esi
- and ebp,ebx
- xor ebx,ecx
- add edi,eax
- vpslld xmm2,xmm2,2
- vmovdqa xmm0,XMMWORD PTR 64[esp]
- shrd eax,eax,7
- xor ebp,ecx
- vpxor xmm6,xmm6,xmm1
- mov esi,edi
- add edx,DWORD PTR 44[esp]
- xor eax,ebx
- shld edi,edi,5
- vpxor xmm6,xmm6,xmm2
- add edx,ebp
- and esi,eax
- vmovdqa xmm2,XMMWORD PTR 112[esp]
- xor eax,ebx
- add edx,edi
- shrd edi,edi,7
- xor esi,ebx
- vpalignr xmm7,xmm4,xmm3,8
- mov ebp,edx
- add ecx,DWORD PTR 48[esp]
- vpaddd xmm2,xmm2,xmm6
- vmovdqa XMMWORD PTR 64[esp],xmm3
- xor edi,eax
- shld edx,edx,5
- vpsrldq xmm1,xmm6,4
- add ecx,esi
- and ebp,edi
- vpxor xmm7,xmm7,xmm3
- xor edi,eax
- add ecx,edx
- vpxor xmm1,xmm1,xmm5
- shrd edx,edx,7
- xor ebp,eax
- vmovdqa XMMWORD PTR 32[esp],xmm2
- mov esi,ecx
- add ebx,DWORD PTR 52[esp]
- vpxor xmm7,xmm7,xmm1
- xor edx,edi
- shld ecx,ecx,5
- add ebx,ebp
- and esi,edx
- vpsrld xmm1,xmm7,31
- xor edx,edi
- add ebx,ecx
- shrd ecx,ecx,7
- xor esi,edi
- vpslldq xmm3,xmm7,12
- vpaddd xmm7,xmm7,xmm7
- mov ebp,ebx
- add eax,DWORD PTR 56[esp]
- xor ecx,edx
- shld ebx,ebx,5
- vpsrld xmm2,xmm3,30
- vpor xmm7,xmm7,xmm1
- add eax,esi
- and ebp,ecx
- xor ecx,edx
- add eax,ebx
- vpslld xmm3,xmm3,2
- vmovdqa xmm1,XMMWORD PTR 80[esp]
- shrd ebx,ebx,7
- xor ebp,edx
- vpxor xmm7,xmm7,xmm2
- mov esi,eax
- add edi,DWORD PTR 60[esp]
- xor ebx,ecx
- shld eax,eax,5
- vpxor xmm7,xmm7,xmm3
- add edi,ebp
- and esi,ebx
- vmovdqa xmm3,XMMWORD PTR 112[esp]
- xor ebx,ecx
- add edi,eax
- vpalignr xmm2,xmm7,xmm6,8
- vpxor xmm0,xmm0,xmm4
- shrd eax,eax,7
- xor esi,ecx
- mov ebp,edi
- add edx,DWORD PTR [esp]
- vpxor xmm0,xmm0,xmm1
- vmovdqa XMMWORD PTR 80[esp],xmm4
- xor eax,ebx
- shld edi,edi,5
- vmovdqa xmm4,xmm3
- vpaddd xmm3,xmm3,xmm7
- add edx,esi
- and ebp,eax
- vpxor xmm0,xmm0,xmm2
- xor eax,ebx
- add edx,edi
- shrd edi,edi,7
- xor ebp,ebx
- vpsrld xmm2,xmm0,30
- vmovdqa XMMWORD PTR 48[esp],xmm3
- mov esi,edx
- add ecx,DWORD PTR 4[esp]
- xor edi,eax
- shld edx,edx,5
- vpslld xmm0,xmm0,2
- add ecx,ebp
- and esi,edi
- xor edi,eax
- add ecx,edx
- shrd edx,edx,7
- xor esi,eax
- mov ebp,ecx
- add ebx,DWORD PTR 8[esp]
- vpor xmm0,xmm0,xmm2
- xor edx,edi
- shld ecx,ecx,5
- vmovdqa xmm2,XMMWORD PTR 96[esp]
- add ebx,esi
- and ebp,edx
- xor edx,edi
- add ebx,ecx
- add eax,DWORD PTR 12[esp]
- xor ebp,edi
- mov esi,ebx
- shld ebx,ebx,5
- add eax,ebp
- xor esi,edx
- shrd ecx,ecx,7
- add eax,ebx
- vpalignr xmm3,xmm0,xmm7,8
- vpxor xmm1,xmm1,xmm5
- add edi,DWORD PTR 16[esp]
- xor esi,ecx
- mov ebp,eax
- shld eax,eax,5
- vpxor xmm1,xmm1,xmm2
- vmovdqa XMMWORD PTR 96[esp],xmm5
- add edi,esi
- xor ebp,ecx
- vmovdqa xmm5,xmm4
- vpaddd xmm4,xmm4,xmm0
- shrd ebx,ebx,7
- add edi,eax
- vpxor xmm1,xmm1,xmm3
- add edx,DWORD PTR 20[esp]
- xor ebp,ebx
- mov esi,edi
- shld edi,edi,5
- vpsrld xmm3,xmm1,30
- vmovdqa XMMWORD PTR [esp],xmm4
- add edx,ebp
- xor esi,ebx
- shrd eax,eax,7
- add edx,edi
- vpslld xmm1,xmm1,2
- add ecx,DWORD PTR 24[esp]
- xor esi,eax
- mov ebp,edx
- shld edx,edx,5
- add ecx,esi
- xor ebp,eax
- shrd edi,edi,7
- add ecx,edx
- vpor xmm1,xmm1,xmm3
- add ebx,DWORD PTR 28[esp]
- xor ebp,edi
- vmovdqa xmm3,XMMWORD PTR 64[esp]
- mov esi,ecx
- shld ecx,ecx,5
- add ebx,ebp
- xor esi,edi
- shrd edx,edx,7
- add ebx,ecx
- vpalignr xmm4,xmm1,xmm0,8
- vpxor xmm2,xmm2,xmm6
- add eax,DWORD PTR 32[esp]
- xor esi,edx
- mov ebp,ebx
- shld ebx,ebx,5
- vpxor xmm2,xmm2,xmm3
- vmovdqa XMMWORD PTR 64[esp],xmm6
- add eax,esi
- xor ebp,edx
- vmovdqa xmm6,XMMWORD PTR 128[esp]
- vpaddd xmm5,xmm5,xmm1
- shrd ecx,ecx,7
- add eax,ebx
- vpxor xmm2,xmm2,xmm4
- add edi,DWORD PTR 36[esp]
- xor ebp,ecx
- mov esi,eax
- shld eax,eax,5
- vpsrld xmm4,xmm2,30
- vmovdqa XMMWORD PTR 16[esp],xmm5
- add edi,ebp
- xor esi,ecx
- shrd ebx,ebx,7
- add edi,eax
- vpslld xmm2,xmm2,2
- add edx,DWORD PTR 40[esp]
- xor esi,ebx
- mov ebp,edi
- shld edi,edi,5
- add edx,esi
- xor ebp,ebx
- shrd eax,eax,7
- add edx,edi
- vpor xmm2,xmm2,xmm4
- add ecx,DWORD PTR 44[esp]
- xor ebp,eax
- vmovdqa xmm4,XMMWORD PTR 80[esp]
- mov esi,edx
- shld edx,edx,5
- add ecx,ebp
- xor esi,eax
- shrd edi,edi,7
- add ecx,edx
- vpalignr xmm5,xmm2,xmm1,8
- vpxor xmm3,xmm3,xmm7
- add ebx,DWORD PTR 48[esp]
- xor esi,edi
- mov ebp,ecx
- shld ecx,ecx,5
- vpxor xmm3,xmm3,xmm4
- vmovdqa XMMWORD PTR 80[esp],xmm7
- add ebx,esi
- xor ebp,edi
- vmovdqa xmm7,xmm6
- vpaddd xmm6,xmm6,xmm2
- shrd edx,edx,7
- add ebx,ecx
- vpxor xmm3,xmm3,xmm5
- add eax,DWORD PTR 52[esp]
- xor ebp,edx
- mov esi,ebx
- shld ebx,ebx,5
- vpsrld xmm5,xmm3,30
- vmovdqa XMMWORD PTR 32[esp],xmm6
- add eax,ebp
- xor esi,edx
- shrd ecx,ecx,7
- add eax,ebx
- vpslld xmm3,xmm3,2
- add edi,DWORD PTR 56[esp]
- xor esi,ecx
- mov ebp,eax
- shld eax,eax,5
- add edi,esi
- xor ebp,ecx
- shrd ebx,ebx,7
- add edi,eax
- vpor xmm3,xmm3,xmm5
- add edx,DWORD PTR 60[esp]
- xor ebp,ebx
- vmovdqa xmm5,XMMWORD PTR 96[esp]
- mov esi,edi
- shld edi,edi,5
- add edx,ebp
- xor esi,ebx
- shrd eax,eax,7
- add edx,edi
- vpalignr xmm6,xmm3,xmm2,8
- vpxor xmm4,xmm4,xmm0
- add ecx,DWORD PTR [esp]
- xor esi,eax
- mov ebp,edx
- shld edx,edx,5
- vpxor xmm4,xmm4,xmm5
- vmovdqa XMMWORD PTR 96[esp],xmm0
- add ecx,esi
- xor ebp,eax
- vmovdqa xmm0,xmm7
- vpaddd xmm7,xmm7,xmm3
- shrd edi,edi,7
- add ecx,edx
- vpxor xmm4,xmm4,xmm6
- add ebx,DWORD PTR 4[esp]
- xor ebp,edi
- mov esi,ecx
- shld ecx,ecx,5
- vpsrld xmm6,xmm4,30
- vmovdqa XMMWORD PTR 48[esp],xmm7
- add ebx,ebp
- xor esi,edi
- shrd edx,edx,7
- add ebx,ecx
- vpslld xmm4,xmm4,2
- add eax,DWORD PTR 8[esp]
- xor esi,edx
- mov ebp,ebx
- shld ebx,ebx,5
- add eax,esi
- xor ebp,edx
- shrd ecx,ecx,7
- add eax,ebx
- vpor xmm4,xmm4,xmm6
- add edi,DWORD PTR 12[esp]
- xor ebp,ecx
- vmovdqa xmm6,XMMWORD PTR 64[esp]
- mov esi,eax
- shld eax,eax,5
- add edi,ebp
- xor esi,ecx
- shrd ebx,ebx,7
- add edi,eax
- vpalignr xmm7,xmm4,xmm3,8
- vpxor xmm5,xmm5,xmm1
- add edx,DWORD PTR 16[esp]
- xor esi,ebx
- mov ebp,edi
- shld edi,edi,5
- vpxor xmm5,xmm5,xmm6
- vmovdqa XMMWORD PTR 64[esp],xmm1
- add edx,esi
- xor ebp,ebx
- vmovdqa xmm1,xmm0
- vpaddd xmm0,xmm0,xmm4
- shrd eax,eax,7
- add edx,edi
- vpxor xmm5,xmm5,xmm7
- add ecx,DWORD PTR 20[esp]
- xor ebp,eax
- mov esi,edx
- shld edx,edx,5
- vpsrld xmm7,xmm5,30
- vmovdqa XMMWORD PTR [esp],xmm0
- add ecx,ebp
- xor esi,eax
- shrd edi,edi,7
- add ecx,edx
- vpslld xmm5,xmm5,2
- add ebx,DWORD PTR 24[esp]
- xor esi,edi
- mov ebp,ecx
- shld ecx,ecx,5
- add ebx,esi
- xor ebp,edi
- shrd edx,edx,7
- add ebx,ecx
- vpor xmm5,xmm5,xmm7
- add eax,DWORD PTR 28[esp]
- vmovdqa xmm7,XMMWORD PTR 80[esp]
- shrd ecx,ecx,7
- mov esi,ebx
- xor ebp,edx
- shld ebx,ebx,5
- add eax,ebp
- xor esi,ecx
- xor ecx,edx
- add eax,ebx
- vpalignr xmm0,xmm5,xmm4,8
- vpxor xmm6,xmm6,xmm2
- add edi,DWORD PTR 32[esp]
- and esi,ecx
- xor ecx,edx
- shrd ebx,ebx,7
- vpxor xmm6,xmm6,xmm7
- vmovdqa XMMWORD PTR 80[esp],xmm2
- mov ebp,eax
- xor esi,ecx
- vmovdqa xmm2,xmm1
- vpaddd xmm1,xmm1,xmm5
- shld eax,eax,5
- add edi,esi
- vpxor xmm6,xmm6,xmm0
- xor ebp,ebx
- xor ebx,ecx
- add edi,eax
- add edx,DWORD PTR 36[esp]
- vpsrld xmm0,xmm6,30
- vmovdqa XMMWORD PTR 16[esp],xmm1
- and ebp,ebx
- xor ebx,ecx
- shrd eax,eax,7
- mov esi,edi
- vpslld xmm6,xmm6,2
- xor ebp,ebx
- shld edi,edi,5
- add edx,ebp
- xor esi,eax
- xor eax,ebx
- add edx,edi
- add ecx,DWORD PTR 40[esp]
- and esi,eax
- vpor xmm6,xmm6,xmm0
- xor eax,ebx
- shrd edi,edi,7
- vmovdqa xmm0,XMMWORD PTR 96[esp]
- mov ebp,edx
- xor esi,eax
- shld edx,edx,5
- add ecx,esi
- xor ebp,edi
- xor edi,eax
- add ecx,edx
- add ebx,DWORD PTR 44[esp]
- and ebp,edi
- xor edi,eax
- shrd edx,edx,7
- mov esi,ecx
- xor ebp,edi
- shld ecx,ecx,5
- add ebx,ebp
- xor esi,edx
- xor edx,edi
- add ebx,ecx
- vpalignr xmm1,xmm6,xmm5,8
- vpxor xmm7,xmm7,xmm3
- add eax,DWORD PTR 48[esp]
- and esi,edx
- xor edx,edi
- shrd ecx,ecx,7
- vpxor xmm7,xmm7,xmm0
- vmovdqa XMMWORD PTR 96[esp],xmm3
- mov ebp,ebx
- xor esi,edx
- vmovdqa xmm3,XMMWORD PTR 144[esp]
- vpaddd xmm2,xmm2,xmm6
- shld ebx,ebx,5
- add eax,esi
- vpxor xmm7,xmm7,xmm1
- xor ebp,ecx
- xor ecx,edx
- add eax,ebx
- add edi,DWORD PTR 52[esp]
- vpsrld xmm1,xmm7,30
- vmovdqa XMMWORD PTR 32[esp],xmm2
- and ebp,ecx
- xor ecx,edx
- shrd ebx,ebx,7
- mov esi,eax
- vpslld xmm7,xmm7,2
- xor ebp,ecx
- shld eax,eax,5
- add edi,ebp
- xor esi,ebx
- xor ebx,ecx
- add edi,eax
- add edx,DWORD PTR 56[esp]
- and esi,ebx
- vpor xmm7,xmm7,xmm1
- xor ebx,ecx
- shrd eax,eax,7
- vmovdqa xmm1,XMMWORD PTR 64[esp]
- mov ebp,edi
- xor esi,ebx
- shld edi,edi,5
- add edx,esi
- xor ebp,eax
- xor eax,ebx
- add edx,edi
- add ecx,DWORD PTR 60[esp]
- and ebp,eax
- xor eax,ebx
- shrd edi,edi,7
- mov esi,edx
- xor ebp,eax
- shld edx,edx,5
- add ecx,ebp
- xor esi,edi
- xor edi,eax
- add ecx,edx
- vpalignr xmm2,xmm7,xmm6,8
- vpxor xmm0,xmm0,xmm4
- add ebx,DWORD PTR [esp]
- and esi,edi
- xor edi,eax
- shrd edx,edx,7
- vpxor xmm0,xmm0,xmm1
- vmovdqa XMMWORD PTR 64[esp],xmm4
- mov ebp,ecx
- xor esi,edi
- vmovdqa xmm4,xmm3
- vpaddd xmm3,xmm3,xmm7
- shld ecx,ecx,5
- add ebx,esi
- vpxor xmm0,xmm0,xmm2
- xor ebp,edx
- xor edx,edi
- add ebx,ecx
- add eax,DWORD PTR 4[esp]
- vpsrld xmm2,xmm0,30
- vmovdqa XMMWORD PTR 48[esp],xmm3
- and ebp,edx
- xor edx,edi
- shrd ecx,ecx,7
- mov esi,ebx
- vpslld xmm0,xmm0,2
- xor ebp,edx
- shld ebx,ebx,5
- add eax,ebp
- xor esi,ecx
- xor ecx,edx
- add eax,ebx
- add edi,DWORD PTR 8[esp]
- and esi,ecx
- vpor xmm0,xmm0,xmm2
- xor ecx,edx
- shrd ebx,ebx,7
- vmovdqa xmm2,XMMWORD PTR 80[esp]
- mov ebp,eax
- xor esi,ecx
- shld eax,eax,5
- add edi,esi
- xor ebp,ebx
- xor ebx,ecx
- add edi,eax
- add edx,DWORD PTR 12[esp]
- and ebp,ebx
- xor ebx,ecx
- shrd eax,eax,7
- mov esi,edi
- xor ebp,ebx
- shld edi,edi,5
- add edx,ebp
- xor esi,eax
- xor eax,ebx
- add edx,edi
- vpalignr xmm3,xmm0,xmm7,8
- vpxor xmm1,xmm1,xmm5
- add ecx,DWORD PTR 16[esp]
- and esi,eax
- xor eax,ebx
- shrd edi,edi,7
- vpxor xmm1,xmm1,xmm2
- vmovdqa XMMWORD PTR 80[esp],xmm5
- mov ebp,edx
- xor esi,eax
- vmovdqa xmm5,xmm4
- vpaddd xmm4,xmm4,xmm0
- shld edx,edx,5
- add ecx,esi
- vpxor xmm1,xmm1,xmm3
- xor ebp,edi
- xor edi,eax
- add ecx,edx
- add ebx,DWORD PTR 20[esp]
- vpsrld xmm3,xmm1,30
- vmovdqa XMMWORD PTR [esp],xmm4
- and ebp,edi
- xor edi,eax
- shrd edx,edx,7
- mov esi,ecx
- vpslld xmm1,xmm1,2
- xor ebp,edi
- shld ecx,ecx,5
- add ebx,ebp
- xor esi,edx
- xor edx,edi
- add ebx,ecx
- add eax,DWORD PTR 24[esp]
- and esi,edx
- vpor xmm1,xmm1,xmm3
- xor edx,edi
- shrd ecx,ecx,7
- vmovdqa xmm3,XMMWORD PTR 96[esp]
- mov ebp,ebx
- xor esi,edx
- shld ebx,ebx,5
- add eax,esi
- xor ebp,ecx
- xor ecx,edx
- add eax,ebx
- add edi,DWORD PTR 28[esp]
- and ebp,ecx
- xor ecx,edx
- shrd ebx,ebx,7
- mov esi,eax
- xor ebp,ecx
- shld eax,eax,5
- add edi,ebp
- xor esi,ebx
- xor ebx,ecx
- add edi,eax
- vpalignr xmm4,xmm1,xmm0,8
- vpxor xmm2,xmm2,xmm6
- add edx,DWORD PTR 32[esp]
- and esi,ebx
- xor ebx,ecx
- shrd eax,eax,7
- vpxor xmm2,xmm2,xmm3
- vmovdqa XMMWORD PTR 96[esp],xmm6
- mov ebp,edi
- xor esi,ebx
- vmovdqa xmm6,xmm5
- vpaddd xmm5,xmm5,xmm1
- shld edi,edi,5
- add edx,esi
- vpxor xmm2,xmm2,xmm4
- xor ebp,eax
- xor eax,ebx
- add edx,edi
- add ecx,DWORD PTR 36[esp]
- vpsrld xmm4,xmm2,30
- vmovdqa XMMWORD PTR 16[esp],xmm5
- and ebp,eax
- xor eax,ebx
- shrd edi,edi,7
- mov esi,edx
- vpslld xmm2,xmm2,2
- xor ebp,eax
- shld edx,edx,5
- add ecx,ebp
- xor esi,edi
- xor edi,eax
- add ecx,edx
- add ebx,DWORD PTR 40[esp]
- and esi,edi
- vpor xmm2,xmm2,xmm4
- xor edi,eax
- shrd edx,edx,7
- vmovdqa xmm4,XMMWORD PTR 64[esp]
- mov ebp,ecx
- xor esi,edi
- shld ecx,ecx,5
- add ebx,esi
- xor ebp,edx
- xor edx,edi
- add ebx,ecx
- add eax,DWORD PTR 44[esp]
- and ebp,edx
- xor edx,edi
- shrd ecx,ecx,7
- mov esi,ebx
- xor ebp,edx
- shld ebx,ebx,5
- add eax,ebp
- xor esi,edx
- add eax,ebx
- vpalignr xmm5,xmm2,xmm1,8
- vpxor xmm3,xmm3,xmm7
- add edi,DWORD PTR 48[esp]
- xor esi,ecx
- mov ebp,eax
- shld eax,eax,5
- vpxor xmm3,xmm3,xmm4
- vmovdqa XMMWORD PTR 64[esp],xmm7
- add edi,esi
- xor ebp,ecx
- vmovdqa xmm7,xmm6
- vpaddd xmm6,xmm6,xmm2
- shrd ebx,ebx,7
- add edi,eax
- vpxor xmm3,xmm3,xmm5
- add edx,DWORD PTR 52[esp]
- xor ebp,ebx
- mov esi,edi
- shld edi,edi,5
- vpsrld xmm5,xmm3,30
- vmovdqa XMMWORD PTR 32[esp],xmm6
- add edx,ebp
- xor esi,ebx
- shrd eax,eax,7
- add edx,edi
- vpslld xmm3,xmm3,2
- add ecx,DWORD PTR 56[esp]
- xor esi,eax
- mov ebp,edx
- shld edx,edx,5
- add ecx,esi
- xor ebp,eax
- shrd edi,edi,7
- add ecx,edx
- vpor xmm3,xmm3,xmm5
- add ebx,DWORD PTR 60[esp]
- xor ebp,edi
- mov esi,ecx
- shld ecx,ecx,5
- add ebx,ebp
- xor esi,edi
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD PTR [esp]
- vpaddd xmm7,xmm7,xmm3
- xor esi,edx
- mov ebp,ebx
- shld ebx,ebx,5
- add eax,esi
- vmovdqa XMMWORD PTR 48[esp],xmm7
- xor ebp,edx
- shrd ecx,ecx,7
- add eax,ebx
- add edi,DWORD PTR 4[esp]
- xor ebp,ecx
- mov esi,eax
- shld eax,eax,5
- add edi,ebp
- xor esi,ecx
- shrd ebx,ebx,7
- add edi,eax
- add edx,DWORD PTR 8[esp]
- xor esi,ebx
- mov ebp,edi
- shld edi,edi,5
- add edx,esi
- xor ebp,ebx
- shrd eax,eax,7
- add edx,edi
- add ecx,DWORD PTR 12[esp]
- xor ebp,eax
- mov esi,edx
- shld edx,edx,5
- add ecx,ebp
- xor esi,eax
- shrd edi,edi,7
- add ecx,edx
- mov ebp,DWORD PTR 196[esp]
- cmp ebp,DWORD PTR 200[esp]
- je $L010done
- vmovdqa xmm7,XMMWORD PTR 160[esp]
- vmovdqa xmm6,XMMWORD PTR 176[esp]
- vmovdqu xmm0,XMMWORD PTR [ebp]
- vmovdqu xmm1,XMMWORD PTR 16[ebp]
- vmovdqu xmm2,XMMWORD PTR 32[ebp]
- vmovdqu xmm3,XMMWORD PTR 48[ebp]
- add ebp,64
- vpshufb xmm0,xmm0,xmm6
- mov DWORD PTR 196[esp],ebp
- vmovdqa XMMWORD PTR 96[esp],xmm7
- add ebx,DWORD PTR 16[esp]
- xor esi,edi
- vpshufb xmm1,xmm1,xmm6
- mov ebp,ecx
- shld ecx,ecx,5
- vpaddd xmm4,xmm0,xmm7
- add ebx,esi
- xor ebp,edi
- shrd edx,edx,7
- add ebx,ecx
- vmovdqa XMMWORD PTR [esp],xmm4
- add eax,DWORD PTR 20[esp]
- xor ebp,edx
- mov esi,ebx
- shld ebx,ebx,5
- add eax,ebp
- xor esi,edx
- shrd ecx,ecx,7
- add eax,ebx
- add edi,DWORD PTR 24[esp]
- xor esi,ecx
- mov ebp,eax
- shld eax,eax,5
- add edi,esi
- xor ebp,ecx
- shrd ebx,ebx,7
- add edi,eax
- add edx,DWORD PTR 28[esp]
- xor ebp,ebx
- mov esi,edi
- shld edi,edi,5
- add edx,ebp
- xor esi,ebx
- shrd eax,eax,7
- add edx,edi
- add ecx,DWORD PTR 32[esp]
- xor esi,eax
- vpshufb xmm2,xmm2,xmm6
- mov ebp,edx
- shld edx,edx,5
- vpaddd xmm5,xmm1,xmm7
- add ecx,esi
- xor ebp,eax
- shrd edi,edi,7
- add ecx,edx
- vmovdqa XMMWORD PTR 16[esp],xmm5
- add ebx,DWORD PTR 36[esp]
- xor ebp,edi
- mov esi,ecx
- shld ecx,ecx,5
- add ebx,ebp
- xor esi,edi
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD PTR 40[esp]
- xor esi,edx
- mov ebp,ebx
- shld ebx,ebx,5
- add eax,esi
- xor ebp,edx
- shrd ecx,ecx,7
- add eax,ebx
- add edi,DWORD PTR 44[esp]
- xor ebp,ecx
- mov esi,eax
- shld eax,eax,5
- add edi,ebp
- xor esi,ecx
- shrd ebx,ebx,7
- add edi,eax
- add edx,DWORD PTR 48[esp]
- xor esi,ebx
- vpshufb xmm3,xmm3,xmm6
- mov ebp,edi
- shld edi,edi,5
- vpaddd xmm6,xmm2,xmm7
- add edx,esi
- xor ebp,ebx
- shrd eax,eax,7
- add edx,edi
- vmovdqa XMMWORD PTR 32[esp],xmm6
- add ecx,DWORD PTR 52[esp]
- xor ebp,eax
- mov esi,edx
- shld edx,edx,5
- add ecx,ebp
- xor esi,eax
- shrd edi,edi,7
- add ecx,edx
- add ebx,DWORD PTR 56[esp]
- xor esi,edi
- mov ebp,ecx
- shld ecx,ecx,5
- add ebx,esi
- xor ebp,edi
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD PTR 60[esp]
- xor ebp,edx
- mov esi,ebx
- shld ebx,ebx,5
- add eax,ebp
- shrd ecx,ecx,7
- add eax,ebx
- mov ebp,DWORD PTR 192[esp]
- add eax,DWORD PTR [ebp]
- add esi,DWORD PTR 4[ebp]
- add ecx,DWORD PTR 8[ebp]
- mov DWORD PTR [ebp],eax
- add edx,DWORD PTR 12[ebp]
- mov DWORD PTR 4[ebp],esi
- add edi,DWORD PTR 16[ebp]
- mov ebx,ecx
- mov DWORD PTR 8[ebp],ecx
- xor ebx,edx
- mov DWORD PTR 12[ebp],edx
- mov DWORD PTR 16[ebp],edi
- mov ebp,esi
- and esi,ebx
- mov ebx,ebp
- jmp $L009loop
-ALIGN 16
-$L010done:
- add ebx,DWORD PTR 16[esp]
- xor esi,edi
- mov ebp,ecx
- shld ecx,ecx,5
- add ebx,esi
- xor ebp,edi
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD PTR 20[esp]
- xor ebp,edx
- mov esi,ebx
- shld ebx,ebx,5
- add eax,ebp
- xor esi,edx
- shrd ecx,ecx,7
- add eax,ebx
- add edi,DWORD PTR 24[esp]
- xor esi,ecx
- mov ebp,eax
- shld eax,eax,5
- add edi,esi
- xor ebp,ecx
- shrd ebx,ebx,7
- add edi,eax
- add edx,DWORD PTR 28[esp]
- xor ebp,ebx
- mov esi,edi
- shld edi,edi,5
- add edx,ebp
- xor esi,ebx
- shrd eax,eax,7
- add edx,edi
- add ecx,DWORD PTR 32[esp]
- xor esi,eax
- mov ebp,edx
- shld edx,edx,5
- add ecx,esi
- xor ebp,eax
- shrd edi,edi,7
- add ecx,edx
- add ebx,DWORD PTR 36[esp]
- xor ebp,edi
- mov esi,ecx
- shld ecx,ecx,5
- add ebx,ebp
- xor esi,edi
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD PTR 40[esp]
- xor esi,edx
- mov ebp,ebx
- shld ebx,ebx,5
- add eax,esi
- xor ebp,edx
- shrd ecx,ecx,7
- add eax,ebx
- add edi,DWORD PTR 44[esp]
- xor ebp,ecx
- mov esi,eax
- shld eax,eax,5
- add edi,ebp
- xor esi,ecx
- shrd ebx,ebx,7
- add edi,eax
- add edx,DWORD PTR 48[esp]
- xor esi,ebx
- mov ebp,edi
- shld edi,edi,5
- add edx,esi
- xor ebp,ebx
- shrd eax,eax,7
- add edx,edi
- add ecx,DWORD PTR 52[esp]
- xor ebp,eax
- mov esi,edx
- shld edx,edx,5
- add ecx,ebp
- xor esi,eax
- shrd edi,edi,7
- add ecx,edx
- add ebx,DWORD PTR 56[esp]
- xor esi,edi
- mov ebp,ecx
- shld ecx,ecx,5
- add ebx,esi
- xor ebp,edi
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD PTR 60[esp]
- xor ebp,edx
- mov esi,ebx
- shld ebx,ebx,5
- add eax,ebp
- shrd ecx,ecx,7
- add eax,ebx
- vzeroall
- mov ebp,DWORD PTR 192[esp]
- add eax,DWORD PTR [ebp]
- mov esp,DWORD PTR 204[esp]
- add esi,DWORD PTR 4[ebp]
- add ecx,DWORD PTR 8[ebp]
- mov DWORD PTR [ebp],eax
- add edx,DWORD PTR 12[ebp]
- mov DWORD PTR 4[ebp],esi
- add edi,DWORD PTR 16[ebp]
- mov DWORD PTR 8[ebp],ecx
- mov DWORD PTR 12[ebp],edx
- mov DWORD PTR 16[ebp],edi
- pop edi
- pop esi
- pop ebx
- pop ebp
- ret
-__sha1_block_data_order_avx ENDP
ALIGN 64
$LK_XX_XX::
DD 1518500249,1518500249,1518500249,1518500249
diff --git a/deps/openssl/asm/x86-win32-masm/sha/sha256-586.asm b/deps/openssl/asm/x86-win32-masm/sha/sha256-586.asm
index d184877bb7..b6af4ab064 100644
--- a/deps/openssl/asm/x86-win32-masm/sha/sha256-586.asm
+++ b/deps/openssl/asm/x86-win32-masm/sha/sha256-586.asm
@@ -56,13 +56,12 @@ $L000pic_point:
or ecx,ebx
and ecx,1342177280
cmp ecx,1342177280
- je $L005AVX
test ebx,512
- jnz $L006SSSE3
+ jnz $L005SSSE3
$L003no_xmm:
sub eax,edi
cmp eax,256
- jae $L007unrolled
+ jae $L006unrolled
jmp $L002loop
ALIGN 16
$L002loop:
@@ -134,7 +133,7 @@ $L002loop:
mov DWORD PTR 28[esp],ecx
mov DWORD PTR 32[esp],edi
ALIGN 16
-$L00800_15:
+$L00700_15:
mov ecx,edx
mov esi,DWORD PTR 24[esp]
ror ecx,14
@@ -172,11 +171,11 @@ $L00800_15:
add ebp,4
add eax,ebx
cmp esi,3248222580
- jne $L00800_15
+ jne $L00700_15
mov ecx,DWORD PTR 156[esp]
- jmp $L00916_63
+ jmp $L00816_63
ALIGN 16
-$L00916_63:
+$L00816_63:
mov ebx,ecx
mov esi,DWORD PTR 104[esp]
ror ecx,11
@@ -231,7 +230,7 @@ $L00916_63:
add ebp,4
add eax,ebx
cmp esi,3329325298
- jne $L00916_63
+ jne $L00816_63
mov esi,DWORD PTR 356[esp]
mov ebx,DWORD PTR 8[esp]
mov ecx,DWORD PTR 16[esp]
@@ -290,7 +289,7 @@ DB 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
DB 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
DB 62,0
ALIGN 16
-$L007unrolled:
+$L006unrolled:
lea esp,DWORD PTR [esp-96]
mov eax,DWORD PTR [esi]
mov ebp,DWORD PTR 4[esi]
@@ -307,9 +306,9 @@ $L007unrolled:
mov DWORD PTR 20[esp],ebx
mov DWORD PTR 24[esp],ecx
mov DWORD PTR 28[esp],esi
- jmp $L010grand_loop
+ jmp $L009grand_loop
ALIGN 16
-$L010grand_loop:
+$L009grand_loop:
mov ebx,DWORD PTR [edi]
mov ecx,DWORD PTR 4[edi]
bswap ebx
@@ -3189,7 +3188,7 @@ $L010grand_loop:
mov DWORD PTR 24[esp],ebx
mov DWORD PTR 28[esp],ecx
cmp edi,DWORD PTR 104[esp]
- jb $L010grand_loop
+ jb $L009grand_loop
mov esp,DWORD PTR 108[esp]
pop edi
pop esi
@@ -3208,9 +3207,9 @@ $L004shaext:
pshufd xmm2,xmm2,27
DB 102,15,58,15,202,8
punpcklqdq xmm2,xmm0
- jmp $L011loop_shaext
+ jmp $L010loop_shaext
ALIGN 16
-$L011loop_shaext:
+$L010loop_shaext:
movdqu xmm3,XMMWORD PTR [edi]
movdqu xmm4,XMMWORD PTR 16[edi]
movdqu xmm5,XMMWORD PTR 32[edi]
@@ -3380,7 +3379,7 @@ DB 15,56,203,209
DB 15,56,203,202
paddd xmm2,XMMWORD PTR 16[esp]
paddd xmm1,XMMWORD PTR [esp]
- jnz $L011loop_shaext
+ jnz $L010loop_shaext
pshufd xmm2,xmm2,177
pshufd xmm7,xmm1,27
pshufd xmm1,xmm1,177
@@ -3395,7 +3394,7 @@ DB 102,15,58,15,215,8
pop ebp
ret
ALIGN 32
-$L006SSSE3:
+$L005SSSE3:
lea esp,DWORD PTR [esp-96]
mov eax,DWORD PTR [esi]
mov ebx,DWORD PTR 4[esi]
@@ -3414,9 +3413,9 @@ $L006SSSE3:
mov DWORD PTR 24[esp],ecx
mov DWORD PTR 28[esp],esi
movdqa xmm7,XMMWORD PTR 256[ebp]
- jmp $L012grand_ssse3
+ jmp $L011grand_ssse3
ALIGN 16
-$L012grand_ssse3:
+$L011grand_ssse3:
movdqu xmm0,XMMWORD PTR [edi]
movdqu xmm1,XMMWORD PTR 16[edi]
movdqu xmm2,XMMWORD PTR 32[edi]
@@ -3439,9 +3438,9 @@ DB 102,15,56,0,223
paddd xmm7,xmm3
movdqa XMMWORD PTR 64[esp],xmm6
movdqa XMMWORD PTR 80[esp],xmm7
- jmp $L013ssse3_00_47
+ jmp $L012ssse3_00_47
ALIGN 16
-$L013ssse3_00_47:
+$L012ssse3_00_47:
add ebp,64
mov ecx,edx
movdqa xmm4,xmm1
@@ -4084,7 +4083,7 @@ DB 102,15,58,15,249,4
add eax,ecx
movdqa XMMWORD PTR 80[esp],xmm6
cmp DWORD PTR 64[ebp],66051
- jne $L013ssse3_00_47
+ jne $L012ssse3_00_47
mov ecx,edx
ror edx,14
mov esi,DWORD PTR 20[esp]
@@ -4598,2218 +4597,13 @@ DB 102,15,58,15,249,4
movdqa xmm7,XMMWORD PTR 64[ebp]
sub ebp,192
cmp edi,DWORD PTR 104[esp]
- jb $L012grand_ssse3
+ jb $L011grand_ssse3
mov esp,DWORD PTR 108[esp]
pop edi
pop esi
pop ebx
pop ebp
ret
-ALIGN 32
-$L005AVX:
- and edx,264
- cmp edx,264
- je $L014AVX_BMI
- lea esp,DWORD PTR [esp-96]
- vzeroall
- mov eax,DWORD PTR [esi]
- mov ebx,DWORD PTR 4[esi]
- mov ecx,DWORD PTR 8[esi]
- mov edi,DWORD PTR 12[esi]
- mov DWORD PTR 4[esp],ebx
- xor ebx,ecx
- mov DWORD PTR 8[esp],ecx
- mov DWORD PTR 12[esp],edi
- mov edx,DWORD PTR 16[esi]
- mov edi,DWORD PTR 20[esi]
- mov ecx,DWORD PTR 24[esi]
- mov esi,DWORD PTR 28[esi]
- mov DWORD PTR 20[esp],edi
- mov edi,DWORD PTR 100[esp]
- mov DWORD PTR 24[esp],ecx
- mov DWORD PTR 28[esp],esi
- vmovdqa xmm7,XMMWORD PTR 256[ebp]
- jmp $L015grand_avx
-ALIGN 32
-$L015grand_avx:
- vmovdqu xmm0,XMMWORD PTR [edi]
- vmovdqu xmm1,XMMWORD PTR 16[edi]
- vmovdqu xmm2,XMMWORD PTR 32[edi]
- vmovdqu xmm3,XMMWORD PTR 48[edi]
- add edi,64
- vpshufb xmm0,xmm0,xmm7
- mov DWORD PTR 100[esp],edi
- vpshufb xmm1,xmm1,xmm7
- vpshufb xmm2,xmm2,xmm7
- vpaddd xmm4,xmm0,XMMWORD PTR [ebp]
- vpshufb xmm3,xmm3,xmm7
- vpaddd xmm5,xmm1,XMMWORD PTR 16[ebp]
- vpaddd xmm6,xmm2,XMMWORD PTR 32[ebp]
- vpaddd xmm7,xmm3,XMMWORD PTR 48[ebp]
- vmovdqa XMMWORD PTR 32[esp],xmm4
- vmovdqa XMMWORD PTR 48[esp],xmm5
- vmovdqa XMMWORD PTR 64[esp],xmm6
- vmovdqa XMMWORD PTR 80[esp],xmm7
- jmp $L016avx_00_47
-ALIGN 16
-$L016avx_00_47:
- add ebp,64
- vpalignr xmm4,xmm1,xmm0,4
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 20[esp]
- vpalignr xmm7,xmm3,xmm2,4
- xor edx,ecx
- mov edi,DWORD PTR 24[esp]
- xor esi,edi
- vpsrld xmm6,xmm4,7
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 16[esp],ecx
- vpaddd xmm0,xmm0,xmm7
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrld xmm7,xmm4,3
- mov ecx,eax
- add edx,edi
- mov edi,DWORD PTR 4[esp]
- vpslld xmm5,xmm4,14
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD PTR [esp],eax
- vpxor xmm4,xmm7,xmm6
- xor ecx,eax
- xor eax,edi
- add edx,DWORD PTR 28[esp]
- vpshufd xmm7,xmm3,250
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- vpsrld xmm6,xmm6,11
- add edx,DWORD PTR 32[esp]
- xor ebx,edi
- shrd ecx,ecx,2
- vpxor xmm4,xmm4,xmm5
- add ebx,edx
- add edx,DWORD PTR 12[esp]
- add ebx,ecx
- vpslld xmm5,xmm5,11
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 16[esp]
- vpxor xmm4,xmm4,xmm6
- xor edx,ecx
- mov edi,DWORD PTR 20[esp]
- xor esi,edi
- vpsrld xmm6,xmm7,10
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 12[esp],ecx
- vpxor xmm4,xmm4,xmm5
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrlq xmm5,xmm7,17
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD PTR [esp]
- vpaddd xmm0,xmm0,xmm4
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD PTR 28[esp],ebx
- vpxor xmm6,xmm6,xmm5
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD PTR 24[esp]
- vpsrlq xmm7,xmm7,19
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- vpxor xmm6,xmm6,xmm7
- add edx,DWORD PTR 36[esp]
- xor eax,edi
- shrd ecx,ecx,2
- vpshufd xmm7,xmm6,132
- add eax,edx
- add edx,DWORD PTR 8[esp]
- add eax,ecx
- vpsrldq xmm7,xmm7,8
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 12[esp]
- vpaddd xmm0,xmm0,xmm7
- xor edx,ecx
- mov edi,DWORD PTR 16[esp]
- xor esi,edi
- vpshufd xmm7,xmm0,80
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 8[esp],ecx
- vpsrld xmm6,xmm7,10
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrlq xmm5,xmm7,17
- mov ecx,eax
- add edx,edi
- mov edi,DWORD PTR 28[esp]
- vpxor xmm6,xmm6,xmm5
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD PTR 24[esp],eax
- vpsrlq xmm7,xmm7,19
- xor ecx,eax
- xor eax,edi
- add edx,DWORD PTR 20[esp]
- vpxor xmm6,xmm6,xmm7
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- vpshufd xmm7,xmm6,232
- add edx,DWORD PTR 40[esp]
- xor ebx,edi
- shrd ecx,ecx,2
- vpslldq xmm7,xmm7,8
- add ebx,edx
- add edx,DWORD PTR 4[esp]
- add ebx,ecx
- vpaddd xmm0,xmm0,xmm7
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 8[esp]
- vpaddd xmm6,xmm0,XMMWORD PTR [ebp]
- xor edx,ecx
- mov edi,DWORD PTR 12[esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 4[esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD PTR 24[esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD PTR 20[esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD PTR 16[esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD PTR 44[esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD PTR [esp]
- add eax,ecx
- vmovdqa XMMWORD PTR 32[esp],xmm6
- vpalignr xmm4,xmm2,xmm1,4
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 4[esp]
- vpalignr xmm7,xmm0,xmm3,4
- xor edx,ecx
- mov edi,DWORD PTR 8[esp]
- xor esi,edi
- vpsrld xmm6,xmm4,7
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR [esp],ecx
- vpaddd xmm1,xmm1,xmm7
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrld xmm7,xmm4,3
- mov ecx,eax
- add edx,edi
- mov edi,DWORD PTR 20[esp]
- vpslld xmm5,xmm4,14
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD PTR 16[esp],eax
- vpxor xmm4,xmm7,xmm6
- xor ecx,eax
- xor eax,edi
- add edx,DWORD PTR 12[esp]
- vpshufd xmm7,xmm0,250
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- vpsrld xmm6,xmm6,11
- add edx,DWORD PTR 48[esp]
- xor ebx,edi
- shrd ecx,ecx,2
- vpxor xmm4,xmm4,xmm5
- add ebx,edx
- add edx,DWORD PTR 28[esp]
- add ebx,ecx
- vpslld xmm5,xmm5,11
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR [esp]
- vpxor xmm4,xmm4,xmm6
- xor edx,ecx
- mov edi,DWORD PTR 4[esp]
- xor esi,edi
- vpsrld xmm6,xmm7,10
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 28[esp],ecx
- vpxor xmm4,xmm4,xmm5
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrlq xmm5,xmm7,17
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD PTR 16[esp]
- vpaddd xmm1,xmm1,xmm4
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD PTR 12[esp],ebx
- vpxor xmm6,xmm6,xmm5
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD PTR 8[esp]
- vpsrlq xmm7,xmm7,19
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- vpxor xmm6,xmm6,xmm7
- add edx,DWORD PTR 52[esp]
- xor eax,edi
- shrd ecx,ecx,2
- vpshufd xmm7,xmm6,132
- add eax,edx
- add edx,DWORD PTR 24[esp]
- add eax,ecx
- vpsrldq xmm7,xmm7,8
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 28[esp]
- vpaddd xmm1,xmm1,xmm7
- xor edx,ecx
- mov edi,DWORD PTR [esp]
- xor esi,edi
- vpshufd xmm7,xmm1,80
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 24[esp],ecx
- vpsrld xmm6,xmm7,10
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrlq xmm5,xmm7,17
- mov ecx,eax
- add edx,edi
- mov edi,DWORD PTR 12[esp]
- vpxor xmm6,xmm6,xmm5
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD PTR 8[esp],eax
- vpsrlq xmm7,xmm7,19
- xor ecx,eax
- xor eax,edi
- add edx,DWORD PTR 4[esp]
- vpxor xmm6,xmm6,xmm7
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- vpshufd xmm7,xmm6,232
- add edx,DWORD PTR 56[esp]
- xor ebx,edi
- shrd ecx,ecx,2
- vpslldq xmm7,xmm7,8
- add ebx,edx
- add edx,DWORD PTR 20[esp]
- add ebx,ecx
- vpaddd xmm1,xmm1,xmm7
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 24[esp]
- vpaddd xmm6,xmm1,XMMWORD PTR 16[ebp]
- xor edx,ecx
- mov edi,DWORD PTR 28[esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 20[esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD PTR 8[esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD PTR 4[esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD PTR [esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD PTR 60[esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD PTR 16[esp]
- add eax,ecx
- vmovdqa XMMWORD PTR 48[esp],xmm6
- vpalignr xmm4,xmm3,xmm2,4
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 20[esp]
- vpalignr xmm7,xmm1,xmm0,4
- xor edx,ecx
- mov edi,DWORD PTR 24[esp]
- xor esi,edi
- vpsrld xmm6,xmm4,7
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 16[esp],ecx
- vpaddd xmm2,xmm2,xmm7
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrld xmm7,xmm4,3
- mov ecx,eax
- add edx,edi
- mov edi,DWORD PTR 4[esp]
- vpslld xmm5,xmm4,14
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD PTR [esp],eax
- vpxor xmm4,xmm7,xmm6
- xor ecx,eax
- xor eax,edi
- add edx,DWORD PTR 28[esp]
- vpshufd xmm7,xmm1,250
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- vpsrld xmm6,xmm6,11
- add edx,DWORD PTR 64[esp]
- xor ebx,edi
- shrd ecx,ecx,2
- vpxor xmm4,xmm4,xmm5
- add ebx,edx
- add edx,DWORD PTR 12[esp]
- add ebx,ecx
- vpslld xmm5,xmm5,11
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 16[esp]
- vpxor xmm4,xmm4,xmm6
- xor edx,ecx
- mov edi,DWORD PTR 20[esp]
- xor esi,edi
- vpsrld xmm6,xmm7,10
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 12[esp],ecx
- vpxor xmm4,xmm4,xmm5
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrlq xmm5,xmm7,17
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD PTR [esp]
- vpaddd xmm2,xmm2,xmm4
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD PTR 28[esp],ebx
- vpxor xmm6,xmm6,xmm5
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD PTR 24[esp]
- vpsrlq xmm7,xmm7,19
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- vpxor xmm6,xmm6,xmm7
- add edx,DWORD PTR 68[esp]
- xor eax,edi
- shrd ecx,ecx,2
- vpshufd xmm7,xmm6,132
- add eax,edx
- add edx,DWORD PTR 8[esp]
- add eax,ecx
- vpsrldq xmm7,xmm7,8
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 12[esp]
- vpaddd xmm2,xmm2,xmm7
- xor edx,ecx
- mov edi,DWORD PTR 16[esp]
- xor esi,edi
- vpshufd xmm7,xmm2,80
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 8[esp],ecx
- vpsrld xmm6,xmm7,10
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrlq xmm5,xmm7,17
- mov ecx,eax
- add edx,edi
- mov edi,DWORD PTR 28[esp]
- vpxor xmm6,xmm6,xmm5
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD PTR 24[esp],eax
- vpsrlq xmm7,xmm7,19
- xor ecx,eax
- xor eax,edi
- add edx,DWORD PTR 20[esp]
- vpxor xmm6,xmm6,xmm7
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- vpshufd xmm7,xmm6,232
- add edx,DWORD PTR 72[esp]
- xor ebx,edi
- shrd ecx,ecx,2
- vpslldq xmm7,xmm7,8
- add ebx,edx
- add edx,DWORD PTR 4[esp]
- add ebx,ecx
- vpaddd xmm2,xmm2,xmm7
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 8[esp]
- vpaddd xmm6,xmm2,XMMWORD PTR 32[ebp]
- xor edx,ecx
- mov edi,DWORD PTR 12[esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 4[esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD PTR 24[esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD PTR 20[esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD PTR 16[esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD PTR 76[esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD PTR [esp]
- add eax,ecx
- vmovdqa XMMWORD PTR 64[esp],xmm6
- vpalignr xmm4,xmm0,xmm3,4
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 4[esp]
- vpalignr xmm7,xmm2,xmm1,4
- xor edx,ecx
- mov edi,DWORD PTR 8[esp]
- xor esi,edi
- vpsrld xmm6,xmm4,7
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR [esp],ecx
- vpaddd xmm3,xmm3,xmm7
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrld xmm7,xmm4,3
- mov ecx,eax
- add edx,edi
- mov edi,DWORD PTR 20[esp]
- vpslld xmm5,xmm4,14
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD PTR 16[esp],eax
- vpxor xmm4,xmm7,xmm6
- xor ecx,eax
- xor eax,edi
- add edx,DWORD PTR 12[esp]
- vpshufd xmm7,xmm2,250
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- vpsrld xmm6,xmm6,11
- add edx,DWORD PTR 80[esp]
- xor ebx,edi
- shrd ecx,ecx,2
- vpxor xmm4,xmm4,xmm5
- add ebx,edx
- add edx,DWORD PTR 28[esp]
- add ebx,ecx
- vpslld xmm5,xmm5,11
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR [esp]
- vpxor xmm4,xmm4,xmm6
- xor edx,ecx
- mov edi,DWORD PTR 4[esp]
- xor esi,edi
- vpsrld xmm6,xmm7,10
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 28[esp],ecx
- vpxor xmm4,xmm4,xmm5
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrlq xmm5,xmm7,17
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD PTR 16[esp]
- vpaddd xmm3,xmm3,xmm4
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD PTR 12[esp],ebx
- vpxor xmm6,xmm6,xmm5
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD PTR 8[esp]
- vpsrlq xmm7,xmm7,19
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- vpxor xmm6,xmm6,xmm7
- add edx,DWORD PTR 84[esp]
- xor eax,edi
- shrd ecx,ecx,2
- vpshufd xmm7,xmm6,132
- add eax,edx
- add edx,DWORD PTR 24[esp]
- add eax,ecx
- vpsrldq xmm7,xmm7,8
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 28[esp]
- vpaddd xmm3,xmm3,xmm7
- xor edx,ecx
- mov edi,DWORD PTR [esp]
- xor esi,edi
- vpshufd xmm7,xmm3,80
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 24[esp],ecx
- vpsrld xmm6,xmm7,10
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrlq xmm5,xmm7,17
- mov ecx,eax
- add edx,edi
- mov edi,DWORD PTR 12[esp]
- vpxor xmm6,xmm6,xmm5
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD PTR 8[esp],eax
- vpsrlq xmm7,xmm7,19
- xor ecx,eax
- xor eax,edi
- add edx,DWORD PTR 4[esp]
- vpxor xmm6,xmm6,xmm7
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- vpshufd xmm7,xmm6,232
- add edx,DWORD PTR 88[esp]
- xor ebx,edi
- shrd ecx,ecx,2
- vpslldq xmm7,xmm7,8
- add ebx,edx
- add edx,DWORD PTR 20[esp]
- add ebx,ecx
- vpaddd xmm3,xmm3,xmm7
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 24[esp]
- vpaddd xmm6,xmm3,XMMWORD PTR 48[ebp]
- xor edx,ecx
- mov edi,DWORD PTR 28[esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 20[esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD PTR 8[esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD PTR 4[esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD PTR [esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD PTR 92[esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD PTR 16[esp]
- add eax,ecx
- vmovdqa XMMWORD PTR 80[esp],xmm6
- cmp DWORD PTR 64[ebp],66051
- jne $L016avx_00_47
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 20[esp]
- xor edx,ecx
- mov edi,DWORD PTR 24[esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 16[esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,eax
- add edx,edi
- mov edi,DWORD PTR 4[esp]
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD PTR [esp],eax
- xor ecx,eax
- xor eax,edi
- add edx,DWORD PTR 28[esp]
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- add edx,DWORD PTR 32[esp]
- xor ebx,edi
- shrd ecx,ecx,2
- add ebx,edx
- add edx,DWORD PTR 12[esp]
- add ebx,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 16[esp]
- xor edx,ecx
- mov edi,DWORD PTR 20[esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 12[esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD PTR [esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD PTR 28[esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD PTR 24[esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD PTR 36[esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD PTR 8[esp]
- add eax,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 12[esp]
- xor edx,ecx
- mov edi,DWORD PTR 16[esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 8[esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,eax
- add edx,edi
- mov edi,DWORD PTR 28[esp]
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD PTR 24[esp],eax
- xor ecx,eax
- xor eax,edi
- add edx,DWORD PTR 20[esp]
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- add edx,DWORD PTR 40[esp]
- xor ebx,edi
- shrd ecx,ecx,2
- add ebx,edx
- add edx,DWORD PTR 4[esp]
- add ebx,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 8[esp]
- xor edx,ecx
- mov edi,DWORD PTR 12[esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 4[esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD PTR 24[esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD PTR 20[esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD PTR 16[esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD PTR 44[esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD PTR [esp]
- add eax,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 4[esp]
- xor edx,ecx
- mov edi,DWORD PTR 8[esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR [esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,eax
- add edx,edi
- mov edi,DWORD PTR 20[esp]
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD PTR 16[esp],eax
- xor ecx,eax
- xor eax,edi
- add edx,DWORD PTR 12[esp]
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- add edx,DWORD PTR 48[esp]
- xor ebx,edi
- shrd ecx,ecx,2
- add ebx,edx
- add edx,DWORD PTR 28[esp]
- add ebx,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR [esp]
- xor edx,ecx
- mov edi,DWORD PTR 4[esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 28[esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD PTR 16[esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD PTR 12[esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD PTR 8[esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD PTR 52[esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD PTR 24[esp]
- add eax,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 28[esp]
- xor edx,ecx
- mov edi,DWORD PTR [esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 24[esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,eax
- add edx,edi
- mov edi,DWORD PTR 12[esp]
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD PTR 8[esp],eax
- xor ecx,eax
- xor eax,edi
- add edx,DWORD PTR 4[esp]
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- add edx,DWORD PTR 56[esp]
- xor ebx,edi
- shrd ecx,ecx,2
- add ebx,edx
- add edx,DWORD PTR 20[esp]
- add ebx,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 24[esp]
- xor edx,ecx
- mov edi,DWORD PTR 28[esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 20[esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD PTR 8[esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD PTR 4[esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD PTR [esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD PTR 60[esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD PTR 16[esp]
- add eax,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 20[esp]
- xor edx,ecx
- mov edi,DWORD PTR 24[esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 16[esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,eax
- add edx,edi
- mov edi,DWORD PTR 4[esp]
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD PTR [esp],eax
- xor ecx,eax
- xor eax,edi
- add edx,DWORD PTR 28[esp]
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- add edx,DWORD PTR 64[esp]
- xor ebx,edi
- shrd ecx,ecx,2
- add ebx,edx
- add edx,DWORD PTR 12[esp]
- add ebx,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 16[esp]
- xor edx,ecx
- mov edi,DWORD PTR 20[esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 12[esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD PTR [esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD PTR 28[esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD PTR 24[esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD PTR 68[esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD PTR 8[esp]
- add eax,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 12[esp]
- xor edx,ecx
- mov edi,DWORD PTR 16[esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 8[esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,eax
- add edx,edi
- mov edi,DWORD PTR 28[esp]
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD PTR 24[esp],eax
- xor ecx,eax
- xor eax,edi
- add edx,DWORD PTR 20[esp]
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- add edx,DWORD PTR 72[esp]
- xor ebx,edi
- shrd ecx,ecx,2
- add ebx,edx
- add edx,DWORD PTR 4[esp]
- add ebx,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 8[esp]
- xor edx,ecx
- mov edi,DWORD PTR 12[esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 4[esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD PTR 24[esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD PTR 20[esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD PTR 16[esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD PTR 76[esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD PTR [esp]
- add eax,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 4[esp]
- xor edx,ecx
- mov edi,DWORD PTR 8[esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR [esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,eax
- add edx,edi
- mov edi,DWORD PTR 20[esp]
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD PTR 16[esp],eax
- xor ecx,eax
- xor eax,edi
- add edx,DWORD PTR 12[esp]
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- add edx,DWORD PTR 80[esp]
- xor ebx,edi
- shrd ecx,ecx,2
- add ebx,edx
- add edx,DWORD PTR 28[esp]
- add ebx,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR [esp]
- xor edx,ecx
- mov edi,DWORD PTR 4[esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 28[esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD PTR 16[esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD PTR 12[esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD PTR 8[esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD PTR 84[esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD PTR 24[esp]
- add eax,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 28[esp]
- xor edx,ecx
- mov edi,DWORD PTR [esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 24[esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,eax
- add edx,edi
- mov edi,DWORD PTR 12[esp]
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD PTR 8[esp],eax
- xor ecx,eax
- xor eax,edi
- add edx,DWORD PTR 4[esp]
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- add edx,DWORD PTR 88[esp]
- xor ebx,edi
- shrd ecx,ecx,2
- add ebx,edx
- add edx,DWORD PTR 20[esp]
- add ebx,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD PTR 24[esp]
- xor edx,ecx
- mov edi,DWORD PTR 28[esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD PTR 20[esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD PTR 8[esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD PTR 4[esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD PTR [esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD PTR 92[esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD PTR 16[esp]
- add eax,ecx
- mov esi,DWORD PTR 96[esp]
- xor ebx,edi
- mov ecx,DWORD PTR 12[esp]
- add eax,DWORD PTR [esi]
- add ebx,DWORD PTR 4[esi]
- add edi,DWORD PTR 8[esi]
- add ecx,DWORD PTR 12[esi]
- mov DWORD PTR [esi],eax
- mov DWORD PTR 4[esi],ebx
- mov DWORD PTR 8[esi],edi
- mov DWORD PTR 12[esi],ecx
- mov DWORD PTR 4[esp],ebx
- xor ebx,edi
- mov DWORD PTR 8[esp],edi
- mov DWORD PTR 12[esp],ecx
- mov edi,DWORD PTR 20[esp]
- mov ecx,DWORD PTR 24[esp]
- add edx,DWORD PTR 16[esi]
- add edi,DWORD PTR 20[esi]
- add ecx,DWORD PTR 24[esi]
- mov DWORD PTR 16[esi],edx
- mov DWORD PTR 20[esi],edi
- mov DWORD PTR 20[esp],edi
- mov edi,DWORD PTR 28[esp]
- mov DWORD PTR 24[esi],ecx
- add edi,DWORD PTR 28[esi]
- mov DWORD PTR 24[esp],ecx
- mov DWORD PTR 28[esi],edi
- mov DWORD PTR 28[esp],edi
- mov edi,DWORD PTR 100[esp]
- vmovdqa xmm7,XMMWORD PTR 64[ebp]
- sub ebp,192
- cmp edi,DWORD PTR 104[esp]
- jb $L015grand_avx
- mov esp,DWORD PTR 108[esp]
- vzeroall
- pop edi
- pop esi
- pop ebx
- pop ebp
- ret
-ALIGN 32
-$L014AVX_BMI:
- lea esp,DWORD PTR [esp-96]
- vzeroall
- mov eax,DWORD PTR [esi]
- mov ebx,DWORD PTR 4[esi]
- mov ecx,DWORD PTR 8[esi]
- mov edi,DWORD PTR 12[esi]
- mov DWORD PTR 4[esp],ebx
- xor ebx,ecx
- mov DWORD PTR 8[esp],ecx
- mov DWORD PTR 12[esp],edi
- mov edx,DWORD PTR 16[esi]
- mov edi,DWORD PTR 20[esi]
- mov ecx,DWORD PTR 24[esi]
- mov esi,DWORD PTR 28[esi]
- mov DWORD PTR 20[esp],edi
- mov edi,DWORD PTR 100[esp]
- mov DWORD PTR 24[esp],ecx
- mov DWORD PTR 28[esp],esi
- vmovdqa xmm7,XMMWORD PTR 256[ebp]
- jmp $L017grand_avx_bmi
-ALIGN 32
-$L017grand_avx_bmi:
- vmovdqu xmm0,XMMWORD PTR [edi]
- vmovdqu xmm1,XMMWORD PTR 16[edi]
- vmovdqu xmm2,XMMWORD PTR 32[edi]
- vmovdqu xmm3,XMMWORD PTR 48[edi]
- add edi,64
- vpshufb xmm0,xmm0,xmm7
- mov DWORD PTR 100[esp],edi
- vpshufb xmm1,xmm1,xmm7
- vpshufb xmm2,xmm2,xmm7
- vpaddd xmm4,xmm0,XMMWORD PTR [ebp]
- vpshufb xmm3,xmm3,xmm7
- vpaddd xmm5,xmm1,XMMWORD PTR 16[ebp]
- vpaddd xmm6,xmm2,XMMWORD PTR 32[ebp]
- vpaddd xmm7,xmm3,XMMWORD PTR 48[ebp]
- vmovdqa XMMWORD PTR 32[esp],xmm4
- vmovdqa XMMWORD PTR 48[esp],xmm5
- vmovdqa XMMWORD PTR 64[esp],xmm6
- vmovdqa XMMWORD PTR 80[esp],xmm7
- jmp $L018avx_bmi_00_47
-ALIGN 16
-$L018avx_bmi_00_47:
- add ebp,64
- vpalignr xmm4,xmm1,xmm0,4
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR 16[esp],edx
- vpalignr xmm7,xmm3,xmm2,4
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR 24[esp]
- vpsrld xmm6,xmm4,7
- xor ecx,edi
- and edx,DWORD PTR 20[esp]
- mov DWORD PTR [esp],eax
- vpaddd xmm0,xmm0,xmm7
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- vpsrld xmm7,xmm4,3
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- vpslld xmm5,xmm4,14
- mov edi,DWORD PTR 4[esp]
- xor ecx,esi
- xor eax,edi
- vpxor xmm4,xmm7,xmm6
- add edx,DWORD PTR 28[esp]
- and ebx,eax
- add edx,DWORD PTR 32[esp]
- vpshufd xmm7,xmm3,250
- xor ebx,edi
- add ecx,edx
- add edx,DWORD PTR 12[esp]
- vpsrld xmm6,xmm6,11
- lea ebx,DWORD PTR [ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpxor xmm4,xmm4,xmm5
- mov DWORD PTR 12[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpslld xmm5,xmm5,11
- andn esi,edx,DWORD PTR 20[esp]
- xor ecx,edi
- and edx,DWORD PTR 16[esp]
- vpxor xmm4,xmm4,xmm6
- mov DWORD PTR 28[esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- vpsrld xmm6,xmm7,10
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- vpxor xmm4,xmm4,xmm5
- mov edi,DWORD PTR [esp]
- xor ecx,esi
- xor ebx,edi
- vpsrlq xmm5,xmm7,17
- add edx,DWORD PTR 24[esp]
- and eax,ebx
- add edx,DWORD PTR 36[esp]
- vpaddd xmm0,xmm0,xmm4
- xor eax,edi
- add ecx,edx
- add edx,DWORD PTR 8[esp]
- vpxor xmm6,xmm6,xmm5
- lea eax,DWORD PTR [ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpsrlq xmm7,xmm7,19
- mov DWORD PTR 8[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpxor xmm6,xmm6,xmm7
- andn esi,edx,DWORD PTR 16[esp]
- xor ecx,edi
- and edx,DWORD PTR 12[esp]
- vpshufd xmm7,xmm6,132
- mov DWORD PTR 24[esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- vpsrldq xmm7,xmm7,8
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- vpaddd xmm0,xmm0,xmm7
- mov edi,DWORD PTR 28[esp]
- xor ecx,esi
- xor eax,edi
- vpshufd xmm7,xmm0,80
- add edx,DWORD PTR 20[esp]
- and ebx,eax
- add edx,DWORD PTR 40[esp]
- vpsrld xmm6,xmm7,10
- xor ebx,edi
- add ecx,edx
- add edx,DWORD PTR 4[esp]
- vpsrlq xmm5,xmm7,17
- lea ebx,DWORD PTR [ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpxor xmm6,xmm6,xmm5
- mov DWORD PTR 4[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpsrlq xmm7,xmm7,19
- andn esi,edx,DWORD PTR 12[esp]
- xor ecx,edi
- and edx,DWORD PTR 8[esp]
- vpxor xmm6,xmm6,xmm7
- mov DWORD PTR 20[esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- vpshufd xmm7,xmm6,232
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- vpslldq xmm7,xmm7,8
- mov edi,DWORD PTR 24[esp]
- xor ecx,esi
- xor ebx,edi
- vpaddd xmm0,xmm0,xmm7
- add edx,DWORD PTR 16[esp]
- and eax,ebx
- add edx,DWORD PTR 44[esp]
- vpaddd xmm6,xmm0,XMMWORD PTR [ebp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD PTR [esp]
- lea eax,DWORD PTR [ecx*1+eax]
- vmovdqa XMMWORD PTR 32[esp],xmm6
- vpalignr xmm4,xmm2,xmm1,4
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR [esp],edx
- vpalignr xmm7,xmm0,xmm3,4
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR 8[esp]
- vpsrld xmm6,xmm4,7
- xor ecx,edi
- and edx,DWORD PTR 4[esp]
- mov DWORD PTR 16[esp],eax
- vpaddd xmm1,xmm1,xmm7
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- vpsrld xmm7,xmm4,3
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- vpslld xmm5,xmm4,14
- mov edi,DWORD PTR 20[esp]
- xor ecx,esi
- xor eax,edi
- vpxor xmm4,xmm7,xmm6
- add edx,DWORD PTR 12[esp]
- and ebx,eax
- add edx,DWORD PTR 48[esp]
- vpshufd xmm7,xmm0,250
- xor ebx,edi
- add ecx,edx
- add edx,DWORD PTR 28[esp]
- vpsrld xmm6,xmm6,11
- lea ebx,DWORD PTR [ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpxor xmm4,xmm4,xmm5
- mov DWORD PTR 28[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpslld xmm5,xmm5,11
- andn esi,edx,DWORD PTR 4[esp]
- xor ecx,edi
- and edx,DWORD PTR [esp]
- vpxor xmm4,xmm4,xmm6
- mov DWORD PTR 12[esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- vpsrld xmm6,xmm7,10
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- vpxor xmm4,xmm4,xmm5
- mov edi,DWORD PTR 16[esp]
- xor ecx,esi
- xor ebx,edi
- vpsrlq xmm5,xmm7,17
- add edx,DWORD PTR 8[esp]
- and eax,ebx
- add edx,DWORD PTR 52[esp]
- vpaddd xmm1,xmm1,xmm4
- xor eax,edi
- add ecx,edx
- add edx,DWORD PTR 24[esp]
- vpxor xmm6,xmm6,xmm5
- lea eax,DWORD PTR [ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpsrlq xmm7,xmm7,19
- mov DWORD PTR 24[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpxor xmm6,xmm6,xmm7
- andn esi,edx,DWORD PTR [esp]
- xor ecx,edi
- and edx,DWORD PTR 28[esp]
- vpshufd xmm7,xmm6,132
- mov DWORD PTR 8[esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- vpsrldq xmm7,xmm7,8
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- vpaddd xmm1,xmm1,xmm7
- mov edi,DWORD PTR 12[esp]
- xor ecx,esi
- xor eax,edi
- vpshufd xmm7,xmm1,80
- add edx,DWORD PTR 4[esp]
- and ebx,eax
- add edx,DWORD PTR 56[esp]
- vpsrld xmm6,xmm7,10
- xor ebx,edi
- add ecx,edx
- add edx,DWORD PTR 20[esp]
- vpsrlq xmm5,xmm7,17
- lea ebx,DWORD PTR [ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpxor xmm6,xmm6,xmm5
- mov DWORD PTR 20[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpsrlq xmm7,xmm7,19
- andn esi,edx,DWORD PTR 28[esp]
- xor ecx,edi
- and edx,DWORD PTR 24[esp]
- vpxor xmm6,xmm6,xmm7
- mov DWORD PTR 4[esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- vpshufd xmm7,xmm6,232
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- vpslldq xmm7,xmm7,8
- mov edi,DWORD PTR 8[esp]
- xor ecx,esi
- xor ebx,edi
- vpaddd xmm1,xmm1,xmm7
- add edx,DWORD PTR [esp]
- and eax,ebx
- add edx,DWORD PTR 60[esp]
- vpaddd xmm6,xmm1,XMMWORD PTR 16[ebp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD PTR 16[esp]
- lea eax,DWORD PTR [ecx*1+eax]
- vmovdqa XMMWORD PTR 48[esp],xmm6
- vpalignr xmm4,xmm3,xmm2,4
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR 16[esp],edx
- vpalignr xmm7,xmm1,xmm0,4
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR 24[esp]
- vpsrld xmm6,xmm4,7
- xor ecx,edi
- and edx,DWORD PTR 20[esp]
- mov DWORD PTR [esp],eax
- vpaddd xmm2,xmm2,xmm7
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- vpsrld xmm7,xmm4,3
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- vpslld xmm5,xmm4,14
- mov edi,DWORD PTR 4[esp]
- xor ecx,esi
- xor eax,edi
- vpxor xmm4,xmm7,xmm6
- add edx,DWORD PTR 28[esp]
- and ebx,eax
- add edx,DWORD PTR 64[esp]
- vpshufd xmm7,xmm1,250
- xor ebx,edi
- add ecx,edx
- add edx,DWORD PTR 12[esp]
- vpsrld xmm6,xmm6,11
- lea ebx,DWORD PTR [ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpxor xmm4,xmm4,xmm5
- mov DWORD PTR 12[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpslld xmm5,xmm5,11
- andn esi,edx,DWORD PTR 20[esp]
- xor ecx,edi
- and edx,DWORD PTR 16[esp]
- vpxor xmm4,xmm4,xmm6
- mov DWORD PTR 28[esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- vpsrld xmm6,xmm7,10
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- vpxor xmm4,xmm4,xmm5
- mov edi,DWORD PTR [esp]
- xor ecx,esi
- xor ebx,edi
- vpsrlq xmm5,xmm7,17
- add edx,DWORD PTR 24[esp]
- and eax,ebx
- add edx,DWORD PTR 68[esp]
- vpaddd xmm2,xmm2,xmm4
- xor eax,edi
- add ecx,edx
- add edx,DWORD PTR 8[esp]
- vpxor xmm6,xmm6,xmm5
- lea eax,DWORD PTR [ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpsrlq xmm7,xmm7,19
- mov DWORD PTR 8[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpxor xmm6,xmm6,xmm7
- andn esi,edx,DWORD PTR 16[esp]
- xor ecx,edi
- and edx,DWORD PTR 12[esp]
- vpshufd xmm7,xmm6,132
- mov DWORD PTR 24[esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- vpsrldq xmm7,xmm7,8
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- vpaddd xmm2,xmm2,xmm7
- mov edi,DWORD PTR 28[esp]
- xor ecx,esi
- xor eax,edi
- vpshufd xmm7,xmm2,80
- add edx,DWORD PTR 20[esp]
- and ebx,eax
- add edx,DWORD PTR 72[esp]
- vpsrld xmm6,xmm7,10
- xor ebx,edi
- add ecx,edx
- add edx,DWORD PTR 4[esp]
- vpsrlq xmm5,xmm7,17
- lea ebx,DWORD PTR [ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpxor xmm6,xmm6,xmm5
- mov DWORD PTR 4[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpsrlq xmm7,xmm7,19
- andn esi,edx,DWORD PTR 12[esp]
- xor ecx,edi
- and edx,DWORD PTR 8[esp]
- vpxor xmm6,xmm6,xmm7
- mov DWORD PTR 20[esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- vpshufd xmm7,xmm6,232
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- vpslldq xmm7,xmm7,8
- mov edi,DWORD PTR 24[esp]
- xor ecx,esi
- xor ebx,edi
- vpaddd xmm2,xmm2,xmm7
- add edx,DWORD PTR 16[esp]
- and eax,ebx
- add edx,DWORD PTR 76[esp]
- vpaddd xmm6,xmm2,XMMWORD PTR 32[ebp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD PTR [esp]
- lea eax,DWORD PTR [ecx*1+eax]
- vmovdqa XMMWORD PTR 64[esp],xmm6
- vpalignr xmm4,xmm0,xmm3,4
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR [esp],edx
- vpalignr xmm7,xmm2,xmm1,4
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR 8[esp]
- vpsrld xmm6,xmm4,7
- xor ecx,edi
- and edx,DWORD PTR 4[esp]
- mov DWORD PTR 16[esp],eax
- vpaddd xmm3,xmm3,xmm7
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- vpsrld xmm7,xmm4,3
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- vpslld xmm5,xmm4,14
- mov edi,DWORD PTR 20[esp]
- xor ecx,esi
- xor eax,edi
- vpxor xmm4,xmm7,xmm6
- add edx,DWORD PTR 12[esp]
- and ebx,eax
- add edx,DWORD PTR 80[esp]
- vpshufd xmm7,xmm2,250
- xor ebx,edi
- add ecx,edx
- add edx,DWORD PTR 28[esp]
- vpsrld xmm6,xmm6,11
- lea ebx,DWORD PTR [ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpxor xmm4,xmm4,xmm5
- mov DWORD PTR 28[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpslld xmm5,xmm5,11
- andn esi,edx,DWORD PTR 4[esp]
- xor ecx,edi
- and edx,DWORD PTR [esp]
- vpxor xmm4,xmm4,xmm6
- mov DWORD PTR 12[esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- vpsrld xmm6,xmm7,10
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- vpxor xmm4,xmm4,xmm5
- mov edi,DWORD PTR 16[esp]
- xor ecx,esi
- xor ebx,edi
- vpsrlq xmm5,xmm7,17
- add edx,DWORD PTR 8[esp]
- and eax,ebx
- add edx,DWORD PTR 84[esp]
- vpaddd xmm3,xmm3,xmm4
- xor eax,edi
- add ecx,edx
- add edx,DWORD PTR 24[esp]
- vpxor xmm6,xmm6,xmm5
- lea eax,DWORD PTR [ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpsrlq xmm7,xmm7,19
- mov DWORD PTR 24[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpxor xmm6,xmm6,xmm7
- andn esi,edx,DWORD PTR [esp]
- xor ecx,edi
- and edx,DWORD PTR 28[esp]
- vpshufd xmm7,xmm6,132
- mov DWORD PTR 8[esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- vpsrldq xmm7,xmm7,8
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- vpaddd xmm3,xmm3,xmm7
- mov edi,DWORD PTR 12[esp]
- xor ecx,esi
- xor eax,edi
- vpshufd xmm7,xmm3,80
- add edx,DWORD PTR 4[esp]
- and ebx,eax
- add edx,DWORD PTR 88[esp]
- vpsrld xmm6,xmm7,10
- xor ebx,edi
- add ecx,edx
- add edx,DWORD PTR 20[esp]
- vpsrlq xmm5,xmm7,17
- lea ebx,DWORD PTR [ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpxor xmm6,xmm6,xmm5
- mov DWORD PTR 20[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpsrlq xmm7,xmm7,19
- andn esi,edx,DWORD PTR 28[esp]
- xor ecx,edi
- and edx,DWORD PTR 24[esp]
- vpxor xmm6,xmm6,xmm7
- mov DWORD PTR 4[esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- vpshufd xmm7,xmm6,232
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- vpslldq xmm7,xmm7,8
- mov edi,DWORD PTR 8[esp]
- xor ecx,esi
- xor ebx,edi
- vpaddd xmm3,xmm3,xmm7
- add edx,DWORD PTR [esp]
- and eax,ebx
- add edx,DWORD PTR 92[esp]
- vpaddd xmm6,xmm3,XMMWORD PTR 48[ebp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD PTR 16[esp]
- lea eax,DWORD PTR [ecx*1+eax]
- vmovdqa XMMWORD PTR 80[esp],xmm6
- cmp DWORD PTR 64[ebp],66051
- jne $L018avx_bmi_00_47
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR 16[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR 24[esp]
- xor ecx,edi
- and edx,DWORD PTR 20[esp]
- mov DWORD PTR [esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- mov edi,DWORD PTR 4[esp]
- xor ecx,esi
- xor eax,edi
- add edx,DWORD PTR 28[esp]
- and ebx,eax
- add edx,DWORD PTR 32[esp]
- xor ebx,edi
- add ecx,edx
- add edx,DWORD PTR 12[esp]
- lea ebx,DWORD PTR [ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR 12[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR 20[esp]
- xor ecx,edi
- and edx,DWORD PTR 16[esp]
- mov DWORD PTR 28[esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- mov edi,DWORD PTR [esp]
- xor ecx,esi
- xor ebx,edi
- add edx,DWORD PTR 24[esp]
- and eax,ebx
- add edx,DWORD PTR 36[esp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD PTR 8[esp]
- lea eax,DWORD PTR [ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR 8[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR 16[esp]
- xor ecx,edi
- and edx,DWORD PTR 12[esp]
- mov DWORD PTR 24[esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- mov edi,DWORD PTR 28[esp]
- xor ecx,esi
- xor eax,edi
- add edx,DWORD PTR 20[esp]
- and ebx,eax
- add edx,DWORD PTR 40[esp]
- xor ebx,edi
- add ecx,edx
- add edx,DWORD PTR 4[esp]
- lea ebx,DWORD PTR [ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR 4[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR 12[esp]
- xor ecx,edi
- and edx,DWORD PTR 8[esp]
- mov DWORD PTR 20[esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- mov edi,DWORD PTR 24[esp]
- xor ecx,esi
- xor ebx,edi
- add edx,DWORD PTR 16[esp]
- and eax,ebx
- add edx,DWORD PTR 44[esp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD PTR [esp]
- lea eax,DWORD PTR [ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR [esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR 8[esp]
- xor ecx,edi
- and edx,DWORD PTR 4[esp]
- mov DWORD PTR 16[esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- mov edi,DWORD PTR 20[esp]
- xor ecx,esi
- xor eax,edi
- add edx,DWORD PTR 12[esp]
- and ebx,eax
- add edx,DWORD PTR 48[esp]
- xor ebx,edi
- add ecx,edx
- add edx,DWORD PTR 28[esp]
- lea ebx,DWORD PTR [ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR 28[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR 4[esp]
- xor ecx,edi
- and edx,DWORD PTR [esp]
- mov DWORD PTR 12[esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- mov edi,DWORD PTR 16[esp]
- xor ecx,esi
- xor ebx,edi
- add edx,DWORD PTR 8[esp]
- and eax,ebx
- add edx,DWORD PTR 52[esp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD PTR 24[esp]
- lea eax,DWORD PTR [ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR 24[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR [esp]
- xor ecx,edi
- and edx,DWORD PTR 28[esp]
- mov DWORD PTR 8[esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- mov edi,DWORD PTR 12[esp]
- xor ecx,esi
- xor eax,edi
- add edx,DWORD PTR 4[esp]
- and ebx,eax
- add edx,DWORD PTR 56[esp]
- xor ebx,edi
- add ecx,edx
- add edx,DWORD PTR 20[esp]
- lea ebx,DWORD PTR [ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR 20[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR 28[esp]
- xor ecx,edi
- and edx,DWORD PTR 24[esp]
- mov DWORD PTR 4[esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- mov edi,DWORD PTR 8[esp]
- xor ecx,esi
- xor ebx,edi
- add edx,DWORD PTR [esp]
- and eax,ebx
- add edx,DWORD PTR 60[esp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD PTR 16[esp]
- lea eax,DWORD PTR [ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR 16[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR 24[esp]
- xor ecx,edi
- and edx,DWORD PTR 20[esp]
- mov DWORD PTR [esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- mov edi,DWORD PTR 4[esp]
- xor ecx,esi
- xor eax,edi
- add edx,DWORD PTR 28[esp]
- and ebx,eax
- add edx,DWORD PTR 64[esp]
- xor ebx,edi
- add ecx,edx
- add edx,DWORD PTR 12[esp]
- lea ebx,DWORD PTR [ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR 12[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR 20[esp]
- xor ecx,edi
- and edx,DWORD PTR 16[esp]
- mov DWORD PTR 28[esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- mov edi,DWORD PTR [esp]
- xor ecx,esi
- xor ebx,edi
- add edx,DWORD PTR 24[esp]
- and eax,ebx
- add edx,DWORD PTR 68[esp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD PTR 8[esp]
- lea eax,DWORD PTR [ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR 8[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR 16[esp]
- xor ecx,edi
- and edx,DWORD PTR 12[esp]
- mov DWORD PTR 24[esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- mov edi,DWORD PTR 28[esp]
- xor ecx,esi
- xor eax,edi
- add edx,DWORD PTR 20[esp]
- and ebx,eax
- add edx,DWORD PTR 72[esp]
- xor ebx,edi
- add ecx,edx
- add edx,DWORD PTR 4[esp]
- lea ebx,DWORD PTR [ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR 4[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR 12[esp]
- xor ecx,edi
- and edx,DWORD PTR 8[esp]
- mov DWORD PTR 20[esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- mov edi,DWORD PTR 24[esp]
- xor ecx,esi
- xor ebx,edi
- add edx,DWORD PTR 16[esp]
- and eax,ebx
- add edx,DWORD PTR 76[esp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD PTR [esp]
- lea eax,DWORD PTR [ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR [esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR 8[esp]
- xor ecx,edi
- and edx,DWORD PTR 4[esp]
- mov DWORD PTR 16[esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- mov edi,DWORD PTR 20[esp]
- xor ecx,esi
- xor eax,edi
- add edx,DWORD PTR 12[esp]
- and ebx,eax
- add edx,DWORD PTR 80[esp]
- xor ebx,edi
- add ecx,edx
- add edx,DWORD PTR 28[esp]
- lea ebx,DWORD PTR [ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR 28[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR 4[esp]
- xor ecx,edi
- and edx,DWORD PTR [esp]
- mov DWORD PTR 12[esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- mov edi,DWORD PTR 16[esp]
- xor ecx,esi
- xor ebx,edi
- add edx,DWORD PTR 8[esp]
- and eax,ebx
- add edx,DWORD PTR 84[esp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD PTR 24[esp]
- lea eax,DWORD PTR [ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR 24[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR [esp]
- xor ecx,edi
- and edx,DWORD PTR 28[esp]
- mov DWORD PTR 8[esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- mov edi,DWORD PTR 12[esp]
- xor ecx,esi
- xor eax,edi
- add edx,DWORD PTR 4[esp]
- and ebx,eax
- add edx,DWORD PTR 88[esp]
- xor ebx,edi
- add ecx,edx
- add edx,DWORD PTR 20[esp]
- lea ebx,DWORD PTR [ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD PTR 20[esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD PTR 28[esp]
- xor ecx,edi
- and edx,DWORD PTR 24[esp]
- mov DWORD PTR 4[esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- lea edx,DWORD PTR [ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- mov edi,DWORD PTR 8[esp]
- xor ecx,esi
- xor ebx,edi
- add edx,DWORD PTR [esp]
- and eax,ebx
- add edx,DWORD PTR 92[esp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD PTR 16[esp]
- lea eax,DWORD PTR [ecx*1+eax]
- mov esi,DWORD PTR 96[esp]
- xor ebx,edi
- mov ecx,DWORD PTR 12[esp]
- add eax,DWORD PTR [esi]
- add ebx,DWORD PTR 4[esi]
- add edi,DWORD PTR 8[esi]
- add ecx,DWORD PTR 12[esi]
- mov DWORD PTR [esi],eax
- mov DWORD PTR 4[esi],ebx
- mov DWORD PTR 8[esi],edi
- mov DWORD PTR 12[esi],ecx
- mov DWORD PTR 4[esp],ebx
- xor ebx,edi
- mov DWORD PTR 8[esp],edi
- mov DWORD PTR 12[esp],ecx
- mov edi,DWORD PTR 20[esp]
- mov ecx,DWORD PTR 24[esp]
- add edx,DWORD PTR 16[esi]
- add edi,DWORD PTR 20[esi]
- add ecx,DWORD PTR 24[esi]
- mov DWORD PTR 16[esi],edx
- mov DWORD PTR 20[esi],edi
- mov DWORD PTR 20[esp],edi
- mov edi,DWORD PTR 28[esp]
- mov DWORD PTR 24[esi],ecx
- add edi,DWORD PTR 28[esi]
- mov DWORD PTR 24[esp],ecx
- mov DWORD PTR 28[esi],edi
- mov DWORD PTR 28[esp],edi
- mov edi,DWORD PTR 100[esp]
- vmovdqa xmm7,XMMWORD PTR 64[ebp]
- sub ebp,192
- cmp edi,DWORD PTR 104[esp]
- jb $L017grand_avx_bmi
- mov esp,DWORD PTR 108[esp]
- vzeroall
- pop edi
- pop esi
- pop ebx
- pop ebp
- ret
_sha256_block_data_order ENDP
.text$ ENDS
.bss SEGMENT 'BSS'
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/aes/aes-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/aes/aes-x86_64.s
index 0bdfe91fc5..c21cce10f5 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/aes/aes-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/aes/aes-x86_64.s
@@ -81,8 +81,8 @@ _x86_64_AES_encrypt:
movl 0(%r14,%rdi,8),%edi
movl 0(%r14,%rbp,8),%ebp
- andl $65280,%edi
- andl $65280,%ebp
+ andl $0x0000ff00,%edi
+ andl $0x0000ff00,%ebp
xorl %edi,%r10d
xorl %ebp,%r11d
@@ -94,8 +94,8 @@ _x86_64_AES_encrypt:
movl 0(%r14,%rsi,8),%esi
movl 0(%r14,%rdi,8),%edi
- andl $65280,%esi
- andl $65280,%edi
+ andl $0x0000ff00,%esi
+ andl $0x0000ff00,%edi
shrl $16,%ebx
xorl %esi,%r12d
xorl %edi,%r8d
@@ -108,9 +108,9 @@ _x86_64_AES_encrypt:
movl 0(%r14,%rdi,8),%edi
movl 0(%r14,%rbp,8),%ebp
- andl $16711680,%esi
- andl $16711680,%edi
- andl $16711680,%ebp
+ andl $0x00ff0000,%esi
+ andl $0x00ff0000,%edi
+ andl $0x00ff0000,%ebp
xorl %esi,%r10d
xorl %edi,%r11d
@@ -123,9 +123,9 @@ _x86_64_AES_encrypt:
movl 2(%r14,%rdi,8),%edi
movl 2(%r14,%rbp,8),%ebp
- andl $16711680,%esi
- andl $4278190080,%edi
- andl $4278190080,%ebp
+ andl $0x00ff0000,%esi
+ andl $0xff000000,%edi
+ andl $0xff000000,%ebp
xorl %esi,%r8d
xorl %edi,%r10d
@@ -138,8 +138,8 @@ _x86_64_AES_encrypt:
movl 2(%r14,%rdi,8),%edi
movl 16+0(%r15),%eax
- andl $4278190080,%esi
- andl $4278190080,%edi
+ andl $0xff000000,%esi
+ andl $0xff000000,%edi
xorl %esi,%r12d
xorl %edi,%r8d
@@ -241,8 +241,8 @@ _x86_64_AES_encrypt_compact:
xorl %r8d,%edx
cmpq 16(%rsp),%r15
je .Lenc_compact_done
- movl $2155905152,%r10d
- movl $2155905152,%r11d
+ movl $0x80808080,%r10d
+ movl $0x80808080,%r11d
andl %eax,%r10d
andl %ebx,%r11d
movl %r10d,%esi
@@ -253,10 +253,10 @@ _x86_64_AES_encrypt_compact:
leal (%rbx,%rbx,1),%r9d
subl %r10d,%esi
subl %r11d,%edi
- andl $4278124286,%r8d
- andl $4278124286,%r9d
- andl $454761243,%esi
- andl $454761243,%edi
+ andl $0xfefefefe,%r8d
+ andl $0xfefefefe,%r9d
+ andl $0x1b1b1b1b,%esi
+ andl $0x1b1b1b1b,%edi
movl %eax,%r10d
movl %ebx,%r11d
xorl %esi,%r8d
@@ -264,9 +264,9 @@ _x86_64_AES_encrypt_compact:
xorl %r8d,%eax
xorl %r9d,%ebx
- movl $2155905152,%r12d
+ movl $0x80808080,%r12d
roll $24,%eax
- movl $2155905152,%ebp
+ movl $0x80808080,%ebp
roll $24,%ebx
andl %ecx,%r12d
andl %edx,%ebp
@@ -289,10 +289,10 @@ _x86_64_AES_encrypt_compact:
xorl %r10d,%eax
xorl %r11d,%ebx
- andl $4278124286,%r8d
- andl $4278124286,%r9d
- andl $454761243,%esi
- andl $454761243,%edi
+ andl $0xfefefefe,%r8d
+ andl $0xfefefefe,%r9d
+ andl $0x1b1b1b1b,%esi
+ andl $0x1b1b1b1b,%edi
movl %ecx,%r12d
movl %edx,%ebp
xorl %esi,%r8d
@@ -345,7 +345,7 @@ AES_encrypt:
andq $-64,%rsp
subq %rsp,%rcx
negq %rcx
- andq $960,%rcx
+ andq $0x3c0,%rcx
subq %rcx,%rsp
subq $32,%rsp
@@ -370,7 +370,7 @@ AES_encrypt:
leaq .LAES_Te+2048(%rip),%r14
leaq 768(%rsp),%rbp
subq %r14,%rbp
- andq $768,%rbp
+ andq $0x300,%rbp
leaq (%r14,%rbp,1),%r14
call _x86_64_AES_encrypt_compact
@@ -792,7 +792,7 @@ AES_decrypt:
andq $-64,%rsp
subq %rsp,%rcx
negq %rcx
- andq $960,%rcx
+ andq $0x3c0,%rcx
subq %rcx,%rsp
subq $32,%rsp
@@ -817,7 +817,7 @@ AES_decrypt:
leaq .LAES_Td+2048(%rip),%r14
leaq 768(%rsp),%rbp
subq %r14,%rbp
- andq $768,%rbp
+ andq $0x300,%rbp
leaq (%r14,%rbp,1),%r14
shrq $3,%rbp
addq %rbp,%r14
@@ -1333,9 +1333,9 @@ AES_cbc_encrypt:
movq %r14,%r10
leaq 2304(%r14),%r11
movq %r15,%r12
- andq $4095,%r10
- andq $4095,%r11
- andq $4095,%r12
+ andq $0xFFF,%r10
+ andq $0xFFF,%r11
+ andq $0xFFF,%r12
cmpq %r11,%r12
jb .Lcbc_te_break_out
@@ -1344,7 +1344,7 @@ AES_cbc_encrypt:
jmp .Lcbc_te_ok
.Lcbc_te_break_out:
subq %r10,%r12
- andq $4095,%r12
+ andq $0xFFF,%r12
addq $320,%r12
subq %r12,%r15
.align 4
@@ -1370,7 +1370,7 @@ AES_cbc_encrypt:
movq %r15,%r10
subq %r14,%r10
- andq $4095,%r10
+ andq $0xfff,%r10
cmpq $2304,%r10
jb .Lcbc_do_ecopy
cmpq $4096-248,%r10
@@ -1557,7 +1557,7 @@ AES_cbc_encrypt:
leaq -88-63(%rcx),%r10
subq %rbp,%r10
negq %r10
- andq $960,%r10
+ andq $0x3c0,%r10
subq %r10,%rbp
xchgq %rsp,%rbp
@@ -1586,7 +1586,7 @@ AES_cbc_encrypt:
leaq 2048(%r14),%r14
leaq 768-8(%rsp),%rax
subq %r14,%rax
- andq $768,%rax
+ andq $0x300,%rax
leaq (%r14,%rax,1),%r14
cmpq $0,%rbx
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-sha1-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-sha1-x86_64.s
index d4ed2047c6..edbd5cb343 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-sha1-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-sha1-x86_64.s
@@ -1392,8 +1392,8 @@ aesni_cbc_sha1_enc_shaext:
movups 16(%rcx),%xmm0
leaq 112(%rcx),%rcx
- pshufd $27,%xmm8,%xmm8
- pshufd $27,%xmm9,%xmm9
+ pshufd $0b00011011,%xmm8,%xmm8
+ pshufd $0b00011011,%xmm9,%xmm9
jmp .Loop_shaext
.align 16
@@ -1672,8 +1672,8 @@ aesni_cbc_sha1_enc_shaext:
leaq 64(%rdi),%rdi
jnz .Loop_shaext
- pshufd $27,%xmm8,%xmm8
- pshufd $27,%xmm9,%xmm9
+ pshufd $0b00011011,%xmm8,%xmm8
+ pshufd $0b00011011,%xmm9,%xmm9
movups %xmm2,(%r8)
movdqu %xmm8,(%r9)
movd %xmm9,16(%r9)
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s
index 6573fe4be3..fcf42adbb4 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s
@@ -503,7 +503,7 @@ aesni_ecb_encrypt:
testl %r8d,%r8d
jz .Lecb_decrypt
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb .Lecb_enc_tail
movdqu (%rdi),%xmm2
@@ -515,7 +515,7 @@ aesni_ecb_encrypt:
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp .Lecb_enc_loop8_enter
.align 16
.Lecb_enc_loop8:
@@ -543,7 +543,7 @@ aesni_ecb_encrypt:
call _aesni_encrypt8
- subq $128,%rdx
+ subq $0x80,%rdx
jnc .Lecb_enc_loop8
movups %xmm2,(%rsi)
@@ -557,22 +557,22 @@ aesni_ecb_encrypt:
movups %xmm8,96(%rsi)
movups %xmm9,112(%rsi)
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz .Lecb_ret
.Lecb_enc_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lecb_enc_one
movups 16(%rdi),%xmm3
je .Lecb_enc_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lecb_enc_three
movups 48(%rdi),%xmm5
je .Lecb_enc_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb .Lecb_enc_five
movups 80(%rdi),%xmm7
je .Lecb_enc_six
@@ -646,7 +646,7 @@ aesni_ecb_encrypt:
.align 16
.Lecb_decrypt:
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb .Lecb_dec_tail
movdqu (%rdi),%xmm2
@@ -658,7 +658,7 @@ aesni_ecb_encrypt:
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp .Lecb_dec_loop8_enter
.align 16
.Lecb_dec_loop8:
@@ -687,7 +687,7 @@ aesni_ecb_encrypt:
call _aesni_decrypt8
movups (%r11),%xmm0
- subq $128,%rdx
+ subq $0x80,%rdx
jnc .Lecb_dec_loop8
movups %xmm2,(%rsi)
@@ -709,22 +709,22 @@ aesni_ecb_encrypt:
movups %xmm9,112(%rsi)
pxor %xmm9,%xmm9
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz .Lecb_ret
.Lecb_dec_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lecb_dec_one
movups 16(%rdi),%xmm3
je .Lecb_dec_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lecb_dec_three
movups 48(%rdi),%xmm5
je .Lecb_dec_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb .Lecb_dec_five
movups 80(%rdi),%xmm7
je .Lecb_dec_six
@@ -1598,7 +1598,7 @@ aesni_xts_encrypt:
movdqa .Lxts_magic(%rip),%xmm8
movdqa %xmm2,%xmm15
- pshufd $95,%xmm2,%xmm9
+ pshufd $0x5f,%xmm2,%xmm9
pxor %xmm0,%xmm1
movdqa %xmm9,%xmm14
paddd %xmm9,%xmm9
@@ -1697,7 +1697,7 @@ aesni_xts_encrypt:
.byte 102,15,56,220,248
movups 64(%r11),%xmm0
movdqa %xmm8,80(%rsp)
- pshufd $95,%xmm15,%xmm9
+ pshufd $0x5f,%xmm15,%xmm9
jmp .Lxts_enc_loop6
.align 32
.Lxts_enc_loop6:
@@ -1836,13 +1836,13 @@ aesni_xts_encrypt:
jz .Lxts_enc_done
pxor %xmm0,%xmm11
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lxts_enc_one
pxor %xmm0,%xmm12
je .Lxts_enc_two
pxor %xmm0,%xmm13
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lxts_enc_three
pxor %xmm0,%xmm14
je .Lxts_enc_four
@@ -2069,7 +2069,7 @@ aesni_xts_decrypt:
movdqa .Lxts_magic(%rip),%xmm8
movdqa %xmm2,%xmm15
- pshufd $95,%xmm2,%xmm9
+ pshufd $0x5f,%xmm2,%xmm9
pxor %xmm0,%xmm1
movdqa %xmm9,%xmm14
paddd %xmm9,%xmm9
@@ -2168,7 +2168,7 @@ aesni_xts_decrypt:
.byte 102,15,56,222,248
movups 64(%r11),%xmm0
movdqa %xmm8,80(%rsp)
- pshufd $95,%xmm15,%xmm9
+ pshufd $0x5f,%xmm15,%xmm9
jmp .Lxts_dec_loop6
.align 32
.Lxts_dec_loop6:
@@ -2308,13 +2308,13 @@ aesni_xts_decrypt:
jz .Lxts_dec_done
pxor %xmm0,%xmm12
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lxts_dec_one
pxor %xmm0,%xmm13
je .Lxts_dec_two
pxor %xmm0,%xmm14
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lxts_dec_three
je .Lxts_dec_four
@@ -2345,7 +2345,7 @@ aesni_xts_decrypt:
pcmpgtd %xmm15,%xmm14
movdqu %xmm6,64(%rsi)
leaq 80(%rsi),%rsi
- pshufd $19,%xmm14,%xmm11
+ pshufd $0x13,%xmm14,%xmm11
andq $15,%r9
jz .Lxts_dec_ret
@@ -2634,7 +2634,7 @@ aesni_cbc_encrypt:
leaq -8(%rax),%rbp
movups (%r8),%xmm10
movl %r10d,%eax
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe .Lcbc_dec_tail
movups (%rcx),%xmm0
@@ -2650,14 +2650,14 @@ aesni_cbc_encrypt:
movdqu 80(%rdi),%xmm7
movdqa %xmm6,%xmm15
movl OPENSSL_ia32cap_P+4(%rip),%r9d
- cmpq $112,%rdx
+ cmpq $0x70,%rdx
jbe .Lcbc_dec_six_or_seven
andl $71303168,%r9d
- subq $80,%rdx
+ subq $0x50,%rdx
cmpl $4194304,%r9d
je .Lcbc_dec_loop6_enter
- subq $32,%rdx
+ subq $0x20,%rdx
leaq 112(%rcx),%rcx
jmp .Lcbc_dec_loop8_enter
.align 16
@@ -2672,7 +2672,7 @@ aesni_cbc_encrypt:
movups 16-112(%rcx),%xmm1
pxor %xmm0,%xmm4
xorq %r11,%r11
- cmpq $112,%rdx
+ cmpq $0x70,%rdx
pxor %xmm0,%xmm5
pxor %xmm0,%xmm6
pxor %xmm0,%xmm7
@@ -2857,21 +2857,21 @@ aesni_cbc_encrypt:
movups %xmm8,96(%rsi)
leaq 112(%rsi),%rsi
- subq $128,%rdx
+ subq $0x80,%rdx
ja .Lcbc_dec_loop8
movaps %xmm9,%xmm2
leaq -112(%rcx),%rcx
- addq $112,%rdx
+ addq $0x70,%rdx
jle .Lcbc_dec_clear_tail_collected
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe .Lcbc_dec_tail
movaps %xmm11,%xmm2
.Lcbc_dec_six_or_seven:
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
ja .Lcbc_dec_seven
movaps %xmm7,%xmm8
@@ -2964,33 +2964,33 @@ aesni_cbc_encrypt:
movl %r10d,%eax
movdqu %xmm6,64(%rsi)
leaq 80(%rsi),%rsi
- subq $96,%rdx
+ subq $0x60,%rdx
ja .Lcbc_dec_loop6
movdqa %xmm7,%xmm2
- addq $80,%rdx
+ addq $0x50,%rdx
jle .Lcbc_dec_clear_tail_collected
movups %xmm7,(%rsi)
leaq 16(%rsi),%rsi
.Lcbc_dec_tail:
movups (%rdi),%xmm2
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_one
movups 16(%rdi),%xmm3
movaps %xmm2,%xmm11
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_two
movups 32(%rdi),%xmm4
movaps %xmm3,%xmm12
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_three
movups 48(%rdi),%xmm5
movaps %xmm4,%xmm13
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_four
movups 64(%rdi),%xmm6
@@ -3015,7 +3015,7 @@ aesni_cbc_encrypt:
movdqa %xmm6,%xmm2
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
- subq $16,%rdx
+ subq $0x10,%rdx
jmp .Lcbc_dec_tail_collected
.align 16
@@ -3332,7 +3332,7 @@ __aesni_set_encrypt_key:
pslldq $4,%xmm0
pxor %xmm3,%xmm0
- pshufd $255,%xmm0,%xmm3
+ pshufd $0xff,%xmm0,%xmm3
pxor %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm1,%xmm3
@@ -3419,7 +3419,7 @@ __aesni_set_encrypt_key:
decl %r10d
jz .Ldone_key256
- pshufd $255,%xmm0,%xmm2
+ pshufd $0xff,%xmm0,%xmm2
pxor %xmm3,%xmm3
.byte 102,15,56,221,211
@@ -3462,11 +3462,11 @@ __aesni_set_encrypt_key:
movups %xmm0,(%rax)
leaq 16(%rax),%rax
.Lkey_expansion_128_cold:
- shufps $16,%xmm0,%xmm4
+ shufps $0b00010000,%xmm0,%xmm4
xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
+ shufps $0b10001100,%xmm0,%xmm4
xorps %xmm4,%xmm0
- shufps $255,%xmm1,%xmm1
+ shufps $0b11111111,%xmm1,%xmm1
xorps %xmm1,%xmm0
.byte 0xf3,0xc3
@@ -3477,25 +3477,25 @@ __aesni_set_encrypt_key:
.Lkey_expansion_192a_cold:
movaps %xmm2,%xmm5
.Lkey_expansion_192b_warm:
- shufps $16,%xmm0,%xmm4
+ shufps $0b00010000,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
+ shufps $0b10001100,%xmm0,%xmm4
pslldq $4,%xmm3
xorps %xmm4,%xmm0
- pshufd $85,%xmm1,%xmm1
+ pshufd $0b01010101,%xmm1,%xmm1
pxor %xmm3,%xmm2
pxor %xmm1,%xmm0
- pshufd $255,%xmm0,%xmm3
+ pshufd $0b11111111,%xmm0,%xmm3
pxor %xmm3,%xmm2
.byte 0xf3,0xc3
.align 16
.Lkey_expansion_192b:
movaps %xmm0,%xmm3
- shufps $68,%xmm0,%xmm5
+ shufps $0b01000100,%xmm0,%xmm5
movups %xmm5,(%rax)
- shufps $78,%xmm2,%xmm3
+ shufps $0b01001110,%xmm2,%xmm3
movups %xmm3,16(%rax)
leaq 32(%rax),%rax
jmp .Lkey_expansion_192b_warm
@@ -3505,11 +3505,11 @@ __aesni_set_encrypt_key:
movups %xmm2,(%rax)
leaq 16(%rax),%rax
.Lkey_expansion_256a_cold:
- shufps $16,%xmm0,%xmm4
+ shufps $0b00010000,%xmm0,%xmm4
xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
+ shufps $0b10001100,%xmm0,%xmm4
xorps %xmm4,%xmm0
- shufps $255,%xmm1,%xmm1
+ shufps $0b11111111,%xmm1,%xmm1
xorps %xmm1,%xmm0
.byte 0xf3,0xc3
@@ -3518,11 +3518,11 @@ __aesni_set_encrypt_key:
movups %xmm0,(%rax)
leaq 16(%rax),%rax
- shufps $16,%xmm2,%xmm4
+ shufps $0b00010000,%xmm2,%xmm4
xorps %xmm4,%xmm2
- shufps $140,%xmm2,%xmm4
+ shufps $0b10001100,%xmm2,%xmm4
xorps %xmm4,%xmm2
- shufps $170,%xmm1,%xmm1
+ shufps $0b10101010,%xmm1,%xmm1
xorps %xmm1,%xmm2
.byte 0xf3,0xc3
.size aesni_set_encrypt_key,.-aesni_set_encrypt_key
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/aes/bsaes-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/aes/bsaes-x86_64.s
index 5b363a5eef..0fd201167f 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/aes/bsaes-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/aes/bsaes-x86_64.s
@@ -324,45 +324,45 @@ _bsaes_encrypt8_bitslice:
pxor %xmm2,%xmm5
decl %r10d
jl .Lenc_done
- pshufd $147,%xmm15,%xmm7
- pshufd $147,%xmm0,%xmm8
+ pshufd $0x93,%xmm15,%xmm7
+ pshufd $0x93,%xmm0,%xmm8
pxor %xmm7,%xmm15
- pshufd $147,%xmm3,%xmm9
+ pshufd $0x93,%xmm3,%xmm9
pxor %xmm8,%xmm0
- pshufd $147,%xmm5,%xmm10
+ pshufd $0x93,%xmm5,%xmm10
pxor %xmm9,%xmm3
- pshufd $147,%xmm2,%xmm11
+ pshufd $0x93,%xmm2,%xmm11
pxor %xmm10,%xmm5
- pshufd $147,%xmm6,%xmm12
+ pshufd $0x93,%xmm6,%xmm12
pxor %xmm11,%xmm2
- pshufd $147,%xmm1,%xmm13
+ pshufd $0x93,%xmm1,%xmm13
pxor %xmm12,%xmm6
- pshufd $147,%xmm4,%xmm14
+ pshufd $0x93,%xmm4,%xmm14
pxor %xmm13,%xmm1
pxor %xmm14,%xmm4
pxor %xmm15,%xmm8
pxor %xmm4,%xmm7
pxor %xmm4,%xmm8
- pshufd $78,%xmm15,%xmm15
+ pshufd $0x4E,%xmm15,%xmm15
pxor %xmm0,%xmm9
- pshufd $78,%xmm0,%xmm0
+ pshufd $0x4E,%xmm0,%xmm0
pxor %xmm2,%xmm12
pxor %xmm7,%xmm15
pxor %xmm6,%xmm13
pxor %xmm8,%xmm0
pxor %xmm5,%xmm11
- pshufd $78,%xmm2,%xmm7
+ pshufd $0x4E,%xmm2,%xmm7
pxor %xmm1,%xmm14
- pshufd $78,%xmm6,%xmm8
+ pshufd $0x4E,%xmm6,%xmm8
pxor %xmm3,%xmm10
- pshufd $78,%xmm5,%xmm2
+ pshufd $0x4E,%xmm5,%xmm2
pxor %xmm4,%xmm10
- pshufd $78,%xmm4,%xmm6
+ pshufd $0x4E,%xmm4,%xmm6
pxor %xmm4,%xmm11
- pshufd $78,%xmm1,%xmm5
+ pshufd $0x4E,%xmm1,%xmm5
pxor %xmm11,%xmm7
- pshufd $78,%xmm3,%xmm1
+ pshufd $0x4E,%xmm3,%xmm1
pxor %xmm12,%xmm8
pxor %xmm10,%xmm2
pxor %xmm14,%xmm6
@@ -796,24 +796,24 @@ _bsaes_decrypt8:
decl %r10d
jl .Ldec_done
- pshufd $78,%xmm15,%xmm7
- pshufd $78,%xmm2,%xmm13
+ pshufd $0x4E,%xmm15,%xmm7
+ pshufd $0x4E,%xmm2,%xmm13
pxor %xmm15,%xmm7
- pshufd $78,%xmm4,%xmm14
+ pshufd $0x4E,%xmm4,%xmm14
pxor %xmm2,%xmm13
- pshufd $78,%xmm0,%xmm8
+ pshufd $0x4E,%xmm0,%xmm8
pxor %xmm4,%xmm14
- pshufd $78,%xmm5,%xmm9
+ pshufd $0x4E,%xmm5,%xmm9
pxor %xmm0,%xmm8
- pshufd $78,%xmm3,%xmm10
+ pshufd $0x4E,%xmm3,%xmm10
pxor %xmm5,%xmm9
pxor %xmm13,%xmm15
pxor %xmm13,%xmm0
- pshufd $78,%xmm1,%xmm11
+ pshufd $0x4E,%xmm1,%xmm11
pxor %xmm3,%xmm10
pxor %xmm7,%xmm5
pxor %xmm8,%xmm3
- pshufd $78,%xmm6,%xmm12
+ pshufd $0x4E,%xmm6,%xmm12
pxor %xmm1,%xmm11
pxor %xmm14,%xmm0
pxor %xmm9,%xmm1
@@ -827,45 +827,45 @@ _bsaes_decrypt8:
pxor %xmm14,%xmm1
pxor %xmm14,%xmm6
pxor %xmm12,%xmm4
- pshufd $147,%xmm15,%xmm7
- pshufd $147,%xmm0,%xmm8
+ pshufd $0x93,%xmm15,%xmm7
+ pshufd $0x93,%xmm0,%xmm8
pxor %xmm7,%xmm15
- pshufd $147,%xmm5,%xmm9
+ pshufd $0x93,%xmm5,%xmm9
pxor %xmm8,%xmm0
- pshufd $147,%xmm3,%xmm10
+ pshufd $0x93,%xmm3,%xmm10
pxor %xmm9,%xmm5
- pshufd $147,%xmm1,%xmm11
+ pshufd $0x93,%xmm1,%xmm11
pxor %xmm10,%xmm3
- pshufd $147,%xmm6,%xmm12
+ pshufd $0x93,%xmm6,%xmm12
pxor %xmm11,%xmm1
- pshufd $147,%xmm2,%xmm13
+ pshufd $0x93,%xmm2,%xmm13
pxor %xmm12,%xmm6
- pshufd $147,%xmm4,%xmm14
+ pshufd $0x93,%xmm4,%xmm14
pxor %xmm13,%xmm2
pxor %xmm14,%xmm4
pxor %xmm15,%xmm8
pxor %xmm4,%xmm7
pxor %xmm4,%xmm8
- pshufd $78,%xmm15,%xmm15
+ pshufd $0x4E,%xmm15,%xmm15
pxor %xmm0,%xmm9
- pshufd $78,%xmm0,%xmm0
+ pshufd $0x4E,%xmm0,%xmm0
pxor %xmm1,%xmm12
pxor %xmm7,%xmm15
pxor %xmm6,%xmm13
pxor %xmm8,%xmm0
pxor %xmm3,%xmm11
- pshufd $78,%xmm1,%xmm7
+ pshufd $0x4E,%xmm1,%xmm7
pxor %xmm2,%xmm14
- pshufd $78,%xmm6,%xmm8
+ pshufd $0x4E,%xmm6,%xmm8
pxor %xmm5,%xmm10
- pshufd $78,%xmm3,%xmm1
+ pshufd $0x4E,%xmm3,%xmm1
pxor %xmm4,%xmm10
- pshufd $78,%xmm4,%xmm6
+ pshufd $0x4E,%xmm4,%xmm6
pxor %xmm4,%xmm11
- pshufd $78,%xmm2,%xmm3
+ pshufd $0x4E,%xmm2,%xmm3
pxor %xmm11,%xmm7
- pshufd $78,%xmm5,%xmm2
+ pshufd $0x4E,%xmm5,%xmm2
pxor %xmm12,%xmm8
pxor %xmm1,%xmm10
pxor %xmm14,%xmm6
@@ -1552,20 +1552,20 @@ bsaes_xts_encrypt:
movdqa %xmm7,(%rax)
andq $-16,%r14
- subq $128,%rsp
+ subq $0x80,%rsp
movdqa 32(%rbp),%xmm6
pxor %xmm14,%xmm14
movdqa .Lxts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- subq $128,%r14
+ subq $0x80,%r14
jc .Lxts_enc_short
jmp .Lxts_enc_loop
.align 16
.Lxts_enc_loop:
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm15
movdqa %xmm6,0(%rsp)
@@ -1573,7 +1573,7 @@ bsaes_xts_encrypt:
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm0
movdqa %xmm6,16(%rsp)
@@ -1582,7 +1582,7 @@ bsaes_xts_encrypt:
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
movdqu 0(%r12),%xmm7
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm1
movdqa %xmm6,32(%rsp)
@@ -1592,7 +1592,7 @@ bsaes_xts_encrypt:
pxor %xmm13,%xmm6
movdqu 16(%r12),%xmm8
pxor %xmm7,%xmm15
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm2
movdqa %xmm6,48(%rsp)
@@ -1602,7 +1602,7 @@ bsaes_xts_encrypt:
pxor %xmm13,%xmm6
movdqu 32(%r12),%xmm9
pxor %xmm8,%xmm0
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm3
movdqa %xmm6,64(%rsp)
@@ -1612,7 +1612,7 @@ bsaes_xts_encrypt:
pxor %xmm13,%xmm6
movdqu 48(%r12),%xmm10
pxor %xmm9,%xmm1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm4
movdqa %xmm6,80(%rsp)
@@ -1622,7 +1622,7 @@ bsaes_xts_encrypt:
pxor %xmm13,%xmm6
movdqu 64(%r12),%xmm11
pxor %xmm10,%xmm2
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm5
movdqa %xmm6,96(%rsp)
@@ -1666,20 +1666,20 @@ bsaes_xts_encrypt:
pxor %xmm14,%xmm14
movdqa .Lxts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
paddq %xmm6,%xmm6
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- subq $128,%r14
+ subq $0x80,%r14
jnc .Lxts_enc_loop
.Lxts_enc_short:
- addq $128,%r14
+ addq $0x80,%r14
jz .Lxts_enc_done
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm15
movdqa %xmm6,0(%rsp)
@@ -1687,7 +1687,7 @@ bsaes_xts_encrypt:
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm0
movdqa %xmm6,16(%rsp)
@@ -1698,7 +1698,7 @@ bsaes_xts_encrypt:
movdqu 0(%r12),%xmm7
cmpq $16,%r14
je .Lxts_enc_1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm1
movdqa %xmm6,32(%rsp)
@@ -1710,7 +1710,7 @@ bsaes_xts_encrypt:
cmpq $32,%r14
je .Lxts_enc_2
pxor %xmm7,%xmm15
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm2
movdqa %xmm6,48(%rsp)
@@ -1722,7 +1722,7 @@ bsaes_xts_encrypt:
cmpq $48,%r14
je .Lxts_enc_3
pxor %xmm8,%xmm0
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm3
movdqa %xmm6,64(%rsp)
@@ -1734,7 +1734,7 @@ bsaes_xts_encrypt:
cmpq $64,%r14
je .Lxts_enc_4
pxor %xmm9,%xmm1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm4
movdqa %xmm6,80(%rsp)
@@ -1746,7 +1746,7 @@ bsaes_xts_encrypt:
cmpq $80,%r14
je .Lxts_enc_5
pxor %xmm10,%xmm2
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm5
movdqa %xmm6,96(%rsp)
@@ -2011,20 +2011,20 @@ bsaes_xts_decrypt:
shlq $4,%rax
subq %rax,%r14
- subq $128,%rsp
+ subq $0x80,%rsp
movdqa 32(%rbp),%xmm6
pxor %xmm14,%xmm14
movdqa .Lxts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- subq $128,%r14
+ subq $0x80,%r14
jc .Lxts_dec_short
jmp .Lxts_dec_loop
.align 16
.Lxts_dec_loop:
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm15
movdqa %xmm6,0(%rsp)
@@ -2032,7 +2032,7 @@ bsaes_xts_decrypt:
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm0
movdqa %xmm6,16(%rsp)
@@ -2041,7 +2041,7 @@ bsaes_xts_decrypt:
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
movdqu 0(%r12),%xmm7
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm1
movdqa %xmm6,32(%rsp)
@@ -2051,7 +2051,7 @@ bsaes_xts_decrypt:
pxor %xmm13,%xmm6
movdqu 16(%r12),%xmm8
pxor %xmm7,%xmm15
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm2
movdqa %xmm6,48(%rsp)
@@ -2061,7 +2061,7 @@ bsaes_xts_decrypt:
pxor %xmm13,%xmm6
movdqu 32(%r12),%xmm9
pxor %xmm8,%xmm0
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm3
movdqa %xmm6,64(%rsp)
@@ -2071,7 +2071,7 @@ bsaes_xts_decrypt:
pxor %xmm13,%xmm6
movdqu 48(%r12),%xmm10
pxor %xmm9,%xmm1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm4
movdqa %xmm6,80(%rsp)
@@ -2081,7 +2081,7 @@ bsaes_xts_decrypt:
pxor %xmm13,%xmm6
movdqu 64(%r12),%xmm11
pxor %xmm10,%xmm2
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm5
movdqa %xmm6,96(%rsp)
@@ -2125,20 +2125,20 @@ bsaes_xts_decrypt:
pxor %xmm14,%xmm14
movdqa .Lxts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
paddq %xmm6,%xmm6
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- subq $128,%r14
+ subq $0x80,%r14
jnc .Lxts_dec_loop
.Lxts_dec_short:
- addq $128,%r14
+ addq $0x80,%r14
jz .Lxts_dec_done
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm15
movdqa %xmm6,0(%rsp)
@@ -2146,7 +2146,7 @@ bsaes_xts_decrypt:
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm0
movdqa %xmm6,16(%rsp)
@@ -2157,7 +2157,7 @@ bsaes_xts_decrypt:
movdqu 0(%r12),%xmm7
cmpq $16,%r14
je .Lxts_dec_1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm1
movdqa %xmm6,32(%rsp)
@@ -2169,7 +2169,7 @@ bsaes_xts_decrypt:
cmpq $32,%r14
je .Lxts_dec_2
pxor %xmm7,%xmm15
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm2
movdqa %xmm6,48(%rsp)
@@ -2181,7 +2181,7 @@ bsaes_xts_decrypt:
cmpq $48,%r14
je .Lxts_dec_3
pxor %xmm8,%xmm0
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm3
movdqa %xmm6,64(%rsp)
@@ -2193,7 +2193,7 @@ bsaes_xts_decrypt:
cmpq $64,%r14
je .Lxts_dec_4
pxor %xmm9,%xmm1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm4
movdqa %xmm6,80(%rsp)
@@ -2205,7 +2205,7 @@ bsaes_xts_decrypt:
cmpq $80,%r14
je .Lxts_dec_5
pxor %xmm10,%xmm2
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm5
movdqa %xmm6,96(%rsp)
@@ -2382,7 +2382,7 @@ bsaes_xts_decrypt:
pxor %xmm14,%xmm14
movdqa .Lxts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
movdqa %xmm6,%xmm5
paddq %xmm6,%xmm6
pand %xmm12,%xmm13
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/aes/vpaes-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/aes/vpaes-x86_64.s
index b9d6df5134..bf7c2b0b6f 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/aes/vpaes-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/aes/vpaes-x86_64.s
@@ -60,7 +60,7 @@ _vpaes_encrypt_core:
addq $16,%r11
pxor %xmm0,%xmm3
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
subq $1,%rax
pxor %xmm3,%xmm0
@@ -120,10 +120,10 @@ _vpaes_decrypt_core:
pand %xmm9,%xmm0
.byte 102,15,56,0,208
movdqa .Lk_dipt+16(%rip),%xmm0
- xorq $48,%r11
+ xorq $0x30,%r11
leaq .Lk_dsbd(%rip),%r10
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
pxor %xmm5,%xmm2
movdqa .Lk_mc_forward+48(%rip),%xmm5
pxor %xmm2,%xmm0
@@ -242,7 +242,7 @@ _vpaes_schedule_core:
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
movdqu %xmm3,(%rdx)
- xorq $48,%r8
+ xorq $0x30,%r8
.Lschedule_go:
cmpl $192,%esi
@@ -332,7 +332,7 @@ _vpaes_schedule_core:
call _vpaes_schedule_mangle
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
movdqa %xmm7,%xmm5
movdqa %xmm6,%xmm7
call _vpaes_schedule_low_round
@@ -399,8 +399,8 @@ _vpaes_schedule_core:
.type _vpaes_schedule_192_smear,@function
.align 16
_vpaes_schedule_192_smear:
- pshufd $128,%xmm6,%xmm1
- pshufd $254,%xmm7,%xmm0
+ pshufd $0x80,%xmm6,%xmm1
+ pshufd $0xFE,%xmm7,%xmm0
pxor %xmm1,%xmm6
pxor %xmm1,%xmm1
pxor %xmm0,%xmm6
@@ -437,7 +437,7 @@ _vpaes_schedule_round:
pxor %xmm1,%xmm7
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
.byte 102,15,58,15,192,1
@@ -596,7 +596,7 @@ _vpaes_schedule_mangle:
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
addq $-16,%r8
- andq $48,%r8
+ andq $0x30,%r8
movdqu %xmm3,(%rdx)
.byte 0xf3,0xc3
.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
@@ -614,7 +614,7 @@ vpaes_set_encrypt_key:
movl %eax,240(%rdx)
movl $0,%ecx
- movl $48,%r8d
+ movl $0x30,%r8d
call _vpaes_schedule_core
xorl %eax,%eax
.byte 0xf3,0xc3
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/bn/rsaz-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/bn/rsaz-x86_64.s
index b43eb278e2..4a1211329c 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/bn/rsaz-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/bn/rsaz-x86_64.s
@@ -461,48 +461,94 @@ rsaz_512_mul_gather4:
pushq %r14
pushq %r15
- movl %r9d,%r9d
- subq $128+24,%rsp
+ subq $152,%rsp
.Lmul_gather4_body:
- movl 64(%rdx,%r9,4),%eax
-.byte 102,72,15,110,199
- movl (%rdx,%r9,4),%ebx
-.byte 102,72,15,110,201
+ movd %r9d,%xmm8
+ movdqa .Linc+16(%rip),%xmm1
+ movdqa .Linc(%rip),%xmm0
+
+ pshufd $0,%xmm8,%xmm8
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm8,%xmm0
+ movdqa %xmm7,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm8,%xmm1
+ movdqa %xmm7,%xmm4
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm8,%xmm2
+ movdqa %xmm7,%xmm5
+ paddd %xmm3,%xmm4
+ pcmpeqd %xmm8,%xmm3
+ movdqa %xmm7,%xmm6
+ paddd %xmm4,%xmm5
+ pcmpeqd %xmm8,%xmm4
+ paddd %xmm5,%xmm6
+ pcmpeqd %xmm8,%xmm5
+ paddd %xmm6,%xmm7
+ pcmpeqd %xmm8,%xmm6
+ pcmpeqd %xmm8,%xmm7
+
+ movdqa 0(%rdx),%xmm8
+ movdqa 16(%rdx),%xmm9
+ movdqa 32(%rdx),%xmm10
+ movdqa 48(%rdx),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rdx),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rdx),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rdx),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rdx),%xmm15
+ leaq 128(%rdx),%rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+.byte 102,76,15,126,195
+
movq %r8,128(%rsp)
+ movq %rdi,128+8(%rsp)
+ movq %rcx,128+16(%rsp)
- shlq $32,%rax
- orq %rax,%rbx
movq (%rsi),%rax
movq 8(%rsi),%rcx
- leaq 128(%rdx,%r9,4),%rbp
mulq %rbx
movq %rax,(%rsp)
movq %rcx,%rax
movq %rdx,%r8
mulq %rbx
- movd (%rbp),%xmm4
addq %rax,%r8
movq 16(%rsi),%rax
movq %rdx,%r9
adcq $0,%r9
mulq %rbx
- movd 64(%rbp),%xmm5
addq %rax,%r9
movq 24(%rsi),%rax
movq %rdx,%r10
adcq $0,%r10
mulq %rbx
- pslldq $4,%xmm5
addq %rax,%r10
movq 32(%rsi),%rax
movq %rdx,%r11
adcq $0,%r11
mulq %rbx
- por %xmm5,%xmm4
addq %rax,%r11
movq 40(%rsi),%rax
movq %rdx,%r12
@@ -515,14 +561,12 @@ rsaz_512_mul_gather4:
adcq $0,%r13
mulq %rbx
- leaq 128(%rbp),%rbp
addq %rax,%r13
movq 56(%rsi),%rax
movq %rdx,%r14
adcq $0,%r14
mulq %rbx
-.byte 102,72,15,126,227
addq %rax,%r14
movq (%rsi),%rax
movq %rdx,%r15
@@ -534,6 +578,35 @@ rsaz_512_mul_gather4:
.align 32
.Loop_mul_gather:
+ movdqa 0(%rbp),%xmm8
+ movdqa 16(%rbp),%xmm9
+ movdqa 32(%rbp),%xmm10
+ movdqa 48(%rbp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rbp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rbp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rbp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rbp),%xmm15
+ leaq 128(%rbp),%rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+.byte 102,76,15,126,195
+
mulq %rbx
addq %rax,%r8
movq 8(%rsi),%rax
@@ -542,7 +615,6 @@ rsaz_512_mul_gather4:
adcq $0,%r8
mulq %rbx
- movd (%rbp),%xmm4
addq %rax,%r9
movq 16(%rsi),%rax
adcq $0,%rdx
@@ -551,7 +623,6 @@ rsaz_512_mul_gather4:
adcq $0,%r9
mulq %rbx
- movd 64(%rbp),%xmm5
addq %rax,%r10
movq 24(%rsi),%rax
adcq $0,%rdx
@@ -560,7 +631,6 @@ rsaz_512_mul_gather4:
adcq $0,%r10
mulq %rbx
- pslldq $4,%xmm5
addq %rax,%r11
movq 32(%rsi),%rax
adcq $0,%rdx
@@ -569,7 +639,6 @@ rsaz_512_mul_gather4:
adcq $0,%r11
mulq %rbx
- por %xmm5,%xmm4
addq %rax,%r12
movq 40(%rsi),%rax
adcq $0,%rdx
@@ -594,7 +663,6 @@ rsaz_512_mul_gather4:
adcq $0,%r14
mulq %rbx
-.byte 102,72,15,126,227
addq %rax,%r15
movq (%rsi),%rax
adcq $0,%rdx
@@ -602,7 +670,6 @@ rsaz_512_mul_gather4:
movq %rdx,%r15
adcq $0,%r15
- leaq 128(%rbp),%rbp
leaq 8(%rdi),%rdi
decl %ecx
@@ -617,8 +684,8 @@ rsaz_512_mul_gather4:
movq %r14,48(%rdi)
movq %r15,56(%rdi)
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
+ movq 128+8(%rsp),%rdi
+ movq 128+16(%rsp),%rbp
movq (%rsp),%r8
movq 8(%rsp),%r9
@@ -667,7 +734,7 @@ rsaz_512_mul_scatter4:
movl %r9d,%r9d
subq $128+24,%rsp
.Lmul_scatter4_body:
- leaq (%r8,%r9,4),%r8
+ leaq (%r8,%r9,8),%r8
.byte 102,72,15,110,199
.byte 102,72,15,110,202
.byte 102,73,15,110,208
@@ -703,30 +770,14 @@ rsaz_512_mul_scatter4:
call __rsaz_512_subtract
- movl %r8d,0(%rsi)
- shrq $32,%r8
- movl %r9d,128(%rsi)
- shrq $32,%r9
- movl %r10d,256(%rsi)
- shrq $32,%r10
- movl %r11d,384(%rsi)
- shrq $32,%r11
- movl %r12d,512(%rsi)
- shrq $32,%r12
- movl %r13d,640(%rsi)
- shrq $32,%r13
- movl %r14d,768(%rsi)
- shrq $32,%r14
- movl %r15d,896(%rsi)
- shrq $32,%r15
- movl %r8d,64(%rsi)
- movl %r9d,192(%rsi)
- movl %r10d,320(%rsi)
- movl %r11d,448(%rsi)
- movl %r12d,576(%rsi)
- movl %r13d,704(%rsi)
- movl %r14d,832(%rsi)
- movl %r15d,960(%rsi)
+ movq %r8,0(%rsi)
+ movq %r9,128(%rsi)
+ movq %r10,256(%rsi)
+ movq %r11,384(%rsi)
+ movq %r12,512(%rsi)
+ movq %r13,640(%rsi)
+ movq %r14,768(%rsi)
+ movq %r15,896(%rsi)
leaq 128+24+48(%rsp),%rax
movq -48(%rax),%r15
@@ -1079,16 +1130,14 @@ __rsaz_512_mul:
.type rsaz_512_scatter4,@function
.align 16
rsaz_512_scatter4:
- leaq (%rdi,%rdx,4),%rdi
+ leaq (%rdi,%rdx,8),%rdi
movl $8,%r9d
jmp .Loop_scatter
.align 16
.Loop_scatter:
movq (%rsi),%rax
leaq 8(%rsi),%rsi
- movl %eax,(%rdi)
- shrq $32,%rax
- movl %eax,64(%rdi)
+ movq %rax,(%rdi)
leaq 128(%rdi),%rdi
decl %r9d
jnz .Loop_scatter
@@ -1099,19 +1148,72 @@ rsaz_512_scatter4:
.type rsaz_512_gather4,@function
.align 16
rsaz_512_gather4:
- leaq (%rsi,%rdx,4),%rsi
+ movd %edx,%xmm8
+ movdqa .Linc+16(%rip),%xmm1
+ movdqa .Linc(%rip),%xmm0
+
+ pshufd $0,%xmm8,%xmm8
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm8,%xmm0
+ movdqa %xmm7,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm8,%xmm1
+ movdqa %xmm7,%xmm4
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm8,%xmm2
+ movdqa %xmm7,%xmm5
+ paddd %xmm3,%xmm4
+ pcmpeqd %xmm8,%xmm3
+ movdqa %xmm7,%xmm6
+ paddd %xmm4,%xmm5
+ pcmpeqd %xmm8,%xmm4
+ paddd %xmm5,%xmm6
+ pcmpeqd %xmm8,%xmm5
+ paddd %xmm6,%xmm7
+ pcmpeqd %xmm8,%xmm6
+ pcmpeqd %xmm8,%xmm7
movl $8,%r9d
jmp .Loop_gather
.align 16
.Loop_gather:
- movl (%rsi),%eax
- movl 64(%rsi),%r8d
+ movdqa 0(%rsi),%xmm8
+ movdqa 16(%rsi),%xmm9
+ movdqa 32(%rsi),%xmm10
+ movdqa 48(%rsi),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rsi),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rsi),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rsi),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rsi),%xmm15
leaq 128(%rsi),%rsi
- shlq $32,%r8
- orq %r8,%rax
- movq %rax,(%rdi)
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+ movq %xmm8,(%rdi)
leaq 8(%rdi),%rdi
decl %r9d
jnz .Loop_gather
.byte 0xf3,0xc3
+.LSEH_end_rsaz_512_gather4:
.size rsaz_512_gather4,.-rsaz_512_gather4
+
+.align 64
+.Linc:
+.long 0,0, 1,1
+.long 2,2, 2,2
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-gf2m.s b/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-gf2m.s
index eed057ad6a..f4e5337565 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-gf2m.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-gf2m.s
@@ -242,7 +242,7 @@ bn_GF2m_mul_2x2:
movq %rcx,56(%rsp)
movq %r8,64(%rsp)
- movq $15,%r8
+ movq $0xf,%r8
movq %rsi,%rax
movq %rcx,%rbp
call _mul_1x1
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont.s b/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont.s
index b098169213..9e0019c163 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont.s
@@ -633,20 +633,20 @@ bn_sqr8x_mont:
- leaq -64(%rsp,%r9,4),%r11
+ leaq -64(%rsp,%r9,2),%r11
movq (%r8),%r8
subq %rsi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lsqr8x_sp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,4),%rsp
+ leaq -64(%rsp,%r9,2),%rsp
jmp .Lsqr8x_sp_done
.align 32
.Lsqr8x_sp_alt:
- leaq 4096-64(,%r9,4),%r10
- leaq -64(%rsp,%r9,4),%rsp
+ leaq 4096-64(,%r9,2),%r10
+ leaq -64(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -656,58 +656,80 @@ bn_sqr8x_mont:
movq %r9,%r10
negq %r9
- leaq 64(%rsp,%r9,2),%r11
movq %r8,32(%rsp)
movq %rax,40(%rsp)
.Lsqr8x_body:
- movq %r9,%rbp
-.byte 102,73,15,110,211
- shrq $3+2,%rbp
- movl OPENSSL_ia32cap_P+8(%rip),%eax
- jmp .Lsqr8x_copy_n
-
-.align 32
-.Lsqr8x_copy_n:
- movq 0(%rcx),%xmm0
- movq 8(%rcx),%xmm1
- movq 16(%rcx),%xmm3
- movq 24(%rcx),%xmm4
- leaq 32(%rcx),%rcx
- movdqa %xmm0,0(%r11)
- movdqa %xmm1,16(%r11)
- movdqa %xmm3,32(%r11)
- movdqa %xmm4,48(%r11)
- leaq 64(%r11),%r11
- decq %rbp
- jnz .Lsqr8x_copy_n
-
+.byte 102,72,15,110,209
pxor %xmm0,%xmm0
.byte 102,72,15,110,207
.byte 102,73,15,110,218
call bn_sqr8x_internal
+
+
+
+ leaq (%rdi,%r9,1),%rbx
+ movq %r9,%rcx
+ movq %r9,%rdx
+.byte 102,72,15,126,207
+ sarq $3+2,%rcx
+ jmp .Lsqr8x_sub
+
+.align 32
+.Lsqr8x_sub:
+ movq 0(%rbx),%r12
+ movq 8(%rbx),%r13
+ movq 16(%rbx),%r14
+ movq 24(%rbx),%r15
+ leaq 32(%rbx),%rbx
+ sbbq 0(%rbp),%r12
+ sbbq 8(%rbp),%r13
+ sbbq 16(%rbp),%r14
+ sbbq 24(%rbp),%r15
+ leaq 32(%rbp),%rbp
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ leaq 32(%rdi),%rdi
+ incq %rcx
+ jnz .Lsqr8x_sub
+
+ sbbq $0,%rax
+ leaq (%rbx,%r9,1),%rbx
+ leaq (%rdi,%r9,1),%rdi
+
+.byte 102,72,15,110,200
pxor %xmm0,%xmm0
- leaq 48(%rsp),%rax
- leaq 64(%rsp,%r9,2),%rdx
- shrq $3+2,%r9
+ pshufd $0,%xmm1,%xmm1
movq 40(%rsp),%rsi
- jmp .Lsqr8x_zero
+ jmp .Lsqr8x_cond_copy
.align 32
-.Lsqr8x_zero:
- movdqa %xmm0,0(%rax)
- movdqa %xmm0,16(%rax)
- movdqa %xmm0,32(%rax)
- movdqa %xmm0,48(%rax)
- leaq 64(%rax),%rax
- movdqa %xmm0,0(%rdx)
- movdqa %xmm0,16(%rdx)
- movdqa %xmm0,32(%rdx)
- movdqa %xmm0,48(%rdx)
- leaq 64(%rdx),%rdx
- decq %r9
- jnz .Lsqr8x_zero
+.Lsqr8x_cond_copy:
+ movdqa 0(%rbx),%xmm2
+ movdqa 16(%rbx),%xmm3
+ leaq 32(%rbx),%rbx
+ movdqu 0(%rdi),%xmm4
+ movdqu 16(%rdi),%xmm5
+ leaq 32(%rdi),%rdi
+ movdqa %xmm0,-32(%rbx)
+ movdqa %xmm0,-16(%rbx)
+ movdqa %xmm0,-32(%rbx,%rdx,1)
+ movdqa %xmm0,-16(%rbx,%rdx,1)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-32(%rdi)
+ movdqu %xmm5,-16(%rdi)
+ addq $32,%r9
+ jnz .Lsqr8x_cond_copy
movq $1,%rax
movq -48(%rsi),%r15
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s b/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s
index d2d1fbf4e7..8afe249695 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s
@@ -14,46 +14,151 @@ bn_mul_mont_gather5:
.Lmul_enter:
movl %r9d,%r9d
movq %rsp,%rax
- movl 8(%rsp),%r10d
+ movd 8(%rsp),%xmm5
+ leaq .Linc(%rip),%r10
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
+
leaq 2(%r9),%r11
negq %r11
- leaq (%rsp,%r11,8),%rsp
+ leaq -264(%rsp,%r11,8),%rsp
andq $-1024,%rsp
movq %rax,8(%rsp,%r9,8)
.Lmul_body:
- movq %rdx,%r12
- movq %r10,%r11
- shrq $3,%r10
- andq $7,%r11
- notq %r10
- leaq .Lmagic_masks(%rip),%rax
- andq $3,%r10
- leaq 96(%r12,%r11,8),%r12
- movq 0(%rax,%r10,8),%xmm4
- movq 8(%rax,%r10,8),%xmm5
- movq 16(%rax,%r10,8),%xmm6
- movq 24(%rax,%r10,8),%xmm7
-
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
+ leaq 128(%rdx),%r12
+ movdqa 0(%r10),%xmm0
+ movdqa 16(%r10),%xmm1
+ leaq 24-112(%rsp,%r9,8),%r10
+ andq $-16,%r10
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+.byte 0x67
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
+.byte 0x67
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ por %xmm1,%xmm0
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
.byte 102,72,15,126,195
movq (%r8),%r8
@@ -62,29 +167,14 @@ bn_mul_mont_gather5:
xorq %r14,%r14
xorq %r15,%r15
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
-
movq %r8,%rbp
mulq %rbx
movq %rax,%r10
movq (%rcx),%rax
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq %r10,%rbp
movq %rdx,%r11
- por %xmm2,%xmm0
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
@@ -117,14 +207,12 @@ bn_mul_mont_gather5:
cmpq %r9,%r15
jne .L1st
-.byte 102,72,15,126,195
addq %rax,%r13
- movq (%rsi),%rax
adcq $0,%rdx
addq %r11,%r13
adcq $0,%rdx
- movq %r13,-16(%rsp,%r15,8)
+ movq %r13,-16(%rsp,%r9,8)
movq %rdx,%r13
movq %r10,%r11
@@ -138,33 +226,78 @@ bn_mul_mont_gather5:
jmp .Louter
.align 16
.Louter:
+ leaq 24+128(%rsp,%r9,8),%rdx
+ andq $-16,%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r12),%xmm0
+ movdqa 80(%r12),%xmm1
+ movdqa 96(%r12),%xmm2
+ movdqa 112(%r12),%xmm3
+ pand 64(%rdx),%xmm0
+ pand 80(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%r12),%r12
+
+ movq (%rsi),%rax
+.byte 102,72,15,126,195
+
xorq %r15,%r15
movq %r8,%rbp
movq (%rsp),%r10
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
-
mulq %rbx
addq %rax,%r10
movq (%rcx),%rax
adcq $0,%rdx
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq %r10,%rbp
movq %rdx,%r11
- por %xmm2,%xmm0
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
@@ -200,15 +333,12 @@ bn_mul_mont_gather5:
cmpq %r9,%r15
jne .Linner
-.byte 102,72,15,126,195
-
addq %rax,%r13
- movq (%rsi),%rax
adcq $0,%rdx
addq %r10,%r13
- movq (%rsp,%r15,8),%r10
+ movq (%rsp,%r9,8),%r10
adcq $0,%rdx
- movq %r13,-16(%rsp,%r15,8)
+ movq %r13,-16(%rsp,%r9,8)
movq %rdx,%r13
xorq %rdx,%rdx
@@ -255,6 +385,7 @@ bn_mul_mont_gather5:
movq 8(%rsp,%r9,8),%rsi
movq $1,%rax
+
movq -48(%rsi),%r15
movq -40(%rsi),%r14
movq -32(%rsi),%r13
@@ -277,10 +408,10 @@ bn_mul4x_mont_gather5:
pushq %r13
pushq %r14
pushq %r15
+
.byte 0x67
- movl %r9d,%r10d
shll $3,%r9d
- shll $3+2,%r10d
+ leaq (%r9,%r9,2),%r10
negq %r9
@@ -290,19 +421,21 @@ bn_mul4x_mont_gather5:
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lmul4xsp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -320(%rsp,%r9,2),%rsp
jmp .Lmul4xsp_done
.align 32
.Lmul4xsp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -318,6 +451,7 @@ bn_mul4x_mont_gather5:
movq 40(%rsp),%rsi
movq $1,%rax
+
movq -48(%rsi),%r15
movq -40(%rsi),%r14
movq -32(%rsi),%r13
@@ -333,47 +467,141 @@ bn_mul4x_mont_gather5:
.align 32
mul4x_internal:
shlq $5,%r9
- movl 8(%rax),%r10d
- leaq 256(%rdx,%r9,1),%r13
+ movd 8(%rax),%xmm5
+ leaq .Linc(%rip),%rax
+ leaq 128(%rdx,%r9,1),%r13
shrq $5,%r9
- movq %r10,%r11
- shrq $3,%r10
- andq $7,%r11
- notq %r10
- leaq .Lmagic_masks(%rip),%rax
- andq $3,%r10
- leaq 96(%rdx,%r11,8),%r12
- movq 0(%rax,%r10,8),%xmm4
- movq 8(%rax,%r10,8),%xmm5
- addq $7,%r11
- movq 16(%rax,%r10,8),%xmm6
- movq 24(%rax,%r10,8),%xmm7
- andq $7,%r11
-
- movq -96(%r12),%xmm0
- leaq 256(%r12),%r14
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
-.byte 0x67
- por %xmm1,%xmm0
- movq -96(%r14),%xmm1
-.byte 0x67
- pand %xmm7,%xmm3
-.byte 0x67
- por %xmm2,%xmm0
- movq -32(%r14),%xmm2
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 88-112(%rsp,%r9,1),%r10
+ leaq 128(%rdx),%r12
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+.byte 0x67,0x67
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
.byte 0x67
- pand %xmm4,%xmm1
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
.byte 0x67
- por %xmm3,%xmm0
- movq 32(%r14),%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ por %xmm1,%xmm0
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ leaq 256(%r12),%r12
.byte 102,72,15,126,195
- movq 96(%r14),%xmm0
+
movq %r13,16+8(%rsp)
movq %rdi,56+8(%rsp)
@@ -387,26 +615,10 @@ mul4x_internal:
movq %rax,%r10
movq (%rcx),%rax
- pand %xmm5,%xmm2
- pand %xmm6,%xmm3
- por %xmm2,%xmm1
-
imulq %r10,%rbp
-
-
-
-
-
-
-
- leaq 64+8(%rsp,%r11,8),%r14
+ leaq 64+8(%rsp),%r14
movq %rdx,%r11
- pand %xmm7,%xmm0
- por %xmm3,%xmm1
- leaq 512(%r12),%r12
- por %xmm1,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi,%r9,1),%rax
@@ -415,7 +627,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -425,7 +637,7 @@ mul4x_internal:
adcq $0,%rdx
addq %r11,%rdi
leaq 32(%r9),%r15
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %rdi,(%r14)
movq %rdx,%r13
@@ -435,7 +647,7 @@ mul4x_internal:
.L1st4x:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
leaq 32(%r14),%r14
adcq $0,%rdx
movq %rdx,%r11
@@ -451,7 +663,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq -16(%rcx),%rax
+ movq -8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -481,7 +693,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -490,7 +702,7 @@ mul4x_internal:
movq 16(%rsi,%r15,1),%rax
adcq $0,%rdx
addq %r11,%rdi
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %rdi,(%r14)
movq %rdx,%r13
@@ -500,7 +712,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
leaq 32(%r14),%r14
adcq $0,%rdx
movq %rdx,%r11
@@ -516,7 +728,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq -16(%rcx),%rax
+ movq -8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -529,8 +741,7 @@ mul4x_internal:
movq %rdi,-16(%r14)
movq %rdx,%r13
-.byte 102,72,15,126,195
- leaq (%rcx,%r9,2),%rcx
+ leaq (%rcx,%r9,1),%rcx
xorq %rdi,%rdi
addq %r10,%r13
@@ -541,6 +752,63 @@ mul4x_internal:
.align 32
.Louter4x:
+ leaq 16+128(%r14),%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r12),%xmm0
+ movdqa 80(%r12),%xmm1
+ movdqa 96(%r12),%xmm2
+ movdqa 112(%r12),%xmm3
+ pand 64(%rdx),%xmm0
+ pand 80(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%r12),%r12
+.byte 102,72,15,126,195
+
movq (%r14,%r9,1),%r10
movq %r8,%rbp
mulq %rbx
@@ -548,25 +816,11 @@ mul4x_internal:
movq (%rcx),%rax
adcq $0,%rdx
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
-
imulq %r10,%rbp
-.byte 0x67
movq %rdx,%r11
movq %rdi,(%r14)
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
- por %xmm2,%xmm0
leaq (%r14,%r9,1),%r14
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
mulq %rbp
addq %rax,%r10
@@ -576,7 +830,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
addq 8(%r14),%r11
adcq $0,%rdx
@@ -588,7 +842,7 @@ mul4x_internal:
adcq $0,%rdx
addq %r11,%rdi
leaq 32(%r9),%r15
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %rdx,%r13
jmp .Linner4x
@@ -597,7 +851,7 @@ mul4x_internal:
.Linner4x:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
adcq $0,%rdx
addq 16(%r14),%r10
leaq 32(%r14),%r14
@@ -615,7 +869,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq -16(%rcx),%rax
+ movq -8(%rcx),%rax
adcq $0,%rdx
addq -8(%r14),%r11
adcq $0,%rdx
@@ -649,7 +903,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
addq 8(%r14),%r11
adcq $0,%rdx
@@ -660,7 +914,7 @@ mul4x_internal:
movq 16(%rsi,%r15,1),%rax
adcq $0,%rdx
addq %r11,%rdi
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %r13,-8(%r14)
movq %rdx,%r13
@@ -670,7 +924,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
adcq $0,%rdx
addq 16(%r14),%r10
leaq 32(%r14),%r14
@@ -689,7 +943,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
movq %rbp,%rax
- movq -16(%rcx),%rbp
+ movq -8(%rcx),%rbp
adcq $0,%rdx
addq -8(%r14),%r11
adcq $0,%rdx
@@ -704,9 +958,8 @@ mul4x_internal:
movq %r13,-24(%r14)
movq %rdx,%r13
-.byte 102,72,15,126,195
movq %rdi,-16(%r14)
- leaq (%rcx,%r9,2),%rcx
+ leaq (%rcx,%r9,1),%rcx
xorq %rdi,%rdi
addq %r10,%r13
@@ -717,16 +970,23 @@ mul4x_internal:
cmpq 16+8(%rsp),%r12
jb .Louter4x
+ xorq %rax,%rax
subq %r13,%rbp
adcq %r15,%r15
orq %r15,%rdi
- xorq $1,%rdi
+ subq %rdi,%rax
leaq (%r14,%r9,1),%rbx
- leaq (%rcx,%rdi,8),%rbp
+ movq (%rcx),%r12
+ leaq (%rcx),%rbp
movq %r9,%rcx
sarq $3+2,%rcx
movq 56+8(%rsp),%rdi
- jmp .Lsqr4x_sub
+ decq %r12
+ xorq %r10,%r10
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqr4x_sub_entry
.size mul4x_internal,.-mul4x_internal
.globl bn_power5
.type bn_power5,@function
@@ -739,9 +999,9 @@ bn_power5:
pushq %r13
pushq %r14
pushq %r15
- movl %r9d,%r10d
+
shll $3,%r9d
- shll $3+2,%r10d
+ leal (%r9,%r9,2),%r10d
negq %r9
movq (%r8),%r8
@@ -751,19 +1011,20 @@ bn_power5:
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
+
+ leaq -320(%rsp,%r9,2),%r11
+ subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lpwr_sp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -320(%rsp,%r9,2),%rsp
jmp .Lpwr_sp_done
.align 32
.Lpwr_sp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -791,10 +1052,15 @@ bn_power5:
.byte 102,72,15,110,226
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
.byte 102,72,15,126,209
.byte 102,72,15,126,226
@@ -1338,9 +1604,9 @@ __bn_sqr8x_internal:
movq %rbx,-16(%rdi)
movq %r8,-8(%rdi)
.byte 102,72,15,126,213
-sqr8x_reduction:
+__bn_sqr8x_reduction:
xorq %rax,%rax
- leaq (%rbp,%r9,2),%rcx
+ leaq (%r9,%rbp,1),%rcx
leaq 48+8(%rsp,%r9,2),%rdx
movq %rcx,0+8(%rsp)
leaq 48+8(%rsp,%r9,1),%rdi
@@ -1373,14 +1639,14 @@ sqr8x_reduction:
.align 32
.L8x_reduce:
mulq %rbx
- movq 16(%rbp),%rax
+ movq 8(%rbp),%rax
negq %r8
movq %rdx,%r8
adcq $0,%r8
mulq %rbx
addq %rax,%r9
- movq 32(%rbp),%rax
+ movq 16(%rbp),%rax
adcq $0,%rdx
addq %r9,%r8
movq %rbx,48-8+8(%rsp,%rcx,8)
@@ -1389,7 +1655,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r10
- movq 48(%rbp),%rax
+ movq 24(%rbp),%rax
adcq $0,%rdx
addq %r10,%r9
movq 32+8(%rsp),%rsi
@@ -1398,7 +1664,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r11
- movq 64(%rbp),%rax
+ movq 32(%rbp),%rax
adcq $0,%rdx
imulq %r8,%rsi
addq %r11,%r10
@@ -1407,7 +1673,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r12
- movq 80(%rbp),%rax
+ movq 40(%rbp),%rax
adcq $0,%rdx
addq %r12,%r11
movq %rdx,%r12
@@ -1415,7 +1681,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r13
- movq 96(%rbp),%rax
+ movq 48(%rbp),%rax
adcq $0,%rdx
addq %r13,%r12
movq %rdx,%r13
@@ -1423,7 +1689,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r14
- movq 112(%rbp),%rax
+ movq 56(%rbp),%rax
adcq $0,%rdx
addq %r14,%r13
movq %rdx,%r14
@@ -1441,7 +1707,7 @@ sqr8x_reduction:
decl %ecx
jnz .L8x_reduce
- leaq 128(%rbp),%rbp
+ leaq 64(%rbp),%rbp
xorq %rax,%rax
movq 8+8(%rsp),%rdx
cmpq 0+8(%rsp),%rbp
@@ -1467,14 +1733,14 @@ sqr8x_reduction:
.L8x_tail:
mulq %rbx
addq %rax,%r8
- movq 16(%rbp),%rax
+ movq 8(%rbp),%rax
movq %r8,(%rdi)
movq %rdx,%r8
adcq $0,%r8
mulq %rbx
addq %rax,%r9
- movq 32(%rbp),%rax
+ movq 16(%rbp),%rax
adcq $0,%rdx
addq %r9,%r8
leaq 8(%rdi),%rdi
@@ -1483,7 +1749,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r10
- movq 48(%rbp),%rax
+ movq 24(%rbp),%rax
adcq $0,%rdx
addq %r10,%r9
movq %rdx,%r10
@@ -1491,7 +1757,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r11
- movq 64(%rbp),%rax
+ movq 32(%rbp),%rax
adcq $0,%rdx
addq %r11,%r10
movq %rdx,%r11
@@ -1499,7 +1765,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r12
- movq 80(%rbp),%rax
+ movq 40(%rbp),%rax
adcq $0,%rdx
addq %r12,%r11
movq %rdx,%r12
@@ -1507,7 +1773,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r13
- movq 96(%rbp),%rax
+ movq 48(%rbp),%rax
adcq $0,%rdx
addq %r13,%r12
movq %rdx,%r13
@@ -1515,7 +1781,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r14
- movq 112(%rbp),%rax
+ movq 56(%rbp),%rax
adcq $0,%rdx
addq %r14,%r13
movq %rdx,%r14
@@ -1533,7 +1799,7 @@ sqr8x_reduction:
decl %ecx
jnz .L8x_tail
- leaq 128(%rbp),%rbp
+ leaq 64(%rbp),%rbp
movq 8+8(%rsp),%rdx
cmpq 0+8(%rsp),%rbp
jae .L8x_tail_done
@@ -1579,7 +1845,7 @@ sqr8x_reduction:
adcq 48(%rdi),%r14
adcq 56(%rdi),%r15
adcq $0,%rax
- movq -16(%rbp),%rcx
+ movq -8(%rbp),%rcx
xorq %rsi,%rsi
.byte 102,72,15,126,213
@@ -1597,44 +1863,62 @@ sqr8x_reduction:
cmpq %rdx,%rdi
jb .L8x_reduction_loop
-
- subq %r15,%rcx
+ .byte 0xf3,0xc3
+.size bn_sqr8x_internal,.-bn_sqr8x_internal
+.type __bn_post4x_internal,@function
+.align 32
+__bn_post4x_internal:
+ movq 0(%rbp),%r12
leaq (%rdi,%r9,1),%rbx
- adcq %rsi,%rsi
movq %r9,%rcx
- orq %rsi,%rax
.byte 102,72,15,126,207
- xorq $1,%rax
+ negq %rax
.byte 102,72,15,126,206
- leaq (%rbp,%rax,8),%rbp
sarq $3+2,%rcx
- jmp .Lsqr4x_sub
+ decq %r12
+ xorq %r10,%r10
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqr4x_sub_entry
-.align 32
+.align 16
.Lsqr4x_sub:
-.byte 0x66
- movq 0(%rbx),%r12
- movq 8(%rbx),%r13
- sbbq 0(%rbp),%r12
- movq 16(%rbx),%r14
- sbbq 16(%rbp),%r13
- movq 24(%rbx),%r15
- leaq 32(%rbx),%rbx
- sbbq 32(%rbp),%r14
+ movq 0(%rbp),%r12
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+.Lsqr4x_sub_entry:
+ leaq 32(%rbp),%rbp
+ notq %r12
+ notq %r13
+ notq %r14
+ notq %r15
+ andq %rax,%r12
+ andq %rax,%r13
+ andq %rax,%r14
+ andq %rax,%r15
+
+ negq %r10
+ adcq 0(%rbx),%r12
+ adcq 8(%rbx),%r13
+ adcq 16(%rbx),%r14
+ adcq 24(%rbx),%r15
movq %r12,0(%rdi)
- sbbq 48(%rbp),%r15
- leaq 64(%rbp),%rbp
+ leaq 32(%rbx),%rbx
movq %r13,8(%rdi)
+ sbbq %r10,%r10
movq %r14,16(%rdi)
movq %r15,24(%rdi)
leaq 32(%rdi),%rdi
incq %rcx
jnz .Lsqr4x_sub
+
movq %r9,%r10
negq %r9
.byte 0xf3,0xc3
-.size bn_sqr8x_internal,.-bn_sqr8x_internal
+.size __bn_post4x_internal,.-__bn_post4x_internal
.globl bn_from_montgomery
.type bn_from_montgomery,@function
.align 32
@@ -1656,10 +1940,9 @@ bn_from_mont8x:
pushq %r13
pushq %r14
pushq %r15
-.byte 0x67
- movl %r9d,%r10d
+
shll $3,%r9d
- shll $3+2,%r10d
+ leaq (%r9,%r9,2),%r10
negq %r9
movq (%r8),%r8
@@ -1669,19 +1952,20 @@ bn_from_mont8x:
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
+
+ leaq -320(%rsp,%r9,2),%r11
+ subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lfrom_sp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -320(%rsp,%r9,2),%rsp
jmp .Lfrom_sp_done
.align 32
.Lfrom_sp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -1732,7 +2016,8 @@ bn_from_mont8x:
.byte 0x67
movq %rcx,%rbp
.byte 102,73,15,110,218
- call sqr8x_reduction
+ call __bn_sqr8x_reduction
+ call __bn_post4x_internal
pxor %xmm0,%xmm0
leaq 48(%rsp),%rax
@@ -1799,45 +2084,169 @@ bn_scatter5:
.globl bn_gather5
.type bn_gather5,@function
-.align 16
+.align 32
bn_gather5:
- movl %ecx,%r11d
- shrl $3,%ecx
- andq $7,%r11
- notl %ecx
- leaq .Lmagic_masks(%rip),%rax
- andl $3,%ecx
- leaq 128(%rdx,%r11,8),%rdx
- movq 0(%rax,%rcx,8),%xmm4
- movq 8(%rax,%rcx,8),%xmm5
- movq 16(%rax,%rcx,8),%xmm6
- movq 24(%rax,%rcx,8),%xmm7
+.LSEH_begin_bn_gather5:
+
+.byte 0x4c,0x8d,0x14,0x24
+.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00
+ leaq .Linc(%rip),%rax
+ andq $-16,%rsp
+
+ movd %ecx,%xmm5
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 128(%rdx),%r11
+ leaq 128(%rsp),%rax
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,-128(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,-112(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,-96(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,-80(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,-64(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,-48(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,-32(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,-16(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,16(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,32(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,48(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,64(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,80(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,96(%rax)
+ movdqa %xmm4,%xmm2
+ movdqa %xmm3,112(%rax)
jmp .Lgather
-.align 16
-.Lgather:
- movq -128(%rdx),%xmm0
- movq -64(%rdx),%xmm1
- pand %xmm4,%xmm0
- movq 0(%rdx),%xmm2
- pand %xmm5,%xmm1
- movq 64(%rdx),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-.byte 0x67,0x67
- por %xmm2,%xmm0
- leaq 256(%rdx),%rdx
- por %xmm3,%xmm0
+.align 32
+.Lgather:
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r11),%xmm0
+ movdqa -112(%r11),%xmm1
+ movdqa -96(%r11),%xmm2
+ pand -128(%rax),%xmm0
+ movdqa -80(%r11),%xmm3
+ pand -112(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r11),%xmm0
+ movdqa -48(%r11),%xmm1
+ movdqa -32(%r11),%xmm2
+ pand -64(%rax),%xmm0
+ movdqa -16(%r11),%xmm3
+ pand -48(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r11),%xmm0
+ movdqa 16(%r11),%xmm1
+ movdqa 32(%r11),%xmm2
+ pand 0(%rax),%xmm0
+ movdqa 48(%r11),%xmm3
+ pand 16(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r11),%xmm0
+ movdqa 80(%r11),%xmm1
+ movdqa 96(%r11),%xmm2
+ pand 64(%rax),%xmm0
+ movdqa 112(%r11),%xmm3
+ pand 80(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ leaq 256(%r11),%r11
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
movq %xmm0,(%rdi)
leaq 8(%rdi),%rdi
subl $1,%esi
jnz .Lgather
+
+ leaq (%r10),%rsp
.byte 0xf3,0xc3
.LSEH_end_bn_gather5:
.size bn_gather5,.-bn_gather5
.align 64
-.Lmagic_masks:
-.long 0,0, 0,0, 0,0, -1,-1
-.long 0,0, 0,0, 0,0, 0,0
+.Linc:
+.long 0,0, 1,1
+.long 2,2, 2,2
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/camellia/cmll-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/camellia/cmll-x86_64.s
index ac7da4dfc2..1117381f31 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/camellia/cmll-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/camellia/cmll-x86_64.s
@@ -1624,7 +1624,7 @@ Camellia_cbc_encrypt:
leaq -64-63(%rcx),%r10
subq %rsp,%r10
negq %r10
- andq $960,%r10
+ andq $0x3C0,%r10
subq %r10,%rsp
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/ec/ecp_nistz256-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/ec/ecp_nistz256-x86_64.s
index 393782329e..7876e38299 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/ec/ecp_nistz256-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/ec/ecp_nistz256-x86_64.s
@@ -1121,6 +1121,7 @@ ecp_nistz256_point_double:
pushq %r15
subq $160+8,%rsp
+.Lpoint_double_shortcutq:
movdqu 0(%rsi),%xmm0
movq %rsi,%rbx
movdqu 16(%rsi),%xmm1
@@ -1341,7 +1342,7 @@ ecp_nistz256_point_add:
por %xmm1,%xmm3
movdqu 0(%rsi),%xmm0
- pshufd $177,%xmm3,%xmm5
+ pshufd $0xb1,%xmm3,%xmm5
movdqu 16(%rsi),%xmm1
movdqu 32(%rsi),%xmm2
por %xmm3,%xmm5
@@ -1351,7 +1352,7 @@ ecp_nistz256_point_add:
movq 64+16(%rsi),%r15
movq 64+24(%rsi),%r8
movdqa %xmm0,480(%rsp)
- pshufd $30,%xmm5,%xmm4
+ pshufd $0x1e,%xmm5,%xmm4
movdqa %xmm1,480+16(%rsp)
por %xmm0,%xmm1
.byte 102,72,15,110,199
@@ -1371,10 +1372,10 @@ ecp_nistz256_point_add:
call __ecp_nistz256_sqr_montq
pcmpeqd %xmm4,%xmm5
- pshufd $177,%xmm3,%xmm4
+ pshufd $0xb1,%xmm3,%xmm4
por %xmm3,%xmm4
pshufd $0,%xmm5,%xmm5
- pshufd $30,%xmm4,%xmm3
+ pshufd $0x1e,%xmm4,%xmm3
por %xmm3,%xmm4
pxor %xmm3,%xmm3
pcmpeqd %xmm3,%xmm4
@@ -1383,6 +1384,7 @@ ecp_nistz256_point_add:
movq 64+8(%rbx),%r14
movq 64+16(%rbx),%r15
movq 64+24(%rbx),%r8
+.byte 102,72,15,110,203
leaq 64-0(%rbx),%rsi
leaq 32(%rsp),%rdi
@@ -1474,7 +1476,7 @@ ecp_nistz256_point_add:
testq %r8,%r8
jnz .Ladd_proceedq
testq %r9,%r9
- jz .Ladd_proceedq
+ jz .Ladd_doubleq
.byte 102,72,15,126,199
pxor %xmm0,%xmm0
@@ -1487,6 +1489,13 @@ ecp_nistz256_point_add:
jmp .Ladd_doneq
.align 32
+.Ladd_doubleq:
+.byte 102,72,15,126,206
+.byte 102,72,15,126,199
+ addq $416,%rsp
+ jmp .Lpoint_double_shortcutq
+
+.align 32
.Ladd_proceedq:
movq 0+64(%rsp),%rax
movq 8+64(%rsp),%r14
@@ -1733,13 +1742,13 @@ ecp_nistz256_point_add_affine:
por %xmm1,%xmm3
movdqu 0(%rbx),%xmm0
- pshufd $177,%xmm3,%xmm5
+ pshufd $0xb1,%xmm3,%xmm5
movdqu 16(%rbx),%xmm1
movdqu 32(%rbx),%xmm2
por %xmm3,%xmm5
movdqu 48(%rbx),%xmm3
movdqa %xmm0,416(%rsp)
- pshufd $30,%xmm5,%xmm4
+ pshufd $0x1e,%xmm5,%xmm4
movdqa %xmm1,416+16(%rsp)
por %xmm0,%xmm1
.byte 102,72,15,110,199
@@ -1755,13 +1764,13 @@ ecp_nistz256_point_add_affine:
call __ecp_nistz256_sqr_montq
pcmpeqd %xmm4,%xmm5
- pshufd $177,%xmm3,%xmm4
+ pshufd $0xb1,%xmm3,%xmm4
movq 0(%rbx),%rax
movq %r12,%r9
por %xmm3,%xmm4
pshufd $0,%xmm5,%xmm5
- pshufd $30,%xmm4,%xmm3
+ pshufd $0x1e,%xmm4,%xmm3
movq %r13,%r10
por %xmm3,%xmm4
pxor %xmm3,%xmm3
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/modes/ghash-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/modes/ghash-x86_64.s
index 462ef7fe73..e9ffdc2de2 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/modes/ghash-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/modes/ghash-x86_64.s
@@ -20,14 +20,14 @@ gcm_gmult_4bit:
movq $14,%rcx
movq 8(%rsi,%rax,1),%r8
movq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
movq %r8,%rdx
jmp .Loop1
.align 16
.Loop1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
movb (%rdi,%rcx,1),%al
shrq $4,%r9
@@ -43,13 +43,13 @@ gcm_gmult_4bit:
js .Lbreak1
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
@@ -58,19 +58,19 @@ gcm_gmult_4bit:
.align 16
.Lbreak1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rbx,1),%r8
@@ -661,10 +661,10 @@ gcm_ghash_4bit:
gcm_init_clmul:
.L_init_clmul:
movdqu (%rsi),%xmm2
- pshufd $78,%xmm2,%xmm2
+ pshufd $0b01001110,%xmm2,%xmm2
- pshufd $255,%xmm2,%xmm4
+ pshufd $0b11111111,%xmm2,%xmm4
movdqa %xmm2,%xmm3
psllq $1,%xmm2
pxor %xmm5,%xmm5
@@ -678,11 +678,11 @@ gcm_init_clmul:
pxor %xmm5,%xmm2
- pshufd $78,%xmm2,%xmm6
+ pshufd $0b01001110,%xmm2,%xmm6
movdqa %xmm2,%xmm0
pxor %xmm2,%xmm6
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -718,8 +718,8 @@ gcm_init_clmul:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- pshufd $78,%xmm2,%xmm3
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm2,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm2,%xmm3
movdqu %xmm2,0(%rdi)
pxor %xmm0,%xmm4
@@ -727,7 +727,7 @@ gcm_init_clmul:
.byte 102,15,58,15,227,8
movdqu %xmm4,32(%rdi)
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -765,7 +765,7 @@ gcm_init_clmul:
pxor %xmm1,%xmm0
movdqa %xmm0,%xmm5
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -801,8 +801,8 @@ gcm_init_clmul:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- pshufd $78,%xmm5,%xmm3
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm5,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm5,%xmm3
movdqu %xmm5,48(%rdi)
pxor %xmm0,%xmm4
@@ -822,7 +822,7 @@ gcm_gmult_clmul:
movdqu 32(%rsi),%xmm4
.byte 102,15,56,0,197
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -874,20 +874,20 @@ gcm_ghash_clmul:
movdqu 32(%rsi),%xmm7
.byte 102,65,15,56,0,194
- subq $16,%rcx
+ subq $0x10,%rcx
jz .Lodd_tail
movdqu 16(%rsi),%xmm6
movl OPENSSL_ia32cap_P+4(%rip),%eax
- cmpq $48,%rcx
+ cmpq $0x30,%rcx
jb .Lskip4x
andl $71303168,%eax
cmpl $4194304,%eax
je .Lskip4x
- subq $48,%rcx
- movq $11547335547999543296,%rax
+ subq $0x30,%rcx
+ movq $0xA040608020C0E000,%rax
movdqu 48(%rsi),%xmm14
movdqu 64(%rsi),%xmm15
@@ -899,14 +899,14 @@ gcm_ghash_clmul:
.byte 102,65,15,56,0,218
.byte 102,69,15,56,0,218
movdqa %xmm3,%xmm5
- pshufd $78,%xmm3,%xmm4
+ pshufd $0b01001110,%xmm3,%xmm4
pxor %xmm3,%xmm4
.byte 102,15,58,68,218,0
.byte 102,15,58,68,234,17
.byte 102,15,58,68,231,0
movdqa %xmm11,%xmm13
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
pxor %xmm11,%xmm12
.byte 102,68,15,58,68,222,0
.byte 102,68,15,58,68,238,17
@@ -921,12 +921,12 @@ gcm_ghash_clmul:
.byte 102,69,15,56,0,218
.byte 102,69,15,56,0,194
movdqa %xmm11,%xmm13
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
pxor %xmm8,%xmm0
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm8
+ pshufd $0b01001110,%xmm0,%xmm8
pxor %xmm0,%xmm8
.byte 102,69,15,58,68,238,17
.byte 102,68,15,58,68,231,0
@@ -934,7 +934,7 @@ gcm_ghash_clmul:
xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jc .Ltail4x
jmp .Lmod4_loop
@@ -949,14 +949,14 @@ gcm_ghash_clmul:
movdqu 32(%rdx),%xmm3
movdqa %xmm11,%xmm13
.byte 102,68,15,58,68,199,16
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
xorps %xmm5,%xmm1
pxor %xmm11,%xmm12
.byte 102,65,15,56,0,218
movups 32(%rsi),%xmm7
xorps %xmm4,%xmm8
.byte 102,68,15,58,68,218,0
- pshufd $78,%xmm3,%xmm4
+ pshufd $0b01001110,%xmm3,%xmm4
pxor %xmm0,%xmm8
movdqa %xmm3,%xmm5
@@ -1000,7 +1000,7 @@ gcm_ghash_clmul:
movdqa %xmm11,%xmm13
pxor %xmm12,%xmm4
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
pxor %xmm9,%xmm0
pxor %xmm8,%xmm1
pxor %xmm11,%xmm12
@@ -1010,14 +1010,14 @@ gcm_ghash_clmul:
movdqa %xmm0,%xmm1
.byte 102,69,15,58,68,238,17
xorps %xmm11,%xmm3
- pshufd $78,%xmm0,%xmm8
+ pshufd $0b01001110,%xmm0,%xmm8
pxor %xmm0,%xmm8
.byte 102,68,15,58,68,231,0
xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jnc .Lmod4_loop
.Ltail4x:
@@ -1061,10 +1061,10 @@ gcm_ghash_clmul:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- addq $64,%rcx
+ addq $0x40,%rcx
jz .Ldone
movdqu 32(%rsi),%xmm7
- subq $16,%rcx
+ subq $0x10,%rcx
jz .Lodd_tail
.Lskip4x:
@@ -1079,7 +1079,7 @@ gcm_ghash_clmul:
pxor %xmm8,%xmm0
movdqa %xmm3,%xmm5
- pshufd $78,%xmm3,%xmm4
+ pshufd $0b01001110,%xmm3,%xmm4
pxor %xmm3,%xmm4
.byte 102,15,58,68,218,0
.byte 102,15,58,68,234,17
@@ -1087,7 +1087,7 @@ gcm_ghash_clmul:
leaq 32(%rdx),%rdx
nop
- subq $32,%rcx
+ subq $0x20,%rcx
jbe .Leven_tail
nop
jmp .Lmod_loop
@@ -1096,7 +1096,7 @@ gcm_ghash_clmul:
.Lmod_loop:
movdqa %xmm0,%xmm1
movdqa %xmm4,%xmm8
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm0,%xmm4
.byte 102,15,58,68,198,0
@@ -1134,7 +1134,7 @@ gcm_ghash_clmul:
pslldq $8,%xmm0
psrldq $8,%xmm8
pxor %xmm9,%xmm0
- pshufd $78,%xmm5,%xmm4
+ pshufd $0b01001110,%xmm5,%xmm4
pxor %xmm8,%xmm1
pxor %xmm5,%xmm4
@@ -1150,13 +1150,13 @@ gcm_ghash_clmul:
.byte 102,15,58,68,231,0
pxor %xmm1,%xmm0
- subq $32,%rcx
+ subq $0x20,%rcx
ja .Lmod_loop
.Leven_tail:
movdqa %xmm0,%xmm1
movdqa %xmm4,%xmm8
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm0,%xmm4
.byte 102,15,58,68,198,0
@@ -1204,7 +1204,7 @@ gcm_ghash_clmul:
.byte 102,69,15,56,0,194
pxor %xmm8,%xmm0
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha1-mb-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha1-mb-x86_64.s
index 8a1e5e7b59..4d25c99cf6 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha1-mb-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha1-mb-x86_64.s
@@ -2599,10 +2599,10 @@ _shaext_shortcut:
punpcklqdq %xmm5,%xmm0
punpckhqdq %xmm5,%xmm8
- pshufd $63,%xmm7,%xmm1
- pshufd $127,%xmm7,%xmm9
- pshufd $27,%xmm0,%xmm0
- pshufd $27,%xmm8,%xmm8
+ pshufd $0b00111111,%xmm7,%xmm1
+ pshufd $0b01111111,%xmm7,%xmm9
+ pshufd $0b00011011,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm8,%xmm8
jmp .Loop_shaext
.align 32
@@ -2857,8 +2857,8 @@ _shaext_shortcut:
.byte 69,15,58,204,193,3
.byte 69,15,56,200,214
- pshufd $0,%xmm6,%xmm11
- pshufd $85,%xmm6,%xmm12
+ pshufd $0x00,%xmm6,%xmm11
+ pshufd $0x55,%xmm6,%xmm12
movdqa %xmm6,%xmm7
pcmpgtd %xmm4,%xmm11
pcmpgtd %xmm4,%xmm12
@@ -2888,8 +2888,8 @@ _shaext_shortcut:
movl 280(%rsp),%edx
- pshufd $27,%xmm0,%xmm0
- pshufd $27,%xmm8,%xmm8
+ pshufd $0b00011011,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm8,%xmm8
movdqa %xmm0,%xmm6
punpckldq %xmm8,%xmm0
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha1-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha1-x86_64.s
index 38b7df1970..38e9956cb6 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha1-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha1-x86_64.s
@@ -1240,9 +1240,9 @@ _shaext_shortcut:
movdqa K_XX_XX+160(%rip),%xmm3
movdqu (%rsi),%xmm4
- pshufd $27,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm0,%xmm0
movdqu 16(%rsi),%xmm5
- pshufd $27,%xmm1,%xmm1
+ pshufd $0b00011011,%xmm1,%xmm1
movdqu 32(%rsi),%xmm6
.byte 102,15,56,0,227
movdqu 48(%rsi),%xmm7
@@ -1392,8 +1392,8 @@ _shaext_shortcut:
jnz .Loop_shaext
- pshufd $27,%xmm0,%xmm0
- pshufd $27,%xmm1,%xmm1
+ pshufd $0b00011011,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm1,%xmm1
movdqu %xmm0,(%rdi)
movd %xmm1,16(%rdi)
.byte 0xf3,0xc3
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha256-mb-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha256-mb-x86_64.s
index 7f8e35a92e..7655283b98 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha256-mb-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha256-mb-x86_64.s
@@ -2677,10 +2677,10 @@ _shaext_shortcut:
punpckhqdq %xmm8,%xmm14
punpckhqdq %xmm10,%xmm15
- pshufd $27,%xmm12,%xmm12
- pshufd $27,%xmm13,%xmm13
- pshufd $27,%xmm14,%xmm14
- pshufd $27,%xmm15,%xmm15
+ pshufd $0b00011011,%xmm12,%xmm12
+ pshufd $0b00011011,%xmm13,%xmm13
+ pshufd $0b00011011,%xmm14,%xmm14
+ pshufd $0b00011011,%xmm15,%xmm15
jmp .Loop_shaext
.align 32
@@ -2712,11 +2712,11 @@ _shaext_shortcut:
movdqa %xmm2,%xmm0
movdqa %xmm15,112(%rsp)
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
pxor %xmm12,%xmm4
movdqa %xmm12,64(%rsp)
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
pxor %xmm14,%xmm8
movdqa %xmm14,96(%rsp)
movdqa 16-128(%rbp),%xmm1
@@ -2734,11 +2734,11 @@ _shaext_shortcut:
.byte 102,68,15,56,0,211
prefetcht0 127(%r9)
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
.byte 102,68,15,56,0,219
.byte 15,56,204,229
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 32-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -2751,14 +2751,14 @@ _shaext_shortcut:
movdqa %xmm2,%xmm0
movdqa %xmm7,%xmm3
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
.byte 102,15,58,15,222,4
paddd %xmm3,%xmm4
movdqa %xmm11,%xmm3
.byte 102,65,15,58,15,218,4
.byte 15,56,204,238
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 48-128(%rbp),%xmm1
paddd %xmm7,%xmm1
.byte 69,15,56,203,247
@@ -2775,13 +2775,13 @@ _shaext_shortcut:
.byte 102,15,58,15,223,4
.byte 69,15,56,203,254
.byte 69,15,56,205,195
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm5
movdqa %xmm8,%xmm3
.byte 102,65,15,58,15,219,4
.byte 15,56,204,247
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 64-128(%rbp),%xmm1
paddd %xmm4,%xmm1
.byte 69,15,56,203,247
@@ -2797,13 +2797,13 @@ _shaext_shortcut:
.byte 102,15,58,15,220,4
.byte 69,15,56,203,254
.byte 69,15,56,205,200
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm6
movdqa %xmm9,%xmm3
.byte 102,65,15,58,15,216,4
.byte 15,56,204,252
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 80-128(%rbp),%xmm1
paddd %xmm5,%xmm1
.byte 69,15,56,203,247
@@ -2819,13 +2819,13 @@ _shaext_shortcut:
.byte 102,15,58,15,221,4
.byte 69,15,56,203,254
.byte 69,15,56,205,209
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm7
movdqa %xmm10,%xmm3
.byte 102,65,15,58,15,217,4
.byte 15,56,204,229
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 96-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -2841,13 +2841,13 @@ _shaext_shortcut:
.byte 102,15,58,15,222,4
.byte 69,15,56,203,254
.byte 69,15,56,205,218
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm4
movdqa %xmm11,%xmm3
.byte 102,65,15,58,15,218,4
.byte 15,56,204,238
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 112-128(%rbp),%xmm1
paddd %xmm7,%xmm1
.byte 69,15,56,203,247
@@ -2863,13 +2863,13 @@ _shaext_shortcut:
.byte 102,15,58,15,223,4
.byte 69,15,56,203,254
.byte 69,15,56,205,195
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm5
movdqa %xmm8,%xmm3
.byte 102,65,15,58,15,219,4
.byte 15,56,204,247
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 128-128(%rbp),%xmm1
paddd %xmm4,%xmm1
.byte 69,15,56,203,247
@@ -2885,13 +2885,13 @@ _shaext_shortcut:
.byte 102,15,58,15,220,4
.byte 69,15,56,203,254
.byte 69,15,56,205,200
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm6
movdqa %xmm9,%xmm3
.byte 102,65,15,58,15,216,4
.byte 15,56,204,252
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 144-128(%rbp),%xmm1
paddd %xmm5,%xmm1
.byte 69,15,56,203,247
@@ -2907,13 +2907,13 @@ _shaext_shortcut:
.byte 102,15,58,15,221,4
.byte 69,15,56,203,254
.byte 69,15,56,205,209
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm7
movdqa %xmm10,%xmm3
.byte 102,65,15,58,15,217,4
.byte 15,56,204,229
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 160-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -2929,13 +2929,13 @@ _shaext_shortcut:
.byte 102,15,58,15,222,4
.byte 69,15,56,203,254
.byte 69,15,56,205,218
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm4
movdqa %xmm11,%xmm3
.byte 102,65,15,58,15,218,4
.byte 15,56,204,238
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 176-128(%rbp),%xmm1
paddd %xmm7,%xmm1
.byte 69,15,56,203,247
@@ -2951,13 +2951,13 @@ _shaext_shortcut:
.byte 102,15,58,15,223,4
.byte 69,15,56,203,254
.byte 69,15,56,205,195
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm5
movdqa %xmm8,%xmm3
.byte 102,65,15,58,15,219,4
.byte 15,56,204,247
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 192-128(%rbp),%xmm1
paddd %xmm4,%xmm1
.byte 69,15,56,203,247
@@ -2973,13 +2973,13 @@ _shaext_shortcut:
.byte 102,15,58,15,220,4
.byte 69,15,56,203,254
.byte 69,15,56,205,200
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm6
movdqa %xmm9,%xmm3
.byte 102,65,15,58,15,216,4
.byte 15,56,204,252
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 208-128(%rbp),%xmm1
paddd %xmm5,%xmm1
.byte 69,15,56,203,247
@@ -2995,13 +2995,13 @@ _shaext_shortcut:
.byte 102,15,58,15,221,4
.byte 69,15,56,203,254
.byte 69,15,56,205,209
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm7
movdqa %xmm10,%xmm3
.byte 102,65,15,58,15,217,4
nop
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 224-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -3018,13 +3018,13 @@ _shaext_shortcut:
pxor %xmm6,%xmm6
.byte 69,15,56,203,254
.byte 69,15,56,205,218
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
movdqa 240-128(%rbp),%xmm1
paddd %xmm7,%xmm1
movq (%rbx),%xmm7
nop
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 240-128(%rbp),%xmm2
paddd %xmm11,%xmm2
.byte 69,15,56,203,247
@@ -3034,17 +3034,17 @@ _shaext_shortcut:
cmovgeq %rsp,%r8
cmpl 4(%rbx),%ecx
cmovgeq %rsp,%r9
- pshufd $0,%xmm7,%xmm9
+ pshufd $0x00,%xmm7,%xmm9
.byte 69,15,56,203,236
movdqa %xmm2,%xmm0
- pshufd $85,%xmm7,%xmm10
+ pshufd $0x55,%xmm7,%xmm10
movdqa %xmm7,%xmm11
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
pcmpgtd %xmm6,%xmm9
pcmpgtd %xmm6,%xmm10
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
pcmpgtd %xmm6,%xmm11
movdqa K256_shaext-16(%rip),%xmm3
.byte 69,15,56,203,247
@@ -3066,10 +3066,10 @@ _shaext_shortcut:
movl 280(%rsp),%edx
- pshufd $27,%xmm12,%xmm12
- pshufd $27,%xmm13,%xmm13
- pshufd $27,%xmm14,%xmm14
- pshufd $27,%xmm15,%xmm15
+ pshufd $0b00011011,%xmm12,%xmm12
+ pshufd $0b00011011,%xmm13,%xmm13
+ pshufd $0b00011011,%xmm14,%xmm14
+ pshufd $0b00011011,%xmm15,%xmm15
movdqa %xmm12,%xmm5
movdqa %xmm13,%xmm6
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha256-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha256-x86_64.s
index d2951d8ea3..ab16a7b618 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha256-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha256-x86_64.s
@@ -1754,9 +1754,9 @@ _shaext_shortcut:
movdqu 16(%rdi),%xmm2
movdqa 512-128(%rcx),%xmm7
- pshufd $27,%xmm1,%xmm0
- pshufd $177,%xmm1,%xmm1
- pshufd $27,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm0
+ pshufd $0xb1,%xmm1,%xmm1
+ pshufd $0x1b,%xmm2,%xmm2
movdqa %xmm7,%xmm8
.byte 102,15,58,15,202,8
punpcklqdq %xmm0,%xmm2
@@ -1775,7 +1775,7 @@ _shaext_shortcut:
.byte 102,15,56,0,231
movdqa %xmm2,%xmm10
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
nop
movdqa %xmm1,%xmm9
.byte 15,56,203,202
@@ -1784,7 +1784,7 @@ _shaext_shortcut:
paddd %xmm4,%xmm0
.byte 102,15,56,0,239
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
leaq 64(%rsi),%rsi
.byte 15,56,204,220
.byte 15,56,203,202
@@ -1793,7 +1793,7 @@ _shaext_shortcut:
paddd %xmm5,%xmm0
.byte 102,15,56,0,247
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
.byte 102,15,58,15,253,4
nop
@@ -1805,7 +1805,7 @@ _shaext_shortcut:
paddd %xmm6,%xmm0
.byte 15,56,205,222
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
.byte 102,15,58,15,254,4
nop
@@ -1816,7 +1816,7 @@ _shaext_shortcut:
paddd %xmm3,%xmm0
.byte 15,56,205,227
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
.byte 102,15,58,15,251,4
nop
@@ -1827,7 +1827,7 @@ _shaext_shortcut:
paddd %xmm4,%xmm0
.byte 15,56,205,236
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
.byte 102,15,58,15,252,4
nop
@@ -1838,7 +1838,7 @@ _shaext_shortcut:
paddd %xmm5,%xmm0
.byte 15,56,205,245
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
.byte 102,15,58,15,253,4
nop
@@ -1849,7 +1849,7 @@ _shaext_shortcut:
paddd %xmm6,%xmm0
.byte 15,56,205,222
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
.byte 102,15,58,15,254,4
nop
@@ -1860,7 +1860,7 @@ _shaext_shortcut:
paddd %xmm3,%xmm0
.byte 15,56,205,227
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
.byte 102,15,58,15,251,4
nop
@@ -1871,7 +1871,7 @@ _shaext_shortcut:
paddd %xmm4,%xmm0
.byte 15,56,205,236
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
.byte 102,15,58,15,252,4
nop
@@ -1882,7 +1882,7 @@ _shaext_shortcut:
paddd %xmm5,%xmm0
.byte 15,56,205,245
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
.byte 102,15,58,15,253,4
nop
@@ -1893,7 +1893,7 @@ _shaext_shortcut:
paddd %xmm6,%xmm0
.byte 15,56,205,222
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
.byte 102,15,58,15,254,4
nop
@@ -1904,7 +1904,7 @@ _shaext_shortcut:
paddd %xmm3,%xmm0
.byte 15,56,205,227
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
.byte 102,15,58,15,251,4
nop
@@ -1915,7 +1915,7 @@ _shaext_shortcut:
paddd %xmm4,%xmm0
.byte 15,56,205,236
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
.byte 102,15,58,15,252,4
.byte 15,56,203,202
@@ -1924,7 +1924,7 @@ _shaext_shortcut:
movdqa 448-128(%rcx),%xmm0
paddd %xmm5,%xmm0
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
.byte 15,56,205,245
movdqa %xmm8,%xmm7
.byte 15,56,203,202
@@ -1933,7 +1933,7 @@ _shaext_shortcut:
paddd %xmm6,%xmm0
nop
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
decq %rdx
nop
.byte 15,56,203,202
@@ -1942,9 +1942,9 @@ _shaext_shortcut:
paddd %xmm9,%xmm1
jnz .Loop_shaext
- pshufd $177,%xmm2,%xmm2
- pshufd $27,%xmm1,%xmm7
- pshufd $177,%xmm1,%xmm1
+ pshufd $0xb1,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm7
+ pshufd $0xb1,%xmm1,%xmm1
punpckhqdq %xmm2,%xmm1
.byte 102,15,58,15,215,8
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/x86_64cpuid.s b/deps/openssl/asm_obsolete/x64-elf-gas/x86_64cpuid.s
index 656a5ce855..0e81a290e3 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/x86_64cpuid.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/x86_64cpuid.s
@@ -44,43 +44,43 @@ OPENSSL_ia32_cpuid:
movl %eax,%r11d
xorl %eax,%eax
- cmpl $1970169159,%ebx
+ cmpl $0x756e6547,%ebx
setne %al
movl %eax,%r9d
- cmpl $1231384169,%edx
+ cmpl $0x49656e69,%edx
setne %al
orl %eax,%r9d
- cmpl $1818588270,%ecx
+ cmpl $0x6c65746e,%ecx
setne %al
orl %eax,%r9d
jz .Lintel
- cmpl $1752462657,%ebx
+ cmpl $0x68747541,%ebx
setne %al
movl %eax,%r10d
- cmpl $1769238117,%edx
+ cmpl $0x69746E65,%edx
setne %al
orl %eax,%r10d
- cmpl $1145913699,%ecx
+ cmpl $0x444D4163,%ecx
setne %al
orl %eax,%r10d
jnz .Lintel
- movl $2147483648,%eax
+ movl $0x80000000,%eax
cpuid
- cmpl $2147483649,%eax
+ cmpl $0x80000001,%eax
jb .Lintel
movl %eax,%r10d
- movl $2147483649,%eax
+ movl $0x80000001,%eax
cpuid
orl %ecx,%r9d
- andl $2049,%r9d
+ andl $0x00000801,%r9d
- cmpl $2147483656,%r10d
+ cmpl $0x80000008,%r10d
jb .Lintel
- movl $2147483656,%eax
+ movl $0x80000008,%eax
cpuid
movzbq %cl,%r10
incq %r10
@@ -92,7 +92,7 @@ OPENSSL_ia32_cpuid:
shrl $16,%ebx
cmpb %r10b,%bl
ja .Lgeneric
- andl $4026531839,%edx
+ andl $0xefffffff,%edx
jmp .Lgeneric
.Lintel:
@@ -105,7 +105,7 @@ OPENSSL_ia32_cpuid:
cpuid
movl %eax,%r10d
shrl $14,%r10d
- andl $4095,%r10d
+ andl $0xfff,%r10d
cmpl $7,%r11d
jb .Lnocacheinfo
@@ -118,29 +118,29 @@ OPENSSL_ia32_cpuid:
.Lnocacheinfo:
movl $1,%eax
cpuid
- andl $3220176895,%edx
+ andl $0xbfefffff,%edx
cmpl $0,%r9d
jne .Lnotintel
- orl $1073741824,%edx
+ orl $0x40000000,%edx
andb $15,%ah
cmpb $15,%ah
jne .Lnotintel
- orl $1048576,%edx
+ orl $0x00100000,%edx
.Lnotintel:
btl $28,%edx
jnc .Lgeneric
- andl $4026531839,%edx
+ andl $0xefffffff,%edx
cmpl $0,%r10d
je .Lgeneric
- orl $268435456,%edx
+ orl $0x10000000,%edx
shrl $16,%ebx
cmpb $1,%bl
ja .Lgeneric
- andl $4026531839,%edx
+ andl $0xefffffff,%edx
.Lgeneric:
- andl $2048,%r9d
- andl $4294965247,%ecx
+ andl $0x00000800,%r9d
+ andl $0xfffff7ff,%ecx
orl %ecx,%r9d
movl %edx,%r10d
@@ -152,9 +152,9 @@ OPENSSL_ia32_cpuid:
cmpl $6,%eax
je .Ldone
.Lclear_avx:
- movl $4026525695,%eax
+ movl $0xefffe7ff,%eax
andl %eax,%r9d
- andl $4294967263,8(%rdi)
+ andl $0xffffffdf,8(%rdi)
.Ldone:
shlq $32,%r9
movl %r10d,%eax
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aes-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aes-x86_64.s
index a50170a9a2..cb2db3584a 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aes-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aes-x86_64.s
@@ -81,8 +81,8 @@ L$enc_loop:
movl 0(%r14,%rdi,8),%edi
movl 0(%r14,%rbp,8),%ebp
- andl $65280,%edi
- andl $65280,%ebp
+ andl $0x0000ff00,%edi
+ andl $0x0000ff00,%ebp
xorl %edi,%r10d
xorl %ebp,%r11d
@@ -94,8 +94,8 @@ L$enc_loop:
movl 0(%r14,%rsi,8),%esi
movl 0(%r14,%rdi,8),%edi
- andl $65280,%esi
- andl $65280,%edi
+ andl $0x0000ff00,%esi
+ andl $0x0000ff00,%edi
shrl $16,%ebx
xorl %esi,%r12d
xorl %edi,%r8d
@@ -108,9 +108,9 @@ L$enc_loop:
movl 0(%r14,%rdi,8),%edi
movl 0(%r14,%rbp,8),%ebp
- andl $16711680,%esi
- andl $16711680,%edi
- andl $16711680,%ebp
+ andl $0x00ff0000,%esi
+ andl $0x00ff0000,%edi
+ andl $0x00ff0000,%ebp
xorl %esi,%r10d
xorl %edi,%r11d
@@ -123,9 +123,9 @@ L$enc_loop:
movl 2(%r14,%rdi,8),%edi
movl 2(%r14,%rbp,8),%ebp
- andl $16711680,%esi
- andl $4278190080,%edi
- andl $4278190080,%ebp
+ andl $0x00ff0000,%esi
+ andl $0xff000000,%edi
+ andl $0xff000000,%ebp
xorl %esi,%r8d
xorl %edi,%r10d
@@ -138,8 +138,8 @@ L$enc_loop:
movl 2(%r14,%rdi,8),%edi
movl 16+0(%r15),%eax
- andl $4278190080,%esi
- andl $4278190080,%edi
+ andl $0xff000000,%esi
+ andl $0xff000000,%edi
xorl %esi,%r12d
xorl %edi,%r8d
@@ -241,8 +241,8 @@ L$enc_loop_compact:
xorl %r8d,%edx
cmpq 16(%rsp),%r15
je L$enc_compact_done
- movl $2155905152,%r10d
- movl $2155905152,%r11d
+ movl $0x80808080,%r10d
+ movl $0x80808080,%r11d
andl %eax,%r10d
andl %ebx,%r11d
movl %r10d,%esi
@@ -253,10 +253,10 @@ L$enc_loop_compact:
leal (%rbx,%rbx,1),%r9d
subl %r10d,%esi
subl %r11d,%edi
- andl $4278124286,%r8d
- andl $4278124286,%r9d
- andl $454761243,%esi
- andl $454761243,%edi
+ andl $0xfefefefe,%r8d
+ andl $0xfefefefe,%r9d
+ andl $0x1b1b1b1b,%esi
+ andl $0x1b1b1b1b,%edi
movl %eax,%r10d
movl %ebx,%r11d
xorl %esi,%r8d
@@ -264,9 +264,9 @@ L$enc_loop_compact:
xorl %r8d,%eax
xorl %r9d,%ebx
- movl $2155905152,%r12d
+ movl $0x80808080,%r12d
roll $24,%eax
- movl $2155905152,%ebp
+ movl $0x80808080,%ebp
roll $24,%ebx
andl %ecx,%r12d
andl %edx,%ebp
@@ -289,10 +289,10 @@ L$enc_loop_compact:
xorl %r10d,%eax
xorl %r11d,%ebx
- andl $4278124286,%r8d
- andl $4278124286,%r9d
- andl $454761243,%esi
- andl $454761243,%edi
+ andl $0xfefefefe,%r8d
+ andl $0xfefefefe,%r9d
+ andl $0x1b1b1b1b,%esi
+ andl $0x1b1b1b1b,%edi
movl %ecx,%r12d
movl %edx,%ebp
xorl %esi,%r8d
@@ -345,7 +345,7 @@ _AES_encrypt:
andq $-64,%rsp
subq %rsp,%rcx
negq %rcx
- andq $960,%rcx
+ andq $0x3c0,%rcx
subq %rcx,%rsp
subq $32,%rsp
@@ -370,7 +370,7 @@ L$enc_prologue:
leaq L$AES_Te+2048(%rip),%r14
leaq 768(%rsp),%rbp
subq %r14,%rbp
- andq $768,%rbp
+ andq $0x300,%rbp
leaq (%r14,%rbp,1),%r14
call _x86_64_AES_encrypt_compact
@@ -792,7 +792,7 @@ _AES_decrypt:
andq $-64,%rsp
subq %rsp,%rcx
negq %rcx
- andq $960,%rcx
+ andq $0x3c0,%rcx
subq %rcx,%rsp
subq $32,%rsp
@@ -817,7 +817,7 @@ L$dec_prologue:
leaq L$AES_Td+2048(%rip),%r14
leaq 768(%rsp),%rbp
subq %r14,%rbp
- andq $768,%rbp
+ andq $0x300,%rbp
leaq (%r14,%rbp,1),%r14
shrq $3,%rbp
addq %rbp,%r14
@@ -1333,9 +1333,9 @@ L$cbc_picked_te:
movq %r14,%r10
leaq 2304(%r14),%r11
movq %r15,%r12
- andq $4095,%r10
- andq $4095,%r11
- andq $4095,%r12
+ andq $0xFFF,%r10
+ andq $0xFFF,%r11
+ andq $0xFFF,%r12
cmpq %r11,%r12
jb L$cbc_te_break_out
@@ -1344,7 +1344,7 @@ L$cbc_picked_te:
jmp L$cbc_te_ok
L$cbc_te_break_out:
subq %r10,%r12
- andq $4095,%r12
+ andq $0xFFF,%r12
addq $320,%r12
subq %r12,%r15
.p2align 2
@@ -1370,7 +1370,7 @@ L$cbc_fast_body:
movq %r15,%r10
subq %r14,%r10
- andq $4095,%r10
+ andq $0xfff,%r10
cmpq $2304,%r10
jb L$cbc_do_ecopy
cmpq $4096-248,%r10
@@ -1557,7 +1557,7 @@ L$cbc_slow_prologue:
leaq -88-63(%rcx),%r10
subq %rbp,%r10
negq %r10
- andq $960,%r10
+ andq $0x3c0,%r10
subq %r10,%rbp
xchgq %rsp,%rbp
@@ -1586,7 +1586,7 @@ L$cbc_slow_body:
leaq 2048(%r14),%r14
leaq 768-8(%rsp),%rax
subq %r14,%rax
- andq $768,%rax
+ andq $0x300,%rax
leaq (%r14,%rax,1),%r14
cmpq $0,%rbx
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-sha1-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-sha1-x86_64.s
index 015db5faa7..970a12149b 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-sha1-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-sha1-x86_64.s
@@ -1392,8 +1392,8 @@ aesni_cbc_sha1_enc_shaext:
movups 16(%rcx),%xmm0
leaq 112(%rcx),%rcx
- pshufd $27,%xmm8,%xmm8
- pshufd $27,%xmm9,%xmm9
+ pshufd $0b00011011,%xmm8,%xmm8
+ pshufd $0b00011011,%xmm9,%xmm9
jmp L$oop_shaext
.p2align 4
@@ -1672,8 +1672,8 @@ L$aesenclast9:
leaq 64(%rdi),%rdi
jnz L$oop_shaext
- pshufd $27,%xmm8,%xmm8
- pshufd $27,%xmm9,%xmm9
+ pshufd $0b00011011,%xmm8,%xmm8
+ pshufd $0b00011011,%xmm9,%xmm9
movups %xmm2,(%r8)
movdqu %xmm8,(%r9)
movd %xmm9,16(%r9)
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s
index 41ad80eebd..6aa1441150 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s
@@ -503,7 +503,7 @@ _aesni_ecb_encrypt:
testl %r8d,%r8d
jz L$ecb_decrypt
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb L$ecb_enc_tail
movdqu (%rdi),%xmm2
@@ -515,7 +515,7 @@ _aesni_ecb_encrypt:
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp L$ecb_enc_loop8_enter
.p2align 4
L$ecb_enc_loop8:
@@ -543,7 +543,7 @@ L$ecb_enc_loop8_enter:
call _aesni_encrypt8
- subq $128,%rdx
+ subq $0x80,%rdx
jnc L$ecb_enc_loop8
movups %xmm2,(%rsi)
@@ -557,22 +557,22 @@ L$ecb_enc_loop8_enter:
movups %xmm8,96(%rsi)
movups %xmm9,112(%rsi)
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz L$ecb_ret
L$ecb_enc_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb L$ecb_enc_one
movups 16(%rdi),%xmm3
je L$ecb_enc_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb L$ecb_enc_three
movups 48(%rdi),%xmm5
je L$ecb_enc_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb L$ecb_enc_five
movups 80(%rdi),%xmm7
je L$ecb_enc_six
@@ -646,7 +646,7 @@ L$ecb_enc_six:
.p2align 4
L$ecb_decrypt:
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb L$ecb_dec_tail
movdqu (%rdi),%xmm2
@@ -658,7 +658,7 @@ L$ecb_decrypt:
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp L$ecb_dec_loop8_enter
.p2align 4
L$ecb_dec_loop8:
@@ -687,7 +687,7 @@ L$ecb_dec_loop8_enter:
call _aesni_decrypt8
movups (%r11),%xmm0
- subq $128,%rdx
+ subq $0x80,%rdx
jnc L$ecb_dec_loop8
movups %xmm2,(%rsi)
@@ -709,22 +709,22 @@ L$ecb_dec_loop8_enter:
movups %xmm9,112(%rsi)
pxor %xmm9,%xmm9
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz L$ecb_ret
L$ecb_dec_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb L$ecb_dec_one
movups 16(%rdi),%xmm3
je L$ecb_dec_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb L$ecb_dec_three
movups 48(%rdi),%xmm5
je L$ecb_dec_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb L$ecb_dec_five
movups 80(%rdi),%xmm7
je L$ecb_dec_six
@@ -1598,7 +1598,7 @@ L$oop_enc1_8:
movdqa L$xts_magic(%rip),%xmm8
movdqa %xmm2,%xmm15
- pshufd $95,%xmm2,%xmm9
+ pshufd $0x5f,%xmm2,%xmm9
pxor %xmm0,%xmm1
movdqa %xmm9,%xmm14
paddd %xmm9,%xmm9
@@ -1697,7 +1697,7 @@ L$xts_enc_grandloop:
.byte 102,15,56,220,248
movups 64(%r11),%xmm0
movdqa %xmm8,80(%rsp)
- pshufd $95,%xmm15,%xmm9
+ pshufd $0x5f,%xmm15,%xmm9
jmp L$xts_enc_loop6
.p2align 5
L$xts_enc_loop6:
@@ -1836,13 +1836,13 @@ L$xts_enc_short:
jz L$xts_enc_done
pxor %xmm0,%xmm11
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb L$xts_enc_one
pxor %xmm0,%xmm12
je L$xts_enc_two
pxor %xmm0,%xmm13
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb L$xts_enc_three
pxor %xmm0,%xmm14
je L$xts_enc_four
@@ -2069,7 +2069,7 @@ L$oop_enc1_11:
movdqa L$xts_magic(%rip),%xmm8
movdqa %xmm2,%xmm15
- pshufd $95,%xmm2,%xmm9
+ pshufd $0x5f,%xmm2,%xmm9
pxor %xmm0,%xmm1
movdqa %xmm9,%xmm14
paddd %xmm9,%xmm9
@@ -2168,7 +2168,7 @@ L$xts_dec_grandloop:
.byte 102,15,56,222,248
movups 64(%r11),%xmm0
movdqa %xmm8,80(%rsp)
- pshufd $95,%xmm15,%xmm9
+ pshufd $0x5f,%xmm15,%xmm9
jmp L$xts_dec_loop6
.p2align 5
L$xts_dec_loop6:
@@ -2308,13 +2308,13 @@ L$xts_dec_short:
jz L$xts_dec_done
pxor %xmm0,%xmm12
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb L$xts_dec_one
pxor %xmm0,%xmm13
je L$xts_dec_two
pxor %xmm0,%xmm14
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb L$xts_dec_three
je L$xts_dec_four
@@ -2345,7 +2345,7 @@ L$xts_dec_short:
pcmpgtd %xmm15,%xmm14
movdqu %xmm6,64(%rsi)
leaq 80(%rsi),%rsi
- pshufd $19,%xmm14,%xmm11
+ pshufd $0x13,%xmm14,%xmm11
andq $15,%r9
jz L$xts_dec_ret
@@ -2634,7 +2634,7 @@ L$cbc_decrypt_bulk:
leaq -8(%rax),%rbp
movups (%r8),%xmm10
movl %r10d,%eax
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe L$cbc_dec_tail
movups (%rcx),%xmm0
@@ -2650,14 +2650,14 @@ L$cbc_decrypt_bulk:
movdqu 80(%rdi),%xmm7
movdqa %xmm6,%xmm15
movl _OPENSSL_ia32cap_P+4(%rip),%r9d
- cmpq $112,%rdx
+ cmpq $0x70,%rdx
jbe L$cbc_dec_six_or_seven
andl $71303168,%r9d
- subq $80,%rdx
+ subq $0x50,%rdx
cmpl $4194304,%r9d
je L$cbc_dec_loop6_enter
- subq $32,%rdx
+ subq $0x20,%rdx
leaq 112(%rcx),%rcx
jmp L$cbc_dec_loop8_enter
.p2align 4
@@ -2672,7 +2672,7 @@ L$cbc_dec_loop8_enter:
movups 16-112(%rcx),%xmm1
pxor %xmm0,%xmm4
xorq %r11,%r11
- cmpq $112,%rdx
+ cmpq $0x70,%rdx
pxor %xmm0,%xmm5
pxor %xmm0,%xmm6
pxor %xmm0,%xmm7
@@ -2857,21 +2857,21 @@ L$cbc_dec_done:
movups %xmm8,96(%rsi)
leaq 112(%rsi),%rsi
- subq $128,%rdx
+ subq $0x80,%rdx
ja L$cbc_dec_loop8
movaps %xmm9,%xmm2
leaq -112(%rcx),%rcx
- addq $112,%rdx
+ addq $0x70,%rdx
jle L$cbc_dec_clear_tail_collected
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe L$cbc_dec_tail
movaps %xmm11,%xmm2
L$cbc_dec_six_or_seven:
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
ja L$cbc_dec_seven
movaps %xmm7,%xmm8
@@ -2964,33 +2964,33 @@ L$cbc_dec_loop6_enter:
movl %r10d,%eax
movdqu %xmm6,64(%rsi)
leaq 80(%rsi),%rsi
- subq $96,%rdx
+ subq $0x60,%rdx
ja L$cbc_dec_loop6
movdqa %xmm7,%xmm2
- addq $80,%rdx
+ addq $0x50,%rdx
jle L$cbc_dec_clear_tail_collected
movups %xmm7,(%rsi)
leaq 16(%rsi),%rsi
L$cbc_dec_tail:
movups (%rdi),%xmm2
- subq $16,%rdx
+ subq $0x10,%rdx
jbe L$cbc_dec_one
movups 16(%rdi),%xmm3
movaps %xmm2,%xmm11
- subq $16,%rdx
+ subq $0x10,%rdx
jbe L$cbc_dec_two
movups 32(%rdi),%xmm4
movaps %xmm3,%xmm12
- subq $16,%rdx
+ subq $0x10,%rdx
jbe L$cbc_dec_three
movups 48(%rdi),%xmm5
movaps %xmm4,%xmm13
- subq $16,%rdx
+ subq $0x10,%rdx
jbe L$cbc_dec_four
movups 64(%rdi),%xmm6
@@ -3015,7 +3015,7 @@ L$cbc_dec_tail:
movdqa %xmm6,%xmm2
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
- subq $16,%rdx
+ subq $0x10,%rdx
jmp L$cbc_dec_tail_collected
.p2align 4
@@ -3332,7 +3332,7 @@ L$oop_key192:
pslldq $4,%xmm0
pxor %xmm3,%xmm0
- pshufd $255,%xmm0,%xmm3
+ pshufd $0xff,%xmm0,%xmm3
pxor %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm1,%xmm3
@@ -3419,7 +3419,7 @@ L$oop_key256:
decl %r10d
jz L$done_key256
- pshufd $255,%xmm0,%xmm2
+ pshufd $0xff,%xmm0,%xmm2
pxor %xmm3,%xmm3
.byte 102,15,56,221,211
@@ -3462,11 +3462,11 @@ L$key_expansion_128:
movups %xmm0,(%rax)
leaq 16(%rax),%rax
L$key_expansion_128_cold:
- shufps $16,%xmm0,%xmm4
+ shufps $0b00010000,%xmm0,%xmm4
xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
+ shufps $0b10001100,%xmm0,%xmm4
xorps %xmm4,%xmm0
- shufps $255,%xmm1,%xmm1
+ shufps $0b11111111,%xmm1,%xmm1
xorps %xmm1,%xmm0
.byte 0xf3,0xc3
@@ -3477,25 +3477,25 @@ L$key_expansion_192a:
L$key_expansion_192a_cold:
movaps %xmm2,%xmm5
L$key_expansion_192b_warm:
- shufps $16,%xmm0,%xmm4
+ shufps $0b00010000,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
+ shufps $0b10001100,%xmm0,%xmm4
pslldq $4,%xmm3
xorps %xmm4,%xmm0
- pshufd $85,%xmm1,%xmm1
+ pshufd $0b01010101,%xmm1,%xmm1
pxor %xmm3,%xmm2
pxor %xmm1,%xmm0
- pshufd $255,%xmm0,%xmm3
+ pshufd $0b11111111,%xmm0,%xmm3
pxor %xmm3,%xmm2
.byte 0xf3,0xc3
.p2align 4
L$key_expansion_192b:
movaps %xmm0,%xmm3
- shufps $68,%xmm0,%xmm5
+ shufps $0b01000100,%xmm0,%xmm5
movups %xmm5,(%rax)
- shufps $78,%xmm2,%xmm3
+ shufps $0b01001110,%xmm2,%xmm3
movups %xmm3,16(%rax)
leaq 32(%rax),%rax
jmp L$key_expansion_192b_warm
@@ -3505,11 +3505,11 @@ L$key_expansion_256a:
movups %xmm2,(%rax)
leaq 16(%rax),%rax
L$key_expansion_256a_cold:
- shufps $16,%xmm0,%xmm4
+ shufps $0b00010000,%xmm0,%xmm4
xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
+ shufps $0b10001100,%xmm0,%xmm4
xorps %xmm4,%xmm0
- shufps $255,%xmm1,%xmm1
+ shufps $0b11111111,%xmm1,%xmm1
xorps %xmm1,%xmm0
.byte 0xf3,0xc3
@@ -3518,11 +3518,11 @@ L$key_expansion_256b:
movups %xmm0,(%rax)
leaq 16(%rax),%rax
- shufps $16,%xmm2,%xmm4
+ shufps $0b00010000,%xmm2,%xmm4
xorps %xmm4,%xmm2
- shufps $140,%xmm2,%xmm4
+ shufps $0b10001100,%xmm2,%xmm4
xorps %xmm4,%xmm2
- shufps $170,%xmm1,%xmm1
+ shufps $0b10101010,%xmm1,%xmm1
xorps %xmm1,%xmm2
.byte 0xf3,0xc3
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/bsaes-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/bsaes-x86_64.s
index 2af36a90b0..52ae782e9a 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/bsaes-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/bsaes-x86_64.s
@@ -324,45 +324,45 @@ L$enc_sbox:
pxor %xmm2,%xmm5
decl %r10d
jl L$enc_done
- pshufd $147,%xmm15,%xmm7
- pshufd $147,%xmm0,%xmm8
+ pshufd $0x93,%xmm15,%xmm7
+ pshufd $0x93,%xmm0,%xmm8
pxor %xmm7,%xmm15
- pshufd $147,%xmm3,%xmm9
+ pshufd $0x93,%xmm3,%xmm9
pxor %xmm8,%xmm0
- pshufd $147,%xmm5,%xmm10
+ pshufd $0x93,%xmm5,%xmm10
pxor %xmm9,%xmm3
- pshufd $147,%xmm2,%xmm11
+ pshufd $0x93,%xmm2,%xmm11
pxor %xmm10,%xmm5
- pshufd $147,%xmm6,%xmm12
+ pshufd $0x93,%xmm6,%xmm12
pxor %xmm11,%xmm2
- pshufd $147,%xmm1,%xmm13
+ pshufd $0x93,%xmm1,%xmm13
pxor %xmm12,%xmm6
- pshufd $147,%xmm4,%xmm14
+ pshufd $0x93,%xmm4,%xmm14
pxor %xmm13,%xmm1
pxor %xmm14,%xmm4
pxor %xmm15,%xmm8
pxor %xmm4,%xmm7
pxor %xmm4,%xmm8
- pshufd $78,%xmm15,%xmm15
+ pshufd $0x4E,%xmm15,%xmm15
pxor %xmm0,%xmm9
- pshufd $78,%xmm0,%xmm0
+ pshufd $0x4E,%xmm0,%xmm0
pxor %xmm2,%xmm12
pxor %xmm7,%xmm15
pxor %xmm6,%xmm13
pxor %xmm8,%xmm0
pxor %xmm5,%xmm11
- pshufd $78,%xmm2,%xmm7
+ pshufd $0x4E,%xmm2,%xmm7
pxor %xmm1,%xmm14
- pshufd $78,%xmm6,%xmm8
+ pshufd $0x4E,%xmm6,%xmm8
pxor %xmm3,%xmm10
- pshufd $78,%xmm5,%xmm2
+ pshufd $0x4E,%xmm5,%xmm2
pxor %xmm4,%xmm10
- pshufd $78,%xmm4,%xmm6
+ pshufd $0x4E,%xmm4,%xmm6
pxor %xmm4,%xmm11
- pshufd $78,%xmm1,%xmm5
+ pshufd $0x4E,%xmm1,%xmm5
pxor %xmm11,%xmm7
- pshufd $78,%xmm3,%xmm1
+ pshufd $0x4E,%xmm3,%xmm1
pxor %xmm12,%xmm8
pxor %xmm10,%xmm2
pxor %xmm14,%xmm6
@@ -796,24 +796,24 @@ L$dec_sbox:
decl %r10d
jl L$dec_done
- pshufd $78,%xmm15,%xmm7
- pshufd $78,%xmm2,%xmm13
+ pshufd $0x4E,%xmm15,%xmm7
+ pshufd $0x4E,%xmm2,%xmm13
pxor %xmm15,%xmm7
- pshufd $78,%xmm4,%xmm14
+ pshufd $0x4E,%xmm4,%xmm14
pxor %xmm2,%xmm13
- pshufd $78,%xmm0,%xmm8
+ pshufd $0x4E,%xmm0,%xmm8
pxor %xmm4,%xmm14
- pshufd $78,%xmm5,%xmm9
+ pshufd $0x4E,%xmm5,%xmm9
pxor %xmm0,%xmm8
- pshufd $78,%xmm3,%xmm10
+ pshufd $0x4E,%xmm3,%xmm10
pxor %xmm5,%xmm9
pxor %xmm13,%xmm15
pxor %xmm13,%xmm0
- pshufd $78,%xmm1,%xmm11
+ pshufd $0x4E,%xmm1,%xmm11
pxor %xmm3,%xmm10
pxor %xmm7,%xmm5
pxor %xmm8,%xmm3
- pshufd $78,%xmm6,%xmm12
+ pshufd $0x4E,%xmm6,%xmm12
pxor %xmm1,%xmm11
pxor %xmm14,%xmm0
pxor %xmm9,%xmm1
@@ -827,45 +827,45 @@ L$dec_sbox:
pxor %xmm14,%xmm1
pxor %xmm14,%xmm6
pxor %xmm12,%xmm4
- pshufd $147,%xmm15,%xmm7
- pshufd $147,%xmm0,%xmm8
+ pshufd $0x93,%xmm15,%xmm7
+ pshufd $0x93,%xmm0,%xmm8
pxor %xmm7,%xmm15
- pshufd $147,%xmm5,%xmm9
+ pshufd $0x93,%xmm5,%xmm9
pxor %xmm8,%xmm0
- pshufd $147,%xmm3,%xmm10
+ pshufd $0x93,%xmm3,%xmm10
pxor %xmm9,%xmm5
- pshufd $147,%xmm1,%xmm11
+ pshufd $0x93,%xmm1,%xmm11
pxor %xmm10,%xmm3
- pshufd $147,%xmm6,%xmm12
+ pshufd $0x93,%xmm6,%xmm12
pxor %xmm11,%xmm1
- pshufd $147,%xmm2,%xmm13
+ pshufd $0x93,%xmm2,%xmm13
pxor %xmm12,%xmm6
- pshufd $147,%xmm4,%xmm14
+ pshufd $0x93,%xmm4,%xmm14
pxor %xmm13,%xmm2
pxor %xmm14,%xmm4
pxor %xmm15,%xmm8
pxor %xmm4,%xmm7
pxor %xmm4,%xmm8
- pshufd $78,%xmm15,%xmm15
+ pshufd $0x4E,%xmm15,%xmm15
pxor %xmm0,%xmm9
- pshufd $78,%xmm0,%xmm0
+ pshufd $0x4E,%xmm0,%xmm0
pxor %xmm1,%xmm12
pxor %xmm7,%xmm15
pxor %xmm6,%xmm13
pxor %xmm8,%xmm0
pxor %xmm3,%xmm11
- pshufd $78,%xmm1,%xmm7
+ pshufd $0x4E,%xmm1,%xmm7
pxor %xmm2,%xmm14
- pshufd $78,%xmm6,%xmm8
+ pshufd $0x4E,%xmm6,%xmm8
pxor %xmm5,%xmm10
- pshufd $78,%xmm3,%xmm1
+ pshufd $0x4E,%xmm3,%xmm1
pxor %xmm4,%xmm10
- pshufd $78,%xmm4,%xmm6
+ pshufd $0x4E,%xmm4,%xmm6
pxor %xmm4,%xmm11
- pshufd $78,%xmm2,%xmm3
+ pshufd $0x4E,%xmm2,%xmm3
pxor %xmm11,%xmm7
- pshufd $78,%xmm5,%xmm2
+ pshufd $0x4E,%xmm5,%xmm2
pxor %xmm12,%xmm8
pxor %xmm1,%xmm10
pxor %xmm14,%xmm6
@@ -1552,20 +1552,20 @@ L$xts_enc_prologue:
movdqa %xmm7,(%rax)
andq $-16,%r14
- subq $128,%rsp
+ subq $0x80,%rsp
movdqa 32(%rbp),%xmm6
pxor %xmm14,%xmm14
movdqa L$xts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- subq $128,%r14
+ subq $0x80,%r14
jc L$xts_enc_short
jmp L$xts_enc_loop
.p2align 4
L$xts_enc_loop:
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm15
movdqa %xmm6,0(%rsp)
@@ -1573,7 +1573,7 @@ L$xts_enc_loop:
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm0
movdqa %xmm6,16(%rsp)
@@ -1582,7 +1582,7 @@ L$xts_enc_loop:
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
movdqu 0(%r12),%xmm7
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm1
movdqa %xmm6,32(%rsp)
@@ -1592,7 +1592,7 @@ L$xts_enc_loop:
pxor %xmm13,%xmm6
movdqu 16(%r12),%xmm8
pxor %xmm7,%xmm15
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm2
movdqa %xmm6,48(%rsp)
@@ -1602,7 +1602,7 @@ L$xts_enc_loop:
pxor %xmm13,%xmm6
movdqu 32(%r12),%xmm9
pxor %xmm8,%xmm0
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm3
movdqa %xmm6,64(%rsp)
@@ -1612,7 +1612,7 @@ L$xts_enc_loop:
pxor %xmm13,%xmm6
movdqu 48(%r12),%xmm10
pxor %xmm9,%xmm1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm4
movdqa %xmm6,80(%rsp)
@@ -1622,7 +1622,7 @@ L$xts_enc_loop:
pxor %xmm13,%xmm6
movdqu 64(%r12),%xmm11
pxor %xmm10,%xmm2
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm5
movdqa %xmm6,96(%rsp)
@@ -1666,20 +1666,20 @@ L$xts_enc_loop:
pxor %xmm14,%xmm14
movdqa L$xts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
paddq %xmm6,%xmm6
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- subq $128,%r14
+ subq $0x80,%r14
jnc L$xts_enc_loop
L$xts_enc_short:
- addq $128,%r14
+ addq $0x80,%r14
jz L$xts_enc_done
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm15
movdqa %xmm6,0(%rsp)
@@ -1687,7 +1687,7 @@ L$xts_enc_short:
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm0
movdqa %xmm6,16(%rsp)
@@ -1698,7 +1698,7 @@ L$xts_enc_short:
movdqu 0(%r12),%xmm7
cmpq $16,%r14
je L$xts_enc_1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm1
movdqa %xmm6,32(%rsp)
@@ -1710,7 +1710,7 @@ L$xts_enc_short:
cmpq $32,%r14
je L$xts_enc_2
pxor %xmm7,%xmm15
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm2
movdqa %xmm6,48(%rsp)
@@ -1722,7 +1722,7 @@ L$xts_enc_short:
cmpq $48,%r14
je L$xts_enc_3
pxor %xmm8,%xmm0
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm3
movdqa %xmm6,64(%rsp)
@@ -1734,7 +1734,7 @@ L$xts_enc_short:
cmpq $64,%r14
je L$xts_enc_4
pxor %xmm9,%xmm1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm4
movdqa %xmm6,80(%rsp)
@@ -1746,7 +1746,7 @@ L$xts_enc_short:
cmpq $80,%r14
je L$xts_enc_5
pxor %xmm10,%xmm2
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm5
movdqa %xmm6,96(%rsp)
@@ -2011,20 +2011,20 @@ L$xts_dec_prologue:
shlq $4,%rax
subq %rax,%r14
- subq $128,%rsp
+ subq $0x80,%rsp
movdqa 32(%rbp),%xmm6
pxor %xmm14,%xmm14
movdqa L$xts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- subq $128,%r14
+ subq $0x80,%r14
jc L$xts_dec_short
jmp L$xts_dec_loop
.p2align 4
L$xts_dec_loop:
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm15
movdqa %xmm6,0(%rsp)
@@ -2032,7 +2032,7 @@ L$xts_dec_loop:
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm0
movdqa %xmm6,16(%rsp)
@@ -2041,7 +2041,7 @@ L$xts_dec_loop:
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
movdqu 0(%r12),%xmm7
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm1
movdqa %xmm6,32(%rsp)
@@ -2051,7 +2051,7 @@ L$xts_dec_loop:
pxor %xmm13,%xmm6
movdqu 16(%r12),%xmm8
pxor %xmm7,%xmm15
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm2
movdqa %xmm6,48(%rsp)
@@ -2061,7 +2061,7 @@ L$xts_dec_loop:
pxor %xmm13,%xmm6
movdqu 32(%r12),%xmm9
pxor %xmm8,%xmm0
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm3
movdqa %xmm6,64(%rsp)
@@ -2071,7 +2071,7 @@ L$xts_dec_loop:
pxor %xmm13,%xmm6
movdqu 48(%r12),%xmm10
pxor %xmm9,%xmm1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm4
movdqa %xmm6,80(%rsp)
@@ -2081,7 +2081,7 @@ L$xts_dec_loop:
pxor %xmm13,%xmm6
movdqu 64(%r12),%xmm11
pxor %xmm10,%xmm2
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm5
movdqa %xmm6,96(%rsp)
@@ -2125,20 +2125,20 @@ L$xts_dec_loop:
pxor %xmm14,%xmm14
movdqa L$xts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
paddq %xmm6,%xmm6
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- subq $128,%r14
+ subq $0x80,%r14
jnc L$xts_dec_loop
L$xts_dec_short:
- addq $128,%r14
+ addq $0x80,%r14
jz L$xts_dec_done
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm15
movdqa %xmm6,0(%rsp)
@@ -2146,7 +2146,7 @@ L$xts_dec_short:
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm0
movdqa %xmm6,16(%rsp)
@@ -2157,7 +2157,7 @@ L$xts_dec_short:
movdqu 0(%r12),%xmm7
cmpq $16,%r14
je L$xts_dec_1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm1
movdqa %xmm6,32(%rsp)
@@ -2169,7 +2169,7 @@ L$xts_dec_short:
cmpq $32,%r14
je L$xts_dec_2
pxor %xmm7,%xmm15
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm2
movdqa %xmm6,48(%rsp)
@@ -2181,7 +2181,7 @@ L$xts_dec_short:
cmpq $48,%r14
je L$xts_dec_3
pxor %xmm8,%xmm0
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm3
movdqa %xmm6,64(%rsp)
@@ -2193,7 +2193,7 @@ L$xts_dec_short:
cmpq $64,%r14
je L$xts_dec_4
pxor %xmm9,%xmm1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm4
movdqa %xmm6,80(%rsp)
@@ -2205,7 +2205,7 @@ L$xts_dec_short:
cmpq $80,%r14
je L$xts_dec_5
pxor %xmm10,%xmm2
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm5
movdqa %xmm6,96(%rsp)
@@ -2382,7 +2382,7 @@ L$xts_dec_done:
pxor %xmm14,%xmm14
movdqa L$xts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
movdqa %xmm6,%xmm5
paddq %xmm6,%xmm6
pand %xmm12,%xmm13
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/vpaes-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/vpaes-x86_64.s
index c724170ce9..2ffd0bc100 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/vpaes-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/vpaes-x86_64.s
@@ -60,7 +60,7 @@ L$enc_loop:
addq $16,%r11
pxor %xmm0,%xmm3
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
subq $1,%rax
pxor %xmm3,%xmm0
@@ -120,10 +120,10 @@ _vpaes_decrypt_core:
pand %xmm9,%xmm0
.byte 102,15,56,0,208
movdqa L$k_dipt+16(%rip),%xmm0
- xorq $48,%r11
+ xorq $0x30,%r11
leaq L$k_dsbd(%rip),%r10
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
pxor %xmm5,%xmm2
movdqa L$k_mc_forward+48(%rip),%xmm5
pxor %xmm2,%xmm0
@@ -242,7 +242,7 @@ L$schedule_am_decrypting:
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
movdqu %xmm3,(%rdx)
- xorq $48,%r8
+ xorq $0x30,%r8
L$schedule_go:
cmpl $192,%esi
@@ -332,7 +332,7 @@ L$oop_schedule_256:
call _vpaes_schedule_mangle
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
movdqa %xmm7,%xmm5
movdqa %xmm6,%xmm7
call _vpaes_schedule_low_round
@@ -399,8 +399,8 @@ L$schedule_mangle_last_dec:
.p2align 4
_vpaes_schedule_192_smear:
- pshufd $128,%xmm6,%xmm1
- pshufd $254,%xmm7,%xmm0
+ pshufd $0x80,%xmm6,%xmm1
+ pshufd $0xFE,%xmm7,%xmm0
pxor %xmm1,%xmm6
pxor %xmm1,%xmm1
pxor %xmm0,%xmm6
@@ -437,7 +437,7 @@ _vpaes_schedule_round:
pxor %xmm1,%xmm7
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
.byte 102,15,58,15,192,1
@@ -596,7 +596,7 @@ L$schedule_mangle_both:
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
addq $-16,%r8
- andq $48,%r8
+ andq $0x30,%r8
movdqu %xmm3,(%rdx)
.byte 0xf3,0xc3
@@ -614,7 +614,7 @@ _vpaes_set_encrypt_key:
movl %eax,240(%rdx)
movl $0,%ecx
- movl $48,%r8d
+ movl $0x30,%r8d
call _vpaes_schedule_core
xorl %eax,%eax
.byte 0xf3,0xc3
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/rsaz-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/rsaz-x86_64.s
index 4e70deabbd..b92f098e73 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/rsaz-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/rsaz-x86_64.s
@@ -461,48 +461,94 @@ _rsaz_512_mul_gather4:
pushq %r14
pushq %r15
- movl %r9d,%r9d
- subq $128+24,%rsp
+ subq $152,%rsp
L$mul_gather4_body:
- movl 64(%rdx,%r9,4),%eax
-.byte 102,72,15,110,199
- movl (%rdx,%r9,4),%ebx
-.byte 102,72,15,110,201
+ movd %r9d,%xmm8
+ movdqa L$inc+16(%rip),%xmm1
+ movdqa L$inc(%rip),%xmm0
+
+ pshufd $0,%xmm8,%xmm8
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm8,%xmm0
+ movdqa %xmm7,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm8,%xmm1
+ movdqa %xmm7,%xmm4
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm8,%xmm2
+ movdqa %xmm7,%xmm5
+ paddd %xmm3,%xmm4
+ pcmpeqd %xmm8,%xmm3
+ movdqa %xmm7,%xmm6
+ paddd %xmm4,%xmm5
+ pcmpeqd %xmm8,%xmm4
+ paddd %xmm5,%xmm6
+ pcmpeqd %xmm8,%xmm5
+ paddd %xmm6,%xmm7
+ pcmpeqd %xmm8,%xmm6
+ pcmpeqd %xmm8,%xmm7
+
+ movdqa 0(%rdx),%xmm8
+ movdqa 16(%rdx),%xmm9
+ movdqa 32(%rdx),%xmm10
+ movdqa 48(%rdx),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rdx),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rdx),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rdx),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rdx),%xmm15
+ leaq 128(%rdx),%rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+.byte 102,76,15,126,195
+
movq %r8,128(%rsp)
+ movq %rdi,128+8(%rsp)
+ movq %rcx,128+16(%rsp)
- shlq $32,%rax
- orq %rax,%rbx
movq (%rsi),%rax
movq 8(%rsi),%rcx
- leaq 128(%rdx,%r9,4),%rbp
mulq %rbx
movq %rax,(%rsp)
movq %rcx,%rax
movq %rdx,%r8
mulq %rbx
- movd (%rbp),%xmm4
addq %rax,%r8
movq 16(%rsi),%rax
movq %rdx,%r9
adcq $0,%r9
mulq %rbx
- movd 64(%rbp),%xmm5
addq %rax,%r9
movq 24(%rsi),%rax
movq %rdx,%r10
adcq $0,%r10
mulq %rbx
- pslldq $4,%xmm5
addq %rax,%r10
movq 32(%rsi),%rax
movq %rdx,%r11
adcq $0,%r11
mulq %rbx
- por %xmm5,%xmm4
addq %rax,%r11
movq 40(%rsi),%rax
movq %rdx,%r12
@@ -515,14 +561,12 @@ L$mul_gather4_body:
adcq $0,%r13
mulq %rbx
- leaq 128(%rbp),%rbp
addq %rax,%r13
movq 56(%rsi),%rax
movq %rdx,%r14
adcq $0,%r14
mulq %rbx
-.byte 102,72,15,126,227
addq %rax,%r14
movq (%rsi),%rax
movq %rdx,%r15
@@ -534,6 +578,35 @@ L$mul_gather4_body:
.p2align 5
L$oop_mul_gather:
+ movdqa 0(%rbp),%xmm8
+ movdqa 16(%rbp),%xmm9
+ movdqa 32(%rbp),%xmm10
+ movdqa 48(%rbp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rbp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rbp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rbp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rbp),%xmm15
+ leaq 128(%rbp),%rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+.byte 102,76,15,126,195
+
mulq %rbx
addq %rax,%r8
movq 8(%rsi),%rax
@@ -542,7 +615,6 @@ L$oop_mul_gather:
adcq $0,%r8
mulq %rbx
- movd (%rbp),%xmm4
addq %rax,%r9
movq 16(%rsi),%rax
adcq $0,%rdx
@@ -551,7 +623,6 @@ L$oop_mul_gather:
adcq $0,%r9
mulq %rbx
- movd 64(%rbp),%xmm5
addq %rax,%r10
movq 24(%rsi),%rax
adcq $0,%rdx
@@ -560,7 +631,6 @@ L$oop_mul_gather:
adcq $0,%r10
mulq %rbx
- pslldq $4,%xmm5
addq %rax,%r11
movq 32(%rsi),%rax
adcq $0,%rdx
@@ -569,7 +639,6 @@ L$oop_mul_gather:
adcq $0,%r11
mulq %rbx
- por %xmm5,%xmm4
addq %rax,%r12
movq 40(%rsi),%rax
adcq $0,%rdx
@@ -594,7 +663,6 @@ L$oop_mul_gather:
adcq $0,%r14
mulq %rbx
-.byte 102,72,15,126,227
addq %rax,%r15
movq (%rsi),%rax
adcq $0,%rdx
@@ -602,7 +670,6 @@ L$oop_mul_gather:
movq %rdx,%r15
adcq $0,%r15
- leaq 128(%rbp),%rbp
leaq 8(%rdi),%rdi
decl %ecx
@@ -617,8 +684,8 @@ L$oop_mul_gather:
movq %r14,48(%rdi)
movq %r15,56(%rdi)
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
+ movq 128+8(%rsp),%rdi
+ movq 128+16(%rsp),%rbp
movq (%rsp),%r8
movq 8(%rsp),%r9
@@ -667,7 +734,7 @@ _rsaz_512_mul_scatter4:
movl %r9d,%r9d
subq $128+24,%rsp
L$mul_scatter4_body:
- leaq (%r8,%r9,4),%r8
+ leaq (%r8,%r9,8),%r8
.byte 102,72,15,110,199
.byte 102,72,15,110,202
.byte 102,73,15,110,208
@@ -703,30 +770,14 @@ L$mul_scatter4_body:
call __rsaz_512_subtract
- movl %r8d,0(%rsi)
- shrq $32,%r8
- movl %r9d,128(%rsi)
- shrq $32,%r9
- movl %r10d,256(%rsi)
- shrq $32,%r10
- movl %r11d,384(%rsi)
- shrq $32,%r11
- movl %r12d,512(%rsi)
- shrq $32,%r12
- movl %r13d,640(%rsi)
- shrq $32,%r13
- movl %r14d,768(%rsi)
- shrq $32,%r14
- movl %r15d,896(%rsi)
- shrq $32,%r15
- movl %r8d,64(%rsi)
- movl %r9d,192(%rsi)
- movl %r10d,320(%rsi)
- movl %r11d,448(%rsi)
- movl %r12d,576(%rsi)
- movl %r13d,704(%rsi)
- movl %r14d,832(%rsi)
- movl %r15d,960(%rsi)
+ movq %r8,0(%rsi)
+ movq %r9,128(%rsi)
+ movq %r10,256(%rsi)
+ movq %r11,384(%rsi)
+ movq %r12,512(%rsi)
+ movq %r13,640(%rsi)
+ movq %r14,768(%rsi)
+ movq %r15,896(%rsi)
leaq 128+24+48(%rsp),%rax
movq -48(%rax),%r15
@@ -1079,16 +1130,14 @@ L$oop_mul:
.p2align 4
_rsaz_512_scatter4:
- leaq (%rdi,%rdx,4),%rdi
+ leaq (%rdi,%rdx,8),%rdi
movl $8,%r9d
jmp L$oop_scatter
.p2align 4
L$oop_scatter:
movq (%rsi),%rax
leaq 8(%rsi),%rsi
- movl %eax,(%rdi)
- shrq $32,%rax
- movl %eax,64(%rdi)
+ movq %rax,(%rdi)
leaq 128(%rdi),%rdi
decl %r9d
jnz L$oop_scatter
@@ -1099,18 +1148,72 @@ L$oop_scatter:
.p2align 4
_rsaz_512_gather4:
- leaq (%rsi,%rdx,4),%rsi
+ movd %edx,%xmm8
+ movdqa L$inc+16(%rip),%xmm1
+ movdqa L$inc(%rip),%xmm0
+
+ pshufd $0,%xmm8,%xmm8
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm8,%xmm0
+ movdqa %xmm7,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm8,%xmm1
+ movdqa %xmm7,%xmm4
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm8,%xmm2
+ movdqa %xmm7,%xmm5
+ paddd %xmm3,%xmm4
+ pcmpeqd %xmm8,%xmm3
+ movdqa %xmm7,%xmm6
+ paddd %xmm4,%xmm5
+ pcmpeqd %xmm8,%xmm4
+ paddd %xmm5,%xmm6
+ pcmpeqd %xmm8,%xmm5
+ paddd %xmm6,%xmm7
+ pcmpeqd %xmm8,%xmm6
+ pcmpeqd %xmm8,%xmm7
movl $8,%r9d
jmp L$oop_gather
.p2align 4
L$oop_gather:
- movl (%rsi),%eax
- movl 64(%rsi),%r8d
+ movdqa 0(%rsi),%xmm8
+ movdqa 16(%rsi),%xmm9
+ movdqa 32(%rsi),%xmm10
+ movdqa 48(%rsi),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rsi),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rsi),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rsi),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rsi),%xmm15
leaq 128(%rsi),%rsi
- shlq $32,%r8
- orq %r8,%rax
- movq %rax,(%rdi)
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+ movq %xmm8,(%rdi)
leaq 8(%rdi),%rdi
decl %r9d
jnz L$oop_gather
.byte 0xf3,0xc3
+L$SEH_end_rsaz_512_gather4:
+
+
+.p2align 6
+L$inc:
+.long 0,0, 1,1
+.long 2,2, 2,2
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-gf2m.s b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-gf2m.s
index 040c324c49..c0f0b4bd68 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-gf2m.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-gf2m.s
@@ -242,7 +242,7 @@ L$body_mul_2x2:
movq %rcx,56(%rsp)
movq %r8,64(%rsp)
- movq $15,%r8
+ movq $0xf,%r8
movq %rsi,%rax
movq %rcx,%rbp
call _mul_1x1
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont.s b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont.s
index 2ed1c0ff42..9b49555a4d 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont.s
@@ -633,20 +633,20 @@ L$sqr8x_enter:
- leaq -64(%rsp,%r9,4),%r11
+ leaq -64(%rsp,%r9,2),%r11
movq (%r8),%r8
subq %rsi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb L$sqr8x_sp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,4),%rsp
+ leaq -64(%rsp,%r9,2),%rsp
jmp L$sqr8x_sp_done
.p2align 5
L$sqr8x_sp_alt:
- leaq 4096-64(,%r9,4),%r10
- leaq -64(%rsp,%r9,4),%rsp
+ leaq 4096-64(,%r9,2),%r10
+ leaq -64(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -656,58 +656,80 @@ L$sqr8x_sp_done:
movq %r9,%r10
negq %r9
- leaq 64(%rsp,%r9,2),%r11
movq %r8,32(%rsp)
movq %rax,40(%rsp)
L$sqr8x_body:
- movq %r9,%rbp
-.byte 102,73,15,110,211
- shrq $3+2,%rbp
- movl _OPENSSL_ia32cap_P+8(%rip),%eax
- jmp L$sqr8x_copy_n
-
-.p2align 5
-L$sqr8x_copy_n:
- movq 0(%rcx),%xmm0
- movq 8(%rcx),%xmm1
- movq 16(%rcx),%xmm3
- movq 24(%rcx),%xmm4
- leaq 32(%rcx),%rcx
- movdqa %xmm0,0(%r11)
- movdqa %xmm1,16(%r11)
- movdqa %xmm3,32(%r11)
- movdqa %xmm4,48(%r11)
- leaq 64(%r11),%r11
- decq %rbp
- jnz L$sqr8x_copy_n
-
+.byte 102,72,15,110,209
pxor %xmm0,%xmm0
.byte 102,72,15,110,207
.byte 102,73,15,110,218
call _bn_sqr8x_internal
+
+
+
+ leaq (%rdi,%r9,1),%rbx
+ movq %r9,%rcx
+ movq %r9,%rdx
+.byte 102,72,15,126,207
+ sarq $3+2,%rcx
+ jmp L$sqr8x_sub
+
+.p2align 5
+L$sqr8x_sub:
+ movq 0(%rbx),%r12
+ movq 8(%rbx),%r13
+ movq 16(%rbx),%r14
+ movq 24(%rbx),%r15
+ leaq 32(%rbx),%rbx
+ sbbq 0(%rbp),%r12
+ sbbq 8(%rbp),%r13
+ sbbq 16(%rbp),%r14
+ sbbq 24(%rbp),%r15
+ leaq 32(%rbp),%rbp
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ leaq 32(%rdi),%rdi
+ incq %rcx
+ jnz L$sqr8x_sub
+
+ sbbq $0,%rax
+ leaq (%rbx,%r9,1),%rbx
+ leaq (%rdi,%r9,1),%rdi
+
+.byte 102,72,15,110,200
pxor %xmm0,%xmm0
- leaq 48(%rsp),%rax
- leaq 64(%rsp,%r9,2),%rdx
- shrq $3+2,%r9
+ pshufd $0,%xmm1,%xmm1
movq 40(%rsp),%rsi
- jmp L$sqr8x_zero
+ jmp L$sqr8x_cond_copy
.p2align 5
-L$sqr8x_zero:
- movdqa %xmm0,0(%rax)
- movdqa %xmm0,16(%rax)
- movdqa %xmm0,32(%rax)
- movdqa %xmm0,48(%rax)
- leaq 64(%rax),%rax
- movdqa %xmm0,0(%rdx)
- movdqa %xmm0,16(%rdx)
- movdqa %xmm0,32(%rdx)
- movdqa %xmm0,48(%rdx)
- leaq 64(%rdx),%rdx
- decq %r9
- jnz L$sqr8x_zero
+L$sqr8x_cond_copy:
+ movdqa 0(%rbx),%xmm2
+ movdqa 16(%rbx),%xmm3
+ leaq 32(%rbx),%rbx
+ movdqu 0(%rdi),%xmm4
+ movdqu 16(%rdi),%xmm5
+ leaq 32(%rdi),%rdi
+ movdqa %xmm0,-32(%rbx)
+ movdqa %xmm0,-16(%rbx)
+ movdqa %xmm0,-32(%rbx,%rdx,1)
+ movdqa %xmm0,-16(%rbx,%rdx,1)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-32(%rdi)
+ movdqu %xmm5,-16(%rdi)
+ addq $32,%r9
+ jnz L$sqr8x_cond_copy
movq $1,%rax
movq -48(%rsi),%r15
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s
index bcd5140eba..c9731e162d 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s
@@ -14,46 +14,151 @@ _bn_mul_mont_gather5:
L$mul_enter:
movl %r9d,%r9d
movq %rsp,%rax
- movl 8(%rsp),%r10d
+ movd 8(%rsp),%xmm5
+ leaq L$inc(%rip),%r10
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
+
leaq 2(%r9),%r11
negq %r11
- leaq (%rsp,%r11,8),%rsp
+ leaq -264(%rsp,%r11,8),%rsp
andq $-1024,%rsp
movq %rax,8(%rsp,%r9,8)
L$mul_body:
- movq %rdx,%r12
- movq %r10,%r11
- shrq $3,%r10
- andq $7,%r11
- notq %r10
- leaq L$magic_masks(%rip),%rax
- andq $3,%r10
- leaq 96(%r12,%r11,8),%r12
- movq 0(%rax,%r10,8),%xmm4
- movq 8(%rax,%r10,8),%xmm5
- movq 16(%rax,%r10,8),%xmm6
- movq 24(%rax,%r10,8),%xmm7
-
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
+ leaq 128(%rdx),%r12
+ movdqa 0(%r10),%xmm0
+ movdqa 16(%r10),%xmm1
+ leaq 24-112(%rsp,%r9,8),%r10
+ andq $-16,%r10
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+.byte 0x67
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
+.byte 0x67
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ por %xmm1,%xmm0
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
.byte 102,72,15,126,195
movq (%r8),%r8
@@ -62,29 +167,14 @@ L$mul_body:
xorq %r14,%r14
xorq %r15,%r15
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
-
movq %r8,%rbp
mulq %rbx
movq %rax,%r10
movq (%rcx),%rax
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq %r10,%rbp
movq %rdx,%r11
- por %xmm2,%xmm0
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
@@ -117,14 +207,12 @@ L$1st_enter:
cmpq %r9,%r15
jne L$1st
-.byte 102,72,15,126,195
addq %rax,%r13
- movq (%rsi),%rax
adcq $0,%rdx
addq %r11,%r13
adcq $0,%rdx
- movq %r13,-16(%rsp,%r15,8)
+ movq %r13,-16(%rsp,%r9,8)
movq %rdx,%r13
movq %r10,%r11
@@ -138,33 +226,78 @@ L$1st_enter:
jmp L$outer
.p2align 4
L$outer:
+ leaq 24+128(%rsp,%r9,8),%rdx
+ andq $-16,%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r12),%xmm0
+ movdqa 80(%r12),%xmm1
+ movdqa 96(%r12),%xmm2
+ movdqa 112(%r12),%xmm3
+ pand 64(%rdx),%xmm0
+ pand 80(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%r12),%r12
+
+ movq (%rsi),%rax
+.byte 102,72,15,126,195
+
xorq %r15,%r15
movq %r8,%rbp
movq (%rsp),%r10
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
-
mulq %rbx
addq %rax,%r10
movq (%rcx),%rax
adcq $0,%rdx
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq %r10,%rbp
movq %rdx,%r11
- por %xmm2,%xmm0
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
@@ -200,15 +333,12 @@ L$inner_enter:
cmpq %r9,%r15
jne L$inner
-.byte 102,72,15,126,195
-
addq %rax,%r13
- movq (%rsi),%rax
adcq $0,%rdx
addq %r10,%r13
- movq (%rsp,%r15,8),%r10
+ movq (%rsp,%r9,8),%r10
adcq $0,%rdx
- movq %r13,-16(%rsp,%r15,8)
+ movq %r13,-16(%rsp,%r9,8)
movq %rdx,%r13
xorq %rdx,%rdx
@@ -255,6 +385,7 @@ L$copy:
movq 8(%rsp,%r9,8),%rsi
movq $1,%rax
+
movq -48(%rsi),%r15
movq -40(%rsi),%r14
movq -32(%rsi),%r13
@@ -277,10 +408,10 @@ L$mul4x_enter:
pushq %r13
pushq %r14
pushq %r15
+
.byte 0x67
- movl %r9d,%r10d
shll $3,%r9d
- shll $3+2,%r10d
+ leaq (%r9,%r9,2),%r10
negq %r9
@@ -290,19 +421,21 @@ L$mul4x_enter:
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb L$mul4xsp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -320(%rsp,%r9,2),%rsp
jmp L$mul4xsp_done
.p2align 5
L$mul4xsp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -318,6 +451,7 @@ L$mul4x_body:
movq 40(%rsp),%rsi
movq $1,%rax
+
movq -48(%rsi),%r15
movq -40(%rsi),%r14
movq -32(%rsi),%r13
@@ -333,47 +467,141 @@ L$mul4x_epilogue:
.p2align 5
mul4x_internal:
shlq $5,%r9
- movl 8(%rax),%r10d
- leaq 256(%rdx,%r9,1),%r13
+ movd 8(%rax),%xmm5
+ leaq L$inc(%rip),%rax
+ leaq 128(%rdx,%r9,1),%r13
shrq $5,%r9
- movq %r10,%r11
- shrq $3,%r10
- andq $7,%r11
- notq %r10
- leaq L$magic_masks(%rip),%rax
- andq $3,%r10
- leaq 96(%rdx,%r11,8),%r12
- movq 0(%rax,%r10,8),%xmm4
- movq 8(%rax,%r10,8),%xmm5
- addq $7,%r11
- movq 16(%rax,%r10,8),%xmm6
- movq 24(%rax,%r10,8),%xmm7
- andq $7,%r11
-
- movq -96(%r12),%xmm0
- leaq 256(%r12),%r14
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
-.byte 0x67
- por %xmm1,%xmm0
- movq -96(%r14),%xmm1
-.byte 0x67
- pand %xmm7,%xmm3
-.byte 0x67
- por %xmm2,%xmm0
- movq -32(%r14),%xmm2
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 88-112(%rsp,%r9,1),%r10
+ leaq 128(%rdx),%r12
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+.byte 0x67,0x67
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
.byte 0x67
- pand %xmm4,%xmm1
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
.byte 0x67
- por %xmm3,%xmm0
- movq 32(%r14),%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ por %xmm1,%xmm0
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ leaq 256(%r12),%r12
.byte 102,72,15,126,195
- movq 96(%r14),%xmm0
+
movq %r13,16+8(%rsp)
movq %rdi,56+8(%rsp)
@@ -387,26 +615,10 @@ mul4x_internal:
movq %rax,%r10
movq (%rcx),%rax
- pand %xmm5,%xmm2
- pand %xmm6,%xmm3
- por %xmm2,%xmm1
-
imulq %r10,%rbp
-
-
-
-
-
-
-
- leaq 64+8(%rsp,%r11,8),%r14
+ leaq 64+8(%rsp),%r14
movq %rdx,%r11
- pand %xmm7,%xmm0
- por %xmm3,%xmm1
- leaq 512(%r12),%r12
- por %xmm1,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi,%r9,1),%rax
@@ -415,7 +627,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -425,7 +637,7 @@ mul4x_internal:
adcq $0,%rdx
addq %r11,%rdi
leaq 32(%r9),%r15
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %rdi,(%r14)
movq %rdx,%r13
@@ -435,7 +647,7 @@ mul4x_internal:
L$1st4x:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
leaq 32(%r14),%r14
adcq $0,%rdx
movq %rdx,%r11
@@ -451,7 +663,7 @@ L$1st4x:
mulq %rbx
addq %rax,%r11
- movq -16(%rcx),%rax
+ movq -8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -481,7 +693,7 @@ L$1st4x:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -490,7 +702,7 @@ L$1st4x:
movq 16(%rsi,%r15,1),%rax
adcq $0,%rdx
addq %r11,%rdi
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %rdi,(%r14)
movq %rdx,%r13
@@ -500,7 +712,7 @@ L$1st4x:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
leaq 32(%r14),%r14
adcq $0,%rdx
movq %rdx,%r11
@@ -516,7 +728,7 @@ L$1st4x:
mulq %rbx
addq %rax,%r11
- movq -16(%rcx),%rax
+ movq -8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -529,8 +741,7 @@ L$1st4x:
movq %rdi,-16(%r14)
movq %rdx,%r13
-.byte 102,72,15,126,195
- leaq (%rcx,%r9,2),%rcx
+ leaq (%rcx,%r9,1),%rcx
xorq %rdi,%rdi
addq %r10,%r13
@@ -541,6 +752,63 @@ L$1st4x:
.p2align 5
L$outer4x:
+ leaq 16+128(%r14),%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r12),%xmm0
+ movdqa 80(%r12),%xmm1
+ movdqa 96(%r12),%xmm2
+ movdqa 112(%r12),%xmm3
+ pand 64(%rdx),%xmm0
+ pand 80(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%r12),%r12
+.byte 102,72,15,126,195
+
movq (%r14,%r9,1),%r10
movq %r8,%rbp
mulq %rbx
@@ -548,25 +816,11 @@ L$outer4x:
movq (%rcx),%rax
adcq $0,%rdx
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
-
imulq %r10,%rbp
-.byte 0x67
movq %rdx,%r11
movq %rdi,(%r14)
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
- por %xmm2,%xmm0
leaq (%r14,%r9,1),%r14
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
mulq %rbp
addq %rax,%r10
@@ -576,7 +830,7 @@ L$outer4x:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
addq 8(%r14),%r11
adcq $0,%rdx
@@ -588,7 +842,7 @@ L$outer4x:
adcq $0,%rdx
addq %r11,%rdi
leaq 32(%r9),%r15
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %rdx,%r13
jmp L$inner4x
@@ -597,7 +851,7 @@ L$outer4x:
L$inner4x:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
adcq $0,%rdx
addq 16(%r14),%r10
leaq 32(%r14),%r14
@@ -615,7 +869,7 @@ L$inner4x:
mulq %rbx
addq %rax,%r11
- movq -16(%rcx),%rax
+ movq -8(%rcx),%rax
adcq $0,%rdx
addq -8(%r14),%r11
adcq $0,%rdx
@@ -649,7 +903,7 @@ L$inner4x:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
addq 8(%r14),%r11
adcq $0,%rdx
@@ -660,7 +914,7 @@ L$inner4x:
movq 16(%rsi,%r15,1),%rax
adcq $0,%rdx
addq %r11,%rdi
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %r13,-8(%r14)
movq %rdx,%r13
@@ -670,7 +924,7 @@ L$inner4x:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
adcq $0,%rdx
addq 16(%r14),%r10
leaq 32(%r14),%r14
@@ -689,7 +943,7 @@ L$inner4x:
mulq %rbx
addq %rax,%r11
movq %rbp,%rax
- movq -16(%rcx),%rbp
+ movq -8(%rcx),%rbp
adcq $0,%rdx
addq -8(%r14),%r11
adcq $0,%rdx
@@ -704,9 +958,8 @@ L$inner4x:
movq %r13,-24(%r14)
movq %rdx,%r13
-.byte 102,72,15,126,195
movq %rdi,-16(%r14)
- leaq (%rcx,%r9,2),%rcx
+ leaq (%rcx,%r9,1),%rcx
xorq %rdi,%rdi
addq %r10,%r13
@@ -717,16 +970,23 @@ L$inner4x:
cmpq 16+8(%rsp),%r12
jb L$outer4x
+ xorq %rax,%rax
subq %r13,%rbp
adcq %r15,%r15
orq %r15,%rdi
- xorq $1,%rdi
+ subq %rdi,%rax
leaq (%r14,%r9,1),%rbx
- leaq (%rcx,%rdi,8),%rbp
+ movq (%rcx),%r12
+ leaq (%rcx),%rbp
movq %r9,%rcx
sarq $3+2,%rcx
movq 56+8(%rsp),%rdi
- jmp L$sqr4x_sub
+ decq %r12
+ xorq %r10,%r10
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp L$sqr4x_sub_entry
.globl _bn_power5
@@ -739,9 +999,9 @@ _bn_power5:
pushq %r13
pushq %r14
pushq %r15
- movl %r9d,%r10d
+
shll $3,%r9d
- shll $3+2,%r10d
+ leal (%r9,%r9,2),%r10d
negq %r9
movq (%r8),%r8
@@ -751,19 +1011,20 @@ _bn_power5:
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
+
+ leaq -320(%rsp,%r9,2),%r11
+ subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb L$pwr_sp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -320(%rsp,%r9,2),%rsp
jmp L$pwr_sp_done
.p2align 5
L$pwr_sp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -791,10 +1052,15 @@ L$power5_body:
.byte 102,72,15,110,226
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
.byte 102,72,15,126,209
.byte 102,72,15,126,226
@@ -1338,9 +1604,9 @@ L$sqr4x_shift_n_add:
movq %rbx,-16(%rdi)
movq %r8,-8(%rdi)
.byte 102,72,15,126,213
-sqr8x_reduction:
+__bn_sqr8x_reduction:
xorq %rax,%rax
- leaq (%rbp,%r9,2),%rcx
+ leaq (%r9,%rbp,1),%rcx
leaq 48+8(%rsp,%r9,2),%rdx
movq %rcx,0+8(%rsp)
leaq 48+8(%rsp,%r9,1),%rdi
@@ -1373,14 +1639,14 @@ L$8x_reduction_loop:
.p2align 5
L$8x_reduce:
mulq %rbx
- movq 16(%rbp),%rax
+ movq 8(%rbp),%rax
negq %r8
movq %rdx,%r8
adcq $0,%r8
mulq %rbx
addq %rax,%r9
- movq 32(%rbp),%rax
+ movq 16(%rbp),%rax
adcq $0,%rdx
addq %r9,%r8
movq %rbx,48-8+8(%rsp,%rcx,8)
@@ -1389,7 +1655,7 @@ L$8x_reduce:
mulq %rbx
addq %rax,%r10
- movq 48(%rbp),%rax
+ movq 24(%rbp),%rax
adcq $0,%rdx
addq %r10,%r9
movq 32+8(%rsp),%rsi
@@ -1398,7 +1664,7 @@ L$8x_reduce:
mulq %rbx
addq %rax,%r11
- movq 64(%rbp),%rax
+ movq 32(%rbp),%rax
adcq $0,%rdx
imulq %r8,%rsi
addq %r11,%r10
@@ -1407,7 +1673,7 @@ L$8x_reduce:
mulq %rbx
addq %rax,%r12
- movq 80(%rbp),%rax
+ movq 40(%rbp),%rax
adcq $0,%rdx
addq %r12,%r11
movq %rdx,%r12
@@ -1415,7 +1681,7 @@ L$8x_reduce:
mulq %rbx
addq %rax,%r13
- movq 96(%rbp),%rax
+ movq 48(%rbp),%rax
adcq $0,%rdx
addq %r13,%r12
movq %rdx,%r13
@@ -1423,7 +1689,7 @@ L$8x_reduce:
mulq %rbx
addq %rax,%r14
- movq 112(%rbp),%rax
+ movq 56(%rbp),%rax
adcq $0,%rdx
addq %r14,%r13
movq %rdx,%r14
@@ -1441,7 +1707,7 @@ L$8x_reduce:
decl %ecx
jnz L$8x_reduce
- leaq 128(%rbp),%rbp
+ leaq 64(%rbp),%rbp
xorq %rax,%rax
movq 8+8(%rsp),%rdx
cmpq 0+8(%rsp),%rbp
@@ -1467,14 +1733,14 @@ L$8x_reduce:
L$8x_tail:
mulq %rbx
addq %rax,%r8
- movq 16(%rbp),%rax
+ movq 8(%rbp),%rax
movq %r8,(%rdi)
movq %rdx,%r8
adcq $0,%r8
mulq %rbx
addq %rax,%r9
- movq 32(%rbp),%rax
+ movq 16(%rbp),%rax
adcq $0,%rdx
addq %r9,%r8
leaq 8(%rdi),%rdi
@@ -1483,7 +1749,7 @@ L$8x_tail:
mulq %rbx
addq %rax,%r10
- movq 48(%rbp),%rax
+ movq 24(%rbp),%rax
adcq $0,%rdx
addq %r10,%r9
movq %rdx,%r10
@@ -1491,7 +1757,7 @@ L$8x_tail:
mulq %rbx
addq %rax,%r11
- movq 64(%rbp),%rax
+ movq 32(%rbp),%rax
adcq $0,%rdx
addq %r11,%r10
movq %rdx,%r11
@@ -1499,7 +1765,7 @@ L$8x_tail:
mulq %rbx
addq %rax,%r12
- movq 80(%rbp),%rax
+ movq 40(%rbp),%rax
adcq $0,%rdx
addq %r12,%r11
movq %rdx,%r12
@@ -1507,7 +1773,7 @@ L$8x_tail:
mulq %rbx
addq %rax,%r13
- movq 96(%rbp),%rax
+ movq 48(%rbp),%rax
adcq $0,%rdx
addq %r13,%r12
movq %rdx,%r13
@@ -1515,7 +1781,7 @@ L$8x_tail:
mulq %rbx
addq %rax,%r14
- movq 112(%rbp),%rax
+ movq 56(%rbp),%rax
adcq $0,%rdx
addq %r14,%r13
movq %rdx,%r14
@@ -1533,7 +1799,7 @@ L$8x_tail:
decl %ecx
jnz L$8x_tail
- leaq 128(%rbp),%rbp
+ leaq 64(%rbp),%rbp
movq 8+8(%rsp),%rdx
cmpq 0+8(%rsp),%rbp
jae L$8x_tail_done
@@ -1579,7 +1845,7 @@ L$8x_no_tail:
adcq 48(%rdi),%r14
adcq 56(%rdi),%r15
adcq $0,%rax
- movq -16(%rbp),%rcx
+ movq -8(%rbp),%rcx
xorq %rsi,%rsi
.byte 102,72,15,126,213
@@ -1597,40 +1863,58 @@ L$8x_no_tail:
cmpq %rdx,%rdi
jb L$8x_reduction_loop
+ .byte 0xf3,0xc3
+
- subq %r15,%rcx
+.p2align 5
+__bn_post4x_internal:
+ movq 0(%rbp),%r12
leaq (%rdi,%r9,1),%rbx
- adcq %rsi,%rsi
movq %r9,%rcx
- orq %rsi,%rax
.byte 102,72,15,126,207
- xorq $1,%rax
+ negq %rax
.byte 102,72,15,126,206
- leaq (%rbp,%rax,8),%rbp
sarq $3+2,%rcx
- jmp L$sqr4x_sub
+ decq %r12
+ xorq %r10,%r10
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp L$sqr4x_sub_entry
-.p2align 5
+.p2align 4
L$sqr4x_sub:
-.byte 0x66
- movq 0(%rbx),%r12
- movq 8(%rbx),%r13
- sbbq 0(%rbp),%r12
- movq 16(%rbx),%r14
- sbbq 16(%rbp),%r13
- movq 24(%rbx),%r15
- leaq 32(%rbx),%rbx
- sbbq 32(%rbp),%r14
+ movq 0(%rbp),%r12
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+L$sqr4x_sub_entry:
+ leaq 32(%rbp),%rbp
+ notq %r12
+ notq %r13
+ notq %r14
+ notq %r15
+ andq %rax,%r12
+ andq %rax,%r13
+ andq %rax,%r14
+ andq %rax,%r15
+
+ negq %r10
+ adcq 0(%rbx),%r12
+ adcq 8(%rbx),%r13
+ adcq 16(%rbx),%r14
+ adcq 24(%rbx),%r15
movq %r12,0(%rdi)
- sbbq 48(%rbp),%r15
- leaq 64(%rbp),%rbp
+ leaq 32(%rbx),%rbx
movq %r13,8(%rdi)
+ sbbq %r10,%r10
movq %r14,16(%rdi)
movq %r15,24(%rdi)
leaq 32(%rdi),%rdi
incq %rcx
jnz L$sqr4x_sub
+
movq %r9,%r10
negq %r9
.byte 0xf3,0xc3
@@ -1656,10 +1940,9 @@ bn_from_mont8x:
pushq %r13
pushq %r14
pushq %r15
-.byte 0x67
- movl %r9d,%r10d
+
shll $3,%r9d
- shll $3+2,%r10d
+ leaq (%r9,%r9,2),%r10
negq %r9
movq (%r8),%r8
@@ -1669,19 +1952,20 @@ bn_from_mont8x:
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
+
+ leaq -320(%rsp,%r9,2),%r11
+ subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb L$from_sp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -320(%rsp,%r9,2),%rsp
jmp L$from_sp_done
.p2align 5
L$from_sp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -1732,7 +2016,8 @@ L$mul_by_1:
.byte 0x67
movq %rcx,%rbp
.byte 102,73,15,110,218
- call sqr8x_reduction
+ call __bn_sqr8x_reduction
+ call __bn_post4x_internal
pxor %xmm0,%xmm0
leaq 48(%rsp),%rax
@@ -1799,45 +2084,169 @@ L$scatter_epilogue:
.globl _bn_gather5
-.p2align 4
+.p2align 5
_bn_gather5:
- movl %ecx,%r11d
- shrl $3,%ecx
- andq $7,%r11
- notl %ecx
- leaq L$magic_masks(%rip),%rax
- andl $3,%ecx
- leaq 128(%rdx,%r11,8),%rdx
- movq 0(%rax,%rcx,8),%xmm4
- movq 8(%rax,%rcx,8),%xmm5
- movq 16(%rax,%rcx,8),%xmm6
- movq 24(%rax,%rcx,8),%xmm7
+L$SEH_begin_bn_gather5:
+
+.byte 0x4c,0x8d,0x14,0x24
+.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00
+ leaq L$inc(%rip),%rax
+ andq $-16,%rsp
+
+ movd %ecx,%xmm5
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 128(%rdx),%r11
+ leaq 128(%rsp),%rax
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,-128(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,-112(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,-96(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,-80(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,-64(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,-48(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,-32(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,-16(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,16(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,32(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,48(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,64(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,80(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,96(%rax)
+ movdqa %xmm4,%xmm2
+ movdqa %xmm3,112(%rax)
jmp L$gather
-.p2align 4
-L$gather:
- movq -128(%rdx),%xmm0
- movq -64(%rdx),%xmm1
- pand %xmm4,%xmm0
- movq 0(%rdx),%xmm2
- pand %xmm5,%xmm1
- movq 64(%rdx),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-.byte 0x67,0x67
- por %xmm2,%xmm0
- leaq 256(%rdx),%rdx
- por %xmm3,%xmm0
+.p2align 5
+L$gather:
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r11),%xmm0
+ movdqa -112(%r11),%xmm1
+ movdqa -96(%r11),%xmm2
+ pand -128(%rax),%xmm0
+ movdqa -80(%r11),%xmm3
+ pand -112(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r11),%xmm0
+ movdqa -48(%r11),%xmm1
+ movdqa -32(%r11),%xmm2
+ pand -64(%rax),%xmm0
+ movdqa -16(%r11),%xmm3
+ pand -48(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r11),%xmm0
+ movdqa 16(%r11),%xmm1
+ movdqa 32(%r11),%xmm2
+ pand 0(%rax),%xmm0
+ movdqa 48(%r11),%xmm3
+ pand 16(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r11),%xmm0
+ movdqa 80(%r11),%xmm1
+ movdqa 96(%r11),%xmm2
+ pand 64(%rax),%xmm0
+ movdqa 112(%r11),%xmm3
+ pand 80(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ leaq 256(%r11),%r11
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
movq %xmm0,(%rdi)
leaq 8(%rdi),%rdi
subl $1,%esi
jnz L$gather
+
+ leaq (%r10),%rsp
.byte 0xf3,0xc3
L$SEH_end_bn_gather5:
.p2align 6
-L$magic_masks:
-.long 0,0, 0,0, 0,0, -1,-1
-.long 0,0, 0,0, 0,0, 0,0
+L$inc:
+.long 0,0, 1,1
+.long 2,2, 2,2
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/camellia/cmll-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/camellia/cmll-x86_64.s
index 0a3145ad4b..8025d088fd 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/camellia/cmll-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/camellia/cmll-x86_64.s
@@ -1624,7 +1624,7 @@ L$cbc_prologue:
leaq -64-63(%rcx),%r10
subq %rsp,%r10
negq %r10
- andq $960,%r10
+ andq $0x3C0,%r10
subq %r10,%rsp
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/ec/ecp_nistz256-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/ec/ecp_nistz256-x86_64.s
index a63b602b9b..30456b900f 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/ec/ecp_nistz256-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/ec/ecp_nistz256-x86_64.s
@@ -1121,6 +1121,7 @@ _ecp_nistz256_point_double:
pushq %r15
subq $160+8,%rsp
+L$point_double_shortcutq:
movdqu 0(%rsi),%xmm0
movq %rsi,%rbx
movdqu 16(%rsi),%xmm1
@@ -1341,7 +1342,7 @@ _ecp_nistz256_point_add:
por %xmm1,%xmm3
movdqu 0(%rsi),%xmm0
- pshufd $177,%xmm3,%xmm5
+ pshufd $0xb1,%xmm3,%xmm5
movdqu 16(%rsi),%xmm1
movdqu 32(%rsi),%xmm2
por %xmm3,%xmm5
@@ -1351,7 +1352,7 @@ _ecp_nistz256_point_add:
movq 64+16(%rsi),%r15
movq 64+24(%rsi),%r8
movdqa %xmm0,480(%rsp)
- pshufd $30,%xmm5,%xmm4
+ pshufd $0x1e,%xmm5,%xmm4
movdqa %xmm1,480+16(%rsp)
por %xmm0,%xmm1
.byte 102,72,15,110,199
@@ -1371,10 +1372,10 @@ _ecp_nistz256_point_add:
call __ecp_nistz256_sqr_montq
pcmpeqd %xmm4,%xmm5
- pshufd $177,%xmm3,%xmm4
+ pshufd $0xb1,%xmm3,%xmm4
por %xmm3,%xmm4
pshufd $0,%xmm5,%xmm5
- pshufd $30,%xmm4,%xmm3
+ pshufd $0x1e,%xmm4,%xmm3
por %xmm3,%xmm4
pxor %xmm3,%xmm3
pcmpeqd %xmm3,%xmm4
@@ -1383,6 +1384,7 @@ _ecp_nistz256_point_add:
movq 64+8(%rbx),%r14
movq 64+16(%rbx),%r15
movq 64+24(%rbx),%r8
+.byte 102,72,15,110,203
leaq 64-0(%rbx),%rsi
leaq 32(%rsp),%rdi
@@ -1474,7 +1476,7 @@ _ecp_nistz256_point_add:
testq %r8,%r8
jnz L$add_proceedq
testq %r9,%r9
- jz L$add_proceedq
+ jz L$add_doubleq
.byte 102,72,15,126,199
pxor %xmm0,%xmm0
@@ -1487,6 +1489,13 @@ _ecp_nistz256_point_add:
jmp L$add_doneq
.p2align 5
+L$add_doubleq:
+.byte 102,72,15,126,206
+.byte 102,72,15,126,199
+ addq $416,%rsp
+ jmp L$point_double_shortcutq
+
+.p2align 5
L$add_proceedq:
movq 0+64(%rsp),%rax
movq 8+64(%rsp),%r14
@@ -1733,13 +1742,13 @@ _ecp_nistz256_point_add_affine:
por %xmm1,%xmm3
movdqu 0(%rbx),%xmm0
- pshufd $177,%xmm3,%xmm5
+ pshufd $0xb1,%xmm3,%xmm5
movdqu 16(%rbx),%xmm1
movdqu 32(%rbx),%xmm2
por %xmm3,%xmm5
movdqu 48(%rbx),%xmm3
movdqa %xmm0,416(%rsp)
- pshufd $30,%xmm5,%xmm4
+ pshufd $0x1e,%xmm5,%xmm4
movdqa %xmm1,416+16(%rsp)
por %xmm0,%xmm1
.byte 102,72,15,110,199
@@ -1755,13 +1764,13 @@ _ecp_nistz256_point_add_affine:
call __ecp_nistz256_sqr_montq
pcmpeqd %xmm4,%xmm5
- pshufd $177,%xmm3,%xmm4
+ pshufd $0xb1,%xmm3,%xmm4
movq 0(%rbx),%rax
movq %r12,%r9
por %xmm3,%xmm4
pshufd $0,%xmm5,%xmm5
- pshufd $30,%xmm4,%xmm3
+ pshufd $0x1e,%xmm4,%xmm3
movq %r13,%r10
por %xmm3,%xmm4
pxor %xmm3,%xmm3
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/modes/ghash-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/modes/ghash-x86_64.s
index f21b3013c5..77fddf934a 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/modes/ghash-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/modes/ghash-x86_64.s
@@ -20,14 +20,14 @@ L$gmult_prologue:
movq $14,%rcx
movq 8(%rsi,%rax,1),%r8
movq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
movq %r8,%rdx
jmp L$oop1
.p2align 4
L$oop1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
movb (%rdi,%rcx,1),%al
shrq $4,%r9
@@ -43,13 +43,13 @@ L$oop1:
js L$break1
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
@@ -58,19 +58,19 @@ L$oop1:
.p2align 4
L$break1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rbx,1),%r8
@@ -661,10 +661,10 @@ L$ghash_epilogue:
_gcm_init_clmul:
L$_init_clmul:
movdqu (%rsi),%xmm2
- pshufd $78,%xmm2,%xmm2
+ pshufd $0b01001110,%xmm2,%xmm2
- pshufd $255,%xmm2,%xmm4
+ pshufd $0b11111111,%xmm2,%xmm4
movdqa %xmm2,%xmm3
psllq $1,%xmm2
pxor %xmm5,%xmm5
@@ -678,11 +678,11 @@ L$_init_clmul:
pxor %xmm5,%xmm2
- pshufd $78,%xmm2,%xmm6
+ pshufd $0b01001110,%xmm2,%xmm6
movdqa %xmm2,%xmm0
pxor %xmm2,%xmm6
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -718,8 +718,8 @@ L$_init_clmul:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- pshufd $78,%xmm2,%xmm3
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm2,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm2,%xmm3
movdqu %xmm2,0(%rdi)
pxor %xmm0,%xmm4
@@ -727,7 +727,7 @@ L$_init_clmul:
.byte 102,15,58,15,227,8
movdqu %xmm4,32(%rdi)
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -765,7 +765,7 @@ L$_init_clmul:
pxor %xmm1,%xmm0
movdqa %xmm0,%xmm5
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -801,8 +801,8 @@ L$_init_clmul:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- pshufd $78,%xmm5,%xmm3
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm5,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm5,%xmm3
movdqu %xmm5,48(%rdi)
pxor %xmm0,%xmm4
@@ -822,7 +822,7 @@ L$_gmult_clmul:
movdqu 32(%rsi),%xmm4
.byte 102,15,56,0,197
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -874,20 +874,20 @@ L$_ghash_clmul:
movdqu 32(%rsi),%xmm7
.byte 102,65,15,56,0,194
- subq $16,%rcx
+ subq $0x10,%rcx
jz L$odd_tail
movdqu 16(%rsi),%xmm6
movl _OPENSSL_ia32cap_P+4(%rip),%eax
- cmpq $48,%rcx
+ cmpq $0x30,%rcx
jb L$skip4x
andl $71303168,%eax
cmpl $4194304,%eax
je L$skip4x
- subq $48,%rcx
- movq $11547335547999543296,%rax
+ subq $0x30,%rcx
+ movq $0xA040608020C0E000,%rax
movdqu 48(%rsi),%xmm14
movdqu 64(%rsi),%xmm15
@@ -899,14 +899,14 @@ L$_ghash_clmul:
.byte 102,65,15,56,0,218
.byte 102,69,15,56,0,218
movdqa %xmm3,%xmm5
- pshufd $78,%xmm3,%xmm4
+ pshufd $0b01001110,%xmm3,%xmm4
pxor %xmm3,%xmm4
.byte 102,15,58,68,218,0
.byte 102,15,58,68,234,17
.byte 102,15,58,68,231,0
movdqa %xmm11,%xmm13
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
pxor %xmm11,%xmm12
.byte 102,68,15,58,68,222,0
.byte 102,68,15,58,68,238,17
@@ -921,12 +921,12 @@ L$_ghash_clmul:
.byte 102,69,15,56,0,218
.byte 102,69,15,56,0,194
movdqa %xmm11,%xmm13
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
pxor %xmm8,%xmm0
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm8
+ pshufd $0b01001110,%xmm0,%xmm8
pxor %xmm0,%xmm8
.byte 102,69,15,58,68,238,17
.byte 102,68,15,58,68,231,0
@@ -934,7 +934,7 @@ L$_ghash_clmul:
xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jc L$tail4x
jmp L$mod4_loop
@@ -949,14 +949,14 @@ L$mod4_loop:
movdqu 32(%rdx),%xmm3
movdqa %xmm11,%xmm13
.byte 102,68,15,58,68,199,16
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
xorps %xmm5,%xmm1
pxor %xmm11,%xmm12
.byte 102,65,15,56,0,218
movups 32(%rsi),%xmm7
xorps %xmm4,%xmm8
.byte 102,68,15,58,68,218,0
- pshufd $78,%xmm3,%xmm4
+ pshufd $0b01001110,%xmm3,%xmm4
pxor %xmm0,%xmm8
movdqa %xmm3,%xmm5
@@ -1000,7 +1000,7 @@ L$mod4_loop:
movdqa %xmm11,%xmm13
pxor %xmm12,%xmm4
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
pxor %xmm9,%xmm0
pxor %xmm8,%xmm1
pxor %xmm11,%xmm12
@@ -1010,14 +1010,14 @@ L$mod4_loop:
movdqa %xmm0,%xmm1
.byte 102,69,15,58,68,238,17
xorps %xmm11,%xmm3
- pshufd $78,%xmm0,%xmm8
+ pshufd $0b01001110,%xmm0,%xmm8
pxor %xmm0,%xmm8
.byte 102,68,15,58,68,231,0
xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jnc L$mod4_loop
L$tail4x:
@@ -1061,10 +1061,10 @@ L$tail4x:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- addq $64,%rcx
+ addq $0x40,%rcx
jz L$done
movdqu 32(%rsi),%xmm7
- subq $16,%rcx
+ subq $0x10,%rcx
jz L$odd_tail
L$skip4x:
@@ -1079,7 +1079,7 @@ L$skip4x:
pxor %xmm8,%xmm0
movdqa %xmm3,%xmm5
- pshufd $78,%xmm3,%xmm4
+ pshufd $0b01001110,%xmm3,%xmm4
pxor %xmm3,%xmm4
.byte 102,15,58,68,218,0
.byte 102,15,58,68,234,17
@@ -1087,7 +1087,7 @@ L$skip4x:
leaq 32(%rdx),%rdx
nop
- subq $32,%rcx
+ subq $0x20,%rcx
jbe L$even_tail
nop
jmp L$mod_loop
@@ -1096,7 +1096,7 @@ L$skip4x:
L$mod_loop:
movdqa %xmm0,%xmm1
movdqa %xmm4,%xmm8
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm0,%xmm4
.byte 102,15,58,68,198,0
@@ -1134,7 +1134,7 @@ L$mod_loop:
pslldq $8,%xmm0
psrldq $8,%xmm8
pxor %xmm9,%xmm0
- pshufd $78,%xmm5,%xmm4
+ pshufd $0b01001110,%xmm5,%xmm4
pxor %xmm8,%xmm1
pxor %xmm5,%xmm4
@@ -1150,13 +1150,13 @@ L$mod_loop:
.byte 102,15,58,68,231,0
pxor %xmm1,%xmm0
- subq $32,%rcx
+ subq $0x20,%rcx
ja L$mod_loop
L$even_tail:
movdqa %xmm0,%xmm1
movdqa %xmm4,%xmm8
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm0,%xmm4
.byte 102,15,58,68,198,0
@@ -1204,7 +1204,7 @@ L$odd_tail:
.byte 102,69,15,56,0,194
pxor %xmm8,%xmm0
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha1-mb-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha1-mb-x86_64.s
index 010924530b..a0de51655d 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha1-mb-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha1-mb-x86_64.s
@@ -2599,10 +2599,10 @@ L$oop_grande_shaext:
punpcklqdq %xmm5,%xmm0
punpckhqdq %xmm5,%xmm8
- pshufd $63,%xmm7,%xmm1
- pshufd $127,%xmm7,%xmm9
- pshufd $27,%xmm0,%xmm0
- pshufd $27,%xmm8,%xmm8
+ pshufd $0b00111111,%xmm7,%xmm1
+ pshufd $0b01111111,%xmm7,%xmm9
+ pshufd $0b00011011,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm8,%xmm8
jmp L$oop_shaext
.p2align 5
@@ -2857,8 +2857,8 @@ L$oop_shaext:
.byte 69,15,58,204,193,3
.byte 69,15,56,200,214
- pshufd $0,%xmm6,%xmm11
- pshufd $85,%xmm6,%xmm12
+ pshufd $0x00,%xmm6,%xmm11
+ pshufd $0x55,%xmm6,%xmm12
movdqa %xmm6,%xmm7
pcmpgtd %xmm4,%xmm11
pcmpgtd %xmm4,%xmm12
@@ -2888,8 +2888,8 @@ L$oop_shaext:
movl 280(%rsp),%edx
- pshufd $27,%xmm0,%xmm0
- pshufd $27,%xmm8,%xmm8
+ pshufd $0b00011011,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm8,%xmm8
movdqa %xmm0,%xmm6
punpckldq %xmm8,%xmm0
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha1-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha1-x86_64.s
index 671034cdaf..798ca0dc4d 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha1-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha1-x86_64.s
@@ -1240,9 +1240,9 @@ _shaext_shortcut:
movdqa K_XX_XX+160(%rip),%xmm3
movdqu (%rsi),%xmm4
- pshufd $27,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm0,%xmm0
movdqu 16(%rsi),%xmm5
- pshufd $27,%xmm1,%xmm1
+ pshufd $0b00011011,%xmm1,%xmm1
movdqu 32(%rsi),%xmm6
.byte 102,15,56,0,227
movdqu 48(%rsi),%xmm7
@@ -1392,8 +1392,8 @@ L$oop_shaext:
jnz L$oop_shaext
- pshufd $27,%xmm0,%xmm0
- pshufd $27,%xmm1,%xmm1
+ pshufd $0b00011011,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm1,%xmm1
movdqu %xmm0,(%rdi)
movd %xmm1,16(%rdi)
.byte 0xf3,0xc3
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha256-mb-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha256-mb-x86_64.s
index 5ad4c7bb10..276322bec2 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha256-mb-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha256-mb-x86_64.s
@@ -2677,10 +2677,10 @@ L$oop_grande_shaext:
punpckhqdq %xmm8,%xmm14
punpckhqdq %xmm10,%xmm15
- pshufd $27,%xmm12,%xmm12
- pshufd $27,%xmm13,%xmm13
- pshufd $27,%xmm14,%xmm14
- pshufd $27,%xmm15,%xmm15
+ pshufd $0b00011011,%xmm12,%xmm12
+ pshufd $0b00011011,%xmm13,%xmm13
+ pshufd $0b00011011,%xmm14,%xmm14
+ pshufd $0b00011011,%xmm15,%xmm15
jmp L$oop_shaext
.p2align 5
@@ -2712,11 +2712,11 @@ L$oop_shaext:
movdqa %xmm2,%xmm0
movdqa %xmm15,112(%rsp)
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
pxor %xmm12,%xmm4
movdqa %xmm12,64(%rsp)
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
pxor %xmm14,%xmm8
movdqa %xmm14,96(%rsp)
movdqa 16-128(%rbp),%xmm1
@@ -2734,11 +2734,11 @@ L$oop_shaext:
.byte 102,68,15,56,0,211
prefetcht0 127(%r9)
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
.byte 102,68,15,56,0,219
.byte 15,56,204,229
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 32-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -2751,14 +2751,14 @@ L$oop_shaext:
movdqa %xmm2,%xmm0
movdqa %xmm7,%xmm3
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
.byte 102,15,58,15,222,4
paddd %xmm3,%xmm4
movdqa %xmm11,%xmm3
.byte 102,65,15,58,15,218,4
.byte 15,56,204,238
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 48-128(%rbp),%xmm1
paddd %xmm7,%xmm1
.byte 69,15,56,203,247
@@ -2775,13 +2775,13 @@ L$oop_shaext:
.byte 102,15,58,15,223,4
.byte 69,15,56,203,254
.byte 69,15,56,205,195
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm5
movdqa %xmm8,%xmm3
.byte 102,65,15,58,15,219,4
.byte 15,56,204,247
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 64-128(%rbp),%xmm1
paddd %xmm4,%xmm1
.byte 69,15,56,203,247
@@ -2797,13 +2797,13 @@ L$oop_shaext:
.byte 102,15,58,15,220,4
.byte 69,15,56,203,254
.byte 69,15,56,205,200
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm6
movdqa %xmm9,%xmm3
.byte 102,65,15,58,15,216,4
.byte 15,56,204,252
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 80-128(%rbp),%xmm1
paddd %xmm5,%xmm1
.byte 69,15,56,203,247
@@ -2819,13 +2819,13 @@ L$oop_shaext:
.byte 102,15,58,15,221,4
.byte 69,15,56,203,254
.byte 69,15,56,205,209
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm7
movdqa %xmm10,%xmm3
.byte 102,65,15,58,15,217,4
.byte 15,56,204,229
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 96-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -2841,13 +2841,13 @@ L$oop_shaext:
.byte 102,15,58,15,222,4
.byte 69,15,56,203,254
.byte 69,15,56,205,218
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm4
movdqa %xmm11,%xmm3
.byte 102,65,15,58,15,218,4
.byte 15,56,204,238
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 112-128(%rbp),%xmm1
paddd %xmm7,%xmm1
.byte 69,15,56,203,247
@@ -2863,13 +2863,13 @@ L$oop_shaext:
.byte 102,15,58,15,223,4
.byte 69,15,56,203,254
.byte 69,15,56,205,195
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm5
movdqa %xmm8,%xmm3
.byte 102,65,15,58,15,219,4
.byte 15,56,204,247
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 128-128(%rbp),%xmm1
paddd %xmm4,%xmm1
.byte 69,15,56,203,247
@@ -2885,13 +2885,13 @@ L$oop_shaext:
.byte 102,15,58,15,220,4
.byte 69,15,56,203,254
.byte 69,15,56,205,200
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm6
movdqa %xmm9,%xmm3
.byte 102,65,15,58,15,216,4
.byte 15,56,204,252
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 144-128(%rbp),%xmm1
paddd %xmm5,%xmm1
.byte 69,15,56,203,247
@@ -2907,13 +2907,13 @@ L$oop_shaext:
.byte 102,15,58,15,221,4
.byte 69,15,56,203,254
.byte 69,15,56,205,209
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm7
movdqa %xmm10,%xmm3
.byte 102,65,15,58,15,217,4
.byte 15,56,204,229
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 160-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -2929,13 +2929,13 @@ L$oop_shaext:
.byte 102,15,58,15,222,4
.byte 69,15,56,203,254
.byte 69,15,56,205,218
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm4
movdqa %xmm11,%xmm3
.byte 102,65,15,58,15,218,4
.byte 15,56,204,238
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 176-128(%rbp),%xmm1
paddd %xmm7,%xmm1
.byte 69,15,56,203,247
@@ -2951,13 +2951,13 @@ L$oop_shaext:
.byte 102,15,58,15,223,4
.byte 69,15,56,203,254
.byte 69,15,56,205,195
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm5
movdqa %xmm8,%xmm3
.byte 102,65,15,58,15,219,4
.byte 15,56,204,247
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 192-128(%rbp),%xmm1
paddd %xmm4,%xmm1
.byte 69,15,56,203,247
@@ -2973,13 +2973,13 @@ L$oop_shaext:
.byte 102,15,58,15,220,4
.byte 69,15,56,203,254
.byte 69,15,56,205,200
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm6
movdqa %xmm9,%xmm3
.byte 102,65,15,58,15,216,4
.byte 15,56,204,252
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 208-128(%rbp),%xmm1
paddd %xmm5,%xmm1
.byte 69,15,56,203,247
@@ -2995,13 +2995,13 @@ L$oop_shaext:
.byte 102,15,58,15,221,4
.byte 69,15,56,203,254
.byte 69,15,56,205,209
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm7
movdqa %xmm10,%xmm3
.byte 102,65,15,58,15,217,4
nop
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 224-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -3018,13 +3018,13 @@ L$oop_shaext:
pxor %xmm6,%xmm6
.byte 69,15,56,203,254
.byte 69,15,56,205,218
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
movdqa 240-128(%rbp),%xmm1
paddd %xmm7,%xmm1
movq (%rbx),%xmm7
nop
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 240-128(%rbp),%xmm2
paddd %xmm11,%xmm2
.byte 69,15,56,203,247
@@ -3034,17 +3034,17 @@ L$oop_shaext:
cmovgeq %rsp,%r8
cmpl 4(%rbx),%ecx
cmovgeq %rsp,%r9
- pshufd $0,%xmm7,%xmm9
+ pshufd $0x00,%xmm7,%xmm9
.byte 69,15,56,203,236
movdqa %xmm2,%xmm0
- pshufd $85,%xmm7,%xmm10
+ pshufd $0x55,%xmm7,%xmm10
movdqa %xmm7,%xmm11
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
pcmpgtd %xmm6,%xmm9
pcmpgtd %xmm6,%xmm10
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
pcmpgtd %xmm6,%xmm11
movdqa K256_shaext-16(%rip),%xmm3
.byte 69,15,56,203,247
@@ -3066,10 +3066,10 @@ L$oop_shaext:
movl 280(%rsp),%edx
- pshufd $27,%xmm12,%xmm12
- pshufd $27,%xmm13,%xmm13
- pshufd $27,%xmm14,%xmm14
- pshufd $27,%xmm15,%xmm15
+ pshufd $0b00011011,%xmm12,%xmm12
+ pshufd $0b00011011,%xmm13,%xmm13
+ pshufd $0b00011011,%xmm14,%xmm14
+ pshufd $0b00011011,%xmm15,%xmm15
movdqa %xmm12,%xmm5
movdqa %xmm13,%xmm6
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha256-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha256-x86_64.s
index aa507cada6..5566d58761 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha256-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha256-x86_64.s
@@ -1754,9 +1754,9 @@ _shaext_shortcut:
movdqu 16(%rdi),%xmm2
movdqa 512-128(%rcx),%xmm7
- pshufd $27,%xmm1,%xmm0
- pshufd $177,%xmm1,%xmm1
- pshufd $27,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm0
+ pshufd $0xb1,%xmm1,%xmm1
+ pshufd $0x1b,%xmm2,%xmm2
movdqa %xmm7,%xmm8
.byte 102,15,58,15,202,8
punpcklqdq %xmm0,%xmm2
@@ -1775,7 +1775,7 @@ L$oop_shaext:
.byte 102,15,56,0,231
movdqa %xmm2,%xmm10
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
nop
movdqa %xmm1,%xmm9
.byte 15,56,203,202
@@ -1784,7 +1784,7 @@ L$oop_shaext:
paddd %xmm4,%xmm0
.byte 102,15,56,0,239
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
leaq 64(%rsi),%rsi
.byte 15,56,204,220
.byte 15,56,203,202
@@ -1793,7 +1793,7 @@ L$oop_shaext:
paddd %xmm5,%xmm0
.byte 102,15,56,0,247
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
.byte 102,15,58,15,253,4
nop
@@ -1805,7 +1805,7 @@ L$oop_shaext:
paddd %xmm6,%xmm0
.byte 15,56,205,222
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
.byte 102,15,58,15,254,4
nop
@@ -1816,7 +1816,7 @@ L$oop_shaext:
paddd %xmm3,%xmm0
.byte 15,56,205,227
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
.byte 102,15,58,15,251,4
nop
@@ -1827,7 +1827,7 @@ L$oop_shaext:
paddd %xmm4,%xmm0
.byte 15,56,205,236
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
.byte 102,15,58,15,252,4
nop
@@ -1838,7 +1838,7 @@ L$oop_shaext:
paddd %xmm5,%xmm0
.byte 15,56,205,245
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
.byte 102,15,58,15,253,4
nop
@@ -1849,7 +1849,7 @@ L$oop_shaext:
paddd %xmm6,%xmm0
.byte 15,56,205,222
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
.byte 102,15,58,15,254,4
nop
@@ -1860,7 +1860,7 @@ L$oop_shaext:
paddd %xmm3,%xmm0
.byte 15,56,205,227
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
.byte 102,15,58,15,251,4
nop
@@ -1871,7 +1871,7 @@ L$oop_shaext:
paddd %xmm4,%xmm0
.byte 15,56,205,236
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
.byte 102,15,58,15,252,4
nop
@@ -1882,7 +1882,7 @@ L$oop_shaext:
paddd %xmm5,%xmm0
.byte 15,56,205,245
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
.byte 102,15,58,15,253,4
nop
@@ -1893,7 +1893,7 @@ L$oop_shaext:
paddd %xmm6,%xmm0
.byte 15,56,205,222
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
.byte 102,15,58,15,254,4
nop
@@ -1904,7 +1904,7 @@ L$oop_shaext:
paddd %xmm3,%xmm0
.byte 15,56,205,227
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
.byte 102,15,58,15,251,4
nop
@@ -1915,7 +1915,7 @@ L$oop_shaext:
paddd %xmm4,%xmm0
.byte 15,56,205,236
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
.byte 102,15,58,15,252,4
.byte 15,56,203,202
@@ -1924,7 +1924,7 @@ L$oop_shaext:
movdqa 448-128(%rcx),%xmm0
paddd %xmm5,%xmm0
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
.byte 15,56,205,245
movdqa %xmm8,%xmm7
.byte 15,56,203,202
@@ -1933,7 +1933,7 @@ L$oop_shaext:
paddd %xmm6,%xmm0
nop
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
decq %rdx
nop
.byte 15,56,203,202
@@ -1942,9 +1942,9 @@ L$oop_shaext:
paddd %xmm9,%xmm1
jnz L$oop_shaext
- pshufd $177,%xmm2,%xmm2
- pshufd $27,%xmm1,%xmm7
- pshufd $177,%xmm1,%xmm1
+ pshufd $0xb1,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm7
+ pshufd $0xb1,%xmm1,%xmm1
punpckhqdq %xmm2,%xmm1
.byte 102,15,58,15,215,8
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/x86_64cpuid.s b/deps/openssl/asm_obsolete/x64-macosx-gas/x86_64cpuid.s
index 5d69baad8f..ef623d5967 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/x86_64cpuid.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/x86_64cpuid.s
@@ -45,43 +45,43 @@ _OPENSSL_ia32_cpuid:
movl %eax,%r11d
xorl %eax,%eax
- cmpl $1970169159,%ebx
+ cmpl $0x756e6547,%ebx
setne %al
movl %eax,%r9d
- cmpl $1231384169,%edx
+ cmpl $0x49656e69,%edx
setne %al
orl %eax,%r9d
- cmpl $1818588270,%ecx
+ cmpl $0x6c65746e,%ecx
setne %al
orl %eax,%r9d
jz L$intel
- cmpl $1752462657,%ebx
+ cmpl $0x68747541,%ebx
setne %al
movl %eax,%r10d
- cmpl $1769238117,%edx
+ cmpl $0x69746E65,%edx
setne %al
orl %eax,%r10d
- cmpl $1145913699,%ecx
+ cmpl $0x444D4163,%ecx
setne %al
orl %eax,%r10d
jnz L$intel
- movl $2147483648,%eax
+ movl $0x80000000,%eax
cpuid
- cmpl $2147483649,%eax
+ cmpl $0x80000001,%eax
jb L$intel
movl %eax,%r10d
- movl $2147483649,%eax
+ movl $0x80000001,%eax
cpuid
orl %ecx,%r9d
- andl $2049,%r9d
+ andl $0x00000801,%r9d
- cmpl $2147483656,%r10d
+ cmpl $0x80000008,%r10d
jb L$intel
- movl $2147483656,%eax
+ movl $0x80000008,%eax
cpuid
movzbq %cl,%r10
incq %r10
@@ -93,7 +93,7 @@ _OPENSSL_ia32_cpuid:
shrl $16,%ebx
cmpb %r10b,%bl
ja L$generic
- andl $4026531839,%edx
+ andl $0xefffffff,%edx
jmp L$generic
L$intel:
@@ -106,7 +106,7 @@ L$intel:
cpuid
movl %eax,%r10d
shrl $14,%r10d
- andl $4095,%r10d
+ andl $0xfff,%r10d
cmpl $7,%r11d
jb L$nocacheinfo
@@ -119,29 +119,29 @@ L$intel:
L$nocacheinfo:
movl $1,%eax
cpuid
- andl $3220176895,%edx
+ andl $0xbfefffff,%edx
cmpl $0,%r9d
jne L$notintel
- orl $1073741824,%edx
+ orl $0x40000000,%edx
andb $15,%ah
cmpb $15,%ah
jne L$notintel
- orl $1048576,%edx
+ orl $0x00100000,%edx
L$notintel:
btl $28,%edx
jnc L$generic
- andl $4026531839,%edx
+ andl $0xefffffff,%edx
cmpl $0,%r10d
je L$generic
- orl $268435456,%edx
+ orl $0x10000000,%edx
shrl $16,%ebx
cmpb $1,%bl
ja L$generic
- andl $4026531839,%edx
+ andl $0xefffffff,%edx
L$generic:
- andl $2048,%r9d
- andl $4294965247,%ecx
+ andl $0x00000800,%r9d
+ andl $0xfffff7ff,%ecx
orl %ecx,%r9d
movl %edx,%r10d
@@ -153,9 +153,9 @@ L$generic:
cmpl $6,%eax
je L$done
L$clear_avx:
- movl $4026525695,%eax
+ movl $0xefffe7ff,%eax
andl %eax,%r9d
- andl $4294967263,8(%rdi)
+ andl $0xffffffdf,8(%rdi)
L$done:
shlq $32,%r9
movl %r10d,%eax
diff --git a/deps/openssl/asm_obsolete/x64-win32-masm/bn/rsaz-x86_64.asm b/deps/openssl/asm_obsolete/x64-win32-masm/bn/rsaz-x86_64.asm
index 86e828d3dc..89114311a5 100644
--- a/deps/openssl/asm_obsolete/x64-win32-masm/bn/rsaz-x86_64.asm
+++ b/deps/openssl/asm_obsolete/x64-win32-masm/bn/rsaz-x86_64.asm
@@ -502,48 +502,104 @@ $L$SEH_begin_rsaz_512_mul_gather4::
push r14
push r15
- mov r9d,r9d
- sub rsp,128+24
+ sub rsp,328
+ movaps XMMWORD PTR[160+rsp],xmm6
+ movaps XMMWORD PTR[176+rsp],xmm7
+ movaps XMMWORD PTR[192+rsp],xmm8
+ movaps XMMWORD PTR[208+rsp],xmm9
+ movaps XMMWORD PTR[224+rsp],xmm10
+ movaps XMMWORD PTR[240+rsp],xmm11
+ movaps XMMWORD PTR[256+rsp],xmm12
+ movaps XMMWORD PTR[272+rsp],xmm13
+ movaps XMMWORD PTR[288+rsp],xmm14
+ movaps XMMWORD PTR[304+rsp],xmm15
$L$mul_gather4_body::
- mov eax,DWORD PTR[64+r9*4+rdx]
-DB 102,72,15,110,199
- mov ebx,DWORD PTR[r9*4+rdx]
-DB 102,72,15,110,201
+ movd xmm8,r9d
+ movdqa xmm1,XMMWORD PTR[(($L$inc+16))]
+ movdqa xmm0,XMMWORD PTR[$L$inc]
+
+ pshufd xmm8,xmm8,0
+ movdqa xmm7,xmm1
+ movdqa xmm2,xmm1
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm8
+ movdqa xmm3,xmm7
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm8
+ movdqa xmm4,xmm7
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm8
+ movdqa xmm5,xmm7
+ paddd xmm4,xmm3
+ pcmpeqd xmm3,xmm8
+ movdqa xmm6,xmm7
+ paddd xmm5,xmm4
+ pcmpeqd xmm4,xmm8
+ paddd xmm6,xmm5
+ pcmpeqd xmm5,xmm8
+ paddd xmm7,xmm6
+ pcmpeqd xmm6,xmm8
+ pcmpeqd xmm7,xmm8
+
+ movdqa xmm8,XMMWORD PTR[rdx]
+ movdqa xmm9,XMMWORD PTR[16+rdx]
+ movdqa xmm10,XMMWORD PTR[32+rdx]
+ movdqa xmm11,XMMWORD PTR[48+rdx]
+ pand xmm8,xmm0
+ movdqa xmm12,XMMWORD PTR[64+rdx]
+ pand xmm9,xmm1
+ movdqa xmm13,XMMWORD PTR[80+rdx]
+ pand xmm10,xmm2
+ movdqa xmm14,XMMWORD PTR[96+rdx]
+ pand xmm11,xmm3
+ movdqa xmm15,XMMWORD PTR[112+rdx]
+ lea rbp,QWORD PTR[128+rdx]
+ pand xmm12,xmm4
+ pand xmm13,xmm5
+ pand xmm14,xmm6
+ pand xmm15,xmm7
+ por xmm8,xmm10
+ por xmm9,xmm11
+ por xmm8,xmm12
+ por xmm9,xmm13
+ por xmm8,xmm14
+ por xmm9,xmm15
+
+ por xmm8,xmm9
+ pshufd xmm9,xmm8,04eh
+ por xmm8,xmm9
+DB 102,76,15,126,195
+
mov QWORD PTR[128+rsp],r8
+ mov QWORD PTR[((128+8))+rsp],rdi
+ mov QWORD PTR[((128+16))+rsp],rcx
- shl rax,32
- or rbx,rax
mov rax,QWORD PTR[rsi]
mov rcx,QWORD PTR[8+rsi]
- lea rbp,QWORD PTR[128+r9*4+rdx]
mul rbx
mov QWORD PTR[rsp],rax
mov rax,rcx
mov r8,rdx
mul rbx
- movd xmm4,DWORD PTR[rbp]
add r8,rax
mov rax,QWORD PTR[16+rsi]
mov r9,rdx
adc r9,0
mul rbx
- movd xmm5,DWORD PTR[64+rbp]
add r9,rax
mov rax,QWORD PTR[24+rsi]
mov r10,rdx
adc r10,0
mul rbx
- pslldq xmm5,4
add r10,rax
mov rax,QWORD PTR[32+rsi]
mov r11,rdx
adc r11,0
mul rbx
- por xmm4,xmm5
add r11,rax
mov rax,QWORD PTR[40+rsi]
mov r12,rdx
@@ -556,14 +612,12 @@ DB 102,72,15,110,201
adc r13,0
mul rbx
- lea rbp,QWORD PTR[128+rbp]
add r13,rax
mov rax,QWORD PTR[56+rsi]
mov r14,rdx
adc r14,0
mul rbx
-DB 102,72,15,126,227
add r14,rax
mov rax,QWORD PTR[rsi]
mov r15,rdx
@@ -575,6 +629,35 @@ DB 102,72,15,126,227
ALIGN 32
$L$oop_mul_gather::
+ movdqa xmm8,XMMWORD PTR[rbp]
+ movdqa xmm9,XMMWORD PTR[16+rbp]
+ movdqa xmm10,XMMWORD PTR[32+rbp]
+ movdqa xmm11,XMMWORD PTR[48+rbp]
+ pand xmm8,xmm0
+ movdqa xmm12,XMMWORD PTR[64+rbp]
+ pand xmm9,xmm1
+ movdqa xmm13,XMMWORD PTR[80+rbp]
+ pand xmm10,xmm2
+ movdqa xmm14,XMMWORD PTR[96+rbp]
+ pand xmm11,xmm3
+ movdqa xmm15,XMMWORD PTR[112+rbp]
+ lea rbp,QWORD PTR[128+rbp]
+ pand xmm12,xmm4
+ pand xmm13,xmm5
+ pand xmm14,xmm6
+ pand xmm15,xmm7
+ por xmm8,xmm10
+ por xmm9,xmm11
+ por xmm8,xmm12
+ por xmm9,xmm13
+ por xmm8,xmm14
+ por xmm9,xmm15
+
+ por xmm8,xmm9
+ pshufd xmm9,xmm8,04eh
+ por xmm8,xmm9
+DB 102,76,15,126,195
+
mul rbx
add r8,rax
mov rax,QWORD PTR[8+rsi]
@@ -583,7 +666,6 @@ $L$oop_mul_gather::
adc r8,0
mul rbx
- movd xmm4,DWORD PTR[rbp]
add r9,rax
mov rax,QWORD PTR[16+rsi]
adc rdx,0
@@ -592,7 +674,6 @@ $L$oop_mul_gather::
adc r9,0
mul rbx
- movd xmm5,DWORD PTR[64+rbp]
add r10,rax
mov rax,QWORD PTR[24+rsi]
adc rdx,0
@@ -601,7 +682,6 @@ $L$oop_mul_gather::
adc r10,0
mul rbx
- pslldq xmm5,4
add r11,rax
mov rax,QWORD PTR[32+rsi]
adc rdx,0
@@ -610,7 +690,6 @@ $L$oop_mul_gather::
adc r11,0
mul rbx
- por xmm4,xmm5
add r12,rax
mov rax,QWORD PTR[40+rsi]
adc rdx,0
@@ -635,7 +714,6 @@ $L$oop_mul_gather::
adc r14,0
mul rbx
-DB 102,72,15,126,227
add r15,rax
mov rax,QWORD PTR[rsi]
adc rdx,0
@@ -643,7 +721,6 @@ DB 102,72,15,126,227
mov r15,rdx
adc r15,0
- lea rbp,QWORD PTR[128+rbp]
lea rdi,QWORD PTR[8+rdi]
dec ecx
@@ -658,8 +735,8 @@ DB 102,72,15,126,227
mov QWORD PTR[48+rdi],r14
mov QWORD PTR[56+rdi],r15
-DB 102,72,15,126,199
-DB 102,72,15,126,205
+ mov rdi,QWORD PTR[((128+8))+rsp]
+ mov rbp,QWORD PTR[((128+16))+rsp]
mov r8,QWORD PTR[rsp]
mov r9,QWORD PTR[8+rsp]
@@ -684,6 +761,17 @@ DB 102,72,15,126,205
call __rsaz_512_subtract
lea rax,QWORD PTR[((128+24+48))+rsp]
+ movaps xmm6,XMMWORD PTR[((160-200))+rax]
+ movaps xmm7,XMMWORD PTR[((176-200))+rax]
+ movaps xmm8,XMMWORD PTR[((192-200))+rax]
+ movaps xmm9,XMMWORD PTR[((208-200))+rax]
+ movaps xmm10,XMMWORD PTR[((224-200))+rax]
+ movaps xmm11,XMMWORD PTR[((240-200))+rax]
+ movaps xmm12,XMMWORD PTR[((256-200))+rax]
+ movaps xmm13,XMMWORD PTR[((272-200))+rax]
+ movaps xmm14,XMMWORD PTR[((288-200))+rax]
+ movaps xmm15,XMMWORD PTR[((304-200))+rax]
+ lea rax,QWORD PTR[176+rax]
mov r15,QWORD PTR[((-48))+rax]
mov r14,QWORD PTR[((-40))+rax]
mov r13,QWORD PTR[((-32))+rax]
@@ -723,7 +811,7 @@ $L$SEH_begin_rsaz_512_mul_scatter4::
mov r9d,r9d
sub rsp,128+24
$L$mul_scatter4_body::
- lea r8,QWORD PTR[r9*4+r8]
+ lea r8,QWORD PTR[r9*8+r8]
DB 102,72,15,110,199
DB 102,72,15,110,202
DB 102,73,15,110,208
@@ -759,30 +847,14 @@ DB 102,72,15,126,214
call __rsaz_512_subtract
- mov DWORD PTR[rsi],r8d
- shr r8,32
- mov DWORD PTR[128+rsi],r9d
- shr r9,32
- mov DWORD PTR[256+rsi],r10d
- shr r10,32
- mov DWORD PTR[384+rsi],r11d
- shr r11,32
- mov DWORD PTR[512+rsi],r12d
- shr r12,32
- mov DWORD PTR[640+rsi],r13d
- shr r13,32
- mov DWORD PTR[768+rsi],r14d
- shr r14,32
- mov DWORD PTR[896+rsi],r15d
- shr r15,32
- mov DWORD PTR[64+rsi],r8d
- mov DWORD PTR[192+rsi],r9d
- mov DWORD PTR[320+rsi],r10d
- mov DWORD PTR[448+rsi],r11d
- mov DWORD PTR[576+rsi],r12d
- mov DWORD PTR[704+rsi],r13d
- mov DWORD PTR[832+rsi],r14d
- mov DWORD PTR[960+rsi],r15d
+ mov QWORD PTR[rsi],r8
+ mov QWORD PTR[128+rsi],r9
+ mov QWORD PTR[256+rsi],r10
+ mov QWORD PTR[384+rsi],r11
+ mov QWORD PTR[512+rsi],r12
+ mov QWORD PTR[640+rsi],r13
+ mov QWORD PTR[768+rsi],r14
+ mov QWORD PTR[896+rsi],r15
lea rax,QWORD PTR[((128+24+48))+rsp]
mov r15,QWORD PTR[((-48))+rax]
@@ -1151,16 +1223,14 @@ PUBLIC rsaz_512_scatter4
ALIGN 16
rsaz_512_scatter4 PROC PUBLIC
- lea rcx,QWORD PTR[r8*4+rcx]
+ lea rcx,QWORD PTR[r8*8+rcx]
mov r9d,8
jmp $L$oop_scatter
ALIGN 16
$L$oop_scatter::
mov rax,QWORD PTR[rdx]
lea rdx,QWORD PTR[8+rdx]
- mov DWORD PTR[rcx],eax
- shr rax,32
- mov DWORD PTR[64+rcx],eax
+ mov QWORD PTR[rcx],rax
lea rcx,QWORD PTR[128+rcx]
dec r9d
jnz $L$oop_scatter
@@ -1171,22 +1241,98 @@ PUBLIC rsaz_512_gather4
ALIGN 16
rsaz_512_gather4 PROC PUBLIC
- lea rdx,QWORD PTR[r8*4+rdx]
+$L$SEH_begin_rsaz_512_gather4::
+DB 048h,081h,0ech,0a8h,000h,000h,000h
+DB 00fh,029h,034h,024h
+DB 00fh,029h,07ch,024h,010h
+DB 044h,00fh,029h,044h,024h,020h
+DB 044h,00fh,029h,04ch,024h,030h
+DB 044h,00fh,029h,054h,024h,040h
+DB 044h,00fh,029h,05ch,024h,050h
+DB 044h,00fh,029h,064h,024h,060h
+DB 044h,00fh,029h,06ch,024h,070h
+DB 044h,00fh,029h,0b4h,024h,080h,0,0,0
+DB 044h,00fh,029h,0bch,024h,090h,0,0,0
+ movd xmm8,r8d
+ movdqa xmm1,XMMWORD PTR[(($L$inc+16))]
+ movdqa xmm0,XMMWORD PTR[$L$inc]
+
+ pshufd xmm8,xmm8,0
+ movdqa xmm7,xmm1
+ movdqa xmm2,xmm1
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm8
+ movdqa xmm3,xmm7
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm8
+ movdqa xmm4,xmm7
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm8
+ movdqa xmm5,xmm7
+ paddd xmm4,xmm3
+ pcmpeqd xmm3,xmm8
+ movdqa xmm6,xmm7
+ paddd xmm5,xmm4
+ pcmpeqd xmm4,xmm8
+ paddd xmm6,xmm5
+ pcmpeqd xmm5,xmm8
+ paddd xmm7,xmm6
+ pcmpeqd xmm6,xmm8
+ pcmpeqd xmm7,xmm8
mov r9d,8
jmp $L$oop_gather
ALIGN 16
$L$oop_gather::
- mov eax,DWORD PTR[rdx]
- mov r8d,DWORD PTR[64+rdx]
+ movdqa xmm8,XMMWORD PTR[rdx]
+ movdqa xmm9,XMMWORD PTR[16+rdx]
+ movdqa xmm10,XMMWORD PTR[32+rdx]
+ movdqa xmm11,XMMWORD PTR[48+rdx]
+ pand xmm8,xmm0
+ movdqa xmm12,XMMWORD PTR[64+rdx]
+ pand xmm9,xmm1
+ movdqa xmm13,XMMWORD PTR[80+rdx]
+ pand xmm10,xmm2
+ movdqa xmm14,XMMWORD PTR[96+rdx]
+ pand xmm11,xmm3
+ movdqa xmm15,XMMWORD PTR[112+rdx]
lea rdx,QWORD PTR[128+rdx]
- shl r8,32
- or rax,r8
- mov QWORD PTR[rcx],rax
+ pand xmm12,xmm4
+ pand xmm13,xmm5
+ pand xmm14,xmm6
+ pand xmm15,xmm7
+ por xmm8,xmm10
+ por xmm9,xmm11
+ por xmm8,xmm12
+ por xmm9,xmm13
+ por xmm8,xmm14
+ por xmm9,xmm15
+
+ por xmm8,xmm9
+ pshufd xmm9,xmm8,04eh
+ por xmm8,xmm9
+ movq QWORD PTR[rcx],xmm8
lea rcx,QWORD PTR[8+rcx]
dec r9d
jnz $L$oop_gather
+ movaps xmm6,XMMWORD PTR[rsp]
+ movaps xmm7,XMMWORD PTR[16+rsp]
+ movaps xmm8,XMMWORD PTR[32+rsp]
+ movaps xmm9,XMMWORD PTR[48+rsp]
+ movaps xmm10,XMMWORD PTR[64+rsp]
+ movaps xmm11,XMMWORD PTR[80+rsp]
+ movaps xmm12,XMMWORD PTR[96+rsp]
+ movaps xmm13,XMMWORD PTR[112+rsp]
+ movaps xmm14,XMMWORD PTR[128+rsp]
+ movaps xmm15,XMMWORD PTR[144+rsp]
+ add rsp,0a8h
DB 0F3h,0C3h ;repret
+$L$SEH_end_rsaz_512_gather4::
rsaz_512_gather4 ENDP
+
+ALIGN 64
+$L$inc::
+ DD 0,0,1,1
+ DD 2,2,2,2
EXTERN __imp_RtlVirtualUnwind:NEAR
ALIGN 16
@@ -1222,6 +1368,18 @@ se_handler PROC PRIVATE
lea rax,QWORD PTR[((128+24+48))+rax]
+ lea rbx,QWORD PTR[$L$mul_gather4_epilogue]
+ cmp rbx,r10
+ jne $L$se_not_in_mul_gather4
+
+ lea rax,QWORD PTR[176+rax]
+
+ lea rsi,QWORD PTR[((-48-168))+rax]
+ lea rdi,QWORD PTR[512+r8]
+ mov ecx,20
+ DD 0a548f3fch
+
+$L$se_not_in_mul_gather4::
mov rbx,QWORD PTR[((-8))+rax]
mov rbp,QWORD PTR[((-16))+rax]
mov r12,QWORD PTR[((-24))+rax]
@@ -1298,6 +1456,10 @@ ALIGN 4
DD imagerel $L$SEH_end_rsaz_512_mul_by_one
DD imagerel $L$SEH_info_rsaz_512_mul_by_one
+ DD imagerel $L$SEH_begin_rsaz_512_gather4
+ DD imagerel $L$SEH_end_rsaz_512_gather4
+ DD imagerel $L$SEH_info_rsaz_512_gather4
+
.pdata ENDS
.xdata SEGMENT READONLY ALIGN(8)
ALIGN 8
@@ -1321,6 +1483,19 @@ $L$SEH_info_rsaz_512_mul_by_one::
DB 9,0,0,0
DD imagerel se_handler
DD imagerel $L$mul_by_one_body,imagerel $L$mul_by_one_epilogue
+$L$SEH_info_rsaz_512_gather4::
+DB 001h,046h,016h,000h
+DB 046h,0f8h,009h,000h
+DB 03dh,0e8h,008h,000h
+DB 034h,0d8h,007h,000h
+DB 02eh,0c8h,006h,000h
+DB 028h,0b8h,005h,000h
+DB 022h,0a8h,004h,000h
+DB 01ch,098h,003h,000h
+DB 016h,088h,002h,000h
+DB 010h,078h,001h,000h
+DB 00bh,068h,000h,000h
+DB 007h,001h,015h,000h
.xdata ENDS
END
diff --git a/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont.asm b/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont.asm
index afec83bd17..e24eb89aee 100644
--- a/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont.asm
+++ b/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont.asm
@@ -676,20 +676,20 @@ $L$sqr8x_enter::
- lea r11,QWORD PTR[((-64))+r9*4+rsp]
+ lea r11,QWORD PTR[((-64))+r9*2+rsp]
mov r8,QWORD PTR[r8]
sub r11,rsi
and r11,4095
cmp r10,r11
jb $L$sqr8x_sp_alt
sub rsp,r11
- lea rsp,QWORD PTR[((-64))+r9*4+rsp]
+ lea rsp,QWORD PTR[((-64))+r9*2+rsp]
jmp $L$sqr8x_sp_done
ALIGN 32
$L$sqr8x_sp_alt::
- lea r10,QWORD PTR[((4096-64))+r9*4]
- lea rsp,QWORD PTR[((-64))+r9*4+rsp]
+ lea r10,QWORD PTR[((4096-64))+r9*2]
+ lea rsp,QWORD PTR[((-64))+r9*2+rsp]
sub r11,r10
mov r10,0
cmovc r11,r10
@@ -699,58 +699,80 @@ $L$sqr8x_sp_done::
mov r10,r9
neg r9
- lea r11,QWORD PTR[64+r9*2+rsp]
mov QWORD PTR[32+rsp],r8
mov QWORD PTR[40+rsp],rax
$L$sqr8x_body::
- mov rbp,r9
-DB 102,73,15,110,211
- shr rbp,3+2
- mov eax,DWORD PTR[((OPENSSL_ia32cap_P+8))]
- jmp $L$sqr8x_copy_n
-
-ALIGN 32
-$L$sqr8x_copy_n::
- movq xmm0,QWORD PTR[rcx]
- movq xmm1,QWORD PTR[8+rcx]
- movq xmm3,QWORD PTR[16+rcx]
- movq xmm4,QWORD PTR[24+rcx]
- lea rcx,QWORD PTR[32+rcx]
- movdqa XMMWORD PTR[r11],xmm0
- movdqa XMMWORD PTR[16+r11],xmm1
- movdqa XMMWORD PTR[32+r11],xmm3
- movdqa XMMWORD PTR[48+r11],xmm4
- lea r11,QWORD PTR[64+r11]
- dec rbp
- jnz $L$sqr8x_copy_n
-
+DB 102,72,15,110,209
pxor xmm0,xmm0
DB 102,72,15,110,207
DB 102,73,15,110,218
call bn_sqr8x_internal
+
+
+
+ lea rbx,QWORD PTR[r9*1+rdi]
+ mov rcx,r9
+ mov rdx,r9
+DB 102,72,15,126,207
+ sar rcx,3+2
+ jmp $L$sqr8x_sub
+
+ALIGN 32
+$L$sqr8x_sub::
+ mov r12,QWORD PTR[rbx]
+ mov r13,QWORD PTR[8+rbx]
+ mov r14,QWORD PTR[16+rbx]
+ mov r15,QWORD PTR[24+rbx]
+ lea rbx,QWORD PTR[32+rbx]
+ sbb r12,QWORD PTR[rbp]
+ sbb r13,QWORD PTR[8+rbp]
+ sbb r14,QWORD PTR[16+rbp]
+ sbb r15,QWORD PTR[24+rbp]
+ lea rbp,QWORD PTR[32+rbp]
+ mov QWORD PTR[rdi],r12
+ mov QWORD PTR[8+rdi],r13
+ mov QWORD PTR[16+rdi],r14
+ mov QWORD PTR[24+rdi],r15
+ lea rdi,QWORD PTR[32+rdi]
+ inc rcx
+ jnz $L$sqr8x_sub
+
+ sbb rax,0
+ lea rbx,QWORD PTR[r9*1+rbx]
+ lea rdi,QWORD PTR[r9*1+rdi]
+
+DB 102,72,15,110,200
pxor xmm0,xmm0
- lea rax,QWORD PTR[48+rsp]
- lea rdx,QWORD PTR[64+r9*2+rsp]
- shr r9,3+2
+ pshufd xmm1,xmm1,0
mov rsi,QWORD PTR[40+rsp]
- jmp $L$sqr8x_zero
+ jmp $L$sqr8x_cond_copy
ALIGN 32
-$L$sqr8x_zero::
- movdqa XMMWORD PTR[rax],xmm0
- movdqa XMMWORD PTR[16+rax],xmm0
- movdqa XMMWORD PTR[32+rax],xmm0
- movdqa XMMWORD PTR[48+rax],xmm0
- lea rax,QWORD PTR[64+rax]
- movdqa XMMWORD PTR[rdx],xmm0
- movdqa XMMWORD PTR[16+rdx],xmm0
- movdqa XMMWORD PTR[32+rdx],xmm0
- movdqa XMMWORD PTR[48+rdx],xmm0
- lea rdx,QWORD PTR[64+rdx]
- dec r9
- jnz $L$sqr8x_zero
+$L$sqr8x_cond_copy::
+ movdqa xmm2,XMMWORD PTR[rbx]
+ movdqa xmm3,XMMWORD PTR[16+rbx]
+ lea rbx,QWORD PTR[32+rbx]
+ movdqu xmm4,XMMWORD PTR[rdi]
+ movdqu xmm5,XMMWORD PTR[16+rdi]
+ lea rdi,QWORD PTR[32+rdi]
+ movdqa XMMWORD PTR[(-32)+rbx],xmm0
+ movdqa XMMWORD PTR[(-16)+rbx],xmm0
+ movdqa XMMWORD PTR[(-32)+rdx*1+rbx],xmm0
+ movdqa XMMWORD PTR[(-16)+rdx*1+rbx],xmm0
+ pcmpeqd xmm0,xmm1
+ pand xmm2,xmm1
+ pand xmm3,xmm1
+ pand xmm4,xmm0
+ pand xmm5,xmm0
+ pxor xmm0,xmm0
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqu XMMWORD PTR[(-32)+rdi],xmm4
+ movdqu XMMWORD PTR[(-16)+rdi],xmm5
+ add r9,32
+ jnz $L$sqr8x_cond_copy
mov rax,1
mov r15,QWORD PTR[((-48))+rsi]
diff --git a/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm b/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm
index 86acef32ea..503e2d6a03 100644
--- a/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm
+++ b/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm
@@ -27,49 +27,151 @@ ALIGN 16
$L$mul_enter::
mov r9d,r9d
mov rax,rsp
- mov r10d,DWORD PTR[56+rsp]
+ movd xmm5,DWORD PTR[56+rsp]
+ lea r10,QWORD PTR[$L$inc]
push rbx
push rbp
push r12
push r13
push r14
push r15
- lea rsp,QWORD PTR[((-40))+rsp]
- movaps XMMWORD PTR[rsp],xmm6
- movaps XMMWORD PTR[16+rsp],xmm7
+
lea r11,QWORD PTR[2+r9]
neg r11
- lea rsp,QWORD PTR[r11*8+rsp]
+ lea rsp,QWORD PTR[((-264))+r11*8+rsp]
and rsp,-1024
mov QWORD PTR[8+r9*8+rsp],rax
$L$mul_body::
- mov r12,rdx
- mov r11,r10
- shr r10,3
- and r11,7
- not r10
- lea rax,QWORD PTR[$L$magic_masks]
- and r10,3
- lea r12,QWORD PTR[96+r11*8+r12]
- movq xmm4,QWORD PTR[r10*8+rax]
- movq xmm5,QWORD PTR[8+r10*8+rax]
- movq xmm6,QWORD PTR[16+r10*8+rax]
- movq xmm7,QWORD PTR[24+r10*8+rax]
-
- movq xmm0,QWORD PTR[((-96))+r12]
- movq xmm1,QWORD PTR[((-32))+r12]
- pand xmm0,xmm4
- movq xmm2,QWORD PTR[32+r12]
- pand xmm1,xmm5
- movq xmm3,QWORD PTR[96+r12]
- pand xmm2,xmm6
- por xmm0,xmm1
- pand xmm3,xmm7
+ lea r12,QWORD PTR[128+rdx]
+ movdqa xmm0,XMMWORD PTR[r10]
+ movdqa xmm1,XMMWORD PTR[16+r10]
+ lea r10,QWORD PTR[((24-112))+r9*8+rsp]
+ and r10,-16
+
+ pshufd xmm5,xmm5,0
+ movdqa xmm4,xmm1
+ movdqa xmm2,xmm1
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+DB 067h
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[112+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[128+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[144+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[160+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[176+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[192+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[208+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[224+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[240+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[256+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[272+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[288+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[304+r10],xmm0
+
+ paddd xmm3,xmm2
+DB 067h
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[320+r10],xmm1
+
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[336+r10],xmm2
+ pand xmm0,XMMWORD PTR[64+r12]
+
+ pand xmm1,XMMWORD PTR[80+r12]
+ pand xmm2,XMMWORD PTR[96+r12]
+ movdqa XMMWORD PTR[352+r10],xmm3
+ pand xmm3,XMMWORD PTR[112+r12]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD PTR[((-128))+r12]
+ movdqa xmm5,XMMWORD PTR[((-112))+r12]
+ movdqa xmm2,XMMWORD PTR[((-96))+r12]
+ pand xmm4,XMMWORD PTR[112+r10]
+ movdqa xmm3,XMMWORD PTR[((-80))+r12]
+ pand xmm5,XMMWORD PTR[128+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD PTR[144+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD PTR[160+r10]
por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD PTR[((-64))+r12]
+ movdqa xmm5,XMMWORD PTR[((-48))+r12]
+ movdqa xmm2,XMMWORD PTR[((-32))+r12]
+ pand xmm4,XMMWORD PTR[176+r10]
+ movdqa xmm3,XMMWORD PTR[((-16))+r12]
+ pand xmm5,XMMWORD PTR[192+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD PTR[208+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD PTR[224+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD PTR[r12]
+ movdqa xmm5,XMMWORD PTR[16+r12]
+ movdqa xmm2,XMMWORD PTR[32+r12]
+ pand xmm4,XMMWORD PTR[240+r10]
+ movdqa xmm3,XMMWORD PTR[48+r12]
+ pand xmm5,XMMWORD PTR[256+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD PTR[272+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD PTR[288+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ por xmm0,xmm1
+ pshufd xmm1,xmm0,04eh
+ por xmm0,xmm1
lea r12,QWORD PTR[256+r12]
- por xmm0,xmm3
-
DB 102,72,15,126,195
mov r8,QWORD PTR[r8]
@@ -78,29 +180,14 @@ DB 102,72,15,126,195
xor r14,r14
xor r15,r15
- movq xmm0,QWORD PTR[((-96))+r12]
- movq xmm1,QWORD PTR[((-32))+r12]
- pand xmm0,xmm4
- movq xmm2,QWORD PTR[32+r12]
- pand xmm1,xmm5
-
mov rbp,r8
mul rbx
mov r10,rax
mov rax,QWORD PTR[rcx]
- movq xmm3,QWORD PTR[96+r12]
- pand xmm2,xmm6
- por xmm0,xmm1
- pand xmm3,xmm7
-
imul rbp,r10
mov r11,rdx
- por xmm0,xmm2
- lea r12,QWORD PTR[256+r12]
- por xmm0,xmm3
-
mul rbp
add r10,rax
mov rax,QWORD PTR[8+rsi]
@@ -133,14 +220,12 @@ $L$1st_enter::
cmp r15,r9
jne $L$1st
-DB 102,72,15,126,195
add r13,rax
- mov rax,QWORD PTR[rsi]
adc rdx,0
add r13,r11
adc rdx,0
- mov QWORD PTR[((-16))+r15*8+rsp],r13
+ mov QWORD PTR[((-16))+r9*8+rsp],r13
mov r13,rdx
mov r11,r10
@@ -154,33 +239,78 @@ DB 102,72,15,126,195
jmp $L$outer
ALIGN 16
$L$outer::
+ lea rdx,QWORD PTR[((24+128))+r9*8+rsp]
+ and rdx,-16
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ movdqa xmm0,XMMWORD PTR[((-128))+r12]
+ movdqa xmm1,XMMWORD PTR[((-112))+r12]
+ movdqa xmm2,XMMWORD PTR[((-96))+r12]
+ movdqa xmm3,XMMWORD PTR[((-80))+r12]
+ pand xmm0,XMMWORD PTR[((-128))+rdx]
+ pand xmm1,XMMWORD PTR[((-112))+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[((-96))+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[((-80))+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[((-64))+r12]
+ movdqa xmm1,XMMWORD PTR[((-48))+r12]
+ movdqa xmm2,XMMWORD PTR[((-32))+r12]
+ movdqa xmm3,XMMWORD PTR[((-16))+r12]
+ pand xmm0,XMMWORD PTR[((-64))+rdx]
+ pand xmm1,XMMWORD PTR[((-48))+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[((-32))+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[((-16))+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[r12]
+ movdqa xmm1,XMMWORD PTR[16+r12]
+ movdqa xmm2,XMMWORD PTR[32+r12]
+ movdqa xmm3,XMMWORD PTR[48+r12]
+ pand xmm0,XMMWORD PTR[rdx]
+ pand xmm1,XMMWORD PTR[16+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[32+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[48+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[64+r12]
+ movdqa xmm1,XMMWORD PTR[80+r12]
+ movdqa xmm2,XMMWORD PTR[96+r12]
+ movdqa xmm3,XMMWORD PTR[112+r12]
+ pand xmm0,XMMWORD PTR[64+rdx]
+ pand xmm1,XMMWORD PTR[80+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[96+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[112+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ por xmm4,xmm5
+ pshufd xmm0,xmm4,04eh
+ por xmm0,xmm4
+ lea r12,QWORD PTR[256+r12]
+
+ mov rax,QWORD PTR[rsi]
+DB 102,72,15,126,195
+
xor r15,r15
mov rbp,r8
mov r10,QWORD PTR[rsp]
- movq xmm0,QWORD PTR[((-96))+r12]
- movq xmm1,QWORD PTR[((-32))+r12]
- pand xmm0,xmm4
- movq xmm2,QWORD PTR[32+r12]
- pand xmm1,xmm5
-
mul rbx
add r10,rax
mov rax,QWORD PTR[rcx]
adc rdx,0
- movq xmm3,QWORD PTR[96+r12]
- pand xmm2,xmm6
- por xmm0,xmm1
- pand xmm3,xmm7
-
imul rbp,r10
mov r11,rdx
- por xmm0,xmm2
- lea r12,QWORD PTR[256+r12]
- por xmm0,xmm3
-
mul rbp
add r10,rax
mov rax,QWORD PTR[8+rsi]
@@ -216,15 +346,12 @@ $L$inner_enter::
cmp r15,r9
jne $L$inner
-DB 102,72,15,126,195
-
add r13,rax
- mov rax,QWORD PTR[rsi]
adc rdx,0
add r13,r10
- mov r10,QWORD PTR[r15*8+rsp]
+ mov r10,QWORD PTR[r9*8+rsp]
adc rdx,0
- mov QWORD PTR[((-16))+r15*8+rsp],r13
+ mov QWORD PTR[((-16))+r9*8+rsp],r13
mov r13,rdx
xor rdx,rdx
@@ -271,8 +398,7 @@ $L$copy::
mov rsi,QWORD PTR[8+r9*8+rsp]
mov rax,1
- movaps xmm6,XMMWORD PTR[((-88))+rsi]
- movaps xmm7,XMMWORD PTR[((-72))+rsi]
+
mov r15,QWORD PTR[((-48))+rsi]
mov r14,QWORD PTR[((-40))+rsi]
mov r13,QWORD PTR[((-32))+rsi]
@@ -310,13 +436,10 @@ DB 067h
push r13
push r14
push r15
- lea rsp,QWORD PTR[((-40))+rsp]
- movaps XMMWORD PTR[rsp],xmm6
- movaps XMMWORD PTR[16+rsp],xmm7
+
DB 067h
- mov r10d,r9d
shl r9d,3
- shl r10d,3+2
+ lea r10,QWORD PTR[r9*2+r9]
neg r9
@@ -326,19 +449,21 @@ DB 067h
- lea r11,QWORD PTR[((-64))+r9*2+rsp]
- sub r11,rsi
+
+
+ lea r11,QWORD PTR[((-320))+r9*2+rsp]
+ sub r11,rdi
and r11,4095
cmp r10,r11
jb $L$mul4xsp_alt
sub rsp,r11
- lea rsp,QWORD PTR[((-64))+r9*2+rsp]
+ lea rsp,QWORD PTR[((-320))+r9*2+rsp]
jmp $L$mul4xsp_done
ALIGN 32
$L$mul4xsp_alt::
- lea r10,QWORD PTR[((4096-64))+r9*2]
- lea rsp,QWORD PTR[((-64))+r9*2+rsp]
+ lea r10,QWORD PTR[((4096-320))+r9*2]
+ lea rsp,QWORD PTR[((-320))+r9*2+rsp]
sub r11,r10
mov r10,0
cmovc r11,r10
@@ -354,8 +479,7 @@ $L$mul4x_body::
mov rsi,QWORD PTR[40+rsp]
mov rax,1
- movaps xmm6,XMMWORD PTR[((-88))+rsi]
- movaps xmm7,XMMWORD PTR[((-72))+rsi]
+
mov r15,QWORD PTR[((-48))+rsi]
mov r14,QWORD PTR[((-40))+rsi]
mov r13,QWORD PTR[((-32))+rsi]
@@ -374,47 +498,141 @@ bn_mul4x_mont_gather5 ENDP
ALIGN 32
mul4x_internal PROC PRIVATE
shl r9,5
- mov r10d,DWORD PTR[56+rax]
- lea r13,QWORD PTR[256+r9*1+rdx]
+ movd xmm5,DWORD PTR[56+rax]
+ lea rax,QWORD PTR[$L$inc]
+ lea r13,QWORD PTR[128+r9*1+rdx]
shr r9,5
- mov r11,r10
- shr r10,3
- and r11,7
- not r10
- lea rax,QWORD PTR[$L$magic_masks]
- and r10,3
- lea r12,QWORD PTR[96+r11*8+rdx]
- movq xmm4,QWORD PTR[r10*8+rax]
- movq xmm5,QWORD PTR[8+r10*8+rax]
- add r11,7
- movq xmm6,QWORD PTR[16+r10*8+rax]
- movq xmm7,QWORD PTR[24+r10*8+rax]
- and r11,7
-
- movq xmm0,QWORD PTR[((-96))+r12]
- lea r14,QWORD PTR[256+r12]
- movq xmm1,QWORD PTR[((-32))+r12]
- pand xmm0,xmm4
- movq xmm2,QWORD PTR[32+r12]
- pand xmm1,xmm5
- movq xmm3,QWORD PTR[96+r12]
- pand xmm2,xmm6
-DB 067h
- por xmm0,xmm1
- movq xmm1,QWORD PTR[((-96))+r14]
-DB 067h
- pand xmm3,xmm7
-DB 067h
- por xmm0,xmm2
- movq xmm2,QWORD PTR[((-32))+r14]
+ movdqa xmm0,XMMWORD PTR[rax]
+ movdqa xmm1,XMMWORD PTR[16+rax]
+ lea r10,QWORD PTR[((88-112))+r9*1+rsp]
+ lea r12,QWORD PTR[128+rdx]
+
+ pshufd xmm5,xmm5,0
+ movdqa xmm4,xmm1
+DB 067h,067h
+ movdqa xmm2,xmm1
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
DB 067h
- pand xmm1,xmm4
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[112+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[128+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[144+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[160+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[176+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[192+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[208+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[224+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[240+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[256+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[272+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[288+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[304+r10],xmm0
+
+ paddd xmm3,xmm2
DB 067h
- por xmm0,xmm3
- movq xmm3,QWORD PTR[32+r14]
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[320+r10],xmm1
+
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[336+r10],xmm2
+ pand xmm0,XMMWORD PTR[64+r12]
+ pand xmm1,XMMWORD PTR[80+r12]
+ pand xmm2,XMMWORD PTR[96+r12]
+ movdqa XMMWORD PTR[352+r10],xmm3
+ pand xmm3,XMMWORD PTR[112+r12]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD PTR[((-128))+r12]
+ movdqa xmm5,XMMWORD PTR[((-112))+r12]
+ movdqa xmm2,XMMWORD PTR[((-96))+r12]
+ pand xmm4,XMMWORD PTR[112+r10]
+ movdqa xmm3,XMMWORD PTR[((-80))+r12]
+ pand xmm5,XMMWORD PTR[128+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD PTR[144+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD PTR[160+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD PTR[((-64))+r12]
+ movdqa xmm5,XMMWORD PTR[((-48))+r12]
+ movdqa xmm2,XMMWORD PTR[((-32))+r12]
+ pand xmm4,XMMWORD PTR[176+r10]
+ movdqa xmm3,XMMWORD PTR[((-16))+r12]
+ pand xmm5,XMMWORD PTR[192+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD PTR[208+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD PTR[224+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD PTR[r12]
+ movdqa xmm5,XMMWORD PTR[16+r12]
+ movdqa xmm2,XMMWORD PTR[32+r12]
+ pand xmm4,XMMWORD PTR[240+r10]
+ movdqa xmm3,XMMWORD PTR[48+r12]
+ pand xmm5,XMMWORD PTR[256+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD PTR[272+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD PTR[288+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ por xmm0,xmm1
+ pshufd xmm1,xmm0,04eh
+ por xmm0,xmm1
+ lea r12,QWORD PTR[256+r12]
DB 102,72,15,126,195
- movq xmm0,QWORD PTR[96+r14]
+
mov QWORD PTR[((16+8))+rsp],r13
mov QWORD PTR[((56+8))+rsp],rdi
@@ -428,26 +646,10 @@ DB 102,72,15,126,195
mov r10,rax
mov rax,QWORD PTR[rcx]
- pand xmm2,xmm5
- pand xmm3,xmm6
- por xmm1,xmm2
-
imul rbp,r10
-
-
-
-
-
-
-
- lea r14,QWORD PTR[((64+8))+r11*8+rsp]
+ lea r14,QWORD PTR[((64+8))+rsp]
mov r11,rdx
- pand xmm0,xmm7
- por xmm1,xmm3
- lea r12,QWORD PTR[512+r12]
- por xmm0,xmm1
-
mul rbp
add r10,rax
mov rax,QWORD PTR[8+r9*1+rsi]
@@ -456,7 +658,7 @@ DB 102,72,15,126,195
mul rbx
add r11,rax
- mov rax,QWORD PTR[16+rcx]
+ mov rax,QWORD PTR[8+rcx]
adc rdx,0
mov r10,rdx
@@ -466,7 +668,7 @@ DB 102,72,15,126,195
adc rdx,0
add rdi,r11
lea r15,QWORD PTR[32+r9]
- lea rcx,QWORD PTR[64+rcx]
+ lea rcx,QWORD PTR[32+rcx]
adc rdx,0
mov QWORD PTR[r14],rdi
mov r13,rdx
@@ -476,7 +678,7 @@ ALIGN 32
$L$1st4x::
mul rbx
add r10,rax
- mov rax,QWORD PTR[((-32))+rcx]
+ mov rax,QWORD PTR[((-16))+rcx]
lea r14,QWORD PTR[32+r14]
adc rdx,0
mov r11,rdx
@@ -492,7 +694,7 @@ $L$1st4x::
mul rbx
add r11,rax
- mov rax,QWORD PTR[((-16))+rcx]
+ mov rax,QWORD PTR[((-8))+rcx]
adc rdx,0
mov r10,rdx
@@ -522,7 +724,7 @@ $L$1st4x::
mul rbx
add r11,rax
- mov rax,QWORD PTR[16+rcx]
+ mov rax,QWORD PTR[8+rcx]
adc rdx,0
mov r10,rdx
@@ -531,7 +733,7 @@ $L$1st4x::
mov rax,QWORD PTR[16+r15*1+rsi]
adc rdx,0
add rdi,r11
- lea rcx,QWORD PTR[64+rcx]
+ lea rcx,QWORD PTR[32+rcx]
adc rdx,0
mov QWORD PTR[r14],rdi
mov r13,rdx
@@ -541,7 +743,7 @@ $L$1st4x::
mul rbx
add r10,rax
- mov rax,QWORD PTR[((-32))+rcx]
+ mov rax,QWORD PTR[((-16))+rcx]
lea r14,QWORD PTR[32+r14]
adc rdx,0
mov r11,rdx
@@ -557,7 +759,7 @@ $L$1st4x::
mul rbx
add r11,rax
- mov rax,QWORD PTR[((-16))+rcx]
+ mov rax,QWORD PTR[((-8))+rcx]
adc rdx,0
mov r10,rdx
@@ -570,8 +772,7 @@ $L$1st4x::
mov QWORD PTR[((-16))+r14],rdi
mov r13,rdx
-DB 102,72,15,126,195
- lea rcx,QWORD PTR[r9*2+rcx]
+ lea rcx,QWORD PTR[r9*1+rcx]
xor rdi,rdi
add r13,r10
@@ -582,6 +783,63 @@ DB 102,72,15,126,195
ALIGN 32
$L$outer4x::
+ lea rdx,QWORD PTR[((16+128))+r14]
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ movdqa xmm0,XMMWORD PTR[((-128))+r12]
+ movdqa xmm1,XMMWORD PTR[((-112))+r12]
+ movdqa xmm2,XMMWORD PTR[((-96))+r12]
+ movdqa xmm3,XMMWORD PTR[((-80))+r12]
+ pand xmm0,XMMWORD PTR[((-128))+rdx]
+ pand xmm1,XMMWORD PTR[((-112))+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[((-96))+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[((-80))+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[((-64))+r12]
+ movdqa xmm1,XMMWORD PTR[((-48))+r12]
+ movdqa xmm2,XMMWORD PTR[((-32))+r12]
+ movdqa xmm3,XMMWORD PTR[((-16))+r12]
+ pand xmm0,XMMWORD PTR[((-64))+rdx]
+ pand xmm1,XMMWORD PTR[((-48))+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[((-32))+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[((-16))+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[r12]
+ movdqa xmm1,XMMWORD PTR[16+r12]
+ movdqa xmm2,XMMWORD PTR[32+r12]
+ movdqa xmm3,XMMWORD PTR[48+r12]
+ pand xmm0,XMMWORD PTR[rdx]
+ pand xmm1,XMMWORD PTR[16+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[32+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[48+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[64+r12]
+ movdqa xmm1,XMMWORD PTR[80+r12]
+ movdqa xmm2,XMMWORD PTR[96+r12]
+ movdqa xmm3,XMMWORD PTR[112+r12]
+ pand xmm0,XMMWORD PTR[64+rdx]
+ pand xmm1,XMMWORD PTR[80+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[96+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[112+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ por xmm4,xmm5
+ pshufd xmm0,xmm4,04eh
+ por xmm0,xmm4
+ lea r12,QWORD PTR[256+r12]
+DB 102,72,15,126,195
+
mov r10,QWORD PTR[r9*1+r14]
mov rbp,r8
mul rbx
@@ -589,25 +847,11 @@ $L$outer4x::
mov rax,QWORD PTR[rcx]
adc rdx,0
- movq xmm0,QWORD PTR[((-96))+r12]
- movq xmm1,QWORD PTR[((-32))+r12]
- pand xmm0,xmm4
- movq xmm2,QWORD PTR[32+r12]
- pand xmm1,xmm5
- movq xmm3,QWORD PTR[96+r12]
-
imul rbp,r10
-DB 067h
mov r11,rdx
mov QWORD PTR[r14],rdi
- pand xmm2,xmm6
- por xmm0,xmm1
- pand xmm3,xmm7
- por xmm0,xmm2
lea r14,QWORD PTR[r9*1+r14]
- lea r12,QWORD PTR[256+r12]
- por xmm0,xmm3
mul rbp
add r10,rax
@@ -617,7 +861,7 @@ DB 067h
mul rbx
add r11,rax
- mov rax,QWORD PTR[16+rcx]
+ mov rax,QWORD PTR[8+rcx]
adc rdx,0
add r11,QWORD PTR[8+r14]
adc rdx,0
@@ -629,7 +873,7 @@ DB 067h
adc rdx,0
add rdi,r11
lea r15,QWORD PTR[32+r9]
- lea rcx,QWORD PTR[64+rcx]
+ lea rcx,QWORD PTR[32+rcx]
adc rdx,0
mov r13,rdx
jmp $L$inner4x
@@ -638,7 +882,7 @@ ALIGN 32
$L$inner4x::
mul rbx
add r10,rax
- mov rax,QWORD PTR[((-32))+rcx]
+ mov rax,QWORD PTR[((-16))+rcx]
adc rdx,0
add r10,QWORD PTR[16+r14]
lea r14,QWORD PTR[32+r14]
@@ -656,7 +900,7 @@ $L$inner4x::
mul rbx
add r11,rax
- mov rax,QWORD PTR[((-16))+rcx]
+ mov rax,QWORD PTR[((-8))+rcx]
adc rdx,0
add r11,QWORD PTR[((-8))+r14]
adc rdx,0
@@ -690,7 +934,7 @@ $L$inner4x::
mul rbx
add r11,rax
- mov rax,QWORD PTR[16+rcx]
+ mov rax,QWORD PTR[8+rcx]
adc rdx,0
add r11,QWORD PTR[8+r14]
adc rdx,0
@@ -701,7 +945,7 @@ $L$inner4x::
mov rax,QWORD PTR[16+r15*1+rsi]
adc rdx,0
add rdi,r11
- lea rcx,QWORD PTR[64+rcx]
+ lea rcx,QWORD PTR[32+rcx]
adc rdx,0
mov QWORD PTR[((-8))+r14],r13
mov r13,rdx
@@ -711,7 +955,7 @@ $L$inner4x::
mul rbx
add r10,rax
- mov rax,QWORD PTR[((-32))+rcx]
+ mov rax,QWORD PTR[((-16))+rcx]
adc rdx,0
add r10,QWORD PTR[16+r14]
lea r14,QWORD PTR[32+r14]
@@ -730,7 +974,7 @@ $L$inner4x::
mul rbx
add r11,rax
mov rax,rbp
- mov rbp,QWORD PTR[((-16))+rcx]
+ mov rbp,QWORD PTR[((-8))+rcx]
adc rdx,0
add r11,QWORD PTR[((-8))+r14]
adc rdx,0
@@ -745,9 +989,8 @@ $L$inner4x::
mov QWORD PTR[((-24))+r14],r13
mov r13,rdx
-DB 102,72,15,126,195
mov QWORD PTR[((-16))+r14],rdi
- lea rcx,QWORD PTR[r9*2+rcx]
+ lea rcx,QWORD PTR[r9*1+rcx]
xor rdi,rdi
add r13,r10
@@ -758,16 +1001,23 @@ DB 102,72,15,126,195
cmp r12,QWORD PTR[((16+8))+rsp]
jb $L$outer4x
+ xor rax,rax
sub rbp,r13
adc r15,r15
or rdi,r15
- xor rdi,1
+ sub rax,rdi
lea rbx,QWORD PTR[r9*1+r14]
- lea rbp,QWORD PTR[rdi*8+rcx]
+ mov r12,QWORD PTR[rcx]
+ lea rbp,QWORD PTR[rcx]
mov rcx,r9
sar rcx,3+2
mov rdi,QWORD PTR[((56+8))+rsp]
- jmp $L$sqr4x_sub
+ dec r12
+ xor r10,r10
+ mov r13,QWORD PTR[8+rbp]
+ mov r14,QWORD PTR[16+rbp]
+ mov r15,QWORD PTR[24+rbp]
+ jmp $L$sqr4x_sub_entry
mul4x_internal ENDP
PUBLIC bn_power5
@@ -792,12 +1042,9 @@ $L$SEH_begin_bn_power5::
push r13
push r14
push r15
- lea rsp,QWORD PTR[((-40))+rsp]
- movaps XMMWORD PTR[rsp],xmm6
- movaps XMMWORD PTR[16+rsp],xmm7
- mov r10d,r9d
+
shl r9d,3
- shl r10d,3+2
+ lea r10d,DWORD PTR[r9*2+r9]
neg r9
mov r8,QWORD PTR[r8]
@@ -807,19 +1054,20 @@ $L$SEH_begin_bn_power5::
- lea r11,QWORD PTR[((-64))+r9*2+rsp]
- sub r11,rsi
+
+ lea r11,QWORD PTR[((-320))+r9*2+rsp]
+ sub r11,rdi
and r11,4095
cmp r10,r11
jb $L$pwr_sp_alt
sub rsp,r11
- lea rsp,QWORD PTR[((-64))+r9*2+rsp]
+ lea rsp,QWORD PTR[((-320))+r9*2+rsp]
jmp $L$pwr_sp_done
ALIGN 32
$L$pwr_sp_alt::
- lea r10,QWORD PTR[((4096-64))+r9*2]
- lea rsp,QWORD PTR[((-64))+r9*2+rsp]
+ lea r10,QWORD PTR[((4096-320))+r9*2]
+ lea rsp,QWORD PTR[((-320))+r9*2+rsp]
sub r11,r10
mov r10,0
cmovc r11,r10
@@ -847,10 +1095,15 @@ DB 102,73,15,110,218
DB 102,72,15,110,226
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
DB 102,72,15,126,209
DB 102,72,15,126,226
@@ -1397,9 +1650,9 @@ DB 067h
mov QWORD PTR[((-16))+rdi],rbx
mov QWORD PTR[((-8))+rdi],r8
DB 102,72,15,126,213
-sqr8x_reduction::
+__bn_sqr8x_reduction::
xor rax,rax
- lea rcx,QWORD PTR[r9*2+rbp]
+ lea rcx,QWORD PTR[rbp*1+r9]
lea rdx,QWORD PTR[((48+8))+r9*2+rsp]
mov QWORD PTR[((0+8))+rsp],rcx
lea rdi,QWORD PTR[((48+8))+r9*1+rsp]
@@ -1432,14 +1685,14 @@ DB 067h
ALIGN 32
$L$8x_reduce::
mul rbx
- mov rax,QWORD PTR[16+rbp]
+ mov rax,QWORD PTR[8+rbp]
neg r8
mov r8,rdx
adc r8,0
mul rbx
add r9,rax
- mov rax,QWORD PTR[32+rbp]
+ mov rax,QWORD PTR[16+rbp]
adc rdx,0
add r8,r9
mov QWORD PTR[((48-8+8))+rcx*8+rsp],rbx
@@ -1448,7 +1701,7 @@ $L$8x_reduce::
mul rbx
add r10,rax
- mov rax,QWORD PTR[48+rbp]
+ mov rax,QWORD PTR[24+rbp]
adc rdx,0
add r9,r10
mov rsi,QWORD PTR[((32+8))+rsp]
@@ -1457,7 +1710,7 @@ $L$8x_reduce::
mul rbx
add r11,rax
- mov rax,QWORD PTR[64+rbp]
+ mov rax,QWORD PTR[32+rbp]
adc rdx,0
imul rsi,r8
add r10,r11
@@ -1466,7 +1719,7 @@ $L$8x_reduce::
mul rbx
add r12,rax
- mov rax,QWORD PTR[80+rbp]
+ mov rax,QWORD PTR[40+rbp]
adc rdx,0
add r11,r12
mov r12,rdx
@@ -1474,7 +1727,7 @@ $L$8x_reduce::
mul rbx
add r13,rax
- mov rax,QWORD PTR[96+rbp]
+ mov rax,QWORD PTR[48+rbp]
adc rdx,0
add r12,r13
mov r13,rdx
@@ -1482,7 +1735,7 @@ $L$8x_reduce::
mul rbx
add r14,rax
- mov rax,QWORD PTR[112+rbp]
+ mov rax,QWORD PTR[56+rbp]
adc rdx,0
add r13,r14
mov r14,rdx
@@ -1500,7 +1753,7 @@ $L$8x_reduce::
dec ecx
jnz $L$8x_reduce
- lea rbp,QWORD PTR[128+rbp]
+ lea rbp,QWORD PTR[64+rbp]
xor rax,rax
mov rdx,QWORD PTR[((8+8))+rsp]
cmp rbp,QWORD PTR[((0+8))+rsp]
@@ -1526,14 +1779,14 @@ ALIGN 32
$L$8x_tail::
mul rbx
add r8,rax
- mov rax,QWORD PTR[16+rbp]
+ mov rax,QWORD PTR[8+rbp]
mov QWORD PTR[rdi],r8
mov r8,rdx
adc r8,0
mul rbx
add r9,rax
- mov rax,QWORD PTR[32+rbp]
+ mov rax,QWORD PTR[16+rbp]
adc rdx,0
add r8,r9
lea rdi,QWORD PTR[8+rdi]
@@ -1542,7 +1795,7 @@ $L$8x_tail::
mul rbx
add r10,rax
- mov rax,QWORD PTR[48+rbp]
+ mov rax,QWORD PTR[24+rbp]
adc rdx,0
add r9,r10
mov r10,rdx
@@ -1550,7 +1803,7 @@ $L$8x_tail::
mul rbx
add r11,rax
- mov rax,QWORD PTR[64+rbp]
+ mov rax,QWORD PTR[32+rbp]
adc rdx,0
add r10,r11
mov r11,rdx
@@ -1558,7 +1811,7 @@ $L$8x_tail::
mul rbx
add r12,rax
- mov rax,QWORD PTR[80+rbp]
+ mov rax,QWORD PTR[40+rbp]
adc rdx,0
add r11,r12
mov r12,rdx
@@ -1566,7 +1819,7 @@ $L$8x_tail::
mul rbx
add r13,rax
- mov rax,QWORD PTR[96+rbp]
+ mov rax,QWORD PTR[48+rbp]
adc rdx,0
add r12,r13
mov r13,rdx
@@ -1574,7 +1827,7 @@ $L$8x_tail::
mul rbx
add r14,rax
- mov rax,QWORD PTR[112+rbp]
+ mov rax,QWORD PTR[56+rbp]
adc rdx,0
add r13,r14
mov r14,rdx
@@ -1592,7 +1845,7 @@ $L$8x_tail::
dec ecx
jnz $L$8x_tail
- lea rbp,QWORD PTR[128+rbp]
+ lea rbp,QWORD PTR[64+rbp]
mov rdx,QWORD PTR[((8+8))+rsp]
cmp rbp,QWORD PTR[((0+8))+rsp]
jae $L$8x_tail_done
@@ -1638,7 +1891,7 @@ $L$8x_no_tail::
adc r14,QWORD PTR[48+rdi]
adc r15,QWORD PTR[56+rdi]
adc rax,0
- mov rcx,QWORD PTR[((-16))+rbp]
+ mov rcx,QWORD PTR[((-8))+rbp]
xor rsi,rsi
DB 102,72,15,126,213
@@ -1656,44 +1909,62 @@ DB 102,73,15,126,217
cmp rdi,rdx
jb $L$8x_reduction_loop
+ DB 0F3h,0C3h ;repret
+bn_sqr8x_internal ENDP
- sub rcx,r15
+ALIGN 32
+__bn_post4x_internal PROC PRIVATE
+ mov r12,QWORD PTR[rbp]
lea rbx,QWORD PTR[r9*1+rdi]
- adc rsi,rsi
mov rcx,r9
- or rax,rsi
DB 102,72,15,126,207
- xor rax,1
+ neg rax
DB 102,72,15,126,206
- lea rbp,QWORD PTR[rax*8+rbp]
sar rcx,3+2
- jmp $L$sqr4x_sub
+ dec r12
+ xor r10,r10
+ mov r13,QWORD PTR[8+rbp]
+ mov r14,QWORD PTR[16+rbp]
+ mov r15,QWORD PTR[24+rbp]
+ jmp $L$sqr4x_sub_entry
-ALIGN 32
+ALIGN 16
$L$sqr4x_sub::
-DB 066h
- mov r12,QWORD PTR[rbx]
- mov r13,QWORD PTR[8+rbx]
- sbb r12,QWORD PTR[rbp]
- mov r14,QWORD PTR[16+rbx]
- sbb r13,QWORD PTR[16+rbp]
- mov r15,QWORD PTR[24+rbx]
- lea rbx,QWORD PTR[32+rbx]
- sbb r14,QWORD PTR[32+rbp]
+ mov r12,QWORD PTR[rbp]
+ mov r13,QWORD PTR[8+rbp]
+ mov r14,QWORD PTR[16+rbp]
+ mov r15,QWORD PTR[24+rbp]
+$L$sqr4x_sub_entry::
+ lea rbp,QWORD PTR[32+rbp]
+ not r12
+ not r13
+ not r14
+ not r15
+ and r12,rax
+ and r13,rax
+ and r14,rax
+ and r15,rax
+
+ neg r10
+ adc r12,QWORD PTR[rbx]
+ adc r13,QWORD PTR[8+rbx]
+ adc r14,QWORD PTR[16+rbx]
+ adc r15,QWORD PTR[24+rbx]
mov QWORD PTR[rdi],r12
- sbb r15,QWORD PTR[48+rbp]
- lea rbp,QWORD PTR[64+rbp]
+ lea rbx,QWORD PTR[32+rbx]
mov QWORD PTR[8+rdi],r13
+ sbb r10,r10
mov QWORD PTR[16+rdi],r14
mov QWORD PTR[24+rdi],r15
lea rdi,QWORD PTR[32+rdi]
inc rcx
jnz $L$sqr4x_sub
+
mov r10,r9
neg r9
DB 0F3h,0C3h ;repret
-bn_sqr8x_internal ENDP
+__bn_post4x_internal ENDP
PUBLIC bn_from_montgomery
ALIGN 32
@@ -1727,13 +1998,9 @@ DB 067h
push r13
push r14
push r15
- lea rsp,QWORD PTR[((-40))+rsp]
- movaps XMMWORD PTR[rsp],xmm6
- movaps XMMWORD PTR[16+rsp],xmm7
-DB 067h
- mov r10d,r9d
+
shl r9d,3
- shl r10d,3+2
+ lea r10,QWORD PTR[r9*2+r9]
neg r9
mov r8,QWORD PTR[r8]
@@ -1743,19 +2010,20 @@ DB 067h
- lea r11,QWORD PTR[((-64))+r9*2+rsp]
- sub r11,rsi
+
+ lea r11,QWORD PTR[((-320))+r9*2+rsp]
+ sub r11,rdi
and r11,4095
cmp r10,r11
jb $L$from_sp_alt
sub rsp,r11
- lea rsp,QWORD PTR[((-64))+r9*2+rsp]
+ lea rsp,QWORD PTR[((-320))+r9*2+rsp]
jmp $L$from_sp_done
ALIGN 32
$L$from_sp_alt::
- lea r10,QWORD PTR[((4096-64))+r9*2]
- lea rsp,QWORD PTR[((-64))+r9*2+rsp]
+ lea r10,QWORD PTR[((4096-320))+r9*2]
+ lea rsp,QWORD PTR[((-320))+r9*2+rsp]
sub r11,r10
mov r10,0
cmovc r11,r10
@@ -1806,7 +2074,8 @@ DB 102,72,15,110,209
DB 067h
mov rbp,rcx
DB 102,73,15,110,218
- call sqr8x_reduction
+ call __bn_sqr8x_reduction
+ call __bn_post4x_internal
pxor xmm0,xmm0
lea rax,QWORD PTR[48+rsp]
@@ -1876,55 +2145,171 @@ bn_scatter5 ENDP
PUBLIC bn_gather5
-ALIGN 16
+ALIGN 32
bn_gather5 PROC PUBLIC
$L$SEH_begin_bn_gather5::
-DB 048h,083h,0ech,028h
-DB 00fh,029h,034h,024h
-DB 00fh,029h,07ch,024h,010h
- mov r11d,r9d
- shr r9d,3
- and r11,7
- not r9d
- lea rax,QWORD PTR[$L$magic_masks]
- and r9d,3
- lea r8,QWORD PTR[128+r11*8+r8]
- movq xmm4,QWORD PTR[r9*8+rax]
- movq xmm5,QWORD PTR[8+r9*8+rax]
- movq xmm6,QWORD PTR[16+r9*8+rax]
- movq xmm7,QWORD PTR[24+r9*8+rax]
+DB 04ch,08dh,014h,024h
+DB 048h,081h,0ech,008h,001h,000h,000h
+ lea rax,QWORD PTR[$L$inc]
+ and rsp,-16
+
+ movd xmm5,r9d
+ movdqa xmm0,XMMWORD PTR[rax]
+ movdqa xmm1,XMMWORD PTR[16+rax]
+ lea r11,QWORD PTR[128+r8]
+ lea rax,QWORD PTR[128+rsp]
+
+ pshufd xmm5,xmm5,0
+ movdqa xmm4,xmm1
+ movdqa xmm2,xmm1
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa xmm3,xmm4
+
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[(-128)+rax],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[(-112)+rax],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[(-96)+rax],xmm2
+ movdqa xmm2,xmm4
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[(-80)+rax],xmm3
+ movdqa xmm3,xmm4
+
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[(-64)+rax],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[(-48)+rax],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[(-32)+rax],xmm2
+ movdqa xmm2,xmm4
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[(-16)+rax],xmm3
+ movdqa xmm3,xmm4
+
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[rax],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[16+rax],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[32+rax],xmm2
+ movdqa xmm2,xmm4
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD PTR[48+rax],xmm3
+ movdqa xmm3,xmm4
+
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD PTR[64+rax],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD PTR[80+rax],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD PTR[96+rax],xmm2
+ movdqa xmm2,xmm4
+ movdqa XMMWORD PTR[112+rax],xmm3
jmp $L$gather
-ALIGN 16
-$L$gather::
- movq xmm0,QWORD PTR[((-128))+r8]
- movq xmm1,QWORD PTR[((-64))+r8]
- pand xmm0,xmm4
- movq xmm2,QWORD PTR[r8]
- pand xmm1,xmm5
- movq xmm3,QWORD PTR[64+r8]
- pand xmm2,xmm6
- por xmm0,xmm1
- pand xmm3,xmm7
-DB 067h,067h
- por xmm0,xmm2
- lea r8,QWORD PTR[256+r8]
- por xmm0,xmm3
+ALIGN 32
+$L$gather::
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ movdqa xmm0,XMMWORD PTR[((-128))+r11]
+ movdqa xmm1,XMMWORD PTR[((-112))+r11]
+ movdqa xmm2,XMMWORD PTR[((-96))+r11]
+ pand xmm0,XMMWORD PTR[((-128))+rax]
+ movdqa xmm3,XMMWORD PTR[((-80))+r11]
+ pand xmm1,XMMWORD PTR[((-112))+rax]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[((-96))+rax]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[((-80))+rax]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[((-64))+r11]
+ movdqa xmm1,XMMWORD PTR[((-48))+r11]
+ movdqa xmm2,XMMWORD PTR[((-32))+r11]
+ pand xmm0,XMMWORD PTR[((-64))+rax]
+ movdqa xmm3,XMMWORD PTR[((-16))+r11]
+ pand xmm1,XMMWORD PTR[((-48))+rax]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[((-32))+rax]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[((-16))+rax]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[r11]
+ movdqa xmm1,XMMWORD PTR[16+r11]
+ movdqa xmm2,XMMWORD PTR[32+r11]
+ pand xmm0,XMMWORD PTR[rax]
+ movdqa xmm3,XMMWORD PTR[48+r11]
+ pand xmm1,XMMWORD PTR[16+rax]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[32+rax]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[48+rax]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD PTR[64+r11]
+ movdqa xmm1,XMMWORD PTR[80+r11]
+ movdqa xmm2,XMMWORD PTR[96+r11]
+ pand xmm0,XMMWORD PTR[64+rax]
+ movdqa xmm3,XMMWORD PTR[112+r11]
+ pand xmm1,XMMWORD PTR[80+rax]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD PTR[96+rax]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD PTR[112+rax]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ por xmm4,xmm5
+ lea r11,QWORD PTR[256+r11]
+ pshufd xmm0,xmm4,04eh
+ por xmm0,xmm4
movq QWORD PTR[rcx],xmm0
lea rcx,QWORD PTR[8+rcx]
sub edx,1
jnz $L$gather
- movaps xmm6,XMMWORD PTR[rsp]
- movaps xmm7,XMMWORD PTR[16+rsp]
- lea rsp,QWORD PTR[40+rsp]
+
+ lea rsp,QWORD PTR[r10]
DB 0F3h,0C3h ;repret
$L$SEH_end_bn_gather5::
bn_gather5 ENDP
ALIGN 64
-$L$magic_masks::
- DD 0,0,0,0,0,0,-1,-1
- DD 0,0,0,0,0,0,0,0
+$L$inc::
+ DD 0,0,1,1
+ DD 2,2,2,2
DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
DB 112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115
DB 99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111
@@ -1966,19 +2351,16 @@ mul_handler PROC PRIVATE
lea r10,QWORD PTR[$L$mul_epilogue]
cmp rbx,r10
- jb $L$body_40
+ ja $L$body_40
mov r10,QWORD PTR[192+r8]
mov rax,QWORD PTR[8+r10*8+rax]
+
jmp $L$body_proceed
$L$body_40::
mov rax,QWORD PTR[40+rax]
$L$body_proceed::
-
- movaps xmm0,XMMWORD PTR[((-88))+rax]
- movaps xmm1,XMMWORD PTR[((-72))+rax]
-
mov rbx,QWORD PTR[((-8))+rax]
mov rbp,QWORD PTR[((-16))+rax]
mov r12,QWORD PTR[((-24))+rax]
@@ -1991,8 +2373,6 @@ $L$body_proceed::
mov QWORD PTR[224+r8],r13
mov QWORD PTR[232+r8],r14
mov QWORD PTR[240+r8],r15
- movups XMMWORD PTR[512+r8],xmm0
- movups XMMWORD PTR[528+r8],xmm1
$L$common_seh_tail::
mov rdi,QWORD PTR[8+rax]
@@ -2080,10 +2460,9 @@ DB 9,0,0,0
DD imagerel $L$from_body,imagerel $L$from_epilogue
ALIGN 8
$L$SEH_info_bn_gather5::
-DB 001h,00dh,005h,000h
-DB 00dh,078h,001h,000h
-DB 008h,068h,000h,000h
-DB 004h,042h,000h,000h
+DB 001h,00bh,003h,00ah
+DB 00bh,001h,021h,000h
+DB 004h,0a3h,000h,000h
ALIGN 8
.xdata ENDS
diff --git a/deps/openssl/asm_obsolete/x64-win32-masm/ec/ecp_nistz256-x86_64.asm b/deps/openssl/asm_obsolete/x64-win32-masm/ec/ecp_nistz256-x86_64.asm
index ef9b22fbfd..ca78bd52cc 100644
--- a/deps/openssl/asm_obsolete/x64-win32-masm/ec/ecp_nistz256-x86_64.asm
+++ b/deps/openssl/asm_obsolete/x64-win32-masm/ec/ecp_nistz256-x86_64.asm
@@ -1303,6 +1303,7 @@ $L$SEH_begin_ecp_nistz256_point_double::
push r15
sub rsp,32*5+8
+$L$point_double_shortcutq::
movdqu xmm0,XMMWORD PTR[rsi]
mov rbx,rsi
movdqu xmm1,XMMWORD PTR[16+rsi]
@@ -1577,6 +1578,7 @@ DB 102,72,15,110,199
mov r14,QWORD PTR[((64+8))+rbx]
mov r15,QWORD PTR[((64+16))+rbx]
mov r8,QWORD PTR[((64+24))+rbx]
+DB 102,72,15,110,203
lea rsi,QWORD PTR[((64-0))+rbx]
lea rdi,QWORD PTR[32+rsp]
@@ -1668,7 +1670,7 @@ DB 102,73,15,126,217
test r8,r8
jnz $L$add_proceedq
test r9,r9
- jz $L$add_proceedq
+ jz $L$add_doubleq
DB 102,72,15,126,199
pxor xmm0,xmm0
@@ -1681,6 +1683,13 @@ DB 102,72,15,126,199
jmp $L$add_doneq
ALIGN 32
+$L$add_doubleq::
+DB 102,72,15,126,206
+DB 102,72,15,126,199
+ add rsp,416
+ jmp $L$point_double_shortcutq
+
+ALIGN 32
$L$add_proceedq::
mov rax,QWORD PTR[((0+64))+rsp]
mov r14,QWORD PTR[((8+64))+rsp]
diff --git a/deps/openssl/openssl/CHANGES b/deps/openssl/openssl/CHANGES
index aa1f60d144..df4b6064dd 100644
--- a/deps/openssl/openssl/CHANGES
+++ b/deps/openssl/openssl/CHANGES
@@ -2,6 +2,138 @@
OpenSSL CHANGES
_______________
+ Changes between 1.0.2f and 1.0.2g [1 Mar 2016]
+
+ * Disable weak ciphers in SSLv3 and up in default builds of OpenSSL.
+ Builds that are not configured with "enable-weak-ssl-ciphers" will not
+ provide any "EXPORT" or "LOW" strength ciphers.
+ [Viktor Dukhovni]
+
+ * Disable SSLv2 default build, default negotiation and weak ciphers. SSLv2
+ is by default disabled at build-time. Builds that are not configured with
+ "enable-ssl2" will not support SSLv2. Even if "enable-ssl2" is used,
+ users who want to negotiate SSLv2 via the version-flexible SSLv23_method()
+ will need to explicitly call either of:
+
+ SSL_CTX_clear_options(ctx, SSL_OP_NO_SSLv2);
+ or
+ SSL_clear_options(ssl, SSL_OP_NO_SSLv2);
+
+ as appropriate. Even if either of those is used, or the application
+ explicitly uses the version-specific SSLv2_method() or its client and
+ server variants, SSLv2 ciphers vulnerable to exhaustive search key
+ recovery have been removed. Specifically, the SSLv2 40-bit EXPORT
+ ciphers, and SSLv2 56-bit DES are no longer available.
+ (CVE-2016-0800)
+ [Viktor Dukhovni]
+
+ *) Fix a double-free in DSA code
+
+ A double free bug was discovered when OpenSSL parses malformed DSA private
+ keys and could lead to a DoS attack or memory corruption for applications
+ that receive DSA private keys from untrusted sources. This scenario is
+ considered rare.
+
+ This issue was reported to OpenSSL by Adam Langley(Google/BoringSSL) using
+ libFuzzer.
+ (CVE-2016-0705)
+ [Stephen Henson]
+
+ *) Disable SRP fake user seed to address a server memory leak.
+
+ Add a new method SRP_VBASE_get1_by_user that handles the seed properly.
+
+ SRP_VBASE_get_by_user had inconsistent memory management behaviour.
+ In order to fix an unavoidable memory leak, SRP_VBASE_get_by_user
+ was changed to ignore the "fake user" SRP seed, even if the seed
+ is configured.
+
+ Users should use SRP_VBASE_get1_by_user instead. Note that in
+ SRP_VBASE_get1_by_user, caller must free the returned value. Note
+ also that even though configuring the SRP seed attempts to hide
+ invalid usernames by continuing the handshake with fake
+ credentials, this behaviour is not constant time and no strong
+ guarantees are made that the handshake is indistinguishable from
+ that of a valid user.
+ (CVE-2016-0798)
+ [Emilia Käsper]
+
+ *) Fix BN_hex2bn/BN_dec2bn NULL pointer deref/heap corruption
+
+ In the BN_hex2bn function the number of hex digits is calculated using an
+ int value |i|. Later |bn_expand| is called with a value of |i * 4|. For
+ large values of |i| this can result in |bn_expand| not allocating any
+ memory because |i * 4| is negative. This can leave the internal BIGNUM data
+ field as NULL leading to a subsequent NULL ptr deref. For very large values
+ of |i|, the calculation |i * 4| could be a positive value smaller than |i|.
+ In this case memory is allocated to the internal BIGNUM data field, but it
+ is insufficiently sized leading to heap corruption. A similar issue exists
+ in BN_dec2bn. This could have security consequences if BN_hex2bn/BN_dec2bn
+ is ever called by user applications with very large untrusted hex/dec data.
+ This is anticipated to be a rare occurrence.
+
+ All OpenSSL internal usage of these functions use data that is not expected
+ to be untrusted, e.g. config file data or application command line
+ arguments. If user developed applications generate config file data based
+ on untrusted data then it is possible that this could also lead to security
+ consequences. This is also anticipated to be rare.
+
+ This issue was reported to OpenSSL by Guido Vranken.
+ (CVE-2016-0797)
+ [Matt Caswell]
+
+ *) Fix memory issues in BIO_*printf functions
+
+ The internal |fmtstr| function used in processing a "%s" format string in
+ the BIO_*printf functions could overflow while calculating the length of a
+ string and cause an OOB read when printing very long strings.
+
+ Additionally the internal |doapr_outch| function can attempt to write to an
+ OOB memory location (at an offset from the NULL pointer) in the event of a
+ memory allocation failure. In 1.0.2 and below this could be caused where
+ the size of a buffer to be allocated is greater than INT_MAX. E.g. this
+ could be in processing a very long "%s" format string. Memory leaks can
+ also occur.
+
+ The first issue may mask the second issue dependent on compiler behaviour.
+ These problems could enable attacks where large amounts of untrusted data
+ is passed to the BIO_*printf functions. If applications use these functions
+ in this way then they could be vulnerable. OpenSSL itself uses these
+ functions when printing out human-readable dumps of ASN.1 data. Therefore
+ applications that print this data could be vulnerable if the data is from
+ untrusted sources. OpenSSL command line applications could also be
+ vulnerable where they print out ASN.1 data, or if untrusted data is passed
+ as command line arguments.
+
+ Libssl is not considered directly vulnerable. Additionally certificates etc
+ received via remote connections via libssl are also unlikely to be able to
+ trigger these issues because of message size limits enforced within libssl.
+
+ This issue was reported to OpenSSL Guido Vranken.
+ (CVE-2016-0799)
+ [Matt Caswell]
+
+ *) Side channel attack on modular exponentiation
+
+ A side-channel attack was found which makes use of cache-bank conflicts on
+ the Intel Sandy-Bridge microarchitecture which could lead to the recovery
+ of RSA keys. The ability to exploit this issue is limited as it relies on
+ an attacker who has control of code in a thread running on the same
+ hyper-threaded core as the victim thread which is performing decryptions.
+
+ This issue was reported to OpenSSL by Yuval Yarom, The University of
+ Adelaide and NICTA, Daniel Genkin, Technion and Tel Aviv University, and
+ Nadia Heninger, University of Pennsylvania with more information at
+ http://cachebleed.info.
+ (CVE-2016-0702)
+ [Andy Polyakov]
+
+ *) Change the req app to generate a 2048-bit RSA/DSA key by default,
+ if no keysize is specified with default_bits. This fixes an
+ omission in an earlier change that changed all RSA/DSA key generation
+ apps to use 2048 bits by default.
+ [Emilia Käsper]
+
Changes between 1.0.2e and 1.0.2f [28 Jan 2016]
*) DH small subgroups
@@ -105,7 +237,7 @@
[Emilia Käsper]
*) In DSA_generate_parameters_ex, if the provided seed is too short,
- return an error
+ use a random seed, as already documented.
[Rich Salz and Ismo Puustinen <ismo.puustinen@intel.com>]
Changes between 1.0.2c and 1.0.2d [9 Jul 2015]
diff --git a/deps/openssl/openssl/Configure b/deps/openssl/openssl/Configure
index 4a715dc437..c98107a487 100755
--- a/deps/openssl/openssl/Configure
+++ b/deps/openssl/openssl/Configure
@@ -58,6 +58,10 @@ my $usage="Usage: Configure [no-<cipher> ...] [enable-<cipher> ...] [experimenta
# library and will be loaded in run-time by the OpenSSL library.
# sctp include SCTP support
# 386 generate 80386 code
+# enable-weak-ssl-ciphers
+# Enable EXPORT and LOW SSLv3 ciphers that are disabled by
+# default. Note, weak SSLv2 ciphers are unconditionally
+# disabled.
# no-sse2 disables IA-32 SSE2 code, above option implies no-sse2
# no-<cipher> build without specified algorithm (rsa, idea, rc5, ...)
# -<xxx> +<xxx> compiler options are passed through
@@ -781,11 +785,13 @@ my %disabled = ( # "what" => "comment" [or special keyword "experimental
"md2" => "default",
"rc5" => "default",
"rfc3779" => "default",
- "sctp" => "default",
+ "sctp" => "default",
"shared" => "default",
"ssl-trace" => "default",
+ "ssl2" => "default",
"store" => "experimental",
"unit-test" => "default",
+ "weak-ssl-ciphers" => "default",
"zlib" => "default",
"zlib-dynamic" => "default"
);
diff --git a/deps/openssl/openssl/Makefile.shared b/deps/openssl/openssl/Makefile.shared
index e753f44e18..a2aa9804c1 100644
--- a/deps/openssl/openssl/Makefile.shared
+++ b/deps/openssl/openssl/Makefile.shared
@@ -272,7 +272,7 @@ link_o.cygwin:
SHLIB_SOVER=${LIBVERSION:+"-$(LIBVERSION)"}; \
ALLSYMSFLAGS='-Wl,--whole-archive'; \
NOALLSYMSFLAGS='-Wl,--no-whole-archive'; \
- SHAREDFLAGS="$(CFLAGS) $(SHARED_LDFLAGS) -shared $$base $$deffile -Wl,-s,-Bsymbolic"; \
+ SHAREDFLAGS="$(CFLAGS) $(SHARED_LDFLAGS) -shared $$base $$deffile -Wl,-Bsymbolic"; \
$(LINK_SO_O)
#for mingw target if def-file is in use dll-name should match library-name
link_a.cygwin:
@@ -289,7 +289,7 @@ link_a.cygwin:
SHLIB_SOVER=32; \
extras="$(LIBNAME).def"; \
$(PERL) util/mkdef.pl 32 $$SHLIB > $$extras; \
- base=; [ $(LIBNAME) = "crypto" ] && base=-Wl,--image-base,0x63000000; \
+ base=; [ $(LIBNAME) = "crypto" -a -n "$(FIPSCANLIB)" ] && base=-Wl,--image-base,0x63000000; \
fi; \
dll_name=$$SHLIB$$SHLIB_SOVER$$SHLIB_SUFFIX; \
$(PERL) util/mkrc.pl $$dll_name | \
@@ -297,7 +297,7 @@ link_a.cygwin:
extras="$$extras rc.o"; \
ALLSYMSFLAGS='-Wl,--whole-archive'; \
NOALLSYMSFLAGS='-Wl,--no-whole-archive'; \
- SHAREDFLAGS="$(CFLAGS) $(SHARED_LDFLAGS) -shared $$base -Wl,-s,-Bsymbolic -Wl,--out-implib,lib$(LIBNAME).dll.a $$extras"; \
+ SHAREDFLAGS="$(CFLAGS) $(SHARED_LDFLAGS) -shared $$base -Wl,-Bsymbolic -Wl,--out-implib,lib$(LIBNAME).dll.a $$extras"; \
[ -f apps/$$dll_name ] && rm apps/$$dll_name; \
[ -f test/$$dll_name ] && rm test/$$dll_name; \
$(LINK_SO_A) || exit 1; \
diff --git a/deps/openssl/openssl/NEWS b/deps/openssl/openssl/NEWS
index 06c77025e9..33242c8362 100644
--- a/deps/openssl/openssl/NEWS
+++ b/deps/openssl/openssl/NEWS
@@ -5,6 +5,19 @@
This file gives a brief overview of the major changes between each OpenSSL
release. For more details please read the CHANGES file.
+ Major changes between OpenSSL 1.0.2f and OpenSSL 1.0.2g [1 Mar 2016]
+
+ o Disable weak ciphers in SSLv3 and up in default builds of OpenSSL.
+ o Disable SSLv2 default build, default negotiation and weak ciphers
+ (CVE-2016-0800)
+ o Fix a double-free in DSA code (CVE-2016-0705)
+ o Disable SRP fake user seed to address a server memory leak
+ (CVE-2016-0798)
+ o Fix BN_hex2bn/BN_dec2bn NULL pointer deref/heap corruption
+ (CVE-2016-0797)
+ o Fix memory issues in BIO_*printf functions (CVE-2016-0799)
+ o Fix side channel attack on modular exponentiation (CVE-2016-0702)
+
Major changes between OpenSSL 1.0.2e and OpenSSL 1.0.2f [28 Jan 2016]
o DH small subgroups (CVE-2016-0701)
diff --git a/deps/openssl/openssl/README b/deps/openssl/openssl/README
index 1e9869daee..2077b04eb2 100644
--- a/deps/openssl/openssl/README
+++ b/deps/openssl/openssl/README
@@ -1,5 +1,5 @@
- OpenSSL 1.0.2f 28 Jan 2016
+ OpenSSL 1.0.2g 1 Mar 2016
Copyright (c) 1998-2015 The OpenSSL Project
Copyright (c) 1995-1998 Eric A. Young, Tim J. Hudson
diff --git a/deps/openssl/openssl/apps/apps.c b/deps/openssl/openssl/apps/apps.c
index 2e778054ca..b1dd97038f 100644
--- a/deps/openssl/openssl/apps/apps.c
+++ b/deps/openssl/openssl/apps/apps.c
@@ -2442,7 +2442,11 @@ int bio_to_mem(unsigned char **out, int maxlen, BIO *in)
else
len = 1024;
len = BIO_read(in, tbuf, len);
- if (len <= 0)
+ if (len < 0) {
+ BIO_free(mem);
+ return -1;
+ }
+ if (len == 0)
break;
if (BIO_write(mem, tbuf, len) != len) {
BIO_free(mem);
@@ -2459,7 +2463,7 @@ int bio_to_mem(unsigned char **out, int maxlen, BIO *in)
return ret;
}
-int pkey_ctrl_string(EVP_PKEY_CTX *ctx, char *value)
+int pkey_ctrl_string(EVP_PKEY_CTX *ctx, const char *value)
{
int rv;
char *stmp, *vtmp = NULL;
diff --git a/deps/openssl/openssl/apps/apps.h b/deps/openssl/openssl/apps/apps.h
index 8276e70869..19bf5cc333 100644
--- a/deps/openssl/openssl/apps/apps.h
+++ b/deps/openssl/openssl/apps/apps.h
@@ -321,7 +321,7 @@ int args_verify(char ***pargs, int *pargc,
int *badarg, BIO *err, X509_VERIFY_PARAM **pm);
void policies_print(BIO *out, X509_STORE_CTX *ctx);
int bio_to_mem(unsigned char **out, int maxlen, BIO *in);
-int pkey_ctrl_string(EVP_PKEY_CTX *ctx, char *value);
+int pkey_ctrl_string(EVP_PKEY_CTX *ctx, const char *value);
int init_gen_str(BIO *err, EVP_PKEY_CTX **pctx,
const char *algname, ENGINE *e, int do_param);
int do_X509_sign(BIO *err, X509 *x, EVP_PKEY *pkey, const EVP_MD *md,
diff --git a/deps/openssl/openssl/apps/pkeyutl.c b/deps/openssl/openssl/apps/pkeyutl.c
index 501fd6304a..39faa451ab 100644
--- a/deps/openssl/openssl/apps/pkeyutl.c
+++ b/deps/openssl/openssl/apps/pkeyutl.c
@@ -73,7 +73,7 @@ static void usage(void);
#define PROG pkeyutl_main
static EVP_PKEY_CTX *init_ctx(int *pkeysize,
- char *keyfile, int keyform, int key_type,
+ const char *keyfile, int keyform, int key_type,
char *passargin, int pkey_op, ENGINE *e,
int impl);
@@ -99,10 +99,12 @@ int MAIN(int argc, char **argv)
char *passargin = NULL;
int keysize = -1;
int engine_impl = 0;
-
unsigned char *buf_in = NULL, *buf_out = NULL, *sig = NULL;
- size_t buf_outlen;
+ size_t buf_outlen = 0;
int buf_inlen = 0, siglen = -1;
+ const char *inkey = NULL;
+ const char *peerkey = NULL;
+ STACK_OF(OPENSSL_STRING) *pkeyopts = NULL;
int ret = 1, rv = -1;
@@ -136,21 +138,13 @@ int MAIN(int argc, char **argv)
} else if (!strcmp(*argv, "-inkey")) {
if (--argc < 1)
badarg = 1;
- else {
- ctx = init_ctx(&keysize,
- *(++argv), keyform, key_type,
- passargin, pkey_op, e, engine_impl);
- if (!ctx) {
- BIO_puts(bio_err, "Error initializing context\n");
- ERR_print_errors(bio_err);
- badarg = 1;
- }
- }
+ else
+ inkey = *++argv;
} else if (!strcmp(*argv, "-peerkey")) {
if (--argc < 1)
badarg = 1;
- else if (!setup_peer(bio_err, ctx, peerform, *(++argv), e))
- badarg = 1;
+ else
+ peerkey = *++argv;
} else if (!strcmp(*argv, "-passin")) {
if (--argc < 1)
badarg = 1;
@@ -191,23 +185,21 @@ int MAIN(int argc, char **argv)
pkey_op = EVP_PKEY_OP_VERIFY;
else if (!strcmp(*argv, "-verifyrecover"))
pkey_op = EVP_PKEY_OP_VERIFYRECOVER;
- else if (!strcmp(*argv, "-rev"))
- rev = 1;
else if (!strcmp(*argv, "-encrypt"))
pkey_op = EVP_PKEY_OP_ENCRYPT;
else if (!strcmp(*argv, "-decrypt"))
pkey_op = EVP_PKEY_OP_DECRYPT;
else if (!strcmp(*argv, "-derive"))
pkey_op = EVP_PKEY_OP_DERIVE;
+ else if (!strcmp(*argv, "-rev"))
+ rev = 1;
else if (strcmp(*argv, "-pkeyopt") == 0) {
if (--argc < 1)
badarg = 1;
- else if (!ctx) {
- BIO_puts(bio_err, "-pkeyopt command before -inkey\n");
- badarg = 1;
- } else if (pkey_ctrl_string(ctx, *(++argv)) <= 0) {
- BIO_puts(bio_err, "parameter setting error\n");
- ERR_print_errors(bio_err);
+ else if ((pkeyopts == NULL &&
+ (pkeyopts = sk_OPENSSL_STRING_new_null()) == NULL) ||
+ sk_OPENSSL_STRING_push(pkeyopts, *++argv) == 0) {
+ BIO_puts(bio_err, "out of memory\n");
goto end;
}
} else
@@ -220,10 +212,37 @@ int MAIN(int argc, char **argv)
argv++;
}
- if (!ctx) {
+ if (inkey == NULL ||
+ (peerkey != NULL && pkey_op != EVP_PKEY_OP_DERIVE)) {
usage();
goto end;
}
+ ctx = init_ctx(&keysize, inkey, keyform, key_type,
+ passargin, pkey_op, e, engine_impl);
+ if (!ctx) {
+ BIO_puts(bio_err, "Error initializing context\n");
+ ERR_print_errors(bio_err);
+ goto end;
+ }
+ if (peerkey != NULL && !setup_peer(bio_err, ctx, peerform, peerkey, e)) {
+ BIO_puts(bio_err, "Error setting up peer key\n");
+ ERR_print_errors(bio_err);
+ goto end;
+ }
+ if (pkeyopts != NULL) {
+ int num = sk_OPENSSL_STRING_num(pkeyopts);
+ int i;
+
+ for (i = 0; i < num; ++i) {
+ const char *opt = sk_OPENSSL_STRING_value(pkeyopts, i);
+
+ if (pkey_ctrl_string(ctx, opt) <= 0) {
+ BIO_puts(bio_err, "parameter setting error\n");
+ ERR_print_errors(bio_err);
+ goto end;
+ }
+ }
+ }
if (sigfile && (pkey_op != EVP_PKEY_OP_VERIFY)) {
BIO_puts(bio_err, "Signature file specified for non verify\n");
@@ -273,7 +292,7 @@ int MAIN(int argc, char **argv)
}
siglen = bio_to_mem(&sig, keysize * 10, sigbio);
BIO_free(sigbio);
- if (siglen <= 0) {
+ if (siglen < 0) {
BIO_printf(bio_err, "Error reading signature data\n");
goto end;
}
@@ -282,7 +301,7 @@ int MAIN(int argc, char **argv)
if (in) {
/* Read the input data */
buf_inlen = bio_to_mem(&buf_in, keysize * 10, in);
- if (buf_inlen <= 0) {
+ if (buf_inlen < 0) {
BIO_printf(bio_err, "Error reading input Data\n");
exit(1);
}
@@ -310,7 +329,7 @@ int MAIN(int argc, char **argv)
} else {
rv = do_keyop(ctx, pkey_op, NULL, (size_t *)&buf_outlen,
buf_in, (size_t)buf_inlen);
- if (rv > 0) {
+ if (rv > 0 && buf_outlen != 0) {
buf_out = OPENSSL_malloc(buf_outlen);
if (!buf_out)
rv = -1;
@@ -340,12 +359,14 @@ int MAIN(int argc, char **argv)
EVP_PKEY_CTX_free(ctx);
BIO_free(in);
BIO_free_all(out);
- if (buf_in)
+ if (buf_in != NULL)
OPENSSL_free(buf_in);
- if (buf_out)
+ if (buf_out != NULL)
OPENSSL_free(buf_out);
- if (sig)
+ if (sig != NULL)
OPENSSL_free(sig);
+ if (pkeyopts != NULL)
+ sk_OPENSSL_STRING_free(pkeyopts);
return ret;
}
@@ -380,7 +401,7 @@ static void usage()
}
static EVP_PKEY_CTX *init_ctx(int *pkeysize,
- char *keyfile, int keyform, int key_type,
+ const char *keyfile, int keyform, int key_type,
char *passargin, int pkey_op, ENGINE *e,
int engine_impl)
{
@@ -484,14 +505,9 @@ static int setup_peer(BIO *err, EVP_PKEY_CTX *ctx, int peerform,
EVP_PKEY *peer = NULL;
ENGINE* engine = NULL;
int ret;
- if (!ctx) {
- BIO_puts(err, "-peerkey command before -inkey\n");
- return 0;
- }
if (peerform == FORMAT_ENGINE)
- engine = e;
-
+ engine = e;
peer = load_pubkey(bio_err, file, peerform, 0, NULL, engine, "Peer Key");
if (!peer) {
diff --git a/deps/openssl/openssl/apps/req.c b/deps/openssl/openssl/apps/req.c
index 57781c93c4..e818bd2976 100644
--- a/deps/openssl/openssl/apps/req.c
+++ b/deps/openssl/openssl/apps/req.c
@@ -101,8 +101,8 @@
#define STRING_MASK "string_mask"
#define UTF8_IN "utf8"
-#define DEFAULT_KEY_LENGTH 512
-#define MIN_KEY_LENGTH 384
+#define DEFAULT_KEY_LENGTH 2048
+#define MIN_KEY_LENGTH 512
#undef PROG
#define PROG req_main
diff --git a/deps/openssl/openssl/apps/rsautl.c b/deps/openssl/openssl/apps/rsautl.c
index d642f9ad97..5b6f849ea7 100644
--- a/deps/openssl/openssl/apps/rsautl.c
+++ b/deps/openssl/openssl/apps/rsautl.c
@@ -250,7 +250,7 @@ int MAIN(int argc, char **argv)
if (outfile) {
if (!(out = BIO_new_file(outfile, "wb"))) {
- BIO_printf(bio_err, "Error Reading Output File\n");
+ BIO_printf(bio_err, "Error Writing Output File\n");
ERR_print_errors(bio_err);
goto end;
}
@@ -276,7 +276,7 @@ int MAIN(int argc, char **argv)
/* Read the input data */
rsa_inlen = BIO_read(in, rsa_in, keysize * 2);
- if (rsa_inlen <= 0) {
+ if (rsa_inlen < 0) {
BIO_printf(bio_err, "Error reading input Data\n");
exit(1);
}
@@ -311,7 +311,7 @@ int MAIN(int argc, char **argv)
}
- if (rsa_outlen <= 0) {
+ if (rsa_outlen < 0) {
BIO_printf(bio_err, "RSA operation error\n");
ERR_print_errors(bio_err);
goto end;
diff --git a/deps/openssl/openssl/apps/s_client.c b/deps/openssl/openssl/apps/s_client.c
index 2abef8869a..bc8004a555 100644
--- a/deps/openssl/openssl/apps/s_client.c
+++ b/deps/openssl/openssl/apps/s_client.c
@@ -399,8 +399,6 @@ static void sc_usage(void)
BIO_printf(bio_err,
" -bugs - Switch on all SSL implementation bug workarounds\n");
BIO_printf(bio_err,
- " -serverpref - Use server's cipher preferences (only SSLv2)\n");
- BIO_printf(bio_err,
" -cipher - preferred cipher to use, use the 'openssl ciphers'\n");
BIO_printf(bio_err,
" command to see what is available\n");
diff --git a/deps/openssl/openssl/apps/s_server.c b/deps/openssl/openssl/apps/s_server.c
index 65cbaaf6eb..09c755b55c 100644
--- a/deps/openssl/openssl/apps/s_server.c
+++ b/deps/openssl/openssl/apps/s_server.c
@@ -429,6 +429,8 @@ typedef struct srpsrvparm_st {
static int MS_CALLBACK ssl_srp_server_param_cb(SSL *s, int *ad, void *arg)
{
srpsrvparm *p = (srpsrvparm *) arg;
+ int ret = SSL3_AL_FATAL;
+
if (p->login == NULL && p->user == NULL) {
p->login = SSL_get_srp_username(s);
BIO_printf(bio_err, "SRP username = \"%s\"\n", p->login);
@@ -437,21 +439,25 @@ static int MS_CALLBACK ssl_srp_server_param_cb(SSL *s, int *ad, void *arg)
if (p->user == NULL) {
BIO_printf(bio_err, "User %s doesn't exist\n", p->login);
- return SSL3_AL_FATAL;
+ goto err;
}
+
if (SSL_set_srp_server_param
(s, p->user->N, p->user->g, p->user->s, p->user->v,
p->user->info) < 0) {
*ad = SSL_AD_INTERNAL_ERROR;
- return SSL3_AL_FATAL;
+ goto err;
}
BIO_printf(bio_err,
"SRP parameters set: username = \"%s\" info=\"%s\" \n",
p->login, p->user->info);
- /* need to check whether there are memory leaks */
+ ret = SSL_ERROR_NONE;
+
+err:
+ SRP_user_pwd_free(p->user);
p->user = NULL;
p->login = NULL;
- return SSL_ERROR_NONE;
+ return ret;
}
#endif
@@ -2452,9 +2458,10 @@ static int sv_body(char *hostname, int s, int stype, unsigned char *context)
#ifndef OPENSSL_NO_SRP
while (SSL_get_error(con, k) == SSL_ERROR_WANT_X509_LOOKUP) {
BIO_printf(bio_s_out, "LOOKUP renego during write\n");
+ SRP_user_pwd_free(srp_callback_parm.user);
srp_callback_parm.user =
- SRP_VBASE_get_by_user(srp_callback_parm.vb,
- srp_callback_parm.login);
+ SRP_VBASE_get1_by_user(srp_callback_parm.vb,
+ srp_callback_parm.login);
if (srp_callback_parm.user)
BIO_printf(bio_s_out, "LOOKUP done %s\n",
srp_callback_parm.user->info);
@@ -2508,9 +2515,10 @@ static int sv_body(char *hostname, int s, int stype, unsigned char *context)
#ifndef OPENSSL_NO_SRP
while (SSL_get_error(con, i) == SSL_ERROR_WANT_X509_LOOKUP) {
BIO_printf(bio_s_out, "LOOKUP renego during read\n");
+ SRP_user_pwd_free(srp_callback_parm.user);
srp_callback_parm.user =
- SRP_VBASE_get_by_user(srp_callback_parm.vb,
- srp_callback_parm.login);
+ SRP_VBASE_get1_by_user(srp_callback_parm.vb,
+ srp_callback_parm.login);
if (srp_callback_parm.user)
BIO_printf(bio_s_out, "LOOKUP done %s\n",
srp_callback_parm.user->info);
@@ -2605,9 +2613,10 @@ static int init_ssl_connection(SSL *con)
while (i <= 0 && SSL_get_error(con, i) == SSL_ERROR_WANT_X509_LOOKUP) {
BIO_printf(bio_s_out, "LOOKUP during accept %s\n",
srp_callback_parm.login);
+ SRP_user_pwd_free(srp_callback_parm.user);
srp_callback_parm.user =
- SRP_VBASE_get_by_user(srp_callback_parm.vb,
- srp_callback_parm.login);
+ SRP_VBASE_get1_by_user(srp_callback_parm.vb,
+ srp_callback_parm.login);
if (srp_callback_parm.user)
BIO_printf(bio_s_out, "LOOKUP done %s\n",
srp_callback_parm.user->info);
@@ -2849,9 +2858,10 @@ static int www_body(char *hostname, int s, int stype, unsigned char *context)
&& SSL_get_error(con, i) == SSL_ERROR_WANT_X509_LOOKUP) {
BIO_printf(bio_s_out, "LOOKUP during accept %s\n",
srp_callback_parm.login);
+ SRP_user_pwd_free(srp_callback_parm.user);
srp_callback_parm.user =
- SRP_VBASE_get_by_user(srp_callback_parm.vb,
- srp_callback_parm.login);
+ SRP_VBASE_get1_by_user(srp_callback_parm.vb,
+ srp_callback_parm.login);
if (srp_callback_parm.user)
BIO_printf(bio_s_out, "LOOKUP done %s\n",
srp_callback_parm.user->info);
@@ -2891,9 +2901,10 @@ static int www_body(char *hostname, int s, int stype, unsigned char *context)
if (BIO_should_io_special(io)
&& BIO_get_retry_reason(io) == BIO_RR_SSL_X509_LOOKUP) {
BIO_printf(bio_s_out, "LOOKUP renego during read\n");
+ SRP_user_pwd_free(srp_callback_parm.user);
srp_callback_parm.user =
- SRP_VBASE_get_by_user(srp_callback_parm.vb,
- srp_callback_parm.login);
+ SRP_VBASE_get1_by_user(srp_callback_parm.vb,
+ srp_callback_parm.login);
if (srp_callback_parm.user)
BIO_printf(bio_s_out, "LOOKUP done %s\n",
srp_callback_parm.user->info);
@@ -3236,9 +3247,10 @@ static int rev_body(char *hostname, int s, int stype, unsigned char *context)
if (BIO_should_io_special(io)
&& BIO_get_retry_reason(io) == BIO_RR_SSL_X509_LOOKUP) {
BIO_printf(bio_s_out, "LOOKUP renego during accept\n");
+ SRP_user_pwd_free(srp_callback_parm.user);
srp_callback_parm.user =
- SRP_VBASE_get_by_user(srp_callback_parm.vb,
- srp_callback_parm.login);
+ SRP_VBASE_get1_by_user(srp_callback_parm.vb,
+ srp_callback_parm.login);
if (srp_callback_parm.user)
BIO_printf(bio_s_out, "LOOKUP done %s\n",
srp_callback_parm.user->info);
@@ -3264,9 +3276,10 @@ static int rev_body(char *hostname, int s, int stype, unsigned char *context)
if (BIO_should_io_special(io)
&& BIO_get_retry_reason(io) == BIO_RR_SSL_X509_LOOKUP) {
BIO_printf(bio_s_out, "LOOKUP renego during read\n");
+ SRP_user_pwd_free(srp_callback_parm.user);
srp_callback_parm.user =
- SRP_VBASE_get_by_user(srp_callback_parm.vb,
- srp_callback_parm.login);
+ SRP_VBASE_get1_by_user(srp_callback_parm.vb,
+ srp_callback_parm.login);
if (srp_callback_parm.user)
BIO_printf(bio_s_out, "LOOKUP done %s\n",
srp_callback_parm.user->info);
diff --git a/deps/openssl/openssl/config b/deps/openssl/openssl/config
index 77f730f093..bba370c4f3 100755
--- a/deps/openssl/openssl/config
+++ b/deps/openssl/openssl/config
@@ -852,7 +852,8 @@ case "$GUESSOS" in
# *-dgux) OUT="dgux" ;;
mips-sony-newsos4) OUT="newsos4-gcc" ;;
*-*-cygwin_pre1.3) OUT="Cygwin-pre1.3" ;;
- *-*-cygwin) OUT="Cygwin" ;;
+ i[3456]86-*-cygwin) OUT="Cygwin" ;;
+ *-*-cygwin) OUT="Cygwin-${MACHINE}" ;;
t3e-cray-unicosmk) OUT="cray-t3e" ;;
j90-cray-unicos) OUT="cray-j90" ;;
nsr-tandem-nsk) OUT="tandem-c89" ;;
diff --git a/deps/openssl/openssl/crypto/asn1/tasn_dec.c b/deps/openssl/openssl/crypto/asn1/tasn_dec.c
index 9256049d15..5a507967c8 100644
--- a/deps/openssl/openssl/crypto/asn1/tasn_dec.c
+++ b/deps/openssl/openssl/crypto/asn1/tasn_dec.c
@@ -717,7 +717,7 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
long plen;
char cst, inf, free_cont = 0;
const unsigned char *p;
- BUF_MEM buf;
+ BUF_MEM buf = { 0, NULL, 0 };
const unsigned char *cont = NULL;
long len;
if (!pval) {
@@ -793,7 +793,6 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
} else {
len = p - cont + plen;
p += plen;
- buf.data = NULL;
}
} else if (cst) {
if (utype == V_ASN1_NULL || utype == V_ASN1_BOOLEAN
@@ -802,9 +801,9 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ASN1_R_TYPE_NOT_PRIMITIVE);
return 0;
}
- buf.length = 0;
- buf.max = 0;
- buf.data = NULL;
+
+ /* Free any returned 'buf' content */
+ free_cont = 1;
/*
* Should really check the internal tags are correct but some things
* may get this wrong. The relevant specs say that constructed string
@@ -812,18 +811,16 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
* So instead just check for UNIVERSAL class and ignore the tag.
*/
if (!asn1_collect(&buf, &p, plen, inf, -1, V_ASN1_UNIVERSAL, 0)) {
- free_cont = 1;
goto err;
}
len = buf.length;
/* Append a final null to string */
if (!BUF_MEM_grow_clean(&buf, len + 1)) {
ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ERR_R_MALLOC_FAILURE);
- return 0;
+ goto err;
}
buf.data[len] = 0;
cont = (const unsigned char *)buf.data;
- free_cont = 1;
} else {
cont = p;
len = plen;
@@ -831,6 +828,7 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
}
/* We now have content length and type: translate into a structure */
+ /* asn1_ex_c2i may reuse allocated buffer, and so sets free_cont to 0 */
if (!asn1_ex_c2i(pval, cont, len, utype, &free_cont, it))
goto err;
diff --git a/deps/openssl/openssl/crypto/bio/b_print.c b/deps/openssl/openssl/crypto/bio/b_print.c
index 7c81e25d48..90248fa2aa 100644
--- a/deps/openssl/openssl/crypto/bio/b_print.c
+++ b/deps/openssl/openssl/crypto/bio/b_print.c
@@ -125,16 +125,16 @@
# define LLONG long
#endif
-static void fmtstr(char **, char **, size_t *, size_t *,
- const char *, int, int, int);
-static void fmtint(char **, char **, size_t *, size_t *,
- LLONG, int, int, int, int);
-static void fmtfp(char **, char **, size_t *, size_t *,
- LDOUBLE, int, int, int);
-static void doapr_outch(char **, char **, size_t *, size_t *, int);
-static void _dopr(char **sbuffer, char **buffer,
- size_t *maxlen, size_t *retlen, int *truncated,
- const char *format, va_list args);
+static int fmtstr(char **, char **, size_t *, size_t *,
+ const char *, int, int, int);
+static int fmtint(char **, char **, size_t *, size_t *,
+ LLONG, int, int, int, int);
+static int fmtfp(char **, char **, size_t *, size_t *,
+ LDOUBLE, int, int, int);
+static int doapr_outch(char **, char **, size_t *, size_t *, int);
+static int _dopr(char **sbuffer, char **buffer,
+ size_t *maxlen, size_t *retlen, int *truncated,
+ const char *format, va_list args);
/* format read states */
#define DP_S_DEFAULT 0
@@ -165,7 +165,7 @@ static void _dopr(char **sbuffer, char **buffer,
#define char_to_int(p) (p - '0')
#define OSSL_MAX(p,q) ((p >= q) ? p : q)
-static void
+static int
_dopr(char **sbuffer,
char **buffer,
size_t *maxlen,
@@ -196,7 +196,8 @@ _dopr(char **sbuffer,
if (ch == '%')
state = DP_S_FLAGS;
else
- doapr_outch(sbuffer, buffer, &currlen, maxlen, ch);
+ if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, ch))
+ return 0;
ch = *format++;
break;
case DP_S_FLAGS:
@@ -302,8 +303,9 @@ _dopr(char **sbuffer,
value = va_arg(args, int);
break;
}
- fmtint(sbuffer, buffer, &currlen, maxlen,
- value, 10, min, max, flags);
+ if (!fmtint(sbuffer, buffer, &currlen, maxlen, value, 10, min,
+ max, flags))
+ return 0;
break;
case 'X':
flags |= DP_F_UP;
@@ -326,17 +328,19 @@ _dopr(char **sbuffer,
value = (LLONG) va_arg(args, unsigned int);
break;
}
- fmtint(sbuffer, buffer, &currlen, maxlen, value,
- ch == 'o' ? 8 : (ch == 'u' ? 10 : 16),
- min, max, flags);
+ if (!fmtint(sbuffer, buffer, &currlen, maxlen, value,
+ ch == 'o' ? 8 : (ch == 'u' ? 10 : 16),
+ min, max, flags))
+ return 0;
break;
case 'f':
if (cflags == DP_C_LDOUBLE)
fvalue = va_arg(args, LDOUBLE);
else
fvalue = va_arg(args, double);
- fmtfp(sbuffer, buffer, &currlen, maxlen,
- fvalue, min, max, flags);
+ if (!fmtfp(sbuffer, buffer, &currlen, maxlen, fvalue, min, max,
+ flags))
+ return 0;
break;
case 'E':
flags |= DP_F_UP;
@@ -355,8 +359,9 @@ _dopr(char **sbuffer,
fvalue = va_arg(args, double);
break;
case 'c':
- doapr_outch(sbuffer, buffer, &currlen, maxlen,
- va_arg(args, int));
+ if(!doapr_outch(sbuffer, buffer, &currlen, maxlen,
+ va_arg(args, int)))
+ return 0;
break;
case 's':
strvalue = va_arg(args, char *);
@@ -366,13 +371,15 @@ _dopr(char **sbuffer,
else
max = *maxlen;
}
- fmtstr(sbuffer, buffer, &currlen, maxlen, strvalue,
- flags, min, max);
+ if (!fmtstr(sbuffer, buffer, &currlen, maxlen, strvalue,
+ flags, min, max))
+ return 0;
break;
case 'p':
value = (long)va_arg(args, void *);
- fmtint(sbuffer, buffer, &currlen, maxlen,
- value, 16, min, max, flags | DP_F_NUM);
+ if (!fmtint(sbuffer, buffer, &currlen, maxlen,
+ value, 16, min, max, flags | DP_F_NUM))
+ return 0;
break;
case 'n': /* XXX */
if (cflags == DP_C_SHORT) {
@@ -394,7 +401,8 @@ _dopr(char **sbuffer,
}
break;
case '%':
- doapr_outch(sbuffer, buffer, &currlen, maxlen, ch);
+ if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, ch))
+ return 0;
break;
case 'w':
/* not supported yet, treat as next char */
@@ -418,46 +426,56 @@ _dopr(char **sbuffer,
*truncated = (currlen > *maxlen - 1);
if (*truncated)
currlen = *maxlen - 1;
- doapr_outch(sbuffer, buffer, &currlen, maxlen, '\0');
+ if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, '\0'))
+ return 0;
*retlen = currlen - 1;
- return;
+ return 1;
}
-static void
+static int
fmtstr(char **sbuffer,
char **buffer,
size_t *currlen,
size_t *maxlen, const char *value, int flags, int min, int max)
{
- int padlen, strln;
+ int padlen;
+ size_t strln;
int cnt = 0;
if (value == 0)
value = "<NULL>";
- for (strln = 0; value[strln]; ++strln) ;
+
+ strln = strlen(value);
+ if (strln > INT_MAX)
+ strln = INT_MAX;
+
padlen = min - strln;
- if (padlen < 0)
+ if (min < 0 || padlen < 0)
padlen = 0;
if (flags & DP_F_MINUS)
padlen = -padlen;
while ((padlen > 0) && (cnt < max)) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
--padlen;
++cnt;
}
while (*value && (cnt < max)) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, *value++);
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, *value++))
+ return 0;
++cnt;
}
while ((padlen < 0) && (cnt < max)) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
++padlen;
++cnt;
}
+ return 1;
}
-static void
+static int
fmtint(char **sbuffer,
char **buffer,
size_t *currlen,
@@ -517,37 +535,44 @@ fmtint(char **sbuffer,
/* spaces */
while (spadlen > 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
--spadlen;
}
/* sign */
if (signvalue)
- doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue);
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue))
+ return 0;
/* prefix */
while (*prefix) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, *prefix);
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, *prefix))
+ return 0;
prefix++;
}
/* zeros */
if (zpadlen > 0) {
while (zpadlen > 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, '0');
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, '0'))
+ return 0;
--zpadlen;
}
}
/* digits */
- while (place > 0)
- doapr_outch(sbuffer, buffer, currlen, maxlen, convert[--place]);
+ while (place > 0) {
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, convert[--place]))
+ return 0;
+ }
/* left justified spaces */
while (spadlen < 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
++spadlen;
}
- return;
+ return 1;
}
static LDOUBLE abs_val(LDOUBLE value)
@@ -578,7 +603,7 @@ static long roundv(LDOUBLE value)
return intpart;
}
-static void
+static int
fmtfp(char **sbuffer,
char **buffer,
size_t *currlen,
@@ -657,47 +682,61 @@ fmtfp(char **sbuffer,
if ((flags & DP_F_ZERO) && (padlen > 0)) {
if (signvalue) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue);
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue))
+ return 0;
--padlen;
signvalue = 0;
}
while (padlen > 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, '0');
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '0'))
+ return 0;
--padlen;
}
}
while (padlen > 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
--padlen;
}
- if (signvalue)
- doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue);
+ if (signvalue && !doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue))
+ return 0;
- while (iplace > 0)
- doapr_outch(sbuffer, buffer, currlen, maxlen, iconvert[--iplace]);
+ while (iplace > 0) {
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, iconvert[--iplace]))
+ return 0;
+ }
/*
* Decimal point. This should probably use locale to find the correct
* char to print out.
*/
if (max > 0 || (flags & DP_F_NUM)) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, '.');
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '.'))
+ return 0;
- while (fplace > 0)
- doapr_outch(sbuffer, buffer, currlen, maxlen, fconvert[--fplace]);
+ while (fplace > 0) {
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen,
+ fconvert[--fplace]))
+ return 0;
+ }
}
while (zpadlen > 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, '0');
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '0'))
+ return 0;
--zpadlen;
}
while (padlen < 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
++padlen;
}
+ return 1;
}
-static void
+#define BUFFER_INC 1024
+
+static int
doapr_outch(char **sbuffer,
char **buffer, size_t *currlen, size_t *maxlen, int c)
{
@@ -708,24 +747,25 @@ doapr_outch(char **sbuffer,
assert(*currlen <= *maxlen);
if (buffer && *currlen == *maxlen) {
- *maxlen += 1024;
+ if (*maxlen > INT_MAX - BUFFER_INC)
+ return 0;
+
+ *maxlen += BUFFER_INC;
if (*buffer == NULL) {
*buffer = OPENSSL_malloc(*maxlen);
- if (!*buffer) {
- /* Panic! Can't really do anything sensible. Just return */
- return;
- }
+ if (*buffer == NULL)
+ return 0;
if (*currlen > 0) {
assert(*sbuffer != NULL);
memcpy(*buffer, *sbuffer, *currlen);
}
*sbuffer = NULL;
} else {
- *buffer = OPENSSL_realloc(*buffer, *maxlen);
- if (!*buffer) {
- /* Panic! Can't really do anything sensible. Just return */
- return;
- }
+ char *tmpbuf;
+ tmpbuf = OPENSSL_realloc(*buffer, *maxlen);
+ if (tmpbuf == NULL)
+ return 0;
+ *buffer = tmpbuf;
}
}
@@ -736,7 +776,7 @@ doapr_outch(char **sbuffer,
(*buffer)[(*currlen)++] = (char)c;
}
- return;
+ return 1;
}
/***************************************************************************/
@@ -768,7 +808,11 @@ int BIO_vprintf(BIO *bio, const char *format, va_list args)
dynbuf = NULL;
CRYPTO_push_info("doapr()");
- _dopr(&hugebufp, &dynbuf, &hugebufsize, &retlen, &ignored, format, args);
+ if (!_dopr(&hugebufp, &dynbuf, &hugebufsize, &retlen, &ignored, format,
+ args)) {
+ OPENSSL_free(dynbuf);
+ return -1;
+ }
if (dynbuf) {
ret = BIO_write(bio, dynbuf, (int)retlen);
OPENSSL_free(dynbuf);
@@ -803,7 +847,8 @@ int BIO_vsnprintf(char *buf, size_t n, const char *format, va_list args)
size_t retlen;
int truncated;
- _dopr(&buf, NULL, &n, &retlen, &truncated, format, args);
+ if(!_dopr(&buf, NULL, &n, &retlen, &truncated, format, args))
+ return -1;
if (truncated)
/*
diff --git a/deps/openssl/openssl/crypto/bio/bio.h b/deps/openssl/openssl/crypto/bio/bio.h
index 6e2293bc66..6790aed28e 100644
--- a/deps/openssl/openssl/crypto/bio/bio.h
+++ b/deps/openssl/openssl/crypto/bio/bio.h
@@ -479,7 +479,7 @@ struct bio_dgram_sctp_prinfo {
# define BIO_get_conn_hostname(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,0)
# define BIO_get_conn_port(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,1)
# define BIO_get_conn_ip(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,2)
-# define BIO_get_conn_int_port(b) BIO_ctrl(b,BIO_C_GET_CONNECT,3,0,NULL)
+# define BIO_get_conn_int_port(b) BIO_ctrl(b,BIO_C_GET_CONNECT,3,NULL)
# define BIO_set_nbio(b,n) BIO_ctrl(b,BIO_C_SET_NBIO,(n),NULL)
@@ -689,7 +689,7 @@ long BIO_debug_callback(BIO *bio, int cmd, const char *argp, int argi,
long argl, long ret);
BIO_METHOD *BIO_s_mem(void);
-BIO *BIO_new_mem_buf(void *buf, int len);
+BIO *BIO_new_mem_buf(const void *buf, int len);
BIO_METHOD *BIO_s_socket(void);
BIO_METHOD *BIO_s_connect(void);
BIO_METHOD *BIO_s_accept(void);
diff --git a/deps/openssl/openssl/crypto/bio/bss_mem.c b/deps/openssl/openssl/crypto/bio/bss_mem.c
index d190765dc2..b0394a960d 100644
--- a/deps/openssl/openssl/crypto/bio/bss_mem.c
+++ b/deps/openssl/openssl/crypto/bio/bss_mem.c
@@ -91,7 +91,8 @@ BIO_METHOD *BIO_s_mem(void)
return (&mem_method);
}
-BIO *BIO_new_mem_buf(void *buf, int len)
+
+BIO *BIO_new_mem_buf(const void *buf, int len)
{
BIO *ret;
BUF_MEM *b;
@@ -105,7 +106,8 @@ BIO *BIO_new_mem_buf(void *buf, int len)
if (!(ret = BIO_new(BIO_s_mem())))
return NULL;
b = (BUF_MEM *)ret->ptr;
- b->data = buf;
+ /* Cast away const and trust in the MEM_RDONLY flag. */
+ b->data = (void *)buf;
b->length = sz;
b->max = sz;
ret->flags |= BIO_FLAGS_MEM_RDONLY;
diff --git a/deps/openssl/openssl/crypto/bn/Makefile b/deps/openssl/openssl/crypto/bn/Makefile
index 215855ecae..c4c6409517 100644
--- a/deps/openssl/openssl/crypto/bn/Makefile
+++ b/deps/openssl/openssl/crypto/bn/Makefile
@@ -252,8 +252,8 @@ bn_exp.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_exp.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_exp.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_exp.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_exp.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_exp.c bn_lcl.h
-bn_exp.o: rsaz_exp.h
+bn_exp.o: ../../include/openssl/symhacks.h ../constant_time_locl.h
+bn_exp.o: ../cryptlib.h bn_exp.c bn_lcl.h rsaz_exp.h
bn_exp2.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_exp2.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_exp2.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
diff --git a/deps/openssl/openssl/crypto/bn/asm/rsaz-avx2.pl b/deps/openssl/openssl/crypto/bn/asm/rsaz-avx2.pl
index 3b6ccf83d1..712a77fe8c 100755
--- a/deps/openssl/openssl/crypto/bn/asm/rsaz-avx2.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/rsaz-avx2.pl
@@ -443,7 +443,7 @@ $TEMP2 = $B2;
$TEMP3 = $Y1;
$TEMP4 = $Y2;
$code.=<<___;
- #we need to fix indexes 32-39 to avoid overflow
+ # we need to fix indices 32-39 to avoid overflow
vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
@@ -1592,68 +1592,128 @@ rsaz_1024_scatter5_avx2:
.type rsaz_1024_gather5_avx2,\@abi-omnipotent
.align 32
rsaz_1024_gather5_avx2:
+ vzeroupper
+ mov %rsp,%r11
___
$code.=<<___ if ($win64);
lea -0x88(%rsp),%rax
- vzeroupper
.LSEH_begin_rsaz_1024_gather5:
# I can't trust assembler to use specific encoding:-(
- .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
- .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax)
- .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax)
- .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax)
- .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax)
- .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax)
- .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax)
- .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax)
- .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax)
- .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax)
- .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax)
+ .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp
+ .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax)
+ .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax)
+ .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax)
+ .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax)
+ .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax)
+ .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax)
+ .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax)
+ .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax)
___
$code.=<<___;
- lea .Lgather_table(%rip),%r11
- mov $power,%eax
- and \$3,$power
- shr \$2,%eax # cache line number
- shl \$4,$power # offset within cache line
-
- vmovdqu -32(%r11),%ymm7 # .Lgather_permd
- vpbroadcastb 8(%r11,%rax), %xmm8
- vpbroadcastb 7(%r11,%rax), %xmm9
- vpbroadcastb 6(%r11,%rax), %xmm10
- vpbroadcastb 5(%r11,%rax), %xmm11
- vpbroadcastb 4(%r11,%rax), %xmm12
- vpbroadcastb 3(%r11,%rax), %xmm13
- vpbroadcastb 2(%r11,%rax), %xmm14
- vpbroadcastb 1(%r11,%rax), %xmm15
-
- lea 64($inp,$power),$inp
- mov \$64,%r11 # size optimization
- mov \$9,%eax
- jmp .Loop_gather_1024
+ lea -0x100(%rsp),%rsp
+ and \$-32, %rsp
+ lea .Linc(%rip), %r10
+ lea -128(%rsp),%rax # control u-op density
+
+ vmovd $power, %xmm4
+ vmovdqa (%r10),%ymm0
+ vmovdqa 32(%r10),%ymm1
+ vmovdqa 64(%r10),%ymm5
+ vpbroadcastd %xmm4,%ymm4
+
+ vpaddd %ymm5, %ymm0, %ymm2
+ vpcmpeqd %ymm4, %ymm0, %ymm0
+ vpaddd %ymm5, %ymm1, %ymm3
+ vpcmpeqd %ymm4, %ymm1, %ymm1
+ vmovdqa %ymm0, 32*0+128(%rax)
+ vpaddd %ymm5, %ymm2, %ymm0
+ vpcmpeqd %ymm4, %ymm2, %ymm2
+ vmovdqa %ymm1, 32*1+128(%rax)
+ vpaddd %ymm5, %ymm3, %ymm1
+ vpcmpeqd %ymm4, %ymm3, %ymm3
+ vmovdqa %ymm2, 32*2+128(%rax)
+ vpaddd %ymm5, %ymm0, %ymm2
+ vpcmpeqd %ymm4, %ymm0, %ymm0
+ vmovdqa %ymm3, 32*3+128(%rax)
+ vpaddd %ymm5, %ymm1, %ymm3
+ vpcmpeqd %ymm4, %ymm1, %ymm1
+ vmovdqa %ymm0, 32*4+128(%rax)
+ vpaddd %ymm5, %ymm2, %ymm8
+ vpcmpeqd %ymm4, %ymm2, %ymm2
+ vmovdqa %ymm1, 32*5+128(%rax)
+ vpaddd %ymm5, %ymm3, %ymm9
+ vpcmpeqd %ymm4, %ymm3, %ymm3
+ vmovdqa %ymm2, 32*6+128(%rax)
+ vpaddd %ymm5, %ymm8, %ymm10
+ vpcmpeqd %ymm4, %ymm8, %ymm8
+ vmovdqa %ymm3, 32*7+128(%rax)
+ vpaddd %ymm5, %ymm9, %ymm11
+ vpcmpeqd %ymm4, %ymm9, %ymm9
+ vpaddd %ymm5, %ymm10, %ymm12
+ vpcmpeqd %ymm4, %ymm10, %ymm10
+ vpaddd %ymm5, %ymm11, %ymm13
+ vpcmpeqd %ymm4, %ymm11, %ymm11
+ vpaddd %ymm5, %ymm12, %ymm14
+ vpcmpeqd %ymm4, %ymm12, %ymm12
+ vpaddd %ymm5, %ymm13, %ymm15
+ vpcmpeqd %ymm4, %ymm13, %ymm13
+ vpcmpeqd %ymm4, %ymm14, %ymm14
+ vpcmpeqd %ymm4, %ymm15, %ymm15
+
+ vmovdqa -32(%r10),%ymm7 # .Lgather_permd
+ lea 128($inp), $inp
+ mov \$9,$power
-.align 32
.Loop_gather_1024:
- vpand -64($inp), %xmm8,%xmm0
- vpand ($inp), %xmm9,%xmm1
- vpand 64($inp), %xmm10,%xmm2
- vpand ($inp,%r11,2), %xmm11,%xmm3
- vpor %xmm0,%xmm1,%xmm1
- vpand 64($inp,%r11,2), %xmm12,%xmm4
- vpor %xmm2,%xmm3,%xmm3
- vpand ($inp,%r11,4), %xmm13,%xmm5
- vpor %xmm1,%xmm3,%xmm3
- vpand 64($inp,%r11,4), %xmm14,%xmm6
- vpor %xmm4,%xmm5,%xmm5
- vpand -128($inp,%r11,8), %xmm15,%xmm2
- lea ($inp,%r11,8),$inp
- vpor %xmm3,%xmm5,%xmm5
- vpor %xmm2,%xmm6,%xmm6
- vpor %xmm5,%xmm6,%xmm6
- vpermd %ymm6,%ymm7,%ymm6
- vmovdqu %ymm6,($out)
+ vmovdqa 32*0-128($inp), %ymm0
+ vmovdqa 32*1-128($inp), %ymm1
+ vmovdqa 32*2-128($inp), %ymm2
+ vmovdqa 32*3-128($inp), %ymm3
+ vpand 32*0+128(%rax), %ymm0, %ymm0
+ vpand 32*1+128(%rax), %ymm1, %ymm1
+ vpand 32*2+128(%rax), %ymm2, %ymm2
+ vpor %ymm0, %ymm1, %ymm4
+ vpand 32*3+128(%rax), %ymm3, %ymm3
+ vmovdqa 32*4-128($inp), %ymm0
+ vmovdqa 32*5-128($inp), %ymm1
+ vpor %ymm2, %ymm3, %ymm5
+ vmovdqa 32*6-128($inp), %ymm2
+ vmovdqa 32*7-128($inp), %ymm3
+ vpand 32*4+128(%rax), %ymm0, %ymm0
+ vpand 32*5+128(%rax), %ymm1, %ymm1
+ vpand 32*6+128(%rax), %ymm2, %ymm2
+ vpor %ymm0, %ymm4, %ymm4
+ vpand 32*7+128(%rax), %ymm3, %ymm3
+ vpand 32*8-128($inp), %ymm8, %ymm0
+ vpor %ymm1, %ymm5, %ymm5
+ vpand 32*9-128($inp), %ymm9, %ymm1
+ vpor %ymm2, %ymm4, %ymm4
+ vpand 32*10-128($inp),%ymm10, %ymm2
+ vpor %ymm3, %ymm5, %ymm5
+ vpand 32*11-128($inp),%ymm11, %ymm3
+ vpor %ymm0, %ymm4, %ymm4
+ vpand 32*12-128($inp),%ymm12, %ymm0
+ vpor %ymm1, %ymm5, %ymm5
+ vpand 32*13-128($inp),%ymm13, %ymm1
+ vpor %ymm2, %ymm4, %ymm4
+ vpand 32*14-128($inp),%ymm14, %ymm2
+ vpor %ymm3, %ymm5, %ymm5
+ vpand 32*15-128($inp),%ymm15, %ymm3
+ lea 32*16($inp), $inp
+ vpor %ymm0, %ymm4, %ymm4
+ vpor %ymm1, %ymm5, %ymm5
+ vpor %ymm2, %ymm4, %ymm4
+ vpor %ymm3, %ymm5, %ymm5
+
+ vpor %ymm5, %ymm4, %ymm4
+ vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared
+ vpor %xmm4, %xmm5, %xmm5
+ vpermd %ymm5,%ymm7,%ymm5
+ vmovdqu %ymm5,($out)
lea 32($out),$out
- dec %eax
+ dec $power
jnz .Loop_gather_1024
vpxor %ymm0,%ymm0,%ymm0
@@ -1661,20 +1721,20 @@ $code.=<<___;
vzeroupper
___
$code.=<<___ if ($win64);
- movaps (%rsp),%xmm6
- movaps 0x10(%rsp),%xmm7
- movaps 0x20(%rsp),%xmm8
- movaps 0x30(%rsp),%xmm9
- movaps 0x40(%rsp),%xmm10
- movaps 0x50(%rsp),%xmm11
- movaps 0x60(%rsp),%xmm12
- movaps 0x70(%rsp),%xmm13
- movaps 0x80(%rsp),%xmm14
- movaps 0x90(%rsp),%xmm15
- lea 0xa8(%rsp),%rsp
+ movaps -0xa8(%r11),%xmm6
+ movaps -0x98(%r11),%xmm7
+ movaps -0x88(%r11),%xmm8
+ movaps -0x78(%r11),%xmm9
+ movaps -0x68(%r11),%xmm10
+ movaps -0x58(%r11),%xmm11
+ movaps -0x48(%r11),%xmm12
+ movaps -0x38(%r11),%xmm13
+ movaps -0x28(%r11),%xmm14
+ movaps -0x18(%r11),%xmm15
.LSEH_end_rsaz_1024_gather5:
___
$code.=<<___;
+ lea (%r11),%rsp
ret
.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
___
@@ -1708,8 +1768,10 @@ $code.=<<___;
.long 0,2,4,6,7,7,7,7
.Lgather_permd:
.long 0,7,1,7,2,7,3,7
-.Lgather_table:
- .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
+.Linc:
+ .long 0,0,0,0, 1,1,1,1
+ .long 2,2,2,2, 3,3,3,3
+ .long 4,4,4,4, 4,4,4,4
.align 64
___
@@ -1837,18 +1899,19 @@ rsaz_se_handler:
.rva rsaz_se_handler
.rva .Lmul_1024_body,.Lmul_1024_epilogue
.LSEH_info_rsaz_1024_gather5:
- .byte 0x01,0x33,0x16,0x00
- .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15
- .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14
- .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13
- .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12
- .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11
- .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10
- .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9
- .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8
- .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7
- .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6
- .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
+ .byte 0x01,0x36,0x17,0x0b
+ .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
+ .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
+ .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
+ .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
+ .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
+ .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
+ .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
+ .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
+ .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
+ .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
+ .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8
+ .byte 0x00,0xb3,0x00,0x00 # set_frame r11
___
}
diff --git a/deps/openssl/openssl/crypto/bn/asm/rsaz-x86_64.pl b/deps/openssl/openssl/crypto/bn/asm/rsaz-x86_64.pl
index 091cdc2069..87ce2c34d9 100755
--- a/deps/openssl/openssl/crypto/bn/asm/rsaz-x86_64.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/rsaz-x86_64.pl
@@ -915,9 +915,76 @@ rsaz_512_mul_gather4:
push %r14
push %r15
- mov $pwr, $pwr
- subq \$128+24, %rsp
+ subq \$`128+24+($win64?0xb0:0)`, %rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,0xa0(%rsp)
+ movaps %xmm7,0xb0(%rsp)
+ movaps %xmm8,0xc0(%rsp)
+ movaps %xmm9,0xd0(%rsp)
+ movaps %xmm10,0xe0(%rsp)
+ movaps %xmm11,0xf0(%rsp)
+ movaps %xmm12,0x100(%rsp)
+ movaps %xmm13,0x110(%rsp)
+ movaps %xmm14,0x120(%rsp)
+ movaps %xmm15,0x130(%rsp)
+___
+$code.=<<___;
.Lmul_gather4_body:
+ movd $pwr,%xmm8
+ movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
+ movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
+
+ pshufd \$0,%xmm8,%xmm8 # broadcast $power
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..15 to $power
+#
+for($i=0;$i<4;$i++) {
+$code.=<<___;
+ paddd %xmm`$i`,%xmm`$i+1`
+ pcmpeqd %xmm8,%xmm`$i`
+ movdqa %xmm7,%xmm`$i+3`
+___
+}
+for(;$i<7;$i++) {
+$code.=<<___;
+ paddd %xmm`$i`,%xmm`$i+1`
+ pcmpeqd %xmm8,%xmm`$i`
+___
+}
+$code.=<<___;
+ pcmpeqd %xmm8,%xmm7
+
+ movdqa 16*0($bp),%xmm8
+ movdqa 16*1($bp),%xmm9
+ movdqa 16*2($bp),%xmm10
+ movdqa 16*3($bp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 16*4($bp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 16*5($bp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 16*6($bp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 16*7($bp),%xmm15
+ leaq 128($bp), %rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd \$0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
___
$code.=<<___ if ($addx);
movl \$0x80100,%r11d
@@ -926,45 +993,38 @@ $code.=<<___ if ($addx);
je .Lmulx_gather
___
$code.=<<___;
- movl 64($bp,$pwr,4), %eax
- movq $out, %xmm0 # off-load arguments
- movl ($bp,$pwr,4), %ebx
- movq $mod, %xmm1
- movq $n0, 128(%rsp)
+ movq %xmm8,%rbx
+
+ movq $n0, 128(%rsp) # off-load arguments
+ movq $out, 128+8(%rsp)
+ movq $mod, 128+16(%rsp)
- shlq \$32, %rax
- or %rax, %rbx
movq ($ap), %rax
movq 8($ap), %rcx
- leaq 128($bp,$pwr,4), %rbp
mulq %rbx # 0 iteration
movq %rax, (%rsp)
movq %rcx, %rax
movq %rdx, %r8
mulq %rbx
- movd (%rbp), %xmm4
addq %rax, %r8
movq 16($ap), %rax
movq %rdx, %r9
adcq \$0, %r9
mulq %rbx
- movd 64(%rbp), %xmm5
addq %rax, %r9
movq 24($ap), %rax
movq %rdx, %r10
adcq \$0, %r10
mulq %rbx
- pslldq \$4, %xmm5
addq %rax, %r10
movq 32($ap), %rax
movq %rdx, %r11
adcq \$0, %r11
mulq %rbx
- por %xmm5, %xmm4
addq %rax, %r11
movq 40($ap), %rax
movq %rdx, %r12
@@ -977,14 +1037,12 @@ $code.=<<___;
adcq \$0, %r13
mulq %rbx
- leaq 128(%rbp), %rbp
addq %rax, %r13
movq 56($ap), %rax
movq %rdx, %r14
adcq \$0, %r14
mulq %rbx
- movq %xmm4, %rbx
addq %rax, %r14
movq ($ap), %rax
movq %rdx, %r15
@@ -996,6 +1054,35 @@ $code.=<<___;
.align 32
.Loop_mul_gather:
+ movdqa 16*0(%rbp),%xmm8
+ movdqa 16*1(%rbp),%xmm9
+ movdqa 16*2(%rbp),%xmm10
+ movdqa 16*3(%rbp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 16*4(%rbp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 16*5(%rbp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 16*6(%rbp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 16*7(%rbp),%xmm15
+ leaq 128(%rbp), %rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd \$0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+ movq %xmm8,%rbx
+
mulq %rbx
addq %rax, %r8
movq 8($ap), %rax
@@ -1004,7 +1091,6 @@ $code.=<<___;
adcq \$0, %r8
mulq %rbx
- movd (%rbp), %xmm4
addq %rax, %r9
movq 16($ap), %rax
adcq \$0, %rdx
@@ -1013,7 +1099,6 @@ $code.=<<___;
adcq \$0, %r9
mulq %rbx
- movd 64(%rbp), %xmm5
addq %rax, %r10
movq 24($ap), %rax
adcq \$0, %rdx
@@ -1022,7 +1107,6 @@ $code.=<<___;
adcq \$0, %r10
mulq %rbx
- pslldq \$4, %xmm5
addq %rax, %r11
movq 32($ap), %rax
adcq \$0, %rdx
@@ -1031,7 +1115,6 @@ $code.=<<___;
adcq \$0, %r11
mulq %rbx
- por %xmm5, %xmm4
addq %rax, %r12
movq 40($ap), %rax
adcq \$0, %rdx
@@ -1056,7 +1139,6 @@ $code.=<<___;
adcq \$0, %r14
mulq %rbx
- movq %xmm4, %rbx
addq %rax, %r15
movq ($ap), %rax
adcq \$0, %rdx
@@ -1064,7 +1146,6 @@ $code.=<<___;
movq %rdx, %r15
adcq \$0, %r15
- leaq 128(%rbp), %rbp
leaq 8(%rdi), %rdi
decl %ecx
@@ -1079,8 +1160,8 @@ $code.=<<___;
movq %r14, 48(%rdi)
movq %r15, 56(%rdi)
- movq %xmm0, $out
- movq %xmm1, %rbp
+ movq 128+8(%rsp), $out
+ movq 128+16(%rsp), %rbp
movq (%rsp), %r8
movq 8(%rsp), %r9
@@ -1098,45 +1179,37 @@ $code.=<<___ if ($addx);
.align 32
.Lmulx_gather:
- mov 64($bp,$pwr,4), %eax
- movq $out, %xmm0 # off-load arguments
- lea 128($bp,$pwr,4), %rbp
- mov ($bp,$pwr,4), %edx
- movq $mod, %xmm1
- mov $n0, 128(%rsp)
+ movq %xmm8,%rdx
+
+ mov $n0, 128(%rsp) # off-load arguments
+ mov $out, 128+8(%rsp)
+ mov $mod, 128+16(%rsp)
- shl \$32, %rax
- or %rax, %rdx
mulx ($ap), %rbx, %r8 # 0 iteration
mov %rbx, (%rsp)
xor %edi, %edi # cf=0, of=0
mulx 8($ap), %rax, %r9
- movd (%rbp), %xmm4
mulx 16($ap), %rbx, %r10
- movd 64(%rbp), %xmm5
adcx %rax, %r8
mulx 24($ap), %rax, %r11
- pslldq \$4, %xmm5
adcx %rbx, %r9
mulx 32($ap), %rbx, %r12
- por %xmm5, %xmm4
adcx %rax, %r10
mulx 40($ap), %rax, %r13
adcx %rbx, %r11
mulx 48($ap), %rbx, %r14
- lea 128(%rbp), %rbp
adcx %rax, %r12
mulx 56($ap), %rax, %r15
- movq %xmm4, %rdx
adcx %rbx, %r13
adcx %rax, %r14
+ .byte 0x67
mov %r8, %rbx
adcx %rdi, %r15 # %rdi is 0
@@ -1145,24 +1218,48 @@ $code.=<<___ if ($addx);
.align 32
.Loop_mulx_gather:
- mulx ($ap), %rax, %r8
+ movdqa 16*0(%rbp),%xmm8
+ movdqa 16*1(%rbp),%xmm9
+ movdqa 16*2(%rbp),%xmm10
+ movdqa 16*3(%rbp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 16*4(%rbp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 16*5(%rbp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 16*6(%rbp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 16*7(%rbp),%xmm15
+ leaq 128(%rbp), %rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd \$0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+ movq %xmm8,%rdx
+
+ .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
adcx %rax, %rbx
adox %r9, %r8
mulx 8($ap), %rax, %r9
- .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4
adcx %rax, %r8
adox %r10, %r9
mulx 16($ap), %rax, %r10
- movd 64(%rbp), %xmm5
- lea 128(%rbp), %rbp
adcx %rax, %r9
adox %r11, %r10
.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
- pslldq \$4, %xmm5
- por %xmm5, %xmm4
adcx %rax, %r10
adox %r12, %r11
@@ -1176,10 +1273,10 @@ $code.=<<___ if ($addx);
.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
adcx %rax, %r13
+ .byte 0x67
adox %r15, %r14
mulx 56($ap), %rax, %r15
- movq %xmm4, %rdx
mov %rbx, 64(%rsp,%rcx,8)
adcx %rax, %r14
adox %rdi, %r15
@@ -1198,10 +1295,10 @@ $code.=<<___ if ($addx);
mov %r14, 64+48(%rsp)
mov %r15, 64+56(%rsp)
- movq %xmm0, $out
- movq %xmm1, %rbp
+ mov 128(%rsp), %rdx # pull arguments
+ mov 128+8(%rsp), $out
+ mov 128+16(%rsp), %rbp
- mov 128(%rsp), %rdx # pull $n0
mov (%rsp), %r8
mov 8(%rsp), %r9
mov 16(%rsp), %r10
@@ -1229,6 +1326,21 @@ $code.=<<___;
call __rsaz_512_subtract
leaq 128+24+48(%rsp), %rax
+___
+$code.=<<___ if ($win64);
+ movaps 0xa0-0xc8(%rax),%xmm6
+ movaps 0xb0-0xc8(%rax),%xmm7
+ movaps 0xc0-0xc8(%rax),%xmm8
+ movaps 0xd0-0xc8(%rax),%xmm9
+ movaps 0xe0-0xc8(%rax),%xmm10
+ movaps 0xf0-0xc8(%rax),%xmm11
+ movaps 0x100-0xc8(%rax),%xmm12
+ movaps 0x110-0xc8(%rax),%xmm13
+ movaps 0x120-0xc8(%rax),%xmm14
+ movaps 0x130-0xc8(%rax),%xmm15
+ lea 0xb0(%rax),%rax
+___
+$code.=<<___;
movq -48(%rax), %r15
movq -40(%rax), %r14
movq -32(%rax), %r13
@@ -1258,7 +1370,7 @@ rsaz_512_mul_scatter4:
mov $pwr, $pwr
subq \$128+24, %rsp
.Lmul_scatter4_body:
- leaq ($tbl,$pwr,4), $tbl
+ leaq ($tbl,$pwr,8), $tbl
movq $out, %xmm0 # off-load arguments
movq $mod, %xmm1
movq $tbl, %xmm2
@@ -1329,30 +1441,14 @@ $code.=<<___;
call __rsaz_512_subtract
- movl %r8d, 64*0($inp) # scatter
- shrq \$32, %r8
- movl %r9d, 64*2($inp)
- shrq \$32, %r9
- movl %r10d, 64*4($inp)
- shrq \$32, %r10
- movl %r11d, 64*6($inp)
- shrq \$32, %r11
- movl %r12d, 64*8($inp)
- shrq \$32, %r12
- movl %r13d, 64*10($inp)
- shrq \$32, %r13
- movl %r14d, 64*12($inp)
- shrq \$32, %r14
- movl %r15d, 64*14($inp)
- shrq \$32, %r15
- movl %r8d, 64*1($inp)
- movl %r9d, 64*3($inp)
- movl %r10d, 64*5($inp)
- movl %r11d, 64*7($inp)
- movl %r12d, 64*9($inp)
- movl %r13d, 64*11($inp)
- movl %r14d, 64*13($inp)
- movl %r15d, 64*15($inp)
+ movq %r8, 128*0($inp) # scatter
+ movq %r9, 128*1($inp)
+ movq %r10, 128*2($inp)
+ movq %r11, 128*3($inp)
+ movq %r12, 128*4($inp)
+ movq %r13, 128*5($inp)
+ movq %r14, 128*6($inp)
+ movq %r15, 128*7($inp)
leaq 128+24+48(%rsp), %rax
movq -48(%rax), %r15
@@ -1956,16 +2052,14 @@ $code.=<<___;
.type rsaz_512_scatter4,\@abi-omnipotent
.align 16
rsaz_512_scatter4:
- leaq ($out,$power,4), $out
+ leaq ($out,$power,8), $out
movl \$8, %r9d
jmp .Loop_scatter
.align 16
.Loop_scatter:
movq ($inp), %rax
leaq 8($inp), $inp
- movl %eax, ($out)
- shrq \$32, %rax
- movl %eax, 64($out)
+ movq %rax, ($out)
leaq 128($out), $out
decl %r9d
jnz .Loop_scatter
@@ -1976,22 +2070,106 @@ rsaz_512_scatter4:
.type rsaz_512_gather4,\@abi-omnipotent
.align 16
rsaz_512_gather4:
- leaq ($inp,$power,4), $inp
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_rsaz_512_gather4:
+ .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
+ .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
+ .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
+ .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
+ .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
+ .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
+ .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
+ .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
+ .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
+ .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
+ .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
+___
+$code.=<<___;
+ movd $power,%xmm8
+ movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
+ movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
+
+ pshufd \$0,%xmm8,%xmm8 # broadcast $power
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..15 to $power
+#
+for($i=0;$i<4;$i++) {
+$code.=<<___;
+ paddd %xmm`$i`,%xmm`$i+1`
+ pcmpeqd %xmm8,%xmm`$i`
+ movdqa %xmm7,%xmm`$i+3`
+___
+}
+for(;$i<7;$i++) {
+$code.=<<___;
+ paddd %xmm`$i`,%xmm`$i+1`
+ pcmpeqd %xmm8,%xmm`$i`
+___
+}
+$code.=<<___;
+ pcmpeqd %xmm8,%xmm7
movl \$8, %r9d
jmp .Loop_gather
.align 16
.Loop_gather:
- movl ($inp), %eax
- movl 64($inp), %r8d
+ movdqa 16*0($inp),%xmm8
+ movdqa 16*1($inp),%xmm9
+ movdqa 16*2($inp),%xmm10
+ movdqa 16*3($inp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 16*4($inp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 16*5($inp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 16*6($inp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 16*7($inp),%xmm15
leaq 128($inp), $inp
- shlq \$32, %r8
- or %r8, %rax
- movq %rax, ($out)
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd \$0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+ movq %xmm8,($out)
leaq 8($out), $out
decl %r9d
jnz .Loop_gather
+___
+$code.=<<___ if ($win64);
+ movaps 0x00(%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ movaps 0x40(%rsp),%xmm10
+ movaps 0x50(%rsp),%xmm11
+ movaps 0x60(%rsp),%xmm12
+ movaps 0x70(%rsp),%xmm13
+ movaps 0x80(%rsp),%xmm14
+ movaps 0x90(%rsp),%xmm15
+ add \$0xa8,%rsp
+___
+$code.=<<___;
ret
+.LSEH_end_rsaz_512_gather4:
.size rsaz_512_gather4,.-rsaz_512_gather4
+
+.align 64
+.Linc:
+ .long 0,0, 1,1
+ .long 2,2, 2,2
___
}
@@ -2039,6 +2217,18 @@ se_handler:
lea 128+24+48(%rax),%rax
+ lea .Lmul_gather4_epilogue(%rip),%rbx
+ cmp %r10,%rbx
+ jne .Lse_not_in_mul_gather4
+
+ lea 0xb0(%rax),%rax
+
+ lea -48-0xa8(%rax),%rsi
+ lea 512($context),%rdi
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lse_not_in_mul_gather4:
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
@@ -2090,7 +2280,7 @@ se_handler:
pop %rdi
pop %rsi
ret
-.size sqr_handler,.-sqr_handler
+.size se_handler,.-se_handler
.section .pdata
.align 4
@@ -2114,6 +2304,10 @@ se_handler:
.rva .LSEH_end_rsaz_512_mul_by_one
.rva .LSEH_info_rsaz_512_mul_by_one
+ .rva .LSEH_begin_rsaz_512_gather4
+ .rva .LSEH_end_rsaz_512_gather4
+ .rva .LSEH_info_rsaz_512_gather4
+
.section .xdata
.align 8
.LSEH_info_rsaz_512_sqr:
@@ -2136,6 +2330,19 @@ se_handler:
.byte 9,0,0,0
.rva se_handler
.rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
+.LSEH_info_rsaz_512_gather4:
+ .byte 0x01,0x46,0x16,0x00
+ .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
+ .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
+ .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
+ .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
+ .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
+ .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
+ .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
+ .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
+ .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
+ .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
+ .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
___
}
diff --git a/deps/openssl/openssl/crypto/bn/asm/x86_64-mont.pl b/deps/openssl/openssl/crypto/bn/asm/x86_64-mont.pl
index e82e451388..29ba1224e3 100755
--- a/deps/openssl/openssl/crypto/bn/asm/x86_64-mont.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/x86_64-mont.pl
@@ -775,100 +775,126 @@ bn_sqr8x_mont:
# 4096. this is done to allow memory disambiguation logic
# do its job.
#
- lea -64(%rsp,$num,4),%r11
+ lea -64(%rsp,$num,2),%r11
mov ($n0),$n0 # *n0
sub $aptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lsqr8x_sp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
jmp .Lsqr8x_sp_done
.align 32
.Lsqr8x_sp_alt:
- lea 4096-64(,$num,4),%r10 # 4096-frame-4*$num
- lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
+ lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
sub %r11,%rsp
.Lsqr8x_sp_done:
and \$-64,%rsp
- mov $num,%r10
+ mov $num,%r10
neg $num
- lea 64(%rsp,$num,2),%r11 # copy of modulus
mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp
.Lsqr8x_body:
- mov $num,$i
- movq %r11, %xmm2 # save pointer to modulus copy
- shr \$3+2,$i
- mov OPENSSL_ia32cap_P+8(%rip),%eax
- jmp .Lsqr8x_copy_n
-
-.align 32
-.Lsqr8x_copy_n:
- movq 8*0($nptr),%xmm0
- movq 8*1($nptr),%xmm1
- movq 8*2($nptr),%xmm3
- movq 8*3($nptr),%xmm4
- lea 8*4($nptr),$nptr
- movdqa %xmm0,16*0(%r11)
- movdqa %xmm1,16*1(%r11)
- movdqa %xmm3,16*2(%r11)
- movdqa %xmm4,16*3(%r11)
- lea 16*4(%r11),%r11
- dec $i
- jnz .Lsqr8x_copy_n
-
+ movq $nptr, %xmm2 # save pointer to modulus
pxor %xmm0,%xmm0
movq $rptr,%xmm1 # save $rptr
movq %r10, %xmm3 # -$num
___
$code.=<<___ if ($addx);
+ mov OPENSSL_ia32cap_P+8(%rip),%eax
and \$0x80100,%eax
cmp \$0x80100,%eax
jne .Lsqr8x_nox
call bn_sqrx8x_internal # see x86_64-mont5 module
-
- pxor %xmm0,%xmm0
- lea 48(%rsp),%rax
- lea 64(%rsp,$num,2),%rdx
- shr \$3+2,$num
- mov 40(%rsp),%rsi # restore %rsp
- jmp .Lsqr8x_zero
+ # %rax top-most carry
+ # %rbp nptr
+ # %rcx -8*num
+ # %r8 end of tp[2*num]
+ lea (%r8,%rcx),%rbx
+ mov %rcx,$num
+ mov %rcx,%rdx
+ movq %xmm1,$rptr
+ sar \$3+2,%rcx # %cf=0
+ jmp .Lsqr8x_sub
.align 32
.Lsqr8x_nox:
___
$code.=<<___;
call bn_sqr8x_internal # see x86_64-mont5 module
+ # %rax top-most carry
+ # %rbp nptr
+ # %r8 -8*num
+ # %rdi end of tp[2*num]
+ lea (%rdi,$num),%rbx
+ mov $num,%rcx
+ mov $num,%rdx
+ movq %xmm1,$rptr
+ sar \$3+2,%rcx # %cf=0
+ jmp .Lsqr8x_sub
+.align 32
+.Lsqr8x_sub:
+ mov 8*0(%rbx),%r12
+ mov 8*1(%rbx),%r13
+ mov 8*2(%rbx),%r14
+ mov 8*3(%rbx),%r15
+ lea 8*4(%rbx),%rbx
+ sbb 8*0(%rbp),%r12
+ sbb 8*1(%rbp),%r13
+ sbb 8*2(%rbp),%r14
+ sbb 8*3(%rbp),%r15
+ lea 8*4(%rbp),%rbp
+ mov %r12,8*0($rptr)
+ mov %r13,8*1($rptr)
+ mov %r14,8*2($rptr)
+ mov %r15,8*3($rptr)
+ lea 8*4($rptr),$rptr
+ inc %rcx # preserves %cf
+ jnz .Lsqr8x_sub
+
+ sbb \$0,%rax # top-most carry
+ lea (%rbx,$num),%rbx # rewind
+ lea ($rptr,$num),$rptr # rewind
+
+ movq %rax,%xmm1
pxor %xmm0,%xmm0
- lea 48(%rsp),%rax
- lea 64(%rsp,$num,2),%rdx
- shr \$3+2,$num
+ pshufd \$0,%xmm1,%xmm1
mov 40(%rsp),%rsi # restore %rsp
- jmp .Lsqr8x_zero
+ jmp .Lsqr8x_cond_copy
.align 32
-.Lsqr8x_zero:
- movdqa %xmm0,16*0(%rax) # wipe t
- movdqa %xmm0,16*1(%rax)
- movdqa %xmm0,16*2(%rax)
- movdqa %xmm0,16*3(%rax)
- lea 16*4(%rax),%rax
- movdqa %xmm0,16*0(%rdx) # wipe n
- movdqa %xmm0,16*1(%rdx)
- movdqa %xmm0,16*2(%rdx)
- movdqa %xmm0,16*3(%rdx)
- lea 16*4(%rdx),%rdx
- dec $num
- jnz .Lsqr8x_zero
+.Lsqr8x_cond_copy:
+ movdqa 16*0(%rbx),%xmm2
+ movdqa 16*1(%rbx),%xmm3
+ lea 16*2(%rbx),%rbx
+ movdqu 16*0($rptr),%xmm4
+ movdqu 16*1($rptr),%xmm5
+ lea 16*2($rptr),$rptr
+ movdqa %xmm0,-16*2(%rbx) # zero tp
+ movdqa %xmm0,-16*1(%rbx)
+ movdqa %xmm0,-16*2(%rbx,%rdx)
+ movdqa %xmm0,-16*1(%rbx,%rdx)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-16*2($rptr)
+ movdqu %xmm5,-16*1($rptr)
+ add \$32,$num
+ jnz .Lsqr8x_cond_copy
mov \$1,%rax
mov -48(%rsi),%r15
@@ -1135,64 +1161,75 @@ $code.=<<___;
adc $zero,%r15 # modulo-scheduled
sub 0*8($tptr),$zero # pull top-most carry
adc %r15,%r14
- mov -8($nptr),$mi
sbb %r15,%r15 # top-most carry
mov %r14,-1*8($tptr)
cmp 16(%rsp),$bptr
jne .Lmulx4x_outer
- sub %r14,$mi # compare top-most words
- sbb $mi,$mi
- or $mi,%r15
-
- neg $num
- xor %rdx,%rdx
+ lea 64(%rsp),$tptr
+ sub $num,$nptr # rewind $nptr
+ neg %r15
+ mov $num,%rdx
+ shr \$3+2,$num # %cf=0
mov 32(%rsp),$rptr # restore rp
+ jmp .Lmulx4x_sub
+
+.align 32
+.Lmulx4x_sub:
+ mov 8*0($tptr),%r11
+ mov 8*1($tptr),%r12
+ mov 8*2($tptr),%r13
+ mov 8*3($tptr),%r14
+ lea 8*4($tptr),$tptr
+ sbb 8*0($nptr),%r11
+ sbb 8*1($nptr),%r12
+ sbb 8*2($nptr),%r13
+ sbb 8*3($nptr),%r14
+ lea 8*4($nptr),$nptr
+ mov %r11,8*0($rptr)
+ mov %r12,8*1($rptr)
+ mov %r13,8*2($rptr)
+ mov %r14,8*3($rptr)
+ lea 8*4($rptr),$rptr
+ dec $num # preserves %cf
+ jnz .Lmulx4x_sub
+
+ sbb \$0,%r15 # top-most carry
lea 64(%rsp),$tptr
+ sub %rdx,$rptr # rewind
+ movq %r15,%xmm1
pxor %xmm0,%xmm0
- mov 0*8($nptr,$num),%r8
- mov 1*8($nptr,$num),%r9
- neg %r8
- jmp .Lmulx4x_sub_entry
+ pshufd \$0,%xmm1,%xmm1
+ mov 40(%rsp),%rsi # restore %rsp
+ jmp .Lmulx4x_cond_copy
.align 32
-.Lmulx4x_sub:
- mov 0*8($nptr,$num),%r8
- mov 1*8($nptr,$num),%r9
- not %r8
-.Lmulx4x_sub_entry:
- mov 2*8($nptr,$num),%r10
- not %r9
- and %r15,%r8
- mov 3*8($nptr,$num),%r11
- not %r10
- and %r15,%r9
- not %r11
- and %r15,%r10
- and %r15,%r11
-
- neg %rdx # mov %rdx,%cf
- adc 0*8($tptr),%r8
- adc 1*8($tptr),%r9
- movdqa %xmm0,($tptr)
- adc 2*8($tptr),%r10
- adc 3*8($tptr),%r11
- movdqa %xmm0,16($tptr)
- lea 4*8($tptr),$tptr
- sbb %rdx,%rdx # mov %cf,%rdx
+.Lmulx4x_cond_copy:
+ movdqa 16*0($tptr),%xmm2
+ movdqa 16*1($tptr),%xmm3
+ lea 16*2($tptr),$tptr
+ movdqu 16*0($rptr),%xmm4
+ movdqu 16*1($rptr),%xmm5
+ lea 16*2($rptr),$rptr
+ movdqa %xmm0,-16*2($tptr) # zero tp
+ movdqa %xmm0,-16*1($tptr)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-16*2($rptr)
+ movdqu %xmm5,-16*1($rptr)
+ sub \$32,%rdx
+ jnz .Lmulx4x_cond_copy
- mov %r8,0*8($rptr)
- mov %r9,1*8($rptr)
- mov %r10,2*8($rptr)
- mov %r11,3*8($rptr)
- lea 4*8($rptr),$rptr
+ mov %rdx,($tptr)
- add \$32,$num
- jnz .Lmulx4x_sub
-
- mov 40(%rsp),%rsi # restore %rsp
mov \$1,%rax
mov -48(%rsi),%r15
mov -40(%rsi),%r14
diff --git a/deps/openssl/openssl/crypto/bn/asm/x86_64-mont5.pl b/deps/openssl/openssl/crypto/bn/asm/x86_64-mont5.pl
index 292409c4ff..2e8c9db32c 100755
--- a/deps/openssl/openssl/crypto/bn/asm/x86_64-mont5.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/x86_64-mont5.pl
@@ -99,58 +99,111 @@ $code.=<<___;
.Lmul_enter:
mov ${num}d,${num}d
mov %rsp,%rax
- mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
+ movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
+ lea .Linc(%rip),%r10
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
+
lea 2($num),%r11
neg %r11
- lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
+ lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8)
and \$-1024,%rsp # minimize TLB usage
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul_body:
- mov $bp,%r12 # reassign $bp
+ lea 128($bp),%r12 # reassign $bp (+size optimization)
___
$bp="%r12";
$STRIDE=2**5*8; # 5 is "window size"
$N=$STRIDE/4; # should match cache line size
$code.=<<___;
- mov %r10,%r11
- shr \$`log($N/8)/log(2)`,%r10
- and \$`$N/8-1`,%r11
- not %r10
- lea .Lmagic_masks(%rip),%rax
- and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
- lea 96($bp,%r11,8),$bp # pointer within 1st cache line
- movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
- movq 8(%rax,%r10,8),%xmm5 # cache line contains element
- movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
- movq 24(%rax,%r10,8),%xmm7
-
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
+ movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000
+ movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002
+ lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
+ and \$-16,%r10
+
+ pshufd \$0,%xmm5,%xmm5 # broadcast index
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..31 to index and save result to stack
+#
+$code.=<<___;
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0 # compare to 1,0
+ .byte 0x67
+ movdqa %xmm4,%xmm3
+___
+for($k=0;$k<$STRIDE/16-4;$k+=4) {
+$code.=<<___;
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1 # compare to 3,2
+ movdqa %xmm0,`16*($k+0)+112`(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2 # compare to 5,4
+ movdqa %xmm1,`16*($k+1)+112`(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3 # compare to 7,6
+ movdqa %xmm2,`16*($k+2)+112`(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,`16*($k+3)+112`(%r10)
+ movdqa %xmm4,%xmm3
+___
+}
+$code.=<<___; # last iteration can be optimized
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,`16*($k+0)+112`(%r10)
+
+ paddd %xmm2,%xmm3
+ .byte 0x67
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,`16*($k+1)+112`(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,`16*($k+2)+112`(%r10)
+ pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register
+
+ pand `16*($k+1)-128`($bp),%xmm1
+ pand `16*($k+2)-128`($bp),%xmm2
+ movdqa %xmm3,`16*($k+3)+112`(%r10)
+ pand `16*($k+3)-128`($bp),%xmm3
por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+for($k=0;$k<$STRIDE/16-4;$k+=4) {
+$code.=<<___;
+ movdqa `16*($k+0)-128`($bp),%xmm4
+ movdqa `16*($k+1)-128`($bp),%xmm5
+ movdqa `16*($k+2)-128`($bp),%xmm2
+ pand `16*($k+0)+112`(%r10),%xmm4
+ movdqa `16*($k+3)-128`($bp),%xmm3
+ pand `16*($k+1)+112`(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand `16*($k+2)+112`(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand `16*($k+3)+112`(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+}
+$code.=<<___;
+ por %xmm1,%xmm0
+ pshufd \$0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
lea $STRIDE($bp),$bp
- por %xmm3,%xmm0
-
movq %xmm0,$m0 # m0=bp[0]
mov ($n0),$n0 # pull n0[0] value
@@ -159,29 +212,14 @@ $code.=<<___;
xor $i,$i # i=0
xor $j,$j # j=0
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
-
mov $n0,$m1
mulq $m0 # ap[0]*bp[0]
mov %rax,$lo0
mov ($np),%rax
- movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq $lo0,$m1 # "tp[0]"*n0
mov %rdx,$hi0
- por %xmm2,%xmm0
- lea $STRIDE($bp),$bp
- por %xmm3,%xmm0
-
mulq $m1 # np[0]*m1
add %rax,$lo0 # discarded
mov 8($ap),%rax
@@ -212,16 +250,14 @@ $code.=<<___;
mulq $m1 # np[j]*m1
cmp $num,$j
- jne .L1st
-
- movq %xmm0,$m0 # bp[1]
+ jne .L1st # note that upon exit $j==$num, so
+ # they can be used interchangeably
add %rax,$hi1
- mov ($ap),%rax # ap[0]
adc \$0,%rdx
add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
- mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov $hi1,-16(%rsp,$num,8) # tp[num-1]
mov %rdx,$hi1
mov $lo0,$hi0
@@ -235,33 +271,48 @@ $code.=<<___;
jmp .Louter
.align 16
.Louter:
+ lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization)
+ and \$-16,%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+for($k=0;$k<$STRIDE/16;$k+=4) {
+$code.=<<___;
+ movdqa `16*($k+0)-128`($bp),%xmm0
+ movdqa `16*($k+1)-128`($bp),%xmm1
+ movdqa `16*($k+2)-128`($bp),%xmm2
+ movdqa `16*($k+3)-128`($bp),%xmm3
+ pand `16*($k+0)-128`(%rdx),%xmm0
+ pand `16*($k+1)-128`(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand `16*($k+2)-128`(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand `16*($k+3)-128`(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+___
+}
+$code.=<<___;
+ por %xmm5,%xmm4
+ pshufd \$0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ lea $STRIDE($bp),$bp
+
+ mov ($ap),%rax # ap[0]
+ movq %xmm0,$m0 # m0=bp[i]
+
xor $j,$j # j=0
mov $n0,$m1
mov (%rsp),$lo0
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
-
mulq $m0 # ap[0]*bp[i]
add %rax,$lo0 # ap[0]*bp[i]+tp[0]
mov ($np),%rax
adc \$0,%rdx
- movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq $lo0,$m1 # tp[0]*n0
mov %rdx,$hi0
- por %xmm2,%xmm0
- lea $STRIDE($bp),$bp
- por %xmm3,%xmm0
-
mulq $m1 # np[0]*m1
add %rax,$lo0 # discarded
mov 8($ap),%rax
@@ -295,17 +346,14 @@ $code.=<<___;
mulq $m1 # np[j]*m1
cmp $num,$j
- jne .Linner
-
- movq %xmm0,$m0 # bp[i+1]
-
+ jne .Linner # note that upon exit $j==$num, so
+ # they can be used interchangeably
add %rax,$hi1
- mov ($ap),%rax # ap[0]
adc \$0,%rdx
add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
- mov (%rsp,$j,8),$lo0
+ mov (%rsp,$num,8),$lo0
adc \$0,%rdx
- mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov $hi1,-16(%rsp,$num,8) # tp[num-1]
mov %rdx,$hi1
xor %rdx,%rdx
@@ -352,12 +400,7 @@ $code.=<<___;
mov 8(%rsp,$num,8),%rsi # restore %rsp
mov \$1,%rax
-___
-$code.=<<___ if ($win64);
- movaps -88(%rsi),%xmm6
- movaps -72(%rsi),%xmm7
-___
-$code.=<<___;
+
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
@@ -379,8 +422,8 @@ bn_mul4x_mont_gather5:
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
- and \$0x80100,%r11d
- cmp \$0x80100,%r11d
+ and \$0x80108,%r11d
+ cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
je .Lmulx4x_enter
___
$code.=<<___;
@@ -392,39 +435,34 @@ $code.=<<___;
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
+
.byte 0x67
- mov ${num}d,%r10d
- shl \$3,${num}d
- shl \$3+2,%r10d # 4*$num
+ shl \$3,${num}d # convert $num to bytes
+ lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num # -$num
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers ret[num], am[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic. [excessive frame is allocated in order
- # to allow bn_from_mont8x to clear it.]
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). This is done to allow memory disambiguation
+ # logic do its magic. [Extra [num] is allocated in order
+ # to align with bn_power5's frame, which is cleansed after
+ # completing exponentiation. Extra 256 bytes is for power mask
+ # calculated from 7th argument, the index.]
#
- lea -64(%rsp,$num,2),%r11
- sub $ap,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmul4xsp_alt
- sub %r11,%rsp # align with $ap
- lea -64(%rsp,$num,2),%rsp # alloca(128+num*8)
+ sub %r11,%rsp # align with $rp
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
jmp .Lmul4xsp_done
.align 32
.Lmul4xsp_alt:
- lea 4096-64(,$num,2),%r10
- lea -64(%rsp,$num,2),%rsp # alloca(128+num*8)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -440,12 +478,7 @@ $code.=<<___;
mov 40(%rsp),%rsi # restore %rsp
mov \$1,%rax
-___
-$code.=<<___ if ($win64);
- movaps -88(%rsi),%xmm6
- movaps -72(%rsi),%xmm7
-___
-$code.=<<___;
+
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
@@ -460,9 +493,10 @@ $code.=<<___;
.type mul4x_internal,\@abi-omnipotent
.align 32
mul4x_internal:
- shl \$5,$num
- mov `($win64?56:8)`(%rax),%r10d # load 7th argument
- lea 256(%rdx,$num),%r13
+ shl \$5,$num # $num was in bytes
+ movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index
+ lea .Linc(%rip),%rax
+ lea 128(%rdx,$num),%r13 # end of powers table (+size optimization)
shr \$5,$num # restore $num
___
$bp="%r12";
@@ -470,44 +504,92 @@ ___
$N=$STRIDE/4; # should match cache line size
$tp=$i;
$code.=<<___;
- mov %r10,%r11
- shr \$`log($N/8)/log(2)`,%r10
- and \$`$N/8-1`,%r11
- not %r10
- lea .Lmagic_masks(%rip),%rax
- and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
- lea 96(%rdx,%r11,8),$bp # pointer within 1st cache line
- movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
- movq 8(%rax,%r10,8),%xmm5 # cache line contains element
- add \$7,%r11
- movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
- movq 24(%rax,%r10,8),%xmm7
- and \$7,%r11
-
- movq `0*$STRIDE/4-96`($bp),%xmm0
- lea $STRIDE($bp),$tp # borrow $tp
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- .byte 0x67
- por %xmm1,%xmm0
- movq `0*$STRIDE/4-96`($tp),%xmm1
- .byte 0x67
- pand %xmm7,%xmm3
- .byte 0x67
- por %xmm2,%xmm0
- movq `1*$STRIDE/4-96`($tp),%xmm2
+ movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
+ movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
+ lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization)
+ lea 128(%rdx),$bp # size optimization
+
+ pshufd \$0,%xmm5,%xmm5 # broadcast index
+ movdqa %xmm1,%xmm4
+ .byte 0x67,0x67
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..31 to index and save result to stack
+#
+$code.=<<___;
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0 # compare to 1,0
.byte 0x67
- pand %xmm4,%xmm1
+ movdqa %xmm4,%xmm3
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1 # compare to 3,2
+ movdqa %xmm0,`16*($i+0)+112`(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2 # compare to 5,4
+ movdqa %xmm1,`16*($i+1)+112`(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3 # compare to 7,6
+ movdqa %xmm2,`16*($i+2)+112`(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,`16*($i+3)+112`(%r10)
+ movdqa %xmm4,%xmm3
+___
+}
+$code.=<<___; # last iteration can be optimized
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,`16*($i+0)+112`(%r10)
+
+ paddd %xmm2,%xmm3
.byte 0x67
- por %xmm3,%xmm0
- movq `2*$STRIDE/4-96`($tp),%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,`16*($i+1)+112`(%r10)
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,`16*($i+2)+112`(%r10)
+ pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register
+
+ pand `16*($i+1)-128`($bp),%xmm1
+ pand `16*($i+2)-128`($bp),%xmm2
+ movdqa %xmm3,`16*($i+3)+112`(%r10)
+ pand `16*($i+3)-128`($bp),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`($bp),%xmm4
+ movdqa `16*($i+1)-128`($bp),%xmm5
+ movdqa `16*($i+2)-128`($bp),%xmm2
+ pand `16*($i+0)+112`(%r10),%xmm4
+ movdqa `16*($i+3)-128`($bp),%xmm3
+ pand `16*($i+1)+112`(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand `16*($i+2)+112`(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand `16*($i+3)+112`(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+}
+$code.=<<___;
+ por %xmm1,%xmm0
+ pshufd \$0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ lea $STRIDE($bp),$bp
movq %xmm0,$m0 # m0=bp[0]
- movq `3*$STRIDE/4-96`($tp),%xmm0
+
mov %r13,16+8(%rsp) # save end of b[num]
mov $rp, 56+8(%rsp) # save $rp
@@ -521,26 +603,10 @@ $code.=<<___;
mov %rax,$A[0]
mov ($np),%rax
- pand %xmm5,%xmm2
- pand %xmm6,%xmm3
- por %xmm2,%xmm1
-
imulq $A[0],$m1 # "tp[0]"*n0
- ##############################################################
- # $tp is chosen so that writing to top-most element of the
- # vector occurs just "above" references to powers table,
- # "above" modulo cache-line size, which effectively precludes
- # possibility of memory disambiguation logic failure when
- # accessing the table.
- #
- lea 64+8(%rsp,%r11,8),$tp
+ lea 64+8(%rsp),$tp
mov %rdx,$A[1]
- pand %xmm7,%xmm0
- por %xmm3,%xmm1
- lea 2*$STRIDE($bp),$bp
- por %xmm1,%xmm0
-
mulq $m1 # np[0]*m1
add %rax,$A[0] # discarded
mov 8($ap,$num),%rax
@@ -549,7 +615,7 @@ $code.=<<___;
mulq $m0
add %rax,$A[1]
- mov 16*1($np),%rax # interleaved with 0, therefore 16*n
+ mov 8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
@@ -559,7 +625,7 @@ $code.=<<___;
adc \$0,%rdx
add $A[1],$N[1]
lea 4*8($num),$j # j=4
- lea 16*4($np),$np
+ lea 8*4($np),$np
adc \$0,%rdx
mov $N[1],($tp)
mov %rdx,$N[0]
@@ -569,7 +635,7 @@ $code.=<<___;
.L1st4x:
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
- mov -16*2($np),%rax
+ mov -8*2($np),%rax
lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
@@ -585,7 +651,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
- mov -16*1($np),%rax
+ mov -8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
@@ -600,7 +666,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
- mov 16*0($np),%rax
+ mov 8*0($np),%rax
adc \$0,%rdx
mov %rdx,$A[1]
@@ -615,7 +681,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
- mov 16*1($np),%rax
+ mov 8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
@@ -624,7 +690,7 @@ $code.=<<___;
mov 16($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
- lea 16*4($np),$np
+ lea 8*4($np),$np
adc \$0,%rdx
mov $N[1],($tp) # tp[j-1]
mov %rdx,$N[0]
@@ -634,7 +700,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
- mov -16*2($np),%rax
+ mov -8*2($np),%rax
lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
@@ -650,7 +716,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
- mov -16*1($np),%rax
+ mov -8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
@@ -663,8 +729,7 @@ $code.=<<___;
mov $N[1],-16($tp) # tp[j-1]
mov %rdx,$N[0]
- movq %xmm0,$m0 # bp[1]
- lea ($np,$num,2),$np # rewind $np
+ lea ($np,$num),$np # rewind $np
xor $N[1],$N[1]
add $A[0],$N[0]
@@ -675,6 +740,33 @@ $code.=<<___;
.align 32
.Louter4x:
+ lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization)
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`($bp),%xmm0
+ movdqa `16*($i+1)-128`($bp),%xmm1
+ movdqa `16*($i+2)-128`($bp),%xmm2
+ movdqa `16*($i+3)-128`($bp),%xmm3
+ pand `16*($i+0)-128`(%rdx),%xmm0
+ pand `16*($i+1)-128`(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand `16*($i+2)-128`(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand `16*($i+3)-128`(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+___
+}
+$code.=<<___;
+ por %xmm5,%xmm4
+ pshufd \$0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ lea $STRIDE($bp),$bp
+ movq %xmm0,$m0 # m0=bp[i]
+
mov ($tp,$num),$A[0]
mov $n0,$m1
mulq $m0 # ap[0]*bp[i]
@@ -682,25 +774,11 @@ $code.=<<___;
mov ($np),%rax
adc \$0,%rdx
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bp),%xmm3
-
imulq $A[0],$m1 # tp[0]*n0
- .byte 0x67
mov %rdx,$A[1]
mov $N[1],($tp) # store upmost overflow bit
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
- por %xmm2,%xmm0
lea ($tp,$num),$tp # rewind $tp
- lea $STRIDE($bp),$bp
- por %xmm3,%xmm0
mulq $m1 # np[0]*m1
add %rax,$A[0] # "$N[0]", discarded
@@ -710,7 +788,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov 16*1($np),%rax # interleaved with 0, therefore 16*n
+ mov 8*1($np),%rax
adc \$0,%rdx
add 8($tp),$A[1] # +tp[1]
adc \$0,%rdx
@@ -722,7 +800,7 @@ $code.=<<___;
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
lea 4*8($num),$j # j=4
- lea 16*4($np),$np
+ lea 8*4($np),$np
adc \$0,%rdx
mov %rdx,$N[0]
jmp .Linner4x
@@ -731,7 +809,7 @@ $code.=<<___;
.Linner4x:
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
- mov -16*2($np),%rax
+ mov -8*2($np),%rax
adc \$0,%rdx
add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
lea 32($tp),$tp
@@ -749,7 +827,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov -16*1($np),%rax
+ mov -8*1($np),%rax
adc \$0,%rdx
add -8($tp),$A[1]
adc \$0,%rdx
@@ -766,7 +844,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
- mov 16*0($np),%rax
+ mov 8*0($np),%rax
adc \$0,%rdx
add ($tp),$A[0] # ap[j]*bp[i]+tp[j]
adc \$0,%rdx
@@ -783,7 +861,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov 16*1($np),%rax
+ mov 8*1($np),%rax
adc \$0,%rdx
add 8($tp),$A[1]
adc \$0,%rdx
@@ -794,7 +872,7 @@ $code.=<<___;
mov 16($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1]
- lea 16*4($np),$np
+ lea 8*4($np),$np
adc \$0,%rdx
mov $N[0],-8($tp) # tp[j-1]
mov %rdx,$N[0]
@@ -804,7 +882,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
- mov -16*2($np),%rax
+ mov -8*2($np),%rax
adc \$0,%rdx
add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
lea 32($tp),$tp
@@ -823,7 +901,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
mov $m1,%rax
- mov -16*1($np),$m1
+ mov -8*1($np),$m1
adc \$0,%rdx
add -8($tp),$A[1]
adc \$0,%rdx
@@ -838,9 +916,8 @@ $code.=<<___;
mov $N[0],-24($tp) # tp[j-1]
mov %rdx,$N[0]
- movq %xmm0,$m0 # bp[i+1]
mov $N[1],-16($tp) # tp[j-1]
- lea ($np,$num,2),$np # rewind $np
+ lea ($np,$num),$np # rewind $np
xor $N[1],$N[1]
add $A[0],$N[0]
@@ -854,16 +931,23 @@ $code.=<<___;
___
if (1) {
$code.=<<___;
+ xor %rax,%rax
sub $N[0],$m1 # compare top-most words
adc $j,$j # $j is zero
or $j,$N[1]
- xor \$1,$N[1]
+ sub $N[1],%rax # %rax=-$N[1]
lea ($tp,$num),%rbx # tptr in .sqr4x_sub
- lea ($np,$N[1],8),%rbp # nptr in .sqr4x_sub
+ mov ($np),%r12
+ lea ($np),%rbp # nptr in .sqr4x_sub
mov %r9,%rcx
- sar \$3+2,%rcx # cf=0
+ sar \$3+2,%rcx
mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub
- jmp .Lsqr4x_sub
+ dec %r12 # so that after 'not' we get -n[0]
+ xor %r10,%r10
+ mov 8*1(%rbp),%r13
+ mov 8*2(%rbp),%r14
+ mov 8*3(%rbp),%r15
+ jmp .Lsqr4x_sub_entry
___
} else {
my @ri=("%rax",$bp,$m0,$m1);
@@ -930,8 +1014,8 @@ bn_power5:
___
$code.=<<___ if ($addx);
mov OPENSSL_ia32cap_P+8(%rip),%r11d
- and \$0x80100,%r11d
- cmp \$0x80100,%r11d
+ and \$0x80108,%r11d
+ cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
je .Lpowerx5_enter
___
$code.=<<___;
@@ -942,38 +1026,32 @@ $code.=<<___;
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
- mov ${num}d,%r10d
+
shl \$3,${num}d # convert $num to bytes
- shl \$3+2,%r10d # 4*$num
+ lea ($num,$num,2),%r10d # 3*$num
neg $num
mov ($n0),$n0 # *n0
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers ret[num], am[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic.
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). This is done to allow memory disambiguation
+ # logic do its magic. [Extra 256 bytes is for power mask
+ # calculated from 7th argument, the index.]
#
- lea -64(%rsp,$num,2),%r11
- sub $aptr,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lpwr_sp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
jmp .Lpwr_sp_done
.align 32
.Lpwr_sp_alt:
- lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -995,16 +1073,21 @@ $code.=<<___;
mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp
.Lpower5_body:
- movq $rptr,%xmm1 # save $rptr
+ movq $rptr,%xmm1 # save $rptr, used in sqr8x
movq $nptr,%xmm2 # save $nptr
- movq %r10, %xmm3 # -$num
+ movq %r10, %xmm3 # -$num, used in sqr8x
movq $bptr,%xmm4
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
movq %xmm2,$nptr
movq %xmm4,$bptr
@@ -1565,9 +1648,9 @@ my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
$code.=<<___;
movq %xmm2,$nptr
-sqr8x_reduction:
+__bn_sqr8x_reduction:
xor %rax,%rax
- lea ($nptr,$num,2),%rcx # end of n[]
+ lea ($nptr,$num),%rcx # end of n[]
lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer
mov %rcx,0+8(%rsp)
lea 48+8(%rsp,$num),$tptr # end of initial t[] window
@@ -1593,21 +1676,21 @@ sqr8x_reduction:
.byte 0x67
mov $m0,%r8
imulq 32+8(%rsp),$m0 # n0*a[0]
- mov 16*0($nptr),%rax # n[0]
+ mov 8*0($nptr),%rax # n[0]
mov \$8,%ecx
jmp .L8x_reduce
.align 32
.L8x_reduce:
mulq $m0
- mov 16*1($nptr),%rax # n[1]
+ mov 8*1($nptr),%rax # n[1]
neg %r8
mov %rdx,%r8
adc \$0,%r8
mulq $m0
add %rax,%r9
- mov 16*2($nptr),%rax
+ mov 8*2($nptr),%rax
adc \$0,%rdx
add %r9,%r8
mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i]
@@ -1616,7 +1699,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r10
- mov 16*3($nptr),%rax
+ mov 8*3($nptr),%rax
adc \$0,%rdx
add %r10,%r9
mov 32+8(%rsp),$carry # pull n0, borrow $carry
@@ -1625,7 +1708,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r11
- mov 16*4($nptr),%rax
+ mov 8*4($nptr),%rax
adc \$0,%rdx
imulq %r8,$carry # modulo-scheduled
add %r11,%r10
@@ -1634,7 +1717,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r12
- mov 16*5($nptr),%rax
+ mov 8*5($nptr),%rax
adc \$0,%rdx
add %r12,%r11
mov %rdx,%r12
@@ -1642,7 +1725,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r13
- mov 16*6($nptr),%rax
+ mov 8*6($nptr),%rax
adc \$0,%rdx
add %r13,%r12
mov %rdx,%r13
@@ -1650,7 +1733,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r14
- mov 16*7($nptr),%rax
+ mov 8*7($nptr),%rax
adc \$0,%rdx
add %r14,%r13
mov %rdx,%r14
@@ -1659,7 +1742,7 @@ sqr8x_reduction:
mulq $m0
mov $carry,$m0 # n0*a[i]
add %rax,%r15
- mov 16*0($nptr),%rax # n[0]
+ mov 8*0($nptr),%rax # n[0]
adc \$0,%rdx
add %r15,%r14
mov %rdx,%r15
@@ -1668,7 +1751,7 @@ sqr8x_reduction:
dec %ecx
jnz .L8x_reduce
- lea 16*8($nptr),$nptr
+ lea 8*8($nptr),$nptr
xor %rax,%rax
mov 8+8(%rsp),%rdx # pull end of t[]
cmp 0+8(%rsp),$nptr # end of n[]?
@@ -1687,21 +1770,21 @@ sqr8x_reduction:
mov 48+56+8(%rsp),$m0 # pull n0*a[0]
mov \$8,%ecx
- mov 16*0($nptr),%rax
+ mov 8*0($nptr),%rax
jmp .L8x_tail
.align 32
.L8x_tail:
mulq $m0
add %rax,%r8
- mov 16*1($nptr),%rax
+ mov 8*1($nptr),%rax
mov %r8,($tptr) # save result
mov %rdx,%r8
adc \$0,%r8
mulq $m0
add %rax,%r9
- mov 16*2($nptr),%rax
+ mov 8*2($nptr),%rax
adc \$0,%rdx
add %r9,%r8
lea 8($tptr),$tptr # $tptr++
@@ -1710,7 +1793,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r10
- mov 16*3($nptr),%rax
+ mov 8*3($nptr),%rax
adc \$0,%rdx
add %r10,%r9
mov %rdx,%r10
@@ -1718,7 +1801,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r11
- mov 16*4($nptr),%rax
+ mov 8*4($nptr),%rax
adc \$0,%rdx
add %r11,%r10
mov %rdx,%r11
@@ -1726,7 +1809,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r12
- mov 16*5($nptr),%rax
+ mov 8*5($nptr),%rax
adc \$0,%rdx
add %r12,%r11
mov %rdx,%r12
@@ -1734,7 +1817,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r13
- mov 16*6($nptr),%rax
+ mov 8*6($nptr),%rax
adc \$0,%rdx
add %r13,%r12
mov %rdx,%r13
@@ -1742,7 +1825,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r14
- mov 16*7($nptr),%rax
+ mov 8*7($nptr),%rax
adc \$0,%rdx
add %r14,%r13
mov %rdx,%r14
@@ -1753,14 +1836,14 @@ sqr8x_reduction:
add %rax,%r15
adc \$0,%rdx
add %r15,%r14
- mov 16*0($nptr),%rax # pull n[0]
+ mov 8*0($nptr),%rax # pull n[0]
mov %rdx,%r15
adc \$0,%r15
dec %ecx
jnz .L8x_tail
- lea 16*8($nptr),$nptr
+ lea 8*8($nptr),$nptr
mov 8+8(%rsp),%rdx # pull end of t[]
cmp 0+8(%rsp),$nptr # end of n[]?
jae .L8x_tail_done # break out of loop
@@ -1806,7 +1889,7 @@ sqr8x_reduction:
adc 8*6($tptr),%r14
adc 8*7($tptr),%r15
adc \$0,%rax # top-most carry
- mov -16($nptr),%rcx # np[num-1]
+ mov -8($nptr),%rcx # np[num-1]
xor $carry,$carry
movq %xmm2,$nptr # restore $nptr
@@ -1824,6 +1907,8 @@ sqr8x_reduction:
cmp %rdx,$tptr # end of t[]?
jb .L8x_reduction_loop
+ ret
+.size bn_sqr8x_internal,.-bn_sqr8x_internal
___
}
##############################################################
@@ -1832,48 +1917,62 @@ ___
{
my ($tptr,$nptr)=("%rbx","%rbp");
$code.=<<___;
- #xor %rsi,%rsi # %rsi was $carry above
- sub %r15,%rcx # compare top-most words
+.type __bn_post4x_internal,\@abi-omnipotent
+.align 32
+__bn_post4x_internal:
+ mov 8*0($nptr),%r12
lea (%rdi,$num),$tptr # %rdi was $tptr above
- adc %rsi,%rsi
mov $num,%rcx
- or %rsi,%rax
movq %xmm1,$rptr # restore $rptr
- xor \$1,%rax
+ neg %rax
movq %xmm1,$aptr # prepare for back-to-back call
- lea ($nptr,%rax,8),$nptr
- sar \$3+2,%rcx # cf=0
- jmp .Lsqr4x_sub
+ sar \$3+2,%rcx
+ dec %r12 # so that after 'not' we get -n[0]
+ xor %r10,%r10
+ mov 8*1($nptr),%r13
+ mov 8*2($nptr),%r14
+ mov 8*3($nptr),%r15
+ jmp .Lsqr4x_sub_entry
-.align 32
+.align 16
.Lsqr4x_sub:
- .byte 0x66
- mov 8*0($tptr),%r12
- mov 8*1($tptr),%r13
- sbb 16*0($nptr),%r12
- mov 8*2($tptr),%r14
- sbb 16*1($nptr),%r13
- mov 8*3($tptr),%r15
- lea 8*4($tptr),$tptr
- sbb 16*2($nptr),%r14
+ mov 8*0($nptr),%r12
+ mov 8*1($nptr),%r13
+ mov 8*2($nptr),%r14
+ mov 8*3($nptr),%r15
+.Lsqr4x_sub_entry:
+ lea 8*4($nptr),$nptr
+ not %r12
+ not %r13
+ not %r14
+ not %r15
+ and %rax,%r12
+ and %rax,%r13
+ and %rax,%r14
+ and %rax,%r15
+
+ neg %r10 # mov %r10,%cf
+ adc 8*0($tptr),%r12
+ adc 8*1($tptr),%r13
+ adc 8*2($tptr),%r14
+ adc 8*3($tptr),%r15
mov %r12,8*0($rptr)
- sbb 16*3($nptr),%r15
- lea 16*4($nptr),$nptr
+ lea 8*4($tptr),$tptr
mov %r13,8*1($rptr)
+ sbb %r10,%r10 # mov %cf,%r10
mov %r14,8*2($rptr)
mov %r15,8*3($rptr)
lea 8*4($rptr),$rptr
inc %rcx # pass %cf
jnz .Lsqr4x_sub
-___
-}
-$code.=<<___;
+
mov $num,%r10 # prepare for back-to-back call
neg $num # restore $num
ret
-.size bn_sqr8x_internal,.-bn_sqr8x_internal
+.size __bn_post4x_internal,.-__bn_post4x_internal
___
+}
{
$code.=<<___;
.globl bn_from_montgomery
@@ -1897,39 +1996,32 @@ bn_from_mont8x:
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
- .byte 0x67
- mov ${num}d,%r10d
+
shl \$3,${num}d # convert $num to bytes
- shl \$3+2,%r10d # 4*$num
+ lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num
mov ($n0),$n0 # *n0
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers ret[num], am[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic.
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). The stack is allocated to aligned with
+ # bn_power5's frame, and as bn_from_montgomery happens to be
+ # last operation, we use the opportunity to cleanse it.
#
- lea -64(%rsp,$num,2),%r11
- sub $aptr,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lfrom_sp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
jmp .Lfrom_sp_done
.align 32
.Lfrom_sp_alt:
- lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -1983,12 +2075,13 @@ $code.=<<___;
___
$code.=<<___ if ($addx);
mov OPENSSL_ia32cap_P+8(%rip),%r11d
- and \$0x80100,%r11d
- cmp \$0x80100,%r11d
+ and \$0x80108,%r11d
+ cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
jne .Lfrom_mont_nox
lea (%rax,$num),$rptr
- call sqrx8x_reduction
+ call __bn_sqrx8x_reduction
+ call __bn_postx4x_internal
pxor %xmm0,%xmm0
lea 48(%rsp),%rax
@@ -1999,7 +2092,8 @@ $code.=<<___ if ($addx);
.Lfrom_mont_nox:
___
$code.=<<___;
- call sqr8x_reduction
+ call __bn_sqr8x_reduction
+ call __bn_post4x_internal
pxor %xmm0,%xmm0
lea 48(%rsp),%rax
@@ -2039,7 +2133,6 @@ $code.=<<___;
.align 32
bn_mulx4x_mont_gather5:
.Lmulx4x_enter:
- .byte 0x67
mov %rsp,%rax
push %rbx
push %rbp
@@ -2047,40 +2140,33 @@ bn_mulx4x_mont_gather5:
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
- .byte 0x67
- mov ${num}d,%r10d
+
shl \$3,${num}d # convert $num to bytes
- shl \$3+2,%r10d # 4*$num
+ lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num # -$num
mov ($n0),$n0 # *n0
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers a[num], ret[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic. [excessive frame is allocated in order
- # to allow bn_from_mont8x to clear it.]
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). This is done to allow memory disambiguation
+ # logic do its magic. [Extra [num] is allocated in order
+ # to align with bn_power5's frame, which is cleansed after
+ # completing exponentiation. Extra 256 bytes is for power mask
+ # calculated from 7th argument, the index.]
#
- lea -64(%rsp,$num,2),%r11
- sub $ap,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmulx4xsp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+$num)
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
jmp .Lmulx4xsp_done
-.align 32
.Lmulx4xsp_alt:
- lea 4096-64(,$num,2),%r10 # 4096-frame-$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+$num)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -2106,12 +2192,7 @@ $code.=<<___;
mov 40(%rsp),%rsi # restore %rsp
mov \$1,%rax
-___
-$code.=<<___ if ($win64);
- movaps -88(%rsi),%xmm6
- movaps -72(%rsi),%xmm7
-___
-$code.=<<___;
+
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
@@ -2126,14 +2207,16 @@ $code.=<<___;
.type mulx4x_internal,\@abi-omnipotent
.align 32
mulx4x_internal:
- .byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00 # mov $num,8(%rsp) # save -$num
- .byte 0x67
+ mov $num,8(%rsp) # save -$num (it was in bytes)
+ mov $num,%r10
neg $num # restore $num
shl \$5,$num
- lea 256($bp,$num),%r13
+ neg %r10 # restore $num
+ lea 128($bp,$num),%r13 # end of powers table (+size optimization)
shr \$5+5,$num
- mov `($win64?56:8)`(%rax),%r10d # load 7th argument
+ movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument
sub \$1,$num
+ lea .Linc(%rip),%rax
mov %r13,16+8(%rsp) # end of b[num]
mov $num,24+8(%rsp) # inner counter
mov $rp, 56+8(%rsp) # save $rp
@@ -2144,52 +2227,92 @@ my $rptr=$bptr;
my $STRIDE=2**5*8; # 5 is "window size"
my $N=$STRIDE/4; # should match cache line size
$code.=<<___;
- mov %r10,%r11
- shr \$`log($N/8)/log(2)`,%r10
- and \$`$N/8-1`,%r11
- not %r10
- lea .Lmagic_masks(%rip),%rax
- and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
- lea 96($bp,%r11,8),$bptr # pointer within 1st cache line
- movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
- movq 8(%rax,%r10,8),%xmm5 # cache line contains element
- add \$7,%r11
- movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
- movq 24(%rax,%r10,8),%xmm7
- and \$7,%r11
-
- movq `0*$STRIDE/4-96`($bptr),%xmm0
- lea $STRIDE($bptr),$tptr # borrow $tptr
- movq `1*$STRIDE/4-96`($bptr),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bptr),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bptr),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- movq `0*$STRIDE/4-96`($tptr),%xmm1
- pand %xmm7,%xmm3
- por %xmm2,%xmm0
- movq `1*$STRIDE/4-96`($tptr),%xmm2
- por %xmm3,%xmm0
- .byte 0x67,0x67
- pand %xmm4,%xmm1
- movq `2*$STRIDE/4-96`($tptr),%xmm3
+ movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
+ movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
+ lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimizaton)
+ lea 128($bp),$bptr # size optimization
+ pshufd \$0,%xmm5,%xmm5 # broadcast index
+ movdqa %xmm1,%xmm4
+ .byte 0x67
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..31 to index and save result to stack
+#
+$code.=<<___;
+ .byte 0x67
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0 # compare to 1,0
+ movdqa %xmm4,%xmm3
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1 # compare to 3,2
+ movdqa %xmm0,`16*($i+0)+112`(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2 # compare to 5,4
+ movdqa %xmm1,`16*($i+1)+112`(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3 # compare to 7,6
+ movdqa %xmm2,`16*($i+2)+112`(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,`16*($i+3)+112`(%r10)
+ movdqa %xmm4,%xmm3
+___
+}
+$code.=<<___; # last iteration can be optimized
+ .byte 0x67
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,`16*($i+0)+112`(%r10)
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,`16*($i+1)+112`(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,`16*($i+2)+112`(%r10)
+
+ pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register
+ pand `16*($i+1)-128`($bptr),%xmm1
+ pand `16*($i+2)-128`($bptr),%xmm2
+ movdqa %xmm3,`16*($i+3)+112`(%r10)
+ pand `16*($i+3)-128`($bptr),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`($bptr),%xmm4
+ movdqa `16*($i+1)-128`($bptr),%xmm5
+ movdqa `16*($i+2)-128`($bptr),%xmm2
+ pand `16*($i+0)+112`(%r10),%xmm4
+ movdqa `16*($i+3)-128`($bptr),%xmm3
+ pand `16*($i+1)+112`(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand `16*($i+2)+112`(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand `16*($i+3)+112`(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+}
+$code.=<<___;
+ pxor %xmm1,%xmm0
+ pshufd \$0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ lea $STRIDE($bptr),$bptr
movq %xmm0,%rdx # bp[0]
- movq `3*$STRIDE/4-96`($tptr),%xmm0
- lea 2*$STRIDE($bptr),$bptr # next &b[i]
- pand %xmm5,%xmm2
- .byte 0x67,0x67
- pand %xmm6,%xmm3
- ##############################################################
- # $tptr is chosen so that writing to top-most element of the
- # vector occurs just "above" references to powers table,
- # "above" modulo cache-line size, which effectively precludes
- # possibility of memory disambiguation logic failure when
- # accessing the table.
- #
- lea 64+8*4+8(%rsp,%r11,8),$tptr
+ lea 64+8*4+8(%rsp),$tptr
mov %rdx,$bi
mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
@@ -2205,37 +2328,31 @@ $code.=<<___;
xor $zero,$zero # cf=0, of=0
mov $mi,%rdx
- por %xmm2,%xmm1
- pand %xmm7,%xmm0
- por %xmm3,%xmm1
mov $bptr,8+8(%rsp) # off-load &b[i]
- por %xmm1,%xmm0
- .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr
+ lea 4*8($aptr),$aptr
adcx %rax,%r13
adcx $zero,%r14 # cf=0
- mulx 0*16($nptr),%rax,%r10
+ mulx 0*8($nptr),%rax,%r10
adcx %rax,%r15 # discarded
adox %r11,%r10
- mulx 1*16($nptr),%rax,%r11
+ mulx 1*8($nptr),%rax,%r11
adcx %rax,%r10
adox %r12,%r11
- mulx 2*16($nptr),%rax,%r12
+ mulx 2*8($nptr),%rax,%r12
mov 24+8(%rsp),$bptr # counter value
- .byte 0x66
mov %r10,-8*4($tptr)
adcx %rax,%r11
adox %r13,%r12
- mulx 3*16($nptr),%rax,%r15
- .byte 0x67,0x67
+ mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
mov %r11,-8*3($tptr)
adcx %rax,%r12
adox $zero,%r15 # of=0
- .byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00 # lea 4*16($nptr),$nptr
+ lea 4*8($nptr),$nptr
mov %r12,-8*2($tptr)
- #jmp .Lmulx4x_1st
+ jmp .Lmulx4x_1st
.align 32
.Lmulx4x_1st:
@@ -2255,30 +2372,29 @@ $code.=<<___;
lea 4*8($tptr),$tptr
adox %r15,%r10
- mulx 0*16($nptr),%rax,%r15
+ mulx 0*8($nptr),%rax,%r15
adcx %rax,%r10
adox %r15,%r11
- mulx 1*16($nptr),%rax,%r15
+ mulx 1*8($nptr),%rax,%r15
adcx %rax,%r11
adox %r15,%r12
- mulx 2*16($nptr),%rax,%r15
+ mulx 2*8($nptr),%rax,%r15
mov %r10,-5*8($tptr)
adcx %rax,%r12
mov %r11,-4*8($tptr)
adox %r15,%r13
- mulx 3*16($nptr),%rax,%r15
+ mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
mov %r12,-3*8($tptr)
adcx %rax,%r13
adox $zero,%r15
- lea 4*16($nptr),$nptr
+ lea 4*8($nptr),$nptr
mov %r13,-2*8($tptr)
dec $bptr # of=0, pass cf
jnz .Lmulx4x_1st
mov 8(%rsp),$num # load -num
- movq %xmm0,%rdx # bp[1]
adc $zero,%r15 # modulo-scheduled
lea ($aptr,$num),$aptr # rewind $aptr
add %r15,%r14
@@ -2289,6 +2405,34 @@ $code.=<<___;
.align 32
.Lmulx4x_outer:
+ lea 16-256($tptr),%r10 # where 256-byte mask is (+density control)
+ pxor %xmm4,%xmm4
+ .byte 0x67,0x67
+ pxor %xmm5,%xmm5
+___
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`($bptr),%xmm0
+ movdqa `16*($i+1)-128`($bptr),%xmm1
+ movdqa `16*($i+2)-128`($bptr),%xmm2
+ pand `16*($i+0)+256`(%r10),%xmm0
+ movdqa `16*($i+3)-128`($bptr),%xmm3
+ pand `16*($i+1)+256`(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand `16*($i+2)+256`(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand `16*($i+3)+256`(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+___
+}
+$code.=<<___;
+ por %xmm5,%xmm4
+ pshufd \$0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ lea $STRIDE($bptr),$bptr
+ movq %xmm0,%rdx # m0=bp[i]
+
mov $zero,($tptr) # save top-most carry
lea 4*8($tptr,$num),$tptr # rewind $tptr
mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
@@ -2303,54 +2447,37 @@ $code.=<<___;
mulx 3*8($aptr),%rdx,%r14
adox -2*8($tptr),%r12
adcx %rdx,%r13
- lea ($nptr,$num,2),$nptr # rewind $nptr
+ lea ($nptr,$num),$nptr # rewind $nptr
lea 4*8($aptr),$aptr
adox -1*8($tptr),%r13
adcx $zero,%r14
adox $zero,%r14
- .byte 0x67
mov $mi,%r15
imulq 32+8(%rsp),$mi # "t[0]"*n0
- movq `0*$STRIDE/4-96`($bptr),%xmm0
- .byte 0x67,0x67
mov $mi,%rdx
- movq `1*$STRIDE/4-96`($bptr),%xmm1
- .byte 0x67
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bptr),%xmm2
- .byte 0x67
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bptr),%xmm3
- add \$$STRIDE,$bptr # next &b[i]
- .byte 0x67
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
xor $zero,$zero # cf=0, of=0
mov $bptr,8+8(%rsp) # off-load &b[i]
- mulx 0*16($nptr),%rax,%r10
+ mulx 0*8($nptr),%rax,%r10
adcx %rax,%r15 # discarded
adox %r11,%r10
- mulx 1*16($nptr),%rax,%r11
+ mulx 1*8($nptr),%rax,%r11
adcx %rax,%r10
adox %r12,%r11
- mulx 2*16($nptr),%rax,%r12
+ mulx 2*8($nptr),%rax,%r12
adcx %rax,%r11
adox %r13,%r12
- mulx 3*16($nptr),%rax,%r15
+ mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
- por %xmm2,%xmm0
mov 24+8(%rsp),$bptr # counter value
mov %r10,-8*4($tptr)
- por %xmm3,%xmm0
adcx %rax,%r12
mov %r11,-8*3($tptr)
adox $zero,%r15 # of=0
mov %r12,-8*2($tptr)
- lea 4*16($nptr),$nptr
+ lea 4*8($nptr),$nptr
jmp .Lmulx4x_inner
.align 32
@@ -2375,20 +2502,20 @@ $code.=<<___;
adcx $zero,%r14 # cf=0
adox %r15,%r10
- mulx 0*16($nptr),%rax,%r15
+ mulx 0*8($nptr),%rax,%r15
adcx %rax,%r10
adox %r15,%r11
- mulx 1*16($nptr),%rax,%r15
+ mulx 1*8($nptr),%rax,%r15
adcx %rax,%r11
adox %r15,%r12
- mulx 2*16($nptr),%rax,%r15
+ mulx 2*8($nptr),%rax,%r15
mov %r10,-5*8($tptr)
adcx %rax,%r12
adox %r15,%r13
mov %r11,-4*8($tptr)
- mulx 3*16($nptr),%rax,%r15
+ mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
- lea 4*16($nptr),$nptr
+ lea 4*8($nptr),$nptr
mov %r12,-3*8($tptr)
adcx %rax,%r13
adox $zero,%r15
@@ -2398,7 +2525,6 @@ $code.=<<___;
jnz .Lmulx4x_inner
mov 0+8(%rsp),$num # load -num
- movq %xmm0,%rdx # bp[i+1]
adc $zero,%r15 # modulo-scheduled
sub 0*8($tptr),$bptr # pull top-most carry to %cf
mov 8+8(%rsp),$bptr # re-load &b[i]
@@ -2411,20 +2537,26 @@ $code.=<<___;
cmp %r10,$bptr
jb .Lmulx4x_outer
- mov -16($nptr),%r10
+ mov -8($nptr),%r10
+ mov $zero,%r8
+ mov ($nptr,$num),%r12
+ lea ($nptr,$num),%rbp # rewind $nptr
+ mov $num,%rcx
+ lea ($tptr,$num),%rdi # rewind $tptr
+ xor %eax,%eax
xor %r15,%r15
sub %r14,%r10 # compare top-most words
adc %r15,%r15
- or %r15,$zero
- xor \$1,$zero
- lea ($tptr,$num),%rdi # rewind $tptr
- lea ($nptr,$num,2),$nptr # rewind $nptr
- .byte 0x67,0x67
- sar \$3+2,$num # cf=0
- lea ($nptr,$zero,8),%rbp
+ or %r15,%r8
+ sar \$3+2,%rcx
+ sub %r8,%rax # %rax=-%r8
mov 56+8(%rsp),%rdx # restore rp
- mov $num,%rcx
- jmp .Lsqrx4x_sub # common post-condition
+ dec %r12 # so that after 'not' we get -n[0]
+ mov 8*1(%rbp),%r13
+ xor %r8,%r8
+ mov 8*2(%rbp),%r14
+ mov 8*3(%rbp),%r15
+ jmp .Lsqrx4x_sub_entry # common post-condition
.size mulx4x_internal,.-mulx4x_internal
___
} {
@@ -2448,7 +2580,6 @@ $code.=<<___;
.align 32
bn_powerx5:
.Lpowerx5_enter:
- .byte 0x67
mov %rsp,%rax
push %rbx
push %rbp
@@ -2456,39 +2587,32 @@ bn_powerx5:
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
- .byte 0x67
- mov ${num}d,%r10d
+
shl \$3,${num}d # convert $num to bytes
- shl \$3+2,%r10d # 4*$num
+ lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num
mov ($n0),$n0 # *n0
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers ret[num], am[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic.
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). This is done to allow memory disambiguation
+ # logic do its magic. [Extra 256 bytes is for power mask
+ # calculated from 7th argument, the index.]
#
- lea -64(%rsp,$num,2),%r11
- sub $aptr,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lpwrx_sp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
jmp .Lpwrx_sp_done
.align 32
.Lpwrx_sp_alt:
- lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -2519,10 +2643,15 @@ $code.=<<___;
.Lpowerx5_body:
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
mov %r10,$num # -num
mov $aptr,$rptr
@@ -2534,12 +2663,7 @@ $code.=<<___;
mov 40(%rsp),%rsi # restore %rsp
mov \$1,%rax
-___
-$code.=<<___ if ($win64);
- movaps -88(%rsi),%xmm6
- movaps -72(%rsi),%xmm7
-___
-$code.=<<___;
+
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
@@ -2973,11 +3097,11 @@ my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
$code.=<<___;
movq %xmm2,$nptr
-sqrx8x_reduction:
+__bn_sqrx8x_reduction:
xor %eax,%eax # initial top-most carry bit
mov 32+8(%rsp),%rbx # n0
mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr)
- lea -128($nptr,$num,2),%rcx # end of n[]
+ lea -8*8($nptr,$num),%rcx # end of n[]
#lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer
mov %rcx, 0+8(%rsp) # save end of n[]
mov $tptr,8+8(%rsp) # save end of t[]
@@ -3006,23 +3130,23 @@ sqrx8x_reduction:
.align 32
.Lsqrx8x_reduce:
mov %r8, %rbx
- mulx 16*0($nptr),%rax,%r8 # n[0]
+ mulx 8*0($nptr),%rax,%r8 # n[0]
adcx %rbx,%rax # discarded
adox %r9,%r8
- mulx 16*1($nptr),%rbx,%r9 # n[1]
+ mulx 8*1($nptr),%rbx,%r9 # n[1]
adcx %rbx,%r8
adox %r10,%r9
- mulx 16*2($nptr),%rbx,%r10
+ mulx 8*2($nptr),%rbx,%r10
adcx %rbx,%r9
adox %r11,%r10
- mulx 16*3($nptr),%rbx,%r11
+ mulx 8*3($nptr),%rbx,%r11
adcx %rbx,%r10
adox %r12,%r11
- .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rbx,%r12
+ .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12
mov %rdx,%rax
mov %r8,%rdx
adcx %rbx,%r11
@@ -3032,15 +3156,15 @@ sqrx8x_reduction:
mov %rax,%rdx
mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i]
- mulx 16*5($nptr),%rax,%r13
+ mulx 8*5($nptr),%rax,%r13
adcx %rax,%r12
adox %r14,%r13
- mulx 16*6($nptr),%rax,%r14
+ mulx 8*6($nptr),%rax,%r14
adcx %rax,%r13
adox %r15,%r14
- mulx 16*7($nptr),%rax,%r15
+ mulx 8*7($nptr),%rax,%r15
mov %rbx,%rdx
adcx %rax,%r14
adox $carry,%r15 # $carry is 0
@@ -3056,7 +3180,7 @@ sqrx8x_reduction:
mov 48+8(%rsp),%rdx # pull n0*a[0]
add 8*0($tptr),%r8
- lea 16*8($nptr),$nptr
+ lea 8*8($nptr),$nptr
mov \$-8,%rcx
adcx 8*1($tptr),%r9
adcx 8*2($tptr),%r10
@@ -3075,35 +3199,35 @@ sqrx8x_reduction:
.align 32
.Lsqrx8x_tail:
mov %r8,%rbx
- mulx 16*0($nptr),%rax,%r8
+ mulx 8*0($nptr),%rax,%r8
adcx %rax,%rbx
adox %r9,%r8
- mulx 16*1($nptr),%rax,%r9
+ mulx 8*1($nptr),%rax,%r9
adcx %rax,%r8
adox %r10,%r9
- mulx 16*2($nptr),%rax,%r10
+ mulx 8*2($nptr),%rax,%r10
adcx %rax,%r9
adox %r11,%r10
- mulx 16*3($nptr),%rax,%r11
+ mulx 8*3($nptr),%rax,%r11
adcx %rax,%r10
adox %r12,%r11
- .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rax,%r12
+ .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12
adcx %rax,%r11
adox %r13,%r12
- mulx 16*5($nptr),%rax,%r13
+ mulx 8*5($nptr),%rax,%r13
adcx %rax,%r12
adox %r14,%r13
- mulx 16*6($nptr),%rax,%r14
+ mulx 8*6($nptr),%rax,%r14
adcx %rax,%r13
adox %r15,%r14
- mulx 16*7($nptr),%rax,%r15
+ mulx 8*7($nptr),%rax,%r15
mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i]
adcx %rax,%r14
adox $carry,%r15
@@ -3119,7 +3243,7 @@ sqrx8x_reduction:
sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
mov 48+8(%rsp),%rdx # pull n0*a[0]
- lea 16*8($nptr),$nptr
+ lea 8*8($nptr),$nptr
adc 8*0($tptr),%r8
adc 8*1($tptr),%r9
adc 8*2($tptr),%r10
@@ -3155,7 +3279,7 @@ sqrx8x_reduction:
adc 8*0($tptr),%r8
movq %xmm3,%rcx
adc 8*1($tptr),%r9
- mov 16*7($nptr),$carry
+ mov 8*7($nptr),$carry
movq %xmm2,$nptr # restore $nptr
adc 8*2($tptr),%r10
adc 8*3($tptr),%r11
@@ -3181,6 +3305,8 @@ sqrx8x_reduction:
lea 8*8($tptr,%rcx),$tptr # start of current t[] window
cmp 8+8(%rsp),%r8 # end of t[]?
jb .Lsqrx8x_reduction_loop
+ ret
+.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
___
}
##############################################################
@@ -3188,52 +3314,59 @@ ___
#
{
my ($rptr,$nptr)=("%rdx","%rbp");
-my @ri=map("%r$_",(10..13));
-my @ni=map("%r$_",(14..15));
$code.=<<___;
- xor %ebx,%ebx
- sub %r15,%rsi # compare top-most words
- adc %rbx,%rbx
+.align 32
+__bn_postx4x_internal:
+ mov 8*0($nptr),%r12
mov %rcx,%r10 # -$num
- or %rbx,%rax
mov %rcx,%r9 # -$num
- xor \$1,%rax
- sar \$3+2,%rcx # cf=0
+ neg %rax
+ sar \$3+2,%rcx
#lea 48+8(%rsp,%r9),$tptr
- lea ($nptr,%rax,8),$nptr
movq %xmm1,$rptr # restore $rptr
movq %xmm1,$aptr # prepare for back-to-back call
- jmp .Lsqrx4x_sub
+ dec %r12 # so that after 'not' we get -n[0]
+ mov 8*1($nptr),%r13
+ xor %r8,%r8
+ mov 8*2($nptr),%r14
+ mov 8*3($nptr),%r15
+ jmp .Lsqrx4x_sub_entry
-.align 32
+.align 16
.Lsqrx4x_sub:
- .byte 0x66
- mov 8*0($tptr),%r12
- mov 8*1($tptr),%r13
- sbb 16*0($nptr),%r12
- mov 8*2($tptr),%r14
- sbb 16*1($nptr),%r13
- mov 8*3($tptr),%r15
- lea 8*4($tptr),$tptr
- sbb 16*2($nptr),%r14
+ mov 8*0($nptr),%r12
+ mov 8*1($nptr),%r13
+ mov 8*2($nptr),%r14
+ mov 8*3($nptr),%r15
+.Lsqrx4x_sub_entry:
+ andn %rax,%r12,%r12
+ lea 8*4($nptr),$nptr
+ andn %rax,%r13,%r13
+ andn %rax,%r14,%r14
+ andn %rax,%r15,%r15
+
+ neg %r8 # mov %r8,%cf
+ adc 8*0($tptr),%r12
+ adc 8*1($tptr),%r13
+ adc 8*2($tptr),%r14
+ adc 8*3($tptr),%r15
mov %r12,8*0($rptr)
- sbb 16*3($nptr),%r15
- lea 16*4($nptr),$nptr
+ lea 8*4($tptr),$tptr
mov %r13,8*1($rptr)
+ sbb %r8,%r8 # mov %cf,%r8
mov %r14,8*2($rptr)
mov %r15,8*3($rptr)
lea 8*4($rptr),$rptr
inc %rcx
jnz .Lsqrx4x_sub
-___
-}
-$code.=<<___;
+
neg %r9 # restore $num
ret
-.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
+.size __bn_postx4x_internal,.-__bn_postx4x_internal
___
+}
}}}
{
my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
@@ -3282,56 +3415,91 @@ bn_scatter5:
.globl bn_gather5
.type bn_gather5,\@abi-omnipotent
-.align 16
+.align 32
bn_gather5:
-___
-$code.=<<___ if ($win64);
-.LSEH_begin_bn_gather5:
+.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases
# I can't trust assembler to use specific encoding:-(
- .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp
- .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
- .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
+ .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10
+ .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp
+ lea .Linc(%rip),%rax
+ and \$-16,%rsp # shouldn't be formally required
+
+ movd $idx,%xmm5
+ movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
+ movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
+ lea 128($tbl),%r11 # size optimization
+ lea 128(%rsp),%rax # size optimization
+
+ pshufd \$0,%xmm5,%xmm5 # broadcast $idx
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
___
+########################################################################
+# calculate mask by comparing 0..31 to $idx and save result to stack
+#
+for($i=0;$i<$STRIDE/16;$i+=4) {
$code.=<<___;
- mov $idx,%r11d
- shr \$`log($N/8)/log(2)`,$idx
- and \$`$N/8-1`,%r11
- not $idx
- lea .Lmagic_masks(%rip),%rax
- and \$`2**5/($N/8)-1`,$idx # 5 is "window size"
- lea 128($tbl,%r11,8),$tbl # pointer within 1st cache line
- movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which
- movq 8(%rax,$idx,8),%xmm5 # cache line contains element
- movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument
- movq 24(%rax,$idx,8),%xmm7
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0 # compare to 1,0
+___
+$code.=<<___ if ($i);
+ movdqa %xmm3,`16*($i-1)-128`(%rax)
+___
+$code.=<<___;
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1 # compare to 3,2
+ movdqa %xmm0,`16*($i+0)-128`(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2 # compare to 5,4
+ movdqa %xmm1,`16*($i+1)-128`(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3 # compare to 7,6
+ movdqa %xmm2,`16*($i+2)-128`(%rax)
+ movdqa %xmm4,%xmm2
+___
+}
+$code.=<<___;
+ movdqa %xmm3,`16*($i-1)-128`(%rax)
jmp .Lgather
-.align 16
-.Lgather:
- movq `0*$STRIDE/4-128`($tbl),%xmm0
- movq `1*$STRIDE/4-128`($tbl),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-128`($tbl),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-128`($tbl),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
- .byte 0x67,0x67
- por %xmm2,%xmm0
- lea $STRIDE($tbl),$tbl
- por %xmm3,%xmm0
+.align 32
+.Lgather:
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`(%r11),%xmm0
+ movdqa `16*($i+1)-128`(%r11),%xmm1
+ movdqa `16*($i+2)-128`(%r11),%xmm2
+ pand `16*($i+0)-128`(%rax),%xmm0
+ movdqa `16*($i+3)-128`(%r11),%xmm3
+ pand `16*($i+1)-128`(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand `16*($i+2)-128`(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand `16*($i+3)-128`(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+___
+}
+$code.=<<___;
+ por %xmm5,%xmm4
+ lea $STRIDE(%r11),%r11
+ pshufd \$0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
movq %xmm0,($out) # m0=bp[0]
lea 8($out),$out
sub \$1,$num
jnz .Lgather
-___
-$code.=<<___ if ($win64);
- movaps (%rsp),%xmm6
- movaps 0x10(%rsp),%xmm7
- lea 0x28(%rsp),%rsp
-___
-$code.=<<___;
+
+ lea (%r10),%rsp
ret
.LSEH_end_bn_gather5:
.size bn_gather5,.-bn_gather5
@@ -3339,9 +3507,9 @@ ___
}
$code.=<<___;
.align 64
-.Lmagic_masks:
- .long 0,0, 0,0, 0,0, -1,-1
- .long 0,0, 0,0, 0,0, 0,0
+.Linc:
+ .long 0,0, 1,1
+ .long 2,2, 2,2
.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___
@@ -3389,19 +3557,16 @@ mul_handler:
lea .Lmul_epilogue(%rip),%r10
cmp %r10,%rbx
- jb .Lbody_40
+ ja .Lbody_40
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
+
jmp .Lbody_proceed
.Lbody_40:
mov 40(%rax),%rax # pull saved stack pointer
.Lbody_proceed:
-
- movaps -88(%rax),%xmm0
- movaps -72(%rax),%xmm1
-
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
@@ -3414,8 +3579,6 @@ mul_handler:
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R15
- movups %xmm0,512($context) # restore context->Xmm6
- movups %xmm1,528($context) # restore context->Xmm7
.Lcommon_seh_tail:
mov 8(%rax),%rdi
@@ -3526,10 +3689,9 @@ ___
$code.=<<___;
.align 8
.LSEH_info_bn_gather5:
- .byte 0x01,0x0d,0x05,0x00
- .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
- .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
- .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28
+ .byte 0x01,0x0b,0x03,0x0a
+ .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108
+ .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp)
.align 8
___
}
diff --git a/deps/openssl/openssl/crypto/bn/bn.h b/deps/openssl/openssl/crypto/bn/bn.h
index 5696965e9a..86264ae631 100644
--- a/deps/openssl/openssl/crypto/bn/bn.h
+++ b/deps/openssl/openssl/crypto/bn/bn.h
@@ -125,6 +125,7 @@
#ifndef HEADER_BN_H
# define HEADER_BN_H
+# include <limits.h>
# include <openssl/e_os2.h>
# ifndef OPENSSL_NO_FP_API
# include <stdio.h> /* FILE */
@@ -721,8 +722,17 @@ const BIGNUM *BN_get0_nist_prime_521(void);
/* library internal functions */
-# define bn_expand(a,bits) ((((((bits+BN_BITS2-1))/BN_BITS2)) <= (a)->dmax)?\
- (a):bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2))
+# define bn_expand(a,bits) \
+ ( \
+ bits > (INT_MAX - BN_BITS2 + 1) ? \
+ NULL \
+ : \
+ (((bits+BN_BITS2-1)/BN_BITS2) <= (a)->dmax) ? \
+ (a) \
+ : \
+ bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2) \
+ )
+
# define bn_wexpand(a,words) (((words) <= (a)->dmax)?(a):bn_expand2((a),(words)))
BIGNUM *bn_expand2(BIGNUM *a, int words);
# ifndef OPENSSL_NO_DEPRECATED
diff --git a/deps/openssl/openssl/crypto/bn/bn_exp.c b/deps/openssl/openssl/crypto/bn/bn_exp.c
index 6d30d1e0ff..1670f01d1d 100644
--- a/deps/openssl/openssl/crypto/bn/bn_exp.c
+++ b/deps/openssl/openssl/crypto/bn/bn_exp.c
@@ -110,6 +110,7 @@
*/
#include "cryptlib.h"
+#include "constant_time_locl.h"
#include "bn_lcl.h"
#include <stdlib.h>
@@ -606,15 +607,17 @@ static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos)
static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top,
unsigned char *buf, int idx,
- int width)
+ int window)
{
- size_t i, j;
+ int i, j;
+ int width = 1 << window;
+ BN_ULONG *table = (BN_ULONG *)buf;
if (top > b->top)
top = b->top; /* this works because 'buf' is explicitly
* zeroed */
- for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
- buf[j] = ((unsigned char *)b->d)[i];
+ for (i = 0, j = idx; i < top; i++, j += width) {
+ table[j] = b->d[i];
}
return 1;
@@ -622,15 +625,51 @@ static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top,
static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top,
unsigned char *buf, int idx,
- int width)
+ int window)
{
- size_t i, j;
+ int i, j;
+ int width = 1 << window;
+ volatile BN_ULONG *table = (volatile BN_ULONG *)buf;
if (bn_wexpand(b, top) == NULL)
return 0;
- for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
- ((unsigned char *)b->d)[i] = buf[j];
+ if (window <= 3) {
+ for (i = 0; i < top; i++, table += width) {
+ BN_ULONG acc = 0;
+
+ for (j = 0; j < width; j++) {
+ acc |= table[j] &
+ ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1));
+ }
+
+ b->d[i] = acc;
+ }
+ } else {
+ int xstride = 1 << (window - 2);
+ BN_ULONG y0, y1, y2, y3;
+
+ i = idx >> (window - 2); /* equivalent of idx / xstride */
+ idx &= xstride - 1; /* equivalent of idx % xstride */
+
+ y0 = (BN_ULONG)0 - (constant_time_eq_int(i,0)&1);
+ y1 = (BN_ULONG)0 - (constant_time_eq_int(i,1)&1);
+ y2 = (BN_ULONG)0 - (constant_time_eq_int(i,2)&1);
+ y3 = (BN_ULONG)0 - (constant_time_eq_int(i,3)&1);
+
+ for (i = 0; i < top; i++, table += width) {
+ BN_ULONG acc = 0;
+
+ for (j = 0; j < xstride; j++) {
+ acc |= ( (table[j + 0 * xstride] & y0) |
+ (table[j + 1 * xstride] & y1) |
+ (table[j + 2 * xstride] & y2) |
+ (table[j + 3 * xstride] & y3) )
+ & ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1));
+ }
+
+ b->d[i] = acc;
+ }
}
b->top = top;
@@ -749,8 +788,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
if (window >= 5) {
window = 5; /* ~5% improvement for RSA2048 sign, and even
* for RSA4096 */
- if ((top & 7) == 0)
- powerbufLen += 2 * top * sizeof(m->d[0]);
+ /* reserve space for mont->N.d[] copy */
+ powerbufLen += top * sizeof(mont->N.d[0]);
}
#endif
(void)0;
@@ -971,7 +1010,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
const BN_ULONG *not_used, const BN_ULONG *np,
const BN_ULONG *n0, int num);
- BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2;
+ BN_ULONG *n0 = mont->n0, *np;
/*
* BN_to_montgomery can contaminate words above .top [in
@@ -982,11 +1021,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
for (i = tmp.top; i < top; i++)
tmp.d[i] = 0;
- if (top & 7)
- np2 = np;
- else
- for (np2 = am.d + top, i = 0; i < top; i++)
- np2[2 * i] = np[i];
+ /*
+ * copy mont->N.d[] to improve cache locality
+ */
+ for (np = am.d + top, i = 0; i < top; i++)
+ np[i] = mont->N.d[i];
bn_scatter5(tmp.d, top, powerbuf, 0);
bn_scatter5(am.d, am.top, powerbuf, 1);
@@ -996,7 +1035,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
# if 0
for (i = 3; i < 32; i++) {
/* Calculate a^i = a^(i-1) * a */
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
}
# else
@@ -1007,7 +1046,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
for (i = 3; i < 8; i += 2) {
int j;
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
for (j = 2 * i; j < 32; j *= 2) {
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
@@ -1015,13 +1054,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
}
for (; i < 16; i += 2) {
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
bn_scatter5(tmp.d, top, powerbuf, 2 * i);
}
for (; i < 32; i += 2) {
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
}
# endif
@@ -1050,11 +1089,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
while (bits >= 0) {
wvalue = bn_get_bits5(p->d, bits - 4);
bits -= 5;
- bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
+ bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
}
}
- ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top);
+ ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
tmp.top = top;
bn_correct_top(&tmp);
if (ret) {
@@ -1065,9 +1104,9 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
} else
#endif
{
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers))
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, window))
goto err;
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, numPowers))
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, window))
goto err;
/*
@@ -1079,15 +1118,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
if (window > 1) {
if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx))
goto err;
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF
- (&tmp, top, powerbuf, 2, numPowers))
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2,
+ window))
goto err;
for (i = 3; i < numPowers; i++) {
/* Calculate a^i = a^(i-1) * a */
if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx))
goto err;
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF
- (&tmp, top, powerbuf, i, numPowers))
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i,
+ window))
goto err;
}
}
@@ -1095,8 +1134,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
bits--;
for (wvalue = 0, i = bits % window; i >= 0; i--, bits--)
wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
- if (!MOD_EXP_CTIME_COPY_FROM_PREBUF
- (&tmp, top, powerbuf, wvalue, numPowers))
+ if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp, top, powerbuf, wvalue,
+ window))
goto err;
/*
@@ -1116,8 +1155,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
/*
* Fetch the appropriate pre-computed value from the pre-buf
*/
- if (!MOD_EXP_CTIME_COPY_FROM_PREBUF
- (&am, top, powerbuf, wvalue, numPowers))
+ if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue,
+ window))
goto err;
/* Multiply the result into the intermediate result */
diff --git a/deps/openssl/openssl/crypto/bn/bn_print.c b/deps/openssl/openssl/crypto/bn/bn_print.c
index ab10b957ba..bfa31efc56 100644
--- a/deps/openssl/openssl/crypto/bn/bn_print.c
+++ b/deps/openssl/openssl/crypto/bn/bn_print.c
@@ -58,6 +58,7 @@
#include <stdio.h>
#include <ctype.h>
+#include <limits.h>
#include "cryptlib.h"
#include <openssl/buffer.h>
#include "bn_lcl.h"
@@ -189,7 +190,11 @@ int BN_hex2bn(BIGNUM **bn, const char *a)
a++;
}
- for (i = 0; isxdigit((unsigned char)a[i]); i++) ;
+ for (i = 0; i <= (INT_MAX/4) && isxdigit((unsigned char)a[i]); i++)
+ continue;
+
+ if (i > INT_MAX/4)
+ goto err;
num = i + neg;
if (bn == NULL)
@@ -204,7 +209,7 @@ int BN_hex2bn(BIGNUM **bn, const char *a)
BN_zero(ret);
}
- /* i is the number of hex digests; */
+ /* i is the number of hex digits */
if (bn_expand(ret, i * 4) == NULL)
goto err;
@@ -260,7 +265,11 @@ int BN_dec2bn(BIGNUM **bn, const char *a)
a++;
}
- for (i = 0; isdigit((unsigned char)a[i]); i++) ;
+ for (i = 0; i <= (INT_MAX/4) && isdigit((unsigned char)a[i]); i++)
+ continue;
+
+ if (i > INT_MAX/4)
+ goto err;
num = i + neg;
if (bn == NULL)
@@ -278,7 +287,7 @@ int BN_dec2bn(BIGNUM **bn, const char *a)
BN_zero(ret);
}
- /* i is the number of digests, a bit of an over expand; */
+ /* i is the number of digits, a bit of an over expand */
if (bn_expand(ret, i * 4) == NULL)
goto err;
diff --git a/deps/openssl/openssl/crypto/bn/bn_recp.c b/deps/openssl/openssl/crypto/bn/bn_recp.c
index 7497ac624d..f047040efe 100644
--- a/deps/openssl/openssl/crypto/bn/bn_recp.c
+++ b/deps/openssl/openssl/crypto/bn/bn_recp.c
@@ -65,6 +65,7 @@ void BN_RECP_CTX_init(BN_RECP_CTX *recp)
BN_init(&(recp->N));
BN_init(&(recp->Nr));
recp->num_bits = 0;
+ recp->shift = 0;
recp->flags = 0;
}
diff --git a/deps/openssl/openssl/crypto/cmac/cmac.c b/deps/openssl/openssl/crypto/cmac/cmac.c
index 774e6dc919..2954b6eb7d 100644
--- a/deps/openssl/openssl/crypto/cmac/cmac.c
+++ b/deps/openssl/openssl/crypto/cmac/cmac.c
@@ -160,6 +160,14 @@ int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen,
EVPerr(EVP_F_CMAC_INIT, EVP_R_DISABLED_FOR_FIPS);
return 0;
}
+
+ /* Switch to FIPS cipher implementation if possible */
+ if (cipher != NULL) {
+ const EVP_CIPHER *fcipher;
+ fcipher = FIPS_get_cipherbynid(EVP_CIPHER_nid(cipher));
+ if (fcipher != NULL)
+ cipher = fcipher;
+ }
/*
* Other algorithm blocking will be done in FIPS_cmac_init, via
* FIPS_cipherinit().
diff --git a/deps/openssl/openssl/crypto/cryptlib.c b/deps/openssl/openssl/crypto/cryptlib.c
index c9f674ba8e..1925428f5e 100644
--- a/deps/openssl/openssl/crypto/cryptlib.c
+++ b/deps/openssl/openssl/crypto/cryptlib.c
@@ -1016,11 +1016,11 @@ void *OPENSSL_stderr(void)
return stderr;
}
-int CRYPTO_memcmp(const void *in_a, const void *in_b, size_t len)
+int CRYPTO_memcmp(const volatile void *in_a, const volatile void *in_b, size_t len)
{
size_t i;
- const unsigned char *a = in_a;
- const unsigned char *b = in_b;
+ const volatile unsigned char *a = in_a;
+ const volatile unsigned char *b = in_b;
unsigned char x = 0;
for (i = 0; i < len; i++)
diff --git a/deps/openssl/openssl/crypto/crypto.h b/deps/openssl/openssl/crypto/crypto.h
index c450d7a3c3..6c644ce12a 100644
--- a/deps/openssl/openssl/crypto/crypto.h
+++ b/deps/openssl/openssl/crypto/crypto.h
@@ -628,7 +628,7 @@ void OPENSSL_init(void);
* into a defined order as the return value when a != b is undefined, other
* than to be non-zero.
*/
-int CRYPTO_memcmp(const void *a, const void *b, size_t len);
+int CRYPTO_memcmp(const volatile void *a, const volatile void *b, size_t len);
/* BEGIN ERROR CODES */
/*
diff --git a/deps/openssl/openssl/crypto/dh/dh.h b/deps/openssl/openssl/crypto/dh/dh.h
index 5498a9dc10..a5bd9016aa 100644
--- a/deps/openssl/openssl/crypto/dh/dh.h
+++ b/deps/openssl/openssl/crypto/dh/dh.h
@@ -174,7 +174,7 @@ struct dh_st {
/* DH_check_pub_key error codes */
# define DH_CHECK_PUBKEY_TOO_SMALL 0x01
# define DH_CHECK_PUBKEY_TOO_LARGE 0x02
-# define DH_CHECK_PUBKEY_INVALID 0x03
+# define DH_CHECK_PUBKEY_INVALID 0x04
/*
* primes p where (p-1)/2 is prime too are called "safe"; we define this for
diff --git a/deps/openssl/openssl/crypto/dh/dh_check.c b/deps/openssl/openssl/crypto/dh/dh_check.c
index 5adedc0d26..0277041114 100644
--- a/deps/openssl/openssl/crypto/dh/dh_check.c
+++ b/deps/openssl/openssl/crypto/dh/dh_check.c
@@ -160,13 +160,12 @@ int DH_check_pub_key(const DH *dh, const BIGNUM *pub_key, int *ret)
goto err;
BN_CTX_start(ctx);
tmp = BN_CTX_get(ctx);
- if (tmp == NULL)
+ if (tmp == NULL || !BN_set_word(tmp, 1))
goto err;
- BN_set_word(tmp, 1);
if (BN_cmp(pub_key, tmp) <= 0)
*ret |= DH_CHECK_PUBKEY_TOO_SMALL;
- BN_copy(tmp, dh->p);
- BN_sub_word(tmp, 1);
+ if (BN_copy(tmp, dh->p) == NULL || !BN_sub_word(tmp, 1))
+ goto err;
if (BN_cmp(pub_key, tmp) >= 0)
*ret |= DH_CHECK_PUBKEY_TOO_LARGE;
diff --git a/deps/openssl/openssl/crypto/dsa/dsa_ameth.c b/deps/openssl/openssl/crypto/dsa/dsa_ameth.c
index c40e1777ad..cc83d6e6ad 100644
--- a/deps/openssl/openssl/crypto/dsa/dsa_ameth.c
+++ b/deps/openssl/openssl/crypto/dsa/dsa_ameth.c
@@ -191,6 +191,8 @@ static int dsa_priv_decode(EVP_PKEY *pkey, PKCS8_PRIV_KEY_INFO *p8)
STACK_OF(ASN1_TYPE) *ndsa = NULL;
DSA *dsa = NULL;
+ int ret = 0;
+
if (!PKCS8_pkey_get0(NULL, &p, &pklen, &palg, p8))
return 0;
X509_ALGOR_get0(NULL, &ptype, &pval, palg);
@@ -262,23 +264,21 @@ static int dsa_priv_decode(EVP_PKEY *pkey, PKCS8_PRIV_KEY_INFO *p8)
}
EVP_PKEY_assign_DSA(pkey, dsa);
- BN_CTX_free(ctx);
- if (ndsa)
- sk_ASN1_TYPE_pop_free(ndsa, ASN1_TYPE_free);
- else
- ASN1_STRING_clear_free(privkey);
- return 1;
+ ret = 1;
+ goto done;
decerr:
- DSAerr(DSA_F_DSA_PRIV_DECODE, EVP_R_DECODE_ERROR);
+ DSAerr(DSA_F_DSA_PRIV_DECODE, DSA_R_DECODE_ERROR);
dsaerr:
+ DSA_free(dsa);
+ done:
BN_CTX_free(ctx);
- if (privkey)
+ if (ndsa)
+ sk_ASN1_TYPE_pop_free(ndsa, ASN1_TYPE_free);
+ else
ASN1_STRING_clear_free(privkey);
- sk_ASN1_TYPE_pop_free(ndsa, ASN1_TYPE_free);
- DSA_free(dsa);
- return 0;
+ return ret;
}
static int dsa_priv_encode(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pkey)
diff --git a/deps/openssl/openssl/crypto/dso/dso_lib.c b/deps/openssl/openssl/crypto/dso/dso_lib.c
index 3312450eae..2beb7c1ba5 100644
--- a/deps/openssl/openssl/crypto/dso/dso_lib.c
+++ b/deps/openssl/openssl/crypto/dso/dso_lib.c
@@ -122,6 +122,7 @@ DSO *DSO_new_method(DSO_METHOD *meth)
ret->meth = meth;
ret->references = 1;
if ((ret->meth->init != NULL) && !ret->meth->init(ret)) {
+ sk_void_free(ret->meth_data);
OPENSSL_free(ret);
ret = NULL;
}
diff --git a/deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl b/deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl
index e6acfd59f0..7140860e24 100755
--- a/deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -2001,6 +2001,7 @@ $code.=<<___;
push %r15
sub \$32*5+8, %rsp
+.Lpoint_double_shortcut$x:
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
mov $a_ptr, $b_ptr # backup copy
movdqu 0x10($a_ptr), %xmm1
@@ -2291,6 +2292,7 @@ $code.=<<___;
mov 0x40+8*1($b_ptr), $acc6
mov 0x40+8*2($b_ptr), $acc7
mov 0x40+8*3($b_ptr), $acc0
+ movq $b_ptr, %xmm1
lea 0x40-$bias($b_ptr), $a_ptr
lea $Z1sqr(%rsp), $r_ptr # Z1^2
@@ -2346,7 +2348,7 @@ $code.=<<___;
test $acc0, $acc0
jnz .Ladd_proceed$x # (in1infty || in2infty)?
test $acc1, $acc1
- jz .Ladd_proceed$x # is_equal(S1,S2)?
+ jz .Ladd_double$x # is_equal(S1,S2)?
movq %xmm0, $r_ptr # restore $r_ptr
pxor %xmm0, %xmm0
@@ -2359,6 +2361,13 @@ $code.=<<___;
jmp .Ladd_done$x
.align 32
+.Ladd_double$x:
+ movq %xmm1, $a_ptr # restore $a_ptr
+ movq %xmm0, $r_ptr # restore $r_ptr
+ add \$`32*(18-5)`, %rsp # difference in frame sizes
+ jmp .Lpoint_double_shortcut$x
+
+.align 32
.Ladd_proceed$x:
`&load_for_sqr("$R(%rsp)", "$src0")`
lea $Rsqr(%rsp), $r_ptr # R^2
diff --git a/deps/openssl/openssl/crypto/ec/ecp_nistp224.c b/deps/openssl/openssl/crypto/ec/ecp_nistp224.c
index ed09f97ade..d81cc9ce6b 100644
--- a/deps/openssl/openssl/crypto/ec/ecp_nistp224.c
+++ b/deps/openssl/openssl/crypto/ec/ecp_nistp224.c
@@ -1657,8 +1657,7 @@ int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
*/
if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
- ret = 1;
- goto err;
+ goto done;
}
if ((!BN_to_felem(pre->g_pre_comp[0][1][0], &group->generator->X)) ||
(!BN_to_felem(pre->g_pre_comp[0][1][1], &group->generator->Y)) ||
@@ -1736,6 +1735,7 @@ int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
}
make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems);
+ done:
if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp224_pre_comp_dup,
nistp224_pre_comp_free,
nistp224_pre_comp_clear_free))
diff --git a/deps/openssl/openssl/crypto/ec/ecp_nistp256.c b/deps/openssl/openssl/crypto/ec/ecp_nistp256.c
index a5887086c6..78d191aac7 100644
--- a/deps/openssl/openssl/crypto/ec/ecp_nistp256.c
+++ b/deps/openssl/openssl/crypto/ec/ecp_nistp256.c
@@ -2249,8 +2249,7 @@ int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
*/
if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
- ret = 1;
- goto err;
+ goto done;
}
if ((!BN_to_felem(x_tmp, &group->generator->X)) ||
(!BN_to_felem(y_tmp, &group->generator->Y)) ||
@@ -2337,6 +2336,7 @@ int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
}
make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
+ done:
if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup,
nistp256_pre_comp_free,
nistp256_pre_comp_clear_free))
diff --git a/deps/openssl/openssl/crypto/ec/ecp_nistp521.c b/deps/openssl/openssl/crypto/ec/ecp_nistp521.c
index 360b9a3516..c53a61bbfb 100644
--- a/deps/openssl/openssl/crypto/ec/ecp_nistp521.c
+++ b/deps/openssl/openssl/crypto/ec/ecp_nistp521.c
@@ -2056,8 +2056,7 @@ int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
*/
if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
- ret = 1;
- goto err;
+ goto done;
}
if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
(!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
@@ -2115,6 +2114,7 @@ int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
}
make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
+ done:
if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
nistp521_pre_comp_free,
nistp521_pre_comp_clear_free))
diff --git a/deps/openssl/openssl/crypto/ec/ectest.c b/deps/openssl/openssl/crypto/ec/ectest.c
index efab0b07b1..40a1f00325 100644
--- a/deps/openssl/openssl/crypto/ec/ectest.c
+++ b/deps/openssl/openssl/crypto/ec/ectest.c
@@ -1758,9 +1758,18 @@ static void nistp_single_test(const struct nistp_test_params *test)
if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx))
ABORT;
+ /*
+ * We have not performed precomputation so have_precompute mult should be
+ * false
+ */
+ if (EC_GROUP_have_precompute_mult(NISTP))
+ ABORT;
+
/* now repeat all tests with precomputation */
if (!EC_GROUP_precompute_mult(NISTP, ctx))
ABORT;
+ if (!EC_GROUP_have_precompute_mult(NISTP))
+ ABORT;
/* fixed point multiplication */
EC_POINT_mul(NISTP, Q, m, NULL, NULL, ctx);
diff --git a/deps/openssl/openssl/crypto/engine/eng_dyn.c b/deps/openssl/openssl/crypto/engine/eng_dyn.c
index 3169b09ad8..40f30e9d58 100644
--- a/deps/openssl/openssl/crypto/engine/eng_dyn.c
+++ b/deps/openssl/openssl/crypto/engine/eng_dyn.c
@@ -243,8 +243,10 @@ static int dynamic_set_data_ctx(ENGINE *e, dynamic_data_ctx **ctx)
* If we lost the race to set the context, c is non-NULL and *ctx is the
* context of the thread that won.
*/
- if (c)
+ if (c) {
+ sk_OPENSSL_STRING_free(c->dirs);
OPENSSL_free(c);
+ }
return 1;
}
diff --git a/deps/openssl/openssl/crypto/evp/e_des.c b/deps/openssl/openssl/crypto/evp/e_des.c
index aae13a6756..8ca65cd03a 100644
--- a/deps/openssl/openssl/crypto/evp/e_des.c
+++ b/deps/openssl/openssl/crypto/evp/e_des.c
@@ -71,12 +71,13 @@ typedef struct {
DES_key_schedule ks;
} ks;
union {
- void (*cbc) (const void *, void *, size_t, const void *, void *);
+ void (*cbc) (const void *, void *, size_t,
+ const DES_key_schedule *, unsigned char *);
} stream;
} EVP_DES_KEY;
# if defined(AES_ASM) && (defined(__sparc) || defined(__sparc__))
-/* ---------^^^ this is not a typo, just a way to detect that
+/* ----------^^^ this is not a typo, just a way to detect that
* assembler support was in general requested... */
# include "sparc_arch.h"
@@ -86,9 +87,9 @@ extern unsigned int OPENSSL_sparcv9cap_P[];
void des_t4_key_expand(const void *key, DES_key_schedule *ks);
void des_t4_cbc_encrypt(const void *inp, void *out, size_t len,
- DES_key_schedule *ks, unsigned char iv[8]);
+ const DES_key_schedule *ks, unsigned char iv[8]);
void des_t4_cbc_decrypt(const void *inp, void *out, size_t len,
- DES_key_schedule *ks, unsigned char iv[8]);
+ const DES_key_schedule *ks, unsigned char iv[8]);
# endif
static int des_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
@@ -130,7 +131,7 @@ static int des_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
{
EVP_DES_KEY *dat = (EVP_DES_KEY *) ctx->cipher_data;
- if (dat->stream.cbc) {
+ if (dat->stream.cbc != NULL) {
(*dat->stream.cbc) (in, out, inl, &dat->ks.ks, ctx->iv);
return 1;
}
diff --git a/deps/openssl/openssl/crypto/evp/e_des3.c b/deps/openssl/openssl/crypto/evp/e_des3.c
index bf6c1d2d3d..0e910d6d80 100644
--- a/deps/openssl/openssl/crypto/evp/e_des3.c
+++ b/deps/openssl/openssl/crypto/evp/e_des3.c
@@ -75,7 +75,8 @@ typedef struct {
DES_key_schedule ks[3];
} ks;
union {
- void (*cbc) (const void *, void *, size_t, const void *, void *);
+ void (*cbc) (const void *, void *, size_t,
+ const DES_key_schedule *, unsigned char *);
} stream;
} DES_EDE_KEY;
# define ks1 ks.ks[0]
@@ -93,9 +94,9 @@ extern unsigned int OPENSSL_sparcv9cap_P[];
void des_t4_key_expand(const void *key, DES_key_schedule *ks);
void des_t4_ede3_cbc_encrypt(const void *inp, void *out, size_t len,
- DES_key_schedule *ks, unsigned char iv[8]);
+ const DES_key_schedule ks[3], unsigned char iv[8]);
void des_t4_ede3_cbc_decrypt(const void *inp, void *out, size_t len,
- DES_key_schedule *ks, unsigned char iv[8]);
+ const DES_key_schedule ks[3], unsigned char iv[8]);
# endif
static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
@@ -162,7 +163,7 @@ static int des_ede_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
}
# endif /* KSSL_DEBUG */
if (dat->stream.cbc) {
- (*dat->stream.cbc) (in, out, inl, &dat->ks, ctx->iv);
+ (*dat->stream.cbc) (in, out, inl, dat->ks.ks, ctx->iv);
return 1;
}
@@ -395,7 +396,7 @@ static int des_ede3_unwrap(EVP_CIPHER_CTX *ctx, unsigned char *out,
int rv = -1;
if (inl < 24)
return -1;
- if (!out)
+ if (out == NULL)
return inl - 16;
memcpy(ctx->iv, wrap_iv, 8);
/* Decrypt first block which will end up as icv */
@@ -438,7 +439,7 @@ static int des_ede3_wrap(EVP_CIPHER_CTX *ctx, unsigned char *out,
const unsigned char *in, size_t inl)
{
unsigned char sha1tmp[SHA_DIGEST_LENGTH];
- if (!out)
+ if (out == NULL)
return inl + 16;
/* Copy input to output buffer + 8 so we have space for IV */
memmove(out + 8, in, inl);
diff --git a/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl b/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
index bd6bf72fe4..980cfd23ef 100644
--- a/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -43,7 +43,7 @@ die "can't locate x86_64-xlate.pl";
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.19) + ($1>=2.22);
+ $avx = ($1>=2.20) + ($1>=2.22);
}
if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
@@ -489,7 +489,7 @@ $code.=<<___;
___
$code.=<<___ if ($win64);
movaps -0xd8(%rax),%xmm6
- movaps -0xd8(%rax),%xmm7
+ movaps -0xc8(%rax),%xmm7
movaps -0xb8(%rax),%xmm8
movaps -0xa8(%rax),%xmm9
movaps -0x98(%rax),%xmm10
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
index 4ff2d39aa7..f889f20187 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
@@ -92,7 +92,7 @@ die "can't locate x86_64-xlate.pl";
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.19) + ($1>=2.22);
+ $avx = ($1>=2.20) + ($1>=2.22);
}
if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
diff --git a/deps/openssl/openssl/crypto/modes/ctr128.c b/deps/openssl/openssl/crypto/modes/ctr128.c
index f3bbcbf723..bcafd6b6bf 100644
--- a/deps/openssl/openssl/crypto/modes/ctr128.c
+++ b/deps/openssl/openssl/crypto/modes/ctr128.c
@@ -67,23 +67,20 @@
/* increment counter (128-bit int) by 1 */
static void ctr128_inc(unsigned char *counter)
{
- u32 n = 16;
- u8 c;
+ u32 n = 16, c = 1;
do {
--n;
- c = counter[n];
- ++c;
- counter[n] = c;
- if (c)
- return;
+ c += counter[n];
+ counter[n] = (u8)c;
+ c >>= 8;
} while (n);
}
#if !defined(OPENSSL_SMALL_FOOTPRINT)
static void ctr128_inc_aligned(unsigned char *counter)
{
- size_t *data, c, n;
+ size_t *data, c, d, n;
const union {
long one;
char little;
@@ -91,20 +88,19 @@ static void ctr128_inc_aligned(unsigned char *counter)
1
};
- if (is_endian.little) {
+ if (is_endian.little || ((size_t)counter % sizeof(size_t)) != 0) {
ctr128_inc(counter);
return;
}
data = (size_t *)counter;
+ c = 1;
n = 16 / sizeof(size_t);
do {
--n;
- c = data[n];
- ++c;
- data[n] = c;
- if (c)
- return;
+ d = data[n] += c;
+ /* did addition carry? */
+ c = ((d - c) ^ d) >> (sizeof(size_t) * 8 - 1);
} while (n);
}
#endif
@@ -144,14 +140,14 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
}
# if defined(STRICT_ALIGNMENT)
- if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) !=
- 0)
+ if (((size_t)in | (size_t)out | (size_t)ecount_buf)
+ % sizeof(size_t) != 0)
break;
# endif
while (len >= 16) {
(*block) (ivec, ecount_buf, key);
ctr128_inc_aligned(ivec);
- for (; n < 16; n += sizeof(size_t))
+ for (n = 0; n < 16; n += sizeof(size_t))
*(size_t *)(out + n) =
*(size_t *)(in + n) ^ *(size_t *)(ecount_buf + n);
len -= 16;
@@ -189,16 +185,13 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
/* increment upper 96 bits of 128-bit counter by 1 */
static void ctr96_inc(unsigned char *counter)
{
- u32 n = 12;
- u8 c;
+ u32 n = 12, c = 1;
do {
--n;
- c = counter[n];
- ++c;
- counter[n] = c;
- if (c)
- return;
+ c += counter[n];
+ counter[n] = (u8)c;
+ c >>= 8;
} while (n);
}
diff --git a/deps/openssl/openssl/crypto/opensslv.h b/deps/openssl/openssl/crypto/opensslv.h
index 03b8c48437..4334fd15cd 100644
--- a/deps/openssl/openssl/crypto/opensslv.h
+++ b/deps/openssl/openssl/crypto/opensslv.h
@@ -30,11 +30,11 @@ extern "C" {
* (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for
* major minor fix final patch/beta)
*/
-# define OPENSSL_VERSION_NUMBER 0x1000206fL
+# define OPENSSL_VERSION_NUMBER 0x1000207fL
# ifdef OPENSSL_FIPS
-# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2f-fips 28 Jan 2016"
+# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2g-fips 1 Mar 2016"
# else
-# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2f 28 Jan 2016"
+# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2g 1 Mar 2016"
# endif
# define OPENSSL_VERSION_PTEXT " part of " OPENSSL_VERSION_TEXT
diff --git a/deps/openssl/openssl/crypto/perlasm/x86_64-xlate.pl b/deps/openssl/openssl/crypto/perlasm/x86_64-xlate.pl
index 9c70b8c2c6..ee04221c7e 100755
--- a/deps/openssl/openssl/crypto/perlasm/x86_64-xlate.pl
+++ b/deps/openssl/openssl/crypto/perlasm/x86_64-xlate.pl
@@ -198,8 +198,11 @@ my %globals;
if ($gas) {
# Solaris /usr/ccs/bin/as can't handle multiplications
# in $self->{value}
- $self->{value} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
- $self->{value} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
+ my $value = $self->{value};
+ $value =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+ if ($value =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg) {
+ $self->{value} = $value;
+ }
sprintf "\$%s",$self->{value};
} else {
$self->{value} =~ s/(0b[0-1]+)/oct($1)/eig;
diff --git a/deps/openssl/openssl/crypto/pkcs7/pk7_smime.c b/deps/openssl/openssl/crypto/pkcs7/pk7_smime.c
index c4d3724d2a..dc9b484078 100644
--- a/deps/openssl/openssl/crypto/pkcs7/pk7_smime.c
+++ b/deps/openssl/openssl/crypto/pkcs7/pk7_smime.c
@@ -274,12 +274,29 @@ int PKCS7_verify(PKCS7 *p7, STACK_OF(X509) *certs, X509_STORE *store,
PKCS7err(PKCS7_F_PKCS7_VERIFY, PKCS7_R_NO_CONTENT);
return 0;
}
+#if 0
+ /*
+ * NB: this test commented out because some versions of Netscape
+ * illegally include zero length content when signing data. Also
+ * Microsoft Authenticode includes a SpcIndirectDataContent data
+ * structure which describes the content to be protected by the
+ * signature, rather than directly embedding that content. So
+ * Authenticode implementations are also expected to use
+ * PKCS7_verify() with explicit external data, on non-detached
+ * PKCS#7 signatures.
+ *
+ * In OpenSSL 1.1 a new flag PKCS7_NO_DUAL_CONTENT has been
+ * introduced to disable this sanity check. For the 1.0.2 branch
+ * this change is not acceptable, so the check remains completely
+ * commented out (as it has been for a long time).
+ */
/* Check for data and content: two sets of data */
if (!PKCS7_get_detached(p7) && indata) {
PKCS7err(PKCS7_F_PKCS7_VERIFY, PKCS7_R_CONTENT_AND_DATA_PRESENT);
return 0;
}
+#endif
sinfos = PKCS7_get_signer_info(p7);
diff --git a/deps/openssl/openssl/crypto/rsa/rsa_sign.c b/deps/openssl/openssl/crypto/rsa/rsa_sign.c
index ed63a1d8b0..82ca8324df 100644
--- a/deps/openssl/openssl/crypto/rsa/rsa_sign.c
+++ b/deps/openssl/openssl/crypto/rsa/rsa_sign.c
@@ -84,7 +84,7 @@ int RSA_sign(int type, const unsigned char *m, unsigned int m_len,
return 0;
}
#endif
- if (rsa->meth->rsa_sign) {
+ if ((rsa->flags & RSA_FLAG_SIGN_VER) && rsa->meth->rsa_sign) {
return rsa->meth->rsa_sign(type, m, m_len, sigret, siglen, rsa);
}
/* Special case: SSL signature, just check the length */
@@ -293,7 +293,7 @@ int RSA_verify(int dtype, const unsigned char *m, unsigned int m_len,
const unsigned char *sigbuf, unsigned int siglen, RSA *rsa)
{
- if (rsa->meth->rsa_verify) {
+ if ((rsa->flags & RSA_FLAG_SIGN_VER) && rsa->meth->rsa_verify) {
return rsa->meth->rsa_verify(dtype, m, m_len, sigbuf, siglen, rsa);
}
diff --git a/deps/openssl/openssl/crypto/srp/srp.h b/deps/openssl/openssl/crypto/srp/srp.h
index d072536fec..028892a1ff 100644
--- a/deps/openssl/openssl/crypto/srp/srp.h
+++ b/deps/openssl/openssl/crypto/srp/srp.h
@@ -82,16 +82,21 @@ typedef struct SRP_gN_cache_st {
DECLARE_STACK_OF(SRP_gN_cache)
typedef struct SRP_user_pwd_st {
+ /* Owned by us. */
char *id;
BIGNUM *s;
BIGNUM *v;
+ /* Not owned by us. */
const BIGNUM *g;
const BIGNUM *N;
+ /* Owned by us. */
char *info;
} SRP_user_pwd;
DECLARE_STACK_OF(SRP_user_pwd)
+void SRP_user_pwd_free(SRP_user_pwd *user_pwd);
+
typedef struct SRP_VBASE_st {
STACK_OF(SRP_user_pwd) *users_pwd;
STACK_OF(SRP_gN_cache) *gN_cache;
@@ -115,7 +120,12 @@ DECLARE_STACK_OF(SRP_gN)
SRP_VBASE *SRP_VBASE_new(char *seed_key);
int SRP_VBASE_free(SRP_VBASE *vb);
int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file);
+
+/* This method ignores the configured seed and fails for an unknown user. */
SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username);
+/* NOTE: unlike in SRP_VBASE_get_by_user, caller owns the returned pointer.*/
+SRP_user_pwd *SRP_VBASE_get1_by_user(SRP_VBASE *vb, char *username);
+
char *SRP_create_verifier(const char *user, const char *pass, char **salt,
char **verifier, const char *N, const char *g);
int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt,
diff --git a/deps/openssl/openssl/crypto/srp/srp_vfy.c b/deps/openssl/openssl/crypto/srp/srp_vfy.c
index a3f1a8a0a4..26ad3e07b4 100644
--- a/deps/openssl/openssl/crypto/srp/srp_vfy.c
+++ b/deps/openssl/openssl/crypto/srp/srp_vfy.c
@@ -185,7 +185,7 @@ static char *t_tob64(char *dst, const unsigned char *src, int size)
return olddst;
}
-static void SRP_user_pwd_free(SRP_user_pwd *user_pwd)
+void SRP_user_pwd_free(SRP_user_pwd *user_pwd)
{
if (user_pwd == NULL)
return;
@@ -247,6 +247,24 @@ static int SRP_user_pwd_set_sv_BN(SRP_user_pwd *vinfo, BIGNUM *s, BIGNUM *v)
return (vinfo->s != NULL && vinfo->v != NULL);
}
+static SRP_user_pwd *srp_user_pwd_dup(SRP_user_pwd *src)
+{
+ SRP_user_pwd *ret;
+
+ if (src == NULL)
+ return NULL;
+ if ((ret = SRP_user_pwd_new()) == NULL)
+ return NULL;
+
+ SRP_user_pwd_set_gN(ret, src->g, src->N);
+ if (!SRP_user_pwd_set_ids(ret, src->id, src->info)
+ || !SRP_user_pwd_set_sv_BN(ret, BN_dup(src->s), BN_dup(src->v))) {
+ SRP_user_pwd_free(ret);
+ return NULL;
+ }
+ return ret;
+}
+
SRP_VBASE *SRP_VBASE_new(char *seed_key)
{
SRP_VBASE *vb = (SRP_VBASE *)OPENSSL_malloc(sizeof(SRP_VBASE));
@@ -468,21 +486,50 @@ int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file)
}
-SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username)
+static SRP_user_pwd *find_user(SRP_VBASE *vb, char *username)
{
int i;
SRP_user_pwd *user;
- unsigned char digv[SHA_DIGEST_LENGTH];
- unsigned char digs[SHA_DIGEST_LENGTH];
- EVP_MD_CTX ctxt;
if (vb == NULL)
return NULL;
+
for (i = 0; i < sk_SRP_user_pwd_num(vb->users_pwd); i++) {
user = sk_SRP_user_pwd_value(vb->users_pwd, i);
if (strcmp(user->id, username) == 0)
return user;
}
+
+ return NULL;
+}
+
+/*
+ * This method ignores the configured seed and fails for an unknown user.
+ * Ownership of the returned pointer is not released to the caller.
+ * In other words, caller must not free the result.
+ */
+SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username)
+{
+ return find_user(vb, username);
+}
+
+/*
+ * Ownership of the returned pointer is released to the caller.
+ * In other words, caller must free the result once done.
+ */
+SRP_user_pwd *SRP_VBASE_get1_by_user(SRP_VBASE *vb, char *username)
+{
+ SRP_user_pwd *user;
+ unsigned char digv[SHA_DIGEST_LENGTH];
+ unsigned char digs[SHA_DIGEST_LENGTH];
+ EVP_MD_CTX ctxt;
+
+ if (vb == NULL)
+ return NULL;
+
+ if ((user = find_user(vb, username)) != NULL)
+ return srp_user_pwd_dup(user);
+
if ((vb->seed_key == NULL) ||
(vb->default_g == NULL) || (vb->default_N == NULL))
return NULL;
diff --git a/deps/openssl/openssl/crypto/stack/stack.c b/deps/openssl/openssl/crypto/stack/stack.c
index de437acf6a..fa50083e22 100644
--- a/deps/openssl/openssl/crypto/stack/stack.c
+++ b/deps/openssl/openssl/crypto/stack/stack.c
@@ -360,7 +360,7 @@ void *sk_set(_STACK *st, int i, void *value)
void sk_sort(_STACK *st)
{
- if (st && !st->sorted) {
+ if (st && !st->sorted && st->comp != NULL) {
int (*comp_func) (const void *, const void *);
/*
diff --git a/deps/openssl/openssl/crypto/x509/x509_vfy.c b/deps/openssl/openssl/crypto/x509/x509_vfy.c
index 0429767032..4d34dbac93 100644
--- a/deps/openssl/openssl/crypto/x509/x509_vfy.c
+++ b/deps/openssl/openssl/crypto/x509/x509_vfy.c
@@ -194,6 +194,9 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
int num, j, retry;
int (*cb) (int xok, X509_STORE_CTX *xctx);
STACK_OF(X509) *sktmp = NULL;
+ int trust = X509_TRUST_UNTRUSTED;
+ int err;
+
if (ctx->cert == NULL) {
X509err(X509_F_X509_VERIFY_CERT, X509_R_NO_CERT_SET_FOR_US_TO_VERIFY);
return -1;
@@ -216,7 +219,8 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
if (((ctx->chain = sk_X509_new_null()) == NULL) ||
(!sk_X509_push(ctx->chain, ctx->cert))) {
X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE);
- goto end;
+ ok = -1;
+ goto err;
}
CRYPTO_add(&ctx->cert->references, 1, CRYPTO_LOCK_X509);
ctx->last_untrusted = 1;
@@ -225,7 +229,8 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
if (ctx->untrusted != NULL
&& (sktmp = sk_X509_dup(ctx->untrusted)) == NULL) {
X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE);
- goto end;
+ ok = -1;
+ goto err;
}
num = sk_X509_num(ctx->chain);
@@ -249,7 +254,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
if (ctx->param->flags & X509_V_FLAG_TRUSTED_FIRST) {
ok = ctx->get_issuer(&xtmp, ctx, x);
if (ok < 0)
- goto end;
+ goto err;
/*
* If successful for now free up cert so it will be picked up
* again later.
@@ -266,7 +271,8 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
if (xtmp != NULL) {
if (!sk_X509_push(ctx->chain, xtmp)) {
X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE);
- goto end;
+ ok = -1;
+ goto err;
}
CRYPTO_add(&xtmp->references, 1, CRYPTO_LOCK_X509);
(void)sk_X509_delete_ptr(sktmp, xtmp);
@@ -314,7 +320,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
bad_chain = 1;
ok = cb(0, ctx);
if (!ok)
- goto end;
+ goto err;
} else {
/*
* We have a match: replace certificate with store
@@ -347,25 +353,26 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
ok = ctx->get_issuer(&xtmp, ctx, x);
if (ok < 0)
- goto end;
+ goto err;
if (ok == 0)
break;
x = xtmp;
if (!sk_X509_push(ctx->chain, x)) {
X509_free(xtmp);
X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE);
- ok = 0;
- goto end;
+ ok = -1;
+ goto err;
}
num++;
}
/* we now have our chain, lets check it... */
- i = check_trust(ctx);
+ if ((trust = check_trust(ctx)) == X509_TRUST_REJECTED) {
+ /* Callback already issued */
+ ok = 0;
+ goto err;
+ }
- /* If explicitly rejected error */
- if (i == X509_TRUST_REJECTED)
- goto end;
/*
* If it's not explicitly trusted then check if there is an alternative
* chain that could be used. We only do this if we haven't already
@@ -373,14 +380,14 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
* chain checking
*/
retry = 0;
- if (i != X509_TRUST_TRUSTED
+ if (trust != X509_TRUST_TRUSTED
&& !(ctx->param->flags & X509_V_FLAG_TRUSTED_FIRST)
&& !(ctx->param->flags & X509_V_FLAG_NO_ALT_CHAINS)) {
while (j-- > 1) {
xtmp2 = sk_X509_value(ctx->chain, j - 1);
ok = ctx->get_issuer(&xtmp, ctx, xtmp2);
if (ok < 0)
- goto end;
+ goto err;
/* Check if we found an alternate chain */
if (ok > 0) {
/*
@@ -410,7 +417,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
* self signed certificate in which case we've indicated an error already
* and set bad_chain == 1
*/
- if (i != X509_TRUST_TRUSTED && !bad_chain) {
+ if (trust != X509_TRUST_TRUSTED && !bad_chain) {
if ((chain_ss == NULL) || !ctx->check_issued(ctx, x, chain_ss)) {
if (ctx->last_untrusted >= num)
ctx->error = X509_V_ERR_UNABLE_TO_GET_ISSUER_CERT_LOCALLY;
@@ -431,26 +438,26 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
bad_chain = 1;
ok = cb(0, ctx);
if (!ok)
- goto end;
+ goto err;
}
/* We have the chain complete: now we need to check its purpose */
ok = check_chain_extensions(ctx);
if (!ok)
- goto end;
+ goto err;
/* Check name constraints */
ok = check_name_constraints(ctx);
if (!ok)
- goto end;
+ goto err;
ok = check_id(ctx);
if (!ok)
- goto end;
+ goto err;
/* We may as well copy down any DSA parameters that are required */
X509_get_pubkey_parameters(NULL, ctx->chain);
@@ -462,16 +469,16 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
ok = ctx->check_revocation(ctx);
if (!ok)
- goto end;
+ goto err;
- i = X509_chain_check_suiteb(&ctx->error_depth, NULL, ctx->chain,
- ctx->param->flags);
- if (i != X509_V_OK) {
- ctx->error = i;
+ err = X509_chain_check_suiteb(&ctx->error_depth, NULL, ctx->chain,
+ ctx->param->flags);
+ if (err != X509_V_OK) {
+ ctx->error = err;
ctx->current_cert = sk_X509_value(ctx->chain, ctx->error_depth);
ok = cb(0, ctx);
if (!ok)
- goto end;
+ goto err;
}
/* At this point, we have a chain and need to verify it */
@@ -480,25 +487,28 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
else
ok = internal_verify(ctx);
if (!ok)
- goto end;
+ goto err;
#ifndef OPENSSL_NO_RFC3779
/* RFC 3779 path validation, now that CRL check has been done */
ok = v3_asid_validate_path(ctx);
if (!ok)
- goto end;
+ goto err;
ok = v3_addr_validate_path(ctx);
if (!ok)
- goto end;
+ goto err;
#endif
/* If we get this far evaluate policies */
if (!bad_chain && (ctx->param->flags & X509_V_FLAG_POLICY_CHECK))
ok = ctx->check_policy(ctx);
if (!ok)
- goto end;
+ goto err;
if (0) {
- end:
+ err:
+ /* Ensure we return an error */
+ if (ok > 0)
+ ok = 0;
X509_get_pubkey_parameters(NULL, ctx->chain);
}
if (sktmp != NULL)
diff --git a/deps/openssl/openssl/doc/apps/ciphers.pod b/deps/openssl/openssl/doc/apps/ciphers.pod
index 1c26e3b3da..9643b4d48c 100644
--- a/deps/openssl/openssl/doc/apps/ciphers.pod
+++ b/deps/openssl/openssl/doc/apps/ciphers.pod
@@ -38,25 +38,21 @@ SSL v2 and for SSL v3/TLS v1.
Like B<-v>, but include cipher suite codes in output (hex format).
-=item B<-ssl3>
+=item B<-ssl3>, B<-tls1>
-only include SSL v3 ciphers.
+This lists ciphers compatible with any of SSLv3, TLSv1, TLSv1.1 or TLSv1.2.
=item B<-ssl2>
-only include SSL v2 ciphers.
-
-=item B<-tls1>
-
-only include TLS v1 ciphers.
+Only include SSLv2 ciphers.
=item B<-h>, B<-?>
-print a brief usage message.
+Print a brief usage message.
=item B<cipherlist>
-a cipher list to convert to a cipher preference list. If it is not included
+A cipher list to convert to a cipher preference list. If it is not included
then the default cipher list will be used. The format is described below.
=back
@@ -109,9 +105,10 @@ The following is a list of all permitted cipher strings and their meanings.
=item B<DEFAULT>
-the default cipher list. This is determined at compile time and
-is normally B<ALL:!EXPORT:!aNULL:!eNULL:!SSLv2>. This must be the firstcipher string
-specified.
+The default cipher list.
+This is determined at compile time and is normally
+B<ALL:!EXPORT:!aNULL:!eNULL:!SSLv2>.
+When used, this must be the first cipherstring specified.
=item B<COMPLEMENTOFDEFAULT>
@@ -139,34 +136,46 @@ than 128 bits, and some cipher suites with 128-bit keys.
=item B<LOW>
-"low" encryption cipher suites, currently those using 64 or 56 bit encryption algorithms
-but excluding export cipher suites.
+Low strength encryption cipher suites, currently those using 64 or 56 bit
+encryption algorithms but excluding export cipher suites.
+As of OpenSSL 1.0.2g, these are disabled in default builds.
=item B<EXP>, B<EXPORT>
-export encryption algorithms. Including 40 and 56 bits algorithms.
+Export strength encryption algorithms. Including 40 and 56 bits algorithms.
+As of OpenSSL 1.0.2g, these are disabled in default builds.
=item B<EXPORT40>
-40 bit export encryption algorithms
+40-bit export encryption algorithms
+As of OpenSSL 1.0.2g, these are disabled in default builds.
=item B<EXPORT56>
-56 bit export encryption algorithms. In OpenSSL 0.9.8c and later the set of
+56-bit export encryption algorithms. In OpenSSL 0.9.8c and later the set of
56 bit export ciphers is empty unless OpenSSL has been explicitly configured
with support for experimental ciphers.
+As of OpenSSL 1.0.2g, these are disabled in default builds.
=item B<eNULL>, B<NULL>
-the "NULL" ciphers that is those offering no encryption. Because these offer no
-encryption at all and are a security risk they are disabled unless explicitly
-included.
+The "NULL" ciphers that is those offering no encryption. Because these offer no
+encryption at all and are a security risk they are not enabled via either the
+B<DEFAULT> or B<ALL> cipher strings.
+Be careful when building cipherlists out of lower-level primitives such as
+B<kRSA> or B<aECDSA> as these do overlap with the B<eNULL> ciphers.
+When in doubt, include B<!eNULL> in your cipherlist.
=item B<aNULL>
-the cipher suites offering no authentication. This is currently the anonymous
+The cipher suites offering no authentication. This is currently the anonymous
DH algorithms and anonymous ECDH algorithms. These cipher suites are vulnerable
to a "man in the middle" attack and so their use is normally discouraged.
+These are excluded from the B<DEFAULT> ciphers, but included in the B<ALL>
+ciphers.
+Be careful when building cipherlists out of lower-level primitives such as
+B<kDHE> or B<AES> as these do overlap with the B<aNULL> ciphers.
+When in doubt, include B<!aNULL> in your cipherlist.
=item B<kRSA>, B<RSA>
@@ -582,11 +591,11 @@ Note: these ciphers can also be used in SSL v3.
=head2 Deprecated SSL v2.0 cipher suites.
SSL_CK_RC4_128_WITH_MD5 RC4-MD5
- SSL_CK_RC4_128_EXPORT40_WITH_MD5 EXP-RC4-MD5
- SSL_CK_RC2_128_CBC_WITH_MD5 RC2-MD5
- SSL_CK_RC2_128_CBC_EXPORT40_WITH_MD5 EXP-RC2-MD5
+ SSL_CK_RC4_128_EXPORT40_WITH_MD5 Not implemented.
+ SSL_CK_RC2_128_CBC_WITH_MD5 RC2-CBC-MD5
+ SSL_CK_RC2_128_CBC_EXPORT40_WITH_MD5 Not implemented.
SSL_CK_IDEA_128_CBC_WITH_MD5 IDEA-CBC-MD5
- SSL_CK_DES_64_CBC_WITH_MD5 DES-CBC-MD5
+ SSL_CK_DES_64_CBC_WITH_MD5 Not implemented.
SSL_CK_DES_192_EDE3_CBC_WITH_MD5 DES-CBC3-MD5
=head1 NOTES
diff --git a/deps/openssl/openssl/doc/apps/pkeyutl.pod b/deps/openssl/openssl/doc/apps/pkeyutl.pod
index 27be9a9007..5da347c97d 100644
--- a/deps/openssl/openssl/doc/apps/pkeyutl.pod
+++ b/deps/openssl/openssl/doc/apps/pkeyutl.pod
@@ -137,6 +137,19 @@ Unless otherwise mentioned all algorithms support the B<digest:alg> option
which specifies the digest in use for sign, verify and verifyrecover operations.
The value B<alg> should represent a digest name as used in the
EVP_get_digestbyname() function for example B<sha1>.
+This value is used only for sanity-checking the lengths of data passed in to
+the B<pkeyutl> and for creating the structures that make up the signature
+(e.g. B<DigestInfo> in RSASSA PKCS#1 v1.5 signatures).
+In case of RSA, ECDSA and DSA signatures, this utility
+will not perform hashing on input data but rather use the data directly as
+input of signature algorithm. Depending on key type, signature type and mode
+of padding, the maximum acceptable lengths of input data differ. In general,
+with RSA the signed data can't be longer than the key modulus, in case of ECDSA
+and DSA the data shouldn't be longer than field size, otherwise it will be
+silently truncated to field size.
+
+In other words, if the value of digest is B<sha1> the input should be 20 bytes
+long binary encoding of SHA-1 hash function output.
=head1 RSA ALGORITHM
diff --git a/deps/openssl/openssl/doc/apps/req.pod b/deps/openssl/openssl/doc/apps/req.pod
index 54a4d394d2..30653e5093 100644
--- a/deps/openssl/openssl/doc/apps/req.pod
+++ b/deps/openssl/openssl/doc/apps/req.pod
@@ -347,9 +347,12 @@ configuration file values.
=item B<default_bits>
-This specifies the default key size in bits. If not specified then
-512 is used. It is used if the B<-new> option is used. It can be
-overridden by using the B<-newkey> option.
+Specifies the default key size in bits.
+
+This option is used in conjunction with the B<-new> option to generate
+a new key. It can be overridden by specifying an explicit key size in
+the B<-newkey> option. The smallest accepted key size is 512 bits. If
+no key size is specified then 2048 bits is used.
=item B<default_keyfile>
diff --git a/deps/openssl/openssl/doc/apps/s_client.pod b/deps/openssl/openssl/doc/apps/s_client.pod
index 84d0527069..618df9659d 100644
--- a/deps/openssl/openssl/doc/apps/s_client.pod
+++ b/deps/openssl/openssl/doc/apps/s_client.pod
@@ -201,15 +201,11 @@ Use the PSK key B<key> when using a PSK cipher suite. The key is
given as a hexadecimal number without leading 0x, for example -psk
1a2b3c4d.
-=item B<-ssl2>, B<-ssl3>, B<-tls1>, B<-no_ssl2>, B<-no_ssl3>, B<-no_tls1>, B<-no_tls1_1>, B<-no_tls1_2>
+=item B<-ssl2>, B<-ssl3>, B<-tls1>, B<-tls1_1>, B<-tls1_2>, B<-no_ssl2>, B<-no_ssl3>, B<-no_tls1>, B<-no_tls1_1>, B<-no_tls1_2>
-these options disable the use of certain SSL or TLS protocols. By default
-the initial handshake uses a method which should be compatible with all
-servers and permit them to use SSL v3, SSL v2 or TLS as appropriate.
-
-Unfortunately there are still ancient and broken servers in use which
-cannot handle this technique and will fail to connect. Some servers only
-work if TLS is turned off.
+These options require or disable the use of the specified SSL or TLS protocols.
+By default the initial handshake uses a I<version-flexible> method which will
+negotiate the highest mutually supported protocol version.
=item B<-fallback_scsv>
diff --git a/deps/openssl/openssl/doc/apps/s_server.pod b/deps/openssl/openssl/doc/apps/s_server.pod
index baca779244..6f4acb7006 100644
--- a/deps/openssl/openssl/doc/apps/s_server.pod
+++ b/deps/openssl/openssl/doc/apps/s_server.pod
@@ -217,11 +217,11 @@ Use the PSK key B<key> when using a PSK cipher suite. The key is
given as a hexadecimal number without leading 0x, for example -psk
1a2b3c4d.
-=item B<-ssl2>, B<-ssl3>, B<-tls1>, B<-no_ssl2>, B<-no_ssl3>, B<-no_tls1>
+=item B<-ssl2>, B<-ssl3>, B<-tls1>, B<-tls1_1>, B<-tls1_2>, B<-no_ssl2>, B<-no_ssl3>, B<-no_tls1>, B<-no_tls1_1>, B<-no_tls1_2>
-these options disable the use of certain SSL or TLS protocols. By default
-the initial handshake uses a method which should be compatible with all
-servers and permit them to use SSL v3, SSL v2 or TLS as appropriate.
+These options require or disable the use of the specified SSL or TLS protocols.
+By default the initial handshake uses a I<version-flexible> method which will
+negotiate the highest mutually supported protocol version.
=item B<-bugs>
diff --git a/deps/openssl/openssl/doc/crypto/BIO_s_mem.pod b/deps/openssl/openssl/doc/crypto/BIO_s_mem.pod
index 8f85e0dcee..9f239648d7 100644
--- a/deps/openssl/openssl/doc/crypto/BIO_s_mem.pod
+++ b/deps/openssl/openssl/doc/crypto/BIO_s_mem.pod
@@ -16,7 +16,7 @@ BIO_get_mem_ptr, BIO_new_mem_buf - memory BIO
BIO_set_mem_buf(BIO *b,BUF_MEM *bm,int c)
BIO_get_mem_ptr(BIO *b,BUF_MEM **pp)
- BIO *BIO_new_mem_buf(void *buf, int len);
+ BIO *BIO_new_mem_buf(const void *buf, int len);
=head1 DESCRIPTION
@@ -61,7 +61,7 @@ BIO_get_mem_ptr() places the underlying BUF_MEM structure in B<pp>. It is
a macro.
BIO_new_mem_buf() creates a memory BIO using B<len> bytes of data at B<buf>,
-if B<len> is -1 then the B<buf> is assumed to be null terminated and its
+if B<len> is -1 then the B<buf> is assumed to be nul terminated and its
length is determined by B<strlen>. The BIO is set to a read only state and
as a result cannot be written to. This is useful when some data needs to be
made available from a static area of memory in the form of a BIO. The
diff --git a/deps/openssl/openssl/doc/ssl/SSL_CONF_cmd.pod b/deps/openssl/openssl/doc/ssl/SSL_CONF_cmd.pod
index 2bf1a60e90..e81d76ae77 100644
--- a/deps/openssl/openssl/doc/ssl/SSL_CONF_cmd.pod
+++ b/deps/openssl/openssl/doc/ssl/SSL_CONF_cmd.pod
@@ -74,7 +74,7 @@ B<prime256v1>). Curve names are case sensitive.
=item B<-named_curve>
-This sets the temporary curve used for ephemeral ECDH modes. Only used by
+This sets the temporary curve used for ephemeral ECDH modes. Only used by
servers
The B<value> argument is a curve name or the special value B<auto> which
@@ -85,7 +85,7 @@ can be either the B<NIST> name (e.g. B<P-256>) or an OpenSSL OID name
=item B<-cipher>
Sets the cipher suite list to B<value>. Note: syntax checking of B<value> is
-currently not performed unless a B<SSL> or B<SSL_CTX> structure is
+currently not performed unless a B<SSL> or B<SSL_CTX> structure is
associated with B<cctx>.
=item B<-cert>
@@ -111,9 +111,9 @@ operations are permitted.
=item B<-no_ssl2>, B<-no_ssl3>, B<-no_tls1>, B<-no_tls1_1>, B<-no_tls1_2>
-Disables protocol support for SSLv2, SSLv3, TLS 1.0, TLS 1.1 or TLS 1.2
-by setting the corresponding options B<SSL_OP_NO_SSL2>, B<SSL_OP_NO_SSL3>,
-B<SSL_OP_NO_TLS1>, B<SSL_OP_NO_TLS1_1> and B<SSL_OP_NO_TLS1_2> respectively.
+Disables protocol support for SSLv2, SSLv3, TLSv1.0, TLSv1.1 or TLSv1.2
+by setting the corresponding options B<SSL_OP_NO_SSLv2>, B<SSL_OP_NO_SSLv3>,
+B<SSL_OP_NO_TLSv1>, B<SSL_OP_NO_TLSv1_1> and B<SSL_OP_NO_TLSv1_2> respectively.
=item B<-bugs>
@@ -177,7 +177,7 @@ Note: the command prefix (if set) alters the recognised B<cmd> values.
=item B<CipherString>
Sets the cipher suite list to B<value>. Note: syntax checking of B<value> is
-currently not performed unless an B<SSL> or B<SSL_CTX> structure is
+currently not performed unless an B<SSL> or B<SSL_CTX> structure is
associated with B<cctx>.
=item B<Certificate>
@@ -244,7 +244,7 @@ B<prime256v1>). Curve names are case sensitive.
=item B<ECDHParameters>
-This sets the temporary curve used for ephemeral ECDH modes. Only used by
+This sets the temporary curve used for ephemeral ECDH modes. Only used by
servers
The B<value> argument is a curve name or the special value B<Automatic> which
@@ -258,10 +258,11 @@ The supported versions of the SSL or TLS protocol.
The B<value> argument is a comma separated list of supported protocols to
enable or disable. If an protocol is preceded by B<-> that version is disabled.
-All versions are enabled by default, though applications may choose to
-explicitly disable some. Currently supported protocol values are B<SSLv2>,
-B<SSLv3>, B<TLSv1>, B<TLSv1.1> and B<TLSv1.2>. The special value B<ALL> refers
-to all supported versions.
+Currently supported protocol values are B<SSLv2>, B<SSLv3>, B<TLSv1>,
+B<TLSv1.1> and B<TLSv1.2>.
+All protocol versions other than B<SSLv2> are enabled by default.
+To avoid inadvertent enabling of B<SSLv2>, when SSLv2 is disabled, it is not
+possible to enable it via the B<Protocol> command.
=item B<Options>
@@ -339,16 +340,16 @@ The value is a directory name.
The order of operations is significant. This can be used to set either defaults
or values which cannot be overridden. For example if an application calls:
- SSL_CONF_cmd(ctx, "Protocol", "-SSLv2");
+ SSL_CONF_cmd(ctx, "Protocol", "-SSLv3");
SSL_CONF_cmd(ctx, userparam, uservalue);
-it will disable SSLv2 support by default but the user can override it. If
+it will disable SSLv3 support by default but the user can override it. If
however the call sequence is:
SSL_CONF_cmd(ctx, userparam, uservalue);
- SSL_CONF_cmd(ctx, "Protocol", "-SSLv2");
+ SSL_CONF_cmd(ctx, "Protocol", "-SSLv3");
-SSLv2 is B<always> disabled and attempt to override this by the user are
+then SSLv3 is B<always> disabled and attempt to override this by the user are
ignored.
By checking the return code of SSL_CTX_cmd() it is possible to query if a
@@ -372,7 +373,7 @@ can be checked instead. If -3 is returned a required argument is missing
and an error is indicated. If 0 is returned some other error occurred and
this can be reported back to the user.
-The function SSL_CONF_cmd_value_type() can be used by applications to
+The function SSL_CONF_cmd_value_type() can be used by applications to
check for the existence of a command or to perform additional syntax
checking or translation of the command value. For example if the return
value is B<SSL_CONF_TYPE_FILE> an application could translate a relative
diff --git a/deps/openssl/openssl/doc/ssl/SSL_CTX_new.pod b/deps/openssl/openssl/doc/ssl/SSL_CTX_new.pod
index 491ac8c172..b8cc879784 100644
--- a/deps/openssl/openssl/doc/ssl/SSL_CTX_new.pod
+++ b/deps/openssl/openssl/doc/ssl/SSL_CTX_new.pod
@@ -2,13 +2,55 @@
=head1 NAME
-SSL_CTX_new - create a new SSL_CTX object as framework for TLS/SSL enabled functions
+SSL_CTX_new,
+SSLv23_method, SSLv23_server_method, SSLv23_client_method,
+TLSv1_2_method, TLSv1_2_server_method, TLSv1_2_client_method,
+TLSv1_1_method, TLSv1_1_server_method, TLSv1_1_client_method,
+TLSv1_method, TLSv1_server_method, TLSv1_client_method,
+SSLv3_method, SSLv3_server_method, SSLv3_client_method,
+SSLv2_method, SSLv2_server_method, SSLv2_client_method,
+DTLS_method, DTLS_server_method, DTLS_client_method,
+DTLSv1_2_method, DTLSv1_2_server_method, DTLSv1_2_client_method,
+DTLSv1_method, DTLSv1_server_method, DTLSv1_client_method -
+create a new SSL_CTX object as framework for TLS/SSL enabled functions
=head1 SYNOPSIS
#include <openssl/ssl.h>
SSL_CTX *SSL_CTX_new(const SSL_METHOD *method);
+ const SSL_METHOD *SSLv23_method(void);
+ const SSL_METHOD *SSLv23_server_method(void);
+ const SSL_METHOD *SSLv23_client_method(void);
+ const SSL_METHOD *TLSv1_2_method(void);
+ const SSL_METHOD *TLSv1_2_server_method(void);
+ const SSL_METHOD *TLSv1_2_client_method(void);
+ const SSL_METHOD *TLSv1_1_method(void);
+ const SSL_METHOD *TLSv1_1_server_method(void);
+ const SSL_METHOD *TLSv1_1_client_method(void);
+ const SSL_METHOD *TLSv1_method(void);
+ const SSL_METHOD *TLSv1_server_method(void);
+ const SSL_METHOD *TLSv1_client_method(void);
+ #ifndef OPENSSL_NO_SSL3_METHOD
+ const SSL_METHOD *SSLv3_method(void);
+ const SSL_METHOD *SSLv3_server_method(void);
+ const SSL_METHOD *SSLv3_client_method(void);
+ #endif
+ #ifndef OPENSSL_NO_SSL2
+ const SSL_METHOD *SSLv2_method(void);
+ const SSL_METHOD *SSLv2_server_method(void);
+ const SSL_METHOD *SSLv2_client_method(void);
+ #endif
+
+ const SSL_METHOD *DTLS_method(void);
+ const SSL_METHOD *DTLS_server_method(void);
+ const SSL_METHOD *DTLS_client_method(void);
+ const SSL_METHOD *DTLSv1_2_method(void);
+ const SSL_METHOD *DTLSv1_2_server_method(void);
+ const SSL_METHOD *DTLSv1_2_client_method(void);
+ const SSL_METHOD *DTLSv1_method(void);
+ const SSL_METHOD *DTLSv1_server_method(void);
+ const SSL_METHOD *DTLSv1_client_method(void);
=head1 DESCRIPTION
@@ -23,65 +65,88 @@ client only type. B<method> can be of the following types:
=over 4
-=item SSLv2_method(void), SSLv2_server_method(void), SSLv2_client_method(void)
+=item SSLv23_method(), SSLv23_server_method(), SSLv23_client_method()
+
+These are the general-purpose I<version-flexible> SSL/TLS methods.
+The actual protocol version used will be negotiated to the highest version
+mutually supported by the client and the server.
+The supported protocols are SSLv2, SSLv3, TLSv1, TLSv1.1 and TLSv1.2.
+Most applications should use these method, and avoid the version specific
+methods described below.
+
+The list of protocols available can be further limited using the
+B<SSL_OP_NO_SSLv2>, B<SSL_OP_NO_SSLv3>, B<SSL_OP_NO_TLSv1>,
+B<SSL_OP_NO_TLSv1_1> and B<SSL_OP_NO_TLSv1_2> options of the
+L<SSL_CTX_set_options(3)> or L<SSL_set_options(3)> functions.
+Clients should avoid creating "holes" in the set of protocols they support,
+when disabling a protocol, make sure that you also disable either all previous
+or all subsequent protocol versions.
+In clients, when a protocol version is disabled without disabling I<all>
+previous protocol versions, the effect is to also disable all subsequent
+protocol versions.
+
+The SSLv2 and SSLv3 protocols are deprecated and should generally not be used.
+Applications should typically use L<SSL_CTX_set_options(3)> in combination with
+the B<SSL_OP_NO_SSLv3> flag to disable negotiation of SSLv3 via the above
+I<version-flexible> SSL/TLS methods.
+The B<SSL_OP_NO_SSLv2> option is set by default, and would need to be cleared
+via L<SSL_CTX_clear_options(3)> in order to enable negotiation of SSLv2.
+
+=item TLSv1_2_method(), TLSv1_2_server_method(), TLSv1_2_client_method()
-A TLS/SSL connection established with these methods will only understand
-the SSLv2 protocol. A client will send out SSLv2 client hello messages
-and will also indicate that it only understand SSLv2. A server will only
-understand SSLv2 client hello messages.
+A TLS/SSL connection established with these methods will only understand the
+TLSv1.2 protocol. A client will send out TLSv1.2 client hello messages and
+will also indicate that it only understand TLSv1.2. A server will only
+understand TLSv1.2 client hello messages.
-=item SSLv3_method(void), SSLv3_server_method(void), SSLv3_client_method(void)
+=item TLSv1_1_method(), TLSv1_1_server_method(), TLSv1_1_client_method()
A TLS/SSL connection established with these methods will only understand the
-SSLv3 protocol. A client will send out SSLv3 client hello messages
-and will indicate that it only understands SSLv3. A server will only understand
-SSLv3 client hello messages. This especially means, that it will
-not understand SSLv2 client hello messages which are widely used for
-compatibility reasons, see SSLv23_*_method().
+TLSv1.1 protocol. A client will send out TLSv1.1 client hello messages and
+will also indicate that it only understand TLSv1.1. A server will only
+understand TLSv1.1 client hello messages.
-=item TLSv1_method(void), TLSv1_server_method(void), TLSv1_client_method(void)
+=item TLSv1_method(), TLSv1_server_method(), TLSv1_client_method()
A TLS/SSL connection established with these methods will only understand the
-TLSv1 protocol. A client will send out TLSv1 client hello messages
-and will indicate that it only understands TLSv1. A server will only understand
-TLSv1 client hello messages. This especially means, that it will
-not understand SSLv2 client hello messages which are widely used for
-compatibility reasons, see SSLv23_*_method(). It will also not understand
-SSLv3 client hello messages.
-
-=item SSLv23_method(void), SSLv23_server_method(void), SSLv23_client_method(void)
-
-A TLS/SSL connection established with these methods may understand the SSLv2,
-SSLv3, TLSv1, TLSv1.1 and TLSv1.2 protocols.
-
-If the cipher list does not contain any SSLv2 ciphersuites (the default
-cipher list does not) or extensions are required (for example server name)
-a client will send out TLSv1 client hello messages including extensions and
-will indicate that it also understands TLSv1.1, TLSv1.2 and permits a
-fallback to SSLv3. A server will support SSLv3, TLSv1, TLSv1.1 and TLSv1.2
-protocols. This is the best choice when compatibility is a concern.
-
-If any SSLv2 ciphersuites are included in the cipher list and no extensions
-are required then SSLv2 compatible client hellos will be used by clients and
-SSLv2 will be accepted by servers. This is B<not> recommended due to the
-insecurity of SSLv2 and the limited nature of the SSLv2 client hello
-prohibiting the use of extensions.
+TLSv1 protocol. A client will send out TLSv1 client hello messages and will
+indicate that it only understands TLSv1. A server will only understand TLSv1
+client hello messages.
-=back
+=item SSLv3_method(), SSLv3_server_method(), SSLv3_client_method()
+
+A TLS/SSL connection established with these methods will only understand the
+SSLv3 protocol. A client will send out SSLv3 client hello messages and will
+indicate that it only understands SSLv3. A server will only understand SSLv3
+client hello messages. The SSLv3 protocol is deprecated and should not be
+used.
+
+=item SSLv2_method(), SSLv2_server_method(), SSLv2_client_method()
+
+A TLS/SSL connection established with these methods will only understand the
+SSLv2 protocol. A client will send out SSLv2 client hello messages and will
+also indicate that it only understand SSLv2. A server will only understand
+SSLv2 client hello messages. The SSLv2 protocol offers little to no security
+and should not be used.
+As of OpenSSL 1.0.2g, EXPORT ciphers and 56-bit DES are no longer available
+with SSLv2.
-The list of protocols available can later be limited using the SSL_OP_NO_SSLv2,
-SSL_OP_NO_SSLv3, SSL_OP_NO_TLSv1, SSL_OP_NO_TLSv1_1 and SSL_OP_NO_TLSv1_2
-options of the SSL_CTX_set_options() or SSL_set_options() functions.
-Using these options it is possible to choose e.g. SSLv23_server_method() and
-be able to negotiate with all possible clients, but to only allow newer
-protocols like TLSv1, TLSv1.1 or TLS v1.2.
+=item DTLS_method(), DTLS_server_method(), DTLS_client_method()
-Applications which never want to support SSLv2 (even is the cipher string
-is configured to use SSLv2 ciphersuites) can set SSL_OP_NO_SSLv2.
+These are the version-flexible DTLS methods.
+
+=item DTLSv1_2_method(), DTLSv1_2_server_method(), DTLSv1_2_client_method()
+
+These are the version-specific methods for DTLSv1.2.
+
+=item DTLSv1_method(), DTLSv1_server_method(), DTLSv1_client_method()
+
+These are the version-specific methods for DTLSv1.
+
+=back
-SSL_CTX_new() initializes the list of ciphers, the session cache setting,
-the callbacks, the keys and certificates and the options to its default
-values.
+SSL_CTX_new() initializes the list of ciphers, the session cache setting, the
+callbacks, the keys and certificates and the options to its default values.
=head1 RETURN VALUES
@@ -91,8 +156,8 @@ The following return values can occur:
=item NULL
-The creation of a new SSL_CTX object failed. Check the error stack to
-find out the reason.
+The creation of a new SSL_CTX object failed. Check the error stack to find out
+the reason.
=item Pointer to an SSL_CTX object
@@ -102,6 +167,7 @@ The return value points to an allocated SSL_CTX object.
=head1 SEE ALSO
+L<SSL_CTX_set_options(3)>, L<SSL_CTX_clear_options(3)>, L<SSL_set_options(3)>,
L<SSL_CTX_free(3)|SSL_CTX_free(3)>, L<SSL_accept(3)|SSL_accept(3)>,
L<ssl(3)|ssl(3)>, L<SSL_set_connect_state(3)|SSL_set_connect_state(3)>
diff --git a/deps/openssl/openssl/doc/ssl/SSL_CTX_set_options.pod b/deps/openssl/openssl/doc/ssl/SSL_CTX_set_options.pod
index e80a72cd4d..9a7e98c1d4 100644
--- a/deps/openssl/openssl/doc/ssl/SSL_CTX_set_options.pod
+++ b/deps/openssl/openssl/doc/ssl/SSL_CTX_set_options.pod
@@ -189,15 +189,25 @@ browser has a cert, it will crash/hang. Works for 3.x and 4.xbeta
=item SSL_OP_NO_SSLv2
Do not use the SSLv2 protocol.
+As of OpenSSL 1.0.2g the B<SSL_OP_NO_SSLv2> option is set by default.
=item SSL_OP_NO_SSLv3
Do not use the SSLv3 protocol.
+It is recommended that applications should set this option.
=item SSL_OP_NO_TLSv1
Do not use the TLSv1 protocol.
+=item SSL_OP_NO_TLSv1_1
+
+Do not use the TLSv1.1 protocol.
+
+=item SSL_OP_NO_TLSv1_2
+
+Do not use the TLSv1.2 protocol.
+
=item SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION
When performing renegotiation as a server, always start a new session
diff --git a/deps/openssl/openssl/doc/ssl/ssl.pod b/deps/openssl/openssl/doc/ssl/ssl.pod
index 242087e691..70cca178a2 100644
--- a/deps/openssl/openssl/doc/ssl/ssl.pod
+++ b/deps/openssl/openssl/doc/ssl/ssl.pod
@@ -130,41 +130,86 @@ protocol methods defined in B<SSL_METHOD> structures.
=over 4
-=item const SSL_METHOD *B<SSLv2_client_method>(void);
+=item const SSL_METHOD *B<SSLv23_method>(void);
-Constructor for the SSLv2 SSL_METHOD structure for a dedicated client.
+Constructor for the I<version-flexible> SSL_METHOD structure for
+clients, servers or both.
+See L<SSL_CTX_new(3)> for details.
-=item const SSL_METHOD *B<SSLv2_server_method>(void);
+=item const SSL_METHOD *B<SSLv23_client_method>(void);
-Constructor for the SSLv2 SSL_METHOD structure for a dedicated server.
+Constructor for the I<version-flexible> SSL_METHOD structure for
+clients.
-=item const SSL_METHOD *B<SSLv2_method>(void);
+=item const SSL_METHOD *B<SSLv23_client_method>(void);
-Constructor for the SSLv2 SSL_METHOD structure for combined client and server.
+Constructor for the I<version-flexible> SSL_METHOD structure for
+servers.
-=item const SSL_METHOD *B<SSLv3_client_method>(void);
+=item const SSL_METHOD *B<TLSv1_2_method>(void);
-Constructor for the SSLv3 SSL_METHOD structure for a dedicated client.
+Constructor for the TLSv1.2 SSL_METHOD structure for clients, servers
+or both.
-=item const SSL_METHOD *B<SSLv3_server_method>(void);
+=item const SSL_METHOD *B<TLSv1_2_client_method>(void);
-Constructor for the SSLv3 SSL_METHOD structure for a dedicated server.
+Constructor for the TLSv1.2 SSL_METHOD structure for clients.
-=item const SSL_METHOD *B<SSLv3_method>(void);
+=item const SSL_METHOD *B<TLSv1_2_server_method>(void);
+
+Constructor for the TLSv1.2 SSL_METHOD structure for servers.
+
+=item const SSL_METHOD *B<TLSv1_1_method>(void);
-Constructor for the SSLv3 SSL_METHOD structure for combined client and server.
+Constructor for the TLSv1.1 SSL_METHOD structure for clients, servers
+or both.
+
+=item const SSL_METHOD *B<TLSv1_1_client_method>(void);
+
+Constructor for the TLSv1.1 SSL_METHOD structure for clients.
+
+=item const SSL_METHOD *B<TLSv1_1_server_method>(void);
+
+Constructor for the TLSv1.1 SSL_METHOD structure for servers.
+
+=item const SSL_METHOD *B<TLSv1_method>(void);
+
+Constructor for the TLSv1 SSL_METHOD structure for clients, servers
+or both.
=item const SSL_METHOD *B<TLSv1_client_method>(void);
-Constructor for the TLSv1 SSL_METHOD structure for a dedicated client.
+Constructor for the TLSv1 SSL_METHOD structure for clients.
=item const SSL_METHOD *B<TLSv1_server_method>(void);
-Constructor for the TLSv1 SSL_METHOD structure for a dedicated server.
+Constructor for the TLSv1 SSL_METHOD structure for servers.
-=item const SSL_METHOD *B<TLSv1_method>(void);
+=item const SSL_METHOD *B<SSLv3_method>(void);
+
+Constructor for the SSLv3 SSL_METHOD structure for clients, servers
+or both.
+
+=item const SSL_METHOD *B<SSLv3_client_method>(void);
+
+Constructor for the SSLv3 SSL_METHOD structure for clients.
+
+=item const SSL_METHOD *B<SSLv3_server_method>(void);
+
+Constructor for the SSLv3 SSL_METHOD structure for servers.
+
+=item const SSL_METHOD *B<SSLv2_method>(void);
+
+Constructor for the SSLv2 SSL_METHOD structure for clients, servers
+or both.
+
+=item const SSL_METHOD *B<SSLv2_client_method>(void);
+
+Constructor for the SSLv2 SSL_METHOD structure for clients.
+
+=item const SSL_METHOD *B<SSLv2_server_method>(void);
-Constructor for the TLSv1 SSL_METHOD structure for combined client and server.
+Constructor for the SSLv2 SSL_METHOD structure for servers.
=back
diff --git a/deps/openssl/openssl/engines/e_capi.c b/deps/openssl/openssl/engines/e_capi.c
index f4cd2ffe7f..6e524633f3 100644
--- a/deps/openssl/openssl/engines/e_capi.c
+++ b/deps/openssl/openssl/engines/e_capi.c
@@ -114,6 +114,26 @@
# define CERT_SYSTEM_STORE_CURRENT_USER 0x00010000
# endif
+# ifndef ALG_SID_SHA_256
+# define ALG_SID_SHA_256 12
+# endif
+# ifndef ALG_SID_SHA_384
+# define ALG_SID_SHA_384 13
+# endif
+# ifndef ALG_SID_SHA_512
+# define ALG_SID_SHA_512 14
+# endif
+
+# ifndef CALG_SHA_256
+# define CALG_SHA_256 (ALG_CLASS_HASH | ALG_TYPE_ANY | ALG_SID_SHA_256)
+# endif
+# ifndef CALG_SHA_384
+# define CALG_SHA_384 (ALG_CLASS_HASH | ALG_TYPE_ANY | ALG_SID_SHA_384)
+# endif
+# ifndef CALG_SHA_512
+# define CALG_SHA_512 (ALG_CLASS_HASH | ALG_TYPE_ANY | ALG_SID_SHA_512)
+# endif
+
# include <openssl/engine.h>
# include <openssl/pem.h>
# include <openssl/x509v3.h>
@@ -800,6 +820,18 @@ int capi_rsa_sign(int dtype, const unsigned char *m, unsigned int m_len,
}
/* Convert the signature type to a CryptoAPI algorithm ID */
switch (dtype) {
+ case NID_sha256:
+ alg = CALG_SHA_256;
+ break;
+
+ case NID_sha384:
+ alg = CALG_SHA_384;
+ break;
+
+ case NID_sha512:
+ alg = CALG_SHA_512;
+ break;
+
case NID_sha1:
alg = CALG_SHA1;
break;
diff --git a/deps/openssl/openssl/include/openssl/bio.h b/deps/openssl/openssl/include/openssl/bio.h
index 6e2293bc66..6790aed28e 100644
--- a/deps/openssl/openssl/include/openssl/bio.h
+++ b/deps/openssl/openssl/include/openssl/bio.h
@@ -479,7 +479,7 @@ struct bio_dgram_sctp_prinfo {
# define BIO_get_conn_hostname(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,0)
# define BIO_get_conn_port(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,1)
# define BIO_get_conn_ip(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,2)
-# define BIO_get_conn_int_port(b) BIO_ctrl(b,BIO_C_GET_CONNECT,3,0,NULL)
+# define BIO_get_conn_int_port(b) BIO_ctrl(b,BIO_C_GET_CONNECT,3,NULL)
# define BIO_set_nbio(b,n) BIO_ctrl(b,BIO_C_SET_NBIO,(n),NULL)
@@ -689,7 +689,7 @@ long BIO_debug_callback(BIO *bio, int cmd, const char *argp, int argi,
long argl, long ret);
BIO_METHOD *BIO_s_mem(void);
-BIO *BIO_new_mem_buf(void *buf, int len);
+BIO *BIO_new_mem_buf(const void *buf, int len);
BIO_METHOD *BIO_s_socket(void);
BIO_METHOD *BIO_s_connect(void);
BIO_METHOD *BIO_s_accept(void);
diff --git a/deps/openssl/openssl/include/openssl/bn.h b/deps/openssl/openssl/include/openssl/bn.h
index 5696965e9a..86264ae631 100644
--- a/deps/openssl/openssl/include/openssl/bn.h
+++ b/deps/openssl/openssl/include/openssl/bn.h
@@ -125,6 +125,7 @@
#ifndef HEADER_BN_H
# define HEADER_BN_H
+# include <limits.h>
# include <openssl/e_os2.h>
# ifndef OPENSSL_NO_FP_API
# include <stdio.h> /* FILE */
@@ -721,8 +722,17 @@ const BIGNUM *BN_get0_nist_prime_521(void);
/* library internal functions */
-# define bn_expand(a,bits) ((((((bits+BN_BITS2-1))/BN_BITS2)) <= (a)->dmax)?\
- (a):bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2))
+# define bn_expand(a,bits) \
+ ( \
+ bits > (INT_MAX - BN_BITS2 + 1) ? \
+ NULL \
+ : \
+ (((bits+BN_BITS2-1)/BN_BITS2) <= (a)->dmax) ? \
+ (a) \
+ : \
+ bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2) \
+ )
+
# define bn_wexpand(a,words) (((words) <= (a)->dmax)?(a):bn_expand2((a),(words)))
BIGNUM *bn_expand2(BIGNUM *a, int words);
# ifndef OPENSSL_NO_DEPRECATED
diff --git a/deps/openssl/openssl/include/openssl/crypto.h b/deps/openssl/openssl/include/openssl/crypto.h
index c450d7a3c3..6c644ce12a 100644
--- a/deps/openssl/openssl/include/openssl/crypto.h
+++ b/deps/openssl/openssl/include/openssl/crypto.h
@@ -628,7 +628,7 @@ void OPENSSL_init(void);
* into a defined order as the return value when a != b is undefined, other
* than to be non-zero.
*/
-int CRYPTO_memcmp(const void *a, const void *b, size_t len);
+int CRYPTO_memcmp(const volatile void *a, const volatile void *b, size_t len);
/* BEGIN ERROR CODES */
/*
diff --git a/deps/openssl/openssl/include/openssl/dh.h b/deps/openssl/openssl/include/openssl/dh.h
index 5498a9dc10..a5bd9016aa 100644
--- a/deps/openssl/openssl/include/openssl/dh.h
+++ b/deps/openssl/openssl/include/openssl/dh.h
@@ -174,7 +174,7 @@ struct dh_st {
/* DH_check_pub_key error codes */
# define DH_CHECK_PUBKEY_TOO_SMALL 0x01
# define DH_CHECK_PUBKEY_TOO_LARGE 0x02
-# define DH_CHECK_PUBKEY_INVALID 0x03
+# define DH_CHECK_PUBKEY_INVALID 0x04
/*
* primes p where (p-1)/2 is prime too are called "safe"; we define this for
diff --git a/deps/openssl/openssl/include/openssl/opensslv.h b/deps/openssl/openssl/include/openssl/opensslv.h
index 03b8c48437..4334fd15cd 100644
--- a/deps/openssl/openssl/include/openssl/opensslv.h
+++ b/deps/openssl/openssl/include/openssl/opensslv.h
@@ -30,11 +30,11 @@ extern "C" {
* (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for
* major minor fix final patch/beta)
*/
-# define OPENSSL_VERSION_NUMBER 0x1000206fL
+# define OPENSSL_VERSION_NUMBER 0x1000207fL
# ifdef OPENSSL_FIPS
-# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2f-fips 28 Jan 2016"
+# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2g-fips 1 Mar 2016"
# else
-# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2f 28 Jan 2016"
+# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2g 1 Mar 2016"
# endif
# define OPENSSL_VERSION_PTEXT " part of " OPENSSL_VERSION_TEXT
diff --git a/deps/openssl/openssl/include/openssl/srp.h b/deps/openssl/openssl/include/openssl/srp.h
index d072536fec..028892a1ff 100644
--- a/deps/openssl/openssl/include/openssl/srp.h
+++ b/deps/openssl/openssl/include/openssl/srp.h
@@ -82,16 +82,21 @@ typedef struct SRP_gN_cache_st {
DECLARE_STACK_OF(SRP_gN_cache)
typedef struct SRP_user_pwd_st {
+ /* Owned by us. */
char *id;
BIGNUM *s;
BIGNUM *v;
+ /* Not owned by us. */
const BIGNUM *g;
const BIGNUM *N;
+ /* Owned by us. */
char *info;
} SRP_user_pwd;
DECLARE_STACK_OF(SRP_user_pwd)
+void SRP_user_pwd_free(SRP_user_pwd *user_pwd);
+
typedef struct SRP_VBASE_st {
STACK_OF(SRP_user_pwd) *users_pwd;
STACK_OF(SRP_gN_cache) *gN_cache;
@@ -115,7 +120,12 @@ DECLARE_STACK_OF(SRP_gN)
SRP_VBASE *SRP_VBASE_new(char *seed_key);
int SRP_VBASE_free(SRP_VBASE *vb);
int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file);
+
+/* This method ignores the configured seed and fails for an unknown user. */
SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username);
+/* NOTE: unlike in SRP_VBASE_get_by_user, caller owns the returned pointer.*/
+SRP_user_pwd *SRP_VBASE_get1_by_user(SRP_VBASE *vb, char *username);
+
char *SRP_create_verifier(const char *user, const char *pass, char **salt,
char **verifier, const char *N, const char *g);
int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt,
diff --git a/deps/openssl/openssl/include/openssl/ssl.h b/deps/openssl/openssl/include/openssl/ssl.h
index ae8c92575e..04d4007eeb 100644
--- a/deps/openssl/openssl/include/openssl/ssl.h
+++ b/deps/openssl/openssl/include/openssl/ssl.h
@@ -2713,7 +2713,6 @@ void ERR_load_SSL_strings(void);
# define SSL_F_SSL3_SETUP_KEY_BLOCK 157
# define SSL_F_SSL3_SETUP_READ_BUFFER 156
# define SSL_F_SSL3_SETUP_WRITE_BUFFER 291
-# define SSL_F_SSL3_SHUTDOWN 396
# define SSL_F_SSL3_WRITE_BYTES 158
# define SSL_F_SSL3_WRITE_PENDING 159
# define SSL_F_SSL_ADD_CERT_CHAIN 318
diff --git a/deps/openssl/openssl/ms/uplink-x86.pl b/deps/openssl/openssl/ms/uplink-x86.pl
index 0dffc14fcd..53b998d270 100755
--- a/deps/openssl/openssl/ms/uplink-x86.pl
+++ b/deps/openssl/openssl/ms/uplink-x86.pl
@@ -14,11 +14,11 @@ require "uplink-common.pl";
for ($i=1;$i<=$N;$i++) {
&function_begin_B("_\$lazy${i}");
&lea ("eax",&DWP(&label("OPENSSL_UplinkTable")));
- &push ("eax");
&push ($i);
+ &push ("eax");
&call (&label("OPENSSL_Uplink"));
- &add ("esp",8);
&pop ("eax");
+ &add ("esp",4);
&jmp_ptr(&DWP(4*$i,"eax"));
&function_end_B("_\$lazy${i}");
}
diff --git a/deps/openssl/openssl/openssl.spec b/deps/openssl/openssl/openssl.spec
index 72ace12c44..67fb0735e2 100644
--- a/deps/openssl/openssl/openssl.spec
+++ b/deps/openssl/openssl/openssl.spec
@@ -6,7 +6,7 @@ Release: 1
Summary: Secure Sockets Layer and cryptography libraries and tools
Name: openssl
-Version: 1.0.2f
+Version: 1.0.2g
Source0: ftp://ftp.openssl.org/source/%{name}-%{version}.tar.gz
License: OpenSSL
Group: System Environment/Libraries
diff --git a/deps/openssl/openssl/ssl/Makefile b/deps/openssl/openssl/ssl/Makefile
index 7b90fb0375..b6dee5b5ea 100644
--- a/deps/openssl/openssl/ssl/Makefile
+++ b/deps/openssl/openssl/ssl/Makefile
@@ -15,7 +15,7 @@ KRB5_INCLUDES=
CFLAGS= $(INCLUDES) $(CFLAG)
GENERAL=Makefile README ssl-lib.com install.com
-TEST=ssltest.c heartbeat_test.c clienthellotest.c
+TEST=ssltest.c heartbeat_test.c clienthellotest.c sslv2conftest.c
APPS=
LIB=$(TOP)/libssl.a
@@ -399,14 +399,14 @@ s2_clnt.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
s2_clnt.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
s2_clnt.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
s2_clnt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s2_clnt.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
-s2_clnt.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s2_clnt.o: ../include/openssl/sha.h ../include/openssl/srtp.h
-s2_clnt.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s2_clnt.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s2_clnt.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s2_clnt.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s2_clnt.o: ../include/openssl/x509_vfy.h s2_clnt.c ssl_locl.h
+s2_clnt.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
+s2_clnt.o: ../include/openssl/safestack.h ../include/openssl/sha.h
+s2_clnt.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+s2_clnt.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+s2_clnt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+s2_clnt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+s2_clnt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s2_clnt.c
+s2_clnt.o: ssl_locl.h
s2_enc.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
s2_enc.o: ../include/openssl/buffer.h ../include/openssl/comp.h
s2_enc.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -435,18 +435,18 @@ s2_lib.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
s2_lib.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
s2_lib.o: ../include/openssl/evp.h ../include/openssl/hmac.h
s2_lib.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-s2_lib.o: ../include/openssl/md5.h ../include/openssl/obj_mac.h
-s2_lib.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
-s2_lib.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
-s2_lib.o: ../include/openssl/pem.h ../include/openssl/pem2.h
-s2_lib.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
-s2_lib.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s2_lib.o: ../include/openssl/sha.h ../include/openssl/srtp.h
-s2_lib.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s2_lib.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s2_lib.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s2_lib.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s2_lib.o: ../include/openssl/x509_vfy.h s2_lib.c ssl_locl.h
+s2_lib.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
+s2_lib.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
+s2_lib.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
+s2_lib.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
+s2_lib.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
+s2_lib.o: ../include/openssl/safestack.h ../include/openssl/sha.h
+s2_lib.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+s2_lib.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+s2_lib.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+s2_lib.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+s2_lib.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s2_lib.c
+s2_lib.o: ssl_locl.h
s2_meth.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
s2_meth.o: ../include/openssl/buffer.h ../include/openssl/comp.h
s2_meth.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -487,20 +487,19 @@ s2_pkt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
s2_pkt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
s2_pkt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s2_pkt.c
s2_pkt.o: ssl_locl.h
-s2_srvr.o: ../crypto/constant_time_locl.h ../e_os.h ../include/openssl/asn1.h
-s2_srvr.o: ../include/openssl/bio.h ../include/openssl/buffer.h
-s2_srvr.o: ../include/openssl/comp.h ../include/openssl/crypto.h
-s2_srvr.o: ../include/openssl/dsa.h ../include/openssl/dtls1.h
-s2_srvr.o: ../include/openssl/e_os2.h ../include/openssl/ec.h
-s2_srvr.o: ../include/openssl/ecdh.h ../include/openssl/ecdsa.h
-s2_srvr.o: ../include/openssl/err.h ../include/openssl/evp.h
-s2_srvr.o: ../include/openssl/hmac.h ../include/openssl/kssl.h
-s2_srvr.o: ../include/openssl/lhash.h ../include/openssl/obj_mac.h
-s2_srvr.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
-s2_srvr.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
-s2_srvr.o: ../include/openssl/pem.h ../include/openssl/pem2.h
-s2_srvr.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
-s2_srvr.o: ../include/openssl/rand.h ../include/openssl/rsa.h
+s2_srvr.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
+s2_srvr.o: ../include/openssl/buffer.h ../include/openssl/comp.h
+s2_srvr.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
+s2_srvr.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
+s2_srvr.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
+s2_srvr.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
+s2_srvr.o: ../include/openssl/evp.h ../include/openssl/hmac.h
+s2_srvr.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
+s2_srvr.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
+s2_srvr.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
+s2_srvr.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
+s2_srvr.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
+s2_srvr.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
s2_srvr.o: ../include/openssl/safestack.h ../include/openssl/sha.h
s2_srvr.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
s2_srvr.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
diff --git a/deps/openssl/openssl/ssl/s2_lib.c b/deps/openssl/openssl/ssl/s2_lib.c
index d55b93f76b..a8036b357f 100644
--- a/deps/openssl/openssl/ssl/s2_lib.c
+++ b/deps/openssl/openssl/ssl/s2_lib.c
@@ -156,6 +156,7 @@ OPENSSL_GLOBAL const SSL_CIPHER ssl2_ciphers[] = {
128,
},
+# if 0
/* RC4_128_EXPORT40_WITH_MD5 */
{
1,
@@ -171,6 +172,7 @@ OPENSSL_GLOBAL const SSL_CIPHER ssl2_ciphers[] = {
40,
128,
},
+# endif
/* RC2_128_CBC_WITH_MD5 */
{
@@ -188,6 +190,7 @@ OPENSSL_GLOBAL const SSL_CIPHER ssl2_ciphers[] = {
128,
},
+# if 0
/* RC2_128_CBC_EXPORT40_WITH_MD5 */
{
1,
@@ -203,6 +206,7 @@ OPENSSL_GLOBAL const SSL_CIPHER ssl2_ciphers[] = {
40,
128,
},
+# endif
# ifndef OPENSSL_NO_IDEA
/* IDEA_128_CBC_WITH_MD5 */
@@ -222,6 +226,7 @@ OPENSSL_GLOBAL const SSL_CIPHER ssl2_ciphers[] = {
},
# endif
+# if 0
/* DES_64_CBC_WITH_MD5 */
{
1,
@@ -237,6 +242,7 @@ OPENSSL_GLOBAL const SSL_CIPHER ssl2_ciphers[] = {
56,
56,
},
+# endif
/* DES_192_EDE3_CBC_WITH_MD5 */
{
diff --git a/deps/openssl/openssl/ssl/s3_lib.c b/deps/openssl/openssl/ssl/s3_lib.c
index f846cb5b7b..4aac3b2792 100644
--- a/deps/openssl/openssl/ssl/s3_lib.c
+++ b/deps/openssl/openssl/ssl/s3_lib.c
@@ -198,6 +198,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
},
/* Cipher 03 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_RSA_RC4_40_MD5,
@@ -212,6 +213,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
40,
128,
},
+#endif
/* Cipher 04 */
{
@@ -246,6 +248,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
},
/* Cipher 06 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_RSA_RC2_40_MD5,
@@ -260,6 +263,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
40,
128,
},
+#endif
/* Cipher 07 */
#ifndef OPENSSL_NO_IDEA
@@ -280,6 +284,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
#endif
/* Cipher 08 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_RSA_DES_40_CBC_SHA,
@@ -294,8 +299,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
40,
56,
},
+#endif
/* Cipher 09 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_RSA_DES_64_CBC_SHA,
@@ -310,6 +317,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
56,
56,
},
+#endif
/* Cipher 0A */
{
@@ -329,6 +337,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
/* The DH ciphers */
/* Cipher 0B */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
0,
SSL3_TXT_DH_DSS_DES_40_CBC_SHA,
@@ -343,8 +352,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
40,
56,
},
+#endif
/* Cipher 0C */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_DH_DSS_DES_64_CBC_SHA,
@@ -359,6 +370,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
56,
56,
},
+#endif
/* Cipher 0D */
{
@@ -377,6 +389,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
},
/* Cipher 0E */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
0,
SSL3_TXT_DH_RSA_DES_40_CBC_SHA,
@@ -391,8 +404,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
40,
56,
},
+#endif
/* Cipher 0F */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_DH_RSA_DES_64_CBC_SHA,
@@ -407,6 +422,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
56,
56,
},
+#endif
/* Cipher 10 */
{
@@ -426,6 +442,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
/* The Ephemeral DH ciphers */
/* Cipher 11 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_EDH_DSS_DES_40_CBC_SHA,
@@ -440,8 +457,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
40,
56,
},
+#endif
/* Cipher 12 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_EDH_DSS_DES_64_CBC_SHA,
@@ -456,6 +475,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
56,
56,
},
+#endif
/* Cipher 13 */
{
@@ -474,6 +494,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
},
/* Cipher 14 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_EDH_RSA_DES_40_CBC_SHA,
@@ -488,8 +509,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
40,
56,
},
+#endif
/* Cipher 15 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_EDH_RSA_DES_64_CBC_SHA,
@@ -504,6 +527,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
56,
56,
},
+#endif
/* Cipher 16 */
{
@@ -522,6 +546,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
},
/* Cipher 17 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_ADH_RC4_40_MD5,
@@ -536,6 +561,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
40,
128,
},
+#endif
/* Cipher 18 */
{
@@ -554,6 +580,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
},
/* Cipher 19 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_ADH_DES_40_CBC_SHA,
@@ -568,8 +595,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
40,
128,
},
+#endif
/* Cipher 1A */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_ADH_DES_64_CBC_SHA,
@@ -584,6 +613,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
56,
56,
},
+#endif
/* Cipher 1B */
{
@@ -655,6 +685,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
#ifndef OPENSSL_NO_KRB5
/* The Kerberos ciphers*/
/* Cipher 1E */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_KRB5_DES_64_CBC_SHA,
@@ -669,6 +700,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
56,
56,
},
+# endif
/* Cipher 1F */
{
@@ -719,6 +751,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
},
/* Cipher 22 */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_KRB5_DES_64_CBC_MD5,
@@ -733,6 +766,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
56,
56,
},
+# endif
/* Cipher 23 */
{
@@ -783,6 +817,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
},
/* Cipher 26 */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_KRB5_DES_40_CBC_SHA,
@@ -797,8 +832,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
40,
56,
},
+# endif
/* Cipher 27 */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_KRB5_RC2_40_CBC_SHA,
@@ -813,8 +850,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
40,
128,
},
+# endif
/* Cipher 28 */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_KRB5_RC4_40_SHA,
@@ -829,8 +868,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
40,
128,
},
+# endif
/* Cipher 29 */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_KRB5_DES_40_CBC_MD5,
@@ -845,8 +886,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
40,
56,
},
+# endif
/* Cipher 2A */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_KRB5_RC2_40_CBC_MD5,
@@ -861,8 +904,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
40,
128,
},
+# endif
/* Cipher 2B */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
SSL3_TXT_KRB5_RC4_40_MD5,
@@ -877,6 +922,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
40,
128,
},
+# endif
#endif /* OPENSSL_NO_KRB5 */
/* New AES ciphersuites */
@@ -1300,6 +1346,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
# endif
/* Cipher 62 */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
TLS1_TXT_RSA_EXPORT1024_WITH_DES_CBC_SHA,
@@ -1314,8 +1361,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
56,
56,
},
+# endif
/* Cipher 63 */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
TLS1_TXT_DHE_DSS_EXPORT1024_WITH_DES_CBC_SHA,
@@ -1330,8 +1379,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
56,
56,
},
+# endif
/* Cipher 64 */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
TLS1_TXT_RSA_EXPORT1024_WITH_RC4_56_SHA,
@@ -1346,8 +1397,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
56,
128,
},
+# endif
/* Cipher 65 */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
{
1,
TLS1_TXT_DHE_DSS_EXPORT1024_WITH_RC4_56_SHA,
@@ -1362,6 +1415,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
56,
128,
},
+# endif
/* Cipher 66 */
{
@@ -4326,21 +4380,6 @@ int ssl3_shutdown(SSL *s)
}
#endif
} else if (!(s->shutdown & SSL_RECEIVED_SHUTDOWN)) {
- if (SSL_in_init(s)) {
- /*
- * We can't shutdown properly if we are in the middle of a
- * handshake. Doing so is problematic because the peer may send a
- * CCS before it acts on our close_notify. However we should not
- * continue to process received handshake messages or CCS once our
- * close_notify has been sent. Therefore any close_notify from
- * the peer will be unreadable because we have not moved to the next
- * cipher state. Its best just to avoid this can-of-worms. Return
- * an error if we are wanting to wait for a close_notify from the
- * peer and we are in init.
- */
- SSLerr(SSL_F_SSL3_SHUTDOWN, SSL_R_SHUTDOWN_WHILE_IN_INIT);
- return -1;
- }
/*
* If we are waiting for a close from our peer, we are closed
*/
diff --git a/deps/openssl/openssl/ssl/ssl.h b/deps/openssl/openssl/ssl/ssl.h
index ae8c92575e..04d4007eeb 100644
--- a/deps/openssl/openssl/ssl/ssl.h
+++ b/deps/openssl/openssl/ssl/ssl.h
@@ -2713,7 +2713,6 @@ void ERR_load_SSL_strings(void);
# define SSL_F_SSL3_SETUP_KEY_BLOCK 157
# define SSL_F_SSL3_SETUP_READ_BUFFER 156
# define SSL_F_SSL3_SETUP_WRITE_BUFFER 291
-# define SSL_F_SSL3_SHUTDOWN 396
# define SSL_F_SSL3_WRITE_BYTES 158
# define SSL_F_SSL3_WRITE_PENDING 159
# define SSL_F_SSL_ADD_CERT_CHAIN 318
diff --git a/deps/openssl/openssl/ssl/ssl_conf.c b/deps/openssl/openssl/ssl/ssl_conf.c
index 5478840dea..8d3709d2b6 100644
--- a/deps/openssl/openssl/ssl/ssl_conf.c
+++ b/deps/openssl/openssl/ssl/ssl_conf.c
@@ -330,11 +330,19 @@ static int cmd_Protocol(SSL_CONF_CTX *cctx, const char *value)
SSL_FLAG_TBL_INV("TLSv1.1", SSL_OP_NO_TLSv1_1),
SSL_FLAG_TBL_INV("TLSv1.2", SSL_OP_NO_TLSv1_2)
};
+ int ret;
+ int sslv2off;
+
if (!(cctx->flags & SSL_CONF_FLAG_FILE))
return -2;
cctx->tbl = ssl_protocol_list;
cctx->ntbl = sizeof(ssl_protocol_list) / sizeof(ssl_flag_tbl);
- return CONF_parse_list(value, ',', 1, ssl_set_option_list, cctx);
+
+ sslv2off = *cctx->poptions & SSL_OP_NO_SSLv2;
+ ret = CONF_parse_list(value, ',', 1, ssl_set_option_list, cctx);
+ /* Never turn on SSLv2 through configuration */
+ *cctx->poptions |= sslv2off;
+ return ret;
}
static int cmd_Options(SSL_CONF_CTX *cctx, const char *value)
diff --git a/deps/openssl/openssl/ssl/ssl_err.c b/deps/openssl/openssl/ssl/ssl_err.c
index dd3b2afd1e..704088dc46 100644
--- a/deps/openssl/openssl/ssl/ssl_err.c
+++ b/deps/openssl/openssl/ssl/ssl_err.c
@@ -206,7 +206,6 @@ static ERR_STRING_DATA SSL_str_functs[] = {
{ERR_FUNC(SSL_F_SSL3_SETUP_KEY_BLOCK), "ssl3_setup_key_block"},
{ERR_FUNC(SSL_F_SSL3_SETUP_READ_BUFFER), "ssl3_setup_read_buffer"},
{ERR_FUNC(SSL_F_SSL3_SETUP_WRITE_BUFFER), "ssl3_setup_write_buffer"},
- {ERR_FUNC(SSL_F_SSL3_SHUTDOWN), "ssl3_shutdown"},
{ERR_FUNC(SSL_F_SSL3_WRITE_BYTES), "ssl3_write_bytes"},
{ERR_FUNC(SSL_F_SSL3_WRITE_PENDING), "ssl3_write_pending"},
{ERR_FUNC(SSL_F_SSL_ADD_CERT_CHAIN), "ssl_add_cert_chain"},
diff --git a/deps/openssl/openssl/ssl/ssl_lib.c b/deps/openssl/openssl/ssl/ssl_lib.c
index 2744be8ad8..f1279bbf91 100644
--- a/deps/openssl/openssl/ssl/ssl_lib.c
+++ b/deps/openssl/openssl/ssl/ssl_lib.c
@@ -1060,7 +1060,12 @@ int SSL_shutdown(SSL *s)
return -1;
}
- return s->method->ssl_shutdown(s);
+ if (!SSL_in_init(s)) {
+ return s->method->ssl_shutdown(s);
+ } else {
+ SSLerr(SSL_F_SSL_SHUTDOWN, SSL_R_SHUTDOWN_WHILE_IN_INIT);
+ return -1;
+ }
}
int SSL_renegotiate(SSL *s)
@@ -2049,6 +2054,13 @@ SSL_CTX *SSL_CTX_new(const SSL_METHOD *meth)
*/
ret->options |= SSL_OP_LEGACY_SERVER_CONNECT;
+ /*
+ * Disable SSLv2 by default, callers that want to enable SSLv2 will have to
+ * explicitly clear this option via either of SSL_CTX_clear_options() or
+ * SSL_clear_options().
+ */
+ ret->options |= SSL_OP_NO_SSLv2;
+
return (ret);
err:
SSLerr(SSL_F_SSL_CTX_NEW, ERR_R_MALLOC_FAILURE);
diff --git a/deps/openssl/openssl/ssl/sslv2conftest.c b/deps/openssl/openssl/ssl/sslv2conftest.c
new file mode 100644
index 0000000000..1fd748b118
--- /dev/null
+++ b/deps/openssl/openssl/ssl/sslv2conftest.c
@@ -0,0 +1,231 @@
+/* Written by Matt Caswell for the OpenSSL Project */
+/* ====================================================================
+ * Copyright (c) 2016 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include <stdlib.h>
+#include <openssl/bio.h>
+#include <openssl/ssl.h>
+#include <openssl/err.h>
+
+
+#define TOTAL_NUM_TESTS 2
+#define TEST_SSL_CTX 0
+
+#define SSLV2ON 1
+#define SSLV2OFF 0
+
+SSL_CONF_CTX *confctx;
+SSL_CTX *ctx;
+SSL *ssl;
+
+static int checksslv2(int test, int sslv2)
+{
+ int options;
+ if (test == TEST_SSL_CTX) {
+ options = SSL_CTX_get_options(ctx);
+ } else {
+ options = SSL_get_options(ssl);
+ }
+ return ((options & SSL_OP_NO_SSLv2) == 0) ^ (sslv2 == SSLV2OFF);
+}
+
+int main(int argc, char *argv[])
+{
+ BIO *err;
+ int testresult = 0;
+ int currtest;
+
+ SSL_library_init();
+ SSL_load_error_strings();
+
+ err = BIO_new_fp(stderr, BIO_NOCLOSE | BIO_FP_TEXT);
+
+ CRYPTO_malloc_debug_init();
+ CRYPTO_set_mem_debug_options(V_CRYPTO_MDEBUG_ALL);
+ CRYPTO_mem_ctrl(CRYPTO_MEM_CHECK_ON);
+
+
+ confctx = SSL_CONF_CTX_new();
+ ctx = SSL_CTX_new(SSLv23_method());
+ ssl = SSL_new(ctx);
+ if (confctx == NULL || ctx == NULL)
+ goto end;
+
+ SSL_CONF_CTX_set_flags(confctx, SSL_CONF_FLAG_FILE
+ | SSL_CONF_FLAG_CLIENT
+ | SSL_CONF_FLAG_SERVER);
+
+ /*
+ * For each test set up an SSL_CTX and SSL and see whether SSLv2 is enabled
+ * as expected after various SSL_CONF_cmd("Protocol", ...) calls.
+ */
+ for (currtest = 0; currtest < TOTAL_NUM_TESTS; currtest++) {
+ BIO_printf(err, "SSLv2 CONF Test number %d\n", currtest);
+ if (currtest == TEST_SSL_CTX)
+ SSL_CONF_CTX_set_ssl_ctx(confctx, ctx);
+ else
+ SSL_CONF_CTX_set_ssl(confctx, ssl);
+
+ /* SSLv2 should be off by default */
+ if (!checksslv2(currtest, SSLV2OFF)) {
+ BIO_printf(err, "SSLv2 CONF Test: Off by default test FAIL\n");
+ goto end;
+ }
+
+ if (SSL_CONF_cmd(confctx, "Protocol", "ALL") != 2
+ || !SSL_CONF_CTX_finish(confctx)) {
+ BIO_printf(err, "SSLv2 CONF Test: SSL_CONF command FAIL\n");
+ goto end;
+ }
+
+ /* Should still be off even after ALL Protocols on */
+ if (!checksslv2(currtest, SSLV2OFF)) {
+ BIO_printf(err, "SSLv2 CONF Test: Off after config #1 FAIL\n");
+ goto end;
+ }
+
+ if (SSL_CONF_cmd(confctx, "Protocol", "SSLv2") != 2
+ || !SSL_CONF_CTX_finish(confctx)) {
+ BIO_printf(err, "SSLv2 CONF Test: SSL_CONF command FAIL\n");
+ goto end;
+ }
+
+ /* Should still be off even if explicitly asked for */
+ if (!checksslv2(currtest, SSLV2OFF)) {
+ BIO_printf(err, "SSLv2 CONF Test: Off after config #2 FAIL\n");
+ goto end;
+ }
+
+ if (SSL_CONF_cmd(confctx, "Protocol", "-SSLv2") != 2
+ || !SSL_CONF_CTX_finish(confctx)) {
+ BIO_printf(err, "SSLv2 CONF Test: SSL_CONF command FAIL\n");;
+ goto end;
+ }
+
+ if (!checksslv2(currtest, SSLV2OFF)) {
+ BIO_printf(err, "SSLv2 CONF Test: Off after config #3 FAIL\n");
+ goto end;
+ }
+
+ if (currtest == TEST_SSL_CTX)
+ SSL_CTX_clear_options(ctx, SSL_OP_NO_SSLv2);
+ else
+ SSL_clear_options(ssl, SSL_OP_NO_SSLv2);
+
+ if (!checksslv2(currtest, SSLV2ON)) {
+ BIO_printf(err, "SSLv2 CONF Test: On after clear FAIL\n");
+ goto end;
+ }
+
+ if (SSL_CONF_cmd(confctx, "Protocol", "ALL") != 2
+ || !SSL_CONF_CTX_finish(confctx)) {
+ BIO_printf(err, "SSLv2 CONF Test: SSL_CONF command FAIL\n");
+ goto end;
+ }
+
+ /* Option has been cleared and config says have SSLv2 so should be on */
+ if (!checksslv2(currtest, SSLV2ON)) {
+ BIO_printf(err, "SSLv2 CONF Test: On after config #1 FAIL\n");
+ goto end;
+ }
+
+ if (SSL_CONF_cmd(confctx, "Protocol", "SSLv2") != 2
+ || !SSL_CONF_CTX_finish(confctx)) {
+ BIO_printf(err, "SSLv2 CONF Test: SSL_CONF command FAIL\n");
+ goto end;
+ }
+
+ /* Option has been cleared and config says have SSLv2 so should be on */
+ if (!checksslv2(currtest, SSLV2ON)) {
+ BIO_printf(err, "SSLv2 CONF Test: On after config #2 FAIL\n");
+ goto end;
+ }
+
+ if (SSL_CONF_cmd(confctx, "Protocol", "-SSLv2") != 2
+ || !SSL_CONF_CTX_finish(confctx)) {
+ BIO_printf(err, "SSLv2 CONF Test: SSL_CONF command FAIL\n");
+ goto end;
+ }
+
+ /* Option has been cleared but config says no SSLv2 so should be off */
+ if (!checksslv2(currtest, SSLV2OFF)) {
+ BIO_printf(err, "SSLv2 CONF Test: Off after config #4 FAIL\n");
+ goto end;
+ }
+
+ }
+
+ testresult = 1;
+
+ end:
+ SSL_free(ssl);
+ SSL_CTX_free(ctx);
+ SSL_CONF_CTX_free(confctx);
+
+ if (!testresult) {
+ printf("SSLv2 CONF test: FAILED (Test %d)\n", currtest);
+ ERR_print_errors(err);
+ } else {
+ printf("SSLv2 CONF test: PASSED\n");
+ }
+
+ ERR_free_strings();
+ ERR_remove_thread_state(NULL);
+ EVP_cleanup();
+ CRYPTO_cleanup_all_ex_data();
+ CRYPTO_mem_leaks(err);
+ BIO_free(err);
+
+ return testresult ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/deps/openssl/openssl/test/Makefile b/deps/openssl/openssl/test/Makefile
index b180971b28..e566babfa5 100644
--- a/deps/openssl/openssl/test/Makefile
+++ b/deps/openssl/openssl/test/Makefile
@@ -70,6 +70,7 @@ HEARTBEATTEST= heartbeat_test
CONSTTIMETEST= constant_time_test
VERIFYEXTRATEST= verify_extra_test
CLIENTHELLOTEST= clienthellotest
+SSLV2CONFTEST = sslv2conftest
TESTS= alltests
@@ -83,7 +84,7 @@ EXE= $(BNTEST)$(EXE_EXT) $(ECTEST)$(EXE_EXT) $(ECDSATEST)$(EXE_EXT) $(ECDHTEST)
$(EVPTEST)$(EXE_EXT) $(EVPEXTRATEST)$(EXE_EXT) $(IGETEST)$(EXE_EXT) $(JPAKETEST)$(EXE_EXT) $(SRPTEST)$(EXE_EXT) \
$(ASN1TEST)$(EXE_EXT) $(V3NAMETEST)$(EXE_EXT) $(HEARTBEATTEST)$(EXE_EXT) \
$(CONSTTIMETEST)$(EXE_EXT) $(VERIFYEXTRATEST)$(EXE_EXT) \
- $(CLIENTHELLOTEST)$(EXE_EXT)
+ $(CLIENTHELLOTEST)$(EXE_EXT) $(SSLV2CONFTEST)$(EXE_EXT)
# $(METHTEST)$(EXE_EXT)
@@ -97,7 +98,7 @@ OBJ= $(BNTEST).o $(ECTEST).o $(ECDSATEST).o $(ECDHTEST).o $(IDEATEST).o \
$(BFTEST).o $(SSLTEST).o $(DSATEST).o $(EXPTEST).o $(RSATEST).o \
$(EVPTEST).o $(EVPEXTRATEST).o $(IGETEST).o $(JPAKETEST).o $(ASN1TEST).o $(V3NAMETEST).o \
$(HEARTBEATTEST).o $(CONSTTIMETEST).o $(VERIFYEXTRATEST).o \
- $(CLIENTHELLOTEST).o
+ $(CLIENTHELLOTEST).o $(SSLV2CONFTEST).o
SRC= $(BNTEST).c $(ECTEST).c $(ECDSATEST).c $(ECDHTEST).c $(IDEATEST).c \
$(MD2TEST).c $(MD4TEST).c $(MD5TEST).c \
@@ -108,7 +109,7 @@ SRC= $(BNTEST).c $(ECTEST).c $(ECDSATEST).c $(ECDHTEST).c $(IDEATEST).c \
$(BFTEST).c $(SSLTEST).c $(DSATEST).c $(EXPTEST).c $(RSATEST).c \
$(EVPTEST).c $(EVPEXTRATEST).c $(IGETEST).c $(JPAKETEST).c $(SRPTEST).c $(ASN1TEST).c \
$(V3NAMETEST).c $(HEARTBEATTEST).c $(CONSTTIMETEST).c $(VERIFYEXTRATEST).c \
- $(CLIENTHELLOTEST).c
+ $(CLIENTHELLOTEST).c $(SSLV2CONFTEST).c
EXHEADER=
HEADER= testutil.h $(EXHEADER)
@@ -152,7 +153,7 @@ alltests: \
test_gen test_req test_pkcs7 test_verify test_dh test_dsa \
test_ss test_ca test_engine test_evp test_evp_extra test_ssl test_tsa test_ige \
test_jpake test_srp test_cms test_ocsp test_v3name test_heartbeat \
- test_constant_time test_verify_extra test_clienthello
+ test_constant_time test_verify_extra test_clienthello test_sslv2conftest
test_evp: $(EVPTEST)$(EXE_EXT) evptests.txt
../util/shlib_wrap.sh ./$(EVPTEST) evptests.txt
@@ -361,6 +362,10 @@ test_clienthello: $(CLIENTHELLOTEST)$(EXE_EXT)
@echo $(START) $@
../util/shlib_wrap.sh ./$(CLIENTHELLOTEST)
+test_sslv2conftest: $(SSLV2CONFTEST)$(EXE_EXT)
+ @echo $(START) $@
+ ../util/shlib_wrap.sh ./$(SSLV2CONFTEST)
+
lint:
lint -DLINT $(INCLUDES) $(SRC)>fluff
@@ -538,6 +543,9 @@ $(VERIFYEXTRATEST)$(EXE_EXT): $(VERIFYEXTRATEST).o
$(CLIENTHELLOTEST)$(EXE_EXT): $(CLIENTHELLOTEST).o
@target=$(CLIENTHELLOTEST) $(BUILD_CMD)
+$(SSLV2CONFTEST)$(EXE_EXT): $(SSLV2CONFTEST).o
+ @target=$(SSLV2CONFTEST) $(BUILD_CMD)
+
#$(AESTEST).o: $(AESTEST).c
# $(CC) -c $(CFLAGS) -DINTERMEDIATE_VALUE_KAT -DTRACE_KAT_MCT $(AESTEST).c
@@ -848,6 +856,25 @@ ssltest.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
ssltest.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
ssltest.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h
ssltest.o: ../include/openssl/x509v3.h ssltest.c
+sslv2conftest.o: ../include/openssl/asn1.h ../include/openssl/bio.h
+sslv2conftest.o: ../include/openssl/buffer.h ../include/openssl/comp.h
+sslv2conftest.o: ../include/openssl/crypto.h ../include/openssl/dtls1.h
+sslv2conftest.o: ../include/openssl/e_os2.h ../include/openssl/ec.h
+sslv2conftest.o: ../include/openssl/ecdh.h ../include/openssl/ecdsa.h
+sslv2conftest.o: ../include/openssl/err.h ../include/openssl/evp.h
+sslv2conftest.o: ../include/openssl/hmac.h ../include/openssl/kssl.h
+sslv2conftest.o: ../include/openssl/lhash.h ../include/openssl/obj_mac.h
+sslv2conftest.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
+sslv2conftest.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
+sslv2conftest.o: ../include/openssl/pem.h ../include/openssl/pem2.h
+sslv2conftest.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
+sslv2conftest.o: ../include/openssl/safestack.h ../include/openssl/sha.h
+sslv2conftest.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+sslv2conftest.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+sslv2conftest.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+sslv2conftest.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+sslv2conftest.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h
+sslv2conftest.o: sslv2conftest.c
v3nametest.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
v3nametest.o: ../include/openssl/buffer.h ../include/openssl/conf.h
v3nametest.o: ../include/openssl/crypto.h ../include/openssl/e_os2.h
diff --git a/deps/openssl/openssl/util/libeay.num b/deps/openssl/openssl/util/libeay.num
index 7f7487df50..e5b3c6ea84 100755
--- a/deps/openssl/openssl/util/libeay.num
+++ b/deps/openssl/openssl/util/libeay.num
@@ -1807,6 +1807,8 @@ ASN1_UTCTIME_get 2350 NOEXIST::FUNCTION:
X509_REQ_digest 2362 EXIST::FUNCTION:EVP
X509_CRL_digest 2391 EXIST::FUNCTION:EVP
ASN1_STRING_clear_free 2392 EXIST::FUNCTION:
+SRP_VBASE_get1_by_user 2393 EXIST::FUNCTION:SRP
+SRP_user_pwd_free 2394 EXIST::FUNCTION:SRP
d2i_ASN1_SET_OF_PKCS7 2397 NOEXIST::FUNCTION:
X509_ALGOR_cmp 2398 EXIST::FUNCTION:
EVP_CIPHER_CTX_set_key_length 2399 EXIST::FUNCTION:
diff --git a/deps/openssl/openssl/util/mk1mf.pl b/deps/openssl/openssl/util/mk1mf.pl
index 99652aff91..2629a1c5dd 100755
--- a/deps/openssl/openssl/util/mk1mf.pl
+++ b/deps/openssl/openssl/util/mk1mf.pl
@@ -290,6 +290,7 @@ $cflags.=" -DOPENSSL_NO_HW" if $no_hw;
$cflags.=" -DOPENSSL_FIPS" if $fips;
$cflags.=" -DOPENSSL_NO_JPAKE" if $no_jpake;
$cflags.=" -DOPENSSL_NO_EC2M" if $no_ec2m;
+$cflags.=" -DOPENSSL_NO_WEAK_SSL_CIPHERS" if $no_weak_ssl;
$cflags.= " -DZLIB" if $zlib_opt;
$cflags.= " -DZLIB_SHARED" if $zlib_opt == 2;
@@ -482,7 +483,7 @@ EX_LIBS=$ex_libs
# The OpenSSL directory
SRC_D=$src_dir
-LINK=$link
+LINK_CMD=$link
LFLAGS=$lflags
RSC=$rsc
@@ -1205,6 +1206,7 @@ sub read_options
"no-jpake" => \$no_jpake,
"no-ec2m" => \$no_ec2m,
"no-ec_nistp_64_gcc_128" => 0,
+ "no-weak-ssl-ciphers" => \$no_weak_ssl,
"no-err" => \$no_err,
"no-sock" => \$no_sock,
"no-krb5" => \$no_krb5,
diff --git a/deps/openssl/openssl/util/pl/BC-32.pl b/deps/openssl/openssl/util/pl/BC-32.pl
index f7161d7bfe..375b0a76df 100644
--- a/deps/openssl/openssl/util/pl/BC-32.pl
+++ b/deps/openssl/openssl/util/pl/BC-32.pl
@@ -118,7 +118,7 @@ ___
{
local($ex)=($target =~ /O_SSL/)?' $(L_CRYPTO)':'';
$ex.=' ws2_32.lib gdi32.lib';
- $ret.="\t\$(LINK) \$(MLFLAGS) $efile$target /def:ms/${Name}.def @<<\n \$(SHLIB_EX_OBJ) $objs $ex\n<<\n";
+ $ret.="\t\$(LINK_CMD) \$(MLFLAGS) $efile$target /def:ms/${Name}.def @<<\n \$(SHLIB_EX_OBJ) $objs $ex\n<<\n";
}
$ret.="\n";
return($ret);
@@ -132,7 +132,7 @@ sub do_link_rule
$file =~ s/\//$o/g if $o ne '/';
$n=&bname($target);
$ret.="$target: $files $dep_libs\n";
- $ret.="\t\$(LINK) \$(LFLAGS) $files \$(APP_EX_OBJ), $target,, $libs\n\n";
+ $ret.="\t\$(LINK_CMD) \$(LFLAGS) $files \$(APP_EX_OBJ), $target,, $libs\n\n";
return($ret);
}
diff --git a/deps/openssl/openssl/util/pl/Mingw32.pl b/deps/openssl/openssl/util/pl/Mingw32.pl
index fe3fb27a78..55c85f6447 100644
--- a/deps/openssl/openssl/util/pl/Mingw32.pl
+++ b/deps/openssl/openssl/util/pl/Mingw32.pl
@@ -98,7 +98,7 @@ sub do_link_rule
$file =~ s/\//$o/g if $o ne '/';
$n=&bname($target);
$ret.="$target: $files $dep_libs\n";
- $ret.="\t\$(LINK) ${efile}$target \$(LFLAGS) $files $libs\n\n";
+ $ret.="\t\$(LINK_CMD) ${efile}$target \$(LFLAGS) $files $libs\n\n";
return($ret);
}
1;
diff --git a/deps/openssl/openssl/util/pl/OS2-EMX.pl b/deps/openssl/openssl/util/pl/OS2-EMX.pl
index 28cd116907..92a332e6e9 100644
--- a/deps/openssl/openssl/util/pl/OS2-EMX.pl
+++ b/deps/openssl/openssl/util/pl/OS2-EMX.pl
@@ -99,7 +99,7 @@ sub do_lib_rule
{
local($ex)=($target =~ /O_SSL/)?' $(L_CRYPTO)':'';
$ex.=' -lsocket';
- $ret.="\t\$(LINK) \$(SHLIB_CFLAGS) \$(MLFLAGS) $efile$target \$(SHLIB_EX_OBJ) \$(${Name}OBJ) $ex os2/${Name}.def\n";
+ $ret.="\t\$(LINK_CMD) \$(SHLIB_CFLAGS) \$(MLFLAGS) $efile$target \$(SHLIB_EX_OBJ) \$(${Name}OBJ) $ex os2/${Name}.def\n";
$ret.="\temximp -o $out_def/$name.a os2/${Name}.def\n";
$ret.="\temximp -o $out_def/$name.lib os2/${Name}.def\n\n";
}
@@ -113,7 +113,7 @@ sub do_link_rule
$file =~ s/\//$o/g if $o ne '/';
$n=&bname($target);
$ret.="$target: $files $dep_libs\n";
- $ret.="\t\$(LINK) ${efile}$target \$(CFLAG) \$(LFLAGS) $files $libs\n\n";
+ $ret.="\t\$(LINK_CMD) ${efile}$target \$(CFLAG) \$(LFLAGS) $files $libs\n\n";
return($ret);
}
diff --git a/deps/openssl/openssl/util/pl/VC-32.pl b/deps/openssl/openssl/util/pl/VC-32.pl
index 0f5547f056..dba96cba5e 100644
--- a/deps/openssl/openssl/util/pl/VC-32.pl
+++ b/deps/openssl/openssl/util/pl/VC-32.pl
@@ -330,7 +330,7 @@ sub do_lib_rule
if ($fips && $target =~ /O_CRYPTO/)
{
$ret.="$target: $objs \$(PREMAIN_DSO_EXE)";
- $ret.="\n\tSET FIPS_LINK=\$(LINK)\n";
+ $ret.="\n\tSET FIPS_LINK=\$(LINK_CMD)\n";
$ret.="\tSET FIPS_CC=\$(CC)\n";
$ret.="\tSET FIPS_CC_ARGS=/Fo\$(OBJ_D)${o}fips_premain.obj \$(SHLIB_CFLAGS) -c\n";
$ret.="\tSET PREMAIN_DSO_EXE=\$(PREMAIN_DSO_EXE)\n";
@@ -344,7 +344,7 @@ sub do_lib_rule
else
{
$ret.="$target: $objs";
- $ret.="\n\t\$(LINK) \$(MLFLAGS) $efile$target $name @<<\n \$(SHLIB_EX_OBJ) $objs $ex \$(EX_LIBS)\n<<\n";
+ $ret.="\n\t\$(LINK_CMD) \$(MLFLAGS) $efile$target $name @<<\n \$(SHLIB_EX_OBJ) $objs $ex \$(EX_LIBS)\n<<\n";
}
$ret.="\tIF EXIST \$@.manifest mt -nologo -manifest \$@.manifest -outputresource:\$@;2\n\n";
}
@@ -363,7 +363,7 @@ sub do_link_rule
{
$ret.=" \$(OBJ_D)${o}applink.obj" if $shlib;
$ret.="\n";
- $ret.=" \$(LINK) \$(LFLAGS) $efile$target @<<\n\t";
+ $ret.=" \$(LINK_CMD) \$(LFLAGS) $efile$target @<<\n\t";
if ($files =~ /O_FIPSCANISTER/ && !$fipscanisterbuild) {
$ret.= "\$(EX_LIBS) ";
$ret.= "\$(OBJ_D)${o}applink.obj " if $shlib;
@@ -373,7 +373,7 @@ sub do_link_rule
elsif ($standalone == 2)
{
$ret.="\n";
- $ret.="\tSET FIPS_LINK=\$(LINK)\n";
+ $ret.="\tSET FIPS_LINK=\$(LINK_CMD)\n";
$ret.="\tSET FIPS_CC=\$(CC)\n";
$ret.="\tSET FIPS_CC_ARGS=/Fo\$(OBJ_D)${o}fips_premain.obj \$(SHLIB_CFLAGS) -c\n";
$ret.="\tSET PREMAIN_DSO_EXE=\n";
@@ -386,7 +386,7 @@ sub do_link_rule
else
{
$ret.="\n";
- $ret.="\t\$(LINK) \$(LFLAGS) $efile$target @<<\n";
+ $ret.="\t\$(LINK_CMD) \$(LFLAGS) $efile$target @<<\n";
$ret.="\t\$(APP_EX_OBJ) $files $libs\n<<\n";
}
$ret.="\tIF EXIST \$@.manifest mt -nologo -manifest \$@.manifest -outputresource:\$@;1\n\n";
diff --git a/deps/openssl/openssl/util/pl/linux.pl b/deps/openssl/openssl/util/pl/linux.pl
index d24f7b7291..3362941f7b 100644
--- a/deps/openssl/openssl/util/pl/linux.pl
+++ b/deps/openssl/openssl/util/pl/linux.pl
@@ -78,7 +78,7 @@ sub do_link_rule
$file =~ s/\//$o/g if $o ne '/';
$n=&bname($target);
$ret.="$target: $files $dep_libs\n";
- $ret.="\t\$(LINK) ${efile}$target \$(LFLAGS) $files $libs\n\n";
+ $ret.="\t\$(LINK_CMD) ${efile}$target \$(LFLAGS) $files $libs\n\n";
return($ret);
}
diff --git a/deps/openssl/openssl/util/pl/netware.pl b/deps/openssl/openssl/util/pl/netware.pl
index fe80a9bb89..16f4f4ee37 100644
--- a/deps/openssl/openssl/util/pl/netware.pl
+++ b/deps/openssl/openssl/util/pl/netware.pl
@@ -506,22 +506,22 @@ sub do_link_rule
if ($gnuc)
{
$ret.="\t\$(MKLIB) $lib_flags \$(TMP_D)${o}\$(E_EXE).a \$(filter-out \$(TMP_D)${o}\$(E_EXE)${obj},$files)\n";
- $ret.="\t\$(LINK) \$(LFLAGS) $def_file2\n";
+ $ret.="\t\$(LINK_CMD) \$(LFLAGS) $def_file2\n";
$ret.="\t\@$mv \$(E_EXE)2.nlm \$(TEST_D)\n";
}
else
{
- $ret.="\t\$(LINK) \$(LFLAGS) $def_file2 $files \"$prelude\" $libs -o $target2\n";
+ $ret.="\t\$(LINK_CMD) \$(LFLAGS) $def_file2 $files \"$prelude\" $libs -o $target2\n";
}
}
if ($gnuc)
{
- $ret.="\t\$(LINK) \$(LFLAGS) $def_file\n";
+ $ret.="\t\$(LINK_CMD) \$(LFLAGS) $def_file\n";
$ret.="\t\@$mv \$(\@F) \$(TEST_D)\n";
}
else
{
- $ret.="\t\$(LINK) \$(LFLAGS) $def_file $files \"$prelude\" $libs -o $target\n";
+ $ret.="\t\$(LINK_CMD) \$(LFLAGS) $def_file $files \"$prelude\" $libs -o $target\n";
}
$ret.="\n";
diff --git a/deps/openssl/openssl/util/pl/ultrix.pl b/deps/openssl/openssl/util/pl/ultrix.pl
index ea370c71f9..0c76c83b4a 100644
--- a/deps/openssl/openssl/util/pl/ultrix.pl
+++ b/deps/openssl/openssl/util/pl/ultrix.pl
@@ -31,7 +31,7 @@ sub do_link_rule
$file =~ s/\//$o/g if $o ne '/';
$n=&bname($target);
$ret.="$target: $files $dep_libs\n";
- $ret.="\t\$(LINK) ${efile}$target \$(LFLAGS) $files $libs\n\n";
+ $ret.="\t\$(LINK_CMD) ${efile}$target \$(LFLAGS) $files $libs\n\n";
return($ret);
}
diff --git a/deps/openssl/openssl/util/pl/unix.pl b/deps/openssl/openssl/util/pl/unix.pl
index 1d4e9dc5df..8818c5bcb1 100644
--- a/deps/openssl/openssl/util/pl/unix.pl
+++ b/deps/openssl/openssl/util/pl/unix.pl
@@ -164,7 +164,7 @@ sub do_link_rule
$file =~ s/\//$o/g if $o ne '/';
$n=&bname($target);
$ret.="$target: $files $dep_libs\n";
- $ret.="\t\$(LINK) ${efile}$target \$(LFLAGS) $files $libs\n\n";
+ $ret.="\t\$(LINK_CMD) ${efile}$target \$(LFLAGS) $files $libs\n\n";
return($ret);
}