summaryrefslogtreecommitdiff
path: root/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto')
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aes-x86_64.s125
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-mb-x86_64.s72
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-sha1-x86_64.s34
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-sha256-x86_64.s114
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-x86_64.s229
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/bsaes-x86_64.s139
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/rsaz-avx2.s41
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/rsaz-x86_64.s85
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-gf2m.s24
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-mont.s72
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-mont5.s111
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/buildinf.h75
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/camellia/cmll-x86_64.s57
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/chacha/chacha-x86_64.s1507
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/ec/ecp_nistz256-x86_64.s1437
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/ec/x25519-x86_64.s760
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/include/internal/dso_conf.h5
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/md5/md5-x86_64.s13
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/modes/aesni-gcm-x86_64.s73
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/modes/ghash-x86_64.s54
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/poly1305/poly1305-x86_64.s1762
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/rc4/rc4-md5-x86_64.s16
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/rc4/rc4-x86_64.s9
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/keccak1600-x86_64.s492
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha1-mb-x86_64.s46
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha1-x86_64.s164
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha256-mb-x86_64.s46
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha256-x86_64.s152
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha512-x86_64.s152
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/whrlpool/wp-x86_64.s36
-rw-r--r--deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/x86_64cpuid.s74
31 files changed, 7401 insertions, 575 deletions
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aes-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aes-x86_64.s
index 9a337fb897..72dade4a50 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aes-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aes-x86_64.s
@@ -332,15 +332,23 @@ L$enc_compact_done:
.private_extern _asm_AES_encrypt
_asm_AES_encrypt:
_AES_encrypt:
+
+ movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
- movq %rsp,%r10
+
leaq -63(%rdx),%rcx
andq $-64,%rsp
subq %rsp,%rcx
@@ -350,7 +358,8 @@ _AES_encrypt:
subq $32,%rsp
movq %rsi,16(%rsp)
- movq %r10,24(%rsp)
+ movq %rax,24(%rsp)
+
L$enc_prologue:
movq %rdx,%r15
@@ -377,22 +386,31 @@ L$enc_prologue:
movq 16(%rsp),%r9
movq 24(%rsp),%rsi
+
movl %eax,0(%r9)
movl %ebx,4(%r9)
movl %ecx,8(%r9)
movl %edx,12(%r9)
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
L$enc_epilogue:
.byte 0xf3,0xc3
+
.p2align 4
_x86_64_AES_decrypt:
xorl 0(%r15),%eax
@@ -779,15 +797,23 @@ L$dec_compact_done:
.private_extern _asm_AES_decrypt
_asm_AES_decrypt:
_AES_decrypt:
+
+ movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
- movq %rsp,%r10
+
leaq -63(%rdx),%rcx
andq $-64,%rsp
subq %rsp,%rcx
@@ -797,7 +823,8 @@ _AES_decrypt:
subq $32,%rsp
movq %rsi,16(%rsp)
- movq %r10,24(%rsp)
+ movq %rax,24(%rsp)
+
L$dec_prologue:
movq %rdx,%r15
@@ -826,44 +853,65 @@ L$dec_prologue:
movq 16(%rsp),%r9
movq 24(%rsp),%rsi
+
movl %eax,0(%r9)
movl %ebx,4(%r9)
movl %ecx,8(%r9)
movl %edx,12(%r9)
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
L$dec_epilogue:
.byte 0xf3,0xc3
+
.globl _AES_set_encrypt_key
.p2align 4
_AES_set_encrypt_key:
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
subq $8,%rsp
+
L$enc_key_prologue:
call _x86_64_AES_set_encrypt_key
movq 40(%rsp),%rbp
+
movq 48(%rsp),%rbx
+
addq $56,%rsp
+
L$enc_key_epilogue:
.byte 0xf3,0xc3
+
.p2align 4
_x86_64_AES_set_encrypt_key:
movl %esi,%ecx
@@ -1106,13 +1154,21 @@ L$exit:
.p2align 4
_AES_set_decrypt_key:
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
pushq %rdx
+
L$dec_key_prologue:
call _x86_64_AES_set_encrypt_key
@@ -1280,15 +1336,23 @@ L$permute:
xorq %rax,%rax
L$abort:
movq 8(%rsp),%r15
+
movq 16(%rsp),%r14
+
movq 24(%rsp),%r13
+
movq 32(%rsp),%r12
+
movq 40(%rsp),%rbp
+
movq 48(%rsp),%rbx
+
addq $56,%rsp
+
L$dec_key_epilogue:
.byte 0xf3,0xc3
+
.globl _AES_cbc_encrypt
.p2align 4
@@ -1297,25 +1361,32 @@ L$dec_key_epilogue:
.private_extern _asm_AES_cbc_encrypt
_asm_AES_cbc_encrypt:
_AES_cbc_encrypt:
+
cmpq $0,%rdx
je L$cbc_epilogue
pushfq
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
L$cbc_prologue:
cld
movl %r9d,%r9d
leaq L$AES_Te(%rip),%r14
+ leaq L$AES_Td(%rip),%r10
cmpq $0,%r9
- jne L$cbc_picked_te
- leaq L$AES_Td(%rip),%r14
-L$cbc_picked_te:
+ cmoveq %r10,%r14
movl _OPENSSL_ia32cap_P(%rip),%r10d
cmpq $512,%rdx
@@ -1352,7 +1423,9 @@ L$cbc_te_ok:
xchgq %rsp,%r15
+
movq %r15,16(%rsp)
+
L$cbc_fast_body:
movq %rdi,24(%rsp)
movq %rsi,32(%rsp)
@@ -1734,18 +1807,28 @@ L$cbc_slow_dec_partial:
.p2align 4
L$cbc_exit:
movq 16(%rsp),%rsi
+
movq (%rsi),%r15
+
movq 8(%rsi),%r14
+
movq 16(%rsi),%r13
+
movq 24(%rsi),%r12
+
movq 32(%rsi),%rbp
+
movq 40(%rsi),%rbx
+
leaq 48(%rsi),%rsp
+
L$cbc_popfq:
popfq
+
L$cbc_epilogue:
.byte 0xf3,0xc3
+
.p2align 6
L$AES_Te:
.long 0xa56363c6,0xa56363c6
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-mb-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-mb-x86_64.s
index 75ce16175c..8f97b853a7 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-mb-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-mb-x86_64.s
@@ -6,6 +6,7 @@
.p2align 5
_aesni_multi_cbc_encrypt:
+
cmpl $2,%edx
jb L$enc_non_avx
movl _OPENSSL_ia32cap_P+4(%rip),%ecx
@@ -15,11 +16,17 @@ _aesni_multi_cbc_encrypt:
.p2align 4
L$enc_non_avx:
movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
@@ -27,10 +34,12 @@ L$enc_non_avx:
+
subq $48,%rsp
andq $-64,%rsp
movq %rax,16(%rsp)
+
L$enc4x_body:
movdqu (%rsi),%xmm12
leaq 120(%rsi),%rsi
@@ -239,6 +248,7 @@ L$enc4x_tail:
jnz L$oop_enc4x
movq 16(%rsp),%rax
+
movl 24(%rsp),%edx
@@ -256,20 +266,29 @@ L$enc4x_tail:
L$enc4x_done:
movq -48(%rax),%r15
+
movq -40(%rax),%r14
+
movq -32(%rax),%r13
+
movq -24(%rax),%r12
+
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$enc4x_epilogue:
.byte 0xf3,0xc3
+
.globl _aesni_multi_cbc_decrypt
.p2align 5
_aesni_multi_cbc_decrypt:
+
cmpl $2,%edx
jb L$dec_non_avx
movl _OPENSSL_ia32cap_P+4(%rip),%ecx
@@ -279,11 +298,17 @@ _aesni_multi_cbc_decrypt:
.p2align 4
L$dec_non_avx:
movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
@@ -291,10 +316,12 @@ L$dec_non_avx:
+
subq $48,%rsp
andq $-64,%rsp
movq %rax,16(%rsp)
+
L$dec4x_body:
movdqu (%rsi),%xmm12
leaq 120(%rsi),%rsi
@@ -503,6 +530,7 @@ L$dec4x_tail:
jnz L$oop_dec4x
movq 16(%rsp),%rax
+
movl 24(%rsp),%edx
leaq 160(%rdi),%rdi
@@ -511,25 +539,40 @@ L$dec4x_tail:
L$dec4x_done:
movq -48(%rax),%r15
+
movq -40(%rax),%r14
+
movq -32(%rax),%r13
+
movq -24(%rax),%r12
+
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$dec4x_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
aesni_multi_cbc_encrypt_avx:
+
_avx_cbc_enc_shortcut:
movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
@@ -539,10 +582,12 @@ _avx_cbc_enc_shortcut:
+
subq $192,%rsp
andq $-128,%rsp
movq %rax,16(%rsp)
+
L$enc8x_body:
vzeroupper
vmovdqu (%rsi),%xmm15
@@ -944,29 +989,45 @@ L$enc8x_tail:
+
L$enc8x_done:
vzeroupper
movq -48(%rax),%r15
+
movq -40(%rax),%r14
+
movq -32(%rax),%r13
+
movq -24(%rax),%r12
+
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$enc8x_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
aesni_multi_cbc_decrypt_avx:
+
_avx_cbc_dec_shortcut:
movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
@@ -977,11 +1038,13 @@ _avx_cbc_dec_shortcut:
+
subq $256,%rsp
andq $-256,%rsp
subq $192,%rsp
movq %rax,16(%rsp)
+
L$dec8x_body:
vzeroupper
vmovdqu (%rsi),%xmm15
@@ -1421,15 +1484,24 @@ L$dec8x_tail:
+
L$dec8x_done:
vzeroupper
movq -48(%rax),%r15
+
movq -40(%rax),%r14
+
movq -32(%rax),%r13
+
movq -24(%rax),%r12
+
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$dec8x_epilogue:
.byte 0xf3,0xc3
+
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-sha1-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-sha1-x86_64.s
index b14cf7691a..aed8f3a345 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-sha1-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-sha1-x86_64.s
@@ -21,18 +21,26 @@ _aesni_cbc_sha1_enc:
.p2align 5
aesni_cbc_sha1_enc_ssse3:
+
movq 8(%rsp),%r10
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
leaq -104(%rsp),%rsp
+
movq %rdi,%r12
movq %rsi,%r13
movq %rdx,%r14
@@ -1362,31 +1370,48 @@ L$aesenclast5:
movl %ebp,16(%r9)
movups %xmm2,(%r8)
leaq 104(%rsp),%rsi
+
movq 0(%rsi),%r15
+
movq 8(%rsi),%r14
+
movq 16(%rsi),%r13
+
movq 24(%rsi),%r12
+
movq 32(%rsi),%rbp
+
movq 40(%rsi),%rbx
+
leaq 48(%rsi),%rsp
+
L$epilogue_ssse3:
.byte 0xf3,0xc3
+
.p2align 5
aesni_cbc_sha1_enc_avx:
+
movq 8(%rsp),%r10
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
leaq -104(%rsp),%rsp
+
vzeroall
movq %rdi,%r12
movq %rsi,%r13
@@ -2660,16 +2685,25 @@ L$vaesenclast10:
vmovups %xmm12,(%r8)
vzeroall
leaq 104(%rsp),%rsi
+
movq 0(%rsi),%r15
+
movq 8(%rsi),%r14
+
movq 16(%rsi),%r13
+
movq 24(%rsi),%r12
+
movq 32(%rsi),%rbp
+
movq 40(%rsi),%rbx
+
leaq 48(%rsi),%rsp
+
L$epilogue_avx:
.byte 0xf3,0xc3
+
.p2align 6
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-sha256-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-sha256-x86_64.s
index 08025a0bae..28cf0768ca 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-sha256-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-sha256-x86_64.s
@@ -77,15 +77,23 @@ K256:
.p2align 6
aesni_cbc_sha256_enc_xop:
+
L$xop_shortcut:
movq 8(%rsp),%r10
+ movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
- movq %rsp,%r11
+
subq $128,%rsp
andq $-64,%rsp
@@ -101,7 +109,8 @@ L$xop_shortcut:
movq %r8,64+32(%rsp)
movq %r9,64+40(%rsp)
movq %r10,64+48(%rsp)
- movq %r11,64+56(%rsp)
+ movq %rax,120(%rsp)
+
L$prologue_xop:
vzeroall
@@ -1207,31 +1216,48 @@ L$xop_00_47:
jb L$loop_xop
movq 64+32(%rsp),%r8
- movq 64+56(%rsp),%rsi
+ movq 120(%rsp),%rsi
+
vmovdqu %xmm8,(%r8)
vzeroall
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
L$epilogue_xop:
.byte 0xf3,0xc3
+
.p2align 6
aesni_cbc_sha256_enc_avx:
+
L$avx_shortcut:
movq 8(%rsp),%r10
+ movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
- movq %rsp,%r11
+
subq $128,%rsp
andq $-64,%rsp
@@ -1247,7 +1273,8 @@ L$avx_shortcut:
movq %r8,64+32(%rsp)
movq %r9,64+40(%rsp)
movq %r10,64+48(%rsp)
- movq %r11,64+56(%rsp)
+ movq %rax,120(%rsp)
+
L$prologue_avx:
vzeroall
@@ -2384,31 +2411,48 @@ L$avx_00_47:
jb L$loop_avx
movq 64+32(%rsp),%r8
- movq 64+56(%rsp),%rsi
+ movq 120(%rsp),%rsi
+
vmovdqu %xmm8,(%r8)
vzeroall
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
L$epilogue_avx:
.byte 0xf3,0xc3
+
.p2align 6
aesni_cbc_sha256_enc_avx2:
+
L$avx2_shortcut:
movq 8(%rsp),%r10
+ movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
- movq %rsp,%r11
+
subq $576,%rsp
andq $-1024,%rsp
addq $448,%rsp
@@ -2425,7 +2469,8 @@ L$avx2_shortcut:
movq %r8,64+32(%rsp)
movq %r9,64+40(%rsp)
movq %r10,64+48(%rsp)
- movq %r11,64+56(%rsp)
+ movq %rax,120(%rsp)
+
L$prologue_avx2:
vzeroall
@@ -3987,20 +4032,29 @@ L$ower_avx2:
L$done_avx2:
leaq (%rbp),%rsp
movq 64+32(%rsp),%r8
- movq 64+56(%rsp),%rsi
+ movq 120(%rsp),%rsi
+
vmovdqu %xmm8,(%r8)
vzeroall
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
L$epilogue_avx2:
.byte 0xf3,0xc3
+
.p2align 5
aesni_cbc_sha256_enc_shaext:
movq 8(%rsp),%r10
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-x86_64.s
index 2c741239ef..8d76a18b85 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-x86_64.s
@@ -995,6 +995,7 @@ L$oop_enc1_6:
.p2align 4
_aesni_ctr32_encrypt_blocks:
+
cmpq $1,%rdx
jne L$ctr32_bulk
@@ -1024,11 +1025,12 @@ L$oop_enc1_7:
.p2align 4
L$ctr32_bulk:
- leaq (%rsp),%rax
+ leaq (%rsp),%r11
+
pushq %rbp
+
subq $128,%rsp
andq $-16,%rsp
- leaq -8(%rax),%rbp
@@ -1037,7 +1039,7 @@ L$ctr32_bulk:
movdqu (%rcx),%xmm0
movl 12(%r8),%r8d
pxor %xmm0,%xmm2
- movl 12(%rcx),%r11d
+ movl 12(%rcx),%ebp
movdqa %xmm2,0(%rsp)
bswapl %r8d
movdqa %xmm2,%xmm3
@@ -1053,8 +1055,8 @@ L$ctr32_bulk:
leaq 2(%r8),%rdx
bswapl %eax
bswapl %edx
- xorl %r11d,%eax
- xorl %r11d,%edx
+ xorl %ebp,%eax
+ xorl %ebp,%edx
.byte 102,15,58,34,216,3
leaq 3(%r8),%rax
movdqa %xmm3,16(%rsp)
@@ -1063,25 +1065,25 @@ L$ctr32_bulk:
movq %r10,%rdx
leaq 4(%r8),%r10
movdqa %xmm4,32(%rsp)
- xorl %r11d,%eax
+ xorl %ebp,%eax
bswapl %r10d
.byte 102,15,58,34,232,3
- xorl %r11d,%r10d
+ xorl %ebp,%r10d
movdqa %xmm5,48(%rsp)
leaq 5(%r8),%r9
movl %r10d,64+12(%rsp)
bswapl %r9d
leaq 6(%r8),%r10
movl 240(%rcx),%eax
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
bswapl %r10d
movl %r9d,80+12(%rsp)
- xorl %r11d,%r10d
+ xorl %ebp,%r10d
leaq 7(%r8),%r9
movl %r10d,96+12(%rsp)
bswapl %r9d
movl _OPENSSL_ia32cap_P+4(%rip),%r10d
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
andl $71303168,%r10d
movl %r9d,112+12(%rsp)
@@ -1105,7 +1107,7 @@ L$ctr32_bulk:
L$ctr32_6x:
shll $4,%eax
movl $48,%r10d
- bswapl %r11d
+ bswapl %ebp
leaq 32(%rcx,%rax,1),%rcx
subq %rax,%r10
jmp L$ctr32_loop6
@@ -1116,32 +1118,32 @@ L$ctr32_loop6:
movups -48(%rcx,%r10,1),%xmm0
.byte 102,15,56,220,209
movl %r8d,%eax
- xorl %r11d,%eax
+ xorl %ebp,%eax
.byte 102,15,56,220,217
.byte 0x0f,0x38,0xf1,0x44,0x24,12
leal 1(%r8),%eax
.byte 102,15,56,220,225
- xorl %r11d,%eax
+ xorl %ebp,%eax
.byte 0x0f,0x38,0xf1,0x44,0x24,28
.byte 102,15,56,220,233
leal 2(%r8),%eax
- xorl %r11d,%eax
+ xorl %ebp,%eax
.byte 102,15,56,220,241
.byte 0x0f,0x38,0xf1,0x44,0x24,44
leal 3(%r8),%eax
.byte 102,15,56,220,249
movups -32(%rcx,%r10,1),%xmm1
- xorl %r11d,%eax
+ xorl %ebp,%eax
.byte 102,15,56,220,208
.byte 0x0f,0x38,0xf1,0x44,0x24,60
leal 4(%r8),%eax
.byte 102,15,56,220,216
- xorl %r11d,%eax
+ xorl %ebp,%eax
.byte 0x0f,0x38,0xf1,0x44,0x24,76
.byte 102,15,56,220,224
leal 5(%r8),%eax
- xorl %r11d,%eax
+ xorl %ebp,%eax
.byte 102,15,56,220,232
.byte 0x0f,0x38,0xf1,0x44,0x24,92
movq %r10,%rax
@@ -1202,7 +1204,7 @@ L$ctr32_loop8:
bswapl %r9d
movups 32-128(%rcx),%xmm0
.byte 102,15,56,220,225
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
nop
.byte 102,15,56,220,233
movl %r9d,0+12(%rsp)
@@ -1215,7 +1217,7 @@ L$ctr32_loop8:
bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
.byte 0x66,0x90
.byte 102,15,56,220,224
.byte 102,15,56,220,232
@@ -1229,7 +1231,7 @@ L$ctr32_loop8:
bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
.byte 0x66,0x90
.byte 102,15,56,220,225
.byte 102,15,56,220,233
@@ -1243,7 +1245,7 @@ L$ctr32_loop8:
bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
.byte 0x66,0x90
.byte 102,15,56,220,224
.byte 102,15,56,220,232
@@ -1257,7 +1259,7 @@ L$ctr32_loop8:
bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
.byte 0x66,0x90
.byte 102,15,56,220,225
.byte 102,15,56,220,233
@@ -1271,7 +1273,7 @@ L$ctr32_loop8:
bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
.byte 0x66,0x90
.byte 102,15,56,220,224
.byte 102,15,56,220,232
@@ -1285,7 +1287,7 @@ L$ctr32_loop8:
bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
.byte 0x66,0x90
.byte 102,15,56,220,225
.byte 102,15,56,220,233
@@ -1300,7 +1302,7 @@ L$ctr32_loop8:
.byte 102,15,56,220,208
.byte 102,15,56,220,216
.byte 102,15,56,220,224
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
movdqu 0(%rdi),%xmm10
.byte 102,15,56,220,232
movl %r9d,112+12(%rsp)
@@ -1535,7 +1537,7 @@ L$ctr32_loop3:
L$ctr32_done:
xorps %xmm0,%xmm0
- xorl %r11d,%r11d
+ xorl %ebp,%ebp
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
@@ -1559,20 +1561,25 @@ L$ctr32_done:
pxor %xmm14,%xmm14
movaps %xmm0,112(%rsp)
pxor %xmm15,%xmm15
- leaq (%rbp),%rsp
- popq %rbp
+ movq -8(%r11),%rbp
+
+ leaq (%r11),%rsp
+
L$ctr32_epilogue:
.byte 0xf3,0xc3
+
.globl _aesni_xts_encrypt
.p2align 4
_aesni_xts_encrypt:
- leaq (%rsp),%rax
+
+ leaq (%rsp),%r11
+
pushq %rbp
+
subq $112,%rsp
andq $-16,%rsp
- leaq -8(%rax),%rbp
movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
@@ -1588,7 +1595,7 @@ L$oop_enc1_8:
jnz L$oop_enc1_8
.byte 102,15,56,221,209
movups (%rcx),%xmm0
- movq %rcx,%r11
+ movq %rcx,%rbp
movl %r10d,%eax
shll $4,%r10d
movq %rdx,%r9
@@ -1644,9 +1651,9 @@ L$oop_enc1_8:
jc L$xts_enc_short
movl $16+96,%eax
- leaq 32(%r11,%r10,1),%rcx
+ leaq 32(%rbp,%r10,1),%rcx
subq %r10,%rax
- movups 16(%r11),%xmm1
+ movups 16(%rbp),%xmm1
movq %rax,%r10
leaq L$xts_magic(%rip),%r8
jmp L$xts_enc_grandloop
@@ -1671,7 +1678,7 @@ L$xts_enc_grandloop:
movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
.byte 102,15,56,220,233
- movups 32(%r11),%xmm0
+ movups 32(%rbp),%xmm0
leaq 96(%rdi),%rdi
pxor %xmm8,%xmm7
@@ -1680,7 +1687,7 @@ L$xts_enc_grandloop:
pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
.byte 102,15,56,220,249
- movups 48(%r11),%xmm1
+ movups 48(%rbp),%xmm1
pxor %xmm9,%xmm12
.byte 102,15,56,220,208
@@ -1695,7 +1702,7 @@ L$xts_enc_grandloop:
movdqa %xmm14,64(%rsp)
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups 64(%r11),%xmm0
+ movups 64(%rbp),%xmm0
movdqa %xmm8,80(%rsp)
pshufd $0x5f,%xmm15,%xmm9
jmp L$xts_enc_loop6
@@ -1727,7 +1734,7 @@ L$xts_enc_loop6:
psrad $31,%xmm14
.byte 102,15,56,220,217
pand %xmm8,%xmm14
- movups (%r11),%xmm10
+ movups (%rbp),%xmm10
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
@@ -1795,10 +1802,10 @@ L$xts_enc_loop6:
.byte 102,15,56,220,225
.byte 102,15,56,220,233
pxor %xmm0,%xmm15
- movups (%r11),%xmm0
+ movups (%rbp),%xmm0
.byte 102,15,56,220,241
.byte 102,15,56,220,249
- movups 16(%r11),%xmm1
+ movups 16(%rbp),%xmm1
pxor %xmm15,%xmm14
.byte 102,15,56,221,84,36,0
@@ -1825,7 +1832,7 @@ L$xts_enc_loop6:
movl $16+96,%eax
subl %r10d,%eax
- movq %r11,%rcx
+ movq %rbp,%rcx
shrl $4,%eax
L$xts_enc_short:
@@ -1981,7 +1988,7 @@ L$xts_enc_steal:
jnz L$xts_enc_steal
subq %r9,%rsi
- movq %r11,%rcx
+ movq %rbp,%rcx
movl %r10d,%eax
movups -16(%rsi),%xmm2
@@ -2024,20 +2031,25 @@ L$xts_enc_ret:
movaps %xmm0,96(%rsp)
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
- leaq (%rbp),%rsp
- popq %rbp
+ movq -8(%r11),%rbp
+
+ leaq (%r11),%rsp
+
L$xts_enc_epilogue:
.byte 0xf3,0xc3
+
.globl _aesni_xts_decrypt
.p2align 4
_aesni_xts_decrypt:
- leaq (%rsp),%rax
+
+ leaq (%rsp),%r11
+
pushq %rbp
+
subq $112,%rsp
andq $-16,%rsp
- leaq -8(%rax),%rbp
movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
@@ -2059,7 +2071,7 @@ L$oop_enc1_11:
subq %rax,%rdx
movups (%rcx),%xmm0
- movq %rcx,%r11
+ movq %rcx,%rbp
movl %r10d,%eax
shll $4,%r10d
movq %rdx,%r9
@@ -2115,9 +2127,9 @@ L$oop_enc1_11:
jc L$xts_dec_short
movl $16+96,%eax
- leaq 32(%r11,%r10,1),%rcx
+ leaq 32(%rbp,%r10,1),%rcx
subq %r10,%rax
- movups 16(%r11),%xmm1
+ movups 16(%rbp),%xmm1
movq %rax,%r10
leaq L$xts_magic(%rip),%r8
jmp L$xts_dec_grandloop
@@ -2142,7 +2154,7 @@ L$xts_dec_grandloop:
movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
.byte 102,15,56,222,233
- movups 32(%r11),%xmm0
+ movups 32(%rbp),%xmm0
leaq 96(%rdi),%rdi
pxor %xmm8,%xmm7
@@ -2151,7 +2163,7 @@ L$xts_dec_grandloop:
pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
.byte 102,15,56,222,249
- movups 48(%r11),%xmm1
+ movups 48(%rbp),%xmm1
pxor %xmm9,%xmm12
.byte 102,15,56,222,208
@@ -2166,7 +2178,7 @@ L$xts_dec_grandloop:
movdqa %xmm14,64(%rsp)
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups 64(%r11),%xmm0
+ movups 64(%rbp),%xmm0
movdqa %xmm8,80(%rsp)
pshufd $0x5f,%xmm15,%xmm9
jmp L$xts_dec_loop6
@@ -2198,7 +2210,7 @@ L$xts_dec_loop6:
psrad $31,%xmm14
.byte 102,15,56,222,217
pand %xmm8,%xmm14
- movups (%r11),%xmm10
+ movups (%rbp),%xmm10
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
@@ -2266,10 +2278,10 @@ L$xts_dec_loop6:
.byte 102,15,56,222,225
.byte 102,15,56,222,233
pxor %xmm0,%xmm15
- movups (%r11),%xmm0
+ movups (%rbp),%xmm0
.byte 102,15,56,222,241
.byte 102,15,56,222,249
- movups 16(%r11),%xmm1
+ movups 16(%rbp),%xmm1
pxor %xmm15,%xmm14
.byte 102,15,56,223,84,36,0
@@ -2296,7 +2308,7 @@ L$xts_dec_loop6:
movl $16+96,%eax
subl %r10d,%eax
- movq %r11,%rcx
+ movq %rbp,%rcx
shrl $4,%eax
L$xts_dec_short:
@@ -2453,7 +2465,7 @@ L$xts_dec_done:
jz L$xts_dec_ret
L$xts_dec_done2:
movq %r9,%rdx
- movq %r11,%rcx
+ movq %rbp,%rcx
movl %r10d,%eax
movups (%rdi),%xmm2
@@ -2483,7 +2495,7 @@ L$xts_dec_steal:
jnz L$xts_dec_steal
subq %r9,%rsi
- movq %r11,%rcx
+ movq %rbp,%rcx
movl %r10d,%eax
movups (%rsi),%xmm2
@@ -2526,21 +2538,30 @@ L$xts_dec_ret:
movaps %xmm0,96(%rsp)
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
- leaq (%rbp),%rsp
- popq %rbp
+ movq -8(%r11),%rbp
+
+ leaq (%r11),%rsp
+
L$xts_dec_epilogue:
.byte 0xf3,0xc3
+
.globl _aesni_ocb_encrypt
.p2align 5
_aesni_ocb_encrypt:
+
leaq (%rsp),%rax
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
movq 8(%rax),%rbx
movq 8+8(%rax),%rbp
@@ -2716,16 +2737,26 @@ L$ocb_enc_done:
pxor %xmm13,%xmm13
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
- popq %r14
- popq %r13
- popq %r12
- popq %rbp
- popq %rbx
+ leaq 40(%rsp),%rax
+
+ movq -40(%rax),%r14
+
+ movq -32(%rax),%r13
+
+ movq -24(%rax),%r12
+
+ movq -16(%rax),%rbp
+
+ movq -8(%rax),%rbx
+
+ leaq (%rax),%rsp
+
L$ocb_enc_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
__ocb_encrypt6:
pxor %xmm9,%xmm15
@@ -2935,12 +2966,18 @@ L$ocb_enc_loop1:
.p2align 5
_aesni_ocb_decrypt:
+
leaq (%rsp),%rax
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
movq 8(%rax),%rbx
movq 8+8(%rax),%rbp
@@ -3138,16 +3175,26 @@ L$ocb_dec_done:
pxor %xmm13,%xmm13
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
- popq %r14
- popq %r13
- popq %r12
- popq %rbp
- popq %rbx
+ leaq 40(%rsp),%rax
+
+ movq -40(%rax),%r14
+
+ movq -32(%rax),%r13
+
+ movq -24(%rax),%r12
+
+ movq -16(%rax),%rbp
+
+ movq -8(%rax),%rbx
+
+ leaq (%rax),%rsp
+
L$ocb_dec_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
__ocb_decrypt6:
pxor %xmm9,%xmm15
@@ -3345,6 +3392,7 @@ L$ocb_dec_loop1:
.p2align 4
_aesni_cbc_encrypt:
+
testq %rdx,%rdx
jz L$cbc_ret
@@ -3437,11 +3485,13 @@ L$oop_dec1_16:
jmp L$cbc_ret
.p2align 4
L$cbc_decrypt_bulk:
- leaq (%rsp),%rax
+ leaq (%rsp),%r11
+
pushq %rbp
+
subq $16,%rsp
andq $-16,%rsp
- leaq -8(%rax),%rbp
+ movq %rcx,%rbp
movups (%r8),%xmm10
movl %r10d,%eax
cmpq $0x50,%rdx
@@ -3481,7 +3531,7 @@ L$cbc_dec_loop8_enter:
pxor %xmm0,%xmm3
movups 16-112(%rcx),%xmm1
pxor %xmm0,%xmm4
- xorq %r11,%r11
+ movq $-1,%rbp
cmpq $0x70,%rdx
pxor %xmm0,%xmm5
pxor %xmm0,%xmm6
@@ -3497,10 +3547,10 @@ L$cbc_dec_loop8_enter:
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
- setnc %r11b
- shlq $7,%r11
+ adcq $0,%rbp
+ andq $128,%rbp
.byte 102,68,15,56,222,201
- addq %rdi,%r11
+ addq %rdi,%rbp
movups 48-112(%rcx),%xmm1
.byte 102,15,56,222,208
.byte 102,15,56,222,216
@@ -3638,18 +3688,18 @@ L$cbc_dec_done:
movdqu 112(%rdi),%xmm0
.byte 102,65,15,56,223,228
leaq 128(%rdi),%rdi
- movdqu 0(%r11),%xmm11
+ movdqu 0(%rbp),%xmm11
.byte 102,65,15,56,223,237
.byte 102,65,15,56,223,246
- movdqu 16(%r11),%xmm12
- movdqu 32(%r11),%xmm13
+ movdqu 16(%rbp),%xmm12
+ movdqu 32(%rbp),%xmm13
.byte 102,65,15,56,223,255
.byte 102,68,15,56,223,193
- movdqu 48(%r11),%xmm14
- movdqu 64(%r11),%xmm15
+ movdqu 48(%rbp),%xmm14
+ movdqu 64(%rbp),%xmm15
.byte 102,69,15,56,223,202
movdqa %xmm0,%xmm10
- movdqu 80(%r11),%xmm1
+ movdqu 80(%rbp),%xmm1
movups -112(%rcx),%xmm0
movups %xmm2,(%rsi)
@@ -3768,7 +3818,7 @@ L$cbc_dec_loop6_enter:
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm14,%xmm6
- movq %r11,%rcx
+ movq %rbp,%rcx
movdqu %xmm5,48(%rsi)
pxor %xmm15,%xmm7
movl %r10d,%eax
@@ -3921,16 +3971,21 @@ L$cbc_dec_tail_partial:
L$cbc_dec_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
- leaq (%rbp),%rsp
- popq %rbp
+ movq -8(%r11),%rbp
+
+ leaq (%r11),%rsp
+
L$cbc_ret:
.byte 0xf3,0xc3
+
.globl _aesni_set_decrypt_key
.p2align 4
_aesni_set_decrypt_key:
+
.byte 0x48,0x83,0xEC,0x08
+
call __aesni_set_encrypt_key
shll $4,%esi
testl %eax,%eax
@@ -3963,7 +4018,9 @@ L$dec_key_inverse:
pxor %xmm0,%xmm0
L$dec_key_ret:
addq $8,%rsp
+
.byte 0xf3,0xc3
+
L$SEH_end_set_decrypt_key:
.globl _aesni_set_encrypt_key
@@ -3971,7 +4028,9 @@ L$SEH_end_set_decrypt_key:
.p2align 4
_aesni_set_encrypt_key:
__aesni_set_encrypt_key:
+
.byte 0x48,0x83,0xEC,0x08
+
movq $-1,%rax
testq %rdi,%rdi
jz L$enc_key_ret
@@ -4264,7 +4323,9 @@ L$enc_key_ret:
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
addq $8,%rsp
+
.byte 0xf3,0xc3
+
L$SEH_end_set_encrypt_key:
.p2align 4
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/bsaes-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/bsaes-x86_64.s
index da5d1b1122..13920e2ace 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/bsaes-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/bsaes-x86_64.s
@@ -1067,6 +1067,7 @@ L$key_loop:
.p2align 4
_bsaes_cbc_encrypt:
+
cmpl $0,%r9d
jne _asm_AES_cbc_encrypt
cmpq $128,%rdx
@@ -1075,13 +1076,21 @@ _bsaes_cbc_encrypt:
movq %rsp,%rax
L$cbc_dec_prologue:
pushq %rbp
+
pushq %rbx
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
leaq -72(%rsp),%rsp
+
movq %rsp,%rbp
+
movl 240(%rcx),%eax
movq %rdi,%r12
movq %rsi,%r13
@@ -1300,33 +1309,50 @@ L$cbc_dec_bzero:
cmpq %rax,%rbp
ja L$cbc_dec_bzero
- leaq (%rbp),%rsp
- movq 72(%rsp),%r15
- movq 80(%rsp),%r14
- movq 88(%rsp),%r13
- movq 96(%rsp),%r12
- movq 104(%rsp),%rbx
- movq 112(%rsp),%rax
- leaq 120(%rsp),%rsp
- movq %rax,%rbp
+ leaq 120(%rbp),%rax
+
+ movq -48(%rax),%r15
+
+ movq -40(%rax),%r14
+
+ movq -32(%rax),%r13
+
+ movq -24(%rax),%r12
+
+ movq -16(%rax),%rbx
+
+ movq -8(%rax),%rbp
+
+ leaq (%rax),%rsp
+
L$cbc_dec_epilogue:
.byte 0xf3,0xc3
+
.globl _bsaes_ctr32_encrypt_blocks
.p2align 4
_bsaes_ctr32_encrypt_blocks:
+
movq %rsp,%rax
L$ctr_enc_prologue:
pushq %rbp
+
pushq %rbx
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
leaq -72(%rsp),%rsp
+
movq %rsp,%rbp
+
movdqu (%r8),%xmm0
movl 240(%rcx),%eax
movq %rdi,%r12
@@ -1500,32 +1526,49 @@ L$ctr_enc_bzero:
cmpq %rax,%rbp
ja L$ctr_enc_bzero
- leaq (%rbp),%rsp
- movq 72(%rsp),%r15
- movq 80(%rsp),%r14
- movq 88(%rsp),%r13
- movq 96(%rsp),%r12
- movq 104(%rsp),%rbx
- movq 112(%rsp),%rax
- leaq 120(%rsp),%rsp
- movq %rax,%rbp
+ leaq 120(%rbp),%rax
+
+ movq -48(%rax),%r15
+
+ movq -40(%rax),%r14
+
+ movq -32(%rax),%r13
+
+ movq -24(%rax),%r12
+
+ movq -16(%rax),%rbx
+
+ movq -8(%rax),%rbp
+
+ leaq (%rax),%rsp
+
L$ctr_enc_epilogue:
.byte 0xf3,0xc3
+
.globl _bsaes_xts_encrypt
.p2align 4
_bsaes_xts_encrypt:
+
movq %rsp,%rax
L$xts_enc_prologue:
pushq %rbp
+
pushq %rbx
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
leaq -72(%rsp),%rsp
+
movq %rsp,%rbp
+
movq %rdi,%r12
movq %rsi,%r13
movq %rdx,%r14
@@ -1951,32 +1994,48 @@ L$xts_enc_bzero:
cmpq %rax,%rbp
ja L$xts_enc_bzero
- leaq (%rbp),%rsp
- movq 72(%rsp),%r15
- movq 80(%rsp),%r14
- movq 88(%rsp),%r13
- movq 96(%rsp),%r12
- movq 104(%rsp),%rbx
- movq 112(%rsp),%rax
- leaq 120(%rsp),%rsp
- movq %rax,%rbp
+ leaq 120(%rbp),%rax
+
+ movq -48(%rax),%r15
+
+ movq -40(%rax),%r14
+
+ movq -32(%rax),%r13
+
+ movq -24(%rax),%r12
+
+ movq -16(%rax),%rbx
+
+ movq -8(%rax),%rbp
+
+ leaq (%rax),%rsp
+
L$xts_enc_epilogue:
.byte 0xf3,0xc3
+
.globl _bsaes_xts_decrypt
.p2align 4
_bsaes_xts_decrypt:
+
movq %rsp,%rax
L$xts_dec_prologue:
pushq %rbp
+
pushq %rbx
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
leaq -72(%rsp),%rsp
+
movq %rsp,%rbp
movq %rdi,%r12
movq %rsi,%r13
@@ -2429,19 +2488,27 @@ L$xts_dec_bzero:
cmpq %rax,%rbp
ja L$xts_dec_bzero
- leaq (%rbp),%rsp
- movq 72(%rsp),%r15
- movq 80(%rsp),%r14
- movq 88(%rsp),%r13
- movq 96(%rsp),%r12
- movq 104(%rsp),%rbx
- movq 112(%rsp),%rax
- leaq 120(%rsp),%rsp
- movq %rax,%rbp
+ leaq 120(%rbp),%rax
+
+ movq -48(%rax),%r15
+
+ movq -40(%rax),%r14
+
+ movq -32(%rax),%r13
+
+ movq -24(%rax),%r12
+
+ movq -16(%rax),%rbx
+
+ movq -8(%rax),%rbp
+
+ leaq (%rax),%rsp
+
L$xts_dec_epilogue:
.byte 0xf3,0xc3
+
.p2align 6
_bsaes_const:
L$M0ISR:
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/rsaz-avx2.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/rsaz-avx2.s
index 785a35ac91..73aa8b7373 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/rsaz-avx2.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/rsaz-avx2.s
@@ -4,15 +4,24 @@
.p2align 6
_rsaz_1024_sqr_avx2:
+
leaq (%rsp),%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
vzeroupper
movq %rax,%rbp
+
movq %rdx,%r13
subq $832,%rsp
movq %r13,%r15
@@ -625,28 +634,46 @@ L$OOP_REDUCE_1024:
vzeroall
movq %rbp,%rax
+
movq -48(%rax),%r15
+
movq -40(%rax),%r14
+
movq -32(%rax),%r13
+
movq -24(%rax),%r12
+
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$sqr_1024_epilogue:
.byte 0xf3,0xc3
+
.globl _rsaz_1024_mul_avx2
.p2align 6
_rsaz_1024_mul_avx2:
+
leaq (%rsp),%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
movq %rax,%rbp
+
vzeroall
movq %rdx,%r13
subq $64,%rsp
@@ -1162,16 +1189,25 @@ L$oop_mul_1024:
vzeroupper
movq %rbp,%rax
+
movq -48(%rax),%r15
+
movq -40(%rax),%r14
+
movq -32(%rax),%r13
+
movq -24(%rax),%r12
+
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$mul_1024_epilogue:
.byte 0xf3,0xc3
+
.globl _rsaz_1024_red2norm_avx2
.p2align 5
@@ -1555,8 +1591,10 @@ L$oop_scatter_1024:
.p2align 5
_rsaz_1024_gather5_avx2:
+
vzeroupper
movq %rsp,%r11
+
leaq -256(%rsp),%rsp
andq $-32,%rsp
leaq L$inc(%rip),%r10
@@ -1665,8 +1703,11 @@ L$oop_gather_1024:
vmovdqu %ymm0,(%rdi)
vzeroupper
leaq (%r11),%rsp
+
.byte 0xf3,0xc3
+L$SEH_end_rsaz_1024_gather5:
+
.globl _rsaz_avx2_eligible
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/rsaz-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/rsaz-x86_64.s
index 7f4a01109e..eab5b54b2c 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/rsaz-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/rsaz-x86_64.s
@@ -6,14 +6,22 @@
.p2align 5
_rsaz_512_sqr:
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
subq $128+24,%rsp
+
L$sqr_body:
movq %rdx,%rbp
movq (%rsi),%rdx
@@ -658,28 +666,45 @@ L$oop_sqrx:
L$sqr_tail:
leaq 128+24+48(%rsp),%rax
+
movq -48(%rax),%r15
+
movq -40(%rax),%r14
+
movq -32(%rax),%r13
+
movq -24(%rax),%r12
+
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$sqr_epilogue:
.byte 0xf3,0xc3
+
.globl _rsaz_512_mul
.p2align 5
_rsaz_512_mul:
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
subq $128+24,%rsp
+
L$mul_body:
.byte 102,72,15,110,199
.byte 102,72,15,110,201
@@ -741,28 +766,45 @@ L$mul_tail:
call __rsaz_512_subtract
leaq 128+24+48(%rsp),%rax
+
movq -48(%rax),%r15
+
movq -40(%rax),%r14
+
movq -32(%rax),%r13
+
movq -24(%rax),%r12
+
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$mul_epilogue:
.byte 0xf3,0xc3
+
.globl _rsaz_512_mul_gather4
.p2align 5
_rsaz_512_mul_gather4:
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
subq $152,%rsp
+
L$mul_gather4_body:
movd %r9d,%xmm8
movdqa L$inc+16(%rip),%xmm1
@@ -1151,29 +1193,46 @@ L$mul_gather_tail:
call __rsaz_512_subtract
leaq 128+24+48(%rsp),%rax
+
movq -48(%rax),%r15
+
movq -40(%rax),%r14
+
movq -32(%rax),%r13
+
movq -24(%rax),%r12
+
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$mul_gather4_epilogue:
.byte 0xf3,0xc3
+
.globl _rsaz_512_mul_scatter4
.p2align 5
_rsaz_512_mul_scatter4:
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
movl %r9d,%r9d
subq $128+24,%rsp
+
L$mul_scatter4_body:
leaq (%r8,%r9,8),%r8
.byte 102,72,15,110,199
@@ -1248,28 +1307,45 @@ L$mul_scatter_tail:
movq %r15,896(%rsi)
leaq 128+24+48(%rsp),%rax
+
movq -48(%rax),%r15
+
movq -40(%rax),%r14
+
movq -32(%rax),%r13
+
movq -24(%rax),%r12
+
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$mul_scatter4_epilogue:
.byte 0xf3,0xc3
+
.globl _rsaz_512_mul_by_one
.p2align 5
_rsaz_512_mul_by_one:
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
subq $128+24,%rsp
+
L$mul_by_one_body:
movl _OPENSSL_ia32cap_P+8(%rip),%eax
movq %rdx,%rbp
@@ -1312,17 +1388,26 @@ L$by_one_tail:
movq %r15,56(%rdi)
leaq 128+24+48(%rsp),%rax
+
movq -48(%rax),%r15
+
movq -40(%rax),%r14
+
movq -32(%rax),%r13
+
movq -24(%rax),%r12
+
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$mul_by_one_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
__rsaz_512_reduce:
movq %r8,%rbx
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-gf2m.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-gf2m.s
index af1ffdd59b..4137d5990a 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-gf2m.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-gf2m.s
@@ -3,7 +3,9 @@
.p2align 4
_mul_1x1:
+
subq $128+8,%rsp
+
movq $-1,%r9
leaq (%rax,%rax,1),%rsi
shrq $3,%r9
@@ -193,16 +195,20 @@ _mul_1x1:
xorq %rdi,%rdx
addq $128+8,%rsp
+
.byte 0xf3,0xc3
L$end_mul_1x1:
+
.globl _bn_GF2m_mul_2x2
.p2align 4
_bn_GF2m_mul_2x2:
- movq _OPENSSL_ia32cap_P(%rip),%rax
- btq $33,%rax
+
+ movq %rsp,%rax
+ movq _OPENSSL_ia32cap_P(%rip),%r10
+ btq $33,%r10
jnc L$vanilla_mul_2x2
.byte 102,72,15,110,198
@@ -230,11 +236,17 @@ _bn_GF2m_mul_2x2:
.p2align 4
L$vanilla_mul_2x2:
leaq -136(%rsp),%rsp
+
movq %r14,80(%rsp)
+
movq %r13,88(%rsp)
+
movq %r12,96(%rsp)
+
movq %rbp,104(%rsp)
+
movq %rbx,112(%rsp)
+
L$body_mul_2x2:
movq %rdi,32(%rsp)
movq %rsi,40(%rsp)
@@ -279,13 +291,21 @@ L$body_mul_2x2:
movq %rax,8(%rbp)
movq 80(%rsp),%r14
+
movq 88(%rsp),%r13
+
movq 96(%rsp),%r12
+
movq 104(%rsp),%rbp
+
movq 112(%rsp),%rbx
+
leaq 136(%rsp),%rsp
+
+L$epilogue_mul_2x2:
.byte 0xf3,0xc3
L$end_mul_2x2:
+
.byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 4
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-mont.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-mont.s
index dd43da0d86..5abe3696c4 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-mont.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-mont.s
@@ -6,8 +6,10 @@
.p2align 4
_bn_mul_mont:
+
movl %r9d,%r9d
movq %rsp,%rax
+
testl $3,%r9d
jnz L$mul_enter
cmpl $8,%r9d
@@ -22,12 +24,18 @@ _bn_mul_mont:
.p2align 4
L$mul_enter:
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
negq %r9
movq %rsp,%r11
leaq -16(%rsp,%r9,8),%r10
@@ -59,6 +67,7 @@ L$mul_page_walk:
L$mul_page_walk_done:
movq %rax,8(%rsp,%r9,8)
+
L$mul_body:
movq %rdx,%r12
movq (%r8),%r8
@@ -226,33 +235,50 @@ L$copy:
jnz L$copy
movq 8(%rsp,%r9,8),%rsi
+
movq $1,%rax
movq -48(%rsi),%r15
+
movq -40(%rsi),%r14
+
movq -32(%rsi),%r13
+
movq -24(%rsi),%r12
+
movq -16(%rsi),%rbp
+
movq -8(%rsi),%rbx
+
leaq (%rsi),%rsp
+
L$mul_epilogue:
.byte 0xf3,0xc3
+
.p2align 4
bn_mul4x_mont:
+
movl %r9d,%r9d
movq %rsp,%rax
+
L$mul4x_enter:
andl $0x80100,%r11d
cmpl $0x80100,%r11d
je L$mulx4x_enter
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
negq %r9
movq %rsp,%r11
leaq -32(%rsp,%r9,8),%r10
@@ -275,6 +301,7 @@ L$mul4x_page_walk:
L$mul4x_page_walk_done:
movq %rax,8(%rsp,%r9,8)
+
L$mul4x_body:
movq %rdi,16(%rsp,%r9,8)
movq %rdx,%r12
@@ -642,14 +669,22 @@ L$copy4x:
decq %r15
jnz L$copy4x
movq 8(%rsp,%r9,8),%rsi
+
movq $1,%rax
movq -48(%rsi),%r15
+
movq -40(%rsi),%r14
+
movq -32(%rsi),%r13
+
movq -24(%rsi),%r12
+
movq -16(%rsi),%rbp
+
movq -8(%rsi),%rbx
+
leaq (%rsi),%rsp
+
L$mul4x_epilogue:
.byte 0xf3,0xc3
@@ -657,16 +692,25 @@ L$mul4x_epilogue:
+
.p2align 5
bn_sqr8x_mont:
+
movq %rsp,%rax
+
L$sqr8x_enter:
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
L$sqr8x_prologue:
movl %r9d,%r10d
@@ -722,6 +766,7 @@ L$sqr8x_page_walk_done:
movq %r8,32(%rsp)
movq %rax,40(%rsp)
+
L$sqr8x_body:
.byte 102,72,15,110,209
@@ -787,6 +832,7 @@ L$sqr8x_sub:
pxor %xmm0,%xmm0
pshufd $0,%xmm1,%xmm1
movq 40(%rsp),%rsi
+
jmp L$sqr8x_cond_copy
.p2align 5
@@ -816,26 +862,42 @@ L$sqr8x_cond_copy:
movq $1,%rax
movq -48(%rsi),%r15
+
movq -40(%rsi),%r14
+
movq -32(%rsi),%r13
+
movq -24(%rsi),%r12
+
movq -16(%rsi),%rbp
+
movq -8(%rsi),%rbx
+
leaq (%rsi),%rsp
+
L$sqr8x_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
bn_mulx4x_mont:
+
movq %rsp,%rax
+
L$mulx4x_enter:
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
L$mulx4x_prologue:
shll $3,%r9d
@@ -881,6 +943,7 @@ L$mulx4x_page_walk_done:
movq %r8,24(%rsp)
movq %rdi,32(%rsp)
movq %rax,40(%rsp)
+
movq %r9,48(%rsp)
jmp L$mulx4x_body
@@ -1125,6 +1188,7 @@ L$mulx4x_sub:
pxor %xmm0,%xmm0
pshufd $0,%xmm1,%xmm1
movq 40(%rsp),%rsi
+
jmp L$mulx4x_cond_copy
.p2align 5
@@ -1154,14 +1218,22 @@ L$mulx4x_cond_copy:
movq $1,%rax
movq -48(%rsi),%r15
+
movq -40(%rsi),%r14
+
movq -32(%rsi),%r13
+
movq -24(%rsi),%r12
+
movq -16(%rsi),%rbp
+
movq -8(%rsi),%rbx
+
leaq (%rsi),%rsp
+
L$mulx4x_epilogue:
.byte 0xf3,0xc3
+
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 4
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-mont5.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-mont5.s
index f415b8d80c..9cb256094b 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-mont5.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-mont5.s
@@ -6,8 +6,10 @@
.p2align 6
_bn_mul_mont_gather5:
+
movl %r9d,%r9d
movq %rsp,%rax
+
testl $7,%r9d
jnz L$mul_enter
movl _OPENSSL_ia32cap_P+8(%rip),%r11d
@@ -17,12 +19,18 @@ _bn_mul_mont_gather5:
L$mul_enter:
movd 8(%rsp),%xmm5
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
negq %r9
movq %rsp,%r11
leaq -280(%rsp,%r9,8),%r10
@@ -54,6 +62,7 @@ L$mul_page_walk_done:
leaq L$inc(%rip),%r10
movq %rax,8(%rsp,%r9,8)
+
L$mul_body:
leaq 128(%rdx),%r12
@@ -411,33 +420,50 @@ L$copy:
jnz L$copy
movq 8(%rsp,%r9,8),%rsi
+
movq $1,%rax
movq -48(%rsi),%r15
+
movq -40(%rsi),%r14
+
movq -32(%rsi),%r13
+
movq -24(%rsi),%r12
+
movq -16(%rsi),%rbp
+
movq -8(%rsi),%rbx
+
leaq (%rsi),%rsp
+
L$mul_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
bn_mul4x_mont_gather5:
+
.byte 0x67
movq %rsp,%rax
+
L$mul4x_enter:
andl $0x80108,%r11d
cmpl $0x80108,%r11d
je L$mulx4x_enter
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
L$mul4x_prologue:
.byte 0x67
@@ -493,25 +519,35 @@ L$mul4x_page_walk_done:
negq %r9
movq %rax,40(%rsp)
+
L$mul4x_body:
call mul4x_internal
movq 40(%rsp),%rsi
+
movq $1,%rax
movq -48(%rsi),%r15
+
movq -40(%rsi),%r14
+
movq -32(%rsi),%r13
+
movq -24(%rsi),%r12
+
movq -16(%rsi),%rbp
+
movq -8(%rsi),%rbx
+
leaq (%rsi),%rsp
+
L$mul4x_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
mul4x_internal:
shlq $5,%r9
@@ -1040,17 +1076,25 @@ L$inner4x:
.p2align 5
_bn_power5:
+
movq %rsp,%rax
+
movl _OPENSSL_ia32cap_P+8(%rip),%r11d
andl $0x80108,%r11d
cmpl $0x80108,%r11d
je L$powerx5_enter
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
L$power5_prologue:
shll $3,%r9d
@@ -1115,6 +1159,7 @@ L$pwr_page_walk_done:
movq %r8,32(%rsp)
movq %rax,40(%rsp)
+
L$power5_body:
.byte 102,72,15,110,207
.byte 102,72,15,110,209
@@ -1141,18 +1186,27 @@ L$power5_body:
call mul4x_internal
movq 40(%rsp),%rsi
+
movq $1,%rax
movq -48(%rsi),%r15
+
movq -40(%rsi),%r14
+
movq -32(%rsi),%r13
+
movq -24(%rsi),%r12
+
movq -16(%rsi),%rbp
+
movq -8(%rsi),%rbx
+
leaq (%rsi),%rsp
+
L$power5_epilogue:
.byte 0xf3,0xc3
+
.globl _bn_sqr8x_internal
.private_extern _bn_sqr8x_internal
@@ -2001,14 +2055,22 @@ _bn_from_montgomery:
.p2align 5
bn_from_mont8x:
+
.byte 0x67
movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
L$from_prologue:
shll $3,%r9d
@@ -2073,6 +2135,7 @@ L$from_page_walk_done:
movq %r8,32(%rsp)
movq %rax,40(%rsp)
+
L$from_body:
movq %r9,%r11
leaq 48(%rsp),%rax
@@ -2114,7 +2177,6 @@ L$mul_by_1:
pxor %xmm0,%xmm0
leaq 48(%rsp),%rax
- movq 40(%rsp),%rsi
jmp L$from_mont_zero
.p2align 5
@@ -2124,11 +2186,12 @@ L$from_mont_nox:
pxor %xmm0,%xmm0
leaq 48(%rsp),%rax
- movq 40(%rsp),%rsi
jmp L$from_mont_zero
.p2align 5
L$from_mont_zero:
+ movq 40(%rsp),%rsi
+
movdqa %xmm0,0(%rax)
movdqa %xmm0,16(%rax)
movdqa %xmm0,32(%rax)
@@ -2139,26 +2202,42 @@ L$from_mont_zero:
movq $1,%rax
movq -48(%rsi),%r15
+
movq -40(%rsi),%r14
+
movq -32(%rsi),%r13
+
movq -24(%rsi),%r12
+
movq -16(%rsi),%rbp
+
movq -8(%rsi),%rbx
+
leaq (%rsi),%rsp
+
L$from_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
bn_mulx4x_mont_gather5:
+
movq %rsp,%rax
+
L$mulx4x_enter:
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
L$mulx4x_prologue:
shll $3,%r9d
@@ -2224,24 +2303,34 @@ L$mulx4x_page_walk_done:
movq %r8,32(%rsp)
movq %rax,40(%rsp)
+
L$mulx4x_body:
call mulx4x_internal
movq 40(%rsp),%rsi
+
movq $1,%rax
movq -48(%rsi),%r15
+
movq -40(%rsi),%r14
+
movq -32(%rsi),%r13
+
movq -24(%rsi),%r12
+
movq -16(%rsi),%rbp
+
movq -8(%rsi),%rbx
+
leaq (%rsi),%rsp
+
L$mulx4x_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
mulx4x_internal:
movq %r9,8(%rsp)
@@ -2666,14 +2755,22 @@ L$mulx4x_inner:
.p2align 5
bn_powerx5:
+
movq %rsp,%rax
+
L$powerx5_enter:
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
L$powerx5_prologue:
shll $3,%r9d
@@ -2745,6 +2842,7 @@ L$pwrx_page_walk_done:
.byte 102,72,15,110,226
movq %r8,32(%rsp)
movq %rax,40(%rsp)
+
L$powerx5_body:
call __bn_sqrx8x_internal
@@ -2767,19 +2865,28 @@ L$powerx5_body:
call mulx4x_internal
movq 40(%rsp),%rsi
+
movq $1,%rax
movq -48(%rsi),%r15
+
movq -40(%rsi),%r14
+
movq -32(%rsi),%r13
+
movq -24(%rsi),%r12
+
movq -16(%rsi),%rbp
+
movq -8(%rsi),%rbx
+
leaq (%rsi),%rsp
+
L$powerx5_epilogue:
.byte 0xf3,0xc3
+
.globl _bn_sqrx8x_internal
.private_extern _bn_sqrx8x_internal
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/buildinf.h b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/buildinf.h
index ecb27f8a3a..b6263000db 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/buildinf.h
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/buildinf.h
@@ -1,38 +1,47 @@
-/* auto-generated by util/mkbuildinf.pl for crypto/cversion.c */
-#define CFLAGS cflags
/*
- * Generate CFLAGS as an array of individual characters. This is a
+ * WARNING: do not edit!
+ * Generated by util/mkbuildinf.pl
+ *
+ * Copyright 2014-2017 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#define PLATFORM "platform: darwin64-x86_64-cc"
+#define DATE "built on: Thu Nov 22 19:33:06 2018 UTC"
+
+/*
+ * Generate compiler_flags as an array of individual characters. This is a
* workaround for the situation where CFLAGS gets too long for a C90 string
* literal
*/
-static const char cflags[] = {
- 'c','o','m','p','i','l','e','r',':',' ','c','c',' ','-','D','D',
- 'S','O','_','D','L','F','C','N',' ','-','D','H','A','V','E','_',
- 'D','L','F','C','N','_','H',' ','-','D','N','D','E','B','U','G',
- ' ','-','D','O','P','E','N','S','S','L','_','T','H','R','E','A',
- 'D','S',' ','-','D','O','P','E','N','S','S','L','_','N','O','_',
- 'D','Y','N','A','M','I','C','_','E','N','G','I','N','E',' ','-',
- 'D','O','P','E','N','S','S','L','_','P','I','C',' ','-','D','O',
- 'P','E','N','S','S','L','_','I','A','3','2','_','S','S','E','2',
- ' ','-','D','O','P','E','N','S','S','L','_','B','N','_','A','S',
- 'M','_','M','O','N','T',' ','-','D','O','P','E','N','S','S','L',
- '_','B','N','_','A','S','M','_','M','O','N','T','5',' ','-','D',
- 'O','P','E','N','S','S','L','_','B','N','_','A','S','M','_','G',
- 'F','2','m',' ','-','D','S','H','A','1','_','A','S','M',' ','-',
- 'D','S','H','A','2','5','6','_','A','S','M',' ','-','D','S','H',
- 'A','5','1','2','_','A','S','M',' ','-','D','R','C','4','_','A',
- 'S','M',' ','-','D','M','D','5','_','A','S','M',' ','-','D','A',
- 'E','S','_','A','S','M',' ','-','D','V','P','A','E','S','_','A',
- 'S','M',' ','-','D','B','S','A','E','S','_','A','S','M',' ','-',
- 'D','G','H','A','S','H','_','A','S','M',' ','-','D','E','C','P',
- '_','N','I','S','T','Z','2','5','6','_','A','S','M',' ','-','D',
- 'P','A','D','L','O','C','K','_','A','S','M',' ','-','D','P','O',
- 'L','Y','1','3','0','5','_','A','S','M',' ','-','D','O','P','E',
- 'N','S','S','L','D','I','R','=','"','\\','"','/','u','s','r','/',
- 'l','o','c','a','l','/','s','s','l','\\','"','"',' ','-','D','E',
- 'N','G','I','N','E','S','D','I','R','=','"','\\','"','/','u','s',
- 'r','/','l','o','c','a','l','/','l','i','b','/','e','n','g','i',
- 'n','e','s','-','1','.','1','\\','"','"',' ','\0'
+static const char compiler_flags[] = {
+ 'c','o','m','p','i','l','e','r',':',' ','g','c','c',' ','-','f',
+ 'P','I','C',' ','-','a','r','c','h',' ','x','8','6','_','6','4',
+ ' ','-','W','a',',','-','-','n','o','e','x','e','c','s','t','a',
+ 'c','k',' ','-','O','3',' ','-','W','a','l','l',' ','-','D','L',
+ '_','E','N','D','I','A','N',' ','-','D','O','P','E','N','S','S',
+ 'L','_','P','I','C',' ','-','D','O','P','E','N','S','S','L','_',
+ 'C','P','U','I','D','_','O','B','J',' ','-','D','O','P','E','N',
+ 'S','S','L','_','I','A','3','2','_','S','S','E','2',' ','-','D',
+ 'O','P','E','N','S','S','L','_','B','N','_','A','S','M','_','M',
+ 'O','N','T',' ','-','D','O','P','E','N','S','S','L','_','B','N',
+ '_','A','S','M','_','M','O','N','T','5',' ','-','D','O','P','E',
+ 'N','S','S','L','_','B','N','_','A','S','M','_','G','F','2','m',
+ ' ','-','D','S','H','A','1','_','A','S','M',' ','-','D','S','H',
+ 'A','2','5','6','_','A','S','M',' ','-','D','S','H','A','5','1',
+ '2','_','A','S','M',' ','-','D','K','E','C','C','A','K','1','6',
+ '0','0','_','A','S','M',' ','-','D','R','C','4','_','A','S','M',
+ ' ','-','D','M','D','5','_','A','S','M',' ','-','D','A','E','S',
+ '_','A','S','M',' ','-','D','V','P','A','E','S','_','A','S','M',
+ ' ','-','D','B','S','A','E','S','_','A','S','M',' ','-','D','G',
+ 'H','A','S','H','_','A','S','M',' ','-','D','E','C','P','_','N',
+ 'I','S','T','Z','2','5','6','_','A','S','M',' ','-','D','X','2',
+ '5','5','1','9','_','A','S','M',' ','-','D','P','A','D','L','O',
+ 'C','K','_','A','S','M',' ','-','D','P','O','L','Y','1','3','0',
+ '5','_','A','S','M',' ','-','D','_','R','E','E','N','T','R','A',
+ 'N','T',' ','-','D','N','D','E','B','U','G','\0'
};
-#define PLATFORM "platform: darwin64-x86_64-cc"
-#define DATE "built on: Tue Nov 20 09:37:39 2018"
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/camellia/cmll-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/camellia/cmll-x86_64.s
index 35a3ea550a..2ae924deec 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/camellia/cmll-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/camellia/cmll-x86_64.s
@@ -17,11 +17,17 @@ _Camellia_EncryptBlock:
.p2align 4
L$enc_rounds:
_Camellia_EncryptBlock_Rounds:
+
pushq %rbx
+
pushq %rbp
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
L$enc_prologue:
@@ -53,16 +59,23 @@ L$enc_prologue:
movl %r11d,12(%r13)
movq 0(%rsp),%r15
+
movq 8(%rsp),%r14
+
movq 16(%rsp),%r13
+
movq 24(%rsp),%rbp
+
movq 32(%rsp),%rbx
+
leaq 40(%rsp),%rsp
+
L$enc_epilogue:
.byte 0xf3,0xc3
+
.p2align 4
_x86_64_Camellia_encrypt:
xorl 0(%r14),%r9d
@@ -286,11 +299,17 @@ _Camellia_DecryptBlock:
.p2align 4
L$dec_rounds:
_Camellia_DecryptBlock_Rounds:
+
pushq %rbx
+
pushq %rbp
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
L$dec_prologue:
@@ -322,16 +341,23 @@ L$dec_prologue:
movl %r11d,12(%r13)
movq 0(%rsp),%r15
+
movq 8(%rsp),%r14
+
movq 16(%rsp),%r13
+
movq 24(%rsp),%rbp
+
movq 32(%rsp),%rbx
+
leaq 40(%rsp),%rsp
+
L$dec_epilogue:
.byte 0xf3,0xc3
+
.p2align 4
_x86_64_Camellia_decrypt:
xorl 0(%r14),%r9d
@@ -542,11 +568,17 @@ L$ddone:
.p2align 4
_Camellia_Ekeygen:
+
pushq %rbx
+
pushq %rbp
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
L$key_prologue:
movl %edi,%r15d
@@ -1074,14 +1106,21 @@ L$2nd256:
movl $4,%eax
L$done:
movq 0(%rsp),%r15
+
movq 8(%rsp),%r14
+
movq 16(%rsp),%r13
+
movq 24(%rsp),%rbp
+
movq 32(%rsp),%rbx
+
leaq 40(%rsp),%rsp
+
L$key_epilogue:
.byte 0xf3,0xc3
+
.p2align 6
L$Camellia_SIGMA:
.long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
@@ -1605,17 +1644,25 @@ L$Camellia_SBOX:
.p2align 4
_Camellia_cbc_encrypt:
+
cmpq $0,%rdx
je L$cbc_abort
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
L$cbc_prologue:
movq %rsp,%rbp
+
subq $64,%rsp
andq $-64,%rsp
@@ -1637,6 +1684,7 @@ L$cbc_prologue:
movq %r8,40(%rsp)
movq %rbp,48(%rsp)
+
L$cbc_body:
leaq L$Camellia_SBOX(%rip),%rbp
@@ -1824,15 +1872,24 @@ L$cbc_dec_popf:
.p2align 4
L$cbc_done:
movq 48(%rsp),%rcx
+
movq 0(%rcx),%r15
+
movq 8(%rcx),%r14
+
movq 16(%rcx),%r13
+
movq 24(%rcx),%r12
+
movq 32(%rcx),%rbp
+
movq 40(%rcx),%rbx
+
leaq 48(%rcx),%rsp
+
L$cbc_abort:
.byte 0xf3,0xc3
+
.byte 67,97,109,101,108,108,105,97,32,102,111,114,32,120,56,54,95,54,52,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/chacha/chacha-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/chacha/chacha-x86_64.s
index afd47bdf68..edb6c28e4b 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/chacha/chacha-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/chacha/chacha-x86_64.s
@@ -19,6 +19,17 @@ L$rot16:
.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
L$rot24:
.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+L$twoy:
+.long 2,0,0,0, 2,0,0,0
+.p2align 6
+L$zeroz:
+.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+L$fourz:
+.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+L$incz:
+.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+L$sixteen:
+.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
L$sigma:
.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
@@ -26,20 +37,33 @@ L$sigma:
.p2align 6
_ChaCha20_ctr32:
+
cmpq $0,%rdx
je L$no_data
movq _OPENSSL_ia32cap_P+4(%rip),%r10
+ btq $48,%r10
+ jc L$ChaCha20_avx512
+ testq %r10,%r10
+ js L$ChaCha20_avx512vl
testl $512,%r10d
jnz L$ChaCha20_ssse3
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
subq $64+24,%rsp
+L$ctr32_body:
+
movdqu (%rcx),%xmm1
movdqu 16(%rcx),%xmm2
@@ -276,34 +300,41 @@ L$oop_tail:
jnz L$oop_tail
L$done:
- addq $64+24,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbp
- popq %rbx
+ leaq 64+24+48(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
L$no_data:
.byte 0xf3,0xc3
+
.p2align 5
ChaCha20_ssse3:
+
L$ChaCha20_ssse3:
+ movq %rsp,%r9
+
testl $2048,%r10d
jnz L$ChaCha20_4xop
cmpq $128,%rdx
+ je L$ChaCha20_128
ja L$ChaCha20_4x
L$do_sse3_after_all:
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-
- subq $64+24,%rsp
+ subq $64+8,%rsp
movdqa L$sigma(%rip),%xmm0
movdqu (%rcx),%xmm1
movdqu 16(%rcx),%xmm2
@@ -315,7 +346,7 @@ L$do_sse3_after_all:
movdqa %xmm1,16(%rsp)
movdqa %xmm2,32(%rsp)
movdqa %xmm3,48(%rsp)
- movl $10,%ebp
+ movq $10,%r8
jmp L$oop_ssse3
.p2align 5
@@ -325,7 +356,7 @@ L$oop_outer_ssse3:
movdqa 16(%rsp),%xmm1
movdqa 32(%rsp),%xmm2
paddd 48(%rsp),%xmm3
- movl $10,%ebp
+ movq $10,%r8
movdqa %xmm3,48(%rsp)
jmp L$oop_ssse3
@@ -374,7 +405,7 @@ L$oop_ssse3:
pshufd $78,%xmm2,%xmm2
pshufd $147,%xmm1,%xmm1
pshufd $57,%xmm3,%xmm3
- decl %ebp
+ decq %r8
jnz L$oop_ssse3
paddd 0(%rsp),%xmm0
paddd 16(%rsp),%xmm1
@@ -411,31 +442,187 @@ L$tail_ssse3:
movdqa %xmm1,16(%rsp)
movdqa %xmm2,32(%rsp)
movdqa %xmm3,48(%rsp)
- xorq %rbx,%rbx
+ xorq %r8,%r8
L$oop_tail_ssse3:
- movzbl (%rsi,%rbx,1),%eax
- movzbl (%rsp,%rbx,1),%ecx
- leaq 1(%rbx),%rbx
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
xorl %ecx,%eax
- movb %al,-1(%rdi,%rbx,1)
+ movb %al,-1(%rdi,%r8,1)
decq %rdx
jnz L$oop_tail_ssse3
L$done_ssse3:
- addq $64+24,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbp
- popq %rbx
+ leaq (%r9),%rsp
+
+L$ssse3_epilogue:
+ .byte 0xf3,0xc3
+
+
+
+.p2align 5
+ChaCha20_128:
+
+L$ChaCha20_128:
+ movq %rsp,%r9
+
+ subq $64+8,%rsp
+ movdqa L$sigma(%rip),%xmm8
+ movdqu (%rcx),%xmm9
+ movdqu 16(%rcx),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa L$one(%rip),%xmm1
+ movdqa L$rot16(%rip),%xmm6
+ movdqa L$rot24(%rip),%xmm7
+
+ movdqa %xmm8,%xmm10
+ movdqa %xmm8,0(%rsp)
+ movdqa %xmm9,%xmm11
+ movdqa %xmm9,16(%rsp)
+ movdqa %xmm2,%xmm0
+ movdqa %xmm2,32(%rsp)
+ paddd %xmm3,%xmm1
+ movdqa %xmm3,48(%rsp)
+ movq $10,%r8
+ jmp L$oop_128
+
+.p2align 5
+L$oop_128:
+ paddd %xmm9,%xmm8
+ pxor %xmm8,%xmm3
+ paddd %xmm11,%xmm10
+ pxor %xmm10,%xmm1
+.byte 102,15,56,0,222
+.byte 102,15,56,0,206
+ paddd %xmm3,%xmm2
+ paddd %xmm1,%xmm0
+ pxor %xmm2,%xmm9
+ pxor %xmm0,%xmm11
+ movdqa %xmm9,%xmm4
+ psrld $20,%xmm9
+ movdqa %xmm11,%xmm5
+ pslld $12,%xmm4
+ psrld $20,%xmm11
+ por %xmm4,%xmm9
+ pslld $12,%xmm5
+ por %xmm5,%xmm11
+ paddd %xmm9,%xmm8
+ pxor %xmm8,%xmm3
+ paddd %xmm11,%xmm10
+ pxor %xmm10,%xmm1
+.byte 102,15,56,0,223
+.byte 102,15,56,0,207
+ paddd %xmm3,%xmm2
+ paddd %xmm1,%xmm0
+ pxor %xmm2,%xmm9
+ pxor %xmm0,%xmm11
+ movdqa %xmm9,%xmm4
+ psrld $25,%xmm9
+ movdqa %xmm11,%xmm5
+ pslld $7,%xmm4
+ psrld $25,%xmm11
+ por %xmm4,%xmm9
+ pslld $7,%xmm5
+ por %xmm5,%xmm11
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm9,%xmm9
+ pshufd $147,%xmm3,%xmm3
+ pshufd $78,%xmm0,%xmm0
+ pshufd $57,%xmm11,%xmm11
+ pshufd $147,%xmm1,%xmm1
+ paddd %xmm9,%xmm8
+ pxor %xmm8,%xmm3
+ paddd %xmm11,%xmm10
+ pxor %xmm10,%xmm1
+.byte 102,15,56,0,222
+.byte 102,15,56,0,206
+ paddd %xmm3,%xmm2
+ paddd %xmm1,%xmm0
+ pxor %xmm2,%xmm9
+ pxor %xmm0,%xmm11
+ movdqa %xmm9,%xmm4
+ psrld $20,%xmm9
+ movdqa %xmm11,%xmm5
+ pslld $12,%xmm4
+ psrld $20,%xmm11
+ por %xmm4,%xmm9
+ pslld $12,%xmm5
+ por %xmm5,%xmm11
+ paddd %xmm9,%xmm8
+ pxor %xmm8,%xmm3
+ paddd %xmm11,%xmm10
+ pxor %xmm10,%xmm1
+.byte 102,15,56,0,223
+.byte 102,15,56,0,207
+ paddd %xmm3,%xmm2
+ paddd %xmm1,%xmm0
+ pxor %xmm2,%xmm9
+ pxor %xmm0,%xmm11
+ movdqa %xmm9,%xmm4
+ psrld $25,%xmm9
+ movdqa %xmm11,%xmm5
+ pslld $7,%xmm4
+ psrld $25,%xmm11
+ por %xmm4,%xmm9
+ pslld $7,%xmm5
+ por %xmm5,%xmm11
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm9,%xmm9
+ pshufd $57,%xmm3,%xmm3
+ pshufd $78,%xmm0,%xmm0
+ pshufd $147,%xmm11,%xmm11
+ pshufd $57,%xmm1,%xmm1
+ decq %r8
+ jnz L$oop_128
+ paddd 0(%rsp),%xmm8
+ paddd 16(%rsp),%xmm9
+ paddd 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+ paddd L$one(%rip),%xmm1
+ paddd 0(%rsp),%xmm10
+ paddd 16(%rsp),%xmm11
+ paddd 32(%rsp),%xmm0
+ paddd 48(%rsp),%xmm1
+
+ movdqu 0(%rsi),%xmm4
+ movdqu 16(%rsi),%xmm5
+ pxor %xmm4,%xmm8
+ movdqu 32(%rsi),%xmm4
+ pxor %xmm5,%xmm9
+ movdqu 48(%rsi),%xmm5
+ pxor %xmm4,%xmm2
+ movdqu 64(%rsi),%xmm4
+ pxor %xmm5,%xmm3
+ movdqu 80(%rsi),%xmm5
+ pxor %xmm4,%xmm10
+ movdqu 96(%rsi),%xmm4
+ pxor %xmm5,%xmm11
+ movdqu 112(%rsi),%xmm5
+ pxor %xmm4,%xmm0
+ pxor %xmm5,%xmm1
+
+ movdqu %xmm8,0(%rdi)
+ movdqu %xmm9,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+ movdqu %xmm10,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm0,96(%rdi)
+ movdqu %xmm1,112(%rdi)
+ leaq (%r9),%rsp
+
+L$128_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
ChaCha20_4x:
+
L$ChaCha20_4x:
+ movq %rsp,%r9
+
movq %r10,%r11
shrq $32,%r10
testq $32,%r10
@@ -448,8 +635,7 @@ L$ChaCha20_4x:
je L$do_sse3_after_all
L$proceed4x:
- leaq -120(%rsp),%r11
- subq $0x148+0,%rsp
+ subq $0x140+8,%rsp
movdqa L$sigma(%rip),%xmm11
movdqu (%rcx),%xmm15
movdqu 16(%rcx),%xmm7
@@ -976,15 +1162,20 @@ L$oop_tail4x:
jnz L$oop_tail4x
L$done4x:
- addq $0x148+0,%rsp
+ leaq (%r9),%rsp
+
+L$4x_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
ChaCha20_4xop:
+
L$ChaCha20_4xop:
- leaq -120(%rsp),%r11
- subq $0x148+0,%rsp
+ movq %rsp,%r9
+
+ subq $0x140+8,%rsp
vzeroupper
vmovdqa L$sigma(%rip),%xmm11
@@ -1386,18 +1577,22 @@ L$oop_tail4xop:
L$done4xop:
vzeroupper
- addq $0x148+0,%rsp
+ leaq (%r9),%rsp
+
+L$4xop_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
ChaCha20_8x:
+
L$ChaCha20_8x:
- movq %rsp,%r10
+ movq %rsp,%r9
+
subq $0x280+8,%rsp
andq $-32,%rsp
vzeroupper
- movq %r10,640(%rsp)
@@ -1988,6 +2183,1240 @@ L$oop_tail8x:
L$done8x:
vzeroall
- movq 640(%rsp),%rsp
+ leaq (%r9),%rsp
+
+L$8x_epilogue:
+ .byte 0xf3,0xc3
+
+
+
+.p2align 5
+ChaCha20_avx512:
+
+L$ChaCha20_avx512:
+ movq %rsp,%r9
+
+ cmpq $512,%rdx
+ ja L$ChaCha20_16x
+
+ subq $64+8,%rsp
+ vbroadcasti32x4 L$sigma(%rip),%zmm0
+ vbroadcasti32x4 (%rcx),%zmm1
+ vbroadcasti32x4 16(%rcx),%zmm2
+ vbroadcasti32x4 (%r8),%zmm3
+
+ vmovdqa32 %zmm0,%zmm16
+ vmovdqa32 %zmm1,%zmm17
+ vmovdqa32 %zmm2,%zmm18
+ vpaddd L$zeroz(%rip),%zmm3,%zmm3
+ vmovdqa32 L$fourz(%rip),%zmm20
+ movq $10,%r8
+ vmovdqa32 %zmm3,%zmm19
+ jmp L$oop_avx512
+
+.p2align 4
+L$oop_outer_avx512:
+ vmovdqa32 %zmm16,%zmm0
+ vmovdqa32 %zmm17,%zmm1
+ vmovdqa32 %zmm18,%zmm2
+ vpaddd %zmm20,%zmm19,%zmm3
+ movq $10,%r8
+ vmovdqa32 %zmm3,%zmm19
+ jmp L$oop_avx512
+
+.p2align 5
+L$oop_avx512:
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $16,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $12,%zmm1,%zmm1
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $8,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $7,%zmm1,%zmm1
+ vpshufd $78,%zmm2,%zmm2
+ vpshufd $57,%zmm1,%zmm1
+ vpshufd $147,%zmm3,%zmm3
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $16,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $12,%zmm1,%zmm1
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $8,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $7,%zmm1,%zmm1
+ vpshufd $78,%zmm2,%zmm2
+ vpshufd $147,%zmm1,%zmm1
+ vpshufd $57,%zmm3,%zmm3
+ decq %r8
+ jnz L$oop_avx512
+ vpaddd %zmm16,%zmm0,%zmm0
+ vpaddd %zmm17,%zmm1,%zmm1
+ vpaddd %zmm18,%zmm2,%zmm2
+ vpaddd %zmm19,%zmm3,%zmm3
+
+ subq $64,%rdx
+ jb L$tail64_avx512
+
+ vpxor 0(%rsi),%xmm0,%xmm4
+ vpxor 16(%rsi),%xmm1,%xmm5
+ vpxor 32(%rsi),%xmm2,%xmm6
+ vpxor 48(%rsi),%xmm3,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz L$done_avx512
+
+ vextracti32x4 $1,%zmm0,%xmm4
+ vextracti32x4 $1,%zmm1,%xmm5
+ vextracti32x4 $1,%zmm2,%xmm6
+ vextracti32x4 $1,%zmm3,%xmm7
+
+ subq $64,%rdx
+ jb L$tail_avx512
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz L$done_avx512
+
+ vextracti32x4 $2,%zmm0,%xmm4
+ vextracti32x4 $2,%zmm1,%xmm5
+ vextracti32x4 $2,%zmm2,%xmm6
+ vextracti32x4 $2,%zmm3,%xmm7
+
+ subq $64,%rdx
+ jb L$tail_avx512
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz L$done_avx512
+
+ vextracti32x4 $3,%zmm0,%xmm4
+ vextracti32x4 $3,%zmm1,%xmm5
+ vextracti32x4 $3,%zmm2,%xmm6
+ vextracti32x4 $3,%zmm3,%xmm7
+
+ subq $64,%rdx
+ jb L$tail_avx512
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jnz L$oop_outer_avx512
+
+ jmp L$done_avx512
+
+.p2align 4
+L$tail64_avx512:
+ vmovdqa %xmm0,0(%rsp)
+ vmovdqa %xmm1,16(%rsp)
+ vmovdqa %xmm2,32(%rsp)
+ vmovdqa %xmm3,48(%rsp)
+ addq $64,%rdx
+ jmp L$oop_tail_avx512
+
+.p2align 4
+L$tail_avx512:
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ vmovdqa %xmm7,48(%rsp)
+ addq $64,%rdx
+
+L$oop_tail_avx512:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz L$oop_tail_avx512
+
+ vmovdqu32 %zmm16,0(%rsp)
+
+L$done_avx512:
+ vzeroall
+ leaq (%r9),%rsp
+
+L$avx512_epilogue:
.byte 0xf3,0xc3
+
+
+.p2align 5
+ChaCha20_avx512vl:
+
+L$ChaCha20_avx512vl:
+ movq %rsp,%r9
+
+ cmpq $128,%rdx
+ ja L$ChaCha20_8xvl
+
+ subq $64+8,%rsp
+ vbroadcasti128 L$sigma(%rip),%ymm0
+ vbroadcasti128 (%rcx),%ymm1
+ vbroadcasti128 16(%rcx),%ymm2
+ vbroadcasti128 (%r8),%ymm3
+
+ vmovdqa32 %ymm0,%ymm16
+ vmovdqa32 %ymm1,%ymm17
+ vmovdqa32 %ymm2,%ymm18
+ vpaddd L$zeroz(%rip),%ymm3,%ymm3
+ vmovdqa32 L$twoy(%rip),%ymm20
+ movq $10,%r8
+ vmovdqa32 %ymm3,%ymm19
+ jmp L$oop_avx512vl
+
+.p2align 4
+L$oop_outer_avx512vl:
+ vmovdqa32 %ymm18,%ymm2
+ vpaddd %ymm20,%ymm19,%ymm3
+ movq $10,%r8
+ vmovdqa32 %ymm3,%ymm19
+ jmp L$oop_avx512vl
+
+.p2align 5
+L$oop_avx512vl:
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+ vpshufd $78,%ymm2,%ymm2
+ vpshufd $57,%ymm1,%ymm1
+ vpshufd $147,%ymm3,%ymm3
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+ vpshufd $78,%ymm2,%ymm2
+ vpshufd $147,%ymm1,%ymm1
+ vpshufd $57,%ymm3,%ymm3
+ decq %r8
+ jnz L$oop_avx512vl
+ vpaddd %ymm16,%ymm0,%ymm0
+ vpaddd %ymm17,%ymm1,%ymm1
+ vpaddd %ymm18,%ymm2,%ymm2
+ vpaddd %ymm19,%ymm3,%ymm3
+
+ subq $64,%rdx
+ jb L$tail64_avx512vl
+
+ vpxor 0(%rsi),%xmm0,%xmm4
+ vpxor 16(%rsi),%xmm1,%xmm5
+ vpxor 32(%rsi),%xmm2,%xmm6
+ vpxor 48(%rsi),%xmm3,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz L$done_avx512vl
+
+ vextracti128 $1,%ymm0,%xmm4
+ vextracti128 $1,%ymm1,%xmm5
+ vextracti128 $1,%ymm2,%xmm6
+ vextracti128 $1,%ymm3,%xmm7
+
+ subq $64,%rdx
+ jb L$tail_avx512vl
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ vmovdqa32 %ymm16,%ymm0
+ vmovdqa32 %ymm17,%ymm1
+ jnz L$oop_outer_avx512vl
+
+ jmp L$done_avx512vl
+
+.p2align 4
+L$tail64_avx512vl:
+ vmovdqa %xmm0,0(%rsp)
+ vmovdqa %xmm1,16(%rsp)
+ vmovdqa %xmm2,32(%rsp)
+ vmovdqa %xmm3,48(%rsp)
+ addq $64,%rdx
+ jmp L$oop_tail_avx512vl
+
+.p2align 4
+L$tail_avx512vl:
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ vmovdqa %xmm7,48(%rsp)
+ addq $64,%rdx
+
+L$oop_tail_avx512vl:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz L$oop_tail_avx512vl
+
+ vmovdqu32 %ymm16,0(%rsp)
+ vmovdqu32 %ymm16,32(%rsp)
+
+L$done_avx512vl:
+ vzeroall
+ leaq (%r9),%rsp
+
+L$avx512vl_epilogue:
+ .byte 0xf3,0xc3
+
+
+
+.p2align 5
+ChaCha20_16x:
+
+L$ChaCha20_16x:
+ movq %rsp,%r9
+
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vzeroupper
+
+ leaq L$sigma(%rip),%r10
+ vbroadcasti32x4 (%r10),%zmm3
+ vbroadcasti32x4 (%rcx),%zmm7
+ vbroadcasti32x4 16(%rcx),%zmm11
+ vbroadcasti32x4 (%r8),%zmm15
+
+ vpshufd $0x00,%zmm3,%zmm0
+ vpshufd $0x55,%zmm3,%zmm1
+ vpshufd $0xaa,%zmm3,%zmm2
+ vpshufd $0xff,%zmm3,%zmm3
+ vmovdqa64 %zmm0,%zmm16
+ vmovdqa64 %zmm1,%zmm17
+ vmovdqa64 %zmm2,%zmm18
+ vmovdqa64 %zmm3,%zmm19
+
+ vpshufd $0x00,%zmm7,%zmm4
+ vpshufd $0x55,%zmm7,%zmm5
+ vpshufd $0xaa,%zmm7,%zmm6
+ vpshufd $0xff,%zmm7,%zmm7
+ vmovdqa64 %zmm4,%zmm20
+ vmovdqa64 %zmm5,%zmm21
+ vmovdqa64 %zmm6,%zmm22
+ vmovdqa64 %zmm7,%zmm23
+
+ vpshufd $0x00,%zmm11,%zmm8
+ vpshufd $0x55,%zmm11,%zmm9
+ vpshufd $0xaa,%zmm11,%zmm10
+ vpshufd $0xff,%zmm11,%zmm11
+ vmovdqa64 %zmm8,%zmm24
+ vmovdqa64 %zmm9,%zmm25
+ vmovdqa64 %zmm10,%zmm26
+ vmovdqa64 %zmm11,%zmm27
+
+ vpshufd $0x00,%zmm15,%zmm12
+ vpshufd $0x55,%zmm15,%zmm13
+ vpshufd $0xaa,%zmm15,%zmm14
+ vpshufd $0xff,%zmm15,%zmm15
+ vpaddd L$incz(%rip),%zmm12,%zmm12
+ vmovdqa64 %zmm12,%zmm28
+ vmovdqa64 %zmm13,%zmm29
+ vmovdqa64 %zmm14,%zmm30
+ vmovdqa64 %zmm15,%zmm31
+
+ movl $10,%eax
+ jmp L$oop16x
+
+.p2align 5
+L$oop_outer16x:
+ vpbroadcastd 0(%r10),%zmm0
+ vpbroadcastd 4(%r10),%zmm1
+ vpbroadcastd 8(%r10),%zmm2
+ vpbroadcastd 12(%r10),%zmm3
+ vpaddd L$sixteen(%rip),%zmm28,%zmm28
+ vmovdqa64 %zmm20,%zmm4
+ vmovdqa64 %zmm21,%zmm5
+ vmovdqa64 %zmm22,%zmm6
+ vmovdqa64 %zmm23,%zmm7
+ vmovdqa64 %zmm24,%zmm8
+ vmovdqa64 %zmm25,%zmm9
+ vmovdqa64 %zmm26,%zmm10
+ vmovdqa64 %zmm27,%zmm11
+ vmovdqa64 %zmm28,%zmm12
+ vmovdqa64 %zmm29,%zmm13
+ vmovdqa64 %zmm30,%zmm14
+ vmovdqa64 %zmm31,%zmm15
+
+ vmovdqa64 %zmm0,%zmm16
+ vmovdqa64 %zmm1,%zmm17
+ vmovdqa64 %zmm2,%zmm18
+ vmovdqa64 %zmm3,%zmm19
+
+ movl $10,%eax
+ jmp L$oop16x
+
+.p2align 5
+L$oop16x:
+ vpaddd %zmm4,%zmm0,%zmm0
+ vpaddd %zmm5,%zmm1,%zmm1
+ vpaddd %zmm6,%zmm2,%zmm2
+ vpaddd %zmm7,%zmm3,%zmm3
+ vpxord %zmm0,%zmm12,%zmm12
+ vpxord %zmm1,%zmm13,%zmm13
+ vpxord %zmm2,%zmm14,%zmm14
+ vpxord %zmm3,%zmm15,%zmm15
+ vprold $16,%zmm12,%zmm12
+ vprold $16,%zmm13,%zmm13
+ vprold $16,%zmm14,%zmm14
+ vprold $16,%zmm15,%zmm15
+ vpaddd %zmm12,%zmm8,%zmm8
+ vpaddd %zmm13,%zmm9,%zmm9
+ vpaddd %zmm14,%zmm10,%zmm10
+ vpaddd %zmm15,%zmm11,%zmm11
+ vpxord %zmm8,%zmm4,%zmm4
+ vpxord %zmm9,%zmm5,%zmm5
+ vpxord %zmm10,%zmm6,%zmm6
+ vpxord %zmm11,%zmm7,%zmm7
+ vprold $12,%zmm4,%zmm4
+ vprold $12,%zmm5,%zmm5
+ vprold $12,%zmm6,%zmm6
+ vprold $12,%zmm7,%zmm7
+ vpaddd %zmm4,%zmm0,%zmm0
+ vpaddd %zmm5,%zmm1,%zmm1
+ vpaddd %zmm6,%zmm2,%zmm2
+ vpaddd %zmm7,%zmm3,%zmm3
+ vpxord %zmm0,%zmm12,%zmm12
+ vpxord %zmm1,%zmm13,%zmm13
+ vpxord %zmm2,%zmm14,%zmm14
+ vpxord %zmm3,%zmm15,%zmm15
+ vprold $8,%zmm12,%zmm12
+ vprold $8,%zmm13,%zmm13
+ vprold $8,%zmm14,%zmm14
+ vprold $8,%zmm15,%zmm15
+ vpaddd %zmm12,%zmm8,%zmm8
+ vpaddd %zmm13,%zmm9,%zmm9
+ vpaddd %zmm14,%zmm10,%zmm10
+ vpaddd %zmm15,%zmm11,%zmm11
+ vpxord %zmm8,%zmm4,%zmm4
+ vpxord %zmm9,%zmm5,%zmm5
+ vpxord %zmm10,%zmm6,%zmm6
+ vpxord %zmm11,%zmm7,%zmm7
+ vprold $7,%zmm4,%zmm4
+ vprold $7,%zmm5,%zmm5
+ vprold $7,%zmm6,%zmm6
+ vprold $7,%zmm7,%zmm7
+ vpaddd %zmm5,%zmm0,%zmm0
+ vpaddd %zmm6,%zmm1,%zmm1
+ vpaddd %zmm7,%zmm2,%zmm2
+ vpaddd %zmm4,%zmm3,%zmm3
+ vpxord %zmm0,%zmm15,%zmm15
+ vpxord %zmm1,%zmm12,%zmm12
+ vpxord %zmm2,%zmm13,%zmm13
+ vpxord %zmm3,%zmm14,%zmm14
+ vprold $16,%zmm15,%zmm15
+ vprold $16,%zmm12,%zmm12
+ vprold $16,%zmm13,%zmm13
+ vprold $16,%zmm14,%zmm14
+ vpaddd %zmm15,%zmm10,%zmm10
+ vpaddd %zmm12,%zmm11,%zmm11
+ vpaddd %zmm13,%zmm8,%zmm8
+ vpaddd %zmm14,%zmm9,%zmm9
+ vpxord %zmm10,%zmm5,%zmm5
+ vpxord %zmm11,%zmm6,%zmm6
+ vpxord %zmm8,%zmm7,%zmm7
+ vpxord %zmm9,%zmm4,%zmm4
+ vprold $12,%zmm5,%zmm5
+ vprold $12,%zmm6,%zmm6
+ vprold $12,%zmm7,%zmm7
+ vprold $12,%zmm4,%zmm4
+ vpaddd %zmm5,%zmm0,%zmm0
+ vpaddd %zmm6,%zmm1,%zmm1
+ vpaddd %zmm7,%zmm2,%zmm2
+ vpaddd %zmm4,%zmm3,%zmm3
+ vpxord %zmm0,%zmm15,%zmm15
+ vpxord %zmm1,%zmm12,%zmm12
+ vpxord %zmm2,%zmm13,%zmm13
+ vpxord %zmm3,%zmm14,%zmm14
+ vprold $8,%zmm15,%zmm15
+ vprold $8,%zmm12,%zmm12
+ vprold $8,%zmm13,%zmm13
+ vprold $8,%zmm14,%zmm14
+ vpaddd %zmm15,%zmm10,%zmm10
+ vpaddd %zmm12,%zmm11,%zmm11
+ vpaddd %zmm13,%zmm8,%zmm8
+ vpaddd %zmm14,%zmm9,%zmm9
+ vpxord %zmm10,%zmm5,%zmm5
+ vpxord %zmm11,%zmm6,%zmm6
+ vpxord %zmm8,%zmm7,%zmm7
+ vpxord %zmm9,%zmm4,%zmm4
+ vprold $7,%zmm5,%zmm5
+ vprold $7,%zmm6,%zmm6
+ vprold $7,%zmm7,%zmm7
+ vprold $7,%zmm4,%zmm4
+ decl %eax
+ jnz L$oop16x
+
+ vpaddd %zmm16,%zmm0,%zmm0
+ vpaddd %zmm17,%zmm1,%zmm1
+ vpaddd %zmm18,%zmm2,%zmm2
+ vpaddd %zmm19,%zmm3,%zmm3
+
+ vpunpckldq %zmm1,%zmm0,%zmm18
+ vpunpckldq %zmm3,%zmm2,%zmm19
+ vpunpckhdq %zmm1,%zmm0,%zmm0
+ vpunpckhdq %zmm3,%zmm2,%zmm2
+ vpunpcklqdq %zmm19,%zmm18,%zmm1
+ vpunpckhqdq %zmm19,%zmm18,%zmm18
+ vpunpcklqdq %zmm2,%zmm0,%zmm3
+ vpunpckhqdq %zmm2,%zmm0,%zmm0
+ vpaddd %zmm20,%zmm4,%zmm4
+ vpaddd %zmm21,%zmm5,%zmm5
+ vpaddd %zmm22,%zmm6,%zmm6
+ vpaddd %zmm23,%zmm7,%zmm7
+
+ vpunpckldq %zmm5,%zmm4,%zmm2
+ vpunpckldq %zmm7,%zmm6,%zmm19
+ vpunpckhdq %zmm5,%zmm4,%zmm4
+ vpunpckhdq %zmm7,%zmm6,%zmm6
+ vpunpcklqdq %zmm19,%zmm2,%zmm5
+ vpunpckhqdq %zmm19,%zmm2,%zmm2
+ vpunpcklqdq %zmm6,%zmm4,%zmm7
+ vpunpckhqdq %zmm6,%zmm4,%zmm4
+ vshufi32x4 $0x44,%zmm5,%zmm1,%zmm19
+ vshufi32x4 $0xee,%zmm5,%zmm1,%zmm5
+ vshufi32x4 $0x44,%zmm2,%zmm18,%zmm1
+ vshufi32x4 $0xee,%zmm2,%zmm18,%zmm2
+ vshufi32x4 $0x44,%zmm7,%zmm3,%zmm18
+ vshufi32x4 $0xee,%zmm7,%zmm3,%zmm7
+ vshufi32x4 $0x44,%zmm4,%zmm0,%zmm3
+ vshufi32x4 $0xee,%zmm4,%zmm0,%zmm4
+ vpaddd %zmm24,%zmm8,%zmm8
+ vpaddd %zmm25,%zmm9,%zmm9
+ vpaddd %zmm26,%zmm10,%zmm10
+ vpaddd %zmm27,%zmm11,%zmm11
+
+ vpunpckldq %zmm9,%zmm8,%zmm6
+ vpunpckldq %zmm11,%zmm10,%zmm0
+ vpunpckhdq %zmm9,%zmm8,%zmm8
+ vpunpckhdq %zmm11,%zmm10,%zmm10
+ vpunpcklqdq %zmm0,%zmm6,%zmm9
+ vpunpckhqdq %zmm0,%zmm6,%zmm6
+ vpunpcklqdq %zmm10,%zmm8,%zmm11
+ vpunpckhqdq %zmm10,%zmm8,%zmm8
+ vpaddd %zmm28,%zmm12,%zmm12
+ vpaddd %zmm29,%zmm13,%zmm13
+ vpaddd %zmm30,%zmm14,%zmm14
+ vpaddd %zmm31,%zmm15,%zmm15
+
+ vpunpckldq %zmm13,%zmm12,%zmm10
+ vpunpckldq %zmm15,%zmm14,%zmm0
+ vpunpckhdq %zmm13,%zmm12,%zmm12
+ vpunpckhdq %zmm15,%zmm14,%zmm14
+ vpunpcklqdq %zmm0,%zmm10,%zmm13
+ vpunpckhqdq %zmm0,%zmm10,%zmm10
+ vpunpcklqdq %zmm14,%zmm12,%zmm15
+ vpunpckhqdq %zmm14,%zmm12,%zmm12
+ vshufi32x4 $0x44,%zmm13,%zmm9,%zmm0
+ vshufi32x4 $0xee,%zmm13,%zmm9,%zmm13
+ vshufi32x4 $0x44,%zmm10,%zmm6,%zmm9
+ vshufi32x4 $0xee,%zmm10,%zmm6,%zmm10
+ vshufi32x4 $0x44,%zmm15,%zmm11,%zmm6
+ vshufi32x4 $0xee,%zmm15,%zmm11,%zmm15
+ vshufi32x4 $0x44,%zmm12,%zmm8,%zmm11
+ vshufi32x4 $0xee,%zmm12,%zmm8,%zmm12
+ vshufi32x4 $0x88,%zmm0,%zmm19,%zmm16
+ vshufi32x4 $0xdd,%zmm0,%zmm19,%zmm19
+ vshufi32x4 $0x88,%zmm13,%zmm5,%zmm0
+ vshufi32x4 $0xdd,%zmm13,%zmm5,%zmm13
+ vshufi32x4 $0x88,%zmm9,%zmm1,%zmm17
+ vshufi32x4 $0xdd,%zmm9,%zmm1,%zmm1
+ vshufi32x4 $0x88,%zmm10,%zmm2,%zmm9
+ vshufi32x4 $0xdd,%zmm10,%zmm2,%zmm10
+ vshufi32x4 $0x88,%zmm6,%zmm18,%zmm14
+ vshufi32x4 $0xdd,%zmm6,%zmm18,%zmm18
+ vshufi32x4 $0x88,%zmm15,%zmm7,%zmm6
+ vshufi32x4 $0xdd,%zmm15,%zmm7,%zmm15
+ vshufi32x4 $0x88,%zmm11,%zmm3,%zmm8
+ vshufi32x4 $0xdd,%zmm11,%zmm3,%zmm3
+ vshufi32x4 $0x88,%zmm12,%zmm4,%zmm11
+ vshufi32x4 $0xdd,%zmm12,%zmm4,%zmm12
+ cmpq $1024,%rdx
+ jb L$tail16x
+
+ vpxord 0(%rsi),%zmm16,%zmm16
+ vpxord 64(%rsi),%zmm17,%zmm17
+ vpxord 128(%rsi),%zmm14,%zmm14
+ vpxord 192(%rsi),%zmm8,%zmm8
+ vmovdqu32 %zmm16,0(%rdi)
+ vmovdqu32 %zmm17,64(%rdi)
+ vmovdqu32 %zmm14,128(%rdi)
+ vmovdqu32 %zmm8,192(%rdi)
+
+ vpxord 256(%rsi),%zmm19,%zmm19
+ vpxord 320(%rsi),%zmm1,%zmm1
+ vpxord 384(%rsi),%zmm18,%zmm18
+ vpxord 448(%rsi),%zmm3,%zmm3
+ vmovdqu32 %zmm19,256(%rdi)
+ vmovdqu32 %zmm1,320(%rdi)
+ vmovdqu32 %zmm18,384(%rdi)
+ vmovdqu32 %zmm3,448(%rdi)
+
+ vpxord 512(%rsi),%zmm0,%zmm0
+ vpxord 576(%rsi),%zmm9,%zmm9
+ vpxord 640(%rsi),%zmm6,%zmm6
+ vpxord 704(%rsi),%zmm11,%zmm11
+ vmovdqu32 %zmm0,512(%rdi)
+ vmovdqu32 %zmm9,576(%rdi)
+ vmovdqu32 %zmm6,640(%rdi)
+ vmovdqu32 %zmm11,704(%rdi)
+
+ vpxord 768(%rsi),%zmm13,%zmm13
+ vpxord 832(%rsi),%zmm10,%zmm10
+ vpxord 896(%rsi),%zmm15,%zmm15
+ vpxord 960(%rsi),%zmm12,%zmm12
+ leaq 1024(%rsi),%rsi
+ vmovdqu32 %zmm13,768(%rdi)
+ vmovdqu32 %zmm10,832(%rdi)
+ vmovdqu32 %zmm15,896(%rdi)
+ vmovdqu32 %zmm12,960(%rdi)
+ leaq 1024(%rdi),%rdi
+
+ subq $1024,%rdx
+ jnz L$oop_outer16x
+
+ jmp L$done16x
+
+.p2align 5
+L$tail16x:
+ xorq %r10,%r10
+ subq %rsi,%rdi
+ cmpq $64,%rdx
+ jb L$ess_than_64_16x
+ vpxord (%rsi),%zmm16,%zmm16
+ vmovdqu32 %zmm16,(%rdi,%rsi,1)
+ je L$done16x
+ vmovdqa32 %zmm17,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $128,%rdx
+ jb L$ess_than_64_16x
+ vpxord (%rsi),%zmm17,%zmm17
+ vmovdqu32 %zmm17,(%rdi,%rsi,1)
+ je L$done16x
+ vmovdqa32 %zmm14,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $192,%rdx
+ jb L$ess_than_64_16x
+ vpxord (%rsi),%zmm14,%zmm14
+ vmovdqu32 %zmm14,(%rdi,%rsi,1)
+ je L$done16x
+ vmovdqa32 %zmm8,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $256,%rdx
+ jb L$ess_than_64_16x
+ vpxord (%rsi),%zmm8,%zmm8
+ vmovdqu32 %zmm8,(%rdi,%rsi,1)
+ je L$done16x
+ vmovdqa32 %zmm19,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $320,%rdx
+ jb L$ess_than_64_16x
+ vpxord (%rsi),%zmm19,%zmm19
+ vmovdqu32 %zmm19,(%rdi,%rsi,1)
+ je L$done16x
+ vmovdqa32 %zmm1,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $384,%rdx
+ jb L$ess_than_64_16x
+ vpxord (%rsi),%zmm1,%zmm1
+ vmovdqu32 %zmm1,(%rdi,%rsi,1)
+ je L$done16x
+ vmovdqa32 %zmm18,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $448,%rdx
+ jb L$ess_than_64_16x
+ vpxord (%rsi),%zmm18,%zmm18
+ vmovdqu32 %zmm18,(%rdi,%rsi,1)
+ je L$done16x
+ vmovdqa32 %zmm3,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $512,%rdx
+ jb L$ess_than_64_16x
+ vpxord (%rsi),%zmm3,%zmm3
+ vmovdqu32 %zmm3,(%rdi,%rsi,1)
+ je L$done16x
+ vmovdqa32 %zmm0,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $576,%rdx
+ jb L$ess_than_64_16x
+ vpxord (%rsi),%zmm0,%zmm0
+ vmovdqu32 %zmm0,(%rdi,%rsi,1)
+ je L$done16x
+ vmovdqa32 %zmm9,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $640,%rdx
+ jb L$ess_than_64_16x
+ vpxord (%rsi),%zmm9,%zmm9
+ vmovdqu32 %zmm9,(%rdi,%rsi,1)
+ je L$done16x
+ vmovdqa32 %zmm6,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $704,%rdx
+ jb L$ess_than_64_16x
+ vpxord (%rsi),%zmm6,%zmm6
+ vmovdqu32 %zmm6,(%rdi,%rsi,1)
+ je L$done16x
+ vmovdqa32 %zmm11,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $768,%rdx
+ jb L$ess_than_64_16x
+ vpxord (%rsi),%zmm11,%zmm11
+ vmovdqu32 %zmm11,(%rdi,%rsi,1)
+ je L$done16x
+ vmovdqa32 %zmm13,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $832,%rdx
+ jb L$ess_than_64_16x
+ vpxord (%rsi),%zmm13,%zmm13
+ vmovdqu32 %zmm13,(%rdi,%rsi,1)
+ je L$done16x
+ vmovdqa32 %zmm10,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $896,%rdx
+ jb L$ess_than_64_16x
+ vpxord (%rsi),%zmm10,%zmm10
+ vmovdqu32 %zmm10,(%rdi,%rsi,1)
+ je L$done16x
+ vmovdqa32 %zmm15,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $960,%rdx
+ jb L$ess_than_64_16x
+ vpxord (%rsi),%zmm15,%zmm15
+ vmovdqu32 %zmm15,(%rdi,%rsi,1)
+ je L$done16x
+ vmovdqa32 %zmm12,%zmm16
+ leaq 64(%rsi),%rsi
+
+L$ess_than_64_16x:
+ vmovdqa32 %zmm16,0(%rsp)
+ leaq (%rdi,%rsi,1),%rdi
+ andq $63,%rdx
+
+L$oop_tail16x:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz L$oop_tail16x
+
+ vpxord %zmm16,%zmm16,%zmm16
+ vmovdqa32 %zmm16,0(%rsp)
+
+L$done16x:
+ vzeroall
+ leaq (%r9),%rsp
+
+L$16x_epilogue:
+ .byte 0xf3,0xc3
+
+
+
+.p2align 5
+ChaCha20_8xvl:
+
+L$ChaCha20_8xvl:
+ movq %rsp,%r9
+
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vzeroupper
+
+ leaq L$sigma(%rip),%r10
+ vbroadcasti128 (%r10),%ymm3
+ vbroadcasti128 (%rcx),%ymm7
+ vbroadcasti128 16(%rcx),%ymm11
+ vbroadcasti128 (%r8),%ymm15
+
+ vpshufd $0x00,%ymm3,%ymm0
+ vpshufd $0x55,%ymm3,%ymm1
+ vpshufd $0xaa,%ymm3,%ymm2
+ vpshufd $0xff,%ymm3,%ymm3
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm1,%ymm17
+ vmovdqa64 %ymm2,%ymm18
+ vmovdqa64 %ymm3,%ymm19
+
+ vpshufd $0x00,%ymm7,%ymm4
+ vpshufd $0x55,%ymm7,%ymm5
+ vpshufd $0xaa,%ymm7,%ymm6
+ vpshufd $0xff,%ymm7,%ymm7
+ vmovdqa64 %ymm4,%ymm20
+ vmovdqa64 %ymm5,%ymm21
+ vmovdqa64 %ymm6,%ymm22
+ vmovdqa64 %ymm7,%ymm23
+
+ vpshufd $0x00,%ymm11,%ymm8
+ vpshufd $0x55,%ymm11,%ymm9
+ vpshufd $0xaa,%ymm11,%ymm10
+ vpshufd $0xff,%ymm11,%ymm11
+ vmovdqa64 %ymm8,%ymm24
+ vmovdqa64 %ymm9,%ymm25
+ vmovdqa64 %ymm10,%ymm26
+ vmovdqa64 %ymm11,%ymm27
+
+ vpshufd $0x00,%ymm15,%ymm12
+ vpshufd $0x55,%ymm15,%ymm13
+ vpshufd $0xaa,%ymm15,%ymm14
+ vpshufd $0xff,%ymm15,%ymm15
+ vpaddd L$incy(%rip),%ymm12,%ymm12
+ vmovdqa64 %ymm12,%ymm28
+ vmovdqa64 %ymm13,%ymm29
+ vmovdqa64 %ymm14,%ymm30
+ vmovdqa64 %ymm15,%ymm31
+
+ movl $10,%eax
+ jmp L$oop8xvl
+
+.p2align 5
+L$oop_outer8xvl:
+
+
+ vpbroadcastd 8(%r10),%ymm2
+ vpbroadcastd 12(%r10),%ymm3
+ vpaddd L$eight(%rip),%ymm28,%ymm28
+ vmovdqa64 %ymm20,%ymm4
+ vmovdqa64 %ymm21,%ymm5
+ vmovdqa64 %ymm22,%ymm6
+ vmovdqa64 %ymm23,%ymm7
+ vmovdqa64 %ymm24,%ymm8
+ vmovdqa64 %ymm25,%ymm9
+ vmovdqa64 %ymm26,%ymm10
+ vmovdqa64 %ymm27,%ymm11
+ vmovdqa64 %ymm28,%ymm12
+ vmovdqa64 %ymm29,%ymm13
+ vmovdqa64 %ymm30,%ymm14
+ vmovdqa64 %ymm31,%ymm15
+
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm1,%ymm17
+ vmovdqa64 %ymm2,%ymm18
+ vmovdqa64 %ymm3,%ymm19
+
+ movl $10,%eax
+ jmp L$oop8xvl
+
+.p2align 5
+L$oop8xvl:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpxor %ymm0,%ymm12,%ymm12
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm3,%ymm15,%ymm15
+ vprold $16,%ymm12,%ymm12
+ vprold $16,%ymm13,%ymm13
+ vprold $16,%ymm14,%ymm14
+ vprold $16,%ymm15,%ymm15
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm8,%ymm4,%ymm4
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm11,%ymm7,%ymm7
+ vprold $12,%ymm4,%ymm4
+ vprold $12,%ymm5,%ymm5
+ vprold $12,%ymm6,%ymm6
+ vprold $12,%ymm7,%ymm7
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpxor %ymm0,%ymm12,%ymm12
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm3,%ymm15,%ymm15
+ vprold $8,%ymm12,%ymm12
+ vprold $8,%ymm13,%ymm13
+ vprold $8,%ymm14,%ymm14
+ vprold $8,%ymm15,%ymm15
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm8,%ymm4,%ymm4
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm11,%ymm7,%ymm7
+ vprold $7,%ymm4,%ymm4
+ vprold $7,%ymm5,%ymm5
+ vprold $7,%ymm6,%ymm6
+ vprold $7,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpaddd %ymm4,%ymm3,%ymm3
+ vpxor %ymm0,%ymm15,%ymm15
+ vpxor %ymm1,%ymm12,%ymm12
+ vpxor %ymm2,%ymm13,%ymm13
+ vpxor %ymm3,%ymm14,%ymm14
+ vprold $16,%ymm15,%ymm15
+ vprold $16,%ymm12,%ymm12
+ vprold $16,%ymm13,%ymm13
+ vprold $16,%ymm14,%ymm14
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm10,%ymm5,%ymm5
+ vpxor %ymm11,%ymm6,%ymm6
+ vpxor %ymm8,%ymm7,%ymm7
+ vpxor %ymm9,%ymm4,%ymm4
+ vprold $12,%ymm5,%ymm5
+ vprold $12,%ymm6,%ymm6
+ vprold $12,%ymm7,%ymm7
+ vprold $12,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpaddd %ymm4,%ymm3,%ymm3
+ vpxor %ymm0,%ymm15,%ymm15
+ vpxor %ymm1,%ymm12,%ymm12
+ vpxor %ymm2,%ymm13,%ymm13
+ vpxor %ymm3,%ymm14,%ymm14
+ vprold $8,%ymm15,%ymm15
+ vprold $8,%ymm12,%ymm12
+ vprold $8,%ymm13,%ymm13
+ vprold $8,%ymm14,%ymm14
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm10,%ymm5,%ymm5
+ vpxor %ymm11,%ymm6,%ymm6
+ vpxor %ymm8,%ymm7,%ymm7
+ vpxor %ymm9,%ymm4,%ymm4
+ vprold $7,%ymm5,%ymm5
+ vprold $7,%ymm6,%ymm6
+ vprold $7,%ymm7,%ymm7
+ vprold $7,%ymm4,%ymm4
+ decl %eax
+ jnz L$oop8xvl
+
+ vpaddd %ymm16,%ymm0,%ymm0
+ vpaddd %ymm17,%ymm1,%ymm1
+ vpaddd %ymm18,%ymm2,%ymm2
+ vpaddd %ymm19,%ymm3,%ymm3
+
+ vpunpckldq %ymm1,%ymm0,%ymm18
+ vpunpckldq %ymm3,%ymm2,%ymm19
+ vpunpckhdq %ymm1,%ymm0,%ymm0
+ vpunpckhdq %ymm3,%ymm2,%ymm2
+ vpunpcklqdq %ymm19,%ymm18,%ymm1
+ vpunpckhqdq %ymm19,%ymm18,%ymm18
+ vpunpcklqdq %ymm2,%ymm0,%ymm3
+ vpunpckhqdq %ymm2,%ymm0,%ymm0
+ vpaddd %ymm20,%ymm4,%ymm4
+ vpaddd %ymm21,%ymm5,%ymm5
+ vpaddd %ymm22,%ymm6,%ymm6
+ vpaddd %ymm23,%ymm7,%ymm7
+
+ vpunpckldq %ymm5,%ymm4,%ymm2
+ vpunpckldq %ymm7,%ymm6,%ymm19
+ vpunpckhdq %ymm5,%ymm4,%ymm4
+ vpunpckhdq %ymm7,%ymm6,%ymm6
+ vpunpcklqdq %ymm19,%ymm2,%ymm5
+ vpunpckhqdq %ymm19,%ymm2,%ymm2
+ vpunpcklqdq %ymm6,%ymm4,%ymm7
+ vpunpckhqdq %ymm6,%ymm4,%ymm4
+ vshufi32x4 $0,%ymm5,%ymm1,%ymm19
+ vshufi32x4 $3,%ymm5,%ymm1,%ymm5
+ vshufi32x4 $0,%ymm2,%ymm18,%ymm1
+ vshufi32x4 $3,%ymm2,%ymm18,%ymm2
+ vshufi32x4 $0,%ymm7,%ymm3,%ymm18
+ vshufi32x4 $3,%ymm7,%ymm3,%ymm7
+ vshufi32x4 $0,%ymm4,%ymm0,%ymm3
+ vshufi32x4 $3,%ymm4,%ymm0,%ymm4
+ vpaddd %ymm24,%ymm8,%ymm8
+ vpaddd %ymm25,%ymm9,%ymm9
+ vpaddd %ymm26,%ymm10,%ymm10
+ vpaddd %ymm27,%ymm11,%ymm11
+
+ vpunpckldq %ymm9,%ymm8,%ymm6
+ vpunpckldq %ymm11,%ymm10,%ymm0
+ vpunpckhdq %ymm9,%ymm8,%ymm8
+ vpunpckhdq %ymm11,%ymm10,%ymm10
+ vpunpcklqdq %ymm0,%ymm6,%ymm9
+ vpunpckhqdq %ymm0,%ymm6,%ymm6
+ vpunpcklqdq %ymm10,%ymm8,%ymm11
+ vpunpckhqdq %ymm10,%ymm8,%ymm8
+ vpaddd %ymm28,%ymm12,%ymm12
+ vpaddd %ymm29,%ymm13,%ymm13
+ vpaddd %ymm30,%ymm14,%ymm14
+ vpaddd %ymm31,%ymm15,%ymm15
+
+ vpunpckldq %ymm13,%ymm12,%ymm10
+ vpunpckldq %ymm15,%ymm14,%ymm0
+ vpunpckhdq %ymm13,%ymm12,%ymm12
+ vpunpckhdq %ymm15,%ymm14,%ymm14
+ vpunpcklqdq %ymm0,%ymm10,%ymm13
+ vpunpckhqdq %ymm0,%ymm10,%ymm10
+ vpunpcklqdq %ymm14,%ymm12,%ymm15
+ vpunpckhqdq %ymm14,%ymm12,%ymm12
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+ vperm2i128 $0x20,%ymm10,%ymm6,%ymm9
+ vperm2i128 $0x31,%ymm10,%ymm6,%ymm10
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm6
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+ vperm2i128 $0x20,%ymm12,%ymm8,%ymm11
+ vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
+ cmpq $512,%rdx
+ jb L$tail8xvl
+
+ movl $0x80,%eax
+ vpxord 0(%rsi),%ymm19,%ymm19
+ vpxor 32(%rsi),%ymm0,%ymm0
+ vpxor 64(%rsi),%ymm5,%ymm5
+ vpxor 96(%rsi),%ymm13,%ymm13
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu32 %ymm19,0(%rdi)
+ vmovdqu %ymm0,32(%rdi)
+ vmovdqu %ymm5,64(%rdi)
+ vmovdqu %ymm13,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpxor 0(%rsi),%ymm1,%ymm1
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vpxor 64(%rsi),%ymm2,%ymm2
+ vpxor 96(%rsi),%ymm10,%ymm10
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu %ymm1,0(%rdi)
+ vmovdqu %ymm9,32(%rdi)
+ vmovdqu %ymm2,64(%rdi)
+ vmovdqu %ymm10,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpxord 0(%rsi),%ymm18,%ymm18
+ vpxor 32(%rsi),%ymm6,%ymm6
+ vpxor 64(%rsi),%ymm7,%ymm7
+ vpxor 96(%rsi),%ymm15,%ymm15
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu32 %ymm18,0(%rdi)
+ vmovdqu %ymm6,32(%rdi)
+ vmovdqu %ymm7,64(%rdi)
+ vmovdqu %ymm15,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpxor 0(%rsi),%ymm3,%ymm3
+ vpxor 32(%rsi),%ymm11,%ymm11
+ vpxor 64(%rsi),%ymm4,%ymm4
+ vpxor 96(%rsi),%ymm12,%ymm12
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu %ymm3,0(%rdi)
+ vmovdqu %ymm11,32(%rdi)
+ vmovdqu %ymm4,64(%rdi)
+ vmovdqu %ymm12,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpbroadcastd 0(%r10),%ymm0
+ vpbroadcastd 4(%r10),%ymm1
+
+ subq $512,%rdx
+ jnz L$oop_outer8xvl
+
+ jmp L$done8xvl
+
+.p2align 5
+L$tail8xvl:
+ vmovdqa64 %ymm19,%ymm8
+ xorq %r10,%r10
+ subq %rsi,%rdi
+ cmpq $64,%rdx
+ jb L$ess_than_64_8xvl
+ vpxor 0(%rsi),%ymm8,%ymm8
+ vpxor 32(%rsi),%ymm0,%ymm0
+ vmovdqu %ymm8,0(%rdi,%rsi,1)
+ vmovdqu %ymm0,32(%rdi,%rsi,1)
+ je L$done8xvl
+ vmovdqa %ymm5,%ymm8
+ vmovdqa %ymm13,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $128,%rdx
+ jb L$ess_than_64_8xvl
+ vpxor 0(%rsi),%ymm5,%ymm5
+ vpxor 32(%rsi),%ymm13,%ymm13
+ vmovdqu %ymm5,0(%rdi,%rsi,1)
+ vmovdqu %ymm13,32(%rdi,%rsi,1)
+ je L$done8xvl
+ vmovdqa %ymm1,%ymm8
+ vmovdqa %ymm9,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $192,%rdx
+ jb L$ess_than_64_8xvl
+ vpxor 0(%rsi),%ymm1,%ymm1
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm1,0(%rdi,%rsi,1)
+ vmovdqu %ymm9,32(%rdi,%rsi,1)
+ je L$done8xvl
+ vmovdqa %ymm2,%ymm8
+ vmovdqa %ymm10,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $256,%rdx
+ jb L$ess_than_64_8xvl
+ vpxor 0(%rsi),%ymm2,%ymm2
+ vpxor 32(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm2,0(%rdi,%rsi,1)
+ vmovdqu %ymm10,32(%rdi,%rsi,1)
+ je L$done8xvl
+ vmovdqa32 %ymm18,%ymm8
+ vmovdqa %ymm6,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $320,%rdx
+ jb L$ess_than_64_8xvl
+ vpxord 0(%rsi),%ymm18,%ymm18
+ vpxor 32(%rsi),%ymm6,%ymm6
+ vmovdqu32 %ymm18,0(%rdi,%rsi,1)
+ vmovdqu %ymm6,32(%rdi,%rsi,1)
+ je L$done8xvl
+ vmovdqa %ymm7,%ymm8
+ vmovdqa %ymm15,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $384,%rdx
+ jb L$ess_than_64_8xvl
+ vpxor 0(%rsi),%ymm7,%ymm7
+ vpxor 32(%rsi),%ymm15,%ymm15
+ vmovdqu %ymm7,0(%rdi,%rsi,1)
+ vmovdqu %ymm15,32(%rdi,%rsi,1)
+ je L$done8xvl
+ vmovdqa %ymm3,%ymm8
+ vmovdqa %ymm11,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $448,%rdx
+ jb L$ess_than_64_8xvl
+ vpxor 0(%rsi),%ymm3,%ymm3
+ vpxor 32(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm3,0(%rdi,%rsi,1)
+ vmovdqu %ymm11,32(%rdi,%rsi,1)
+ je L$done8xvl
+ vmovdqa %ymm4,%ymm8
+ vmovdqa %ymm12,%ymm0
+ leaq 64(%rsi),%rsi
+
+L$ess_than_64_8xvl:
+ vmovdqa %ymm8,0(%rsp)
+ vmovdqa %ymm0,32(%rsp)
+ leaq (%rdi,%rsi,1),%rdi
+ andq $63,%rdx
+
+L$oop_tail8xvl:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz L$oop_tail8xvl
+
+ vpxor %ymm8,%ymm8,%ymm8
+ vmovdqa %ymm8,0(%rsp)
+ vmovdqa %ymm8,32(%rsp)
+
+L$done8xvl:
+ vzeroall
+ leaq (%r9),%rsp
+
+L$8xvl_epilogue:
+ .byte 0xf3,0xc3
+
+
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/ec/ecp_nistz256-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/ec/ecp_nistz256-x86_64.s
index 77102c6a41..302649aacc 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/ec/ecp_nistz256-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/ec/ecp_nistz256-x86_64.s
@@ -2393,13 +2393,23 @@ L$Three:
L$ONE_mont:
.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+L$ord:
+.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+L$ordK:
+.quad 0xccd1c8aaee00bc4f
+
.globl _ecp_nistz256_mul_by_2
.p2align 6
_ecp_nistz256_mul_by_2:
+
pushq %r12
+
pushq %r13
+L$mul_by_2_body:
+
movq 0(%rsi),%r8
xorq %r13,%r13
movq 8(%rsi),%r9
@@ -2431,20 +2441,30 @@ _ecp_nistz256_mul_by_2:
movq %r10,16(%rdi)
movq %r11,24(%rdi)
- popq %r13
- popq %r12
+ movq 0(%rsp),%r13
+
+ movq 8(%rsp),%r12
+
+ leaq 16(%rsp),%rsp
+
+L$mul_by_2_epilogue:
.byte 0xf3,0xc3
+
.globl _ecp_nistz256_div_by_2
.p2align 5
_ecp_nistz256_div_by_2:
+
pushq %r12
+
pushq %r13
+L$div_by_2_body:
+
movq 0(%rsi),%r8
movq 8(%rsi),%r9
movq 16(%rsi),%r10
@@ -2491,20 +2511,30 @@ _ecp_nistz256_div_by_2:
movq %r10,16(%rdi)
movq %r11,24(%rdi)
- popq %r13
- popq %r12
+ movq 0(%rsp),%r13
+
+ movq 8(%rsp),%r12
+
+ leaq 16(%rsp),%rsp
+
+L$div_by_2_epilogue:
.byte 0xf3,0xc3
+
.globl _ecp_nistz256_mul_by_3
.p2align 5
_ecp_nistz256_mul_by_3:
+
pushq %r12
+
pushq %r13
+L$mul_by_3_body:
+
movq 0(%rsi),%r8
xorq %r13,%r13
movq 8(%rsi),%r9
@@ -2557,20 +2587,30 @@ _ecp_nistz256_mul_by_3:
movq %r10,16(%rdi)
movq %r11,24(%rdi)
- popq %r13
- popq %r12
+ movq 0(%rsp),%r13
+
+ movq 8(%rsp),%r12
+
+ leaq 16(%rsp),%rsp
+
+L$mul_by_3_epilogue:
.byte 0xf3,0xc3
+
.globl _ecp_nistz256_add
.p2align 5
_ecp_nistz256_add:
+
pushq %r12
+
pushq %r13
+L$add_body:
+
movq 0(%rsi),%r8
xorq %r13,%r13
movq 8(%rsi),%r9
@@ -2603,20 +2643,30 @@ _ecp_nistz256_add:
movq %r10,16(%rdi)
movq %r11,24(%rdi)
- popq %r13
- popq %r12
+ movq 0(%rsp),%r13
+
+ movq 8(%rsp),%r12
+
+ leaq 16(%rsp),%rsp
+
+L$add_epilogue:
.byte 0xf3,0xc3
+
.globl _ecp_nistz256_sub
.p2align 5
_ecp_nistz256_sub:
+
pushq %r12
+
pushq %r13
+L$sub_body:
+
movq 0(%rsi),%r8
xorq %r13,%r13
movq 8(%rsi),%r9
@@ -2649,20 +2699,30 @@ _ecp_nistz256_sub:
movq %r10,16(%rdi)
movq %r11,24(%rdi)
- popq %r13
- popq %r12
+ movq 0(%rsp),%r13
+
+ movq 8(%rsp),%r12
+
+ leaq 16(%rsp),%rsp
+
+L$sub_epilogue:
.byte 0xf3,0xc3
+
.globl _ecp_nistz256_neg
.p2align 5
_ecp_nistz256_neg:
+
pushq %r12
+
pushq %r13
+L$neg_body:
+
xorq %r8,%r8
xorq %r9,%r9
xorq %r10,%r10
@@ -2695,14 +2755,1085 @@ _ecp_nistz256_neg:
movq %r10,16(%rdi)
movq %r11,24(%rdi)
- popq %r13
- popq %r12
+ movq 0(%rsp),%r13
+
+ movq 8(%rsp),%r12
+
+ leaq 16(%rsp),%rsp
+
+L$neg_epilogue:
.byte 0xf3,0xc3
+
+
+
+.globl _ecp_nistz256_ord_mul_mont
+
+.p2align 5
+_ecp_nistz256_ord_mul_mont:
+
+ movl $0x80100,%ecx
+ andl _OPENSSL_ia32cap_P+8(%rip),%ecx
+ cmpl $0x80100,%ecx
+ je L$ecp_nistz256_ord_mul_montx
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+L$ord_mul_body:
+
+ movq 0(%rdx),%rax
+ movq %rdx,%rbx
+ leaq L$ord(%rip),%r14
+ movq L$ordK(%rip),%r15
+
+
+ movq %rax,%rcx
+ mulq 0(%rsi)
+ movq %rax,%r8
+ movq %rcx,%rax
+ movq %rdx,%r9
+
+ mulq 8(%rsi)
+ addq %rax,%r9
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq 16(%rsi)
+ addq %rax,%r10
+ movq %rcx,%rax
+ adcq $0,%rdx
+
+ movq %r8,%r13
+ imulq %r15,%r8
+
+ movq %rdx,%r11
+ mulq 24(%rsi)
+ addq %rax,%r11
+ movq %r8,%rax
+ adcq $0,%rdx
+ movq %rdx,%r12
+
+
+ mulq 0(%r14)
+ movq %r8,%rbp
+ addq %rax,%r13
+ movq %r8,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ subq %r8,%r10
+ sbbq $0,%r8
+
+ mulq 8(%r14)
+ addq %rcx,%r9
+ adcq $0,%rdx
+ addq %rax,%r9
+ movq %rbp,%rax
+ adcq %rdx,%r10
+ movq %rbp,%rdx
+ adcq $0,%r8
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r11
+ movq 8(%rbx),%rax
+ sbbq %rdx,%rbp
+
+ addq %r8,%r11
+ adcq %rbp,%r12
+ adcq $0,%r13
+
+
+ movq %rax,%rcx
+ mulq 0(%rsi)
+ addq %rax,%r9
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 8(%rsi)
+ addq %rbp,%r10
+ adcq $0,%rdx
+ addq %rax,%r10
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 16(%rsi)
+ addq %rbp,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %rcx,%rax
+ adcq $0,%rdx
+
+ movq %r9,%rcx
+ imulq %r15,%r9
+
+ movq %rdx,%rbp
+ mulq 24(%rsi)
+ addq %rbp,%r12
+ adcq $0,%rdx
+ xorq %r8,%r8
+ addq %rax,%r12
+ movq %r9,%rax
+ adcq %rdx,%r13
+ adcq $0,%r8
+
+
+ mulq 0(%r14)
+ movq %r9,%rbp
+ addq %rax,%rcx
+ movq %r9,%rax
+ adcq %rdx,%rcx
+
+ subq %r9,%r11
+ sbbq $0,%r9
+
+ mulq 8(%r14)
+ addq %rcx,%r10
+ adcq $0,%rdx
+ addq %rax,%r10
+ movq %rbp,%rax
+ adcq %rdx,%r11
+ movq %rbp,%rdx
+ adcq $0,%r9
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r12
+ movq 16(%rbx),%rax
+ sbbq %rdx,%rbp
+
+ addq %r9,%r12
+ adcq %rbp,%r13
+ adcq $0,%r8
+
+
+ movq %rax,%rcx
+ mulq 0(%rsi)
+ addq %rax,%r10
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 8(%rsi)
+ addq %rbp,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 16(%rsi)
+ addq %rbp,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %rcx,%rax
+ adcq $0,%rdx
+
+ movq %r10,%rcx
+ imulq %r15,%r10
+
+ movq %rdx,%rbp
+ mulq 24(%rsi)
+ addq %rbp,%r13
+ adcq $0,%rdx
+ xorq %r9,%r9
+ addq %rax,%r13
+ movq %r10,%rax
+ adcq %rdx,%r8
+ adcq $0,%r9
+
+
+ mulq 0(%r14)
+ movq %r10,%rbp
+ addq %rax,%rcx
+ movq %r10,%rax
+ adcq %rdx,%rcx
+
+ subq %r10,%r12
+ sbbq $0,%r10
+
+ mulq 8(%r14)
+ addq %rcx,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %rbp,%rax
+ adcq %rdx,%r12
+ movq %rbp,%rdx
+ adcq $0,%r10
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r13
+ movq 24(%rbx),%rax
+ sbbq %rdx,%rbp
+
+ addq %r10,%r13
+ adcq %rbp,%r8
+ adcq $0,%r9
+
+
+ movq %rax,%rcx
+ mulq 0(%rsi)
+ addq %rax,%r11
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 8(%rsi)
+ addq %rbp,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 16(%rsi)
+ addq %rbp,%r13
+ adcq $0,%rdx
+ addq %rax,%r13
+ movq %rcx,%rax
+ adcq $0,%rdx
+
+ movq %r11,%rcx
+ imulq %r15,%r11
+
+ movq %rdx,%rbp
+ mulq 24(%rsi)
+ addq %rbp,%r8
+ adcq $0,%rdx
+ xorq %r10,%r10
+ addq %rax,%r8
+ movq %r11,%rax
+ adcq %rdx,%r9
+ adcq $0,%r10
+
+
+ mulq 0(%r14)
+ movq %r11,%rbp
+ addq %rax,%rcx
+ movq %r11,%rax
+ adcq %rdx,%rcx
+
+ subq %r11,%r13
+ sbbq $0,%r11
+
+ mulq 8(%r14)
+ addq %rcx,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %rbp,%rax
+ adcq %rdx,%r13
+ movq %rbp,%rdx
+ adcq $0,%r11
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r8
+ sbbq %rdx,%rbp
+
+ addq %r11,%r8
+ adcq %rbp,%r9
+ adcq $0,%r10
+
+
+ movq %r12,%rsi
+ subq 0(%r14),%r12
+ movq %r13,%r11
+ sbbq 8(%r14),%r13
+ movq %r8,%rcx
+ sbbq 16(%r14),%r8
+ movq %r9,%rbp
+ sbbq 24(%r14),%r9
+ sbbq $0,%r10
+
+ cmovcq %rsi,%r12
+ cmovcq %r11,%r13
+ cmovcq %rcx,%r8
+ cmovcq %rbp,%r9
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ movq 0(%rsp),%r15
+
+ movq 8(%rsp),%r14
+
+ movq 16(%rsp),%r13
+
+ movq 24(%rsp),%r12
+
+ movq 32(%rsp),%rbx
+
+ movq 40(%rsp),%rbp
+
+ leaq 48(%rsp),%rsp
+
+L$ord_mul_epilogue:
+ .byte 0xf3,0xc3
+
+
+
+
+
+
+
+
+
+.globl _ecp_nistz256_ord_sqr_mont
+
+.p2align 5
+_ecp_nistz256_ord_sqr_mont:
+
+ movl $0x80100,%ecx
+ andl _OPENSSL_ia32cap_P+8(%rip),%ecx
+ cmpl $0x80100,%ecx
+ je L$ecp_nistz256_ord_sqr_montx
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+L$ord_sqr_body:
+
+ movq 0(%rsi),%r8
+ movq 8(%rsi),%rax
+ movq 16(%rsi),%r14
+ movq 24(%rsi),%r15
+ leaq L$ord(%rip),%rsi
+ movq %rdx,%rbx
+ jmp L$oop_ord_sqr
+
+.p2align 5
+L$oop_ord_sqr:
+
+ movq %rax,%rbp
+ mulq %r8
+ movq %rax,%r9
+.byte 102,72,15,110,205
+ movq %r14,%rax
+ movq %rdx,%r10
+
+ mulq %r8
+ addq %rax,%r10
+ movq %r15,%rax
+.byte 102,73,15,110,214
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %r8
+ addq %rax,%r11
+ movq %r15,%rax
+.byte 102,73,15,110,223
+ adcq $0,%rdx
+ movq %rdx,%r12
+
+
+ mulq %r14
+ movq %rax,%r13
+ movq %r14,%rax
+ movq %rdx,%r14
+
+
+ mulq %rbp
+ addq %rax,%r11
+ movq %r15,%rax
+ adcq $0,%rdx
+ movq %rdx,%r15
+
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+
+ addq %r15,%r12
+ adcq %rdx,%r13
+ adcq $0,%r14
+
+
+ xorq %r15,%r15
+ movq %r8,%rax
+ addq %r9,%r9
+ adcq %r10,%r10
+ adcq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ adcq %r14,%r14
+ adcq $0,%r15
+
+
+ mulq %rax
+ movq %rax,%r8
+.byte 102,72,15,126,200
+ movq %rdx,%rbp
+
+ mulq %rax
+ addq %rbp,%r9
+ adcq %rax,%r10
+.byte 102,72,15,126,208
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq %rax
+ addq %rbp,%r11
+ adcq %rax,%r12
+.byte 102,72,15,126,216
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ movq %r8,%rcx
+ imulq 32(%rsi),%r8
+
+ mulq %rax
+ addq %rbp,%r13
+ adcq %rax,%r14
+ movq 0(%rsi),%rax
+ adcq %rdx,%r15
+
+
+ mulq %r8
+ movq %r8,%rbp
+ addq %rax,%rcx
+ movq 8(%rsi),%rax
+ adcq %rdx,%rcx
+
+ subq %r8,%r10
+ sbbq $0,%rbp
+
+ mulq %r8
+ addq %rcx,%r9
+ adcq $0,%rdx
+ addq %rax,%r9
+ movq %r8,%rax
+ adcq %rdx,%r10
+ movq %r8,%rdx
+ adcq $0,%rbp
+
+ movq %r9,%rcx
+ imulq 32(%rsi),%r9
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r11
+ movq 0(%rsi),%rax
+ sbbq %rdx,%r8
+
+ addq %rbp,%r11
+ adcq $0,%r8
+
+
+ mulq %r9
+ movq %r9,%rbp
+ addq %rax,%rcx
+ movq 8(%rsi),%rax
+ adcq %rdx,%rcx
+
+ subq %r9,%r11
+ sbbq $0,%rbp
+
+ mulq %r9
+ addq %rcx,%r10
+ adcq $0,%rdx
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+ movq %r9,%rdx
+ adcq $0,%rbp
+
+ movq %r10,%rcx
+ imulq 32(%rsi),%r10
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r8
+ movq 0(%rsi),%rax
+ sbbq %rdx,%r9
+
+ addq %rbp,%r8
+ adcq $0,%r9
+
+
+ mulq %r10
+ movq %r10,%rbp
+ addq %rax,%rcx
+ movq 8(%rsi),%rax
+ adcq %rdx,%rcx
+
+ subq %r10,%r8
+ sbbq $0,%rbp
+
+ mulq %r10
+ addq %rcx,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %r10,%rax
+ adcq %rdx,%r8
+ movq %r10,%rdx
+ adcq $0,%rbp
+
+ movq %r11,%rcx
+ imulq 32(%rsi),%r11
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r9
+ movq 0(%rsi),%rax
+ sbbq %rdx,%r10
+
+ addq %rbp,%r9
+ adcq $0,%r10
+
+
+ mulq %r11
+ movq %r11,%rbp
+ addq %rax,%rcx
+ movq 8(%rsi),%rax
+ adcq %rdx,%rcx
+
+ subq %r11,%r9
+ sbbq $0,%rbp
+
+ mulq %r11
+ addq %rcx,%r8
+ adcq $0,%rdx
+ addq %rax,%r8
+ movq %r11,%rax
+ adcq %rdx,%r9
+ movq %r11,%rdx
+ adcq $0,%rbp
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r10
+ sbbq %rdx,%r11
+
+ addq %rbp,%r10
+ adcq $0,%r11
+
+
+ xorq %rdx,%rdx
+ addq %r12,%r8
+ adcq %r13,%r9
+ movq %r8,%r12
+ adcq %r14,%r10
+ adcq %r15,%r11
+ movq %r9,%rax
+ adcq $0,%rdx
+
+
+ subq 0(%rsi),%r8
+ movq %r10,%r14
+ sbbq 8(%rsi),%r9
+ sbbq 16(%rsi),%r10
+ movq %r11,%r15
+ sbbq 24(%rsi),%r11
+ sbbq $0,%rdx
+
+ cmovcq %r12,%r8
+ cmovncq %r9,%rax
+ cmovncq %r10,%r14
+ cmovncq %r11,%r15
+
+ decq %rbx
+ jnz L$oop_ord_sqr
+
+ movq %r8,0(%rdi)
+ movq %rax,8(%rdi)
+ pxor %xmm1,%xmm1
+ movq %r14,16(%rdi)
+ pxor %xmm2,%xmm2
+ movq %r15,24(%rdi)
+ pxor %xmm3,%xmm3
+
+ movq 0(%rsp),%r15
+
+ movq 8(%rsp),%r14
+
+ movq 16(%rsp),%r13
+
+ movq 24(%rsp),%r12
+
+ movq 32(%rsp),%rbx
+
+ movq 40(%rsp),%rbp
+
+ leaq 48(%rsp),%rsp
+
+L$ord_sqr_epilogue:
+ .byte 0xf3,0xc3
+
+
+
+
+.p2align 5
+ecp_nistz256_ord_mul_montx:
+
+L$ecp_nistz256_ord_mul_montx:
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+L$ord_mulx_body:
+
+ movq %rdx,%rbx
+ movq 0(%rdx),%rdx
+ movq 0(%rsi),%r9
+ movq 8(%rsi),%r10
+ movq 16(%rsi),%r11
+ movq 24(%rsi),%r12
+ leaq -128(%rsi),%rsi
+ leaq L$ord-128(%rip),%r14
+ movq L$ordK(%rip),%r15
+
+
+ mulxq %r9,%r8,%r9
+ mulxq %r10,%rcx,%r10
+ mulxq %r11,%rbp,%r11
+ addq %rcx,%r9
+ mulxq %r12,%rcx,%r12
+ movq %r8,%rdx
+ mulxq %r15,%rdx,%rax
+ adcq %rbp,%r10
+ adcq %rcx,%r11
+ adcq $0,%r12
+
+
+ xorq %r13,%r13
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 8(%rbx),%rdx
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+ adcxq %r8,%r12
+ adoxq %r8,%r13
+ adcq $0,%r13
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r9,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ adcxq %r8,%r13
+ adoxq %r8,%r8
+ adcq $0,%r8
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 16(%rbx),%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcxq %r9,%r13
+ adoxq %r9,%r8
+ adcq $0,%r8
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r10,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ adcxq %r9,%r8
+ adoxq %r9,%r9
+ adcq $0,%r9
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 24(%rbx),%rdx
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+ adcxq %r10,%r8
+ adoxq %r10,%r9
+ adcq $0,%r9
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r11,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+
+ adcxq %r10,%r9
+ adoxq %r10,%r10
+ adcq $0,%r10
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ leaq 128(%r14),%r14
+ movq %r12,%rbx
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ movq %r13,%rdx
+ adcxq %r11,%r9
+ adoxq %r11,%r10
+ adcq $0,%r10
+
+
+
+ movq %r8,%rcx
+ subq 0(%r14),%r12
+ sbbq 8(%r14),%r13
+ sbbq 16(%r14),%r8
+ movq %r9,%rbp
+ sbbq 24(%r14),%r9
+ sbbq $0,%r10
+
+ cmovcq %rbx,%r12
+ cmovcq %rdx,%r13
+ cmovcq %rcx,%r8
+ cmovcq %rbp,%r9
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ movq 0(%rsp),%r15
+
+ movq 8(%rsp),%r14
+
+ movq 16(%rsp),%r13
+
+ movq 24(%rsp),%r12
+
+ movq 32(%rsp),%rbx
+
+ movq 40(%rsp),%rbp
+
+ leaq 48(%rsp),%rsp
+
+L$ord_mulx_epilogue:
+ .byte 0xf3,0xc3
+
+
+
+
+.p2align 5
+ecp_nistz256_ord_sqr_montx:
+
+L$ecp_nistz256_ord_sqr_montx:
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+L$ord_sqrx_body:
+
+ movq %rdx,%rbx
+ movq 0(%rsi),%rdx
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r15
+ movq 24(%rsi),%r8
+ leaq L$ord(%rip),%rsi
+ jmp L$oop_ord_sqrx
+
+.p2align 5
+L$oop_ord_sqrx:
+ mulxq %r14,%r9,%r10
+ mulxq %r15,%rcx,%r11
+ movq %rdx,%rax
+.byte 102,73,15,110,206
+ mulxq %r8,%rbp,%r12
+ movq %r14,%rdx
+ addq %rcx,%r10
+.byte 102,73,15,110,215
+ adcq %rbp,%r11
+ adcq $0,%r12
+ xorq %r13,%r13
+
+ mulxq %r15,%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq %r8,%rcx,%rbp
+ movq %r15,%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcq $0,%r13
+
+ mulxq %r8,%rcx,%r14
+ movq %rax,%rdx
+.byte 102,73,15,110,216
+ xorq %r15,%r15
+ adcxq %r9,%r9
+ adoxq %rcx,%r13
+ adcxq %r10,%r10
+ adoxq %r15,%r14
+
+
+ mulxq %rdx,%r8,%rbp
+.byte 102,72,15,126,202
+ adcxq %r11,%r11
+ adoxq %rbp,%r9
+ adcxq %r12,%r12
+ mulxq %rdx,%rcx,%rax
+.byte 102,72,15,126,210
+ adcxq %r13,%r13
+ adoxq %rcx,%r10
+ adcxq %r14,%r14
+ mulxq %rdx,%rcx,%rbp
+.byte 0x67
+.byte 102,72,15,126,218
+ adoxq %rax,%r11
+ adcxq %r15,%r15
+ adoxq %rcx,%r12
+ adoxq %rbp,%r13
+ mulxq %rdx,%rcx,%rax
+ adoxq %rcx,%r14
+ adoxq %rax,%r15
+
+
+ movq %r8,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ xorq %rax,%rax
+ mulxq 0(%rsi),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ mulxq 8(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+ mulxq 16(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+ mulxq 24(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r8
+ adcxq %rax,%r8
+
+
+ movq %r9,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adoxq %rcx,%r9
+ adcxq %rbp,%r10
+ mulxq 8(%rsi),%rcx,%rbp
+ adoxq %rcx,%r10
+ adcxq %rbp,%r11
+ mulxq 16(%rsi),%rcx,%rbp
+ adoxq %rcx,%r11
+ adcxq %rbp,%r8
+ mulxq 24(%rsi),%rcx,%rbp
+ adoxq %rcx,%r8
+ adcxq %rbp,%r9
+ adoxq %rax,%r9
+
+
+ movq %r10,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+ mulxq 8(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r8
+ mulxq 16(%rsi),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ mulxq 24(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+ adcxq %rax,%r10
+
+
+ movq %r11,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adoxq %rcx,%r11
+ adcxq %rbp,%r8
+ mulxq 8(%rsi),%rcx,%rbp
+ adoxq %rcx,%r8
+ adcxq %rbp,%r9
+ mulxq 16(%rsi),%rcx,%rbp
+ adoxq %rcx,%r9
+ adcxq %rbp,%r10
+ mulxq 24(%rsi),%rcx,%rbp
+ adoxq %rcx,%r10
+ adcxq %rbp,%r11
+ adoxq %rax,%r11
+
+
+ addq %r8,%r12
+ adcq %r13,%r9
+ movq %r12,%rdx
+ adcq %r14,%r10
+ adcq %r15,%r11
+ movq %r9,%r14
+ adcq $0,%rax
+
+
+ subq 0(%rsi),%r12
+ movq %r10,%r15
+ sbbq 8(%rsi),%r9
+ sbbq 16(%rsi),%r10
+ movq %r11,%r8
+ sbbq 24(%rsi),%r11
+ sbbq $0,%rax
+
+ cmovncq %r12,%rdx
+ cmovncq %r9,%r14
+ cmovncq %r10,%r15
+ cmovncq %r11,%r8
+
+ decq %rbx
+ jnz L$oop_ord_sqrx
+
+ movq %rdx,0(%rdi)
+ movq %r14,8(%rdi)
+ pxor %xmm1,%xmm1
+ movq %r15,16(%rdi)
+ pxor %xmm2,%xmm2
+ movq %r8,24(%rdi)
+ pxor %xmm3,%xmm3
+
+ movq 0(%rsp),%r15
+
+ movq 8(%rsp),%r14
+
+ movq 16(%rsp),%r13
+
+ movq 24(%rsp),%r12
+
+ movq 32(%rsp),%rbx
+
+ movq 40(%rsp),%rbp
+
+ leaq 48(%rsp),%rsp
+
+L$ord_sqrx_epilogue:
+ .byte 0xf3,0xc3
+
+
+
+
+
+
.globl _ecp_nistz256_to_mont
.p2align 5
@@ -2723,15 +3854,23 @@ _ecp_nistz256_to_mont:
.p2align 5
_ecp_nistz256_mul_mont:
+
movl $0x80100,%ecx
andl _OPENSSL_ia32cap_P+8(%rip),%ecx
L$mul_mont:
pushq %rbp
+
pushq %rbx
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
+L$mul_body:
cmpl $0x80100,%ecx
je L$mul_montx
movq %rdx,%rbx
@@ -2756,16 +3895,26 @@ L$mul_montx:
call __ecp_nistz256_mul_montx
L$mul_mont_done:
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %rbp
+ movq 0(%rsp),%r15
+
+ movq 8(%rsp),%r14
+
+ movq 16(%rsp),%r13
+
+ movq 24(%rsp),%r12
+
+ movq 32(%rsp),%rbx
+
+ movq 40(%rsp),%rbp
+
+ leaq 48(%rsp),%rsp
+
+L$mul_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
__ecp_nistz256_mul_montq:
@@ -2992,14 +4141,22 @@ __ecp_nistz256_mul_montq:
.p2align 5
_ecp_nistz256_sqr_mont:
+
movl $0x80100,%ecx
andl _OPENSSL_ia32cap_P+8(%rip),%ecx
pushq %rbp
+
pushq %rbx
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
+L$sqr_body:
cmpl $0x80100,%ecx
je L$sqr_montx
movq 0(%rsi),%rax
@@ -3020,16 +4177,26 @@ L$sqr_montx:
call __ecp_nistz256_sqr_montx
L$sqr_mont_done:
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %rbp
+ movq 0(%rsp),%r15
+
+ movq 8(%rsp),%r14
+
+ movq 16(%rsp),%r13
+
+ movq 24(%rsp),%r12
+
+ movq 32(%rsp),%rbx
+
+ movq 40(%rsp),%rbp
+
+ leaq 48(%rsp),%rsp
+
+L$sqr_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
__ecp_nistz256_sqr_montq:
movq %rax,%r13
@@ -3494,9 +4661,13 @@ __ecp_nistz256_sqr_montx:
.p2align 5
_ecp_nistz256_from_mont:
+
pushq %r12
+
pushq %r13
+L$from_body:
+
movq 0(%rsi),%rax
movq L$poly+24(%rip),%r13
movq 8(%rsi),%r9
@@ -3576,12 +4747,18 @@ _ecp_nistz256_from_mont:
movq %r10,16(%rdi)
movq %r11,24(%rdi)
- popq %r13
- popq %r12
+ movq 0(%rsp),%r13
+
+ movq 8(%rsp),%r12
+
+ leaq 16(%rsp),%rsp
+
+L$from_epilogue:
.byte 0xf3,0xc3
+
.globl _ecp_nistz256_scatter_w5
.p2align 5
@@ -3664,6 +4841,7 @@ L$select_loop_sse_w5:
movdqu %xmm6,64(%rdi)
movdqu %xmm7,80(%rdi)
.byte 0xf3,0xc3
+L$SEH_end_ecp_nistz256_gather_w5:
@@ -3734,6 +4912,7 @@ L$select_loop_sse_w7:
movdqu %xmm4,32(%rdi)
movdqu %xmm5,48(%rdi)
.byte 0xf3,0xc3
+L$SEH_end_ecp_nistz256_gather_w7:
@@ -3794,6 +4973,7 @@ L$select_loop_avx2_w5:
vmovdqu %ymm4,64(%rdi)
vzeroupper
.byte 0xf3,0xc3
+L$SEH_end_ecp_nistz256_avx2_gather_w5:
@@ -3871,6 +5051,7 @@ L$select_loop_avx2_w7:
vmovdqu %ymm3,32(%rdi)
vzeroupper
.byte 0xf3,0xc3
+L$SEH_end_ecp_nistz256_avx2_gather_w7:
.p2align 5
@@ -3997,18 +5178,27 @@ __ecp_nistz256_mul_by_2q:
.p2align 5
_ecp_nistz256_point_double:
+
movl $0x80100,%ecx
andl _OPENSSL_ia32cap_P+8(%rip),%ecx
cmpl $0x80100,%ecx
je L$point_doublex
pushq %rbp
+
pushq %rbx
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
subq $160+8,%rsp
+L$point_doubleq_body:
+
L$point_double_shortcutq:
movdqu 0(%rsi),%xmm0
movq %rsi,%rbx
@@ -4190,31 +5380,51 @@ L$point_double_shortcutq:
.byte 102,72,15,126,207
call __ecp_nistz256_sub_fromq
- addq $160+8,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %rbp
+ leaq 160+56(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbx
+
+ movq -8(%rsi),%rbp
+
+ leaq (%rsi),%rsp
+
+L$point_doubleq_epilogue:
.byte 0xf3,0xc3
+
.globl _ecp_nistz256_point_add
.p2align 5
_ecp_nistz256_point_add:
+
movl $0x80100,%ecx
andl _OPENSSL_ia32cap_P+8(%rip),%ecx
cmpl $0x80100,%ecx
je L$point_addx
pushq %rbp
+
pushq %rbx
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
subq $576+8,%rsp
+L$point_addq_body:
+
movdqu 0(%rsi),%xmm0
movdqu 16(%rsi),%xmm1
movdqu 32(%rsi),%xmm2
@@ -4590,31 +5800,51 @@ L$add_proceedq:
movdqu %xmm3,48(%rdi)
L$add_doneq:
- addq $576+8,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %rbp
+ leaq 576+56(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbx
+
+ movq -8(%rsi),%rbp
+
+ leaq (%rsi),%rsp
+
+L$point_addq_epilogue:
.byte 0xf3,0xc3
+
.globl _ecp_nistz256_point_add_affine
.p2align 5
_ecp_nistz256_point_add_affine:
+
movl $0x80100,%ecx
andl _OPENSSL_ia32cap_P+8(%rip),%ecx
cmpl $0x80100,%ecx
je L$point_add_affinex
pushq %rbp
+
pushq %rbx
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
subq $480+8,%rsp
+L$add_affineq_body:
+
movdqu 0(%rsi),%xmm0
movq %rdx,%rbx
movdqu 16(%rsi),%xmm1
@@ -4896,16 +6126,27 @@ _ecp_nistz256_point_add_affine:
movdqu %xmm2,32(%rdi)
movdqu %xmm3,48(%rdi)
- addq $480+8,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %rbp
+ leaq 480+56(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbx
+
+ movq -8(%rsi),%rbp
+
+ leaq (%rsi),%rsp
+
+L$add_affineq_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
__ecp_nistz256_add_tox:
xorq %r11,%r11
@@ -5035,15 +6276,24 @@ __ecp_nistz256_mul_by_2x:
.p2align 5
ecp_nistz256_point_doublex:
+
L$point_doublex:
pushq %rbp
+
pushq %rbx
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
subq $160+8,%rsp
+L$point_doublex_body:
+
L$point_double_shortcutx:
movdqu 0(%rsi),%xmm0
movq %rsi,%rbx
@@ -5225,27 +6475,47 @@ L$point_double_shortcutx:
.byte 102,72,15,126,207
call __ecp_nistz256_sub_fromx
- addq $160+8,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %rbp
+ leaq 160+56(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbx
+
+ movq -8(%rsi),%rbp
+
+ leaq (%rsi),%rsp
+
+L$point_doublex_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
ecp_nistz256_point_addx:
+
L$point_addx:
pushq %rbp
+
pushq %rbx
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
subq $576+8,%rsp
+L$point_addx_body:
+
movdqu 0(%rsi),%xmm0
movdqu 16(%rsi),%xmm1
movdqu 32(%rsi),%xmm2
@@ -5621,27 +6891,47 @@ L$add_proceedx:
movdqu %xmm3,48(%rdi)
L$add_donex:
- addq $576+8,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %rbp
+ leaq 576+56(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbx
+
+ movq -8(%rsi),%rbp
+
+ leaq (%rsi),%rsp
+
+L$point_addx_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
ecp_nistz256_point_add_affinex:
+
L$point_add_affinex:
pushq %rbp
+
pushq %rbx
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
subq $480+8,%rsp
+L$add_affinex_body:
+
movdqu 0(%rsi),%xmm0
movq %rdx,%rbx
movdqu 16(%rsi),%xmm1
@@ -5923,12 +7213,23 @@ L$point_add_affinex:
movdqu %xmm2,32(%rdi)
movdqu %xmm3,48(%rdi)
- addq $480+8,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %rbp
+ leaq 480+56(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbx
+
+ movq -8(%rsi),%rbp
+
+ leaq (%rsi),%rsp
+
+L$add_affinex_epilogue:
.byte 0xf3,0xc3
+
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/ec/x25519-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/ec/x25519-x86_64.s
new file mode 100644
index 0000000000..cdb602d4cc
--- /dev/null
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/ec/x25519-x86_64.s
@@ -0,0 +1,760 @@
+.text
+
+.globl _x25519_fe51_mul
+
+.p2align 5
+_x25519_fe51_mul:
+
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ leaq -40(%rsp),%rsp
+
+L$fe51_mul_body:
+
+ movq 0(%rsi),%rax
+ movq 0(%rdx),%r11
+ movq 8(%rdx),%r12
+ movq 16(%rdx),%r13
+ movq 24(%rdx),%rbp
+ movq 32(%rdx),%r14
+
+ movq %rdi,32(%rsp)
+ movq %rax,%rdi
+ mulq %r11
+ movq %r11,0(%rsp)
+ movq %rax,%rbx
+ movq %rdi,%rax
+ movq %rdx,%rcx
+ mulq %r12
+ movq %r12,8(%rsp)
+ movq %rax,%r8
+ movq %rdi,%rax
+ leaq (%r14,%r14,8),%r15
+ movq %rdx,%r9
+ mulq %r13
+ movq %r13,16(%rsp)
+ movq %rax,%r10
+ movq %rdi,%rax
+ leaq (%r14,%r15,2),%rdi
+ movq %rdx,%r11
+ mulq %rbp
+ movq %rax,%r12
+ movq 0(%rsi),%rax
+ movq %rdx,%r13
+ mulq %r14
+ movq %rax,%r14
+ movq 8(%rsi),%rax
+ movq %rdx,%r15
+
+ mulq %rdi
+ addq %rax,%rbx
+ movq 16(%rsi),%rax
+ adcq %rdx,%rcx
+ mulq %rdi
+ addq %rax,%r8
+ movq 24(%rsi),%rax
+ adcq %rdx,%r9
+ mulq %rdi
+ addq %rax,%r10
+ movq 32(%rsi),%rax
+ adcq %rdx,%r11
+ mulq %rdi
+ imulq $19,%rbp,%rdi
+ addq %rax,%r12
+ movq 8(%rsi),%rax
+ adcq %rdx,%r13
+ mulq %rbp
+ movq 16(%rsp),%rbp
+ addq %rax,%r14
+ movq 16(%rsi),%rax
+ adcq %rdx,%r15
+
+ mulq %rdi
+ addq %rax,%rbx
+ movq 24(%rsi),%rax
+ adcq %rdx,%rcx
+ mulq %rdi
+ addq %rax,%r8
+ movq 32(%rsi),%rax
+ adcq %rdx,%r9
+ mulq %rdi
+ imulq $19,%rbp,%rdi
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq %rdx,%r11
+ mulq %rbp
+ addq %rax,%r12
+ movq 16(%rsi),%rax
+ adcq %rdx,%r13
+ mulq %rbp
+ movq 8(%rsp),%rbp
+ addq %rax,%r14
+ movq 24(%rsi),%rax
+ adcq %rdx,%r15
+
+ mulq %rdi
+ addq %rax,%rbx
+ movq 32(%rsi),%rax
+ adcq %rdx,%rcx
+ mulq %rdi
+ addq %rax,%r8
+ movq 8(%rsi),%rax
+ adcq %rdx,%r9
+ mulq %rbp
+ imulq $19,%rbp,%rdi
+ addq %rax,%r10
+ movq 16(%rsi),%rax
+ adcq %rdx,%r11
+ mulq %rbp
+ addq %rax,%r12
+ movq 24(%rsi),%rax
+ adcq %rdx,%r13
+ mulq %rbp
+ movq 0(%rsp),%rbp
+ addq %rax,%r14
+ movq 32(%rsi),%rax
+ adcq %rdx,%r15
+
+ mulq %rdi
+ addq %rax,%rbx
+ movq 8(%rsi),%rax
+ adcq %rdx,%rcx
+ mulq %rbp
+ addq %rax,%r8
+ movq 16(%rsi),%rax
+ adcq %rdx,%r9
+ mulq %rbp
+ addq %rax,%r10
+ movq 24(%rsi),%rax
+ adcq %rdx,%r11
+ mulq %rbp
+ addq %rax,%r12
+ movq 32(%rsi),%rax
+ adcq %rdx,%r13
+ mulq %rbp
+ addq %rax,%r14
+ adcq %rdx,%r15
+
+ movq 32(%rsp),%rdi
+ jmp L$reduce51
+L$fe51_mul_epilogue:
+
+
+
+.globl _x25519_fe51_sqr
+
+.p2align 5
+_x25519_fe51_sqr:
+
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ leaq -40(%rsp),%rsp
+
+L$fe51_sqr_body:
+
+ movq 0(%rsi),%rax
+ movq 16(%rsi),%r15
+ movq 32(%rsi),%rbp
+
+ movq %rdi,32(%rsp)
+ leaq (%rax,%rax,1),%r14
+ mulq %rax
+ movq %rax,%rbx
+ movq 8(%rsi),%rax
+ movq %rdx,%rcx
+ mulq %r14
+ movq %rax,%r8
+ movq %r15,%rax
+ movq %r15,0(%rsp)
+ movq %rdx,%r9
+ mulq %r14
+ movq %rax,%r10
+ movq 24(%rsi),%rax
+ movq %rdx,%r11
+ imulq $19,%rbp,%rdi
+ mulq %r14
+ movq %rax,%r12
+ movq %rbp,%rax
+ movq %rdx,%r13
+ mulq %r14
+ movq %rax,%r14
+ movq %rbp,%rax
+ movq %rdx,%r15
+
+ mulq %rdi
+ addq %rax,%r12
+ movq 8(%rsi),%rax
+ adcq %rdx,%r13
+
+ movq 24(%rsi),%rsi
+ leaq (%rax,%rax,1),%rbp
+ mulq %rax
+ addq %rax,%r10
+ movq 0(%rsp),%rax
+ adcq %rdx,%r11
+ mulq %rbp
+ addq %rax,%r12
+ movq %rbp,%rax
+ adcq %rdx,%r13
+ mulq %rsi
+ addq %rax,%r14
+ movq %rbp,%rax
+ adcq %rdx,%r15
+ imulq $19,%rsi,%rbp
+ mulq %rdi
+ addq %rax,%rbx
+ leaq (%rsi,%rsi,1),%rax
+ adcq %rdx,%rcx
+
+ mulq %rdi
+ addq %rax,%r10
+ movq %rsi,%rax
+ adcq %rdx,%r11
+ mulq %rbp
+ addq %rax,%r8
+ movq 0(%rsp),%rax
+ adcq %rdx,%r9
+
+ leaq (%rax,%rax,1),%rsi
+ mulq %rax
+ addq %rax,%r14
+ movq %rbp,%rax
+ adcq %rdx,%r15
+ mulq %rsi
+ addq %rax,%rbx
+ movq %rsi,%rax
+ adcq %rdx,%rcx
+ mulq %rdi
+ addq %rax,%r8
+ adcq %rdx,%r9
+
+ movq 32(%rsp),%rdi
+ jmp L$reduce51
+
+.p2align 5
+L$reduce51:
+ movq $0x7ffffffffffff,%rbp
+
+ movq %r10,%rdx
+ shrq $51,%r10
+ shlq $13,%r11
+ andq %rbp,%rdx
+ orq %r10,%r11
+ addq %r11,%r12
+ adcq $0,%r13
+
+ movq %rbx,%rax
+ shrq $51,%rbx
+ shlq $13,%rcx
+ andq %rbp,%rax
+ orq %rbx,%rcx
+ addq %rcx,%r8
+ adcq $0,%r9
+
+ movq %r12,%rbx
+ shrq $51,%r12
+ shlq $13,%r13
+ andq %rbp,%rbx
+ orq %r12,%r13
+ addq %r13,%r14
+ adcq $0,%r15
+
+ movq %r8,%rcx
+ shrq $51,%r8
+ shlq $13,%r9
+ andq %rbp,%rcx
+ orq %r8,%r9
+ addq %r9,%rdx
+
+ movq %r14,%r10
+ shrq $51,%r14
+ shlq $13,%r15
+ andq %rbp,%r10
+ orq %r14,%r15
+
+ leaq (%r15,%r15,8),%r14
+ leaq (%r15,%r14,2),%r15
+ addq %r15,%rax
+
+ movq %rdx,%r8
+ andq %rbp,%rdx
+ shrq $51,%r8
+ addq %r8,%rbx
+
+ movq %rax,%r9
+ andq %rbp,%rax
+ shrq $51,%r9
+ addq %r9,%rcx
+
+ movq %rax,0(%rdi)
+ movq %rcx,8(%rdi)
+ movq %rdx,16(%rdi)
+ movq %rbx,24(%rdi)
+ movq %r10,32(%rdi)
+
+ movq 40(%rsp),%r15
+
+ movq 48(%rsp),%r14
+
+ movq 56(%rsp),%r13
+
+ movq 64(%rsp),%r12
+
+ movq 72(%rsp),%rbx
+
+ movq 80(%rsp),%rbp
+
+ leaq 88(%rsp),%rsp
+
+L$fe51_sqr_epilogue:
+ .byte 0xf3,0xc3
+
+
+
+.globl _x25519_fe51_mul121666
+
+.p2align 5
+_x25519_fe51_mul121666:
+
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ leaq -40(%rsp),%rsp
+
+L$fe51_mul121666_body:
+ movl $121666,%eax
+
+ mulq 0(%rsi)
+ movq %rax,%rbx
+ movl $121666,%eax
+ movq %rdx,%rcx
+ mulq 8(%rsi)
+ movq %rax,%r8
+ movl $121666,%eax
+ movq %rdx,%r9
+ mulq 16(%rsi)
+ movq %rax,%r10
+ movl $121666,%eax
+ movq %rdx,%r11
+ mulq 24(%rsi)
+ movq %rax,%r12
+ movl $121666,%eax
+ movq %rdx,%r13
+ mulq 32(%rsi)
+ movq %rax,%r14
+ movq %rdx,%r15
+
+ jmp L$reduce51
+L$fe51_mul121666_epilogue:
+
+
+
+.globl _x25519_fe64_eligible
+
+.p2align 5
+_x25519_fe64_eligible:
+ movl _OPENSSL_ia32cap_P+8(%rip),%ecx
+ xorl %eax,%eax
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ cmovel %ecx,%eax
+ .byte 0xf3,0xc3
+
+
+.globl _x25519_fe64_mul
+
+.p2align 5
+_x25519_fe64_mul:
+
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ pushq %rdi
+
+ leaq -16(%rsp),%rsp
+
+L$fe64_mul_body:
+
+ movq %rdx,%rax
+ movq 0(%rdx),%rbp
+ movq 0(%rsi),%rdx
+ movq 8(%rax),%rcx
+ movq 16(%rax),%r14
+ movq 24(%rax),%r15
+
+ mulxq %rbp,%r8,%rax
+ xorl %edi,%edi
+ mulxq %rcx,%r9,%rbx
+ adcxq %rax,%r9
+ mulxq %r14,%r10,%rax
+ adcxq %rbx,%r10
+ mulxq %r15,%r11,%r12
+ movq 8(%rsi),%rdx
+ adcxq %rax,%r11
+ movq %r14,(%rsp)
+ adcxq %rdi,%r12
+
+ mulxq %rbp,%rax,%rbx
+ adoxq %rax,%r9
+ adcxq %rbx,%r10
+ mulxq %rcx,%rax,%rbx
+ adoxq %rax,%r10
+ adcxq %rbx,%r11
+ mulxq %r14,%rax,%rbx
+ adoxq %rax,%r11
+ adcxq %rbx,%r12
+ mulxq %r15,%rax,%r13
+ movq 16(%rsi),%rdx
+ adoxq %rax,%r12
+ adcxq %rdi,%r13
+ adoxq %rdi,%r13
+
+ mulxq %rbp,%rax,%rbx
+ adcxq %rax,%r10
+ adoxq %rbx,%r11
+ mulxq %rcx,%rax,%rbx
+ adcxq %rax,%r11
+ adoxq %rbx,%r12
+ mulxq %r14,%rax,%rbx
+ adcxq %rax,%r12
+ adoxq %rbx,%r13
+ mulxq %r15,%rax,%r14
+ movq 24(%rsi),%rdx
+ adcxq %rax,%r13
+ adoxq %rdi,%r14
+ adcxq %rdi,%r14
+
+ mulxq %rbp,%rax,%rbx
+ adoxq %rax,%r11
+ adcxq %rbx,%r12
+ mulxq %rcx,%rax,%rbx
+ adoxq %rax,%r12
+ adcxq %rbx,%r13
+ mulxq (%rsp),%rax,%rbx
+ adoxq %rax,%r13
+ adcxq %rbx,%r14
+ mulxq %r15,%rax,%r15
+ movl $38,%edx
+ adoxq %rax,%r14
+ adcxq %rdi,%r15
+ adoxq %rdi,%r15
+
+ jmp L$reduce64
+L$fe64_mul_epilogue:
+
+
+
+.globl _x25519_fe64_sqr
+
+.p2align 5
+_x25519_fe64_sqr:
+
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ pushq %rdi
+
+ leaq -16(%rsp),%rsp
+
+L$fe64_sqr_body:
+
+ movq 0(%rsi),%rdx
+ movq 8(%rsi),%rcx
+ movq 16(%rsi),%rbp
+ movq 24(%rsi),%rsi
+
+
+ mulxq %rdx,%r8,%r15
+ mulxq %rcx,%r9,%rax
+ xorl %edi,%edi
+ mulxq %rbp,%r10,%rbx
+ adcxq %rax,%r10
+ mulxq %rsi,%r11,%r12
+ movq %rcx,%rdx
+ adcxq %rbx,%r11
+ adcxq %rdi,%r12
+
+
+ mulxq %rbp,%rax,%rbx
+ adoxq %rax,%r11
+ adcxq %rbx,%r12
+ mulxq %rsi,%rax,%r13
+ movq %rbp,%rdx
+ adoxq %rax,%r12
+ adcxq %rdi,%r13
+
+
+ mulxq %rsi,%rax,%r14
+ movq %rcx,%rdx
+ adoxq %rax,%r13
+ adcxq %rdi,%r14
+ adoxq %rdi,%r14
+
+ adcxq %r9,%r9
+ adoxq %r15,%r9
+ adcxq %r10,%r10
+ mulxq %rdx,%rax,%rbx
+ movq %rbp,%rdx
+ adcxq %r11,%r11
+ adoxq %rax,%r10
+ adcxq %r12,%r12
+ adoxq %rbx,%r11
+ mulxq %rdx,%rax,%rbx
+ movq %rsi,%rdx
+ adcxq %r13,%r13
+ adoxq %rax,%r12
+ adcxq %r14,%r14
+ adoxq %rbx,%r13
+ mulxq %rdx,%rax,%r15
+ movl $38,%edx
+ adoxq %rax,%r14
+ adcxq %rdi,%r15
+ adoxq %rdi,%r15
+ jmp L$reduce64
+
+.p2align 5
+L$reduce64:
+ mulxq %r12,%rax,%rbx
+ adcxq %rax,%r8
+ adoxq %rbx,%r9
+ mulxq %r13,%rax,%rbx
+ adcxq %rax,%r9
+ adoxq %rbx,%r10
+ mulxq %r14,%rax,%rbx
+ adcxq %rax,%r10
+ adoxq %rbx,%r11
+ mulxq %r15,%rax,%r12
+ adcxq %rax,%r11
+ adoxq %rdi,%r12
+ adcxq %rdi,%r12
+
+ movq 16(%rsp),%rdi
+ imulq %rdx,%r12
+
+ addq %r12,%r8
+ adcq $0,%r9
+ adcq $0,%r10
+ adcq $0,%r11
+
+ sbbq %rax,%rax
+ andq $38,%rax
+
+ addq %rax,%r8
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r8,0(%rdi)
+
+ movq 24(%rsp),%r15
+
+ movq 32(%rsp),%r14
+
+ movq 40(%rsp),%r13
+
+ movq 48(%rsp),%r12
+
+ movq 56(%rsp),%rbx
+
+ movq 64(%rsp),%rbp
+
+ leaq 72(%rsp),%rsp
+
+L$fe64_sqr_epilogue:
+ .byte 0xf3,0xc3
+
+
+
+.globl _x25519_fe64_mul121666
+
+.p2align 5
+_x25519_fe64_mul121666:
+L$fe64_mul121666_body:
+ movl $121666,%edx
+ mulxq 0(%rsi),%r8,%rcx
+ mulxq 8(%rsi),%r9,%rax
+ addq %rcx,%r9
+ mulxq 16(%rsi),%r10,%rcx
+ adcq %rax,%r10
+ mulxq 24(%rsi),%r11,%rax
+ adcq %rcx,%r11
+ adcq $0,%rax
+
+ imulq $38,%rax,%rax
+
+ addq %rax,%r8
+ adcq $0,%r9
+ adcq $0,%r10
+ adcq $0,%r11
+
+ sbbq %rax,%rax
+ andq $38,%rax
+
+ addq %rax,%r8
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r8,0(%rdi)
+
+L$fe64_mul121666_epilogue:
+ .byte 0xf3,0xc3
+
+
+.globl _x25519_fe64_add
+
+.p2align 5
+_x25519_fe64_add:
+L$fe64_add_body:
+ movq 0(%rsi),%r8
+ movq 8(%rsi),%r9
+ movq 16(%rsi),%r10
+ movq 24(%rsi),%r11
+
+ addq 0(%rdx),%r8
+ adcq 8(%rdx),%r9
+ adcq 16(%rdx),%r10
+ adcq 24(%rdx),%r11
+
+ sbbq %rax,%rax
+ andq $38,%rax
+
+ addq %rax,%r8
+ adcq $0,%r9
+ adcq $0,%r10
+ movq %r9,8(%rdi)
+ adcq $0,%r11
+ movq %r10,16(%rdi)
+ sbbq %rax,%rax
+ movq %r11,24(%rdi)
+ andq $38,%rax
+
+ addq %rax,%r8
+ movq %r8,0(%rdi)
+
+L$fe64_add_epilogue:
+ .byte 0xf3,0xc3
+
+
+.globl _x25519_fe64_sub
+
+.p2align 5
+_x25519_fe64_sub:
+L$fe64_sub_body:
+ movq 0(%rsi),%r8
+ movq 8(%rsi),%r9
+ movq 16(%rsi),%r10
+ movq 24(%rsi),%r11
+
+ subq 0(%rdx),%r8
+ sbbq 8(%rdx),%r9
+ sbbq 16(%rdx),%r10
+ sbbq 24(%rdx),%r11
+
+ sbbq %rax,%rax
+ andq $38,%rax
+
+ subq %rax,%r8
+ sbbq $0,%r9
+ sbbq $0,%r10
+ movq %r9,8(%rdi)
+ sbbq $0,%r11
+ movq %r10,16(%rdi)
+ sbbq %rax,%rax
+ movq %r11,24(%rdi)
+ andq $38,%rax
+
+ subq %rax,%r8
+ movq %r8,0(%rdi)
+
+L$fe64_sub_epilogue:
+ .byte 0xf3,0xc3
+
+
+.globl _x25519_fe64_tobytes
+
+.p2align 5
+_x25519_fe64_tobytes:
+L$fe64_to_body:
+ movq 0(%rsi),%r8
+ movq 8(%rsi),%r9
+ movq 16(%rsi),%r10
+ movq 24(%rsi),%r11
+
+
+ leaq (%r11,%r11,1),%rax
+ sarq $63,%r11
+ shrq $1,%rax
+ andq $19,%r11
+ addq $19,%r11
+
+ addq %r11,%r8
+ adcq $0,%r9
+ adcq $0,%r10
+ adcq $0,%rax
+
+ leaq (%rax,%rax,1),%r11
+ sarq $63,%rax
+ shrq $1,%r11
+ notq %rax
+ andq $19,%rax
+
+ subq %rax,%r8
+ sbbq $0,%r9
+ sbbq $0,%r10
+ sbbq $0,%r11
+
+ movq %r8,0(%rdi)
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+
+L$fe64_to_epilogue:
+ .byte 0xf3,0xc3
+
+.byte 88,50,53,53,49,57,32,112,114,105,109,105,116,105,118,101,115,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/include/internal/dso_conf.h b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/include/internal/dso_conf.h
index 64f9685dd5..83576dc732 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/include/internal/dso_conf.h
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/include/internal/dso_conf.h
@@ -1,7 +1,7 @@
/* WARNING: do not edit! */
/* Generated by Makefile from crypto/include/internal/dso_conf.h.in */
/*
- * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
@@ -12,5 +12,8 @@
#ifndef HEADER_DSO_CONF_H
# define HEADER_DSO_CONF_H
+# define DSO_DLFCN
+# define HAVE_DLFCN_H
# define DSO_EXTENSION ".dylib"
+
#endif
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/md5/md5-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/md5/md5-x86_64.s
index f385ea2a3f..285c32ca88 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/md5/md5-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/md5/md5-x86_64.s
@@ -4,11 +4,17 @@
.globl _md5_block_asm_data_order
_md5_block_asm_data_order:
+
pushq %rbp
+
pushq %rbx
+
pushq %r12
+
pushq %r14
+
pushq %r15
+
L$prologue:
@@ -655,11 +661,18 @@ L$end:
movl %edx,12(%rbp)
movq (%rsp),%r15
+
movq 8(%rsp),%r14
+
movq 16(%rsp),%r12
+
movq 24(%rsp),%rbx
+
movq 32(%rsp),%rbp
+
addq $40,%rsp
+
L$epilogue:
.byte 0xf3,0xc3
+
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/modes/aesni-gcm-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/modes/aesni-gcm-x86_64.s
index f01a002363..d5920186ce 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/modes/aesni-gcm-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/modes/aesni-gcm-x86_64.s
@@ -31,23 +31,6 @@ L$resume_ctr32:
vpxor %xmm15,%xmm12,%xmm12
vmovups 16-128(%rcx),%xmm2
vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
xorq %r12,%r12
cmpq %r14,%r15
@@ -332,20 +315,25 @@ L$6x_done:
.p2align 5
_aesni_gcm_decrypt:
- xorq %r10,%r10
-
-
+ xorq %r10,%r10
cmpq $0x60,%rdx
jb L$gcm_dec_abort
leaq (%rsp),%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
vzeroupper
vmovdqu (%r8),%xmm1
@@ -374,15 +362,7 @@ L$dec_no_key_aliasing:
vmovdqu 80(%rdi),%xmm7
leaq (%rdi),%r14
vmovdqu 64(%rdi),%xmm4
-
-
-
-
-
-
-
leaq -192(%rdi,%rdx,1),%r15
-
vmovdqu 48(%rdi),%xmm5
shrq $4,%rdx
xorq %r10,%r10
@@ -415,17 +395,25 @@ L$dec_no_key_aliasing:
vzeroupper
movq -48(%rax),%r15
+
movq -40(%rax),%r14
+
movq -32(%rax),%r13
+
movq -24(%rax),%r12
+
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$gcm_dec_abort:
movq %r10,%rax
.byte 0xf3,0xc3
+
.p2align 5
_aesni_ctr32_6x:
vmovdqu 0-128(%rcx),%xmm4
@@ -520,21 +508,25 @@ L$handle_ctr32_2:
.p2align 5
_aesni_gcm_encrypt:
- xorq %r10,%r10
-
-
-
+ xorq %r10,%r10
cmpq $288,%rdx
jb L$gcm_enc_abort
leaq (%rsp),%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
vzeroupper
vmovdqu (%r8),%xmm1
@@ -558,16 +550,7 @@ _aesni_gcm_encrypt:
L$enc_no_key_aliasing:
leaq (%rsi),%r14
-
-
-
-
-
-
-
-
leaq -192(%rsi,%rdx,1),%r15
-
shrq $4,%rdx
call _aesni_ctr32_6x
@@ -769,16 +752,24 @@ L$enc_no_key_aliasing:
vzeroupper
movq -48(%rax),%r15
+
movq -40(%rax),%r14
+
movq -32(%rax),%r13
+
movq -24(%rax),%r12
+
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$gcm_enc_abort:
movq %r10,%rax
.byte 0xf3,0xc3
+
.p2align 6
L$bswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/modes/ghash-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/modes/ghash-x86_64.s
index 502af78349..d182d45cfb 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/modes/ghash-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/modes/ghash-x86_64.s
@@ -5,9 +5,21 @@
.p2align 4
_gcm_gmult_4bit:
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ subq $280,%rsp
+
L$gmult_prologue:
movzbq 15(%rdi),%r8
@@ -84,22 +96,35 @@ L$break1:
movq %r8,8(%rdi)
movq %r9,(%rdi)
- movq 16(%rsp),%rbx
- leaq 24(%rsp),%rsp
+ leaq 280+48(%rsp),%rsi
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
L$gmult_epilogue:
.byte 0xf3,0xc3
+
.globl _gcm_ghash_4bit
.p2align 4
_gcm_ghash_4bit:
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
subq $280,%rsp
+
L$ghash_prologue:
movq %rdx,%r14
movq %rcx,%r15
@@ -644,17 +669,26 @@ L$outer_loop:
movq %r8,8(%rdi)
movq %r9,(%rdi)
- leaq 280(%rsp),%rsi
- movq 0(%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ leaq 280+48(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq 0(%rsi),%rsp
+
L$ghash_epilogue:
.byte 0xf3,0xc3
+
.globl _gcm_init_clmul
.p2align 4
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/poly1305/poly1305-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/poly1305/poly1305-x86_64.s
index c68f5a6fbe..cbc8c80816 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/poly1305/poly1305-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/poly1305/poly1305-x86_64.s
@@ -31,6 +31,11 @@ _poly1305_init:
leaq poly1305_blocks_avx2(%rip),%rax
btq $37,%r9
cmovcq %rax,%r10
+ movq $2149646336,%rax
+ shrq $32,%r9
+ andq %rax,%r9
+ cmpq %rax,%r9
+ je L$init_base2_44
movq $0x0ffffffc0fffffff,%rax
movq $0x0ffffffc0ffffffc,%rcx
andq 0(%rsi),%rax
@@ -47,16 +52,23 @@ L$no_key:
.p2align 5
_poly1305_blocks:
+
L$blocks:
shrq $4,%rdx
jz L$no_data
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
L$blocks_body:
movq %rdx,%r15
@@ -127,18 +139,26 @@ L$oop:
movq %rbp,16(%rdi)
movq 0(%rsp),%r15
+
movq 8(%rsp),%r14
+
movq 16(%rsp),%r13
+
movq 24(%rsp),%r12
+
movq 32(%rsp),%rbp
+
movq 40(%rsp),%rbx
+
leaq 48(%rsp),%rsp
+
L$no_data:
L$blocks_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
_poly1305_emit:
L$emit:
@@ -371,6 +391,7 @@ __poly1305_init_avx:
.p2align 5
poly1305_blocks_avx:
+
movl 20(%rdi),%r8d
cmpq $128,%rdx
jae L$blocks_avx
@@ -390,11 +411,17 @@ L$blocks_avx:
jz L$even_avx
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
L$blocks_avx_body:
movq %rdx,%r15
@@ -497,24 +524,39 @@ L$store_base2_26_avx:
.p2align 4
L$done_avx:
movq 0(%rsp),%r15
+
movq 8(%rsp),%r14
+
movq 16(%rsp),%r13
+
movq 24(%rsp),%r12
+
movq 32(%rsp),%rbp
+
movq 40(%rsp),%rbx
+
leaq 48(%rsp),%rsp
+
L$no_data_avx:
L$blocks_avx_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
L$base2_64_avx:
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
L$base2_64_avx_body:
movq %rdx,%r15
@@ -574,18 +616,27 @@ L$proceed_avx:
movq %r15,%rdx
movq 0(%rsp),%r15
+
movq 8(%rsp),%r14
+
movq 16(%rsp),%r13
+
movq 24(%rsp),%r12
+
movq 32(%rsp),%rbp
+
movq 40(%rsp),%rbx
+
leaq 48(%rsp),%rax
leaq 48(%rsp),%rsp
+
L$base2_64_avx_epilogue:
jmp L$do_avx
+
.p2align 5
L$even_avx:
+
vmovd 0(%rdi),%xmm0
vmovd 4(%rdi),%xmm1
vmovd 8(%rdi),%xmm2
@@ -594,6 +645,7 @@ L$even_avx:
L$do_avx:
leaq -88(%rsp),%r11
+
subq $0x178,%rsp
subq $64,%rdx
leaq -32(%rsi),%rax
@@ -1153,11 +1205,13 @@ L$short_tail_avx:
vmovd %xmm13,-100(%rdi)
vmovd %xmm14,-96(%rdi)
leaq 88(%r11),%rsp
+
vzeroupper
.byte 0xf3,0xc3
+
.p2align 5
poly1305_emit_avx:
cmpl $0,20(%rdi)
@@ -1214,6 +1268,7 @@ poly1305_emit_avx:
.p2align 5
poly1305_blocks_avx2:
+
movl 20(%rdi),%r8d
cmpq $128,%rdx
jae L$blocks_avx2
@@ -1233,11 +1288,17 @@ L$blocks_avx2:
jz L$even_avx2
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
L$blocks_avx2_body:
movq %rdx,%r15
@@ -1346,24 +1407,39 @@ L$store_base2_26_avx2:
.p2align 4
L$done_avx2:
movq 0(%rsp),%r15
+
movq 8(%rsp),%r14
+
movq 16(%rsp),%r13
+
movq 24(%rsp),%r12
+
movq 32(%rsp),%rbp
+
movq 40(%rsp),%rbx
+
leaq 48(%rsp),%rsp
+
L$no_data_avx2:
L$blocks_avx2_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
L$base2_64_avx2:
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
L$base2_64_avx2_body:
movq %rdx,%r15
@@ -1426,20 +1502,32 @@ L$init_avx2:
L$proceed_avx2:
movq %r15,%rdx
+ movl _OPENSSL_ia32cap_P+8(%rip),%r10d
+ movl $3221291008,%r11d
movq 0(%rsp),%r15
+
movq 8(%rsp),%r14
+
movq 16(%rsp),%r13
+
movq 24(%rsp),%r12
+
movq 32(%rsp),%rbp
+
movq 40(%rsp),%rbx
+
leaq 48(%rsp),%rax
leaq 48(%rsp),%rsp
+
L$base2_64_avx2_epilogue:
jmp L$do_avx2
+
.p2align 5
L$even_avx2:
+
+ movl _OPENSSL_ia32cap_P+8(%rip),%r10d
vmovd 0(%rdi),%xmm0
vmovd 4(%rdi),%xmm1
vmovd 8(%rdi),%xmm2
@@ -1447,10 +1535,18 @@ L$even_avx2:
vmovd 16(%rdi),%xmm4
L$do_avx2:
+ cmpq $512,%rdx
+ jb L$skip_avx512
+ andl %r11d,%r10d
+ testl $65536,%r10d
+ jnz L$blocks_avx512
+L$skip_avx512:
leaq -8(%rsp),%r11
+
subq $0x128,%rsp
- leaq 48+64(%rdi),%rdi
leaq L$const(%rip),%rcx
+ leaq 48+64(%rdi),%rdi
+ vmovdqa 96(%rcx),%ymm7
vmovdqu -64(%rdi),%xmm9
@@ -1460,36 +1556,28 @@ L$do_avx2:
vmovdqu -16(%rdi),%xmm11
vmovdqu 0(%rdi),%xmm12
vmovdqu 16(%rdi),%xmm13
+ leaq 144(%rsp),%rax
vmovdqu 32(%rdi),%xmm14
- vpermq $0x15,%ymm9,%ymm9
+ vpermd %ymm9,%ymm7,%ymm9
vmovdqu 48(%rdi),%xmm15
- vpermq $0x15,%ymm10,%ymm10
- vpshufd $0xc8,%ymm9,%ymm9
+ vpermd %ymm10,%ymm7,%ymm10
vmovdqu 64(%rdi),%xmm5
- vpermq $0x15,%ymm6,%ymm6
- vpshufd $0xc8,%ymm10,%ymm10
+ vpermd %ymm6,%ymm7,%ymm6
vmovdqa %ymm9,0(%rsp)
- vpermq $0x15,%ymm11,%ymm11
- vpshufd $0xc8,%ymm6,%ymm6
- vmovdqa %ymm10,32(%rsp)
- vpermq $0x15,%ymm12,%ymm12
- vpshufd $0xc8,%ymm11,%ymm11
- vmovdqa %ymm6,64(%rsp)
- vpermq $0x15,%ymm13,%ymm13
- vpshufd $0xc8,%ymm12,%ymm12
- vmovdqa %ymm11,96(%rsp)
- vpermq $0x15,%ymm14,%ymm14
- vpshufd $0xc8,%ymm13,%ymm13
- vmovdqa %ymm12,128(%rsp)
- vpermq $0x15,%ymm15,%ymm15
- vpshufd $0xc8,%ymm14,%ymm14
- vmovdqa %ymm13,160(%rsp)
- vpermq $0x15,%ymm5,%ymm5
- vpshufd $0xc8,%ymm15,%ymm15
- vmovdqa %ymm14,192(%rsp)
- vpshufd $0xc8,%ymm5,%ymm5
- vmovdqa %ymm15,224(%rsp)
- vmovdqa %ymm5,256(%rsp)
+ vpermd %ymm11,%ymm7,%ymm11
+ vmovdqa %ymm10,32-144(%rax)
+ vpermd %ymm12,%ymm7,%ymm12
+ vmovdqa %ymm6,64-144(%rax)
+ vpermd %ymm13,%ymm7,%ymm13
+ vmovdqa %ymm11,96-144(%rax)
+ vpermd %ymm14,%ymm7,%ymm14
+ vmovdqa %ymm12,128-144(%rax)
+ vpermd %ymm15,%ymm7,%ymm15
+ vmovdqa %ymm13,160-144(%rax)
+ vpermd %ymm5,%ymm7,%ymm5
+ vmovdqa %ymm14,192-144(%rax)
+ vmovdqa %ymm15,224-144(%rax)
+ vmovdqa %ymm5,256-144(%rax)
vmovdqa 64(%rcx),%ymm5
@@ -1516,7 +1604,6 @@ L$do_avx2:
vpand %ymm5,%ymm10,%ymm10
vpor 32(%rcx),%ymm6,%ymm6
- leaq 144(%rsp),%rax
vpaddq %ymm2,%ymm9,%ymm2
subq $64,%rdx
jz L$tail_avx2
@@ -1811,9 +1898,1506 @@ L$tail_avx2:
vmovd %xmm3,-100(%rdi)
vmovd %xmm4,-96(%rdi)
leaq 8(%r11),%rsp
+
vzeroupper
.byte 0xf3,0xc3
+
+
+.p2align 5
+poly1305_blocks_avx512:
+
+L$blocks_avx512:
+ movl $15,%eax
+ kmovw %eax,%k2
+ leaq -8(%rsp),%r11
+
+ subq $0x128,%rsp
+ leaq L$const(%rip),%rcx
+ leaq 48+64(%rdi),%rdi
+ vmovdqa 96(%rcx),%ymm9
+
+
+ vmovdqu -64(%rdi),%xmm11
+ andq $-512,%rsp
+ vmovdqu -48(%rdi),%xmm12
+ movq $0x20,%rax
+ vmovdqu -32(%rdi),%xmm7
+ vmovdqu -16(%rdi),%xmm13
+ vmovdqu 0(%rdi),%xmm8
+ vmovdqu 16(%rdi),%xmm14
+ vmovdqu 32(%rdi),%xmm10
+ vmovdqu 48(%rdi),%xmm15
+ vmovdqu 64(%rdi),%xmm6
+ vpermd %zmm11,%zmm9,%zmm16
+ vpbroadcastq 64(%rcx),%zmm5
+ vpermd %zmm12,%zmm9,%zmm17
+ vpermd %zmm7,%zmm9,%zmm21
+ vpermd %zmm13,%zmm9,%zmm18
+ vmovdqa64 %zmm16,0(%rsp){%k2}
+ vpsrlq $32,%zmm16,%zmm7
+ vpermd %zmm8,%zmm9,%zmm22
+ vmovdqu64 %zmm17,0(%rsp,%rax,1){%k2}
+ vpsrlq $32,%zmm17,%zmm8
+ vpermd %zmm14,%zmm9,%zmm19
+ vmovdqa64 %zmm21,64(%rsp){%k2}
+ vpermd %zmm10,%zmm9,%zmm23
+ vpermd %zmm15,%zmm9,%zmm20
+ vmovdqu64 %zmm18,64(%rsp,%rax,1){%k2}
+ vpermd %zmm6,%zmm9,%zmm24
+ vmovdqa64 %zmm22,128(%rsp){%k2}
+ vmovdqu64 %zmm19,128(%rsp,%rax,1){%k2}
+ vmovdqa64 %zmm23,192(%rsp){%k2}
+ vmovdqu64 %zmm20,192(%rsp,%rax,1){%k2}
+ vmovdqa64 %zmm24,256(%rsp){%k2}
+
+
+
+
+
+
+
+
+
+
+ vpmuludq %zmm7,%zmm16,%zmm11
+ vpmuludq %zmm7,%zmm17,%zmm12
+ vpmuludq %zmm7,%zmm18,%zmm13
+ vpmuludq %zmm7,%zmm19,%zmm14
+ vpmuludq %zmm7,%zmm20,%zmm15
+ vpsrlq $32,%zmm18,%zmm9
+
+ vpmuludq %zmm8,%zmm24,%zmm25
+ vpmuludq %zmm8,%zmm16,%zmm26
+ vpmuludq %zmm8,%zmm17,%zmm27
+ vpmuludq %zmm8,%zmm18,%zmm28
+ vpmuludq %zmm8,%zmm19,%zmm29
+ vpsrlq $32,%zmm19,%zmm10
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+
+ vpmuludq %zmm9,%zmm23,%zmm25
+ vpmuludq %zmm9,%zmm24,%zmm26
+ vpmuludq %zmm9,%zmm17,%zmm28
+ vpmuludq %zmm9,%zmm18,%zmm29
+ vpmuludq %zmm9,%zmm16,%zmm27
+ vpsrlq $32,%zmm20,%zmm6
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpmuludq %zmm10,%zmm22,%zmm25
+ vpmuludq %zmm10,%zmm16,%zmm28
+ vpmuludq %zmm10,%zmm17,%zmm29
+ vpmuludq %zmm10,%zmm23,%zmm26
+ vpmuludq %zmm10,%zmm24,%zmm27
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpmuludq %zmm6,%zmm24,%zmm28
+ vpmuludq %zmm6,%zmm16,%zmm29
+ vpmuludq %zmm6,%zmm21,%zmm25
+ vpmuludq %zmm6,%zmm22,%zmm26
+ vpmuludq %zmm6,%zmm23,%zmm27
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+
+
+
+ vmovdqu64 0(%rsi),%zmm10
+ vmovdqu64 64(%rsi),%zmm6
+ leaq 128(%rsi),%rsi
+
+
+
+
+ vpsrlq $26,%zmm14,%zmm28
+ vpandq %zmm5,%zmm14,%zmm14
+ vpaddq %zmm28,%zmm15,%zmm15
+
+ vpsrlq $26,%zmm11,%zmm25
+ vpandq %zmm5,%zmm11,%zmm11
+ vpaddq %zmm25,%zmm12,%zmm12
+
+ vpsrlq $26,%zmm15,%zmm29
+ vpandq %zmm5,%zmm15,%zmm15
+
+ vpsrlq $26,%zmm12,%zmm26
+ vpandq %zmm5,%zmm12,%zmm12
+ vpaddq %zmm26,%zmm13,%zmm13
+
+ vpaddq %zmm29,%zmm11,%zmm11
+ vpsllq $2,%zmm29,%zmm29
+ vpaddq %zmm29,%zmm11,%zmm11
+
+ vpsrlq $26,%zmm13,%zmm27
+ vpandq %zmm5,%zmm13,%zmm13
+ vpaddq %zmm27,%zmm14,%zmm14
+
+ vpsrlq $26,%zmm11,%zmm25
+ vpandq %zmm5,%zmm11,%zmm11
+ vpaddq %zmm25,%zmm12,%zmm12
+
+ vpsrlq $26,%zmm14,%zmm28
+ vpandq %zmm5,%zmm14,%zmm14
+ vpaddq %zmm28,%zmm15,%zmm15
+
+
+
+
+
+ vpunpcklqdq %zmm6,%zmm10,%zmm7
+ vpunpckhqdq %zmm6,%zmm10,%zmm6
+
+
+
+
+
+
+ vmovdqa32 128(%rcx),%zmm25
+ movl $0x7777,%eax
+ kmovw %eax,%k1
+
+ vpermd %zmm16,%zmm25,%zmm16
+ vpermd %zmm17,%zmm25,%zmm17
+ vpermd %zmm18,%zmm25,%zmm18
+ vpermd %zmm19,%zmm25,%zmm19
+ vpermd %zmm20,%zmm25,%zmm20
+
+ vpermd %zmm11,%zmm25,%zmm16{%k1}
+ vpermd %zmm12,%zmm25,%zmm17{%k1}
+ vpermd %zmm13,%zmm25,%zmm18{%k1}
+ vpermd %zmm14,%zmm25,%zmm19{%k1}
+ vpermd %zmm15,%zmm25,%zmm20{%k1}
+
+ vpslld $2,%zmm17,%zmm21
+ vpslld $2,%zmm18,%zmm22
+ vpslld $2,%zmm19,%zmm23
+ vpslld $2,%zmm20,%zmm24
+ vpaddd %zmm17,%zmm21,%zmm21
+ vpaddd %zmm18,%zmm22,%zmm22
+ vpaddd %zmm19,%zmm23,%zmm23
+ vpaddd %zmm20,%zmm24,%zmm24
+
+ vpbroadcastq 32(%rcx),%zmm30
+
+ vpsrlq $52,%zmm7,%zmm9
+ vpsllq $12,%zmm6,%zmm10
+ vporq %zmm10,%zmm9,%zmm9
+ vpsrlq $26,%zmm7,%zmm8
+ vpsrlq $14,%zmm6,%zmm10
+ vpsrlq $40,%zmm6,%zmm6
+ vpandq %zmm5,%zmm9,%zmm9
+ vpandq %zmm5,%zmm7,%zmm7
+
+
+
+
+ vpaddq %zmm2,%zmm9,%zmm2
+ subq $192,%rdx
+ jbe L$tail_avx512
+ jmp L$oop_avx512
+
+.p2align 5
+L$oop_avx512:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vpmuludq %zmm2,%zmm17,%zmm14
+ vpaddq %zmm0,%zmm7,%zmm0
+ vpmuludq %zmm2,%zmm18,%zmm15
+ vpandq %zmm5,%zmm8,%zmm8
+ vpmuludq %zmm2,%zmm23,%zmm11
+ vpandq %zmm5,%zmm10,%zmm10
+ vpmuludq %zmm2,%zmm24,%zmm12
+ vporq %zmm30,%zmm6,%zmm6
+ vpmuludq %zmm2,%zmm16,%zmm13
+ vpaddq %zmm1,%zmm8,%zmm1
+ vpaddq %zmm3,%zmm10,%zmm3
+ vpaddq %zmm4,%zmm6,%zmm4
+
+ vmovdqu64 0(%rsi),%zmm10
+ vmovdqu64 64(%rsi),%zmm6
+ leaq 128(%rsi),%rsi
+ vpmuludq %zmm0,%zmm19,%zmm28
+ vpmuludq %zmm0,%zmm20,%zmm29
+ vpmuludq %zmm0,%zmm16,%zmm25
+ vpmuludq %zmm0,%zmm17,%zmm26
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+
+ vpmuludq %zmm1,%zmm18,%zmm28
+ vpmuludq %zmm1,%zmm19,%zmm29
+ vpmuludq %zmm1,%zmm24,%zmm25
+ vpmuludq %zmm0,%zmm18,%zmm27
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpunpcklqdq %zmm6,%zmm10,%zmm7
+ vpunpckhqdq %zmm6,%zmm10,%zmm6
+
+ vpmuludq %zmm3,%zmm16,%zmm28
+ vpmuludq %zmm3,%zmm17,%zmm29
+ vpmuludq %zmm1,%zmm16,%zmm26
+ vpmuludq %zmm1,%zmm17,%zmm27
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpmuludq %zmm4,%zmm24,%zmm28
+ vpmuludq %zmm4,%zmm16,%zmm29
+ vpmuludq %zmm3,%zmm22,%zmm25
+ vpmuludq %zmm3,%zmm23,%zmm26
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpmuludq %zmm3,%zmm24,%zmm27
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpmuludq %zmm4,%zmm21,%zmm25
+ vpmuludq %zmm4,%zmm22,%zmm26
+ vpmuludq %zmm4,%zmm23,%zmm27
+ vpaddq %zmm25,%zmm11,%zmm0
+ vpaddq %zmm26,%zmm12,%zmm1
+ vpaddq %zmm27,%zmm13,%zmm2
+
+
+
+
+ vpsrlq $52,%zmm7,%zmm9
+ vpsllq $12,%zmm6,%zmm10
+
+ vpsrlq $26,%zmm14,%zmm3
+ vpandq %zmm5,%zmm14,%zmm14
+ vpaddq %zmm3,%zmm15,%zmm4
+
+ vporq %zmm10,%zmm9,%zmm9
+
+ vpsrlq $26,%zmm0,%zmm11
+ vpandq %zmm5,%zmm0,%zmm0
+ vpaddq %zmm11,%zmm1,%zmm1
+
+ vpandq %zmm5,%zmm9,%zmm9
+
+ vpsrlq $26,%zmm4,%zmm15
+ vpandq %zmm5,%zmm4,%zmm4
+
+ vpsrlq $26,%zmm1,%zmm12
+ vpandq %zmm5,%zmm1,%zmm1
+ vpaddq %zmm12,%zmm2,%zmm2
+
+ vpaddq %zmm15,%zmm0,%zmm0
+ vpsllq $2,%zmm15,%zmm15
+ vpaddq %zmm15,%zmm0,%zmm0
+
+ vpaddq %zmm9,%zmm2,%zmm2
+ vpsrlq $26,%zmm7,%zmm8
+
+ vpsrlq $26,%zmm2,%zmm13
+ vpandq %zmm5,%zmm2,%zmm2
+ vpaddq %zmm13,%zmm14,%zmm3
+
+ vpsrlq $14,%zmm6,%zmm10
+
+ vpsrlq $26,%zmm0,%zmm11
+ vpandq %zmm5,%zmm0,%zmm0
+ vpaddq %zmm11,%zmm1,%zmm1
+
+ vpsrlq $40,%zmm6,%zmm6
+
+ vpsrlq $26,%zmm3,%zmm14
+ vpandq %zmm5,%zmm3,%zmm3
+ vpaddq %zmm14,%zmm4,%zmm4
+
+ vpandq %zmm5,%zmm7,%zmm7
+
+
+
+
+ subq $128,%rdx
+ ja L$oop_avx512
+
+L$tail_avx512:
+
+
+
+
+
+ vpsrlq $32,%zmm16,%zmm16
+ vpsrlq $32,%zmm17,%zmm17
+ vpsrlq $32,%zmm18,%zmm18
+ vpsrlq $32,%zmm23,%zmm23
+ vpsrlq $32,%zmm24,%zmm24
+ vpsrlq $32,%zmm19,%zmm19
+ vpsrlq $32,%zmm20,%zmm20
+ vpsrlq $32,%zmm21,%zmm21
+ vpsrlq $32,%zmm22,%zmm22
+
+
+
+ leaq (%rsi,%rdx,1),%rsi
+
+
+ vpaddq %zmm0,%zmm7,%zmm0
+
+ vpmuludq %zmm2,%zmm17,%zmm14
+ vpmuludq %zmm2,%zmm18,%zmm15
+ vpmuludq %zmm2,%zmm23,%zmm11
+ vpandq %zmm5,%zmm8,%zmm8
+ vpmuludq %zmm2,%zmm24,%zmm12
+ vpandq %zmm5,%zmm10,%zmm10
+ vpmuludq %zmm2,%zmm16,%zmm13
+ vporq %zmm30,%zmm6,%zmm6
+ vpaddq %zmm1,%zmm8,%zmm1
+ vpaddq %zmm3,%zmm10,%zmm3
+ vpaddq %zmm4,%zmm6,%zmm4
+
+ vmovdqu 0(%rsi),%xmm7
+ vpmuludq %zmm0,%zmm19,%zmm28
+ vpmuludq %zmm0,%zmm20,%zmm29
+ vpmuludq %zmm0,%zmm16,%zmm25
+ vpmuludq %zmm0,%zmm17,%zmm26
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+
+ vmovdqu 16(%rsi),%xmm8
+ vpmuludq %zmm1,%zmm18,%zmm28
+ vpmuludq %zmm1,%zmm19,%zmm29
+ vpmuludq %zmm1,%zmm24,%zmm25
+ vpmuludq %zmm0,%zmm18,%zmm27
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+ vpmuludq %zmm3,%zmm16,%zmm28
+ vpmuludq %zmm3,%zmm17,%zmm29
+ vpmuludq %zmm1,%zmm16,%zmm26
+ vpmuludq %zmm1,%zmm17,%zmm27
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ vpmuludq %zmm4,%zmm24,%zmm28
+ vpmuludq %zmm4,%zmm16,%zmm29
+ vpmuludq %zmm3,%zmm22,%zmm25
+ vpmuludq %zmm3,%zmm23,%zmm26
+ vpmuludq %zmm3,%zmm24,%zmm27
+ vpaddq %zmm28,%zmm14,%zmm3
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpmuludq %zmm4,%zmm21,%zmm25
+ vpmuludq %zmm4,%zmm22,%zmm26
+ vpmuludq %zmm4,%zmm23,%zmm27
+ vpaddq %zmm25,%zmm11,%zmm0
+ vpaddq %zmm26,%zmm12,%zmm1
+ vpaddq %zmm27,%zmm13,%zmm2
+
+
+
+
+ movl $1,%eax
+ vpermq $0xb1,%zmm3,%zmm14
+ vpermq $0xb1,%zmm15,%zmm4
+ vpermq $0xb1,%zmm0,%zmm11
+ vpermq $0xb1,%zmm1,%zmm12
+ vpermq $0xb1,%zmm2,%zmm13
+ vpaddq %zmm14,%zmm3,%zmm3
+ vpaddq %zmm15,%zmm4,%zmm4
+ vpaddq %zmm11,%zmm0,%zmm0
+ vpaddq %zmm12,%zmm1,%zmm1
+ vpaddq %zmm13,%zmm2,%zmm2
+
+ kmovw %eax,%k3
+ vpermq $0x2,%zmm3,%zmm14
+ vpermq $0x2,%zmm4,%zmm15
+ vpermq $0x2,%zmm0,%zmm11
+ vpermq $0x2,%zmm1,%zmm12
+ vpermq $0x2,%zmm2,%zmm13
+ vpaddq %zmm14,%zmm3,%zmm3
+ vpaddq %zmm15,%zmm4,%zmm4
+ vpaddq %zmm11,%zmm0,%zmm0
+ vpaddq %zmm12,%zmm1,%zmm1
+ vpaddq %zmm13,%zmm2,%zmm2
+
+ vextracti64x4 $0x1,%zmm3,%ymm14
+ vextracti64x4 $0x1,%zmm4,%ymm15
+ vextracti64x4 $0x1,%zmm0,%ymm11
+ vextracti64x4 $0x1,%zmm1,%ymm12
+ vextracti64x4 $0x1,%zmm2,%ymm13
+ vpaddq %zmm14,%zmm3,%zmm3{%k3}{z}
+ vpaddq %zmm15,%zmm4,%zmm4{%k3}{z}
+ vpaddq %zmm11,%zmm0,%zmm0{%k3}{z}
+ vpaddq %zmm12,%zmm1,%zmm1{%k3}{z}
+ vpaddq %zmm13,%zmm2,%zmm2{%k3}{z}
+
+
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpsrldq $6,%ymm7,%ymm9
+ vpsrldq $6,%ymm8,%ymm10
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpunpcklqdq %ymm10,%ymm9,%ymm9
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpsrlq $30,%ymm9,%ymm10
+ vpsrlq $4,%ymm9,%ymm9
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpsrlq $26,%ymm7,%ymm8
+ vpsrlq $40,%ymm6,%ymm6
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpand %ymm5,%ymm9,%ymm9
+ vpand %ymm5,%ymm7,%ymm7
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm2,%ymm9,%ymm2
+ vpand %ymm5,%ymm8,%ymm8
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ leaq 144(%rsp),%rax
+ addq $64,%rdx
+ jnz L$tail_avx2
+
+ vpsubq %ymm9,%ymm2,%ymm2
+ vmovd %xmm0,-112(%rdi)
+ vmovd %xmm1,-108(%rdi)
+ vmovd %xmm2,-104(%rdi)
+ vmovd %xmm3,-100(%rdi)
+ vmovd %xmm4,-96(%rdi)
+ vzeroall
+ leaq 8(%r11),%rsp
+
+ .byte 0xf3,0xc3
+
+
+
+.p2align 5
+poly1305_init_base2_44:
+ xorq %rax,%rax
+ movq %rax,0(%rdi)
+ movq %rax,8(%rdi)
+ movq %rax,16(%rdi)
+
+L$init_base2_44:
+ leaq poly1305_blocks_vpmadd52(%rip),%r10
+ leaq poly1305_emit_base2_44(%rip),%r11
+
+ movq $0x0ffffffc0fffffff,%rax
+ movq $0x0ffffffc0ffffffc,%rcx
+ andq 0(%rsi),%rax
+ movq $0x00000fffffffffff,%r8
+ andq 8(%rsi),%rcx
+ movq $0x00000fffffffffff,%r9
+ andq %rax,%r8
+ shrdq $44,%rcx,%rax
+ movq %r8,40(%rdi)
+ andq %r9,%rax
+ shrq $24,%rcx
+ movq %rax,48(%rdi)
+ leaq (%rax,%rax,4),%rax
+ movq %rcx,56(%rdi)
+ shlq $2,%rax
+ leaq (%rcx,%rcx,4),%rcx
+ shlq $2,%rcx
+ movq %rax,24(%rdi)
+ movq %rcx,32(%rdi)
+ movq $-1,64(%rdi)
+ movq %r10,0(%rdx)
+ movq %r11,8(%rdx)
+ movl $1,%eax
+ .byte 0xf3,0xc3
+
+
+.p2align 5
+poly1305_blocks_vpmadd52:
+ shrq $4,%rdx
+ jz L$no_data_vpmadd52
+
+ shlq $40,%rcx
+ movq 64(%rdi),%r8
+
+
+
+
+
+
+ movq $3,%rax
+ movq $1,%r10
+ cmpq $4,%rdx
+ cmovaeq %r10,%rax
+ testq %r8,%r8
+ cmovnsq %r10,%rax
+
+ andq %rdx,%rax
+ jz L$blocks_vpmadd52_4x
+
+ subq %rax,%rdx
+ movl $7,%r10d
+ movl $1,%r11d
+ kmovw %r10d,%k7
+ leaq L$2_44_inp_permd(%rip),%r10
+ kmovw %r11d,%k1
+
+ vmovq %rcx,%xmm21
+ vmovdqa64 0(%r10),%ymm19
+ vmovdqa64 32(%r10),%ymm20
+ vpermq $0xcf,%ymm21,%ymm21
+ vmovdqa64 64(%r10),%ymm22
+
+ vmovdqu64 0(%rdi),%ymm16{%k7}{z}
+ vmovdqu64 40(%rdi),%ymm3{%k7}{z}
+ vmovdqu64 32(%rdi),%ymm4{%k7}{z}
+ vmovdqu64 24(%rdi),%ymm5{%k7}{z}
+
+ vmovdqa64 96(%r10),%ymm23
+ vmovdqa64 128(%r10),%ymm24
+
+ jmp L$oop_vpmadd52
+
+.p2align 5
+L$oop_vpmadd52:
+ vmovdqu32 0(%rsi),%xmm18
+ leaq 16(%rsi),%rsi
+
+ vpermd %ymm18,%ymm19,%ymm18
+ vpsrlvq %ymm20,%ymm18,%ymm18
+ vpandq %ymm22,%ymm18,%ymm18
+ vporq %ymm21,%ymm18,%ymm18
+
+ vpaddq %ymm18,%ymm16,%ymm16
+
+ vpermq $0,%ymm16,%ymm0{%k7}{z}
+ vpermq $85,%ymm16,%ymm1{%k7}{z}
+ vpermq $170,%ymm16,%ymm2{%k7}{z}
+
+ vpxord %ymm16,%ymm16,%ymm16
+ vpxord %ymm17,%ymm17,%ymm17
+
+ vpmadd52luq %ymm3,%ymm0,%ymm16
+ vpmadd52huq %ymm3,%ymm0,%ymm17
+
+ vpmadd52luq %ymm4,%ymm1,%ymm16
+ vpmadd52huq %ymm4,%ymm1,%ymm17
+
+ vpmadd52luq %ymm5,%ymm2,%ymm16
+ vpmadd52huq %ymm5,%ymm2,%ymm17
+
+ vpsrlvq %ymm23,%ymm16,%ymm18
+ vpsllvq %ymm24,%ymm17,%ymm17
+ vpandq %ymm22,%ymm16,%ymm16
+
+ vpaddq %ymm18,%ymm17,%ymm17
+
+ vpermq $147,%ymm17,%ymm17
+
+ vpaddq %ymm17,%ymm16,%ymm16
+
+ vpsrlvq %ymm23,%ymm16,%ymm18
+ vpandq %ymm22,%ymm16,%ymm16
+
+ vpermq $147,%ymm18,%ymm18
+
+ vpaddq %ymm18,%ymm16,%ymm16
+
+ vpermq $147,%ymm16,%ymm18{%k1}{z}
+
+ vpaddq %ymm18,%ymm16,%ymm16
+ vpsllq $2,%ymm18,%ymm18
+
+ vpaddq %ymm18,%ymm16,%ymm16
+
+ decq %rax
+ jnz L$oop_vpmadd52
+
+ vmovdqu64 %ymm16,0(%rdi){%k7}
+
+ testq %rdx,%rdx
+ jnz L$blocks_vpmadd52_4x
+
+L$no_data_vpmadd52:
+ .byte 0xf3,0xc3
+
+
+.p2align 5
+poly1305_blocks_vpmadd52_4x:
+ shrq $4,%rdx
+ jz L$no_data_vpmadd52_4x
+
+ shlq $40,%rcx
+ movq 64(%rdi),%r8
+
+L$blocks_vpmadd52_4x:
+ vpbroadcastq %rcx,%ymm31
+
+ vmovdqa64 L$x_mask44(%rip),%ymm28
+ movl $5,%eax
+ vmovdqa64 L$x_mask42(%rip),%ymm29
+ kmovw %eax,%k1
+
+ testq %r8,%r8
+ js L$init_vpmadd52
+
+ vmovq 0(%rdi),%xmm0
+ vmovq 8(%rdi),%xmm1
+ vmovq 16(%rdi),%xmm2
+
+ testq $3,%rdx
+ jnz L$blocks_vpmadd52_2x_do
+
+L$blocks_vpmadd52_4x_do:
+ vpbroadcastq 64(%rdi),%ymm3
+ vpbroadcastq 96(%rdi),%ymm4
+ vpbroadcastq 128(%rdi),%ymm5
+ vpbroadcastq 160(%rdi),%ymm16
+
+L$blocks_vpmadd52_4x_key_loaded:
+ vpsllq $2,%ymm5,%ymm17
+ vpaddq %ymm5,%ymm17,%ymm17
+ vpsllq $2,%ymm17,%ymm17
+
+ testq $7,%rdx
+ jz L$blocks_vpmadd52_8x
+
+ vmovdqu64 0(%rsi),%ymm26
+ vmovdqu64 32(%rsi),%ymm27
+ leaq 64(%rsi),%rsi
+
+ vpunpcklqdq %ymm27,%ymm26,%ymm25
+ vpunpckhqdq %ymm27,%ymm26,%ymm27
+
+
+
+ vpsrlq $24,%ymm27,%ymm26
+ vporq %ymm31,%ymm26,%ymm26
+ vpaddq %ymm26,%ymm2,%ymm2
+ vpandq %ymm28,%ymm25,%ymm24
+ vpsrlq $44,%ymm25,%ymm25
+ vpsllq $20,%ymm27,%ymm27
+ vporq %ymm27,%ymm25,%ymm25
+ vpandq %ymm28,%ymm25,%ymm25
+
+ subq $4,%rdx
+ jz L$tail_vpmadd52_4x
+ jmp L$oop_vpmadd52_4x
+ ud2
+
+.p2align 5
+L$init_vpmadd52:
+ vmovq 24(%rdi),%xmm16
+ vmovq 56(%rdi),%xmm2
+ vmovq 32(%rdi),%xmm17
+ vmovq 40(%rdi),%xmm3
+ vmovq 48(%rdi),%xmm4
+
+ vmovdqa %ymm3,%ymm0
+ vmovdqa %ymm4,%ymm1
+ vmovdqa %ymm2,%ymm5
+
+ movl $2,%eax
+
+L$mul_init_vpmadd52:
+ vpxorq %ymm18,%ymm18,%ymm18
+ vpmadd52luq %ymm2,%ymm16,%ymm18
+ vpxorq %ymm19,%ymm19,%ymm19
+ vpmadd52huq %ymm2,%ymm16,%ymm19
+ vpxorq %ymm20,%ymm20,%ymm20
+ vpmadd52luq %ymm2,%ymm17,%ymm20
+ vpxorq %ymm21,%ymm21,%ymm21
+ vpmadd52huq %ymm2,%ymm17,%ymm21
+ vpxorq %ymm22,%ymm22,%ymm22
+ vpmadd52luq %ymm2,%ymm3,%ymm22
+ vpxorq %ymm23,%ymm23,%ymm23
+ vpmadd52huq %ymm2,%ymm3,%ymm23
+
+ vpmadd52luq %ymm0,%ymm3,%ymm18
+ vpmadd52huq %ymm0,%ymm3,%ymm19
+ vpmadd52luq %ymm0,%ymm4,%ymm20
+ vpmadd52huq %ymm0,%ymm4,%ymm21
+ vpmadd52luq %ymm0,%ymm5,%ymm22
+ vpmadd52huq %ymm0,%ymm5,%ymm23
+
+ vpmadd52luq %ymm1,%ymm17,%ymm18
+ vpmadd52huq %ymm1,%ymm17,%ymm19
+ vpmadd52luq %ymm1,%ymm3,%ymm20
+ vpmadd52huq %ymm1,%ymm3,%ymm21
+ vpmadd52luq %ymm1,%ymm4,%ymm22
+ vpmadd52huq %ymm1,%ymm4,%ymm23
+
+
+
+ vpsrlq $44,%ymm18,%ymm30
+ vpsllq $8,%ymm19,%ymm19
+ vpandq %ymm28,%ymm18,%ymm0
+ vpaddq %ymm30,%ymm19,%ymm19
+
+ vpaddq %ymm19,%ymm20,%ymm20
+
+ vpsrlq $44,%ymm20,%ymm30
+ vpsllq $8,%ymm21,%ymm21
+ vpandq %ymm28,%ymm20,%ymm1
+ vpaddq %ymm30,%ymm21,%ymm21
+
+ vpaddq %ymm21,%ymm22,%ymm22
+
+ vpsrlq $42,%ymm22,%ymm30
+ vpsllq $10,%ymm23,%ymm23
+ vpandq %ymm29,%ymm22,%ymm2
+ vpaddq %ymm30,%ymm23,%ymm23
+
+ vpaddq %ymm23,%ymm0,%ymm0
+ vpsllq $2,%ymm23,%ymm23
+
+ vpaddq %ymm23,%ymm0,%ymm0
+
+ vpsrlq $44,%ymm0,%ymm30
+ vpandq %ymm28,%ymm0,%ymm0
+
+ vpaddq %ymm30,%ymm1,%ymm1
+
+ decl %eax
+ jz L$done_init_vpmadd52
+
+ vpunpcklqdq %ymm4,%ymm1,%ymm4
+ vpbroadcastq %xmm1,%xmm1
+ vpunpcklqdq %ymm5,%ymm2,%ymm5
+ vpbroadcastq %xmm2,%xmm2
+ vpunpcklqdq %ymm3,%ymm0,%ymm3
+ vpbroadcastq %xmm0,%xmm0
+
+ vpsllq $2,%ymm4,%ymm16
+ vpsllq $2,%ymm5,%ymm17
+ vpaddq %ymm4,%ymm16,%ymm16
+ vpaddq %ymm5,%ymm17,%ymm17
+ vpsllq $2,%ymm16,%ymm16
+ vpsllq $2,%ymm17,%ymm17
+
+ jmp L$mul_init_vpmadd52
+ ud2
+
+.p2align 5
+L$done_init_vpmadd52:
+ vinserti128 $1,%xmm4,%ymm1,%ymm4
+ vinserti128 $1,%xmm5,%ymm2,%ymm5
+ vinserti128 $1,%xmm3,%ymm0,%ymm3
+
+ vpermq $216,%ymm4,%ymm4
+ vpermq $216,%ymm5,%ymm5
+ vpermq $216,%ymm3,%ymm3
+
+ vpsllq $2,%ymm4,%ymm16
+ vpaddq %ymm4,%ymm16,%ymm16
+ vpsllq $2,%ymm16,%ymm16
+
+ vmovq 0(%rdi),%xmm0
+ vmovq 8(%rdi),%xmm1
+ vmovq 16(%rdi),%xmm2
+
+ testq $3,%rdx
+ jnz L$done_init_vpmadd52_2x
+
+ vmovdqu64 %ymm3,64(%rdi)
+ vpbroadcastq %xmm3,%ymm3
+ vmovdqu64 %ymm4,96(%rdi)
+ vpbroadcastq %xmm4,%ymm4
+ vmovdqu64 %ymm5,128(%rdi)
+ vpbroadcastq %xmm5,%ymm5
+ vmovdqu64 %ymm16,160(%rdi)
+ vpbroadcastq %xmm16,%ymm16
+
+ jmp L$blocks_vpmadd52_4x_key_loaded
+ ud2
+
+.p2align 5
+L$done_init_vpmadd52_2x:
+ vmovdqu64 %ymm3,64(%rdi)
+ vpsrldq $8,%ymm3,%ymm3
+ vmovdqu64 %ymm4,96(%rdi)
+ vpsrldq $8,%ymm4,%ymm4
+ vmovdqu64 %ymm5,128(%rdi)
+ vpsrldq $8,%ymm5,%ymm5
+ vmovdqu64 %ymm16,160(%rdi)
+ vpsrldq $8,%ymm16,%ymm16
+ jmp L$blocks_vpmadd52_2x_key_loaded
+ ud2
+
+.p2align 5
+L$blocks_vpmadd52_2x_do:
+ vmovdqu64 128+8(%rdi),%ymm5{%k1}{z}
+ vmovdqu64 160+8(%rdi),%ymm16{%k1}{z}
+ vmovdqu64 64+8(%rdi),%ymm3{%k1}{z}
+ vmovdqu64 96+8(%rdi),%ymm4{%k1}{z}
+
+L$blocks_vpmadd52_2x_key_loaded:
+ vmovdqu64 0(%rsi),%ymm26
+ vpxorq %ymm27,%ymm27,%ymm27
+ leaq 32(%rsi),%rsi
+
+ vpunpcklqdq %ymm27,%ymm26,%ymm25
+ vpunpckhqdq %ymm27,%ymm26,%ymm27
+
+
+
+ vpsrlq $24,%ymm27,%ymm26
+ vporq %ymm31,%ymm26,%ymm26
+ vpaddq %ymm26,%ymm2,%ymm2
+ vpandq %ymm28,%ymm25,%ymm24
+ vpsrlq $44,%ymm25,%ymm25
+ vpsllq $20,%ymm27,%ymm27
+ vporq %ymm27,%ymm25,%ymm25
+ vpandq %ymm28,%ymm25,%ymm25
+
+ jmp L$tail_vpmadd52_2x
+ ud2
+
+.p2align 5
+L$oop_vpmadd52_4x:
+
+ vpaddq %ymm24,%ymm0,%ymm0
+ vpaddq %ymm25,%ymm1,%ymm1
+
+ vpxorq %ymm18,%ymm18,%ymm18
+ vpmadd52luq %ymm2,%ymm16,%ymm18
+ vpxorq %ymm19,%ymm19,%ymm19
+ vpmadd52huq %ymm2,%ymm16,%ymm19
+ vpxorq %ymm20,%ymm20,%ymm20
+ vpmadd52luq %ymm2,%ymm17,%ymm20
+ vpxorq %ymm21,%ymm21,%ymm21
+ vpmadd52huq %ymm2,%ymm17,%ymm21
+ vpxorq %ymm22,%ymm22,%ymm22
+ vpmadd52luq %ymm2,%ymm3,%ymm22
+ vpxorq %ymm23,%ymm23,%ymm23
+ vpmadd52huq %ymm2,%ymm3,%ymm23
+
+ vmovdqu64 0(%rsi),%ymm26
+ vmovdqu64 32(%rsi),%ymm27
+ leaq 64(%rsi),%rsi
+ vpmadd52luq %ymm0,%ymm3,%ymm18
+ vpmadd52huq %ymm0,%ymm3,%ymm19
+ vpmadd52luq %ymm0,%ymm4,%ymm20
+ vpmadd52huq %ymm0,%ymm4,%ymm21
+ vpmadd52luq %ymm0,%ymm5,%ymm22
+ vpmadd52huq %ymm0,%ymm5,%ymm23
+
+ vpunpcklqdq %ymm27,%ymm26,%ymm25
+ vpunpckhqdq %ymm27,%ymm26,%ymm27
+ vpmadd52luq %ymm1,%ymm17,%ymm18
+ vpmadd52huq %ymm1,%ymm17,%ymm19
+ vpmadd52luq %ymm1,%ymm3,%ymm20
+ vpmadd52huq %ymm1,%ymm3,%ymm21
+ vpmadd52luq %ymm1,%ymm4,%ymm22
+ vpmadd52huq %ymm1,%ymm4,%ymm23
+
+
+
+ vpsrlq $44,%ymm18,%ymm30
+ vpsllq $8,%ymm19,%ymm19
+ vpandq %ymm28,%ymm18,%ymm0
+ vpaddq %ymm30,%ymm19,%ymm19
+
+ vpsrlq $24,%ymm27,%ymm26
+ vporq %ymm31,%ymm26,%ymm26
+ vpaddq %ymm19,%ymm20,%ymm20
+
+ vpsrlq $44,%ymm20,%ymm30
+ vpsllq $8,%ymm21,%ymm21
+ vpandq %ymm28,%ymm20,%ymm1
+ vpaddq %ymm30,%ymm21,%ymm21
+
+ vpandq %ymm28,%ymm25,%ymm24
+ vpsrlq $44,%ymm25,%ymm25
+ vpsllq $20,%ymm27,%ymm27
+ vpaddq %ymm21,%ymm22,%ymm22
+
+ vpsrlq $42,%ymm22,%ymm30
+ vpsllq $10,%ymm23,%ymm23
+ vpandq %ymm29,%ymm22,%ymm2
+ vpaddq %ymm30,%ymm23,%ymm23
+
+ vpaddq %ymm26,%ymm2,%ymm2
+ vpaddq %ymm23,%ymm0,%ymm0
+ vpsllq $2,%ymm23,%ymm23
+
+ vpaddq %ymm23,%ymm0,%ymm0
+ vporq %ymm27,%ymm25,%ymm25
+ vpandq %ymm28,%ymm25,%ymm25
+
+ vpsrlq $44,%ymm0,%ymm30
+ vpandq %ymm28,%ymm0,%ymm0
+
+ vpaddq %ymm30,%ymm1,%ymm1
+
+ subq $4,%rdx
+ jnz L$oop_vpmadd52_4x
+
+L$tail_vpmadd52_4x:
+ vmovdqu64 128(%rdi),%ymm5
+ vmovdqu64 160(%rdi),%ymm16
+ vmovdqu64 64(%rdi),%ymm3
+ vmovdqu64 96(%rdi),%ymm4
+
+L$tail_vpmadd52_2x:
+ vpsllq $2,%ymm5,%ymm17
+ vpaddq %ymm5,%ymm17,%ymm17
+ vpsllq $2,%ymm17,%ymm17
+
+
+ vpaddq %ymm24,%ymm0,%ymm0
+ vpaddq %ymm25,%ymm1,%ymm1
+
+ vpxorq %ymm18,%ymm18,%ymm18
+ vpmadd52luq %ymm2,%ymm16,%ymm18
+ vpxorq %ymm19,%ymm19,%ymm19
+ vpmadd52huq %ymm2,%ymm16,%ymm19
+ vpxorq %ymm20,%ymm20,%ymm20
+ vpmadd52luq %ymm2,%ymm17,%ymm20
+ vpxorq %ymm21,%ymm21,%ymm21
+ vpmadd52huq %ymm2,%ymm17,%ymm21
+ vpxorq %ymm22,%ymm22,%ymm22
+ vpmadd52luq %ymm2,%ymm3,%ymm22
+ vpxorq %ymm23,%ymm23,%ymm23
+ vpmadd52huq %ymm2,%ymm3,%ymm23
+
+ vpmadd52luq %ymm0,%ymm3,%ymm18
+ vpmadd52huq %ymm0,%ymm3,%ymm19
+ vpmadd52luq %ymm0,%ymm4,%ymm20
+ vpmadd52huq %ymm0,%ymm4,%ymm21
+ vpmadd52luq %ymm0,%ymm5,%ymm22
+ vpmadd52huq %ymm0,%ymm5,%ymm23
+
+ vpmadd52luq %ymm1,%ymm17,%ymm18
+ vpmadd52huq %ymm1,%ymm17,%ymm19
+ vpmadd52luq %ymm1,%ymm3,%ymm20
+ vpmadd52huq %ymm1,%ymm3,%ymm21
+ vpmadd52luq %ymm1,%ymm4,%ymm22
+ vpmadd52huq %ymm1,%ymm4,%ymm23
+
+
+
+
+ movl $1,%eax
+ kmovw %eax,%k1
+ vpsrldq $8,%ymm18,%ymm24
+ vpsrldq $8,%ymm19,%ymm0
+ vpsrldq $8,%ymm20,%ymm25
+ vpsrldq $8,%ymm21,%ymm1
+ vpaddq %ymm24,%ymm18,%ymm18
+ vpaddq %ymm0,%ymm19,%ymm19
+ vpsrldq $8,%ymm22,%ymm26
+ vpsrldq $8,%ymm23,%ymm2
+ vpaddq %ymm25,%ymm20,%ymm20
+ vpaddq %ymm1,%ymm21,%ymm21
+ vpermq $0x2,%ymm18,%ymm24
+ vpermq $0x2,%ymm19,%ymm0
+ vpaddq %ymm26,%ymm22,%ymm22
+ vpaddq %ymm2,%ymm23,%ymm23
+
+ vpermq $0x2,%ymm20,%ymm25
+ vpermq $0x2,%ymm21,%ymm1
+ vpaddq %ymm24,%ymm18,%ymm18{%k1}{z}
+ vpaddq %ymm0,%ymm19,%ymm19{%k1}{z}
+ vpermq $0x2,%ymm22,%ymm26
+ vpermq $0x2,%ymm23,%ymm2
+ vpaddq %ymm25,%ymm20,%ymm20{%k1}{z}
+ vpaddq %ymm1,%ymm21,%ymm21{%k1}{z}
+ vpaddq %ymm26,%ymm22,%ymm22{%k1}{z}
+ vpaddq %ymm2,%ymm23,%ymm23{%k1}{z}
+
+
+
+ vpsrlq $44,%ymm18,%ymm30
+ vpsllq $8,%ymm19,%ymm19
+ vpandq %ymm28,%ymm18,%ymm0
+ vpaddq %ymm30,%ymm19,%ymm19
+
+ vpaddq %ymm19,%ymm20,%ymm20
+
+ vpsrlq $44,%ymm20,%ymm30
+ vpsllq $8,%ymm21,%ymm21
+ vpandq %ymm28,%ymm20,%ymm1
+ vpaddq %ymm30,%ymm21,%ymm21
+
+ vpaddq %ymm21,%ymm22,%ymm22
+
+ vpsrlq $42,%ymm22,%ymm30
+ vpsllq $10,%ymm23,%ymm23
+ vpandq %ymm29,%ymm22,%ymm2
+ vpaddq %ymm30,%ymm23,%ymm23
+
+ vpaddq %ymm23,%ymm0,%ymm0
+ vpsllq $2,%ymm23,%ymm23
+
+ vpaddq %ymm23,%ymm0,%ymm0
+
+ vpsrlq $44,%ymm0,%ymm30
+ vpandq %ymm28,%ymm0,%ymm0
+
+ vpaddq %ymm30,%ymm1,%ymm1
+
+
+ subq $2,%rdx
+ ja L$blocks_vpmadd52_4x_do
+
+ vmovq %xmm0,0(%rdi)
+ vmovq %xmm1,8(%rdi)
+ vmovq %xmm2,16(%rdi)
+ vzeroall
+
+L$no_data_vpmadd52_4x:
+ .byte 0xf3,0xc3
+
+
+.p2align 5
+poly1305_blocks_vpmadd52_8x:
+ shrq $4,%rdx
+ jz L$no_data_vpmadd52_8x
+
+ shlq $40,%rcx
+ movq 64(%rdi),%r8
+
+ vmovdqa64 L$x_mask44(%rip),%ymm28
+ vmovdqa64 L$x_mask42(%rip),%ymm29
+
+ testq %r8,%r8
+ js L$init_vpmadd52
+
+ vmovq 0(%rdi),%xmm0
+ vmovq 8(%rdi),%xmm1
+ vmovq 16(%rdi),%xmm2
+
+L$blocks_vpmadd52_8x:
+
+
+
+ vmovdqu64 128(%rdi),%ymm5
+ vmovdqu64 160(%rdi),%ymm16
+ vmovdqu64 64(%rdi),%ymm3
+ vmovdqu64 96(%rdi),%ymm4
+
+ vpsllq $2,%ymm5,%ymm17
+ vpaddq %ymm5,%ymm17,%ymm17
+ vpsllq $2,%ymm17,%ymm17
+
+ vpbroadcastq %xmm5,%ymm8
+ vpbroadcastq %xmm3,%ymm6
+ vpbroadcastq %xmm4,%ymm7
+
+ vpxorq %ymm18,%ymm18,%ymm18
+ vpmadd52luq %ymm8,%ymm16,%ymm18
+ vpxorq %ymm19,%ymm19,%ymm19
+ vpmadd52huq %ymm8,%ymm16,%ymm19
+ vpxorq %ymm20,%ymm20,%ymm20
+ vpmadd52luq %ymm8,%ymm17,%ymm20
+ vpxorq %ymm21,%ymm21,%ymm21
+ vpmadd52huq %ymm8,%ymm17,%ymm21
+ vpxorq %ymm22,%ymm22,%ymm22
+ vpmadd52luq %ymm8,%ymm3,%ymm22
+ vpxorq %ymm23,%ymm23,%ymm23
+ vpmadd52huq %ymm8,%ymm3,%ymm23
+
+ vpmadd52luq %ymm6,%ymm3,%ymm18
+ vpmadd52huq %ymm6,%ymm3,%ymm19
+ vpmadd52luq %ymm6,%ymm4,%ymm20
+ vpmadd52huq %ymm6,%ymm4,%ymm21
+ vpmadd52luq %ymm6,%ymm5,%ymm22
+ vpmadd52huq %ymm6,%ymm5,%ymm23
+
+ vpmadd52luq %ymm7,%ymm17,%ymm18
+ vpmadd52huq %ymm7,%ymm17,%ymm19
+ vpmadd52luq %ymm7,%ymm3,%ymm20
+ vpmadd52huq %ymm7,%ymm3,%ymm21
+ vpmadd52luq %ymm7,%ymm4,%ymm22
+ vpmadd52huq %ymm7,%ymm4,%ymm23
+
+
+
+ vpsrlq $44,%ymm18,%ymm30
+ vpsllq $8,%ymm19,%ymm19
+ vpandq %ymm28,%ymm18,%ymm6
+ vpaddq %ymm30,%ymm19,%ymm19
+
+ vpaddq %ymm19,%ymm20,%ymm20
+
+ vpsrlq $44,%ymm20,%ymm30
+ vpsllq $8,%ymm21,%ymm21
+ vpandq %ymm28,%ymm20,%ymm7
+ vpaddq %ymm30,%ymm21,%ymm21
+
+ vpaddq %ymm21,%ymm22,%ymm22
+
+ vpsrlq $42,%ymm22,%ymm30
+ vpsllq $10,%ymm23,%ymm23
+ vpandq %ymm29,%ymm22,%ymm8
+ vpaddq %ymm30,%ymm23,%ymm23
+
+ vpaddq %ymm23,%ymm6,%ymm6
+ vpsllq $2,%ymm23,%ymm23
+
+ vpaddq %ymm23,%ymm6,%ymm6
+
+ vpsrlq $44,%ymm6,%ymm30
+ vpandq %ymm28,%ymm6,%ymm6
+
+ vpaddq %ymm30,%ymm7,%ymm7
+
+
+
+
+
+ vpunpcklqdq %ymm5,%ymm8,%ymm26
+ vpunpckhqdq %ymm5,%ymm8,%ymm5
+ vpunpcklqdq %ymm3,%ymm6,%ymm24
+ vpunpckhqdq %ymm3,%ymm6,%ymm3
+ vpunpcklqdq %ymm4,%ymm7,%ymm25
+ vpunpckhqdq %ymm4,%ymm7,%ymm4
+ vshufi64x2 $0x44,%zmm5,%zmm26,%zmm8
+ vshufi64x2 $0x44,%zmm3,%zmm24,%zmm6
+ vshufi64x2 $0x44,%zmm4,%zmm25,%zmm7
+
+ vmovdqu64 0(%rsi),%zmm26
+ vmovdqu64 64(%rsi),%zmm27
+ leaq 128(%rsi),%rsi
+
+ vpsllq $2,%zmm8,%zmm10
+ vpsllq $2,%zmm7,%zmm9
+ vpaddq %zmm8,%zmm10,%zmm10
+ vpaddq %zmm7,%zmm9,%zmm9
+ vpsllq $2,%zmm10,%zmm10
+ vpsllq $2,%zmm9,%zmm9
+
+ vpbroadcastq %rcx,%zmm31
+ vpbroadcastq %xmm28,%zmm28
+ vpbroadcastq %xmm29,%zmm29
+
+ vpbroadcastq %xmm9,%zmm16
+ vpbroadcastq %xmm10,%zmm17
+ vpbroadcastq %xmm6,%zmm3
+ vpbroadcastq %xmm7,%zmm4
+ vpbroadcastq %xmm8,%zmm5
+
+ vpunpcklqdq %zmm27,%zmm26,%zmm25
+ vpunpckhqdq %zmm27,%zmm26,%zmm27
+
+
+
+ vpsrlq $24,%zmm27,%zmm26
+ vporq %zmm31,%zmm26,%zmm26
+ vpaddq %zmm26,%zmm2,%zmm2
+ vpandq %zmm28,%zmm25,%zmm24
+ vpsrlq $44,%zmm25,%zmm25
+ vpsllq $20,%zmm27,%zmm27
+ vporq %zmm27,%zmm25,%zmm25
+ vpandq %zmm28,%zmm25,%zmm25
+
+ subq $8,%rdx
+ jz L$tail_vpmadd52_8x
+ jmp L$oop_vpmadd52_8x
+
+.p2align 5
+L$oop_vpmadd52_8x:
+
+ vpaddq %zmm24,%zmm0,%zmm0
+ vpaddq %zmm25,%zmm1,%zmm1
+
+ vpxorq %zmm18,%zmm18,%zmm18
+ vpmadd52luq %zmm2,%zmm16,%zmm18
+ vpxorq %zmm19,%zmm19,%zmm19
+ vpmadd52huq %zmm2,%zmm16,%zmm19
+ vpxorq %zmm20,%zmm20,%zmm20
+ vpmadd52luq %zmm2,%zmm17,%zmm20
+ vpxorq %zmm21,%zmm21,%zmm21
+ vpmadd52huq %zmm2,%zmm17,%zmm21
+ vpxorq %zmm22,%zmm22,%zmm22
+ vpmadd52luq %zmm2,%zmm3,%zmm22
+ vpxorq %zmm23,%zmm23,%zmm23
+ vpmadd52huq %zmm2,%zmm3,%zmm23
+
+ vmovdqu64 0(%rsi),%zmm26
+ vmovdqu64 64(%rsi),%zmm27
+ leaq 128(%rsi),%rsi
+ vpmadd52luq %zmm0,%zmm3,%zmm18
+ vpmadd52huq %zmm0,%zmm3,%zmm19
+ vpmadd52luq %zmm0,%zmm4,%zmm20
+ vpmadd52huq %zmm0,%zmm4,%zmm21
+ vpmadd52luq %zmm0,%zmm5,%zmm22
+ vpmadd52huq %zmm0,%zmm5,%zmm23
+
+ vpunpcklqdq %zmm27,%zmm26,%zmm25
+ vpunpckhqdq %zmm27,%zmm26,%zmm27
+ vpmadd52luq %zmm1,%zmm17,%zmm18
+ vpmadd52huq %zmm1,%zmm17,%zmm19
+ vpmadd52luq %zmm1,%zmm3,%zmm20
+ vpmadd52huq %zmm1,%zmm3,%zmm21
+ vpmadd52luq %zmm1,%zmm4,%zmm22
+ vpmadd52huq %zmm1,%zmm4,%zmm23
+
+
+
+ vpsrlq $44,%zmm18,%zmm30
+ vpsllq $8,%zmm19,%zmm19
+ vpandq %zmm28,%zmm18,%zmm0
+ vpaddq %zmm30,%zmm19,%zmm19
+
+ vpsrlq $24,%zmm27,%zmm26
+ vporq %zmm31,%zmm26,%zmm26
+ vpaddq %zmm19,%zmm20,%zmm20
+
+ vpsrlq $44,%zmm20,%zmm30
+ vpsllq $8,%zmm21,%zmm21
+ vpandq %zmm28,%zmm20,%zmm1
+ vpaddq %zmm30,%zmm21,%zmm21
+
+ vpandq %zmm28,%zmm25,%zmm24
+ vpsrlq $44,%zmm25,%zmm25
+ vpsllq $20,%zmm27,%zmm27
+ vpaddq %zmm21,%zmm22,%zmm22
+
+ vpsrlq $42,%zmm22,%zmm30
+ vpsllq $10,%zmm23,%zmm23
+ vpandq %zmm29,%zmm22,%zmm2
+ vpaddq %zmm30,%zmm23,%zmm23
+
+ vpaddq %zmm26,%zmm2,%zmm2
+ vpaddq %zmm23,%zmm0,%zmm0
+ vpsllq $2,%zmm23,%zmm23
+
+ vpaddq %zmm23,%zmm0,%zmm0
+ vporq %zmm27,%zmm25,%zmm25
+ vpandq %zmm28,%zmm25,%zmm25
+
+ vpsrlq $44,%zmm0,%zmm30
+ vpandq %zmm28,%zmm0,%zmm0
+
+ vpaddq %zmm30,%zmm1,%zmm1
+
+ subq $8,%rdx
+ jnz L$oop_vpmadd52_8x
+
+L$tail_vpmadd52_8x:
+
+ vpaddq %zmm24,%zmm0,%zmm0
+ vpaddq %zmm25,%zmm1,%zmm1
+
+ vpxorq %zmm18,%zmm18,%zmm18
+ vpmadd52luq %zmm2,%zmm9,%zmm18
+ vpxorq %zmm19,%zmm19,%zmm19
+ vpmadd52huq %zmm2,%zmm9,%zmm19
+ vpxorq %zmm20,%zmm20,%zmm20
+ vpmadd52luq %zmm2,%zmm10,%zmm20
+ vpxorq %zmm21,%zmm21,%zmm21
+ vpmadd52huq %zmm2,%zmm10,%zmm21
+ vpxorq %zmm22,%zmm22,%zmm22
+ vpmadd52luq %zmm2,%zmm6,%zmm22
+ vpxorq %zmm23,%zmm23,%zmm23
+ vpmadd52huq %zmm2,%zmm6,%zmm23
+
+ vpmadd52luq %zmm0,%zmm6,%zmm18
+ vpmadd52huq %zmm0,%zmm6,%zmm19
+ vpmadd52luq %zmm0,%zmm7,%zmm20
+ vpmadd52huq %zmm0,%zmm7,%zmm21
+ vpmadd52luq %zmm0,%zmm8,%zmm22
+ vpmadd52huq %zmm0,%zmm8,%zmm23
+
+ vpmadd52luq %zmm1,%zmm10,%zmm18
+ vpmadd52huq %zmm1,%zmm10,%zmm19
+ vpmadd52luq %zmm1,%zmm6,%zmm20
+ vpmadd52huq %zmm1,%zmm6,%zmm21
+ vpmadd52luq %zmm1,%zmm7,%zmm22
+ vpmadd52huq %zmm1,%zmm7,%zmm23
+
+
+
+
+ movl $1,%eax
+ kmovw %eax,%k1
+ vpsrldq $8,%zmm18,%zmm24
+ vpsrldq $8,%zmm19,%zmm0
+ vpsrldq $8,%zmm20,%zmm25
+ vpsrldq $8,%zmm21,%zmm1
+ vpaddq %zmm24,%zmm18,%zmm18
+ vpaddq %zmm0,%zmm19,%zmm19
+ vpsrldq $8,%zmm22,%zmm26
+ vpsrldq $8,%zmm23,%zmm2
+ vpaddq %zmm25,%zmm20,%zmm20
+ vpaddq %zmm1,%zmm21,%zmm21
+ vpermq $0x2,%zmm18,%zmm24
+ vpermq $0x2,%zmm19,%zmm0
+ vpaddq %zmm26,%zmm22,%zmm22
+ vpaddq %zmm2,%zmm23,%zmm23
+
+ vpermq $0x2,%zmm20,%zmm25
+ vpermq $0x2,%zmm21,%zmm1
+ vpaddq %zmm24,%zmm18,%zmm18
+ vpaddq %zmm0,%zmm19,%zmm19
+ vpermq $0x2,%zmm22,%zmm26
+ vpermq $0x2,%zmm23,%zmm2
+ vpaddq %zmm25,%zmm20,%zmm20
+ vpaddq %zmm1,%zmm21,%zmm21
+ vextracti64x4 $1,%zmm18,%ymm24
+ vextracti64x4 $1,%zmm19,%ymm0
+ vpaddq %zmm26,%zmm22,%zmm22
+ vpaddq %zmm2,%zmm23,%zmm23
+
+ vextracti64x4 $1,%zmm20,%ymm25
+ vextracti64x4 $1,%zmm21,%ymm1
+ vextracti64x4 $1,%zmm22,%ymm26
+ vextracti64x4 $1,%zmm23,%ymm2
+ vpaddq %ymm24,%ymm18,%ymm18{%k1}{z}
+ vpaddq %ymm0,%ymm19,%ymm19{%k1}{z}
+ vpaddq %ymm25,%ymm20,%ymm20{%k1}{z}
+ vpaddq %ymm1,%ymm21,%ymm21{%k1}{z}
+ vpaddq %ymm26,%ymm22,%ymm22{%k1}{z}
+ vpaddq %ymm2,%ymm23,%ymm23{%k1}{z}
+
+
+
+ vpsrlq $44,%ymm18,%ymm30
+ vpsllq $8,%ymm19,%ymm19
+ vpandq %ymm28,%ymm18,%ymm0
+ vpaddq %ymm30,%ymm19,%ymm19
+
+ vpaddq %ymm19,%ymm20,%ymm20
+
+ vpsrlq $44,%ymm20,%ymm30
+ vpsllq $8,%ymm21,%ymm21
+ vpandq %ymm28,%ymm20,%ymm1
+ vpaddq %ymm30,%ymm21,%ymm21
+
+ vpaddq %ymm21,%ymm22,%ymm22
+
+ vpsrlq $42,%ymm22,%ymm30
+ vpsllq $10,%ymm23,%ymm23
+ vpandq %ymm29,%ymm22,%ymm2
+ vpaddq %ymm30,%ymm23,%ymm23
+
+ vpaddq %ymm23,%ymm0,%ymm0
+ vpsllq $2,%ymm23,%ymm23
+
+ vpaddq %ymm23,%ymm0,%ymm0
+
+ vpsrlq $44,%ymm0,%ymm30
+ vpandq %ymm28,%ymm0,%ymm0
+
+ vpaddq %ymm30,%ymm1,%ymm1
+
+
+
+ vmovq %xmm0,0(%rdi)
+ vmovq %xmm1,8(%rdi)
+ vmovq %xmm2,16(%rdi)
+ vzeroall
+
+L$no_data_vpmadd52_8x:
+ .byte 0xf3,0xc3
+
+
+.p2align 5
+poly1305_emit_base2_44:
+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movq 16(%rdi),%r10
+
+ movq %r9,%rax
+ shrq $20,%r9
+ shlq $44,%rax
+ movq %r10,%rcx
+ shrq $40,%r10
+ shlq $24,%rcx
+
+ addq %rax,%r8
+ adcq %rcx,%r9
+ adcq $0,%r10
+
+ movq %r8,%rax
+ addq $5,%r8
+ movq %r9,%rcx
+ adcq $0,%r9
+ adcq $0,%r10
+ shrq $2,%r10
+ cmovnzq %r8,%rax
+ cmovnzq %r9,%rcx
+
+ addq 0(%rdx),%rax
+ adcq 8(%rdx),%rcx
+ movq %rax,0(%rsi)
+ movq %rcx,8(%rsi)
+
+ .byte 0xf3,0xc3
+
.p2align 6
L$const:
L$mask24:
@@ -1822,7 +3406,125 @@ L$129:
.long 16777216,0,16777216,0,16777216,0,16777216,0
L$mask26:
.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
-L$five:
-.long 5,0,5,0,5,0,5,0
+L$permd_avx2:
+.long 2,2,2,3,2,0,2,1
+L$permd_avx512:
+.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+
+L$2_44_inp_permd:
+.long 0,1,1,2,2,3,7,7
+L$2_44_inp_shift:
+.quad 0,12,24,64
+L$2_44_mask:
+.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
+L$2_44_shift_rgt:
+.quad 44,44,42,64
+L$2_44_shift_lft:
+.quad 8,8,10,64
+
+.p2align 6
+L$x_mask44:
+.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+L$x_mask42:
+.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 4
+.globl _xor128_encrypt_n_pad
+
+.p2align 4
+_xor128_encrypt_n_pad:
+ subq %rdx,%rsi
+ subq %rdx,%rdi
+ movq %rcx,%r10
+ shrq $4,%rcx
+ jz L$tail_enc
+ nop
+L$oop_enc_xmm:
+ movdqu (%rsi,%rdx,1),%xmm0
+ pxor (%rdx),%xmm0
+ movdqu %xmm0,(%rdi,%rdx,1)
+ movdqa %xmm0,(%rdx)
+ leaq 16(%rdx),%rdx
+ decq %rcx
+ jnz L$oop_enc_xmm
+
+ andq $15,%r10
+ jz L$done_enc
+
+L$tail_enc:
+ movq $16,%rcx
+ subq %r10,%rcx
+ xorl %eax,%eax
+L$oop_enc_byte:
+ movb (%rsi,%rdx,1),%al
+ xorb (%rdx),%al
+ movb %al,(%rdi,%rdx,1)
+ movb %al,(%rdx)
+ leaq 1(%rdx),%rdx
+ decq %r10
+ jnz L$oop_enc_byte
+
+ xorl %eax,%eax
+L$oop_enc_pad:
+ movb %al,(%rdx)
+ leaq 1(%rdx),%rdx
+ decq %rcx
+ jnz L$oop_enc_pad
+
+L$done_enc:
+ movq %rdx,%rax
+ .byte 0xf3,0xc3
+
+
+.globl _xor128_decrypt_n_pad
+
+.p2align 4
+_xor128_decrypt_n_pad:
+ subq %rdx,%rsi
+ subq %rdx,%rdi
+ movq %rcx,%r10
+ shrq $4,%rcx
+ jz L$tail_dec
+ nop
+L$oop_dec_xmm:
+ movdqu (%rsi,%rdx,1),%xmm0
+ movdqa (%rdx),%xmm1
+ pxor %xmm0,%xmm1
+ movdqu %xmm1,(%rdi,%rdx,1)
+ movdqa %xmm0,(%rdx)
+ leaq 16(%rdx),%rdx
+ decq %rcx
+ jnz L$oop_dec_xmm
+
+ pxor %xmm1,%xmm1
+ andq $15,%r10
+ jz L$done_dec
+
+L$tail_dec:
+ movq $16,%rcx
+ subq %r10,%rcx
+ xorl %eax,%eax
+ xorq %r11,%r11
+L$oop_dec_byte:
+ movb (%rsi,%rdx,1),%r11b
+ movb (%rdx),%al
+ xorb %r11b,%al
+ movb %al,(%rdi,%rdx,1)
+ movb %r11b,(%rdx)
+ leaq 1(%rdx),%rdx
+ decq %r10
+ jnz L$oop_dec_byte
+
+ xorl %eax,%eax
+L$oop_dec_pad:
+ movb %al,(%rdx)
+ leaq 1(%rdx),%rdx
+ decq %rcx
+ jnz L$oop_dec_pad
+
+L$done_dec:
+ movq %rdx,%rax
+ .byte 0xf3,0xc3
+
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/rc4/rc4-md5-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/rc4/rc4-md5-x86_64.s
index 47dce361a6..435976a260 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/rc4/rc4-md5-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/rc4/rc4-md5-x86_64.s
@@ -4,15 +4,23 @@
.globl _rc4_md5_enc
_rc4_md5_enc:
+
cmpq $0,%r9
je L$abort
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
subq $40,%rsp
+
L$body:
movq %rcx,%r11
movq %r9,%r12
@@ -1247,13 +1255,21 @@ L$oop:
movl %ecx,-4(%rdi)
movq 40(%rsp),%r15
+
movq 48(%rsp),%r14
+
movq 56(%rsp),%r13
+
movq 64(%rsp),%r12
+
movq 72(%rsp),%rbp
+
movq 80(%rsp),%rbx
+
leaq 88(%rsp),%rsp
+
L$epilogue:
L$abort:
.byte 0xf3,0xc3
+
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/rc4/rc4-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/rc4/rc4-x86_64.s
index 86ef486662..8bfb48b74a 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/rc4/rc4-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/rc4/rc4-x86_64.s
@@ -8,9 +8,13 @@ _RC4: orq %rsi,%rsi
jne L$entry
.byte 0xf3,0xc3
L$entry:
+
pushq %rbx
+
pushq %r12
+
pushq %r13
+
L$prologue:
movq %rsi,%r11
movq %rdx,%r12
@@ -511,12 +515,17 @@ L$exit:
movl %ecx,-4(%rdi)
movq (%rsp),%r13
+
movq 8(%rsp),%r12
+
movq 16(%rsp),%rbx
+
addq $24,%rsp
+
L$epilogue:
.byte 0xf3,0xc3
+
.globl _RC4_set_key
.p2align 4
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/keccak1600-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/keccak1600-x86_64.s
new file mode 100644
index 0000000000..ec096c5ab0
--- /dev/null
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/keccak1600-x86_64.s
@@ -0,0 +1,492 @@
+.text
+
+
+.p2align 5
+__KeccakF1600:
+ movq 60(%rdi),%rax
+ movq 68(%rdi),%rbx
+ movq 76(%rdi),%rcx
+ movq 84(%rdi),%rdx
+ movq 92(%rdi),%rbp
+ jmp L$oop
+
+.p2align 5
+L$oop:
+ movq -100(%rdi),%r8
+ movq -52(%rdi),%r9
+ movq -4(%rdi),%r10
+ movq 44(%rdi),%r11
+
+ xorq -84(%rdi),%rcx
+ xorq -76(%rdi),%rdx
+ xorq %r8,%rax
+ xorq -92(%rdi),%rbx
+ xorq -44(%rdi),%rcx
+ xorq -60(%rdi),%rax
+ movq %rbp,%r12
+ xorq -68(%rdi),%rbp
+
+ xorq %r10,%rcx
+ xorq -20(%rdi),%rax
+ xorq -36(%rdi),%rdx
+ xorq %r9,%rbx
+ xorq -28(%rdi),%rbp
+
+ xorq 36(%rdi),%rcx
+ xorq 20(%rdi),%rax
+ xorq 4(%rdi),%rdx
+ xorq -12(%rdi),%rbx
+ xorq 12(%rdi),%rbp
+
+ movq %rcx,%r13
+ rolq $1,%rcx
+ xorq %rax,%rcx
+ xorq %r11,%rdx
+
+ rolq $1,%rax
+ xorq %rdx,%rax
+ xorq 28(%rdi),%rbx
+
+ rolq $1,%rdx
+ xorq %rbx,%rdx
+ xorq 52(%rdi),%rbp
+
+ rolq $1,%rbx
+ xorq %rbp,%rbx
+
+ rolq $1,%rbp
+ xorq %r13,%rbp
+ xorq %rcx,%r9
+ xorq %rdx,%r10
+ rolq $44,%r9
+ xorq %rbp,%r11
+ xorq %rax,%r12
+ rolq $43,%r10
+ xorq %rbx,%r8
+ movq %r9,%r13
+ rolq $21,%r11
+ orq %r10,%r9
+ xorq %r8,%r9
+ rolq $14,%r12
+
+ xorq (%r15),%r9
+ leaq 8(%r15),%r15
+
+ movq %r12,%r14
+ andq %r11,%r12
+ movq %r9,-100(%rsi)
+ xorq %r10,%r12
+ notq %r10
+ movq %r12,-84(%rsi)
+
+ orq %r11,%r10
+ movq 76(%rdi),%r12
+ xorq %r13,%r10
+ movq %r10,-92(%rsi)
+
+ andq %r8,%r13
+ movq -28(%rdi),%r9
+ xorq %r14,%r13
+ movq -20(%rdi),%r10
+ movq %r13,-68(%rsi)
+
+ orq %r8,%r14
+ movq -76(%rdi),%r8
+ xorq %r11,%r14
+ movq 28(%rdi),%r11
+ movq %r14,-76(%rsi)
+
+
+ xorq %rbp,%r8
+ xorq %rdx,%r12
+ rolq $28,%r8
+ xorq %rcx,%r11
+ xorq %rax,%r9
+ rolq $61,%r12
+ rolq $45,%r11
+ xorq %rbx,%r10
+ rolq $20,%r9
+ movq %r8,%r13
+ orq %r12,%r8
+ rolq $3,%r10
+
+ xorq %r11,%r8
+ movq %r8,-36(%rsi)
+
+ movq %r9,%r14
+ andq %r13,%r9
+ movq -92(%rdi),%r8
+ xorq %r12,%r9
+ notq %r12
+ movq %r9,-28(%rsi)
+
+ orq %r11,%r12
+ movq -44(%rdi),%r9
+ xorq %r10,%r12
+ movq %r12,-44(%rsi)
+
+ andq %r10,%r11
+ movq 60(%rdi),%r12
+ xorq %r14,%r11
+ movq %r11,-52(%rsi)
+
+ orq %r10,%r14
+ movq 4(%rdi),%r10
+ xorq %r13,%r14
+ movq 52(%rdi),%r11
+ movq %r14,-60(%rsi)
+
+
+ xorq %rbp,%r10
+ xorq %rax,%r11
+ rolq $25,%r10
+ xorq %rdx,%r9
+ rolq $8,%r11
+ xorq %rbx,%r12
+ rolq $6,%r9
+ xorq %rcx,%r8
+ rolq $18,%r12
+ movq %r10,%r13
+ andq %r11,%r10
+ rolq $1,%r8
+
+ notq %r11
+ xorq %r9,%r10
+ movq %r10,-12(%rsi)
+
+ movq %r12,%r14
+ andq %r11,%r12
+ movq -12(%rdi),%r10
+ xorq %r13,%r12
+ movq %r12,-4(%rsi)
+
+ orq %r9,%r13
+ movq 84(%rdi),%r12
+ xorq %r8,%r13
+ movq %r13,-20(%rsi)
+
+ andq %r8,%r9
+ xorq %r14,%r9
+ movq %r9,12(%rsi)
+
+ orq %r8,%r14
+ movq -60(%rdi),%r9
+ xorq %r11,%r14
+ movq 36(%rdi),%r11
+ movq %r14,4(%rsi)
+
+
+ movq -68(%rdi),%r8
+
+ xorq %rcx,%r10
+ xorq %rdx,%r11
+ rolq $10,%r10
+ xorq %rbx,%r9
+ rolq $15,%r11
+ xorq %rbp,%r12
+ rolq $36,%r9
+ xorq %rax,%r8
+ rolq $56,%r12
+ movq %r10,%r13
+ orq %r11,%r10
+ rolq $27,%r8
+
+ notq %r11
+ xorq %r9,%r10
+ movq %r10,28(%rsi)
+
+ movq %r12,%r14
+ orq %r11,%r12
+ xorq %r13,%r12
+ movq %r12,36(%rsi)
+
+ andq %r9,%r13
+ xorq %r8,%r13
+ movq %r13,20(%rsi)
+
+ orq %r8,%r9
+ xorq %r14,%r9
+ movq %r9,52(%rsi)
+
+ andq %r14,%r8
+ xorq %r11,%r8
+ movq %r8,44(%rsi)
+
+
+ xorq -84(%rdi),%rdx
+ xorq -36(%rdi),%rbp
+ rolq $62,%rdx
+ xorq 68(%rdi),%rcx
+ rolq $55,%rbp
+ xorq 12(%rdi),%rax
+ rolq $2,%rcx
+ xorq 20(%rdi),%rbx
+ xchgq %rsi,%rdi
+ rolq $39,%rax
+ rolq $41,%rbx
+ movq %rdx,%r13
+ andq %rbp,%rdx
+ notq %rbp
+ xorq %rcx,%rdx
+ movq %rdx,92(%rdi)
+
+ movq %rax,%r14
+ andq %rbp,%rax
+ xorq %r13,%rax
+ movq %rax,60(%rdi)
+
+ orq %rcx,%r13
+ xorq %rbx,%r13
+ movq %r13,84(%rdi)
+
+ andq %rbx,%rcx
+ xorq %r14,%rcx
+ movq %rcx,76(%rdi)
+
+ orq %r14,%rbx
+ xorq %rbp,%rbx
+ movq %rbx,68(%rdi)
+
+ movq %rdx,%rbp
+ movq %r13,%rdx
+
+ testq $255,%r15
+ jnz L$oop
+
+ leaq -192(%r15),%r15
+ .byte 0xf3,0xc3
+
+
+
+.p2align 5
+KeccakF1600:
+
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+
+ leaq 100(%rdi),%rdi
+ subq $200,%rsp
+
+
+ notq -92(%rdi)
+ notq -84(%rdi)
+ notq -36(%rdi)
+ notq -4(%rdi)
+ notq 36(%rdi)
+ notq 60(%rdi)
+
+ leaq iotas(%rip),%r15
+ leaq 100(%rsp),%rsi
+
+ call __KeccakF1600
+
+ notq -92(%rdi)
+ notq -84(%rdi)
+ notq -36(%rdi)
+ notq -4(%rdi)
+ notq 36(%rdi)
+ notq 60(%rdi)
+ leaq -100(%rdi),%rdi
+
+ addq $200,%rsp
+
+
+ popq %r15
+
+ popq %r14
+
+ popq %r13
+
+ popq %r12
+
+ popq %rbp
+
+ popq %rbx
+
+ .byte 0xf3,0xc3
+
+
+.globl _SHA3_absorb
+
+.p2align 5
+_SHA3_absorb:
+
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+
+ leaq 100(%rdi),%rdi
+ subq $232,%rsp
+
+
+ movq %rsi,%r9
+ leaq 100(%rsp),%rsi
+
+ notq -92(%rdi)
+ notq -84(%rdi)
+ notq -36(%rdi)
+ notq -4(%rdi)
+ notq 36(%rdi)
+ notq 60(%rdi)
+ leaq iotas(%rip),%r15
+
+ movq %rcx,216-100(%rsi)
+
+L$oop_absorb:
+ cmpq %rcx,%rdx
+ jc L$done_absorb
+
+ shrq $3,%rcx
+ leaq -100(%rdi),%r8
+
+L$block_absorb:
+ movq (%r9),%rax
+ leaq 8(%r9),%r9
+ xorq (%r8),%rax
+ leaq 8(%r8),%r8
+ subq $8,%rdx
+ movq %rax,-8(%r8)
+ subq $1,%rcx
+ jnz L$block_absorb
+
+ movq %r9,200-100(%rsi)
+ movq %rdx,208-100(%rsi)
+ call __KeccakF1600
+ movq 200-100(%rsi),%r9
+ movq 208-100(%rsi),%rdx
+ movq 216-100(%rsi),%rcx
+ jmp L$oop_absorb
+
+.p2align 5
+L$done_absorb:
+ movq %rdx,%rax
+
+ notq -92(%rdi)
+ notq -84(%rdi)
+ notq -36(%rdi)
+ notq -4(%rdi)
+ notq 36(%rdi)
+ notq 60(%rdi)
+
+ addq $232,%rsp
+
+
+ popq %r15
+
+ popq %r14
+
+ popq %r13
+
+ popq %r12
+
+ popq %rbp
+
+ popq %rbx
+
+ .byte 0xf3,0xc3
+
+
+.globl _SHA3_squeeze
+
+.p2align 5
+_SHA3_squeeze:
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+
+ shrq $3,%rcx
+ movq %rdi,%r8
+ movq %rsi,%r12
+ movq %rdx,%r13
+ movq %rcx,%r14
+ jmp L$oop_squeeze
+
+.p2align 5
+L$oop_squeeze:
+ cmpq $8,%r13
+ jb L$tail_squeeze
+
+ movq (%r8),%rax
+ leaq 8(%r8),%r8
+ movq %rax,(%r12)
+ leaq 8(%r12),%r12
+ subq $8,%r13
+ jz L$done_squeeze
+
+ subq $1,%rcx
+ jnz L$oop_squeeze
+
+ call KeccakF1600
+ movq %rdi,%r8
+ movq %r14,%rcx
+ jmp L$oop_squeeze
+
+L$tail_squeeze:
+ movq %r8,%rsi
+ movq %r12,%rdi
+ movq %r13,%rcx
+.byte 0xf3,0xa4
+
+L$done_squeeze:
+ popq %r14
+
+ popq %r13
+
+ popq %r12
+
+ .byte 0xf3,0xc3
+
+
+.p2align 8
+.quad 0,0,0,0,0,0,0,0
+
+iotas:
+.quad 0x0000000000000001
+.quad 0x0000000000008082
+.quad 0x800000000000808a
+.quad 0x8000000080008000
+.quad 0x000000000000808b
+.quad 0x0000000080000001
+.quad 0x8000000080008081
+.quad 0x8000000000008009
+.quad 0x000000000000008a
+.quad 0x0000000000000088
+.quad 0x0000000080008009
+.quad 0x000000008000000a
+.quad 0x000000008000808b
+.quad 0x800000000000008b
+.quad 0x8000000000008089
+.quad 0x8000000000008003
+.quad 0x8000000000008002
+.quad 0x8000000000000080
+.quad 0x000000000000800a
+.quad 0x800000008000000a
+.quad 0x8000000080008081
+.quad 0x8000000000008080
+.quad 0x0000000080000001
+.quad 0x8000000080008008
+
+.byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha1-mb-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha1-mb-x86_64.s
index 7026de0e76..b2009fb28f 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha1-mb-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha1-mb-x86_64.s
@@ -6,17 +6,22 @@
.p2align 5
_sha1_multi_block:
+
movq _OPENSSL_ia32cap_P+4(%rip),%rcx
btq $61,%rcx
jc _shaext_shortcut
testl $268435456,%ecx
jnz _avx_shortcut
movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
subq $288,%rsp
andq $-256,%rsp
movq %rax,272(%rsp)
+
L$body:
leaq K_XX_XX(%rip),%rbp
leaq 256(%rsp),%rbx
@@ -2546,19 +2551,28 @@ L$oop:
L$done:
movq 272(%rsp),%rax
+
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$epilogue:
.byte 0xf3,0xc3
+
.p2align 5
sha1_multi_block_shaext:
+
_shaext_shortcut:
movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
subq $288,%rsp
shll $1,%edx
andq $-256,%rsp
@@ -2914,14 +2928,19 @@ L$oop_shaext:
L$done_shaext:
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$epilogue_shaext:
.byte 0xf3,0xc3
+
.p2align 5
sha1_multi_block_avx:
+
_avx_shortcut:
shrq $32,%rcx
cmpl $2,%edx
@@ -2932,11 +2951,15 @@ _avx_shortcut:
.p2align 5
L$avx:
movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
subq $288,%rsp
andq $-256,%rsp
movq %rax,272(%rsp)
+
L$body_avx:
leaq K_XX_XX(%rip),%rbp
leaq 256(%rsp),%rbx
@@ -4986,27 +5009,41 @@ L$oop_avx:
L$done_avx:
movq 272(%rsp),%rax
+
vzeroupper
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$epilogue_avx:
.byte 0xf3,0xc3
+
.p2align 5
sha1_multi_block_avx2:
+
_avx2_shortcut:
movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
subq $576,%rsp
andq $-256,%rsp
movq %rax,544(%rsp)
+
L$body_avx2:
leaq K_XX_XX(%rip),%rbp
shrl $1,%edx
@@ -7193,18 +7230,27 @@ L$oop_avx2:
L$done_avx2:
movq 544(%rsp),%rax
+
vzeroupper
movq -48(%rax),%r15
+
movq -40(%rax),%r14
+
movq -32(%rax),%r13
+
movq -24(%rax),%r12
+
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$epilogue_avx2:
.byte 0xf3,0xc3
+
.p2align 8
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha1-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha1-x86_64.s
index 3e3633911f..02472d0b7d 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha1-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha1-x86_64.s
@@ -5,6 +5,7 @@
.p2align 4
_sha1_block_data_order:
+
movl _OPENSSL_ia32cap_P+0(%rip),%r9d
movl _OPENSSL_ia32cap_P+4(%rip),%r8d
movl _OPENSSL_ia32cap_P+8(%rip),%r10d
@@ -25,17 +26,24 @@ _sha1_block_data_order:
.p2align 4
L$ialu:
movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
movq %rdi,%r8
subq $72,%rsp
movq %rsi,%r9
andq $-64,%rsp
movq %rdx,%r10
movq %rax,64(%rsp)
+
L$prologue:
movl 0(%r8),%esi
@@ -1230,19 +1238,28 @@ L$loop:
jnz L$loop
movq 64(%rsp),%rsi
+
movq -40(%rsi),%r14
+
movq -32(%rsi),%r13
+
movq -24(%rsi),%r12
+
movq -16(%rsi),%rbp
+
movq -8(%rsi),%rbx
+
leaq (%rsi),%rsp
+
L$epilogue:
.byte 0xf3,0xc3
+
.p2align 5
sha1_block_data_order_shaext:
_shaext_shortcut:
+
movdqu (%rdi),%xmm0
movd 16(%rdi),%xmm1
movdqa K_XX_XX+160(%rip),%xmm3
@@ -1404,20 +1421,27 @@ L$oop_shaext:
pshufd $27,%xmm1,%xmm1
movdqu %xmm0,(%rdi)
movd %xmm1,16(%rdi)
+
.byte 0xf3,0xc3
.p2align 4
sha1_block_data_order_ssse3:
_ssse3_shortcut:
- movq %rsp,%rax
+
+ movq %rsp,%r11
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
leaq -64(%rsp),%rsp
- movq %rax,%r14
andq $-64,%rsp
movq %rdi,%r8
movq %rsi,%r9
@@ -1425,7 +1449,7 @@ _ssse3_shortcut:
shlq $6,%r10
addq %r9,%r10
- leaq K_XX_XX+64(%rip),%r11
+ leaq K_XX_XX+64(%rip),%r14
movl 0(%r8),%eax
movl 4(%r8),%ebx
@@ -1437,8 +1461,8 @@ _ssse3_shortcut:
xorl %edx,%edi
andl %edi,%esi
- movdqa 64(%r11),%xmm6
- movdqa -64(%r11),%xmm9
+ movdqa 64(%r14),%xmm6
+ movdqa -64(%r14),%xmm9
movdqu 0(%r9),%xmm0
movdqu 16(%r9),%xmm1
movdqu 32(%r9),%xmm2
@@ -1514,7 +1538,7 @@ L$oop_ssse3:
pslld $2,%xmm9
pxor %xmm10,%xmm4
xorl %ebp,%edx
- movdqa -64(%r11),%xmm10
+ movdqa -64(%r14),%xmm10
roll $5,%ecx
addl %edi,%ebx
andl %edx,%esi
@@ -1575,7 +1599,7 @@ L$oop_ssse3:
pslld $2,%xmm10
pxor %xmm8,%xmm5
xorl %eax,%ebp
- movdqa -32(%r11),%xmm8
+ movdqa -32(%r14),%xmm8
roll $5,%edx
addl %edi,%ecx
andl %ebp,%esi
@@ -1636,7 +1660,7 @@ L$oop_ssse3:
pslld $2,%xmm8
pxor %xmm9,%xmm6
xorl %ebx,%eax
- movdqa -32(%r11),%xmm9
+ movdqa -32(%r14),%xmm9
roll $5,%ebp
addl %edi,%edx
andl %eax,%esi
@@ -1697,7 +1721,7 @@ L$oop_ssse3:
pslld $2,%xmm9
pxor %xmm10,%xmm7
xorl %ecx,%ebx
- movdqa -32(%r11),%xmm10
+ movdqa -32(%r14),%xmm10
roll $5,%eax
addl %edi,%ebp
andl %ebx,%esi
@@ -1808,7 +1832,7 @@ L$oop_ssse3:
pxor %xmm3,%xmm2
addl %esi,%eax
xorl %edx,%edi
- movdqa 0(%r11),%xmm10
+ movdqa 0(%r14),%xmm10
rorl $7,%ecx
paddd %xmm1,%xmm9
addl %ebx,%eax
@@ -2043,7 +2067,7 @@ L$oop_ssse3:
pxor %xmm0,%xmm7
roll $5,%ebx
addl %esi,%eax
- movdqa 32(%r11),%xmm9
+ movdqa 32(%r14),%xmm9
xorl %ecx,%edi
paddd %xmm6,%xmm8
xorl %edx,%ecx
@@ -2334,8 +2358,8 @@ L$oop_ssse3:
addl %edx,%ecx
cmpq %r10,%r9
je L$done_ssse3
- movdqa 64(%r11),%xmm6
- movdqa -64(%r11),%xmm9
+ movdqa 64(%r14),%xmm6
+ movdqa -64(%r14),%xmm9
movdqu 0(%r9),%xmm0
movdqu 16(%r9),%xmm1
movdqu 32(%r9),%xmm2
@@ -2572,29 +2596,41 @@ L$done_ssse3:
movl %ecx,8(%r8)
movl %edx,12(%r8)
movl %ebp,16(%r8)
- leaq (%r14),%rsi
- movq -40(%rsi),%r14
- movq -32(%rsi),%r13
- movq -24(%rsi),%r12
- movq -16(%rsi),%rbp
- movq -8(%rsi),%rbx
- leaq (%rsi),%rsp
+ movq -40(%r11),%r14
+
+ movq -32(%r11),%r13
+
+ movq -24(%r11),%r12
+
+ movq -16(%r11),%rbp
+
+ movq -8(%r11),%rbx
+
+ leaq (%r11),%rsp
+
L$epilogue_ssse3:
.byte 0xf3,0xc3
+
.p2align 4
sha1_block_data_order_avx:
_avx_shortcut:
- movq %rsp,%rax
+
+ movq %rsp,%r11
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
leaq -64(%rsp),%rsp
vzeroupper
- movq %rax,%r14
andq $-64,%rsp
movq %rdi,%r8
movq %rsi,%r9
@@ -2602,7 +2638,7 @@ _avx_shortcut:
shlq $6,%r10
addq %r9,%r10
- leaq K_XX_XX+64(%rip),%r11
+ leaq K_XX_XX+64(%rip),%r14
movl 0(%r8),%eax
movl 4(%r8),%ebx
@@ -2614,8 +2650,8 @@ _avx_shortcut:
xorl %edx,%edi
andl %edi,%esi
- vmovdqa 64(%r11),%xmm6
- vmovdqa -64(%r11),%xmm11
+ vmovdqa 64(%r14),%xmm6
+ vmovdqa -64(%r14),%xmm11
vmovdqu 0(%r9),%xmm0
vmovdqu 16(%r9),%xmm1
vmovdqu 32(%r9),%xmm2
@@ -2740,7 +2776,7 @@ L$oop_avx:
vpxor %xmm10,%xmm5,%xmm5
xorl %eax,%ebp
shldl $5,%edx,%edx
- vmovdqa -32(%r11),%xmm11
+ vmovdqa -32(%r14),%xmm11
addl %edi,%ecx
andl %ebp,%esi
xorl %eax,%ebp
@@ -2953,7 +2989,7 @@ L$oop_avx:
addl %esi,%eax
xorl %edx,%edi
vpaddd %xmm1,%xmm11,%xmm9
- vmovdqa 0(%r11),%xmm11
+ vmovdqa 0(%r14),%xmm11
shrdl $7,%ecx,%ecx
addl %ebx,%eax
vpxor %xmm8,%xmm2,%xmm2
@@ -3172,7 +3208,7 @@ L$oop_avx:
movl %ebx,%edi
xorl %edx,%esi
vpaddd %xmm6,%xmm11,%xmm9
- vmovdqa 32(%r11),%xmm11
+ vmovdqa 32(%r14),%xmm11
shldl $5,%ebx,%ebx
addl %esi,%eax
vpxor %xmm8,%xmm7,%xmm7
@@ -3451,8 +3487,8 @@ L$oop_avx:
addl %edx,%ecx
cmpq %r10,%r9
je L$done_avx
- vmovdqa 64(%r11),%xmm6
- vmovdqa -64(%r11),%xmm11
+ vmovdqa 64(%r14),%xmm6
+ vmovdqa -64(%r14),%xmm11
vmovdqu 0(%r9),%xmm0
vmovdqu 16(%r9),%xmm1
vmovdqu 32(%r9),%xmm2
@@ -3688,28 +3724,40 @@ L$done_avx:
movl %ecx,8(%r8)
movl %edx,12(%r8)
movl %ebp,16(%r8)
- leaq (%r14),%rsi
- movq -40(%rsi),%r14
- movq -32(%rsi),%r13
- movq -24(%rsi),%r12
- movq -16(%rsi),%rbp
- movq -8(%rsi),%rbx
- leaq (%rsi),%rsp
+ movq -40(%r11),%r14
+
+ movq -32(%r11),%r13
+
+ movq -24(%r11),%r12
+
+ movq -16(%r11),%rbp
+
+ movq -8(%r11),%rbx
+
+ leaq (%r11),%rsp
+
L$epilogue_avx:
.byte 0xf3,0xc3
+
.p2align 4
sha1_block_data_order_avx2:
_avx2_shortcut:
- movq %rsp,%rax
+
+ movq %rsp,%r11
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
vzeroupper
- movq %rax,%r14
movq %rdi,%r8
movq %rsi,%r9
movq %rdx,%r10
@@ -3719,7 +3767,7 @@ _avx2_shortcut:
leaq 64(%r9),%r13
andq $-128,%rsp
addq %r9,%r10
- leaq K_XX_XX+64(%rip),%r11
+ leaq K_XX_XX+64(%rip),%r14
movl 0(%r8),%eax
cmpq %r10,%r13
@@ -3728,7 +3776,7 @@ _avx2_shortcut:
movl 8(%r8),%ecx
movl 12(%r8),%edx
movl 16(%r8),%esi
- vmovdqu 64(%r11),%ymm6
+ vmovdqu 64(%r14),%ymm6
vmovdqu (%r9),%xmm0
vmovdqu 16(%r9),%xmm1
@@ -3742,7 +3790,7 @@ _avx2_shortcut:
vpshufb %ymm6,%ymm1,%ymm1
vinserti128 $1,48(%r13),%ymm3,%ymm3
vpshufb %ymm6,%ymm2,%ymm2
- vmovdqu -64(%r11),%ymm11
+ vmovdqu -64(%r14),%ymm11
vpshufb %ymm6,%ymm3,%ymm3
vpaddd %ymm11,%ymm0,%ymm4
@@ -3774,7 +3822,7 @@ _avx2_shortcut:
vpxor %ymm3,%ymm8,%ymm8
vpxor %ymm8,%ymm5,%ymm5
vpsrld $31,%ymm5,%ymm8
- vmovdqu -32(%r11),%ymm11
+ vmovdqu -32(%r14),%ymm11
vpslldq $12,%ymm5,%ymm10
vpaddd %ymm5,%ymm5,%ymm5
vpsrld $30,%ymm10,%ymm9
@@ -3928,7 +3976,7 @@ L$align32_1:
addl -56(%r13),%ebp
andnl %esi,%ebx,%edi
vpxor %ymm3,%ymm2,%ymm2
- vmovdqu 0(%r11),%ymm11
+ vmovdqu 0(%r14),%ymm11
addl %ecx,%ebp
rorxl $27,%ebx,%r12d
rorxl $2,%ebx,%ecx
@@ -4159,7 +4207,7 @@ L$align32_1:
addl -116(%r13),%eax
leal (%rax,%rbx,1),%eax
vpxor %ymm0,%ymm7,%ymm7
- vmovdqu 32(%r11),%ymm11
+ vmovdqu 32(%r14),%ymm11
rorxl $27,%ebp,%r12d
rorxl $2,%ebp,%ebx
xorl %ecx,%ebp
@@ -4604,7 +4652,7 @@ L$align32_2:
cmpq %r10,%r9
je L$done_avx2
- vmovdqu 64(%r11),%ymm6
+ vmovdqu 64(%r14),%ymm6
cmpq %r10,%rdi
ja L$ast_avx2
@@ -4820,7 +4868,7 @@ L$ast_avx2:
xorl %ebx,%eax
addl %r12d,%esi
xorl %ecx,%eax
- vmovdqu -64(%r11),%ymm11
+ vmovdqu -64(%r14),%ymm11
vpshufb %ymm6,%ymm0,%ymm0
addl 68(%r13),%edx
leal (%rdx,%rax,1),%edx
@@ -5176,7 +5224,7 @@ L$align32_3:
xorl %ebp,%esi
addl %r12d,%edx
vpsrld $31,%ymm5,%ymm8
- vmovdqu -32(%r11),%ymm11
+ vmovdqu -32(%r14),%ymm11
xorl %ebx,%esi
addl 104(%r13),%ecx
leal (%rcx,%rsi,1),%ecx
@@ -5369,16 +5417,22 @@ L$align32_3:
L$done_avx2:
vzeroupper
- leaq (%r14),%rsi
- movq -40(%rsi),%r14
- movq -32(%rsi),%r13
- movq -24(%rsi),%r12
- movq -16(%rsi),%rbp
- movq -8(%rsi),%rbx
- leaq (%rsi),%rsp
+ movq -40(%r11),%r14
+
+ movq -32(%r11),%r13
+
+ movq -24(%r11),%r12
+
+ movq -16(%r11),%rbp
+
+ movq -8(%r11),%rbx
+
+ leaq (%r11),%rsp
+
L$epilogue_avx2:
.byte 0xf3,0xc3
+
.p2align 6
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha256-mb-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha256-mb-x86_64.s
index 95e0e774af..bab9a565a2 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha256-mb-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha256-mb-x86_64.s
@@ -6,17 +6,22 @@
.p2align 5
_sha256_multi_block:
+
movq _OPENSSL_ia32cap_P+4(%rip),%rcx
btq $61,%rcx
jc _shaext_shortcut
testl $268435456,%ecx
jnz _avx_shortcut
movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
subq $288,%rsp
andq $-256,%rsp
movq %rax,272(%rsp)
+
L$body:
leaq K256+128(%rip),%rbp
leaq 256(%rsp),%rbx
@@ -2615,19 +2620,28 @@ L$oop_16_xx:
L$done:
movq 272(%rsp),%rax
+
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$epilogue:
.byte 0xf3,0xc3
+
.p2align 5
sha256_multi_block_shaext:
+
_shaext_shortcut:
movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
subq $288,%rsp
shll $1,%edx
andq $-256,%rsp
@@ -3102,14 +3116,19 @@ L$oop_shaext:
L$done_shaext:
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$epilogue_shaext:
.byte 0xf3,0xc3
+
.p2align 5
sha256_multi_block_avx:
+
_avx_shortcut:
shrq $32,%rcx
cmpl $2,%edx
@@ -3120,11 +3139,15 @@ _avx_shortcut:
.p2align 5
L$avx:
movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
subq $288,%rsp
andq $-256,%rsp
movq %rax,272(%rsp)
+
L$body_avx:
leaq K256+128(%rip),%rbp
leaq 256(%rsp),%rbx
@@ -5353,27 +5376,41 @@ L$oop_16_xx_avx:
L$done_avx:
movq 272(%rsp),%rax
+
vzeroupper
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$epilogue_avx:
.byte 0xf3,0xc3
+
.p2align 5
sha256_multi_block_avx2:
+
_avx2_shortcut:
movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+
subq $576,%rsp
andq $-256,%rsp
movq %rax,544(%rsp)
+
L$body_avx2:
leaq K256+128(%rip),%rbp
leaq 128(%rdi),%rdi
@@ -7738,17 +7775,26 @@ L$oop_16_xx_avx2:
L$done_avx2:
movq 544(%rsp),%rax
+
vzeroupper
movq -48(%rax),%r15
+
movq -40(%rax),%r14
+
movq -32(%rax),%r13
+
movq -24(%rax),%r12
+
movq -16(%rax),%rbp
+
movq -8(%rax),%rbx
+
leaq (%rax),%rsp
+
L$epilogue_avx2:
.byte 0xf3,0xc3
+
.p2align 8
K256:
.long 1116352408,1116352408,1116352408,1116352408
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha256-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha256-x86_64.s
index 05e973612b..e43cdd7040 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha256-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha256-x86_64.s
@@ -5,6 +5,7 @@
.p2align 4
_sha256_block_data_order:
+
leaq _OPENSSL_ia32cap_P(%rip),%r11
movl 0(%r11),%r9d
movl 4(%r11),%r10d
@@ -21,13 +22,20 @@ _sha256_block_data_order:
je L$avx_shortcut
testl $512,%r10d
jnz L$ssse3_shortcut
+ movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
- movq %rsp,%r11
+
shlq $4,%rdx
subq $64+32,%rsp
leaq (%rsi,%rdx,4),%rdx
@@ -35,7 +43,8 @@ _sha256_block_data_order:
movq %rdi,64+0(%rsp)
movq %rsi,64+8(%rsp)
movq %rdx,64+16(%rsp)
- movq %r11,64+24(%rsp)
+ movq %rax,88(%rsp)
+
L$prologue:
movl 0(%rdi),%eax
@@ -1699,17 +1708,26 @@ L$rounds_16_xx:
movl %r11d,28(%rdi)
jb L$loop
- movq 64+24(%rsp),%rsi
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq 88(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
L$epilogue:
.byte 0xf3,0xc3
+
.p2align 6
K256:
@@ -1963,14 +1981,22 @@ L$oop_shaext:
.p2align 6
sha256_block_data_order_ssse3:
+
L$ssse3_shortcut:
+ movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
- movq %rsp,%r11
+
shlq $4,%rdx
subq $96,%rsp
leaq (%rsi,%rdx,4),%rdx
@@ -1978,7 +2004,8 @@ L$ssse3_shortcut:
movq %rdi,64+0(%rsp)
movq %rsi,64+8(%rsp)
movq %rdx,64+16(%rsp)
- movq %r11,64+24(%rsp)
+ movq %rax,88(%rsp)
+
L$prologue_ssse3:
movl 0(%rdi),%eax
@@ -3044,28 +3071,45 @@ L$ssse3_00_47:
movl %r11d,28(%rdi)
jb L$loop_ssse3
- movq 64+24(%rsp),%rsi
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq 88(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
L$epilogue_ssse3:
.byte 0xf3,0xc3
+
.p2align 6
sha256_block_data_order_avx:
+
L$avx_shortcut:
+ movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
- movq %rsp,%r11
+
shlq $4,%rdx
subq $96,%rsp
leaq (%rsi,%rdx,4),%rdx
@@ -3073,7 +3117,8 @@ L$avx_shortcut:
movq %rdi,64+0(%rsp)
movq %rsi,64+8(%rsp)
movq %rdx,64+16(%rsp)
- movq %r11,64+24(%rsp)
+ movq %rax,88(%rsp)
+
L$prologue_avx:
vzeroupper
@@ -4100,29 +4145,46 @@ L$avx_00_47:
movl %r11d,28(%rdi)
jb L$loop_avx
- movq 64+24(%rsp),%rsi
+ movq 88(%rsp),%rsi
+
vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
L$epilogue_avx:
.byte 0xf3,0xc3
+
.p2align 6
sha256_block_data_order_avx2:
+
L$avx2_shortcut:
+ movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
- movq %rsp,%r11
+
subq $544,%rsp
shlq $4,%rdx
andq $-1024,%rsp
@@ -4131,7 +4193,8 @@ L$avx2_shortcut:
movq %rdi,64+0(%rsp)
movq %rsi,64+8(%rsp)
movq %rdx,64+16(%rsp)
- movq %r11,64+24(%rsp)
+ movq %rax,88(%rsp)
+
L$prologue_avx2:
vzeroupper
@@ -5344,15 +5407,24 @@ L$ower_avx2:
L$done_avx2:
leaq (%rbp),%rsp
- movq 64+24(%rsp),%rsi
+ movq 88(%rsp),%rsi
+
vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
L$epilogue_avx2:
.byte 0xf3,0xc3
+
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha512-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha512-x86_64.s
index 234616bc3b..51ace9a686 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha512-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha512-x86_64.s
@@ -5,6 +5,7 @@
.p2align 4
_sha512_block_data_order:
+
leaq _OPENSSL_ia32cap_P(%rip),%r11
movl 0(%r11),%r9d
movl 4(%r11),%r10d
@@ -19,13 +20,20 @@ _sha512_block_data_order:
orl %r9d,%r10d
cmpl $1342177792,%r10d
je L$avx_shortcut
+ movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
- movq %rsp,%r11
+
shlq $4,%rdx
subq $128+32,%rsp
leaq (%rsi,%rdx,8),%rdx
@@ -33,7 +41,8 @@ _sha512_block_data_order:
movq %rdi,128+0(%rsp)
movq %rsi,128+8(%rsp)
movq %rdx,128+16(%rsp)
- movq %r11,128+24(%rsp)
+ movq %rax,152(%rsp)
+
L$prologue:
movq 0(%rdi),%rax
@@ -1697,17 +1706,26 @@ L$rounds_16_xx:
movq %r11,56(%rdi)
jb L$loop
- movq 128+24(%rsp),%rsi
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq 152(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
L$epilogue:
.byte 0xf3,0xc3
+
.p2align 6
K512:
@@ -1798,14 +1816,22 @@ K512:
.p2align 6
sha512_block_data_order_xop:
+
L$xop_shortcut:
+ movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
- movq %rsp,%r11
+
shlq $4,%rdx
subq $160,%rsp
leaq (%rsi,%rdx,8),%rdx
@@ -1813,7 +1839,8 @@ L$xop_shortcut:
movq %rdi,128+0(%rsp)
movq %rsi,128+8(%rsp)
movq %rdx,128+16(%rsp)
- movq %r11,128+24(%rsp)
+ movq %rax,152(%rsp)
+
L$prologue_xop:
vzeroupper
@@ -2866,29 +2893,46 @@ L$xop_00_47:
movq %r11,56(%rdi)
jb L$loop_xop
- movq 128+24(%rsp),%rsi
+ movq 152(%rsp),%rsi
+
vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
L$epilogue_xop:
.byte 0xf3,0xc3
+
.p2align 6
sha512_block_data_order_avx:
+
L$avx_shortcut:
+ movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
- movq %rsp,%r11
+
shlq $4,%rdx
subq $160,%rsp
leaq (%rsi,%rdx,8),%rdx
@@ -2896,7 +2940,8 @@ L$avx_shortcut:
movq %rdi,128+0(%rsp)
movq %rsi,128+8(%rsp)
movq %rdx,128+16(%rsp)
- movq %r11,128+24(%rsp)
+ movq %rax,152(%rsp)
+
L$prologue_avx:
vzeroupper
@@ -4013,29 +4058,46 @@ L$avx_00_47:
movq %r11,56(%rdi)
jb L$loop_avx
- movq 128+24(%rsp),%rsi
+ movq 152(%rsp),%rsi
+
vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
L$epilogue_avx:
.byte 0xf3,0xc3
+
.p2align 6
sha512_block_data_order_avx2:
+
L$avx2_shortcut:
+ movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
- movq %rsp,%r11
+
subq $1312,%rsp
shlq $4,%rdx
andq $-2048,%rsp
@@ -4044,7 +4106,8 @@ L$avx2_shortcut:
movq %rdi,128+0(%rsp)
movq %rsi,128+8(%rsp)
movq %rdx,128+16(%rsp)
- movq %r11,128+24(%rsp)
+ movq %rax,152(%rsp)
+
L$prologue_avx2:
vzeroupper
@@ -5351,15 +5414,24 @@ L$ower_avx2:
L$done_avx2:
leaq (%rbp),%rsp
- movq 128+24(%rsp),%rsi
+ movq 152(%rsp),%rsi
+
vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
L$epilogue_avx2:
.byte 0xf3,0xc3
+
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/whrlpool/wp-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/whrlpool/wp-x86_64.s
index 4057ba32ac..2c94c14b93 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/whrlpool/wp-x86_64.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/whrlpool/wp-x86_64.s
@@ -4,14 +4,22 @@
.p2align 4
_whirlpool_block:
+
+ movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
- movq %rsp,%r11
+
subq $128+40,%rsp
andq $-64,%rsp
@@ -19,7 +27,8 @@ _whirlpool_block:
movq %rdi,0(%r10)
movq %rsi,8(%r10)
movq %rdx,16(%r10)
- movq %r11,32(%r10)
+ movq %rax,32(%r10)
+
L$prologue:
movq %r10,%rbx
@@ -579,17 +588,26 @@ L$roundsdone:
jmp L$outerloop
L$alldone:
movq 32(%rbx),%rsi
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
L$epilogue:
.byte 0xf3,0xc3
+
.p2align 6
L$table:
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/x86_64cpuid.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/x86_64cpuid.s
index 8f16835f71..05afede678 100644
--- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/x86_64cpuid.s
+++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/x86_64cpuid.s
@@ -37,10 +37,12 @@ _OPENSSL_rdtsc:
.p2align 4
_OPENSSL_ia32_cpuid:
+
movq %rbx,%r8
+
xorl %eax,%eax
- movl %eax,8(%rdi)
+ movq %rax,8(%rdi)
cpuid
movl %eax,%r11d
@@ -111,6 +113,7 @@ L$intel:
L$nocacheinfo:
movl $1,%eax
cpuid
+ movd %eax,%xmm0
andl $0xbfefffff,%edx
cmpl $0,%r9d
jne L$notintel
@@ -158,28 +161,47 @@ L$generic:
jc L$notknights
andl $0xfff7ffff,%ebx
L$notknights:
+ movd %xmm0,%eax
+ andl $0x0fff0ff0,%eax
+ cmpl $0x00050650,%eax
+ jne L$notskylakex
+ andl $0xfffeffff,%ebx
+
+L$notskylakex:
movl %ebx,8(%rdi)
+ movl %ecx,12(%rdi)
L$no_extended_info:
btl $27,%r9d
jnc L$clear_avx
xorl %ecx,%ecx
.byte 0x0f,0x01,0xd0
+ andl $0xe6,%eax
+ cmpl $0xe6,%eax
+ je L$done
+ andl $0x3fdeffff,8(%rdi)
+
+
+
+
andl $6,%eax
cmpl $6,%eax
je L$done
L$clear_avx:
movl $0xefffe7ff,%eax
andl %eax,%r9d
- andl $0xffffffdf,8(%rdi)
+ movl $0x3fdeffdf,%eax
+ andl %eax,8(%rdi)
L$done:
shlq $32,%r9
movl %r10d,%eax
movq %r8,%rbx
+
orq %r9,%rax
.byte 0xf3,0xc3
+
.globl _OPENSSL_cleanse
.p2align 4
@@ -223,6 +245,18 @@ _CRYPTO_memcmp:
xorq %r10,%r10
cmpq $0,%rdx
je L$no_data
+ cmpq $16,%rdx
+ jne L$oop_cmp
+ movq (%rdi),%r10
+ movq 8(%rdi),%r11
+ movq $1,%rdx
+ xorq (%rsi),%r10
+ xorq 8(%rsi),%r11
+ orq %r11,%r10
+ cmovnzq %rdx,%rax
+ .byte 0xf3,0xc3
+
+.p2align 4
L$oop_cmp:
movb (%rdi),%r10b
leaq 1(%rdi),%rdi
@@ -346,21 +380,6 @@ L$done2:
subq %rcx,%rax
.byte 0xf3,0xc3
-.globl _OPENSSL_ia32_rdrand
-
-.p2align 4
-_OPENSSL_ia32_rdrand:
- movl $8,%ecx
-L$oop_rdrand:
-.byte 72,15,199,240
- jc L$break_rdrand
- loop L$oop_rdrand
-L$break_rdrand:
- cmpq $0,%rax
- cmoveq %rcx,%rax
- .byte 0xf3,0xc3
-
-
.globl _OPENSSL_ia32_rdrand_bytes
.p2align 4
@@ -394,28 +413,14 @@ L$tail_rdrand_bytes:
movb %r10b,(%rdi)
leaq 1(%rdi),%rdi
incq %rax
- shrq $8,%r8
+ shrq $8,%r10
decq %rsi
jnz L$tail_rdrand_bytes
L$done_rdrand_bytes:
+ xorq %r10,%r10
.byte 0xf3,0xc3
-.globl _OPENSSL_ia32_rdseed
-
-.p2align 4
-_OPENSSL_ia32_rdseed:
- movl $8,%ecx
-L$oop_rdseed:
-.byte 72,15,199,248
- jc L$break_rdseed
- loop L$oop_rdseed
-L$break_rdseed:
- cmpq $0,%rax
- cmoveq %rcx,%rax
- .byte 0xf3,0xc3
-
-
.globl _OPENSSL_ia32_rdseed_bytes
.p2align 4
@@ -449,10 +454,11 @@ L$tail_rdseed_bytes:
movb %r10b,(%rdi)
leaq 1(%rdi),%rdi
incq %rax
- shrq $8,%r8
+ shrq $8,%r10
decq %rsi
jnz L$tail_rdseed_bytes
L$done_rdseed_bytes:
+ xorq %r10,%r10
.byte 0xf3,0xc3