diff options
Diffstat (limited to 'deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto')
31 files changed, 7401 insertions, 575 deletions
diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aes-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aes-x86_64.s index 9a337fb897..72dade4a50 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aes-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aes-x86_64.s @@ -332,15 +332,23 @@ L$enc_compact_done: .private_extern _asm_AES_encrypt _asm_AES_encrypt: _AES_encrypt: + + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movq %rsp,%r10 + leaq -63(%rdx),%rcx andq $-64,%rsp subq %rsp,%rcx @@ -350,7 +358,8 @@ _AES_encrypt: subq $32,%rsp movq %rsi,16(%rsp) - movq %r10,24(%rsp) + movq %rax,24(%rsp) + L$enc_prologue: movq %rdx,%r15 @@ -377,22 +386,31 @@ L$enc_prologue: movq 16(%rsp),%r9 movq 24(%rsp),%rsi + movl %eax,0(%r9) movl %ebx,4(%r9) movl %ecx,8(%r9) movl %edx,12(%r9) - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$enc_epilogue: .byte 0xf3,0xc3 + .p2align 4 _x86_64_AES_decrypt: xorl 0(%r15),%eax @@ -779,15 +797,23 @@ L$dec_compact_done: .private_extern _asm_AES_decrypt _asm_AES_decrypt: _AES_decrypt: + + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movq %rsp,%r10 + leaq -63(%rdx),%rcx andq $-64,%rsp subq %rsp,%rcx @@ -797,7 +823,8 @@ _AES_decrypt: subq $32,%rsp movq %rsi,16(%rsp) - movq %r10,24(%rsp) + movq %rax,24(%rsp) + L$dec_prologue: movq %rdx,%r15 @@ -826,44 +853,65 @@ L$dec_prologue: movq 16(%rsp),%r9 movq 24(%rsp),%rsi + movl %eax,0(%r9) movl %ebx,4(%r9) movl %ecx,8(%r9) movl %edx,12(%r9) - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$dec_epilogue: .byte 0xf3,0xc3 + .globl _AES_set_encrypt_key .p2align 4 _AES_set_encrypt_key: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $8,%rsp + L$enc_key_prologue: call _x86_64_AES_set_encrypt_key movq 40(%rsp),%rbp + movq 48(%rsp),%rbx + addq $56,%rsp + L$enc_key_epilogue: .byte 0xf3,0xc3 + .p2align 4 _x86_64_AES_set_encrypt_key: movl %esi,%ecx @@ -1106,13 +1154,21 @@ L$exit: .p2align 4 _AES_set_decrypt_key: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rdx + L$dec_key_prologue: call _x86_64_AES_set_encrypt_key @@ -1280,15 +1336,23 @@ L$permute: xorq %rax,%rax L$abort: movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 + movq 40(%rsp),%rbp + movq 48(%rsp),%rbx + addq $56,%rsp + L$dec_key_epilogue: .byte 0xf3,0xc3 + .globl _AES_cbc_encrypt .p2align 4 @@ -1297,25 +1361,32 @@ L$dec_key_epilogue: .private_extern _asm_AES_cbc_encrypt _asm_AES_cbc_encrypt: _AES_cbc_encrypt: + cmpq $0,%rdx je L$cbc_epilogue pushfq + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + L$cbc_prologue: cld movl %r9d,%r9d leaq L$AES_Te(%rip),%r14 + leaq L$AES_Td(%rip),%r10 cmpq $0,%r9 - jne L$cbc_picked_te - leaq L$AES_Td(%rip),%r14 -L$cbc_picked_te: + cmoveq %r10,%r14 movl _OPENSSL_ia32cap_P(%rip),%r10d cmpq $512,%rdx @@ -1352,7 +1423,9 @@ L$cbc_te_ok: xchgq %rsp,%r15 + movq %r15,16(%rsp) + L$cbc_fast_body: movq %rdi,24(%rsp) movq %rsi,32(%rsp) @@ -1734,18 +1807,28 @@ L$cbc_slow_dec_partial: .p2align 4 L$cbc_exit: movq 16(%rsp),%rsi + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp + L$cbc_popfq: popfq + L$cbc_epilogue: .byte 0xf3,0xc3 + .p2align 6 L$AES_Te: .long 0xa56363c6,0xa56363c6 diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-mb-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-mb-x86_64.s index 75ce16175c..8f97b853a7 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-mb-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-mb-x86_64.s @@ -6,6 +6,7 @@ .p2align 5 _aesni_multi_cbc_encrypt: + cmpl $2,%edx jb L$enc_non_avx movl _OPENSSL_ia32cap_P+4(%rip),%ecx @@ -15,11 +16,17 @@ _aesni_multi_cbc_encrypt: .p2align 4 L$enc_non_avx: movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 @@ -27,10 +34,12 @@ L$enc_non_avx: + subq $48,%rsp andq $-64,%rsp movq %rax,16(%rsp) + L$enc4x_body: movdqu (%rsi),%xmm12 leaq 120(%rsi),%rsi @@ -239,6 +248,7 @@ L$enc4x_tail: jnz L$oop_enc4x movq 16(%rsp),%rax + movl 24(%rsp),%edx @@ -256,20 +266,29 @@ L$enc4x_tail: L$enc4x_done: movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$enc4x_epilogue: .byte 0xf3,0xc3 + .globl _aesni_multi_cbc_decrypt .p2align 5 _aesni_multi_cbc_decrypt: + cmpl $2,%edx jb L$dec_non_avx movl _OPENSSL_ia32cap_P+4(%rip),%ecx @@ -279,11 +298,17 @@ _aesni_multi_cbc_decrypt: .p2align 4 L$dec_non_avx: movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 @@ -291,10 +316,12 @@ L$dec_non_avx: + subq $48,%rsp andq $-64,%rsp movq %rax,16(%rsp) + L$dec4x_body: movdqu (%rsi),%xmm12 leaq 120(%rsi),%rsi @@ -503,6 +530,7 @@ L$dec4x_tail: jnz L$oop_dec4x movq 16(%rsp),%rax + movl 24(%rsp),%edx leaq 160(%rdi),%rdi @@ -511,25 +539,40 @@ L$dec4x_tail: L$dec4x_done: movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$dec4x_epilogue: .byte 0xf3,0xc3 + .p2align 5 aesni_multi_cbc_encrypt_avx: + _avx_cbc_enc_shortcut: movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 @@ -539,10 +582,12 @@ _avx_cbc_enc_shortcut: + subq $192,%rsp andq $-128,%rsp movq %rax,16(%rsp) + L$enc8x_body: vzeroupper vmovdqu (%rsi),%xmm15 @@ -944,29 +989,45 @@ L$enc8x_tail: + L$enc8x_done: vzeroupper movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$enc8x_epilogue: .byte 0xf3,0xc3 + .p2align 5 aesni_multi_cbc_decrypt_avx: + _avx_cbc_dec_shortcut: movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 @@ -977,11 +1038,13 @@ _avx_cbc_dec_shortcut: + subq $256,%rsp andq $-256,%rsp subq $192,%rsp movq %rax,16(%rsp) + L$dec8x_body: vzeroupper vmovdqu (%rsi),%xmm15 @@ -1421,15 +1484,24 @@ L$dec8x_tail: + L$dec8x_done: vzeroupper movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$dec8x_epilogue: .byte 0xf3,0xc3 + diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-sha1-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-sha1-x86_64.s index b14cf7691a..aed8f3a345 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-sha1-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-sha1-x86_64.s @@ -21,18 +21,26 @@ _aesni_cbc_sha1_enc: .p2align 5 aesni_cbc_sha1_enc_ssse3: + movq 8(%rsp),%r10 pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq -104(%rsp),%rsp + movq %rdi,%r12 movq %rsi,%r13 movq %rdx,%r14 @@ -1362,31 +1370,48 @@ L$aesenclast5: movl %ebp,16(%r9) movups %xmm2,(%r8) leaq 104(%rsp),%rsi + movq 0(%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp + L$epilogue_ssse3: .byte 0xf3,0xc3 + .p2align 5 aesni_cbc_sha1_enc_avx: + movq 8(%rsp),%r10 pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq -104(%rsp),%rsp + vzeroall movq %rdi,%r12 movq %rsi,%r13 @@ -2660,16 +2685,25 @@ L$vaesenclast10: vmovups %xmm12,(%r8) vzeroall leaq 104(%rsp),%rsi + movq 0(%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp + L$epilogue_avx: .byte 0xf3,0xc3 + .p2align 6 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-sha256-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-sha256-x86_64.s index 08025a0bae..28cf0768ca 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-sha256-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-sha256-x86_64.s @@ -77,15 +77,23 @@ K256: .p2align 6 aesni_cbc_sha256_enc_xop: + L$xop_shortcut: movq 8(%rsp),%r10 + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movq %rsp,%r11 + subq $128,%rsp andq $-64,%rsp @@ -101,7 +109,8 @@ L$xop_shortcut: movq %r8,64+32(%rsp) movq %r9,64+40(%rsp) movq %r10,64+48(%rsp) - movq %r11,64+56(%rsp) + movq %rax,120(%rsp) + L$prologue_xop: vzeroall @@ -1207,31 +1216,48 @@ L$xop_00_47: jb L$loop_xop movq 64+32(%rsp),%r8 - movq 64+56(%rsp),%rsi + movq 120(%rsp),%rsi + vmovdqu %xmm8,(%r8) vzeroall - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$epilogue_xop: .byte 0xf3,0xc3 + .p2align 6 aesni_cbc_sha256_enc_avx: + L$avx_shortcut: movq 8(%rsp),%r10 + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movq %rsp,%r11 + subq $128,%rsp andq $-64,%rsp @@ -1247,7 +1273,8 @@ L$avx_shortcut: movq %r8,64+32(%rsp) movq %r9,64+40(%rsp) movq %r10,64+48(%rsp) - movq %r11,64+56(%rsp) + movq %rax,120(%rsp) + L$prologue_avx: vzeroall @@ -2384,31 +2411,48 @@ L$avx_00_47: jb L$loop_avx movq 64+32(%rsp),%r8 - movq 64+56(%rsp),%rsi + movq 120(%rsp),%rsi + vmovdqu %xmm8,(%r8) vzeroall - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$epilogue_avx: .byte 0xf3,0xc3 + .p2align 6 aesni_cbc_sha256_enc_avx2: + L$avx2_shortcut: movq 8(%rsp),%r10 + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movq %rsp,%r11 + subq $576,%rsp andq $-1024,%rsp addq $448,%rsp @@ -2425,7 +2469,8 @@ L$avx2_shortcut: movq %r8,64+32(%rsp) movq %r9,64+40(%rsp) movq %r10,64+48(%rsp) - movq %r11,64+56(%rsp) + movq %rax,120(%rsp) + L$prologue_avx2: vzeroall @@ -3987,20 +4032,29 @@ L$ower_avx2: L$done_avx2: leaq (%rbp),%rsp movq 64+32(%rsp),%r8 - movq 64+56(%rsp),%rsi + movq 120(%rsp),%rsi + vmovdqu %xmm8,(%r8) vzeroall - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$epilogue_avx2: .byte 0xf3,0xc3 + .p2align 5 aesni_cbc_sha256_enc_shaext: movq 8(%rsp),%r10 diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-x86_64.s index 2c741239ef..8d76a18b85 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/aesni-x86_64.s @@ -995,6 +995,7 @@ L$oop_enc1_6: .p2align 4 _aesni_ctr32_encrypt_blocks: + cmpq $1,%rdx jne L$ctr32_bulk @@ -1024,11 +1025,12 @@ L$oop_enc1_7: .p2align 4 L$ctr32_bulk: - leaq (%rsp),%rax + leaq (%rsp),%r11 + pushq %rbp + subq $128,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp @@ -1037,7 +1039,7 @@ L$ctr32_bulk: movdqu (%rcx),%xmm0 movl 12(%r8),%r8d pxor %xmm0,%xmm2 - movl 12(%rcx),%r11d + movl 12(%rcx),%ebp movdqa %xmm2,0(%rsp) bswapl %r8d movdqa %xmm2,%xmm3 @@ -1053,8 +1055,8 @@ L$ctr32_bulk: leaq 2(%r8),%rdx bswapl %eax bswapl %edx - xorl %r11d,%eax - xorl %r11d,%edx + xorl %ebp,%eax + xorl %ebp,%edx .byte 102,15,58,34,216,3 leaq 3(%r8),%rax movdqa %xmm3,16(%rsp) @@ -1063,25 +1065,25 @@ L$ctr32_bulk: movq %r10,%rdx leaq 4(%r8),%r10 movdqa %xmm4,32(%rsp) - xorl %r11d,%eax + xorl %ebp,%eax bswapl %r10d .byte 102,15,58,34,232,3 - xorl %r11d,%r10d + xorl %ebp,%r10d movdqa %xmm5,48(%rsp) leaq 5(%r8),%r9 movl %r10d,64+12(%rsp) bswapl %r9d leaq 6(%r8),%r10 movl 240(%rcx),%eax - xorl %r11d,%r9d + xorl %ebp,%r9d bswapl %r10d movl %r9d,80+12(%rsp) - xorl %r11d,%r10d + xorl %ebp,%r10d leaq 7(%r8),%r9 movl %r10d,96+12(%rsp) bswapl %r9d movl _OPENSSL_ia32cap_P+4(%rip),%r10d - xorl %r11d,%r9d + xorl %ebp,%r9d andl $71303168,%r10d movl %r9d,112+12(%rsp) @@ -1105,7 +1107,7 @@ L$ctr32_bulk: L$ctr32_6x: shll $4,%eax movl $48,%r10d - bswapl %r11d + bswapl %ebp leaq 32(%rcx,%rax,1),%rcx subq %rax,%r10 jmp L$ctr32_loop6 @@ -1116,32 +1118,32 @@ L$ctr32_loop6: movups -48(%rcx,%r10,1),%xmm0 .byte 102,15,56,220,209 movl %r8d,%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,217 .byte 0x0f,0x38,0xf1,0x44,0x24,12 leal 1(%r8),%eax .byte 102,15,56,220,225 - xorl %r11d,%eax + xorl %ebp,%eax .byte 0x0f,0x38,0xf1,0x44,0x24,28 .byte 102,15,56,220,233 leal 2(%r8),%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,241 .byte 0x0f,0x38,0xf1,0x44,0x24,44 leal 3(%r8),%eax .byte 102,15,56,220,249 movups -32(%rcx,%r10,1),%xmm1 - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,208 .byte 0x0f,0x38,0xf1,0x44,0x24,60 leal 4(%r8),%eax .byte 102,15,56,220,216 - xorl %r11d,%eax + xorl %ebp,%eax .byte 0x0f,0x38,0xf1,0x44,0x24,76 .byte 102,15,56,220,224 leal 5(%r8),%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,232 .byte 0x0f,0x38,0xf1,0x44,0x24,92 movq %r10,%rax @@ -1202,7 +1204,7 @@ L$ctr32_loop8: bswapl %r9d movups 32-128(%rcx),%xmm0 .byte 102,15,56,220,225 - xorl %r11d,%r9d + xorl %ebp,%r9d nop .byte 102,15,56,220,233 movl %r9d,0+12(%rsp) @@ -1215,7 +1217,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1229,7 +1231,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1243,7 +1245,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1257,7 +1259,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1271,7 +1273,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1285,7 +1287,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1300,7 +1302,7 @@ L$ctr32_loop8: .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 - xorl %r11d,%r9d + xorl %ebp,%r9d movdqu 0(%rdi),%xmm10 .byte 102,15,56,220,232 movl %r9d,112+12(%rsp) @@ -1535,7 +1537,7 @@ L$ctr32_loop3: L$ctr32_done: xorps %xmm0,%xmm0 - xorl %r11d,%r11d + xorl %ebp,%ebp pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 @@ -1559,20 +1561,25 @@ L$ctr32_done: pxor %xmm14,%xmm14 movaps %xmm0,112(%rsp) pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + + leaq (%r11),%rsp + L$ctr32_epilogue: .byte 0xf3,0xc3 + .globl _aesni_xts_encrypt .p2align 4 _aesni_xts_encrypt: - leaq (%rsp),%rax + + leaq (%rsp),%r11 + pushq %rbp + subq $112,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -1588,7 +1595,7 @@ L$oop_enc1_8: jnz L$oop_enc1_8 .byte 102,15,56,221,209 movups (%rcx),%xmm0 - movq %rcx,%r11 + movq %rcx,%rbp movl %r10d,%eax shll $4,%r10d movq %rdx,%r9 @@ -1644,9 +1651,9 @@ L$oop_enc1_8: jc L$xts_enc_short movl $16+96,%eax - leaq 32(%r11,%r10,1),%rcx + leaq 32(%rbp,%r10,1),%rcx subq %r10,%rax - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 movq %rax,%r10 leaq L$xts_magic(%rip),%r8 jmp L$xts_enc_grandloop @@ -1671,7 +1678,7 @@ L$xts_enc_grandloop: movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 .byte 102,15,56,220,233 - movups 32(%r11),%xmm0 + movups 32(%rbp),%xmm0 leaq 96(%rdi),%rdi pxor %xmm8,%xmm7 @@ -1680,7 +1687,7 @@ L$xts_enc_grandloop: pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) .byte 102,15,56,220,249 - movups 48(%r11),%xmm1 + movups 48(%rbp),%xmm1 pxor %xmm9,%xmm12 .byte 102,15,56,220,208 @@ -1695,7 +1702,7 @@ L$xts_enc_grandloop: movdqa %xmm14,64(%rsp) .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups 64(%r11),%xmm0 + movups 64(%rbp),%xmm0 movdqa %xmm8,80(%rsp) pshufd $0x5f,%xmm15,%xmm9 jmp L$xts_enc_loop6 @@ -1727,7 +1734,7 @@ L$xts_enc_loop6: psrad $31,%xmm14 .byte 102,15,56,220,217 pand %xmm8,%xmm14 - movups (%r11),%xmm10 + movups (%rbp),%xmm10 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 @@ -1795,10 +1802,10 @@ L$xts_enc_loop6: .byte 102,15,56,220,225 .byte 102,15,56,220,233 pxor %xmm0,%xmm15 - movups (%r11),%xmm0 + movups (%rbp),%xmm0 .byte 102,15,56,220,241 .byte 102,15,56,220,249 - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 pxor %xmm15,%xmm14 .byte 102,15,56,221,84,36,0 @@ -1825,7 +1832,7 @@ L$xts_enc_loop6: movl $16+96,%eax subl %r10d,%eax - movq %r11,%rcx + movq %rbp,%rcx shrl $4,%eax L$xts_enc_short: @@ -1981,7 +1988,7 @@ L$xts_enc_steal: jnz L$xts_enc_steal subq %r9,%rsi - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups -16(%rsi),%xmm2 @@ -2024,20 +2031,25 @@ L$xts_enc_ret: movaps %xmm0,96(%rsp) pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + + leaq (%r11),%rsp + L$xts_enc_epilogue: .byte 0xf3,0xc3 + .globl _aesni_xts_decrypt .p2align 4 _aesni_xts_decrypt: - leaq (%rsp),%rax + + leaq (%rsp),%r11 + pushq %rbp + subq $112,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -2059,7 +2071,7 @@ L$oop_enc1_11: subq %rax,%rdx movups (%rcx),%xmm0 - movq %rcx,%r11 + movq %rcx,%rbp movl %r10d,%eax shll $4,%r10d movq %rdx,%r9 @@ -2115,9 +2127,9 @@ L$oop_enc1_11: jc L$xts_dec_short movl $16+96,%eax - leaq 32(%r11,%r10,1),%rcx + leaq 32(%rbp,%r10,1),%rcx subq %r10,%rax - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 movq %rax,%r10 leaq L$xts_magic(%rip),%r8 jmp L$xts_dec_grandloop @@ -2142,7 +2154,7 @@ L$xts_dec_grandloop: movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 .byte 102,15,56,222,233 - movups 32(%r11),%xmm0 + movups 32(%rbp),%xmm0 leaq 96(%rdi),%rdi pxor %xmm8,%xmm7 @@ -2151,7 +2163,7 @@ L$xts_dec_grandloop: pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) .byte 102,15,56,222,249 - movups 48(%r11),%xmm1 + movups 48(%rbp),%xmm1 pxor %xmm9,%xmm12 .byte 102,15,56,222,208 @@ -2166,7 +2178,7 @@ L$xts_dec_grandloop: movdqa %xmm14,64(%rsp) .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups 64(%r11),%xmm0 + movups 64(%rbp),%xmm0 movdqa %xmm8,80(%rsp) pshufd $0x5f,%xmm15,%xmm9 jmp L$xts_dec_loop6 @@ -2198,7 +2210,7 @@ L$xts_dec_loop6: psrad $31,%xmm14 .byte 102,15,56,222,217 pand %xmm8,%xmm14 - movups (%r11),%xmm10 + movups (%rbp),%xmm10 .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 @@ -2266,10 +2278,10 @@ L$xts_dec_loop6: .byte 102,15,56,222,225 .byte 102,15,56,222,233 pxor %xmm0,%xmm15 - movups (%r11),%xmm0 + movups (%rbp),%xmm0 .byte 102,15,56,222,241 .byte 102,15,56,222,249 - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 pxor %xmm15,%xmm14 .byte 102,15,56,223,84,36,0 @@ -2296,7 +2308,7 @@ L$xts_dec_loop6: movl $16+96,%eax subl %r10d,%eax - movq %r11,%rcx + movq %rbp,%rcx shrl $4,%eax L$xts_dec_short: @@ -2453,7 +2465,7 @@ L$xts_dec_done: jz L$xts_dec_ret L$xts_dec_done2: movq %r9,%rdx - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups (%rdi),%xmm2 @@ -2483,7 +2495,7 @@ L$xts_dec_steal: jnz L$xts_dec_steal subq %r9,%rsi - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups (%rsi),%xmm2 @@ -2526,21 +2538,30 @@ L$xts_dec_ret: movaps %xmm0,96(%rsp) pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + + leaq (%r11),%rsp + L$xts_dec_epilogue: .byte 0xf3,0xc3 + .globl _aesni_ocb_encrypt .p2align 5 _aesni_ocb_encrypt: + leaq (%rsp),%rax pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + movq 8(%rax),%rbx movq 8+8(%rax),%rbp @@ -2716,16 +2737,26 @@ L$ocb_enc_done: pxor %xmm13,%xmm13 pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx + leaq 40(%rsp),%rax + + movq -40(%rax),%r14 + + movq -32(%rax),%r13 + + movq -24(%rax),%r12 + + movq -16(%rax),%rbp + + movq -8(%rax),%rbx + + leaq (%rax),%rsp + L$ocb_enc_epilogue: .byte 0xf3,0xc3 + .p2align 5 __ocb_encrypt6: pxor %xmm9,%xmm15 @@ -2935,12 +2966,18 @@ L$ocb_enc_loop1: .p2align 5 _aesni_ocb_decrypt: + leaq (%rsp),%rax pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + movq 8(%rax),%rbx movq 8+8(%rax),%rbp @@ -3138,16 +3175,26 @@ L$ocb_dec_done: pxor %xmm13,%xmm13 pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx + leaq 40(%rsp),%rax + + movq -40(%rax),%r14 + + movq -32(%rax),%r13 + + movq -24(%rax),%r12 + + movq -16(%rax),%rbp + + movq -8(%rax),%rbx + + leaq (%rax),%rsp + L$ocb_dec_epilogue: .byte 0xf3,0xc3 + .p2align 5 __ocb_decrypt6: pxor %xmm9,%xmm15 @@ -3345,6 +3392,7 @@ L$ocb_dec_loop1: .p2align 4 _aesni_cbc_encrypt: + testq %rdx,%rdx jz L$cbc_ret @@ -3437,11 +3485,13 @@ L$oop_dec1_16: jmp L$cbc_ret .p2align 4 L$cbc_decrypt_bulk: - leaq (%rsp),%rax + leaq (%rsp),%r11 + pushq %rbp + subq $16,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp + movq %rcx,%rbp movups (%r8),%xmm10 movl %r10d,%eax cmpq $0x50,%rdx @@ -3481,7 +3531,7 @@ L$cbc_dec_loop8_enter: pxor %xmm0,%xmm3 movups 16-112(%rcx),%xmm1 pxor %xmm0,%xmm4 - xorq %r11,%r11 + movq $-1,%rbp cmpq $0x70,%rdx pxor %xmm0,%xmm5 pxor %xmm0,%xmm6 @@ -3497,10 +3547,10 @@ L$cbc_dec_loop8_enter: .byte 102,15,56,222,241 .byte 102,15,56,222,249 .byte 102,68,15,56,222,193 - setnc %r11b - shlq $7,%r11 + adcq $0,%rbp + andq $128,%rbp .byte 102,68,15,56,222,201 - addq %rdi,%r11 + addq %rdi,%rbp movups 48-112(%rcx),%xmm1 .byte 102,15,56,222,208 .byte 102,15,56,222,216 @@ -3638,18 +3688,18 @@ L$cbc_dec_done: movdqu 112(%rdi),%xmm0 .byte 102,65,15,56,223,228 leaq 128(%rdi),%rdi - movdqu 0(%r11),%xmm11 + movdqu 0(%rbp),%xmm11 .byte 102,65,15,56,223,237 .byte 102,65,15,56,223,246 - movdqu 16(%r11),%xmm12 - movdqu 32(%r11),%xmm13 + movdqu 16(%rbp),%xmm12 + movdqu 32(%rbp),%xmm13 .byte 102,65,15,56,223,255 .byte 102,68,15,56,223,193 - movdqu 48(%r11),%xmm14 - movdqu 64(%r11),%xmm15 + movdqu 48(%rbp),%xmm14 + movdqu 64(%rbp),%xmm15 .byte 102,69,15,56,223,202 movdqa %xmm0,%xmm10 - movdqu 80(%r11),%xmm1 + movdqu 80(%rbp),%xmm1 movups -112(%rcx),%xmm0 movups %xmm2,(%rsi) @@ -3768,7 +3818,7 @@ L$cbc_dec_loop6_enter: pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) pxor %xmm14,%xmm6 - movq %r11,%rcx + movq %rbp,%rcx movdqu %xmm5,48(%rsi) pxor %xmm15,%xmm7 movl %r10d,%eax @@ -3921,16 +3971,21 @@ L$cbc_dec_tail_partial: L$cbc_dec_ret: xorps %xmm0,%xmm0 pxor %xmm1,%xmm1 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + + leaq (%r11),%rsp + L$cbc_ret: .byte 0xf3,0xc3 + .globl _aesni_set_decrypt_key .p2align 4 _aesni_set_decrypt_key: + .byte 0x48,0x83,0xEC,0x08 + call __aesni_set_encrypt_key shll $4,%esi testl %eax,%eax @@ -3963,7 +4018,9 @@ L$dec_key_inverse: pxor %xmm0,%xmm0 L$dec_key_ret: addq $8,%rsp + .byte 0xf3,0xc3 + L$SEH_end_set_decrypt_key: .globl _aesni_set_encrypt_key @@ -3971,7 +4028,9 @@ L$SEH_end_set_decrypt_key: .p2align 4 _aesni_set_encrypt_key: __aesni_set_encrypt_key: + .byte 0x48,0x83,0xEC,0x08 + movq $-1,%rax testq %rdi,%rdi jz L$enc_key_ret @@ -4264,7 +4323,9 @@ L$enc_key_ret: pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 addq $8,%rsp + .byte 0xf3,0xc3 + L$SEH_end_set_encrypt_key: .p2align 4 diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/bsaes-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/bsaes-x86_64.s index da5d1b1122..13920e2ace 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/bsaes-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/aes/bsaes-x86_64.s @@ -1067,6 +1067,7 @@ L$key_loop: .p2align 4 _bsaes_cbc_encrypt: + cmpl $0,%r9d jne _asm_AES_cbc_encrypt cmpq $128,%rdx @@ -1075,13 +1076,21 @@ _bsaes_cbc_encrypt: movq %rsp,%rax L$cbc_dec_prologue: pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq -72(%rsp),%rsp + movq %rsp,%rbp + movl 240(%rcx),%eax movq %rdi,%r12 movq %rsi,%r13 @@ -1300,33 +1309,50 @@ L$cbc_dec_bzero: cmpq %rax,%rbp ja L$cbc_dec_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + + movq -48(%rax),%r15 + + movq -40(%rax),%r14 + + movq -32(%rax),%r13 + + movq -24(%rax),%r12 + + movq -16(%rax),%rbx + + movq -8(%rax),%rbp + + leaq (%rax),%rsp + L$cbc_dec_epilogue: .byte 0xf3,0xc3 + .globl _bsaes_ctr32_encrypt_blocks .p2align 4 _bsaes_ctr32_encrypt_blocks: + movq %rsp,%rax L$ctr_enc_prologue: pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq -72(%rsp),%rsp + movq %rsp,%rbp + movdqu (%r8),%xmm0 movl 240(%rcx),%eax movq %rdi,%r12 @@ -1500,32 +1526,49 @@ L$ctr_enc_bzero: cmpq %rax,%rbp ja L$ctr_enc_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + + movq -48(%rax),%r15 + + movq -40(%rax),%r14 + + movq -32(%rax),%r13 + + movq -24(%rax),%r12 + + movq -16(%rax),%rbx + + movq -8(%rax),%rbp + + leaq (%rax),%rsp + L$ctr_enc_epilogue: .byte 0xf3,0xc3 + .globl _bsaes_xts_encrypt .p2align 4 _bsaes_xts_encrypt: + movq %rsp,%rax L$xts_enc_prologue: pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq -72(%rsp),%rsp + movq %rsp,%rbp + movq %rdi,%r12 movq %rsi,%r13 movq %rdx,%r14 @@ -1951,32 +1994,48 @@ L$xts_enc_bzero: cmpq %rax,%rbp ja L$xts_enc_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + + movq -48(%rax),%r15 + + movq -40(%rax),%r14 + + movq -32(%rax),%r13 + + movq -24(%rax),%r12 + + movq -16(%rax),%rbx + + movq -8(%rax),%rbp + + leaq (%rax),%rsp + L$xts_enc_epilogue: .byte 0xf3,0xc3 + .globl _bsaes_xts_decrypt .p2align 4 _bsaes_xts_decrypt: + movq %rsp,%rax L$xts_dec_prologue: pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq -72(%rsp),%rsp + movq %rsp,%rbp movq %rdi,%r12 movq %rsi,%r13 @@ -2429,19 +2488,27 @@ L$xts_dec_bzero: cmpq %rax,%rbp ja L$xts_dec_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + + movq -48(%rax),%r15 + + movq -40(%rax),%r14 + + movq -32(%rax),%r13 + + movq -24(%rax),%r12 + + movq -16(%rax),%rbx + + movq -8(%rax),%rbp + + leaq (%rax),%rsp + L$xts_dec_epilogue: .byte 0xf3,0xc3 + .p2align 6 _bsaes_const: L$M0ISR: diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/rsaz-avx2.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/rsaz-avx2.s index 785a35ac91..73aa8b7373 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/rsaz-avx2.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/rsaz-avx2.s @@ -4,15 +4,24 @@ .p2align 6 _rsaz_1024_sqr_avx2: + leaq (%rsp),%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + vzeroupper movq %rax,%rbp + movq %rdx,%r13 subq $832,%rsp movq %r13,%r15 @@ -625,28 +634,46 @@ L$OOP_REDUCE_1024: vzeroall movq %rbp,%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$sqr_1024_epilogue: .byte 0xf3,0xc3 + .globl _rsaz_1024_mul_avx2 .p2align 6 _rsaz_1024_mul_avx2: + leaq (%rsp),%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rax,%rbp + vzeroall movq %rdx,%r13 subq $64,%rsp @@ -1162,16 +1189,25 @@ L$oop_mul_1024: vzeroupper movq %rbp,%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$mul_1024_epilogue: .byte 0xf3,0xc3 + .globl _rsaz_1024_red2norm_avx2 .p2align 5 @@ -1555,8 +1591,10 @@ L$oop_scatter_1024: .p2align 5 _rsaz_1024_gather5_avx2: + vzeroupper movq %rsp,%r11 + leaq -256(%rsp),%rsp andq $-32,%rsp leaq L$inc(%rip),%r10 @@ -1665,8 +1703,11 @@ L$oop_gather_1024: vmovdqu %ymm0,(%rdi) vzeroupper leaq (%r11),%rsp + .byte 0xf3,0xc3 +L$SEH_end_rsaz_1024_gather5: + .globl _rsaz_avx2_eligible diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/rsaz-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/rsaz-x86_64.s index 7f4a01109e..eab5b54b2c 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/rsaz-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/rsaz-x86_64.s @@ -6,14 +6,22 @@ .p2align 5 _rsaz_512_sqr: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $128+24,%rsp + L$sqr_body: movq %rdx,%rbp movq (%rsi),%rdx @@ -658,28 +666,45 @@ L$oop_sqrx: L$sqr_tail: leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$sqr_epilogue: .byte 0xf3,0xc3 + .globl _rsaz_512_mul .p2align 5 _rsaz_512_mul: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $128+24,%rsp + L$mul_body: .byte 102,72,15,110,199 .byte 102,72,15,110,201 @@ -741,28 +766,45 @@ L$mul_tail: call __rsaz_512_subtract leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$mul_epilogue: .byte 0xf3,0xc3 + .globl _rsaz_512_mul_gather4 .p2align 5 _rsaz_512_mul_gather4: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $152,%rsp + L$mul_gather4_body: movd %r9d,%xmm8 movdqa L$inc+16(%rip),%xmm1 @@ -1151,29 +1193,46 @@ L$mul_gather_tail: call __rsaz_512_subtract leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$mul_gather4_epilogue: .byte 0xf3,0xc3 + .globl _rsaz_512_mul_scatter4 .p2align 5 _rsaz_512_mul_scatter4: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movl %r9d,%r9d subq $128+24,%rsp + L$mul_scatter4_body: leaq (%r8,%r9,8),%r8 .byte 102,72,15,110,199 @@ -1248,28 +1307,45 @@ L$mul_scatter_tail: movq %r15,896(%rsi) leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$mul_scatter4_epilogue: .byte 0xf3,0xc3 + .globl _rsaz_512_mul_by_one .p2align 5 _rsaz_512_mul_by_one: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $128+24,%rsp + L$mul_by_one_body: movl _OPENSSL_ia32cap_P+8(%rip),%eax movq %rdx,%rbp @@ -1312,17 +1388,26 @@ L$by_one_tail: movq %r15,56(%rdi) leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$mul_by_one_epilogue: .byte 0xf3,0xc3 + .p2align 5 __rsaz_512_reduce: movq %r8,%rbx diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-gf2m.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-gf2m.s index af1ffdd59b..4137d5990a 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-gf2m.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-gf2m.s @@ -3,7 +3,9 @@ .p2align 4 _mul_1x1: + subq $128+8,%rsp + movq $-1,%r9 leaq (%rax,%rax,1),%rsi shrq $3,%r9 @@ -193,16 +195,20 @@ _mul_1x1: xorq %rdi,%rdx addq $128+8,%rsp + .byte 0xf3,0xc3 L$end_mul_1x1: + .globl _bn_GF2m_mul_2x2 .p2align 4 _bn_GF2m_mul_2x2: - movq _OPENSSL_ia32cap_P(%rip),%rax - btq $33,%rax + + movq %rsp,%rax + movq _OPENSSL_ia32cap_P(%rip),%r10 + btq $33,%r10 jnc L$vanilla_mul_2x2 .byte 102,72,15,110,198 @@ -230,11 +236,17 @@ _bn_GF2m_mul_2x2: .p2align 4 L$vanilla_mul_2x2: leaq -136(%rsp),%rsp + movq %r14,80(%rsp) + movq %r13,88(%rsp) + movq %r12,96(%rsp) + movq %rbp,104(%rsp) + movq %rbx,112(%rsp) + L$body_mul_2x2: movq %rdi,32(%rsp) movq %rsi,40(%rsp) @@ -279,13 +291,21 @@ L$body_mul_2x2: movq %rax,8(%rbp) movq 80(%rsp),%r14 + movq 88(%rsp),%r13 + movq 96(%rsp),%r12 + movq 104(%rsp),%rbp + movq 112(%rsp),%rbx + leaq 136(%rsp),%rsp + +L$epilogue_mul_2x2: .byte 0xf3,0xc3 L$end_mul_2x2: + .byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 4 diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-mont.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-mont.s index dd43da0d86..5abe3696c4 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-mont.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-mont.s @@ -6,8 +6,10 @@ .p2align 4 _bn_mul_mont: + movl %r9d,%r9d movq %rsp,%rax + testl $3,%r9d jnz L$mul_enter cmpl $8,%r9d @@ -22,12 +24,18 @@ _bn_mul_mont: .p2align 4 L$mul_enter: pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + negq %r9 movq %rsp,%r11 leaq -16(%rsp,%r9,8),%r10 @@ -59,6 +67,7 @@ L$mul_page_walk: L$mul_page_walk_done: movq %rax,8(%rsp,%r9,8) + L$mul_body: movq %rdx,%r12 movq (%r8),%r8 @@ -226,33 +235,50 @@ L$copy: jnz L$copy movq 8(%rsp,%r9,8),%rsi + movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$mul_epilogue: .byte 0xf3,0xc3 + .p2align 4 bn_mul4x_mont: + movl %r9d,%r9d movq %rsp,%rax + L$mul4x_enter: andl $0x80100,%r11d cmpl $0x80100,%r11d je L$mulx4x_enter pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + negq %r9 movq %rsp,%r11 leaq -32(%rsp,%r9,8),%r10 @@ -275,6 +301,7 @@ L$mul4x_page_walk: L$mul4x_page_walk_done: movq %rax,8(%rsp,%r9,8) + L$mul4x_body: movq %rdi,16(%rsp,%r9,8) movq %rdx,%r12 @@ -642,14 +669,22 @@ L$copy4x: decq %r15 jnz L$copy4x movq 8(%rsp,%r9,8),%rsi + movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$mul4x_epilogue: .byte 0xf3,0xc3 @@ -657,16 +692,25 @@ L$mul4x_epilogue: + .p2align 5 bn_sqr8x_mont: + movq %rsp,%rax + L$sqr8x_enter: pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + L$sqr8x_prologue: movl %r9d,%r10d @@ -722,6 +766,7 @@ L$sqr8x_page_walk_done: movq %r8,32(%rsp) movq %rax,40(%rsp) + L$sqr8x_body: .byte 102,72,15,110,209 @@ -787,6 +832,7 @@ L$sqr8x_sub: pxor %xmm0,%xmm0 pshufd $0,%xmm1,%xmm1 movq 40(%rsp),%rsi + jmp L$sqr8x_cond_copy .p2align 5 @@ -816,26 +862,42 @@ L$sqr8x_cond_copy: movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$sqr8x_epilogue: .byte 0xf3,0xc3 + .p2align 5 bn_mulx4x_mont: + movq %rsp,%rax + L$mulx4x_enter: pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + L$mulx4x_prologue: shll $3,%r9d @@ -881,6 +943,7 @@ L$mulx4x_page_walk_done: movq %r8,24(%rsp) movq %rdi,32(%rsp) movq %rax,40(%rsp) + movq %r9,48(%rsp) jmp L$mulx4x_body @@ -1125,6 +1188,7 @@ L$mulx4x_sub: pxor %xmm0,%xmm0 pshufd $0,%xmm1,%xmm1 movq 40(%rsp),%rsi + jmp L$mulx4x_cond_copy .p2align 5 @@ -1154,14 +1218,22 @@ L$mulx4x_cond_copy: movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$mulx4x_epilogue: .byte 0xf3,0xc3 + .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 4 diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-mont5.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-mont5.s index f415b8d80c..9cb256094b 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-mont5.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/bn/x86_64-mont5.s @@ -6,8 +6,10 @@ .p2align 6 _bn_mul_mont_gather5: + movl %r9d,%r9d movq %rsp,%rax + testl $7,%r9d jnz L$mul_enter movl _OPENSSL_ia32cap_P+8(%rip),%r11d @@ -17,12 +19,18 @@ _bn_mul_mont_gather5: L$mul_enter: movd 8(%rsp),%xmm5 pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + negq %r9 movq %rsp,%r11 leaq -280(%rsp,%r9,8),%r10 @@ -54,6 +62,7 @@ L$mul_page_walk_done: leaq L$inc(%rip),%r10 movq %rax,8(%rsp,%r9,8) + L$mul_body: leaq 128(%rdx),%r12 @@ -411,33 +420,50 @@ L$copy: jnz L$copy movq 8(%rsp,%r9,8),%rsi + movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$mul_epilogue: .byte 0xf3,0xc3 + .p2align 5 bn_mul4x_mont_gather5: + .byte 0x67 movq %rsp,%rax + L$mul4x_enter: andl $0x80108,%r11d cmpl $0x80108,%r11d je L$mulx4x_enter pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + L$mul4x_prologue: .byte 0x67 @@ -493,25 +519,35 @@ L$mul4x_page_walk_done: negq %r9 movq %rax,40(%rsp) + L$mul4x_body: call mul4x_internal movq 40(%rsp),%rsi + movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$mul4x_epilogue: .byte 0xf3,0xc3 + .p2align 5 mul4x_internal: shlq $5,%r9 @@ -1040,17 +1076,25 @@ L$inner4x: .p2align 5 _bn_power5: + movq %rsp,%rax + movl _OPENSSL_ia32cap_P+8(%rip),%r11d andl $0x80108,%r11d cmpl $0x80108,%r11d je L$powerx5_enter pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + L$power5_prologue: shll $3,%r9d @@ -1115,6 +1159,7 @@ L$pwr_page_walk_done: movq %r8,32(%rsp) movq %rax,40(%rsp) + L$power5_body: .byte 102,72,15,110,207 .byte 102,72,15,110,209 @@ -1141,18 +1186,27 @@ L$power5_body: call mul4x_internal movq 40(%rsp),%rsi + movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$power5_epilogue: .byte 0xf3,0xc3 + .globl _bn_sqr8x_internal .private_extern _bn_sqr8x_internal @@ -2001,14 +2055,22 @@ _bn_from_montgomery: .p2align 5 bn_from_mont8x: + .byte 0x67 movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + L$from_prologue: shll $3,%r9d @@ -2073,6 +2135,7 @@ L$from_page_walk_done: movq %r8,32(%rsp) movq %rax,40(%rsp) + L$from_body: movq %r9,%r11 leaq 48(%rsp),%rax @@ -2114,7 +2177,6 @@ L$mul_by_1: pxor %xmm0,%xmm0 leaq 48(%rsp),%rax - movq 40(%rsp),%rsi jmp L$from_mont_zero .p2align 5 @@ -2124,11 +2186,12 @@ L$from_mont_nox: pxor %xmm0,%xmm0 leaq 48(%rsp),%rax - movq 40(%rsp),%rsi jmp L$from_mont_zero .p2align 5 L$from_mont_zero: + movq 40(%rsp),%rsi + movdqa %xmm0,0(%rax) movdqa %xmm0,16(%rax) movdqa %xmm0,32(%rax) @@ -2139,26 +2202,42 @@ L$from_mont_zero: movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$from_epilogue: .byte 0xf3,0xc3 + .p2align 5 bn_mulx4x_mont_gather5: + movq %rsp,%rax + L$mulx4x_enter: pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + L$mulx4x_prologue: shll $3,%r9d @@ -2224,24 +2303,34 @@ L$mulx4x_page_walk_done: movq %r8,32(%rsp) movq %rax,40(%rsp) + L$mulx4x_body: call mulx4x_internal movq 40(%rsp),%rsi + movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$mulx4x_epilogue: .byte 0xf3,0xc3 + .p2align 5 mulx4x_internal: movq %r9,8(%rsp) @@ -2666,14 +2755,22 @@ L$mulx4x_inner: .p2align 5 bn_powerx5: + movq %rsp,%rax + L$powerx5_enter: pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + L$powerx5_prologue: shll $3,%r9d @@ -2745,6 +2842,7 @@ L$pwrx_page_walk_done: .byte 102,72,15,110,226 movq %r8,32(%rsp) movq %rax,40(%rsp) + L$powerx5_body: call __bn_sqrx8x_internal @@ -2767,19 +2865,28 @@ L$powerx5_body: call mulx4x_internal movq 40(%rsp),%rsi + movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$powerx5_epilogue: .byte 0xf3,0xc3 + .globl _bn_sqrx8x_internal .private_extern _bn_sqrx8x_internal diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/buildinf.h b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/buildinf.h index ecb27f8a3a..b6263000db 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/buildinf.h +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/buildinf.h @@ -1,38 +1,47 @@ -/* auto-generated by util/mkbuildinf.pl for crypto/cversion.c */ -#define CFLAGS cflags /* - * Generate CFLAGS as an array of individual characters. This is a + * WARNING: do not edit! + * Generated by util/mkbuildinf.pl + * + * Copyright 2014-2017 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#define PLATFORM "platform: darwin64-x86_64-cc" +#define DATE "built on: Thu Nov 22 19:33:06 2018 UTC" + +/* + * Generate compiler_flags as an array of individual characters. This is a * workaround for the situation where CFLAGS gets too long for a C90 string * literal */ -static const char cflags[] = { - 'c','o','m','p','i','l','e','r',':',' ','c','c',' ','-','D','D', - 'S','O','_','D','L','F','C','N',' ','-','D','H','A','V','E','_', - 'D','L','F','C','N','_','H',' ','-','D','N','D','E','B','U','G', - ' ','-','D','O','P','E','N','S','S','L','_','T','H','R','E','A', - 'D','S',' ','-','D','O','P','E','N','S','S','L','_','N','O','_', - 'D','Y','N','A','M','I','C','_','E','N','G','I','N','E',' ','-', - 'D','O','P','E','N','S','S','L','_','P','I','C',' ','-','D','O', - 'P','E','N','S','S','L','_','I','A','3','2','_','S','S','E','2', - ' ','-','D','O','P','E','N','S','S','L','_','B','N','_','A','S', - 'M','_','M','O','N','T',' ','-','D','O','P','E','N','S','S','L', - '_','B','N','_','A','S','M','_','M','O','N','T','5',' ','-','D', - 'O','P','E','N','S','S','L','_','B','N','_','A','S','M','_','G', - 'F','2','m',' ','-','D','S','H','A','1','_','A','S','M',' ','-', - 'D','S','H','A','2','5','6','_','A','S','M',' ','-','D','S','H', - 'A','5','1','2','_','A','S','M',' ','-','D','R','C','4','_','A', - 'S','M',' ','-','D','M','D','5','_','A','S','M',' ','-','D','A', - 'E','S','_','A','S','M',' ','-','D','V','P','A','E','S','_','A', - 'S','M',' ','-','D','B','S','A','E','S','_','A','S','M',' ','-', - 'D','G','H','A','S','H','_','A','S','M',' ','-','D','E','C','P', - '_','N','I','S','T','Z','2','5','6','_','A','S','M',' ','-','D', - 'P','A','D','L','O','C','K','_','A','S','M',' ','-','D','P','O', - 'L','Y','1','3','0','5','_','A','S','M',' ','-','D','O','P','E', - 'N','S','S','L','D','I','R','=','"','\\','"','/','u','s','r','/', - 'l','o','c','a','l','/','s','s','l','\\','"','"',' ','-','D','E', - 'N','G','I','N','E','S','D','I','R','=','"','\\','"','/','u','s', - 'r','/','l','o','c','a','l','/','l','i','b','/','e','n','g','i', - 'n','e','s','-','1','.','1','\\','"','"',' ','\0' +static const char compiler_flags[] = { + 'c','o','m','p','i','l','e','r',':',' ','g','c','c',' ','-','f', + 'P','I','C',' ','-','a','r','c','h',' ','x','8','6','_','6','4', + ' ','-','W','a',',','-','-','n','o','e','x','e','c','s','t','a', + 'c','k',' ','-','O','3',' ','-','W','a','l','l',' ','-','D','L', + '_','E','N','D','I','A','N',' ','-','D','O','P','E','N','S','S', + 'L','_','P','I','C',' ','-','D','O','P','E','N','S','S','L','_', + 'C','P','U','I','D','_','O','B','J',' ','-','D','O','P','E','N', + 'S','S','L','_','I','A','3','2','_','S','S','E','2',' ','-','D', + 'O','P','E','N','S','S','L','_','B','N','_','A','S','M','_','M', + 'O','N','T',' ','-','D','O','P','E','N','S','S','L','_','B','N', + '_','A','S','M','_','M','O','N','T','5',' ','-','D','O','P','E', + 'N','S','S','L','_','B','N','_','A','S','M','_','G','F','2','m', + ' ','-','D','S','H','A','1','_','A','S','M',' ','-','D','S','H', + 'A','2','5','6','_','A','S','M',' ','-','D','S','H','A','5','1', + '2','_','A','S','M',' ','-','D','K','E','C','C','A','K','1','6', + '0','0','_','A','S','M',' ','-','D','R','C','4','_','A','S','M', + ' ','-','D','M','D','5','_','A','S','M',' ','-','D','A','E','S', + '_','A','S','M',' ','-','D','V','P','A','E','S','_','A','S','M', + ' ','-','D','B','S','A','E','S','_','A','S','M',' ','-','D','G', + 'H','A','S','H','_','A','S','M',' ','-','D','E','C','P','_','N', + 'I','S','T','Z','2','5','6','_','A','S','M',' ','-','D','X','2', + '5','5','1','9','_','A','S','M',' ','-','D','P','A','D','L','O', + 'C','K','_','A','S','M',' ','-','D','P','O','L','Y','1','3','0', + '5','_','A','S','M',' ','-','D','_','R','E','E','N','T','R','A', + 'N','T',' ','-','D','N','D','E','B','U','G','\0' }; -#define PLATFORM "platform: darwin64-x86_64-cc" -#define DATE "built on: Tue Nov 20 09:37:39 2018" diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/camellia/cmll-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/camellia/cmll-x86_64.s index 35a3ea550a..2ae924deec 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/camellia/cmll-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/camellia/cmll-x86_64.s @@ -17,11 +17,17 @@ _Camellia_EncryptBlock: .p2align 4 L$enc_rounds: _Camellia_EncryptBlock_Rounds: + pushq %rbx + pushq %rbp + pushq %r13 + pushq %r14 + pushq %r15 + L$enc_prologue: @@ -53,16 +59,23 @@ L$enc_prologue: movl %r11d,12(%r13) movq 0(%rsp),%r15 + movq 8(%rsp),%r14 + movq 16(%rsp),%r13 + movq 24(%rsp),%rbp + movq 32(%rsp),%rbx + leaq 40(%rsp),%rsp + L$enc_epilogue: .byte 0xf3,0xc3 + .p2align 4 _x86_64_Camellia_encrypt: xorl 0(%r14),%r9d @@ -286,11 +299,17 @@ _Camellia_DecryptBlock: .p2align 4 L$dec_rounds: _Camellia_DecryptBlock_Rounds: + pushq %rbx + pushq %rbp + pushq %r13 + pushq %r14 + pushq %r15 + L$dec_prologue: @@ -322,16 +341,23 @@ L$dec_prologue: movl %r11d,12(%r13) movq 0(%rsp),%r15 + movq 8(%rsp),%r14 + movq 16(%rsp),%r13 + movq 24(%rsp),%rbp + movq 32(%rsp),%rbx + leaq 40(%rsp),%rsp + L$dec_epilogue: .byte 0xf3,0xc3 + .p2align 4 _x86_64_Camellia_decrypt: xorl 0(%r14),%r9d @@ -542,11 +568,17 @@ L$ddone: .p2align 4 _Camellia_Ekeygen: + pushq %rbx + pushq %rbp + pushq %r13 + pushq %r14 + pushq %r15 + L$key_prologue: movl %edi,%r15d @@ -1074,14 +1106,21 @@ L$2nd256: movl $4,%eax L$done: movq 0(%rsp),%r15 + movq 8(%rsp),%r14 + movq 16(%rsp),%r13 + movq 24(%rsp),%rbp + movq 32(%rsp),%rbx + leaq 40(%rsp),%rsp + L$key_epilogue: .byte 0xf3,0xc3 + .p2align 6 L$Camellia_SIGMA: .long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858 @@ -1605,17 +1644,25 @@ L$Camellia_SBOX: .p2align 4 _Camellia_cbc_encrypt: + cmpq $0,%rdx je L$cbc_abort pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + L$cbc_prologue: movq %rsp,%rbp + subq $64,%rsp andq $-64,%rsp @@ -1637,6 +1684,7 @@ L$cbc_prologue: movq %r8,40(%rsp) movq %rbp,48(%rsp) + L$cbc_body: leaq L$Camellia_SBOX(%rip),%rbp @@ -1824,15 +1872,24 @@ L$cbc_dec_popf: .p2align 4 L$cbc_done: movq 48(%rsp),%rcx + movq 0(%rcx),%r15 + movq 8(%rcx),%r14 + movq 16(%rcx),%r13 + movq 24(%rcx),%r12 + movq 32(%rcx),%rbp + movq 40(%rcx),%rbx + leaq 48(%rcx),%rsp + L$cbc_abort: .byte 0xf3,0xc3 + .byte 67,97,109,101,108,108,105,97,32,102,111,114,32,120,56,54,95,54,52,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/chacha/chacha-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/chacha/chacha-x86_64.s index afd47bdf68..edb6c28e4b 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/chacha/chacha-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/chacha/chacha-x86_64.s @@ -19,6 +19,17 @@ L$rot16: .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd L$rot24: .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe +L$twoy: +.long 2,0,0,0, 2,0,0,0 +.p2align 6 +L$zeroz: +.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 +L$fourz: +.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 +L$incz: +.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +L$sixteen: +.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 L$sigma: .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 @@ -26,20 +37,33 @@ L$sigma: .p2align 6 _ChaCha20_ctr32: + cmpq $0,%rdx je L$no_data movq _OPENSSL_ia32cap_P+4(%rip),%r10 + btq $48,%r10 + jc L$ChaCha20_avx512 + testq %r10,%r10 + js L$ChaCha20_avx512vl testl $512,%r10d jnz L$ChaCha20_ssse3 pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $64+24,%rsp +L$ctr32_body: + movdqu (%rcx),%xmm1 movdqu 16(%rcx),%xmm2 @@ -276,34 +300,41 @@ L$oop_tail: jnz L$oop_tail L$done: - addq $64+24,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx + leaq 64+24+48(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$no_data: .byte 0xf3,0xc3 + .p2align 5 ChaCha20_ssse3: + L$ChaCha20_ssse3: + movq %rsp,%r9 + testl $2048,%r10d jnz L$ChaCha20_4xop cmpq $128,%rdx + je L$ChaCha20_128 ja L$ChaCha20_4x L$do_sse3_after_all: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - subq $64+24,%rsp + subq $64+8,%rsp movdqa L$sigma(%rip),%xmm0 movdqu (%rcx),%xmm1 movdqu 16(%rcx),%xmm2 @@ -315,7 +346,7 @@ L$do_sse3_after_all: movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) - movl $10,%ebp + movq $10,%r8 jmp L$oop_ssse3 .p2align 5 @@ -325,7 +356,7 @@ L$oop_outer_ssse3: movdqa 16(%rsp),%xmm1 movdqa 32(%rsp),%xmm2 paddd 48(%rsp),%xmm3 - movl $10,%ebp + movq $10,%r8 movdqa %xmm3,48(%rsp) jmp L$oop_ssse3 @@ -374,7 +405,7 @@ L$oop_ssse3: pshufd $78,%xmm2,%xmm2 pshufd $147,%xmm1,%xmm1 pshufd $57,%xmm3,%xmm3 - decl %ebp + decq %r8 jnz L$oop_ssse3 paddd 0(%rsp),%xmm0 paddd 16(%rsp),%xmm1 @@ -411,31 +442,187 @@ L$tail_ssse3: movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) - xorq %rbx,%rbx + xorq %r8,%r8 L$oop_tail_ssse3: - movzbl (%rsi,%rbx,1),%eax - movzbl (%rsp,%rbx,1),%ecx - leaq 1(%rbx),%rbx + movzbl (%rsi,%r8,1),%eax + movzbl (%rsp,%r8,1),%ecx + leaq 1(%r8),%r8 xorl %ecx,%eax - movb %al,-1(%rdi,%rbx,1) + movb %al,-1(%rdi,%r8,1) decq %rdx jnz L$oop_tail_ssse3 L$done_ssse3: - addq $64+24,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx + leaq (%r9),%rsp + +L$ssse3_epilogue: + .byte 0xf3,0xc3 + + + +.p2align 5 +ChaCha20_128: + +L$ChaCha20_128: + movq %rsp,%r9 + + subq $64+8,%rsp + movdqa L$sigma(%rip),%xmm8 + movdqu (%rcx),%xmm9 + movdqu 16(%rcx),%xmm2 + movdqu (%r8),%xmm3 + movdqa L$one(%rip),%xmm1 + movdqa L$rot16(%rip),%xmm6 + movdqa L$rot24(%rip),%xmm7 + + movdqa %xmm8,%xmm10 + movdqa %xmm8,0(%rsp) + movdqa %xmm9,%xmm11 + movdqa %xmm9,16(%rsp) + movdqa %xmm2,%xmm0 + movdqa %xmm2,32(%rsp) + paddd %xmm3,%xmm1 + movdqa %xmm3,48(%rsp) + movq $10,%r8 + jmp L$oop_128 + +.p2align 5 +L$oop_128: + paddd %xmm9,%xmm8 + pxor %xmm8,%xmm3 + paddd %xmm11,%xmm10 + pxor %xmm10,%xmm1 +.byte 102,15,56,0,222 +.byte 102,15,56,0,206 + paddd %xmm3,%xmm2 + paddd %xmm1,%xmm0 + pxor %xmm2,%xmm9 + pxor %xmm0,%xmm11 + movdqa %xmm9,%xmm4 + psrld $20,%xmm9 + movdqa %xmm11,%xmm5 + pslld $12,%xmm4 + psrld $20,%xmm11 + por %xmm4,%xmm9 + pslld $12,%xmm5 + por %xmm5,%xmm11 + paddd %xmm9,%xmm8 + pxor %xmm8,%xmm3 + paddd %xmm11,%xmm10 + pxor %xmm10,%xmm1 +.byte 102,15,56,0,223 +.byte 102,15,56,0,207 + paddd %xmm3,%xmm2 + paddd %xmm1,%xmm0 + pxor %xmm2,%xmm9 + pxor %xmm0,%xmm11 + movdqa %xmm9,%xmm4 + psrld $25,%xmm9 + movdqa %xmm11,%xmm5 + pslld $7,%xmm4 + psrld $25,%xmm11 + por %xmm4,%xmm9 + pslld $7,%xmm5 + por %xmm5,%xmm11 + pshufd $78,%xmm2,%xmm2 + pshufd $57,%xmm9,%xmm9 + pshufd $147,%xmm3,%xmm3 + pshufd $78,%xmm0,%xmm0 + pshufd $57,%xmm11,%xmm11 + pshufd $147,%xmm1,%xmm1 + paddd %xmm9,%xmm8 + pxor %xmm8,%xmm3 + paddd %xmm11,%xmm10 + pxor %xmm10,%xmm1 +.byte 102,15,56,0,222 +.byte 102,15,56,0,206 + paddd %xmm3,%xmm2 + paddd %xmm1,%xmm0 + pxor %xmm2,%xmm9 + pxor %xmm0,%xmm11 + movdqa %xmm9,%xmm4 + psrld $20,%xmm9 + movdqa %xmm11,%xmm5 + pslld $12,%xmm4 + psrld $20,%xmm11 + por %xmm4,%xmm9 + pslld $12,%xmm5 + por %xmm5,%xmm11 + paddd %xmm9,%xmm8 + pxor %xmm8,%xmm3 + paddd %xmm11,%xmm10 + pxor %xmm10,%xmm1 +.byte 102,15,56,0,223 +.byte 102,15,56,0,207 + paddd %xmm3,%xmm2 + paddd %xmm1,%xmm0 + pxor %xmm2,%xmm9 + pxor %xmm0,%xmm11 + movdqa %xmm9,%xmm4 + psrld $25,%xmm9 + movdqa %xmm11,%xmm5 + pslld $7,%xmm4 + psrld $25,%xmm11 + por %xmm4,%xmm9 + pslld $7,%xmm5 + por %xmm5,%xmm11 + pshufd $78,%xmm2,%xmm2 + pshufd $147,%xmm9,%xmm9 + pshufd $57,%xmm3,%xmm3 + pshufd $78,%xmm0,%xmm0 + pshufd $147,%xmm11,%xmm11 + pshufd $57,%xmm1,%xmm1 + decq %r8 + jnz L$oop_128 + paddd 0(%rsp),%xmm8 + paddd 16(%rsp),%xmm9 + paddd 32(%rsp),%xmm2 + paddd 48(%rsp),%xmm3 + paddd L$one(%rip),%xmm1 + paddd 0(%rsp),%xmm10 + paddd 16(%rsp),%xmm11 + paddd 32(%rsp),%xmm0 + paddd 48(%rsp),%xmm1 + + movdqu 0(%rsi),%xmm4 + movdqu 16(%rsi),%xmm5 + pxor %xmm4,%xmm8 + movdqu 32(%rsi),%xmm4 + pxor %xmm5,%xmm9 + movdqu 48(%rsi),%xmm5 + pxor %xmm4,%xmm2 + movdqu 64(%rsi),%xmm4 + pxor %xmm5,%xmm3 + movdqu 80(%rsi),%xmm5 + pxor %xmm4,%xmm10 + movdqu 96(%rsi),%xmm4 + pxor %xmm5,%xmm11 + movdqu 112(%rsi),%xmm5 + pxor %xmm4,%xmm0 + pxor %xmm5,%xmm1 + + movdqu %xmm8,0(%rdi) + movdqu %xmm9,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + movdqu %xmm10,64(%rdi) + movdqu %xmm11,80(%rdi) + movdqu %xmm0,96(%rdi) + movdqu %xmm1,112(%rdi) + leaq (%r9),%rsp + +L$128_epilogue: .byte 0xf3,0xc3 + .p2align 5 ChaCha20_4x: + L$ChaCha20_4x: + movq %rsp,%r9 + movq %r10,%r11 shrq $32,%r10 testq $32,%r10 @@ -448,8 +635,7 @@ L$ChaCha20_4x: je L$do_sse3_after_all L$proceed4x: - leaq -120(%rsp),%r11 - subq $0x148+0,%rsp + subq $0x140+8,%rsp movdqa L$sigma(%rip),%xmm11 movdqu (%rcx),%xmm15 movdqu 16(%rcx),%xmm7 @@ -976,15 +1162,20 @@ L$oop_tail4x: jnz L$oop_tail4x L$done4x: - addq $0x148+0,%rsp + leaq (%r9),%rsp + +L$4x_epilogue: .byte 0xf3,0xc3 + .p2align 5 ChaCha20_4xop: + L$ChaCha20_4xop: - leaq -120(%rsp),%r11 - subq $0x148+0,%rsp + movq %rsp,%r9 + + subq $0x140+8,%rsp vzeroupper vmovdqa L$sigma(%rip),%xmm11 @@ -1386,18 +1577,22 @@ L$oop_tail4xop: L$done4xop: vzeroupper - addq $0x148+0,%rsp + leaq (%r9),%rsp + +L$4xop_epilogue: .byte 0xf3,0xc3 + .p2align 5 ChaCha20_8x: + L$ChaCha20_8x: - movq %rsp,%r10 + movq %rsp,%r9 + subq $0x280+8,%rsp andq $-32,%rsp vzeroupper - movq %r10,640(%rsp) @@ -1988,6 +2183,1240 @@ L$oop_tail8x: L$done8x: vzeroall - movq 640(%rsp),%rsp + leaq (%r9),%rsp + +L$8x_epilogue: + .byte 0xf3,0xc3 + + + +.p2align 5 +ChaCha20_avx512: + +L$ChaCha20_avx512: + movq %rsp,%r9 + + cmpq $512,%rdx + ja L$ChaCha20_16x + + subq $64+8,%rsp + vbroadcasti32x4 L$sigma(%rip),%zmm0 + vbroadcasti32x4 (%rcx),%zmm1 + vbroadcasti32x4 16(%rcx),%zmm2 + vbroadcasti32x4 (%r8),%zmm3 + + vmovdqa32 %zmm0,%zmm16 + vmovdqa32 %zmm1,%zmm17 + vmovdqa32 %zmm2,%zmm18 + vpaddd L$zeroz(%rip),%zmm3,%zmm3 + vmovdqa32 L$fourz(%rip),%zmm20 + movq $10,%r8 + vmovdqa32 %zmm3,%zmm19 + jmp L$oop_avx512 + +.p2align 4 +L$oop_outer_avx512: + vmovdqa32 %zmm16,%zmm0 + vmovdqa32 %zmm17,%zmm1 + vmovdqa32 %zmm18,%zmm2 + vpaddd %zmm20,%zmm19,%zmm3 + movq $10,%r8 + vmovdqa32 %zmm3,%zmm19 + jmp L$oop_avx512 + +.p2align 5 +L$oop_avx512: + vpaddd %zmm1,%zmm0,%zmm0 + vpxord %zmm0,%zmm3,%zmm3 + vprold $16,%zmm3,%zmm3 + vpaddd %zmm3,%zmm2,%zmm2 + vpxord %zmm2,%zmm1,%zmm1 + vprold $12,%zmm1,%zmm1 + vpaddd %zmm1,%zmm0,%zmm0 + vpxord %zmm0,%zmm3,%zmm3 + vprold $8,%zmm3,%zmm3 + vpaddd %zmm3,%zmm2,%zmm2 + vpxord %zmm2,%zmm1,%zmm1 + vprold $7,%zmm1,%zmm1 + vpshufd $78,%zmm2,%zmm2 + vpshufd $57,%zmm1,%zmm1 + vpshufd $147,%zmm3,%zmm3 + vpaddd %zmm1,%zmm0,%zmm0 + vpxord %zmm0,%zmm3,%zmm3 + vprold $16,%zmm3,%zmm3 + vpaddd %zmm3,%zmm2,%zmm2 + vpxord %zmm2,%zmm1,%zmm1 + vprold $12,%zmm1,%zmm1 + vpaddd %zmm1,%zmm0,%zmm0 + vpxord %zmm0,%zmm3,%zmm3 + vprold $8,%zmm3,%zmm3 + vpaddd %zmm3,%zmm2,%zmm2 + vpxord %zmm2,%zmm1,%zmm1 + vprold $7,%zmm1,%zmm1 + vpshufd $78,%zmm2,%zmm2 + vpshufd $147,%zmm1,%zmm1 + vpshufd $57,%zmm3,%zmm3 + decq %r8 + jnz L$oop_avx512 + vpaddd %zmm16,%zmm0,%zmm0 + vpaddd %zmm17,%zmm1,%zmm1 + vpaddd %zmm18,%zmm2,%zmm2 + vpaddd %zmm19,%zmm3,%zmm3 + + subq $64,%rdx + jb L$tail64_avx512 + + vpxor 0(%rsi),%xmm0,%xmm4 + vpxor 16(%rsi),%xmm1,%xmm5 + vpxor 32(%rsi),%xmm2,%xmm6 + vpxor 48(%rsi),%xmm3,%xmm7 + leaq 64(%rsi),%rsi + + vmovdqu %xmm4,0(%rdi) + vmovdqu %xmm5,16(%rdi) + vmovdqu %xmm6,32(%rdi) + vmovdqu %xmm7,48(%rdi) + leaq 64(%rdi),%rdi + + jz L$done_avx512 + + vextracti32x4 $1,%zmm0,%xmm4 + vextracti32x4 $1,%zmm1,%xmm5 + vextracti32x4 $1,%zmm2,%xmm6 + vextracti32x4 $1,%zmm3,%xmm7 + + subq $64,%rdx + jb L$tail_avx512 + + vpxor 0(%rsi),%xmm4,%xmm4 + vpxor 16(%rsi),%xmm5,%xmm5 + vpxor 32(%rsi),%xmm6,%xmm6 + vpxor 48(%rsi),%xmm7,%xmm7 + leaq 64(%rsi),%rsi + + vmovdqu %xmm4,0(%rdi) + vmovdqu %xmm5,16(%rdi) + vmovdqu %xmm6,32(%rdi) + vmovdqu %xmm7,48(%rdi) + leaq 64(%rdi),%rdi + + jz L$done_avx512 + + vextracti32x4 $2,%zmm0,%xmm4 + vextracti32x4 $2,%zmm1,%xmm5 + vextracti32x4 $2,%zmm2,%xmm6 + vextracti32x4 $2,%zmm3,%xmm7 + + subq $64,%rdx + jb L$tail_avx512 + + vpxor 0(%rsi),%xmm4,%xmm4 + vpxor 16(%rsi),%xmm5,%xmm5 + vpxor 32(%rsi),%xmm6,%xmm6 + vpxor 48(%rsi),%xmm7,%xmm7 + leaq 64(%rsi),%rsi + + vmovdqu %xmm4,0(%rdi) + vmovdqu %xmm5,16(%rdi) + vmovdqu %xmm6,32(%rdi) + vmovdqu %xmm7,48(%rdi) + leaq 64(%rdi),%rdi + + jz L$done_avx512 + + vextracti32x4 $3,%zmm0,%xmm4 + vextracti32x4 $3,%zmm1,%xmm5 + vextracti32x4 $3,%zmm2,%xmm6 + vextracti32x4 $3,%zmm3,%xmm7 + + subq $64,%rdx + jb L$tail_avx512 + + vpxor 0(%rsi),%xmm4,%xmm4 + vpxor 16(%rsi),%xmm5,%xmm5 + vpxor 32(%rsi),%xmm6,%xmm6 + vpxor 48(%rsi),%xmm7,%xmm7 + leaq 64(%rsi),%rsi + + vmovdqu %xmm4,0(%rdi) + vmovdqu %xmm5,16(%rdi) + vmovdqu %xmm6,32(%rdi) + vmovdqu %xmm7,48(%rdi) + leaq 64(%rdi),%rdi + + jnz L$oop_outer_avx512 + + jmp L$done_avx512 + +.p2align 4 +L$tail64_avx512: + vmovdqa %xmm0,0(%rsp) + vmovdqa %xmm1,16(%rsp) + vmovdqa %xmm2,32(%rsp) + vmovdqa %xmm3,48(%rsp) + addq $64,%rdx + jmp L$oop_tail_avx512 + +.p2align 4 +L$tail_avx512: + vmovdqa %xmm4,0(%rsp) + vmovdqa %xmm5,16(%rsp) + vmovdqa %xmm6,32(%rsp) + vmovdqa %xmm7,48(%rsp) + addq $64,%rdx + +L$oop_tail_avx512: + movzbl (%rsi,%r8,1),%eax + movzbl (%rsp,%r8,1),%ecx + leaq 1(%r8),%r8 + xorl %ecx,%eax + movb %al,-1(%rdi,%r8,1) + decq %rdx + jnz L$oop_tail_avx512 + + vmovdqu32 %zmm16,0(%rsp) + +L$done_avx512: + vzeroall + leaq (%r9),%rsp + +L$avx512_epilogue: .byte 0xf3,0xc3 + + +.p2align 5 +ChaCha20_avx512vl: + +L$ChaCha20_avx512vl: + movq %rsp,%r9 + + cmpq $128,%rdx + ja L$ChaCha20_8xvl + + subq $64+8,%rsp + vbroadcasti128 L$sigma(%rip),%ymm0 + vbroadcasti128 (%rcx),%ymm1 + vbroadcasti128 16(%rcx),%ymm2 + vbroadcasti128 (%r8),%ymm3 + + vmovdqa32 %ymm0,%ymm16 + vmovdqa32 %ymm1,%ymm17 + vmovdqa32 %ymm2,%ymm18 + vpaddd L$zeroz(%rip),%ymm3,%ymm3 + vmovdqa32 L$twoy(%rip),%ymm20 + movq $10,%r8 + vmovdqa32 %ymm3,%ymm19 + jmp L$oop_avx512vl + +.p2align 4 +L$oop_outer_avx512vl: + vmovdqa32 %ymm18,%ymm2 + vpaddd %ymm20,%ymm19,%ymm3 + movq $10,%r8 + vmovdqa32 %ymm3,%ymm19 + jmp L$oop_avx512vl + +.p2align 5 +L$oop_avx512vl: + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + vpshufd $78,%ymm2,%ymm2 + vpshufd $57,%ymm1,%ymm1 + vpshufd $147,%ymm3,%ymm3 + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + vpshufd $78,%ymm2,%ymm2 + vpshufd $147,%ymm1,%ymm1 + vpshufd $57,%ymm3,%ymm3 + decq %r8 + jnz L$oop_avx512vl + vpaddd %ymm16,%ymm0,%ymm0 + vpaddd %ymm17,%ymm1,%ymm1 + vpaddd %ymm18,%ymm2,%ymm2 + vpaddd %ymm19,%ymm3,%ymm3 + + subq $64,%rdx + jb L$tail64_avx512vl + + vpxor 0(%rsi),%xmm0,%xmm4 + vpxor 16(%rsi),%xmm1,%xmm5 + vpxor 32(%rsi),%xmm2,%xmm6 + vpxor 48(%rsi),%xmm3,%xmm7 + leaq 64(%rsi),%rsi + + vmovdqu %xmm4,0(%rdi) + vmovdqu %xmm5,16(%rdi) + vmovdqu %xmm6,32(%rdi) + vmovdqu %xmm7,48(%rdi) + leaq 64(%rdi),%rdi + + jz L$done_avx512vl + + vextracti128 $1,%ymm0,%xmm4 + vextracti128 $1,%ymm1,%xmm5 + vextracti128 $1,%ymm2,%xmm6 + vextracti128 $1,%ymm3,%xmm7 + + subq $64,%rdx + jb L$tail_avx512vl + + vpxor 0(%rsi),%xmm4,%xmm4 + vpxor 16(%rsi),%xmm5,%xmm5 + vpxor 32(%rsi),%xmm6,%xmm6 + vpxor 48(%rsi),%xmm7,%xmm7 + leaq 64(%rsi),%rsi + + vmovdqu %xmm4,0(%rdi) + vmovdqu %xmm5,16(%rdi) + vmovdqu %xmm6,32(%rdi) + vmovdqu %xmm7,48(%rdi) + leaq 64(%rdi),%rdi + + vmovdqa32 %ymm16,%ymm0 + vmovdqa32 %ymm17,%ymm1 + jnz L$oop_outer_avx512vl + + jmp L$done_avx512vl + +.p2align 4 +L$tail64_avx512vl: + vmovdqa %xmm0,0(%rsp) + vmovdqa %xmm1,16(%rsp) + vmovdqa %xmm2,32(%rsp) + vmovdqa %xmm3,48(%rsp) + addq $64,%rdx + jmp L$oop_tail_avx512vl + +.p2align 4 +L$tail_avx512vl: + vmovdqa %xmm4,0(%rsp) + vmovdqa %xmm5,16(%rsp) + vmovdqa %xmm6,32(%rsp) + vmovdqa %xmm7,48(%rsp) + addq $64,%rdx + +L$oop_tail_avx512vl: + movzbl (%rsi,%r8,1),%eax + movzbl (%rsp,%r8,1),%ecx + leaq 1(%r8),%r8 + xorl %ecx,%eax + movb %al,-1(%rdi,%r8,1) + decq %rdx + jnz L$oop_tail_avx512vl + + vmovdqu32 %ymm16,0(%rsp) + vmovdqu32 %ymm16,32(%rsp) + +L$done_avx512vl: + vzeroall + leaq (%r9),%rsp + +L$avx512vl_epilogue: + .byte 0xf3,0xc3 + + + +.p2align 5 +ChaCha20_16x: + +L$ChaCha20_16x: + movq %rsp,%r9 + + subq $64+8,%rsp + andq $-64,%rsp + vzeroupper + + leaq L$sigma(%rip),%r10 + vbroadcasti32x4 (%r10),%zmm3 + vbroadcasti32x4 (%rcx),%zmm7 + vbroadcasti32x4 16(%rcx),%zmm11 + vbroadcasti32x4 (%r8),%zmm15 + + vpshufd $0x00,%zmm3,%zmm0 + vpshufd $0x55,%zmm3,%zmm1 + vpshufd $0xaa,%zmm3,%zmm2 + vpshufd $0xff,%zmm3,%zmm3 + vmovdqa64 %zmm0,%zmm16 + vmovdqa64 %zmm1,%zmm17 + vmovdqa64 %zmm2,%zmm18 + vmovdqa64 %zmm3,%zmm19 + + vpshufd $0x00,%zmm7,%zmm4 + vpshufd $0x55,%zmm7,%zmm5 + vpshufd $0xaa,%zmm7,%zmm6 + vpshufd $0xff,%zmm7,%zmm7 + vmovdqa64 %zmm4,%zmm20 + vmovdqa64 %zmm5,%zmm21 + vmovdqa64 %zmm6,%zmm22 + vmovdqa64 %zmm7,%zmm23 + + vpshufd $0x00,%zmm11,%zmm8 + vpshufd $0x55,%zmm11,%zmm9 + vpshufd $0xaa,%zmm11,%zmm10 + vpshufd $0xff,%zmm11,%zmm11 + vmovdqa64 %zmm8,%zmm24 + vmovdqa64 %zmm9,%zmm25 + vmovdqa64 %zmm10,%zmm26 + vmovdqa64 %zmm11,%zmm27 + + vpshufd $0x00,%zmm15,%zmm12 + vpshufd $0x55,%zmm15,%zmm13 + vpshufd $0xaa,%zmm15,%zmm14 + vpshufd $0xff,%zmm15,%zmm15 + vpaddd L$incz(%rip),%zmm12,%zmm12 + vmovdqa64 %zmm12,%zmm28 + vmovdqa64 %zmm13,%zmm29 + vmovdqa64 %zmm14,%zmm30 + vmovdqa64 %zmm15,%zmm31 + + movl $10,%eax + jmp L$oop16x + +.p2align 5 +L$oop_outer16x: + vpbroadcastd 0(%r10),%zmm0 + vpbroadcastd 4(%r10),%zmm1 + vpbroadcastd 8(%r10),%zmm2 + vpbroadcastd 12(%r10),%zmm3 + vpaddd L$sixteen(%rip),%zmm28,%zmm28 + vmovdqa64 %zmm20,%zmm4 + vmovdqa64 %zmm21,%zmm5 + vmovdqa64 %zmm22,%zmm6 + vmovdqa64 %zmm23,%zmm7 + vmovdqa64 %zmm24,%zmm8 + vmovdqa64 %zmm25,%zmm9 + vmovdqa64 %zmm26,%zmm10 + vmovdqa64 %zmm27,%zmm11 + vmovdqa64 %zmm28,%zmm12 + vmovdqa64 %zmm29,%zmm13 + vmovdqa64 %zmm30,%zmm14 + vmovdqa64 %zmm31,%zmm15 + + vmovdqa64 %zmm0,%zmm16 + vmovdqa64 %zmm1,%zmm17 + vmovdqa64 %zmm2,%zmm18 + vmovdqa64 %zmm3,%zmm19 + + movl $10,%eax + jmp L$oop16x + +.p2align 5 +L$oop16x: + vpaddd %zmm4,%zmm0,%zmm0 + vpaddd %zmm5,%zmm1,%zmm1 + vpaddd %zmm6,%zmm2,%zmm2 + vpaddd %zmm7,%zmm3,%zmm3 + vpxord %zmm0,%zmm12,%zmm12 + vpxord %zmm1,%zmm13,%zmm13 + vpxord %zmm2,%zmm14,%zmm14 + vpxord %zmm3,%zmm15,%zmm15 + vprold $16,%zmm12,%zmm12 + vprold $16,%zmm13,%zmm13 + vprold $16,%zmm14,%zmm14 + vprold $16,%zmm15,%zmm15 + vpaddd %zmm12,%zmm8,%zmm8 + vpaddd %zmm13,%zmm9,%zmm9 + vpaddd %zmm14,%zmm10,%zmm10 + vpaddd %zmm15,%zmm11,%zmm11 + vpxord %zmm8,%zmm4,%zmm4 + vpxord %zmm9,%zmm5,%zmm5 + vpxord %zmm10,%zmm6,%zmm6 + vpxord %zmm11,%zmm7,%zmm7 + vprold $12,%zmm4,%zmm4 + vprold $12,%zmm5,%zmm5 + vprold $12,%zmm6,%zmm6 + vprold $12,%zmm7,%zmm7 + vpaddd %zmm4,%zmm0,%zmm0 + vpaddd %zmm5,%zmm1,%zmm1 + vpaddd %zmm6,%zmm2,%zmm2 + vpaddd %zmm7,%zmm3,%zmm3 + vpxord %zmm0,%zmm12,%zmm12 + vpxord %zmm1,%zmm13,%zmm13 + vpxord %zmm2,%zmm14,%zmm14 + vpxord %zmm3,%zmm15,%zmm15 + vprold $8,%zmm12,%zmm12 + vprold $8,%zmm13,%zmm13 + vprold $8,%zmm14,%zmm14 + vprold $8,%zmm15,%zmm15 + vpaddd %zmm12,%zmm8,%zmm8 + vpaddd %zmm13,%zmm9,%zmm9 + vpaddd %zmm14,%zmm10,%zmm10 + vpaddd %zmm15,%zmm11,%zmm11 + vpxord %zmm8,%zmm4,%zmm4 + vpxord %zmm9,%zmm5,%zmm5 + vpxord %zmm10,%zmm6,%zmm6 + vpxord %zmm11,%zmm7,%zmm7 + vprold $7,%zmm4,%zmm4 + vprold $7,%zmm5,%zmm5 + vprold $7,%zmm6,%zmm6 + vprold $7,%zmm7,%zmm7 + vpaddd %zmm5,%zmm0,%zmm0 + vpaddd %zmm6,%zmm1,%zmm1 + vpaddd %zmm7,%zmm2,%zmm2 + vpaddd %zmm4,%zmm3,%zmm3 + vpxord %zmm0,%zmm15,%zmm15 + vpxord %zmm1,%zmm12,%zmm12 + vpxord %zmm2,%zmm13,%zmm13 + vpxord %zmm3,%zmm14,%zmm14 + vprold $16,%zmm15,%zmm15 + vprold $16,%zmm12,%zmm12 + vprold $16,%zmm13,%zmm13 + vprold $16,%zmm14,%zmm14 + vpaddd %zmm15,%zmm10,%zmm10 + vpaddd %zmm12,%zmm11,%zmm11 + vpaddd %zmm13,%zmm8,%zmm8 + vpaddd %zmm14,%zmm9,%zmm9 + vpxord %zmm10,%zmm5,%zmm5 + vpxord %zmm11,%zmm6,%zmm6 + vpxord %zmm8,%zmm7,%zmm7 + vpxord %zmm9,%zmm4,%zmm4 + vprold $12,%zmm5,%zmm5 + vprold $12,%zmm6,%zmm6 + vprold $12,%zmm7,%zmm7 + vprold $12,%zmm4,%zmm4 + vpaddd %zmm5,%zmm0,%zmm0 + vpaddd %zmm6,%zmm1,%zmm1 + vpaddd %zmm7,%zmm2,%zmm2 + vpaddd %zmm4,%zmm3,%zmm3 + vpxord %zmm0,%zmm15,%zmm15 + vpxord %zmm1,%zmm12,%zmm12 + vpxord %zmm2,%zmm13,%zmm13 + vpxord %zmm3,%zmm14,%zmm14 + vprold $8,%zmm15,%zmm15 + vprold $8,%zmm12,%zmm12 + vprold $8,%zmm13,%zmm13 + vprold $8,%zmm14,%zmm14 + vpaddd %zmm15,%zmm10,%zmm10 + vpaddd %zmm12,%zmm11,%zmm11 + vpaddd %zmm13,%zmm8,%zmm8 + vpaddd %zmm14,%zmm9,%zmm9 + vpxord %zmm10,%zmm5,%zmm5 + vpxord %zmm11,%zmm6,%zmm6 + vpxord %zmm8,%zmm7,%zmm7 + vpxord %zmm9,%zmm4,%zmm4 + vprold $7,%zmm5,%zmm5 + vprold $7,%zmm6,%zmm6 + vprold $7,%zmm7,%zmm7 + vprold $7,%zmm4,%zmm4 + decl %eax + jnz L$oop16x + + vpaddd %zmm16,%zmm0,%zmm0 + vpaddd %zmm17,%zmm1,%zmm1 + vpaddd %zmm18,%zmm2,%zmm2 + vpaddd %zmm19,%zmm3,%zmm3 + + vpunpckldq %zmm1,%zmm0,%zmm18 + vpunpckldq %zmm3,%zmm2,%zmm19 + vpunpckhdq %zmm1,%zmm0,%zmm0 + vpunpckhdq %zmm3,%zmm2,%zmm2 + vpunpcklqdq %zmm19,%zmm18,%zmm1 + vpunpckhqdq %zmm19,%zmm18,%zmm18 + vpunpcklqdq %zmm2,%zmm0,%zmm3 + vpunpckhqdq %zmm2,%zmm0,%zmm0 + vpaddd %zmm20,%zmm4,%zmm4 + vpaddd %zmm21,%zmm5,%zmm5 + vpaddd %zmm22,%zmm6,%zmm6 + vpaddd %zmm23,%zmm7,%zmm7 + + vpunpckldq %zmm5,%zmm4,%zmm2 + vpunpckldq %zmm7,%zmm6,%zmm19 + vpunpckhdq %zmm5,%zmm4,%zmm4 + vpunpckhdq %zmm7,%zmm6,%zmm6 + vpunpcklqdq %zmm19,%zmm2,%zmm5 + vpunpckhqdq %zmm19,%zmm2,%zmm2 + vpunpcklqdq %zmm6,%zmm4,%zmm7 + vpunpckhqdq %zmm6,%zmm4,%zmm4 + vshufi32x4 $0x44,%zmm5,%zmm1,%zmm19 + vshufi32x4 $0xee,%zmm5,%zmm1,%zmm5 + vshufi32x4 $0x44,%zmm2,%zmm18,%zmm1 + vshufi32x4 $0xee,%zmm2,%zmm18,%zmm2 + vshufi32x4 $0x44,%zmm7,%zmm3,%zmm18 + vshufi32x4 $0xee,%zmm7,%zmm3,%zmm7 + vshufi32x4 $0x44,%zmm4,%zmm0,%zmm3 + vshufi32x4 $0xee,%zmm4,%zmm0,%zmm4 + vpaddd %zmm24,%zmm8,%zmm8 + vpaddd %zmm25,%zmm9,%zmm9 + vpaddd %zmm26,%zmm10,%zmm10 + vpaddd %zmm27,%zmm11,%zmm11 + + vpunpckldq %zmm9,%zmm8,%zmm6 + vpunpckldq %zmm11,%zmm10,%zmm0 + vpunpckhdq %zmm9,%zmm8,%zmm8 + vpunpckhdq %zmm11,%zmm10,%zmm10 + vpunpcklqdq %zmm0,%zmm6,%zmm9 + vpunpckhqdq %zmm0,%zmm6,%zmm6 + vpunpcklqdq %zmm10,%zmm8,%zmm11 + vpunpckhqdq %zmm10,%zmm8,%zmm8 + vpaddd %zmm28,%zmm12,%zmm12 + vpaddd %zmm29,%zmm13,%zmm13 + vpaddd %zmm30,%zmm14,%zmm14 + vpaddd %zmm31,%zmm15,%zmm15 + + vpunpckldq %zmm13,%zmm12,%zmm10 + vpunpckldq %zmm15,%zmm14,%zmm0 + vpunpckhdq %zmm13,%zmm12,%zmm12 + vpunpckhdq %zmm15,%zmm14,%zmm14 + vpunpcklqdq %zmm0,%zmm10,%zmm13 + vpunpckhqdq %zmm0,%zmm10,%zmm10 + vpunpcklqdq %zmm14,%zmm12,%zmm15 + vpunpckhqdq %zmm14,%zmm12,%zmm12 + vshufi32x4 $0x44,%zmm13,%zmm9,%zmm0 + vshufi32x4 $0xee,%zmm13,%zmm9,%zmm13 + vshufi32x4 $0x44,%zmm10,%zmm6,%zmm9 + vshufi32x4 $0xee,%zmm10,%zmm6,%zmm10 + vshufi32x4 $0x44,%zmm15,%zmm11,%zmm6 + vshufi32x4 $0xee,%zmm15,%zmm11,%zmm15 + vshufi32x4 $0x44,%zmm12,%zmm8,%zmm11 + vshufi32x4 $0xee,%zmm12,%zmm8,%zmm12 + vshufi32x4 $0x88,%zmm0,%zmm19,%zmm16 + vshufi32x4 $0xdd,%zmm0,%zmm19,%zmm19 + vshufi32x4 $0x88,%zmm13,%zmm5,%zmm0 + vshufi32x4 $0xdd,%zmm13,%zmm5,%zmm13 + vshufi32x4 $0x88,%zmm9,%zmm1,%zmm17 + vshufi32x4 $0xdd,%zmm9,%zmm1,%zmm1 + vshufi32x4 $0x88,%zmm10,%zmm2,%zmm9 + vshufi32x4 $0xdd,%zmm10,%zmm2,%zmm10 + vshufi32x4 $0x88,%zmm6,%zmm18,%zmm14 + vshufi32x4 $0xdd,%zmm6,%zmm18,%zmm18 + vshufi32x4 $0x88,%zmm15,%zmm7,%zmm6 + vshufi32x4 $0xdd,%zmm15,%zmm7,%zmm15 + vshufi32x4 $0x88,%zmm11,%zmm3,%zmm8 + vshufi32x4 $0xdd,%zmm11,%zmm3,%zmm3 + vshufi32x4 $0x88,%zmm12,%zmm4,%zmm11 + vshufi32x4 $0xdd,%zmm12,%zmm4,%zmm12 + cmpq $1024,%rdx + jb L$tail16x + + vpxord 0(%rsi),%zmm16,%zmm16 + vpxord 64(%rsi),%zmm17,%zmm17 + vpxord 128(%rsi),%zmm14,%zmm14 + vpxord 192(%rsi),%zmm8,%zmm8 + vmovdqu32 %zmm16,0(%rdi) + vmovdqu32 %zmm17,64(%rdi) + vmovdqu32 %zmm14,128(%rdi) + vmovdqu32 %zmm8,192(%rdi) + + vpxord 256(%rsi),%zmm19,%zmm19 + vpxord 320(%rsi),%zmm1,%zmm1 + vpxord 384(%rsi),%zmm18,%zmm18 + vpxord 448(%rsi),%zmm3,%zmm3 + vmovdqu32 %zmm19,256(%rdi) + vmovdqu32 %zmm1,320(%rdi) + vmovdqu32 %zmm18,384(%rdi) + vmovdqu32 %zmm3,448(%rdi) + + vpxord 512(%rsi),%zmm0,%zmm0 + vpxord 576(%rsi),%zmm9,%zmm9 + vpxord 640(%rsi),%zmm6,%zmm6 + vpxord 704(%rsi),%zmm11,%zmm11 + vmovdqu32 %zmm0,512(%rdi) + vmovdqu32 %zmm9,576(%rdi) + vmovdqu32 %zmm6,640(%rdi) + vmovdqu32 %zmm11,704(%rdi) + + vpxord 768(%rsi),%zmm13,%zmm13 + vpxord 832(%rsi),%zmm10,%zmm10 + vpxord 896(%rsi),%zmm15,%zmm15 + vpxord 960(%rsi),%zmm12,%zmm12 + leaq 1024(%rsi),%rsi + vmovdqu32 %zmm13,768(%rdi) + vmovdqu32 %zmm10,832(%rdi) + vmovdqu32 %zmm15,896(%rdi) + vmovdqu32 %zmm12,960(%rdi) + leaq 1024(%rdi),%rdi + + subq $1024,%rdx + jnz L$oop_outer16x + + jmp L$done16x + +.p2align 5 +L$tail16x: + xorq %r10,%r10 + subq %rsi,%rdi + cmpq $64,%rdx + jb L$ess_than_64_16x + vpxord (%rsi),%zmm16,%zmm16 + vmovdqu32 %zmm16,(%rdi,%rsi,1) + je L$done16x + vmovdqa32 %zmm17,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $128,%rdx + jb L$ess_than_64_16x + vpxord (%rsi),%zmm17,%zmm17 + vmovdqu32 %zmm17,(%rdi,%rsi,1) + je L$done16x + vmovdqa32 %zmm14,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $192,%rdx + jb L$ess_than_64_16x + vpxord (%rsi),%zmm14,%zmm14 + vmovdqu32 %zmm14,(%rdi,%rsi,1) + je L$done16x + vmovdqa32 %zmm8,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $256,%rdx + jb L$ess_than_64_16x + vpxord (%rsi),%zmm8,%zmm8 + vmovdqu32 %zmm8,(%rdi,%rsi,1) + je L$done16x + vmovdqa32 %zmm19,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $320,%rdx + jb L$ess_than_64_16x + vpxord (%rsi),%zmm19,%zmm19 + vmovdqu32 %zmm19,(%rdi,%rsi,1) + je L$done16x + vmovdqa32 %zmm1,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $384,%rdx + jb L$ess_than_64_16x + vpxord (%rsi),%zmm1,%zmm1 + vmovdqu32 %zmm1,(%rdi,%rsi,1) + je L$done16x + vmovdqa32 %zmm18,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $448,%rdx + jb L$ess_than_64_16x + vpxord (%rsi),%zmm18,%zmm18 + vmovdqu32 %zmm18,(%rdi,%rsi,1) + je L$done16x + vmovdqa32 %zmm3,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $512,%rdx + jb L$ess_than_64_16x + vpxord (%rsi),%zmm3,%zmm3 + vmovdqu32 %zmm3,(%rdi,%rsi,1) + je L$done16x + vmovdqa32 %zmm0,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $576,%rdx + jb L$ess_than_64_16x + vpxord (%rsi),%zmm0,%zmm0 + vmovdqu32 %zmm0,(%rdi,%rsi,1) + je L$done16x + vmovdqa32 %zmm9,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $640,%rdx + jb L$ess_than_64_16x + vpxord (%rsi),%zmm9,%zmm9 + vmovdqu32 %zmm9,(%rdi,%rsi,1) + je L$done16x + vmovdqa32 %zmm6,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $704,%rdx + jb L$ess_than_64_16x + vpxord (%rsi),%zmm6,%zmm6 + vmovdqu32 %zmm6,(%rdi,%rsi,1) + je L$done16x + vmovdqa32 %zmm11,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $768,%rdx + jb L$ess_than_64_16x + vpxord (%rsi),%zmm11,%zmm11 + vmovdqu32 %zmm11,(%rdi,%rsi,1) + je L$done16x + vmovdqa32 %zmm13,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $832,%rdx + jb L$ess_than_64_16x + vpxord (%rsi),%zmm13,%zmm13 + vmovdqu32 %zmm13,(%rdi,%rsi,1) + je L$done16x + vmovdqa32 %zmm10,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $896,%rdx + jb L$ess_than_64_16x + vpxord (%rsi),%zmm10,%zmm10 + vmovdqu32 %zmm10,(%rdi,%rsi,1) + je L$done16x + vmovdqa32 %zmm15,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $960,%rdx + jb L$ess_than_64_16x + vpxord (%rsi),%zmm15,%zmm15 + vmovdqu32 %zmm15,(%rdi,%rsi,1) + je L$done16x + vmovdqa32 %zmm12,%zmm16 + leaq 64(%rsi),%rsi + +L$ess_than_64_16x: + vmovdqa32 %zmm16,0(%rsp) + leaq (%rdi,%rsi,1),%rdi + andq $63,%rdx + +L$oop_tail16x: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz L$oop_tail16x + + vpxord %zmm16,%zmm16,%zmm16 + vmovdqa32 %zmm16,0(%rsp) + +L$done16x: + vzeroall + leaq (%r9),%rsp + +L$16x_epilogue: + .byte 0xf3,0xc3 + + + +.p2align 5 +ChaCha20_8xvl: + +L$ChaCha20_8xvl: + movq %rsp,%r9 + + subq $64+8,%rsp + andq $-64,%rsp + vzeroupper + + leaq L$sigma(%rip),%r10 + vbroadcasti128 (%r10),%ymm3 + vbroadcasti128 (%rcx),%ymm7 + vbroadcasti128 16(%rcx),%ymm11 + vbroadcasti128 (%r8),%ymm15 + + vpshufd $0x00,%ymm3,%ymm0 + vpshufd $0x55,%ymm3,%ymm1 + vpshufd $0xaa,%ymm3,%ymm2 + vpshufd $0xff,%ymm3,%ymm3 + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm1,%ymm17 + vmovdqa64 %ymm2,%ymm18 + vmovdqa64 %ymm3,%ymm19 + + vpshufd $0x00,%ymm7,%ymm4 + vpshufd $0x55,%ymm7,%ymm5 + vpshufd $0xaa,%ymm7,%ymm6 + vpshufd $0xff,%ymm7,%ymm7 + vmovdqa64 %ymm4,%ymm20 + vmovdqa64 %ymm5,%ymm21 + vmovdqa64 %ymm6,%ymm22 + vmovdqa64 %ymm7,%ymm23 + + vpshufd $0x00,%ymm11,%ymm8 + vpshufd $0x55,%ymm11,%ymm9 + vpshufd $0xaa,%ymm11,%ymm10 + vpshufd $0xff,%ymm11,%ymm11 + vmovdqa64 %ymm8,%ymm24 + vmovdqa64 %ymm9,%ymm25 + vmovdqa64 %ymm10,%ymm26 + vmovdqa64 %ymm11,%ymm27 + + vpshufd $0x00,%ymm15,%ymm12 + vpshufd $0x55,%ymm15,%ymm13 + vpshufd $0xaa,%ymm15,%ymm14 + vpshufd $0xff,%ymm15,%ymm15 + vpaddd L$incy(%rip),%ymm12,%ymm12 + vmovdqa64 %ymm12,%ymm28 + vmovdqa64 %ymm13,%ymm29 + vmovdqa64 %ymm14,%ymm30 + vmovdqa64 %ymm15,%ymm31 + + movl $10,%eax + jmp L$oop8xvl + +.p2align 5 +L$oop_outer8xvl: + + + vpbroadcastd 8(%r10),%ymm2 + vpbroadcastd 12(%r10),%ymm3 + vpaddd L$eight(%rip),%ymm28,%ymm28 + vmovdqa64 %ymm20,%ymm4 + vmovdqa64 %ymm21,%ymm5 + vmovdqa64 %ymm22,%ymm6 + vmovdqa64 %ymm23,%ymm7 + vmovdqa64 %ymm24,%ymm8 + vmovdqa64 %ymm25,%ymm9 + vmovdqa64 %ymm26,%ymm10 + vmovdqa64 %ymm27,%ymm11 + vmovdqa64 %ymm28,%ymm12 + vmovdqa64 %ymm29,%ymm13 + vmovdqa64 %ymm30,%ymm14 + vmovdqa64 %ymm31,%ymm15 + + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm1,%ymm17 + vmovdqa64 %ymm2,%ymm18 + vmovdqa64 %ymm3,%ymm19 + + movl $10,%eax + jmp L$oop8xvl + +.p2align 5 +L$oop8xvl: + vpaddd %ymm4,%ymm0,%ymm0 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm7,%ymm3,%ymm3 + vpxor %ymm0,%ymm12,%ymm12 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm3,%ymm15,%ymm15 + vprold $16,%ymm12,%ymm12 + vprold $16,%ymm13,%ymm13 + vprold $16,%ymm14,%ymm14 + vprold $16,%ymm15,%ymm15 + vpaddd %ymm12,%ymm8,%ymm8 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm8,%ymm4,%ymm4 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm11,%ymm7,%ymm7 + vprold $12,%ymm4,%ymm4 + vprold $12,%ymm5,%ymm5 + vprold $12,%ymm6,%ymm6 + vprold $12,%ymm7,%ymm7 + vpaddd %ymm4,%ymm0,%ymm0 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm7,%ymm3,%ymm3 + vpxor %ymm0,%ymm12,%ymm12 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm3,%ymm15,%ymm15 + vprold $8,%ymm12,%ymm12 + vprold $8,%ymm13,%ymm13 + vprold $8,%ymm14,%ymm14 + vprold $8,%ymm15,%ymm15 + vpaddd %ymm12,%ymm8,%ymm8 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm8,%ymm4,%ymm4 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm11,%ymm7,%ymm7 + vprold $7,%ymm4,%ymm4 + vprold $7,%ymm5,%ymm5 + vprold $7,%ymm6,%ymm6 + vprold $7,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + vpaddd %ymm6,%ymm1,%ymm1 + vpaddd %ymm7,%ymm2,%ymm2 + vpaddd %ymm4,%ymm3,%ymm3 + vpxor %ymm0,%ymm15,%ymm15 + vpxor %ymm1,%ymm12,%ymm12 + vpxor %ymm2,%ymm13,%ymm13 + vpxor %ymm3,%ymm14,%ymm14 + vprold $16,%ymm15,%ymm15 + vprold $16,%ymm12,%ymm12 + vprold $16,%ymm13,%ymm13 + vprold $16,%ymm14,%ymm14 + vpaddd %ymm15,%ymm10,%ymm10 + vpaddd %ymm12,%ymm11,%ymm11 + vpaddd %ymm13,%ymm8,%ymm8 + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm10,%ymm5,%ymm5 + vpxor %ymm11,%ymm6,%ymm6 + vpxor %ymm8,%ymm7,%ymm7 + vpxor %ymm9,%ymm4,%ymm4 + vprold $12,%ymm5,%ymm5 + vprold $12,%ymm6,%ymm6 + vprold $12,%ymm7,%ymm7 + vprold $12,%ymm4,%ymm4 + vpaddd %ymm5,%ymm0,%ymm0 + vpaddd %ymm6,%ymm1,%ymm1 + vpaddd %ymm7,%ymm2,%ymm2 + vpaddd %ymm4,%ymm3,%ymm3 + vpxor %ymm0,%ymm15,%ymm15 + vpxor %ymm1,%ymm12,%ymm12 + vpxor %ymm2,%ymm13,%ymm13 + vpxor %ymm3,%ymm14,%ymm14 + vprold $8,%ymm15,%ymm15 + vprold $8,%ymm12,%ymm12 + vprold $8,%ymm13,%ymm13 + vprold $8,%ymm14,%ymm14 + vpaddd %ymm15,%ymm10,%ymm10 + vpaddd %ymm12,%ymm11,%ymm11 + vpaddd %ymm13,%ymm8,%ymm8 + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm10,%ymm5,%ymm5 + vpxor %ymm11,%ymm6,%ymm6 + vpxor %ymm8,%ymm7,%ymm7 + vpxor %ymm9,%ymm4,%ymm4 + vprold $7,%ymm5,%ymm5 + vprold $7,%ymm6,%ymm6 + vprold $7,%ymm7,%ymm7 + vprold $7,%ymm4,%ymm4 + decl %eax + jnz L$oop8xvl + + vpaddd %ymm16,%ymm0,%ymm0 + vpaddd %ymm17,%ymm1,%ymm1 + vpaddd %ymm18,%ymm2,%ymm2 + vpaddd %ymm19,%ymm3,%ymm3 + + vpunpckldq %ymm1,%ymm0,%ymm18 + vpunpckldq %ymm3,%ymm2,%ymm19 + vpunpckhdq %ymm1,%ymm0,%ymm0 + vpunpckhdq %ymm3,%ymm2,%ymm2 + vpunpcklqdq %ymm19,%ymm18,%ymm1 + vpunpckhqdq %ymm19,%ymm18,%ymm18 + vpunpcklqdq %ymm2,%ymm0,%ymm3 + vpunpckhqdq %ymm2,%ymm0,%ymm0 + vpaddd %ymm20,%ymm4,%ymm4 + vpaddd %ymm21,%ymm5,%ymm5 + vpaddd %ymm22,%ymm6,%ymm6 + vpaddd %ymm23,%ymm7,%ymm7 + + vpunpckldq %ymm5,%ymm4,%ymm2 + vpunpckldq %ymm7,%ymm6,%ymm19 + vpunpckhdq %ymm5,%ymm4,%ymm4 + vpunpckhdq %ymm7,%ymm6,%ymm6 + vpunpcklqdq %ymm19,%ymm2,%ymm5 + vpunpckhqdq %ymm19,%ymm2,%ymm2 + vpunpcklqdq %ymm6,%ymm4,%ymm7 + vpunpckhqdq %ymm6,%ymm4,%ymm4 + vshufi32x4 $0,%ymm5,%ymm1,%ymm19 + vshufi32x4 $3,%ymm5,%ymm1,%ymm5 + vshufi32x4 $0,%ymm2,%ymm18,%ymm1 + vshufi32x4 $3,%ymm2,%ymm18,%ymm2 + vshufi32x4 $0,%ymm7,%ymm3,%ymm18 + vshufi32x4 $3,%ymm7,%ymm3,%ymm7 + vshufi32x4 $0,%ymm4,%ymm0,%ymm3 + vshufi32x4 $3,%ymm4,%ymm0,%ymm4 + vpaddd %ymm24,%ymm8,%ymm8 + vpaddd %ymm25,%ymm9,%ymm9 + vpaddd %ymm26,%ymm10,%ymm10 + vpaddd %ymm27,%ymm11,%ymm11 + + vpunpckldq %ymm9,%ymm8,%ymm6 + vpunpckldq %ymm11,%ymm10,%ymm0 + vpunpckhdq %ymm9,%ymm8,%ymm8 + vpunpckhdq %ymm11,%ymm10,%ymm10 + vpunpcklqdq %ymm0,%ymm6,%ymm9 + vpunpckhqdq %ymm0,%ymm6,%ymm6 + vpunpcklqdq %ymm10,%ymm8,%ymm11 + vpunpckhqdq %ymm10,%ymm8,%ymm8 + vpaddd %ymm28,%ymm12,%ymm12 + vpaddd %ymm29,%ymm13,%ymm13 + vpaddd %ymm30,%ymm14,%ymm14 + vpaddd %ymm31,%ymm15,%ymm15 + + vpunpckldq %ymm13,%ymm12,%ymm10 + vpunpckldq %ymm15,%ymm14,%ymm0 + vpunpckhdq %ymm13,%ymm12,%ymm12 + vpunpckhdq %ymm15,%ymm14,%ymm14 + vpunpcklqdq %ymm0,%ymm10,%ymm13 + vpunpckhqdq %ymm0,%ymm10,%ymm10 + vpunpcklqdq %ymm14,%ymm12,%ymm15 + vpunpckhqdq %ymm14,%ymm12,%ymm12 + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + vperm2i128 $0x20,%ymm10,%ymm6,%ymm9 + vperm2i128 $0x31,%ymm10,%ymm6,%ymm10 + vperm2i128 $0x20,%ymm15,%ymm11,%ymm6 + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + vperm2i128 $0x20,%ymm12,%ymm8,%ymm11 + vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 + cmpq $512,%rdx + jb L$tail8xvl + + movl $0x80,%eax + vpxord 0(%rsi),%ymm19,%ymm19 + vpxor 32(%rsi),%ymm0,%ymm0 + vpxor 64(%rsi),%ymm5,%ymm5 + vpxor 96(%rsi),%ymm13,%ymm13 + leaq (%rsi,%rax,1),%rsi + vmovdqu32 %ymm19,0(%rdi) + vmovdqu %ymm0,32(%rdi) + vmovdqu %ymm5,64(%rdi) + vmovdqu %ymm13,96(%rdi) + leaq (%rdi,%rax,1),%rdi + + vpxor 0(%rsi),%ymm1,%ymm1 + vpxor 32(%rsi),%ymm9,%ymm9 + vpxor 64(%rsi),%ymm2,%ymm2 + vpxor 96(%rsi),%ymm10,%ymm10 + leaq (%rsi,%rax,1),%rsi + vmovdqu %ymm1,0(%rdi) + vmovdqu %ymm9,32(%rdi) + vmovdqu %ymm2,64(%rdi) + vmovdqu %ymm10,96(%rdi) + leaq (%rdi,%rax,1),%rdi + + vpxord 0(%rsi),%ymm18,%ymm18 + vpxor 32(%rsi),%ymm6,%ymm6 + vpxor 64(%rsi),%ymm7,%ymm7 + vpxor 96(%rsi),%ymm15,%ymm15 + leaq (%rsi,%rax,1),%rsi + vmovdqu32 %ymm18,0(%rdi) + vmovdqu %ymm6,32(%rdi) + vmovdqu %ymm7,64(%rdi) + vmovdqu %ymm15,96(%rdi) + leaq (%rdi,%rax,1),%rdi + + vpxor 0(%rsi),%ymm3,%ymm3 + vpxor 32(%rsi),%ymm11,%ymm11 + vpxor 64(%rsi),%ymm4,%ymm4 + vpxor 96(%rsi),%ymm12,%ymm12 + leaq (%rsi,%rax,1),%rsi + vmovdqu %ymm3,0(%rdi) + vmovdqu %ymm11,32(%rdi) + vmovdqu %ymm4,64(%rdi) + vmovdqu %ymm12,96(%rdi) + leaq (%rdi,%rax,1),%rdi + + vpbroadcastd 0(%r10),%ymm0 + vpbroadcastd 4(%r10),%ymm1 + + subq $512,%rdx + jnz L$oop_outer8xvl + + jmp L$done8xvl + +.p2align 5 +L$tail8xvl: + vmovdqa64 %ymm19,%ymm8 + xorq %r10,%r10 + subq %rsi,%rdi + cmpq $64,%rdx + jb L$ess_than_64_8xvl + vpxor 0(%rsi),%ymm8,%ymm8 + vpxor 32(%rsi),%ymm0,%ymm0 + vmovdqu %ymm8,0(%rdi,%rsi,1) + vmovdqu %ymm0,32(%rdi,%rsi,1) + je L$done8xvl + vmovdqa %ymm5,%ymm8 + vmovdqa %ymm13,%ymm0 + leaq 64(%rsi),%rsi + + cmpq $128,%rdx + jb L$ess_than_64_8xvl + vpxor 0(%rsi),%ymm5,%ymm5 + vpxor 32(%rsi),%ymm13,%ymm13 + vmovdqu %ymm5,0(%rdi,%rsi,1) + vmovdqu %ymm13,32(%rdi,%rsi,1) + je L$done8xvl + vmovdqa %ymm1,%ymm8 + vmovdqa %ymm9,%ymm0 + leaq 64(%rsi),%rsi + + cmpq $192,%rdx + jb L$ess_than_64_8xvl + vpxor 0(%rsi),%ymm1,%ymm1 + vpxor 32(%rsi),%ymm9,%ymm9 + vmovdqu %ymm1,0(%rdi,%rsi,1) + vmovdqu %ymm9,32(%rdi,%rsi,1) + je L$done8xvl + vmovdqa %ymm2,%ymm8 + vmovdqa %ymm10,%ymm0 + leaq 64(%rsi),%rsi + + cmpq $256,%rdx + jb L$ess_than_64_8xvl + vpxor 0(%rsi),%ymm2,%ymm2 + vpxor 32(%rsi),%ymm10,%ymm10 + vmovdqu %ymm2,0(%rdi,%rsi,1) + vmovdqu %ymm10,32(%rdi,%rsi,1) + je L$done8xvl + vmovdqa32 %ymm18,%ymm8 + vmovdqa %ymm6,%ymm0 + leaq 64(%rsi),%rsi + + cmpq $320,%rdx + jb L$ess_than_64_8xvl + vpxord 0(%rsi),%ymm18,%ymm18 + vpxor 32(%rsi),%ymm6,%ymm6 + vmovdqu32 %ymm18,0(%rdi,%rsi,1) + vmovdqu %ymm6,32(%rdi,%rsi,1) + je L$done8xvl + vmovdqa %ymm7,%ymm8 + vmovdqa %ymm15,%ymm0 + leaq 64(%rsi),%rsi + + cmpq $384,%rdx + jb L$ess_than_64_8xvl + vpxor 0(%rsi),%ymm7,%ymm7 + vpxor 32(%rsi),%ymm15,%ymm15 + vmovdqu %ymm7,0(%rdi,%rsi,1) + vmovdqu %ymm15,32(%rdi,%rsi,1) + je L$done8xvl + vmovdqa %ymm3,%ymm8 + vmovdqa %ymm11,%ymm0 + leaq 64(%rsi),%rsi + + cmpq $448,%rdx + jb L$ess_than_64_8xvl + vpxor 0(%rsi),%ymm3,%ymm3 + vpxor 32(%rsi),%ymm11,%ymm11 + vmovdqu %ymm3,0(%rdi,%rsi,1) + vmovdqu %ymm11,32(%rdi,%rsi,1) + je L$done8xvl + vmovdqa %ymm4,%ymm8 + vmovdqa %ymm12,%ymm0 + leaq 64(%rsi),%rsi + +L$ess_than_64_8xvl: + vmovdqa %ymm8,0(%rsp) + vmovdqa %ymm0,32(%rsp) + leaq (%rdi,%rsi,1),%rdi + andq $63,%rdx + +L$oop_tail8xvl: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz L$oop_tail8xvl + + vpxor %ymm8,%ymm8,%ymm8 + vmovdqa %ymm8,0(%rsp) + vmovdqa %ymm8,32(%rsp) + +L$done8xvl: + vzeroall + leaq (%r9),%rsp + +L$8xvl_epilogue: + .byte 0xf3,0xc3 + + diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/ec/ecp_nistz256-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/ec/ecp_nistz256-x86_64.s index 77102c6a41..302649aacc 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/ec/ecp_nistz256-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/ec/ecp_nistz256-x86_64.s @@ -2393,13 +2393,23 @@ L$Three: L$ONE_mont: .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe + +L$ord: +.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 +L$ordK: +.quad 0xccd1c8aaee00bc4f + .globl _ecp_nistz256_mul_by_2 .p2align 6 _ecp_nistz256_mul_by_2: + pushq %r12 + pushq %r13 +L$mul_by_2_body: + movq 0(%rsi),%r8 xorq %r13,%r13 movq 8(%rsi),%r9 @@ -2431,20 +2441,30 @@ _ecp_nistz256_mul_by_2: movq %r10,16(%rdi) movq %r11,24(%rdi) - popq %r13 - popq %r12 + movq 0(%rsp),%r13 + + movq 8(%rsp),%r12 + + leaq 16(%rsp),%rsp + +L$mul_by_2_epilogue: .byte 0xf3,0xc3 + .globl _ecp_nistz256_div_by_2 .p2align 5 _ecp_nistz256_div_by_2: + pushq %r12 + pushq %r13 +L$div_by_2_body: + movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 @@ -2491,20 +2511,30 @@ _ecp_nistz256_div_by_2: movq %r10,16(%rdi) movq %r11,24(%rdi) - popq %r13 - popq %r12 + movq 0(%rsp),%r13 + + movq 8(%rsp),%r12 + + leaq 16(%rsp),%rsp + +L$div_by_2_epilogue: .byte 0xf3,0xc3 + .globl _ecp_nistz256_mul_by_3 .p2align 5 _ecp_nistz256_mul_by_3: + pushq %r12 + pushq %r13 +L$mul_by_3_body: + movq 0(%rsi),%r8 xorq %r13,%r13 movq 8(%rsi),%r9 @@ -2557,20 +2587,30 @@ _ecp_nistz256_mul_by_3: movq %r10,16(%rdi) movq %r11,24(%rdi) - popq %r13 - popq %r12 + movq 0(%rsp),%r13 + + movq 8(%rsp),%r12 + + leaq 16(%rsp),%rsp + +L$mul_by_3_epilogue: .byte 0xf3,0xc3 + .globl _ecp_nistz256_add .p2align 5 _ecp_nistz256_add: + pushq %r12 + pushq %r13 +L$add_body: + movq 0(%rsi),%r8 xorq %r13,%r13 movq 8(%rsi),%r9 @@ -2603,20 +2643,30 @@ _ecp_nistz256_add: movq %r10,16(%rdi) movq %r11,24(%rdi) - popq %r13 - popq %r12 + movq 0(%rsp),%r13 + + movq 8(%rsp),%r12 + + leaq 16(%rsp),%rsp + +L$add_epilogue: .byte 0xf3,0xc3 + .globl _ecp_nistz256_sub .p2align 5 _ecp_nistz256_sub: + pushq %r12 + pushq %r13 +L$sub_body: + movq 0(%rsi),%r8 xorq %r13,%r13 movq 8(%rsi),%r9 @@ -2649,20 +2699,30 @@ _ecp_nistz256_sub: movq %r10,16(%rdi) movq %r11,24(%rdi) - popq %r13 - popq %r12 + movq 0(%rsp),%r13 + + movq 8(%rsp),%r12 + + leaq 16(%rsp),%rsp + +L$sub_epilogue: .byte 0xf3,0xc3 + .globl _ecp_nistz256_neg .p2align 5 _ecp_nistz256_neg: + pushq %r12 + pushq %r13 +L$neg_body: + xorq %r8,%r8 xorq %r9,%r9 xorq %r10,%r10 @@ -2695,14 +2755,1085 @@ _ecp_nistz256_neg: movq %r10,16(%rdi) movq %r11,24(%rdi) - popq %r13 - popq %r12 + movq 0(%rsp),%r13 + + movq 8(%rsp),%r12 + + leaq 16(%rsp),%rsp + +L$neg_epilogue: .byte 0xf3,0xc3 + + + +.globl _ecp_nistz256_ord_mul_mont + +.p2align 5 +_ecp_nistz256_ord_mul_mont: + + movl $0x80100,%ecx + andl _OPENSSL_ia32cap_P+8(%rip),%ecx + cmpl $0x80100,%ecx + je L$ecp_nistz256_ord_mul_montx + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_mul_body: + + movq 0(%rdx),%rax + movq %rdx,%rbx + leaq L$ord(%rip),%r14 + movq L$ordK(%rip),%r15 + + + movq %rax,%rcx + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + + movq %r8,%r13 + imulq %r15,%r8 + + movq %rdx,%r11 + mulq 24(%rsi) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq 0(%r14) + movq %r8,%rbp + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%r8 + + mulq 8(%r14) + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %rbp,%rax + adcq %rdx,%r10 + movq %rbp,%rdx + adcq $0,%r8 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 8(%rbx),%rax + sbbq %rdx,%rbp + + addq %r8,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + + movq %r9,%rcx + imulq %r15,%r9 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + mulq 0(%r14) + movq %r9,%rbp + addq %rax,%rcx + movq %r9,%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%r9 + + mulq 8(%r14) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq %rdx,%r11 + movq %rbp,%rdx + adcq $0,%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r12 + movq 16(%rbx),%rax + sbbq %rdx,%rbp + + addq %r9,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + + movq %r10,%rcx + imulq %r15,%r10 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + mulq 0(%r14) + movq %r10,%rbp + addq %rax,%rcx + movq %r10,%rax + adcq %rdx,%rcx + + subq %r10,%r12 + sbbq $0,%r10 + + mulq 8(%r14) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq %rdx,%r12 + movq %rbp,%rdx + adcq $0,%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r13 + movq 24(%rbx),%rax + sbbq %rdx,%rbp + + addq %r10,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + + movq %r11,%rcx + imulq %r15,%r11 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + mulq 0(%r14) + movq %r11,%rbp + addq %rax,%rcx + movq %r11,%rax + adcq %rdx,%rcx + + subq %r11,%r13 + sbbq $0,%r11 + + mulq 8(%r14) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq %rdx,%r13 + movq %rbp,%rdx + adcq $0,%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + sbbq %rdx,%rbp + + addq %r11,%r8 + adcq %rbp,%r9 + adcq $0,%r10 + + + movq %r12,%rsi + subq 0(%r14),%r12 + movq %r13,%r11 + sbbq 8(%r14),%r13 + movq %r8,%rcx + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rsi,%r12 + cmovcq %r11,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_mul_epilogue: + .byte 0xf3,0xc3 + + + + + + + + + +.globl _ecp_nistz256_ord_sqr_mont + +.p2align 5 +_ecp_nistz256_ord_sqr_mont: + + movl $0x80100,%ecx + andl _OPENSSL_ia32cap_P+8(%rip),%ecx + cmpl $0x80100,%ecx + je L$ecp_nistz256_ord_sqr_montx + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_sqr_body: + + movq 0(%rsi),%r8 + movq 8(%rsi),%rax + movq 16(%rsi),%r14 + movq 24(%rsi),%r15 + leaq L$ord(%rip),%rsi + movq %rdx,%rbx + jmp L$oop_ord_sqr + +.p2align 5 +L$oop_ord_sqr: + + movq %rax,%rbp + mulq %r8 + movq %rax,%r9 +.byte 102,72,15,110,205 + movq %r14,%rax + movq %rdx,%r10 + + mulq %r8 + addq %rax,%r10 + movq %r15,%rax +.byte 102,73,15,110,214 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r8 + addq %rax,%r11 + movq %r15,%rax +.byte 102,73,15,110,223 + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + movq %rax,%r13 + movq %r14,%rax + movq %rdx,%r14 + + + mulq %rbp + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + + addq %r15,%r12 + adcq %rdx,%r13 + adcq $0,%r14 + + + xorq %r15,%r15 + movq %r8,%rax + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + + mulq %rax + movq %rax,%r8 +.byte 102,72,15,126,200 + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r9 + adcq %rax,%r10 +.byte 102,72,15,126,208 + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r11 + adcq %rax,%r12 +.byte 102,72,15,126,216 + adcq $0,%rdx + movq %rdx,%rbp + + movq %r8,%rcx + imulq 32(%rsi),%r8 + + mulq %rax + addq %rbp,%r13 + adcq %rax,%r14 + movq 0(%rsi),%rax + adcq %rdx,%r15 + + + mulq %r8 + movq %r8,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%rbp + + mulq %r8 + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r8,%rax + adcq %rdx,%r10 + movq %r8,%rdx + adcq $0,%rbp + + movq %r9,%rcx + imulq 32(%rsi),%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 0(%rsi),%rax + sbbq %rdx,%r8 + + addq %rbp,%r11 + adcq $0,%r8 + + + mulq %r9 + movq %r9,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%rbp + + mulq %r9 + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r9,%rax + adcq %rdx,%r11 + movq %r9,%rdx + adcq $0,%rbp + + movq %r10,%rcx + imulq 32(%rsi),%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + movq 0(%rsi),%rax + sbbq %rdx,%r9 + + addq %rbp,%r8 + adcq $0,%r9 + + + mulq %r10 + movq %r10,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r10,%r8 + sbbq $0,%rbp + + mulq %r10 + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r10,%rax + adcq %rdx,%r8 + movq %r10,%rdx + adcq $0,%rbp + + movq %r11,%rcx + imulq 32(%rsi),%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r9 + movq 0(%rsi),%rax + sbbq %rdx,%r10 + + addq %rbp,%r9 + adcq $0,%r10 + + + mulq %r11 + movq %r11,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r11,%r9 + sbbq $0,%rbp + + mulq %r11 + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + movq %r11,%rdx + adcq $0,%rbp + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r10 + sbbq %rdx,%r11 + + addq %rbp,%r10 + adcq $0,%r11 + + + xorq %rdx,%rdx + addq %r12,%r8 + adcq %r13,%r9 + movq %r8,%r12 + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%rax + adcq $0,%rdx + + + subq 0(%rsi),%r8 + movq %r10,%r14 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r15 + sbbq 24(%rsi),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r8 + cmovncq %r9,%rax + cmovncq %r10,%r14 + cmovncq %r11,%r15 + + decq %rbx + jnz L$oop_ord_sqr + + movq %r8,0(%rdi) + movq %rax,8(%rdi) + pxor %xmm1,%xmm1 + movq %r14,16(%rdi) + pxor %xmm2,%xmm2 + movq %r15,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_sqr_epilogue: + .byte 0xf3,0xc3 + + + + +.p2align 5 +ecp_nistz256_ord_mul_montx: + +L$ecp_nistz256_ord_mul_montx: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_mulx_body: + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + leaq L$ord-128(%rip),%r14 + movq L$ordK(%rip),%r15 + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + mulxq %r11,%rbp,%r11 + addq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + mulxq %r15,%rdx,%rax + adcq %rbp,%r10 + adcq %rcx,%r11 + adcq $0,%r12 + + + xorq %r13,%r13 + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%r14),%rcx,%rbp + movq 8(%rbx),%rdx + adcxq %rcx,%r11 + adoxq %rbp,%r12 + adcxq %r8,%r12 + adoxq %r8,%r13 + adcq $0,%r13 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%r14),%rcx,%rbp + movq 16(%rbx),%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcxq %r9,%r13 + adoxq %r9,%r8 + adcq $0,%r8 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%r14),%rcx,%rbp + movq 24(%rbx),%rdx + adcxq %rcx,%r13 + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcq $0,%r9 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%r14),%rcx,%rbp + leaq 128(%r14),%r14 + movq %r12,%rbx + adcxq %rcx,%r8 + adoxq %rbp,%r9 + movq %r13,%rdx + adcxq %r11,%r9 + adoxq %r11,%r10 + adcq $0,%r10 + + + + movq %r8,%rcx + subq 0(%r14),%r12 + sbbq 8(%r14),%r13 + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_mulx_epilogue: + .byte 0xf3,0xc3 + + + + +.p2align 5 +ecp_nistz256_ord_sqr_montx: + +L$ecp_nistz256_ord_sqr_montx: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_sqrx_body: + + movq %rdx,%rbx + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq L$ord(%rip),%rsi + jmp L$oop_ord_sqrx + +.p2align 5 +L$oop_ord_sqrx: + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + movq %rdx,%rax +.byte 102,73,15,110,206 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + addq %rcx,%r10 +.byte 102,73,15,110,215 + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + mulxq %r8,%rcx,%r14 + movq %rax,%rdx +.byte 102,73,15,110,216 + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + + mulxq %rdx,%r8,%rbp +.byte 102,72,15,126,202 + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax +.byte 102,72,15,126,210 + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 + mulxq %rdx,%rcx,%rbp +.byte 0x67 +.byte 102,72,15,126,218 + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + adoxq %rbp,%r13 + mulxq %rdx,%rcx,%rax + adoxq %rcx,%r14 + adoxq %rax,%r15 + + + movq %r8,%rdx + mulxq 32(%rsi),%rdx,%rcx + + xorq %rax,%rax + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + adcxq %rax,%r8 + + + movq %r9,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + adoxq %rax,%r9 + + + movq %r10,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + adcxq %rax,%r10 + + + movq %r11,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + adoxq %rax,%r11 + + + addq %r8,%r12 + adcq %r13,%r9 + movq %r12,%rdx + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%r14 + adcq $0,%rax + + + subq 0(%rsi),%r12 + movq %r10,%r15 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r8 + sbbq 24(%rsi),%r11 + sbbq $0,%rax + + cmovncq %r12,%rdx + cmovncq %r9,%r14 + cmovncq %r10,%r15 + cmovncq %r11,%r8 + + decq %rbx + jnz L$oop_ord_sqrx + + movq %rdx,0(%rdi) + movq %r14,8(%rdi) + pxor %xmm1,%xmm1 + movq %r15,16(%rdi) + pxor %xmm2,%xmm2 + movq %r8,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_sqrx_epilogue: + .byte 0xf3,0xc3 + + + + + + .globl _ecp_nistz256_to_mont .p2align 5 @@ -2723,15 +3854,23 @@ _ecp_nistz256_to_mont: .p2align 5 _ecp_nistz256_mul_mont: + movl $0x80100,%ecx andl _OPENSSL_ia32cap_P+8(%rip),%ecx L$mul_mont: pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +L$mul_body: cmpl $0x80100,%ecx je L$mul_montx movq %rdx,%rbx @@ -2756,16 +3895,26 @@ L$mul_montx: call __ecp_nistz256_mul_montx L$mul_mont_done: - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$mul_epilogue: .byte 0xf3,0xc3 + .p2align 5 __ecp_nistz256_mul_montq: @@ -2992,14 +4141,22 @@ __ecp_nistz256_mul_montq: .p2align 5 _ecp_nistz256_sqr_mont: + movl $0x80100,%ecx andl _OPENSSL_ia32cap_P+8(%rip),%ecx pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +L$sqr_body: cmpl $0x80100,%ecx je L$sqr_montx movq 0(%rsi),%rax @@ -3020,16 +4177,26 @@ L$sqr_montx: call __ecp_nistz256_sqr_montx L$sqr_mont_done: - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$sqr_epilogue: .byte 0xf3,0xc3 + .p2align 5 __ecp_nistz256_sqr_montq: movq %rax,%r13 @@ -3494,9 +4661,13 @@ __ecp_nistz256_sqr_montx: .p2align 5 _ecp_nistz256_from_mont: + pushq %r12 + pushq %r13 +L$from_body: + movq 0(%rsi),%rax movq L$poly+24(%rip),%r13 movq 8(%rsi),%r9 @@ -3576,12 +4747,18 @@ _ecp_nistz256_from_mont: movq %r10,16(%rdi) movq %r11,24(%rdi) - popq %r13 - popq %r12 + movq 0(%rsp),%r13 + + movq 8(%rsp),%r12 + + leaq 16(%rsp),%rsp + +L$from_epilogue: .byte 0xf3,0xc3 + .globl _ecp_nistz256_scatter_w5 .p2align 5 @@ -3664,6 +4841,7 @@ L$select_loop_sse_w5: movdqu %xmm6,64(%rdi) movdqu %xmm7,80(%rdi) .byte 0xf3,0xc3 +L$SEH_end_ecp_nistz256_gather_w5: @@ -3734,6 +4912,7 @@ L$select_loop_sse_w7: movdqu %xmm4,32(%rdi) movdqu %xmm5,48(%rdi) .byte 0xf3,0xc3 +L$SEH_end_ecp_nistz256_gather_w7: @@ -3794,6 +4973,7 @@ L$select_loop_avx2_w5: vmovdqu %ymm4,64(%rdi) vzeroupper .byte 0xf3,0xc3 +L$SEH_end_ecp_nistz256_avx2_gather_w5: @@ -3871,6 +5051,7 @@ L$select_loop_avx2_w7: vmovdqu %ymm3,32(%rdi) vzeroupper .byte 0xf3,0xc3 +L$SEH_end_ecp_nistz256_avx2_gather_w7: .p2align 5 @@ -3997,18 +5178,27 @@ __ecp_nistz256_mul_by_2q: .p2align 5 _ecp_nistz256_point_double: + movl $0x80100,%ecx andl _OPENSSL_ia32cap_P+8(%rip),%ecx cmpl $0x80100,%ecx je L$point_doublex pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $160+8,%rsp +L$point_doubleq_body: + L$point_double_shortcutq: movdqu 0(%rsi),%xmm0 movq %rsi,%rbx @@ -4190,31 +5380,51 @@ L$point_double_shortcutq: .byte 102,72,15,126,207 call __ecp_nistz256_sub_fromq - addq $160+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 160+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_doubleq_epilogue: .byte 0xf3,0xc3 + .globl _ecp_nistz256_point_add .p2align 5 _ecp_nistz256_point_add: + movl $0x80100,%ecx andl _OPENSSL_ia32cap_P+8(%rip),%ecx cmpl $0x80100,%ecx je L$point_addx pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $576+8,%rsp +L$point_addq_body: + movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 @@ -4590,31 +5800,51 @@ L$add_proceedq: movdqu %xmm3,48(%rdi) L$add_doneq: - addq $576+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 576+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_addq_epilogue: .byte 0xf3,0xc3 + .globl _ecp_nistz256_point_add_affine .p2align 5 _ecp_nistz256_point_add_affine: + movl $0x80100,%ecx andl _OPENSSL_ia32cap_P+8(%rip),%ecx cmpl $0x80100,%ecx je L$point_add_affinex pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $480+8,%rsp +L$add_affineq_body: + movdqu 0(%rsi),%xmm0 movq %rdx,%rbx movdqu 16(%rsi),%xmm1 @@ -4896,16 +6126,27 @@ _ecp_nistz256_point_add_affine: movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) - addq $480+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 480+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$add_affineq_epilogue: .byte 0xf3,0xc3 + .p2align 5 __ecp_nistz256_add_tox: xorq %r11,%r11 @@ -5035,15 +6276,24 @@ __ecp_nistz256_mul_by_2x: .p2align 5 ecp_nistz256_point_doublex: + L$point_doublex: pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $160+8,%rsp +L$point_doublex_body: + L$point_double_shortcutx: movdqu 0(%rsi),%xmm0 movq %rsi,%rbx @@ -5225,27 +6475,47 @@ L$point_double_shortcutx: .byte 102,72,15,126,207 call __ecp_nistz256_sub_fromx - addq $160+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 160+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_doublex_epilogue: .byte 0xf3,0xc3 + .p2align 5 ecp_nistz256_point_addx: + L$point_addx: pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $576+8,%rsp +L$point_addx_body: + movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 @@ -5621,27 +6891,47 @@ L$add_proceedx: movdqu %xmm3,48(%rdi) L$add_donex: - addq $576+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 576+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_addx_epilogue: .byte 0xf3,0xc3 + .p2align 5 ecp_nistz256_point_add_affinex: + L$point_add_affinex: pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $480+8,%rsp +L$add_affinex_body: + movdqu 0(%rsi),%xmm0 movq %rdx,%rbx movdqu 16(%rsi),%xmm1 @@ -5923,12 +7213,23 @@ L$point_add_affinex: movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) - addq $480+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 480+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$add_affinex_epilogue: .byte 0xf3,0xc3 + diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/ec/x25519-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/ec/x25519-x86_64.s new file mode 100644 index 0000000000..cdb602d4cc --- /dev/null +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/ec/x25519-x86_64.s @@ -0,0 +1,760 @@ +.text + +.globl _x25519_fe51_mul + +.p2align 5 +_x25519_fe51_mul: + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -40(%rsp),%rsp + +L$fe51_mul_body: + + movq 0(%rsi),%rax + movq 0(%rdx),%r11 + movq 8(%rdx),%r12 + movq 16(%rdx),%r13 + movq 24(%rdx),%rbp + movq 32(%rdx),%r14 + + movq %rdi,32(%rsp) + movq %rax,%rdi + mulq %r11 + movq %r11,0(%rsp) + movq %rax,%rbx + movq %rdi,%rax + movq %rdx,%rcx + mulq %r12 + movq %r12,8(%rsp) + movq %rax,%r8 + movq %rdi,%rax + leaq (%r14,%r14,8),%r15 + movq %rdx,%r9 + mulq %r13 + movq %r13,16(%rsp) + movq %rax,%r10 + movq %rdi,%rax + leaq (%r14,%r15,2),%rdi + movq %rdx,%r11 + mulq %rbp + movq %rax,%r12 + movq 0(%rsi),%rax + movq %rdx,%r13 + mulq %r14 + movq %rax,%r14 + movq 8(%rsi),%rax + movq %rdx,%r15 + + mulq %rdi + addq %rax,%rbx + movq 16(%rsi),%rax + adcq %rdx,%rcx + mulq %rdi + addq %rax,%r8 + movq 24(%rsi),%rax + adcq %rdx,%r9 + mulq %rdi + addq %rax,%r10 + movq 32(%rsi),%rax + adcq %rdx,%r11 + mulq %rdi + imulq $19,%rbp,%rdi + addq %rax,%r12 + movq 8(%rsi),%rax + adcq %rdx,%r13 + mulq %rbp + movq 16(%rsp),%rbp + addq %rax,%r14 + movq 16(%rsi),%rax + adcq %rdx,%r15 + + mulq %rdi + addq %rax,%rbx + movq 24(%rsi),%rax + adcq %rdx,%rcx + mulq %rdi + addq %rax,%r8 + movq 32(%rsi),%rax + adcq %rdx,%r9 + mulq %rdi + imulq $19,%rbp,%rdi + addq %rax,%r10 + movq 8(%rsi),%rax + adcq %rdx,%r11 + mulq %rbp + addq %rax,%r12 + movq 16(%rsi),%rax + adcq %rdx,%r13 + mulq %rbp + movq 8(%rsp),%rbp + addq %rax,%r14 + movq 24(%rsi),%rax + adcq %rdx,%r15 + + mulq %rdi + addq %rax,%rbx + movq 32(%rsi),%rax + adcq %rdx,%rcx + mulq %rdi + addq %rax,%r8 + movq 8(%rsi),%rax + adcq %rdx,%r9 + mulq %rbp + imulq $19,%rbp,%rdi + addq %rax,%r10 + movq 16(%rsi),%rax + adcq %rdx,%r11 + mulq %rbp + addq %rax,%r12 + movq 24(%rsi),%rax + adcq %rdx,%r13 + mulq %rbp + movq 0(%rsp),%rbp + addq %rax,%r14 + movq 32(%rsi),%rax + adcq %rdx,%r15 + + mulq %rdi + addq %rax,%rbx + movq 8(%rsi),%rax + adcq %rdx,%rcx + mulq %rbp + addq %rax,%r8 + movq 16(%rsi),%rax + adcq %rdx,%r9 + mulq %rbp + addq %rax,%r10 + movq 24(%rsi),%rax + adcq %rdx,%r11 + mulq %rbp + addq %rax,%r12 + movq 32(%rsi),%rax + adcq %rdx,%r13 + mulq %rbp + addq %rax,%r14 + adcq %rdx,%r15 + + movq 32(%rsp),%rdi + jmp L$reduce51 +L$fe51_mul_epilogue: + + + +.globl _x25519_fe51_sqr + +.p2align 5 +_x25519_fe51_sqr: + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -40(%rsp),%rsp + +L$fe51_sqr_body: + + movq 0(%rsi),%rax + movq 16(%rsi),%r15 + movq 32(%rsi),%rbp + + movq %rdi,32(%rsp) + leaq (%rax,%rax,1),%r14 + mulq %rax + movq %rax,%rbx + movq 8(%rsi),%rax + movq %rdx,%rcx + mulq %r14 + movq %rax,%r8 + movq %r15,%rax + movq %r15,0(%rsp) + movq %rdx,%r9 + mulq %r14 + movq %rax,%r10 + movq 24(%rsi),%rax + movq %rdx,%r11 + imulq $19,%rbp,%rdi + mulq %r14 + movq %rax,%r12 + movq %rbp,%rax + movq %rdx,%r13 + mulq %r14 + movq %rax,%r14 + movq %rbp,%rax + movq %rdx,%r15 + + mulq %rdi + addq %rax,%r12 + movq 8(%rsi),%rax + adcq %rdx,%r13 + + movq 24(%rsi),%rsi + leaq (%rax,%rax,1),%rbp + mulq %rax + addq %rax,%r10 + movq 0(%rsp),%rax + adcq %rdx,%r11 + mulq %rbp + addq %rax,%r12 + movq %rbp,%rax + adcq %rdx,%r13 + mulq %rsi + addq %rax,%r14 + movq %rbp,%rax + adcq %rdx,%r15 + imulq $19,%rsi,%rbp + mulq %rdi + addq %rax,%rbx + leaq (%rsi,%rsi,1),%rax + adcq %rdx,%rcx + + mulq %rdi + addq %rax,%r10 + movq %rsi,%rax + adcq %rdx,%r11 + mulq %rbp + addq %rax,%r8 + movq 0(%rsp),%rax + adcq %rdx,%r9 + + leaq (%rax,%rax,1),%rsi + mulq %rax + addq %rax,%r14 + movq %rbp,%rax + adcq %rdx,%r15 + mulq %rsi + addq %rax,%rbx + movq %rsi,%rax + adcq %rdx,%rcx + mulq %rdi + addq %rax,%r8 + adcq %rdx,%r9 + + movq 32(%rsp),%rdi + jmp L$reduce51 + +.p2align 5 +L$reduce51: + movq $0x7ffffffffffff,%rbp + + movq %r10,%rdx + shrq $51,%r10 + shlq $13,%r11 + andq %rbp,%rdx + orq %r10,%r11 + addq %r11,%r12 + adcq $0,%r13 + + movq %rbx,%rax + shrq $51,%rbx + shlq $13,%rcx + andq %rbp,%rax + orq %rbx,%rcx + addq %rcx,%r8 + adcq $0,%r9 + + movq %r12,%rbx + shrq $51,%r12 + shlq $13,%r13 + andq %rbp,%rbx + orq %r12,%r13 + addq %r13,%r14 + adcq $0,%r15 + + movq %r8,%rcx + shrq $51,%r8 + shlq $13,%r9 + andq %rbp,%rcx + orq %r8,%r9 + addq %r9,%rdx + + movq %r14,%r10 + shrq $51,%r14 + shlq $13,%r15 + andq %rbp,%r10 + orq %r14,%r15 + + leaq (%r15,%r15,8),%r14 + leaq (%r15,%r14,2),%r15 + addq %r15,%rax + + movq %rdx,%r8 + andq %rbp,%rdx + shrq $51,%r8 + addq %r8,%rbx + + movq %rax,%r9 + andq %rbp,%rax + shrq $51,%r9 + addq %r9,%rcx + + movq %rax,0(%rdi) + movq %rcx,8(%rdi) + movq %rdx,16(%rdi) + movq %rbx,24(%rdi) + movq %r10,32(%rdi) + + movq 40(%rsp),%r15 + + movq 48(%rsp),%r14 + + movq 56(%rsp),%r13 + + movq 64(%rsp),%r12 + + movq 72(%rsp),%rbx + + movq 80(%rsp),%rbp + + leaq 88(%rsp),%rsp + +L$fe51_sqr_epilogue: + .byte 0xf3,0xc3 + + + +.globl _x25519_fe51_mul121666 + +.p2align 5 +_x25519_fe51_mul121666: + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -40(%rsp),%rsp + +L$fe51_mul121666_body: + movl $121666,%eax + + mulq 0(%rsi) + movq %rax,%rbx + movl $121666,%eax + movq %rdx,%rcx + mulq 8(%rsi) + movq %rax,%r8 + movl $121666,%eax + movq %rdx,%r9 + mulq 16(%rsi) + movq %rax,%r10 + movl $121666,%eax + movq %rdx,%r11 + mulq 24(%rsi) + movq %rax,%r12 + movl $121666,%eax + movq %rdx,%r13 + mulq 32(%rsi) + movq %rax,%r14 + movq %rdx,%r15 + + jmp L$reduce51 +L$fe51_mul121666_epilogue: + + + +.globl _x25519_fe64_eligible + +.p2align 5 +_x25519_fe64_eligible: + movl _OPENSSL_ia32cap_P+8(%rip),%ecx + xorl %eax,%eax + andl $0x80100,%ecx + cmpl $0x80100,%ecx + cmovel %ecx,%eax + .byte 0xf3,0xc3 + + +.globl _x25519_fe64_mul + +.p2align 5 +_x25519_fe64_mul: + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + + leaq -16(%rsp),%rsp + +L$fe64_mul_body: + + movq %rdx,%rax + movq 0(%rdx),%rbp + movq 0(%rsi),%rdx + movq 8(%rax),%rcx + movq 16(%rax),%r14 + movq 24(%rax),%r15 + + mulxq %rbp,%r8,%rax + xorl %edi,%edi + mulxq %rcx,%r9,%rbx + adcxq %rax,%r9 + mulxq %r14,%r10,%rax + adcxq %rbx,%r10 + mulxq %r15,%r11,%r12 + movq 8(%rsi),%rdx + adcxq %rax,%r11 + movq %r14,(%rsp) + adcxq %rdi,%r12 + + mulxq %rbp,%rax,%rbx + adoxq %rax,%r9 + adcxq %rbx,%r10 + mulxq %rcx,%rax,%rbx + adoxq %rax,%r10 + adcxq %rbx,%r11 + mulxq %r14,%rax,%rbx + adoxq %rax,%r11 + adcxq %rbx,%r12 + mulxq %r15,%rax,%r13 + movq 16(%rsi),%rdx + adoxq %rax,%r12 + adcxq %rdi,%r13 + adoxq %rdi,%r13 + + mulxq %rbp,%rax,%rbx + adcxq %rax,%r10 + adoxq %rbx,%r11 + mulxq %rcx,%rax,%rbx + adcxq %rax,%r11 + adoxq %rbx,%r12 + mulxq %r14,%rax,%rbx + adcxq %rax,%r12 + adoxq %rbx,%r13 + mulxq %r15,%rax,%r14 + movq 24(%rsi),%rdx + adcxq %rax,%r13 + adoxq %rdi,%r14 + adcxq %rdi,%r14 + + mulxq %rbp,%rax,%rbx + adoxq %rax,%r11 + adcxq %rbx,%r12 + mulxq %rcx,%rax,%rbx + adoxq %rax,%r12 + adcxq %rbx,%r13 + mulxq (%rsp),%rax,%rbx + adoxq %rax,%r13 + adcxq %rbx,%r14 + mulxq %r15,%rax,%r15 + movl $38,%edx + adoxq %rax,%r14 + adcxq %rdi,%r15 + adoxq %rdi,%r15 + + jmp L$reduce64 +L$fe64_mul_epilogue: + + + +.globl _x25519_fe64_sqr + +.p2align 5 +_x25519_fe64_sqr: + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + + leaq -16(%rsp),%rsp + +L$fe64_sqr_body: + + movq 0(%rsi),%rdx + movq 8(%rsi),%rcx + movq 16(%rsi),%rbp + movq 24(%rsi),%rsi + + + mulxq %rdx,%r8,%r15 + mulxq %rcx,%r9,%rax + xorl %edi,%edi + mulxq %rbp,%r10,%rbx + adcxq %rax,%r10 + mulxq %rsi,%r11,%r12 + movq %rcx,%rdx + adcxq %rbx,%r11 + adcxq %rdi,%r12 + + + mulxq %rbp,%rax,%rbx + adoxq %rax,%r11 + adcxq %rbx,%r12 + mulxq %rsi,%rax,%r13 + movq %rbp,%rdx + adoxq %rax,%r12 + adcxq %rdi,%r13 + + + mulxq %rsi,%rax,%r14 + movq %rcx,%rdx + adoxq %rax,%r13 + adcxq %rdi,%r14 + adoxq %rdi,%r14 + + adcxq %r9,%r9 + adoxq %r15,%r9 + adcxq %r10,%r10 + mulxq %rdx,%rax,%rbx + movq %rbp,%rdx + adcxq %r11,%r11 + adoxq %rax,%r10 + adcxq %r12,%r12 + adoxq %rbx,%r11 + mulxq %rdx,%rax,%rbx + movq %rsi,%rdx + adcxq %r13,%r13 + adoxq %rax,%r12 + adcxq %r14,%r14 + adoxq %rbx,%r13 + mulxq %rdx,%rax,%r15 + movl $38,%edx + adoxq %rax,%r14 + adcxq %rdi,%r15 + adoxq %rdi,%r15 + jmp L$reduce64 + +.p2align 5 +L$reduce64: + mulxq %r12,%rax,%rbx + adcxq %rax,%r8 + adoxq %rbx,%r9 + mulxq %r13,%rax,%rbx + adcxq %rax,%r9 + adoxq %rbx,%r10 + mulxq %r14,%rax,%rbx + adcxq %rax,%r10 + adoxq %rbx,%r11 + mulxq %r15,%rax,%r12 + adcxq %rax,%r11 + adoxq %rdi,%r12 + adcxq %rdi,%r12 + + movq 16(%rsp),%rdi + imulq %rdx,%r12 + + addq %r12,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + sbbq %rax,%rax + andq $38,%rax + + addq %rax,%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r8,0(%rdi) + + movq 24(%rsp),%r15 + + movq 32(%rsp),%r14 + + movq 40(%rsp),%r13 + + movq 48(%rsp),%r12 + + movq 56(%rsp),%rbx + + movq 64(%rsp),%rbp + + leaq 72(%rsp),%rsp + +L$fe64_sqr_epilogue: + .byte 0xf3,0xc3 + + + +.globl _x25519_fe64_mul121666 + +.p2align 5 +_x25519_fe64_mul121666: +L$fe64_mul121666_body: + movl $121666,%edx + mulxq 0(%rsi),%r8,%rcx + mulxq 8(%rsi),%r9,%rax + addq %rcx,%r9 + mulxq 16(%rsi),%r10,%rcx + adcq %rax,%r10 + mulxq 24(%rsi),%r11,%rax + adcq %rcx,%r11 + adcq $0,%rax + + imulq $38,%rax,%rax + + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + sbbq %rax,%rax + andq $38,%rax + + addq %rax,%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r8,0(%rdi) + +L$fe64_mul121666_epilogue: + .byte 0xf3,0xc3 + + +.globl _x25519_fe64_add + +.p2align 5 +_x25519_fe64_add: +L$fe64_add_body: + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + + sbbq %rax,%rax + andq $38,%rax + + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + movq %r9,8(%rdi) + adcq $0,%r11 + movq %r10,16(%rdi) + sbbq %rax,%rax + movq %r11,24(%rdi) + andq $38,%rax + + addq %rax,%r8 + movq %r8,0(%rdi) + +L$fe64_add_epilogue: + .byte 0xf3,0xc3 + + +.globl _x25519_fe64_sub + +.p2align 5 +_x25519_fe64_sub: +L$fe64_sub_body: + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + sbbq 8(%rdx),%r9 + sbbq 16(%rdx),%r10 + sbbq 24(%rdx),%r11 + + sbbq %rax,%rax + andq $38,%rax + + subq %rax,%r8 + sbbq $0,%r9 + sbbq $0,%r10 + movq %r9,8(%rdi) + sbbq $0,%r11 + movq %r10,16(%rdi) + sbbq %rax,%rax + movq %r11,24(%rdi) + andq $38,%rax + + subq %rax,%r8 + movq %r8,0(%rdi) + +L$fe64_sub_epilogue: + .byte 0xf3,0xc3 + + +.globl _x25519_fe64_tobytes + +.p2align 5 +_x25519_fe64_tobytes: +L$fe64_to_body: + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + + leaq (%r11,%r11,1),%rax + sarq $63,%r11 + shrq $1,%rax + andq $19,%r11 + addq $19,%r11 + + addq %r11,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%rax + + leaq (%rax,%rax,1),%r11 + sarq $63,%rax + shrq $1,%r11 + notq %rax + andq $19,%rax + + subq %rax,%r8 + sbbq $0,%r9 + sbbq $0,%r10 + sbbq $0,%r11 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + +L$fe64_to_epilogue: + .byte 0xf3,0xc3 + +.byte 88,50,53,53,49,57,32,112,114,105,109,105,116,105,118,101,115,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/include/internal/dso_conf.h b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/include/internal/dso_conf.h index 64f9685dd5..83576dc732 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/include/internal/dso_conf.h +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/include/internal/dso_conf.h @@ -1,7 +1,7 @@ /* WARNING: do not edit! */ /* Generated by Makefile from crypto/include/internal/dso_conf.h.in */ /* - * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the OpenSSL license (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy @@ -12,5 +12,8 @@ #ifndef HEADER_DSO_CONF_H # define HEADER_DSO_CONF_H +# define DSO_DLFCN +# define HAVE_DLFCN_H # define DSO_EXTENSION ".dylib" + #endif diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/md5/md5-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/md5/md5-x86_64.s index f385ea2a3f..285c32ca88 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/md5/md5-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/md5/md5-x86_64.s @@ -4,11 +4,17 @@ .globl _md5_block_asm_data_order _md5_block_asm_data_order: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r14 + pushq %r15 + L$prologue: @@ -655,11 +661,18 @@ L$end: movl %edx,12(%rbp) movq (%rsp),%r15 + movq 8(%rsp),%r14 + movq 16(%rsp),%r12 + movq 24(%rsp),%rbx + movq 32(%rsp),%rbp + addq $40,%rsp + L$epilogue: .byte 0xf3,0xc3 + diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/modes/aesni-gcm-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/modes/aesni-gcm-x86_64.s index f01a002363..d5920186ce 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/modes/aesni-gcm-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/modes/aesni-gcm-x86_64.s @@ -31,23 +31,6 @@ L$resume_ctr32: vpxor %xmm15,%xmm12,%xmm12 vmovups 16-128(%rcx),%xmm2 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 - - - - - - - - - - - - - - - - - xorq %r12,%r12 cmpq %r14,%r15 @@ -332,20 +315,25 @@ L$6x_done: .p2align 5 _aesni_gcm_decrypt: - xorq %r10,%r10 - - + xorq %r10,%r10 cmpq $0x60,%rdx jb L$gcm_dec_abort leaq (%rsp),%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + vzeroupper vmovdqu (%r8),%xmm1 @@ -374,15 +362,7 @@ L$dec_no_key_aliasing: vmovdqu 80(%rdi),%xmm7 leaq (%rdi),%r14 vmovdqu 64(%rdi),%xmm4 - - - - - - - leaq -192(%rdi,%rdx,1),%r15 - vmovdqu 48(%rdi),%xmm5 shrq $4,%rdx xorq %r10,%r10 @@ -415,17 +395,25 @@ L$dec_no_key_aliasing: vzeroupper movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$gcm_dec_abort: movq %r10,%rax .byte 0xf3,0xc3 + .p2align 5 _aesni_ctr32_6x: vmovdqu 0-128(%rcx),%xmm4 @@ -520,21 +508,25 @@ L$handle_ctr32_2: .p2align 5 _aesni_gcm_encrypt: - xorq %r10,%r10 - - - + xorq %r10,%r10 cmpq $288,%rdx jb L$gcm_enc_abort leaq (%rsp),%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + vzeroupper vmovdqu (%r8),%xmm1 @@ -558,16 +550,7 @@ _aesni_gcm_encrypt: L$enc_no_key_aliasing: leaq (%rsi),%r14 - - - - - - - - leaq -192(%rsi,%rdx,1),%r15 - shrq $4,%rdx call _aesni_ctr32_6x @@ -769,16 +752,24 @@ L$enc_no_key_aliasing: vzeroupper movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$gcm_enc_abort: movq %r10,%rax .byte 0xf3,0xc3 + .p2align 6 L$bswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/modes/ghash-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/modes/ghash-x86_64.s index 502af78349..d182d45cfb 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/modes/ghash-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/modes/ghash-x86_64.s @@ -5,9 +5,21 @@ .p2align 4 _gcm_gmult_4bit: + pushq %rbx + pushq %rbp + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $280,%rsp + L$gmult_prologue: movzbq 15(%rdi),%r8 @@ -84,22 +96,35 @@ L$break1: movq %r8,8(%rdi) movq %r9,(%rdi) - movq 16(%rsp),%rbx - leaq 24(%rsp),%rsp + leaq 280+48(%rsp),%rsi + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$gmult_epilogue: .byte 0xf3,0xc3 + .globl _gcm_ghash_4bit .p2align 4 _gcm_ghash_4bit: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $280,%rsp + L$ghash_prologue: movq %rdx,%r14 movq %rcx,%r15 @@ -644,17 +669,26 @@ L$outer_loop: movq %r8,8(%rdi) movq %r9,(%rdi) - leaq 280(%rsp),%rsi - movq 0(%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + leaq 280+48(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq 0(%rsi),%rsp + L$ghash_epilogue: .byte 0xf3,0xc3 + .globl _gcm_init_clmul .p2align 4 diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/poly1305/poly1305-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/poly1305/poly1305-x86_64.s index c68f5a6fbe..cbc8c80816 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/poly1305/poly1305-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/poly1305/poly1305-x86_64.s @@ -31,6 +31,11 @@ _poly1305_init: leaq poly1305_blocks_avx2(%rip),%rax btq $37,%r9 cmovcq %rax,%r10 + movq $2149646336,%rax + shrq $32,%r9 + andq %rax,%r9 + cmpq %rax,%r9 + je L$init_base2_44 movq $0x0ffffffc0fffffff,%rax movq $0x0ffffffc0ffffffc,%rcx andq 0(%rsi),%rax @@ -47,16 +52,23 @@ L$no_key: .p2align 5 _poly1305_blocks: + L$blocks: shrq $4,%rdx jz L$no_data pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + L$blocks_body: movq %rdx,%r15 @@ -127,18 +139,26 @@ L$oop: movq %rbp,16(%rdi) movq 0(%rsp),%r15 + movq 8(%rsp),%r14 + movq 16(%rsp),%r13 + movq 24(%rsp),%r12 + movq 32(%rsp),%rbp + movq 40(%rsp),%rbx + leaq 48(%rsp),%rsp + L$no_data: L$blocks_epilogue: .byte 0xf3,0xc3 + .p2align 5 _poly1305_emit: L$emit: @@ -371,6 +391,7 @@ __poly1305_init_avx: .p2align 5 poly1305_blocks_avx: + movl 20(%rdi),%r8d cmpq $128,%rdx jae L$blocks_avx @@ -390,11 +411,17 @@ L$blocks_avx: jz L$even_avx pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + L$blocks_avx_body: movq %rdx,%r15 @@ -497,24 +524,39 @@ L$store_base2_26_avx: .p2align 4 L$done_avx: movq 0(%rsp),%r15 + movq 8(%rsp),%r14 + movq 16(%rsp),%r13 + movq 24(%rsp),%r12 + movq 32(%rsp),%rbp + movq 40(%rsp),%rbx + leaq 48(%rsp),%rsp + L$no_data_avx: L$blocks_avx_epilogue: .byte 0xf3,0xc3 + .p2align 5 L$base2_64_avx: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + L$base2_64_avx_body: movq %rdx,%r15 @@ -574,18 +616,27 @@ L$proceed_avx: movq %r15,%rdx movq 0(%rsp),%r15 + movq 8(%rsp),%r14 + movq 16(%rsp),%r13 + movq 24(%rsp),%r12 + movq 32(%rsp),%rbp + movq 40(%rsp),%rbx + leaq 48(%rsp),%rax leaq 48(%rsp),%rsp + L$base2_64_avx_epilogue: jmp L$do_avx + .p2align 5 L$even_avx: + vmovd 0(%rdi),%xmm0 vmovd 4(%rdi),%xmm1 vmovd 8(%rdi),%xmm2 @@ -594,6 +645,7 @@ L$even_avx: L$do_avx: leaq -88(%rsp),%r11 + subq $0x178,%rsp subq $64,%rdx leaq -32(%rsi),%rax @@ -1153,11 +1205,13 @@ L$short_tail_avx: vmovd %xmm13,-100(%rdi) vmovd %xmm14,-96(%rdi) leaq 88(%r11),%rsp + vzeroupper .byte 0xf3,0xc3 + .p2align 5 poly1305_emit_avx: cmpl $0,20(%rdi) @@ -1214,6 +1268,7 @@ poly1305_emit_avx: .p2align 5 poly1305_blocks_avx2: + movl 20(%rdi),%r8d cmpq $128,%rdx jae L$blocks_avx2 @@ -1233,11 +1288,17 @@ L$blocks_avx2: jz L$even_avx2 pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + L$blocks_avx2_body: movq %rdx,%r15 @@ -1346,24 +1407,39 @@ L$store_base2_26_avx2: .p2align 4 L$done_avx2: movq 0(%rsp),%r15 + movq 8(%rsp),%r14 + movq 16(%rsp),%r13 + movq 24(%rsp),%r12 + movq 32(%rsp),%rbp + movq 40(%rsp),%rbx + leaq 48(%rsp),%rsp + L$no_data_avx2: L$blocks_avx2_epilogue: .byte 0xf3,0xc3 + .p2align 5 L$base2_64_avx2: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + L$base2_64_avx2_body: movq %rdx,%r15 @@ -1426,20 +1502,32 @@ L$init_avx2: L$proceed_avx2: movq %r15,%rdx + movl _OPENSSL_ia32cap_P+8(%rip),%r10d + movl $3221291008,%r11d movq 0(%rsp),%r15 + movq 8(%rsp),%r14 + movq 16(%rsp),%r13 + movq 24(%rsp),%r12 + movq 32(%rsp),%rbp + movq 40(%rsp),%rbx + leaq 48(%rsp),%rax leaq 48(%rsp),%rsp + L$base2_64_avx2_epilogue: jmp L$do_avx2 + .p2align 5 L$even_avx2: + + movl _OPENSSL_ia32cap_P+8(%rip),%r10d vmovd 0(%rdi),%xmm0 vmovd 4(%rdi),%xmm1 vmovd 8(%rdi),%xmm2 @@ -1447,10 +1535,18 @@ L$even_avx2: vmovd 16(%rdi),%xmm4 L$do_avx2: + cmpq $512,%rdx + jb L$skip_avx512 + andl %r11d,%r10d + testl $65536,%r10d + jnz L$blocks_avx512 +L$skip_avx512: leaq -8(%rsp),%r11 + subq $0x128,%rsp - leaq 48+64(%rdi),%rdi leaq L$const(%rip),%rcx + leaq 48+64(%rdi),%rdi + vmovdqa 96(%rcx),%ymm7 vmovdqu -64(%rdi),%xmm9 @@ -1460,36 +1556,28 @@ L$do_avx2: vmovdqu -16(%rdi),%xmm11 vmovdqu 0(%rdi),%xmm12 vmovdqu 16(%rdi),%xmm13 + leaq 144(%rsp),%rax vmovdqu 32(%rdi),%xmm14 - vpermq $0x15,%ymm9,%ymm9 + vpermd %ymm9,%ymm7,%ymm9 vmovdqu 48(%rdi),%xmm15 - vpermq $0x15,%ymm10,%ymm10 - vpshufd $0xc8,%ymm9,%ymm9 + vpermd %ymm10,%ymm7,%ymm10 vmovdqu 64(%rdi),%xmm5 - vpermq $0x15,%ymm6,%ymm6 - vpshufd $0xc8,%ymm10,%ymm10 + vpermd %ymm6,%ymm7,%ymm6 vmovdqa %ymm9,0(%rsp) - vpermq $0x15,%ymm11,%ymm11 - vpshufd $0xc8,%ymm6,%ymm6 - vmovdqa %ymm10,32(%rsp) - vpermq $0x15,%ymm12,%ymm12 - vpshufd $0xc8,%ymm11,%ymm11 - vmovdqa %ymm6,64(%rsp) - vpermq $0x15,%ymm13,%ymm13 - vpshufd $0xc8,%ymm12,%ymm12 - vmovdqa %ymm11,96(%rsp) - vpermq $0x15,%ymm14,%ymm14 - vpshufd $0xc8,%ymm13,%ymm13 - vmovdqa %ymm12,128(%rsp) - vpermq $0x15,%ymm15,%ymm15 - vpshufd $0xc8,%ymm14,%ymm14 - vmovdqa %ymm13,160(%rsp) - vpermq $0x15,%ymm5,%ymm5 - vpshufd $0xc8,%ymm15,%ymm15 - vmovdqa %ymm14,192(%rsp) - vpshufd $0xc8,%ymm5,%ymm5 - vmovdqa %ymm15,224(%rsp) - vmovdqa %ymm5,256(%rsp) + vpermd %ymm11,%ymm7,%ymm11 + vmovdqa %ymm10,32-144(%rax) + vpermd %ymm12,%ymm7,%ymm12 + vmovdqa %ymm6,64-144(%rax) + vpermd %ymm13,%ymm7,%ymm13 + vmovdqa %ymm11,96-144(%rax) + vpermd %ymm14,%ymm7,%ymm14 + vmovdqa %ymm12,128-144(%rax) + vpermd %ymm15,%ymm7,%ymm15 + vmovdqa %ymm13,160-144(%rax) + vpermd %ymm5,%ymm7,%ymm5 + vmovdqa %ymm14,192-144(%rax) + vmovdqa %ymm15,224-144(%rax) + vmovdqa %ymm5,256-144(%rax) vmovdqa 64(%rcx),%ymm5 @@ -1516,7 +1604,6 @@ L$do_avx2: vpand %ymm5,%ymm10,%ymm10 vpor 32(%rcx),%ymm6,%ymm6 - leaq 144(%rsp),%rax vpaddq %ymm2,%ymm9,%ymm2 subq $64,%rdx jz L$tail_avx2 @@ -1811,9 +1898,1506 @@ L$tail_avx2: vmovd %xmm3,-100(%rdi) vmovd %xmm4,-96(%rdi) leaq 8(%r11),%rsp + vzeroupper .byte 0xf3,0xc3 + + +.p2align 5 +poly1305_blocks_avx512: + +L$blocks_avx512: + movl $15,%eax + kmovw %eax,%k2 + leaq -8(%rsp),%r11 + + subq $0x128,%rsp + leaq L$const(%rip),%rcx + leaq 48+64(%rdi),%rdi + vmovdqa 96(%rcx),%ymm9 + + + vmovdqu -64(%rdi),%xmm11 + andq $-512,%rsp + vmovdqu -48(%rdi),%xmm12 + movq $0x20,%rax + vmovdqu -32(%rdi),%xmm7 + vmovdqu -16(%rdi),%xmm13 + vmovdqu 0(%rdi),%xmm8 + vmovdqu 16(%rdi),%xmm14 + vmovdqu 32(%rdi),%xmm10 + vmovdqu 48(%rdi),%xmm15 + vmovdqu 64(%rdi),%xmm6 + vpermd %zmm11,%zmm9,%zmm16 + vpbroadcastq 64(%rcx),%zmm5 + vpermd %zmm12,%zmm9,%zmm17 + vpermd %zmm7,%zmm9,%zmm21 + vpermd %zmm13,%zmm9,%zmm18 + vmovdqa64 %zmm16,0(%rsp){%k2} + vpsrlq $32,%zmm16,%zmm7 + vpermd %zmm8,%zmm9,%zmm22 + vmovdqu64 %zmm17,0(%rsp,%rax,1){%k2} + vpsrlq $32,%zmm17,%zmm8 + vpermd %zmm14,%zmm9,%zmm19 + vmovdqa64 %zmm21,64(%rsp){%k2} + vpermd %zmm10,%zmm9,%zmm23 + vpermd %zmm15,%zmm9,%zmm20 + vmovdqu64 %zmm18,64(%rsp,%rax,1){%k2} + vpermd %zmm6,%zmm9,%zmm24 + vmovdqa64 %zmm22,128(%rsp){%k2} + vmovdqu64 %zmm19,128(%rsp,%rax,1){%k2} + vmovdqa64 %zmm23,192(%rsp){%k2} + vmovdqu64 %zmm20,192(%rsp,%rax,1){%k2} + vmovdqa64 %zmm24,256(%rsp){%k2} + + + + + + + + + + + vpmuludq %zmm7,%zmm16,%zmm11 + vpmuludq %zmm7,%zmm17,%zmm12 + vpmuludq %zmm7,%zmm18,%zmm13 + vpmuludq %zmm7,%zmm19,%zmm14 + vpmuludq %zmm7,%zmm20,%zmm15 + vpsrlq $32,%zmm18,%zmm9 + + vpmuludq %zmm8,%zmm24,%zmm25 + vpmuludq %zmm8,%zmm16,%zmm26 + vpmuludq %zmm8,%zmm17,%zmm27 + vpmuludq %zmm8,%zmm18,%zmm28 + vpmuludq %zmm8,%zmm19,%zmm29 + vpsrlq $32,%zmm19,%zmm10 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + + vpmuludq %zmm9,%zmm23,%zmm25 + vpmuludq %zmm9,%zmm24,%zmm26 + vpmuludq %zmm9,%zmm17,%zmm28 + vpmuludq %zmm9,%zmm18,%zmm29 + vpmuludq %zmm9,%zmm16,%zmm27 + vpsrlq $32,%zmm20,%zmm6 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm10,%zmm22,%zmm25 + vpmuludq %zmm10,%zmm16,%zmm28 + vpmuludq %zmm10,%zmm17,%zmm29 + vpmuludq %zmm10,%zmm23,%zmm26 + vpmuludq %zmm10,%zmm24,%zmm27 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm6,%zmm24,%zmm28 + vpmuludq %zmm6,%zmm16,%zmm29 + vpmuludq %zmm6,%zmm21,%zmm25 + vpmuludq %zmm6,%zmm22,%zmm26 + vpmuludq %zmm6,%zmm23,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + + + vmovdqu64 0(%rsi),%zmm10 + vmovdqu64 64(%rsi),%zmm6 + leaq 128(%rsi),%rsi + + + + + vpsrlq $26,%zmm14,%zmm28 + vpandq %zmm5,%zmm14,%zmm14 + vpaddq %zmm28,%zmm15,%zmm15 + + vpsrlq $26,%zmm11,%zmm25 + vpandq %zmm5,%zmm11,%zmm11 + vpaddq %zmm25,%zmm12,%zmm12 + + vpsrlq $26,%zmm15,%zmm29 + vpandq %zmm5,%zmm15,%zmm15 + + vpsrlq $26,%zmm12,%zmm26 + vpandq %zmm5,%zmm12,%zmm12 + vpaddq %zmm26,%zmm13,%zmm13 + + vpaddq %zmm29,%zmm11,%zmm11 + vpsllq $2,%zmm29,%zmm29 + vpaddq %zmm29,%zmm11,%zmm11 + + vpsrlq $26,%zmm13,%zmm27 + vpandq %zmm5,%zmm13,%zmm13 + vpaddq %zmm27,%zmm14,%zmm14 + + vpsrlq $26,%zmm11,%zmm25 + vpandq %zmm5,%zmm11,%zmm11 + vpaddq %zmm25,%zmm12,%zmm12 + + vpsrlq $26,%zmm14,%zmm28 + vpandq %zmm5,%zmm14,%zmm14 + vpaddq %zmm28,%zmm15,%zmm15 + + + + + + vpunpcklqdq %zmm6,%zmm10,%zmm7 + vpunpckhqdq %zmm6,%zmm10,%zmm6 + + + + + + + vmovdqa32 128(%rcx),%zmm25 + movl $0x7777,%eax + kmovw %eax,%k1 + + vpermd %zmm16,%zmm25,%zmm16 + vpermd %zmm17,%zmm25,%zmm17 + vpermd %zmm18,%zmm25,%zmm18 + vpermd %zmm19,%zmm25,%zmm19 + vpermd %zmm20,%zmm25,%zmm20 + + vpermd %zmm11,%zmm25,%zmm16{%k1} + vpermd %zmm12,%zmm25,%zmm17{%k1} + vpermd %zmm13,%zmm25,%zmm18{%k1} + vpermd %zmm14,%zmm25,%zmm19{%k1} + vpermd %zmm15,%zmm25,%zmm20{%k1} + + vpslld $2,%zmm17,%zmm21 + vpslld $2,%zmm18,%zmm22 + vpslld $2,%zmm19,%zmm23 + vpslld $2,%zmm20,%zmm24 + vpaddd %zmm17,%zmm21,%zmm21 + vpaddd %zmm18,%zmm22,%zmm22 + vpaddd %zmm19,%zmm23,%zmm23 + vpaddd %zmm20,%zmm24,%zmm24 + + vpbroadcastq 32(%rcx),%zmm30 + + vpsrlq $52,%zmm7,%zmm9 + vpsllq $12,%zmm6,%zmm10 + vporq %zmm10,%zmm9,%zmm9 + vpsrlq $26,%zmm7,%zmm8 + vpsrlq $14,%zmm6,%zmm10 + vpsrlq $40,%zmm6,%zmm6 + vpandq %zmm5,%zmm9,%zmm9 + vpandq %zmm5,%zmm7,%zmm7 + + + + + vpaddq %zmm2,%zmm9,%zmm2 + subq $192,%rdx + jbe L$tail_avx512 + jmp L$oop_avx512 + +.p2align 5 +L$oop_avx512: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + vpmuludq %zmm2,%zmm17,%zmm14 + vpaddq %zmm0,%zmm7,%zmm0 + vpmuludq %zmm2,%zmm18,%zmm15 + vpandq %zmm5,%zmm8,%zmm8 + vpmuludq %zmm2,%zmm23,%zmm11 + vpandq %zmm5,%zmm10,%zmm10 + vpmuludq %zmm2,%zmm24,%zmm12 + vporq %zmm30,%zmm6,%zmm6 + vpmuludq %zmm2,%zmm16,%zmm13 + vpaddq %zmm1,%zmm8,%zmm1 + vpaddq %zmm3,%zmm10,%zmm3 + vpaddq %zmm4,%zmm6,%zmm4 + + vmovdqu64 0(%rsi),%zmm10 + vmovdqu64 64(%rsi),%zmm6 + leaq 128(%rsi),%rsi + vpmuludq %zmm0,%zmm19,%zmm28 + vpmuludq %zmm0,%zmm20,%zmm29 + vpmuludq %zmm0,%zmm16,%zmm25 + vpmuludq %zmm0,%zmm17,%zmm26 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + + vpmuludq %zmm1,%zmm18,%zmm28 + vpmuludq %zmm1,%zmm19,%zmm29 + vpmuludq %zmm1,%zmm24,%zmm25 + vpmuludq %zmm0,%zmm18,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm27,%zmm13,%zmm13 + + vpunpcklqdq %zmm6,%zmm10,%zmm7 + vpunpckhqdq %zmm6,%zmm10,%zmm6 + + vpmuludq %zmm3,%zmm16,%zmm28 + vpmuludq %zmm3,%zmm17,%zmm29 + vpmuludq %zmm1,%zmm16,%zmm26 + vpmuludq %zmm1,%zmm17,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm4,%zmm24,%zmm28 + vpmuludq %zmm4,%zmm16,%zmm29 + vpmuludq %zmm3,%zmm22,%zmm25 + vpmuludq %zmm3,%zmm23,%zmm26 + vpaddq %zmm28,%zmm14,%zmm14 + vpmuludq %zmm3,%zmm24,%zmm27 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm4,%zmm21,%zmm25 + vpmuludq %zmm4,%zmm22,%zmm26 + vpmuludq %zmm4,%zmm23,%zmm27 + vpaddq %zmm25,%zmm11,%zmm0 + vpaddq %zmm26,%zmm12,%zmm1 + vpaddq %zmm27,%zmm13,%zmm2 + + + + + vpsrlq $52,%zmm7,%zmm9 + vpsllq $12,%zmm6,%zmm10 + + vpsrlq $26,%zmm14,%zmm3 + vpandq %zmm5,%zmm14,%zmm14 + vpaddq %zmm3,%zmm15,%zmm4 + + vporq %zmm10,%zmm9,%zmm9 + + vpsrlq $26,%zmm0,%zmm11 + vpandq %zmm5,%zmm0,%zmm0 + vpaddq %zmm11,%zmm1,%zmm1 + + vpandq %zmm5,%zmm9,%zmm9 + + vpsrlq $26,%zmm4,%zmm15 + vpandq %zmm5,%zmm4,%zmm4 + + vpsrlq $26,%zmm1,%zmm12 + vpandq %zmm5,%zmm1,%zmm1 + vpaddq %zmm12,%zmm2,%zmm2 + + vpaddq %zmm15,%zmm0,%zmm0 + vpsllq $2,%zmm15,%zmm15 + vpaddq %zmm15,%zmm0,%zmm0 + + vpaddq %zmm9,%zmm2,%zmm2 + vpsrlq $26,%zmm7,%zmm8 + + vpsrlq $26,%zmm2,%zmm13 + vpandq %zmm5,%zmm2,%zmm2 + vpaddq %zmm13,%zmm14,%zmm3 + + vpsrlq $14,%zmm6,%zmm10 + + vpsrlq $26,%zmm0,%zmm11 + vpandq %zmm5,%zmm0,%zmm0 + vpaddq %zmm11,%zmm1,%zmm1 + + vpsrlq $40,%zmm6,%zmm6 + + vpsrlq $26,%zmm3,%zmm14 + vpandq %zmm5,%zmm3,%zmm3 + vpaddq %zmm14,%zmm4,%zmm4 + + vpandq %zmm5,%zmm7,%zmm7 + + + + + subq $128,%rdx + ja L$oop_avx512 + +L$tail_avx512: + + + + + + vpsrlq $32,%zmm16,%zmm16 + vpsrlq $32,%zmm17,%zmm17 + vpsrlq $32,%zmm18,%zmm18 + vpsrlq $32,%zmm23,%zmm23 + vpsrlq $32,%zmm24,%zmm24 + vpsrlq $32,%zmm19,%zmm19 + vpsrlq $32,%zmm20,%zmm20 + vpsrlq $32,%zmm21,%zmm21 + vpsrlq $32,%zmm22,%zmm22 + + + + leaq (%rsi,%rdx,1),%rsi + + + vpaddq %zmm0,%zmm7,%zmm0 + + vpmuludq %zmm2,%zmm17,%zmm14 + vpmuludq %zmm2,%zmm18,%zmm15 + vpmuludq %zmm2,%zmm23,%zmm11 + vpandq %zmm5,%zmm8,%zmm8 + vpmuludq %zmm2,%zmm24,%zmm12 + vpandq %zmm5,%zmm10,%zmm10 + vpmuludq %zmm2,%zmm16,%zmm13 + vporq %zmm30,%zmm6,%zmm6 + vpaddq %zmm1,%zmm8,%zmm1 + vpaddq %zmm3,%zmm10,%zmm3 + vpaddq %zmm4,%zmm6,%zmm4 + + vmovdqu 0(%rsi),%xmm7 + vpmuludq %zmm0,%zmm19,%zmm28 + vpmuludq %zmm0,%zmm20,%zmm29 + vpmuludq %zmm0,%zmm16,%zmm25 + vpmuludq %zmm0,%zmm17,%zmm26 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + + vmovdqu 16(%rsi),%xmm8 + vpmuludq %zmm1,%zmm18,%zmm28 + vpmuludq %zmm1,%zmm19,%zmm29 + vpmuludq %zmm1,%zmm24,%zmm25 + vpmuludq %zmm0,%zmm18,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm27,%zmm13,%zmm13 + + vinserti128 $1,32(%rsi),%ymm7,%ymm7 + vpmuludq %zmm3,%zmm16,%zmm28 + vpmuludq %zmm3,%zmm17,%zmm29 + vpmuludq %zmm1,%zmm16,%zmm26 + vpmuludq %zmm1,%zmm17,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vinserti128 $1,48(%rsi),%ymm8,%ymm8 + vpmuludq %zmm4,%zmm24,%zmm28 + vpmuludq %zmm4,%zmm16,%zmm29 + vpmuludq %zmm3,%zmm22,%zmm25 + vpmuludq %zmm3,%zmm23,%zmm26 + vpmuludq %zmm3,%zmm24,%zmm27 + vpaddq %zmm28,%zmm14,%zmm3 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm4,%zmm21,%zmm25 + vpmuludq %zmm4,%zmm22,%zmm26 + vpmuludq %zmm4,%zmm23,%zmm27 + vpaddq %zmm25,%zmm11,%zmm0 + vpaddq %zmm26,%zmm12,%zmm1 + vpaddq %zmm27,%zmm13,%zmm2 + + + + + movl $1,%eax + vpermq $0xb1,%zmm3,%zmm14 + vpermq $0xb1,%zmm15,%zmm4 + vpermq $0xb1,%zmm0,%zmm11 + vpermq $0xb1,%zmm1,%zmm12 + vpermq $0xb1,%zmm2,%zmm13 + vpaddq %zmm14,%zmm3,%zmm3 + vpaddq %zmm15,%zmm4,%zmm4 + vpaddq %zmm11,%zmm0,%zmm0 + vpaddq %zmm12,%zmm1,%zmm1 + vpaddq %zmm13,%zmm2,%zmm2 + + kmovw %eax,%k3 + vpermq $0x2,%zmm3,%zmm14 + vpermq $0x2,%zmm4,%zmm15 + vpermq $0x2,%zmm0,%zmm11 + vpermq $0x2,%zmm1,%zmm12 + vpermq $0x2,%zmm2,%zmm13 + vpaddq %zmm14,%zmm3,%zmm3 + vpaddq %zmm15,%zmm4,%zmm4 + vpaddq %zmm11,%zmm0,%zmm0 + vpaddq %zmm12,%zmm1,%zmm1 + vpaddq %zmm13,%zmm2,%zmm2 + + vextracti64x4 $0x1,%zmm3,%ymm14 + vextracti64x4 $0x1,%zmm4,%ymm15 + vextracti64x4 $0x1,%zmm0,%ymm11 + vextracti64x4 $0x1,%zmm1,%ymm12 + vextracti64x4 $0x1,%zmm2,%ymm13 + vpaddq %zmm14,%zmm3,%zmm3{%k3}{z} + vpaddq %zmm15,%zmm4,%zmm4{%k3}{z} + vpaddq %zmm11,%zmm0,%zmm0{%k3}{z} + vpaddq %zmm12,%zmm1,%zmm1{%k3}{z} + vpaddq %zmm13,%zmm2,%zmm2{%k3}{z} + + + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpsrldq $6,%ymm7,%ymm9 + vpsrldq $6,%ymm8,%ymm10 + vpunpckhqdq %ymm8,%ymm7,%ymm6 + vpaddq %ymm14,%ymm4,%ymm4 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpunpcklqdq %ymm10,%ymm9,%ymm9 + vpunpcklqdq %ymm8,%ymm7,%ymm7 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $26,%ymm4,%ymm15 + vpand %ymm5,%ymm4,%ymm4 + + vpsrlq $26,%ymm1,%ymm12 + vpand %ymm5,%ymm1,%ymm1 + vpsrlq $30,%ymm9,%ymm10 + vpsrlq $4,%ymm9,%ymm9 + vpaddq %ymm12,%ymm2,%ymm2 + + vpaddq %ymm15,%ymm0,%ymm0 + vpsllq $2,%ymm15,%ymm15 + vpsrlq $26,%ymm7,%ymm8 + vpsrlq $40,%ymm6,%ymm6 + vpaddq %ymm15,%ymm0,%ymm0 + + vpsrlq $26,%ymm2,%ymm13 + vpand %ymm5,%ymm2,%ymm2 + vpand %ymm5,%ymm9,%ymm9 + vpand %ymm5,%ymm7,%ymm7 + vpaddq %ymm13,%ymm3,%ymm3 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm2,%ymm9,%ymm2 + vpand %ymm5,%ymm8,%ymm8 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpand %ymm5,%ymm10,%ymm10 + vpor 32(%rcx),%ymm6,%ymm6 + vpaddq %ymm14,%ymm4,%ymm4 + + leaq 144(%rsp),%rax + addq $64,%rdx + jnz L$tail_avx2 + + vpsubq %ymm9,%ymm2,%ymm2 + vmovd %xmm0,-112(%rdi) + vmovd %xmm1,-108(%rdi) + vmovd %xmm2,-104(%rdi) + vmovd %xmm3,-100(%rdi) + vmovd %xmm4,-96(%rdi) + vzeroall + leaq 8(%r11),%rsp + + .byte 0xf3,0xc3 + + + +.p2align 5 +poly1305_init_base2_44: + xorq %rax,%rax + movq %rax,0(%rdi) + movq %rax,8(%rdi) + movq %rax,16(%rdi) + +L$init_base2_44: + leaq poly1305_blocks_vpmadd52(%rip),%r10 + leaq poly1305_emit_base2_44(%rip),%r11 + + movq $0x0ffffffc0fffffff,%rax + movq $0x0ffffffc0ffffffc,%rcx + andq 0(%rsi),%rax + movq $0x00000fffffffffff,%r8 + andq 8(%rsi),%rcx + movq $0x00000fffffffffff,%r9 + andq %rax,%r8 + shrdq $44,%rcx,%rax + movq %r8,40(%rdi) + andq %r9,%rax + shrq $24,%rcx + movq %rax,48(%rdi) + leaq (%rax,%rax,4),%rax + movq %rcx,56(%rdi) + shlq $2,%rax + leaq (%rcx,%rcx,4),%rcx + shlq $2,%rcx + movq %rax,24(%rdi) + movq %rcx,32(%rdi) + movq $-1,64(%rdi) + movq %r10,0(%rdx) + movq %r11,8(%rdx) + movl $1,%eax + .byte 0xf3,0xc3 + + +.p2align 5 +poly1305_blocks_vpmadd52: + shrq $4,%rdx + jz L$no_data_vpmadd52 + + shlq $40,%rcx + movq 64(%rdi),%r8 + + + + + + + movq $3,%rax + movq $1,%r10 + cmpq $4,%rdx + cmovaeq %r10,%rax + testq %r8,%r8 + cmovnsq %r10,%rax + + andq %rdx,%rax + jz L$blocks_vpmadd52_4x + + subq %rax,%rdx + movl $7,%r10d + movl $1,%r11d + kmovw %r10d,%k7 + leaq L$2_44_inp_permd(%rip),%r10 + kmovw %r11d,%k1 + + vmovq %rcx,%xmm21 + vmovdqa64 0(%r10),%ymm19 + vmovdqa64 32(%r10),%ymm20 + vpermq $0xcf,%ymm21,%ymm21 + vmovdqa64 64(%r10),%ymm22 + + vmovdqu64 0(%rdi),%ymm16{%k7}{z} + vmovdqu64 40(%rdi),%ymm3{%k7}{z} + vmovdqu64 32(%rdi),%ymm4{%k7}{z} + vmovdqu64 24(%rdi),%ymm5{%k7}{z} + + vmovdqa64 96(%r10),%ymm23 + vmovdqa64 128(%r10),%ymm24 + + jmp L$oop_vpmadd52 + +.p2align 5 +L$oop_vpmadd52: + vmovdqu32 0(%rsi),%xmm18 + leaq 16(%rsi),%rsi + + vpermd %ymm18,%ymm19,%ymm18 + vpsrlvq %ymm20,%ymm18,%ymm18 + vpandq %ymm22,%ymm18,%ymm18 + vporq %ymm21,%ymm18,%ymm18 + + vpaddq %ymm18,%ymm16,%ymm16 + + vpermq $0,%ymm16,%ymm0{%k7}{z} + vpermq $85,%ymm16,%ymm1{%k7}{z} + vpermq $170,%ymm16,%ymm2{%k7}{z} + + vpxord %ymm16,%ymm16,%ymm16 + vpxord %ymm17,%ymm17,%ymm17 + + vpmadd52luq %ymm3,%ymm0,%ymm16 + vpmadd52huq %ymm3,%ymm0,%ymm17 + + vpmadd52luq %ymm4,%ymm1,%ymm16 + vpmadd52huq %ymm4,%ymm1,%ymm17 + + vpmadd52luq %ymm5,%ymm2,%ymm16 + vpmadd52huq %ymm5,%ymm2,%ymm17 + + vpsrlvq %ymm23,%ymm16,%ymm18 + vpsllvq %ymm24,%ymm17,%ymm17 + vpandq %ymm22,%ymm16,%ymm16 + + vpaddq %ymm18,%ymm17,%ymm17 + + vpermq $147,%ymm17,%ymm17 + + vpaddq %ymm17,%ymm16,%ymm16 + + vpsrlvq %ymm23,%ymm16,%ymm18 + vpandq %ymm22,%ymm16,%ymm16 + + vpermq $147,%ymm18,%ymm18 + + vpaddq %ymm18,%ymm16,%ymm16 + + vpermq $147,%ymm16,%ymm18{%k1}{z} + + vpaddq %ymm18,%ymm16,%ymm16 + vpsllq $2,%ymm18,%ymm18 + + vpaddq %ymm18,%ymm16,%ymm16 + + decq %rax + jnz L$oop_vpmadd52 + + vmovdqu64 %ymm16,0(%rdi){%k7} + + testq %rdx,%rdx + jnz L$blocks_vpmadd52_4x + +L$no_data_vpmadd52: + .byte 0xf3,0xc3 + + +.p2align 5 +poly1305_blocks_vpmadd52_4x: + shrq $4,%rdx + jz L$no_data_vpmadd52_4x + + shlq $40,%rcx + movq 64(%rdi),%r8 + +L$blocks_vpmadd52_4x: + vpbroadcastq %rcx,%ymm31 + + vmovdqa64 L$x_mask44(%rip),%ymm28 + movl $5,%eax + vmovdqa64 L$x_mask42(%rip),%ymm29 + kmovw %eax,%k1 + + testq %r8,%r8 + js L$init_vpmadd52 + + vmovq 0(%rdi),%xmm0 + vmovq 8(%rdi),%xmm1 + vmovq 16(%rdi),%xmm2 + + testq $3,%rdx + jnz L$blocks_vpmadd52_2x_do + +L$blocks_vpmadd52_4x_do: + vpbroadcastq 64(%rdi),%ymm3 + vpbroadcastq 96(%rdi),%ymm4 + vpbroadcastq 128(%rdi),%ymm5 + vpbroadcastq 160(%rdi),%ymm16 + +L$blocks_vpmadd52_4x_key_loaded: + vpsllq $2,%ymm5,%ymm17 + vpaddq %ymm5,%ymm17,%ymm17 + vpsllq $2,%ymm17,%ymm17 + + testq $7,%rdx + jz L$blocks_vpmadd52_8x + + vmovdqu64 0(%rsi),%ymm26 + vmovdqu64 32(%rsi),%ymm27 + leaq 64(%rsi),%rsi + + vpunpcklqdq %ymm27,%ymm26,%ymm25 + vpunpckhqdq %ymm27,%ymm26,%ymm27 + + + + vpsrlq $24,%ymm27,%ymm26 + vporq %ymm31,%ymm26,%ymm26 + vpaddq %ymm26,%ymm2,%ymm2 + vpandq %ymm28,%ymm25,%ymm24 + vpsrlq $44,%ymm25,%ymm25 + vpsllq $20,%ymm27,%ymm27 + vporq %ymm27,%ymm25,%ymm25 + vpandq %ymm28,%ymm25,%ymm25 + + subq $4,%rdx + jz L$tail_vpmadd52_4x + jmp L$oop_vpmadd52_4x + ud2 + +.p2align 5 +L$init_vpmadd52: + vmovq 24(%rdi),%xmm16 + vmovq 56(%rdi),%xmm2 + vmovq 32(%rdi),%xmm17 + vmovq 40(%rdi),%xmm3 + vmovq 48(%rdi),%xmm4 + + vmovdqa %ymm3,%ymm0 + vmovdqa %ymm4,%ymm1 + vmovdqa %ymm2,%ymm5 + + movl $2,%eax + +L$mul_init_vpmadd52: + vpxorq %ymm18,%ymm18,%ymm18 + vpmadd52luq %ymm2,%ymm16,%ymm18 + vpxorq %ymm19,%ymm19,%ymm19 + vpmadd52huq %ymm2,%ymm16,%ymm19 + vpxorq %ymm20,%ymm20,%ymm20 + vpmadd52luq %ymm2,%ymm17,%ymm20 + vpxorq %ymm21,%ymm21,%ymm21 + vpmadd52huq %ymm2,%ymm17,%ymm21 + vpxorq %ymm22,%ymm22,%ymm22 + vpmadd52luq %ymm2,%ymm3,%ymm22 + vpxorq %ymm23,%ymm23,%ymm23 + vpmadd52huq %ymm2,%ymm3,%ymm23 + + vpmadd52luq %ymm0,%ymm3,%ymm18 + vpmadd52huq %ymm0,%ymm3,%ymm19 + vpmadd52luq %ymm0,%ymm4,%ymm20 + vpmadd52huq %ymm0,%ymm4,%ymm21 + vpmadd52luq %ymm0,%ymm5,%ymm22 + vpmadd52huq %ymm0,%ymm5,%ymm23 + + vpmadd52luq %ymm1,%ymm17,%ymm18 + vpmadd52huq %ymm1,%ymm17,%ymm19 + vpmadd52luq %ymm1,%ymm3,%ymm20 + vpmadd52huq %ymm1,%ymm3,%ymm21 + vpmadd52luq %ymm1,%ymm4,%ymm22 + vpmadd52huq %ymm1,%ymm4,%ymm23 + + + + vpsrlq $44,%ymm18,%ymm30 + vpsllq $8,%ymm19,%ymm19 + vpandq %ymm28,%ymm18,%ymm0 + vpaddq %ymm30,%ymm19,%ymm19 + + vpaddq %ymm19,%ymm20,%ymm20 + + vpsrlq $44,%ymm20,%ymm30 + vpsllq $8,%ymm21,%ymm21 + vpandq %ymm28,%ymm20,%ymm1 + vpaddq %ymm30,%ymm21,%ymm21 + + vpaddq %ymm21,%ymm22,%ymm22 + + vpsrlq $42,%ymm22,%ymm30 + vpsllq $10,%ymm23,%ymm23 + vpandq %ymm29,%ymm22,%ymm2 + vpaddq %ymm30,%ymm23,%ymm23 + + vpaddq %ymm23,%ymm0,%ymm0 + vpsllq $2,%ymm23,%ymm23 + + vpaddq %ymm23,%ymm0,%ymm0 + + vpsrlq $44,%ymm0,%ymm30 + vpandq %ymm28,%ymm0,%ymm0 + + vpaddq %ymm30,%ymm1,%ymm1 + + decl %eax + jz L$done_init_vpmadd52 + + vpunpcklqdq %ymm4,%ymm1,%ymm4 + vpbroadcastq %xmm1,%xmm1 + vpunpcklqdq %ymm5,%ymm2,%ymm5 + vpbroadcastq %xmm2,%xmm2 + vpunpcklqdq %ymm3,%ymm0,%ymm3 + vpbroadcastq %xmm0,%xmm0 + + vpsllq $2,%ymm4,%ymm16 + vpsllq $2,%ymm5,%ymm17 + vpaddq %ymm4,%ymm16,%ymm16 + vpaddq %ymm5,%ymm17,%ymm17 + vpsllq $2,%ymm16,%ymm16 + vpsllq $2,%ymm17,%ymm17 + + jmp L$mul_init_vpmadd52 + ud2 + +.p2align 5 +L$done_init_vpmadd52: + vinserti128 $1,%xmm4,%ymm1,%ymm4 + vinserti128 $1,%xmm5,%ymm2,%ymm5 + vinserti128 $1,%xmm3,%ymm0,%ymm3 + + vpermq $216,%ymm4,%ymm4 + vpermq $216,%ymm5,%ymm5 + vpermq $216,%ymm3,%ymm3 + + vpsllq $2,%ymm4,%ymm16 + vpaddq %ymm4,%ymm16,%ymm16 + vpsllq $2,%ymm16,%ymm16 + + vmovq 0(%rdi),%xmm0 + vmovq 8(%rdi),%xmm1 + vmovq 16(%rdi),%xmm2 + + testq $3,%rdx + jnz L$done_init_vpmadd52_2x + + vmovdqu64 %ymm3,64(%rdi) + vpbroadcastq %xmm3,%ymm3 + vmovdqu64 %ymm4,96(%rdi) + vpbroadcastq %xmm4,%ymm4 + vmovdqu64 %ymm5,128(%rdi) + vpbroadcastq %xmm5,%ymm5 + vmovdqu64 %ymm16,160(%rdi) + vpbroadcastq %xmm16,%ymm16 + + jmp L$blocks_vpmadd52_4x_key_loaded + ud2 + +.p2align 5 +L$done_init_vpmadd52_2x: + vmovdqu64 %ymm3,64(%rdi) + vpsrldq $8,%ymm3,%ymm3 + vmovdqu64 %ymm4,96(%rdi) + vpsrldq $8,%ymm4,%ymm4 + vmovdqu64 %ymm5,128(%rdi) + vpsrldq $8,%ymm5,%ymm5 + vmovdqu64 %ymm16,160(%rdi) + vpsrldq $8,%ymm16,%ymm16 + jmp L$blocks_vpmadd52_2x_key_loaded + ud2 + +.p2align 5 +L$blocks_vpmadd52_2x_do: + vmovdqu64 128+8(%rdi),%ymm5{%k1}{z} + vmovdqu64 160+8(%rdi),%ymm16{%k1}{z} + vmovdqu64 64+8(%rdi),%ymm3{%k1}{z} + vmovdqu64 96+8(%rdi),%ymm4{%k1}{z} + +L$blocks_vpmadd52_2x_key_loaded: + vmovdqu64 0(%rsi),%ymm26 + vpxorq %ymm27,%ymm27,%ymm27 + leaq 32(%rsi),%rsi + + vpunpcklqdq %ymm27,%ymm26,%ymm25 + vpunpckhqdq %ymm27,%ymm26,%ymm27 + + + + vpsrlq $24,%ymm27,%ymm26 + vporq %ymm31,%ymm26,%ymm26 + vpaddq %ymm26,%ymm2,%ymm2 + vpandq %ymm28,%ymm25,%ymm24 + vpsrlq $44,%ymm25,%ymm25 + vpsllq $20,%ymm27,%ymm27 + vporq %ymm27,%ymm25,%ymm25 + vpandq %ymm28,%ymm25,%ymm25 + + jmp L$tail_vpmadd52_2x + ud2 + +.p2align 5 +L$oop_vpmadd52_4x: + + vpaddq %ymm24,%ymm0,%ymm0 + vpaddq %ymm25,%ymm1,%ymm1 + + vpxorq %ymm18,%ymm18,%ymm18 + vpmadd52luq %ymm2,%ymm16,%ymm18 + vpxorq %ymm19,%ymm19,%ymm19 + vpmadd52huq %ymm2,%ymm16,%ymm19 + vpxorq %ymm20,%ymm20,%ymm20 + vpmadd52luq %ymm2,%ymm17,%ymm20 + vpxorq %ymm21,%ymm21,%ymm21 + vpmadd52huq %ymm2,%ymm17,%ymm21 + vpxorq %ymm22,%ymm22,%ymm22 + vpmadd52luq %ymm2,%ymm3,%ymm22 + vpxorq %ymm23,%ymm23,%ymm23 + vpmadd52huq %ymm2,%ymm3,%ymm23 + + vmovdqu64 0(%rsi),%ymm26 + vmovdqu64 32(%rsi),%ymm27 + leaq 64(%rsi),%rsi + vpmadd52luq %ymm0,%ymm3,%ymm18 + vpmadd52huq %ymm0,%ymm3,%ymm19 + vpmadd52luq %ymm0,%ymm4,%ymm20 + vpmadd52huq %ymm0,%ymm4,%ymm21 + vpmadd52luq %ymm0,%ymm5,%ymm22 + vpmadd52huq %ymm0,%ymm5,%ymm23 + + vpunpcklqdq %ymm27,%ymm26,%ymm25 + vpunpckhqdq %ymm27,%ymm26,%ymm27 + vpmadd52luq %ymm1,%ymm17,%ymm18 + vpmadd52huq %ymm1,%ymm17,%ymm19 + vpmadd52luq %ymm1,%ymm3,%ymm20 + vpmadd52huq %ymm1,%ymm3,%ymm21 + vpmadd52luq %ymm1,%ymm4,%ymm22 + vpmadd52huq %ymm1,%ymm4,%ymm23 + + + + vpsrlq $44,%ymm18,%ymm30 + vpsllq $8,%ymm19,%ymm19 + vpandq %ymm28,%ymm18,%ymm0 + vpaddq %ymm30,%ymm19,%ymm19 + + vpsrlq $24,%ymm27,%ymm26 + vporq %ymm31,%ymm26,%ymm26 + vpaddq %ymm19,%ymm20,%ymm20 + + vpsrlq $44,%ymm20,%ymm30 + vpsllq $8,%ymm21,%ymm21 + vpandq %ymm28,%ymm20,%ymm1 + vpaddq %ymm30,%ymm21,%ymm21 + + vpandq %ymm28,%ymm25,%ymm24 + vpsrlq $44,%ymm25,%ymm25 + vpsllq $20,%ymm27,%ymm27 + vpaddq %ymm21,%ymm22,%ymm22 + + vpsrlq $42,%ymm22,%ymm30 + vpsllq $10,%ymm23,%ymm23 + vpandq %ymm29,%ymm22,%ymm2 + vpaddq %ymm30,%ymm23,%ymm23 + + vpaddq %ymm26,%ymm2,%ymm2 + vpaddq %ymm23,%ymm0,%ymm0 + vpsllq $2,%ymm23,%ymm23 + + vpaddq %ymm23,%ymm0,%ymm0 + vporq %ymm27,%ymm25,%ymm25 + vpandq %ymm28,%ymm25,%ymm25 + + vpsrlq $44,%ymm0,%ymm30 + vpandq %ymm28,%ymm0,%ymm0 + + vpaddq %ymm30,%ymm1,%ymm1 + + subq $4,%rdx + jnz L$oop_vpmadd52_4x + +L$tail_vpmadd52_4x: + vmovdqu64 128(%rdi),%ymm5 + vmovdqu64 160(%rdi),%ymm16 + vmovdqu64 64(%rdi),%ymm3 + vmovdqu64 96(%rdi),%ymm4 + +L$tail_vpmadd52_2x: + vpsllq $2,%ymm5,%ymm17 + vpaddq %ymm5,%ymm17,%ymm17 + vpsllq $2,%ymm17,%ymm17 + + + vpaddq %ymm24,%ymm0,%ymm0 + vpaddq %ymm25,%ymm1,%ymm1 + + vpxorq %ymm18,%ymm18,%ymm18 + vpmadd52luq %ymm2,%ymm16,%ymm18 + vpxorq %ymm19,%ymm19,%ymm19 + vpmadd52huq %ymm2,%ymm16,%ymm19 + vpxorq %ymm20,%ymm20,%ymm20 + vpmadd52luq %ymm2,%ymm17,%ymm20 + vpxorq %ymm21,%ymm21,%ymm21 + vpmadd52huq %ymm2,%ymm17,%ymm21 + vpxorq %ymm22,%ymm22,%ymm22 + vpmadd52luq %ymm2,%ymm3,%ymm22 + vpxorq %ymm23,%ymm23,%ymm23 + vpmadd52huq %ymm2,%ymm3,%ymm23 + + vpmadd52luq %ymm0,%ymm3,%ymm18 + vpmadd52huq %ymm0,%ymm3,%ymm19 + vpmadd52luq %ymm0,%ymm4,%ymm20 + vpmadd52huq %ymm0,%ymm4,%ymm21 + vpmadd52luq %ymm0,%ymm5,%ymm22 + vpmadd52huq %ymm0,%ymm5,%ymm23 + + vpmadd52luq %ymm1,%ymm17,%ymm18 + vpmadd52huq %ymm1,%ymm17,%ymm19 + vpmadd52luq %ymm1,%ymm3,%ymm20 + vpmadd52huq %ymm1,%ymm3,%ymm21 + vpmadd52luq %ymm1,%ymm4,%ymm22 + vpmadd52huq %ymm1,%ymm4,%ymm23 + + + + + movl $1,%eax + kmovw %eax,%k1 + vpsrldq $8,%ymm18,%ymm24 + vpsrldq $8,%ymm19,%ymm0 + vpsrldq $8,%ymm20,%ymm25 + vpsrldq $8,%ymm21,%ymm1 + vpaddq %ymm24,%ymm18,%ymm18 + vpaddq %ymm0,%ymm19,%ymm19 + vpsrldq $8,%ymm22,%ymm26 + vpsrldq $8,%ymm23,%ymm2 + vpaddq %ymm25,%ymm20,%ymm20 + vpaddq %ymm1,%ymm21,%ymm21 + vpermq $0x2,%ymm18,%ymm24 + vpermq $0x2,%ymm19,%ymm0 + vpaddq %ymm26,%ymm22,%ymm22 + vpaddq %ymm2,%ymm23,%ymm23 + + vpermq $0x2,%ymm20,%ymm25 + vpermq $0x2,%ymm21,%ymm1 + vpaddq %ymm24,%ymm18,%ymm18{%k1}{z} + vpaddq %ymm0,%ymm19,%ymm19{%k1}{z} + vpermq $0x2,%ymm22,%ymm26 + vpermq $0x2,%ymm23,%ymm2 + vpaddq %ymm25,%ymm20,%ymm20{%k1}{z} + vpaddq %ymm1,%ymm21,%ymm21{%k1}{z} + vpaddq %ymm26,%ymm22,%ymm22{%k1}{z} + vpaddq %ymm2,%ymm23,%ymm23{%k1}{z} + + + + vpsrlq $44,%ymm18,%ymm30 + vpsllq $8,%ymm19,%ymm19 + vpandq %ymm28,%ymm18,%ymm0 + vpaddq %ymm30,%ymm19,%ymm19 + + vpaddq %ymm19,%ymm20,%ymm20 + + vpsrlq $44,%ymm20,%ymm30 + vpsllq $8,%ymm21,%ymm21 + vpandq %ymm28,%ymm20,%ymm1 + vpaddq %ymm30,%ymm21,%ymm21 + + vpaddq %ymm21,%ymm22,%ymm22 + + vpsrlq $42,%ymm22,%ymm30 + vpsllq $10,%ymm23,%ymm23 + vpandq %ymm29,%ymm22,%ymm2 + vpaddq %ymm30,%ymm23,%ymm23 + + vpaddq %ymm23,%ymm0,%ymm0 + vpsllq $2,%ymm23,%ymm23 + + vpaddq %ymm23,%ymm0,%ymm0 + + vpsrlq $44,%ymm0,%ymm30 + vpandq %ymm28,%ymm0,%ymm0 + + vpaddq %ymm30,%ymm1,%ymm1 + + + subq $2,%rdx + ja L$blocks_vpmadd52_4x_do + + vmovq %xmm0,0(%rdi) + vmovq %xmm1,8(%rdi) + vmovq %xmm2,16(%rdi) + vzeroall + +L$no_data_vpmadd52_4x: + .byte 0xf3,0xc3 + + +.p2align 5 +poly1305_blocks_vpmadd52_8x: + shrq $4,%rdx + jz L$no_data_vpmadd52_8x + + shlq $40,%rcx + movq 64(%rdi),%r8 + + vmovdqa64 L$x_mask44(%rip),%ymm28 + vmovdqa64 L$x_mask42(%rip),%ymm29 + + testq %r8,%r8 + js L$init_vpmadd52 + + vmovq 0(%rdi),%xmm0 + vmovq 8(%rdi),%xmm1 + vmovq 16(%rdi),%xmm2 + +L$blocks_vpmadd52_8x: + + + + vmovdqu64 128(%rdi),%ymm5 + vmovdqu64 160(%rdi),%ymm16 + vmovdqu64 64(%rdi),%ymm3 + vmovdqu64 96(%rdi),%ymm4 + + vpsllq $2,%ymm5,%ymm17 + vpaddq %ymm5,%ymm17,%ymm17 + vpsllq $2,%ymm17,%ymm17 + + vpbroadcastq %xmm5,%ymm8 + vpbroadcastq %xmm3,%ymm6 + vpbroadcastq %xmm4,%ymm7 + + vpxorq %ymm18,%ymm18,%ymm18 + vpmadd52luq %ymm8,%ymm16,%ymm18 + vpxorq %ymm19,%ymm19,%ymm19 + vpmadd52huq %ymm8,%ymm16,%ymm19 + vpxorq %ymm20,%ymm20,%ymm20 + vpmadd52luq %ymm8,%ymm17,%ymm20 + vpxorq %ymm21,%ymm21,%ymm21 + vpmadd52huq %ymm8,%ymm17,%ymm21 + vpxorq %ymm22,%ymm22,%ymm22 + vpmadd52luq %ymm8,%ymm3,%ymm22 + vpxorq %ymm23,%ymm23,%ymm23 + vpmadd52huq %ymm8,%ymm3,%ymm23 + + vpmadd52luq %ymm6,%ymm3,%ymm18 + vpmadd52huq %ymm6,%ymm3,%ymm19 + vpmadd52luq %ymm6,%ymm4,%ymm20 + vpmadd52huq %ymm6,%ymm4,%ymm21 + vpmadd52luq %ymm6,%ymm5,%ymm22 + vpmadd52huq %ymm6,%ymm5,%ymm23 + + vpmadd52luq %ymm7,%ymm17,%ymm18 + vpmadd52huq %ymm7,%ymm17,%ymm19 + vpmadd52luq %ymm7,%ymm3,%ymm20 + vpmadd52huq %ymm7,%ymm3,%ymm21 + vpmadd52luq %ymm7,%ymm4,%ymm22 + vpmadd52huq %ymm7,%ymm4,%ymm23 + + + + vpsrlq $44,%ymm18,%ymm30 + vpsllq $8,%ymm19,%ymm19 + vpandq %ymm28,%ymm18,%ymm6 + vpaddq %ymm30,%ymm19,%ymm19 + + vpaddq %ymm19,%ymm20,%ymm20 + + vpsrlq $44,%ymm20,%ymm30 + vpsllq $8,%ymm21,%ymm21 + vpandq %ymm28,%ymm20,%ymm7 + vpaddq %ymm30,%ymm21,%ymm21 + + vpaddq %ymm21,%ymm22,%ymm22 + + vpsrlq $42,%ymm22,%ymm30 + vpsllq $10,%ymm23,%ymm23 + vpandq %ymm29,%ymm22,%ymm8 + vpaddq %ymm30,%ymm23,%ymm23 + + vpaddq %ymm23,%ymm6,%ymm6 + vpsllq $2,%ymm23,%ymm23 + + vpaddq %ymm23,%ymm6,%ymm6 + + vpsrlq $44,%ymm6,%ymm30 + vpandq %ymm28,%ymm6,%ymm6 + + vpaddq %ymm30,%ymm7,%ymm7 + + + + + + vpunpcklqdq %ymm5,%ymm8,%ymm26 + vpunpckhqdq %ymm5,%ymm8,%ymm5 + vpunpcklqdq %ymm3,%ymm6,%ymm24 + vpunpckhqdq %ymm3,%ymm6,%ymm3 + vpunpcklqdq %ymm4,%ymm7,%ymm25 + vpunpckhqdq %ymm4,%ymm7,%ymm4 + vshufi64x2 $0x44,%zmm5,%zmm26,%zmm8 + vshufi64x2 $0x44,%zmm3,%zmm24,%zmm6 + vshufi64x2 $0x44,%zmm4,%zmm25,%zmm7 + + vmovdqu64 0(%rsi),%zmm26 + vmovdqu64 64(%rsi),%zmm27 + leaq 128(%rsi),%rsi + + vpsllq $2,%zmm8,%zmm10 + vpsllq $2,%zmm7,%zmm9 + vpaddq %zmm8,%zmm10,%zmm10 + vpaddq %zmm7,%zmm9,%zmm9 + vpsllq $2,%zmm10,%zmm10 + vpsllq $2,%zmm9,%zmm9 + + vpbroadcastq %rcx,%zmm31 + vpbroadcastq %xmm28,%zmm28 + vpbroadcastq %xmm29,%zmm29 + + vpbroadcastq %xmm9,%zmm16 + vpbroadcastq %xmm10,%zmm17 + vpbroadcastq %xmm6,%zmm3 + vpbroadcastq %xmm7,%zmm4 + vpbroadcastq %xmm8,%zmm5 + + vpunpcklqdq %zmm27,%zmm26,%zmm25 + vpunpckhqdq %zmm27,%zmm26,%zmm27 + + + + vpsrlq $24,%zmm27,%zmm26 + vporq %zmm31,%zmm26,%zmm26 + vpaddq %zmm26,%zmm2,%zmm2 + vpandq %zmm28,%zmm25,%zmm24 + vpsrlq $44,%zmm25,%zmm25 + vpsllq $20,%zmm27,%zmm27 + vporq %zmm27,%zmm25,%zmm25 + vpandq %zmm28,%zmm25,%zmm25 + + subq $8,%rdx + jz L$tail_vpmadd52_8x + jmp L$oop_vpmadd52_8x + +.p2align 5 +L$oop_vpmadd52_8x: + + vpaddq %zmm24,%zmm0,%zmm0 + vpaddq %zmm25,%zmm1,%zmm1 + + vpxorq %zmm18,%zmm18,%zmm18 + vpmadd52luq %zmm2,%zmm16,%zmm18 + vpxorq %zmm19,%zmm19,%zmm19 + vpmadd52huq %zmm2,%zmm16,%zmm19 + vpxorq %zmm20,%zmm20,%zmm20 + vpmadd52luq %zmm2,%zmm17,%zmm20 + vpxorq %zmm21,%zmm21,%zmm21 + vpmadd52huq %zmm2,%zmm17,%zmm21 + vpxorq %zmm22,%zmm22,%zmm22 + vpmadd52luq %zmm2,%zmm3,%zmm22 + vpxorq %zmm23,%zmm23,%zmm23 + vpmadd52huq %zmm2,%zmm3,%zmm23 + + vmovdqu64 0(%rsi),%zmm26 + vmovdqu64 64(%rsi),%zmm27 + leaq 128(%rsi),%rsi + vpmadd52luq %zmm0,%zmm3,%zmm18 + vpmadd52huq %zmm0,%zmm3,%zmm19 + vpmadd52luq %zmm0,%zmm4,%zmm20 + vpmadd52huq %zmm0,%zmm4,%zmm21 + vpmadd52luq %zmm0,%zmm5,%zmm22 + vpmadd52huq %zmm0,%zmm5,%zmm23 + + vpunpcklqdq %zmm27,%zmm26,%zmm25 + vpunpckhqdq %zmm27,%zmm26,%zmm27 + vpmadd52luq %zmm1,%zmm17,%zmm18 + vpmadd52huq %zmm1,%zmm17,%zmm19 + vpmadd52luq %zmm1,%zmm3,%zmm20 + vpmadd52huq %zmm1,%zmm3,%zmm21 + vpmadd52luq %zmm1,%zmm4,%zmm22 + vpmadd52huq %zmm1,%zmm4,%zmm23 + + + + vpsrlq $44,%zmm18,%zmm30 + vpsllq $8,%zmm19,%zmm19 + vpandq %zmm28,%zmm18,%zmm0 + vpaddq %zmm30,%zmm19,%zmm19 + + vpsrlq $24,%zmm27,%zmm26 + vporq %zmm31,%zmm26,%zmm26 + vpaddq %zmm19,%zmm20,%zmm20 + + vpsrlq $44,%zmm20,%zmm30 + vpsllq $8,%zmm21,%zmm21 + vpandq %zmm28,%zmm20,%zmm1 + vpaddq %zmm30,%zmm21,%zmm21 + + vpandq %zmm28,%zmm25,%zmm24 + vpsrlq $44,%zmm25,%zmm25 + vpsllq $20,%zmm27,%zmm27 + vpaddq %zmm21,%zmm22,%zmm22 + + vpsrlq $42,%zmm22,%zmm30 + vpsllq $10,%zmm23,%zmm23 + vpandq %zmm29,%zmm22,%zmm2 + vpaddq %zmm30,%zmm23,%zmm23 + + vpaddq %zmm26,%zmm2,%zmm2 + vpaddq %zmm23,%zmm0,%zmm0 + vpsllq $2,%zmm23,%zmm23 + + vpaddq %zmm23,%zmm0,%zmm0 + vporq %zmm27,%zmm25,%zmm25 + vpandq %zmm28,%zmm25,%zmm25 + + vpsrlq $44,%zmm0,%zmm30 + vpandq %zmm28,%zmm0,%zmm0 + + vpaddq %zmm30,%zmm1,%zmm1 + + subq $8,%rdx + jnz L$oop_vpmadd52_8x + +L$tail_vpmadd52_8x: + + vpaddq %zmm24,%zmm0,%zmm0 + vpaddq %zmm25,%zmm1,%zmm1 + + vpxorq %zmm18,%zmm18,%zmm18 + vpmadd52luq %zmm2,%zmm9,%zmm18 + vpxorq %zmm19,%zmm19,%zmm19 + vpmadd52huq %zmm2,%zmm9,%zmm19 + vpxorq %zmm20,%zmm20,%zmm20 + vpmadd52luq %zmm2,%zmm10,%zmm20 + vpxorq %zmm21,%zmm21,%zmm21 + vpmadd52huq %zmm2,%zmm10,%zmm21 + vpxorq %zmm22,%zmm22,%zmm22 + vpmadd52luq %zmm2,%zmm6,%zmm22 + vpxorq %zmm23,%zmm23,%zmm23 + vpmadd52huq %zmm2,%zmm6,%zmm23 + + vpmadd52luq %zmm0,%zmm6,%zmm18 + vpmadd52huq %zmm0,%zmm6,%zmm19 + vpmadd52luq %zmm0,%zmm7,%zmm20 + vpmadd52huq %zmm0,%zmm7,%zmm21 + vpmadd52luq %zmm0,%zmm8,%zmm22 + vpmadd52huq %zmm0,%zmm8,%zmm23 + + vpmadd52luq %zmm1,%zmm10,%zmm18 + vpmadd52huq %zmm1,%zmm10,%zmm19 + vpmadd52luq %zmm1,%zmm6,%zmm20 + vpmadd52huq %zmm1,%zmm6,%zmm21 + vpmadd52luq %zmm1,%zmm7,%zmm22 + vpmadd52huq %zmm1,%zmm7,%zmm23 + + + + + movl $1,%eax + kmovw %eax,%k1 + vpsrldq $8,%zmm18,%zmm24 + vpsrldq $8,%zmm19,%zmm0 + vpsrldq $8,%zmm20,%zmm25 + vpsrldq $8,%zmm21,%zmm1 + vpaddq %zmm24,%zmm18,%zmm18 + vpaddq %zmm0,%zmm19,%zmm19 + vpsrldq $8,%zmm22,%zmm26 + vpsrldq $8,%zmm23,%zmm2 + vpaddq %zmm25,%zmm20,%zmm20 + vpaddq %zmm1,%zmm21,%zmm21 + vpermq $0x2,%zmm18,%zmm24 + vpermq $0x2,%zmm19,%zmm0 + vpaddq %zmm26,%zmm22,%zmm22 + vpaddq %zmm2,%zmm23,%zmm23 + + vpermq $0x2,%zmm20,%zmm25 + vpermq $0x2,%zmm21,%zmm1 + vpaddq %zmm24,%zmm18,%zmm18 + vpaddq %zmm0,%zmm19,%zmm19 + vpermq $0x2,%zmm22,%zmm26 + vpermq $0x2,%zmm23,%zmm2 + vpaddq %zmm25,%zmm20,%zmm20 + vpaddq %zmm1,%zmm21,%zmm21 + vextracti64x4 $1,%zmm18,%ymm24 + vextracti64x4 $1,%zmm19,%ymm0 + vpaddq %zmm26,%zmm22,%zmm22 + vpaddq %zmm2,%zmm23,%zmm23 + + vextracti64x4 $1,%zmm20,%ymm25 + vextracti64x4 $1,%zmm21,%ymm1 + vextracti64x4 $1,%zmm22,%ymm26 + vextracti64x4 $1,%zmm23,%ymm2 + vpaddq %ymm24,%ymm18,%ymm18{%k1}{z} + vpaddq %ymm0,%ymm19,%ymm19{%k1}{z} + vpaddq %ymm25,%ymm20,%ymm20{%k1}{z} + vpaddq %ymm1,%ymm21,%ymm21{%k1}{z} + vpaddq %ymm26,%ymm22,%ymm22{%k1}{z} + vpaddq %ymm2,%ymm23,%ymm23{%k1}{z} + + + + vpsrlq $44,%ymm18,%ymm30 + vpsllq $8,%ymm19,%ymm19 + vpandq %ymm28,%ymm18,%ymm0 + vpaddq %ymm30,%ymm19,%ymm19 + + vpaddq %ymm19,%ymm20,%ymm20 + + vpsrlq $44,%ymm20,%ymm30 + vpsllq $8,%ymm21,%ymm21 + vpandq %ymm28,%ymm20,%ymm1 + vpaddq %ymm30,%ymm21,%ymm21 + + vpaddq %ymm21,%ymm22,%ymm22 + + vpsrlq $42,%ymm22,%ymm30 + vpsllq $10,%ymm23,%ymm23 + vpandq %ymm29,%ymm22,%ymm2 + vpaddq %ymm30,%ymm23,%ymm23 + + vpaddq %ymm23,%ymm0,%ymm0 + vpsllq $2,%ymm23,%ymm23 + + vpaddq %ymm23,%ymm0,%ymm0 + + vpsrlq $44,%ymm0,%ymm30 + vpandq %ymm28,%ymm0,%ymm0 + + vpaddq %ymm30,%ymm1,%ymm1 + + + + vmovq %xmm0,0(%rdi) + vmovq %xmm1,8(%rdi) + vmovq %xmm2,16(%rdi) + vzeroall + +L$no_data_vpmadd52_8x: + .byte 0xf3,0xc3 + + +.p2align 5 +poly1305_emit_base2_44: + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + + movq %r9,%rax + shrq $20,%r9 + shlq $44,%rax + movq %r10,%rcx + shrq $40,%r10 + shlq $24,%rcx + + addq %rax,%r8 + adcq %rcx,%r9 + adcq $0,%r10 + + movq %r8,%rax + addq $5,%r8 + movq %r9,%rcx + adcq $0,%r9 + adcq $0,%r10 + shrq $2,%r10 + cmovnzq %r8,%rax + cmovnzq %r9,%rcx + + addq 0(%rdx),%rax + adcq 8(%rdx),%rcx + movq %rax,0(%rsi) + movq %rcx,8(%rsi) + + .byte 0xf3,0xc3 + .p2align 6 L$const: L$mask24: @@ -1822,7 +3406,125 @@ L$129: .long 16777216,0,16777216,0,16777216,0,16777216,0 L$mask26: .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 -L$five: -.long 5,0,5,0,5,0,5,0 +L$permd_avx2: +.long 2,2,2,3,2,0,2,1 +L$permd_avx512: +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 + +L$2_44_inp_permd: +.long 0,1,1,2,2,3,7,7 +L$2_44_inp_shift: +.quad 0,12,24,64 +L$2_44_mask: +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff +L$2_44_shift_rgt: +.quad 44,44,42,64 +L$2_44_shift_lft: +.quad 8,8,10,64 + +.p2align 6 +L$x_mask44: +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +L$x_mask42: +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 4 +.globl _xor128_encrypt_n_pad + +.p2align 4 +_xor128_encrypt_n_pad: + subq %rdx,%rsi + subq %rdx,%rdi + movq %rcx,%r10 + shrq $4,%rcx + jz L$tail_enc + nop +L$oop_enc_xmm: + movdqu (%rsi,%rdx,1),%xmm0 + pxor (%rdx),%xmm0 + movdqu %xmm0,(%rdi,%rdx,1) + movdqa %xmm0,(%rdx) + leaq 16(%rdx),%rdx + decq %rcx + jnz L$oop_enc_xmm + + andq $15,%r10 + jz L$done_enc + +L$tail_enc: + movq $16,%rcx + subq %r10,%rcx + xorl %eax,%eax +L$oop_enc_byte: + movb (%rsi,%rdx,1),%al + xorb (%rdx),%al + movb %al,(%rdi,%rdx,1) + movb %al,(%rdx) + leaq 1(%rdx),%rdx + decq %r10 + jnz L$oop_enc_byte + + xorl %eax,%eax +L$oop_enc_pad: + movb %al,(%rdx) + leaq 1(%rdx),%rdx + decq %rcx + jnz L$oop_enc_pad + +L$done_enc: + movq %rdx,%rax + .byte 0xf3,0xc3 + + +.globl _xor128_decrypt_n_pad + +.p2align 4 +_xor128_decrypt_n_pad: + subq %rdx,%rsi + subq %rdx,%rdi + movq %rcx,%r10 + shrq $4,%rcx + jz L$tail_dec + nop +L$oop_dec_xmm: + movdqu (%rsi,%rdx,1),%xmm0 + movdqa (%rdx),%xmm1 + pxor %xmm0,%xmm1 + movdqu %xmm1,(%rdi,%rdx,1) + movdqa %xmm0,(%rdx) + leaq 16(%rdx),%rdx + decq %rcx + jnz L$oop_dec_xmm + + pxor %xmm1,%xmm1 + andq $15,%r10 + jz L$done_dec + +L$tail_dec: + movq $16,%rcx + subq %r10,%rcx + xorl %eax,%eax + xorq %r11,%r11 +L$oop_dec_byte: + movb (%rsi,%rdx,1),%r11b + movb (%rdx),%al + xorb %r11b,%al + movb %al,(%rdi,%rdx,1) + movb %r11b,(%rdx) + leaq 1(%rdx),%rdx + decq %r10 + jnz L$oop_dec_byte + + xorl %eax,%eax +L$oop_dec_pad: + movb %al,(%rdx) + leaq 1(%rdx),%rdx + decq %rcx + jnz L$oop_dec_pad + +L$done_dec: + movq %rdx,%rax + .byte 0xf3,0xc3 + diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/rc4/rc4-md5-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/rc4/rc4-md5-x86_64.s index 47dce361a6..435976a260 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/rc4/rc4-md5-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/rc4/rc4-md5-x86_64.s @@ -4,15 +4,23 @@ .globl _rc4_md5_enc _rc4_md5_enc: + cmpq $0,%r9 je L$abort pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $40,%rsp + L$body: movq %rcx,%r11 movq %r9,%r12 @@ -1247,13 +1255,21 @@ L$oop: movl %ecx,-4(%rdi) movq 40(%rsp),%r15 + movq 48(%rsp),%r14 + movq 56(%rsp),%r13 + movq 64(%rsp),%r12 + movq 72(%rsp),%rbp + movq 80(%rsp),%rbx + leaq 88(%rsp),%rsp + L$epilogue: L$abort: .byte 0xf3,0xc3 + diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/rc4/rc4-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/rc4/rc4-x86_64.s index 86ef486662..8bfb48b74a 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/rc4/rc4-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/rc4/rc4-x86_64.s @@ -8,9 +8,13 @@ _RC4: orq %rsi,%rsi jne L$entry .byte 0xf3,0xc3 L$entry: + pushq %rbx + pushq %r12 + pushq %r13 + L$prologue: movq %rsi,%r11 movq %rdx,%r12 @@ -511,12 +515,17 @@ L$exit: movl %ecx,-4(%rdi) movq (%rsp),%r13 + movq 8(%rsp),%r12 + movq 16(%rsp),%rbx + addq $24,%rsp + L$epilogue: .byte 0xf3,0xc3 + .globl _RC4_set_key .p2align 4 diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/keccak1600-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/keccak1600-x86_64.s new file mode 100644 index 0000000000..ec096c5ab0 --- /dev/null +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/keccak1600-x86_64.s @@ -0,0 +1,492 @@ +.text + + +.p2align 5 +__KeccakF1600: + movq 60(%rdi),%rax + movq 68(%rdi),%rbx + movq 76(%rdi),%rcx + movq 84(%rdi),%rdx + movq 92(%rdi),%rbp + jmp L$oop + +.p2align 5 +L$oop: + movq -100(%rdi),%r8 + movq -52(%rdi),%r9 + movq -4(%rdi),%r10 + movq 44(%rdi),%r11 + + xorq -84(%rdi),%rcx + xorq -76(%rdi),%rdx + xorq %r8,%rax + xorq -92(%rdi),%rbx + xorq -44(%rdi),%rcx + xorq -60(%rdi),%rax + movq %rbp,%r12 + xorq -68(%rdi),%rbp + + xorq %r10,%rcx + xorq -20(%rdi),%rax + xorq -36(%rdi),%rdx + xorq %r9,%rbx + xorq -28(%rdi),%rbp + + xorq 36(%rdi),%rcx + xorq 20(%rdi),%rax + xorq 4(%rdi),%rdx + xorq -12(%rdi),%rbx + xorq 12(%rdi),%rbp + + movq %rcx,%r13 + rolq $1,%rcx + xorq %rax,%rcx + xorq %r11,%rdx + + rolq $1,%rax + xorq %rdx,%rax + xorq 28(%rdi),%rbx + + rolq $1,%rdx + xorq %rbx,%rdx + xorq 52(%rdi),%rbp + + rolq $1,%rbx + xorq %rbp,%rbx + + rolq $1,%rbp + xorq %r13,%rbp + xorq %rcx,%r9 + xorq %rdx,%r10 + rolq $44,%r9 + xorq %rbp,%r11 + xorq %rax,%r12 + rolq $43,%r10 + xorq %rbx,%r8 + movq %r9,%r13 + rolq $21,%r11 + orq %r10,%r9 + xorq %r8,%r9 + rolq $14,%r12 + + xorq (%r15),%r9 + leaq 8(%r15),%r15 + + movq %r12,%r14 + andq %r11,%r12 + movq %r9,-100(%rsi) + xorq %r10,%r12 + notq %r10 + movq %r12,-84(%rsi) + + orq %r11,%r10 + movq 76(%rdi),%r12 + xorq %r13,%r10 + movq %r10,-92(%rsi) + + andq %r8,%r13 + movq -28(%rdi),%r9 + xorq %r14,%r13 + movq -20(%rdi),%r10 + movq %r13,-68(%rsi) + + orq %r8,%r14 + movq -76(%rdi),%r8 + xorq %r11,%r14 + movq 28(%rdi),%r11 + movq %r14,-76(%rsi) + + + xorq %rbp,%r8 + xorq %rdx,%r12 + rolq $28,%r8 + xorq %rcx,%r11 + xorq %rax,%r9 + rolq $61,%r12 + rolq $45,%r11 + xorq %rbx,%r10 + rolq $20,%r9 + movq %r8,%r13 + orq %r12,%r8 + rolq $3,%r10 + + xorq %r11,%r8 + movq %r8,-36(%rsi) + + movq %r9,%r14 + andq %r13,%r9 + movq -92(%rdi),%r8 + xorq %r12,%r9 + notq %r12 + movq %r9,-28(%rsi) + + orq %r11,%r12 + movq -44(%rdi),%r9 + xorq %r10,%r12 + movq %r12,-44(%rsi) + + andq %r10,%r11 + movq 60(%rdi),%r12 + xorq %r14,%r11 + movq %r11,-52(%rsi) + + orq %r10,%r14 + movq 4(%rdi),%r10 + xorq %r13,%r14 + movq 52(%rdi),%r11 + movq %r14,-60(%rsi) + + + xorq %rbp,%r10 + xorq %rax,%r11 + rolq $25,%r10 + xorq %rdx,%r9 + rolq $8,%r11 + xorq %rbx,%r12 + rolq $6,%r9 + xorq %rcx,%r8 + rolq $18,%r12 + movq %r10,%r13 + andq %r11,%r10 + rolq $1,%r8 + + notq %r11 + xorq %r9,%r10 + movq %r10,-12(%rsi) + + movq %r12,%r14 + andq %r11,%r12 + movq -12(%rdi),%r10 + xorq %r13,%r12 + movq %r12,-4(%rsi) + + orq %r9,%r13 + movq 84(%rdi),%r12 + xorq %r8,%r13 + movq %r13,-20(%rsi) + + andq %r8,%r9 + xorq %r14,%r9 + movq %r9,12(%rsi) + + orq %r8,%r14 + movq -60(%rdi),%r9 + xorq %r11,%r14 + movq 36(%rdi),%r11 + movq %r14,4(%rsi) + + + movq -68(%rdi),%r8 + + xorq %rcx,%r10 + xorq %rdx,%r11 + rolq $10,%r10 + xorq %rbx,%r9 + rolq $15,%r11 + xorq %rbp,%r12 + rolq $36,%r9 + xorq %rax,%r8 + rolq $56,%r12 + movq %r10,%r13 + orq %r11,%r10 + rolq $27,%r8 + + notq %r11 + xorq %r9,%r10 + movq %r10,28(%rsi) + + movq %r12,%r14 + orq %r11,%r12 + xorq %r13,%r12 + movq %r12,36(%rsi) + + andq %r9,%r13 + xorq %r8,%r13 + movq %r13,20(%rsi) + + orq %r8,%r9 + xorq %r14,%r9 + movq %r9,52(%rsi) + + andq %r14,%r8 + xorq %r11,%r8 + movq %r8,44(%rsi) + + + xorq -84(%rdi),%rdx + xorq -36(%rdi),%rbp + rolq $62,%rdx + xorq 68(%rdi),%rcx + rolq $55,%rbp + xorq 12(%rdi),%rax + rolq $2,%rcx + xorq 20(%rdi),%rbx + xchgq %rsi,%rdi + rolq $39,%rax + rolq $41,%rbx + movq %rdx,%r13 + andq %rbp,%rdx + notq %rbp + xorq %rcx,%rdx + movq %rdx,92(%rdi) + + movq %rax,%r14 + andq %rbp,%rax + xorq %r13,%rax + movq %rax,60(%rdi) + + orq %rcx,%r13 + xorq %rbx,%r13 + movq %r13,84(%rdi) + + andq %rbx,%rcx + xorq %r14,%rcx + movq %rcx,76(%rdi) + + orq %r14,%rbx + xorq %rbp,%rbx + movq %rbx,68(%rdi) + + movq %rdx,%rbp + movq %r13,%rdx + + testq $255,%r15 + jnz L$oop + + leaq -192(%r15),%r15 + .byte 0xf3,0xc3 + + + +.p2align 5 +KeccakF1600: + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + + leaq 100(%rdi),%rdi + subq $200,%rsp + + + notq -92(%rdi) + notq -84(%rdi) + notq -36(%rdi) + notq -4(%rdi) + notq 36(%rdi) + notq 60(%rdi) + + leaq iotas(%rip),%r15 + leaq 100(%rsp),%rsi + + call __KeccakF1600 + + notq -92(%rdi) + notq -84(%rdi) + notq -36(%rdi) + notq -4(%rdi) + notq 36(%rdi) + notq 60(%rdi) + leaq -100(%rdi),%rdi + + addq $200,%rsp + + + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + popq %rbp + + popq %rbx + + .byte 0xf3,0xc3 + + +.globl _SHA3_absorb + +.p2align 5 +_SHA3_absorb: + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + + leaq 100(%rdi),%rdi + subq $232,%rsp + + + movq %rsi,%r9 + leaq 100(%rsp),%rsi + + notq -92(%rdi) + notq -84(%rdi) + notq -36(%rdi) + notq -4(%rdi) + notq 36(%rdi) + notq 60(%rdi) + leaq iotas(%rip),%r15 + + movq %rcx,216-100(%rsi) + +L$oop_absorb: + cmpq %rcx,%rdx + jc L$done_absorb + + shrq $3,%rcx + leaq -100(%rdi),%r8 + +L$block_absorb: + movq (%r9),%rax + leaq 8(%r9),%r9 + xorq (%r8),%rax + leaq 8(%r8),%r8 + subq $8,%rdx + movq %rax,-8(%r8) + subq $1,%rcx + jnz L$block_absorb + + movq %r9,200-100(%rsi) + movq %rdx,208-100(%rsi) + call __KeccakF1600 + movq 200-100(%rsi),%r9 + movq 208-100(%rsi),%rdx + movq 216-100(%rsi),%rcx + jmp L$oop_absorb + +.p2align 5 +L$done_absorb: + movq %rdx,%rax + + notq -92(%rdi) + notq -84(%rdi) + notq -36(%rdi) + notq -4(%rdi) + notq 36(%rdi) + notq 60(%rdi) + + addq $232,%rsp + + + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + popq %rbp + + popq %rbx + + .byte 0xf3,0xc3 + + +.globl _SHA3_squeeze + +.p2align 5 +_SHA3_squeeze: + + pushq %r12 + + pushq %r13 + + pushq %r14 + + + shrq $3,%rcx + movq %rdi,%r8 + movq %rsi,%r12 + movq %rdx,%r13 + movq %rcx,%r14 + jmp L$oop_squeeze + +.p2align 5 +L$oop_squeeze: + cmpq $8,%r13 + jb L$tail_squeeze + + movq (%r8),%rax + leaq 8(%r8),%r8 + movq %rax,(%r12) + leaq 8(%r12),%r12 + subq $8,%r13 + jz L$done_squeeze + + subq $1,%rcx + jnz L$oop_squeeze + + call KeccakF1600 + movq %rdi,%r8 + movq %r14,%rcx + jmp L$oop_squeeze + +L$tail_squeeze: + movq %r8,%rsi + movq %r12,%rdi + movq %r13,%rcx +.byte 0xf3,0xa4 + +L$done_squeeze: + popq %r14 + + popq %r13 + + popq %r12 + + .byte 0xf3,0xc3 + + +.p2align 8 +.quad 0,0,0,0,0,0,0,0 + +iotas: +.quad 0x0000000000000001 +.quad 0x0000000000008082 +.quad 0x800000000000808a +.quad 0x8000000080008000 +.quad 0x000000000000808b +.quad 0x0000000080000001 +.quad 0x8000000080008081 +.quad 0x8000000000008009 +.quad 0x000000000000008a +.quad 0x0000000000000088 +.quad 0x0000000080008009 +.quad 0x000000008000000a +.quad 0x000000008000808b +.quad 0x800000000000008b +.quad 0x8000000000008089 +.quad 0x8000000000008003 +.quad 0x8000000000008002 +.quad 0x8000000000000080 +.quad 0x000000000000800a +.quad 0x800000008000000a +.quad 0x8000000080008081 +.quad 0x8000000000008080 +.quad 0x0000000080000001 +.quad 0x8000000080008008 + +.byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha1-mb-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha1-mb-x86_64.s index 7026de0e76..b2009fb28f 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha1-mb-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha1-mb-x86_64.s @@ -6,17 +6,22 @@ .p2align 5 _sha1_multi_block: + movq _OPENSSL_ia32cap_P+4(%rip),%rcx btq $61,%rcx jc _shaext_shortcut testl $268435456,%ecx jnz _avx_shortcut movq %rsp,%rax + pushq %rbx + pushq %rbp + subq $288,%rsp andq $-256,%rsp movq %rax,272(%rsp) + L$body: leaq K_XX_XX(%rip),%rbp leaq 256(%rsp),%rbx @@ -2546,19 +2551,28 @@ L$oop: L$done: movq 272(%rsp),%rax + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$epilogue: .byte 0xf3,0xc3 + .p2align 5 sha1_multi_block_shaext: + _shaext_shortcut: movq %rsp,%rax + pushq %rbx + pushq %rbp + subq $288,%rsp shll $1,%edx andq $-256,%rsp @@ -2914,14 +2928,19 @@ L$oop_shaext: L$done_shaext: movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$epilogue_shaext: .byte 0xf3,0xc3 + .p2align 5 sha1_multi_block_avx: + _avx_shortcut: shrq $32,%rcx cmpl $2,%edx @@ -2932,11 +2951,15 @@ _avx_shortcut: .p2align 5 L$avx: movq %rsp,%rax + pushq %rbx + pushq %rbp + subq $288,%rsp andq $-256,%rsp movq %rax,272(%rsp) + L$body_avx: leaq K_XX_XX(%rip),%rbp leaq 256(%rsp),%rbx @@ -4986,27 +5009,41 @@ L$oop_avx: L$done_avx: movq 272(%rsp),%rax + vzeroupper movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$epilogue_avx: .byte 0xf3,0xc3 + .p2align 5 sha1_multi_block_avx2: + _avx2_shortcut: movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $576,%rsp andq $-256,%rsp movq %rax,544(%rsp) + L$body_avx2: leaq K_XX_XX(%rip),%rbp shrl $1,%edx @@ -7193,18 +7230,27 @@ L$oop_avx2: L$done_avx2: movq 544(%rsp),%rax + vzeroupper movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$epilogue_avx2: .byte 0xf3,0xc3 + .p2align 8 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha1-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha1-x86_64.s index 3e3633911f..02472d0b7d 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha1-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha1-x86_64.s @@ -5,6 +5,7 @@ .p2align 4 _sha1_block_data_order: + movl _OPENSSL_ia32cap_P+0(%rip),%r9d movl _OPENSSL_ia32cap_P+4(%rip),%r8d movl _OPENSSL_ia32cap_P+8(%rip),%r10d @@ -25,17 +26,24 @@ _sha1_block_data_order: .p2align 4 L$ialu: movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + movq %rdi,%r8 subq $72,%rsp movq %rsi,%r9 andq $-64,%rsp movq %rdx,%r10 movq %rax,64(%rsp) + L$prologue: movl 0(%r8),%esi @@ -1230,19 +1238,28 @@ L$loop: jnz L$loop movq 64(%rsp),%rsi + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$epilogue: .byte 0xf3,0xc3 + .p2align 5 sha1_block_data_order_shaext: _shaext_shortcut: + movdqu (%rdi),%xmm0 movd 16(%rdi),%xmm1 movdqa K_XX_XX+160(%rip),%xmm3 @@ -1404,20 +1421,27 @@ L$oop_shaext: pshufd $27,%xmm1,%xmm1 movdqu %xmm0,(%rdi) movd %xmm1,16(%rdi) + .byte 0xf3,0xc3 .p2align 4 sha1_block_data_order_ssse3: _ssse3_shortcut: - movq %rsp,%rax + + movq %rsp,%r11 + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + leaq -64(%rsp),%rsp - movq %rax,%r14 andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 @@ -1425,7 +1449,7 @@ _ssse3_shortcut: shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r11 + leaq K_XX_XX+64(%rip),%r14 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -1437,8 +1461,8 @@ _ssse3_shortcut: xorl %edx,%edi andl %edi,%esi - movdqa 64(%r11),%xmm6 - movdqa -64(%r11),%xmm9 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -1514,7 +1538,7 @@ L$oop_ssse3: pslld $2,%xmm9 pxor %xmm10,%xmm4 xorl %ebp,%edx - movdqa -64(%r11),%xmm10 + movdqa -64(%r14),%xmm10 roll $5,%ecx addl %edi,%ebx andl %edx,%esi @@ -1575,7 +1599,7 @@ L$oop_ssse3: pslld $2,%xmm10 pxor %xmm8,%xmm5 xorl %eax,%ebp - movdqa -32(%r11),%xmm8 + movdqa -32(%r14),%xmm8 roll $5,%edx addl %edi,%ecx andl %ebp,%esi @@ -1636,7 +1660,7 @@ L$oop_ssse3: pslld $2,%xmm8 pxor %xmm9,%xmm6 xorl %ebx,%eax - movdqa -32(%r11),%xmm9 + movdqa -32(%r14),%xmm9 roll $5,%ebp addl %edi,%edx andl %eax,%esi @@ -1697,7 +1721,7 @@ L$oop_ssse3: pslld $2,%xmm9 pxor %xmm10,%xmm7 xorl %ecx,%ebx - movdqa -32(%r11),%xmm10 + movdqa -32(%r14),%xmm10 roll $5,%eax addl %edi,%ebp andl %ebx,%esi @@ -1808,7 +1832,7 @@ L$oop_ssse3: pxor %xmm3,%xmm2 addl %esi,%eax xorl %edx,%edi - movdqa 0(%r11),%xmm10 + movdqa 0(%r14),%xmm10 rorl $7,%ecx paddd %xmm1,%xmm9 addl %ebx,%eax @@ -2043,7 +2067,7 @@ L$oop_ssse3: pxor %xmm0,%xmm7 roll $5,%ebx addl %esi,%eax - movdqa 32(%r11),%xmm9 + movdqa 32(%r14),%xmm9 xorl %ecx,%edi paddd %xmm6,%xmm8 xorl %edx,%ecx @@ -2334,8 +2358,8 @@ L$oop_ssse3: addl %edx,%ecx cmpq %r10,%r9 je L$done_ssse3 - movdqa 64(%r11),%xmm6 - movdqa -64(%r11),%xmm9 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -2572,29 +2596,41 @@ L$done_ssse3: movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) - leaq (%r14),%rsi - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp + movq -40(%r11),%r14 + + movq -32(%r11),%r13 + + movq -24(%r11),%r12 + + movq -16(%r11),%rbp + + movq -8(%r11),%rbx + + leaq (%r11),%rsp + L$epilogue_ssse3: .byte 0xf3,0xc3 + .p2align 4 sha1_block_data_order_avx: _avx_shortcut: - movq %rsp,%rax + + movq %rsp,%r11 + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + leaq -64(%rsp),%rsp vzeroupper - movq %rax,%r14 andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 @@ -2602,7 +2638,7 @@ _avx_shortcut: shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r11 + leaq K_XX_XX+64(%rip),%r14 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -2614,8 +2650,8 @@ _avx_shortcut: xorl %edx,%edi andl %edi,%esi - vmovdqa 64(%r11),%xmm6 - vmovdqa -64(%r11),%xmm11 + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 vmovdqu 0(%r9),%xmm0 vmovdqu 16(%r9),%xmm1 vmovdqu 32(%r9),%xmm2 @@ -2740,7 +2776,7 @@ L$oop_avx: vpxor %xmm10,%xmm5,%xmm5 xorl %eax,%ebp shldl $5,%edx,%edx - vmovdqa -32(%r11),%xmm11 + vmovdqa -32(%r14),%xmm11 addl %edi,%ecx andl %ebp,%esi xorl %eax,%ebp @@ -2953,7 +2989,7 @@ L$oop_avx: addl %esi,%eax xorl %edx,%edi vpaddd %xmm1,%xmm11,%xmm9 - vmovdqa 0(%r11),%xmm11 + vmovdqa 0(%r14),%xmm11 shrdl $7,%ecx,%ecx addl %ebx,%eax vpxor %xmm8,%xmm2,%xmm2 @@ -3172,7 +3208,7 @@ L$oop_avx: movl %ebx,%edi xorl %edx,%esi vpaddd %xmm6,%xmm11,%xmm9 - vmovdqa 32(%r11),%xmm11 + vmovdqa 32(%r14),%xmm11 shldl $5,%ebx,%ebx addl %esi,%eax vpxor %xmm8,%xmm7,%xmm7 @@ -3451,8 +3487,8 @@ L$oop_avx: addl %edx,%ecx cmpq %r10,%r9 je L$done_avx - vmovdqa 64(%r11),%xmm6 - vmovdqa -64(%r11),%xmm11 + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 vmovdqu 0(%r9),%xmm0 vmovdqu 16(%r9),%xmm1 vmovdqu 32(%r9),%xmm2 @@ -3688,28 +3724,40 @@ L$done_avx: movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) - leaq (%r14),%rsi - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp + movq -40(%r11),%r14 + + movq -32(%r11),%r13 + + movq -24(%r11),%r12 + + movq -16(%r11),%rbp + + movq -8(%r11),%rbx + + leaq (%r11),%rsp + L$epilogue_avx: .byte 0xf3,0xc3 + .p2align 4 sha1_block_data_order_avx2: _avx2_shortcut: - movq %rsp,%rax + + movq %rsp,%r11 + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + vzeroupper - movq %rax,%r14 movq %rdi,%r8 movq %rsi,%r9 movq %rdx,%r10 @@ -3719,7 +3767,7 @@ _avx2_shortcut: leaq 64(%r9),%r13 andq $-128,%rsp addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r11 + leaq K_XX_XX+64(%rip),%r14 movl 0(%r8),%eax cmpq %r10,%r13 @@ -3728,7 +3776,7 @@ _avx2_shortcut: movl 8(%r8),%ecx movl 12(%r8),%edx movl 16(%r8),%esi - vmovdqu 64(%r11),%ymm6 + vmovdqu 64(%r14),%ymm6 vmovdqu (%r9),%xmm0 vmovdqu 16(%r9),%xmm1 @@ -3742,7 +3790,7 @@ _avx2_shortcut: vpshufb %ymm6,%ymm1,%ymm1 vinserti128 $1,48(%r13),%ymm3,%ymm3 vpshufb %ymm6,%ymm2,%ymm2 - vmovdqu -64(%r11),%ymm11 + vmovdqu -64(%r14),%ymm11 vpshufb %ymm6,%ymm3,%ymm3 vpaddd %ymm11,%ymm0,%ymm4 @@ -3774,7 +3822,7 @@ _avx2_shortcut: vpxor %ymm3,%ymm8,%ymm8 vpxor %ymm8,%ymm5,%ymm5 vpsrld $31,%ymm5,%ymm8 - vmovdqu -32(%r11),%ymm11 + vmovdqu -32(%r14),%ymm11 vpslldq $12,%ymm5,%ymm10 vpaddd %ymm5,%ymm5,%ymm5 vpsrld $30,%ymm10,%ymm9 @@ -3928,7 +3976,7 @@ L$align32_1: addl -56(%r13),%ebp andnl %esi,%ebx,%edi vpxor %ymm3,%ymm2,%ymm2 - vmovdqu 0(%r11),%ymm11 + vmovdqu 0(%r14),%ymm11 addl %ecx,%ebp rorxl $27,%ebx,%r12d rorxl $2,%ebx,%ecx @@ -4159,7 +4207,7 @@ L$align32_1: addl -116(%r13),%eax leal (%rax,%rbx,1),%eax vpxor %ymm0,%ymm7,%ymm7 - vmovdqu 32(%r11),%ymm11 + vmovdqu 32(%r14),%ymm11 rorxl $27,%ebp,%r12d rorxl $2,%ebp,%ebx xorl %ecx,%ebp @@ -4604,7 +4652,7 @@ L$align32_2: cmpq %r10,%r9 je L$done_avx2 - vmovdqu 64(%r11),%ymm6 + vmovdqu 64(%r14),%ymm6 cmpq %r10,%rdi ja L$ast_avx2 @@ -4820,7 +4868,7 @@ L$ast_avx2: xorl %ebx,%eax addl %r12d,%esi xorl %ecx,%eax - vmovdqu -64(%r11),%ymm11 + vmovdqu -64(%r14),%ymm11 vpshufb %ymm6,%ymm0,%ymm0 addl 68(%r13),%edx leal (%rdx,%rax,1),%edx @@ -5176,7 +5224,7 @@ L$align32_3: xorl %ebp,%esi addl %r12d,%edx vpsrld $31,%ymm5,%ymm8 - vmovdqu -32(%r11),%ymm11 + vmovdqu -32(%r14),%ymm11 xorl %ebx,%esi addl 104(%r13),%ecx leal (%rcx,%rsi,1),%ecx @@ -5369,16 +5417,22 @@ L$align32_3: L$done_avx2: vzeroupper - leaq (%r14),%rsi - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp + movq -40(%r11),%r14 + + movq -32(%r11),%r13 + + movq -24(%r11),%r12 + + movq -16(%r11),%rbp + + movq -8(%r11),%rbx + + leaq (%r11),%rsp + L$epilogue_avx2: .byte 0xf3,0xc3 + .p2align 6 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha256-mb-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha256-mb-x86_64.s index 95e0e774af..bab9a565a2 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha256-mb-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha256-mb-x86_64.s @@ -6,17 +6,22 @@ .p2align 5 _sha256_multi_block: + movq _OPENSSL_ia32cap_P+4(%rip),%rcx btq $61,%rcx jc _shaext_shortcut testl $268435456,%ecx jnz _avx_shortcut movq %rsp,%rax + pushq %rbx + pushq %rbp + subq $288,%rsp andq $-256,%rsp movq %rax,272(%rsp) + L$body: leaq K256+128(%rip),%rbp leaq 256(%rsp),%rbx @@ -2615,19 +2620,28 @@ L$oop_16_xx: L$done: movq 272(%rsp),%rax + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$epilogue: .byte 0xf3,0xc3 + .p2align 5 sha256_multi_block_shaext: + _shaext_shortcut: movq %rsp,%rax + pushq %rbx + pushq %rbp + subq $288,%rsp shll $1,%edx andq $-256,%rsp @@ -3102,14 +3116,19 @@ L$oop_shaext: L$done_shaext: movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$epilogue_shaext: .byte 0xf3,0xc3 + .p2align 5 sha256_multi_block_avx: + _avx_shortcut: shrq $32,%rcx cmpl $2,%edx @@ -3120,11 +3139,15 @@ _avx_shortcut: .p2align 5 L$avx: movq %rsp,%rax + pushq %rbx + pushq %rbp + subq $288,%rsp andq $-256,%rsp movq %rax,272(%rsp) + L$body_avx: leaq K256+128(%rip),%rbp leaq 256(%rsp),%rbx @@ -5353,27 +5376,41 @@ L$oop_16_xx_avx: L$done_avx: movq 272(%rsp),%rax + vzeroupper movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$epilogue_avx: .byte 0xf3,0xc3 + .p2align 5 sha256_multi_block_avx2: + _avx2_shortcut: movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $576,%rsp andq $-256,%rsp movq %rax,544(%rsp) + L$body_avx2: leaq K256+128(%rip),%rbp leaq 128(%rdi),%rdi @@ -7738,17 +7775,26 @@ L$oop_16_xx_avx2: L$done_avx2: movq 544(%rsp),%rax + vzeroupper movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp + L$epilogue_avx2: .byte 0xf3,0xc3 + .p2align 8 K256: .long 1116352408,1116352408,1116352408,1116352408 diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha256-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha256-x86_64.s index 05e973612b..e43cdd7040 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha256-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha256-x86_64.s @@ -5,6 +5,7 @@ .p2align 4 _sha256_block_data_order: + leaq _OPENSSL_ia32cap_P(%rip),%r11 movl 0(%r11),%r9d movl 4(%r11),%r10d @@ -21,13 +22,20 @@ _sha256_block_data_order: je L$avx_shortcut testl $512,%r10d jnz L$ssse3_shortcut + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movq %rsp,%r11 + shlq $4,%rdx subq $64+32,%rsp leaq (%rsi,%rdx,4),%rdx @@ -35,7 +43,8 @@ _sha256_block_data_order: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,88(%rsp) + L$prologue: movl 0(%rdi),%eax @@ -1699,17 +1708,26 @@ L$rounds_16_xx: movl %r11d,28(%rdi) jb L$loop - movq 64+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq 88(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$epilogue: .byte 0xf3,0xc3 + .p2align 6 K256: @@ -1963,14 +1981,22 @@ L$oop_shaext: .p2align 6 sha256_block_data_order_ssse3: + L$ssse3_shortcut: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movq %rsp,%r11 + shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -1978,7 +2004,8 @@ L$ssse3_shortcut: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,88(%rsp) + L$prologue_ssse3: movl 0(%rdi),%eax @@ -3044,28 +3071,45 @@ L$ssse3_00_47: movl %r11d,28(%rdi) jb L$loop_ssse3 - movq 64+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq 88(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$epilogue_ssse3: .byte 0xf3,0xc3 + .p2align 6 sha256_block_data_order_avx: + L$avx_shortcut: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movq %rsp,%r11 + shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -3073,7 +3117,8 @@ L$avx_shortcut: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,88(%rsp) + L$prologue_avx: vzeroupper @@ -4100,29 +4145,46 @@ L$avx_00_47: movl %r11d,28(%rdi) jb L$loop_avx - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi + vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$epilogue_avx: .byte 0xf3,0xc3 + .p2align 6 sha256_block_data_order_avx2: + L$avx2_shortcut: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movq %rsp,%r11 + subq $544,%rsp shlq $4,%rdx andq $-1024,%rsp @@ -4131,7 +4193,8 @@ L$avx2_shortcut: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,88(%rsp) + L$prologue_avx2: vzeroupper @@ -5344,15 +5407,24 @@ L$ower_avx2: L$done_avx2: leaq (%rbp),%rsp - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi + vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$epilogue_avx2: .byte 0xf3,0xc3 + diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha512-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha512-x86_64.s index 234616bc3b..51ace9a686 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha512-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/sha/sha512-x86_64.s @@ -5,6 +5,7 @@ .p2align 4 _sha512_block_data_order: + leaq _OPENSSL_ia32cap_P(%rip),%r11 movl 0(%r11),%r9d movl 4(%r11),%r10d @@ -19,13 +20,20 @@ _sha512_block_data_order: orl %r9d,%r10d cmpl $1342177792,%r10d je L$avx_shortcut + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movq %rsp,%r11 + shlq $4,%rdx subq $128+32,%rsp leaq (%rsi,%rdx,8),%rdx @@ -33,7 +41,8 @@ _sha512_block_data_order: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,152(%rsp) + L$prologue: movq 0(%rdi),%rax @@ -1697,17 +1706,26 @@ L$rounds_16_xx: movq %r11,56(%rdi) jb L$loop - movq 128+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq 152(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$epilogue: .byte 0xf3,0xc3 + .p2align 6 K512: @@ -1798,14 +1816,22 @@ K512: .p2align 6 sha512_block_data_order_xop: + L$xop_shortcut: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movq %rsp,%r11 + shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx @@ -1813,7 +1839,8 @@ L$xop_shortcut: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,152(%rsp) + L$prologue_xop: vzeroupper @@ -2866,29 +2893,46 @@ L$xop_00_47: movq %r11,56(%rdi) jb L$loop_xop - movq 128+24(%rsp),%rsi + movq 152(%rsp),%rsi + vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$epilogue_xop: .byte 0xf3,0xc3 + .p2align 6 sha512_block_data_order_avx: + L$avx_shortcut: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movq %rsp,%r11 + shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx @@ -2896,7 +2940,8 @@ L$avx_shortcut: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,152(%rsp) + L$prologue_avx: vzeroupper @@ -4013,29 +4058,46 @@ L$avx_00_47: movq %r11,56(%rdi) jb L$loop_avx - movq 128+24(%rsp),%rsi + movq 152(%rsp),%rsi + vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$epilogue_avx: .byte 0xf3,0xc3 + .p2align 6 sha512_block_data_order_avx2: + L$avx2_shortcut: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movq %rsp,%r11 + subq $1312,%rsp shlq $4,%rdx andq $-2048,%rsp @@ -4044,7 +4106,8 @@ L$avx2_shortcut: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,152(%rsp) + L$prologue_avx2: vzeroupper @@ -5351,15 +5414,24 @@ L$ower_avx2: L$done_avx2: leaq (%rbp),%rsp - movq 128+24(%rsp),%rsi + movq 152(%rsp),%rsi + vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$epilogue_avx2: .byte 0xf3,0xc3 + diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/whrlpool/wp-x86_64.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/whrlpool/wp-x86_64.s index 4057ba32ac..2c94c14b93 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/whrlpool/wp-x86_64.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/whrlpool/wp-x86_64.s @@ -4,14 +4,22 @@ .p2align 4 _whirlpool_block: + + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movq %rsp,%r11 + subq $128+40,%rsp andq $-64,%rsp @@ -19,7 +27,8 @@ _whirlpool_block: movq %rdi,0(%r10) movq %rsi,8(%r10) movq %rdx,16(%r10) - movq %r11,32(%r10) + movq %rax,32(%r10) + L$prologue: movq %r10,%rbx @@ -579,17 +588,26 @@ L$roundsdone: jmp L$outerloop L$alldone: movq 32(%rbx),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$epilogue: .byte 0xf3,0xc3 + .p2align 6 L$table: diff --git a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/x86_64cpuid.s b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/x86_64cpuid.s index 8f16835f71..05afede678 100644 --- a/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/x86_64cpuid.s +++ b/deps/openssl/config/archs/darwin64-x86_64-cc/asm/crypto/x86_64cpuid.s @@ -37,10 +37,12 @@ _OPENSSL_rdtsc: .p2align 4 _OPENSSL_ia32_cpuid: + movq %rbx,%r8 + xorl %eax,%eax - movl %eax,8(%rdi) + movq %rax,8(%rdi) cpuid movl %eax,%r11d @@ -111,6 +113,7 @@ L$intel: L$nocacheinfo: movl $1,%eax cpuid + movd %eax,%xmm0 andl $0xbfefffff,%edx cmpl $0,%r9d jne L$notintel @@ -158,28 +161,47 @@ L$generic: jc L$notknights andl $0xfff7ffff,%ebx L$notknights: + movd %xmm0,%eax + andl $0x0fff0ff0,%eax + cmpl $0x00050650,%eax + jne L$notskylakex + andl $0xfffeffff,%ebx + +L$notskylakex: movl %ebx,8(%rdi) + movl %ecx,12(%rdi) L$no_extended_info: btl $27,%r9d jnc L$clear_avx xorl %ecx,%ecx .byte 0x0f,0x01,0xd0 + andl $0xe6,%eax + cmpl $0xe6,%eax + je L$done + andl $0x3fdeffff,8(%rdi) + + + + andl $6,%eax cmpl $6,%eax je L$done L$clear_avx: movl $0xefffe7ff,%eax andl %eax,%r9d - andl $0xffffffdf,8(%rdi) + movl $0x3fdeffdf,%eax + andl %eax,8(%rdi) L$done: shlq $32,%r9 movl %r10d,%eax movq %r8,%rbx + orq %r9,%rax .byte 0xf3,0xc3 + .globl _OPENSSL_cleanse .p2align 4 @@ -223,6 +245,18 @@ _CRYPTO_memcmp: xorq %r10,%r10 cmpq $0,%rdx je L$no_data + cmpq $16,%rdx + jne L$oop_cmp + movq (%rdi),%r10 + movq 8(%rdi),%r11 + movq $1,%rdx + xorq (%rsi),%r10 + xorq 8(%rsi),%r11 + orq %r11,%r10 + cmovnzq %rdx,%rax + .byte 0xf3,0xc3 + +.p2align 4 L$oop_cmp: movb (%rdi),%r10b leaq 1(%rdi),%rdi @@ -346,21 +380,6 @@ L$done2: subq %rcx,%rax .byte 0xf3,0xc3 -.globl _OPENSSL_ia32_rdrand - -.p2align 4 -_OPENSSL_ia32_rdrand: - movl $8,%ecx -L$oop_rdrand: -.byte 72,15,199,240 - jc L$break_rdrand - loop L$oop_rdrand -L$break_rdrand: - cmpq $0,%rax - cmoveq %rcx,%rax - .byte 0xf3,0xc3 - - .globl _OPENSSL_ia32_rdrand_bytes .p2align 4 @@ -394,28 +413,14 @@ L$tail_rdrand_bytes: movb %r10b,(%rdi) leaq 1(%rdi),%rdi incq %rax - shrq $8,%r8 + shrq $8,%r10 decq %rsi jnz L$tail_rdrand_bytes L$done_rdrand_bytes: + xorq %r10,%r10 .byte 0xf3,0xc3 -.globl _OPENSSL_ia32_rdseed - -.p2align 4 -_OPENSSL_ia32_rdseed: - movl $8,%ecx -L$oop_rdseed: -.byte 72,15,199,248 - jc L$break_rdseed - loop L$oop_rdseed -L$break_rdseed: - cmpq $0,%rax - cmoveq %rcx,%rax - .byte 0xf3,0xc3 - - .globl _OPENSSL_ia32_rdseed_bytes .p2align 4 @@ -449,10 +454,11 @@ L$tail_rdseed_bytes: movb %r10b,(%rdi) leaq 1(%rdi),%rdi incq %rax - shrq $8,%r8 + shrq $8,%r10 decq %rsi jnz L$tail_rdseed_bytes L$done_rdseed_bytes: + xorq %r10,%r10 .byte 0xf3,0xc3 |