diff options
Diffstat (limited to 'deps/openssl/config/archs/BSD-x86_64/asm/crypto')
31 files changed, 7729 insertions, 575 deletions
diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aes-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aes-x86_64.s index 488ae6d781..4bc117304f 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aes-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aes-x86_64.s @@ -332,15 +332,23 @@ _x86_64_AES_encrypt_compact: .hidden asm_AES_encrypt asm_AES_encrypt: AES_encrypt: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 - movq %rsp,%r10 leaq -63(%rdx),%rcx andq $-64,%rsp subq %rsp,%rcx @@ -350,7 +358,8 @@ AES_encrypt: subq $32,%rsp movq %rsi,16(%rsp) - movq %r10,24(%rsp) + movq %rax,24(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x18,0x06,0x23,0x08 .Lenc_prologue: movq %rdx,%r15 @@ -377,20 +386,29 @@ AES_encrypt: movq 16(%rsp),%r9 movq 24(%rsp),%rsi +.cfi_def_cfa %rsi,8 movl %eax,0(%r9) movl %ebx,4(%r9) movl %ecx,8(%r9) movl %edx,12(%r9) - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lenc_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size AES_encrypt,.-AES_encrypt .type _x86_64_AES_decrypt,@function .align 16 @@ -779,15 +797,23 @@ _x86_64_AES_decrypt_compact: .hidden asm_AES_decrypt asm_AES_decrypt: AES_decrypt: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 - movq %rsp,%r10 leaq -63(%rdx),%rcx andq $-64,%rsp subq %rsp,%rcx @@ -797,7 +823,8 @@ AES_decrypt: subq $32,%rsp movq %rsi,16(%rsp) - movq %r10,24(%rsp) + movq %rax,24(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x18,0x06,0x23,0x08 .Ldec_prologue: movq %rdx,%r15 @@ -826,41 +853,68 @@ AES_decrypt: movq 16(%rsp),%r9 movq 24(%rsp),%rsi +.cfi_def_cfa %rsi,8 movl %eax,0(%r9) movl %ebx,4(%r9) movl %ecx,8(%r9) movl %edx,12(%r9) - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Ldec_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size AES_decrypt,.-AES_decrypt .globl AES_set_encrypt_key .type AES_set_encrypt_key,@function .align 16 AES_set_encrypt_key: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $8,%rsp +.cfi_adjust_cfa_offset 8 .Lenc_key_prologue: call _x86_64_AES_set_encrypt_key movq 40(%rsp),%rbp +.cfi_restore %rbp movq 48(%rsp),%rbx +.cfi_restore %rbx addq $56,%rsp +.cfi_adjust_cfa_offset -56 .Lenc_key_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size AES_set_encrypt_key,.-AES_set_encrypt_key .type _x86_64_AES_set_encrypt_key,@function @@ -1106,13 +1160,27 @@ _x86_64_AES_set_encrypt_key: .type AES_set_decrypt_key,@function .align 16 AES_set_decrypt_key: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 pushq %rdx +.cfi_adjust_cfa_offset 8 .Ldec_key_prologue: call _x86_64_AES_set_encrypt_key @@ -1280,14 +1348,22 @@ AES_set_decrypt_key: xorq %rax,%rax .Labort: movq 8(%rsp),%r15 +.cfi_restore %r15 movq 16(%rsp),%r14 +.cfi_restore %r14 movq 24(%rsp),%r13 +.cfi_restore %r13 movq 32(%rsp),%r12 +.cfi_restore %r12 movq 40(%rsp),%rbp +.cfi_restore %rbp movq 48(%rsp),%rbx +.cfi_restore %rbx addq $56,%rsp +.cfi_adjust_cfa_offset -56 .Ldec_key_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size AES_set_decrypt_key,.-AES_set_decrypt_key .globl AES_cbc_encrypt .type AES_cbc_encrypt,@function @@ -1297,25 +1373,39 @@ AES_set_decrypt_key: .hidden asm_AES_cbc_encrypt asm_AES_cbc_encrypt: AES_cbc_encrypt: +.cfi_startproc cmpq $0,%rdx je .Lcbc_epilogue pushfq +.cfi_adjust_cfa_offset 8 +.cfi_offset 49,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-32 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-40 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-48 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-56 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-64 .Lcbc_prologue: cld movl %r9d,%r9d leaq .LAES_Te(%rip),%r14 + leaq .LAES_Td(%rip),%r10 cmpq $0,%r9 - jne .Lcbc_picked_te - leaq .LAES_Td(%rip),%r14 -.Lcbc_picked_te: + cmoveq %r10,%r14 movl OPENSSL_ia32cap_P(%rip),%r10d cmpq $512,%rdx @@ -1351,8 +1441,10 @@ AES_cbc_encrypt: .Lcbc_te_ok: xchgq %rsp,%r15 +.cfi_def_cfa_register %r15 movq %r15,16(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x40 .Lcbc_fast_body: movq %rdi,24(%rsp) movq %rsi,32(%rsp) @@ -1734,17 +1826,28 @@ AES_cbc_encrypt: .align 16 .Lcbc_exit: movq 16(%rsp),%rsi +.cfi_def_cfa %rsi,64 movq (%rsi),%r15 +.cfi_restore %r15 movq 8(%rsi),%r14 +.cfi_restore %r14 movq 16(%rsi),%r13 +.cfi_restore %r13 movq 24(%rsi),%r12 +.cfi_restore %r12 movq 32(%rsi),%rbp +.cfi_restore %rbp movq 40(%rsi),%rbx +.cfi_restore %rbx leaq 48(%rsi),%rsp +.cfi_def_cfa %rsp,16 .Lcbc_popfq: popfq +.cfi_adjust_cfa_offset -8 +.cfi_restore 49 .Lcbc_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size AES_cbc_encrypt,.-AES_cbc_encrypt .align 64 .LAES_Te: diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aesni-mb-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aesni-mb-x86_64.s index 3dcd55d3f5..f2b5662b9c 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aesni-mb-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aesni-mb-x86_64.s @@ -6,6 +6,7 @@ .type aesni_multi_cbc_encrypt,@function .align 32 aesni_multi_cbc_encrypt: +.cfi_startproc cmpl $2,%edx jb .Lenc_non_avx movl OPENSSL_ia32cap_P+4(%rip),%ecx @@ -15,12 +16,19 @@ aesni_multi_cbc_encrypt: .align 16 .Lenc_non_avx: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 @@ -30,6 +38,7 @@ aesni_multi_cbc_encrypt: subq $48,%rsp andq $-64,%rsp movq %rax,16(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08 .Lenc4x_body: movdqu (%rsi),%xmm12 @@ -239,6 +248,7 @@ aesni_multi_cbc_encrypt: jnz .Loop_enc4x movq 16(%rsp),%rax +.cfi_def_cfa %rax,8 movl 24(%rsp),%edx @@ -256,20 +266,29 @@ aesni_multi_cbc_encrypt: .Lenc4x_done: movq -48(%rax),%r15 +.cfi_restore %r15 movq -40(%rax),%r14 +.cfi_restore %r14 movq -32(%rax),%r13 +.cfi_restore %r13 movq -24(%rax),%r12 +.cfi_restore %r12 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lenc4x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt .globl aesni_multi_cbc_decrypt .type aesni_multi_cbc_decrypt,@function .align 32 aesni_multi_cbc_decrypt: +.cfi_startproc cmpl $2,%edx jb .Ldec_non_avx movl OPENSSL_ia32cap_P+4(%rip),%ecx @@ -279,12 +298,19 @@ aesni_multi_cbc_decrypt: .align 16 .Ldec_non_avx: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 @@ -294,6 +320,7 @@ aesni_multi_cbc_decrypt: subq $48,%rsp andq $-64,%rsp movq %rax,16(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08 .Ldec4x_body: movdqu (%rsi),%xmm12 @@ -503,6 +530,7 @@ aesni_multi_cbc_decrypt: jnz .Loop_dec4x movq 16(%rsp),%rax +.cfi_def_cfa %rax,8 movl 24(%rsp),%edx leaq 160(%rdi),%rdi @@ -511,26 +539,42 @@ aesni_multi_cbc_decrypt: .Ldec4x_done: movq -48(%rax),%r15 +.cfi_restore %r15 movq -40(%rax),%r14 +.cfi_restore %r14 movq -32(%rax),%r13 +.cfi_restore %r13 movq -24(%rax),%r12 +.cfi_restore %r12 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Ldec4x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt .type aesni_multi_cbc_encrypt_avx,@function .align 32 aesni_multi_cbc_encrypt_avx: +.cfi_startproc _avx_cbc_enc_shortcut: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 @@ -542,6 +586,7 @@ _avx_cbc_enc_shortcut: subq $192,%rsp andq $-128,%rsp movq %rax,16(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08 .Lenc8x_body: vzeroupper @@ -939,6 +984,7 @@ _avx_cbc_enc_shortcut: jnz .Loop_enc8x movq 16(%rsp),%rax +.cfi_def_cfa %rax,8 @@ -947,27 +993,43 @@ _avx_cbc_enc_shortcut: .Lenc8x_done: vzeroupper movq -48(%rax),%r15 +.cfi_restore %r15 movq -40(%rax),%r14 +.cfi_restore %r14 movq -32(%rax),%r13 +.cfi_restore %r13 movq -24(%rax),%r12 +.cfi_restore %r12 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lenc8x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx .type aesni_multi_cbc_decrypt_avx,@function .align 32 aesni_multi_cbc_decrypt_avx: +.cfi_startproc _avx_cbc_dec_shortcut: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 @@ -981,6 +1043,7 @@ _avx_cbc_dec_shortcut: andq $-256,%rsp subq $192,%rsp movq %rax,16(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08 .Ldec8x_body: vzeroupper @@ -1416,6 +1479,7 @@ _avx_cbc_dec_shortcut: jnz .Loop_dec8x movq 16(%rsp),%rax +.cfi_def_cfa %rax,8 @@ -1424,12 +1488,20 @@ _avx_cbc_dec_shortcut: .Ldec8x_done: vzeroupper movq -48(%rax),%r15 +.cfi_restore %r15 movq -40(%rax),%r14 +.cfi_restore %r14 movq -32(%rax),%r13 +.cfi_restore %r13 movq -24(%rax),%r12 +.cfi_restore %r12 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Ldec8x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aesni-sha1-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aesni-sha1-x86_64.s index ca193ddb9e..4d2dfe4489 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aesni-sha1-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aesni-sha1-x86_64.s @@ -21,16 +21,30 @@ aesni_cbc_sha1_enc: .type aesni_cbc_sha1_enc_ssse3,@function .align 32 aesni_cbc_sha1_enc_ssse3: +.cfi_startproc movq 8(%rsp),%r10 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 leaq -104(%rsp),%rsp +.cfi_adjust_cfa_offset 104 movq %rdi,%r12 @@ -1362,29 +1376,52 @@ aesni_cbc_sha1_enc_ssse3: movl %ebp,16(%r9) movups %xmm2,(%r8) leaq 104(%rsp),%rsi +.cfi_def_cfa %rsi,56 movq 0(%rsi),%r15 +.cfi_restore %r15 movq 8(%rsi),%r14 +.cfi_restore %r14 movq 16(%rsi),%r13 +.cfi_restore %r13 movq 24(%rsi),%r12 +.cfi_restore %r12 movq 32(%rsi),%rbp +.cfi_restore %rbp movq 40(%rsi),%rbx +.cfi_restore %rbx leaq 48(%rsi),%rsp +.cfi_def_cfa %rsp,8 .Lepilogue_ssse3: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 .type aesni_cbc_sha1_enc_avx,@function .align 32 aesni_cbc_sha1_enc_avx: +.cfi_startproc movq 8(%rsp),%r10 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 leaq -104(%rsp),%rsp +.cfi_adjust_cfa_offset 104 vzeroall @@ -2660,15 +2697,24 @@ aesni_cbc_sha1_enc_avx: vmovups %xmm12,(%r8) vzeroall leaq 104(%rsp),%rsi +.cfi_def_cfa %rsi,56 movq 0(%rsi),%r15 +.cfi_restore %r15 movq 8(%rsi),%r14 +.cfi_restore %r14 movq 16(%rsi),%r13 +.cfi_restore %r13 movq 24(%rsi),%r12 +.cfi_restore %r12 movq 32(%rsi),%rbp +.cfi_restore %rbp movq 40(%rsi),%rbx +.cfi_restore %rbx leaq 48(%rsi),%rsp +.cfi_def_cfa %rsp,8 .Lepilogue_avx: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx .align 64 K_XX_XX: diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aesni-sha256-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aesni-sha256-x86_64.s index 427a1c7d12..5a47b3ee51 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aesni-sha256-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aesni-sha256-x86_64.s @@ -77,15 +77,23 @@ K256: .type aesni_cbc_sha256_enc_xop,@function .align 64 aesni_cbc_sha256_enc_xop: +.cfi_startproc .Lxop_shortcut: movq 8(%rsp),%r10 + movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 - movq %rsp,%r11 +.cfi_offset %r15,-56 subq $128,%rsp andq $-64,%rsp @@ -101,7 +109,8 @@ aesni_cbc_sha256_enc_xop: movq %r8,64+32(%rsp) movq %r9,64+40(%rsp) movq %r10,64+48(%rsp) - movq %r11,64+56(%rsp) + movq %rax,120(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 .Lprologue_xop: vzeroall @@ -1207,31 +1216,48 @@ aesni_cbc_sha256_enc_xop: jb .Lloop_xop movq 64+32(%rsp),%r8 - movq 64+56(%rsp),%rsi + movq 120(%rsp),%rsi +.cfi_def_cfa %rsi,8 vmovdqu %xmm8,(%r8) vzeroall - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_xop: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_cbc_sha256_enc_xop,.-aesni_cbc_sha256_enc_xop .type aesni_cbc_sha256_enc_avx,@function .align 64 aesni_cbc_sha256_enc_avx: +.cfi_startproc .Lavx_shortcut: movq 8(%rsp),%r10 + movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 - movq %rsp,%r11 +.cfi_offset %r15,-56 subq $128,%rsp andq $-64,%rsp @@ -1247,7 +1273,8 @@ aesni_cbc_sha256_enc_avx: movq %r8,64+32(%rsp) movq %r9,64+40(%rsp) movq %r10,64+48(%rsp) - movq %r11,64+56(%rsp) + movq %rax,120(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 .Lprologue_avx: vzeroall @@ -2384,31 +2411,48 @@ aesni_cbc_sha256_enc_avx: jb .Lloop_avx movq 64+32(%rsp),%r8 - movq 64+56(%rsp),%rsi + movq 120(%rsp),%rsi +.cfi_def_cfa %rsi,8 vmovdqu %xmm8,(%r8) vzeroall - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_cbc_sha256_enc_avx,.-aesni_cbc_sha256_enc_avx .type aesni_cbc_sha256_enc_avx2,@function .align 64 aesni_cbc_sha256_enc_avx2: +.cfi_startproc .Lavx2_shortcut: movq 8(%rsp),%r10 + movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 - movq %rsp,%r11 +.cfi_offset %r15,-56 subq $576,%rsp andq $-1024,%rsp addq $448,%rsp @@ -2425,7 +2469,8 @@ aesni_cbc_sha256_enc_avx2: movq %r8,64+32(%rsp) movq %r9,64+40(%rsp) movq %r10,64+48(%rsp) - movq %r11,64+56(%rsp) + movq %rax,120(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 .Lprologue_avx2: vzeroall @@ -3987,18 +4032,27 @@ aesni_cbc_sha256_enc_avx2: .Ldone_avx2: leaq (%rbp),%rsp movq 64+32(%rsp),%r8 - movq 64+56(%rsp),%rsi + movq 120(%rsp),%rsi +.cfi_def_cfa %rsi,8 vmovdqu %xmm8,(%r8) vzeroall - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx2: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_cbc_sha256_enc_avx2,.-aesni_cbc_sha256_enc_avx2 .type aesni_cbc_sha256_enc_shaext,@function .align 32 diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aesni-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aesni-x86_64.s index e18f87c4e6..5b2a68e758 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aesni-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/aesni-x86_64.s @@ -995,6 +995,7 @@ aesni_ccm64_decrypt_blocks: .type aesni_ctr32_encrypt_blocks,@function .align 16 aesni_ctr32_encrypt_blocks: +.cfi_startproc cmpq $1,%rdx jne .Lctr32_bulk @@ -1024,11 +1025,12 @@ aesni_ctr32_encrypt_blocks: .align 16 .Lctr32_bulk: - leaq (%rsp),%rax + leaq (%rsp),%r11 +.cfi_def_cfa_register %r11 pushq %rbp +.cfi_offset %rbp,-16 subq $128,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp @@ -1037,7 +1039,7 @@ aesni_ctr32_encrypt_blocks: movdqu (%rcx),%xmm0 movl 12(%r8),%r8d pxor %xmm0,%xmm2 - movl 12(%rcx),%r11d + movl 12(%rcx),%ebp movdqa %xmm2,0(%rsp) bswapl %r8d movdqa %xmm2,%xmm3 @@ -1053,8 +1055,8 @@ aesni_ctr32_encrypt_blocks: leaq 2(%r8),%rdx bswapl %eax bswapl %edx - xorl %r11d,%eax - xorl %r11d,%edx + xorl %ebp,%eax + xorl %ebp,%edx .byte 102,15,58,34,216,3 leaq 3(%r8),%rax movdqa %xmm3,16(%rsp) @@ -1063,25 +1065,25 @@ aesni_ctr32_encrypt_blocks: movq %r10,%rdx leaq 4(%r8),%r10 movdqa %xmm4,32(%rsp) - xorl %r11d,%eax + xorl %ebp,%eax bswapl %r10d .byte 102,15,58,34,232,3 - xorl %r11d,%r10d + xorl %ebp,%r10d movdqa %xmm5,48(%rsp) leaq 5(%r8),%r9 movl %r10d,64+12(%rsp) bswapl %r9d leaq 6(%r8),%r10 movl 240(%rcx),%eax - xorl %r11d,%r9d + xorl %ebp,%r9d bswapl %r10d movl %r9d,80+12(%rsp) - xorl %r11d,%r10d + xorl %ebp,%r10d leaq 7(%r8),%r9 movl %r10d,96+12(%rsp) bswapl %r9d movl OPENSSL_ia32cap_P+4(%rip),%r10d - xorl %r11d,%r9d + xorl %ebp,%r9d andl $71303168,%r10d movl %r9d,112+12(%rsp) @@ -1105,7 +1107,7 @@ aesni_ctr32_encrypt_blocks: .Lctr32_6x: shll $4,%eax movl $48,%r10d - bswapl %r11d + bswapl %ebp leaq 32(%rcx,%rax,1),%rcx subq %rax,%r10 jmp .Lctr32_loop6 @@ -1116,32 +1118,32 @@ aesni_ctr32_encrypt_blocks: movups -48(%rcx,%r10,1),%xmm0 .byte 102,15,56,220,209 movl %r8d,%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,217 .byte 0x0f,0x38,0xf1,0x44,0x24,12 leal 1(%r8),%eax .byte 102,15,56,220,225 - xorl %r11d,%eax + xorl %ebp,%eax .byte 0x0f,0x38,0xf1,0x44,0x24,28 .byte 102,15,56,220,233 leal 2(%r8),%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,241 .byte 0x0f,0x38,0xf1,0x44,0x24,44 leal 3(%r8),%eax .byte 102,15,56,220,249 movups -32(%rcx,%r10,1),%xmm1 - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,208 .byte 0x0f,0x38,0xf1,0x44,0x24,60 leal 4(%r8),%eax .byte 102,15,56,220,216 - xorl %r11d,%eax + xorl %ebp,%eax .byte 0x0f,0x38,0xf1,0x44,0x24,76 .byte 102,15,56,220,224 leal 5(%r8),%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,232 .byte 0x0f,0x38,0xf1,0x44,0x24,92 movq %r10,%rax @@ -1202,7 +1204,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d movups 32-128(%rcx),%xmm0 .byte 102,15,56,220,225 - xorl %r11d,%r9d + xorl %ebp,%r9d nop .byte 102,15,56,220,233 movl %r9d,0+12(%rsp) @@ -1215,7 +1217,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1229,7 +1231,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1243,7 +1245,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1257,7 +1259,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1271,7 +1273,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1285,7 +1287,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1300,7 +1302,7 @@ aesni_ctr32_encrypt_blocks: .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 - xorl %r11d,%r9d + xorl %ebp,%r9d movdqu 0(%rdi),%xmm10 .byte 102,15,56,220,232 movl %r9d,112+12(%rsp) @@ -1535,7 +1537,7 @@ aesni_ctr32_encrypt_blocks: .Lctr32_done: xorps %xmm0,%xmm0 - xorl %r11d,%r11d + xorl %ebp,%ebp pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 @@ -1559,20 +1561,25 @@ aesni_ctr32_encrypt_blocks: pxor %xmm14,%xmm14 movaps %xmm0,112(%rsp) pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp +.cfi_restore %rbp + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lctr32_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks .globl aesni_xts_encrypt .type aesni_xts_encrypt,@function .align 16 aesni_xts_encrypt: - leaq (%rsp),%rax +.cfi_startproc + leaq (%rsp),%r11 +.cfi_def_cfa_register %r11 pushq %rbp +.cfi_offset %rbp,-16 subq $112,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -1588,7 +1595,7 @@ aesni_xts_encrypt: jnz .Loop_enc1_8 .byte 102,15,56,221,209 movups (%rcx),%xmm0 - movq %rcx,%r11 + movq %rcx,%rbp movl %r10d,%eax shll $4,%r10d movq %rdx,%r9 @@ -1644,9 +1651,9 @@ aesni_xts_encrypt: jc .Lxts_enc_short movl $16+96,%eax - leaq 32(%r11,%r10,1),%rcx + leaq 32(%rbp,%r10,1),%rcx subq %r10,%rax - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 movq %rax,%r10 leaq .Lxts_magic(%rip),%r8 jmp .Lxts_enc_grandloop @@ -1671,7 +1678,7 @@ aesni_xts_encrypt: movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 .byte 102,15,56,220,233 - movups 32(%r11),%xmm0 + movups 32(%rbp),%xmm0 leaq 96(%rdi),%rdi pxor %xmm8,%xmm7 @@ -1680,7 +1687,7 @@ aesni_xts_encrypt: pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) .byte 102,15,56,220,249 - movups 48(%r11),%xmm1 + movups 48(%rbp),%xmm1 pxor %xmm9,%xmm12 .byte 102,15,56,220,208 @@ -1695,7 +1702,7 @@ aesni_xts_encrypt: movdqa %xmm14,64(%rsp) .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups 64(%r11),%xmm0 + movups 64(%rbp),%xmm0 movdqa %xmm8,80(%rsp) pshufd $0x5f,%xmm15,%xmm9 jmp .Lxts_enc_loop6 @@ -1727,7 +1734,7 @@ aesni_xts_encrypt: psrad $31,%xmm14 .byte 102,15,56,220,217 pand %xmm8,%xmm14 - movups (%r11),%xmm10 + movups (%rbp),%xmm10 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 @@ -1795,10 +1802,10 @@ aesni_xts_encrypt: .byte 102,15,56,220,225 .byte 102,15,56,220,233 pxor %xmm0,%xmm15 - movups (%r11),%xmm0 + movups (%rbp),%xmm0 .byte 102,15,56,220,241 .byte 102,15,56,220,249 - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 pxor %xmm15,%xmm14 .byte 102,15,56,221,84,36,0 @@ -1825,7 +1832,7 @@ aesni_xts_encrypt: movl $16+96,%eax subl %r10d,%eax - movq %r11,%rcx + movq %rbp,%rcx shrl $4,%eax .Lxts_enc_short: @@ -1981,7 +1988,7 @@ aesni_xts_encrypt: jnz .Lxts_enc_steal subq %r9,%rsi - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups -16(%rsi),%xmm2 @@ -2024,20 +2031,25 @@ aesni_xts_encrypt: movaps %xmm0,96(%rsp) pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp +.cfi_restore %rbp + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lxts_enc_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_xts_encrypt,.-aesni_xts_encrypt .globl aesni_xts_decrypt .type aesni_xts_decrypt,@function .align 16 aesni_xts_decrypt: - leaq (%rsp),%rax +.cfi_startproc + leaq (%rsp),%r11 +.cfi_def_cfa_register %r11 pushq %rbp +.cfi_offset %rbp,-16 subq $112,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -2059,7 +2071,7 @@ aesni_xts_decrypt: subq %rax,%rdx movups (%rcx),%xmm0 - movq %rcx,%r11 + movq %rcx,%rbp movl %r10d,%eax shll $4,%r10d movq %rdx,%r9 @@ -2115,9 +2127,9 @@ aesni_xts_decrypt: jc .Lxts_dec_short movl $16+96,%eax - leaq 32(%r11,%r10,1),%rcx + leaq 32(%rbp,%r10,1),%rcx subq %r10,%rax - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 movq %rax,%r10 leaq .Lxts_magic(%rip),%r8 jmp .Lxts_dec_grandloop @@ -2142,7 +2154,7 @@ aesni_xts_decrypt: movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 .byte 102,15,56,222,233 - movups 32(%r11),%xmm0 + movups 32(%rbp),%xmm0 leaq 96(%rdi),%rdi pxor %xmm8,%xmm7 @@ -2151,7 +2163,7 @@ aesni_xts_decrypt: pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) .byte 102,15,56,222,249 - movups 48(%r11),%xmm1 + movups 48(%rbp),%xmm1 pxor %xmm9,%xmm12 .byte 102,15,56,222,208 @@ -2166,7 +2178,7 @@ aesni_xts_decrypt: movdqa %xmm14,64(%rsp) .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups 64(%r11),%xmm0 + movups 64(%rbp),%xmm0 movdqa %xmm8,80(%rsp) pshufd $0x5f,%xmm15,%xmm9 jmp .Lxts_dec_loop6 @@ -2198,7 +2210,7 @@ aesni_xts_decrypt: psrad $31,%xmm14 .byte 102,15,56,222,217 pand %xmm8,%xmm14 - movups (%r11),%xmm10 + movups (%rbp),%xmm10 .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 @@ -2266,10 +2278,10 @@ aesni_xts_decrypt: .byte 102,15,56,222,225 .byte 102,15,56,222,233 pxor %xmm0,%xmm15 - movups (%r11),%xmm0 + movups (%rbp),%xmm0 .byte 102,15,56,222,241 .byte 102,15,56,222,249 - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 pxor %xmm15,%xmm14 .byte 102,15,56,223,84,36,0 @@ -2296,7 +2308,7 @@ aesni_xts_decrypt: movl $16+96,%eax subl %r10d,%eax - movq %r11,%rcx + movq %rbp,%rcx shrl $4,%eax .Lxts_dec_short: @@ -2453,7 +2465,7 @@ aesni_xts_decrypt: jz .Lxts_dec_ret .Lxts_dec_done2: movq %r9,%rdx - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups (%rdi),%xmm2 @@ -2483,7 +2495,7 @@ aesni_xts_decrypt: jnz .Lxts_dec_steal subq %r9,%rsi - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups (%rsi),%xmm2 @@ -2526,21 +2538,35 @@ aesni_xts_decrypt: movaps %xmm0,96(%rsp) pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp +.cfi_restore %rbp + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lxts_dec_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_xts_decrypt,.-aesni_xts_decrypt .globl aesni_ocb_encrypt .type aesni_ocb_encrypt,@function .align 32 aesni_ocb_encrypt: +.cfi_startproc leaq (%rsp),%rax pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 movq 8(%rax),%rbx movq 8+8(%rax),%rbp @@ -2716,13 +2742,23 @@ aesni_ocb_encrypt: pxor %xmm13,%xmm13 pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx + leaq 40(%rsp),%rax +.cfi_def_cfa %rax,8 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Locb_enc_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_ocb_encrypt,.-aesni_ocb_encrypt .type __ocb_encrypt6,@function @@ -2935,12 +2971,23 @@ __ocb_encrypt1: .type aesni_ocb_decrypt,@function .align 32 aesni_ocb_decrypt: +.cfi_startproc leaq (%rsp),%rax pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 movq 8(%rax),%rbx movq 8+8(%rax),%rbp @@ -3138,13 +3185,23 @@ aesni_ocb_decrypt: pxor %xmm13,%xmm13 pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx + leaq 40(%rsp),%rax +.cfi_def_cfa %rax,8 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Locb_dec_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_ocb_decrypt,.-aesni_ocb_decrypt .type __ocb_decrypt6,@function @@ -3345,6 +3402,7 @@ __ocb_decrypt1: .type aesni_cbc_encrypt,@function .align 16 aesni_cbc_encrypt: +.cfi_startproc testq %rdx,%rdx jz .Lcbc_ret @@ -3437,11 +3495,13 @@ aesni_cbc_encrypt: jmp .Lcbc_ret .align 16 .Lcbc_decrypt_bulk: - leaq (%rsp),%rax + leaq (%rsp),%r11 +.cfi_def_cfa_register %r11 pushq %rbp +.cfi_offset %rbp,-16 subq $16,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp + movq %rcx,%rbp movups (%r8),%xmm10 movl %r10d,%eax cmpq $0x50,%rdx @@ -3481,7 +3541,7 @@ aesni_cbc_encrypt: pxor %xmm0,%xmm3 movups 16-112(%rcx),%xmm1 pxor %xmm0,%xmm4 - xorq %r11,%r11 + movq $-1,%rbp cmpq $0x70,%rdx pxor %xmm0,%xmm5 pxor %xmm0,%xmm6 @@ -3497,10 +3557,10 @@ aesni_cbc_encrypt: .byte 102,15,56,222,241 .byte 102,15,56,222,249 .byte 102,68,15,56,222,193 - setnc %r11b - shlq $7,%r11 + adcq $0,%rbp + andq $128,%rbp .byte 102,68,15,56,222,201 - addq %rdi,%r11 + addq %rdi,%rbp movups 48-112(%rcx),%xmm1 .byte 102,15,56,222,208 .byte 102,15,56,222,216 @@ -3638,18 +3698,18 @@ aesni_cbc_encrypt: movdqu 112(%rdi),%xmm0 .byte 102,65,15,56,223,228 leaq 128(%rdi),%rdi - movdqu 0(%r11),%xmm11 + movdqu 0(%rbp),%xmm11 .byte 102,65,15,56,223,237 .byte 102,65,15,56,223,246 - movdqu 16(%r11),%xmm12 - movdqu 32(%r11),%xmm13 + movdqu 16(%rbp),%xmm12 + movdqu 32(%rbp),%xmm13 .byte 102,65,15,56,223,255 .byte 102,68,15,56,223,193 - movdqu 48(%r11),%xmm14 - movdqu 64(%r11),%xmm15 + movdqu 48(%rbp),%xmm14 + movdqu 64(%rbp),%xmm15 .byte 102,69,15,56,223,202 movdqa %xmm0,%xmm10 - movdqu 80(%r11),%xmm1 + movdqu 80(%rbp),%xmm1 movups -112(%rcx),%xmm0 movups %xmm2,(%rsi) @@ -3768,7 +3828,7 @@ aesni_cbc_encrypt: pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) pxor %xmm14,%xmm6 - movq %r11,%rcx + movq %rbp,%rcx movdqu %xmm5,48(%rsi) pxor %xmm15,%xmm7 movl %r10d,%eax @@ -3921,16 +3981,21 @@ aesni_cbc_encrypt: .Lcbc_dec_ret: xorps %xmm0,%xmm0 pxor %xmm1,%xmm1 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp +.cfi_restore %rbp + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lcbc_ret: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_cbc_encrypt,.-aesni_cbc_encrypt .globl aesni_set_decrypt_key .type aesni_set_decrypt_key,@function .align 16 aesni_set_decrypt_key: +.cfi_startproc .byte 0x48,0x83,0xEC,0x08 +.cfi_adjust_cfa_offset 8 call __aesni_set_encrypt_key shll $4,%esi testl %eax,%eax @@ -3963,7 +4028,9 @@ aesni_set_decrypt_key: pxor %xmm0,%xmm0 .Ldec_key_ret: addq $8,%rsp +.cfi_adjust_cfa_offset -8 .byte 0xf3,0xc3 +.cfi_endproc .LSEH_end_set_decrypt_key: .size aesni_set_decrypt_key,.-aesni_set_decrypt_key .globl aesni_set_encrypt_key @@ -3971,7 +4038,9 @@ aesni_set_decrypt_key: .align 16 aesni_set_encrypt_key: __aesni_set_encrypt_key: +.cfi_startproc .byte 0x48,0x83,0xEC,0x08 +.cfi_adjust_cfa_offset 8 movq $-1,%rax testq %rdi,%rdi jz .Lenc_key_ret @@ -4264,7 +4333,9 @@ __aesni_set_encrypt_key: pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 addq $8,%rsp +.cfi_adjust_cfa_offset -8 .byte 0xf3,0xc3 +.cfi_endproc .LSEH_end_set_encrypt_key: .align 16 diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/bsaes-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/bsaes-x86_64.s index c76c5a8afb..f7451dfe52 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/bsaes-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/aes/bsaes-x86_64.s @@ -1067,6 +1067,7 @@ _bsaes_key_convert: .type bsaes_cbc_encrypt,@function .align 16 bsaes_cbc_encrypt: +.cfi_startproc cmpl $0,%r9d jne asm_AES_cbc_encrypt cmpq $128,%rdx @@ -1075,13 +1076,27 @@ bsaes_cbc_encrypt: movq %rsp,%rax .Lcbc_dec_prologue: pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 leaq -72(%rsp),%rsp +.cfi_adjust_cfa_offset 0x48 movq %rsp,%rbp +.cfi_def_cfa_register %rbp movl 240(%rcx),%eax movq %rdi,%r12 movq %rsi,%r13 @@ -1300,33 +1315,56 @@ bsaes_cbc_encrypt: cmpq %rax,%rbp ja .Lcbc_dec_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax +.cfi_def_cfa %rax,8 + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbx +.cfi_restore %rbx + movq -8(%rax),%rbp +.cfi_restore %rbp + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lcbc_dec_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt .globl bsaes_ctr32_encrypt_blocks .type bsaes_ctr32_encrypt_blocks,@function .align 16 bsaes_ctr32_encrypt_blocks: +.cfi_startproc movq %rsp,%rax .Lctr_enc_prologue: pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 leaq -72(%rsp),%rsp +.cfi_adjust_cfa_offset 0x48 movq %rsp,%rbp +.cfi_def_cfa_register %rbp movdqu (%r8),%xmm0 movl 240(%rcx),%eax movq %rdi,%r12 @@ -1500,32 +1538,55 @@ bsaes_ctr32_encrypt_blocks: cmpq %rax,%rbp ja .Lctr_enc_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax +.cfi_def_cfa %rax,8 + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbx +.cfi_restore %rbx + movq -8(%rax),%rbp +.cfi_restore %rbp + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lctr_enc_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks .globl bsaes_xts_encrypt .type bsaes_xts_encrypt,@function .align 16 bsaes_xts_encrypt: +.cfi_startproc movq %rsp,%rax .Lxts_enc_prologue: pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 leaq -72(%rsp),%rsp +.cfi_adjust_cfa_offset 0x48 movq %rsp,%rbp +.cfi_def_cfa_register %rbp movq %rdi,%r12 movq %rsi,%r13 movq %rdx,%r14 @@ -1951,32 +2012,54 @@ bsaes_xts_encrypt: cmpq %rax,%rbp ja .Lxts_enc_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax +.cfi_def_cfa %rax,8 + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbx +.cfi_restore %rbx + movq -8(%rax),%rbp +.cfi_restore %rbp + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lxts_enc_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bsaes_xts_encrypt,.-bsaes_xts_encrypt .globl bsaes_xts_decrypt .type bsaes_xts_decrypt,@function .align 16 bsaes_xts_decrypt: +.cfi_startproc movq %rsp,%rax .Lxts_dec_prologue: pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 leaq -72(%rsp),%rsp +.cfi_adjust_cfa_offset 0x48 movq %rsp,%rbp movq %rdi,%r12 movq %rsi,%r13 @@ -2429,17 +2512,25 @@ bsaes_xts_decrypt: cmpq %rax,%rbp ja .Lxts_dec_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax +.cfi_def_cfa %rax,8 + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbx +.cfi_restore %rbx + movq -8(%rax),%rbp +.cfi_restore %rbp + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lxts_dec_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bsaes_xts_decrypt,.-bsaes_xts_decrypt .type _bsaes_const,@object .align 64 diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/rsaz-avx2.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/rsaz-avx2.s index ee619092c9..61b400749b 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/rsaz-avx2.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/rsaz-avx2.s @@ -4,15 +4,24 @@ .type rsaz_1024_sqr_avx2,@function .align 64 rsaz_1024_sqr_avx2: +.cfi_startproc leaq (%rsp),%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 vzeroupper movq %rax,%rbp +.cfi_def_cfa_register %rbp movq %rdx,%r13 subq $832,%rsp movq %r13,%r15 @@ -625,28 +634,46 @@ rsaz_1024_sqr_avx2: vzeroall movq %rbp,%rax +.cfi_def_cfa_register %rax movq -48(%rax),%r15 +.cfi_restore %r15 movq -40(%rax),%r14 +.cfi_restore %r14 movq -32(%rax),%r13 +.cfi_restore %r13 movq -24(%rax),%r12 +.cfi_restore %r12 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lsqr_1024_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 .globl rsaz_1024_mul_avx2 .type rsaz_1024_mul_avx2,@function .align 64 rsaz_1024_mul_avx2: +.cfi_startproc leaq (%rsp),%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 movq %rax,%rbp +.cfi_def_cfa_register %rbp vzeroall movq %rdx,%r13 subq $64,%rsp @@ -1162,15 +1189,24 @@ rsaz_1024_mul_avx2: vzeroupper movq %rbp,%rax +.cfi_def_cfa_register %rax movq -48(%rax),%r15 +.cfi_restore %r15 movq -40(%rax),%r14 +.cfi_restore %r14 movq -32(%rax),%r13 +.cfi_restore %r13 movq -24(%rax),%r12 +.cfi_restore %r12 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lmul_1024_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 .globl rsaz_1024_red2norm_avx2 .type rsaz_1024_red2norm_avx2,@function @@ -1555,8 +1591,10 @@ rsaz_1024_scatter5_avx2: .type rsaz_1024_gather5_avx2,@function .align 32 rsaz_1024_gather5_avx2: +.cfi_startproc vzeroupper movq %rsp,%r11 +.cfi_def_cfa_register %r11 leaq -256(%rsp),%rsp andq $-32,%rsp leaq .Linc(%rip),%r10 @@ -1665,7 +1703,10 @@ rsaz_1024_gather5_avx2: vmovdqu %ymm0,(%rdi) vzeroupper leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .byte 0xf3,0xc3 +.cfi_endproc +.LSEH_end_rsaz_1024_gather5: .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 .globl rsaz_avx2_eligible diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/rsaz-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/rsaz-x86_64.s index 795cebe1d7..f8e4a80588 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/rsaz-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/rsaz-x86_64.s @@ -6,14 +6,28 @@ .type rsaz_512_sqr,@function .align 32 rsaz_512_sqr: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $128+24,%rsp +.cfi_adjust_cfa_offset 128+24 .Lsqr_body: movq %rdx,%rbp movq (%rsi),%rdx @@ -658,28 +672,51 @@ rsaz_512_sqr: .Lsqr_tail: leaq 128+24+48(%rsp),%rax +.cfi_def_cfa %rax,8 movq -48(%rax),%r15 +.cfi_restore %r15 movq -40(%rax),%r14 +.cfi_restore %r14 movq -32(%rax),%r13 +.cfi_restore %r13 movq -24(%rax),%r12 +.cfi_restore %r12 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lsqr_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size rsaz_512_sqr,.-rsaz_512_sqr .globl rsaz_512_mul .type rsaz_512_mul,@function .align 32 rsaz_512_mul: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $128+24,%rsp +.cfi_adjust_cfa_offset 128+24 .Lmul_body: .byte 102,72,15,110,199 .byte 102,72,15,110,201 @@ -741,28 +778,51 @@ rsaz_512_mul: call __rsaz_512_subtract leaq 128+24+48(%rsp),%rax +.cfi_def_cfa %rax,8 movq -48(%rax),%r15 +.cfi_restore %r15 movq -40(%rax),%r14 +.cfi_restore %r14 movq -32(%rax),%r13 +.cfi_restore %r13 movq -24(%rax),%r12 +.cfi_restore %r12 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lmul_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size rsaz_512_mul,.-rsaz_512_mul .globl rsaz_512_mul_gather4 .type rsaz_512_mul_gather4,@function .align 32 rsaz_512_mul_gather4: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $152,%rsp +.cfi_adjust_cfa_offset 152 .Lmul_gather4_body: movd %r9d,%xmm8 movdqa .Linc+16(%rip),%xmm1 @@ -1151,29 +1211,52 @@ rsaz_512_mul_gather4: call __rsaz_512_subtract leaq 128+24+48(%rsp),%rax +.cfi_def_cfa %rax,8 movq -48(%rax),%r15 +.cfi_restore %r15 movq -40(%rax),%r14 +.cfi_restore %r14 movq -32(%rax),%r13 +.cfi_restore %r13 movq -24(%rax),%r12 +.cfi_restore %r12 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lmul_gather4_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 .globl rsaz_512_mul_scatter4 .type rsaz_512_mul_scatter4,@function .align 32 rsaz_512_mul_scatter4: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 movl %r9d,%r9d subq $128+24,%rsp +.cfi_adjust_cfa_offset 128+24 .Lmul_scatter4_body: leaq (%r8,%r9,8),%r8 .byte 102,72,15,110,199 @@ -1248,28 +1331,51 @@ rsaz_512_mul_scatter4: movq %r15,896(%rsi) leaq 128+24+48(%rsp),%rax +.cfi_def_cfa %rax,8 movq -48(%rax),%r15 +.cfi_restore %r15 movq -40(%rax),%r14 +.cfi_restore %r14 movq -32(%rax),%r13 +.cfi_restore %r13 movq -24(%rax),%r12 +.cfi_restore %r12 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lmul_scatter4_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 .globl rsaz_512_mul_by_one .type rsaz_512_mul_by_one,@function .align 32 rsaz_512_mul_by_one: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $128+24,%rsp +.cfi_adjust_cfa_offset 128+24 .Lmul_by_one_body: movl OPENSSL_ia32cap_P+8(%rip),%eax movq %rdx,%rbp @@ -1312,15 +1418,24 @@ rsaz_512_mul_by_one: movq %r15,56(%rdi) leaq 128+24+48(%rsp),%rax +.cfi_def_cfa %rax,8 movq -48(%rax),%r15 +.cfi_restore %r15 movq -40(%rax),%r14 +.cfi_restore %r14 movq -32(%rax),%r13 +.cfi_restore %r13 movq -24(%rax),%r12 +.cfi_restore %r12 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lmul_by_one_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one .type __rsaz_512_reduce,@function .align 32 diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/x86_64-gf2m.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/x86_64-gf2m.s index a0b78a0565..0846c4441e 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/x86_64-gf2m.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/x86_64-gf2m.s @@ -3,7 +3,9 @@ .type _mul_1x1,@function .align 16 _mul_1x1: +.cfi_startproc subq $128+8,%rsp +.cfi_adjust_cfa_offset 128+8 movq $-1,%r9 leaq (%rax,%rax,1),%rsi shrq $3,%r9 @@ -193,16 +195,20 @@ _mul_1x1: xorq %rdi,%rdx addq $128+8,%rsp +.cfi_adjust_cfa_offset -128-8 .byte 0xf3,0xc3 .Lend_mul_1x1: +.cfi_endproc .size _mul_1x1,.-_mul_1x1 .globl bn_GF2m_mul_2x2 .type bn_GF2m_mul_2x2,@function .align 16 bn_GF2m_mul_2x2: - movq OPENSSL_ia32cap_P(%rip),%rax - btq $33,%rax +.cfi_startproc + movq %rsp,%rax + movq OPENSSL_ia32cap_P(%rip),%r10 + btq $33,%r10 jnc .Lvanilla_mul_2x2 .byte 102,72,15,110,198 @@ -230,11 +236,17 @@ bn_GF2m_mul_2x2: .align 16 .Lvanilla_mul_2x2: leaq -136(%rsp),%rsp +.cfi_adjust_cfa_offset 8*17 movq %r14,80(%rsp) +.cfi_rel_offset %r14,8*10 movq %r13,88(%rsp) +.cfi_rel_offset %r13,8*11 movq %r12,96(%rsp) +.cfi_rel_offset %r12,8*12 movq %rbp,104(%rsp) +.cfi_rel_offset %rbp,8*13 movq %rbx,112(%rsp) +.cfi_rel_offset %rbx,8*14 .Lbody_mul_2x2: movq %rdi,32(%rsp) movq %rsi,40(%rsp) @@ -279,13 +291,21 @@ bn_GF2m_mul_2x2: movq %rax,8(%rbp) movq 80(%rsp),%r14 +.cfi_restore %r14 movq 88(%rsp),%r13 +.cfi_restore %r13 movq 96(%rsp),%r12 +.cfi_restore %r12 movq 104(%rsp),%rbp +.cfi_restore %rbp movq 112(%rsp),%rbx +.cfi_restore %rbx leaq 136(%rsp),%rsp +.cfi_adjust_cfa_offset -8*17 +.Lepilogue_mul_2x2: .byte 0xf3,0xc3 .Lend_mul_2x2: +.cfi_endproc .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 .byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/x86_64-mont.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/x86_64-mont.s index 3a78cd8440..414be6aff5 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/x86_64-mont.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/x86_64-mont.s @@ -6,8 +6,10 @@ .type bn_mul_mont,@function .align 16 bn_mul_mont: +.cfi_startproc movl %r9d,%r9d movq %rsp,%rax +.cfi_def_cfa_register %rax testl $3,%r9d jnz .Lmul_enter cmpl $8,%r9d @@ -22,11 +24,17 @@ bn_mul_mont: .align 16 .Lmul_enter: pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 negq %r9 movq %rsp,%r11 @@ -59,6 +67,7 @@ bn_mul_mont: .Lmul_page_walk_done: movq %rax,8(%rsp,%r9,8) +.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 .Lmul_body: movq %rdx,%r12 movq (%r8),%r8 @@ -226,32 +235,49 @@ bn_mul_mont: jnz .Lcopy movq 8(%rsp,%r9,8),%rsi +.cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_mul_mont,.-bn_mul_mont .type bn_mul4x_mont,@function .align 16 bn_mul4x_mont: +.cfi_startproc movl %r9d,%r9d movq %rsp,%rax +.cfi_def_cfa_register %rax .Lmul4x_enter: andl $0x80100,%r11d cmpl $0x80100,%r11d je .Lmulx4x_enter pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 negq %r9 movq %rsp,%r11 @@ -275,6 +301,7 @@ bn_mul4x_mont: .Lmul4x_page_walk_done: movq %rax,8(%rsp,%r9,8) +.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 .Lmul4x_body: movq %rdi,16(%rsp,%r9,8) movq %rdx,%r12 @@ -642,16 +669,25 @@ bn_mul4x_mont: decq %r15 jnz .Lcopy4x movq 8(%rsp,%r9,8),%rsi +.cfi_def_cfa %rsi, 8 movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul4x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_mul4x_mont,.-bn_mul4x_mont @@ -659,14 +695,22 @@ bn_mul4x_mont: .type bn_sqr8x_mont,@function .align 32 bn_sqr8x_mont: +.cfi_startproc movq %rsp,%rax +.cfi_def_cfa_register %rax .Lsqr8x_enter: pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 .Lsqr8x_prologue: movl %r9d,%r10d @@ -722,6 +766,7 @@ bn_sqr8x_mont: movq %r8,32(%rsp) movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lsqr8x_body: .byte 102,72,15,110,209 @@ -787,6 +832,7 @@ bn_sqr8x_mont: pxor %xmm0,%xmm0 pshufd $0,%xmm1,%xmm1 movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 jmp .Lsqr8x_cond_copy .align 32 @@ -816,26 +862,42 @@ bn_sqr8x_mont: movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lsqr8x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_sqr8x_mont,.-bn_sqr8x_mont .type bn_mulx4x_mont,@function .align 32 bn_mulx4x_mont: +.cfi_startproc movq %rsp,%rax +.cfi_def_cfa_register %rax .Lmulx4x_enter: pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 .Lmulx4x_prologue: shll $3,%r9d @@ -881,6 +943,7 @@ bn_mulx4x_mont: movq %r8,24(%rsp) movq %rdi,32(%rsp) movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 movq %r9,48(%rsp) jmp .Lmulx4x_body @@ -1125,6 +1188,7 @@ bn_mulx4x_mont: pxor %xmm0,%xmm0 pshufd $0,%xmm1,%xmm1 movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 jmp .Lmulx4x_cond_copy .align 32 @@ -1154,14 +1218,22 @@ bn_mulx4x_mont: movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmulx4x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_mulx4x_mont,.-bn_mulx4x_mont .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/x86_64-mont5.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/x86_64-mont5.s index 0dd53512f9..c6d752a245 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/x86_64-mont5.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/bn/x86_64-mont5.s @@ -6,8 +6,10 @@ .type bn_mul_mont_gather5,@function .align 64 bn_mul_mont_gather5: +.cfi_startproc movl %r9d,%r9d movq %rsp,%rax +.cfi_def_cfa_register %rax testl $7,%r9d jnz .Lmul_enter movl OPENSSL_ia32cap_P+8(%rip),%r11d @@ -17,11 +19,17 @@ bn_mul_mont_gather5: .Lmul_enter: movd 8(%rsp),%xmm5 pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 negq %r9 movq %rsp,%r11 @@ -54,6 +62,7 @@ bn_mul_mont_gather5: leaq .Linc(%rip),%r10 movq %rax,8(%rsp,%r9,8) +.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 .Lmul_body: leaq 128(%rdx),%r12 @@ -411,33 +420,50 @@ bn_mul_mont_gather5: jnz .Lcopy movq 8(%rsp,%r9,8),%rsi +.cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 .type bn_mul4x_mont_gather5,@function .align 32 bn_mul4x_mont_gather5: +.cfi_startproc .byte 0x67 movq %rsp,%rax +.cfi_def_cfa_register %rax .Lmul4x_enter: andl $0x80108,%r11d cmpl $0x80108,%r11d je .Lmulx4x_enter pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 .Lmul4x_prologue: .byte 0x67 @@ -493,22 +519,32 @@ bn_mul4x_mont_gather5: negq %r9 movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lmul4x_body: call mul4x_internal movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul4x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 .type mul4x_internal,@function @@ -1040,17 +1076,25 @@ mul4x_internal: .type bn_power5,@function .align 32 bn_power5: +.cfi_startproc movq %rsp,%rax +.cfi_def_cfa_register %rax movl OPENSSL_ia32cap_P+8(%rip),%r11d andl $0x80108,%r11d cmpl $0x80108,%r11d je .Lpowerx5_enter pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 .Lpower5_prologue: shll $3,%r9d @@ -1115,6 +1159,7 @@ bn_power5: movq %r8,32(%rsp) movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lpower5_body: .byte 102,72,15,110,207 .byte 102,72,15,110,209 @@ -1141,16 +1186,25 @@ bn_power5: call mul4x_internal movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lpower5_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_power5,.-bn_power5 .globl bn_sqr8x_internal @@ -2001,14 +2055,22 @@ bn_from_montgomery: .type bn_from_mont8x,@function .align 32 bn_from_mont8x: +.cfi_startproc .byte 0x67 movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 .Lfrom_prologue: shll $3,%r9d @@ -2073,6 +2135,7 @@ bn_from_mont8x: movq %r8,32(%rsp) movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lfrom_body: movq %r9,%r11 leaq 48(%rsp),%rax @@ -2114,7 +2177,6 @@ bn_from_mont8x: pxor %xmm0,%xmm0 leaq 48(%rsp),%rax - movq 40(%rsp),%rsi jmp .Lfrom_mont_zero .align 32 @@ -2124,11 +2186,12 @@ bn_from_mont8x: pxor %xmm0,%xmm0 leaq 48(%rsp),%rax - movq 40(%rsp),%rsi jmp .Lfrom_mont_zero .align 32 .Lfrom_mont_zero: + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 movdqa %xmm0,0(%rax) movdqa %xmm0,16(%rax) movdqa %xmm0,32(%rax) @@ -2139,26 +2202,42 @@ bn_from_mont8x: movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lfrom_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_from_mont8x,.-bn_from_mont8x .type bn_mulx4x_mont_gather5,@function .align 32 bn_mulx4x_mont_gather5: +.cfi_startproc movq %rsp,%rax +.cfi_def_cfa_register %rax .Lmulx4x_enter: pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 .Lmulx4x_prologue: shll $3,%r9d @@ -2224,21 +2303,31 @@ bn_mulx4x_mont_gather5: movq %r8,32(%rsp) movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lmulx4x_body: call mulx4x_internal movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmulx4x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 .type mulx4x_internal,@function @@ -2666,14 +2755,22 @@ mulx4x_internal: .type bn_powerx5,@function .align 32 bn_powerx5: +.cfi_startproc movq %rsp,%rax +.cfi_def_cfa_register %rax .Lpowerx5_enter: pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 .Lpowerx5_prologue: shll $3,%r9d @@ -2745,6 +2842,7 @@ bn_powerx5: .byte 102,72,15,110,226 movq %r8,32(%rsp) movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lpowerx5_body: call __bn_sqrx8x_internal @@ -2767,17 +2865,26 @@ bn_powerx5: call mulx4x_internal movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lpowerx5_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_powerx5,.-bn_powerx5 .globl bn_sqrx8x_internal diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/buildinf.h b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/buildinf.h index 260e0eaac9..208078f6cb 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/buildinf.h +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/buildinf.h @@ -1,38 +1,48 @@ -/* auto-generated by util/mkbuildinf.pl for crypto/cversion.c */ -#define CFLAGS cflags /* - * Generate CFLAGS as an array of individual characters. This is a + * WARNING: do not edit! + * Generated by util/mkbuildinf.pl + * + * Copyright 2014-2017 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#define PLATFORM "platform: BSD-x86_64" +#define DATE "built on: Thu Nov 22 19:32:47 2018 UTC" + +/* + * Generate compiler_flags as an array of individual characters. This is a * workaround for the situation where CFLAGS gets too long for a C90 string * literal */ -static const char cflags[] = { - 'c','o','m','p','i','l','e','r',':',' ','c','c',' ','-','D','D', - 'S','O','_','D','L','F','C','N',' ','-','D','H','A','V','E','_', - 'D','L','F','C','N','_','H',' ','-','D','N','D','E','B','U','G', - ' ','-','D','O','P','E','N','S','S','L','_','T','H','R','E','A', - 'D','S',' ','-','D','O','P','E','N','S','S','L','_','N','O','_', - 'D','Y','N','A','M','I','C','_','E','N','G','I','N','E',' ','-', - 'D','O','P','E','N','S','S','L','_','P','I','C',' ','-','D','O', - 'P','E','N','S','S','L','_','I','A','3','2','_','S','S','E','2', - ' ','-','D','O','P','E','N','S','S','L','_','B','N','_','A','S', - 'M','_','M','O','N','T',' ','-','D','O','P','E','N','S','S','L', - '_','B','N','_','A','S','M','_','M','O','N','T','5',' ','-','D', - 'O','P','E','N','S','S','L','_','B','N','_','A','S','M','_','G', - 'F','2','m',' ','-','D','S','H','A','1','_','A','S','M',' ','-', - 'D','S','H','A','2','5','6','_','A','S','M',' ','-','D','S','H', - 'A','5','1','2','_','A','S','M',' ','-','D','R','C','4','_','A', - 'S','M',' ','-','D','M','D','5','_','A','S','M',' ','-','D','A', - 'E','S','_','A','S','M',' ','-','D','V','P','A','E','S','_','A', - 'S','M',' ','-','D','B','S','A','E','S','_','A','S','M',' ','-', - 'D','G','H','A','S','H','_','A','S','M',' ','-','D','E','C','P', - '_','N','I','S','T','Z','2','5','6','_','A','S','M',' ','-','D', - 'P','A','D','L','O','C','K','_','A','S','M',' ','-','D','P','O', - 'L','Y','1','3','0','5','_','A','S','M',' ','-','D','O','P','E', - 'N','S','S','L','D','I','R','=','"','\\','"','/','u','s','r','/', - 'l','o','c','a','l','/','s','s','l','\\','"','"',' ','-','D','E', - 'N','G','I','N','E','S','D','I','R','=','"','\\','"','/','u','s', - 'r','/','l','o','c','a','l','/','l','i','b','/','e','n','g','i', - 'n','e','s','-','1','.','1','\\','"','"',' ','\0' +static const char compiler_flags[] = { + 'c','o','m','p','i','l','e','r',':',' ','g','c','c',' ','-','f', + 'P','I','C',' ','-','p','t','h','r','e','a','d',' ','-','W','a', + ',','-','-','n','o','e','x','e','c','s','t','a','c','k',' ','-', + 'W','a','l','l',' ','-','O','3',' ','-','D','L','_','E','N','D', + 'I','A','N',' ','-','D','O','P','E','N','S','S','L','_','P','I', + 'C',' ','-','D','O','P','E','N','S','S','L','_','C','P','U','I', + 'D','_','O','B','J',' ','-','D','O','P','E','N','S','S','L','_', + 'I','A','3','2','_','S','S','E','2',' ','-','D','O','P','E','N', + 'S','S','L','_','B','N','_','A','S','M','_','M','O','N','T',' ', + '-','D','O','P','E','N','S','S','L','_','B','N','_','A','S','M', + '_','M','O','N','T','5',' ','-','D','O','P','E','N','S','S','L', + '_','B','N','_','A','S','M','_','G','F','2','m',' ','-','D','S', + 'H','A','1','_','A','S','M',' ','-','D','S','H','A','2','5','6', + '_','A','S','M',' ','-','D','S','H','A','5','1','2','_','A','S', + 'M',' ','-','D','K','E','C','C','A','K','1','6','0','0','_','A', + 'S','M',' ','-','D','R','C','4','_','A','S','M',' ','-','D','M', + 'D','5','_','A','S','M',' ','-','D','A','E','S','_','A','S','M', + ' ','-','D','V','P','A','E','S','_','A','S','M',' ','-','D','B', + 'S','A','E','S','_','A','S','M',' ','-','D','G','H','A','S','H', + '_','A','S','M',' ','-','D','E','C','P','_','N','I','S','T','Z', + '2','5','6','_','A','S','M',' ','-','D','X','2','5','5','1','9', + '_','A','S','M',' ','-','D','P','A','D','L','O','C','K','_','A', + 'S','M',' ','-','D','P','O','L','Y','1','3','0','5','_','A','S', + 'M',' ','-','D','_','T','H','R','E','A','D','_','S','A','F','E', + ' ','-','D','_','R','E','E','N','T','R','A','N','T',' ','-','D', + 'N','D','E','B','U','G','\0' }; -#define PLATFORM "platform: BSD-x86_64" -#define DATE "built on: Tue Nov 20 09:37:29 2018" diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/camellia/cmll-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/camellia/cmll-x86_64.s index 1dead91b17..405566b01c 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/camellia/cmll-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/camellia/cmll-x86_64.s @@ -17,11 +17,22 @@ Camellia_EncryptBlock: .align 16 .Lenc_rounds: Camellia_EncryptBlock_Rounds: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-32 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-40 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-48 .Lenc_prologue: @@ -53,13 +64,20 @@ Camellia_EncryptBlock_Rounds: movl %r11d,12(%r13) movq 0(%rsp),%r15 +.cfi_restore %r15 movq 8(%rsp),%r14 +.cfi_restore %r14 movq 16(%rsp),%r13 +.cfi_restore %r13 movq 24(%rsp),%rbp +.cfi_restore %rbp movq 32(%rsp),%rbx +.cfi_restore %rbx leaq 40(%rsp),%rsp +.cfi_adjust_cfa_offset -40 .Lenc_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds .type _x86_64_Camellia_encrypt,@function @@ -286,11 +304,22 @@ Camellia_DecryptBlock: .align 16 .Ldec_rounds: Camellia_DecryptBlock_Rounds: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-32 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-40 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-48 .Ldec_prologue: @@ -322,13 +351,20 @@ Camellia_DecryptBlock_Rounds: movl %r11d,12(%r13) movq 0(%rsp),%r15 +.cfi_restore %r15 movq 8(%rsp),%r14 +.cfi_restore %r14 movq 16(%rsp),%r13 +.cfi_restore %r13 movq 24(%rsp),%rbp +.cfi_restore %rbp movq 32(%rsp),%rbx +.cfi_restore %rbx leaq 40(%rsp),%rsp +.cfi_adjust_cfa_offset -40 .Ldec_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds .type _x86_64_Camellia_decrypt,@function @@ -542,11 +578,22 @@ _x86_64_Camellia_decrypt: .type Camellia_Ekeygen,@function .align 16 Camellia_Ekeygen: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-32 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-40 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-48 .Lkey_prologue: movl %edi,%r15d @@ -1074,13 +1121,20 @@ Camellia_Ekeygen: movl $4,%eax .Ldone: movq 0(%rsp),%r15 +.cfi_restore %r15 movq 8(%rsp),%r14 +.cfi_restore %r14 movq 16(%rsp),%r13 +.cfi_restore %r13 movq 24(%rsp),%rbp +.cfi_restore %rbp movq 32(%rsp),%rbx +.cfi_restore %rbx leaq 40(%rsp),%rsp +.cfi_adjust_cfa_offset -40 .Lkey_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size Camellia_Ekeygen,.-Camellia_Ekeygen .align 64 .LCamellia_SIGMA: @@ -1605,17 +1659,31 @@ Camellia_Ekeygen: .type Camellia_cbc_encrypt,@function .align 16 Camellia_cbc_encrypt: +.cfi_startproc cmpq $0,%rdx je .Lcbc_abort pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 .Lcbc_prologue: movq %rsp,%rbp +.cfi_def_cfa_register %rbp subq $64,%rsp andq $-64,%rsp @@ -1636,6 +1704,7 @@ Camellia_cbc_encrypt: movq %r8,40(%rsp) movq %rbp,48(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x30,0x06,0x23,0x38 .Lcbc_body: leaq .LCamellia_SBOX(%rip),%rbp @@ -1824,15 +1893,24 @@ Camellia_cbc_encrypt: .align 16 .Lcbc_done: movq 48(%rsp),%rcx +.cfi_def_cfa %rcx,56 movq 0(%rcx),%r15 +.cfi_restore %r15 movq 8(%rcx),%r14 +.cfi_restore %r14 movq 16(%rcx),%r13 +.cfi_restore %r13 movq 24(%rcx),%r12 +.cfi_restore %r12 movq 32(%rcx),%rbp +.cfi_restore %rbp movq 40(%rcx),%rbx +.cfi_restore %rbx leaq 48(%rcx),%rsp +.cfi_def_cfa %rsp,8 .Lcbc_abort: .byte 0xf3,0xc3 +.cfi_endproc .size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt .byte 67,97,109,101,108,108,105,97,32,102,111,114,32,120,56,54,95,54,52,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/chacha/chacha-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/chacha/chacha-x86_64.s index a9fed05fd7..1812bc84b1 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/chacha/chacha-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/chacha/chacha-x86_64.s @@ -19,6 +19,17 @@ .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd .Lrot24: .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe +.Ltwoy: +.long 2,0,0,0, 2,0,0,0 +.align 64 +.Lzeroz: +.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 +.Lfourz: +.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 +.Lincz: +.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +.Lsixteen: +.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 .Lsigma: .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 @@ -26,19 +37,38 @@ .type ChaCha20_ctr32,@function .align 64 ChaCha20_ctr32: +.cfi_startproc cmpq $0,%rdx je .Lno_data movq OPENSSL_ia32cap_P+4(%rip),%r10 + btq $48,%r10 + jc .LChaCha20_avx512 + testq %r10,%r10 + js .LChaCha20_avx512vl testl $512,%r10d jnz .LChaCha20_ssse3 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $64+24,%rsp +.cfi_adjust_cfa_offset 64+24 +.Lctr32_body: movdqu (%rcx),%xmm1 @@ -276,34 +306,41 @@ ChaCha20_ctr32: jnz .Loop_tail .Ldone: - addq $64+24,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx + leaq 64+24+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lno_data: .byte 0xf3,0xc3 +.cfi_endproc .size ChaCha20_ctr32,.-ChaCha20_ctr32 .type ChaCha20_ssse3,@function .align 32 ChaCha20_ssse3: +.cfi_startproc .LChaCha20_ssse3: + movq %rsp,%r9 +.cfi_def_cfa_register %r9 testl $2048,%r10d jnz .LChaCha20_4xop cmpq $128,%rdx + je .LChaCha20_128 ja .LChaCha20_4x .Ldo_sse3_after_all: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - subq $64+24,%rsp + subq $64+8,%rsp movdqa .Lsigma(%rip),%xmm0 movdqu (%rcx),%xmm1 movdqu 16(%rcx),%xmm2 @@ -315,7 +352,7 @@ ChaCha20_ssse3: movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) - movl $10,%ebp + movq $10,%r8 jmp .Loop_ssse3 .align 32 @@ -325,7 +362,7 @@ ChaCha20_ssse3: movdqa 16(%rsp),%xmm1 movdqa 32(%rsp),%xmm2 paddd 48(%rsp),%xmm3 - movl $10,%ebp + movq $10,%r8 movdqa %xmm3,48(%rsp) jmp .Loop_ssse3 @@ -374,7 +411,7 @@ ChaCha20_ssse3: pshufd $78,%xmm2,%xmm2 pshufd $147,%xmm1,%xmm1 pshufd $57,%xmm3,%xmm3 - decl %ebp + decq %r8 jnz .Loop_ssse3 paddd 0(%rsp),%xmm0 paddd 16(%rsp),%xmm1 @@ -411,31 +448,187 @@ ChaCha20_ssse3: movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) - xorq %rbx,%rbx + xorq %r8,%r8 .Loop_tail_ssse3: - movzbl (%rsi,%rbx,1),%eax - movzbl (%rsp,%rbx,1),%ecx - leaq 1(%rbx),%rbx + movzbl (%rsi,%r8,1),%eax + movzbl (%rsp,%r8,1),%ecx + leaq 1(%r8),%r8 xorl %ecx,%eax - movb %al,-1(%rdi,%rbx,1) + movb %al,-1(%rdi,%r8,1) decq %rdx jnz .Loop_tail_ssse3 .Ldone_ssse3: - addq $64+24,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx + leaq (%r9),%rsp +.cfi_def_cfa_register %rsp +.Lssse3_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ChaCha20_ssse3,.-ChaCha20_ssse3 +.type ChaCha20_128,@function +.align 32 +ChaCha20_128: +.cfi_startproc +.LChaCha20_128: + movq %rsp,%r9 +.cfi_def_cfa_register %r9 + subq $64+8,%rsp + movdqa .Lsigma(%rip),%xmm8 + movdqu (%rcx),%xmm9 + movdqu 16(%rcx),%xmm2 + movdqu (%r8),%xmm3 + movdqa .Lone(%rip),%xmm1 + movdqa .Lrot16(%rip),%xmm6 + movdqa .Lrot24(%rip),%xmm7 + + movdqa %xmm8,%xmm10 + movdqa %xmm8,0(%rsp) + movdqa %xmm9,%xmm11 + movdqa %xmm9,16(%rsp) + movdqa %xmm2,%xmm0 + movdqa %xmm2,32(%rsp) + paddd %xmm3,%xmm1 + movdqa %xmm3,48(%rsp) + movq $10,%r8 + jmp .Loop_128 + +.align 32 +.Loop_128: + paddd %xmm9,%xmm8 + pxor %xmm8,%xmm3 + paddd %xmm11,%xmm10 + pxor %xmm10,%xmm1 +.byte 102,15,56,0,222 +.byte 102,15,56,0,206 + paddd %xmm3,%xmm2 + paddd %xmm1,%xmm0 + pxor %xmm2,%xmm9 + pxor %xmm0,%xmm11 + movdqa %xmm9,%xmm4 + psrld $20,%xmm9 + movdqa %xmm11,%xmm5 + pslld $12,%xmm4 + psrld $20,%xmm11 + por %xmm4,%xmm9 + pslld $12,%xmm5 + por %xmm5,%xmm11 + paddd %xmm9,%xmm8 + pxor %xmm8,%xmm3 + paddd %xmm11,%xmm10 + pxor %xmm10,%xmm1 +.byte 102,15,56,0,223 +.byte 102,15,56,0,207 + paddd %xmm3,%xmm2 + paddd %xmm1,%xmm0 + pxor %xmm2,%xmm9 + pxor %xmm0,%xmm11 + movdqa %xmm9,%xmm4 + psrld $25,%xmm9 + movdqa %xmm11,%xmm5 + pslld $7,%xmm4 + psrld $25,%xmm11 + por %xmm4,%xmm9 + pslld $7,%xmm5 + por %xmm5,%xmm11 + pshufd $78,%xmm2,%xmm2 + pshufd $57,%xmm9,%xmm9 + pshufd $147,%xmm3,%xmm3 + pshufd $78,%xmm0,%xmm0 + pshufd $57,%xmm11,%xmm11 + pshufd $147,%xmm1,%xmm1 + paddd %xmm9,%xmm8 + pxor %xmm8,%xmm3 + paddd %xmm11,%xmm10 + pxor %xmm10,%xmm1 +.byte 102,15,56,0,222 +.byte 102,15,56,0,206 + paddd %xmm3,%xmm2 + paddd %xmm1,%xmm0 + pxor %xmm2,%xmm9 + pxor %xmm0,%xmm11 + movdqa %xmm9,%xmm4 + psrld $20,%xmm9 + movdqa %xmm11,%xmm5 + pslld $12,%xmm4 + psrld $20,%xmm11 + por %xmm4,%xmm9 + pslld $12,%xmm5 + por %xmm5,%xmm11 + paddd %xmm9,%xmm8 + pxor %xmm8,%xmm3 + paddd %xmm11,%xmm10 + pxor %xmm10,%xmm1 +.byte 102,15,56,0,223 +.byte 102,15,56,0,207 + paddd %xmm3,%xmm2 + paddd %xmm1,%xmm0 + pxor %xmm2,%xmm9 + pxor %xmm0,%xmm11 + movdqa %xmm9,%xmm4 + psrld $25,%xmm9 + movdqa %xmm11,%xmm5 + pslld $7,%xmm4 + psrld $25,%xmm11 + por %xmm4,%xmm9 + pslld $7,%xmm5 + por %xmm5,%xmm11 + pshufd $78,%xmm2,%xmm2 + pshufd $147,%xmm9,%xmm9 + pshufd $57,%xmm3,%xmm3 + pshufd $78,%xmm0,%xmm0 + pshufd $147,%xmm11,%xmm11 + pshufd $57,%xmm1,%xmm1 + decq %r8 + jnz .Loop_128 + paddd 0(%rsp),%xmm8 + paddd 16(%rsp),%xmm9 + paddd 32(%rsp),%xmm2 + paddd 48(%rsp),%xmm3 + paddd .Lone(%rip),%xmm1 + paddd 0(%rsp),%xmm10 + paddd 16(%rsp),%xmm11 + paddd 32(%rsp),%xmm0 + paddd 48(%rsp),%xmm1 + + movdqu 0(%rsi),%xmm4 + movdqu 16(%rsi),%xmm5 + pxor %xmm4,%xmm8 + movdqu 32(%rsi),%xmm4 + pxor %xmm5,%xmm9 + movdqu 48(%rsi),%xmm5 + pxor %xmm4,%xmm2 + movdqu 64(%rsi),%xmm4 + pxor %xmm5,%xmm3 + movdqu 80(%rsi),%xmm5 + pxor %xmm4,%xmm10 + movdqu 96(%rsi),%xmm4 + pxor %xmm5,%xmm11 + movdqu 112(%rsi),%xmm5 + pxor %xmm4,%xmm0 + pxor %xmm5,%xmm1 + + movdqu %xmm8,0(%rdi) + movdqu %xmm9,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + movdqu %xmm10,64(%rdi) + movdqu %xmm11,80(%rdi) + movdqu %xmm0,96(%rdi) + movdqu %xmm1,112(%rdi) + leaq (%r9),%rsp +.cfi_def_cfa_register %rsp +.L128_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ChaCha20_128,.-ChaCha20_128 .type ChaCha20_4x,@function .align 32 ChaCha20_4x: +.cfi_startproc .LChaCha20_4x: + movq %rsp,%r9 +.cfi_def_cfa_register %r9 movq %r10,%r11 shrq $32,%r10 testq $32,%r10 @@ -448,8 +641,7 @@ ChaCha20_4x: je .Ldo_sse3_after_all .Lproceed4x: - leaq -120(%rsp),%r11 - subq $0x148+0,%rsp + subq $0x140+8,%rsp movdqa .Lsigma(%rip),%xmm11 movdqu (%rcx),%xmm15 movdqu 16(%rcx),%xmm7 @@ -976,15 +1168,20 @@ ChaCha20_4x: jnz .Loop_tail4x .Ldone4x: - addq $0x148+0,%rsp + leaq (%r9),%rsp +.cfi_def_cfa_register %rsp +.L4x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ChaCha20_4x,.-ChaCha20_4x .type ChaCha20_4xop,@function .align 32 ChaCha20_4xop: +.cfi_startproc .LChaCha20_4xop: - leaq -120(%rsp),%r11 - subq $0x148+0,%rsp + movq %rsp,%r9 +.cfi_def_cfa_register %r9 + subq $0x140+8,%rsp vzeroupper vmovdqa .Lsigma(%rip),%xmm11 @@ -1386,18 +1583,22 @@ ChaCha20_4xop: .Ldone4xop: vzeroupper - addq $0x148+0,%rsp + leaq (%r9),%rsp +.cfi_def_cfa_register %rsp +.L4xop_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ChaCha20_4xop,.-ChaCha20_4xop .type ChaCha20_8x,@function .align 32 ChaCha20_8x: +.cfi_startproc .LChaCha20_8x: - movq %rsp,%r10 + movq %rsp,%r9 +.cfi_def_cfa_register %r9 subq $0x280+8,%rsp andq $-32,%rsp vzeroupper - movq %r10,640(%rsp) @@ -1988,6 +2189,1240 @@ ChaCha20_8x: .Ldone8x: vzeroall - movq 640(%rsp),%rsp + leaq (%r9),%rsp +.cfi_def_cfa_register %rsp +.L8x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ChaCha20_8x,.-ChaCha20_8x +.type ChaCha20_avx512,@function +.align 32 +ChaCha20_avx512: +.cfi_startproc +.LChaCha20_avx512: + movq %rsp,%r9 +.cfi_def_cfa_register %r9 + cmpq $512,%rdx + ja .LChaCha20_16x + + subq $64+8,%rsp + vbroadcasti32x4 .Lsigma(%rip),%zmm0 + vbroadcasti32x4 (%rcx),%zmm1 + vbroadcasti32x4 16(%rcx),%zmm2 + vbroadcasti32x4 (%r8),%zmm3 + + vmovdqa32 %zmm0,%zmm16 + vmovdqa32 %zmm1,%zmm17 + vmovdqa32 %zmm2,%zmm18 + vpaddd .Lzeroz(%rip),%zmm3,%zmm3 + vmovdqa32 .Lfourz(%rip),%zmm20 + movq $10,%r8 + vmovdqa32 %zmm3,%zmm19 + jmp .Loop_avx512 + +.align 16 +.Loop_outer_avx512: + vmovdqa32 %zmm16,%zmm0 + vmovdqa32 %zmm17,%zmm1 + vmovdqa32 %zmm18,%zmm2 + vpaddd %zmm20,%zmm19,%zmm3 + movq $10,%r8 + vmovdqa32 %zmm3,%zmm19 + jmp .Loop_avx512 + +.align 32 +.Loop_avx512: + vpaddd %zmm1,%zmm0,%zmm0 + vpxord %zmm0,%zmm3,%zmm3 + vprold $16,%zmm3,%zmm3 + vpaddd %zmm3,%zmm2,%zmm2 + vpxord %zmm2,%zmm1,%zmm1 + vprold $12,%zmm1,%zmm1 + vpaddd %zmm1,%zmm0,%zmm0 + vpxord %zmm0,%zmm3,%zmm3 + vprold $8,%zmm3,%zmm3 + vpaddd %zmm3,%zmm2,%zmm2 + vpxord %zmm2,%zmm1,%zmm1 + vprold $7,%zmm1,%zmm1 + vpshufd $78,%zmm2,%zmm2 + vpshufd $57,%zmm1,%zmm1 + vpshufd $147,%zmm3,%zmm3 + vpaddd %zmm1,%zmm0,%zmm0 + vpxord %zmm0,%zmm3,%zmm3 + vprold $16,%zmm3,%zmm3 + vpaddd %zmm3,%zmm2,%zmm2 + vpxord %zmm2,%zmm1,%zmm1 + vprold $12,%zmm1,%zmm1 + vpaddd %zmm1,%zmm0,%zmm0 + vpxord %zmm0,%zmm3,%zmm3 + vprold $8,%zmm3,%zmm3 + vpaddd %zmm3,%zmm2,%zmm2 + vpxord %zmm2,%zmm1,%zmm1 + vprold $7,%zmm1,%zmm1 + vpshufd $78,%zmm2,%zmm2 + vpshufd $147,%zmm1,%zmm1 + vpshufd $57,%zmm3,%zmm3 + decq %r8 + jnz .Loop_avx512 + vpaddd %zmm16,%zmm0,%zmm0 + vpaddd %zmm17,%zmm1,%zmm1 + vpaddd %zmm18,%zmm2,%zmm2 + vpaddd %zmm19,%zmm3,%zmm3 + + subq $64,%rdx + jb .Ltail64_avx512 + + vpxor 0(%rsi),%xmm0,%xmm4 + vpxor 16(%rsi),%xmm1,%xmm5 + vpxor 32(%rsi),%xmm2,%xmm6 + vpxor 48(%rsi),%xmm3,%xmm7 + leaq 64(%rsi),%rsi + + vmovdqu %xmm4,0(%rdi) + vmovdqu %xmm5,16(%rdi) + vmovdqu %xmm6,32(%rdi) + vmovdqu %xmm7,48(%rdi) + leaq 64(%rdi),%rdi + + jz .Ldone_avx512 + + vextracti32x4 $1,%zmm0,%xmm4 + vextracti32x4 $1,%zmm1,%xmm5 + vextracti32x4 $1,%zmm2,%xmm6 + vextracti32x4 $1,%zmm3,%xmm7 + + subq $64,%rdx + jb .Ltail_avx512 + + vpxor 0(%rsi),%xmm4,%xmm4 + vpxor 16(%rsi),%xmm5,%xmm5 + vpxor 32(%rsi),%xmm6,%xmm6 + vpxor 48(%rsi),%xmm7,%xmm7 + leaq 64(%rsi),%rsi + + vmovdqu %xmm4,0(%rdi) + vmovdqu %xmm5,16(%rdi) + vmovdqu %xmm6,32(%rdi) + vmovdqu %xmm7,48(%rdi) + leaq 64(%rdi),%rdi + + jz .Ldone_avx512 + + vextracti32x4 $2,%zmm0,%xmm4 + vextracti32x4 $2,%zmm1,%xmm5 + vextracti32x4 $2,%zmm2,%xmm6 + vextracti32x4 $2,%zmm3,%xmm7 + + subq $64,%rdx + jb .Ltail_avx512 + + vpxor 0(%rsi),%xmm4,%xmm4 + vpxor 16(%rsi),%xmm5,%xmm5 + vpxor 32(%rsi),%xmm6,%xmm6 + vpxor 48(%rsi),%xmm7,%xmm7 + leaq 64(%rsi),%rsi + + vmovdqu %xmm4,0(%rdi) + vmovdqu %xmm5,16(%rdi) + vmovdqu %xmm6,32(%rdi) + vmovdqu %xmm7,48(%rdi) + leaq 64(%rdi),%rdi + + jz .Ldone_avx512 + + vextracti32x4 $3,%zmm0,%xmm4 + vextracti32x4 $3,%zmm1,%xmm5 + vextracti32x4 $3,%zmm2,%xmm6 + vextracti32x4 $3,%zmm3,%xmm7 + + subq $64,%rdx + jb .Ltail_avx512 + + vpxor 0(%rsi),%xmm4,%xmm4 + vpxor 16(%rsi),%xmm5,%xmm5 + vpxor 32(%rsi),%xmm6,%xmm6 + vpxor 48(%rsi),%xmm7,%xmm7 + leaq 64(%rsi),%rsi + + vmovdqu %xmm4,0(%rdi) + vmovdqu %xmm5,16(%rdi) + vmovdqu %xmm6,32(%rdi) + vmovdqu %xmm7,48(%rdi) + leaq 64(%rdi),%rdi + + jnz .Loop_outer_avx512 + + jmp .Ldone_avx512 + +.align 16 +.Ltail64_avx512: + vmovdqa %xmm0,0(%rsp) + vmovdqa %xmm1,16(%rsp) + vmovdqa %xmm2,32(%rsp) + vmovdqa %xmm3,48(%rsp) + addq $64,%rdx + jmp .Loop_tail_avx512 + +.align 16 +.Ltail_avx512: + vmovdqa %xmm4,0(%rsp) + vmovdqa %xmm5,16(%rsp) + vmovdqa %xmm6,32(%rsp) + vmovdqa %xmm7,48(%rsp) + addq $64,%rdx + +.Loop_tail_avx512: + movzbl (%rsi,%r8,1),%eax + movzbl (%rsp,%r8,1),%ecx + leaq 1(%r8),%r8 + xorl %ecx,%eax + movb %al,-1(%rdi,%r8,1) + decq %rdx + jnz .Loop_tail_avx512 + + vmovdqu32 %zmm16,0(%rsp) + +.Ldone_avx512: + vzeroall + leaq (%r9),%rsp +.cfi_def_cfa_register %rsp +.Lavx512_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ChaCha20_avx512,.-ChaCha20_avx512 +.type ChaCha20_avx512vl,@function +.align 32 +ChaCha20_avx512vl: +.cfi_startproc +.LChaCha20_avx512vl: + movq %rsp,%r9 +.cfi_def_cfa_register %r9 + cmpq $128,%rdx + ja .LChaCha20_8xvl + + subq $64+8,%rsp + vbroadcasti128 .Lsigma(%rip),%ymm0 + vbroadcasti128 (%rcx),%ymm1 + vbroadcasti128 16(%rcx),%ymm2 + vbroadcasti128 (%r8),%ymm3 + + vmovdqa32 %ymm0,%ymm16 + vmovdqa32 %ymm1,%ymm17 + vmovdqa32 %ymm2,%ymm18 + vpaddd .Lzeroz(%rip),%ymm3,%ymm3 + vmovdqa32 .Ltwoy(%rip),%ymm20 + movq $10,%r8 + vmovdqa32 %ymm3,%ymm19 + jmp .Loop_avx512vl + +.align 16 +.Loop_outer_avx512vl: + vmovdqa32 %ymm18,%ymm2 + vpaddd %ymm20,%ymm19,%ymm3 + movq $10,%r8 + vmovdqa32 %ymm3,%ymm19 + jmp .Loop_avx512vl + +.align 32 +.Loop_avx512vl: + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + vpshufd $78,%ymm2,%ymm2 + vpshufd $57,%ymm1,%ymm1 + vpshufd $147,%ymm3,%ymm3 + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + vpshufd $78,%ymm2,%ymm2 + vpshufd $147,%ymm1,%ymm1 + vpshufd $57,%ymm3,%ymm3 + decq %r8 + jnz .Loop_avx512vl + vpaddd %ymm16,%ymm0,%ymm0 + vpaddd %ymm17,%ymm1,%ymm1 + vpaddd %ymm18,%ymm2,%ymm2 + vpaddd %ymm19,%ymm3,%ymm3 + + subq $64,%rdx + jb .Ltail64_avx512vl + + vpxor 0(%rsi),%xmm0,%xmm4 + vpxor 16(%rsi),%xmm1,%xmm5 + vpxor 32(%rsi),%xmm2,%xmm6 + vpxor 48(%rsi),%xmm3,%xmm7 + leaq 64(%rsi),%rsi + + vmovdqu %xmm4,0(%rdi) + vmovdqu %xmm5,16(%rdi) + vmovdqu %xmm6,32(%rdi) + vmovdqu %xmm7,48(%rdi) + leaq 64(%rdi),%rdi + + jz .Ldone_avx512vl + + vextracti128 $1,%ymm0,%xmm4 + vextracti128 $1,%ymm1,%xmm5 + vextracti128 $1,%ymm2,%xmm6 + vextracti128 $1,%ymm3,%xmm7 + + subq $64,%rdx + jb .Ltail_avx512vl + + vpxor 0(%rsi),%xmm4,%xmm4 + vpxor 16(%rsi),%xmm5,%xmm5 + vpxor 32(%rsi),%xmm6,%xmm6 + vpxor 48(%rsi),%xmm7,%xmm7 + leaq 64(%rsi),%rsi + + vmovdqu %xmm4,0(%rdi) + vmovdqu %xmm5,16(%rdi) + vmovdqu %xmm6,32(%rdi) + vmovdqu %xmm7,48(%rdi) + leaq 64(%rdi),%rdi + + vmovdqa32 %ymm16,%ymm0 + vmovdqa32 %ymm17,%ymm1 + jnz .Loop_outer_avx512vl + + jmp .Ldone_avx512vl + +.align 16 +.Ltail64_avx512vl: + vmovdqa %xmm0,0(%rsp) + vmovdqa %xmm1,16(%rsp) + vmovdqa %xmm2,32(%rsp) + vmovdqa %xmm3,48(%rsp) + addq $64,%rdx + jmp .Loop_tail_avx512vl + +.align 16 +.Ltail_avx512vl: + vmovdqa %xmm4,0(%rsp) + vmovdqa %xmm5,16(%rsp) + vmovdqa %xmm6,32(%rsp) + vmovdqa %xmm7,48(%rsp) + addq $64,%rdx + +.Loop_tail_avx512vl: + movzbl (%rsi,%r8,1),%eax + movzbl (%rsp,%r8,1),%ecx + leaq 1(%r8),%r8 + xorl %ecx,%eax + movb %al,-1(%rdi,%r8,1) + decq %rdx + jnz .Loop_tail_avx512vl + + vmovdqu32 %ymm16,0(%rsp) + vmovdqu32 %ymm16,32(%rsp) + +.Ldone_avx512vl: + vzeroall + leaq (%r9),%rsp +.cfi_def_cfa_register %rsp +.Lavx512vl_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ChaCha20_avx512vl,.-ChaCha20_avx512vl +.type ChaCha20_16x,@function +.align 32 +ChaCha20_16x: +.cfi_startproc +.LChaCha20_16x: + movq %rsp,%r9 +.cfi_def_cfa_register %r9 + subq $64+8,%rsp + andq $-64,%rsp + vzeroupper + + leaq .Lsigma(%rip),%r10 + vbroadcasti32x4 (%r10),%zmm3 + vbroadcasti32x4 (%rcx),%zmm7 + vbroadcasti32x4 16(%rcx),%zmm11 + vbroadcasti32x4 (%r8),%zmm15 + + vpshufd $0x00,%zmm3,%zmm0 + vpshufd $0x55,%zmm3,%zmm1 + vpshufd $0xaa,%zmm3,%zmm2 + vpshufd $0xff,%zmm3,%zmm3 + vmovdqa64 %zmm0,%zmm16 + vmovdqa64 %zmm1,%zmm17 + vmovdqa64 %zmm2,%zmm18 + vmovdqa64 %zmm3,%zmm19 + + vpshufd $0x00,%zmm7,%zmm4 + vpshufd $0x55,%zmm7,%zmm5 + vpshufd $0xaa,%zmm7,%zmm6 + vpshufd $0xff,%zmm7,%zmm7 + vmovdqa64 %zmm4,%zmm20 + vmovdqa64 %zmm5,%zmm21 + vmovdqa64 %zmm6,%zmm22 + vmovdqa64 %zmm7,%zmm23 + + vpshufd $0x00,%zmm11,%zmm8 + vpshufd $0x55,%zmm11,%zmm9 + vpshufd $0xaa,%zmm11,%zmm10 + vpshufd $0xff,%zmm11,%zmm11 + vmovdqa64 %zmm8,%zmm24 + vmovdqa64 %zmm9,%zmm25 + vmovdqa64 %zmm10,%zmm26 + vmovdqa64 %zmm11,%zmm27 + + vpshufd $0x00,%zmm15,%zmm12 + vpshufd $0x55,%zmm15,%zmm13 + vpshufd $0xaa,%zmm15,%zmm14 + vpshufd $0xff,%zmm15,%zmm15 + vpaddd .Lincz(%rip),%zmm12,%zmm12 + vmovdqa64 %zmm12,%zmm28 + vmovdqa64 %zmm13,%zmm29 + vmovdqa64 %zmm14,%zmm30 + vmovdqa64 %zmm15,%zmm31 + + movl $10,%eax + jmp .Loop16x + +.align 32 +.Loop_outer16x: + vpbroadcastd 0(%r10),%zmm0 + vpbroadcastd 4(%r10),%zmm1 + vpbroadcastd 8(%r10),%zmm2 + vpbroadcastd 12(%r10),%zmm3 + vpaddd .Lsixteen(%rip),%zmm28,%zmm28 + vmovdqa64 %zmm20,%zmm4 + vmovdqa64 %zmm21,%zmm5 + vmovdqa64 %zmm22,%zmm6 + vmovdqa64 %zmm23,%zmm7 + vmovdqa64 %zmm24,%zmm8 + vmovdqa64 %zmm25,%zmm9 + vmovdqa64 %zmm26,%zmm10 + vmovdqa64 %zmm27,%zmm11 + vmovdqa64 %zmm28,%zmm12 + vmovdqa64 %zmm29,%zmm13 + vmovdqa64 %zmm30,%zmm14 + vmovdqa64 %zmm31,%zmm15 + + vmovdqa64 %zmm0,%zmm16 + vmovdqa64 %zmm1,%zmm17 + vmovdqa64 %zmm2,%zmm18 + vmovdqa64 %zmm3,%zmm19 + + movl $10,%eax + jmp .Loop16x + +.align 32 +.Loop16x: + vpaddd %zmm4,%zmm0,%zmm0 + vpaddd %zmm5,%zmm1,%zmm1 + vpaddd %zmm6,%zmm2,%zmm2 + vpaddd %zmm7,%zmm3,%zmm3 + vpxord %zmm0,%zmm12,%zmm12 + vpxord %zmm1,%zmm13,%zmm13 + vpxord %zmm2,%zmm14,%zmm14 + vpxord %zmm3,%zmm15,%zmm15 + vprold $16,%zmm12,%zmm12 + vprold $16,%zmm13,%zmm13 + vprold $16,%zmm14,%zmm14 + vprold $16,%zmm15,%zmm15 + vpaddd %zmm12,%zmm8,%zmm8 + vpaddd %zmm13,%zmm9,%zmm9 + vpaddd %zmm14,%zmm10,%zmm10 + vpaddd %zmm15,%zmm11,%zmm11 + vpxord %zmm8,%zmm4,%zmm4 + vpxord %zmm9,%zmm5,%zmm5 + vpxord %zmm10,%zmm6,%zmm6 + vpxord %zmm11,%zmm7,%zmm7 + vprold $12,%zmm4,%zmm4 + vprold $12,%zmm5,%zmm5 + vprold $12,%zmm6,%zmm6 + vprold $12,%zmm7,%zmm7 + vpaddd %zmm4,%zmm0,%zmm0 + vpaddd %zmm5,%zmm1,%zmm1 + vpaddd %zmm6,%zmm2,%zmm2 + vpaddd %zmm7,%zmm3,%zmm3 + vpxord %zmm0,%zmm12,%zmm12 + vpxord %zmm1,%zmm13,%zmm13 + vpxord %zmm2,%zmm14,%zmm14 + vpxord %zmm3,%zmm15,%zmm15 + vprold $8,%zmm12,%zmm12 + vprold $8,%zmm13,%zmm13 + vprold $8,%zmm14,%zmm14 + vprold $8,%zmm15,%zmm15 + vpaddd %zmm12,%zmm8,%zmm8 + vpaddd %zmm13,%zmm9,%zmm9 + vpaddd %zmm14,%zmm10,%zmm10 + vpaddd %zmm15,%zmm11,%zmm11 + vpxord %zmm8,%zmm4,%zmm4 + vpxord %zmm9,%zmm5,%zmm5 + vpxord %zmm10,%zmm6,%zmm6 + vpxord %zmm11,%zmm7,%zmm7 + vprold $7,%zmm4,%zmm4 + vprold $7,%zmm5,%zmm5 + vprold $7,%zmm6,%zmm6 + vprold $7,%zmm7,%zmm7 + vpaddd %zmm5,%zmm0,%zmm0 + vpaddd %zmm6,%zmm1,%zmm1 + vpaddd %zmm7,%zmm2,%zmm2 + vpaddd %zmm4,%zmm3,%zmm3 + vpxord %zmm0,%zmm15,%zmm15 + vpxord %zmm1,%zmm12,%zmm12 + vpxord %zmm2,%zmm13,%zmm13 + vpxord %zmm3,%zmm14,%zmm14 + vprold $16,%zmm15,%zmm15 + vprold $16,%zmm12,%zmm12 + vprold $16,%zmm13,%zmm13 + vprold $16,%zmm14,%zmm14 + vpaddd %zmm15,%zmm10,%zmm10 + vpaddd %zmm12,%zmm11,%zmm11 + vpaddd %zmm13,%zmm8,%zmm8 + vpaddd %zmm14,%zmm9,%zmm9 + vpxord %zmm10,%zmm5,%zmm5 + vpxord %zmm11,%zmm6,%zmm6 + vpxord %zmm8,%zmm7,%zmm7 + vpxord %zmm9,%zmm4,%zmm4 + vprold $12,%zmm5,%zmm5 + vprold $12,%zmm6,%zmm6 + vprold $12,%zmm7,%zmm7 + vprold $12,%zmm4,%zmm4 + vpaddd %zmm5,%zmm0,%zmm0 + vpaddd %zmm6,%zmm1,%zmm1 + vpaddd %zmm7,%zmm2,%zmm2 + vpaddd %zmm4,%zmm3,%zmm3 + vpxord %zmm0,%zmm15,%zmm15 + vpxord %zmm1,%zmm12,%zmm12 + vpxord %zmm2,%zmm13,%zmm13 + vpxord %zmm3,%zmm14,%zmm14 + vprold $8,%zmm15,%zmm15 + vprold $8,%zmm12,%zmm12 + vprold $8,%zmm13,%zmm13 + vprold $8,%zmm14,%zmm14 + vpaddd %zmm15,%zmm10,%zmm10 + vpaddd %zmm12,%zmm11,%zmm11 + vpaddd %zmm13,%zmm8,%zmm8 + vpaddd %zmm14,%zmm9,%zmm9 + vpxord %zmm10,%zmm5,%zmm5 + vpxord %zmm11,%zmm6,%zmm6 + vpxord %zmm8,%zmm7,%zmm7 + vpxord %zmm9,%zmm4,%zmm4 + vprold $7,%zmm5,%zmm5 + vprold $7,%zmm6,%zmm6 + vprold $7,%zmm7,%zmm7 + vprold $7,%zmm4,%zmm4 + decl %eax + jnz .Loop16x + + vpaddd %zmm16,%zmm0,%zmm0 + vpaddd %zmm17,%zmm1,%zmm1 + vpaddd %zmm18,%zmm2,%zmm2 + vpaddd %zmm19,%zmm3,%zmm3 + + vpunpckldq %zmm1,%zmm0,%zmm18 + vpunpckldq %zmm3,%zmm2,%zmm19 + vpunpckhdq %zmm1,%zmm0,%zmm0 + vpunpckhdq %zmm3,%zmm2,%zmm2 + vpunpcklqdq %zmm19,%zmm18,%zmm1 + vpunpckhqdq %zmm19,%zmm18,%zmm18 + vpunpcklqdq %zmm2,%zmm0,%zmm3 + vpunpckhqdq %zmm2,%zmm0,%zmm0 + vpaddd %zmm20,%zmm4,%zmm4 + vpaddd %zmm21,%zmm5,%zmm5 + vpaddd %zmm22,%zmm6,%zmm6 + vpaddd %zmm23,%zmm7,%zmm7 + + vpunpckldq %zmm5,%zmm4,%zmm2 + vpunpckldq %zmm7,%zmm6,%zmm19 + vpunpckhdq %zmm5,%zmm4,%zmm4 + vpunpckhdq %zmm7,%zmm6,%zmm6 + vpunpcklqdq %zmm19,%zmm2,%zmm5 + vpunpckhqdq %zmm19,%zmm2,%zmm2 + vpunpcklqdq %zmm6,%zmm4,%zmm7 + vpunpckhqdq %zmm6,%zmm4,%zmm4 + vshufi32x4 $0x44,%zmm5,%zmm1,%zmm19 + vshufi32x4 $0xee,%zmm5,%zmm1,%zmm5 + vshufi32x4 $0x44,%zmm2,%zmm18,%zmm1 + vshufi32x4 $0xee,%zmm2,%zmm18,%zmm2 + vshufi32x4 $0x44,%zmm7,%zmm3,%zmm18 + vshufi32x4 $0xee,%zmm7,%zmm3,%zmm7 + vshufi32x4 $0x44,%zmm4,%zmm0,%zmm3 + vshufi32x4 $0xee,%zmm4,%zmm0,%zmm4 + vpaddd %zmm24,%zmm8,%zmm8 + vpaddd %zmm25,%zmm9,%zmm9 + vpaddd %zmm26,%zmm10,%zmm10 + vpaddd %zmm27,%zmm11,%zmm11 + + vpunpckldq %zmm9,%zmm8,%zmm6 + vpunpckldq %zmm11,%zmm10,%zmm0 + vpunpckhdq %zmm9,%zmm8,%zmm8 + vpunpckhdq %zmm11,%zmm10,%zmm10 + vpunpcklqdq %zmm0,%zmm6,%zmm9 + vpunpckhqdq %zmm0,%zmm6,%zmm6 + vpunpcklqdq %zmm10,%zmm8,%zmm11 + vpunpckhqdq %zmm10,%zmm8,%zmm8 + vpaddd %zmm28,%zmm12,%zmm12 + vpaddd %zmm29,%zmm13,%zmm13 + vpaddd %zmm30,%zmm14,%zmm14 + vpaddd %zmm31,%zmm15,%zmm15 + + vpunpckldq %zmm13,%zmm12,%zmm10 + vpunpckldq %zmm15,%zmm14,%zmm0 + vpunpckhdq %zmm13,%zmm12,%zmm12 + vpunpckhdq %zmm15,%zmm14,%zmm14 + vpunpcklqdq %zmm0,%zmm10,%zmm13 + vpunpckhqdq %zmm0,%zmm10,%zmm10 + vpunpcklqdq %zmm14,%zmm12,%zmm15 + vpunpckhqdq %zmm14,%zmm12,%zmm12 + vshufi32x4 $0x44,%zmm13,%zmm9,%zmm0 + vshufi32x4 $0xee,%zmm13,%zmm9,%zmm13 + vshufi32x4 $0x44,%zmm10,%zmm6,%zmm9 + vshufi32x4 $0xee,%zmm10,%zmm6,%zmm10 + vshufi32x4 $0x44,%zmm15,%zmm11,%zmm6 + vshufi32x4 $0xee,%zmm15,%zmm11,%zmm15 + vshufi32x4 $0x44,%zmm12,%zmm8,%zmm11 + vshufi32x4 $0xee,%zmm12,%zmm8,%zmm12 + vshufi32x4 $0x88,%zmm0,%zmm19,%zmm16 + vshufi32x4 $0xdd,%zmm0,%zmm19,%zmm19 + vshufi32x4 $0x88,%zmm13,%zmm5,%zmm0 + vshufi32x4 $0xdd,%zmm13,%zmm5,%zmm13 + vshufi32x4 $0x88,%zmm9,%zmm1,%zmm17 + vshufi32x4 $0xdd,%zmm9,%zmm1,%zmm1 + vshufi32x4 $0x88,%zmm10,%zmm2,%zmm9 + vshufi32x4 $0xdd,%zmm10,%zmm2,%zmm10 + vshufi32x4 $0x88,%zmm6,%zmm18,%zmm14 + vshufi32x4 $0xdd,%zmm6,%zmm18,%zmm18 + vshufi32x4 $0x88,%zmm15,%zmm7,%zmm6 + vshufi32x4 $0xdd,%zmm15,%zmm7,%zmm15 + vshufi32x4 $0x88,%zmm11,%zmm3,%zmm8 + vshufi32x4 $0xdd,%zmm11,%zmm3,%zmm3 + vshufi32x4 $0x88,%zmm12,%zmm4,%zmm11 + vshufi32x4 $0xdd,%zmm12,%zmm4,%zmm12 + cmpq $1024,%rdx + jb .Ltail16x + + vpxord 0(%rsi),%zmm16,%zmm16 + vpxord 64(%rsi),%zmm17,%zmm17 + vpxord 128(%rsi),%zmm14,%zmm14 + vpxord 192(%rsi),%zmm8,%zmm8 + vmovdqu32 %zmm16,0(%rdi) + vmovdqu32 %zmm17,64(%rdi) + vmovdqu32 %zmm14,128(%rdi) + vmovdqu32 %zmm8,192(%rdi) + + vpxord 256(%rsi),%zmm19,%zmm19 + vpxord 320(%rsi),%zmm1,%zmm1 + vpxord 384(%rsi),%zmm18,%zmm18 + vpxord 448(%rsi),%zmm3,%zmm3 + vmovdqu32 %zmm19,256(%rdi) + vmovdqu32 %zmm1,320(%rdi) + vmovdqu32 %zmm18,384(%rdi) + vmovdqu32 %zmm3,448(%rdi) + + vpxord 512(%rsi),%zmm0,%zmm0 + vpxord 576(%rsi),%zmm9,%zmm9 + vpxord 640(%rsi),%zmm6,%zmm6 + vpxord 704(%rsi),%zmm11,%zmm11 + vmovdqu32 %zmm0,512(%rdi) + vmovdqu32 %zmm9,576(%rdi) + vmovdqu32 %zmm6,640(%rdi) + vmovdqu32 %zmm11,704(%rdi) + + vpxord 768(%rsi),%zmm13,%zmm13 + vpxord 832(%rsi),%zmm10,%zmm10 + vpxord 896(%rsi),%zmm15,%zmm15 + vpxord 960(%rsi),%zmm12,%zmm12 + leaq 1024(%rsi),%rsi + vmovdqu32 %zmm13,768(%rdi) + vmovdqu32 %zmm10,832(%rdi) + vmovdqu32 %zmm15,896(%rdi) + vmovdqu32 %zmm12,960(%rdi) + leaq 1024(%rdi),%rdi + + subq $1024,%rdx + jnz .Loop_outer16x + + jmp .Ldone16x + +.align 32 +.Ltail16x: + xorq %r10,%r10 + subq %rsi,%rdi + cmpq $64,%rdx + jb .Less_than_64_16x + vpxord (%rsi),%zmm16,%zmm16 + vmovdqu32 %zmm16,(%rdi,%rsi,1) + je .Ldone16x + vmovdqa32 %zmm17,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $128,%rdx + jb .Less_than_64_16x + vpxord (%rsi),%zmm17,%zmm17 + vmovdqu32 %zmm17,(%rdi,%rsi,1) + je .Ldone16x + vmovdqa32 %zmm14,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $192,%rdx + jb .Less_than_64_16x + vpxord (%rsi),%zmm14,%zmm14 + vmovdqu32 %zmm14,(%rdi,%rsi,1) + je .Ldone16x + vmovdqa32 %zmm8,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $256,%rdx + jb .Less_than_64_16x + vpxord (%rsi),%zmm8,%zmm8 + vmovdqu32 %zmm8,(%rdi,%rsi,1) + je .Ldone16x + vmovdqa32 %zmm19,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $320,%rdx + jb .Less_than_64_16x + vpxord (%rsi),%zmm19,%zmm19 + vmovdqu32 %zmm19,(%rdi,%rsi,1) + je .Ldone16x + vmovdqa32 %zmm1,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $384,%rdx + jb .Less_than_64_16x + vpxord (%rsi),%zmm1,%zmm1 + vmovdqu32 %zmm1,(%rdi,%rsi,1) + je .Ldone16x + vmovdqa32 %zmm18,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $448,%rdx + jb .Less_than_64_16x + vpxord (%rsi),%zmm18,%zmm18 + vmovdqu32 %zmm18,(%rdi,%rsi,1) + je .Ldone16x + vmovdqa32 %zmm3,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $512,%rdx + jb .Less_than_64_16x + vpxord (%rsi),%zmm3,%zmm3 + vmovdqu32 %zmm3,(%rdi,%rsi,1) + je .Ldone16x + vmovdqa32 %zmm0,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $576,%rdx + jb .Less_than_64_16x + vpxord (%rsi),%zmm0,%zmm0 + vmovdqu32 %zmm0,(%rdi,%rsi,1) + je .Ldone16x + vmovdqa32 %zmm9,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $640,%rdx + jb .Less_than_64_16x + vpxord (%rsi),%zmm9,%zmm9 + vmovdqu32 %zmm9,(%rdi,%rsi,1) + je .Ldone16x + vmovdqa32 %zmm6,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $704,%rdx + jb .Less_than_64_16x + vpxord (%rsi),%zmm6,%zmm6 + vmovdqu32 %zmm6,(%rdi,%rsi,1) + je .Ldone16x + vmovdqa32 %zmm11,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $768,%rdx + jb .Less_than_64_16x + vpxord (%rsi),%zmm11,%zmm11 + vmovdqu32 %zmm11,(%rdi,%rsi,1) + je .Ldone16x + vmovdqa32 %zmm13,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $832,%rdx + jb .Less_than_64_16x + vpxord (%rsi),%zmm13,%zmm13 + vmovdqu32 %zmm13,(%rdi,%rsi,1) + je .Ldone16x + vmovdqa32 %zmm10,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $896,%rdx + jb .Less_than_64_16x + vpxord (%rsi),%zmm10,%zmm10 + vmovdqu32 %zmm10,(%rdi,%rsi,1) + je .Ldone16x + vmovdqa32 %zmm15,%zmm16 + leaq 64(%rsi),%rsi + + cmpq $960,%rdx + jb .Less_than_64_16x + vpxord (%rsi),%zmm15,%zmm15 + vmovdqu32 %zmm15,(%rdi,%rsi,1) + je .Ldone16x + vmovdqa32 %zmm12,%zmm16 + leaq 64(%rsi),%rsi + +.Less_than_64_16x: + vmovdqa32 %zmm16,0(%rsp) + leaq (%rdi,%rsi,1),%rdi + andq $63,%rdx + +.Loop_tail16x: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz .Loop_tail16x + + vpxord %zmm16,%zmm16,%zmm16 + vmovdqa32 %zmm16,0(%rsp) + +.Ldone16x: + vzeroall + leaq (%r9),%rsp +.cfi_def_cfa_register %rsp +.L16x_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ChaCha20_16x,.-ChaCha20_16x +.type ChaCha20_8xvl,@function +.align 32 +ChaCha20_8xvl: +.cfi_startproc +.LChaCha20_8xvl: + movq %rsp,%r9 +.cfi_def_cfa_register %r9 + subq $64+8,%rsp + andq $-64,%rsp + vzeroupper + + leaq .Lsigma(%rip),%r10 + vbroadcasti128 (%r10),%ymm3 + vbroadcasti128 (%rcx),%ymm7 + vbroadcasti128 16(%rcx),%ymm11 + vbroadcasti128 (%r8),%ymm15 + + vpshufd $0x00,%ymm3,%ymm0 + vpshufd $0x55,%ymm3,%ymm1 + vpshufd $0xaa,%ymm3,%ymm2 + vpshufd $0xff,%ymm3,%ymm3 + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm1,%ymm17 + vmovdqa64 %ymm2,%ymm18 + vmovdqa64 %ymm3,%ymm19 + + vpshufd $0x00,%ymm7,%ymm4 + vpshufd $0x55,%ymm7,%ymm5 + vpshufd $0xaa,%ymm7,%ymm6 + vpshufd $0xff,%ymm7,%ymm7 + vmovdqa64 %ymm4,%ymm20 + vmovdqa64 %ymm5,%ymm21 + vmovdqa64 %ymm6,%ymm22 + vmovdqa64 %ymm7,%ymm23 + + vpshufd $0x00,%ymm11,%ymm8 + vpshufd $0x55,%ymm11,%ymm9 + vpshufd $0xaa,%ymm11,%ymm10 + vpshufd $0xff,%ymm11,%ymm11 + vmovdqa64 %ymm8,%ymm24 + vmovdqa64 %ymm9,%ymm25 + vmovdqa64 %ymm10,%ymm26 + vmovdqa64 %ymm11,%ymm27 + + vpshufd $0x00,%ymm15,%ymm12 + vpshufd $0x55,%ymm15,%ymm13 + vpshufd $0xaa,%ymm15,%ymm14 + vpshufd $0xff,%ymm15,%ymm15 + vpaddd .Lincy(%rip),%ymm12,%ymm12 + vmovdqa64 %ymm12,%ymm28 + vmovdqa64 %ymm13,%ymm29 + vmovdqa64 %ymm14,%ymm30 + vmovdqa64 %ymm15,%ymm31 + + movl $10,%eax + jmp .Loop8xvl + +.align 32 +.Loop_outer8xvl: + + + vpbroadcastd 8(%r10),%ymm2 + vpbroadcastd 12(%r10),%ymm3 + vpaddd .Leight(%rip),%ymm28,%ymm28 + vmovdqa64 %ymm20,%ymm4 + vmovdqa64 %ymm21,%ymm5 + vmovdqa64 %ymm22,%ymm6 + vmovdqa64 %ymm23,%ymm7 + vmovdqa64 %ymm24,%ymm8 + vmovdqa64 %ymm25,%ymm9 + vmovdqa64 %ymm26,%ymm10 + vmovdqa64 %ymm27,%ymm11 + vmovdqa64 %ymm28,%ymm12 + vmovdqa64 %ymm29,%ymm13 + vmovdqa64 %ymm30,%ymm14 + vmovdqa64 %ymm31,%ymm15 + + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm1,%ymm17 + vmovdqa64 %ymm2,%ymm18 + vmovdqa64 %ymm3,%ymm19 + + movl $10,%eax + jmp .Loop8xvl + +.align 32 +.Loop8xvl: + vpaddd %ymm4,%ymm0,%ymm0 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm7,%ymm3,%ymm3 + vpxor %ymm0,%ymm12,%ymm12 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm3,%ymm15,%ymm15 + vprold $16,%ymm12,%ymm12 + vprold $16,%ymm13,%ymm13 + vprold $16,%ymm14,%ymm14 + vprold $16,%ymm15,%ymm15 + vpaddd %ymm12,%ymm8,%ymm8 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm8,%ymm4,%ymm4 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm11,%ymm7,%ymm7 + vprold $12,%ymm4,%ymm4 + vprold $12,%ymm5,%ymm5 + vprold $12,%ymm6,%ymm6 + vprold $12,%ymm7,%ymm7 + vpaddd %ymm4,%ymm0,%ymm0 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm7,%ymm3,%ymm3 + vpxor %ymm0,%ymm12,%ymm12 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm3,%ymm15,%ymm15 + vprold $8,%ymm12,%ymm12 + vprold $8,%ymm13,%ymm13 + vprold $8,%ymm14,%ymm14 + vprold $8,%ymm15,%ymm15 + vpaddd %ymm12,%ymm8,%ymm8 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm8,%ymm4,%ymm4 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm11,%ymm7,%ymm7 + vprold $7,%ymm4,%ymm4 + vprold $7,%ymm5,%ymm5 + vprold $7,%ymm6,%ymm6 + vprold $7,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + vpaddd %ymm6,%ymm1,%ymm1 + vpaddd %ymm7,%ymm2,%ymm2 + vpaddd %ymm4,%ymm3,%ymm3 + vpxor %ymm0,%ymm15,%ymm15 + vpxor %ymm1,%ymm12,%ymm12 + vpxor %ymm2,%ymm13,%ymm13 + vpxor %ymm3,%ymm14,%ymm14 + vprold $16,%ymm15,%ymm15 + vprold $16,%ymm12,%ymm12 + vprold $16,%ymm13,%ymm13 + vprold $16,%ymm14,%ymm14 + vpaddd %ymm15,%ymm10,%ymm10 + vpaddd %ymm12,%ymm11,%ymm11 + vpaddd %ymm13,%ymm8,%ymm8 + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm10,%ymm5,%ymm5 + vpxor %ymm11,%ymm6,%ymm6 + vpxor %ymm8,%ymm7,%ymm7 + vpxor %ymm9,%ymm4,%ymm4 + vprold $12,%ymm5,%ymm5 + vprold $12,%ymm6,%ymm6 + vprold $12,%ymm7,%ymm7 + vprold $12,%ymm4,%ymm4 + vpaddd %ymm5,%ymm0,%ymm0 + vpaddd %ymm6,%ymm1,%ymm1 + vpaddd %ymm7,%ymm2,%ymm2 + vpaddd %ymm4,%ymm3,%ymm3 + vpxor %ymm0,%ymm15,%ymm15 + vpxor %ymm1,%ymm12,%ymm12 + vpxor %ymm2,%ymm13,%ymm13 + vpxor %ymm3,%ymm14,%ymm14 + vprold $8,%ymm15,%ymm15 + vprold $8,%ymm12,%ymm12 + vprold $8,%ymm13,%ymm13 + vprold $8,%ymm14,%ymm14 + vpaddd %ymm15,%ymm10,%ymm10 + vpaddd %ymm12,%ymm11,%ymm11 + vpaddd %ymm13,%ymm8,%ymm8 + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm10,%ymm5,%ymm5 + vpxor %ymm11,%ymm6,%ymm6 + vpxor %ymm8,%ymm7,%ymm7 + vpxor %ymm9,%ymm4,%ymm4 + vprold $7,%ymm5,%ymm5 + vprold $7,%ymm6,%ymm6 + vprold $7,%ymm7,%ymm7 + vprold $7,%ymm4,%ymm4 + decl %eax + jnz .Loop8xvl + + vpaddd %ymm16,%ymm0,%ymm0 + vpaddd %ymm17,%ymm1,%ymm1 + vpaddd %ymm18,%ymm2,%ymm2 + vpaddd %ymm19,%ymm3,%ymm3 + + vpunpckldq %ymm1,%ymm0,%ymm18 + vpunpckldq %ymm3,%ymm2,%ymm19 + vpunpckhdq %ymm1,%ymm0,%ymm0 + vpunpckhdq %ymm3,%ymm2,%ymm2 + vpunpcklqdq %ymm19,%ymm18,%ymm1 + vpunpckhqdq %ymm19,%ymm18,%ymm18 + vpunpcklqdq %ymm2,%ymm0,%ymm3 + vpunpckhqdq %ymm2,%ymm0,%ymm0 + vpaddd %ymm20,%ymm4,%ymm4 + vpaddd %ymm21,%ymm5,%ymm5 + vpaddd %ymm22,%ymm6,%ymm6 + vpaddd %ymm23,%ymm7,%ymm7 + + vpunpckldq %ymm5,%ymm4,%ymm2 + vpunpckldq %ymm7,%ymm6,%ymm19 + vpunpckhdq %ymm5,%ymm4,%ymm4 + vpunpckhdq %ymm7,%ymm6,%ymm6 + vpunpcklqdq %ymm19,%ymm2,%ymm5 + vpunpckhqdq %ymm19,%ymm2,%ymm2 + vpunpcklqdq %ymm6,%ymm4,%ymm7 + vpunpckhqdq %ymm6,%ymm4,%ymm4 + vshufi32x4 $0,%ymm5,%ymm1,%ymm19 + vshufi32x4 $3,%ymm5,%ymm1,%ymm5 + vshufi32x4 $0,%ymm2,%ymm18,%ymm1 + vshufi32x4 $3,%ymm2,%ymm18,%ymm2 + vshufi32x4 $0,%ymm7,%ymm3,%ymm18 + vshufi32x4 $3,%ymm7,%ymm3,%ymm7 + vshufi32x4 $0,%ymm4,%ymm0,%ymm3 + vshufi32x4 $3,%ymm4,%ymm0,%ymm4 + vpaddd %ymm24,%ymm8,%ymm8 + vpaddd %ymm25,%ymm9,%ymm9 + vpaddd %ymm26,%ymm10,%ymm10 + vpaddd %ymm27,%ymm11,%ymm11 + + vpunpckldq %ymm9,%ymm8,%ymm6 + vpunpckldq %ymm11,%ymm10,%ymm0 + vpunpckhdq %ymm9,%ymm8,%ymm8 + vpunpckhdq %ymm11,%ymm10,%ymm10 + vpunpcklqdq %ymm0,%ymm6,%ymm9 + vpunpckhqdq %ymm0,%ymm6,%ymm6 + vpunpcklqdq %ymm10,%ymm8,%ymm11 + vpunpckhqdq %ymm10,%ymm8,%ymm8 + vpaddd %ymm28,%ymm12,%ymm12 + vpaddd %ymm29,%ymm13,%ymm13 + vpaddd %ymm30,%ymm14,%ymm14 + vpaddd %ymm31,%ymm15,%ymm15 + + vpunpckldq %ymm13,%ymm12,%ymm10 + vpunpckldq %ymm15,%ymm14,%ymm0 + vpunpckhdq %ymm13,%ymm12,%ymm12 + vpunpckhdq %ymm15,%ymm14,%ymm14 + vpunpcklqdq %ymm0,%ymm10,%ymm13 + vpunpckhqdq %ymm0,%ymm10,%ymm10 + vpunpcklqdq %ymm14,%ymm12,%ymm15 + vpunpckhqdq %ymm14,%ymm12,%ymm12 + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + vperm2i128 $0x20,%ymm10,%ymm6,%ymm9 + vperm2i128 $0x31,%ymm10,%ymm6,%ymm10 + vperm2i128 $0x20,%ymm15,%ymm11,%ymm6 + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + vperm2i128 $0x20,%ymm12,%ymm8,%ymm11 + vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 + cmpq $512,%rdx + jb .Ltail8xvl + + movl $0x80,%eax + vpxord 0(%rsi),%ymm19,%ymm19 + vpxor 32(%rsi),%ymm0,%ymm0 + vpxor 64(%rsi),%ymm5,%ymm5 + vpxor 96(%rsi),%ymm13,%ymm13 + leaq (%rsi,%rax,1),%rsi + vmovdqu32 %ymm19,0(%rdi) + vmovdqu %ymm0,32(%rdi) + vmovdqu %ymm5,64(%rdi) + vmovdqu %ymm13,96(%rdi) + leaq (%rdi,%rax,1),%rdi + + vpxor 0(%rsi),%ymm1,%ymm1 + vpxor 32(%rsi),%ymm9,%ymm9 + vpxor 64(%rsi),%ymm2,%ymm2 + vpxor 96(%rsi),%ymm10,%ymm10 + leaq (%rsi,%rax,1),%rsi + vmovdqu %ymm1,0(%rdi) + vmovdqu %ymm9,32(%rdi) + vmovdqu %ymm2,64(%rdi) + vmovdqu %ymm10,96(%rdi) + leaq (%rdi,%rax,1),%rdi + + vpxord 0(%rsi),%ymm18,%ymm18 + vpxor 32(%rsi),%ymm6,%ymm6 + vpxor 64(%rsi),%ymm7,%ymm7 + vpxor 96(%rsi),%ymm15,%ymm15 + leaq (%rsi,%rax,1),%rsi + vmovdqu32 %ymm18,0(%rdi) + vmovdqu %ymm6,32(%rdi) + vmovdqu %ymm7,64(%rdi) + vmovdqu %ymm15,96(%rdi) + leaq (%rdi,%rax,1),%rdi + + vpxor 0(%rsi),%ymm3,%ymm3 + vpxor 32(%rsi),%ymm11,%ymm11 + vpxor 64(%rsi),%ymm4,%ymm4 + vpxor 96(%rsi),%ymm12,%ymm12 + leaq (%rsi,%rax,1),%rsi + vmovdqu %ymm3,0(%rdi) + vmovdqu %ymm11,32(%rdi) + vmovdqu %ymm4,64(%rdi) + vmovdqu %ymm12,96(%rdi) + leaq (%rdi,%rax,1),%rdi + + vpbroadcastd 0(%r10),%ymm0 + vpbroadcastd 4(%r10),%ymm1 + + subq $512,%rdx + jnz .Loop_outer8xvl + + jmp .Ldone8xvl + +.align 32 +.Ltail8xvl: + vmovdqa64 %ymm19,%ymm8 + xorq %r10,%r10 + subq %rsi,%rdi + cmpq $64,%rdx + jb .Less_than_64_8xvl + vpxor 0(%rsi),%ymm8,%ymm8 + vpxor 32(%rsi),%ymm0,%ymm0 + vmovdqu %ymm8,0(%rdi,%rsi,1) + vmovdqu %ymm0,32(%rdi,%rsi,1) + je .Ldone8xvl + vmovdqa %ymm5,%ymm8 + vmovdqa %ymm13,%ymm0 + leaq 64(%rsi),%rsi + + cmpq $128,%rdx + jb .Less_than_64_8xvl + vpxor 0(%rsi),%ymm5,%ymm5 + vpxor 32(%rsi),%ymm13,%ymm13 + vmovdqu %ymm5,0(%rdi,%rsi,1) + vmovdqu %ymm13,32(%rdi,%rsi,1) + je .Ldone8xvl + vmovdqa %ymm1,%ymm8 + vmovdqa %ymm9,%ymm0 + leaq 64(%rsi),%rsi + + cmpq $192,%rdx + jb .Less_than_64_8xvl + vpxor 0(%rsi),%ymm1,%ymm1 + vpxor 32(%rsi),%ymm9,%ymm9 + vmovdqu %ymm1,0(%rdi,%rsi,1) + vmovdqu %ymm9,32(%rdi,%rsi,1) + je .Ldone8xvl + vmovdqa %ymm2,%ymm8 + vmovdqa %ymm10,%ymm0 + leaq 64(%rsi),%rsi + + cmpq $256,%rdx + jb .Less_than_64_8xvl + vpxor 0(%rsi),%ymm2,%ymm2 + vpxor 32(%rsi),%ymm10,%ymm10 + vmovdqu %ymm2,0(%rdi,%rsi,1) + vmovdqu %ymm10,32(%rdi,%rsi,1) + je .Ldone8xvl + vmovdqa32 %ymm18,%ymm8 + vmovdqa %ymm6,%ymm0 + leaq 64(%rsi),%rsi + + cmpq $320,%rdx + jb .Less_than_64_8xvl + vpxord 0(%rsi),%ymm18,%ymm18 + vpxor 32(%rsi),%ymm6,%ymm6 + vmovdqu32 %ymm18,0(%rdi,%rsi,1) + vmovdqu %ymm6,32(%rdi,%rsi,1) + je .Ldone8xvl + vmovdqa %ymm7,%ymm8 + vmovdqa %ymm15,%ymm0 + leaq 64(%rsi),%rsi + + cmpq $384,%rdx + jb .Less_than_64_8xvl + vpxor 0(%rsi),%ymm7,%ymm7 + vpxor 32(%rsi),%ymm15,%ymm15 + vmovdqu %ymm7,0(%rdi,%rsi,1) + vmovdqu %ymm15,32(%rdi,%rsi,1) + je .Ldone8xvl + vmovdqa %ymm3,%ymm8 + vmovdqa %ymm11,%ymm0 + leaq 64(%rsi),%rsi + + cmpq $448,%rdx + jb .Less_than_64_8xvl + vpxor 0(%rsi),%ymm3,%ymm3 + vpxor 32(%rsi),%ymm11,%ymm11 + vmovdqu %ymm3,0(%rdi,%rsi,1) + vmovdqu %ymm11,32(%rdi,%rsi,1) + je .Ldone8xvl + vmovdqa %ymm4,%ymm8 + vmovdqa %ymm12,%ymm0 + leaq 64(%rsi),%rsi + +.Less_than_64_8xvl: + vmovdqa %ymm8,0(%rsp) + vmovdqa %ymm0,32(%rsp) + leaq (%rdi,%rsi,1),%rdi + andq $63,%rdx + +.Loop_tail8xvl: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz .Loop_tail8xvl + + vpxor %ymm8,%ymm8,%ymm8 + vmovdqa %ymm8,0(%rsp) + vmovdqa %ymm8,32(%rsp) + +.Ldone8xvl: + vzeroall + leaq (%r9),%rsp +.cfi_def_cfa_register %rsp +.L8xvl_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ChaCha20_8xvl,.-ChaCha20_8xvl diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/ec/ecp_nistz256-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/ec/ecp_nistz256-x86_64.s index 62a7ac611f..eeeed6ba40 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/ec/ecp_nistz256-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/ec/ecp_nistz256-x86_64.s @@ -2393,12 +2393,24 @@ ecp_nistz256_precomputed: .LONE_mont: .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe + +.Lord: +.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 +.LordK: +.quad 0xccd1c8aaee00bc4f + .globl ecp_nistz256_mul_by_2 .type ecp_nistz256_mul_by_2,@function .align 64 ecp_nistz256_mul_by_2: +.cfi_startproc pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-24 +.Lmul_by_2_body: movq 0(%rsi),%r8 xorq %r13,%r13 @@ -2431,9 +2443,15 @@ ecp_nistz256_mul_by_2: movq %r10,16(%rdi) movq %r11,24(%rdi) - popq %r13 - popq %r12 + movq 0(%rsp),%r13 +.cfi_restore %r13 + movq 8(%rsp),%r12 +.cfi_restore %r12 + leaq 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lmul_by_2_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 @@ -2442,8 +2460,14 @@ ecp_nistz256_mul_by_2: .type ecp_nistz256_div_by_2,@function .align 32 ecp_nistz256_div_by_2: +.cfi_startproc pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-24 +.Ldiv_by_2_body: movq 0(%rsi),%r8 movq 8(%rsi),%r9 @@ -2491,9 +2515,15 @@ ecp_nistz256_div_by_2: movq %r10,16(%rdi) movq %r11,24(%rdi) - popq %r13 - popq %r12 + movq 0(%rsp),%r13 +.cfi_restore %r13 + movq 8(%rsp),%r12 +.cfi_restore %r12 + leaq 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Ldiv_by_2_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 @@ -2502,8 +2532,14 @@ ecp_nistz256_div_by_2: .type ecp_nistz256_mul_by_3,@function .align 32 ecp_nistz256_mul_by_3: +.cfi_startproc pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-24 +.Lmul_by_3_body: movq 0(%rsi),%r8 xorq %r13,%r13 @@ -2557,9 +2593,15 @@ ecp_nistz256_mul_by_3: movq %r10,16(%rdi) movq %r11,24(%rdi) - popq %r13 - popq %r12 + movq 0(%rsp),%r13 +.cfi_restore %r13 + movq 8(%rsp),%r12 +.cfi_restore %r12 + leaq 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lmul_by_3_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 @@ -2568,8 +2610,14 @@ ecp_nistz256_mul_by_3: .type ecp_nistz256_add,@function .align 32 ecp_nistz256_add: +.cfi_startproc pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-24 +.Ladd_body: movq 0(%rsi),%r8 xorq %r13,%r13 @@ -2603,9 +2651,15 @@ ecp_nistz256_add: movq %r10,16(%rdi) movq %r11,24(%rdi) - popq %r13 - popq %r12 + movq 0(%rsp),%r13 +.cfi_restore %r13 + movq 8(%rsp),%r12 +.cfi_restore %r12 + leaq 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Ladd_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_add,.-ecp_nistz256_add @@ -2614,8 +2668,14 @@ ecp_nistz256_add: .type ecp_nistz256_sub,@function .align 32 ecp_nistz256_sub: +.cfi_startproc pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-24 +.Lsub_body: movq 0(%rsi),%r8 xorq %r13,%r13 @@ -2649,9 +2709,15 @@ ecp_nistz256_sub: movq %r10,16(%rdi) movq %r11,24(%rdi) - popq %r13 - popq %r12 + movq 0(%rsp),%r13 +.cfi_restore %r13 + movq 8(%rsp),%r12 +.cfi_restore %r12 + leaq 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lsub_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_sub,.-ecp_nistz256_sub @@ -2660,8 +2726,14 @@ ecp_nistz256_sub: .type ecp_nistz256_neg,@function .align 32 ecp_nistz256_neg: +.cfi_startproc pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-24 +.Lneg_body: xorq %r8,%r8 xorq %r9,%r9 @@ -2695,14 +2767,1109 @@ ecp_nistz256_neg: movq %r10,16(%rdi) movq %r11,24(%rdi) - popq %r13 - popq %r12 + movq 0(%rsp),%r13 +.cfi_restore %r13 + movq 8(%rsp),%r12 +.cfi_restore %r12 + leaq 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lneg_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_neg,.-ecp_nistz256_neg + + +.globl ecp_nistz256_ord_mul_mont +.type ecp_nistz256_ord_mul_mont,@function +.align 32 +ecp_nistz256_ord_mul_mont: +.cfi_startproc + movl $0x80100,%ecx + andl OPENSSL_ia32cap_P+8(%rip),%ecx + cmpl $0x80100,%ecx + je .Lecp_nistz256_ord_mul_montx + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_mul_body: + + movq 0(%rdx),%rax + movq %rdx,%rbx + leaq .Lord(%rip),%r14 + movq .LordK(%rip),%r15 + + + movq %rax,%rcx + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + + movq %r8,%r13 + imulq %r15,%r8 + + movq %rdx,%r11 + mulq 24(%rsi) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq 0(%r14) + movq %r8,%rbp + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%r8 + + mulq 8(%r14) + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %rbp,%rax + adcq %rdx,%r10 + movq %rbp,%rdx + adcq $0,%r8 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 8(%rbx),%rax + sbbq %rdx,%rbp + + addq %r8,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + + movq %r9,%rcx + imulq %r15,%r9 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + mulq 0(%r14) + movq %r9,%rbp + addq %rax,%rcx + movq %r9,%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%r9 + + mulq 8(%r14) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq %rdx,%r11 + movq %rbp,%rdx + adcq $0,%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r12 + movq 16(%rbx),%rax + sbbq %rdx,%rbp + + addq %r9,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + + movq %r10,%rcx + imulq %r15,%r10 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + mulq 0(%r14) + movq %r10,%rbp + addq %rax,%rcx + movq %r10,%rax + adcq %rdx,%rcx + + subq %r10,%r12 + sbbq $0,%r10 + + mulq 8(%r14) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq %rdx,%r12 + movq %rbp,%rdx + adcq $0,%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r13 + movq 24(%rbx),%rax + sbbq %rdx,%rbp + + addq %r10,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + + movq %r11,%rcx + imulq %r15,%r11 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + mulq 0(%r14) + movq %r11,%rbp + addq %rax,%rcx + movq %r11,%rax + adcq %rdx,%rcx + + subq %r11,%r13 + sbbq $0,%r11 + + mulq 8(%r14) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq %rdx,%r13 + movq %rbp,%rdx + adcq $0,%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + sbbq %rdx,%rbp + + addq %r11,%r8 + adcq %rbp,%r9 + adcq $0,%r10 + + + movq %r12,%rsi + subq 0(%r14),%r12 + movq %r13,%r11 + sbbq 8(%r14),%r13 + movq %r8,%rcx + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rsi,%r12 + cmovcq %r11,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mul_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont + + + + + + + +.globl ecp_nistz256_ord_sqr_mont +.type ecp_nistz256_ord_sqr_mont,@function +.align 32 +ecp_nistz256_ord_sqr_mont: +.cfi_startproc + movl $0x80100,%ecx + andl OPENSSL_ia32cap_P+8(%rip),%ecx + cmpl $0x80100,%ecx + je .Lecp_nistz256_ord_sqr_montx + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_sqr_body: + + movq 0(%rsi),%r8 + movq 8(%rsi),%rax + movq 16(%rsi),%r14 + movq 24(%rsi),%r15 + leaq .Lord(%rip),%rsi + movq %rdx,%rbx + jmp .Loop_ord_sqr + +.align 32 +.Loop_ord_sqr: + + movq %rax,%rbp + mulq %r8 + movq %rax,%r9 +.byte 102,72,15,110,205 + movq %r14,%rax + movq %rdx,%r10 + + mulq %r8 + addq %rax,%r10 + movq %r15,%rax +.byte 102,73,15,110,214 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r8 + addq %rax,%r11 + movq %r15,%rax +.byte 102,73,15,110,223 + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + movq %rax,%r13 + movq %r14,%rax + movq %rdx,%r14 + + + mulq %rbp + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + + addq %r15,%r12 + adcq %rdx,%r13 + adcq $0,%r14 + + + xorq %r15,%r15 + movq %r8,%rax + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + + mulq %rax + movq %rax,%r8 +.byte 102,72,15,126,200 + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r9 + adcq %rax,%r10 +.byte 102,72,15,126,208 + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r11 + adcq %rax,%r12 +.byte 102,72,15,126,216 + adcq $0,%rdx + movq %rdx,%rbp + + movq %r8,%rcx + imulq 32(%rsi),%r8 + + mulq %rax + addq %rbp,%r13 + adcq %rax,%r14 + movq 0(%rsi),%rax + adcq %rdx,%r15 + + + mulq %r8 + movq %r8,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%rbp + + mulq %r8 + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r8,%rax + adcq %rdx,%r10 + movq %r8,%rdx + adcq $0,%rbp + + movq %r9,%rcx + imulq 32(%rsi),%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 0(%rsi),%rax + sbbq %rdx,%r8 + + addq %rbp,%r11 + adcq $0,%r8 + + + mulq %r9 + movq %r9,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%rbp + + mulq %r9 + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r9,%rax + adcq %rdx,%r11 + movq %r9,%rdx + adcq $0,%rbp + + movq %r10,%rcx + imulq 32(%rsi),%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + movq 0(%rsi),%rax + sbbq %rdx,%r9 + + addq %rbp,%r8 + adcq $0,%r9 + + + mulq %r10 + movq %r10,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r10,%r8 + sbbq $0,%rbp + + mulq %r10 + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r10,%rax + adcq %rdx,%r8 + movq %r10,%rdx + adcq $0,%rbp + + movq %r11,%rcx + imulq 32(%rsi),%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r9 + movq 0(%rsi),%rax + sbbq %rdx,%r10 + + addq %rbp,%r9 + adcq $0,%r10 + + + mulq %r11 + movq %r11,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r11,%r9 + sbbq $0,%rbp + + mulq %r11 + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + movq %r11,%rdx + adcq $0,%rbp + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r10 + sbbq %rdx,%r11 + + addq %rbp,%r10 + adcq $0,%r11 + + + xorq %rdx,%rdx + addq %r12,%r8 + adcq %r13,%r9 + movq %r8,%r12 + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%rax + adcq $0,%rdx + + + subq 0(%rsi),%r8 + movq %r10,%r14 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r15 + sbbq 24(%rsi),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r8 + cmovncq %r9,%rax + cmovncq %r10,%r14 + cmovncq %r11,%r15 + + decq %rbx + jnz .Loop_ord_sqr + + movq %r8,0(%rdi) + movq %rax,8(%rdi) + pxor %xmm1,%xmm1 + movq %r14,16(%rdi) + pxor %xmm2,%xmm2 + movq %r15,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqr_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont + +.type ecp_nistz256_ord_mul_montx,@function +.align 32 +ecp_nistz256_ord_mul_montx: +.cfi_startproc +.Lecp_nistz256_ord_mul_montx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_mulx_body: + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + leaq .Lord-128(%rip),%r14 + movq .LordK(%rip),%r15 + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + mulxq %r11,%rbp,%r11 + addq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + mulxq %r15,%rdx,%rax + adcq %rbp,%r10 + adcq %rcx,%r11 + adcq $0,%r12 + + + xorq %r13,%r13 + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%r14),%rcx,%rbp + movq 8(%rbx),%rdx + adcxq %rcx,%r11 + adoxq %rbp,%r12 + adcxq %r8,%r12 + adoxq %r8,%r13 + adcq $0,%r13 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%r14),%rcx,%rbp + movq 16(%rbx),%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcxq %r9,%r13 + adoxq %r9,%r8 + adcq $0,%r8 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%r14),%rcx,%rbp + movq 24(%rbx),%rdx + adcxq %rcx,%r13 + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcq $0,%r9 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%r14),%rcx,%rbp + leaq 128(%r14),%r14 + movq %r12,%rbx + adcxq %rcx,%r8 + adoxq %rbp,%r9 + movq %r13,%rdx + adcxq %r11,%r9 + adoxq %r11,%r10 + adcq $0,%r10 + + + + movq %r8,%rcx + subq 0(%r14),%r12 + sbbq 8(%r14),%r13 + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mulx_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx + +.type ecp_nistz256_ord_sqr_montx,@function +.align 32 +ecp_nistz256_ord_sqr_montx: +.cfi_startproc +.Lecp_nistz256_ord_sqr_montx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_sqrx_body: + + movq %rdx,%rbx + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq .Lord(%rip),%rsi + jmp .Loop_ord_sqrx + +.align 32 +.Loop_ord_sqrx: + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + movq %rdx,%rax +.byte 102,73,15,110,206 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + addq %rcx,%r10 +.byte 102,73,15,110,215 + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + mulxq %r8,%rcx,%r14 + movq %rax,%rdx +.byte 102,73,15,110,216 + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + + mulxq %rdx,%r8,%rbp +.byte 102,72,15,126,202 + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax +.byte 102,72,15,126,210 + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 + mulxq %rdx,%rcx,%rbp +.byte 0x67 +.byte 102,72,15,126,218 + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + adoxq %rbp,%r13 + mulxq %rdx,%rcx,%rax + adoxq %rcx,%r14 + adoxq %rax,%r15 + + + movq %r8,%rdx + mulxq 32(%rsi),%rdx,%rcx + + xorq %rax,%rax + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + adcxq %rax,%r8 + + + movq %r9,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + adoxq %rax,%r9 + + + movq %r10,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + adcxq %rax,%r10 + + + movq %r11,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + adoxq %rax,%r11 + + + addq %r8,%r12 + adcq %r13,%r9 + movq %r12,%rdx + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%r14 + adcq $0,%rax + + + subq 0(%rsi),%r12 + movq %r10,%r15 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r8 + sbbq 24(%rsi),%r11 + sbbq $0,%rax + + cmovncq %r12,%rdx + cmovncq %r9,%r14 + cmovncq %r10,%r15 + cmovncq %r11,%r8 + + decq %rbx + jnz .Loop_ord_sqrx + + movq %rdx,0(%rdi) + movq %r14,8(%rdi) + pxor %xmm1,%xmm1 + movq %r15,16(%rdi) + pxor %xmm2,%xmm2 + movq %r8,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqrx_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx + + + + .globl ecp_nistz256_to_mont .type ecp_nistz256_to_mont,@function .align 32 @@ -2723,15 +3890,29 @@ ecp_nistz256_to_mont: .type ecp_nistz256_mul_mont,@function .align 32 ecp_nistz256_mul_mont: +.cfi_startproc movl $0x80100,%ecx andl OPENSSL_ia32cap_P+8(%rip),%ecx .Lmul_mont: pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lmul_body: cmpl $0x80100,%ecx je .Lmul_montx movq %rdx,%rbx @@ -2756,13 +3937,23 @@ ecp_nistz256_mul_mont: call __ecp_nistz256_mul_montx .Lmul_mont_done: - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lmul_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont .type __ecp_nistz256_mul_montq,@function @@ -2992,14 +4183,28 @@ __ecp_nistz256_mul_montq: .type ecp_nistz256_sqr_mont,@function .align 32 ecp_nistz256_sqr_mont: +.cfi_startproc movl $0x80100,%ecx andl OPENSSL_ia32cap_P+8(%rip),%ecx pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lsqr_body: cmpl $0x80100,%ecx je .Lsqr_montx movq 0(%rsi),%rax @@ -3020,13 +4225,23 @@ ecp_nistz256_sqr_mont: call __ecp_nistz256_sqr_montx .Lsqr_mont_done: - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lsqr_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont .type __ecp_nistz256_sqr_montq,@function @@ -3494,8 +4709,14 @@ __ecp_nistz256_sqr_montx: .type ecp_nistz256_from_mont,@function .align 32 ecp_nistz256_from_mont: +.cfi_startproc pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-24 +.Lfrom_body: movq 0(%rsi),%rax movq .Lpoly+24(%rip),%r13 @@ -3576,9 +4797,15 @@ ecp_nistz256_from_mont: movq %r10,16(%rdi) movq %r11,24(%rdi) - popq %r13 - popq %r12 + movq 0(%rsp),%r13 +.cfi_restore %r13 + movq 8(%rsp),%r12 +.cfi_restore %r12 + leaq 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lfrom_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont @@ -3664,6 +4891,7 @@ ecp_nistz256_gather_w5: movdqu %xmm6,64(%rdi) movdqu %xmm7,80(%rdi) .byte 0xf3,0xc3 +.LSEH_end_ecp_nistz256_gather_w5: .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 @@ -3734,6 +4962,7 @@ ecp_nistz256_gather_w7: movdqu %xmm4,32(%rdi) movdqu %xmm5,48(%rdi) .byte 0xf3,0xc3 +.LSEH_end_ecp_nistz256_gather_w7: .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 @@ -3794,6 +5023,7 @@ ecp_nistz256_avx2_gather_w5: vmovdqu %ymm4,64(%rdi) vzeroupper .byte 0xf3,0xc3 +.LSEH_end_ecp_nistz256_avx2_gather_w5: .size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5 @@ -3871,6 +5101,7 @@ ecp_nistz256_avx2_gather_w7: vmovdqu %ymm3,32(%rdi) vzeroupper .byte 0xf3,0xc3 +.LSEH_end_ecp_nistz256_avx2_gather_w7: .size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 .type __ecp_nistz256_add_toq,@function .align 32 @@ -3997,17 +5228,32 @@ __ecp_nistz256_mul_by_2q: .type ecp_nistz256_point_double,@function .align 32 ecp_nistz256_point_double: +.cfi_startproc movl $0x80100,%ecx andl OPENSSL_ia32cap_P+8(%rip),%ecx cmpl $0x80100,%ecx je .Lpoint_doublex pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $160+8,%rsp +.cfi_adjust_cfa_offset 32*5+8 +.Lpoint_doubleq_body: .Lpoint_double_shortcutq: movdqu 0(%rsi),%xmm0 @@ -4190,30 +5436,56 @@ ecp_nistz256_point_double: .byte 102,72,15,126,207 call __ecp_nistz256_sub_fromq - addq $160+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 160+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_doubleq_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_point_double,.-ecp_nistz256_point_double .globl ecp_nistz256_point_add .type ecp_nistz256_point_add,@function .align 32 ecp_nistz256_point_add: +.cfi_startproc movl $0x80100,%ecx andl OPENSSL_ia32cap_P+8(%rip),%ecx cmpl $0x80100,%ecx je .Lpoint_addx pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $576+8,%rsp +.cfi_adjust_cfa_offset 32*18+8 +.Lpoint_addq_body: movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 @@ -4590,30 +5862,56 @@ ecp_nistz256_point_add: movdqu %xmm3,48(%rdi) .Ladd_doneq: - addq $576+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 576+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_addq_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_point_add,.-ecp_nistz256_point_add .globl ecp_nistz256_point_add_affine .type ecp_nistz256_point_add_affine,@function .align 32 ecp_nistz256_point_add_affine: +.cfi_startproc movl $0x80100,%ecx andl OPENSSL_ia32cap_P+8(%rip),%ecx cmpl $0x80100,%ecx je .Lpoint_add_affinex pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $480+8,%rsp +.cfi_adjust_cfa_offset 32*15+8 +.Ladd_affineq_body: movdqu 0(%rsi),%xmm0 movq %rdx,%rbx @@ -4896,14 +6194,25 @@ ecp_nistz256_point_add_affine: movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) - addq $480+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 480+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Ladd_affineq_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine .type __ecp_nistz256_add_tox,@function .align 32 @@ -5035,14 +6344,29 @@ __ecp_nistz256_mul_by_2x: .type ecp_nistz256_point_doublex,@function .align 32 ecp_nistz256_point_doublex: +.cfi_startproc .Lpoint_doublex: pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $160+8,%rsp +.cfi_adjust_cfa_offset 32*5+8 +.Lpoint_doublex_body: .Lpoint_double_shortcutx: movdqu 0(%rsi),%xmm0 @@ -5225,26 +6549,52 @@ ecp_nistz256_point_doublex: .byte 102,72,15,126,207 call __ecp_nistz256_sub_fromx - addq $160+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 160+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_doublex_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex .type ecp_nistz256_point_addx,@function .align 32 ecp_nistz256_point_addx: +.cfi_startproc .Lpoint_addx: pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $576+8,%rsp +.cfi_adjust_cfa_offset 32*18+8 +.Lpoint_addx_body: movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 @@ -5621,26 +6971,52 @@ ecp_nistz256_point_addx: movdqu %xmm3,48(%rdi) .Ladd_donex: - addq $576+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 576+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_addx_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_point_addx,.-ecp_nistz256_point_addx .type ecp_nistz256_point_add_affinex,@function .align 32 ecp_nistz256_point_add_affinex: +.cfi_startproc .Lpoint_add_affinex: pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $480+8,%rsp +.cfi_adjust_cfa_offset 32*15+8 +.Ladd_affinex_body: movdqu 0(%rsi),%xmm0 movq %rdx,%rbx @@ -5923,12 +7299,23 @@ ecp_nistz256_point_add_affinex: movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) - addq $480+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 480+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Ladd_affinex_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/ec/x25519-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/ec/x25519-x86_64.s new file mode 100644 index 0000000000..2a18eaee28 --- /dev/null +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/ec/x25519-x86_64.s @@ -0,0 +1,792 @@ +.text + +.globl x25519_fe51_mul +.type x25519_fe51_mul,@function +.align 32 +x25519_fe51_mul: +.cfi_startproc + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 40 +.Lfe51_mul_body: + + movq 0(%rsi),%rax + movq 0(%rdx),%r11 + movq 8(%rdx),%r12 + movq 16(%rdx),%r13 + movq 24(%rdx),%rbp + movq 32(%rdx),%r14 + + movq %rdi,32(%rsp) + movq %rax,%rdi + mulq %r11 + movq %r11,0(%rsp) + movq %rax,%rbx + movq %rdi,%rax + movq %rdx,%rcx + mulq %r12 + movq %r12,8(%rsp) + movq %rax,%r8 + movq %rdi,%rax + leaq (%r14,%r14,8),%r15 + movq %rdx,%r9 + mulq %r13 + movq %r13,16(%rsp) + movq %rax,%r10 + movq %rdi,%rax + leaq (%r14,%r15,2),%rdi + movq %rdx,%r11 + mulq %rbp + movq %rax,%r12 + movq 0(%rsi),%rax + movq %rdx,%r13 + mulq %r14 + movq %rax,%r14 + movq 8(%rsi),%rax + movq %rdx,%r15 + + mulq %rdi + addq %rax,%rbx + movq 16(%rsi),%rax + adcq %rdx,%rcx + mulq %rdi + addq %rax,%r8 + movq 24(%rsi),%rax + adcq %rdx,%r9 + mulq %rdi + addq %rax,%r10 + movq 32(%rsi),%rax + adcq %rdx,%r11 + mulq %rdi + imulq $19,%rbp,%rdi + addq %rax,%r12 + movq 8(%rsi),%rax + adcq %rdx,%r13 + mulq %rbp + movq 16(%rsp),%rbp + addq %rax,%r14 + movq 16(%rsi),%rax + adcq %rdx,%r15 + + mulq %rdi + addq %rax,%rbx + movq 24(%rsi),%rax + adcq %rdx,%rcx + mulq %rdi + addq %rax,%r8 + movq 32(%rsi),%rax + adcq %rdx,%r9 + mulq %rdi + imulq $19,%rbp,%rdi + addq %rax,%r10 + movq 8(%rsi),%rax + adcq %rdx,%r11 + mulq %rbp + addq %rax,%r12 + movq 16(%rsi),%rax + adcq %rdx,%r13 + mulq %rbp + movq 8(%rsp),%rbp + addq %rax,%r14 + movq 24(%rsi),%rax + adcq %rdx,%r15 + + mulq %rdi + addq %rax,%rbx + movq 32(%rsi),%rax + adcq %rdx,%rcx + mulq %rdi + addq %rax,%r8 + movq 8(%rsi),%rax + adcq %rdx,%r9 + mulq %rbp + imulq $19,%rbp,%rdi + addq %rax,%r10 + movq 16(%rsi),%rax + adcq %rdx,%r11 + mulq %rbp + addq %rax,%r12 + movq 24(%rsi),%rax + adcq %rdx,%r13 + mulq %rbp + movq 0(%rsp),%rbp + addq %rax,%r14 + movq 32(%rsi),%rax + adcq %rdx,%r15 + + mulq %rdi + addq %rax,%rbx + movq 8(%rsi),%rax + adcq %rdx,%rcx + mulq %rbp + addq %rax,%r8 + movq 16(%rsi),%rax + adcq %rdx,%r9 + mulq %rbp + addq %rax,%r10 + movq 24(%rsi),%rax + adcq %rdx,%r11 + mulq %rbp + addq %rax,%r12 + movq 32(%rsi),%rax + adcq %rdx,%r13 + mulq %rbp + addq %rax,%r14 + adcq %rdx,%r15 + + movq 32(%rsp),%rdi + jmp .Lreduce51 +.Lfe51_mul_epilogue: +.cfi_endproc +.size x25519_fe51_mul,.-x25519_fe51_mul + +.globl x25519_fe51_sqr +.type x25519_fe51_sqr,@function +.align 32 +x25519_fe51_sqr: +.cfi_startproc + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 40 +.Lfe51_sqr_body: + + movq 0(%rsi),%rax + movq 16(%rsi),%r15 + movq 32(%rsi),%rbp + + movq %rdi,32(%rsp) + leaq (%rax,%rax,1),%r14 + mulq %rax + movq %rax,%rbx + movq 8(%rsi),%rax + movq %rdx,%rcx + mulq %r14 + movq %rax,%r8 + movq %r15,%rax + movq %r15,0(%rsp) + movq %rdx,%r9 + mulq %r14 + movq %rax,%r10 + movq 24(%rsi),%rax + movq %rdx,%r11 + imulq $19,%rbp,%rdi + mulq %r14 + movq %rax,%r12 + movq %rbp,%rax + movq %rdx,%r13 + mulq %r14 + movq %rax,%r14 + movq %rbp,%rax + movq %rdx,%r15 + + mulq %rdi + addq %rax,%r12 + movq 8(%rsi),%rax + adcq %rdx,%r13 + + movq 24(%rsi),%rsi + leaq (%rax,%rax,1),%rbp + mulq %rax + addq %rax,%r10 + movq 0(%rsp),%rax + adcq %rdx,%r11 + mulq %rbp + addq %rax,%r12 + movq %rbp,%rax + adcq %rdx,%r13 + mulq %rsi + addq %rax,%r14 + movq %rbp,%rax + adcq %rdx,%r15 + imulq $19,%rsi,%rbp + mulq %rdi + addq %rax,%rbx + leaq (%rsi,%rsi,1),%rax + adcq %rdx,%rcx + + mulq %rdi + addq %rax,%r10 + movq %rsi,%rax + adcq %rdx,%r11 + mulq %rbp + addq %rax,%r8 + movq 0(%rsp),%rax + adcq %rdx,%r9 + + leaq (%rax,%rax,1),%rsi + mulq %rax + addq %rax,%r14 + movq %rbp,%rax + adcq %rdx,%r15 + mulq %rsi + addq %rax,%rbx + movq %rsi,%rax + adcq %rdx,%rcx + mulq %rdi + addq %rax,%r8 + adcq %rdx,%r9 + + movq 32(%rsp),%rdi + jmp .Lreduce51 + +.align 32 +.Lreduce51: + movq $0x7ffffffffffff,%rbp + + movq %r10,%rdx + shrq $51,%r10 + shlq $13,%r11 + andq %rbp,%rdx + orq %r10,%r11 + addq %r11,%r12 + adcq $0,%r13 + + movq %rbx,%rax + shrq $51,%rbx + shlq $13,%rcx + andq %rbp,%rax + orq %rbx,%rcx + addq %rcx,%r8 + adcq $0,%r9 + + movq %r12,%rbx + shrq $51,%r12 + shlq $13,%r13 + andq %rbp,%rbx + orq %r12,%r13 + addq %r13,%r14 + adcq $0,%r15 + + movq %r8,%rcx + shrq $51,%r8 + shlq $13,%r9 + andq %rbp,%rcx + orq %r8,%r9 + addq %r9,%rdx + + movq %r14,%r10 + shrq $51,%r14 + shlq $13,%r15 + andq %rbp,%r10 + orq %r14,%r15 + + leaq (%r15,%r15,8),%r14 + leaq (%r15,%r14,2),%r15 + addq %r15,%rax + + movq %rdx,%r8 + andq %rbp,%rdx + shrq $51,%r8 + addq %r8,%rbx + + movq %rax,%r9 + andq %rbp,%rax + shrq $51,%r9 + addq %r9,%rcx + + movq %rax,0(%rdi) + movq %rcx,8(%rdi) + movq %rdx,16(%rdi) + movq %rbx,24(%rdi) + movq %r10,32(%rdi) + + movq 40(%rsp),%r15 +.cfi_restore %r15 + movq 48(%rsp),%r14 +.cfi_restore %r14 + movq 56(%rsp),%r13 +.cfi_restore %r13 + movq 64(%rsp),%r12 +.cfi_restore %r12 + movq 72(%rsp),%rbx +.cfi_restore %rbx + movq 80(%rsp),%rbp +.cfi_restore %rbp + leaq 88(%rsp),%rsp +.cfi_adjust_cfa_offset 88 +.Lfe51_sqr_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size x25519_fe51_sqr,.-x25519_fe51_sqr + +.globl x25519_fe51_mul121666 +.type x25519_fe51_mul121666,@function +.align 32 +x25519_fe51_mul121666: +.cfi_startproc + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 40 +.Lfe51_mul121666_body: + movl $121666,%eax + + mulq 0(%rsi) + movq %rax,%rbx + movl $121666,%eax + movq %rdx,%rcx + mulq 8(%rsi) + movq %rax,%r8 + movl $121666,%eax + movq %rdx,%r9 + mulq 16(%rsi) + movq %rax,%r10 + movl $121666,%eax + movq %rdx,%r11 + mulq 24(%rsi) + movq %rax,%r12 + movl $121666,%eax + movq %rdx,%r13 + mulq 32(%rsi) + movq %rax,%r14 + movq %rdx,%r15 + + jmp .Lreduce51 +.Lfe51_mul121666_epilogue: +.cfi_endproc +.size x25519_fe51_mul121666,.-x25519_fe51_mul121666 + +.globl x25519_fe64_eligible +.type x25519_fe64_eligible,@function +.align 32 +x25519_fe64_eligible: + movl OPENSSL_ia32cap_P+8(%rip),%ecx + xorl %eax,%eax + andl $0x80100,%ecx + cmpl $0x80100,%ecx + cmovel %ecx,%eax + .byte 0xf3,0xc3 +.size x25519_fe64_eligible,.-x25519_fe64_eligible + +.globl x25519_fe64_mul +.type x25519_fe64_mul,@function +.align 32 +x25519_fe64_mul: +.cfi_startproc + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 +.cfi_offset %rdi,-64 + leaq -16(%rsp),%rsp +.cfi_adjust_cfa_offset 16 +.Lfe64_mul_body: + + movq %rdx,%rax + movq 0(%rdx),%rbp + movq 0(%rsi),%rdx + movq 8(%rax),%rcx + movq 16(%rax),%r14 + movq 24(%rax),%r15 + + mulxq %rbp,%r8,%rax + xorl %edi,%edi + mulxq %rcx,%r9,%rbx + adcxq %rax,%r9 + mulxq %r14,%r10,%rax + adcxq %rbx,%r10 + mulxq %r15,%r11,%r12 + movq 8(%rsi),%rdx + adcxq %rax,%r11 + movq %r14,(%rsp) + adcxq %rdi,%r12 + + mulxq %rbp,%rax,%rbx + adoxq %rax,%r9 + adcxq %rbx,%r10 + mulxq %rcx,%rax,%rbx + adoxq %rax,%r10 + adcxq %rbx,%r11 + mulxq %r14,%rax,%rbx + adoxq %rax,%r11 + adcxq %rbx,%r12 + mulxq %r15,%rax,%r13 + movq 16(%rsi),%rdx + adoxq %rax,%r12 + adcxq %rdi,%r13 + adoxq %rdi,%r13 + + mulxq %rbp,%rax,%rbx + adcxq %rax,%r10 + adoxq %rbx,%r11 + mulxq %rcx,%rax,%rbx + adcxq %rax,%r11 + adoxq %rbx,%r12 + mulxq %r14,%rax,%rbx + adcxq %rax,%r12 + adoxq %rbx,%r13 + mulxq %r15,%rax,%r14 + movq 24(%rsi),%rdx + adcxq %rax,%r13 + adoxq %rdi,%r14 + adcxq %rdi,%r14 + + mulxq %rbp,%rax,%rbx + adoxq %rax,%r11 + adcxq %rbx,%r12 + mulxq %rcx,%rax,%rbx + adoxq %rax,%r12 + adcxq %rbx,%r13 + mulxq (%rsp),%rax,%rbx + adoxq %rax,%r13 + adcxq %rbx,%r14 + mulxq %r15,%rax,%r15 + movl $38,%edx + adoxq %rax,%r14 + adcxq %rdi,%r15 + adoxq %rdi,%r15 + + jmp .Lreduce64 +.Lfe64_mul_epilogue: +.cfi_endproc +.size x25519_fe64_mul,.-x25519_fe64_mul + +.globl x25519_fe64_sqr +.type x25519_fe64_sqr,@function +.align 32 +x25519_fe64_sqr: +.cfi_startproc + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 +.cfi_offset %rdi,-64 + leaq -16(%rsp),%rsp +.cfi_adjust_cfa_offset 16 +.Lfe64_sqr_body: + + movq 0(%rsi),%rdx + movq 8(%rsi),%rcx + movq 16(%rsi),%rbp + movq 24(%rsi),%rsi + + + mulxq %rdx,%r8,%r15 + mulxq %rcx,%r9,%rax + xorl %edi,%edi + mulxq %rbp,%r10,%rbx + adcxq %rax,%r10 + mulxq %rsi,%r11,%r12 + movq %rcx,%rdx + adcxq %rbx,%r11 + adcxq %rdi,%r12 + + + mulxq %rbp,%rax,%rbx + adoxq %rax,%r11 + adcxq %rbx,%r12 + mulxq %rsi,%rax,%r13 + movq %rbp,%rdx + adoxq %rax,%r12 + adcxq %rdi,%r13 + + + mulxq %rsi,%rax,%r14 + movq %rcx,%rdx + adoxq %rax,%r13 + adcxq %rdi,%r14 + adoxq %rdi,%r14 + + adcxq %r9,%r9 + adoxq %r15,%r9 + adcxq %r10,%r10 + mulxq %rdx,%rax,%rbx + movq %rbp,%rdx + adcxq %r11,%r11 + adoxq %rax,%r10 + adcxq %r12,%r12 + adoxq %rbx,%r11 + mulxq %rdx,%rax,%rbx + movq %rsi,%rdx + adcxq %r13,%r13 + adoxq %rax,%r12 + adcxq %r14,%r14 + adoxq %rbx,%r13 + mulxq %rdx,%rax,%r15 + movl $38,%edx + adoxq %rax,%r14 + adcxq %rdi,%r15 + adoxq %rdi,%r15 + jmp .Lreduce64 + +.align 32 +.Lreduce64: + mulxq %r12,%rax,%rbx + adcxq %rax,%r8 + adoxq %rbx,%r9 + mulxq %r13,%rax,%rbx + adcxq %rax,%r9 + adoxq %rbx,%r10 + mulxq %r14,%rax,%rbx + adcxq %rax,%r10 + adoxq %rbx,%r11 + mulxq %r15,%rax,%r12 + adcxq %rax,%r11 + adoxq %rdi,%r12 + adcxq %rdi,%r12 + + movq 16(%rsp),%rdi + imulq %rdx,%r12 + + addq %r12,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + sbbq %rax,%rax + andq $38,%rax + + addq %rax,%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r8,0(%rdi) + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset 88 +.Lfe64_sqr_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size x25519_fe64_sqr,.-x25519_fe64_sqr + +.globl x25519_fe64_mul121666 +.type x25519_fe64_mul121666,@function +.align 32 +x25519_fe64_mul121666: +.Lfe64_mul121666_body: + movl $121666,%edx + mulxq 0(%rsi),%r8,%rcx + mulxq 8(%rsi),%r9,%rax + addq %rcx,%r9 + mulxq 16(%rsi),%r10,%rcx + adcq %rax,%r10 + mulxq 24(%rsi),%r11,%rax + adcq %rcx,%r11 + adcq $0,%rax + + imulq $38,%rax,%rax + + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + sbbq %rax,%rax + andq $38,%rax + + addq %rax,%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r8,0(%rdi) + +.Lfe64_mul121666_epilogue: + .byte 0xf3,0xc3 +.size x25519_fe64_mul121666,.-x25519_fe64_mul121666 + +.globl x25519_fe64_add +.type x25519_fe64_add,@function +.align 32 +x25519_fe64_add: +.Lfe64_add_body: + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + + sbbq %rax,%rax + andq $38,%rax + + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + movq %r9,8(%rdi) + adcq $0,%r11 + movq %r10,16(%rdi) + sbbq %rax,%rax + movq %r11,24(%rdi) + andq $38,%rax + + addq %rax,%r8 + movq %r8,0(%rdi) + +.Lfe64_add_epilogue: + .byte 0xf3,0xc3 +.size x25519_fe64_add,.-x25519_fe64_add + +.globl x25519_fe64_sub +.type x25519_fe64_sub,@function +.align 32 +x25519_fe64_sub: +.Lfe64_sub_body: + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + sbbq 8(%rdx),%r9 + sbbq 16(%rdx),%r10 + sbbq 24(%rdx),%r11 + + sbbq %rax,%rax + andq $38,%rax + + subq %rax,%r8 + sbbq $0,%r9 + sbbq $0,%r10 + movq %r9,8(%rdi) + sbbq $0,%r11 + movq %r10,16(%rdi) + sbbq %rax,%rax + movq %r11,24(%rdi) + andq $38,%rax + + subq %rax,%r8 + movq %r8,0(%rdi) + +.Lfe64_sub_epilogue: + .byte 0xf3,0xc3 +.size x25519_fe64_sub,.-x25519_fe64_sub + +.globl x25519_fe64_tobytes +.type x25519_fe64_tobytes,@function +.align 32 +x25519_fe64_tobytes: +.Lfe64_to_body: + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + + leaq (%r11,%r11,1),%rax + sarq $63,%r11 + shrq $1,%rax + andq $19,%r11 + addq $19,%r11 + + addq %r11,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%rax + + leaq (%rax,%rax,1),%r11 + sarq $63,%rax + shrq $1,%r11 + notq %rax + andq $19,%rax + + subq %rax,%r8 + sbbq $0,%r9 + sbbq $0,%r10 + sbbq $0,%r11 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + +.Lfe64_to_epilogue: + .byte 0xf3,0xc3 +.size x25519_fe64_tobytes,.-x25519_fe64_tobytes +.byte 88,50,53,53,49,57,32,112,114,105,109,105,116,105,118,101,115,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/include/internal/dso_conf.h b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/include/internal/dso_conf.h index 7a52dd1f1a..18f6ac3bff 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/include/internal/dso_conf.h +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/include/internal/dso_conf.h @@ -1,7 +1,7 @@ /* WARNING: do not edit! */ /* Generated by Makefile from crypto/include/internal/dso_conf.h.in */ /* - * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the OpenSSL license (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy @@ -12,5 +12,8 @@ #ifndef HEADER_DSO_CONF_H # define HEADER_DSO_CONF_H +# define DSO_DLFCN +# define HAVE_DLFCN_H # define DSO_EXTENSION ".so" + #endif diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/md5/md5-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/md5/md5-x86_64.s index 0defe666bb..348ebe4962 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/md5/md5-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/md5/md5-x86_64.s @@ -4,11 +4,22 @@ .globl md5_block_asm_data_order .type md5_block_asm_data_order,@function md5_block_asm_data_order: +.cfi_startproc pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-40 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-48 .Lprologue: @@ -655,11 +666,18 @@ md5_block_asm_data_order: movl %edx,12(%rbp) movq (%rsp),%r15 +.cfi_restore %r15 movq 8(%rsp),%r14 +.cfi_restore %r14 movq 16(%rsp),%r12 +.cfi_restore %r12 movq 24(%rsp),%rbx +.cfi_restore %rbx movq 32(%rsp),%rbp +.cfi_restore %rbp addq $40,%rsp +.cfi_adjust_cfa_offset -40 .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size md5_block_asm_data_order,.-md5_block_asm_data_order diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/modes/aesni-gcm-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/modes/aesni-gcm-x86_64.s index 21e49925f1..6a7a9577c7 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/modes/aesni-gcm-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/modes/aesni-gcm-x86_64.s @@ -31,23 +31,6 @@ _aesni_ctr32_ghash_6x: vpxor %xmm15,%xmm12,%xmm12 vmovups 16-128(%rcx),%xmm2 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 - - - - - - - - - - - - - - - - - xorq %r12,%r12 cmpq %r14,%r15 @@ -332,20 +315,25 @@ _aesni_ctr32_ghash_6x: .type aesni_gcm_decrypt,@function .align 32 aesni_gcm_decrypt: +.cfi_startproc xorq %r10,%r10 - - - cmpq $0x60,%rdx jb .Lgcm_dec_abort leaq (%rsp),%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 vzeroupper vmovdqu (%r8),%xmm1 @@ -374,15 +362,7 @@ aesni_gcm_decrypt: vmovdqu 80(%rdi),%xmm7 leaq (%rdi),%r14 vmovdqu 64(%rdi),%xmm4 - - - - - - - leaq -192(%rdi,%rdx,1),%r15 - vmovdqu 48(%rdi),%xmm5 shrq $4,%rdx xorq %r10,%r10 @@ -415,15 +395,23 @@ aesni_gcm_decrypt: vzeroupper movq -48(%rax),%r15 +.cfi_restore %r15 movq -40(%rax),%r14 +.cfi_restore %r14 movq -32(%rax),%r13 +.cfi_restore %r13 movq -24(%rax),%r12 +.cfi_restore %r12 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lgcm_dec_abort: movq %r10,%rax .byte 0xf3,0xc3 +.cfi_endproc .size aesni_gcm_decrypt,.-aesni_gcm_decrypt .type _aesni_ctr32_6x,@function .align 32 @@ -520,21 +508,25 @@ _aesni_ctr32_6x: .type aesni_gcm_encrypt,@function .align 32 aesni_gcm_encrypt: +.cfi_startproc xorq %r10,%r10 - - - - cmpq $288,%rdx jb .Lgcm_enc_abort leaq (%rsp),%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 vzeroupper vmovdqu (%r8),%xmm1 @@ -558,16 +550,7 @@ aesni_gcm_encrypt: .Lenc_no_key_aliasing: leaq (%rsi),%r14 - - - - - - - - leaq -192(%rsi,%rdx,1),%r15 - shrq $4,%rdx call _aesni_ctr32_6x @@ -769,15 +752,23 @@ aesni_gcm_encrypt: vzeroupper movq -48(%rax),%r15 +.cfi_restore %r15 movq -40(%rax),%r14 +.cfi_restore %r14 movq -32(%rax),%r13 +.cfi_restore %r13 movq -24(%rax),%r12 +.cfi_restore %r12 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lgcm_enc_abort: movq %r10,%rax .byte 0xf3,0xc3 +.cfi_endproc .size aesni_gcm_encrypt,.-aesni_gcm_encrypt .align 64 .Lbswap_mask: diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/modes/ghash-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/modes/ghash-x86_64.s index 0116ef1c94..29c297f04b 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/modes/ghash-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/modes/ghash-x86_64.s @@ -5,9 +5,27 @@ .type gcm_gmult_4bit,@function .align 16 gcm_gmult_4bit: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $280,%rsp +.cfi_adjust_cfa_offset 280 .Lgmult_prologue: movzbq 15(%rdi),%r8 @@ -84,22 +102,41 @@ gcm_gmult_4bit: movq %r8,8(%rdi) movq %r9,(%rdi) - movq 16(%rsp),%rbx - leaq 24(%rsp),%rsp + leaq 280+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lgmult_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size gcm_gmult_4bit,.-gcm_gmult_4bit .globl gcm_ghash_4bit .type gcm_ghash_4bit,@function .align 16 gcm_ghash_4bit: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $280,%rsp +.cfi_adjust_cfa_offset 280 .Lghash_prologue: movq %rdx,%r14 movq %rcx,%r15 @@ -644,16 +681,25 @@ gcm_ghash_4bit: movq %r8,8(%rdi) movq %r9,(%rdi) - leaq 280(%rsp),%rsi - movq 0(%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + leaq 280+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq 0(%rsi),%rsp +.cfi_def_cfa_register %rsp .Lghash_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size gcm_ghash_4bit,.-gcm_ghash_4bit .globl gcm_init_clmul .type gcm_init_clmul,@function diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/poly1305/poly1305-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/poly1305/poly1305-x86_64.s index 8b2e361ea1..deb4f74bfb 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/poly1305/poly1305-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/poly1305/poly1305-x86_64.s @@ -31,6 +31,11 @@ poly1305_init: leaq poly1305_blocks_avx2(%rip),%rax btq $37,%r9 cmovcq %rax,%r10 + movq $2149646336,%rax + shrq $32,%r9 + andq %rax,%r9 + cmpq %rax,%r9 + je .Linit_base2_44 movq $0x0ffffffc0fffffff,%rax movq $0x0ffffffc0ffffffc,%rcx andq 0(%rsi),%rax @@ -47,16 +52,29 @@ poly1305_init: .type poly1305_blocks,@function .align 32 poly1305_blocks: +.cfi_startproc .Lblocks: shrq $4,%rdx jz .Lno_data pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 .Lblocks_body: movq %rdx,%r15 @@ -127,15 +145,23 @@ poly1305_blocks: movq %rbp,16(%rdi) movq 0(%rsp),%r15 +.cfi_restore %r15 movq 8(%rsp),%r14 +.cfi_restore %r14 movq 16(%rsp),%r13 +.cfi_restore %r13 movq 24(%rsp),%r12 +.cfi_restore %r12 movq 32(%rsp),%rbp +.cfi_restore %rbp movq 40(%rsp),%rbx +.cfi_restore %rbx leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lno_data: .Lblocks_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size poly1305_blocks,.-poly1305_blocks .type poly1305_emit,@function @@ -371,6 +397,7 @@ __poly1305_init_avx: .type poly1305_blocks_avx,@function .align 32 poly1305_blocks_avx: +.cfi_startproc movl 20(%rdi),%r8d cmpq $128,%rdx jae .Lblocks_avx @@ -390,11 +417,23 @@ poly1305_blocks_avx: jz .Leven_avx pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 .Lblocks_avx_body: movq %rdx,%r15 @@ -497,24 +536,45 @@ poly1305_blocks_avx: .align 16 .Ldone_avx: movq 0(%rsp),%r15 +.cfi_restore %r15 movq 8(%rsp),%r14 +.cfi_restore %r14 movq 16(%rsp),%r13 +.cfi_restore %r13 movq 24(%rsp),%r12 +.cfi_restore %r12 movq 32(%rsp),%rbp +.cfi_restore %rbp movq 40(%rsp),%rbx +.cfi_restore %rbx leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lno_data_avx: .Lblocks_avx_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .align 32 .Lbase2_64_avx: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 .Lbase2_64_avx_body: movq %rdx,%r15 @@ -574,18 +634,27 @@ poly1305_blocks_avx: movq %r15,%rdx movq 0(%rsp),%r15 +.cfi_restore %r15 movq 8(%rsp),%r14 +.cfi_restore %r14 movq 16(%rsp),%r13 +.cfi_restore %r13 movq 24(%rsp),%r12 +.cfi_restore %r12 movq 32(%rsp),%rbp +.cfi_restore %rbp movq 40(%rsp),%rbx +.cfi_restore %rbx leaq 48(%rsp),%rax leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lbase2_64_avx_epilogue: jmp .Ldo_avx +.cfi_endproc .align 32 .Leven_avx: +.cfi_startproc vmovd 0(%rdi),%xmm0 vmovd 4(%rdi),%xmm1 vmovd 8(%rdi),%xmm2 @@ -594,6 +663,7 @@ poly1305_blocks_avx: .Ldo_avx: leaq -88(%rsp),%r11 +.cfi_def_cfa %r11,0x60 subq $0x178,%rsp subq $64,%rdx leaq -32(%rsi),%rax @@ -1153,8 +1223,10 @@ poly1305_blocks_avx: vmovd %xmm13,-100(%rdi) vmovd %xmm14,-96(%rdi) leaq 88(%r11),%rsp +.cfi_def_cfa %rsp,8 vzeroupper .byte 0xf3,0xc3 +.cfi_endproc .size poly1305_blocks_avx,.-poly1305_blocks_avx .type poly1305_emit_avx,@function @@ -1214,6 +1286,7 @@ poly1305_emit_avx: .type poly1305_blocks_avx2,@function .align 32 poly1305_blocks_avx2: +.cfi_startproc movl 20(%rdi),%r8d cmpq $128,%rdx jae .Lblocks_avx2 @@ -1233,11 +1306,23 @@ poly1305_blocks_avx2: jz .Leven_avx2 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 .Lblocks_avx2_body: movq %rdx,%r15 @@ -1346,24 +1431,45 @@ poly1305_blocks_avx2: .align 16 .Ldone_avx2: movq 0(%rsp),%r15 +.cfi_restore %r15 movq 8(%rsp),%r14 +.cfi_restore %r14 movq 16(%rsp),%r13 +.cfi_restore %r13 movq 24(%rsp),%r12 +.cfi_restore %r12 movq 32(%rsp),%rbp +.cfi_restore %rbp movq 40(%rsp),%rbx +.cfi_restore %rbx leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lno_data_avx2: .Lblocks_avx2_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .align 32 .Lbase2_64_avx2: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 .Lbase2_64_avx2_body: movq %rdx,%r15 @@ -1426,20 +1532,32 @@ poly1305_blocks_avx2: .Lproceed_avx2: movq %r15,%rdx + movl OPENSSL_ia32cap_P+8(%rip),%r10d + movl $3221291008,%r11d movq 0(%rsp),%r15 +.cfi_restore %r15 movq 8(%rsp),%r14 +.cfi_restore %r14 movq 16(%rsp),%r13 +.cfi_restore %r13 movq 24(%rsp),%r12 +.cfi_restore %r12 movq 32(%rsp),%rbp +.cfi_restore %rbp movq 40(%rsp),%rbx +.cfi_restore %rbx leaq 48(%rsp),%rax leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lbase2_64_avx2_epilogue: jmp .Ldo_avx2 +.cfi_endproc .align 32 .Leven_avx2: +.cfi_startproc + movl OPENSSL_ia32cap_P+8(%rip),%r10d vmovd 0(%rdi),%xmm0 vmovd 4(%rdi),%xmm1 vmovd 8(%rdi),%xmm2 @@ -1447,10 +1565,18 @@ poly1305_blocks_avx2: vmovd 16(%rdi),%xmm4 .Ldo_avx2: + cmpq $512,%rdx + jb .Lskip_avx512 + andl %r11d,%r10d + testl $65536,%r10d + jnz .Lblocks_avx512 +.Lskip_avx512: leaq -8(%rsp),%r11 +.cfi_def_cfa %r11,16 subq $0x128,%rsp - leaq 48+64(%rdi),%rdi leaq .Lconst(%rip),%rcx + leaq 48+64(%rdi),%rdi + vmovdqa 96(%rcx),%ymm7 vmovdqu -64(%rdi),%xmm9 @@ -1460,36 +1586,28 @@ poly1305_blocks_avx2: vmovdqu -16(%rdi),%xmm11 vmovdqu 0(%rdi),%xmm12 vmovdqu 16(%rdi),%xmm13 + leaq 144(%rsp),%rax vmovdqu 32(%rdi),%xmm14 - vpermq $0x15,%ymm9,%ymm9 + vpermd %ymm9,%ymm7,%ymm9 vmovdqu 48(%rdi),%xmm15 - vpermq $0x15,%ymm10,%ymm10 - vpshufd $0xc8,%ymm9,%ymm9 + vpermd %ymm10,%ymm7,%ymm10 vmovdqu 64(%rdi),%xmm5 - vpermq $0x15,%ymm6,%ymm6 - vpshufd $0xc8,%ymm10,%ymm10 + vpermd %ymm6,%ymm7,%ymm6 vmovdqa %ymm9,0(%rsp) - vpermq $0x15,%ymm11,%ymm11 - vpshufd $0xc8,%ymm6,%ymm6 - vmovdqa %ymm10,32(%rsp) - vpermq $0x15,%ymm12,%ymm12 - vpshufd $0xc8,%ymm11,%ymm11 - vmovdqa %ymm6,64(%rsp) - vpermq $0x15,%ymm13,%ymm13 - vpshufd $0xc8,%ymm12,%ymm12 - vmovdqa %ymm11,96(%rsp) - vpermq $0x15,%ymm14,%ymm14 - vpshufd $0xc8,%ymm13,%ymm13 - vmovdqa %ymm12,128(%rsp) - vpermq $0x15,%ymm15,%ymm15 - vpshufd $0xc8,%ymm14,%ymm14 - vmovdqa %ymm13,160(%rsp) - vpermq $0x15,%ymm5,%ymm5 - vpshufd $0xc8,%ymm15,%ymm15 - vmovdqa %ymm14,192(%rsp) - vpshufd $0xc8,%ymm5,%ymm5 - vmovdqa %ymm15,224(%rsp) - vmovdqa %ymm5,256(%rsp) + vpermd %ymm11,%ymm7,%ymm11 + vmovdqa %ymm10,32-144(%rax) + vpermd %ymm12,%ymm7,%ymm12 + vmovdqa %ymm6,64-144(%rax) + vpermd %ymm13,%ymm7,%ymm13 + vmovdqa %ymm11,96-144(%rax) + vpermd %ymm14,%ymm7,%ymm14 + vmovdqa %ymm12,128-144(%rax) + vpermd %ymm15,%ymm7,%ymm15 + vmovdqa %ymm13,160-144(%rax) + vpermd %ymm5,%ymm7,%ymm5 + vmovdqa %ymm14,192-144(%rax) + vmovdqa %ymm15,224-144(%rax) + vmovdqa %ymm5,256-144(%rax) vmovdqa 64(%rcx),%ymm5 @@ -1516,7 +1634,6 @@ poly1305_blocks_avx2: vpand %ymm5,%ymm10,%ymm10 vpor 32(%rcx),%ymm6,%ymm6 - leaq 144(%rsp),%rax vpaddq %ymm2,%ymm9,%ymm2 subq $64,%rdx jz .Ltail_avx2 @@ -1811,9 +1928,1506 @@ poly1305_blocks_avx2: vmovd %xmm3,-100(%rdi) vmovd %xmm4,-96(%rdi) leaq 8(%r11),%rsp +.cfi_def_cfa %rsp,8 vzeroupper .byte 0xf3,0xc3 +.cfi_endproc .size poly1305_blocks_avx2,.-poly1305_blocks_avx2 +.type poly1305_blocks_avx512,@function +.align 32 +poly1305_blocks_avx512: +.cfi_startproc +.Lblocks_avx512: + movl $15,%eax + kmovw %eax,%k2 + leaq -8(%rsp),%r11 +.cfi_def_cfa %r11,16 + subq $0x128,%rsp + leaq .Lconst(%rip),%rcx + leaq 48+64(%rdi),%rdi + vmovdqa 96(%rcx),%ymm9 + + + vmovdqu -64(%rdi),%xmm11 + andq $-512,%rsp + vmovdqu -48(%rdi),%xmm12 + movq $0x20,%rax + vmovdqu -32(%rdi),%xmm7 + vmovdqu -16(%rdi),%xmm13 + vmovdqu 0(%rdi),%xmm8 + vmovdqu 16(%rdi),%xmm14 + vmovdqu 32(%rdi),%xmm10 + vmovdqu 48(%rdi),%xmm15 + vmovdqu 64(%rdi),%xmm6 + vpermd %zmm11,%zmm9,%zmm16 + vpbroadcastq 64(%rcx),%zmm5 + vpermd %zmm12,%zmm9,%zmm17 + vpermd %zmm7,%zmm9,%zmm21 + vpermd %zmm13,%zmm9,%zmm18 + vmovdqa64 %zmm16,0(%rsp){%k2} + vpsrlq $32,%zmm16,%zmm7 + vpermd %zmm8,%zmm9,%zmm22 + vmovdqu64 %zmm17,0(%rsp,%rax,1){%k2} + vpsrlq $32,%zmm17,%zmm8 + vpermd %zmm14,%zmm9,%zmm19 + vmovdqa64 %zmm21,64(%rsp){%k2} + vpermd %zmm10,%zmm9,%zmm23 + vpermd %zmm15,%zmm9,%zmm20 + vmovdqu64 %zmm18,64(%rsp,%rax,1){%k2} + vpermd %zmm6,%zmm9,%zmm24 + vmovdqa64 %zmm22,128(%rsp){%k2} + vmovdqu64 %zmm19,128(%rsp,%rax,1){%k2} + vmovdqa64 %zmm23,192(%rsp){%k2} + vmovdqu64 %zmm20,192(%rsp,%rax,1){%k2} + vmovdqa64 %zmm24,256(%rsp){%k2} + + + + + + + + + + + vpmuludq %zmm7,%zmm16,%zmm11 + vpmuludq %zmm7,%zmm17,%zmm12 + vpmuludq %zmm7,%zmm18,%zmm13 + vpmuludq %zmm7,%zmm19,%zmm14 + vpmuludq %zmm7,%zmm20,%zmm15 + vpsrlq $32,%zmm18,%zmm9 + + vpmuludq %zmm8,%zmm24,%zmm25 + vpmuludq %zmm8,%zmm16,%zmm26 + vpmuludq %zmm8,%zmm17,%zmm27 + vpmuludq %zmm8,%zmm18,%zmm28 + vpmuludq %zmm8,%zmm19,%zmm29 + vpsrlq $32,%zmm19,%zmm10 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + + vpmuludq %zmm9,%zmm23,%zmm25 + vpmuludq %zmm9,%zmm24,%zmm26 + vpmuludq %zmm9,%zmm17,%zmm28 + vpmuludq %zmm9,%zmm18,%zmm29 + vpmuludq %zmm9,%zmm16,%zmm27 + vpsrlq $32,%zmm20,%zmm6 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm10,%zmm22,%zmm25 + vpmuludq %zmm10,%zmm16,%zmm28 + vpmuludq %zmm10,%zmm17,%zmm29 + vpmuludq %zmm10,%zmm23,%zmm26 + vpmuludq %zmm10,%zmm24,%zmm27 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm6,%zmm24,%zmm28 + vpmuludq %zmm6,%zmm16,%zmm29 + vpmuludq %zmm6,%zmm21,%zmm25 + vpmuludq %zmm6,%zmm22,%zmm26 + vpmuludq %zmm6,%zmm23,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + + + vmovdqu64 0(%rsi),%zmm10 + vmovdqu64 64(%rsi),%zmm6 + leaq 128(%rsi),%rsi + + + + + vpsrlq $26,%zmm14,%zmm28 + vpandq %zmm5,%zmm14,%zmm14 + vpaddq %zmm28,%zmm15,%zmm15 + + vpsrlq $26,%zmm11,%zmm25 + vpandq %zmm5,%zmm11,%zmm11 + vpaddq %zmm25,%zmm12,%zmm12 + + vpsrlq $26,%zmm15,%zmm29 + vpandq %zmm5,%zmm15,%zmm15 + + vpsrlq $26,%zmm12,%zmm26 + vpandq %zmm5,%zmm12,%zmm12 + vpaddq %zmm26,%zmm13,%zmm13 + + vpaddq %zmm29,%zmm11,%zmm11 + vpsllq $2,%zmm29,%zmm29 + vpaddq %zmm29,%zmm11,%zmm11 + + vpsrlq $26,%zmm13,%zmm27 + vpandq %zmm5,%zmm13,%zmm13 + vpaddq %zmm27,%zmm14,%zmm14 + + vpsrlq $26,%zmm11,%zmm25 + vpandq %zmm5,%zmm11,%zmm11 + vpaddq %zmm25,%zmm12,%zmm12 + + vpsrlq $26,%zmm14,%zmm28 + vpandq %zmm5,%zmm14,%zmm14 + vpaddq %zmm28,%zmm15,%zmm15 + + + + + + vpunpcklqdq %zmm6,%zmm10,%zmm7 + vpunpckhqdq %zmm6,%zmm10,%zmm6 + + + + + + + vmovdqa32 128(%rcx),%zmm25 + movl $0x7777,%eax + kmovw %eax,%k1 + + vpermd %zmm16,%zmm25,%zmm16 + vpermd %zmm17,%zmm25,%zmm17 + vpermd %zmm18,%zmm25,%zmm18 + vpermd %zmm19,%zmm25,%zmm19 + vpermd %zmm20,%zmm25,%zmm20 + + vpermd %zmm11,%zmm25,%zmm16{%k1} + vpermd %zmm12,%zmm25,%zmm17{%k1} + vpermd %zmm13,%zmm25,%zmm18{%k1} + vpermd %zmm14,%zmm25,%zmm19{%k1} + vpermd %zmm15,%zmm25,%zmm20{%k1} + + vpslld $2,%zmm17,%zmm21 + vpslld $2,%zmm18,%zmm22 + vpslld $2,%zmm19,%zmm23 + vpslld $2,%zmm20,%zmm24 + vpaddd %zmm17,%zmm21,%zmm21 + vpaddd %zmm18,%zmm22,%zmm22 + vpaddd %zmm19,%zmm23,%zmm23 + vpaddd %zmm20,%zmm24,%zmm24 + + vpbroadcastq 32(%rcx),%zmm30 + + vpsrlq $52,%zmm7,%zmm9 + vpsllq $12,%zmm6,%zmm10 + vporq %zmm10,%zmm9,%zmm9 + vpsrlq $26,%zmm7,%zmm8 + vpsrlq $14,%zmm6,%zmm10 + vpsrlq $40,%zmm6,%zmm6 + vpandq %zmm5,%zmm9,%zmm9 + vpandq %zmm5,%zmm7,%zmm7 + + + + + vpaddq %zmm2,%zmm9,%zmm2 + subq $192,%rdx + jbe .Ltail_avx512 + jmp .Loop_avx512 + +.align 32 +.Loop_avx512: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + vpmuludq %zmm2,%zmm17,%zmm14 + vpaddq %zmm0,%zmm7,%zmm0 + vpmuludq %zmm2,%zmm18,%zmm15 + vpandq %zmm5,%zmm8,%zmm8 + vpmuludq %zmm2,%zmm23,%zmm11 + vpandq %zmm5,%zmm10,%zmm10 + vpmuludq %zmm2,%zmm24,%zmm12 + vporq %zmm30,%zmm6,%zmm6 + vpmuludq %zmm2,%zmm16,%zmm13 + vpaddq %zmm1,%zmm8,%zmm1 + vpaddq %zmm3,%zmm10,%zmm3 + vpaddq %zmm4,%zmm6,%zmm4 + + vmovdqu64 0(%rsi),%zmm10 + vmovdqu64 64(%rsi),%zmm6 + leaq 128(%rsi),%rsi + vpmuludq %zmm0,%zmm19,%zmm28 + vpmuludq %zmm0,%zmm20,%zmm29 + vpmuludq %zmm0,%zmm16,%zmm25 + vpmuludq %zmm0,%zmm17,%zmm26 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + + vpmuludq %zmm1,%zmm18,%zmm28 + vpmuludq %zmm1,%zmm19,%zmm29 + vpmuludq %zmm1,%zmm24,%zmm25 + vpmuludq %zmm0,%zmm18,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm27,%zmm13,%zmm13 + + vpunpcklqdq %zmm6,%zmm10,%zmm7 + vpunpckhqdq %zmm6,%zmm10,%zmm6 + + vpmuludq %zmm3,%zmm16,%zmm28 + vpmuludq %zmm3,%zmm17,%zmm29 + vpmuludq %zmm1,%zmm16,%zmm26 + vpmuludq %zmm1,%zmm17,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm4,%zmm24,%zmm28 + vpmuludq %zmm4,%zmm16,%zmm29 + vpmuludq %zmm3,%zmm22,%zmm25 + vpmuludq %zmm3,%zmm23,%zmm26 + vpaddq %zmm28,%zmm14,%zmm14 + vpmuludq %zmm3,%zmm24,%zmm27 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm4,%zmm21,%zmm25 + vpmuludq %zmm4,%zmm22,%zmm26 + vpmuludq %zmm4,%zmm23,%zmm27 + vpaddq %zmm25,%zmm11,%zmm0 + vpaddq %zmm26,%zmm12,%zmm1 + vpaddq %zmm27,%zmm13,%zmm2 + + + + + vpsrlq $52,%zmm7,%zmm9 + vpsllq $12,%zmm6,%zmm10 + + vpsrlq $26,%zmm14,%zmm3 + vpandq %zmm5,%zmm14,%zmm14 + vpaddq %zmm3,%zmm15,%zmm4 + + vporq %zmm10,%zmm9,%zmm9 + + vpsrlq $26,%zmm0,%zmm11 + vpandq %zmm5,%zmm0,%zmm0 + vpaddq %zmm11,%zmm1,%zmm1 + + vpandq %zmm5,%zmm9,%zmm9 + + vpsrlq $26,%zmm4,%zmm15 + vpandq %zmm5,%zmm4,%zmm4 + + vpsrlq $26,%zmm1,%zmm12 + vpandq %zmm5,%zmm1,%zmm1 + vpaddq %zmm12,%zmm2,%zmm2 + + vpaddq %zmm15,%zmm0,%zmm0 + vpsllq $2,%zmm15,%zmm15 + vpaddq %zmm15,%zmm0,%zmm0 + + vpaddq %zmm9,%zmm2,%zmm2 + vpsrlq $26,%zmm7,%zmm8 + + vpsrlq $26,%zmm2,%zmm13 + vpandq %zmm5,%zmm2,%zmm2 + vpaddq %zmm13,%zmm14,%zmm3 + + vpsrlq $14,%zmm6,%zmm10 + + vpsrlq $26,%zmm0,%zmm11 + vpandq %zmm5,%zmm0,%zmm0 + vpaddq %zmm11,%zmm1,%zmm1 + + vpsrlq $40,%zmm6,%zmm6 + + vpsrlq $26,%zmm3,%zmm14 + vpandq %zmm5,%zmm3,%zmm3 + vpaddq %zmm14,%zmm4,%zmm4 + + vpandq %zmm5,%zmm7,%zmm7 + + + + + subq $128,%rdx + ja .Loop_avx512 + +.Ltail_avx512: + + + + + + vpsrlq $32,%zmm16,%zmm16 + vpsrlq $32,%zmm17,%zmm17 + vpsrlq $32,%zmm18,%zmm18 + vpsrlq $32,%zmm23,%zmm23 + vpsrlq $32,%zmm24,%zmm24 + vpsrlq $32,%zmm19,%zmm19 + vpsrlq $32,%zmm20,%zmm20 + vpsrlq $32,%zmm21,%zmm21 + vpsrlq $32,%zmm22,%zmm22 + + + + leaq (%rsi,%rdx,1),%rsi + + + vpaddq %zmm0,%zmm7,%zmm0 + + vpmuludq %zmm2,%zmm17,%zmm14 + vpmuludq %zmm2,%zmm18,%zmm15 + vpmuludq %zmm2,%zmm23,%zmm11 + vpandq %zmm5,%zmm8,%zmm8 + vpmuludq %zmm2,%zmm24,%zmm12 + vpandq %zmm5,%zmm10,%zmm10 + vpmuludq %zmm2,%zmm16,%zmm13 + vporq %zmm30,%zmm6,%zmm6 + vpaddq %zmm1,%zmm8,%zmm1 + vpaddq %zmm3,%zmm10,%zmm3 + vpaddq %zmm4,%zmm6,%zmm4 + + vmovdqu 0(%rsi),%xmm7 + vpmuludq %zmm0,%zmm19,%zmm28 + vpmuludq %zmm0,%zmm20,%zmm29 + vpmuludq %zmm0,%zmm16,%zmm25 + vpmuludq %zmm0,%zmm17,%zmm26 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + + vmovdqu 16(%rsi),%xmm8 + vpmuludq %zmm1,%zmm18,%zmm28 + vpmuludq %zmm1,%zmm19,%zmm29 + vpmuludq %zmm1,%zmm24,%zmm25 + vpmuludq %zmm0,%zmm18,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm27,%zmm13,%zmm13 + + vinserti128 $1,32(%rsi),%ymm7,%ymm7 + vpmuludq %zmm3,%zmm16,%zmm28 + vpmuludq %zmm3,%zmm17,%zmm29 + vpmuludq %zmm1,%zmm16,%zmm26 + vpmuludq %zmm1,%zmm17,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vinserti128 $1,48(%rsi),%ymm8,%ymm8 + vpmuludq %zmm4,%zmm24,%zmm28 + vpmuludq %zmm4,%zmm16,%zmm29 + vpmuludq %zmm3,%zmm22,%zmm25 + vpmuludq %zmm3,%zmm23,%zmm26 + vpmuludq %zmm3,%zmm24,%zmm27 + vpaddq %zmm28,%zmm14,%zmm3 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm4,%zmm21,%zmm25 + vpmuludq %zmm4,%zmm22,%zmm26 + vpmuludq %zmm4,%zmm23,%zmm27 + vpaddq %zmm25,%zmm11,%zmm0 + vpaddq %zmm26,%zmm12,%zmm1 + vpaddq %zmm27,%zmm13,%zmm2 + + + + + movl $1,%eax + vpermq $0xb1,%zmm3,%zmm14 + vpermq $0xb1,%zmm15,%zmm4 + vpermq $0xb1,%zmm0,%zmm11 + vpermq $0xb1,%zmm1,%zmm12 + vpermq $0xb1,%zmm2,%zmm13 + vpaddq %zmm14,%zmm3,%zmm3 + vpaddq %zmm15,%zmm4,%zmm4 + vpaddq %zmm11,%zmm0,%zmm0 + vpaddq %zmm12,%zmm1,%zmm1 + vpaddq %zmm13,%zmm2,%zmm2 + + kmovw %eax,%k3 + vpermq $0x2,%zmm3,%zmm14 + vpermq $0x2,%zmm4,%zmm15 + vpermq $0x2,%zmm0,%zmm11 + vpermq $0x2,%zmm1,%zmm12 + vpermq $0x2,%zmm2,%zmm13 + vpaddq %zmm14,%zmm3,%zmm3 + vpaddq %zmm15,%zmm4,%zmm4 + vpaddq %zmm11,%zmm0,%zmm0 + vpaddq %zmm12,%zmm1,%zmm1 + vpaddq %zmm13,%zmm2,%zmm2 + + vextracti64x4 $0x1,%zmm3,%ymm14 + vextracti64x4 $0x1,%zmm4,%ymm15 + vextracti64x4 $0x1,%zmm0,%ymm11 + vextracti64x4 $0x1,%zmm1,%ymm12 + vextracti64x4 $0x1,%zmm2,%ymm13 + vpaddq %zmm14,%zmm3,%zmm3{%k3}{z} + vpaddq %zmm15,%zmm4,%zmm4{%k3}{z} + vpaddq %zmm11,%zmm0,%zmm0{%k3}{z} + vpaddq %zmm12,%zmm1,%zmm1{%k3}{z} + vpaddq %zmm13,%zmm2,%zmm2{%k3}{z} + + + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpsrldq $6,%ymm7,%ymm9 + vpsrldq $6,%ymm8,%ymm10 + vpunpckhqdq %ymm8,%ymm7,%ymm6 + vpaddq %ymm14,%ymm4,%ymm4 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpunpcklqdq %ymm10,%ymm9,%ymm9 + vpunpcklqdq %ymm8,%ymm7,%ymm7 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $26,%ymm4,%ymm15 + vpand %ymm5,%ymm4,%ymm4 + + vpsrlq $26,%ymm1,%ymm12 + vpand %ymm5,%ymm1,%ymm1 + vpsrlq $30,%ymm9,%ymm10 + vpsrlq $4,%ymm9,%ymm9 + vpaddq %ymm12,%ymm2,%ymm2 + + vpaddq %ymm15,%ymm0,%ymm0 + vpsllq $2,%ymm15,%ymm15 + vpsrlq $26,%ymm7,%ymm8 + vpsrlq $40,%ymm6,%ymm6 + vpaddq %ymm15,%ymm0,%ymm0 + + vpsrlq $26,%ymm2,%ymm13 + vpand %ymm5,%ymm2,%ymm2 + vpand %ymm5,%ymm9,%ymm9 + vpand %ymm5,%ymm7,%ymm7 + vpaddq %ymm13,%ymm3,%ymm3 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm2,%ymm9,%ymm2 + vpand %ymm5,%ymm8,%ymm8 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpand %ymm5,%ymm10,%ymm10 + vpor 32(%rcx),%ymm6,%ymm6 + vpaddq %ymm14,%ymm4,%ymm4 + + leaq 144(%rsp),%rax + addq $64,%rdx + jnz .Ltail_avx2 + + vpsubq %ymm9,%ymm2,%ymm2 + vmovd %xmm0,-112(%rdi) + vmovd %xmm1,-108(%rdi) + vmovd %xmm2,-104(%rdi) + vmovd %xmm3,-100(%rdi) + vmovd %xmm4,-96(%rdi) + vzeroall + leaq 8(%r11),%rsp +.cfi_def_cfa %rsp,8 + .byte 0xf3,0xc3 +.cfi_endproc +.size poly1305_blocks_avx512,.-poly1305_blocks_avx512 +.type poly1305_init_base2_44,@function +.align 32 +poly1305_init_base2_44: + xorq %rax,%rax + movq %rax,0(%rdi) + movq %rax,8(%rdi) + movq %rax,16(%rdi) + +.Linit_base2_44: + leaq poly1305_blocks_vpmadd52(%rip),%r10 + leaq poly1305_emit_base2_44(%rip),%r11 + + movq $0x0ffffffc0fffffff,%rax + movq $0x0ffffffc0ffffffc,%rcx + andq 0(%rsi),%rax + movq $0x00000fffffffffff,%r8 + andq 8(%rsi),%rcx + movq $0x00000fffffffffff,%r9 + andq %rax,%r8 + shrdq $44,%rcx,%rax + movq %r8,40(%rdi) + andq %r9,%rax + shrq $24,%rcx + movq %rax,48(%rdi) + leaq (%rax,%rax,4),%rax + movq %rcx,56(%rdi) + shlq $2,%rax + leaq (%rcx,%rcx,4),%rcx + shlq $2,%rcx + movq %rax,24(%rdi) + movq %rcx,32(%rdi) + movq $-1,64(%rdi) + movq %r10,0(%rdx) + movq %r11,8(%rdx) + movl $1,%eax + .byte 0xf3,0xc3 +.size poly1305_init_base2_44,.-poly1305_init_base2_44 +.type poly1305_blocks_vpmadd52,@function +.align 32 +poly1305_blocks_vpmadd52: + shrq $4,%rdx + jz .Lno_data_vpmadd52 + + shlq $40,%rcx + movq 64(%rdi),%r8 + + + + + + + movq $3,%rax + movq $1,%r10 + cmpq $4,%rdx + cmovaeq %r10,%rax + testq %r8,%r8 + cmovnsq %r10,%rax + + andq %rdx,%rax + jz .Lblocks_vpmadd52_4x + + subq %rax,%rdx + movl $7,%r10d + movl $1,%r11d + kmovw %r10d,%k7 + leaq .L2_44_inp_permd(%rip),%r10 + kmovw %r11d,%k1 + + vmovq %rcx,%xmm21 + vmovdqa64 0(%r10),%ymm19 + vmovdqa64 32(%r10),%ymm20 + vpermq $0xcf,%ymm21,%ymm21 + vmovdqa64 64(%r10),%ymm22 + + vmovdqu64 0(%rdi),%ymm16{%k7}{z} + vmovdqu64 40(%rdi),%ymm3{%k7}{z} + vmovdqu64 32(%rdi),%ymm4{%k7}{z} + vmovdqu64 24(%rdi),%ymm5{%k7}{z} + + vmovdqa64 96(%r10),%ymm23 + vmovdqa64 128(%r10),%ymm24 + + jmp .Loop_vpmadd52 + +.align 32 +.Loop_vpmadd52: + vmovdqu32 0(%rsi),%xmm18 + leaq 16(%rsi),%rsi + + vpermd %ymm18,%ymm19,%ymm18 + vpsrlvq %ymm20,%ymm18,%ymm18 + vpandq %ymm22,%ymm18,%ymm18 + vporq %ymm21,%ymm18,%ymm18 + + vpaddq %ymm18,%ymm16,%ymm16 + + vpermq $0,%ymm16,%ymm0{%k7}{z} + vpermq $85,%ymm16,%ymm1{%k7}{z} + vpermq $170,%ymm16,%ymm2{%k7}{z} + + vpxord %ymm16,%ymm16,%ymm16 + vpxord %ymm17,%ymm17,%ymm17 + + vpmadd52luq %ymm3,%ymm0,%ymm16 + vpmadd52huq %ymm3,%ymm0,%ymm17 + + vpmadd52luq %ymm4,%ymm1,%ymm16 + vpmadd52huq %ymm4,%ymm1,%ymm17 + + vpmadd52luq %ymm5,%ymm2,%ymm16 + vpmadd52huq %ymm5,%ymm2,%ymm17 + + vpsrlvq %ymm23,%ymm16,%ymm18 + vpsllvq %ymm24,%ymm17,%ymm17 + vpandq %ymm22,%ymm16,%ymm16 + + vpaddq %ymm18,%ymm17,%ymm17 + + vpermq $147,%ymm17,%ymm17 + + vpaddq %ymm17,%ymm16,%ymm16 + + vpsrlvq %ymm23,%ymm16,%ymm18 + vpandq %ymm22,%ymm16,%ymm16 + + vpermq $147,%ymm18,%ymm18 + + vpaddq %ymm18,%ymm16,%ymm16 + + vpermq $147,%ymm16,%ymm18{%k1}{z} + + vpaddq %ymm18,%ymm16,%ymm16 + vpsllq $2,%ymm18,%ymm18 + + vpaddq %ymm18,%ymm16,%ymm16 + + decq %rax + jnz .Loop_vpmadd52 + + vmovdqu64 %ymm16,0(%rdi){%k7} + + testq %rdx,%rdx + jnz .Lblocks_vpmadd52_4x + +.Lno_data_vpmadd52: + .byte 0xf3,0xc3 +.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 +.type poly1305_blocks_vpmadd52_4x,@function +.align 32 +poly1305_blocks_vpmadd52_4x: + shrq $4,%rdx + jz .Lno_data_vpmadd52_4x + + shlq $40,%rcx + movq 64(%rdi),%r8 + +.Lblocks_vpmadd52_4x: + vpbroadcastq %rcx,%ymm31 + + vmovdqa64 .Lx_mask44(%rip),%ymm28 + movl $5,%eax + vmovdqa64 .Lx_mask42(%rip),%ymm29 + kmovw %eax,%k1 + + testq %r8,%r8 + js .Linit_vpmadd52 + + vmovq 0(%rdi),%xmm0 + vmovq 8(%rdi),%xmm1 + vmovq 16(%rdi),%xmm2 + + testq $3,%rdx + jnz .Lblocks_vpmadd52_2x_do + +.Lblocks_vpmadd52_4x_do: + vpbroadcastq 64(%rdi),%ymm3 + vpbroadcastq 96(%rdi),%ymm4 + vpbroadcastq 128(%rdi),%ymm5 + vpbroadcastq 160(%rdi),%ymm16 + +.Lblocks_vpmadd52_4x_key_loaded: + vpsllq $2,%ymm5,%ymm17 + vpaddq %ymm5,%ymm17,%ymm17 + vpsllq $2,%ymm17,%ymm17 + + testq $7,%rdx + jz .Lblocks_vpmadd52_8x + + vmovdqu64 0(%rsi),%ymm26 + vmovdqu64 32(%rsi),%ymm27 + leaq 64(%rsi),%rsi + + vpunpcklqdq %ymm27,%ymm26,%ymm25 + vpunpckhqdq %ymm27,%ymm26,%ymm27 + + + + vpsrlq $24,%ymm27,%ymm26 + vporq %ymm31,%ymm26,%ymm26 + vpaddq %ymm26,%ymm2,%ymm2 + vpandq %ymm28,%ymm25,%ymm24 + vpsrlq $44,%ymm25,%ymm25 + vpsllq $20,%ymm27,%ymm27 + vporq %ymm27,%ymm25,%ymm25 + vpandq %ymm28,%ymm25,%ymm25 + + subq $4,%rdx + jz .Ltail_vpmadd52_4x + jmp .Loop_vpmadd52_4x + ud2 + +.align 32 +.Linit_vpmadd52: + vmovq 24(%rdi),%xmm16 + vmovq 56(%rdi),%xmm2 + vmovq 32(%rdi),%xmm17 + vmovq 40(%rdi),%xmm3 + vmovq 48(%rdi),%xmm4 + + vmovdqa %ymm3,%ymm0 + vmovdqa %ymm4,%ymm1 + vmovdqa %ymm2,%ymm5 + + movl $2,%eax + +.Lmul_init_vpmadd52: + vpxorq %ymm18,%ymm18,%ymm18 + vpmadd52luq %ymm2,%ymm16,%ymm18 + vpxorq %ymm19,%ymm19,%ymm19 + vpmadd52huq %ymm2,%ymm16,%ymm19 + vpxorq %ymm20,%ymm20,%ymm20 + vpmadd52luq %ymm2,%ymm17,%ymm20 + vpxorq %ymm21,%ymm21,%ymm21 + vpmadd52huq %ymm2,%ymm17,%ymm21 + vpxorq %ymm22,%ymm22,%ymm22 + vpmadd52luq %ymm2,%ymm3,%ymm22 + vpxorq %ymm23,%ymm23,%ymm23 + vpmadd52huq %ymm2,%ymm3,%ymm23 + + vpmadd52luq %ymm0,%ymm3,%ymm18 + vpmadd52huq %ymm0,%ymm3,%ymm19 + vpmadd52luq %ymm0,%ymm4,%ymm20 + vpmadd52huq %ymm0,%ymm4,%ymm21 + vpmadd52luq %ymm0,%ymm5,%ymm22 + vpmadd52huq %ymm0,%ymm5,%ymm23 + + vpmadd52luq %ymm1,%ymm17,%ymm18 + vpmadd52huq %ymm1,%ymm17,%ymm19 + vpmadd52luq %ymm1,%ymm3,%ymm20 + vpmadd52huq %ymm1,%ymm3,%ymm21 + vpmadd52luq %ymm1,%ymm4,%ymm22 + vpmadd52huq %ymm1,%ymm4,%ymm23 + + + + vpsrlq $44,%ymm18,%ymm30 + vpsllq $8,%ymm19,%ymm19 + vpandq %ymm28,%ymm18,%ymm0 + vpaddq %ymm30,%ymm19,%ymm19 + + vpaddq %ymm19,%ymm20,%ymm20 + + vpsrlq $44,%ymm20,%ymm30 + vpsllq $8,%ymm21,%ymm21 + vpandq %ymm28,%ymm20,%ymm1 + vpaddq %ymm30,%ymm21,%ymm21 + + vpaddq %ymm21,%ymm22,%ymm22 + + vpsrlq $42,%ymm22,%ymm30 + vpsllq $10,%ymm23,%ymm23 + vpandq %ymm29,%ymm22,%ymm2 + vpaddq %ymm30,%ymm23,%ymm23 + + vpaddq %ymm23,%ymm0,%ymm0 + vpsllq $2,%ymm23,%ymm23 + + vpaddq %ymm23,%ymm0,%ymm0 + + vpsrlq $44,%ymm0,%ymm30 + vpandq %ymm28,%ymm0,%ymm0 + + vpaddq %ymm30,%ymm1,%ymm1 + + decl %eax + jz .Ldone_init_vpmadd52 + + vpunpcklqdq %ymm4,%ymm1,%ymm4 + vpbroadcastq %xmm1,%xmm1 + vpunpcklqdq %ymm5,%ymm2,%ymm5 + vpbroadcastq %xmm2,%xmm2 + vpunpcklqdq %ymm3,%ymm0,%ymm3 + vpbroadcastq %xmm0,%xmm0 + + vpsllq $2,%ymm4,%ymm16 + vpsllq $2,%ymm5,%ymm17 + vpaddq %ymm4,%ymm16,%ymm16 + vpaddq %ymm5,%ymm17,%ymm17 + vpsllq $2,%ymm16,%ymm16 + vpsllq $2,%ymm17,%ymm17 + + jmp .Lmul_init_vpmadd52 + ud2 + +.align 32 +.Ldone_init_vpmadd52: + vinserti128 $1,%xmm4,%ymm1,%ymm4 + vinserti128 $1,%xmm5,%ymm2,%ymm5 + vinserti128 $1,%xmm3,%ymm0,%ymm3 + + vpermq $216,%ymm4,%ymm4 + vpermq $216,%ymm5,%ymm5 + vpermq $216,%ymm3,%ymm3 + + vpsllq $2,%ymm4,%ymm16 + vpaddq %ymm4,%ymm16,%ymm16 + vpsllq $2,%ymm16,%ymm16 + + vmovq 0(%rdi),%xmm0 + vmovq 8(%rdi),%xmm1 + vmovq 16(%rdi),%xmm2 + + testq $3,%rdx + jnz .Ldone_init_vpmadd52_2x + + vmovdqu64 %ymm3,64(%rdi) + vpbroadcastq %xmm3,%ymm3 + vmovdqu64 %ymm4,96(%rdi) + vpbroadcastq %xmm4,%ymm4 + vmovdqu64 %ymm5,128(%rdi) + vpbroadcastq %xmm5,%ymm5 + vmovdqu64 %ymm16,160(%rdi) + vpbroadcastq %xmm16,%ymm16 + + jmp .Lblocks_vpmadd52_4x_key_loaded + ud2 + +.align 32 +.Ldone_init_vpmadd52_2x: + vmovdqu64 %ymm3,64(%rdi) + vpsrldq $8,%ymm3,%ymm3 + vmovdqu64 %ymm4,96(%rdi) + vpsrldq $8,%ymm4,%ymm4 + vmovdqu64 %ymm5,128(%rdi) + vpsrldq $8,%ymm5,%ymm5 + vmovdqu64 %ymm16,160(%rdi) + vpsrldq $8,%ymm16,%ymm16 + jmp .Lblocks_vpmadd52_2x_key_loaded + ud2 + +.align 32 +.Lblocks_vpmadd52_2x_do: + vmovdqu64 128+8(%rdi),%ymm5{%k1}{z} + vmovdqu64 160+8(%rdi),%ymm16{%k1}{z} + vmovdqu64 64+8(%rdi),%ymm3{%k1}{z} + vmovdqu64 96+8(%rdi),%ymm4{%k1}{z} + +.Lblocks_vpmadd52_2x_key_loaded: + vmovdqu64 0(%rsi),%ymm26 + vpxorq %ymm27,%ymm27,%ymm27 + leaq 32(%rsi),%rsi + + vpunpcklqdq %ymm27,%ymm26,%ymm25 + vpunpckhqdq %ymm27,%ymm26,%ymm27 + + + + vpsrlq $24,%ymm27,%ymm26 + vporq %ymm31,%ymm26,%ymm26 + vpaddq %ymm26,%ymm2,%ymm2 + vpandq %ymm28,%ymm25,%ymm24 + vpsrlq $44,%ymm25,%ymm25 + vpsllq $20,%ymm27,%ymm27 + vporq %ymm27,%ymm25,%ymm25 + vpandq %ymm28,%ymm25,%ymm25 + + jmp .Ltail_vpmadd52_2x + ud2 + +.align 32 +.Loop_vpmadd52_4x: + + vpaddq %ymm24,%ymm0,%ymm0 + vpaddq %ymm25,%ymm1,%ymm1 + + vpxorq %ymm18,%ymm18,%ymm18 + vpmadd52luq %ymm2,%ymm16,%ymm18 + vpxorq %ymm19,%ymm19,%ymm19 + vpmadd52huq %ymm2,%ymm16,%ymm19 + vpxorq %ymm20,%ymm20,%ymm20 + vpmadd52luq %ymm2,%ymm17,%ymm20 + vpxorq %ymm21,%ymm21,%ymm21 + vpmadd52huq %ymm2,%ymm17,%ymm21 + vpxorq %ymm22,%ymm22,%ymm22 + vpmadd52luq %ymm2,%ymm3,%ymm22 + vpxorq %ymm23,%ymm23,%ymm23 + vpmadd52huq %ymm2,%ymm3,%ymm23 + + vmovdqu64 0(%rsi),%ymm26 + vmovdqu64 32(%rsi),%ymm27 + leaq 64(%rsi),%rsi + vpmadd52luq %ymm0,%ymm3,%ymm18 + vpmadd52huq %ymm0,%ymm3,%ymm19 + vpmadd52luq %ymm0,%ymm4,%ymm20 + vpmadd52huq %ymm0,%ymm4,%ymm21 + vpmadd52luq %ymm0,%ymm5,%ymm22 + vpmadd52huq %ymm0,%ymm5,%ymm23 + + vpunpcklqdq %ymm27,%ymm26,%ymm25 + vpunpckhqdq %ymm27,%ymm26,%ymm27 + vpmadd52luq %ymm1,%ymm17,%ymm18 + vpmadd52huq %ymm1,%ymm17,%ymm19 + vpmadd52luq %ymm1,%ymm3,%ymm20 + vpmadd52huq %ymm1,%ymm3,%ymm21 + vpmadd52luq %ymm1,%ymm4,%ymm22 + vpmadd52huq %ymm1,%ymm4,%ymm23 + + + + vpsrlq $44,%ymm18,%ymm30 + vpsllq $8,%ymm19,%ymm19 + vpandq %ymm28,%ymm18,%ymm0 + vpaddq %ymm30,%ymm19,%ymm19 + + vpsrlq $24,%ymm27,%ymm26 + vporq %ymm31,%ymm26,%ymm26 + vpaddq %ymm19,%ymm20,%ymm20 + + vpsrlq $44,%ymm20,%ymm30 + vpsllq $8,%ymm21,%ymm21 + vpandq %ymm28,%ymm20,%ymm1 + vpaddq %ymm30,%ymm21,%ymm21 + + vpandq %ymm28,%ymm25,%ymm24 + vpsrlq $44,%ymm25,%ymm25 + vpsllq $20,%ymm27,%ymm27 + vpaddq %ymm21,%ymm22,%ymm22 + + vpsrlq $42,%ymm22,%ymm30 + vpsllq $10,%ymm23,%ymm23 + vpandq %ymm29,%ymm22,%ymm2 + vpaddq %ymm30,%ymm23,%ymm23 + + vpaddq %ymm26,%ymm2,%ymm2 + vpaddq %ymm23,%ymm0,%ymm0 + vpsllq $2,%ymm23,%ymm23 + + vpaddq %ymm23,%ymm0,%ymm0 + vporq %ymm27,%ymm25,%ymm25 + vpandq %ymm28,%ymm25,%ymm25 + + vpsrlq $44,%ymm0,%ymm30 + vpandq %ymm28,%ymm0,%ymm0 + + vpaddq %ymm30,%ymm1,%ymm1 + + subq $4,%rdx + jnz .Loop_vpmadd52_4x + +.Ltail_vpmadd52_4x: + vmovdqu64 128(%rdi),%ymm5 + vmovdqu64 160(%rdi),%ymm16 + vmovdqu64 64(%rdi),%ymm3 + vmovdqu64 96(%rdi),%ymm4 + +.Ltail_vpmadd52_2x: + vpsllq $2,%ymm5,%ymm17 + vpaddq %ymm5,%ymm17,%ymm17 + vpsllq $2,%ymm17,%ymm17 + + + vpaddq %ymm24,%ymm0,%ymm0 + vpaddq %ymm25,%ymm1,%ymm1 + + vpxorq %ymm18,%ymm18,%ymm18 + vpmadd52luq %ymm2,%ymm16,%ymm18 + vpxorq %ymm19,%ymm19,%ymm19 + vpmadd52huq %ymm2,%ymm16,%ymm19 + vpxorq %ymm20,%ymm20,%ymm20 + vpmadd52luq %ymm2,%ymm17,%ymm20 + vpxorq %ymm21,%ymm21,%ymm21 + vpmadd52huq %ymm2,%ymm17,%ymm21 + vpxorq %ymm22,%ymm22,%ymm22 + vpmadd52luq %ymm2,%ymm3,%ymm22 + vpxorq %ymm23,%ymm23,%ymm23 + vpmadd52huq %ymm2,%ymm3,%ymm23 + + vpmadd52luq %ymm0,%ymm3,%ymm18 + vpmadd52huq %ymm0,%ymm3,%ymm19 + vpmadd52luq %ymm0,%ymm4,%ymm20 + vpmadd52huq %ymm0,%ymm4,%ymm21 + vpmadd52luq %ymm0,%ymm5,%ymm22 + vpmadd52huq %ymm0,%ymm5,%ymm23 + + vpmadd52luq %ymm1,%ymm17,%ymm18 + vpmadd52huq %ymm1,%ymm17,%ymm19 + vpmadd52luq %ymm1,%ymm3,%ymm20 + vpmadd52huq %ymm1,%ymm3,%ymm21 + vpmadd52luq %ymm1,%ymm4,%ymm22 + vpmadd52huq %ymm1,%ymm4,%ymm23 + + + + + movl $1,%eax + kmovw %eax,%k1 + vpsrldq $8,%ymm18,%ymm24 + vpsrldq $8,%ymm19,%ymm0 + vpsrldq $8,%ymm20,%ymm25 + vpsrldq $8,%ymm21,%ymm1 + vpaddq %ymm24,%ymm18,%ymm18 + vpaddq %ymm0,%ymm19,%ymm19 + vpsrldq $8,%ymm22,%ymm26 + vpsrldq $8,%ymm23,%ymm2 + vpaddq %ymm25,%ymm20,%ymm20 + vpaddq %ymm1,%ymm21,%ymm21 + vpermq $0x2,%ymm18,%ymm24 + vpermq $0x2,%ymm19,%ymm0 + vpaddq %ymm26,%ymm22,%ymm22 + vpaddq %ymm2,%ymm23,%ymm23 + + vpermq $0x2,%ymm20,%ymm25 + vpermq $0x2,%ymm21,%ymm1 + vpaddq %ymm24,%ymm18,%ymm18{%k1}{z} + vpaddq %ymm0,%ymm19,%ymm19{%k1}{z} + vpermq $0x2,%ymm22,%ymm26 + vpermq $0x2,%ymm23,%ymm2 + vpaddq %ymm25,%ymm20,%ymm20{%k1}{z} + vpaddq %ymm1,%ymm21,%ymm21{%k1}{z} + vpaddq %ymm26,%ymm22,%ymm22{%k1}{z} + vpaddq %ymm2,%ymm23,%ymm23{%k1}{z} + + + + vpsrlq $44,%ymm18,%ymm30 + vpsllq $8,%ymm19,%ymm19 + vpandq %ymm28,%ymm18,%ymm0 + vpaddq %ymm30,%ymm19,%ymm19 + + vpaddq %ymm19,%ymm20,%ymm20 + + vpsrlq $44,%ymm20,%ymm30 + vpsllq $8,%ymm21,%ymm21 + vpandq %ymm28,%ymm20,%ymm1 + vpaddq %ymm30,%ymm21,%ymm21 + + vpaddq %ymm21,%ymm22,%ymm22 + + vpsrlq $42,%ymm22,%ymm30 + vpsllq $10,%ymm23,%ymm23 + vpandq %ymm29,%ymm22,%ymm2 + vpaddq %ymm30,%ymm23,%ymm23 + + vpaddq %ymm23,%ymm0,%ymm0 + vpsllq $2,%ymm23,%ymm23 + + vpaddq %ymm23,%ymm0,%ymm0 + + vpsrlq $44,%ymm0,%ymm30 + vpandq %ymm28,%ymm0,%ymm0 + + vpaddq %ymm30,%ymm1,%ymm1 + + + subq $2,%rdx + ja .Lblocks_vpmadd52_4x_do + + vmovq %xmm0,0(%rdi) + vmovq %xmm1,8(%rdi) + vmovq %xmm2,16(%rdi) + vzeroall + +.Lno_data_vpmadd52_4x: + .byte 0xf3,0xc3 +.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x +.type poly1305_blocks_vpmadd52_8x,@function +.align 32 +poly1305_blocks_vpmadd52_8x: + shrq $4,%rdx + jz .Lno_data_vpmadd52_8x + + shlq $40,%rcx + movq 64(%rdi),%r8 + + vmovdqa64 .Lx_mask44(%rip),%ymm28 + vmovdqa64 .Lx_mask42(%rip),%ymm29 + + testq %r8,%r8 + js .Linit_vpmadd52 + + vmovq 0(%rdi),%xmm0 + vmovq 8(%rdi),%xmm1 + vmovq 16(%rdi),%xmm2 + +.Lblocks_vpmadd52_8x: + + + + vmovdqu64 128(%rdi),%ymm5 + vmovdqu64 160(%rdi),%ymm16 + vmovdqu64 64(%rdi),%ymm3 + vmovdqu64 96(%rdi),%ymm4 + + vpsllq $2,%ymm5,%ymm17 + vpaddq %ymm5,%ymm17,%ymm17 + vpsllq $2,%ymm17,%ymm17 + + vpbroadcastq %xmm5,%ymm8 + vpbroadcastq %xmm3,%ymm6 + vpbroadcastq %xmm4,%ymm7 + + vpxorq %ymm18,%ymm18,%ymm18 + vpmadd52luq %ymm8,%ymm16,%ymm18 + vpxorq %ymm19,%ymm19,%ymm19 + vpmadd52huq %ymm8,%ymm16,%ymm19 + vpxorq %ymm20,%ymm20,%ymm20 + vpmadd52luq %ymm8,%ymm17,%ymm20 + vpxorq %ymm21,%ymm21,%ymm21 + vpmadd52huq %ymm8,%ymm17,%ymm21 + vpxorq %ymm22,%ymm22,%ymm22 + vpmadd52luq %ymm8,%ymm3,%ymm22 + vpxorq %ymm23,%ymm23,%ymm23 + vpmadd52huq %ymm8,%ymm3,%ymm23 + + vpmadd52luq %ymm6,%ymm3,%ymm18 + vpmadd52huq %ymm6,%ymm3,%ymm19 + vpmadd52luq %ymm6,%ymm4,%ymm20 + vpmadd52huq %ymm6,%ymm4,%ymm21 + vpmadd52luq %ymm6,%ymm5,%ymm22 + vpmadd52huq %ymm6,%ymm5,%ymm23 + + vpmadd52luq %ymm7,%ymm17,%ymm18 + vpmadd52huq %ymm7,%ymm17,%ymm19 + vpmadd52luq %ymm7,%ymm3,%ymm20 + vpmadd52huq %ymm7,%ymm3,%ymm21 + vpmadd52luq %ymm7,%ymm4,%ymm22 + vpmadd52huq %ymm7,%ymm4,%ymm23 + + + + vpsrlq $44,%ymm18,%ymm30 + vpsllq $8,%ymm19,%ymm19 + vpandq %ymm28,%ymm18,%ymm6 + vpaddq %ymm30,%ymm19,%ymm19 + + vpaddq %ymm19,%ymm20,%ymm20 + + vpsrlq $44,%ymm20,%ymm30 + vpsllq $8,%ymm21,%ymm21 + vpandq %ymm28,%ymm20,%ymm7 + vpaddq %ymm30,%ymm21,%ymm21 + + vpaddq %ymm21,%ymm22,%ymm22 + + vpsrlq $42,%ymm22,%ymm30 + vpsllq $10,%ymm23,%ymm23 + vpandq %ymm29,%ymm22,%ymm8 + vpaddq %ymm30,%ymm23,%ymm23 + + vpaddq %ymm23,%ymm6,%ymm6 + vpsllq $2,%ymm23,%ymm23 + + vpaddq %ymm23,%ymm6,%ymm6 + + vpsrlq $44,%ymm6,%ymm30 + vpandq %ymm28,%ymm6,%ymm6 + + vpaddq %ymm30,%ymm7,%ymm7 + + + + + + vpunpcklqdq %ymm5,%ymm8,%ymm26 + vpunpckhqdq %ymm5,%ymm8,%ymm5 + vpunpcklqdq %ymm3,%ymm6,%ymm24 + vpunpckhqdq %ymm3,%ymm6,%ymm3 + vpunpcklqdq %ymm4,%ymm7,%ymm25 + vpunpckhqdq %ymm4,%ymm7,%ymm4 + vshufi64x2 $0x44,%zmm5,%zmm26,%zmm8 + vshufi64x2 $0x44,%zmm3,%zmm24,%zmm6 + vshufi64x2 $0x44,%zmm4,%zmm25,%zmm7 + + vmovdqu64 0(%rsi),%zmm26 + vmovdqu64 64(%rsi),%zmm27 + leaq 128(%rsi),%rsi + + vpsllq $2,%zmm8,%zmm10 + vpsllq $2,%zmm7,%zmm9 + vpaddq %zmm8,%zmm10,%zmm10 + vpaddq %zmm7,%zmm9,%zmm9 + vpsllq $2,%zmm10,%zmm10 + vpsllq $2,%zmm9,%zmm9 + + vpbroadcastq %rcx,%zmm31 + vpbroadcastq %xmm28,%zmm28 + vpbroadcastq %xmm29,%zmm29 + + vpbroadcastq %xmm9,%zmm16 + vpbroadcastq %xmm10,%zmm17 + vpbroadcastq %xmm6,%zmm3 + vpbroadcastq %xmm7,%zmm4 + vpbroadcastq %xmm8,%zmm5 + + vpunpcklqdq %zmm27,%zmm26,%zmm25 + vpunpckhqdq %zmm27,%zmm26,%zmm27 + + + + vpsrlq $24,%zmm27,%zmm26 + vporq %zmm31,%zmm26,%zmm26 + vpaddq %zmm26,%zmm2,%zmm2 + vpandq %zmm28,%zmm25,%zmm24 + vpsrlq $44,%zmm25,%zmm25 + vpsllq $20,%zmm27,%zmm27 + vporq %zmm27,%zmm25,%zmm25 + vpandq %zmm28,%zmm25,%zmm25 + + subq $8,%rdx + jz .Ltail_vpmadd52_8x + jmp .Loop_vpmadd52_8x + +.align 32 +.Loop_vpmadd52_8x: + + vpaddq %zmm24,%zmm0,%zmm0 + vpaddq %zmm25,%zmm1,%zmm1 + + vpxorq %zmm18,%zmm18,%zmm18 + vpmadd52luq %zmm2,%zmm16,%zmm18 + vpxorq %zmm19,%zmm19,%zmm19 + vpmadd52huq %zmm2,%zmm16,%zmm19 + vpxorq %zmm20,%zmm20,%zmm20 + vpmadd52luq %zmm2,%zmm17,%zmm20 + vpxorq %zmm21,%zmm21,%zmm21 + vpmadd52huq %zmm2,%zmm17,%zmm21 + vpxorq %zmm22,%zmm22,%zmm22 + vpmadd52luq %zmm2,%zmm3,%zmm22 + vpxorq %zmm23,%zmm23,%zmm23 + vpmadd52huq %zmm2,%zmm3,%zmm23 + + vmovdqu64 0(%rsi),%zmm26 + vmovdqu64 64(%rsi),%zmm27 + leaq 128(%rsi),%rsi + vpmadd52luq %zmm0,%zmm3,%zmm18 + vpmadd52huq %zmm0,%zmm3,%zmm19 + vpmadd52luq %zmm0,%zmm4,%zmm20 + vpmadd52huq %zmm0,%zmm4,%zmm21 + vpmadd52luq %zmm0,%zmm5,%zmm22 + vpmadd52huq %zmm0,%zmm5,%zmm23 + + vpunpcklqdq %zmm27,%zmm26,%zmm25 + vpunpckhqdq %zmm27,%zmm26,%zmm27 + vpmadd52luq %zmm1,%zmm17,%zmm18 + vpmadd52huq %zmm1,%zmm17,%zmm19 + vpmadd52luq %zmm1,%zmm3,%zmm20 + vpmadd52huq %zmm1,%zmm3,%zmm21 + vpmadd52luq %zmm1,%zmm4,%zmm22 + vpmadd52huq %zmm1,%zmm4,%zmm23 + + + + vpsrlq $44,%zmm18,%zmm30 + vpsllq $8,%zmm19,%zmm19 + vpandq %zmm28,%zmm18,%zmm0 + vpaddq %zmm30,%zmm19,%zmm19 + + vpsrlq $24,%zmm27,%zmm26 + vporq %zmm31,%zmm26,%zmm26 + vpaddq %zmm19,%zmm20,%zmm20 + + vpsrlq $44,%zmm20,%zmm30 + vpsllq $8,%zmm21,%zmm21 + vpandq %zmm28,%zmm20,%zmm1 + vpaddq %zmm30,%zmm21,%zmm21 + + vpandq %zmm28,%zmm25,%zmm24 + vpsrlq $44,%zmm25,%zmm25 + vpsllq $20,%zmm27,%zmm27 + vpaddq %zmm21,%zmm22,%zmm22 + + vpsrlq $42,%zmm22,%zmm30 + vpsllq $10,%zmm23,%zmm23 + vpandq %zmm29,%zmm22,%zmm2 + vpaddq %zmm30,%zmm23,%zmm23 + + vpaddq %zmm26,%zmm2,%zmm2 + vpaddq %zmm23,%zmm0,%zmm0 + vpsllq $2,%zmm23,%zmm23 + + vpaddq %zmm23,%zmm0,%zmm0 + vporq %zmm27,%zmm25,%zmm25 + vpandq %zmm28,%zmm25,%zmm25 + + vpsrlq $44,%zmm0,%zmm30 + vpandq %zmm28,%zmm0,%zmm0 + + vpaddq %zmm30,%zmm1,%zmm1 + + subq $8,%rdx + jnz .Loop_vpmadd52_8x + +.Ltail_vpmadd52_8x: + + vpaddq %zmm24,%zmm0,%zmm0 + vpaddq %zmm25,%zmm1,%zmm1 + + vpxorq %zmm18,%zmm18,%zmm18 + vpmadd52luq %zmm2,%zmm9,%zmm18 + vpxorq %zmm19,%zmm19,%zmm19 + vpmadd52huq %zmm2,%zmm9,%zmm19 + vpxorq %zmm20,%zmm20,%zmm20 + vpmadd52luq %zmm2,%zmm10,%zmm20 + vpxorq %zmm21,%zmm21,%zmm21 + vpmadd52huq %zmm2,%zmm10,%zmm21 + vpxorq %zmm22,%zmm22,%zmm22 + vpmadd52luq %zmm2,%zmm6,%zmm22 + vpxorq %zmm23,%zmm23,%zmm23 + vpmadd52huq %zmm2,%zmm6,%zmm23 + + vpmadd52luq %zmm0,%zmm6,%zmm18 + vpmadd52huq %zmm0,%zmm6,%zmm19 + vpmadd52luq %zmm0,%zmm7,%zmm20 + vpmadd52huq %zmm0,%zmm7,%zmm21 + vpmadd52luq %zmm0,%zmm8,%zmm22 + vpmadd52huq %zmm0,%zmm8,%zmm23 + + vpmadd52luq %zmm1,%zmm10,%zmm18 + vpmadd52huq %zmm1,%zmm10,%zmm19 + vpmadd52luq %zmm1,%zmm6,%zmm20 + vpmadd52huq %zmm1,%zmm6,%zmm21 + vpmadd52luq %zmm1,%zmm7,%zmm22 + vpmadd52huq %zmm1,%zmm7,%zmm23 + + + + + movl $1,%eax + kmovw %eax,%k1 + vpsrldq $8,%zmm18,%zmm24 + vpsrldq $8,%zmm19,%zmm0 + vpsrldq $8,%zmm20,%zmm25 + vpsrldq $8,%zmm21,%zmm1 + vpaddq %zmm24,%zmm18,%zmm18 + vpaddq %zmm0,%zmm19,%zmm19 + vpsrldq $8,%zmm22,%zmm26 + vpsrldq $8,%zmm23,%zmm2 + vpaddq %zmm25,%zmm20,%zmm20 + vpaddq %zmm1,%zmm21,%zmm21 + vpermq $0x2,%zmm18,%zmm24 + vpermq $0x2,%zmm19,%zmm0 + vpaddq %zmm26,%zmm22,%zmm22 + vpaddq %zmm2,%zmm23,%zmm23 + + vpermq $0x2,%zmm20,%zmm25 + vpermq $0x2,%zmm21,%zmm1 + vpaddq %zmm24,%zmm18,%zmm18 + vpaddq %zmm0,%zmm19,%zmm19 + vpermq $0x2,%zmm22,%zmm26 + vpermq $0x2,%zmm23,%zmm2 + vpaddq %zmm25,%zmm20,%zmm20 + vpaddq %zmm1,%zmm21,%zmm21 + vextracti64x4 $1,%zmm18,%ymm24 + vextracti64x4 $1,%zmm19,%ymm0 + vpaddq %zmm26,%zmm22,%zmm22 + vpaddq %zmm2,%zmm23,%zmm23 + + vextracti64x4 $1,%zmm20,%ymm25 + vextracti64x4 $1,%zmm21,%ymm1 + vextracti64x4 $1,%zmm22,%ymm26 + vextracti64x4 $1,%zmm23,%ymm2 + vpaddq %ymm24,%ymm18,%ymm18{%k1}{z} + vpaddq %ymm0,%ymm19,%ymm19{%k1}{z} + vpaddq %ymm25,%ymm20,%ymm20{%k1}{z} + vpaddq %ymm1,%ymm21,%ymm21{%k1}{z} + vpaddq %ymm26,%ymm22,%ymm22{%k1}{z} + vpaddq %ymm2,%ymm23,%ymm23{%k1}{z} + + + + vpsrlq $44,%ymm18,%ymm30 + vpsllq $8,%ymm19,%ymm19 + vpandq %ymm28,%ymm18,%ymm0 + vpaddq %ymm30,%ymm19,%ymm19 + + vpaddq %ymm19,%ymm20,%ymm20 + + vpsrlq $44,%ymm20,%ymm30 + vpsllq $8,%ymm21,%ymm21 + vpandq %ymm28,%ymm20,%ymm1 + vpaddq %ymm30,%ymm21,%ymm21 + + vpaddq %ymm21,%ymm22,%ymm22 + + vpsrlq $42,%ymm22,%ymm30 + vpsllq $10,%ymm23,%ymm23 + vpandq %ymm29,%ymm22,%ymm2 + vpaddq %ymm30,%ymm23,%ymm23 + + vpaddq %ymm23,%ymm0,%ymm0 + vpsllq $2,%ymm23,%ymm23 + + vpaddq %ymm23,%ymm0,%ymm0 + + vpsrlq $44,%ymm0,%ymm30 + vpandq %ymm28,%ymm0,%ymm0 + + vpaddq %ymm30,%ymm1,%ymm1 + + + + vmovq %xmm0,0(%rdi) + vmovq %xmm1,8(%rdi) + vmovq %xmm2,16(%rdi) + vzeroall + +.Lno_data_vpmadd52_8x: + .byte 0xf3,0xc3 +.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x +.type poly1305_emit_base2_44,@function +.align 32 +poly1305_emit_base2_44: + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + + movq %r9,%rax + shrq $20,%r9 + shlq $44,%rax + movq %r10,%rcx + shrq $40,%r10 + shlq $24,%rcx + + addq %rax,%r8 + adcq %rcx,%r9 + adcq $0,%r10 + + movq %r8,%rax + addq $5,%r8 + movq %r9,%rcx + adcq $0,%r9 + adcq $0,%r10 + shrq $2,%r10 + cmovnzq %r8,%rax + cmovnzq %r9,%rcx + + addq 0(%rdx),%rax + adcq 8(%rdx),%rcx + movq %rax,0(%rsi) + movq %rcx,8(%rsi) + + .byte 0xf3,0xc3 +.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 .align 64 .Lconst: .Lmask24: @@ -1822,7 +3436,125 @@ poly1305_blocks_avx2: .long 16777216,0,16777216,0,16777216,0,16777216,0 .Lmask26: .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 -.Lfive: -.long 5,0,5,0,5,0,5,0 +.Lpermd_avx2: +.long 2,2,2,3,2,0,2,1 +.Lpermd_avx512: +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 + +.L2_44_inp_permd: +.long 0,1,1,2,2,3,7,7 +.L2_44_inp_shift: +.quad 0,12,24,64 +.L2_44_mask: +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff +.L2_44_shift_rgt: +.quad 44,44,42,64 +.L2_44_shift_lft: +.quad 8,8,10,64 + +.align 64 +.Lx_mask44: +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.Lx_mask42: +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 +.globl xor128_encrypt_n_pad +.type xor128_encrypt_n_pad,@function +.align 16 +xor128_encrypt_n_pad: + subq %rdx,%rsi + subq %rdx,%rdi + movq %rcx,%r10 + shrq $4,%rcx + jz .Ltail_enc + nop +.Loop_enc_xmm: + movdqu (%rsi,%rdx,1),%xmm0 + pxor (%rdx),%xmm0 + movdqu %xmm0,(%rdi,%rdx,1) + movdqa %xmm0,(%rdx) + leaq 16(%rdx),%rdx + decq %rcx + jnz .Loop_enc_xmm + + andq $15,%r10 + jz .Ldone_enc + +.Ltail_enc: + movq $16,%rcx + subq %r10,%rcx + xorl %eax,%eax +.Loop_enc_byte: + movb (%rsi,%rdx,1),%al + xorb (%rdx),%al + movb %al,(%rdi,%rdx,1) + movb %al,(%rdx) + leaq 1(%rdx),%rdx + decq %r10 + jnz .Loop_enc_byte + + xorl %eax,%eax +.Loop_enc_pad: + movb %al,(%rdx) + leaq 1(%rdx),%rdx + decq %rcx + jnz .Loop_enc_pad + +.Ldone_enc: + movq %rdx,%rax + .byte 0xf3,0xc3 +.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad + +.globl xor128_decrypt_n_pad +.type xor128_decrypt_n_pad,@function +.align 16 +xor128_decrypt_n_pad: + subq %rdx,%rsi + subq %rdx,%rdi + movq %rcx,%r10 + shrq $4,%rcx + jz .Ltail_dec + nop +.Loop_dec_xmm: + movdqu (%rsi,%rdx,1),%xmm0 + movdqa (%rdx),%xmm1 + pxor %xmm0,%xmm1 + movdqu %xmm1,(%rdi,%rdx,1) + movdqa %xmm0,(%rdx) + leaq 16(%rdx),%rdx + decq %rcx + jnz .Loop_dec_xmm + + pxor %xmm1,%xmm1 + andq $15,%r10 + jz .Ldone_dec + +.Ltail_dec: + movq $16,%rcx + subq %r10,%rcx + xorl %eax,%eax + xorq %r11,%r11 +.Loop_dec_byte: + movb (%rsi,%rdx,1),%r11b + movb (%rdx),%al + xorb %r11b,%al + movb %al,(%rdi,%rdx,1) + movb %r11b,(%rdx) + leaq 1(%rdx),%rdx + decq %r10 + jnz .Loop_dec_byte + + xorl %eax,%eax +.Loop_dec_pad: + movb %al,(%rdx) + leaq 1(%rdx),%rdx + decq %rcx + jnz .Loop_dec_pad + +.Ldone_dec: + movq %rdx,%rax + .byte 0xf3,0xc3 +.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/rc4/rc4-md5-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/rc4/rc4-md5-x86_64.s index aab3c6db13..03fbca89de 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/rc4/rc4-md5-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/rc4/rc4-md5-x86_64.s @@ -4,15 +4,29 @@ .globl rc4_md5_enc .type rc4_md5_enc,@function rc4_md5_enc: +.cfi_startproc cmpq $0,%r9 je .Labort pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $40,%rsp +.cfi_adjust_cfa_offset 40 .Lbody: movq %rcx,%r11 movq %r9,%r12 @@ -1247,13 +1261,21 @@ rc4_md5_enc: movl %ecx,-4(%rdi) movq 40(%rsp),%r15 +.cfi_restore %r15 movq 48(%rsp),%r14 +.cfi_restore %r14 movq 56(%rsp),%r13 +.cfi_restore %r13 movq 64(%rsp),%r12 +.cfi_restore %r12 movq 72(%rsp),%rbp +.cfi_restore %rbp movq 80(%rsp),%rbx +.cfi_restore %rbx leaq 88(%rsp),%rsp +.cfi_adjust_cfa_offset -88 .Lepilogue: .Labort: .byte 0xf3,0xc3 +.cfi_endproc .size rc4_md5_enc,.-rc4_md5_enc diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/rc4/rc4-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/rc4/rc4-x86_64.s index 781b48b9eb..fba70351d4 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/rc4/rc4-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/rc4/rc4-x86_64.s @@ -8,9 +8,16 @@ RC4: orq %rsi,%rsi jne .Lentry .byte 0xf3,0xc3 .Lentry: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-24 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-32 .Lprologue: movq %rsi,%r11 movq %rdx,%r12 @@ -511,11 +518,16 @@ RC4: orq %rsi,%rsi movl %ecx,-4(%rdi) movq (%rsp),%r13 +.cfi_restore %r13 movq 8(%rsp),%r12 +.cfi_restore %r12 movq 16(%rsp),%rbx +.cfi_restore %rbx addq $24,%rsp +.cfi_adjust_cfa_offset -24 .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size RC4,.-RC4 .globl RC4_set_key .type RC4_set_key,@function diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/keccak1600-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/keccak1600-x86_64.s new file mode 100644 index 0000000000..e511f25035 --- /dev/null +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/keccak1600-x86_64.s @@ -0,0 +1,522 @@ +.text + +.type __KeccakF1600,@function +.align 32 +__KeccakF1600: + movq 60(%rdi),%rax + movq 68(%rdi),%rbx + movq 76(%rdi),%rcx + movq 84(%rdi),%rdx + movq 92(%rdi),%rbp + jmp .Loop + +.align 32 +.Loop: + movq -100(%rdi),%r8 + movq -52(%rdi),%r9 + movq -4(%rdi),%r10 + movq 44(%rdi),%r11 + + xorq -84(%rdi),%rcx + xorq -76(%rdi),%rdx + xorq %r8,%rax + xorq -92(%rdi),%rbx + xorq -44(%rdi),%rcx + xorq -60(%rdi),%rax + movq %rbp,%r12 + xorq -68(%rdi),%rbp + + xorq %r10,%rcx + xorq -20(%rdi),%rax + xorq -36(%rdi),%rdx + xorq %r9,%rbx + xorq -28(%rdi),%rbp + + xorq 36(%rdi),%rcx + xorq 20(%rdi),%rax + xorq 4(%rdi),%rdx + xorq -12(%rdi),%rbx + xorq 12(%rdi),%rbp + + movq %rcx,%r13 + rolq $1,%rcx + xorq %rax,%rcx + xorq %r11,%rdx + + rolq $1,%rax + xorq %rdx,%rax + xorq 28(%rdi),%rbx + + rolq $1,%rdx + xorq %rbx,%rdx + xorq 52(%rdi),%rbp + + rolq $1,%rbx + xorq %rbp,%rbx + + rolq $1,%rbp + xorq %r13,%rbp + xorq %rcx,%r9 + xorq %rdx,%r10 + rolq $44,%r9 + xorq %rbp,%r11 + xorq %rax,%r12 + rolq $43,%r10 + xorq %rbx,%r8 + movq %r9,%r13 + rolq $21,%r11 + orq %r10,%r9 + xorq %r8,%r9 + rolq $14,%r12 + + xorq (%r15),%r9 + leaq 8(%r15),%r15 + + movq %r12,%r14 + andq %r11,%r12 + movq %r9,-100(%rsi) + xorq %r10,%r12 + notq %r10 + movq %r12,-84(%rsi) + + orq %r11,%r10 + movq 76(%rdi),%r12 + xorq %r13,%r10 + movq %r10,-92(%rsi) + + andq %r8,%r13 + movq -28(%rdi),%r9 + xorq %r14,%r13 + movq -20(%rdi),%r10 + movq %r13,-68(%rsi) + + orq %r8,%r14 + movq -76(%rdi),%r8 + xorq %r11,%r14 + movq 28(%rdi),%r11 + movq %r14,-76(%rsi) + + + xorq %rbp,%r8 + xorq %rdx,%r12 + rolq $28,%r8 + xorq %rcx,%r11 + xorq %rax,%r9 + rolq $61,%r12 + rolq $45,%r11 + xorq %rbx,%r10 + rolq $20,%r9 + movq %r8,%r13 + orq %r12,%r8 + rolq $3,%r10 + + xorq %r11,%r8 + movq %r8,-36(%rsi) + + movq %r9,%r14 + andq %r13,%r9 + movq -92(%rdi),%r8 + xorq %r12,%r9 + notq %r12 + movq %r9,-28(%rsi) + + orq %r11,%r12 + movq -44(%rdi),%r9 + xorq %r10,%r12 + movq %r12,-44(%rsi) + + andq %r10,%r11 + movq 60(%rdi),%r12 + xorq %r14,%r11 + movq %r11,-52(%rsi) + + orq %r10,%r14 + movq 4(%rdi),%r10 + xorq %r13,%r14 + movq 52(%rdi),%r11 + movq %r14,-60(%rsi) + + + xorq %rbp,%r10 + xorq %rax,%r11 + rolq $25,%r10 + xorq %rdx,%r9 + rolq $8,%r11 + xorq %rbx,%r12 + rolq $6,%r9 + xorq %rcx,%r8 + rolq $18,%r12 + movq %r10,%r13 + andq %r11,%r10 + rolq $1,%r8 + + notq %r11 + xorq %r9,%r10 + movq %r10,-12(%rsi) + + movq %r12,%r14 + andq %r11,%r12 + movq -12(%rdi),%r10 + xorq %r13,%r12 + movq %r12,-4(%rsi) + + orq %r9,%r13 + movq 84(%rdi),%r12 + xorq %r8,%r13 + movq %r13,-20(%rsi) + + andq %r8,%r9 + xorq %r14,%r9 + movq %r9,12(%rsi) + + orq %r8,%r14 + movq -60(%rdi),%r9 + xorq %r11,%r14 + movq 36(%rdi),%r11 + movq %r14,4(%rsi) + + + movq -68(%rdi),%r8 + + xorq %rcx,%r10 + xorq %rdx,%r11 + rolq $10,%r10 + xorq %rbx,%r9 + rolq $15,%r11 + xorq %rbp,%r12 + rolq $36,%r9 + xorq %rax,%r8 + rolq $56,%r12 + movq %r10,%r13 + orq %r11,%r10 + rolq $27,%r8 + + notq %r11 + xorq %r9,%r10 + movq %r10,28(%rsi) + + movq %r12,%r14 + orq %r11,%r12 + xorq %r13,%r12 + movq %r12,36(%rsi) + + andq %r9,%r13 + xorq %r8,%r13 + movq %r13,20(%rsi) + + orq %r8,%r9 + xorq %r14,%r9 + movq %r9,52(%rsi) + + andq %r14,%r8 + xorq %r11,%r8 + movq %r8,44(%rsi) + + + xorq -84(%rdi),%rdx + xorq -36(%rdi),%rbp + rolq $62,%rdx + xorq 68(%rdi),%rcx + rolq $55,%rbp + xorq 12(%rdi),%rax + rolq $2,%rcx + xorq 20(%rdi),%rbx + xchgq %rsi,%rdi + rolq $39,%rax + rolq $41,%rbx + movq %rdx,%r13 + andq %rbp,%rdx + notq %rbp + xorq %rcx,%rdx + movq %rdx,92(%rdi) + + movq %rax,%r14 + andq %rbp,%rax + xorq %r13,%rax + movq %rax,60(%rdi) + + orq %rcx,%r13 + xorq %rbx,%r13 + movq %r13,84(%rdi) + + andq %rbx,%rcx + xorq %r14,%rcx + movq %rcx,76(%rdi) + + orq %r14,%rbx + xorq %rbp,%rbx + movq %rbx,68(%rdi) + + movq %rdx,%rbp + movq %r13,%rdx + + testq $255,%r15 + jnz .Loop + + leaq -192(%r15),%r15 + .byte 0xf3,0xc3 +.size __KeccakF1600,.-__KeccakF1600 + +.type KeccakF1600,@function +.align 32 +KeccakF1600: +.cfi_startproc + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + leaq 100(%rdi),%rdi + subq $200,%rsp +.cfi_adjust_cfa_offset 200 + + notq -92(%rdi) + notq -84(%rdi) + notq -36(%rdi) + notq -4(%rdi) + notq 36(%rdi) + notq 60(%rdi) + + leaq iotas(%rip),%r15 + leaq 100(%rsp),%rsi + + call __KeccakF1600 + + notq -92(%rdi) + notq -84(%rdi) + notq -36(%rdi) + notq -4(%rdi) + notq 36(%rdi) + notq 60(%rdi) + leaq -100(%rdi),%rdi + + addq $200,%rsp +.cfi_adjust_cfa_offset -200 + + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx + .byte 0xf3,0xc3 +.cfi_endproc +.size KeccakF1600,.-KeccakF1600 +.globl SHA3_absorb +.type SHA3_absorb,@function +.align 32 +SHA3_absorb: +.cfi_startproc + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + leaq 100(%rdi),%rdi + subq $232,%rsp +.cfi_adjust_cfa_offset 232 + + movq %rsi,%r9 + leaq 100(%rsp),%rsi + + notq -92(%rdi) + notq -84(%rdi) + notq -36(%rdi) + notq -4(%rdi) + notq 36(%rdi) + notq 60(%rdi) + leaq iotas(%rip),%r15 + + movq %rcx,216-100(%rsi) + +.Loop_absorb: + cmpq %rcx,%rdx + jc .Ldone_absorb + + shrq $3,%rcx + leaq -100(%rdi),%r8 + +.Lblock_absorb: + movq (%r9),%rax + leaq 8(%r9),%r9 + xorq (%r8),%rax + leaq 8(%r8),%r8 + subq $8,%rdx + movq %rax,-8(%r8) + subq $1,%rcx + jnz .Lblock_absorb + + movq %r9,200-100(%rsi) + movq %rdx,208-100(%rsi) + call __KeccakF1600 + movq 200-100(%rsi),%r9 + movq 208-100(%rsi),%rdx + movq 216-100(%rsi),%rcx + jmp .Loop_absorb + +.align 32 +.Ldone_absorb: + movq %rdx,%rax + + notq -92(%rdi) + notq -84(%rdi) + notq -36(%rdi) + notq -4(%rdi) + notq 36(%rdi) + notq 60(%rdi) + + addq $232,%rsp +.cfi_adjust_cfa_offset -232 + + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx + .byte 0xf3,0xc3 +.cfi_endproc +.size SHA3_absorb,.-SHA3_absorb +.globl SHA3_squeeze +.type SHA3_squeeze,@function +.align 32 +SHA3_squeeze: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-24 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-32 + + shrq $3,%rcx + movq %rdi,%r8 + movq %rsi,%r12 + movq %rdx,%r13 + movq %rcx,%r14 + jmp .Loop_squeeze + +.align 32 +.Loop_squeeze: + cmpq $8,%r13 + jb .Ltail_squeeze + + movq (%r8),%rax + leaq 8(%r8),%r8 + movq %rax,(%r12) + leaq 8(%r12),%r12 + subq $8,%r13 + jz .Ldone_squeeze + + subq $1,%rcx + jnz .Loop_squeeze + + call KeccakF1600 + movq %rdi,%r8 + movq %r14,%rcx + jmp .Loop_squeeze + +.Ltail_squeeze: + movq %r8,%rsi + movq %r12,%rdi + movq %r13,%rcx +.byte 0xf3,0xa4 + +.Ldone_squeeze: + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + .byte 0xf3,0xc3 +.cfi_endproc +.size SHA3_squeeze,.-SHA3_squeeze +.align 256 +.quad 0,0,0,0,0,0,0,0 +.type iotas,@object +iotas: +.quad 0x0000000000000001 +.quad 0x0000000000008082 +.quad 0x800000000000808a +.quad 0x8000000080008000 +.quad 0x000000000000808b +.quad 0x0000000080000001 +.quad 0x8000000080008081 +.quad 0x8000000000008009 +.quad 0x000000000000008a +.quad 0x0000000000000088 +.quad 0x0000000080008009 +.quad 0x000000008000000a +.quad 0x000000008000808b +.quad 0x800000000000008b +.quad 0x8000000000008089 +.quad 0x8000000000008003 +.quad 0x8000000000008002 +.quad 0x8000000000000080 +.quad 0x000000000000800a +.quad 0x800000008000000a +.quad 0x8000000080008081 +.quad 0x8000000000008080 +.quad 0x0000000080000001 +.quad 0x8000000080008008 +.size iotas,.-iotas +.byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha1-mb-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha1-mb-x86_64.s index d266d776ec..1a0de0f100 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha1-mb-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha1-mb-x86_64.s @@ -6,17 +6,22 @@ .type sha1_multi_block,@function .align 32 sha1_multi_block: +.cfi_startproc movq OPENSSL_ia32cap_P+4(%rip),%rcx btq $61,%rcx jc _shaext_shortcut testl $268435456,%ecx jnz _avx_shortcut movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbx,-24 subq $288,%rsp andq $-256,%rsp movq %rax,272(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08 .Lbody: leaq K_XX_XX(%rip),%rbp leaq 256(%rsp),%rbx @@ -2546,19 +2551,28 @@ sha1_multi_block: .Ldone: movq 272(%rsp),%rax +.cfi_def_cfa %rax,8 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_multi_block,.-sha1_multi_block .type sha1_multi_block_shaext,@function .align 32 sha1_multi_block_shaext: +.cfi_startproc _shaext_shortcut: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 subq $288,%rsp shll $1,%edx andq $-256,%rsp @@ -2914,14 +2928,19 @@ _shaext_shortcut: .Ldone_shaext: movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_shaext: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_multi_block_shaext,.-sha1_multi_block_shaext .type sha1_multi_block_avx,@function .align 32 sha1_multi_block_avx: +.cfi_startproc _avx_shortcut: shrq $32,%rcx cmpl $2,%edx @@ -2932,11 +2951,15 @@ _avx_shortcut: .align 32 .Lavx: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 subq $288,%rsp andq $-256,%rsp movq %rax,272(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08 .Lbody_avx: leaq K_XX_XX(%rip),%rbp leaq 256(%rsp),%rbx @@ -4986,27 +5009,41 @@ _avx_shortcut: .Ldone_avx: movq 272(%rsp),%rax +.cfi_def_cfa %rax,8 vzeroupper movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_multi_block_avx,.-sha1_multi_block_avx .type sha1_multi_block_avx2,@function .align 32 sha1_multi_block_avx2: +.cfi_startproc _avx2_shortcut: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 subq $576,%rsp andq $-256,%rsp movq %rax,544(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xa0,0x04,0x06,0x23,0x08 .Lbody_avx2: leaq K_XX_XX(%rip),%rbp shrl $1,%edx @@ -7193,16 +7230,25 @@ _avx2_shortcut: .Ldone_avx2: movq 544(%rsp),%rax +.cfi_def_cfa %rax,8 vzeroupper movq -48(%rax),%r15 +.cfi_restore %r15 movq -40(%rax),%r14 +.cfi_restore %r14 movq -32(%rax),%r13 +.cfi_restore %r13 movq -24(%rax),%r12 +.cfi_restore %r12 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx2: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_multi_block_avx2,.-sha1_multi_block_avx2 .align 256 diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha1-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha1-x86_64.s index dbeebed9a0..e436521a04 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha1-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha1-x86_64.s @@ -5,6 +5,7 @@ .type sha1_block_data_order,@function .align 16 sha1_block_data_order: +.cfi_startproc movl OPENSSL_ia32cap_P+0(%rip),%r9d movl OPENSSL_ia32cap_P+4(%rip),%r8d movl OPENSSL_ia32cap_P+8(%rip),%r10d @@ -25,17 +26,24 @@ sha1_block_data_order: .align 16 .Lialu: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 movq %rdi,%r8 subq $72,%rsp movq %rsi,%r9 andq $-64,%rsp movq %rdx,%r10 movq %rax,64(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xc0,0x00,0x06,0x23,0x08 .Lprologue: movl 0(%r8),%esi @@ -1230,19 +1238,28 @@ sha1_block_data_order: jnz .Lloop movq 64(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_block_data_order,.-sha1_block_data_order .type sha1_block_data_order_shaext,@function .align 32 sha1_block_data_order_shaext: _shaext_shortcut: +.cfi_startproc movdqu (%rdi),%xmm0 movd 16(%rdi),%xmm1 movdqa K_XX_XX+160(%rip),%xmm3 @@ -1404,20 +1421,27 @@ _shaext_shortcut: pshufd $27,%xmm1,%xmm1 movdqu %xmm0,(%rdi) movd %xmm1,16(%rdi) +.cfi_endproc .byte 0xf3,0xc3 .size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext .type sha1_block_data_order_ssse3,@function .align 16 sha1_block_data_order_ssse3: _ssse3_shortcut: - movq %rsp,%rax +.cfi_startproc + movq %rsp,%r11 +.cfi_def_cfa_register %r11 pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 leaq -64(%rsp),%rsp - movq %rax,%r14 andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 @@ -1425,7 +1449,7 @@ _ssse3_shortcut: shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r11 + leaq K_XX_XX+64(%rip),%r14 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -1437,8 +1461,8 @@ _ssse3_shortcut: xorl %edx,%edi andl %edi,%esi - movdqa 64(%r11),%xmm6 - movdqa -64(%r11),%xmm9 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -1514,7 +1538,7 @@ _ssse3_shortcut: pslld $2,%xmm9 pxor %xmm10,%xmm4 xorl %ebp,%edx - movdqa -64(%r11),%xmm10 + movdqa -64(%r14),%xmm10 roll $5,%ecx addl %edi,%ebx andl %edx,%esi @@ -1575,7 +1599,7 @@ _ssse3_shortcut: pslld $2,%xmm10 pxor %xmm8,%xmm5 xorl %eax,%ebp - movdqa -32(%r11),%xmm8 + movdqa -32(%r14),%xmm8 roll $5,%edx addl %edi,%ecx andl %ebp,%esi @@ -1636,7 +1660,7 @@ _ssse3_shortcut: pslld $2,%xmm8 pxor %xmm9,%xmm6 xorl %ebx,%eax - movdqa -32(%r11),%xmm9 + movdqa -32(%r14),%xmm9 roll $5,%ebp addl %edi,%edx andl %eax,%esi @@ -1697,7 +1721,7 @@ _ssse3_shortcut: pslld $2,%xmm9 pxor %xmm10,%xmm7 xorl %ecx,%ebx - movdqa -32(%r11),%xmm10 + movdqa -32(%r14),%xmm10 roll $5,%eax addl %edi,%ebp andl %ebx,%esi @@ -1808,7 +1832,7 @@ _ssse3_shortcut: pxor %xmm3,%xmm2 addl %esi,%eax xorl %edx,%edi - movdqa 0(%r11),%xmm10 + movdqa 0(%r14),%xmm10 rorl $7,%ecx paddd %xmm1,%xmm9 addl %ebx,%eax @@ -2043,7 +2067,7 @@ _ssse3_shortcut: pxor %xmm0,%xmm7 roll $5,%ebx addl %esi,%eax - movdqa 32(%r11),%xmm9 + movdqa 32(%r14),%xmm9 xorl %ecx,%edi paddd %xmm6,%xmm8 xorl %edx,%ecx @@ -2334,8 +2358,8 @@ _ssse3_shortcut: addl %edx,%ecx cmpq %r10,%r9 je .Ldone_ssse3 - movdqa 64(%r11),%xmm6 - movdqa -64(%r11),%xmm9 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -2572,29 +2596,41 @@ _ssse3_shortcut: movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) - leaq (%r14),%rsi - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_ssse3: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 .type sha1_block_data_order_avx,@function .align 16 sha1_block_data_order_avx: _avx_shortcut: - movq %rsp,%rax +.cfi_startproc + movq %rsp,%r11 +.cfi_def_cfa_register %r11 pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 leaq -64(%rsp),%rsp vzeroupper - movq %rax,%r14 andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 @@ -2602,7 +2638,7 @@ _avx_shortcut: shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r11 + leaq K_XX_XX+64(%rip),%r14 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -2614,8 +2650,8 @@ _avx_shortcut: xorl %edx,%edi andl %edi,%esi - vmovdqa 64(%r11),%xmm6 - vmovdqa -64(%r11),%xmm11 + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 vmovdqu 0(%r9),%xmm0 vmovdqu 16(%r9),%xmm1 vmovdqu 32(%r9),%xmm2 @@ -2740,7 +2776,7 @@ _avx_shortcut: vpxor %xmm10,%xmm5,%xmm5 xorl %eax,%ebp shldl $5,%edx,%edx - vmovdqa -32(%r11),%xmm11 + vmovdqa -32(%r14),%xmm11 addl %edi,%ecx andl %ebp,%esi xorl %eax,%ebp @@ -2953,7 +2989,7 @@ _avx_shortcut: addl %esi,%eax xorl %edx,%edi vpaddd %xmm1,%xmm11,%xmm9 - vmovdqa 0(%r11),%xmm11 + vmovdqa 0(%r14),%xmm11 shrdl $7,%ecx,%ecx addl %ebx,%eax vpxor %xmm8,%xmm2,%xmm2 @@ -3172,7 +3208,7 @@ _avx_shortcut: movl %ebx,%edi xorl %edx,%esi vpaddd %xmm6,%xmm11,%xmm9 - vmovdqa 32(%r11),%xmm11 + vmovdqa 32(%r14),%xmm11 shldl $5,%ebx,%ebx addl %esi,%eax vpxor %xmm8,%xmm7,%xmm7 @@ -3451,8 +3487,8 @@ _avx_shortcut: addl %edx,%ecx cmpq %r10,%r9 je .Ldone_avx - vmovdqa 64(%r11),%xmm6 - vmovdqa -64(%r11),%xmm11 + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 vmovdqu 0(%r9),%xmm0 vmovdqu 16(%r9),%xmm1 vmovdqu 32(%r9),%xmm2 @@ -3688,28 +3724,40 @@ _avx_shortcut: movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) - leaq (%r14),%rsi - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_block_data_order_avx,.-sha1_block_data_order_avx .type sha1_block_data_order_avx2,@function .align 16 sha1_block_data_order_avx2: _avx2_shortcut: - movq %rsp,%rax +.cfi_startproc + movq %rsp,%r11 +.cfi_def_cfa_register %r11 pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 vzeroupper - movq %rax,%r14 movq %rdi,%r8 movq %rsi,%r9 movq %rdx,%r10 @@ -3719,7 +3767,7 @@ _avx2_shortcut: leaq 64(%r9),%r13 andq $-128,%rsp addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r11 + leaq K_XX_XX+64(%rip),%r14 movl 0(%r8),%eax cmpq %r10,%r13 @@ -3728,7 +3776,7 @@ _avx2_shortcut: movl 8(%r8),%ecx movl 12(%r8),%edx movl 16(%r8),%esi - vmovdqu 64(%r11),%ymm6 + vmovdqu 64(%r14),%ymm6 vmovdqu (%r9),%xmm0 vmovdqu 16(%r9),%xmm1 @@ -3742,7 +3790,7 @@ _avx2_shortcut: vpshufb %ymm6,%ymm1,%ymm1 vinserti128 $1,48(%r13),%ymm3,%ymm3 vpshufb %ymm6,%ymm2,%ymm2 - vmovdqu -64(%r11),%ymm11 + vmovdqu -64(%r14),%ymm11 vpshufb %ymm6,%ymm3,%ymm3 vpaddd %ymm11,%ymm0,%ymm4 @@ -3774,7 +3822,7 @@ _avx2_shortcut: vpxor %ymm3,%ymm8,%ymm8 vpxor %ymm8,%ymm5,%ymm5 vpsrld $31,%ymm5,%ymm8 - vmovdqu -32(%r11),%ymm11 + vmovdqu -32(%r14),%ymm11 vpslldq $12,%ymm5,%ymm10 vpaddd %ymm5,%ymm5,%ymm5 vpsrld $30,%ymm10,%ymm9 @@ -3928,7 +3976,7 @@ _avx2_shortcut: addl -56(%r13),%ebp andnl %esi,%ebx,%edi vpxor %ymm3,%ymm2,%ymm2 - vmovdqu 0(%r11),%ymm11 + vmovdqu 0(%r14),%ymm11 addl %ecx,%ebp rorxl $27,%ebx,%r12d rorxl $2,%ebx,%ecx @@ -4159,7 +4207,7 @@ _avx2_shortcut: addl -116(%r13),%eax leal (%rax,%rbx,1),%eax vpxor %ymm0,%ymm7,%ymm7 - vmovdqu 32(%r11),%ymm11 + vmovdqu 32(%r14),%ymm11 rorxl $27,%ebp,%r12d rorxl $2,%ebp,%ebx xorl %ecx,%ebp @@ -4604,7 +4652,7 @@ _avx2_shortcut: cmpq %r10,%r9 je .Ldone_avx2 - vmovdqu 64(%r11),%ymm6 + vmovdqu 64(%r14),%ymm6 cmpq %r10,%rdi ja .Last_avx2 @@ -4820,7 +4868,7 @@ _avx2_shortcut: xorl %ebx,%eax addl %r12d,%esi xorl %ecx,%eax - vmovdqu -64(%r11),%ymm11 + vmovdqu -64(%r14),%ymm11 vpshufb %ymm6,%ymm0,%ymm0 addl 68(%r13),%edx leal (%rdx,%rax,1),%edx @@ -5176,7 +5224,7 @@ _avx2_shortcut: xorl %ebp,%esi addl %r12d,%edx vpsrld $31,%ymm5,%ymm8 - vmovdqu -32(%r11),%ymm11 + vmovdqu -32(%r14),%ymm11 xorl %ebx,%esi addl 104(%r13),%ecx leal (%rcx,%rsi,1),%ecx @@ -5369,15 +5417,21 @@ _avx2_shortcut: .Ldone_avx2: vzeroupper - leaq (%r14),%rsi - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx2: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 .align 64 K_XX_XX: diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha256-mb-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha256-mb-x86_64.s index f2896b4d6e..59cf9c984e 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha256-mb-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha256-mb-x86_64.s @@ -6,17 +6,22 @@ .type sha256_multi_block,@function .align 32 sha256_multi_block: +.cfi_startproc movq OPENSSL_ia32cap_P+4(%rip),%rcx btq $61,%rcx jc _shaext_shortcut testl $268435456,%ecx jnz _avx_shortcut movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 subq $288,%rsp andq $-256,%rsp movq %rax,272(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08 .Lbody: leaq K256+128(%rip),%rbp leaq 256(%rsp),%rbx @@ -2615,19 +2620,28 @@ sha256_multi_block: .Ldone: movq 272(%rsp),%rax +.cfi_def_cfa %rax,8 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size sha256_multi_block,.-sha256_multi_block .type sha256_multi_block_shaext,@function .align 32 sha256_multi_block_shaext: +.cfi_startproc _shaext_shortcut: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 subq $288,%rsp shll $1,%edx andq $-256,%rsp @@ -3102,14 +3116,19 @@ _shaext_shortcut: .Ldone_shaext: movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_shaext: .byte 0xf3,0xc3 +.cfi_endproc .size sha256_multi_block_shaext,.-sha256_multi_block_shaext .type sha256_multi_block_avx,@function .align 32 sha256_multi_block_avx: +.cfi_startproc _avx_shortcut: shrq $32,%rcx cmpl $2,%edx @@ -3120,11 +3139,15 @@ _avx_shortcut: .align 32 .Lavx: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 subq $288,%rsp andq $-256,%rsp movq %rax,272(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08 .Lbody_avx: leaq K256+128(%rip),%rbp leaq 256(%rsp),%rbx @@ -5353,27 +5376,41 @@ _avx_shortcut: .Ldone_avx: movq 272(%rsp),%rax +.cfi_def_cfa %rax,8 vzeroupper movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: .byte 0xf3,0xc3 +.cfi_endproc .size sha256_multi_block_avx,.-sha256_multi_block_avx .type sha256_multi_block_avx2,@function .align 32 sha256_multi_block_avx2: +.cfi_startproc _avx2_shortcut: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 subq $576,%rsp andq $-256,%rsp movq %rax,544(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xa0,0x04,0x06,0x23,0x08 .Lbody_avx2: leaq K256+128(%rip),%rbp leaq 128(%rdi),%rdi @@ -7738,16 +7775,25 @@ _avx2_shortcut: .Ldone_avx2: movq 544(%rsp),%rax +.cfi_def_cfa %rax,8 vzeroupper movq -48(%rax),%r15 +.cfi_restore %r15 movq -40(%rax),%r14 +.cfi_restore %r14 movq -32(%rax),%r13 +.cfi_restore %r13 movq -24(%rax),%r12 +.cfi_restore %r12 movq -16(%rax),%rbp +.cfi_restore %rbp movq -8(%rax),%rbx +.cfi_restore %rbx leaq (%rax),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx2: .byte 0xf3,0xc3 +.cfi_endproc .size sha256_multi_block_avx2,.-sha256_multi_block_avx2 .align 256 K256: diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha256-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha256-x86_64.s index 8264a7dbdf..42b24df18e 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha256-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha256-x86_64.s @@ -5,6 +5,7 @@ .type sha256_block_data_order,@function .align 16 sha256_block_data_order: +.cfi_startproc leaq OPENSSL_ia32cap_P(%rip),%r11 movl 0(%r11),%r9d movl 4(%r11),%r10d @@ -21,13 +22,20 @@ sha256_block_data_order: je .Lavx_shortcut testl $512,%r10d jnz .Lssse3_shortcut + movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 - movq %rsp,%r11 +.cfi_offset %r15,-56 shlq $4,%rdx subq $64+32,%rsp leaq (%rsi,%rdx,4),%rdx @@ -35,7 +43,8 @@ sha256_block_data_order: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 .Lprologue: movl 0(%rdi),%eax @@ -1699,16 +1708,25 @@ sha256_block_data_order: movl %r11d,28(%rdi) jb .Lloop - movq 64+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size sha256_block_data_order,.-sha256_block_data_order .align 64 .type K256,@object @@ -1963,14 +1981,22 @@ _shaext_shortcut: .type sha256_block_data_order_ssse3,@function .align 64 sha256_block_data_order_ssse3: +.cfi_startproc .Lssse3_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 - movq %rsp,%r11 +.cfi_offset %r15,-56 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -1978,7 +2004,8 @@ sha256_block_data_order_ssse3: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 .Lprologue_ssse3: movl 0(%rdi),%eax @@ -3044,28 +3071,45 @@ sha256_block_data_order_ssse3: movl %r11d,28(%rdi) jb .Lloop_ssse3 - movq 64+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_ssse3: .byte 0xf3,0xc3 +.cfi_endproc .size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 .type sha256_block_data_order_avx,@function .align 64 sha256_block_data_order_avx: +.cfi_startproc .Lavx_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 - movq %rsp,%r11 +.cfi_offset %r15,-56 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -3073,7 +3117,8 @@ sha256_block_data_order_avx: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 .Lprologue_avx: vzeroupper @@ -4100,29 +4145,46 @@ sha256_block_data_order_avx: movl %r11d,28(%rdi) jb .Lloop_avx - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: .byte 0xf3,0xc3 +.cfi_endproc .size sha256_block_data_order_avx,.-sha256_block_data_order_avx .type sha256_block_data_order_avx2,@function .align 64 sha256_block_data_order_avx2: +.cfi_startproc .Lavx2_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 - movq %rsp,%r11 +.cfi_offset %r15,-56 subq $544,%rsp shlq $4,%rdx andq $-1024,%rsp @@ -4131,7 +4193,8 @@ sha256_block_data_order_avx2: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 .Lprologue_avx2: vzeroupper @@ -5344,15 +5407,24 @@ sha256_block_data_order_avx2: .Ldone_avx2: leaq (%rbp),%rsp - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx2: .byte 0xf3,0xc3 +.cfi_endproc .size sha256_block_data_order_avx2,.-sha256_block_data_order_avx2 diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha512-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha512-x86_64.s index 6f8488a38a..5931a2a932 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha512-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/sha/sha512-x86_64.s @@ -5,6 +5,7 @@ .type sha512_block_data_order,@function .align 16 sha512_block_data_order: +.cfi_startproc leaq OPENSSL_ia32cap_P(%rip),%r11 movl 0(%r11),%r9d movl 4(%r11),%r10d @@ -19,13 +20,20 @@ sha512_block_data_order: orl %r9d,%r10d cmpl $1342177792,%r10d je .Lavx_shortcut + movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 - movq %rsp,%r11 +.cfi_offset %r15,-56 shlq $4,%rdx subq $128+32,%rsp leaq (%rsi,%rdx,8),%rdx @@ -33,7 +41,8 @@ sha512_block_data_order: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 .Lprologue: movq 0(%rdi),%rax @@ -1697,16 +1706,25 @@ sha512_block_data_order: movq %r11,56(%rdi) jb .Lloop - movq 128+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size sha512_block_data_order,.-sha512_block_data_order .align 64 .type K512,@object @@ -1798,14 +1816,22 @@ K512: .type sha512_block_data_order_xop,@function .align 64 sha512_block_data_order_xop: +.cfi_startproc .Lxop_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 - movq %rsp,%r11 +.cfi_offset %r15,-56 shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx @@ -1813,7 +1839,8 @@ sha512_block_data_order_xop: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 .Lprologue_xop: vzeroupper @@ -2866,29 +2893,46 @@ sha512_block_data_order_xop: movq %r11,56(%rdi) jb .Lloop_xop - movq 128+24(%rsp),%rsi + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_xop: .byte 0xf3,0xc3 +.cfi_endproc .size sha512_block_data_order_xop,.-sha512_block_data_order_xop .type sha512_block_data_order_avx,@function .align 64 sha512_block_data_order_avx: +.cfi_startproc .Lavx_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 - movq %rsp,%r11 +.cfi_offset %r15,-56 shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx @@ -2896,7 +2940,8 @@ sha512_block_data_order_avx: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 .Lprologue_avx: vzeroupper @@ -4013,29 +4058,46 @@ sha512_block_data_order_avx: movq %r11,56(%rdi) jb .Lloop_avx - movq 128+24(%rsp),%rsi + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: .byte 0xf3,0xc3 +.cfi_endproc .size sha512_block_data_order_avx,.-sha512_block_data_order_avx .type sha512_block_data_order_avx2,@function .align 64 sha512_block_data_order_avx2: +.cfi_startproc .Lavx2_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 - movq %rsp,%r11 +.cfi_offset %r15,-56 subq $1312,%rsp shlq $4,%rdx andq $-2048,%rsp @@ -4044,7 +4106,8 @@ sha512_block_data_order_avx2: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 .Lprologue_avx2: vzeroupper @@ -5351,15 +5414,24 @@ sha512_block_data_order_avx2: .Ldone_avx2: leaq (%rbp),%rsp - movq 128+24(%rsp),%rsi + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx2: .byte 0xf3,0xc3 +.cfi_endproc .size sha512_block_data_order_avx2,.-sha512_block_data_order_avx2 diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/whrlpool/wp-x86_64.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/whrlpool/wp-x86_64.s index a4d55b6afc..2c261f398a 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/whrlpool/wp-x86_64.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/whrlpool/wp-x86_64.s @@ -4,14 +4,22 @@ .type whirlpool_block,@function .align 16 whirlpool_block: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 - movq %rsp,%r11 subq $128+40,%rsp andq $-64,%rsp @@ -19,7 +27,8 @@ whirlpool_block: movq %rdi,0(%r10) movq %rsi,8(%r10) movq %rdx,16(%r10) - movq %r11,32(%r10) + movq %rax,32(%r10) +.cfi_escape 0x0f,0x06,0x77,0xa0,0x01,0x06,0x23,0x08 .Lprologue: movq %r10,%rbx @@ -579,15 +588,24 @@ whirlpool_block: jmp .Louterloop .Lalldone: movq 32(%rbx),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size whirlpool_block,.-whirlpool_block .align 64 diff --git a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/x86_64cpuid.s b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/x86_64cpuid.s index 7e1f5e2740..fd17eaaba4 100644 --- a/deps/openssl/config/archs/BSD-x86_64/asm/crypto/x86_64cpuid.s +++ b/deps/openssl/config/archs/BSD-x86_64/asm/crypto/x86_64cpuid.s @@ -36,10 +36,12 @@ OPENSSL_rdtsc: .type OPENSSL_ia32_cpuid,@function .align 16 OPENSSL_ia32_cpuid: +.cfi_startproc movq %rbx,%r8 +.cfi_register %rbx,%r8 xorl %eax,%eax - movl %eax,8(%rdi) + movq %rax,8(%rdi) cpuid movl %eax,%r11d @@ -110,6 +112,7 @@ OPENSSL_ia32_cpuid: .Lnocacheinfo: movl $1,%eax cpuid + movd %eax,%xmm0 andl $0xbfefffff,%edx cmpl $0,%r9d jne .Lnotintel @@ -157,26 +160,45 @@ OPENSSL_ia32_cpuid: jc .Lnotknights andl $0xfff7ffff,%ebx .Lnotknights: + movd %xmm0,%eax + andl $0x0fff0ff0,%eax + cmpl $0x00050650,%eax + jne .Lnotskylakex + andl $0xfffeffff,%ebx + +.Lnotskylakex: movl %ebx,8(%rdi) + movl %ecx,12(%rdi) .Lno_extended_info: btl $27,%r9d jnc .Lclear_avx xorl %ecx,%ecx .byte 0x0f,0x01,0xd0 + andl $0xe6,%eax + cmpl $0xe6,%eax + je .Ldone + andl $0x3fdeffff,8(%rdi) + + + + andl $6,%eax cmpl $6,%eax je .Ldone .Lclear_avx: movl $0xefffe7ff,%eax andl %eax,%r9d - andl $0xffffffdf,8(%rdi) + movl $0x3fdeffdf,%eax + andl %eax,8(%rdi) .Ldone: shlq $32,%r9 movl %r10d,%eax movq %r8,%rbx +.cfi_restore %rbx orq %r9,%rax .byte 0xf3,0xc3 +.cfi_endproc .size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid .globl OPENSSL_cleanse @@ -222,6 +244,18 @@ CRYPTO_memcmp: xorq %r10,%r10 cmpq $0,%rdx je .Lno_data + cmpq $16,%rdx + jne .Loop_cmp + movq (%rdi),%r10 + movq 8(%rdi),%r11 + movq $1,%rdx + xorq (%rsi),%r10 + xorq 8(%rsi),%r11 + orq %r11,%r10 + cmovnzq %rdx,%rax + .byte 0xf3,0xc3 + +.align 16 .Loop_cmp: movb (%rdi),%r10b leaq 1(%rdi),%rdi @@ -345,21 +379,6 @@ OPENSSL_instrument_bus2: subq %rcx,%rax .byte 0xf3,0xc3 .size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2 -.globl OPENSSL_ia32_rdrand -.type OPENSSL_ia32_rdrand,@function -.align 16 -OPENSSL_ia32_rdrand: - movl $8,%ecx -.Loop_rdrand: -.byte 72,15,199,240 - jc .Lbreak_rdrand - loop .Loop_rdrand -.Lbreak_rdrand: - cmpq $0,%rax - cmoveq %rcx,%rax - .byte 0xf3,0xc3 -.size OPENSSL_ia32_rdrand,.-OPENSSL_ia32_rdrand - .globl OPENSSL_ia32_rdrand_bytes .type OPENSSL_ia32_rdrand_bytes,@function .align 16 @@ -393,28 +412,14 @@ OPENSSL_ia32_rdrand_bytes: movb %r10b,(%rdi) leaq 1(%rdi),%rdi incq %rax - shrq $8,%r8 + shrq $8,%r10 decq %rsi jnz .Ltail_rdrand_bytes .Ldone_rdrand_bytes: + xorq %r10,%r10 .byte 0xf3,0xc3 .size OPENSSL_ia32_rdrand_bytes,.-OPENSSL_ia32_rdrand_bytes -.globl OPENSSL_ia32_rdseed -.type OPENSSL_ia32_rdseed,@function -.align 16 -OPENSSL_ia32_rdseed: - movl $8,%ecx -.Loop_rdseed: -.byte 72,15,199,248 - jc .Lbreak_rdseed - loop .Loop_rdseed -.Lbreak_rdseed: - cmpq $0,%rax - cmoveq %rcx,%rax - .byte 0xf3,0xc3 -.size OPENSSL_ia32_rdseed,.-OPENSSL_ia32_rdseed - .globl OPENSSL_ia32_rdseed_bytes .type OPENSSL_ia32_rdseed_bytes,@function .align 16 @@ -448,10 +453,11 @@ OPENSSL_ia32_rdseed_bytes: movb %r10b,(%rdi) leaq 1(%rdi),%rdi incq %rax - shrq $8,%r8 + shrq $8,%r10 decq %rsi jnz .Ltail_rdseed_bytes .Ldone_rdseed_bytes: + xorq %r10,%r10 .byte 0xf3,0xc3 .size OPENSSL_ia32_rdseed_bytes,.-OPENSSL_ia32_rdseed_bytes |