diff options
Diffstat (limited to 'deps/openssl/config/archs/VC-WIN64A/asm/crypto')
31 files changed, 8810 insertions, 795 deletions
diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aes-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aes-x86_64.asm index 923e31ec9e..5babb865fa 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aes-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aes-x86_64.asm @@ -346,15 +346,23 @@ $L$SEH_begin_AES_encrypt: mov rdx,r8 + + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - mov r10,rsp + lea rcx,[((-63))+rdx] and rsp,-64 sub rcx,rsp @@ -364,7 +372,8 @@ $L$SEH_begin_AES_encrypt: sub rsp,32 mov QWORD[16+rsp],rsi - mov QWORD[24+rsp],r10 + mov QWORD[24+rsp],rax + $L$enc_prologue: mov r15,rdx @@ -391,22 +400,31 @@ $L$enc_prologue: mov r9,QWORD[16+rsp] mov rsi,QWORD[24+rsp] + mov DWORD[r9],eax mov DWORD[4+r9],ebx mov DWORD[8+r9],ecx mov DWORD[12+r9],edx - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$enc_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_AES_encrypt: ALIGN 16 @@ -804,15 +822,23 @@ $L$SEH_begin_AES_decrypt: mov rdx,r8 + + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - mov r10,rsp + lea rcx,[((-63))+rdx] and rsp,-64 sub rcx,rsp @@ -822,7 +848,8 @@ $L$SEH_begin_AES_decrypt: sub rsp,32 mov QWORD[16+rsp],rsi - mov QWORD[24+rsp],r10 + mov QWORD[24+rsp],rax + $L$dec_prologue: mov r15,rdx @@ -851,22 +878,31 @@ $L$dec_prologue: mov r9,QWORD[16+rsp] mov rsi,QWORD[24+rsp] + mov DWORD[r9],eax mov DWORD[4+r9],ebx mov DWORD[8+r9],ecx mov DWORD[12+r9],edx - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$dec_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_AES_decrypt: global AES_set_encrypt_key @@ -881,24 +917,36 @@ $L$SEH_begin_AES_set_encrypt_key: mov rdx,r8 + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp,8 + $L$enc_key_prologue: call _x86_64_AES_set_encrypt_key mov rbp,QWORD[40+rsp] + mov rbx,QWORD[48+rsp] + add rsp,56 + $L$enc_key_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_AES_set_encrypt_key: @@ -1153,13 +1201,21 @@ $L$SEH_begin_AES_set_decrypt_key: mov rdx,r8 + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + push rdx + $L$dec_key_prologue: call _x86_64_AES_set_encrypt_key @@ -1327,16 +1383,24 @@ $L$permute: xor rax,rax $L$abort: mov r15,QWORD[8+rsp] + mov r14,QWORD[16+rsp] + mov r13,QWORD[24+rsp] + mov r12,QWORD[32+rsp] + mov rbp,QWORD[40+rsp] + mov rbx,QWORD[48+rsp] + add rsp,56 + $L$dec_key_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_AES_set_decrypt_key: global AES_cbc_encrypt @@ -1358,25 +1422,32 @@ $L$SEH_begin_AES_cbc_encrypt: mov r9,QWORD[48+rsp] + cmp rdx,0 je NEAR $L$cbc_epilogue pushfq + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + $L$cbc_prologue: cld mov r9d,r9d lea r14,[$L$AES_Te] + lea r10,[$L$AES_Td] cmp r9,0 - jne NEAR $L$cbc_picked_te - lea r14,[$L$AES_Td] -$L$cbc_picked_te: + cmove r14,r10 mov r10d,DWORD[OPENSSL_ia32cap_P] cmp rdx,512 @@ -1413,7 +1484,9 @@ $L$cbc_te_ok: xchg r15,rsp + mov QWORD[16+rsp],r15 + $L$cbc_fast_body: mov QWORD[24+rsp],rdi mov QWORD[32+rsp],rsi @@ -1795,19 +1868,29 @@ $L$cbc_slow_dec_partial: ALIGN 16 $L$cbc_exit: mov rsi,QWORD[16+rsp] + mov r15,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r13,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + mov rbp,QWORD[32+rsi] + mov rbx,QWORD[40+rsi] + lea rsp,[48+rsi] + $L$cbc_popfq: popfq + $L$cbc_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_AES_cbc_encrypt: ALIGN 64 $L$AES_Te: @@ -2632,7 +2715,6 @@ block_se_handler: jae NEAR $L$in_block_prologue mov rax,QWORD[24+rax] - lea rax,[48+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aesni-mb-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aesni-mb-x86_64.asm index e7fdb2142a..9891df39f0 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aesni-mb-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aesni-mb-x86_64.asm @@ -20,6 +20,7 @@ $L$SEH_begin_aesni_multi_cbc_encrypt: mov rdx,r8 + cmp edx,2 jb NEAR $L$enc_non_avx mov ecx,DWORD[((OPENSSL_ia32cap_P+4))] @@ -29,12 +30,19 @@ $L$SEH_begin_aesni_multi_cbc_encrypt: ALIGN 16 $L$enc_non_avx: mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + lea rsp,[((-168))+rsp] movaps XMMWORD[rsp],xmm6 movaps XMMWORD[16+rsp],xmm7 @@ -56,6 +64,7 @@ $L$enc_non_avx: and rsp,-64 mov QWORD[16+rsp],rax + $L$enc4x_body: movdqu xmm12,XMMWORD[rsi] lea rsi,[120+rsi] @@ -264,6 +273,7 @@ DB 102,15,56,221,232 jnz NEAR $L$oop_enc4x mov rax,QWORD[16+rsp] + mov edx,DWORD[24+rsp] @@ -291,16 +301,24 @@ $L$enc4x_done: mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$enc4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_aesni_multi_cbc_encrypt: global aesni_multi_cbc_decrypt @@ -316,6 +334,7 @@ $L$SEH_begin_aesni_multi_cbc_decrypt: mov rdx,r8 + cmp edx,2 jb NEAR $L$dec_non_avx mov ecx,DWORD[((OPENSSL_ia32cap_P+4))] @@ -325,12 +344,19 @@ $L$SEH_begin_aesni_multi_cbc_decrypt: ALIGN 16 $L$dec_non_avx: mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + lea rsp,[((-168))+rsp] movaps XMMWORD[rsp],xmm6 movaps XMMWORD[16+rsp],xmm7 @@ -352,6 +378,7 @@ $L$dec_non_avx: and rsp,-64 mov QWORD[16+rsp],rax + $L$dec4x_body: movdqu xmm12,XMMWORD[rsi] lea rsi,[120+rsi] @@ -560,6 +587,7 @@ DB 102,65,15,56,223,233 jnz NEAR $L$oop_dec4x mov rax,QWORD[16+rsp] + mov edx,DWORD[24+rsp] lea rdi,[160+rdi] @@ -578,16 +606,24 @@ $L$dec4x_done: mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$dec4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_aesni_multi_cbc_decrypt: ALIGN 32 @@ -601,14 +637,22 @@ $L$SEH_begin_aesni_multi_cbc_encrypt_avx: mov rdx,r8 + _avx_cbc_enc_shortcut: mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + lea rsp,[((-168))+rsp] movaps XMMWORD[rsp],xmm6 movaps XMMWORD[16+rsp],xmm7 @@ -632,6 +676,7 @@ _avx_cbc_enc_shortcut: and rsp,-128 mov QWORD[16+rsp],rax + $L$enc8x_body: vzeroupper vmovdqu xmm15,XMMWORD[rsi] @@ -1033,6 +1078,7 @@ $L$enc8x_tail: + $L$enc8x_done: vzeroupper movaps xmm6,XMMWORD[((-216))+rax] @@ -1046,16 +1092,24 @@ $L$enc8x_done: movaps xmm14,XMMWORD[((-88))+rax] movaps xmm15,XMMWORD[((-72))+rax] mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$enc8x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_aesni_multi_cbc_encrypt_avx: @@ -1070,14 +1124,22 @@ $L$SEH_begin_aesni_multi_cbc_decrypt_avx: mov rdx,r8 + _avx_cbc_dec_shortcut: mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + lea rsp,[((-168))+rsp] movaps XMMWORD[rsp],xmm6 movaps XMMWORD[16+rsp],xmm7 @@ -1103,6 +1165,7 @@ _avx_cbc_dec_shortcut: sub rsp,192 mov QWORD[16+rsp],rax + $L$dec8x_body: vzeroupper vmovdqu xmm15,XMMWORD[rsi] @@ -1542,6 +1605,7 @@ $L$dec8x_tail: + $L$dec8x_done: vzeroupper movaps xmm6,XMMWORD[((-216))+rax] @@ -1555,16 +1619,24 @@ $L$dec8x_done: movaps xmm14,XMMWORD[((-88))+rax] movaps xmm15,XMMWORD[((-72))+rax] mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$dec8x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_aesni_multi_cbc_decrypt_avx: EXTERN __imp_RtlVirtualUnwind diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aesni-sha1-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aesni-sha1-x86_64.asm index 45fa82e223..925d1be94a 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aesni-sha1-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aesni-sha1-x86_64.asm @@ -38,18 +38,26 @@ $L$SEH_begin_aesni_cbc_sha1_enc_ssse3: mov r9,QWORD[48+rsp] + mov r10,QWORD[56+rsp] push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + lea rsp,[((-264))+rsp] + movaps XMMWORD[(96+0)+rsp],xmm6 movaps XMMWORD[(96+16)+rsp],xmm7 movaps XMMWORD[(96+32)+rsp],xmm8 @@ -1400,17 +1408,26 @@ DB 102,15,56,221,209 movaps xmm14,XMMWORD[((96+128))+rsp] movaps xmm15,XMMWORD[((96+144))+rsp] lea rsi,[264+rsp] + mov r15,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r13,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + mov rbp,QWORD[32+rsi] + mov rbx,QWORD[40+rsi] + lea rsp,[48+rsi] + $L$epilogue_ssse3: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_aesni_cbc_sha1_enc_ssse3: ALIGN 32 @@ -1427,18 +1444,26 @@ $L$SEH_begin_aesni_cbc_sha1_enc_avx: mov r9,QWORD[48+rsp] + mov r10,QWORD[56+rsp] push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + lea rsp,[((-264))+rsp] + movaps XMMWORD[(96+0)+rsp],xmm6 movaps XMMWORD[(96+16)+rsp],xmm7 movaps XMMWORD[(96+32)+rsp],xmm8 @@ -2733,17 +2758,26 @@ $L$vaesenclast10: movaps xmm14,XMMWORD[((96+128))+rsp] movaps xmm15,XMMWORD[((96+144))+rsp] lea rsi,[264+rsp] + mov r15,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r13,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + mov rbp,QWORD[32+rsi] + mov rbx,QWORD[40+rsi] + lea rsp,[48+rsi] + $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_aesni_cbc_sha1_enc_avx: ALIGN 64 K_XX_XX: diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aesni-sha256-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aesni-sha256-x86_64.asm index f148890c00..b5d50c74db 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aesni-sha256-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aesni-sha256-x86_64.asm @@ -98,15 +98,23 @@ $L$SEH_begin_aesni_cbc_sha256_enc_xop: mov r9,QWORD[48+rsp] + $L$xop_shortcut: mov r10,QWORD[56+rsp] + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - mov r11,rsp + sub rsp,288 and rsp,-64 @@ -122,7 +130,8 @@ $L$xop_shortcut: mov QWORD[((64+32))+rsp],r8 mov QWORD[((64+40))+rsp],r9 mov QWORD[((64+48))+rsp],r10 - mov QWORD[((64+56))+rsp],r11 + mov QWORD[120+rsp],rax + movaps XMMWORD[128+rsp],xmm6 movaps XMMWORD[144+rsp],xmm7 movaps XMMWORD[160+rsp],xmm8 @@ -1238,7 +1247,8 @@ DB 143,232,120,194,239,2 jb NEAR $L$loop_xop mov r8,QWORD[((64+32))+rsp] - mov rsi,QWORD[((64+56))+rsp] + mov rsi,QWORD[120+rsp] + vmovdqu XMMWORD[r8],xmm8 vzeroall movaps xmm6,XMMWORD[128+rsp] @@ -1251,17 +1261,25 @@ DB 143,232,120,194,239,2 movaps xmm13,XMMWORD[240+rsp] movaps xmm14,XMMWORD[256+rsp] movaps xmm15,XMMWORD[272+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$epilogue_xop: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_aesni_cbc_sha256_enc_xop: ALIGN 64 @@ -1278,15 +1296,23 @@ $L$SEH_begin_aesni_cbc_sha256_enc_avx: mov r9,QWORD[48+rsp] + $L$avx_shortcut: mov r10,QWORD[56+rsp] + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - mov r11,rsp + sub rsp,288 and rsp,-64 @@ -1302,7 +1328,8 @@ $L$avx_shortcut: mov QWORD[((64+32))+rsp],r8 mov QWORD[((64+40))+rsp],r9 mov QWORD[((64+48))+rsp],r10 - mov QWORD[((64+56))+rsp],r11 + mov QWORD[120+rsp],rax + movaps XMMWORD[128+rsp],xmm6 movaps XMMWORD[144+rsp],xmm7 movaps XMMWORD[160+rsp],xmm8 @@ -2449,7 +2476,8 @@ $L$avx_00_47: jb NEAR $L$loop_avx mov r8,QWORD[((64+32))+rsp] - mov rsi,QWORD[((64+56))+rsp] + mov rsi,QWORD[120+rsp] + vmovdqu XMMWORD[r8],xmm8 vzeroall movaps xmm6,XMMWORD[128+rsp] @@ -2462,17 +2490,25 @@ $L$avx_00_47: movaps xmm13,XMMWORD[240+rsp] movaps xmm14,XMMWORD[256+rsp] movaps xmm15,XMMWORD[272+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_aesni_cbc_sha256_enc_avx: ALIGN 64 @@ -2489,15 +2525,23 @@ $L$SEH_begin_aesni_cbc_sha256_enc_avx2: mov r9,QWORD[48+rsp] + $L$avx2_shortcut: mov r10,QWORD[56+rsp] + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - mov r11,rsp + sub rsp,736 and rsp,-256*4 add rsp,448 @@ -2514,7 +2558,8 @@ $L$avx2_shortcut: mov QWORD[((64+32))+rsp],r8 mov QWORD[((64+40))+rsp],r9 mov QWORD[((64+48))+rsp],r10 - mov QWORD[((64+56))+rsp],r11 + mov QWORD[120+rsp],rax + movaps XMMWORD[128+rsp],xmm6 movaps XMMWORD[144+rsp],xmm7 movaps XMMWORD[160+rsp],xmm8 @@ -4086,7 +4131,8 @@ $L$ower_avx2: $L$done_avx2: lea rsp,[rbp] mov r8,QWORD[((64+32))+rsp] - mov rsi,QWORD[((64+56))+rsp] + mov rsi,QWORD[120+rsp] + vmovdqu XMMWORD[r8],xmm8 vzeroall movaps xmm6,XMMWORD[128+rsp] @@ -4099,17 +4145,25 @@ $L$done_avx2: movaps xmm13,XMMWORD[240+rsp] movaps xmm14,XMMWORD[256+rsp] movaps xmm15,XMMWORD[272+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$epilogue_avx2: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_aesni_cbc_sha256_enc_avx2: ALIGN 32 @@ -4554,7 +4608,6 @@ $L$not_in_shaext: $L$not_in_avx2: mov rsi,rax mov rax,QWORD[((64+56))+rax] - lea rax,[48+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aesni-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aesni-x86_64.asm index 0f4790eead..3daf8476c3 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aesni-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/aesni-x86_64.asm @@ -1100,6 +1100,7 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks: mov r8,QWORD[40+rsp] + cmp rdx,1 jne NEAR $L$ctr32_bulk @@ -1129,22 +1130,23 @@ DB 102,15,56,221,209 ALIGN 16 $L$ctr32_bulk: - lea rax,[rsp] + lea r11,[rsp] + push rbp + sub rsp,288 and rsp,-16 - movaps XMMWORD[(-168)+rax],xmm6 - movaps XMMWORD[(-152)+rax],xmm7 - movaps XMMWORD[(-136)+rax],xmm8 - movaps XMMWORD[(-120)+rax],xmm9 - movaps XMMWORD[(-104)+rax],xmm10 - movaps XMMWORD[(-88)+rax],xmm11 - movaps XMMWORD[(-72)+rax],xmm12 - movaps XMMWORD[(-56)+rax],xmm13 - movaps XMMWORD[(-40)+rax],xmm14 - movaps XMMWORD[(-24)+rax],xmm15 + movaps XMMWORD[(-168)+r11],xmm6 + movaps XMMWORD[(-152)+r11],xmm7 + movaps XMMWORD[(-136)+r11],xmm8 + movaps XMMWORD[(-120)+r11],xmm9 + movaps XMMWORD[(-104)+r11],xmm10 + movaps XMMWORD[(-88)+r11],xmm11 + movaps XMMWORD[(-72)+r11],xmm12 + movaps XMMWORD[(-56)+r11],xmm13 + movaps XMMWORD[(-40)+r11],xmm14 + movaps XMMWORD[(-24)+r11],xmm15 $L$ctr32_body: - lea rbp,[((-8))+rax] @@ -1153,7 +1155,7 @@ $L$ctr32_body: movdqu xmm0,XMMWORD[rcx] mov r8d,DWORD[12+r8] pxor xmm2,xmm0 - mov r11d,DWORD[12+rcx] + mov ebp,DWORD[12+rcx] movdqa XMMWORD[rsp],xmm2 bswap r8d movdqa xmm3,xmm2 @@ -1169,8 +1171,8 @@ $L$ctr32_body: lea rdx,[2+r8] bswap eax bswap edx - xor eax,r11d - xor edx,r11d + xor eax,ebp + xor edx,ebp DB 102,15,58,34,216,3 lea rax,[3+r8] movdqa XMMWORD[16+rsp],xmm3 @@ -1179,25 +1181,25 @@ DB 102,15,58,34,226,3 mov rdx,r10 lea r10,[4+r8] movdqa XMMWORD[32+rsp],xmm4 - xor eax,r11d + xor eax,ebp bswap r10d DB 102,15,58,34,232,3 - xor r10d,r11d + xor r10d,ebp movdqa XMMWORD[48+rsp],xmm5 lea r9,[5+r8] mov DWORD[((64+12))+rsp],r10d bswap r9d lea r10,[6+r8] mov eax,DWORD[240+rcx] - xor r9d,r11d + xor r9d,ebp bswap r10d mov DWORD[((80+12))+rsp],r9d - xor r10d,r11d + xor r10d,ebp lea r9,[7+r8] mov DWORD[((96+12))+rsp],r10d bswap r9d mov r10d,DWORD[((OPENSSL_ia32cap_P+4))] - xor r9d,r11d + xor r9d,ebp and r10d,71303168 mov DWORD[((112+12))+rsp],r9d @@ -1221,7 +1223,7 @@ ALIGN 16 $L$ctr32_6x: shl eax,4 mov r10d,48 - bswap r11d + bswap ebp lea rcx,[32+rax*1+rcx] sub r10,rax jmp NEAR $L$ctr32_loop6 @@ -1232,32 +1234,32 @@ $L$ctr32_loop6: movups xmm0,XMMWORD[((-48))+r10*1+rcx] DB 102,15,56,220,209 mov eax,r8d - xor eax,r11d + xor eax,ebp DB 102,15,56,220,217 DB 0x0f,0x38,0xf1,0x44,0x24,12 lea eax,[1+r8] DB 102,15,56,220,225 - xor eax,r11d + xor eax,ebp DB 0x0f,0x38,0xf1,0x44,0x24,28 DB 102,15,56,220,233 lea eax,[2+r8] - xor eax,r11d + xor eax,ebp DB 102,15,56,220,241 DB 0x0f,0x38,0xf1,0x44,0x24,44 lea eax,[3+r8] DB 102,15,56,220,249 movups xmm1,XMMWORD[((-32))+r10*1+rcx] - xor eax,r11d + xor eax,ebp DB 102,15,56,220,208 DB 0x0f,0x38,0xf1,0x44,0x24,60 lea eax,[4+r8] DB 102,15,56,220,216 - xor eax,r11d + xor eax,ebp DB 0x0f,0x38,0xf1,0x44,0x24,76 DB 102,15,56,220,224 lea eax,[5+r8] - xor eax,r11d + xor eax,ebp DB 102,15,56,220,232 DB 0x0f,0x38,0xf1,0x44,0x24,92 mov rax,r10 @@ -1318,7 +1320,7 @@ DB 102,15,56,220,217 bswap r9d movups xmm0,XMMWORD[((32-128))+rcx] DB 102,15,56,220,225 - xor r9d,r11d + xor r9d,ebp nop DB 102,15,56,220,233 mov DWORD[((0+12))+rsp],r9d @@ -1331,7 +1333,7 @@ DB 102,68,15,56,220,201 bswap r9d DB 102,15,56,220,208 DB 102,15,56,220,216 - xor r9d,r11d + xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,224 DB 102,15,56,220,232 @@ -1345,7 +1347,7 @@ DB 102,68,15,56,220,200 bswap r9d DB 102,15,56,220,209 DB 102,15,56,220,217 - xor r9d,r11d + xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,225 DB 102,15,56,220,233 @@ -1359,7 +1361,7 @@ DB 102,68,15,56,220,201 bswap r9d DB 102,15,56,220,208 DB 102,15,56,220,216 - xor r9d,r11d + xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,224 DB 102,15,56,220,232 @@ -1373,7 +1375,7 @@ DB 102,68,15,56,220,200 bswap r9d DB 102,15,56,220,209 DB 102,15,56,220,217 - xor r9d,r11d + xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,225 DB 102,15,56,220,233 @@ -1387,7 +1389,7 @@ DB 102,68,15,56,220,201 bswap r9d DB 102,15,56,220,208 DB 102,15,56,220,216 - xor r9d,r11d + xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,224 DB 102,15,56,220,232 @@ -1401,7 +1403,7 @@ DB 102,68,15,56,220,200 bswap r9d DB 102,15,56,220,209 DB 102,15,56,220,217 - xor r9d,r11d + xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,225 DB 102,15,56,220,233 @@ -1416,7 +1418,7 @@ DB 102,68,15,56,220,201 DB 102,15,56,220,208 DB 102,15,56,220,216 DB 102,15,56,220,224 - xor r9d,r11d + xor r9d,ebp movdqu xmm10,XMMWORD[rdi] DB 102,15,56,220,232 mov DWORD[((112+12))+rsp],r9d @@ -1651,32 +1653,32 @@ DB 102,15,56,221,225 $L$ctr32_done: xorps xmm0,xmm0 - xor r11d,r11d + xor ebp,ebp pxor xmm1,xmm1 pxor xmm2,xmm2 pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 - movaps xmm6,XMMWORD[((-160))+rbp] - movaps XMMWORD[(-160)+rbp],xmm0 - movaps xmm7,XMMWORD[((-144))+rbp] - movaps XMMWORD[(-144)+rbp],xmm0 - movaps xmm8,XMMWORD[((-128))+rbp] - movaps XMMWORD[(-128)+rbp],xmm0 - movaps xmm9,XMMWORD[((-112))+rbp] - movaps XMMWORD[(-112)+rbp],xmm0 - movaps xmm10,XMMWORD[((-96))+rbp] - movaps XMMWORD[(-96)+rbp],xmm0 - movaps xmm11,XMMWORD[((-80))+rbp] - movaps XMMWORD[(-80)+rbp],xmm0 - movaps xmm12,XMMWORD[((-64))+rbp] - movaps XMMWORD[(-64)+rbp],xmm0 - movaps xmm13,XMMWORD[((-48))+rbp] - movaps XMMWORD[(-48)+rbp],xmm0 - movaps xmm14,XMMWORD[((-32))+rbp] - movaps XMMWORD[(-32)+rbp],xmm0 - movaps xmm15,XMMWORD[((-16))+rbp] - movaps XMMWORD[(-16)+rbp],xmm0 + movaps xmm6,XMMWORD[((-168))+r11] + movaps XMMWORD[(-168)+r11],xmm0 + movaps xmm7,XMMWORD[((-152))+r11] + movaps XMMWORD[(-152)+r11],xmm0 + movaps xmm8,XMMWORD[((-136))+r11] + movaps XMMWORD[(-136)+r11],xmm0 + movaps xmm9,XMMWORD[((-120))+r11] + movaps XMMWORD[(-120)+r11],xmm0 + movaps xmm10,XMMWORD[((-104))+r11] + movaps XMMWORD[(-104)+r11],xmm0 + movaps xmm11,XMMWORD[((-88))+r11] + movaps XMMWORD[(-88)+r11],xmm0 + movaps xmm12,XMMWORD[((-72))+r11] + movaps XMMWORD[(-72)+r11],xmm0 + movaps xmm13,XMMWORD[((-56))+r11] + movaps XMMWORD[(-56)+r11],xmm0 + movaps xmm14,XMMWORD[((-40))+r11] + movaps XMMWORD[(-40)+r11],xmm0 + movaps xmm15,XMMWORD[((-24))+r11] + movaps XMMWORD[(-24)+r11],xmm0 movaps XMMWORD[rsp],xmm0 movaps XMMWORD[16+rsp],xmm0 movaps XMMWORD[32+rsp],xmm0 @@ -1685,12 +1687,15 @@ $L$ctr32_done: movaps XMMWORD[80+rsp],xmm0 movaps XMMWORD[96+rsp],xmm0 movaps XMMWORD[112+rsp],xmm0 - lea rsp,[rbp] - pop rbp + mov rbp,QWORD[((-8))+r11] + + lea rsp,[r11] + $L$ctr32_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_aesni_ctr32_encrypt_blocks: global aesni_xts_encrypt @@ -1708,22 +1713,24 @@ $L$SEH_begin_aesni_xts_encrypt: mov r9,QWORD[48+rsp] - lea rax,[rsp] + + lea r11,[rsp] + push rbp + sub rsp,272 and rsp,-16 - movaps XMMWORD[(-168)+rax],xmm6 - movaps XMMWORD[(-152)+rax],xmm7 - movaps XMMWORD[(-136)+rax],xmm8 - movaps XMMWORD[(-120)+rax],xmm9 - movaps XMMWORD[(-104)+rax],xmm10 - movaps XMMWORD[(-88)+rax],xmm11 - movaps XMMWORD[(-72)+rax],xmm12 - movaps XMMWORD[(-56)+rax],xmm13 - movaps XMMWORD[(-40)+rax],xmm14 - movaps XMMWORD[(-24)+rax],xmm15 + movaps XMMWORD[(-168)+r11],xmm6 + movaps XMMWORD[(-152)+r11],xmm7 + movaps XMMWORD[(-136)+r11],xmm8 + movaps XMMWORD[(-120)+r11],xmm9 + movaps XMMWORD[(-104)+r11],xmm10 + movaps XMMWORD[(-88)+r11],xmm11 + movaps XMMWORD[(-72)+r11],xmm12 + movaps XMMWORD[(-56)+r11],xmm13 + movaps XMMWORD[(-40)+r11],xmm14 + movaps XMMWORD[(-24)+r11],xmm15 $L$xts_enc_body: - lea rbp,[((-8))+rax] movups xmm2,XMMWORD[r9] mov eax,DWORD[240+r8] mov r10d,DWORD[240+rcx] @@ -1739,7 +1746,7 @@ DB 102,15,56,220,209 jnz NEAR $L$oop_enc1_8 DB 102,15,56,221,209 movups xmm0,XMMWORD[rcx] - mov r11,rcx + mov rbp,rcx mov eax,r10d shl r10d,4 mov r9,rdx @@ -1795,9 +1802,9 @@ DB 102,15,56,221,209 jc NEAR $L$xts_enc_short mov eax,16+96 - lea rcx,[32+r10*1+r11] + lea rcx,[32+r10*1+rbp] sub rax,r10 - movups xmm1,XMMWORD[16+r11] + movups xmm1,XMMWORD[16+rbp] mov r10,rax lea r8,[$L$xts_magic] jmp NEAR $L$xts_enc_grandloop @@ -1822,7 +1829,7 @@ DB 102,15,56,220,225 movdqa xmm9,XMMWORD[96+rsp] pxor xmm6,xmm14 DB 102,15,56,220,233 - movups xmm0,XMMWORD[32+r11] + movups xmm0,XMMWORD[32+rbp] lea rdi,[96+rdi] pxor xmm7,xmm8 @@ -1831,7 +1838,7 @@ DB 102,15,56,220,241 pxor xmm11,xmm9 movdqa XMMWORD[rsp],xmm10 DB 102,15,56,220,249 - movups xmm1,XMMWORD[48+r11] + movups xmm1,XMMWORD[48+rbp] pxor xmm12,xmm9 DB 102,15,56,220,208 @@ -1846,7 +1853,7 @@ DB 102,15,56,220,232 movdqa XMMWORD[64+rsp],xmm14 DB 102,15,56,220,240 DB 102,15,56,220,248 - movups xmm0,XMMWORD[64+r11] + movups xmm0,XMMWORD[64+rbp] movdqa XMMWORD[80+rsp],xmm8 pshufd xmm9,xmm15,0x5f jmp NEAR $L$xts_enc_loop6 @@ -1878,7 +1885,7 @@ DB 102,15,56,220,209 psrad xmm14,31 DB 102,15,56,220,217 pand xmm14,xmm8 - movups xmm10,XMMWORD[r11] + movups xmm10,XMMWORD[rbp] DB 102,15,56,220,225 DB 102,15,56,220,233 DB 102,15,56,220,241 @@ -1946,10 +1953,10 @@ DB 102,15,56,220,217 DB 102,15,56,220,225 DB 102,15,56,220,233 pxor xmm15,xmm0 - movups xmm0,XMMWORD[r11] + movups xmm0,XMMWORD[rbp] DB 102,15,56,220,241 DB 102,15,56,220,249 - movups xmm1,XMMWORD[16+r11] + movups xmm1,XMMWORD[16+rbp] pxor xmm14,xmm15 DB 102,15,56,221,84,36,0 @@ -1976,7 +1983,7 @@ DB 102,15,56,221,124,36,80 mov eax,16+96 sub eax,r10d - mov rcx,r11 + mov rcx,rbp shr eax,4 $L$xts_enc_short: @@ -2132,7 +2139,7 @@ $L$xts_enc_steal: jnz NEAR $L$xts_enc_steal sub rsi,r9 - mov rcx,r11 + mov rcx,rbp mov eax,r10d movups xmm2,XMMWORD[((-16))+rsi] @@ -2158,26 +2165,26 @@ $L$xts_enc_ret: pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 - movaps xmm6,XMMWORD[((-160))+rbp] - movaps XMMWORD[(-160)+rbp],xmm0 - movaps xmm7,XMMWORD[((-144))+rbp] - movaps XMMWORD[(-144)+rbp],xmm0 - movaps xmm8,XMMWORD[((-128))+rbp] - movaps XMMWORD[(-128)+rbp],xmm0 - movaps xmm9,XMMWORD[((-112))+rbp] - movaps XMMWORD[(-112)+rbp],xmm0 - movaps xmm10,XMMWORD[((-96))+rbp] - movaps XMMWORD[(-96)+rbp],xmm0 - movaps xmm11,XMMWORD[((-80))+rbp] - movaps XMMWORD[(-80)+rbp],xmm0 - movaps xmm12,XMMWORD[((-64))+rbp] - movaps XMMWORD[(-64)+rbp],xmm0 - movaps xmm13,XMMWORD[((-48))+rbp] - movaps XMMWORD[(-48)+rbp],xmm0 - movaps xmm14,XMMWORD[((-32))+rbp] - movaps XMMWORD[(-32)+rbp],xmm0 - movaps xmm15,XMMWORD[((-16))+rbp] - movaps XMMWORD[(-16)+rbp],xmm0 + movaps xmm6,XMMWORD[((-168))+r11] + movaps XMMWORD[(-168)+r11],xmm0 + movaps xmm7,XMMWORD[((-152))+r11] + movaps XMMWORD[(-152)+r11],xmm0 + movaps xmm8,XMMWORD[((-136))+r11] + movaps XMMWORD[(-136)+r11],xmm0 + movaps xmm9,XMMWORD[((-120))+r11] + movaps XMMWORD[(-120)+r11],xmm0 + movaps xmm10,XMMWORD[((-104))+r11] + movaps XMMWORD[(-104)+r11],xmm0 + movaps xmm11,XMMWORD[((-88))+r11] + movaps XMMWORD[(-88)+r11],xmm0 + movaps xmm12,XMMWORD[((-72))+r11] + movaps XMMWORD[(-72)+r11],xmm0 + movaps xmm13,XMMWORD[((-56))+r11] + movaps XMMWORD[(-56)+r11],xmm0 + movaps xmm14,XMMWORD[((-40))+r11] + movaps XMMWORD[(-40)+r11],xmm0 + movaps xmm15,XMMWORD[((-24))+r11] + movaps XMMWORD[(-24)+r11],xmm0 movaps XMMWORD[rsp],xmm0 movaps XMMWORD[16+rsp],xmm0 movaps XMMWORD[32+rsp],xmm0 @@ -2185,12 +2192,15 @@ $L$xts_enc_ret: movaps XMMWORD[64+rsp],xmm0 movaps XMMWORD[80+rsp],xmm0 movaps XMMWORD[96+rsp],xmm0 - lea rsp,[rbp] - pop rbp + mov rbp,QWORD[((-8))+r11] + + lea rsp,[r11] + $L$xts_enc_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_aesni_xts_encrypt: global aesni_xts_decrypt @@ -2208,22 +2218,24 @@ $L$SEH_begin_aesni_xts_decrypt: mov r9,QWORD[48+rsp] - lea rax,[rsp] + + lea r11,[rsp] + push rbp + sub rsp,272 and rsp,-16 - movaps XMMWORD[(-168)+rax],xmm6 - movaps XMMWORD[(-152)+rax],xmm7 - movaps XMMWORD[(-136)+rax],xmm8 - movaps XMMWORD[(-120)+rax],xmm9 - movaps XMMWORD[(-104)+rax],xmm10 - movaps XMMWORD[(-88)+rax],xmm11 - movaps XMMWORD[(-72)+rax],xmm12 - movaps XMMWORD[(-56)+rax],xmm13 - movaps XMMWORD[(-40)+rax],xmm14 - movaps XMMWORD[(-24)+rax],xmm15 + movaps XMMWORD[(-168)+r11],xmm6 + movaps XMMWORD[(-152)+r11],xmm7 + movaps XMMWORD[(-136)+r11],xmm8 + movaps XMMWORD[(-120)+r11],xmm9 + movaps XMMWORD[(-104)+r11],xmm10 + movaps XMMWORD[(-88)+r11],xmm11 + movaps XMMWORD[(-72)+r11],xmm12 + movaps XMMWORD[(-56)+r11],xmm13 + movaps XMMWORD[(-40)+r11],xmm14 + movaps XMMWORD[(-24)+r11],xmm15 $L$xts_dec_body: - lea rbp,[((-8))+rax] movups xmm2,XMMWORD[r9] mov eax,DWORD[240+r8] mov r10d,DWORD[240+rcx] @@ -2245,7 +2257,7 @@ DB 102,15,56,221,209 sub rdx,rax movups xmm0,XMMWORD[rcx] - mov r11,rcx + mov rbp,rcx mov eax,r10d shl r10d,4 mov r9,rdx @@ -2301,9 +2313,9 @@ DB 102,15,56,221,209 jc NEAR $L$xts_dec_short mov eax,16+96 - lea rcx,[32+r10*1+r11] + lea rcx,[32+r10*1+rbp] sub rax,r10 - movups xmm1,XMMWORD[16+r11] + movups xmm1,XMMWORD[16+rbp] mov r10,rax lea r8,[$L$xts_magic] jmp NEAR $L$xts_dec_grandloop @@ -2328,7 +2340,7 @@ DB 102,15,56,222,225 movdqa xmm9,XMMWORD[96+rsp] pxor xmm6,xmm14 DB 102,15,56,222,233 - movups xmm0,XMMWORD[32+r11] + movups xmm0,XMMWORD[32+rbp] lea rdi,[96+rdi] pxor xmm7,xmm8 @@ -2337,7 +2349,7 @@ DB 102,15,56,222,241 pxor xmm11,xmm9 movdqa XMMWORD[rsp],xmm10 DB 102,15,56,222,249 - movups xmm1,XMMWORD[48+r11] + movups xmm1,XMMWORD[48+rbp] pxor xmm12,xmm9 DB 102,15,56,222,208 @@ -2352,7 +2364,7 @@ DB 102,15,56,222,232 movdqa XMMWORD[64+rsp],xmm14 DB 102,15,56,222,240 DB 102,15,56,222,248 - movups xmm0,XMMWORD[64+r11] + movups xmm0,XMMWORD[64+rbp] movdqa XMMWORD[80+rsp],xmm8 pshufd xmm9,xmm15,0x5f jmp NEAR $L$xts_dec_loop6 @@ -2384,7 +2396,7 @@ DB 102,15,56,222,209 psrad xmm14,31 DB 102,15,56,222,217 pand xmm14,xmm8 - movups xmm10,XMMWORD[r11] + movups xmm10,XMMWORD[rbp] DB 102,15,56,222,225 DB 102,15,56,222,233 DB 102,15,56,222,241 @@ -2452,10 +2464,10 @@ DB 102,15,56,222,217 DB 102,15,56,222,225 DB 102,15,56,222,233 pxor xmm15,xmm0 - movups xmm0,XMMWORD[r11] + movups xmm0,XMMWORD[rbp] DB 102,15,56,222,241 DB 102,15,56,222,249 - movups xmm1,XMMWORD[16+r11] + movups xmm1,XMMWORD[16+rbp] pxor xmm14,xmm15 DB 102,15,56,223,84,36,0 @@ -2482,7 +2494,7 @@ DB 102,15,56,223,124,36,80 mov eax,16+96 sub eax,r10d - mov rcx,r11 + mov rcx,rbp shr eax,4 $L$xts_dec_short: @@ -2639,7 +2651,7 @@ $L$xts_dec_done: jz NEAR $L$xts_dec_ret $L$xts_dec_done2: mov rdx,r9 - mov rcx,r11 + mov rcx,rbp mov eax,r10d movups xmm2,XMMWORD[rdi] @@ -2669,7 +2681,7 @@ $L$xts_dec_steal: jnz NEAR $L$xts_dec_steal sub rsi,r9 - mov rcx,r11 + mov rcx,rbp mov eax,r10d movups xmm2,XMMWORD[rsi] @@ -2695,26 +2707,26 @@ $L$xts_dec_ret: pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 - movaps xmm6,XMMWORD[((-160))+rbp] - movaps XMMWORD[(-160)+rbp],xmm0 - movaps xmm7,XMMWORD[((-144))+rbp] - movaps XMMWORD[(-144)+rbp],xmm0 - movaps xmm8,XMMWORD[((-128))+rbp] - movaps XMMWORD[(-128)+rbp],xmm0 - movaps xmm9,XMMWORD[((-112))+rbp] - movaps XMMWORD[(-112)+rbp],xmm0 - movaps xmm10,XMMWORD[((-96))+rbp] - movaps XMMWORD[(-96)+rbp],xmm0 - movaps xmm11,XMMWORD[((-80))+rbp] - movaps XMMWORD[(-80)+rbp],xmm0 - movaps xmm12,XMMWORD[((-64))+rbp] - movaps XMMWORD[(-64)+rbp],xmm0 - movaps xmm13,XMMWORD[((-48))+rbp] - movaps XMMWORD[(-48)+rbp],xmm0 - movaps xmm14,XMMWORD[((-32))+rbp] - movaps XMMWORD[(-32)+rbp],xmm0 - movaps xmm15,XMMWORD[((-16))+rbp] - movaps XMMWORD[(-16)+rbp],xmm0 + movaps xmm6,XMMWORD[((-168))+r11] + movaps XMMWORD[(-168)+r11],xmm0 + movaps xmm7,XMMWORD[((-152))+r11] + movaps XMMWORD[(-152)+r11],xmm0 + movaps xmm8,XMMWORD[((-136))+r11] + movaps XMMWORD[(-136)+r11],xmm0 + movaps xmm9,XMMWORD[((-120))+r11] + movaps XMMWORD[(-120)+r11],xmm0 + movaps xmm10,XMMWORD[((-104))+r11] + movaps XMMWORD[(-104)+r11],xmm0 + movaps xmm11,XMMWORD[((-88))+r11] + movaps XMMWORD[(-88)+r11],xmm0 + movaps xmm12,XMMWORD[((-72))+r11] + movaps XMMWORD[(-72)+r11],xmm0 + movaps xmm13,XMMWORD[((-56))+r11] + movaps XMMWORD[(-56)+r11],xmm0 + movaps xmm14,XMMWORD[((-40))+r11] + movaps XMMWORD[(-40)+r11],xmm0 + movaps xmm15,XMMWORD[((-24))+r11] + movaps XMMWORD[(-24)+r11],xmm0 movaps XMMWORD[rsp],xmm0 movaps XMMWORD[16+rsp],xmm0 movaps XMMWORD[32+rsp],xmm0 @@ -2722,12 +2734,15 @@ $L$xts_dec_ret: movaps XMMWORD[64+rsp],xmm0 movaps XMMWORD[80+rsp],xmm0 movaps XMMWORD[96+rsp],xmm0 - lea rsp,[rbp] - pop rbp + mov rbp,QWORD[((-8))+r11] + + lea rsp,[r11] + $L$xts_dec_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_aesni_xts_decrypt: global aesni_ocb_encrypt @@ -2745,12 +2760,18 @@ $L$SEH_begin_aesni_ocb_encrypt: mov r9,QWORD[48+rsp] + lea rax,[rsp] push rbx + push rbp + push r12 + push r13 + push r14 + lea rsp,[((-160))+rsp] movaps XMMWORD[rsp],xmm6 movaps XMMWORD[16+rsp],xmm7 @@ -2950,16 +2971,23 @@ $L$ocb_enc_done: movaps XMMWORD[144+rsp],xmm0 lea rax,[((160+40))+rsp] $L$ocb_enc_pop: - lea rsp,[160+rsp] - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbp,QWORD[((-16))+rax] + + mov rbx,QWORD[((-8))+rax] + + lea rsp,[rax] + $L$ocb_enc_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_aesni_ocb_encrypt: @@ -3184,12 +3212,18 @@ $L$SEH_begin_aesni_ocb_decrypt: mov r9,QWORD[48+rsp] + lea rax,[rsp] push rbx + push rbp + push r12 + push r13 + push r14 + lea rsp,[((-160))+rsp] movaps XMMWORD[rsp],xmm6 movaps XMMWORD[16+rsp],xmm7 @@ -3411,16 +3445,23 @@ $L$ocb_dec_done: movaps XMMWORD[144+rsp],xmm0 lea rax,[((160+40))+rsp] $L$ocb_dec_pop: - lea rsp,[160+rsp] - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbp,QWORD[((-16))+rax] + + mov rbx,QWORD[((-8))+rax] + + lea rsp,[rax] + $L$ocb_dec_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_aesni_ocb_decrypt: @@ -3633,6 +3674,7 @@ $L$SEH_begin_aesni_cbc_encrypt: mov r9,QWORD[48+rsp] + test rdx,rdx jz NEAR $L$cbc_ret @@ -3725,8 +3767,10 @@ DB 102,15,56,223,209 jmp NEAR $L$cbc_ret ALIGN 16 $L$cbc_decrypt_bulk: - lea rax,[rsp] + lea r11,[rsp] + push rbp + sub rsp,176 and rsp,-16 movaps XMMWORD[16+rsp],xmm6 @@ -3740,7 +3784,7 @@ $L$cbc_decrypt_bulk: movaps XMMWORD[144+rsp],xmm14 movaps XMMWORD[160+rsp],xmm15 $L$cbc_decrypt_body: - lea rbp,[((-8))+rax] + mov rbp,rcx movups xmm10,XMMWORD[r8] mov eax,r10d cmp rdx,0x50 @@ -3780,7 +3824,7 @@ $L$cbc_dec_loop8_enter: pxor xmm3,xmm0 movups xmm1,XMMWORD[((16-112))+rcx] pxor xmm4,xmm0 - xor r11,r11 + mov rbp,-1 cmp rdx,0x70 pxor xmm5,xmm0 pxor xmm6,xmm0 @@ -3796,10 +3840,10 @@ DB 102,15,56,222,233 DB 102,15,56,222,241 DB 102,15,56,222,249 DB 102,68,15,56,222,193 - setnc r11b - shl r11,7 + adc rbp,0 + and rbp,128 DB 102,68,15,56,222,201 - add r11,rdi + add rbp,rdi movups xmm1,XMMWORD[((48-112))+rcx] DB 102,15,56,222,208 DB 102,15,56,222,216 @@ -3937,18 +3981,18 @@ DB 102,65,15,56,223,219 movdqu xmm0,XMMWORD[112+rdi] DB 102,65,15,56,223,228 lea rdi,[128+rdi] - movdqu xmm11,XMMWORD[r11] + movdqu xmm11,XMMWORD[rbp] DB 102,65,15,56,223,237 DB 102,65,15,56,223,246 - movdqu xmm12,XMMWORD[16+r11] - movdqu xmm13,XMMWORD[32+r11] + movdqu xmm12,XMMWORD[16+rbp] + movdqu xmm13,XMMWORD[32+rbp] DB 102,65,15,56,223,255 DB 102,68,15,56,223,193 - movdqu xmm14,XMMWORD[48+r11] - movdqu xmm15,XMMWORD[64+r11] + movdqu xmm14,XMMWORD[48+rbp] + movdqu xmm15,XMMWORD[64+rbp] DB 102,69,15,56,223,202 movdqa xmm10,xmm0 - movdqu xmm1,XMMWORD[80+r11] + movdqu xmm1,XMMWORD[80+rbp] movups xmm0,XMMWORD[((-112))+rcx] movups XMMWORD[rsi],xmm2 @@ -4067,7 +4111,7 @@ $L$cbc_dec_loop6_enter: pxor xmm5,xmm13 movdqu XMMWORD[32+rsi],xmm4 pxor xmm6,xmm14 - mov rcx,r11 + mov rcx,rbp movdqu XMMWORD[48+rsi],xmm5 pxor xmm7,xmm15 mov eax,r10d @@ -4236,18 +4280,23 @@ $L$cbc_dec_ret: movaps XMMWORD[144+rsp],xmm0 movaps xmm15,XMMWORD[160+rsp] movaps XMMWORD[160+rsp],xmm0 - lea rsp,[rbp] - pop rbp + mov rbp,QWORD[((-8))+r11] + + lea rsp,[r11] + $L$cbc_ret: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_aesni_cbc_encrypt: global aesni_set_decrypt_key ALIGN 16 aesni_set_decrypt_key: + DB 0x48,0x83,0xEC,0x08 + call __aesni_set_encrypt_key shl edx,4 test eax,eax @@ -4280,7 +4329,9 @@ DB 102,15,56,219,192 pxor xmm0,xmm0 $L$dec_key_ret: add rsp,8 + DB 0F3h,0C3h ;repret + $L$SEH_end_set_decrypt_key: global aesni_set_encrypt_key @@ -4288,7 +4339,9 @@ global aesni_set_encrypt_key ALIGN 16 aesni_set_encrypt_key: __aesni_set_encrypt_key: + DB 0x48,0x83,0xEC,0x08 + mov rax,-1 test rcx,rcx jz NEAR $L$enc_key_ret @@ -4581,7 +4634,9 @@ $L$enc_key_ret: pxor xmm4,xmm4 pxor xmm5,xmm5 add rsp,8 + DB 0F3h,0C3h ;repret + $L$SEH_end_set_encrypt_key: ALIGN 16 @@ -4753,13 +4808,16 @@ ctr_xts_se_handler: cmp rbx,r10 jae NEAR $L$common_seh_tail - mov rax,QWORD[160+r8] - lea rsi,[((-160))+rax] + mov rax,QWORD[208+r8] + + lea rsi,[((-168))+rax] lea rdi,[512+r8] mov ecx,20 DD 0xa548f3fc - jmp NEAR $L$common_rbp_tail + mov rbp,QWORD[((-8))+rax] + mov QWORD[160+r8],rbp + jmp NEAR $L$common_seh_tail @@ -4841,9 +4899,13 @@ cbc_se_handler: cmp rbx,r10 jb NEAR $L$common_seh_tail + mov rax,QWORD[120+r8] + lea r10,[$L$cbc_decrypt_body] cmp rbx,r10 - jb NEAR $L$restore_cbc_rax + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] lea r10,[$L$cbc_ret] cmp rbx,r10 @@ -4854,15 +4916,10 @@ cbc_se_handler: mov ecx,20 DD 0xa548f3fc -$L$common_rbp_tail: - mov rax,QWORD[160+r8] - mov rbp,QWORD[rax] - lea rax,[8+rax] - mov QWORD[160+r8],rbp - jmp NEAR $L$common_seh_tail + mov rax,QWORD[208+r8] -$L$restore_cbc_rax: - mov rax,QWORD[120+r8] + mov rbp,QWORD[((-8))+rax] + mov QWORD[160+r8],rbp $L$common_seh_tail: mov rdi,QWORD[8+rax] diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/bsaes-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/bsaes-x86_64.asm index 6d75248d1f..9ea8253d7c 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/bsaes-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/aes/bsaes-x86_64.asm @@ -1072,6 +1072,7 @@ global bsaes_cbc_encrypt ALIGN 16 bsaes_cbc_encrypt: + mov r11d,DWORD[48+rsp] cmp r11d,0 jne NEAR asm_AES_cbc_encrypt @@ -1081,12 +1082,19 @@ bsaes_cbc_encrypt: mov rax,rsp $L$cbc_dec_prologue: push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + lea rsp,[((-72))+rsp] + mov r10,QWORD[160+rsp] lea rsp,[((-160))+rsp] movaps XMMWORD[64+rsp],xmm6 @@ -1101,6 +1109,7 @@ $L$cbc_dec_prologue: movaps XMMWORD[208+rsp],xmm15 $L$cbc_dec_body: mov rbp,rsp + mov eax,DWORD[240+r9] mov r12,rcx mov r13,rdx @@ -1319,7 +1328,8 @@ $L$cbc_dec_bzero: cmp rbp,rax ja NEAR $L$cbc_dec_bzero - lea rsp,[rbp] + lea rax,[120+rbp] + movaps xmm6,XMMWORD[64+rbp] movaps xmm7,XMMWORD[80+rbp] movaps xmm8,XMMWORD[96+rbp] @@ -1330,32 +1340,48 @@ $L$cbc_dec_bzero: movaps xmm13,XMMWORD[176+rbp] movaps xmm14,XMMWORD[192+rbp] movaps xmm15,XMMWORD[208+rbp] - lea rsp,[160+rbp] - mov r15,QWORD[72+rsp] - mov r14,QWORD[80+rsp] - mov r13,QWORD[88+rsp] - mov r12,QWORD[96+rsp] - mov rbx,QWORD[104+rsp] - mov rax,QWORD[112+rsp] - lea rsp,[120+rsp] - mov rbp,rax + lea rax,[160+rax] +$L$cbc_dec_tail: + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbx,QWORD[((-16))+rax] + + mov rbp,QWORD[((-8))+rax] + + lea rsp,[rax] + $L$cbc_dec_epilogue: DB 0F3h,0C3h ;repret + global bsaes_ctr32_encrypt_blocks ALIGN 16 bsaes_ctr32_encrypt_blocks: + mov rax,rsp $L$ctr_enc_prologue: push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + lea rsp,[((-72))+rsp] + mov r10,QWORD[160+rsp] lea rsp,[((-160))+rsp] movaps XMMWORD[64+rsp],xmm6 @@ -1370,6 +1396,7 @@ $L$ctr_enc_prologue: movaps XMMWORD[208+rsp],xmm15 $L$ctr_enc_body: mov rbp,rsp + movdqu xmm0,XMMWORD[r10] mov eax,DWORD[240+r9] mov r12,rcx @@ -1543,7 +1570,8 @@ $L$ctr_enc_bzero: cmp rbp,rax ja NEAR $L$ctr_enc_bzero - lea rsp,[rbp] + lea rax,[120+rbp] + movaps xmm6,XMMWORD[64+rbp] movaps xmm7,XMMWORD[80+rbp] movaps xmm8,XMMWORD[96+rbp] @@ -1554,31 +1582,47 @@ $L$ctr_enc_bzero: movaps xmm13,XMMWORD[176+rbp] movaps xmm14,XMMWORD[192+rbp] movaps xmm15,XMMWORD[208+rbp] - lea rsp,[160+rbp] - mov r15,QWORD[72+rsp] - mov r14,QWORD[80+rsp] - mov r13,QWORD[88+rsp] - mov r12,QWORD[96+rsp] - mov rbx,QWORD[104+rsp] - mov rax,QWORD[112+rsp] - lea rsp,[120+rsp] - mov rbp,rax + lea rax,[160+rax] +$L$ctr_enc_tail: + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbx,QWORD[((-16))+rax] + + mov rbp,QWORD[((-8))+rax] + + lea rsp,[rax] + $L$ctr_enc_epilogue: DB 0F3h,0C3h ;repret + global bsaes_xts_encrypt ALIGN 16 bsaes_xts_encrypt: + mov rax,rsp $L$xts_enc_prologue: push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + lea rsp,[((-72))+rsp] + mov r10,QWORD[160+rsp] mov r11,QWORD[168+rsp] lea rsp,[((-160))+rsp] @@ -1594,6 +1638,7 @@ $L$xts_enc_prologue: movaps XMMWORD[208+rsp],xmm15 $L$xts_enc_body: mov rbp,rsp + mov r12,rcx mov r13,rdx mov r14,r8 @@ -2019,7 +2064,8 @@ $L$xts_enc_bzero: cmp rbp,rax ja NEAR $L$xts_enc_bzero - lea rsp,[rbp] + lea rax,[120+rbp] + movaps xmm6,XMMWORD[64+rbp] movaps xmm7,XMMWORD[80+rbp] movaps xmm8,XMMWORD[96+rbp] @@ -2030,32 +2076,48 @@ $L$xts_enc_bzero: movaps xmm13,XMMWORD[176+rbp] movaps xmm14,XMMWORD[192+rbp] movaps xmm15,XMMWORD[208+rbp] - lea rsp,[160+rbp] - mov r15,QWORD[72+rsp] - mov r14,QWORD[80+rsp] - mov r13,QWORD[88+rsp] - mov r12,QWORD[96+rsp] - mov rbx,QWORD[104+rsp] - mov rax,QWORD[112+rsp] - lea rsp,[120+rsp] - mov rbp,rax + lea rax,[160+rax] +$L$xts_enc_tail: + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbx,QWORD[((-16))+rax] + + mov rbp,QWORD[((-8))+rax] + + lea rsp,[rax] + $L$xts_enc_epilogue: DB 0F3h,0C3h ;repret + global bsaes_xts_decrypt ALIGN 16 bsaes_xts_decrypt: + mov rax,rsp $L$xts_dec_prologue: push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + lea rsp,[((-72))+rsp] + mov r10,QWORD[160+rsp] mov r11,QWORD[168+rsp] lea rsp,[((-160))+rsp] @@ -2522,7 +2584,8 @@ $L$xts_dec_bzero: cmp rbp,rax ja NEAR $L$xts_dec_bzero - lea rsp,[rbp] + lea rax,[120+rbp] + movaps xmm6,XMMWORD[64+rbp] movaps xmm7,XMMWORD[80+rbp] movaps xmm8,XMMWORD[96+rbp] @@ -2533,19 +2596,27 @@ $L$xts_dec_bzero: movaps xmm13,XMMWORD[176+rbp] movaps xmm14,XMMWORD[192+rbp] movaps xmm15,XMMWORD[208+rbp] - lea rsp,[160+rbp] - mov r15,QWORD[72+rsp] - mov r14,QWORD[80+rsp] - mov r13,QWORD[88+rsp] - mov r12,QWORD[96+rsp] - mov rbx,QWORD[104+rsp] - mov rax,QWORD[112+rsp] - lea rsp,[120+rsp] - mov rbp,rax + lea rax,[160+rax] +$L$xts_dec_tail: + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbx,QWORD[((-16))+rax] + + mov rbp,QWORD[((-8))+rax] + + lea rsp,[rax] + $L$xts_dec_epilogue: DB 0F3h,0C3h ;repret + ALIGN 64 _bsaes_const: $L$M0ISR: @@ -2628,30 +2699,33 @@ se_handler: mov r10d,DWORD[r11] lea r10,[r10*1+rsi] cmp rbx,r10 - jb NEAR $L$in_prologue - - mov rax,QWORD[152+r8] + jbe NEAR $L$in_prologue mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$in_prologue + mov r10d,DWORD[8+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$in_tail + mov rax,QWORD[160+r8] lea rsi,[64+rax] lea rdi,[512+r8] mov ecx,20 DD 0xa548f3fc - lea rax,[160+rax] - - mov rbp,QWORD[112+rax] - mov rbx,QWORD[104+rax] - mov r12,QWORD[96+rax] - mov r13,QWORD[88+rax] - mov r14,QWORD[80+rax] - mov r15,QWORD[72+rax] - lea rax,[120+rax] + lea rax,[((160+120))+rax] + +$L$in_tail: + mov rbp,QWORD[((-48))+rax] + mov rbx,QWORD[((-40))+rax] + mov r12,QWORD[((-32))+rax] + mov r13,QWORD[((-24))+rax] + mov r14,QWORD[((-16))+rax] + mov r15,QWORD[((-8))+rax] mov QWORD[144+r8],rbx mov QWORD[160+r8],rbp mov QWORD[216+r8],r12 @@ -2719,15 +2793,23 @@ $L$cbc_dec_info: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$cbc_dec_body wrt ..imagebase,$L$cbc_dec_epilogue wrt ..imagebase + DD $L$cbc_dec_tail wrt ..imagebase + DD 0 $L$ctr_enc_info: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$ctr_enc_body wrt ..imagebase,$L$ctr_enc_epilogue wrt ..imagebase + DD $L$ctr_enc_tail wrt ..imagebase + DD 0 $L$xts_enc_info: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$xts_enc_body wrt ..imagebase,$L$xts_enc_epilogue wrt ..imagebase + DD $L$xts_enc_tail wrt ..imagebase + DD 0 $L$xts_dec_info: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$xts_dec_body wrt ..imagebase,$L$xts_dec_epilogue wrt ..imagebase + DD $L$xts_dec_tail wrt ..imagebase + DD 0 diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/rsaz-avx2.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/rsaz-avx2.asm index 86d26158d0..02a518607d 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/rsaz-avx2.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/rsaz-avx2.asm @@ -20,13 +20,21 @@ $L$SEH_begin_rsaz_1024_sqr_avx2: mov r8,QWORD[40+rsp] + lea rax,[rsp] + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + vzeroupper lea rsp,[((-168))+rsp] vmovaps XMMWORD[(-216)+rax],xmm6 @@ -41,6 +49,7 @@ $L$SEH_begin_rsaz_1024_sqr_avx2: vmovaps XMMWORD[(-72)+rax],xmm15 $L$sqr_1024_body: mov rbp,rax + mov r13,rdx sub rsp,832 mov r15,r13 @@ -653,6 +662,8 @@ DB 0x67 vzeroall mov rax,rbp + +$L$sqr_1024_in_tail: movaps xmm6,XMMWORD[((-216))+rax] movaps xmm7,XMMWORD[((-200))+rax] movaps xmm8,XMMWORD[((-184))+rax] @@ -664,16 +675,24 @@ DB 0x67 movaps xmm14,XMMWORD[((-88))+rax] movaps xmm15,XMMWORD[((-72))+rax] mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$sqr_1024_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_rsaz_1024_sqr_avx2: global rsaz_1024_mul_avx2 @@ -690,13 +709,21 @@ $L$SEH_begin_rsaz_1024_mul_avx2: mov r8,QWORD[40+rsp] + lea rax,[rsp] + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + vzeroupper lea rsp,[((-168))+rsp] vmovaps XMMWORD[(-216)+rax],xmm6 @@ -711,6 +738,7 @@ $L$SEH_begin_rsaz_1024_mul_avx2: vmovaps XMMWORD[(-72)+rax],xmm15 $L$mul_1024_body: mov rbp,rax + vzeroall mov r13,rdx sub rsp,64 @@ -1226,6 +1254,8 @@ $L$oop_mul_1024: vzeroupper mov rax,rbp + +$L$mul_1024_in_tail: movaps xmm6,XMMWORD[((-216))+rax] movaps xmm7,XMMWORD[((-200))+rax] movaps xmm8,XMMWORD[((-184))+rax] @@ -1237,16 +1267,24 @@ $L$oop_mul_1024: movaps xmm14,XMMWORD[((-88))+rax] movaps xmm15,XMMWORD[((-72))+rax] mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$mul_1024_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_rsaz_1024_mul_avx2: global rsaz_1024_red2norm_avx2 @@ -1631,8 +1669,10 @@ global rsaz_1024_gather5_avx2 ALIGN 32 rsaz_1024_gather5_avx2: + vzeroupper mov r11,rsp + lea rax,[((-136))+rsp] $L$SEH_begin_rsaz_1024_gather5: @@ -1764,10 +1804,12 @@ $L$oop_gather_1024: movaps xmm13,XMMWORD[((-56))+r11] movaps xmm14,XMMWORD[((-40))+r11] movaps xmm15,XMMWORD[((-24))+r11] -$L$SEH_end_rsaz_1024_gather5: lea rsp,[r11] + DB 0F3h,0C3h ;repret +$L$SEH_end_rsaz_1024_gather5: + EXTERN OPENSSL_ia32cap_P global rsaz_avx2_eligible @@ -1822,14 +1864,17 @@ rsaz_se_handler: cmp rbx,r10 jb NEAR $L$common_seh_tail - mov rax,QWORD[152+r8] - mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail - mov rax,QWORD[160+r8] + mov rbp,QWORD[160+r8] + + mov r10d,DWORD[8+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + cmovc rax,rbp mov r15,QWORD[((-48))+rax] mov r14,QWORD[((-40))+rax] @@ -1907,11 +1952,13 @@ ALIGN 8 $L$SEH_info_rsaz_1024_sqr_avx2: DB 9,0,0,0 DD rsaz_se_handler wrt ..imagebase - DD $L$sqr_1024_body wrt ..imagebase,$L$sqr_1024_epilogue wrt ..imagebase + DD $L$sqr_1024_body wrt ..imagebase,$L$sqr_1024_epilogue wrt ..imagebase,$L$sqr_1024_in_tail wrt ..imagebase + DD 0 $L$SEH_info_rsaz_1024_mul_avx2: DB 9,0,0,0 DD rsaz_se_handler wrt ..imagebase - DD $L$mul_1024_body wrt ..imagebase,$L$mul_1024_epilogue wrt ..imagebase + DD $L$mul_1024_body wrt ..imagebase,$L$mul_1024_epilogue wrt ..imagebase,$L$mul_1024_in_tail wrt ..imagebase + DD 0 $L$SEH_info_rsaz_1024_gather5: DB 0x01,0x36,0x17,0x0b DB 0x36,0xf8,0x09,0x00 diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/rsaz-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/rsaz-x86_64.asm index b6384fc421..603a8d17b8 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/rsaz-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/rsaz-x86_64.asm @@ -22,14 +22,22 @@ $L$SEH_begin_rsaz_512_sqr: mov r8,QWORD[40+rsp] + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp,128+24 + $L$sqr_body: mov rbp,rdx mov rdx,QWORD[rsi] @@ -674,17 +682,26 @@ DB 102,72,15,126,205 $L$sqr_tail: lea rax,[((128+24+48))+rsp] + mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$sqr_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_rsaz_512_sqr: global rsaz_512_mul @@ -701,14 +718,22 @@ $L$SEH_begin_rsaz_512_mul: mov r8,QWORD[40+rsp] + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp,128+24 + $L$mul_body: DB 102,72,15,110,199 DB 102,72,15,110,201 @@ -770,17 +795,26 @@ $L$mul_tail: call __rsaz_512_subtract lea rax,[((128+24+48))+rsp] + mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$mul_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_rsaz_512_mul: global rsaz_512_mul_gather4 @@ -798,14 +832,22 @@ $L$SEH_begin_rsaz_512_mul_gather4: mov r9,QWORD[48+rsp] + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp,328 + movaps XMMWORD[160+rsp],xmm6 movaps XMMWORD[176+rsp],xmm7 movaps XMMWORD[192+rsp],xmm8 @@ -1215,17 +1257,26 @@ $L$mul_gather_tail: movaps xmm14,XMMWORD[((288-200))+rax] movaps xmm15,XMMWORD[((304-200))+rax] lea rax,[176+rax] + mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$mul_gather4_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_rsaz_512_mul_gather4: global rsaz_512_mul_scatter4 @@ -1243,15 +1294,23 @@ $L$SEH_begin_rsaz_512_mul_scatter4: mov r9,QWORD[48+rsp] + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + mov r9d,r9d sub rsp,128+24 + $L$mul_scatter4_body: lea r8,[r9*8+r8] DB 102,72,15,110,199 @@ -1326,17 +1385,26 @@ DB 102,72,15,126,214 mov QWORD[896+rsi],r15 lea rax,[((128+24+48))+rsp] + mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$mul_scatter4_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_rsaz_512_mul_scatter4: global rsaz_512_mul_by_one @@ -1352,14 +1420,22 @@ $L$SEH_begin_rsaz_512_mul_by_one: mov rcx,r9 + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp,128+24 + $L$mul_by_one_body: mov eax,DWORD[((OPENSSL_ia32cap_P+8))] mov rbp,rdx @@ -1402,17 +1478,26 @@ $L$by_one_tail: mov QWORD[56+rdi],r15 lea rax,[((128+24+48))+rsp] + mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$mul_by_one_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_rsaz_512_mul_by_one: ALIGN 32 diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/x86_64-gf2m.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/x86_64-gf2m.asm index 053a3f86a6..8123fd11b6 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/x86_64-gf2m.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/x86_64-gf2m.asm @@ -8,7 +8,9 @@ section .text code align=64 ALIGN 16 _mul_1x1: + sub rsp,128+8 + mov r9,-1 lea rsi,[rax*1+rax] shr r9,3 @@ -198,16 +200,20 @@ DB 102,72,15,126,199 xor rdx,rdi add rsp,128+8 + DB 0F3h,0C3h ;repret $L$end_mul_1x1: + EXTERN OPENSSL_ia32cap_P global bn_GF2m_mul_2x2 ALIGN 16 bn_GF2m_mul_2x2: - mov rax,QWORD[OPENSSL_ia32cap_P] - bt rax,33 + + mov rax,rsp + mov r10,QWORD[OPENSSL_ia32cap_P] + bt r10,33 jnc NEAR $L$vanilla_mul_2x2 DB 102,72,15,110,194 @@ -235,14 +241,20 @@ DB 102,15,58,68,229,0 ALIGN 16 $L$vanilla_mul_2x2: lea rsp,[((-136))+rsp] + mov r10,QWORD[176+rsp] mov QWORD[120+rsp],rdi mov QWORD[128+rsp],rsi mov QWORD[80+rsp],r14 + mov QWORD[88+rsp],r13 + mov QWORD[96+rsp],r12 + mov QWORD[104+rsp],rbp + mov QWORD[112+rsp],rbx + $L$body_mul_2x2: mov QWORD[32+rsp],rcx mov QWORD[40+rsp],rdx @@ -287,16 +299,24 @@ $L$body_mul_2x2: mov QWORD[8+rbp],rax mov r14,QWORD[80+rsp] + mov r13,QWORD[88+rsp] + mov r12,QWORD[96+rsp] + mov rbp,QWORD[104+rsp] + mov rbx,QWORD[112+rsp] + mov rdi,QWORD[120+rsp] mov rsi,QWORD[128+rsp] lea rsp,[136+rsp] + +$L$epilogue_mul_2x2: DB 0F3h,0C3h ;repret $L$end_mul_2x2: + DB 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105 DB 99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54 DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 @@ -319,13 +339,19 @@ se_handler: pushfq sub rsp,64 - mov rax,QWORD[152+r8] + mov rax,QWORD[120+r8] mov rbx,QWORD[248+r8] lea r10,[$L$body_mul_2x2] cmp rbx,r10 jb NEAR $L$in_prologue + mov rax,QWORD[152+r8] + + lea r10,[$L$epilogue_mul_2x2] + cmp rbx,r10 + jae NEAR $L$in_prologue + mov r14,QWORD[80+rax] mov r13,QWORD[88+rax] mov r12,QWORD[96+rax] @@ -342,8 +368,9 @@ se_handler: mov QWORD[224+r8],r13 mov QWORD[232+r8],r14 -$L$in_prologue: lea rax,[136+rax] + +$L$in_prologue: mov QWORD[152+r8],rax mov rdi,QWORD[40+r9] diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/x86_64-mont.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/x86_64-mont.asm index 26908c313b..81c205803e 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/x86_64-mont.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/x86_64-mont.asm @@ -23,8 +23,10 @@ $L$SEH_begin_bn_mul_mont: mov r9,QWORD[48+rsp] + mov r9d,r9d mov rax,rsp + test r9d,3 jnz NEAR $L$mul_enter cmp r9d,8 @@ -39,12 +41,18 @@ $L$SEH_begin_bn_mul_mont: ALIGN 16 $L$mul_enter: push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + neg r9 mov r11,rsp lea r10,[((-16))+r9*8+rsp] @@ -76,6 +84,7 @@ $L$mul_page_walk: $L$mul_page_walk_done: mov QWORD[8+r9*8+rsp],rax + $L$mul_body: mov r12,rdx mov r8,QWORD[r8] @@ -243,18 +252,27 @@ $L$copy: jnz NEAR $L$copy mov rsi,QWORD[8+r9*8+rsp] + mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$mul_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_mul_mont: ALIGN 16 @@ -271,19 +289,27 @@ $L$SEH_begin_bn_mul4x_mont: mov r9,QWORD[48+rsp] + mov r9d,r9d mov rax,rsp + $L$mul4x_enter: and r11d,0x80100 cmp r11d,0x80100 je NEAR $L$mulx4x_enter push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + neg r9 mov r11,rsp lea r10,[((-32))+r9*8+rsp] @@ -306,6 +332,7 @@ $L$mul4x_page_walk: $L$mul4x_page_walk_done: mov QWORD[8+r9*8+rsp],rax + $L$mul4x_body: mov QWORD[16+r9*8+rsp],rdi mov r12,rdx @@ -673,18 +700,27 @@ $L$copy4x: dec r15 jnz NEAR $L$copy4x mov rsi,QWORD[8+r9*8+rsp] + mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$mul4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_mul4x_mont: EXTERN bn_sqrx8x_internal EXTERN bn_sqr8x_internal @@ -704,14 +740,22 @@ $L$SEH_begin_bn_sqr8x_mont: mov r9,QWORD[48+rsp] + mov rax,rsp + $L$sqr8x_enter: push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + $L$sqr8x_prologue: mov r10d,r9d @@ -767,6 +811,7 @@ $L$sqr8x_page_walk_done: mov QWORD[32+rsp],r8 mov QWORD[40+rsp],rax + $L$sqr8x_body: DB 102,72,15,110,209 @@ -832,6 +877,7 @@ DB 102,72,15,110,200 pxor xmm0,xmm0 pshufd xmm1,xmm1,0 mov rsi,QWORD[40+rsp] + jmp NEAR $L$sqr8x_cond_copy ALIGN 32 @@ -861,16 +907,24 @@ $L$sqr8x_cond_copy: mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$sqr8x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_sqr8x_mont: ALIGN 32 @@ -887,14 +941,22 @@ $L$SEH_begin_bn_mulx4x_mont: mov r9,QWORD[48+rsp] + mov rax,rsp + $L$mulx4x_enter: push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + $L$mulx4x_prologue: shl r9d,3 @@ -940,6 +1002,7 @@ $L$mulx4x_page_walk_done: mov QWORD[24+rsp],r8 mov QWORD[32+rsp],rdi mov QWORD[40+rsp],rax + mov QWORD[48+rsp],r9 jmp NEAR $L$mulx4x_body @@ -1184,6 +1247,7 @@ DB 102,73,15,110,207 pxor xmm0,xmm0 pshufd xmm1,xmm1,0 mov rsi,QWORD[40+rsp] + jmp NEAR $L$mulx4x_cond_copy ALIGN 32 @@ -1213,16 +1277,24 @@ $L$mulx4x_cond_copy: mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$mulx4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_mulx4x_mont: DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 DB 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/x86_64-mont5.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/x86_64-mont5.asm index de93630c8f..d1855c5acf 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/x86_64-mont5.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/bn/x86_64-mont5.asm @@ -23,8 +23,10 @@ $L$SEH_begin_bn_mul_mont_gather5: mov r9,QWORD[48+rsp] + mov r9d,r9d mov rax,rsp + test r9d,7 jnz NEAR $L$mul_enter mov r11d,DWORD[((OPENSSL_ia32cap_P+8))] @@ -34,12 +36,18 @@ ALIGN 16 $L$mul_enter: movd xmm5,DWORD[56+rsp] push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + neg r9 mov r11,rsp lea r10,[((-280))+r9*8+rsp] @@ -71,6 +79,7 @@ $L$mul_page_walk_done: lea r10,[$L$inc] mov QWORD[8+r9*8+rsp],rax + $L$mul_body: lea r12,[128+rdx] @@ -428,19 +437,28 @@ $L$copy: jnz NEAR $L$copy mov rsi,QWORD[8+r9*8+rsp] + mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$mul_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_mul_mont_gather5: ALIGN 32 @@ -457,18 +475,26 @@ $L$SEH_begin_bn_mul4x_mont_gather5: mov r9,QWORD[48+rsp] + DB 0x67 mov rax,rsp + $L$mul4x_enter: and r11d,0x80108 cmp r11d,0x80108 je NEAR $L$mulx4x_enter push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + $L$mul4x_prologue: DB 0x67 @@ -524,24 +550,34 @@ $L$mul4x_page_walk_done: neg r9 mov QWORD[40+rsp],rax + $L$mul4x_body: call mul4x_internal mov rsi,QWORD[40+rsp] + mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$mul4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_mul4x_mont_gather5: @@ -1085,17 +1121,25 @@ $L$SEH_begin_bn_power5: mov r9,QWORD[48+rsp] + mov rax,rsp + mov r11d,DWORD[((OPENSSL_ia32cap_P+8))] and r11d,0x80108 cmp r11d,0x80108 je NEAR $L$powerx5_enter push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + $L$power5_prologue: shl r9d,3 @@ -1160,6 +1204,7 @@ $L$pwr_page_walk_done: mov QWORD[32+rsp],r8 mov QWORD[40+rsp],rax + $L$power5_body: DB 102,72,15,110,207 DB 102,72,15,110,209 @@ -1186,18 +1231,27 @@ DB 102,72,15,126,226 call mul4x_internal mov rsi,QWORD[40+rsp] + mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$power5_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_power5: global bn_sqr8x_internal @@ -2060,14 +2114,22 @@ $L$SEH_begin_bn_from_mont8x: mov r9,QWORD[48+rsp] + DB 0x67 mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + $L$from_prologue: shl r9d,3 @@ -2132,6 +2194,7 @@ $L$from_page_walk_done: mov QWORD[32+rsp],r8 mov QWORD[40+rsp],rax + $L$from_body: mov r11,r9 lea rax,[48+rsp] @@ -2173,7 +2236,6 @@ DB 102,73,15,110,218 pxor xmm0,xmm0 lea rax,[48+rsp] - mov rsi,QWORD[40+rsp] jmp NEAR $L$from_mont_zero ALIGN 32 @@ -2183,11 +2245,12 @@ $L$from_mont_nox: pxor xmm0,xmm0 lea rax,[48+rsp] - mov rsi,QWORD[40+rsp] jmp NEAR $L$from_mont_zero ALIGN 32 $L$from_mont_zero: + mov rsi,QWORD[40+rsp] + movdqa XMMWORD[rax],xmm0 movdqa XMMWORD[16+rax],xmm0 movdqa XMMWORD[32+rax],xmm0 @@ -2198,16 +2261,24 @@ $L$from_mont_zero: mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$from_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_from_mont8x: ALIGN 32 @@ -2224,14 +2295,22 @@ $L$SEH_begin_bn_mulx4x_mont_gather5: mov r9,QWORD[48+rsp] + mov rax,rsp + $L$mulx4x_enter: push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + $L$mulx4x_prologue: shl r9d,3 @@ -2297,23 +2376,33 @@ $L$mulx4x_page_walk_done: mov QWORD[32+rsp],r8 mov QWORD[40+rsp],rax + $L$mulx4x_body: call mulx4x_internal mov rsi,QWORD[40+rsp] + mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$mulx4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_mulx4x_mont_gather5: @@ -2753,14 +2842,22 @@ $L$SEH_begin_bn_powerx5: mov r9,QWORD[48+rsp] + mov rax,rsp + $L$powerx5_enter: push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + $L$powerx5_prologue: shl r9d,3 @@ -2832,6 +2929,7 @@ DB 102,73,15,110,218 DB 102,72,15,110,226 mov QWORD[32+rsp],r8 mov QWORD[40+rsp],rax + $L$powerx5_body: call __bn_sqrx8x_internal @@ -2854,19 +2952,28 @@ DB 102,72,15,126,226 call mulx4x_internal mov rsi,QWORD[40+rsp] + mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$powerx5_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_powerx5: global bn_sqrx8x_internal diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/buildinf.h b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/buildinf.h index c0b70ab335..3cfa6a4a22 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/buildinf.h +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/buildinf.h @@ -1,12 +1,23 @@ -/* auto-generated by util/mkbuildinf.pl for crypto/cversion.c */ -#define CFLAGS cflags /* - * Generate CFLAGS as an array of individual characters. This is a + * WARNING: do not edit! + * Generated by util/mkbuildinf.pl + * + * Copyright 2014-2017 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#define PLATFORM "platform: " +#define DATE "built on: Thu Nov 22 19:36:01 2018 UTC" + +/* + * Generate compiler_flags as an array of individual characters. This is a * workaround for the situation where CFLAGS gets too long for a C90 string * literal */ -static const char cflags[] = { - 'c','o','m','p','i','l','e','r',':',' ','c','c','\0' +static const char compiler_flags[] = { + 'c','o','m','p','i','l','e','r',':',' ','c','c',' ',' ','\0' }; -#define PLATFORM "platform: " -#define DATE "built on: Tue Nov 20 09:39:06 2018" diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/camellia/cmll-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/camellia/cmll-x86_64.asm index cb91061570..7a0f351e51 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/camellia/cmll-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/camellia/cmll-x86_64.asm @@ -32,11 +32,17 @@ $L$SEH_begin_Camellia_EncryptBlock_Rounds: mov rcx,r9 + push rbx + push rbp + push r13 + push r14 + push r15 + $L$enc_prologue: @@ -68,15 +74,22 @@ $L$enc_prologue: mov DWORD[12+r13],r11d mov r15,QWORD[rsp] + mov r14,QWORD[8+rsp] + mov r13,QWORD[16+rsp] + mov rbp,QWORD[24+rsp] + mov rbx,QWORD[32+rsp] + lea rsp,[40+rsp] + $L$enc_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_Camellia_EncryptBlock_Rounds: @@ -313,11 +326,17 @@ $L$SEH_begin_Camellia_DecryptBlock_Rounds: mov rcx,r9 + push rbx + push rbp + push r13 + push r14 + push r15 + $L$dec_prologue: @@ -349,15 +368,22 @@ $L$dec_prologue: mov DWORD[12+r13],r11d mov r15,QWORD[rsp] + mov r14,QWORD[8+rsp] + mov r13,QWORD[16+rsp] + mov rbp,QWORD[24+rsp] + mov rbx,QWORD[32+rsp] + lea rsp,[40+rsp] + $L$dec_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_Camellia_DecryptBlock_Rounds: @@ -580,11 +606,17 @@ $L$SEH_begin_Camellia_Ekeygen: mov rdx,r8 + push rbx + push rbp + push r13 + push r14 + push r15 + $L$key_prologue: mov r15d,edi @@ -1112,15 +1144,22 @@ $L$2nd256: mov eax,4 $L$done: mov r15,QWORD[rsp] + mov r14,QWORD[8+rsp] + mov r13,QWORD[16+rsp] + mov rbp,QWORD[24+rsp] + mov rbx,QWORD[32+rsp] + lea rsp,[40+rsp] + $L$key_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_Camellia_Ekeygen: ALIGN 64 $L$Camellia_SIGMA: @@ -1657,17 +1696,25 @@ $L$SEH_begin_Camellia_cbc_encrypt: mov r9,QWORD[48+rsp] + cmp rdx,0 je NEAR $L$cbc_abort push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + $L$cbc_prologue: mov rbp,rsp + sub rsp,64 and rsp,-64 @@ -1689,6 +1736,7 @@ $L$cbc_prologue: mov QWORD[40+rsp],r8 mov QWORD[48+rsp],rbp + $L$cbc_body: lea rbp,[$L$Camellia_SBOX] @@ -1876,17 +1924,26 @@ $L$cbc_dec_popf: ALIGN 16 $L$cbc_done: mov rcx,QWORD[48+rsp] + mov r15,QWORD[rcx] + mov r14,QWORD[8+rcx] + mov r13,QWORD[16+rcx] + mov r12,QWORD[24+rcx] + mov rbp,QWORD[32+rcx] + mov rbx,QWORD[40+rcx] + lea rsp,[48+rcx] + $L$cbc_abort: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_Camellia_cbc_encrypt: DB 67,97,109,101,108,108,105,97,32,102,111,114,32,120,56,54 diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/chacha/chacha-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/chacha/chacha-x86_64.asm index ce4751884f..1a2003ea1f 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/chacha/chacha-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/chacha/chacha-x86_64.asm @@ -24,6 +24,17 @@ $L$rot16: DB 0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd $L$rot24: DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe +$L$twoy: + DD 2,0,0,0,2,0,0,0 +ALIGN 64 +$L$zeroz: + DD 0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0 +$L$fourz: + DD 4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0 +$L$incz: + DD 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +$L$sixteen: + DD 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 $L$sigma: DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107 DB 0 @@ -46,20 +57,33 @@ $L$SEH_begin_ChaCha20_ctr32: mov r8,QWORD[40+rsp] + cmp rdx,0 je NEAR $L$no_data mov r10,QWORD[((OPENSSL_ia32cap_P+4))] + bt r10,48 + jc NEAR $L$ChaCha20_avx512 + test r10,r10 + js NEAR $L$ChaCha20_avx512vl test r10d,512 jnz NEAR $L$ChaCha20_ssse3 push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp,64+24 +$L$ctr32_body: + movdqu xmm1,XMMWORD[rcx] movdqu xmm2,XMMWORD[16+rcx] @@ -296,17 +320,27 @@ $L$oop_tail: jnz NEAR $L$oop_tail $L$done: - add rsp,64+24 - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx + lea rsi,[((64+24+48))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$no_data: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ChaCha20_ctr32: ALIGN 32 @@ -322,23 +356,21 @@ $L$SEH_begin_ChaCha20_ssse3: mov r8,QWORD[40+rsp] + $L$ChaCha20_ssse3: + mov r9,rsp + test r10d,2048 jnz NEAR $L$ChaCha20_4xop cmp rdx,128 + je NEAR $L$ChaCha20_128 ja NEAR $L$ChaCha20_4x $L$do_sse3_after_all: - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - - sub rsp,64+72 - movaps XMMWORD[(64+32)+rsp],xmm6 - movaps XMMWORD[(64+48)+rsp],xmm7 + sub rsp,64+40 + movaps XMMWORD[(-40)+r9],xmm6 + movaps XMMWORD[(-24)+r9],xmm7 +$L$ssse3_body: movdqa xmm0,XMMWORD[$L$sigma] movdqu xmm1,XMMWORD[rcx] movdqu xmm2,XMMWORD[16+rcx] @@ -350,7 +382,7 @@ $L$do_sse3_after_all: movdqa XMMWORD[16+rsp],xmm1 movdqa XMMWORD[32+rsp],xmm2 movdqa XMMWORD[48+rsp],xmm3 - mov ebp,10 + mov r8,10 jmp NEAR $L$oop_ssse3 ALIGN 32 @@ -360,7 +392,7 @@ $L$oop_outer_ssse3: movdqa xmm1,XMMWORD[16+rsp] movdqa xmm2,XMMWORD[32+rsp] paddd xmm3,XMMWORD[48+rsp] - mov ebp,10 + mov r8,10 movdqa XMMWORD[48+rsp],xmm3 jmp NEAR $L$oop_ssse3 @@ -409,7 +441,7 @@ DB 102,15,56,0,223 pshufd xmm2,xmm2,78 pshufd xmm1,xmm1,147 pshufd xmm3,xmm3,57 - dec ebp + dec r8 jnz NEAR $L$oop_ssse3 paddd xmm0,XMMWORD[rsp] paddd xmm1,XMMWORD[16+rsp] @@ -446,33 +478,212 @@ $L$tail_ssse3: movdqa XMMWORD[16+rsp],xmm1 movdqa XMMWORD[32+rsp],xmm2 movdqa XMMWORD[48+rsp],xmm3 - xor rbx,rbx + xor r8,r8 $L$oop_tail_ssse3: - movzx eax,BYTE[rbx*1+rsi] - movzx ecx,BYTE[rbx*1+rsp] - lea rbx,[1+rbx] + movzx eax,BYTE[r8*1+rsi] + movzx ecx,BYTE[r8*1+rsp] + lea r8,[1+r8] xor eax,ecx - mov BYTE[((-1))+rbx*1+rdi],al + mov BYTE[((-1))+r8*1+rdi],al dec rdx jnz NEAR $L$oop_tail_ssse3 $L$done_ssse3: - movaps xmm6,XMMWORD[((64+32))+rsp] - movaps xmm7,XMMWORD[((64+48))+rsp] - add rsp,64+72 - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx + movaps xmm6,XMMWORD[((-40))+r9] + movaps xmm7,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$ssse3_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ChaCha20_ssse3: ALIGN 32 +ChaCha20_128: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_128: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +$L$ChaCha20_128: + mov r9,rsp + + sub rsp,64+104 + movaps XMMWORD[(-104)+r9],xmm6 + movaps XMMWORD[(-88)+r9],xmm7 + movaps XMMWORD[(-72)+r9],xmm8 + movaps XMMWORD[(-56)+r9],xmm9 + movaps XMMWORD[(-40)+r9],xmm10 + movaps XMMWORD[(-24)+r9],xmm11 +$L$128_body: + movdqa xmm8,XMMWORD[$L$sigma] + movdqu xmm9,XMMWORD[rcx] + movdqu xmm2,XMMWORD[16+rcx] + movdqu xmm3,XMMWORD[r8] + movdqa xmm1,XMMWORD[$L$one] + movdqa xmm6,XMMWORD[$L$rot16] + movdqa xmm7,XMMWORD[$L$rot24] + + movdqa xmm10,xmm8 + movdqa XMMWORD[rsp],xmm8 + movdqa xmm11,xmm9 + movdqa XMMWORD[16+rsp],xmm9 + movdqa xmm0,xmm2 + movdqa XMMWORD[32+rsp],xmm2 + paddd xmm1,xmm3 + movdqa XMMWORD[48+rsp],xmm3 + mov r8,10 + jmp NEAR $L$oop_128 + +ALIGN 32 +$L$oop_128: + paddd xmm8,xmm9 + pxor xmm3,xmm8 + paddd xmm10,xmm11 + pxor xmm1,xmm10 +DB 102,15,56,0,222 +DB 102,15,56,0,206 + paddd xmm2,xmm3 + paddd xmm0,xmm1 + pxor xmm9,xmm2 + pxor xmm11,xmm0 + movdqa xmm4,xmm9 + psrld xmm9,20 + movdqa xmm5,xmm11 + pslld xmm4,12 + psrld xmm11,20 + por xmm9,xmm4 + pslld xmm5,12 + por xmm11,xmm5 + paddd xmm8,xmm9 + pxor xmm3,xmm8 + paddd xmm10,xmm11 + pxor xmm1,xmm10 +DB 102,15,56,0,223 +DB 102,15,56,0,207 + paddd xmm2,xmm3 + paddd xmm0,xmm1 + pxor xmm9,xmm2 + pxor xmm11,xmm0 + movdqa xmm4,xmm9 + psrld xmm9,25 + movdqa xmm5,xmm11 + pslld xmm4,7 + psrld xmm11,25 + por xmm9,xmm4 + pslld xmm5,7 + por xmm11,xmm5 + pshufd xmm2,xmm2,78 + pshufd xmm9,xmm9,57 + pshufd xmm3,xmm3,147 + pshufd xmm0,xmm0,78 + pshufd xmm11,xmm11,57 + pshufd xmm1,xmm1,147 + paddd xmm8,xmm9 + pxor xmm3,xmm8 + paddd xmm10,xmm11 + pxor xmm1,xmm10 +DB 102,15,56,0,222 +DB 102,15,56,0,206 + paddd xmm2,xmm3 + paddd xmm0,xmm1 + pxor xmm9,xmm2 + pxor xmm11,xmm0 + movdqa xmm4,xmm9 + psrld xmm9,20 + movdqa xmm5,xmm11 + pslld xmm4,12 + psrld xmm11,20 + por xmm9,xmm4 + pslld xmm5,12 + por xmm11,xmm5 + paddd xmm8,xmm9 + pxor xmm3,xmm8 + paddd xmm10,xmm11 + pxor xmm1,xmm10 +DB 102,15,56,0,223 +DB 102,15,56,0,207 + paddd xmm2,xmm3 + paddd xmm0,xmm1 + pxor xmm9,xmm2 + pxor xmm11,xmm0 + movdqa xmm4,xmm9 + psrld xmm9,25 + movdqa xmm5,xmm11 + pslld xmm4,7 + psrld xmm11,25 + por xmm9,xmm4 + pslld xmm5,7 + por xmm11,xmm5 + pshufd xmm2,xmm2,78 + pshufd xmm9,xmm9,147 + pshufd xmm3,xmm3,57 + pshufd xmm0,xmm0,78 + pshufd xmm11,xmm11,147 + pshufd xmm1,xmm1,57 + dec r8 + jnz NEAR $L$oop_128 + paddd xmm8,XMMWORD[rsp] + paddd xmm9,XMMWORD[16+rsp] + paddd xmm2,XMMWORD[32+rsp] + paddd xmm3,XMMWORD[48+rsp] + paddd xmm1,XMMWORD[$L$one] + paddd xmm10,XMMWORD[rsp] + paddd xmm11,XMMWORD[16+rsp] + paddd xmm0,XMMWORD[32+rsp] + paddd xmm1,XMMWORD[48+rsp] + + movdqu xmm4,XMMWORD[rsi] + movdqu xmm5,XMMWORD[16+rsi] + pxor xmm8,xmm4 + movdqu xmm4,XMMWORD[32+rsi] + pxor xmm9,xmm5 + movdqu xmm5,XMMWORD[48+rsi] + pxor xmm2,xmm4 + movdqu xmm4,XMMWORD[64+rsi] + pxor xmm3,xmm5 + movdqu xmm5,XMMWORD[80+rsi] + pxor xmm10,xmm4 + movdqu xmm4,XMMWORD[96+rsi] + pxor xmm11,xmm5 + movdqu xmm5,XMMWORD[112+rsi] + pxor xmm0,xmm4 + pxor xmm1,xmm5 + + movdqu XMMWORD[rdi],xmm8 + movdqu XMMWORD[16+rdi],xmm9 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + movdqu XMMWORD[64+rdi],xmm10 + movdqu XMMWORD[80+rdi],xmm11 + movdqu XMMWORD[96+rdi],xmm0 + movdqu XMMWORD[112+rdi],xmm1 + movaps xmm6,XMMWORD[((-104))+r9] + movaps xmm7,XMMWORD[((-88))+r9] + movaps xmm8,XMMWORD[((-72))+r9] + movaps xmm9,XMMWORD[((-56))+r9] + movaps xmm10,XMMWORD[((-40))+r9] + movaps xmm11,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$128_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ChaCha20_128: + +ALIGN 32 ChaCha20_4x: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi @@ -485,7 +696,10 @@ $L$SEH_begin_ChaCha20_4x: mov r8,QWORD[40+rsp] + $L$ChaCha20_4x: + mov r9,rsp + mov r11,r10 shr r10,32 test r10,32 @@ -498,18 +712,18 @@ $L$ChaCha20_4x: je NEAR $L$do_sse3_after_all $L$proceed4x: - lea r11,[((-120))+rsp] - sub rsp,0x148+160 - movaps XMMWORD[(-48)+r11],xmm6 - movaps XMMWORD[(-32)+r11],xmm7 - movaps XMMWORD[(-16)+r11],xmm8 - movaps XMMWORD[r11],xmm9 - movaps XMMWORD[16+r11],xmm10 - movaps XMMWORD[32+r11],xmm11 - movaps XMMWORD[48+r11],xmm12 - movaps XMMWORD[64+r11],xmm13 - movaps XMMWORD[80+r11],xmm14 - movaps XMMWORD[96+r11],xmm15 + sub rsp,0x140+168 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$4x_body: movdqa xmm11,XMMWORD[$L$sigma] movdqu xmm15,XMMWORD[rcx] movdqu xmm7,XMMWORD[16+rcx] @@ -1036,21 +1250,23 @@ $L$oop_tail4x: jnz NEAR $L$oop_tail4x $L$done4x: - lea r11,[((320+48))+rsp] - movaps xmm6,XMMWORD[((-48))+r11] - movaps xmm7,XMMWORD[((-32))+r11] - movaps xmm8,XMMWORD[((-16))+r11] - movaps xmm9,XMMWORD[r11] - movaps xmm10,XMMWORD[16+r11] - movaps xmm11,XMMWORD[32+r11] - movaps xmm12,XMMWORD[48+r11] - movaps xmm13,XMMWORD[64+r11] - movaps xmm14,XMMWORD[80+r11] - movaps xmm15,XMMWORD[96+r11] - add rsp,0x148+160 + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ChaCha20_4x: ALIGN 32 @@ -1066,19 +1282,22 @@ $L$SEH_begin_ChaCha20_4xop: mov r8,QWORD[40+rsp] + $L$ChaCha20_4xop: - lea r11,[((-120))+rsp] - sub rsp,0x148+160 - movaps XMMWORD[(-48)+r11],xmm6 - movaps XMMWORD[(-32)+r11],xmm7 - movaps XMMWORD[(-16)+r11],xmm8 - movaps XMMWORD[r11],xmm9 - movaps XMMWORD[16+r11],xmm10 - movaps XMMWORD[32+r11],xmm11 - movaps XMMWORD[48+r11],xmm12 - movaps XMMWORD[64+r11],xmm13 - movaps XMMWORD[80+r11],xmm14 - movaps XMMWORD[96+r11],xmm15 + mov r9,rsp + + sub rsp,0x140+168 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$4xop_body: vzeroupper vmovdqa xmm11,XMMWORD[$L$sigma] @@ -1480,21 +1699,23 @@ $L$oop_tail4xop: $L$done4xop: vzeroupper - lea r11,[((320+48))+rsp] - movaps xmm6,XMMWORD[((-48))+r11] - movaps xmm7,XMMWORD[((-32))+r11] - movaps xmm8,XMMWORD[((-16))+r11] - movaps xmm9,XMMWORD[r11] - movaps xmm10,XMMWORD[16+r11] - movaps xmm11,XMMWORD[32+r11] - movaps xmm12,XMMWORD[48+r11] - movaps xmm13,XMMWORD[64+r11] - movaps xmm14,XMMWORD[80+r11] - movaps xmm15,XMMWORD[96+r11] - add rsp,0x148+160 + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$4xop_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ChaCha20_4xop: ALIGN 32 @@ -1510,23 +1731,24 @@ $L$SEH_begin_ChaCha20_8x: mov r8,QWORD[40+rsp] + $L$ChaCha20_8x: - mov r10,rsp - sub rsp,0x280+176 + mov r9,rsp + + sub rsp,0x280+168 and rsp,-32 - lea r11,[((656+48))+rsp] - movaps XMMWORD[(-48)+r11],xmm6 - movaps XMMWORD[(-32)+r11],xmm7 - movaps XMMWORD[(-16)+r11],xmm8 - movaps XMMWORD[r11],xmm9 - movaps XMMWORD[16+r11],xmm10 - movaps XMMWORD[32+r11],xmm11 - movaps XMMWORD[48+r11],xmm12 - movaps XMMWORD[64+r11],xmm13 - movaps XMMWORD[80+r11],xmm14 - movaps XMMWORD[96+r11],xmm15 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$8x_body: vzeroupper - mov QWORD[640+rsp],r10 @@ -2117,19 +2339,1579 @@ $L$oop_tail8x: $L$done8x: vzeroall - lea r11,[((656+48))+rsp] - movaps xmm6,XMMWORD[((-48))+r11] - movaps xmm7,XMMWORD[((-32))+r11] - movaps xmm8,XMMWORD[((-16))+r11] - movaps xmm9,XMMWORD[r11] - movaps xmm10,XMMWORD[16+r11] - movaps xmm11,XMMWORD[32+r11] - movaps xmm12,XMMWORD[48+r11] - movaps xmm13,XMMWORD[64+r11] - movaps xmm14,XMMWORD[80+r11] - movaps xmm15,XMMWORD[96+r11] - mov rsp,QWORD[640+rsp] + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$8x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ChaCha20_8x: + +ALIGN 32 +ChaCha20_avx512: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_avx512: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +$L$ChaCha20_avx512: + mov r9,rsp + + cmp rdx,512 + ja NEAR $L$ChaCha20_16x + + sub rsp,64+40 + movaps XMMWORD[(-40)+r9],xmm6 + movaps XMMWORD[(-24)+r9],xmm7 +$L$avx512_body: + vbroadcasti32x4 zmm0,ZMMWORD[$L$sigma] + vbroadcasti32x4 zmm1,ZMMWORD[rcx] + vbroadcasti32x4 zmm2,ZMMWORD[16+rcx] + vbroadcasti32x4 zmm3,ZMMWORD[r8] + + vmovdqa32 zmm16,zmm0 + vmovdqa32 zmm17,zmm1 + vmovdqa32 zmm18,zmm2 + vpaddd zmm3,zmm3,ZMMWORD[$L$zeroz] + vmovdqa32 zmm20,ZMMWORD[$L$fourz] + mov r8,10 + vmovdqa32 zmm19,zmm3 + jmp NEAR $L$oop_avx512 + +ALIGN 16 +$L$oop_outer_avx512: + vmovdqa32 zmm0,zmm16 + vmovdqa32 zmm1,zmm17 + vmovdqa32 zmm2,zmm18 + vpaddd zmm3,zmm19,zmm20 + mov r8,10 + vmovdqa32 zmm19,zmm3 + jmp NEAR $L$oop_avx512 + +ALIGN 32 +$L$oop_avx512: + vpaddd zmm0,zmm0,zmm1 + vpxord zmm3,zmm3,zmm0 + vprold zmm3,zmm3,16 + vpaddd zmm2,zmm2,zmm3 + vpxord zmm1,zmm1,zmm2 + vprold zmm1,zmm1,12 + vpaddd zmm0,zmm0,zmm1 + vpxord zmm3,zmm3,zmm0 + vprold zmm3,zmm3,8 + vpaddd zmm2,zmm2,zmm3 + vpxord zmm1,zmm1,zmm2 + vprold zmm1,zmm1,7 + vpshufd zmm2,zmm2,78 + vpshufd zmm1,zmm1,57 + vpshufd zmm3,zmm3,147 + vpaddd zmm0,zmm0,zmm1 + vpxord zmm3,zmm3,zmm0 + vprold zmm3,zmm3,16 + vpaddd zmm2,zmm2,zmm3 + vpxord zmm1,zmm1,zmm2 + vprold zmm1,zmm1,12 + vpaddd zmm0,zmm0,zmm1 + vpxord zmm3,zmm3,zmm0 + vprold zmm3,zmm3,8 + vpaddd zmm2,zmm2,zmm3 + vpxord zmm1,zmm1,zmm2 + vprold zmm1,zmm1,7 + vpshufd zmm2,zmm2,78 + vpshufd zmm1,zmm1,147 + vpshufd zmm3,zmm3,57 + dec r8 + jnz NEAR $L$oop_avx512 + vpaddd zmm0,zmm0,zmm16 + vpaddd zmm1,zmm1,zmm17 + vpaddd zmm2,zmm2,zmm18 + vpaddd zmm3,zmm3,zmm19 + + sub rdx,64 + jb NEAR $L$tail64_avx512 + + vpxor xmm4,xmm0,XMMWORD[rsi] + vpxor xmm5,xmm1,XMMWORD[16+rsi] + vpxor xmm6,xmm2,XMMWORD[32+rsi] + vpxor xmm7,xmm3,XMMWORD[48+rsi] + lea rsi,[64+rsi] + + vmovdqu XMMWORD[rdi],xmm4 + vmovdqu XMMWORD[16+rdi],xmm5 + vmovdqu XMMWORD[32+rdi],xmm6 + vmovdqu XMMWORD[48+rdi],xmm7 + lea rdi,[64+rdi] + + jz NEAR $L$done_avx512 + + vextracti32x4 xmm4,zmm0,1 + vextracti32x4 xmm5,zmm1,1 + vextracti32x4 xmm6,zmm2,1 + vextracti32x4 xmm7,zmm3,1 + + sub rdx,64 + jb NEAR $L$tail_avx512 + + vpxor xmm4,xmm4,XMMWORD[rsi] + vpxor xmm5,xmm5,XMMWORD[16+rsi] + vpxor xmm6,xmm6,XMMWORD[32+rsi] + vpxor xmm7,xmm7,XMMWORD[48+rsi] + lea rsi,[64+rsi] + + vmovdqu XMMWORD[rdi],xmm4 + vmovdqu XMMWORD[16+rdi],xmm5 + vmovdqu XMMWORD[32+rdi],xmm6 + vmovdqu XMMWORD[48+rdi],xmm7 + lea rdi,[64+rdi] + + jz NEAR $L$done_avx512 + + vextracti32x4 xmm4,zmm0,2 + vextracti32x4 xmm5,zmm1,2 + vextracti32x4 xmm6,zmm2,2 + vextracti32x4 xmm7,zmm3,2 + + sub rdx,64 + jb NEAR $L$tail_avx512 + + vpxor xmm4,xmm4,XMMWORD[rsi] + vpxor xmm5,xmm5,XMMWORD[16+rsi] + vpxor xmm6,xmm6,XMMWORD[32+rsi] + vpxor xmm7,xmm7,XMMWORD[48+rsi] + lea rsi,[64+rsi] + + vmovdqu XMMWORD[rdi],xmm4 + vmovdqu XMMWORD[16+rdi],xmm5 + vmovdqu XMMWORD[32+rdi],xmm6 + vmovdqu XMMWORD[48+rdi],xmm7 + lea rdi,[64+rdi] + + jz NEAR $L$done_avx512 + + vextracti32x4 xmm4,zmm0,3 + vextracti32x4 xmm5,zmm1,3 + vextracti32x4 xmm6,zmm2,3 + vextracti32x4 xmm7,zmm3,3 + + sub rdx,64 + jb NEAR $L$tail_avx512 + + vpxor xmm4,xmm4,XMMWORD[rsi] + vpxor xmm5,xmm5,XMMWORD[16+rsi] + vpxor xmm6,xmm6,XMMWORD[32+rsi] + vpxor xmm7,xmm7,XMMWORD[48+rsi] + lea rsi,[64+rsi] + + vmovdqu XMMWORD[rdi],xmm4 + vmovdqu XMMWORD[16+rdi],xmm5 + vmovdqu XMMWORD[32+rdi],xmm6 + vmovdqu XMMWORD[48+rdi],xmm7 + lea rdi,[64+rdi] + + jnz NEAR $L$oop_outer_avx512 + + jmp NEAR $L$done_avx512 + +ALIGN 16 +$L$tail64_avx512: + vmovdqa XMMWORD[rsp],xmm0 + vmovdqa XMMWORD[16+rsp],xmm1 + vmovdqa XMMWORD[32+rsp],xmm2 + vmovdqa XMMWORD[48+rsp],xmm3 + add rdx,64 + jmp NEAR $L$oop_tail_avx512 + +ALIGN 16 +$L$tail_avx512: + vmovdqa XMMWORD[rsp],xmm4 + vmovdqa XMMWORD[16+rsp],xmm5 + vmovdqa XMMWORD[32+rsp],xmm6 + vmovdqa XMMWORD[48+rsp],xmm7 + add rdx,64 + +$L$oop_tail_avx512: + movzx eax,BYTE[r8*1+rsi] + movzx ecx,BYTE[r8*1+rsp] + lea r8,[1+r8] + xor eax,ecx + mov BYTE[((-1))+r8*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail_avx512 + + vmovdqu32 ZMMWORD[rsp],zmm16 + +$L$done_avx512: + vzeroall + movaps xmm6,XMMWORD[((-40))+r9] + movaps xmm7,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$avx512_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ChaCha20_avx512: + +ALIGN 32 +ChaCha20_avx512vl: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_avx512vl: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +$L$ChaCha20_avx512vl: + mov r9,rsp + + cmp rdx,128 + ja NEAR $L$ChaCha20_8xvl + + sub rsp,64+40 + movaps XMMWORD[(-40)+r9],xmm6 + movaps XMMWORD[(-24)+r9],xmm7 +$L$avx512vl_body: + vbroadcasti128 ymm0,XMMWORD[$L$sigma] + vbroadcasti128 ymm1,XMMWORD[rcx] + vbroadcasti128 ymm2,XMMWORD[16+rcx] + vbroadcasti128 ymm3,XMMWORD[r8] + + vmovdqa32 ymm16,ymm0 + vmovdqa32 ymm17,ymm1 + vmovdqa32 ymm18,ymm2 + vpaddd ymm3,ymm3,YMMWORD[$L$zeroz] + vmovdqa32 ymm20,YMMWORD[$L$twoy] + mov r8,10 + vmovdqa32 ymm19,ymm3 + jmp NEAR $L$oop_avx512vl + +ALIGN 16 +$L$oop_outer_avx512vl: + vmovdqa32 ymm2,ymm18 + vpaddd ymm3,ymm19,ymm20 + mov r8,10 + vmovdqa32 ymm19,ymm3 + jmp NEAR $L$oop_avx512vl + +ALIGN 32 +$L$oop_avx512vl: + vpaddd ymm0,ymm0,ymm1 + vpxor ymm3,ymm3,ymm0 + vprold ymm3,ymm3,16 + vpaddd ymm2,ymm2,ymm3 + vpxor ymm1,ymm1,ymm2 + vprold ymm1,ymm1,12 + vpaddd ymm0,ymm0,ymm1 + vpxor ymm3,ymm3,ymm0 + vprold ymm3,ymm3,8 + vpaddd ymm2,ymm2,ymm3 + vpxor ymm1,ymm1,ymm2 + vprold ymm1,ymm1,7 + vpshufd ymm2,ymm2,78 + vpshufd ymm1,ymm1,57 + vpshufd ymm3,ymm3,147 + vpaddd ymm0,ymm0,ymm1 + vpxor ymm3,ymm3,ymm0 + vprold ymm3,ymm3,16 + vpaddd ymm2,ymm2,ymm3 + vpxor ymm1,ymm1,ymm2 + vprold ymm1,ymm1,12 + vpaddd ymm0,ymm0,ymm1 + vpxor ymm3,ymm3,ymm0 + vprold ymm3,ymm3,8 + vpaddd ymm2,ymm2,ymm3 + vpxor ymm1,ymm1,ymm2 + vprold ymm1,ymm1,7 + vpshufd ymm2,ymm2,78 + vpshufd ymm1,ymm1,147 + vpshufd ymm3,ymm3,57 + dec r8 + jnz NEAR $L$oop_avx512vl + vpaddd ymm0,ymm0,ymm16 + vpaddd ymm1,ymm1,ymm17 + vpaddd ymm2,ymm2,ymm18 + vpaddd ymm3,ymm3,ymm19 + + sub rdx,64 + jb NEAR $L$tail64_avx512vl + + vpxor xmm4,xmm0,XMMWORD[rsi] + vpxor xmm5,xmm1,XMMWORD[16+rsi] + vpxor xmm6,xmm2,XMMWORD[32+rsi] + vpxor xmm7,xmm3,XMMWORD[48+rsi] + lea rsi,[64+rsi] + + vmovdqu XMMWORD[rdi],xmm4 + vmovdqu XMMWORD[16+rdi],xmm5 + vmovdqu XMMWORD[32+rdi],xmm6 + vmovdqu XMMWORD[48+rdi],xmm7 + lea rdi,[64+rdi] + + jz NEAR $L$done_avx512vl + + vextracti128 xmm4,ymm0,1 + vextracti128 xmm5,ymm1,1 + vextracti128 xmm6,ymm2,1 + vextracti128 xmm7,ymm3,1 + + sub rdx,64 + jb NEAR $L$tail_avx512vl + + vpxor xmm4,xmm4,XMMWORD[rsi] + vpxor xmm5,xmm5,XMMWORD[16+rsi] + vpxor xmm6,xmm6,XMMWORD[32+rsi] + vpxor xmm7,xmm7,XMMWORD[48+rsi] + lea rsi,[64+rsi] + + vmovdqu XMMWORD[rdi],xmm4 + vmovdqu XMMWORD[16+rdi],xmm5 + vmovdqu XMMWORD[32+rdi],xmm6 + vmovdqu XMMWORD[48+rdi],xmm7 + lea rdi,[64+rdi] + + vmovdqa32 ymm0,ymm16 + vmovdqa32 ymm1,ymm17 + jnz NEAR $L$oop_outer_avx512vl + + jmp NEAR $L$done_avx512vl + +ALIGN 16 +$L$tail64_avx512vl: + vmovdqa XMMWORD[rsp],xmm0 + vmovdqa XMMWORD[16+rsp],xmm1 + vmovdqa XMMWORD[32+rsp],xmm2 + vmovdqa XMMWORD[48+rsp],xmm3 + add rdx,64 + jmp NEAR $L$oop_tail_avx512vl + +ALIGN 16 +$L$tail_avx512vl: + vmovdqa XMMWORD[rsp],xmm4 + vmovdqa XMMWORD[16+rsp],xmm5 + vmovdqa XMMWORD[32+rsp],xmm6 + vmovdqa XMMWORD[48+rsp],xmm7 + add rdx,64 + +$L$oop_tail_avx512vl: + movzx eax,BYTE[r8*1+rsi] + movzx ecx,BYTE[r8*1+rsp] + lea r8,[1+r8] + xor eax,ecx + mov BYTE[((-1))+r8*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail_avx512vl + + vmovdqu32 YMMWORD[rsp],ymm16 + vmovdqu32 YMMWORD[32+rsp],ymm16 + +$L$done_avx512vl: + vzeroall + movaps xmm6,XMMWORD[((-40))+r9] + movaps xmm7,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$avx512vl_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ChaCha20_avx512vl: + +ALIGN 32 +ChaCha20_16x: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_16x: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +$L$ChaCha20_16x: + mov r9,rsp + + sub rsp,64+168 + and rsp,-64 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$16x_body: + vzeroupper + + lea r10,[$L$sigma] + vbroadcasti32x4 zmm3,ZMMWORD[r10] + vbroadcasti32x4 zmm7,ZMMWORD[rcx] + vbroadcasti32x4 zmm11,ZMMWORD[16+rcx] + vbroadcasti32x4 zmm15,ZMMWORD[r8] + + vpshufd zmm0,zmm3,0x00 + vpshufd zmm1,zmm3,0x55 + vpshufd zmm2,zmm3,0xaa + vpshufd zmm3,zmm3,0xff + vmovdqa64 zmm16,zmm0 + vmovdqa64 zmm17,zmm1 + vmovdqa64 zmm18,zmm2 + vmovdqa64 zmm19,zmm3 + + vpshufd zmm4,zmm7,0x00 + vpshufd zmm5,zmm7,0x55 + vpshufd zmm6,zmm7,0xaa + vpshufd zmm7,zmm7,0xff + vmovdqa64 zmm20,zmm4 + vmovdqa64 zmm21,zmm5 + vmovdqa64 zmm22,zmm6 + vmovdqa64 zmm23,zmm7 + + vpshufd zmm8,zmm11,0x00 + vpshufd zmm9,zmm11,0x55 + vpshufd zmm10,zmm11,0xaa + vpshufd zmm11,zmm11,0xff + vmovdqa64 zmm24,zmm8 + vmovdqa64 zmm25,zmm9 + vmovdqa64 zmm26,zmm10 + vmovdqa64 zmm27,zmm11 + + vpshufd zmm12,zmm15,0x00 + vpshufd zmm13,zmm15,0x55 + vpshufd zmm14,zmm15,0xaa + vpshufd zmm15,zmm15,0xff + vpaddd zmm12,zmm12,ZMMWORD[$L$incz] + vmovdqa64 zmm28,zmm12 + vmovdqa64 zmm29,zmm13 + vmovdqa64 zmm30,zmm14 + vmovdqa64 zmm31,zmm15 + + mov eax,10 + jmp NEAR $L$oop16x + +ALIGN 32 +$L$oop_outer16x: + vpbroadcastd zmm0,DWORD[r10] + vpbroadcastd zmm1,DWORD[4+r10] + vpbroadcastd zmm2,DWORD[8+r10] + vpbroadcastd zmm3,DWORD[12+r10] + vpaddd zmm28,zmm28,ZMMWORD[$L$sixteen] + vmovdqa64 zmm4,zmm20 + vmovdqa64 zmm5,zmm21 + vmovdqa64 zmm6,zmm22 + vmovdqa64 zmm7,zmm23 + vmovdqa64 zmm8,zmm24 + vmovdqa64 zmm9,zmm25 + vmovdqa64 zmm10,zmm26 + vmovdqa64 zmm11,zmm27 + vmovdqa64 zmm12,zmm28 + vmovdqa64 zmm13,zmm29 + vmovdqa64 zmm14,zmm30 + vmovdqa64 zmm15,zmm31 + + vmovdqa64 zmm16,zmm0 + vmovdqa64 zmm17,zmm1 + vmovdqa64 zmm18,zmm2 + vmovdqa64 zmm19,zmm3 + + mov eax,10 + jmp NEAR $L$oop16x + +ALIGN 32 +$L$oop16x: + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprold zmm12,zmm12,16 + vprold zmm13,zmm13,16 + vprold zmm14,zmm14,16 + vprold zmm15,zmm15,16 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprold zmm4,zmm4,12 + vprold zmm5,zmm5,12 + vprold zmm6,zmm6,12 + vprold zmm7,zmm7,12 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprold zmm12,zmm12,8 + vprold zmm13,zmm13,8 + vprold zmm14,zmm14,8 + vprold zmm15,zmm15,8 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprold zmm4,zmm4,7 + vprold zmm5,zmm5,7 + vprold zmm6,zmm6,7 + vprold zmm7,zmm7,7 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprold zmm15,zmm15,16 + vprold zmm12,zmm12,16 + vprold zmm13,zmm13,16 + vprold zmm14,zmm14,16 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprold zmm5,zmm5,12 + vprold zmm6,zmm6,12 + vprold zmm7,zmm7,12 + vprold zmm4,zmm4,12 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprold zmm15,zmm15,8 + vprold zmm12,zmm12,8 + vprold zmm13,zmm13,8 + vprold zmm14,zmm14,8 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprold zmm5,zmm5,7 + vprold zmm6,zmm6,7 + vprold zmm7,zmm7,7 + vprold zmm4,zmm4,7 + dec eax + jnz NEAR $L$oop16x + + vpaddd zmm0,zmm0,zmm16 + vpaddd zmm1,zmm1,zmm17 + vpaddd zmm2,zmm2,zmm18 + vpaddd zmm3,zmm3,zmm19 + + vpunpckldq zmm18,zmm0,zmm1 + vpunpckldq zmm19,zmm2,zmm3 + vpunpckhdq zmm0,zmm0,zmm1 + vpunpckhdq zmm2,zmm2,zmm3 + vpunpcklqdq zmm1,zmm18,zmm19 + vpunpckhqdq zmm18,zmm18,zmm19 + vpunpcklqdq zmm3,zmm0,zmm2 + vpunpckhqdq zmm0,zmm0,zmm2 + vpaddd zmm4,zmm4,zmm20 + vpaddd zmm5,zmm5,zmm21 + vpaddd zmm6,zmm6,zmm22 + vpaddd zmm7,zmm7,zmm23 + + vpunpckldq zmm2,zmm4,zmm5 + vpunpckldq zmm19,zmm6,zmm7 + vpunpckhdq zmm4,zmm4,zmm5 + vpunpckhdq zmm6,zmm6,zmm7 + vpunpcklqdq zmm5,zmm2,zmm19 + vpunpckhqdq zmm2,zmm2,zmm19 + vpunpcklqdq zmm7,zmm4,zmm6 + vpunpckhqdq zmm4,zmm4,zmm6 + vshufi32x4 zmm19,zmm1,zmm5,0x44 + vshufi32x4 zmm5,zmm1,zmm5,0xee + vshufi32x4 zmm1,zmm18,zmm2,0x44 + vshufi32x4 zmm2,zmm18,zmm2,0xee + vshufi32x4 zmm18,zmm3,zmm7,0x44 + vshufi32x4 zmm7,zmm3,zmm7,0xee + vshufi32x4 zmm3,zmm0,zmm4,0x44 + vshufi32x4 zmm4,zmm0,zmm4,0xee + vpaddd zmm8,zmm8,zmm24 + vpaddd zmm9,zmm9,zmm25 + vpaddd zmm10,zmm10,zmm26 + vpaddd zmm11,zmm11,zmm27 + + vpunpckldq zmm6,zmm8,zmm9 + vpunpckldq zmm0,zmm10,zmm11 + vpunpckhdq zmm8,zmm8,zmm9 + vpunpckhdq zmm10,zmm10,zmm11 + vpunpcklqdq zmm9,zmm6,zmm0 + vpunpckhqdq zmm6,zmm6,zmm0 + vpunpcklqdq zmm11,zmm8,zmm10 + vpunpckhqdq zmm8,zmm8,zmm10 + vpaddd zmm12,zmm12,zmm28 + vpaddd zmm13,zmm13,zmm29 + vpaddd zmm14,zmm14,zmm30 + vpaddd zmm15,zmm15,zmm31 + + vpunpckldq zmm10,zmm12,zmm13 + vpunpckldq zmm0,zmm14,zmm15 + vpunpckhdq zmm12,zmm12,zmm13 + vpunpckhdq zmm14,zmm14,zmm15 + vpunpcklqdq zmm13,zmm10,zmm0 + vpunpckhqdq zmm10,zmm10,zmm0 + vpunpcklqdq zmm15,zmm12,zmm14 + vpunpckhqdq zmm12,zmm12,zmm14 + vshufi32x4 zmm0,zmm9,zmm13,0x44 + vshufi32x4 zmm13,zmm9,zmm13,0xee + vshufi32x4 zmm9,zmm6,zmm10,0x44 + vshufi32x4 zmm10,zmm6,zmm10,0xee + vshufi32x4 zmm6,zmm11,zmm15,0x44 + vshufi32x4 zmm15,zmm11,zmm15,0xee + vshufi32x4 zmm11,zmm8,zmm12,0x44 + vshufi32x4 zmm12,zmm8,zmm12,0xee + vshufi32x4 zmm16,zmm19,zmm0,0x88 + vshufi32x4 zmm19,zmm19,zmm0,0xdd + vshufi32x4 zmm0,zmm5,zmm13,0x88 + vshufi32x4 zmm13,zmm5,zmm13,0xdd + vshufi32x4 zmm17,zmm1,zmm9,0x88 + vshufi32x4 zmm1,zmm1,zmm9,0xdd + vshufi32x4 zmm9,zmm2,zmm10,0x88 + vshufi32x4 zmm10,zmm2,zmm10,0xdd + vshufi32x4 zmm14,zmm18,zmm6,0x88 + vshufi32x4 zmm18,zmm18,zmm6,0xdd + vshufi32x4 zmm6,zmm7,zmm15,0x88 + vshufi32x4 zmm15,zmm7,zmm15,0xdd + vshufi32x4 zmm8,zmm3,zmm11,0x88 + vshufi32x4 zmm3,zmm3,zmm11,0xdd + vshufi32x4 zmm11,zmm4,zmm12,0x88 + vshufi32x4 zmm12,zmm4,zmm12,0xdd + cmp rdx,64*16 + jb NEAR $L$tail16x + + vpxord zmm16,zmm16,ZMMWORD[rsi] + vpxord zmm17,zmm17,ZMMWORD[64+rsi] + vpxord zmm14,zmm14,ZMMWORD[128+rsi] + vpxord zmm8,zmm8,ZMMWORD[192+rsi] + vmovdqu32 ZMMWORD[rdi],zmm16 + vmovdqu32 ZMMWORD[64+rdi],zmm17 + vmovdqu32 ZMMWORD[128+rdi],zmm14 + vmovdqu32 ZMMWORD[192+rdi],zmm8 + + vpxord zmm19,zmm19,ZMMWORD[256+rsi] + vpxord zmm1,zmm1,ZMMWORD[320+rsi] + vpxord zmm18,zmm18,ZMMWORD[384+rsi] + vpxord zmm3,zmm3,ZMMWORD[448+rsi] + vmovdqu32 ZMMWORD[256+rdi],zmm19 + vmovdqu32 ZMMWORD[320+rdi],zmm1 + vmovdqu32 ZMMWORD[384+rdi],zmm18 + vmovdqu32 ZMMWORD[448+rdi],zmm3 + + vpxord zmm0,zmm0,ZMMWORD[512+rsi] + vpxord zmm9,zmm9,ZMMWORD[576+rsi] + vpxord zmm6,zmm6,ZMMWORD[640+rsi] + vpxord zmm11,zmm11,ZMMWORD[704+rsi] + vmovdqu32 ZMMWORD[512+rdi],zmm0 + vmovdqu32 ZMMWORD[576+rdi],zmm9 + vmovdqu32 ZMMWORD[640+rdi],zmm6 + vmovdqu32 ZMMWORD[704+rdi],zmm11 + + vpxord zmm13,zmm13,ZMMWORD[768+rsi] + vpxord zmm10,zmm10,ZMMWORD[832+rsi] + vpxord zmm15,zmm15,ZMMWORD[896+rsi] + vpxord zmm12,zmm12,ZMMWORD[960+rsi] + lea rsi,[1024+rsi] + vmovdqu32 ZMMWORD[768+rdi],zmm13 + vmovdqu32 ZMMWORD[832+rdi],zmm10 + vmovdqu32 ZMMWORD[896+rdi],zmm15 + vmovdqu32 ZMMWORD[960+rdi],zmm12 + lea rdi,[1024+rdi] + + sub rdx,64*16 + jnz NEAR $L$oop_outer16x + + jmp NEAR $L$done16x + +ALIGN 32 +$L$tail16x: + xor r10,r10 + sub rdi,rsi + cmp rdx,64*1 + jb NEAR $L$ess_than_64_16x + vpxord zmm16,zmm16,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm16 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm17 + lea rsi,[64+rsi] + + cmp rdx,64*2 + jb NEAR $L$ess_than_64_16x + vpxord zmm17,zmm17,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm17 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm14 + lea rsi,[64+rsi] + + cmp rdx,64*3 + jb NEAR $L$ess_than_64_16x + vpxord zmm14,zmm14,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm14 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm8 + lea rsi,[64+rsi] + + cmp rdx,64*4 + jb NEAR $L$ess_than_64_16x + vpxord zmm8,zmm8,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm8 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm19 + lea rsi,[64+rsi] + + cmp rdx,64*5 + jb NEAR $L$ess_than_64_16x + vpxord zmm19,zmm19,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm19 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm1 + lea rsi,[64+rsi] + + cmp rdx,64*6 + jb NEAR $L$ess_than_64_16x + vpxord zmm1,zmm1,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm1 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm18 + lea rsi,[64+rsi] + + cmp rdx,64*7 + jb NEAR $L$ess_than_64_16x + vpxord zmm18,zmm18,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm18 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm3 + lea rsi,[64+rsi] + + cmp rdx,64*8 + jb NEAR $L$ess_than_64_16x + vpxord zmm3,zmm3,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm3 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm0 + lea rsi,[64+rsi] + + cmp rdx,64*9 + jb NEAR $L$ess_than_64_16x + vpxord zmm0,zmm0,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm0 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm9 + lea rsi,[64+rsi] + + cmp rdx,64*10 + jb NEAR $L$ess_than_64_16x + vpxord zmm9,zmm9,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm9 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm6 + lea rsi,[64+rsi] + + cmp rdx,64*11 + jb NEAR $L$ess_than_64_16x + vpxord zmm6,zmm6,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm6 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm11 + lea rsi,[64+rsi] + + cmp rdx,64*12 + jb NEAR $L$ess_than_64_16x + vpxord zmm11,zmm11,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm11 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm13 + lea rsi,[64+rsi] + + cmp rdx,64*13 + jb NEAR $L$ess_than_64_16x + vpxord zmm13,zmm13,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm13 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm10 + lea rsi,[64+rsi] + + cmp rdx,64*14 + jb NEAR $L$ess_than_64_16x + vpxord zmm10,zmm10,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm10 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm15 + lea rsi,[64+rsi] + + cmp rdx,64*15 + jb NEAR $L$ess_than_64_16x + vpxord zmm15,zmm15,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm15 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm12 + lea rsi,[64+rsi] + +$L$ess_than_64_16x: + vmovdqa32 ZMMWORD[rsp],zmm16 + lea rdi,[rsi*1+rdi] + and rdx,63 + +$L$oop_tail16x: + movzx eax,BYTE[r10*1+rsi] + movzx ecx,BYTE[r10*1+rsp] + lea r10,[1+r10] + xor eax,ecx + mov BYTE[((-1))+r10*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail16x + + vpxord zmm16,zmm16,zmm16 + vmovdqa32 ZMMWORD[rsp],zmm16 + +$L$done16x: + vzeroall + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$16x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ChaCha20_16x: + +ALIGN 32 +ChaCha20_8xvl: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_8xvl: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +$L$ChaCha20_8xvl: + mov r9,rsp + + sub rsp,64+168 + and rsp,-64 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$8xvl_body: + vzeroupper + + lea r10,[$L$sigma] + vbroadcasti128 ymm3,XMMWORD[r10] + vbroadcasti128 ymm7,XMMWORD[rcx] + vbroadcasti128 ymm11,XMMWORD[16+rcx] + vbroadcasti128 ymm15,XMMWORD[r8] + + vpshufd ymm0,ymm3,0x00 + vpshufd ymm1,ymm3,0x55 + vpshufd ymm2,ymm3,0xaa + vpshufd ymm3,ymm3,0xff + vmovdqa64 ymm16,ymm0 + vmovdqa64 ymm17,ymm1 + vmovdqa64 ymm18,ymm2 + vmovdqa64 ymm19,ymm3 + + vpshufd ymm4,ymm7,0x00 + vpshufd ymm5,ymm7,0x55 + vpshufd ymm6,ymm7,0xaa + vpshufd ymm7,ymm7,0xff + vmovdqa64 ymm20,ymm4 + vmovdqa64 ymm21,ymm5 + vmovdqa64 ymm22,ymm6 + vmovdqa64 ymm23,ymm7 + + vpshufd ymm8,ymm11,0x00 + vpshufd ymm9,ymm11,0x55 + vpshufd ymm10,ymm11,0xaa + vpshufd ymm11,ymm11,0xff + vmovdqa64 ymm24,ymm8 + vmovdqa64 ymm25,ymm9 + vmovdqa64 ymm26,ymm10 + vmovdqa64 ymm27,ymm11 + + vpshufd ymm12,ymm15,0x00 + vpshufd ymm13,ymm15,0x55 + vpshufd ymm14,ymm15,0xaa + vpshufd ymm15,ymm15,0xff + vpaddd ymm12,ymm12,YMMWORD[$L$incy] + vmovdqa64 ymm28,ymm12 + vmovdqa64 ymm29,ymm13 + vmovdqa64 ymm30,ymm14 + vmovdqa64 ymm31,ymm15 + + mov eax,10 + jmp NEAR $L$oop8xvl + +ALIGN 32 +$L$oop_outer8xvl: + + + vpbroadcastd ymm2,DWORD[8+r10] + vpbroadcastd ymm3,DWORD[12+r10] + vpaddd ymm28,ymm28,YMMWORD[$L$eight] + vmovdqa64 ymm4,ymm20 + vmovdqa64 ymm5,ymm21 + vmovdqa64 ymm6,ymm22 + vmovdqa64 ymm7,ymm23 + vmovdqa64 ymm8,ymm24 + vmovdqa64 ymm9,ymm25 + vmovdqa64 ymm10,ymm26 + vmovdqa64 ymm11,ymm27 + vmovdqa64 ymm12,ymm28 + vmovdqa64 ymm13,ymm29 + vmovdqa64 ymm14,ymm30 + vmovdqa64 ymm15,ymm31 + + vmovdqa64 ymm16,ymm0 + vmovdqa64 ymm17,ymm1 + vmovdqa64 ymm18,ymm2 + vmovdqa64 ymm19,ymm3 + + mov eax,10 + jmp NEAR $L$oop8xvl + +ALIGN 32 +$L$oop8xvl: + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxor ymm12,ymm12,ymm0 + vpxor ymm13,ymm13,ymm1 + vpxor ymm14,ymm14,ymm2 + vpxor ymm15,ymm15,ymm3 + vprold ymm12,ymm12,16 + vprold ymm13,ymm13,16 + vprold ymm14,ymm14,16 + vprold ymm15,ymm15,16 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxor ymm4,ymm4,ymm8 + vpxor ymm5,ymm5,ymm9 + vpxor ymm6,ymm6,ymm10 + vpxor ymm7,ymm7,ymm11 + vprold ymm4,ymm4,12 + vprold ymm5,ymm5,12 + vprold ymm6,ymm6,12 + vprold ymm7,ymm7,12 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxor ymm12,ymm12,ymm0 + vpxor ymm13,ymm13,ymm1 + vpxor ymm14,ymm14,ymm2 + vpxor ymm15,ymm15,ymm3 + vprold ymm12,ymm12,8 + vprold ymm13,ymm13,8 + vprold ymm14,ymm14,8 + vprold ymm15,ymm15,8 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxor ymm4,ymm4,ymm8 + vpxor ymm5,ymm5,ymm9 + vpxor ymm6,ymm6,ymm10 + vpxor ymm7,ymm7,ymm11 + vprold ymm4,ymm4,7 + vprold ymm5,ymm5,7 + vprold ymm6,ymm6,7 + vprold ymm7,ymm7,7 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxor ymm15,ymm15,ymm0 + vpxor ymm12,ymm12,ymm1 + vpxor ymm13,ymm13,ymm2 + vpxor ymm14,ymm14,ymm3 + vprold ymm15,ymm15,16 + vprold ymm12,ymm12,16 + vprold ymm13,ymm13,16 + vprold ymm14,ymm14,16 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxor ymm5,ymm5,ymm10 + vpxor ymm6,ymm6,ymm11 + vpxor ymm7,ymm7,ymm8 + vpxor ymm4,ymm4,ymm9 + vprold ymm5,ymm5,12 + vprold ymm6,ymm6,12 + vprold ymm7,ymm7,12 + vprold ymm4,ymm4,12 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxor ymm15,ymm15,ymm0 + vpxor ymm12,ymm12,ymm1 + vpxor ymm13,ymm13,ymm2 + vpxor ymm14,ymm14,ymm3 + vprold ymm15,ymm15,8 + vprold ymm12,ymm12,8 + vprold ymm13,ymm13,8 + vprold ymm14,ymm14,8 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxor ymm5,ymm5,ymm10 + vpxor ymm6,ymm6,ymm11 + vpxor ymm7,ymm7,ymm8 + vpxor ymm4,ymm4,ymm9 + vprold ymm5,ymm5,7 + vprold ymm6,ymm6,7 + vprold ymm7,ymm7,7 + vprold ymm4,ymm4,7 + dec eax + jnz NEAR $L$oop8xvl + + vpaddd ymm0,ymm0,ymm16 + vpaddd ymm1,ymm1,ymm17 + vpaddd ymm2,ymm2,ymm18 + vpaddd ymm3,ymm3,ymm19 + + vpunpckldq ymm18,ymm0,ymm1 + vpunpckldq ymm19,ymm2,ymm3 + vpunpckhdq ymm0,ymm0,ymm1 + vpunpckhdq ymm2,ymm2,ymm3 + vpunpcklqdq ymm1,ymm18,ymm19 + vpunpckhqdq ymm18,ymm18,ymm19 + vpunpcklqdq ymm3,ymm0,ymm2 + vpunpckhqdq ymm0,ymm0,ymm2 + vpaddd ymm4,ymm4,ymm20 + vpaddd ymm5,ymm5,ymm21 + vpaddd ymm6,ymm6,ymm22 + vpaddd ymm7,ymm7,ymm23 + + vpunpckldq ymm2,ymm4,ymm5 + vpunpckldq ymm19,ymm6,ymm7 + vpunpckhdq ymm4,ymm4,ymm5 + vpunpckhdq ymm6,ymm6,ymm7 + vpunpcklqdq ymm5,ymm2,ymm19 + vpunpckhqdq ymm2,ymm2,ymm19 + vpunpcklqdq ymm7,ymm4,ymm6 + vpunpckhqdq ymm4,ymm4,ymm6 + vshufi32x4 ymm19,ymm1,ymm5,0 + vshufi32x4 ymm5,ymm1,ymm5,3 + vshufi32x4 ymm1,ymm18,ymm2,0 + vshufi32x4 ymm2,ymm18,ymm2,3 + vshufi32x4 ymm18,ymm3,ymm7,0 + vshufi32x4 ymm7,ymm3,ymm7,3 + vshufi32x4 ymm3,ymm0,ymm4,0 + vshufi32x4 ymm4,ymm0,ymm4,3 + vpaddd ymm8,ymm8,ymm24 + vpaddd ymm9,ymm9,ymm25 + vpaddd ymm10,ymm10,ymm26 + vpaddd ymm11,ymm11,ymm27 + + vpunpckldq ymm6,ymm8,ymm9 + vpunpckldq ymm0,ymm10,ymm11 + vpunpckhdq ymm8,ymm8,ymm9 + vpunpckhdq ymm10,ymm10,ymm11 + vpunpcklqdq ymm9,ymm6,ymm0 + vpunpckhqdq ymm6,ymm6,ymm0 + vpunpcklqdq ymm11,ymm8,ymm10 + vpunpckhqdq ymm8,ymm8,ymm10 + vpaddd ymm12,ymm12,ymm28 + vpaddd ymm13,ymm13,ymm29 + vpaddd ymm14,ymm14,ymm30 + vpaddd ymm15,ymm15,ymm31 + + vpunpckldq ymm10,ymm12,ymm13 + vpunpckldq ymm0,ymm14,ymm15 + vpunpckhdq ymm12,ymm12,ymm13 + vpunpckhdq ymm14,ymm14,ymm15 + vpunpcklqdq ymm13,ymm10,ymm0 + vpunpckhqdq ymm10,ymm10,ymm0 + vpunpcklqdq ymm15,ymm12,ymm14 + vpunpckhqdq ymm12,ymm12,ymm14 + vperm2i128 ymm0,ymm9,ymm13,0x20 + vperm2i128 ymm13,ymm9,ymm13,0x31 + vperm2i128 ymm9,ymm6,ymm10,0x20 + vperm2i128 ymm10,ymm6,ymm10,0x31 + vperm2i128 ymm6,ymm11,ymm15,0x20 + vperm2i128 ymm15,ymm11,ymm15,0x31 + vperm2i128 ymm11,ymm8,ymm12,0x20 + vperm2i128 ymm12,ymm8,ymm12,0x31 + cmp rdx,64*8 + jb NEAR $L$tail8xvl + + mov eax,0x80 + vpxord ymm19,ymm19,YMMWORD[rsi] + vpxor ymm0,ymm0,YMMWORD[32+rsi] + vpxor ymm5,ymm5,YMMWORD[64+rsi] + vpxor ymm13,ymm13,YMMWORD[96+rsi] + lea rsi,[rax*1+rsi] + vmovdqu32 YMMWORD[rdi],ymm19 + vmovdqu YMMWORD[32+rdi],ymm0 + vmovdqu YMMWORD[64+rdi],ymm5 + vmovdqu YMMWORD[96+rdi],ymm13 + lea rdi,[rax*1+rdi] + + vpxor ymm1,ymm1,YMMWORD[rsi] + vpxor ymm9,ymm9,YMMWORD[32+rsi] + vpxor ymm2,ymm2,YMMWORD[64+rsi] + vpxor ymm10,ymm10,YMMWORD[96+rsi] + lea rsi,[rax*1+rsi] + vmovdqu YMMWORD[rdi],ymm1 + vmovdqu YMMWORD[32+rdi],ymm9 + vmovdqu YMMWORD[64+rdi],ymm2 + vmovdqu YMMWORD[96+rdi],ymm10 + lea rdi,[rax*1+rdi] + + vpxord ymm18,ymm18,YMMWORD[rsi] + vpxor ymm6,ymm6,YMMWORD[32+rsi] + vpxor ymm7,ymm7,YMMWORD[64+rsi] + vpxor ymm15,ymm15,YMMWORD[96+rsi] + lea rsi,[rax*1+rsi] + vmovdqu32 YMMWORD[rdi],ymm18 + vmovdqu YMMWORD[32+rdi],ymm6 + vmovdqu YMMWORD[64+rdi],ymm7 + vmovdqu YMMWORD[96+rdi],ymm15 + lea rdi,[rax*1+rdi] + + vpxor ymm3,ymm3,YMMWORD[rsi] + vpxor ymm11,ymm11,YMMWORD[32+rsi] + vpxor ymm4,ymm4,YMMWORD[64+rsi] + vpxor ymm12,ymm12,YMMWORD[96+rsi] + lea rsi,[rax*1+rsi] + vmovdqu YMMWORD[rdi],ymm3 + vmovdqu YMMWORD[32+rdi],ymm11 + vmovdqu YMMWORD[64+rdi],ymm4 + vmovdqu YMMWORD[96+rdi],ymm12 + lea rdi,[rax*1+rdi] + + vpbroadcastd ymm0,DWORD[r10] + vpbroadcastd ymm1,DWORD[4+r10] + + sub rdx,64*8 + jnz NEAR $L$oop_outer8xvl + + jmp NEAR $L$done8xvl + +ALIGN 32 +$L$tail8xvl: + vmovdqa64 ymm8,ymm19 + xor r10,r10 + sub rdi,rsi + cmp rdx,64*1 + jb NEAR $L$ess_than_64_8xvl + vpxor ymm8,ymm8,YMMWORD[rsi] + vpxor ymm0,ymm0,YMMWORD[32+rsi] + vmovdqu YMMWORD[rsi*1+rdi],ymm8 + vmovdqu YMMWORD[32+rsi*1+rdi],ymm0 + je NEAR $L$done8xvl + vmovdqa ymm8,ymm5 + vmovdqa ymm0,ymm13 + lea rsi,[64+rsi] + + cmp rdx,64*2 + jb NEAR $L$ess_than_64_8xvl + vpxor ymm5,ymm5,YMMWORD[rsi] + vpxor ymm13,ymm13,YMMWORD[32+rsi] + vmovdqu YMMWORD[rsi*1+rdi],ymm5 + vmovdqu YMMWORD[32+rsi*1+rdi],ymm13 + je NEAR $L$done8xvl + vmovdqa ymm8,ymm1 + vmovdqa ymm0,ymm9 + lea rsi,[64+rsi] + + cmp rdx,64*3 + jb NEAR $L$ess_than_64_8xvl + vpxor ymm1,ymm1,YMMWORD[rsi] + vpxor ymm9,ymm9,YMMWORD[32+rsi] + vmovdqu YMMWORD[rsi*1+rdi],ymm1 + vmovdqu YMMWORD[32+rsi*1+rdi],ymm9 + je NEAR $L$done8xvl + vmovdqa ymm8,ymm2 + vmovdqa ymm0,ymm10 + lea rsi,[64+rsi] + + cmp rdx,64*4 + jb NEAR $L$ess_than_64_8xvl + vpxor ymm2,ymm2,YMMWORD[rsi] + vpxor ymm10,ymm10,YMMWORD[32+rsi] + vmovdqu YMMWORD[rsi*1+rdi],ymm2 + vmovdqu YMMWORD[32+rsi*1+rdi],ymm10 + je NEAR $L$done8xvl + vmovdqa32 ymm8,ymm18 + vmovdqa ymm0,ymm6 + lea rsi,[64+rsi] + + cmp rdx,64*5 + jb NEAR $L$ess_than_64_8xvl + vpxord ymm18,ymm18,YMMWORD[rsi] + vpxor ymm6,ymm6,YMMWORD[32+rsi] + vmovdqu32 YMMWORD[rsi*1+rdi],ymm18 + vmovdqu YMMWORD[32+rsi*1+rdi],ymm6 + je NEAR $L$done8xvl + vmovdqa ymm8,ymm7 + vmovdqa ymm0,ymm15 + lea rsi,[64+rsi] + + cmp rdx,64*6 + jb NEAR $L$ess_than_64_8xvl + vpxor ymm7,ymm7,YMMWORD[rsi] + vpxor ymm15,ymm15,YMMWORD[32+rsi] + vmovdqu YMMWORD[rsi*1+rdi],ymm7 + vmovdqu YMMWORD[32+rsi*1+rdi],ymm15 + je NEAR $L$done8xvl + vmovdqa ymm8,ymm3 + vmovdqa ymm0,ymm11 + lea rsi,[64+rsi] + + cmp rdx,64*7 + jb NEAR $L$ess_than_64_8xvl + vpxor ymm3,ymm3,YMMWORD[rsi] + vpxor ymm11,ymm11,YMMWORD[32+rsi] + vmovdqu YMMWORD[rsi*1+rdi],ymm3 + vmovdqu YMMWORD[32+rsi*1+rdi],ymm11 + je NEAR $L$done8xvl + vmovdqa ymm8,ymm4 + vmovdqa ymm0,ymm12 + lea rsi,[64+rsi] + +$L$ess_than_64_8xvl: + vmovdqa YMMWORD[rsp],ymm8 + vmovdqa YMMWORD[32+rsp],ymm0 + lea rdi,[rsi*1+rdi] + and rdx,63 + +$L$oop_tail8xvl: + movzx eax,BYTE[r10*1+rsi] + movzx ecx,BYTE[r10*1+rsp] + lea r10,[1+r10] + xor eax,ecx + mov BYTE[((-1))+r10*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail8xvl + + vpxor ymm8,ymm8,ymm8 + vmovdqa YMMWORD[rsp],ymm8 + vmovdqa YMMWORD[32+rsp],ymm8 + +$L$done8xvl: + vzeroall + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$8xvl_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ChaCha20_8xvl: +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + lea r10,[$L$ctr32_body] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + lea r10,[$L$no_data] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rax,[((64+24+48))+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + + +ALIGN 16 +simd_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[192+r8] + + mov r10d,DWORD[4+r11] + mov ecx,DWORD[8+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + neg rcx + lea rsi,[((-8))+rcx*1+rax] + lea rdi,[512+r8] + neg ecx + shr ecx,3 + DD 0xa548f3fc + + jmp NEAR $L$common_seh_tail + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_ChaCha20_ctr32 wrt ..imagebase + DD $L$SEH_end_ChaCha20_ctr32 wrt ..imagebase + DD $L$SEH_info_ChaCha20_ctr32 wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase + DD $L$SEH_end_ChaCha20_ssse3 wrt ..imagebase + DD $L$SEH_info_ChaCha20_ssse3 wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_128 wrt ..imagebase + DD $L$SEH_end_ChaCha20_128 wrt ..imagebase + DD $L$SEH_info_ChaCha20_128 wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_4x wrt ..imagebase + DD $L$SEH_end_ChaCha20_4x wrt ..imagebase + DD $L$SEH_info_ChaCha20_4x wrt ..imagebase + DD $L$SEH_begin_ChaCha20_4xop wrt ..imagebase + DD $L$SEH_end_ChaCha20_4xop wrt ..imagebase + DD $L$SEH_info_ChaCha20_4xop wrt ..imagebase + DD $L$SEH_begin_ChaCha20_8x wrt ..imagebase + DD $L$SEH_end_ChaCha20_8x wrt ..imagebase + DD $L$SEH_info_ChaCha20_8x wrt ..imagebase + DD $L$SEH_begin_ChaCha20_avx512 wrt ..imagebase + DD $L$SEH_end_ChaCha20_avx512 wrt ..imagebase + DD $L$SEH_info_ChaCha20_avx512 wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_avx512vl wrt ..imagebase + DD $L$SEH_end_ChaCha20_avx512vl wrt ..imagebase + DD $L$SEH_info_ChaCha20_avx512vl wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_16x wrt ..imagebase + DD $L$SEH_end_ChaCha20_16x wrt ..imagebase + DD $L$SEH_info_ChaCha20_16x wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_8xvl wrt ..imagebase + DD $L$SEH_end_ChaCha20_8xvl wrt ..imagebase + DD $L$SEH_info_ChaCha20_8xvl wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_ChaCha20_ctr32: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + +$L$SEH_info_ChaCha20_ssse3: +DB 9,0,0,0 + DD simd_handler wrt ..imagebase + DD $L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase + DD 0x20,0 + +$L$SEH_info_ChaCha20_128: +DB 9,0,0,0 + DD simd_handler wrt ..imagebase + DD $L$128_body wrt ..imagebase,$L$128_epilogue wrt ..imagebase + DD 0x60,0 + +$L$SEH_info_ChaCha20_4x: +DB 9,0,0,0 + DD simd_handler wrt ..imagebase + DD $L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase + DD 0xa0,0 +$L$SEH_info_ChaCha20_4xop: +DB 9,0,0,0 + DD simd_handler wrt ..imagebase + DD $L$4xop_body wrt ..imagebase,$L$4xop_epilogue wrt ..imagebase + DD 0xa0,0 +$L$SEH_info_ChaCha20_8x: +DB 9,0,0,0 + DD simd_handler wrt ..imagebase + DD $L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase + DD 0xa0,0 +$L$SEH_info_ChaCha20_avx512: +DB 9,0,0,0 + DD simd_handler wrt ..imagebase + DD $L$avx512_body wrt ..imagebase,$L$avx512_epilogue wrt ..imagebase + DD 0x20,0 + +$L$SEH_info_ChaCha20_avx512vl: +DB 9,0,0,0 + DD simd_handler wrt ..imagebase + DD $L$avx512vl_body wrt ..imagebase,$L$avx512vl_epilogue wrt ..imagebase + DD 0x20,0 + +$L$SEH_info_ChaCha20_16x: +DB 9,0,0,0 + DD simd_handler wrt ..imagebase + DD $L$16x_body wrt ..imagebase,$L$16x_epilogue wrt ..imagebase + DD 0xa0,0 + +$L$SEH_info_ChaCha20_8xvl: +DB 9,0,0,0 + DD simd_handler wrt ..imagebase + DD $L$8xvl_body wrt ..imagebase,$L$8xvl_epilogue wrt ..imagebase + DD 0xa0,0 diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/ec/ecp_nistz256-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/ec/ecp_nistz256-x86_64.asm index e0c40d6ec4..9ef88ef1c8 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/ec/ecp_nistz256-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/ec/ecp_nistz256-x86_64.asm @@ -2399,6 +2399,12 @@ $L$Three: $L$ONE_mont: DQ 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe + +$L$ord: + DQ 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 +$L$ordK: + DQ 0xccd1c8aaee00bc4f + global ecp_nistz256_mul_by_2 ALIGN 64 @@ -2411,9 +2417,13 @@ $L$SEH_begin_ecp_nistz256_mul_by_2: mov rsi,rdx + push r12 + push r13 +$L$mul_by_2_body: + mov r8,QWORD[rsi] xor r13,r13 mov r9,QWORD[8+rsi] @@ -2445,11 +2455,17 @@ $L$SEH_begin_ecp_nistz256_mul_by_2: mov QWORD[16+rdi],r10 mov QWORD[24+rdi],r11 - pop r13 - pop r12 + mov r13,QWORD[rsp] + + mov r12,QWORD[8+rsp] + + lea rsp,[16+rsp] + +$L$mul_by_2_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_mul_by_2: @@ -2466,9 +2482,13 @@ $L$SEH_begin_ecp_nistz256_div_by_2: mov rsi,rdx + push r12 + push r13 +$L$div_by_2_body: + mov r8,QWORD[rsi] mov r9,QWORD[8+rsi] mov r10,QWORD[16+rsi] @@ -2515,11 +2535,17 @@ $L$SEH_begin_ecp_nistz256_div_by_2: mov QWORD[16+rdi],r10 mov QWORD[24+rdi],r11 - pop r13 - pop r12 + mov r13,QWORD[rsp] + + mov r12,QWORD[8+rsp] + + lea rsp,[16+rsp] + +$L$div_by_2_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_div_by_2: @@ -2536,9 +2562,13 @@ $L$SEH_begin_ecp_nistz256_mul_by_3: mov rsi,rdx + push r12 + push r13 +$L$mul_by_3_body: + mov r8,QWORD[rsi] xor r13,r13 mov r9,QWORD[8+rsi] @@ -2591,11 +2621,17 @@ $L$SEH_begin_ecp_nistz256_mul_by_3: mov QWORD[16+rdi],r10 mov QWORD[24+rdi],r11 - pop r13 - pop r12 + mov r13,QWORD[rsp] + + mov r12,QWORD[8+rsp] + + lea rsp,[16+rsp] + +$L$mul_by_3_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_mul_by_3: @@ -2613,9 +2649,13 @@ $L$SEH_begin_ecp_nistz256_add: mov rdx,r8 + push r12 + push r13 +$L$add_body: + mov r8,QWORD[rsi] xor r13,r13 mov r9,QWORD[8+rsi] @@ -2648,11 +2688,17 @@ $L$SEH_begin_ecp_nistz256_add: mov QWORD[16+rdi],r10 mov QWORD[24+rdi],r11 - pop r13 - pop r12 + mov r13,QWORD[rsp] + + mov r12,QWORD[8+rsp] + + lea rsp,[16+rsp] + +$L$add_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_add: @@ -2670,9 +2716,13 @@ $L$SEH_begin_ecp_nistz256_sub: mov rdx,r8 + push r12 + push r13 +$L$sub_body: + mov r8,QWORD[rsi] xor r13,r13 mov r9,QWORD[8+rsi] @@ -2705,11 +2755,17 @@ $L$SEH_begin_ecp_nistz256_sub: mov QWORD[16+rdi],r10 mov QWORD[24+rdi],r11 - pop r13 - pop r12 + mov r13,QWORD[rsp] + + mov r12,QWORD[8+rsp] + + lea rsp,[16+rsp] + +$L$sub_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_sub: @@ -2726,9 +2782,13 @@ $L$SEH_begin_ecp_nistz256_neg: mov rsi,rdx + push r12 + push r13 +$L$neg_body: + xor r8,r8 xor r9,r9 xor r10,r10 @@ -2761,16 +2821,1131 @@ $L$SEH_begin_ecp_nistz256_neg: mov QWORD[16+rdi],r10 mov QWORD[24+rdi],r11 - pop r13 - pop r12 + mov r13,QWORD[rsp] + + mov r12,QWORD[8+rsp] + + lea rsp,[16+rsp] + +$L$neg_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_neg: + + +global ecp_nistz256_ord_mul_mont + +ALIGN 32 +ecp_nistz256_ord_mul_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_mul_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + mov ecx,0x80100 + and ecx,DWORD[((OPENSSL_ia32cap_P+8))] + cmp ecx,0x80100 + je NEAR $L$ecp_nistz256_ord_mul_montx + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_mul_body: + + mov rax,QWORD[rdx] + mov rbx,rdx + lea r14,[$L$ord] + mov r15,QWORD[$L$ordK] + + + mov rcx,rax + mul QWORD[rsi] + mov r8,rax + mov rax,rcx + mov r9,rdx + + mul QWORD[8+rsi] + add r9,rax + mov rax,rcx + adc rdx,0 + mov r10,rdx + + mul QWORD[16+rsi] + add r10,rax + mov rax,rcx + adc rdx,0 + + mov r13,r8 + imul r8,r15 + + mov r11,rdx + mul QWORD[24+rsi] + add r11,rax + mov rax,r8 + adc rdx,0 + mov r12,rdx + + + mul QWORD[r14] + mov rbp,r8 + add r13,rax + mov rax,r8 + adc rdx,0 + mov rcx,rdx + + sub r10,r8 + sbb r8,0 + + mul QWORD[8+r14] + add r9,rcx + adc rdx,0 + add r9,rax + mov rax,rbp + adc r10,rdx + mov rdx,rbp + adc r8,0 + + shl rax,32 + shr rdx,32 + sub r11,rax + mov rax,QWORD[8+rbx] + sbb rbp,rdx + + add r11,r8 + adc r12,rbp + adc r13,0 + + + mov rcx,rax + mul QWORD[rsi] + add r9,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[8+rsi] + add r10,rbp + adc rdx,0 + add r10,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[16+rsi] + add r11,rbp + adc rdx,0 + add r11,rax + mov rax,rcx + adc rdx,0 + + mov rcx,r9 + imul r9,r15 + + mov rbp,rdx + mul QWORD[24+rsi] + add r12,rbp + adc rdx,0 + xor r8,r8 + add r12,rax + mov rax,r9 + adc r13,rdx + adc r8,0 + + + mul QWORD[r14] + mov rbp,r9 + add rcx,rax + mov rax,r9 + adc rcx,rdx + + sub r11,r9 + sbb r9,0 + + mul QWORD[8+r14] + add r10,rcx + adc rdx,0 + add r10,rax + mov rax,rbp + adc r11,rdx + mov rdx,rbp + adc r9,0 + + shl rax,32 + shr rdx,32 + sub r12,rax + mov rax,QWORD[16+rbx] + sbb rbp,rdx + + add r12,r9 + adc r13,rbp + adc r8,0 + + + mov rcx,rax + mul QWORD[rsi] + add r10,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[8+rsi] + add r11,rbp + adc rdx,0 + add r11,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[16+rsi] + add r12,rbp + adc rdx,0 + add r12,rax + mov rax,rcx + adc rdx,0 + + mov rcx,r10 + imul r10,r15 + + mov rbp,rdx + mul QWORD[24+rsi] + add r13,rbp + adc rdx,0 + xor r9,r9 + add r13,rax + mov rax,r10 + adc r8,rdx + adc r9,0 + + + mul QWORD[r14] + mov rbp,r10 + add rcx,rax + mov rax,r10 + adc rcx,rdx + + sub r12,r10 + sbb r10,0 + + mul QWORD[8+r14] + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,rbp + adc r12,rdx + mov rdx,rbp + adc r10,0 + + shl rax,32 + shr rdx,32 + sub r13,rax + mov rax,QWORD[24+rbx] + sbb rbp,rdx + + add r13,r10 + adc r8,rbp + adc r9,0 + + + mov rcx,rax + mul QWORD[rsi] + add r11,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[8+rsi] + add r12,rbp + adc rdx,0 + add r12,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[16+rsi] + add r13,rbp + adc rdx,0 + add r13,rax + mov rax,rcx + adc rdx,0 + + mov rcx,r11 + imul r11,r15 + + mov rbp,rdx + mul QWORD[24+rsi] + add r8,rbp + adc rdx,0 + xor r10,r10 + add r8,rax + mov rax,r11 + adc r9,rdx + adc r10,0 + + + mul QWORD[r14] + mov rbp,r11 + add rcx,rax + mov rax,r11 + adc rcx,rdx + + sub r13,r11 + sbb r11,0 + + mul QWORD[8+r14] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,rbp + adc r13,rdx + mov rdx,rbp + adc r11,0 + + shl rax,32 + shr rdx,32 + sub r8,rax + sbb rbp,rdx + + add r8,r11 + adc r9,rbp + adc r10,0 + + + mov rsi,r12 + sub r12,QWORD[r14] + mov r11,r13 + sbb r13,QWORD[8+r14] + mov rcx,r8 + sbb r8,QWORD[16+r14] + mov rbp,r9 + sbb r9,QWORD[24+r14] + sbb r10,0 + + cmovc r12,rsi + cmovc r13,r11 + cmovc r8,rcx + cmovc r9,rbp + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_mul_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_ord_mul_mont: + + + + + + + +global ecp_nistz256_ord_sqr_mont + +ALIGN 32 +ecp_nistz256_ord_sqr_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_sqr_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + mov ecx,0x80100 + and ecx,DWORD[((OPENSSL_ia32cap_P+8))] + cmp ecx,0x80100 + je NEAR $L$ecp_nistz256_ord_sqr_montx + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_sqr_body: + + mov r8,QWORD[rsi] + mov rax,QWORD[8+rsi] + mov r14,QWORD[16+rsi] + mov r15,QWORD[24+rsi] + lea rsi,[$L$ord] + mov rbx,rdx + jmp NEAR $L$oop_ord_sqr + +ALIGN 32 +$L$oop_ord_sqr: + + mov rbp,rax + mul r8 + mov r9,rax +DB 102,72,15,110,205 + mov rax,r14 + mov r10,rdx + + mul r8 + add r10,rax + mov rax,r15 +DB 102,73,15,110,214 + adc rdx,0 + mov r11,rdx + + mul r8 + add r11,rax + mov rax,r15 +DB 102,73,15,110,223 + adc rdx,0 + mov r12,rdx + + + mul r14 + mov r13,rax + mov rax,r14 + mov r14,rdx + + + mul rbp + add r11,rax + mov rax,r15 + adc rdx,0 + mov r15,rdx + + mul rbp + add r12,rax + adc rdx,0 + + add r12,r15 + adc r13,rdx + adc r14,0 + + + xor r15,r15 + mov rax,r8 + add r9,r9 + adc r10,r10 + adc r11,r11 + adc r12,r12 + adc r13,r13 + adc r14,r14 + adc r15,0 + + + mul rax + mov r8,rax +DB 102,72,15,126,200 + mov rbp,rdx + + mul rax + add r9,rbp + adc r10,rax +DB 102,72,15,126,208 + adc rdx,0 + mov rbp,rdx + + mul rax + add r11,rbp + adc r12,rax +DB 102,72,15,126,216 + adc rdx,0 + mov rbp,rdx + + mov rcx,r8 + imul r8,QWORD[32+rsi] + + mul rax + add r13,rbp + adc r14,rax + mov rax,QWORD[rsi] + adc r15,rdx + + + mul r8 + mov rbp,r8 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r10,r8 + sbb rbp,0 + + mul r8 + add r9,rcx + adc rdx,0 + add r9,rax + mov rax,r8 + adc r10,rdx + mov rdx,r8 + adc rbp,0 + + mov rcx,r9 + imul r9,QWORD[32+rsi] + + shl rax,32 + shr rdx,32 + sub r11,rax + mov rax,QWORD[rsi] + sbb r8,rdx + + add r11,rbp + adc r8,0 + + + mul r9 + mov rbp,r9 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r11,r9 + sbb rbp,0 + + mul r9 + add r10,rcx + adc rdx,0 + add r10,rax + mov rax,r9 + adc r11,rdx + mov rdx,r9 + adc rbp,0 + + mov rcx,r10 + imul r10,QWORD[32+rsi] + + shl rax,32 + shr rdx,32 + sub r8,rax + mov rax,QWORD[rsi] + sbb r9,rdx + + add r8,rbp + adc r9,0 + + + mul r10 + mov rbp,r10 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r8,r10 + sbb rbp,0 + + mul r10 + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,r10 + adc r8,rdx + mov rdx,r10 + adc rbp,0 + + mov rcx,r11 + imul r11,QWORD[32+rsi] + + shl rax,32 + shr rdx,32 + sub r9,rax + mov rax,QWORD[rsi] + sbb r10,rdx + + add r9,rbp + adc r10,0 + + + mul r11 + mov rbp,r11 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r9,r11 + sbb rbp,0 + + mul r11 + add r8,rcx + adc rdx,0 + add r8,rax + mov rax,r11 + adc r9,rdx + mov rdx,r11 + adc rbp,0 + + shl rax,32 + shr rdx,32 + sub r10,rax + sbb r11,rdx + + add r10,rbp + adc r11,0 + + + xor rdx,rdx + add r8,r12 + adc r9,r13 + mov r12,r8 + adc r10,r14 + adc r11,r15 + mov rax,r9 + adc rdx,0 + + + sub r8,QWORD[rsi] + mov r14,r10 + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov r15,r11 + sbb r11,QWORD[24+rsi] + sbb rdx,0 + + cmovc r8,r12 + cmovnc rax,r9 + cmovnc r14,r10 + cmovnc r15,r11 + + dec rbx + jnz NEAR $L$oop_ord_sqr + + mov QWORD[rdi],r8 + mov QWORD[8+rdi],rax + pxor xmm1,xmm1 + mov QWORD[16+rdi],r14 + pxor xmm2,xmm2 + mov QWORD[24+rdi],r15 + pxor xmm3,xmm3 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_sqr_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_ord_sqr_mont: + + +ALIGN 32 +ecp_nistz256_ord_mul_montx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_mul_montx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$ecp_nistz256_ord_mul_montx: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_mulx_body: + + mov rbx,rdx + mov rdx,QWORD[rdx] + mov r9,QWORD[rsi] + mov r10,QWORD[8+rsi] + mov r11,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + lea rsi,[((-128))+rsi] + lea r14,[(($L$ord-128))] + mov r15,QWORD[$L$ordK] + + + mulx r9,r8,r9 + mulx r10,rcx,r10 + mulx r11,rbp,r11 + add r9,rcx + mulx r12,rcx,r12 + mov rdx,r8 + mulx rax,rdx,r15 + adc r10,rbp + adc r11,rcx + adc r12,0 + + + xor r13,r13 + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r8,rcx + adox r9,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + mov rdx,QWORD[8+rbx] + adcx r11,rcx + adox r12,rbp + adcx r12,r8 + adox r13,r8 + adc r13,0 + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r9 + mulx rax,rdx,r15 + adcx r12,rcx + adox r13,rbp + + adcx r13,r8 + adox r8,r8 + adc r8,0 + + + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + mov rdx,QWORD[16+rbx] + adcx r12,rcx + adox r13,rbp + adcx r13,r9 + adox r8,r9 + adc r8,0 + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r10 + mulx rax,rdx,r15 + adcx r13,rcx + adox r8,rbp + + adcx r8,r9 + adox r9,r9 + adc r9,0 + + + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + mov rdx,QWORD[24+rbx] + adcx r13,rcx + adox r8,rbp + adcx r8,r10 + adox r9,r10 + adc r9,0 + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r13,rcx + adox r8,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r11 + mulx rax,rdx,r15 + adcx r8,rcx + adox r9,rbp + + adcx r9,r10 + adox r10,r10 + adc r10,0 + + + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r13,rcx + adox r8,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + lea r14,[128+r14] + mov rbx,r12 + adcx r8,rcx + adox r9,rbp + mov rdx,r13 + adcx r9,r11 + adox r10,r11 + adc r10,0 + + + + mov rcx,r8 + sub r12,QWORD[r14] + sbb r13,QWORD[8+r14] + sbb r8,QWORD[16+r14] + mov rbp,r9 + sbb r9,QWORD[24+r14] + sbb r10,0 + + cmovc r12,rbx + cmovc r13,rdx + cmovc r8,rcx + cmovc r9,rbp + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_mulx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_ord_mul_montx: + + +ALIGN 32 +ecp_nistz256_ord_sqr_montx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_sqr_montx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$ecp_nistz256_ord_sqr_montx: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_sqrx_body: + + mov rbx,rdx + mov rdx,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r15,QWORD[16+rsi] + mov r8,QWORD[24+rsi] + lea rsi,[$L$ord] + jmp NEAR $L$oop_ord_sqrx + +ALIGN 32 +$L$oop_ord_sqrx: + mulx r10,r9,r14 + mulx r11,rcx,r15 + mov rax,rdx +DB 102,73,15,110,206 + mulx r12,rbp,r8 + mov rdx,r14 + add r10,rcx +DB 102,73,15,110,215 + adc r11,rbp + adc r12,0 + xor r13,r13 + + mulx rbp,rcx,r15 + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,r8 + mov rdx,r15 + adcx r12,rcx + adox r13,rbp + adc r13,0 + + mulx r14,rcx,r8 + mov rdx,rax +DB 102,73,15,110,216 + xor r15,r15 + adcx r9,r9 + adox r13,rcx + adcx r10,r10 + adox r14,r15 + + + mulx rbp,r8,rdx +DB 102,72,15,126,202 + adcx r11,r11 + adox r9,rbp + adcx r12,r12 + mulx rax,rcx,rdx +DB 102,72,15,126,210 + adcx r13,r13 + adox r10,rcx + adcx r14,r14 + mulx rbp,rcx,rdx +DB 0x67 +DB 102,72,15,126,218 + adox r11,rax + adcx r15,r15 + adox r12,rcx + adox r13,rbp + mulx rax,rcx,rdx + adox r14,rcx + adox r15,rax + + + mov rdx,r8 + mulx rcx,rdx,QWORD[32+rsi] + + xor rax,rax + mulx rbp,rcx,QWORD[rsi] + adcx r8,rcx + adox r9,rbp + mulx rbp,rcx,QWORD[8+rsi] + adcx r9,rcx + adox r10,rbp + mulx rbp,rcx,QWORD[16+rsi] + adcx r10,rcx + adox r11,rbp + mulx rbp,rcx,QWORD[24+rsi] + adcx r11,rcx + adox r8,rbp + adcx r8,rax + + + mov rdx,r9 + mulx rcx,rdx,QWORD[32+rsi] + + mulx rbp,rcx,QWORD[rsi] + adox r9,rcx + adcx r10,rbp + mulx rbp,rcx,QWORD[8+rsi] + adox r10,rcx + adcx r11,rbp + mulx rbp,rcx,QWORD[16+rsi] + adox r11,rcx + adcx r8,rbp + mulx rbp,rcx,QWORD[24+rsi] + adox r8,rcx + adcx r9,rbp + adox r9,rax + + + mov rdx,r10 + mulx rcx,rdx,QWORD[32+rsi] + + mulx rbp,rcx,QWORD[rsi] + adcx r10,rcx + adox r11,rbp + mulx rbp,rcx,QWORD[8+rsi] + adcx r11,rcx + adox r8,rbp + mulx rbp,rcx,QWORD[16+rsi] + adcx r8,rcx + adox r9,rbp + mulx rbp,rcx,QWORD[24+rsi] + adcx r9,rcx + adox r10,rbp + adcx r10,rax + + + mov rdx,r11 + mulx rcx,rdx,QWORD[32+rsi] + + mulx rbp,rcx,QWORD[rsi] + adox r11,rcx + adcx r8,rbp + mulx rbp,rcx,QWORD[8+rsi] + adox r8,rcx + adcx r9,rbp + mulx rbp,rcx,QWORD[16+rsi] + adox r9,rcx + adcx r10,rbp + mulx rbp,rcx,QWORD[24+rsi] + adox r10,rcx + adcx r11,rbp + adox r11,rax + + + add r12,r8 + adc r9,r13 + mov rdx,r12 + adc r10,r14 + adc r11,r15 + mov r14,r9 + adc rax,0 + + + sub r12,QWORD[rsi] + mov r15,r10 + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov r8,r11 + sbb r11,QWORD[24+rsi] + sbb rax,0 + + cmovnc rdx,r12 + cmovnc r14,r9 + cmovnc r15,r10 + cmovnc r8,r11 + + dec rbx + jnz NEAR $L$oop_ord_sqrx + + mov QWORD[rdi],rdx + mov QWORD[8+rdi],r14 + pxor xmm1,xmm1 + mov QWORD[16+rdi],r15 + pxor xmm2,xmm2 + mov QWORD[24+rdi],r8 + pxor xmm3,xmm3 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_sqrx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_ord_sqr_montx: + + + + global ecp_nistz256_to_mont ALIGN 32 @@ -2808,15 +3983,23 @@ $L$SEH_begin_ecp_nistz256_mul_mont: mov rdx,r8 + mov ecx,0x80100 and ecx,DWORD[((OPENSSL_ia32cap_P+8))] $L$mul_mont: push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + +$L$mul_body: cmp ecx,0x80100 je NEAR $L$mul_montx mov rbx,rdx @@ -2841,15 +4024,25 @@ $L$mul_montx: call __ecp_nistz256_mul_montx $L$mul_mont_done: - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - pop rbp + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$mul_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_mul_mont: @@ -3087,14 +4280,22 @@ $L$SEH_begin_ecp_nistz256_sqr_mont: mov rsi,rdx + mov ecx,0x80100 and ecx,DWORD[((OPENSSL_ia32cap_P+8))] push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + +$L$sqr_body: cmp ecx,0x80100 je NEAR $L$sqr_montx mov rax,QWORD[rsi] @@ -3115,15 +4316,25 @@ $L$sqr_montx: call __ecp_nistz256_sqr_montx $L$sqr_mont_done: - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - pop rbp + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$sqr_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_sqr_mont: @@ -3599,9 +4810,13 @@ $L$SEH_begin_ecp_nistz256_from_mont: mov rsi,rdx + push r12 + push r13 +$L$from_body: + mov rax,QWORD[rsi] mov r13,QWORD[(($L$poly+24))] mov r9,QWORD[8+rsi] @@ -3681,11 +4896,17 @@ $L$SEH_begin_ecp_nistz256_from_mont: mov QWORD[16+rdi],r10 mov QWORD[24+rdi],r11 - pop r13 - pop r12 + mov r13,QWORD[rsp] + + mov r12,QWORD[8+rsp] + + lea rsp,[16+rsp] + +$L$from_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_from_mont: @@ -3794,8 +5015,8 @@ $L$select_loop_sse_w5: movaps xmm14,XMMWORD[128+rsp] movaps xmm15,XMMWORD[144+rsp] lea rsp,[168+rsp] -$L$SEH_end_ecp_nistz256_gather_w5: DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_gather_w5: @@ -3889,8 +5110,8 @@ $L$select_loop_sse_w7: movaps xmm14,XMMWORD[128+rsp] movaps xmm15,XMMWORD[144+rsp] lea rsp,[168+rsp] -$L$SEH_end_ecp_nistz256_gather_w7: DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_gather_w7: @@ -3900,6 +5121,7 @@ ecp_nistz256_avx2_gather_w5: $L$avx2_gather_w5: vzeroupper lea rax,[((-136))+rsp] + mov r11,rsp $L$SEH_begin_ecp_nistz256_avx2_gather_w5: DB 0x48,0x8d,0x60,0xe0 DB 0xc5,0xf8,0x29,0x70,0xe0 @@ -3973,9 +5195,9 @@ $L$select_loop_avx2_w5: movaps xmm13,XMMWORD[112+rsp] movaps xmm14,XMMWORD[128+rsp] movaps xmm15,XMMWORD[144+rsp] - lea rsp,[168+rsp] -$L$SEH_end_ecp_nistz256_avx2_gather_w5: + lea rsp,[r11] DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_avx2_gather_w5: @@ -3986,6 +5208,7 @@ ALIGN 32 ecp_nistz256_avx2_gather_w7: $L$avx2_gather_w7: vzeroupper + mov r11,rsp lea rax,[((-136))+rsp] $L$SEH_begin_ecp_nistz256_avx2_gather_w7: DB 0x48,0x8d,0x60,0xe0 @@ -4075,9 +5298,9 @@ $L$select_loop_avx2_w7: movaps xmm13,XMMWORD[112+rsp] movaps xmm14,XMMWORD[128+rsp] movaps xmm15,XMMWORD[144+rsp] - lea rsp,[168+rsp] -$L$SEH_end_ecp_nistz256_avx2_gather_w7: + lea rsp,[r11] DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_avx2_gather_w7: ALIGN 32 @@ -4212,18 +5435,27 @@ $L$SEH_begin_ecp_nistz256_point_double: mov rsi,rdx + mov ecx,0x80100 and ecx,DWORD[((OPENSSL_ia32cap_P+8))] cmp ecx,0x80100 je NEAR $L$point_doublex push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp,32*5+8 +$L$point_doubleq_body: + $L$point_double_shortcutq: movdqu xmm0,XMMWORD[rsi] mov rbx,rsi @@ -4405,16 +5637,27 @@ DB 102,72,15,126,203 DB 102,72,15,126,207 call __ecp_nistz256_sub_fromq - add rsp,32*5+8 - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - pop rbp + lea rsi,[((160+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_doubleq_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_point_double: global ecp_nistz256_point_add @@ -4429,18 +5672,27 @@ $L$SEH_begin_ecp_nistz256_point_add: mov rdx,r8 + mov ecx,0x80100 and ecx,DWORD[((OPENSSL_ia32cap_P+8))] cmp ecx,0x80100 je NEAR $L$point_addx push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp,32*18+8 +$L$point_addq_body: + movdqu xmm0,XMMWORD[rsi] movdqu xmm1,XMMWORD[16+rsi] movdqu xmm2,XMMWORD[32+rsi] @@ -4816,16 +6068,27 @@ DB 102,72,15,126,199 movdqu XMMWORD[48+rdi],xmm3 $L$add_doneq: - add rsp,32*18+8 - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - pop rbp + lea rsi,[((576+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_addq_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_point_add: global ecp_nistz256_point_add_affine @@ -4840,18 +6103,27 @@ $L$SEH_begin_ecp_nistz256_point_add_affine: mov rdx,r8 + mov ecx,0x80100 and ecx,DWORD[((OPENSSL_ia32cap_P+8))] cmp ecx,0x80100 je NEAR $L$point_add_affinex push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp,32*15+8 +$L$add_affineq_body: + movdqu xmm0,XMMWORD[rsi] mov rbx,rdx movdqu xmm1,XMMWORD[16+rsi] @@ -5133,16 +6405,27 @@ DB 102,72,15,126,199 movdqu XMMWORD[32+rdi],xmm2 movdqu XMMWORD[48+rdi],xmm3 - add rsp,32*15+8 - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - pop rbp + lea rsi,[((480+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$add_affineq_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_point_add_affine: ALIGN 32 @@ -5282,15 +6565,24 @@ $L$SEH_begin_ecp_nistz256_point_doublex: mov rsi,rdx + $L$point_doublex: push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp,32*5+8 +$L$point_doublex_body: + $L$point_double_shortcutx: movdqu xmm0,XMMWORD[rsi] mov rbx,rsi @@ -5472,16 +6764,27 @@ DB 102,72,15,126,203 DB 102,72,15,126,207 call __ecp_nistz256_sub_fromx - add rsp,32*5+8 - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - pop rbp + lea rsi,[((160+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_doublex_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_point_doublex: ALIGN 32 @@ -5495,15 +6798,24 @@ $L$SEH_begin_ecp_nistz256_point_addx: mov rdx,r8 + $L$point_addx: push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp,32*18+8 +$L$point_addx_body: + movdqu xmm0,XMMWORD[rsi] movdqu xmm1,XMMWORD[16+rsi] movdqu xmm2,XMMWORD[32+rsi] @@ -5879,16 +7191,27 @@ DB 102,72,15,126,199 movdqu XMMWORD[48+rdi],xmm3 $L$add_donex: - add rsp,32*18+8 - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - pop rbp + lea rsi,[((576+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_addx_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_point_addx: ALIGN 32 @@ -5902,15 +7225,24 @@ $L$SEH_begin_ecp_nistz256_point_add_affinex: mov rdx,r8 + $L$point_add_affinex: push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp,32*15+8 +$L$add_affinex_body: + movdqu xmm0,XMMWORD[rsi] mov rbx,rdx movdqu xmm1,XMMWORD[16+rsi] @@ -6192,14 +7524,375 @@ DB 102,72,15,126,199 movdqu XMMWORD[32+rdi],xmm2 movdqu XMMWORD[48+rdi],xmm3 - add rsp,32*15+8 + lea rsi,[((480+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$add_affinex_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_point_add_affinex: +EXTERN __imp_RtlVirtualUnwind + + +ALIGN 16 +short_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rax,[16+rax] + + mov r12,QWORD[((-8))+rax] + mov r13,QWORD[((-16))+rax] + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + + jmp NEAR $L$common_seh_tail + + + +ALIGN 16 +full_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov r10d,DWORD[8+r11] + lea rax,[r10*1+rax] + + mov rbp,QWORD[((-8))+rax] + mov rbx,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq pop r15 pop r14 pop r13 pop r12 - pop rbx pop rbp - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] + pop rbx + pop rdi + pop rsi DB 0F3h,0C3h ;repret -$L$SEH_end_ecp_nistz256_point_add_affinex: + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_ecp_nistz256_mul_by_2 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_mul_by_2 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_mul_by_2 wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_div_by_2 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_div_by_2 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_div_by_2 wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_mul_by_3 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_mul_by_3 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_mul_by_3 wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_add wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_add wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_add wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_sub wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_sub wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_sub wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_neg wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_neg wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_neg wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_ord_mul_mont wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_mul_mont wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_mul_mont wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_ord_sqr_mont wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_sqr_mont wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_sqr_mont wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_ord_mul_montx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_mul_montx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_mul_montx wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_ord_sqr_montx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_sqr_montx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_sqr_montx wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_to_mont wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_to_mont wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_to_mont wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_mul_mont wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_mul_mont wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_mul_mont wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_sqr_mont wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_sqr_mont wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_sqr_mont wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_from_mont wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_from_mont wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_from_mont wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_gather_w5 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_gather_w5 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_gather_wX wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_gather_w7 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_gather_w7 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_gather_wX wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_avx2_gather_w5 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_avx2_gather_w5 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_avx2_gather_wX wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_avx2_gather_w7 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_avx2_gather_w7 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_avx2_gather_wX wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_point_double wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_double wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_double wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_add wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_add wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_add wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_add_affine wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_add_affine wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_add_affine wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_point_doublex wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_doublex wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_doublex wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_addx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_addx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_addx wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_add_affinex wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_add_affinex wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_add_affinex wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_ecp_nistz256_mul_by_2: +DB 9,0,0,0 + DD short_handler wrt ..imagebase + DD $L$mul_by_2_body wrt ..imagebase,$L$mul_by_2_epilogue wrt ..imagebase +$L$SEH_info_ecp_nistz256_div_by_2: +DB 9,0,0,0 + DD short_handler wrt ..imagebase + DD $L$div_by_2_body wrt ..imagebase,$L$div_by_2_epilogue wrt ..imagebase +$L$SEH_info_ecp_nistz256_mul_by_3: +DB 9,0,0,0 + DD short_handler wrt ..imagebase + DD $L$mul_by_3_body wrt ..imagebase,$L$mul_by_3_epilogue wrt ..imagebase +$L$SEH_info_ecp_nistz256_add: +DB 9,0,0,0 + DD short_handler wrt ..imagebase + DD $L$add_body wrt ..imagebase,$L$add_epilogue wrt ..imagebase +$L$SEH_info_ecp_nistz256_sub: +DB 9,0,0,0 + DD short_handler wrt ..imagebase + DD $L$sub_body wrt ..imagebase,$L$sub_epilogue wrt ..imagebase +$L$SEH_info_ecp_nistz256_neg: +DB 9,0,0,0 + DD short_handler wrt ..imagebase + DD $L$neg_body wrt ..imagebase,$L$neg_epilogue wrt ..imagebase +$L$SEH_info_ecp_nistz256_ord_mul_mont: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_mul_body wrt ..imagebase,$L$ord_mul_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_ord_sqr_mont: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_sqr_body wrt ..imagebase,$L$ord_sqr_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_ord_mul_montx: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_mulx_body wrt ..imagebase,$L$ord_mulx_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_ord_sqr_montx: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_sqrx_body wrt ..imagebase,$L$ord_sqrx_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_to_mont: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_mul_mont: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_sqr_mont: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$sqr_body wrt ..imagebase,$L$sqr_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_from_mont: +DB 9,0,0,0 + DD short_handler wrt ..imagebase + DD $L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase +$L$SEH_info_ecp_nistz256_gather_wX: +DB 0x01,0x33,0x16,0x00 +DB 0x33,0xf8,0x09,0x00 +DB 0x2e,0xe8,0x08,0x00 +DB 0x29,0xd8,0x07,0x00 +DB 0x24,0xc8,0x06,0x00 +DB 0x1f,0xb8,0x05,0x00 +DB 0x1a,0xa8,0x04,0x00 +DB 0x15,0x98,0x03,0x00 +DB 0x10,0x88,0x02,0x00 +DB 0x0c,0x78,0x01,0x00 +DB 0x08,0x68,0x00,0x00 +DB 0x04,0x01,0x15,0x00 +ALIGN 8 +$L$SEH_info_ecp_nistz256_avx2_gather_wX: +DB 0x01,0x36,0x17,0x0b +DB 0x36,0xf8,0x09,0x00 +DB 0x31,0xe8,0x08,0x00 +DB 0x2c,0xd8,0x07,0x00 +DB 0x27,0xc8,0x06,0x00 +DB 0x22,0xb8,0x05,0x00 +DB 0x1d,0xa8,0x04,0x00 +DB 0x18,0x98,0x03,0x00 +DB 0x13,0x88,0x02,0x00 +DB 0x0e,0x78,0x01,0x00 +DB 0x09,0x68,0x00,0x00 +DB 0x04,0x01,0x15,0x00 +DB 0x00,0xb3,0x00,0x00 +ALIGN 8 +$L$SEH_info_ecp_nistz256_point_double: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_doubleq_body wrt ..imagebase,$L$point_doubleq_epilogue wrt ..imagebase + DD 32*5+56,0 +$L$SEH_info_ecp_nistz256_point_add: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_addq_body wrt ..imagebase,$L$point_addq_epilogue wrt ..imagebase + DD 32*18+56,0 +$L$SEH_info_ecp_nistz256_point_add_affine: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$add_affineq_body wrt ..imagebase,$L$add_affineq_epilogue wrt ..imagebase + DD 32*15+56,0 +ALIGN 8 +$L$SEH_info_ecp_nistz256_point_doublex: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_doublex_body wrt ..imagebase,$L$point_doublex_epilogue wrt ..imagebase + DD 32*5+56,0 +$L$SEH_info_ecp_nistz256_point_addx: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_addx_body wrt ..imagebase,$L$point_addx_epilogue wrt ..imagebase + DD 32*18+56,0 +$L$SEH_info_ecp_nistz256_point_add_affinex: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$add_affinex_body wrt ..imagebase,$L$add_affinex_epilogue wrt ..imagebase + DD 32*15+56,0 diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/ec/x25519-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/ec/x25519-x86_64.asm new file mode 100644 index 0000000000..84d55134ac --- /dev/null +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/ec/x25519-x86_64.asm @@ -0,0 +1,1054 @@ +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + + +global x25519_fe51_mul + +ALIGN 32 +x25519_fe51_mul: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_x25519_fe51_mul: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,[((-40))+rsp] + +$L$fe51_mul_body: + + mov rax,QWORD[rsi] + mov r11,QWORD[rdx] + mov r12,QWORD[8+rdx] + mov r13,QWORD[16+rdx] + mov rbp,QWORD[24+rdx] + mov r14,QWORD[32+rdx] + + mov QWORD[32+rsp],rdi + mov rdi,rax + mul r11 + mov QWORD[rsp],r11 + mov rbx,rax + mov rax,rdi + mov rcx,rdx + mul r12 + mov QWORD[8+rsp],r12 + mov r8,rax + mov rax,rdi + lea r15,[r14*8+r14] + mov r9,rdx + mul r13 + mov QWORD[16+rsp],r13 + mov r10,rax + mov rax,rdi + lea rdi,[r15*2+r14] + mov r11,rdx + mul rbp + mov r12,rax + mov rax,QWORD[rsi] + mov r13,rdx + mul r14 + mov r14,rax + mov rax,QWORD[8+rsi] + mov r15,rdx + + mul rdi + add rbx,rax + mov rax,QWORD[16+rsi] + adc rcx,rdx + mul rdi + add r8,rax + mov rax,QWORD[24+rsi] + adc r9,rdx + mul rdi + add r10,rax + mov rax,QWORD[32+rsi] + adc r11,rdx + mul rdi + imul rdi,rbp,19 + add r12,rax + mov rax,QWORD[8+rsi] + adc r13,rdx + mul rbp + mov rbp,QWORD[16+rsp] + add r14,rax + mov rax,QWORD[16+rsi] + adc r15,rdx + + mul rdi + add rbx,rax + mov rax,QWORD[24+rsi] + adc rcx,rdx + mul rdi + add r8,rax + mov rax,QWORD[32+rsi] + adc r9,rdx + mul rdi + imul rdi,rbp,19 + add r10,rax + mov rax,QWORD[8+rsi] + adc r11,rdx + mul rbp + add r12,rax + mov rax,QWORD[16+rsi] + adc r13,rdx + mul rbp + mov rbp,QWORD[8+rsp] + add r14,rax + mov rax,QWORD[24+rsi] + adc r15,rdx + + mul rdi + add rbx,rax + mov rax,QWORD[32+rsi] + adc rcx,rdx + mul rdi + add r8,rax + mov rax,QWORD[8+rsi] + adc r9,rdx + mul rbp + imul rdi,rbp,19 + add r10,rax + mov rax,QWORD[16+rsi] + adc r11,rdx + mul rbp + add r12,rax + mov rax,QWORD[24+rsi] + adc r13,rdx + mul rbp + mov rbp,QWORD[rsp] + add r14,rax + mov rax,QWORD[32+rsi] + adc r15,rdx + + mul rdi + add rbx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + mul rbp + add r8,rax + mov rax,QWORD[16+rsi] + adc r9,rdx + mul rbp + add r10,rax + mov rax,QWORD[24+rsi] + adc r11,rdx + mul rbp + add r12,rax + mov rax,QWORD[32+rsi] + adc r13,rdx + mul rbp + add r14,rax + adc r15,rdx + + mov rdi,QWORD[32+rsp] + jmp NEAR $L$reduce51 +$L$fe51_mul_epilogue: + +$L$SEH_end_x25519_fe51_mul: + +global x25519_fe51_sqr + +ALIGN 32 +x25519_fe51_sqr: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_x25519_fe51_sqr: + mov rdi,rcx + mov rsi,rdx + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,[((-40))+rsp] + +$L$fe51_sqr_body: + + mov rax,QWORD[rsi] + mov r15,QWORD[16+rsi] + mov rbp,QWORD[32+rsi] + + mov QWORD[32+rsp],rdi + lea r14,[rax*1+rax] + mul rax + mov rbx,rax + mov rax,QWORD[8+rsi] + mov rcx,rdx + mul r14 + mov r8,rax + mov rax,r15 + mov QWORD[rsp],r15 + mov r9,rdx + mul r14 + mov r10,rax + mov rax,QWORD[24+rsi] + mov r11,rdx + imul rdi,rbp,19 + mul r14 + mov r12,rax + mov rax,rbp + mov r13,rdx + mul r14 + mov r14,rax + mov rax,rbp + mov r15,rdx + + mul rdi + add r12,rax + mov rax,QWORD[8+rsi] + adc r13,rdx + + mov rsi,QWORD[24+rsi] + lea rbp,[rax*1+rax] + mul rax + add r10,rax + mov rax,QWORD[rsp] + adc r11,rdx + mul rbp + add r12,rax + mov rax,rbp + adc r13,rdx + mul rsi + add r14,rax + mov rax,rbp + adc r15,rdx + imul rbp,rsi,19 + mul rdi + add rbx,rax + lea rax,[rsi*1+rsi] + adc rcx,rdx + + mul rdi + add r10,rax + mov rax,rsi + adc r11,rdx + mul rbp + add r8,rax + mov rax,QWORD[rsp] + adc r9,rdx + + lea rsi,[rax*1+rax] + mul rax + add r14,rax + mov rax,rbp + adc r15,rdx + mul rsi + add rbx,rax + mov rax,rsi + adc rcx,rdx + mul rdi + add r8,rax + adc r9,rdx + + mov rdi,QWORD[32+rsp] + jmp NEAR $L$reduce51 + +ALIGN 32 +$L$reduce51: + mov rbp,0x7ffffffffffff + + mov rdx,r10 + shr r10,51 + shl r11,13 + and rdx,rbp + or r11,r10 + add r12,r11 + adc r13,0 + + mov rax,rbx + shr rbx,51 + shl rcx,13 + and rax,rbp + or rcx,rbx + add r8,rcx + adc r9,0 + + mov rbx,r12 + shr r12,51 + shl r13,13 + and rbx,rbp + or r13,r12 + add r14,r13 + adc r15,0 + + mov rcx,r8 + shr r8,51 + shl r9,13 + and rcx,rbp + or r9,r8 + add rdx,r9 + + mov r10,r14 + shr r14,51 + shl r15,13 + and r10,rbp + or r15,r14 + + lea r14,[r15*8+r15] + lea r15,[r14*2+r15] + add rax,r15 + + mov r8,rdx + and rdx,rbp + shr r8,51 + add rbx,r8 + + mov r9,rax + and rax,rbp + shr r9,51 + add rcx,r9 + + mov QWORD[rdi],rax + mov QWORD[8+rdi],rcx + mov QWORD[16+rdi],rdx + mov QWORD[24+rdi],rbx + mov QWORD[32+rdi],r10 + + mov r15,QWORD[40+rsp] + + mov r14,QWORD[48+rsp] + + mov r13,QWORD[56+rsp] + + mov r12,QWORD[64+rsp] + + mov rbx,QWORD[72+rsp] + + mov rbp,QWORD[80+rsp] + + lea rsp,[88+rsp] + +$L$fe51_sqr_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_x25519_fe51_sqr: + +global x25519_fe51_mul121666 + +ALIGN 32 +x25519_fe51_mul121666: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_x25519_fe51_mul121666: + mov rdi,rcx + mov rsi,rdx + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,[((-40))+rsp] + +$L$fe51_mul121666_body: + mov eax,121666 + + mul QWORD[rsi] + mov rbx,rax + mov eax,121666 + mov rcx,rdx + mul QWORD[8+rsi] + mov r8,rax + mov eax,121666 + mov r9,rdx + mul QWORD[16+rsi] + mov r10,rax + mov eax,121666 + mov r11,rdx + mul QWORD[24+rsi] + mov r12,rax + mov eax,121666 + mov r13,rdx + mul QWORD[32+rsi] + mov r14,rax + mov r15,rdx + + jmp NEAR $L$reduce51 +$L$fe51_mul121666_epilogue: + +$L$SEH_end_x25519_fe51_mul121666: +EXTERN OPENSSL_ia32cap_P +global x25519_fe64_eligible + +ALIGN 32 +x25519_fe64_eligible: + mov ecx,DWORD[((OPENSSL_ia32cap_P+8))] + xor eax,eax + and ecx,0x80100 + cmp ecx,0x80100 + cmove eax,ecx + DB 0F3h,0C3h ;repret + + +global x25519_fe64_mul + +ALIGN 32 +x25519_fe64_mul: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_x25519_fe64_mul: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + + lea rsp,[((-16))+rsp] + +$L$fe64_mul_body: + + mov rax,rdx + mov rbp,QWORD[rdx] + mov rdx,QWORD[rsi] + mov rcx,QWORD[8+rax] + mov r14,QWORD[16+rax] + mov r15,QWORD[24+rax] + + mulx rax,r8,rbp + xor edi,edi + mulx rbx,r9,rcx + adcx r9,rax + mulx rax,r10,r14 + adcx r10,rbx + mulx r12,r11,r15 + mov rdx,QWORD[8+rsi] + adcx r11,rax + mov QWORD[rsp],r14 + adcx r12,rdi + + mulx rbx,rax,rbp + adox r9,rax + adcx r10,rbx + mulx rbx,rax,rcx + adox r10,rax + adcx r11,rbx + mulx rbx,rax,r14 + adox r11,rax + adcx r12,rbx + mulx r13,rax,r15 + mov rdx,QWORD[16+rsi] + adox r12,rax + adcx r13,rdi + adox r13,rdi + + mulx rbx,rax,rbp + adcx r10,rax + adox r11,rbx + mulx rbx,rax,rcx + adcx r11,rax + adox r12,rbx + mulx rbx,rax,r14 + adcx r12,rax + adox r13,rbx + mulx r14,rax,r15 + mov rdx,QWORD[24+rsi] + adcx r13,rax + adox r14,rdi + adcx r14,rdi + + mulx rbx,rax,rbp + adox r11,rax + adcx r12,rbx + mulx rbx,rax,rcx + adox r12,rax + adcx r13,rbx + mulx rbx,rax,QWORD[rsp] + adox r13,rax + adcx r14,rbx + mulx r15,rax,r15 + mov edx,38 + adox r14,rax + adcx r15,rdi + adox r15,rdi + + jmp NEAR $L$reduce64 +$L$fe64_mul_epilogue: + +$L$SEH_end_x25519_fe64_mul: + +global x25519_fe64_sqr + +ALIGN 32 +x25519_fe64_sqr: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_x25519_fe64_sqr: + mov rdi,rcx + mov rsi,rdx + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + + lea rsp,[((-16))+rsp] + +$L$fe64_sqr_body: + + mov rdx,QWORD[rsi] + mov rcx,QWORD[8+rsi] + mov rbp,QWORD[16+rsi] + mov rsi,QWORD[24+rsi] + + + mulx r15,r8,rdx + mulx rax,r9,rcx + xor edi,edi + mulx rbx,r10,rbp + adcx r10,rax + mulx r12,r11,rsi + mov rdx,rcx + adcx r11,rbx + adcx r12,rdi + + + mulx rbx,rax,rbp + adox r11,rax + adcx r12,rbx + mulx r13,rax,rsi + mov rdx,rbp + adox r12,rax + adcx r13,rdi + + + mulx r14,rax,rsi + mov rdx,rcx + adox r13,rax + adcx r14,rdi + adox r14,rdi + + adcx r9,r9 + adox r9,r15 + adcx r10,r10 + mulx rbx,rax,rdx + mov rdx,rbp + adcx r11,r11 + adox r10,rax + adcx r12,r12 + adox r11,rbx + mulx rbx,rax,rdx + mov rdx,rsi + adcx r13,r13 + adox r12,rax + adcx r14,r14 + adox r13,rbx + mulx r15,rax,rdx + mov edx,38 + adox r14,rax + adcx r15,rdi + adox r15,rdi + jmp NEAR $L$reduce64 + +ALIGN 32 +$L$reduce64: + mulx rbx,rax,r12 + adcx r8,rax + adox r9,rbx + mulx rbx,rax,r13 + adcx r9,rax + adox r10,rbx + mulx rbx,rax,r14 + adcx r10,rax + adox r11,rbx + mulx r12,rax,r15 + adcx r11,rax + adox r12,rdi + adcx r12,rdi + + mov rdi,QWORD[16+rsp] + imul r12,rdx + + add r8,r12 + adc r9,0 + adc r10,0 + adc r11,0 + + sbb rax,rax + and rax,38 + + add r8,rax + mov QWORD[8+rdi],r9 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + mov QWORD[rdi],r8 + + mov r15,QWORD[24+rsp] + + mov r14,QWORD[32+rsp] + + mov r13,QWORD[40+rsp] + + mov r12,QWORD[48+rsp] + + mov rbx,QWORD[56+rsp] + + mov rbp,QWORD[64+rsp] + + lea rsp,[72+rsp] + +$L$fe64_sqr_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_x25519_fe64_sqr: + +global x25519_fe64_mul121666 + +ALIGN 32 +x25519_fe64_mul121666: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_x25519_fe64_mul121666: + mov rdi,rcx + mov rsi,rdx + + +$L$fe64_mul121666_body: + mov edx,121666 + mulx rcx,r8,QWORD[rsi] + mulx rax,r9,QWORD[8+rsi] + add r9,rcx + mulx rcx,r10,QWORD[16+rsi] + adc r10,rax + mulx rax,r11,QWORD[24+rsi] + adc r11,rcx + adc rax,0 + + imul rax,rax,38 + + add r8,rax + adc r9,0 + adc r10,0 + adc r11,0 + + sbb rax,rax + and rax,38 + + add r8,rax + mov QWORD[8+rdi],r9 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + mov QWORD[rdi],r8 + +$L$fe64_mul121666_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_x25519_fe64_mul121666: + +global x25519_fe64_add + +ALIGN 32 +x25519_fe64_add: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_x25519_fe64_add: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +$L$fe64_add_body: + mov r8,QWORD[rsi] + mov r9,QWORD[8+rsi] + mov r10,QWORD[16+rsi] + mov r11,QWORD[24+rsi] + + add r8,QWORD[rdx] + adc r9,QWORD[8+rdx] + adc r10,QWORD[16+rdx] + adc r11,QWORD[24+rdx] + + sbb rax,rax + and rax,38 + + add r8,rax + adc r9,0 + adc r10,0 + mov QWORD[8+rdi],r9 + adc r11,0 + mov QWORD[16+rdi],r10 + sbb rax,rax + mov QWORD[24+rdi],r11 + and rax,38 + + add r8,rax + mov QWORD[rdi],r8 + +$L$fe64_add_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_x25519_fe64_add: + +global x25519_fe64_sub + +ALIGN 32 +x25519_fe64_sub: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_x25519_fe64_sub: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +$L$fe64_sub_body: + mov r8,QWORD[rsi] + mov r9,QWORD[8+rsi] + mov r10,QWORD[16+rsi] + mov r11,QWORD[24+rsi] + + sub r8,QWORD[rdx] + sbb r9,QWORD[8+rdx] + sbb r10,QWORD[16+rdx] + sbb r11,QWORD[24+rdx] + + sbb rax,rax + and rax,38 + + sub r8,rax + sbb r9,0 + sbb r10,0 + mov QWORD[8+rdi],r9 + sbb r11,0 + mov QWORD[16+rdi],r10 + sbb rax,rax + mov QWORD[24+rdi],r11 + and rax,38 + + sub r8,rax + mov QWORD[rdi],r8 + +$L$fe64_sub_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_x25519_fe64_sub: + +global x25519_fe64_tobytes + +ALIGN 32 +x25519_fe64_tobytes: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_x25519_fe64_tobytes: + mov rdi,rcx + mov rsi,rdx + + +$L$fe64_to_body: + mov r8,QWORD[rsi] + mov r9,QWORD[8+rsi] + mov r10,QWORD[16+rsi] + mov r11,QWORD[24+rsi] + + + lea rax,[r11*1+r11] + sar r11,63 + shr rax,1 + and r11,19 + add r11,19 + + add r8,r11 + adc r9,0 + adc r10,0 + adc rax,0 + + lea r11,[rax*1+rax] + sar rax,63 + shr r11,1 + not rax + and rax,19 + + sub r8,rax + sbb r9,0 + sbb r10,0 + sbb r11,0 + + mov QWORD[rdi],r8 + mov QWORD[8+rdi],r9 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + +$L$fe64_to_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_x25519_fe64_tobytes: +DB 88,50,53,53,49,57,32,112,114,105,109,105,116,105,118,101 +DB 115,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82 +DB 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 +DB 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +EXTERN __imp_RtlVirtualUnwind + + +ALIGN 16 +short_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + jmp NEAR $L$common_seh_tail + + + +ALIGN 16 +full_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov r10d,DWORD[8+r11] + lea rax,[r10*1+rax] + + mov rbp,QWORD[((-8))+rax] + mov rbx,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_x25519_fe51_mul wrt ..imagebase + DD $L$SEH_end_x25519_fe51_mul wrt ..imagebase + DD $L$SEH_info_x25519_fe51_mul wrt ..imagebase + + DD $L$SEH_begin_x25519_fe51_sqr wrt ..imagebase + DD $L$SEH_end_x25519_fe51_sqr wrt ..imagebase + DD $L$SEH_info_x25519_fe51_sqr wrt ..imagebase + + DD $L$SEH_begin_x25519_fe51_mul121666 wrt ..imagebase + DD $L$SEH_end_x25519_fe51_mul121666 wrt ..imagebase + DD $L$SEH_info_x25519_fe51_mul121666 wrt ..imagebase + DD $L$SEH_begin_x25519_fe64_mul wrt ..imagebase + DD $L$SEH_end_x25519_fe64_mul wrt ..imagebase + DD $L$SEH_info_x25519_fe64_mul wrt ..imagebase + + DD $L$SEH_begin_x25519_fe64_sqr wrt ..imagebase + DD $L$SEH_end_x25519_fe64_sqr wrt ..imagebase + DD $L$SEH_info_x25519_fe64_sqr wrt ..imagebase + + DD $L$SEH_begin_x25519_fe64_mul121666 wrt ..imagebase + DD $L$SEH_end_x25519_fe64_mul121666 wrt ..imagebase + DD $L$SEH_info_x25519_fe64_mul121666 wrt ..imagebase + + DD $L$SEH_begin_x25519_fe64_add wrt ..imagebase + DD $L$SEH_end_x25519_fe64_add wrt ..imagebase + DD $L$SEH_info_x25519_fe64_add wrt ..imagebase + + DD $L$SEH_begin_x25519_fe64_sub wrt ..imagebase + DD $L$SEH_end_x25519_fe64_sub wrt ..imagebase + DD $L$SEH_info_x25519_fe64_sub wrt ..imagebase + + DD $L$SEH_begin_x25519_fe64_tobytes wrt ..imagebase + DD $L$SEH_end_x25519_fe64_tobytes wrt ..imagebase + DD $L$SEH_info_x25519_fe64_tobytes wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_x25519_fe51_mul: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$fe51_mul_body wrt ..imagebase,$L$fe51_mul_epilogue wrt ..imagebase + DD 88,0 +$L$SEH_info_x25519_fe51_sqr: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$fe51_sqr_body wrt ..imagebase,$L$fe51_sqr_epilogue wrt ..imagebase + DD 88,0 +$L$SEH_info_x25519_fe51_mul121666: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$fe51_mul121666_body wrt ..imagebase,$L$fe51_mul121666_epilogue wrt ..imagebase + DD 88,0 +$L$SEH_info_x25519_fe64_mul: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$fe64_mul_body wrt ..imagebase,$L$fe64_mul_epilogue wrt ..imagebase + DD 72,0 +$L$SEH_info_x25519_fe64_sqr: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$fe64_sqr_body wrt ..imagebase,$L$fe64_sqr_epilogue wrt ..imagebase + DD 72,0 +$L$SEH_info_x25519_fe64_mul121666: +DB 9,0,0,0 + DD short_handler wrt ..imagebase + DD $L$fe64_mul121666_body wrt ..imagebase,$L$fe64_mul121666_epilogue wrt ..imagebase +$L$SEH_info_x25519_fe64_add: +DB 9,0,0,0 + DD short_handler wrt ..imagebase + DD $L$fe64_add_body wrt ..imagebase,$L$fe64_add_epilogue wrt ..imagebase +$L$SEH_info_x25519_fe64_sub: +DB 9,0,0,0 + DD short_handler wrt ..imagebase + DD $L$fe64_sub_body wrt ..imagebase,$L$fe64_sub_epilogue wrt ..imagebase +$L$SEH_info_x25519_fe64_tobytes: +DB 9,0,0,0 + DD short_handler wrt ..imagebase + DD $L$fe64_to_body wrt ..imagebase,$L$fe64_to_epilogue wrt ..imagebase diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/include/internal/dso_conf.h b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/include/internal/dso_conf.h index 289768d956..dc8306eda3 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/include/internal/dso_conf.h +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/include/internal/dso_conf.h @@ -1,7 +1,7 @@ /* WARNING: do not edit! */ /* Generated by makefile from crypto/include/internal/dso_conf.h.in */ /* - * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the OpenSSL license (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy @@ -12,5 +12,7 @@ #ifndef HEADER_DSO_CONF_H # define HEADER_DSO_CONF_H +# define DSO_WIN32 # define DSO_EXTENSION ".dll" + #endif diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/md5/md5-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/md5/md5-x86_64.asm index 8bb2cfb77f..3fd339153b 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/md5/md5-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/md5/md5-x86_64.asm @@ -18,11 +18,17 @@ $L$SEH_begin_md5_block_asm_data_order: mov rdx,r8 + push rbp + push rbx + push r12 + push r14 + push r15 + $L$prologue: @@ -669,15 +675,22 @@ $L$end: mov DWORD[12+rbp],edx mov r15,QWORD[rsp] + mov r14,QWORD[8+rsp] + mov r12,QWORD[16+rsp] + mov rbx,QWORD[24+rsp] + mov rbp,QWORD[32+rsp] + add rsp,40 + $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_md5_block_asm_data_order: EXTERN __imp_RtlVirtualUnwind diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/modes/aesni-gcm-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/modes/aesni-gcm-x86_64.asm index 741a9e4f3a..b1d8332457 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/modes/aesni-gcm-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/modes/aesni-gcm-x86_64.asm @@ -36,23 +36,6 @@ $L$resume_ctr32: vpxor xmm12,xmm12,xmm15 vmovups xmm2,XMMWORD[((16-128))+rcx] vpclmulqdq xmm6,xmm7,xmm3,0x01 - - - - - - - - - - - - - - - - - xor r12,r12 cmp r15,r14 @@ -349,20 +332,25 @@ $L$SEH_begin_aesni_gcm_decrypt: mov r9,QWORD[48+rsp] - xor r10,r10 - - + xor r10,r10 cmp rdx,0x60 jb NEAR $L$gcm_dec_abort lea rax,[rsp] + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + lea rsp,[((-168))+rsp] movaps XMMWORD[(-216)+rax],xmm6 movaps XMMWORD[(-200)+rax],xmm7 @@ -403,15 +391,7 @@ $L$dec_no_key_aliasing: vmovdqu xmm7,XMMWORD[80+rdi] lea r14,[rdi] vmovdqu xmm4,XMMWORD[64+rdi] - - - - - - - lea r15,[((-192))+rdx*1+rdi] - vmovdqu xmm5,XMMWORD[48+rdi] shr rdx,4 xor r10,r10 @@ -454,17 +434,25 @@ $L$dec_no_key_aliasing: movaps xmm14,XMMWORD[((-88))+rax] movaps xmm15,XMMWORD[((-72))+rax] mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$gcm_dec_abort: mov rax,r10 mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_aesni_gcm_decrypt: ALIGN 32 @@ -573,21 +561,25 @@ $L$SEH_begin_aesni_gcm_encrypt: mov r9,QWORD[48+rsp] - xor r10,r10 - - - + xor r10,r10 cmp rdx,0x60*3 jb NEAR $L$gcm_enc_abort lea rax,[rsp] + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + lea rsp,[((-168))+rsp] movaps XMMWORD[(-216)+rax],xmm6 movaps XMMWORD[(-200)+rax],xmm7 @@ -623,16 +615,7 @@ $L$gcm_enc_body: $L$enc_no_key_aliasing: lea r14,[rsi] - - - - - - - - lea r15,[((-192))+rdx*1+rsi] - shr rdx,4 call _aesni_ctr32_6x @@ -844,17 +827,25 @@ $L$enc_no_key_aliasing: movaps xmm14,XMMWORD[((-88))+rax] movaps xmm15,XMMWORD[((-72))+rax] mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$gcm_enc_abort: mov rax,r10 mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_aesni_gcm_encrypt: ALIGN 64 $L$bswap_mask: diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/modes/ghash-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/modes/ghash-x86_64.asm index e5204bf81d..b227e2400e 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/modes/ghash-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/modes/ghash-x86_64.asm @@ -18,9 +18,21 @@ $L$SEH_begin_gcm_gmult_4bit: mov rsi,rdx + push rbx + push rbp + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,280 + $L$gmult_prologue: movzx r8,BYTE[15+rdi] @@ -97,12 +109,17 @@ $L$break1: mov QWORD[8+rdi],r8 mov QWORD[rdi],r9 - mov rbx,QWORD[16+rsp] - lea rsp,[24+rsp] + lea rsi,[((280+48))+rsp] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$gmult_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_gcm_gmult_4bit: global gcm_ghash_4bit @@ -118,13 +135,21 @@ $L$SEH_begin_gcm_ghash_4bit: mov rcx,r9 + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp,280 + $L$ghash_prologue: mov r14,rdx mov r15,rcx @@ -669,18 +694,27 @@ $L$outer_loop: mov QWORD[8+rdi],r8 mov QWORD[rdi],r9 - lea rsi,[280+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + lea rsi,[((280+48))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$ghash_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_gcm_ghash_4bit: global gcm_init_clmul @@ -1916,14 +1950,20 @@ se_handler: cmp rbx,r10 jae NEAR $L$in_prologue - lea rax,[24+rax] + lea rax,[((48+280))+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] mov QWORD[144+r8],rbx mov QWORD[160+r8],rbp mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 $L$in_prologue: mov rdi,QWORD[8+rax] diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/poly1305/poly1305-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/poly1305/poly1305-x86_64.asm index 15fde3cba6..5717654508 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/poly1305/poly1305-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/poly1305/poly1305-x86_64.asm @@ -45,6 +45,11 @@ $L$SEH_begin_poly1305_init: lea rax,[poly1305_blocks_avx2] bt r9,37 cmovc r10,rax + mov rax,2149646336 + shr r9,32 + and r9,rax + cmp r9,rax + je NEAR $L$init_base2_44 mov rax,0x0ffffffc0fffffff mov rcx,0x0ffffffc0ffffffc and rax,QWORD[rsi] @@ -73,16 +78,23 @@ $L$SEH_begin_poly1305_blocks: mov rcx,r9 + $L$blocks: shr rdx,4 jz NEAR $L$no_data push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + $L$blocks_body: mov r15,rdx @@ -153,17 +165,25 @@ $L$oop: mov QWORD[16+rdi],rbp mov r15,QWORD[rsp] + mov r14,QWORD[8+rsp] + mov r13,QWORD[16+rsp] + mov r12,QWORD[24+rsp] + mov rbp,QWORD[32+rsp] + mov rbx,QWORD[40+rsp] + lea rsp,[48+rsp] + $L$no_data: $L$blocks_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_poly1305_blocks: @@ -420,6 +440,7 @@ $L$SEH_begin_poly1305_blocks_avx: mov rcx,r9 + mov r8d,DWORD[20+rdi] cmp rdx,128 jae NEAR $L$blocks_avx @@ -439,11 +460,17 @@ $L$blocks_avx: jz NEAR $L$even_avx push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + $L$blocks_avx_body: mov r15,rdx @@ -546,26 +573,41 @@ $L$store_base2_26_avx: ALIGN 16 $L$done_avx: mov r15,QWORD[rsp] + mov r14,QWORD[8+rsp] + mov r13,QWORD[16+rsp] + mov r12,QWORD[24+rsp] + mov rbp,QWORD[32+rsp] + mov rbx,QWORD[40+rsp] + lea rsp,[48+rsp] + $L$no_data_avx: $L$blocks_avx_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + ALIGN 32 $L$base2_64_avx: + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + $L$base2_64_avx_body: mov r15,rdx @@ -625,18 +667,27 @@ $L$proceed_avx: mov rdx,r15 mov r15,QWORD[rsp] + mov r14,QWORD[8+rsp] + mov r13,QWORD[16+rsp] + mov r12,QWORD[24+rsp] + mov rbp,QWORD[32+rsp] + mov rbx,QWORD[40+rsp] + lea rax,[48+rsp] lea rsp,[48+rsp] + $L$base2_64_avx_epilogue: jmp NEAR $L$do_avx + ALIGN 32 $L$even_avx: + vmovd xmm0,DWORD[rdi] vmovd xmm1,DWORD[4+rdi] vmovd xmm2,DWORD[8+rdi] @@ -1230,6 +1281,7 @@ $L$do_avx_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_poly1305_blocks_avx: @@ -1310,6 +1362,7 @@ $L$SEH_begin_poly1305_blocks_avx2: mov rcx,r9 + mov r8d,DWORD[20+rdi] cmp rdx,128 jae NEAR $L$blocks_avx2 @@ -1329,11 +1382,17 @@ $L$blocks_avx2: jz NEAR $L$even_avx2 push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + $L$blocks_avx2_body: mov r15,rdx @@ -1442,26 +1501,41 @@ $L$store_base2_26_avx2: ALIGN 16 $L$done_avx2: mov r15,QWORD[rsp] + mov r14,QWORD[8+rsp] + mov r13,QWORD[16+rsp] + mov r12,QWORD[24+rsp] + mov rbp,QWORD[32+rsp] + mov rbx,QWORD[40+rsp] + lea rsp,[48+rsp] + $L$no_data_avx2: $L$blocks_avx2_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + ALIGN 32 $L$base2_64_avx2: + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + $L$base2_64_avx2_body: mov r15,rdx @@ -1524,20 +1598,32 @@ $L$init_avx2: $L$proceed_avx2: mov rdx,r15 + mov r10d,DWORD[((OPENSSL_ia32cap_P+8))] + mov r11d,3221291008 mov r15,QWORD[rsp] + mov r14,QWORD[8+rsp] + mov r13,QWORD[16+rsp] + mov r12,QWORD[24+rsp] + mov rbp,QWORD[32+rsp] + mov rbx,QWORD[40+rsp] + lea rax,[48+rsp] lea rsp,[48+rsp] + $L$base2_64_avx2_epilogue: jmp NEAR $L$do_avx2 + ALIGN 32 $L$even_avx2: + + mov r10d,DWORD[((OPENSSL_ia32cap_P+8))] vmovd xmm0,DWORD[rdi] vmovd xmm1,DWORD[4+rdi] vmovd xmm2,DWORD[8+rdi] @@ -1545,6 +1631,12 @@ $L$even_avx2: vmovd xmm4,DWORD[16+rdi] $L$do_avx2: + cmp rdx,512 + jb NEAR $L$skip_avx512 + and r10d,r11d + test r10d,65536 + jnz NEAR $L$blocks_avx512 +$L$skip_avx512: lea r11,[((-248))+rsp] sub rsp,0x1c8 vmovdqa XMMWORD[80+r11],xmm6 @@ -1558,8 +1650,9 @@ $L$do_avx2: vmovdqa XMMWORD[208+r11],xmm14 vmovdqa XMMWORD[224+r11],xmm15 $L$do_avx2_body: - lea rdi,[((48+64))+rdi] lea rcx,[$L$const] + lea rdi,[((48+64))+rdi] + vmovdqa ymm7,YMMWORD[96+rcx] vmovdqu xmm9,XMMWORD[((-64))+rdi] @@ -1569,36 +1662,28 @@ $L$do_avx2_body: vmovdqu xmm11,XMMWORD[((-16))+rdi] vmovdqu xmm12,XMMWORD[rdi] vmovdqu xmm13,XMMWORD[16+rdi] + lea rax,[144+rsp] vmovdqu xmm14,XMMWORD[32+rdi] - vpermq ymm9,ymm9,0x15 + vpermd ymm9,ymm7,ymm9 vmovdqu xmm15,XMMWORD[48+rdi] - vpermq ymm10,ymm10,0x15 - vpshufd ymm9,ymm9,0xc8 + vpermd ymm10,ymm7,ymm10 vmovdqu xmm5,XMMWORD[64+rdi] - vpermq ymm6,ymm6,0x15 - vpshufd ymm10,ymm10,0xc8 + vpermd ymm6,ymm7,ymm6 vmovdqa YMMWORD[rsp],ymm9 - vpermq ymm11,ymm11,0x15 - vpshufd ymm6,ymm6,0xc8 - vmovdqa YMMWORD[32+rsp],ymm10 - vpermq ymm12,ymm12,0x15 - vpshufd ymm11,ymm11,0xc8 - vmovdqa YMMWORD[64+rsp],ymm6 - vpermq ymm13,ymm13,0x15 - vpshufd ymm12,ymm12,0xc8 - vmovdqa YMMWORD[96+rsp],ymm11 - vpermq ymm14,ymm14,0x15 - vpshufd ymm13,ymm13,0xc8 - vmovdqa YMMWORD[128+rsp],ymm12 - vpermq ymm15,ymm15,0x15 - vpshufd ymm14,ymm14,0xc8 - vmovdqa YMMWORD[160+rsp],ymm13 - vpermq ymm5,ymm5,0x15 - vpshufd ymm15,ymm15,0xc8 - vmovdqa YMMWORD[192+rsp],ymm14 - vpshufd ymm5,ymm5,0xc8 - vmovdqa YMMWORD[224+rsp],ymm15 - vmovdqa YMMWORD[256+rsp],ymm5 + vpermd ymm11,ymm7,ymm11 + vmovdqa YMMWORD[(32-144)+rax],ymm10 + vpermd ymm12,ymm7,ymm12 + vmovdqa YMMWORD[(64-144)+rax],ymm6 + vpermd ymm13,ymm7,ymm13 + vmovdqa YMMWORD[(96-144)+rax],ymm11 + vpermd ymm14,ymm7,ymm14 + vmovdqa YMMWORD[(128-144)+rax],ymm12 + vpermd ymm15,ymm7,ymm15 + vmovdqa YMMWORD[(160-144)+rax],ymm13 + vpermd ymm5,ymm7,ymm5 + vmovdqa YMMWORD[(192-144)+rax],ymm14 + vmovdqa YMMWORD[(224-144)+rax],ymm15 + vmovdqa YMMWORD[(256-144)+rax],ymm5 vmovdqa ymm5,YMMWORD[64+rcx] @@ -1625,7 +1710,6 @@ $L$do_avx2_body: vpand ymm10,ymm10,ymm5 vpor ymm6,ymm6,YMMWORD[32+rcx] - lea rax,[144+rsp] vpaddq ymm2,ymm9,ymm2 sub rdx,64 jz NEAR $L$tail_avx2 @@ -1935,7 +2019,1593 @@ $L$do_avx2_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_poly1305_blocks_avx2: + +ALIGN 32 +poly1305_blocks_avx512: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_poly1305_blocks_avx512: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + +$L$blocks_avx512: + mov eax,15 + kmovw k2,eax + lea r11,[((-248))+rsp] + sub rsp,0x1c8 + vmovdqa XMMWORD[80+r11],xmm6 + vmovdqa XMMWORD[96+r11],xmm7 + vmovdqa XMMWORD[112+r11],xmm8 + vmovdqa XMMWORD[128+r11],xmm9 + vmovdqa XMMWORD[144+r11],xmm10 + vmovdqa XMMWORD[160+r11],xmm11 + vmovdqa XMMWORD[176+r11],xmm12 + vmovdqa XMMWORD[192+r11],xmm13 + vmovdqa XMMWORD[208+r11],xmm14 + vmovdqa XMMWORD[224+r11],xmm15 +$L$do_avx512_body: + lea rcx,[$L$const] + lea rdi,[((48+64))+rdi] + vmovdqa ymm9,YMMWORD[96+rcx] + + + vmovdqu xmm11,XMMWORD[((-64))+rdi] + and rsp,-512 + vmovdqu xmm12,XMMWORD[((-48))+rdi] + mov rax,0x20 + vmovdqu xmm7,XMMWORD[((-32))+rdi] + vmovdqu xmm13,XMMWORD[((-16))+rdi] + vmovdqu xmm8,XMMWORD[rdi] + vmovdqu xmm14,XMMWORD[16+rdi] + vmovdqu xmm10,XMMWORD[32+rdi] + vmovdqu xmm15,XMMWORD[48+rdi] + vmovdqu xmm6,XMMWORD[64+rdi] + vpermd zmm16,zmm9,zmm11 + vpbroadcastq zmm5,QWORD[64+rcx] + vpermd zmm17,zmm9,zmm12 + vpermd zmm21,zmm9,zmm7 + vpermd zmm18,zmm9,zmm13 + vmovdqa64 ZMMWORD[rsp]{k2},zmm16 + vpsrlq zmm7,zmm16,32 + vpermd zmm22,zmm9,zmm8 + vmovdqu64 ZMMWORD[rax*1+rsp]{k2},zmm17 + vpsrlq zmm8,zmm17,32 + vpermd zmm19,zmm9,zmm14 + vmovdqa64 ZMMWORD[64+rsp]{k2},zmm21 + vpermd zmm23,zmm9,zmm10 + vpermd zmm20,zmm9,zmm15 + vmovdqu64 ZMMWORD[64+rax*1+rsp]{k2},zmm18 + vpermd zmm24,zmm9,zmm6 + vmovdqa64 ZMMWORD[128+rsp]{k2},zmm22 + vmovdqu64 ZMMWORD[128+rax*1+rsp]{k2},zmm19 + vmovdqa64 ZMMWORD[192+rsp]{k2},zmm23 + vmovdqu64 ZMMWORD[192+rax*1+rsp]{k2},zmm20 + vmovdqa64 ZMMWORD[256+rsp]{k2},zmm24 + + + + + + + + + + + vpmuludq zmm11,zmm16,zmm7 + vpmuludq zmm12,zmm17,zmm7 + vpmuludq zmm13,zmm18,zmm7 + vpmuludq zmm14,zmm19,zmm7 + vpmuludq zmm15,zmm20,zmm7 + vpsrlq zmm9,zmm18,32 + + vpmuludq zmm25,zmm24,zmm8 + vpmuludq zmm26,zmm16,zmm8 + vpmuludq zmm27,zmm17,zmm8 + vpmuludq zmm28,zmm18,zmm8 + vpmuludq zmm29,zmm19,zmm8 + vpsrlq zmm10,zmm19,32 + vpaddq zmm11,zmm11,zmm25 + vpaddq zmm12,zmm12,zmm26 + vpaddq zmm13,zmm13,zmm27 + vpaddq zmm14,zmm14,zmm28 + vpaddq zmm15,zmm15,zmm29 + + vpmuludq zmm25,zmm23,zmm9 + vpmuludq zmm26,zmm24,zmm9 + vpmuludq zmm28,zmm17,zmm9 + vpmuludq zmm29,zmm18,zmm9 + vpmuludq zmm27,zmm16,zmm9 + vpsrlq zmm6,zmm20,32 + vpaddq zmm11,zmm11,zmm25 + vpaddq zmm12,zmm12,zmm26 + vpaddq zmm14,zmm14,zmm28 + vpaddq zmm15,zmm15,zmm29 + vpaddq zmm13,zmm13,zmm27 + + vpmuludq zmm25,zmm22,zmm10 + vpmuludq zmm28,zmm16,zmm10 + vpmuludq zmm29,zmm17,zmm10 + vpmuludq zmm26,zmm23,zmm10 + vpmuludq zmm27,zmm24,zmm10 + vpaddq zmm11,zmm11,zmm25 + vpaddq zmm14,zmm14,zmm28 + vpaddq zmm15,zmm15,zmm29 + vpaddq zmm12,zmm12,zmm26 + vpaddq zmm13,zmm13,zmm27 + + vpmuludq zmm28,zmm24,zmm6 + vpmuludq zmm29,zmm16,zmm6 + vpmuludq zmm25,zmm21,zmm6 + vpmuludq zmm26,zmm22,zmm6 + vpmuludq zmm27,zmm23,zmm6 + vpaddq zmm14,zmm14,zmm28 + vpaddq zmm15,zmm15,zmm29 + vpaddq zmm11,zmm11,zmm25 + vpaddq zmm12,zmm12,zmm26 + vpaddq zmm13,zmm13,zmm27 + + + + vmovdqu64 zmm10,ZMMWORD[rsi] + vmovdqu64 zmm6,ZMMWORD[64+rsi] + lea rsi,[128+rsi] + + + + + vpsrlq zmm28,zmm14,26 + vpandq zmm14,zmm14,zmm5 + vpaddq zmm15,zmm15,zmm28 + + vpsrlq zmm25,zmm11,26 + vpandq zmm11,zmm11,zmm5 + vpaddq zmm12,zmm12,zmm25 + + vpsrlq zmm29,zmm15,26 + vpandq zmm15,zmm15,zmm5 + + vpsrlq zmm26,zmm12,26 + vpandq zmm12,zmm12,zmm5 + vpaddq zmm13,zmm13,zmm26 + + vpaddq zmm11,zmm11,zmm29 + vpsllq zmm29,zmm29,2 + vpaddq zmm11,zmm11,zmm29 + + vpsrlq zmm27,zmm13,26 + vpandq zmm13,zmm13,zmm5 + vpaddq zmm14,zmm14,zmm27 + + vpsrlq zmm25,zmm11,26 + vpandq zmm11,zmm11,zmm5 + vpaddq zmm12,zmm12,zmm25 + + vpsrlq zmm28,zmm14,26 + vpandq zmm14,zmm14,zmm5 + vpaddq zmm15,zmm15,zmm28 + + + + + + vpunpcklqdq zmm7,zmm10,zmm6 + vpunpckhqdq zmm6,zmm10,zmm6 + + + + + + + vmovdqa32 zmm25,ZMMWORD[128+rcx] + mov eax,0x7777 + kmovw k1,eax + + vpermd zmm16,zmm25,zmm16 + vpermd zmm17,zmm25,zmm17 + vpermd zmm18,zmm25,zmm18 + vpermd zmm19,zmm25,zmm19 + vpermd zmm20,zmm25,zmm20 + + vpermd zmm16{k1},zmm25,zmm11 + vpermd zmm17{k1},zmm25,zmm12 + vpermd zmm18{k1},zmm25,zmm13 + vpermd zmm19{k1},zmm25,zmm14 + vpermd zmm20{k1},zmm25,zmm15 + + vpslld zmm21,zmm17,2 + vpslld zmm22,zmm18,2 + vpslld zmm23,zmm19,2 + vpslld zmm24,zmm20,2 + vpaddd zmm21,zmm21,zmm17 + vpaddd zmm22,zmm22,zmm18 + vpaddd zmm23,zmm23,zmm19 + vpaddd zmm24,zmm24,zmm20 + + vpbroadcastq zmm30,QWORD[32+rcx] + + vpsrlq zmm9,zmm7,52 + vpsllq zmm10,zmm6,12 + vporq zmm9,zmm9,zmm10 + vpsrlq zmm8,zmm7,26 + vpsrlq zmm10,zmm6,14 + vpsrlq zmm6,zmm6,40 + vpandq zmm9,zmm9,zmm5 + vpandq zmm7,zmm7,zmm5 + + + + + vpaddq zmm2,zmm9,zmm2 + sub rdx,192 + jbe NEAR $L$tail_avx512 + jmp NEAR $L$oop_avx512 + +ALIGN 32 +$L$oop_avx512: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + vpmuludq zmm14,zmm17,zmm2 + vpaddq zmm0,zmm7,zmm0 + vpmuludq zmm15,zmm18,zmm2 + vpandq zmm8,zmm8,zmm5 + vpmuludq zmm11,zmm23,zmm2 + vpandq zmm10,zmm10,zmm5 + vpmuludq zmm12,zmm24,zmm2 + vporq zmm6,zmm6,zmm30 + vpmuludq zmm13,zmm16,zmm2 + vpaddq zmm1,zmm8,zmm1 + vpaddq zmm3,zmm10,zmm3 + vpaddq zmm4,zmm6,zmm4 + + vmovdqu64 zmm10,ZMMWORD[rsi] + vmovdqu64 zmm6,ZMMWORD[64+rsi] + lea rsi,[128+rsi] + vpmuludq zmm28,zmm19,zmm0 + vpmuludq zmm29,zmm20,zmm0 + vpmuludq zmm25,zmm16,zmm0 + vpmuludq zmm26,zmm17,zmm0 + vpaddq zmm14,zmm14,zmm28 + vpaddq zmm15,zmm15,zmm29 + vpaddq zmm11,zmm11,zmm25 + vpaddq zmm12,zmm12,zmm26 + + vpmuludq zmm28,zmm18,zmm1 + vpmuludq zmm29,zmm19,zmm1 + vpmuludq zmm25,zmm24,zmm1 + vpmuludq zmm27,zmm18,zmm0 + vpaddq zmm14,zmm14,zmm28 + vpaddq zmm15,zmm15,zmm29 + vpaddq zmm11,zmm11,zmm25 + vpaddq zmm13,zmm13,zmm27 + + vpunpcklqdq zmm7,zmm10,zmm6 + vpunpckhqdq zmm6,zmm10,zmm6 + + vpmuludq zmm28,zmm16,zmm3 + vpmuludq zmm29,zmm17,zmm3 + vpmuludq zmm26,zmm16,zmm1 + vpmuludq zmm27,zmm17,zmm1 + vpaddq zmm14,zmm14,zmm28 + vpaddq zmm15,zmm15,zmm29 + vpaddq zmm12,zmm12,zmm26 + vpaddq zmm13,zmm13,zmm27 + + vpmuludq zmm28,zmm24,zmm4 + vpmuludq zmm29,zmm16,zmm4 + vpmuludq zmm25,zmm22,zmm3 + vpmuludq zmm26,zmm23,zmm3 + vpaddq zmm14,zmm14,zmm28 + vpmuludq zmm27,zmm24,zmm3 + vpaddq zmm15,zmm15,zmm29 + vpaddq zmm11,zmm11,zmm25 + vpaddq zmm12,zmm12,zmm26 + vpaddq zmm13,zmm13,zmm27 + + vpmuludq zmm25,zmm21,zmm4 + vpmuludq zmm26,zmm22,zmm4 + vpmuludq zmm27,zmm23,zmm4 + vpaddq zmm0,zmm11,zmm25 + vpaddq zmm1,zmm12,zmm26 + vpaddq zmm2,zmm13,zmm27 + + + + + vpsrlq zmm9,zmm7,52 + vpsllq zmm10,zmm6,12 + + vpsrlq zmm3,zmm14,26 + vpandq zmm14,zmm14,zmm5 + vpaddq zmm4,zmm15,zmm3 + + vporq zmm9,zmm9,zmm10 + + vpsrlq zmm11,zmm0,26 + vpandq zmm0,zmm0,zmm5 + vpaddq zmm1,zmm1,zmm11 + + vpandq zmm9,zmm9,zmm5 + + vpsrlq zmm15,zmm4,26 + vpandq zmm4,zmm4,zmm5 + + vpsrlq zmm12,zmm1,26 + vpandq zmm1,zmm1,zmm5 + vpaddq zmm2,zmm2,zmm12 + + vpaddq zmm0,zmm0,zmm15 + vpsllq zmm15,zmm15,2 + vpaddq zmm0,zmm0,zmm15 + + vpaddq zmm2,zmm2,zmm9 + vpsrlq zmm8,zmm7,26 + + vpsrlq zmm13,zmm2,26 + vpandq zmm2,zmm2,zmm5 + vpaddq zmm3,zmm14,zmm13 + + vpsrlq zmm10,zmm6,14 + + vpsrlq zmm11,zmm0,26 + vpandq zmm0,zmm0,zmm5 + vpaddq zmm1,zmm1,zmm11 + + vpsrlq zmm6,zmm6,40 + + vpsrlq zmm14,zmm3,26 + vpandq zmm3,zmm3,zmm5 + vpaddq zmm4,zmm4,zmm14 + + vpandq zmm7,zmm7,zmm5 + + + + + sub rdx,128 + ja NEAR $L$oop_avx512 + +$L$tail_avx512: + + + + + + vpsrlq zmm16,zmm16,32 + vpsrlq zmm17,zmm17,32 + vpsrlq zmm18,zmm18,32 + vpsrlq zmm23,zmm23,32 + vpsrlq zmm24,zmm24,32 + vpsrlq zmm19,zmm19,32 + vpsrlq zmm20,zmm20,32 + vpsrlq zmm21,zmm21,32 + vpsrlq zmm22,zmm22,32 + + + + lea rsi,[rdx*1+rsi] + + + vpaddq zmm0,zmm7,zmm0 + + vpmuludq zmm14,zmm17,zmm2 + vpmuludq zmm15,zmm18,zmm2 + vpmuludq zmm11,zmm23,zmm2 + vpandq zmm8,zmm8,zmm5 + vpmuludq zmm12,zmm24,zmm2 + vpandq zmm10,zmm10,zmm5 + vpmuludq zmm13,zmm16,zmm2 + vporq zmm6,zmm6,zmm30 + vpaddq zmm1,zmm8,zmm1 + vpaddq zmm3,zmm10,zmm3 + vpaddq zmm4,zmm6,zmm4 + + vmovdqu xmm7,XMMWORD[rsi] + vpmuludq zmm28,zmm19,zmm0 + vpmuludq zmm29,zmm20,zmm0 + vpmuludq zmm25,zmm16,zmm0 + vpmuludq zmm26,zmm17,zmm0 + vpaddq zmm14,zmm14,zmm28 + vpaddq zmm15,zmm15,zmm29 + vpaddq zmm11,zmm11,zmm25 + vpaddq zmm12,zmm12,zmm26 + + vmovdqu xmm8,XMMWORD[16+rsi] + vpmuludq zmm28,zmm18,zmm1 + vpmuludq zmm29,zmm19,zmm1 + vpmuludq zmm25,zmm24,zmm1 + vpmuludq zmm27,zmm18,zmm0 + vpaddq zmm14,zmm14,zmm28 + vpaddq zmm15,zmm15,zmm29 + vpaddq zmm11,zmm11,zmm25 + vpaddq zmm13,zmm13,zmm27 + + vinserti128 ymm7,ymm7,XMMWORD[32+rsi],1 + vpmuludq zmm28,zmm16,zmm3 + vpmuludq zmm29,zmm17,zmm3 + vpmuludq zmm26,zmm16,zmm1 + vpmuludq zmm27,zmm17,zmm1 + vpaddq zmm14,zmm14,zmm28 + vpaddq zmm15,zmm15,zmm29 + vpaddq zmm12,zmm12,zmm26 + vpaddq zmm13,zmm13,zmm27 + + vinserti128 ymm8,ymm8,XMMWORD[48+rsi],1 + vpmuludq zmm28,zmm24,zmm4 + vpmuludq zmm29,zmm16,zmm4 + vpmuludq zmm25,zmm22,zmm3 + vpmuludq zmm26,zmm23,zmm3 + vpmuludq zmm27,zmm24,zmm3 + vpaddq zmm3,zmm14,zmm28 + vpaddq zmm15,zmm15,zmm29 + vpaddq zmm11,zmm11,zmm25 + vpaddq zmm12,zmm12,zmm26 + vpaddq zmm13,zmm13,zmm27 + + vpmuludq zmm25,zmm21,zmm4 + vpmuludq zmm26,zmm22,zmm4 + vpmuludq zmm27,zmm23,zmm4 + vpaddq zmm0,zmm11,zmm25 + vpaddq zmm1,zmm12,zmm26 + vpaddq zmm2,zmm13,zmm27 + + + + + mov eax,1 + vpermq zmm14,zmm3,0xb1 + vpermq zmm4,zmm15,0xb1 + vpermq zmm11,zmm0,0xb1 + vpermq zmm12,zmm1,0xb1 + vpermq zmm13,zmm2,0xb1 + vpaddq zmm3,zmm3,zmm14 + vpaddq zmm4,zmm4,zmm15 + vpaddq zmm0,zmm0,zmm11 + vpaddq zmm1,zmm1,zmm12 + vpaddq zmm2,zmm2,zmm13 + + kmovw k3,eax + vpermq zmm14,zmm3,0x2 + vpermq zmm15,zmm4,0x2 + vpermq zmm11,zmm0,0x2 + vpermq zmm12,zmm1,0x2 + vpermq zmm13,zmm2,0x2 + vpaddq zmm3,zmm3,zmm14 + vpaddq zmm4,zmm4,zmm15 + vpaddq zmm0,zmm0,zmm11 + vpaddq zmm1,zmm1,zmm12 + vpaddq zmm2,zmm2,zmm13 + + vextracti64x4 ymm14,zmm3,0x1 + vextracti64x4 ymm15,zmm4,0x1 + vextracti64x4 ymm11,zmm0,0x1 + vextracti64x4 ymm12,zmm1,0x1 + vextracti64x4 ymm13,zmm2,0x1 + vpaddq zmm3{k3}{z},zmm3,zmm14 + vpaddq zmm4{k3}{z},zmm4,zmm15 + vpaddq zmm0{k3}{z},zmm0,zmm11 + vpaddq zmm1{k3}{z},zmm1,zmm12 + vpaddq zmm2{k3}{z},zmm2,zmm13 + + + + vpsrlq ymm14,ymm3,26 + vpand ymm3,ymm3,ymm5 + vpsrldq ymm9,ymm7,6 + vpsrldq ymm10,ymm8,6 + vpunpckhqdq ymm6,ymm7,ymm8 + vpaddq ymm4,ymm4,ymm14 + + vpsrlq ymm11,ymm0,26 + vpand ymm0,ymm0,ymm5 + vpunpcklqdq ymm9,ymm9,ymm10 + vpunpcklqdq ymm7,ymm7,ymm8 + vpaddq ymm1,ymm1,ymm11 + + vpsrlq ymm15,ymm4,26 + vpand ymm4,ymm4,ymm5 + + vpsrlq ymm12,ymm1,26 + vpand ymm1,ymm1,ymm5 + vpsrlq ymm10,ymm9,30 + vpsrlq ymm9,ymm9,4 + vpaddq ymm2,ymm2,ymm12 + + vpaddq ymm0,ymm0,ymm15 + vpsllq ymm15,ymm15,2 + vpsrlq ymm8,ymm7,26 + vpsrlq ymm6,ymm6,40 + vpaddq ymm0,ymm0,ymm15 + + vpsrlq ymm13,ymm2,26 + vpand ymm2,ymm2,ymm5 + vpand ymm9,ymm9,ymm5 + vpand ymm7,ymm7,ymm5 + vpaddq ymm3,ymm3,ymm13 + + vpsrlq ymm11,ymm0,26 + vpand ymm0,ymm0,ymm5 + vpaddq ymm2,ymm9,ymm2 + vpand ymm8,ymm8,ymm5 + vpaddq ymm1,ymm1,ymm11 + + vpsrlq ymm14,ymm3,26 + vpand ymm3,ymm3,ymm5 + vpand ymm10,ymm10,ymm5 + vpor ymm6,ymm6,YMMWORD[32+rcx] + vpaddq ymm4,ymm4,ymm14 + + lea rax,[144+rsp] + add rdx,64 + jnz NEAR $L$tail_avx2 + + vpsubq ymm2,ymm2,ymm9 + vmovd DWORD[(-112)+rdi],xmm0 + vmovd DWORD[(-108)+rdi],xmm1 + vmovd DWORD[(-104)+rdi],xmm2 + vmovd DWORD[(-100)+rdi],xmm3 + vmovd DWORD[(-96)+rdi],xmm4 + vzeroall + movdqa xmm6,XMMWORD[80+r11] + movdqa xmm7,XMMWORD[96+r11] + movdqa xmm8,XMMWORD[112+r11] + movdqa xmm9,XMMWORD[128+r11] + movdqa xmm10,XMMWORD[144+r11] + movdqa xmm11,XMMWORD[160+r11] + movdqa xmm12,XMMWORD[176+r11] + movdqa xmm13,XMMWORD[192+r11] + movdqa xmm14,XMMWORD[208+r11] + movdqa xmm15,XMMWORD[224+r11] + lea rsp,[248+r11] +$L$do_avx512_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_poly1305_blocks_avx512: + +ALIGN 32 +poly1305_init_base2_44: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_poly1305_init_base2_44: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + xor rax,rax + mov QWORD[rdi],rax + mov QWORD[8+rdi],rax + mov QWORD[16+rdi],rax + +$L$init_base2_44: + lea r10,[poly1305_blocks_vpmadd52] + lea r11,[poly1305_emit_base2_44] + + mov rax,0x0ffffffc0fffffff + mov rcx,0x0ffffffc0ffffffc + and rax,QWORD[rsi] + mov r8,0x00000fffffffffff + and rcx,QWORD[8+rsi] + mov r9,0x00000fffffffffff + and r8,rax + shrd rax,rcx,44 + mov QWORD[40+rdi],r8 + and rax,r9 + shr rcx,24 + mov QWORD[48+rdi],rax + lea rax,[rax*4+rax] + mov QWORD[56+rdi],rcx + shl rax,2 + lea rcx,[rcx*4+rcx] + shl rcx,2 + mov QWORD[24+rdi],rax + mov QWORD[32+rdi],rcx + mov QWORD[64+rdi],-1 + mov QWORD[rdx],r10 + mov QWORD[8+rdx],r11 + mov eax,1 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_poly1305_init_base2_44: + +ALIGN 32 +poly1305_blocks_vpmadd52: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_poly1305_blocks_vpmadd52: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + shr rdx,4 + jz NEAR $L$no_data_vpmadd52 + + shl rcx,40 + mov r8,QWORD[64+rdi] + + + + + + + mov rax,3 + mov r10,1 + cmp rdx,4 + cmovae rax,r10 + test r8,r8 + cmovns rax,r10 + + and rax,rdx + jz NEAR $L$blocks_vpmadd52_4x + + sub rdx,rax + mov r10d,7 + mov r11d,1 + kmovw k7,r10d + lea r10,[$L$2_44_inp_permd] + kmovw k1,r11d + + vmovq xmm21,rcx + vmovdqa64 ymm19,YMMWORD[r10] + vmovdqa64 ymm20,YMMWORD[32+r10] + vpermq ymm21,ymm21,0xcf + vmovdqa64 ymm22,YMMWORD[64+r10] + + vmovdqu64 ymm16{k7}{z},[rdi] + vmovdqu64 ymm3{k7}{z},[40+rdi] + vmovdqu64 ymm4{k7}{z},[32+rdi] + vmovdqu64 ymm5{k7}{z},[24+rdi] + + vmovdqa64 ymm23,YMMWORD[96+r10] + vmovdqa64 ymm24,YMMWORD[128+r10] + + jmp NEAR $L$oop_vpmadd52 + +ALIGN 32 +$L$oop_vpmadd52: + vmovdqu32 xmm18,XMMWORD[rsi] + lea rsi,[16+rsi] + + vpermd ymm18,ymm19,ymm18 + vpsrlvq ymm18,ymm18,ymm20 + vpandq ymm18,ymm18,ymm22 + vporq ymm18,ymm18,ymm21 + + vpaddq ymm16,ymm16,ymm18 + + vpermq ymm0{k7}{z},ymm16,0 + vpermq ymm1{k7}{z},ymm16,85 + vpermq ymm2{k7}{z},ymm16,170 + + vpxord ymm16,ymm16,ymm16 + vpxord ymm17,ymm17,ymm17 + + vpmadd52luq ymm16,ymm0,ymm3 + vpmadd52huq ymm17,ymm0,ymm3 + + vpmadd52luq ymm16,ymm1,ymm4 + vpmadd52huq ymm17,ymm1,ymm4 + + vpmadd52luq ymm16,ymm2,ymm5 + vpmadd52huq ymm17,ymm2,ymm5 + + vpsrlvq ymm18,ymm16,ymm23 + vpsllvq ymm17,ymm17,ymm24 + vpandq ymm16,ymm16,ymm22 + + vpaddq ymm17,ymm17,ymm18 + + vpermq ymm17,ymm17,147 + + vpaddq ymm16,ymm16,ymm17 + + vpsrlvq ymm18,ymm16,ymm23 + vpandq ymm16,ymm16,ymm22 + + vpermq ymm18,ymm18,147 + + vpaddq ymm16,ymm16,ymm18 + + vpermq ymm18{k1}{z},ymm16,147 + + vpaddq ymm16,ymm16,ymm18 + vpsllq ymm18,ymm18,2 + + vpaddq ymm16,ymm16,ymm18 + + dec rax + jnz NEAR $L$oop_vpmadd52 + + vmovdqu64 YMMWORD[rdi]{k7},ymm16 + + test rdx,rdx + jnz NEAR $L$blocks_vpmadd52_4x + +$L$no_data_vpmadd52: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_poly1305_blocks_vpmadd52: + +ALIGN 32 +poly1305_blocks_vpmadd52_4x: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_poly1305_blocks_vpmadd52_4x: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + shr rdx,4 + jz NEAR $L$no_data_vpmadd52_4x + + shl rcx,40 + mov r8,QWORD[64+rdi] + +$L$blocks_vpmadd52_4x: + vpbroadcastq ymm31,rcx + + vmovdqa64 ymm28,YMMWORD[$L$x_mask44] + mov eax,5 + vmovdqa64 ymm29,YMMWORD[$L$x_mask42] + kmovw k1,eax + + test r8,r8 + js NEAR $L$init_vpmadd52 + + vmovq xmm0,QWORD[rdi] + vmovq xmm1,QWORD[8+rdi] + vmovq xmm2,QWORD[16+rdi] + + test rdx,3 + jnz NEAR $L$blocks_vpmadd52_2x_do + +$L$blocks_vpmadd52_4x_do: + vpbroadcastq ymm3,QWORD[64+rdi] + vpbroadcastq ymm4,QWORD[96+rdi] + vpbroadcastq ymm5,QWORD[128+rdi] + vpbroadcastq ymm16,QWORD[160+rdi] + +$L$blocks_vpmadd52_4x_key_loaded: + vpsllq ymm17,ymm5,2 + vpaddq ymm17,ymm17,ymm5 + vpsllq ymm17,ymm17,2 + + test rdx,7 + jz NEAR $L$blocks_vpmadd52_8x + + vmovdqu64 ymm26,YMMWORD[rsi] + vmovdqu64 ymm27,YMMWORD[32+rsi] + lea rsi,[64+rsi] + + vpunpcklqdq ymm25,ymm26,ymm27 + vpunpckhqdq ymm27,ymm26,ymm27 + + + + vpsrlq ymm26,ymm27,24 + vporq ymm26,ymm26,ymm31 + vpaddq ymm2,ymm2,ymm26 + vpandq ymm24,ymm25,ymm28 + vpsrlq ymm25,ymm25,44 + vpsllq ymm27,ymm27,20 + vporq ymm25,ymm25,ymm27 + vpandq ymm25,ymm25,ymm28 + + sub rdx,4 + jz NEAR $L$tail_vpmadd52_4x + jmp NEAR $L$oop_vpmadd52_4x + ud2 + +ALIGN 32 +$L$init_vpmadd52: + vmovq xmm16,QWORD[24+rdi] + vmovq xmm2,QWORD[56+rdi] + vmovq xmm17,QWORD[32+rdi] + vmovq xmm3,QWORD[40+rdi] + vmovq xmm4,QWORD[48+rdi] + + vmovdqa ymm0,ymm3 + vmovdqa ymm1,ymm4 + vmovdqa ymm5,ymm2 + + mov eax,2 + +$L$mul_init_vpmadd52: + vpxorq ymm18,ymm18,ymm18 + vpmadd52luq ymm18,ymm16,ymm2 + vpxorq ymm19,ymm19,ymm19 + vpmadd52huq ymm19,ymm16,ymm2 + vpxorq ymm20,ymm20,ymm20 + vpmadd52luq ymm20,ymm17,ymm2 + vpxorq ymm21,ymm21,ymm21 + vpmadd52huq ymm21,ymm17,ymm2 + vpxorq ymm22,ymm22,ymm22 + vpmadd52luq ymm22,ymm3,ymm2 + vpxorq ymm23,ymm23,ymm23 + vpmadd52huq ymm23,ymm3,ymm2 + + vpmadd52luq ymm18,ymm3,ymm0 + vpmadd52huq ymm19,ymm3,ymm0 + vpmadd52luq ymm20,ymm4,ymm0 + vpmadd52huq ymm21,ymm4,ymm0 + vpmadd52luq ymm22,ymm5,ymm0 + vpmadd52huq ymm23,ymm5,ymm0 + + vpmadd52luq ymm18,ymm17,ymm1 + vpmadd52huq ymm19,ymm17,ymm1 + vpmadd52luq ymm20,ymm3,ymm1 + vpmadd52huq ymm21,ymm3,ymm1 + vpmadd52luq ymm22,ymm4,ymm1 + vpmadd52huq ymm23,ymm4,ymm1 + + + + vpsrlq ymm30,ymm18,44 + vpsllq ymm19,ymm19,8 + vpandq ymm0,ymm18,ymm28 + vpaddq ymm19,ymm19,ymm30 + + vpaddq ymm20,ymm20,ymm19 + + vpsrlq ymm30,ymm20,44 + vpsllq ymm21,ymm21,8 + vpandq ymm1,ymm20,ymm28 + vpaddq ymm21,ymm21,ymm30 + + vpaddq ymm22,ymm22,ymm21 + + vpsrlq ymm30,ymm22,42 + vpsllq ymm23,ymm23,10 + vpandq ymm2,ymm22,ymm29 + vpaddq ymm23,ymm23,ymm30 + + vpaddq ymm0,ymm0,ymm23 + vpsllq ymm23,ymm23,2 + + vpaddq ymm0,ymm0,ymm23 + + vpsrlq ymm30,ymm0,44 + vpandq ymm0,ymm0,ymm28 + + vpaddq ymm1,ymm1,ymm30 + + dec eax + jz NEAR $L$done_init_vpmadd52 + + vpunpcklqdq ymm4,ymm1,ymm4 + vpbroadcastq xmm1,xmm1 + vpunpcklqdq ymm5,ymm2,ymm5 + vpbroadcastq xmm2,xmm2 + vpunpcklqdq ymm3,ymm0,ymm3 + vpbroadcastq xmm0,xmm0 + + vpsllq ymm16,ymm4,2 + vpsllq ymm17,ymm5,2 + vpaddq ymm16,ymm16,ymm4 + vpaddq ymm17,ymm17,ymm5 + vpsllq ymm16,ymm16,2 + vpsllq ymm17,ymm17,2 + + jmp NEAR $L$mul_init_vpmadd52 + ud2 + +ALIGN 32 +$L$done_init_vpmadd52: + vinserti128 ymm4,ymm1,xmm4,1 + vinserti128 ymm5,ymm2,xmm5,1 + vinserti128 ymm3,ymm0,xmm3,1 + + vpermq ymm4,ymm4,216 + vpermq ymm5,ymm5,216 + vpermq ymm3,ymm3,216 + + vpsllq ymm16,ymm4,2 + vpaddq ymm16,ymm16,ymm4 + vpsllq ymm16,ymm16,2 + + vmovq xmm0,QWORD[rdi] + vmovq xmm1,QWORD[8+rdi] + vmovq xmm2,QWORD[16+rdi] + + test rdx,3 + jnz NEAR $L$done_init_vpmadd52_2x + + vmovdqu64 YMMWORD[64+rdi],ymm3 + vpbroadcastq ymm3,xmm3 + vmovdqu64 YMMWORD[96+rdi],ymm4 + vpbroadcastq ymm4,xmm4 + vmovdqu64 YMMWORD[128+rdi],ymm5 + vpbroadcastq ymm5,xmm5 + vmovdqu64 YMMWORD[160+rdi],ymm16 + vpbroadcastq ymm16,xmm16 + + jmp NEAR $L$blocks_vpmadd52_4x_key_loaded + ud2 + +ALIGN 32 +$L$done_init_vpmadd52_2x: + vmovdqu64 YMMWORD[64+rdi],ymm3 + vpsrldq ymm3,ymm3,8 + vmovdqu64 YMMWORD[96+rdi],ymm4 + vpsrldq ymm4,ymm4,8 + vmovdqu64 YMMWORD[128+rdi],ymm5 + vpsrldq ymm5,ymm5,8 + vmovdqu64 YMMWORD[160+rdi],ymm16 + vpsrldq ymm16,ymm16,8 + jmp NEAR $L$blocks_vpmadd52_2x_key_loaded + ud2 + +ALIGN 32 +$L$blocks_vpmadd52_2x_do: + vmovdqu64 ymm5{k1}{z},[((128+8))+rdi] + vmovdqu64 ymm16{k1}{z},[((160+8))+rdi] + vmovdqu64 ymm3{k1}{z},[((64+8))+rdi] + vmovdqu64 ymm4{k1}{z},[((96+8))+rdi] + +$L$blocks_vpmadd52_2x_key_loaded: + vmovdqu64 ymm26,YMMWORD[rsi] + vpxorq ymm27,ymm27,ymm27 + lea rsi,[32+rsi] + + vpunpcklqdq ymm25,ymm26,ymm27 + vpunpckhqdq ymm27,ymm26,ymm27 + + + + vpsrlq ymm26,ymm27,24 + vporq ymm26,ymm26,ymm31 + vpaddq ymm2,ymm2,ymm26 + vpandq ymm24,ymm25,ymm28 + vpsrlq ymm25,ymm25,44 + vpsllq ymm27,ymm27,20 + vporq ymm25,ymm25,ymm27 + vpandq ymm25,ymm25,ymm28 + + jmp NEAR $L$tail_vpmadd52_2x + ud2 + +ALIGN 32 +$L$oop_vpmadd52_4x: + + vpaddq ymm0,ymm0,ymm24 + vpaddq ymm1,ymm1,ymm25 + + vpxorq ymm18,ymm18,ymm18 + vpmadd52luq ymm18,ymm16,ymm2 + vpxorq ymm19,ymm19,ymm19 + vpmadd52huq ymm19,ymm16,ymm2 + vpxorq ymm20,ymm20,ymm20 + vpmadd52luq ymm20,ymm17,ymm2 + vpxorq ymm21,ymm21,ymm21 + vpmadd52huq ymm21,ymm17,ymm2 + vpxorq ymm22,ymm22,ymm22 + vpmadd52luq ymm22,ymm3,ymm2 + vpxorq ymm23,ymm23,ymm23 + vpmadd52huq ymm23,ymm3,ymm2 + + vmovdqu64 ymm26,YMMWORD[rsi] + vmovdqu64 ymm27,YMMWORD[32+rsi] + lea rsi,[64+rsi] + vpmadd52luq ymm18,ymm3,ymm0 + vpmadd52huq ymm19,ymm3,ymm0 + vpmadd52luq ymm20,ymm4,ymm0 + vpmadd52huq ymm21,ymm4,ymm0 + vpmadd52luq ymm22,ymm5,ymm0 + vpmadd52huq ymm23,ymm5,ymm0 + + vpunpcklqdq ymm25,ymm26,ymm27 + vpunpckhqdq ymm27,ymm26,ymm27 + vpmadd52luq ymm18,ymm17,ymm1 + vpmadd52huq ymm19,ymm17,ymm1 + vpmadd52luq ymm20,ymm3,ymm1 + vpmadd52huq ymm21,ymm3,ymm1 + vpmadd52luq ymm22,ymm4,ymm1 + vpmadd52huq ymm23,ymm4,ymm1 + + + + vpsrlq ymm30,ymm18,44 + vpsllq ymm19,ymm19,8 + vpandq ymm0,ymm18,ymm28 + vpaddq ymm19,ymm19,ymm30 + + vpsrlq ymm26,ymm27,24 + vporq ymm26,ymm26,ymm31 + vpaddq ymm20,ymm20,ymm19 + + vpsrlq ymm30,ymm20,44 + vpsllq ymm21,ymm21,8 + vpandq ymm1,ymm20,ymm28 + vpaddq ymm21,ymm21,ymm30 + + vpandq ymm24,ymm25,ymm28 + vpsrlq ymm25,ymm25,44 + vpsllq ymm27,ymm27,20 + vpaddq ymm22,ymm22,ymm21 + + vpsrlq ymm30,ymm22,42 + vpsllq ymm23,ymm23,10 + vpandq ymm2,ymm22,ymm29 + vpaddq ymm23,ymm23,ymm30 + + vpaddq ymm2,ymm2,ymm26 + vpaddq ymm0,ymm0,ymm23 + vpsllq ymm23,ymm23,2 + + vpaddq ymm0,ymm0,ymm23 + vporq ymm25,ymm25,ymm27 + vpandq ymm25,ymm25,ymm28 + + vpsrlq ymm30,ymm0,44 + vpandq ymm0,ymm0,ymm28 + + vpaddq ymm1,ymm1,ymm30 + + sub rdx,4 + jnz NEAR $L$oop_vpmadd52_4x + +$L$tail_vpmadd52_4x: + vmovdqu64 ymm5,YMMWORD[128+rdi] + vmovdqu64 ymm16,YMMWORD[160+rdi] + vmovdqu64 ymm3,YMMWORD[64+rdi] + vmovdqu64 ymm4,YMMWORD[96+rdi] + +$L$tail_vpmadd52_2x: + vpsllq ymm17,ymm5,2 + vpaddq ymm17,ymm17,ymm5 + vpsllq ymm17,ymm17,2 + + + vpaddq ymm0,ymm0,ymm24 + vpaddq ymm1,ymm1,ymm25 + + vpxorq ymm18,ymm18,ymm18 + vpmadd52luq ymm18,ymm16,ymm2 + vpxorq ymm19,ymm19,ymm19 + vpmadd52huq ymm19,ymm16,ymm2 + vpxorq ymm20,ymm20,ymm20 + vpmadd52luq ymm20,ymm17,ymm2 + vpxorq ymm21,ymm21,ymm21 + vpmadd52huq ymm21,ymm17,ymm2 + vpxorq ymm22,ymm22,ymm22 + vpmadd52luq ymm22,ymm3,ymm2 + vpxorq ymm23,ymm23,ymm23 + vpmadd52huq ymm23,ymm3,ymm2 + + vpmadd52luq ymm18,ymm3,ymm0 + vpmadd52huq ymm19,ymm3,ymm0 + vpmadd52luq ymm20,ymm4,ymm0 + vpmadd52huq ymm21,ymm4,ymm0 + vpmadd52luq ymm22,ymm5,ymm0 + vpmadd52huq ymm23,ymm5,ymm0 + + vpmadd52luq ymm18,ymm17,ymm1 + vpmadd52huq ymm19,ymm17,ymm1 + vpmadd52luq ymm20,ymm3,ymm1 + vpmadd52huq ymm21,ymm3,ymm1 + vpmadd52luq ymm22,ymm4,ymm1 + vpmadd52huq ymm23,ymm4,ymm1 + + + + + mov eax,1 + kmovw k1,eax + vpsrldq ymm24,ymm18,8 + vpsrldq ymm0,ymm19,8 + vpsrldq ymm25,ymm20,8 + vpsrldq ymm1,ymm21,8 + vpaddq ymm18,ymm18,ymm24 + vpaddq ymm19,ymm19,ymm0 + vpsrldq ymm26,ymm22,8 + vpsrldq ymm2,ymm23,8 + vpaddq ymm20,ymm20,ymm25 + vpaddq ymm21,ymm21,ymm1 + vpermq ymm24,ymm18,0x2 + vpermq ymm0,ymm19,0x2 + vpaddq ymm22,ymm22,ymm26 + vpaddq ymm23,ymm23,ymm2 + + vpermq ymm25,ymm20,0x2 + vpermq ymm1,ymm21,0x2 + vpaddq ymm18{k1}{z},ymm18,ymm24 + vpaddq ymm19{k1}{z},ymm19,ymm0 + vpermq ymm26,ymm22,0x2 + vpermq ymm2,ymm23,0x2 + vpaddq ymm20{k1}{z},ymm20,ymm25 + vpaddq ymm21{k1}{z},ymm21,ymm1 + vpaddq ymm22{k1}{z},ymm22,ymm26 + vpaddq ymm23{k1}{z},ymm23,ymm2 + + + + vpsrlq ymm30,ymm18,44 + vpsllq ymm19,ymm19,8 + vpandq ymm0,ymm18,ymm28 + vpaddq ymm19,ymm19,ymm30 + + vpaddq ymm20,ymm20,ymm19 + + vpsrlq ymm30,ymm20,44 + vpsllq ymm21,ymm21,8 + vpandq ymm1,ymm20,ymm28 + vpaddq ymm21,ymm21,ymm30 + + vpaddq ymm22,ymm22,ymm21 + + vpsrlq ymm30,ymm22,42 + vpsllq ymm23,ymm23,10 + vpandq ymm2,ymm22,ymm29 + vpaddq ymm23,ymm23,ymm30 + + vpaddq ymm0,ymm0,ymm23 + vpsllq ymm23,ymm23,2 + + vpaddq ymm0,ymm0,ymm23 + + vpsrlq ymm30,ymm0,44 + vpandq ymm0,ymm0,ymm28 + + vpaddq ymm1,ymm1,ymm30 + + + sub rdx,2 + ja NEAR $L$blocks_vpmadd52_4x_do + + vmovq QWORD[rdi],xmm0 + vmovq QWORD[8+rdi],xmm1 + vmovq QWORD[16+rdi],xmm2 + vzeroall + +$L$no_data_vpmadd52_4x: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_poly1305_blocks_vpmadd52_4x: + +ALIGN 32 +poly1305_blocks_vpmadd52_8x: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_poly1305_blocks_vpmadd52_8x: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + shr rdx,4 + jz NEAR $L$no_data_vpmadd52_8x + + shl rcx,40 + mov r8,QWORD[64+rdi] + + vmovdqa64 ymm28,YMMWORD[$L$x_mask44] + vmovdqa64 ymm29,YMMWORD[$L$x_mask42] + + test r8,r8 + js NEAR $L$init_vpmadd52 + + vmovq xmm0,QWORD[rdi] + vmovq xmm1,QWORD[8+rdi] + vmovq xmm2,QWORD[16+rdi] + +$L$blocks_vpmadd52_8x: + + + + vmovdqu64 ymm5,YMMWORD[128+rdi] + vmovdqu64 ymm16,YMMWORD[160+rdi] + vmovdqu64 ymm3,YMMWORD[64+rdi] + vmovdqu64 ymm4,YMMWORD[96+rdi] + + vpsllq ymm17,ymm5,2 + vpaddq ymm17,ymm17,ymm5 + vpsllq ymm17,ymm17,2 + + vpbroadcastq ymm8,xmm5 + vpbroadcastq ymm6,xmm3 + vpbroadcastq ymm7,xmm4 + + vpxorq ymm18,ymm18,ymm18 + vpmadd52luq ymm18,ymm16,ymm8 + vpxorq ymm19,ymm19,ymm19 + vpmadd52huq ymm19,ymm16,ymm8 + vpxorq ymm20,ymm20,ymm20 + vpmadd52luq ymm20,ymm17,ymm8 + vpxorq ymm21,ymm21,ymm21 + vpmadd52huq ymm21,ymm17,ymm8 + vpxorq ymm22,ymm22,ymm22 + vpmadd52luq ymm22,ymm3,ymm8 + vpxorq ymm23,ymm23,ymm23 + vpmadd52huq ymm23,ymm3,ymm8 + + vpmadd52luq ymm18,ymm3,ymm6 + vpmadd52huq ymm19,ymm3,ymm6 + vpmadd52luq ymm20,ymm4,ymm6 + vpmadd52huq ymm21,ymm4,ymm6 + vpmadd52luq ymm22,ymm5,ymm6 + vpmadd52huq ymm23,ymm5,ymm6 + + vpmadd52luq ymm18,ymm17,ymm7 + vpmadd52huq ymm19,ymm17,ymm7 + vpmadd52luq ymm20,ymm3,ymm7 + vpmadd52huq ymm21,ymm3,ymm7 + vpmadd52luq ymm22,ymm4,ymm7 + vpmadd52huq ymm23,ymm4,ymm7 + + + + vpsrlq ymm30,ymm18,44 + vpsllq ymm19,ymm19,8 + vpandq ymm6,ymm18,ymm28 + vpaddq ymm19,ymm19,ymm30 + + vpaddq ymm20,ymm20,ymm19 + + vpsrlq ymm30,ymm20,44 + vpsllq ymm21,ymm21,8 + vpandq ymm7,ymm20,ymm28 + vpaddq ymm21,ymm21,ymm30 + + vpaddq ymm22,ymm22,ymm21 + + vpsrlq ymm30,ymm22,42 + vpsllq ymm23,ymm23,10 + vpandq ymm8,ymm22,ymm29 + vpaddq ymm23,ymm23,ymm30 + + vpaddq ymm6,ymm6,ymm23 + vpsllq ymm23,ymm23,2 + + vpaddq ymm6,ymm6,ymm23 + + vpsrlq ymm30,ymm6,44 + vpandq ymm6,ymm6,ymm28 + + vpaddq ymm7,ymm7,ymm30 + + + + + + vpunpcklqdq ymm26,ymm8,ymm5 + vpunpckhqdq ymm5,ymm8,ymm5 + vpunpcklqdq ymm24,ymm6,ymm3 + vpunpckhqdq ymm3,ymm6,ymm3 + vpunpcklqdq ymm25,ymm7,ymm4 + vpunpckhqdq ymm4,ymm7,ymm4 + vshufi64x2 zmm8,zmm26,zmm5,0x44 + vshufi64x2 zmm6,zmm24,zmm3,0x44 + vshufi64x2 zmm7,zmm25,zmm4,0x44 + + vmovdqu64 zmm26,ZMMWORD[rsi] + vmovdqu64 zmm27,ZMMWORD[64+rsi] + lea rsi,[128+rsi] + + vpsllq zmm10,zmm8,2 + vpsllq zmm9,zmm7,2 + vpaddq zmm10,zmm10,zmm8 + vpaddq zmm9,zmm9,zmm7 + vpsllq zmm10,zmm10,2 + vpsllq zmm9,zmm9,2 + + vpbroadcastq zmm31,rcx + vpbroadcastq zmm28,xmm28 + vpbroadcastq zmm29,xmm29 + + vpbroadcastq zmm16,xmm9 + vpbroadcastq zmm17,xmm10 + vpbroadcastq zmm3,xmm6 + vpbroadcastq zmm4,xmm7 + vpbroadcastq zmm5,xmm8 + + vpunpcklqdq zmm25,zmm26,zmm27 + vpunpckhqdq zmm27,zmm26,zmm27 + + + + vpsrlq zmm26,zmm27,24 + vporq zmm26,zmm26,zmm31 + vpaddq zmm2,zmm2,zmm26 + vpandq zmm24,zmm25,zmm28 + vpsrlq zmm25,zmm25,44 + vpsllq zmm27,zmm27,20 + vporq zmm25,zmm25,zmm27 + vpandq zmm25,zmm25,zmm28 + + sub rdx,8 + jz NEAR $L$tail_vpmadd52_8x + jmp NEAR $L$oop_vpmadd52_8x + +ALIGN 32 +$L$oop_vpmadd52_8x: + + vpaddq zmm0,zmm0,zmm24 + vpaddq zmm1,zmm1,zmm25 + + vpxorq zmm18,zmm18,zmm18 + vpmadd52luq zmm18,zmm16,zmm2 + vpxorq zmm19,zmm19,zmm19 + vpmadd52huq zmm19,zmm16,zmm2 + vpxorq zmm20,zmm20,zmm20 + vpmadd52luq zmm20,zmm17,zmm2 + vpxorq zmm21,zmm21,zmm21 + vpmadd52huq zmm21,zmm17,zmm2 + vpxorq zmm22,zmm22,zmm22 + vpmadd52luq zmm22,zmm3,zmm2 + vpxorq zmm23,zmm23,zmm23 + vpmadd52huq zmm23,zmm3,zmm2 + + vmovdqu64 zmm26,ZMMWORD[rsi] + vmovdqu64 zmm27,ZMMWORD[64+rsi] + lea rsi,[128+rsi] + vpmadd52luq zmm18,zmm3,zmm0 + vpmadd52huq zmm19,zmm3,zmm0 + vpmadd52luq zmm20,zmm4,zmm0 + vpmadd52huq zmm21,zmm4,zmm0 + vpmadd52luq zmm22,zmm5,zmm0 + vpmadd52huq zmm23,zmm5,zmm0 + + vpunpcklqdq zmm25,zmm26,zmm27 + vpunpckhqdq zmm27,zmm26,zmm27 + vpmadd52luq zmm18,zmm17,zmm1 + vpmadd52huq zmm19,zmm17,zmm1 + vpmadd52luq zmm20,zmm3,zmm1 + vpmadd52huq zmm21,zmm3,zmm1 + vpmadd52luq zmm22,zmm4,zmm1 + vpmadd52huq zmm23,zmm4,zmm1 + + + + vpsrlq zmm30,zmm18,44 + vpsllq zmm19,zmm19,8 + vpandq zmm0,zmm18,zmm28 + vpaddq zmm19,zmm19,zmm30 + + vpsrlq zmm26,zmm27,24 + vporq zmm26,zmm26,zmm31 + vpaddq zmm20,zmm20,zmm19 + + vpsrlq zmm30,zmm20,44 + vpsllq zmm21,zmm21,8 + vpandq zmm1,zmm20,zmm28 + vpaddq zmm21,zmm21,zmm30 + + vpandq zmm24,zmm25,zmm28 + vpsrlq zmm25,zmm25,44 + vpsllq zmm27,zmm27,20 + vpaddq zmm22,zmm22,zmm21 + + vpsrlq zmm30,zmm22,42 + vpsllq zmm23,zmm23,10 + vpandq zmm2,zmm22,zmm29 + vpaddq zmm23,zmm23,zmm30 + + vpaddq zmm2,zmm2,zmm26 + vpaddq zmm0,zmm0,zmm23 + vpsllq zmm23,zmm23,2 + + vpaddq zmm0,zmm0,zmm23 + vporq zmm25,zmm25,zmm27 + vpandq zmm25,zmm25,zmm28 + + vpsrlq zmm30,zmm0,44 + vpandq zmm0,zmm0,zmm28 + + vpaddq zmm1,zmm1,zmm30 + + sub rdx,8 + jnz NEAR $L$oop_vpmadd52_8x + +$L$tail_vpmadd52_8x: + + vpaddq zmm0,zmm0,zmm24 + vpaddq zmm1,zmm1,zmm25 + + vpxorq zmm18,zmm18,zmm18 + vpmadd52luq zmm18,zmm9,zmm2 + vpxorq zmm19,zmm19,zmm19 + vpmadd52huq zmm19,zmm9,zmm2 + vpxorq zmm20,zmm20,zmm20 + vpmadd52luq zmm20,zmm10,zmm2 + vpxorq zmm21,zmm21,zmm21 + vpmadd52huq zmm21,zmm10,zmm2 + vpxorq zmm22,zmm22,zmm22 + vpmadd52luq zmm22,zmm6,zmm2 + vpxorq zmm23,zmm23,zmm23 + vpmadd52huq zmm23,zmm6,zmm2 + + vpmadd52luq zmm18,zmm6,zmm0 + vpmadd52huq zmm19,zmm6,zmm0 + vpmadd52luq zmm20,zmm7,zmm0 + vpmadd52huq zmm21,zmm7,zmm0 + vpmadd52luq zmm22,zmm8,zmm0 + vpmadd52huq zmm23,zmm8,zmm0 + + vpmadd52luq zmm18,zmm10,zmm1 + vpmadd52huq zmm19,zmm10,zmm1 + vpmadd52luq zmm20,zmm6,zmm1 + vpmadd52huq zmm21,zmm6,zmm1 + vpmadd52luq zmm22,zmm7,zmm1 + vpmadd52huq zmm23,zmm7,zmm1 + + + + + mov eax,1 + kmovw k1,eax + vpsrldq zmm24,zmm18,8 + vpsrldq zmm0,zmm19,8 + vpsrldq zmm25,zmm20,8 + vpsrldq zmm1,zmm21,8 + vpaddq zmm18,zmm18,zmm24 + vpaddq zmm19,zmm19,zmm0 + vpsrldq zmm26,zmm22,8 + vpsrldq zmm2,zmm23,8 + vpaddq zmm20,zmm20,zmm25 + vpaddq zmm21,zmm21,zmm1 + vpermq zmm24,zmm18,0x2 + vpermq zmm0,zmm19,0x2 + vpaddq zmm22,zmm22,zmm26 + vpaddq zmm23,zmm23,zmm2 + + vpermq zmm25,zmm20,0x2 + vpermq zmm1,zmm21,0x2 + vpaddq zmm18,zmm18,zmm24 + vpaddq zmm19,zmm19,zmm0 + vpermq zmm26,zmm22,0x2 + vpermq zmm2,zmm23,0x2 + vpaddq zmm20,zmm20,zmm25 + vpaddq zmm21,zmm21,zmm1 + vextracti64x4 ymm24,zmm18,1 + vextracti64x4 ymm0,zmm19,1 + vpaddq zmm22,zmm22,zmm26 + vpaddq zmm23,zmm23,zmm2 + + vextracti64x4 ymm25,zmm20,1 + vextracti64x4 ymm1,zmm21,1 + vextracti64x4 ymm26,zmm22,1 + vextracti64x4 ymm2,zmm23,1 + vpaddq ymm18{k1}{z},ymm18,ymm24 + vpaddq ymm19{k1}{z},ymm19,ymm0 + vpaddq ymm20{k1}{z},ymm20,ymm25 + vpaddq ymm21{k1}{z},ymm21,ymm1 + vpaddq ymm22{k1}{z},ymm22,ymm26 + vpaddq ymm23{k1}{z},ymm23,ymm2 + + + + vpsrlq ymm30,ymm18,44 + vpsllq ymm19,ymm19,8 + vpandq ymm0,ymm18,ymm28 + vpaddq ymm19,ymm19,ymm30 + + vpaddq ymm20,ymm20,ymm19 + + vpsrlq ymm30,ymm20,44 + vpsllq ymm21,ymm21,8 + vpandq ymm1,ymm20,ymm28 + vpaddq ymm21,ymm21,ymm30 + + vpaddq ymm22,ymm22,ymm21 + + vpsrlq ymm30,ymm22,42 + vpsllq ymm23,ymm23,10 + vpandq ymm2,ymm22,ymm29 + vpaddq ymm23,ymm23,ymm30 + + vpaddq ymm0,ymm0,ymm23 + vpsllq ymm23,ymm23,2 + + vpaddq ymm0,ymm0,ymm23 + + vpsrlq ymm30,ymm0,44 + vpandq ymm0,ymm0,ymm28 + + vpaddq ymm1,ymm1,ymm30 + + + + vmovq QWORD[rdi],xmm0 + vmovq QWORD[8+rdi],xmm1 + vmovq QWORD[16+rdi],xmm2 + vzeroall + +$L$no_data_vpmadd52_8x: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_poly1305_blocks_vpmadd52_8x: + +ALIGN 32 +poly1305_emit_base2_44: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_poly1305_emit_base2_44: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + mov r8,QWORD[rdi] + mov r9,QWORD[8+rdi] + mov r10,QWORD[16+rdi] + + mov rax,r9 + shr r9,20 + shl rax,44 + mov rcx,r10 + shr r10,40 + shl rcx,24 + + add r8,rax + adc r9,rcx + adc r10,0 + + mov rax,r8 + add r8,5 + mov rcx,r9 + adc r9,0 + adc r10,0 + shr r10,2 + cmovnz rax,r8 + cmovnz rcx,r9 + + add rax,QWORD[rdx] + adc rcx,QWORD[8+rdx] + mov QWORD[rsi],rax + mov QWORD[8+rsi],rcx + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_poly1305_emit_base2_44: ALIGN 64 $L$const: $L$mask24: @@ -1944,13 +3614,131 @@ $L$129: DD 16777216,0,16777216,0,16777216,0,16777216,0 $L$mask26: DD 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 -$L$five: - DD 5,0,5,0,5,0,5,0 +$L$permd_avx2: + DD 2,2,2,3,2,0,2,1 +$L$permd_avx512: + DD 0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7 + +$L$2_44_inp_permd: + DD 0,1,1,2,2,3,7,7 +$L$2_44_inp_shift: + DQ 0,12,24,64 +$L$2_44_mask: + DQ 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff +$L$2_44_shift_rgt: + DQ 44,44,42,64 +$L$2_44_shift_lft: + DQ 8,8,10,64 + +ALIGN 64 +$L$x_mask44: + DQ 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff + DQ 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +$L$x_mask42: + DQ 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff + DQ 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff DB 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54 DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32 DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115 DB 108,46,111,114,103,62,0 ALIGN 16 +global xor128_encrypt_n_pad + +ALIGN 16 +xor128_encrypt_n_pad: + sub rdx,r8 + sub rcx,r8 + mov r10,r9 + shr r9,4 + jz NEAR $L$tail_enc + nop +$L$oop_enc_xmm: + movdqu xmm0,XMMWORD[r8*1+rdx] + pxor xmm0,XMMWORD[r8] + movdqu XMMWORD[r8*1+rcx],xmm0 + movdqa XMMWORD[r8],xmm0 + lea r8,[16+r8] + dec r9 + jnz NEAR $L$oop_enc_xmm + + and r10,15 + jz NEAR $L$done_enc + +$L$tail_enc: + mov r9,16 + sub r9,r10 + xor eax,eax +$L$oop_enc_byte: + mov al,BYTE[r8*1+rdx] + xor al,BYTE[r8] + mov BYTE[r8*1+rcx],al + mov BYTE[r8],al + lea r8,[1+r8] + dec r10 + jnz NEAR $L$oop_enc_byte + + xor eax,eax +$L$oop_enc_pad: + mov BYTE[r8],al + lea r8,[1+r8] + dec r9 + jnz NEAR $L$oop_enc_pad + +$L$done_enc: + mov rax,r8 + DB 0F3h,0C3h ;repret + + +global xor128_decrypt_n_pad + +ALIGN 16 +xor128_decrypt_n_pad: + sub rdx,r8 + sub rcx,r8 + mov r10,r9 + shr r9,4 + jz NEAR $L$tail_dec + nop +$L$oop_dec_xmm: + movdqu xmm0,XMMWORD[r8*1+rdx] + movdqa xmm1,XMMWORD[r8] + pxor xmm1,xmm0 + movdqu XMMWORD[r8*1+rcx],xmm1 + movdqa XMMWORD[r8],xmm0 + lea r8,[16+r8] + dec r9 + jnz NEAR $L$oop_dec_xmm + + pxor xmm1,xmm1 + and r10,15 + jz NEAR $L$done_dec + +$L$tail_dec: + mov r9,16 + sub r9,r10 + xor eax,eax + xor r11,r11 +$L$oop_dec_byte: + mov r11b,BYTE[r8*1+rdx] + mov al,BYTE[r8] + xor al,r11b + mov BYTE[r8*1+rcx],al + mov BYTE[r8],r11b + lea r8,[1+r8] + dec r10 + jnz NEAR $L$oop_dec_byte + + xor eax,eax +$L$oop_dec_pad: + mov BYTE[r8],al + lea r8,[1+r8] + dec r9 + jnz NEAR $L$oop_dec_pad + +$L$done_dec: + mov rax,r8 + DB 0F3h,0C3h ;repret + EXTERN __imp_RtlVirtualUnwind ALIGN 16 @@ -2121,6 +3909,9 @@ ALIGN 4 DD $L$even_avx2 wrt ..imagebase DD $L$SEH_end_poly1305_blocks_avx2 wrt ..imagebase DD $L$SEH_info_poly1305_blocks_avx2_3 wrt ..imagebase + DD $L$SEH_begin_poly1305_blocks_avx512 wrt ..imagebase + DD $L$SEH_end_poly1305_blocks_avx512 wrt ..imagebase + DD $L$SEH_info_poly1305_blocks_avx512 wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_poly1305_init: @@ -2170,3 +3961,7 @@ $L$SEH_info_poly1305_blocks_avx2_3: DB 9,0,0,0 DD avx_handler wrt ..imagebase DD $L$do_avx2_body wrt ..imagebase,$L$do_avx2_epilogue wrt ..imagebase +$L$SEH_info_poly1305_blocks_avx512: +DB 9,0,0,0 + DD avx_handler wrt ..imagebase + DD $L$do_avx512_body wrt ..imagebase,$L$do_avx512_epilogue wrt ..imagebase diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/rc4/rc4-md5-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/rc4/rc4-md5-x86_64.asm index f1ea9652d9..5e42fe63df 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/rc4/rc4-md5-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/rc4/rc4-md5-x86_64.asm @@ -21,15 +21,23 @@ $L$SEH_begin_rc4_md5_enc: mov r9,QWORD[48+rsp] + cmp r9,0 je NEAR $L$abort push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp,40 + $L$body: mov r11,rcx mov r12,r9 @@ -1264,17 +1272,25 @@ $L$oop: mov DWORD[((-4))+rdi],ecx mov r15,QWORD[40+rsp] + mov r14,QWORD[48+rsp] + mov r13,QWORD[56+rsp] + mov r12,QWORD[64+rsp] + mov rbp,QWORD[72+rsp] + mov rbx,QWORD[80+rsp] + lea rsp,[88+rsp] + $L$epilogue: $L$abort: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_rc4_md5_enc: EXTERN __imp_RtlVirtualUnwind diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/rc4/rc4-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/rc4/rc4-x86_64.asm index 9c042ec082..5732b40ed6 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/rc4/rc4-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/rc4/rc4-x86_64.asm @@ -25,9 +25,13 @@ $L$SEH_begin_RC4: mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$entry: + push rbx + push r12 + push r13 + $L$prologue: mov r11,rsi mov r12,rdx @@ -528,13 +532,18 @@ $L$exit: mov DWORD[((-4))+rdi],ecx mov r13,QWORD[rsp] + mov r12,QWORD[8+rsp] + mov rbx,QWORD[16+rsp] + add rsp,24 + $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_RC4: global RC4_set_key diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/keccak1600-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/keccak1600-x86_64.asm new file mode 100644 index 0000000000..d0471cb3b3 --- /dev/null +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/keccak1600-x86_64.asm @@ -0,0 +1,525 @@ +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + + + +ALIGN 32 +__KeccakF1600: + mov rax,QWORD[60+rdi] + mov rbx,QWORD[68+rdi] + mov rcx,QWORD[76+rdi] + mov rdx,QWORD[84+rdi] + mov rbp,QWORD[92+rdi] + jmp NEAR $L$oop + +ALIGN 32 +$L$oop: + mov r8,QWORD[((-100))+rdi] + mov r9,QWORD[((-52))+rdi] + mov r10,QWORD[((-4))+rdi] + mov r11,QWORD[44+rdi] + + xor rcx,QWORD[((-84))+rdi] + xor rdx,QWORD[((-76))+rdi] + xor rax,r8 + xor rbx,QWORD[((-92))+rdi] + xor rcx,QWORD[((-44))+rdi] + xor rax,QWORD[((-60))+rdi] + mov r12,rbp + xor rbp,QWORD[((-68))+rdi] + + xor rcx,r10 + xor rax,QWORD[((-20))+rdi] + xor rdx,QWORD[((-36))+rdi] + xor rbx,r9 + xor rbp,QWORD[((-28))+rdi] + + xor rcx,QWORD[36+rdi] + xor rax,QWORD[20+rdi] + xor rdx,QWORD[4+rdi] + xor rbx,QWORD[((-12))+rdi] + xor rbp,QWORD[12+rdi] + + mov r13,rcx + rol rcx,1 + xor rcx,rax + xor rdx,r11 + + rol rax,1 + xor rax,rdx + xor rbx,QWORD[28+rdi] + + rol rdx,1 + xor rdx,rbx + xor rbp,QWORD[52+rdi] + + rol rbx,1 + xor rbx,rbp + + rol rbp,1 + xor rbp,r13 + xor r9,rcx + xor r10,rdx + rol r9,44 + xor r11,rbp + xor r12,rax + rol r10,43 + xor r8,rbx + mov r13,r9 + rol r11,21 + or r9,r10 + xor r9,r8 + rol r12,14 + + xor r9,QWORD[r15] + lea r15,[8+r15] + + mov r14,r12 + and r12,r11 + mov QWORD[((-100))+rsi],r9 + xor r12,r10 + not r10 + mov QWORD[((-84))+rsi],r12 + + or r10,r11 + mov r12,QWORD[76+rdi] + xor r10,r13 + mov QWORD[((-92))+rsi],r10 + + and r13,r8 + mov r9,QWORD[((-28))+rdi] + xor r13,r14 + mov r10,QWORD[((-20))+rdi] + mov QWORD[((-68))+rsi],r13 + + or r14,r8 + mov r8,QWORD[((-76))+rdi] + xor r14,r11 + mov r11,QWORD[28+rdi] + mov QWORD[((-76))+rsi],r14 + + + xor r8,rbp + xor r12,rdx + rol r8,28 + xor r11,rcx + xor r9,rax + rol r12,61 + rol r11,45 + xor r10,rbx + rol r9,20 + mov r13,r8 + or r8,r12 + rol r10,3 + + xor r8,r11 + mov QWORD[((-36))+rsi],r8 + + mov r14,r9 + and r9,r13 + mov r8,QWORD[((-92))+rdi] + xor r9,r12 + not r12 + mov QWORD[((-28))+rsi],r9 + + or r12,r11 + mov r9,QWORD[((-44))+rdi] + xor r12,r10 + mov QWORD[((-44))+rsi],r12 + + and r11,r10 + mov r12,QWORD[60+rdi] + xor r11,r14 + mov QWORD[((-52))+rsi],r11 + + or r14,r10 + mov r10,QWORD[4+rdi] + xor r14,r13 + mov r11,QWORD[52+rdi] + mov QWORD[((-60))+rsi],r14 + + + xor r10,rbp + xor r11,rax + rol r10,25 + xor r9,rdx + rol r11,8 + xor r12,rbx + rol r9,6 + xor r8,rcx + rol r12,18 + mov r13,r10 + and r10,r11 + rol r8,1 + + not r11 + xor r10,r9 + mov QWORD[((-12))+rsi],r10 + + mov r14,r12 + and r12,r11 + mov r10,QWORD[((-12))+rdi] + xor r12,r13 + mov QWORD[((-4))+rsi],r12 + + or r13,r9 + mov r12,QWORD[84+rdi] + xor r13,r8 + mov QWORD[((-20))+rsi],r13 + + and r9,r8 + xor r9,r14 + mov QWORD[12+rsi],r9 + + or r14,r8 + mov r9,QWORD[((-60))+rdi] + xor r14,r11 + mov r11,QWORD[36+rdi] + mov QWORD[4+rsi],r14 + + + mov r8,QWORD[((-68))+rdi] + + xor r10,rcx + xor r11,rdx + rol r10,10 + xor r9,rbx + rol r11,15 + xor r12,rbp + rol r9,36 + xor r8,rax + rol r12,56 + mov r13,r10 + or r10,r11 + rol r8,27 + + not r11 + xor r10,r9 + mov QWORD[28+rsi],r10 + + mov r14,r12 + or r12,r11 + xor r12,r13 + mov QWORD[36+rsi],r12 + + and r13,r9 + xor r13,r8 + mov QWORD[20+rsi],r13 + + or r9,r8 + xor r9,r14 + mov QWORD[52+rsi],r9 + + and r8,r14 + xor r8,r11 + mov QWORD[44+rsi],r8 + + + xor rdx,QWORD[((-84))+rdi] + xor rbp,QWORD[((-36))+rdi] + rol rdx,62 + xor rcx,QWORD[68+rdi] + rol rbp,55 + xor rax,QWORD[12+rdi] + rol rcx,2 + xor rbx,QWORD[20+rdi] + xchg rdi,rsi + rol rax,39 + rol rbx,41 + mov r13,rdx + and rdx,rbp + not rbp + xor rdx,rcx + mov QWORD[92+rdi],rdx + + mov r14,rax + and rax,rbp + xor rax,r13 + mov QWORD[60+rdi],rax + + or r13,rcx + xor r13,rbx + mov QWORD[84+rdi],r13 + + and rcx,rbx + xor rcx,r14 + mov QWORD[76+rdi],rcx + + or rbx,r14 + xor rbx,rbp + mov QWORD[68+rdi],rbx + + mov rbp,rdx + mov rdx,r13 + + test r15,255 + jnz NEAR $L$oop + + lea r15,[((-192))+r15] + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +KeccakF1600: + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + + lea rdi,[100+rdi] + sub rsp,200 + + + not QWORD[((-92))+rdi] + not QWORD[((-84))+rdi] + not QWORD[((-36))+rdi] + not QWORD[((-4))+rdi] + not QWORD[36+rdi] + not QWORD[60+rdi] + + lea r15,[iotas] + lea rsi,[100+rsp] + + call __KeccakF1600 + + not QWORD[((-92))+rdi] + not QWORD[((-84))+rdi] + not QWORD[((-36))+rdi] + not QWORD[((-4))+rdi] + not QWORD[36+rdi] + not QWORD[60+rdi] + lea rdi,[((-100))+rdi] + + add rsp,200 + + + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbp + + pop rbx + + DB 0F3h,0C3h ;repret + + +global SHA3_absorb + +ALIGN 32 +SHA3_absorb: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_SHA3_absorb: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + + lea rdi,[100+rdi] + sub rsp,232 + + + mov r9,rsi + lea rsi,[100+rsp] + + not QWORD[((-92))+rdi] + not QWORD[((-84))+rdi] + not QWORD[((-36))+rdi] + not QWORD[((-4))+rdi] + not QWORD[36+rdi] + not QWORD[60+rdi] + lea r15,[iotas] + + mov QWORD[((216-100))+rsi],rcx + +$L$oop_absorb: + cmp rdx,rcx + jc NEAR $L$done_absorb + + shr rcx,3 + lea r8,[((-100))+rdi] + +$L$block_absorb: + mov rax,QWORD[r9] + lea r9,[8+r9] + xor rax,QWORD[r8] + lea r8,[8+r8] + sub rdx,8 + mov QWORD[((-8))+r8],rax + sub rcx,1 + jnz NEAR $L$block_absorb + + mov QWORD[((200-100))+rsi],r9 + mov QWORD[((208-100))+rsi],rdx + call __KeccakF1600 + mov r9,QWORD[((200-100))+rsi] + mov rdx,QWORD[((208-100))+rsi] + mov rcx,QWORD[((216-100))+rsi] + jmp NEAR $L$oop_absorb + +ALIGN 32 +$L$done_absorb: + mov rax,rdx + + not QWORD[((-92))+rdi] + not QWORD[((-84))+rdi] + not QWORD[((-36))+rdi] + not QWORD[((-4))+rdi] + not QWORD[36+rdi] + not QWORD[60+rdi] + + add rsp,232 + + + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbp + + pop rbx + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_SHA3_absorb: +global SHA3_squeeze + +ALIGN 32 +SHA3_squeeze: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_SHA3_squeeze: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push r12 + + push r13 + + push r14 + + + shr rcx,3 + mov r8,rdi + mov r12,rsi + mov r13,rdx + mov r14,rcx + jmp NEAR $L$oop_squeeze + +ALIGN 32 +$L$oop_squeeze: + cmp r13,8 + jb NEAR $L$tail_squeeze + + mov rax,QWORD[r8] + lea r8,[8+r8] + mov QWORD[r12],rax + lea r12,[8+r12] + sub r13,8 + jz NEAR $L$done_squeeze + + sub rcx,1 + jnz NEAR $L$oop_squeeze + + call KeccakF1600 + mov r8,rdi + mov rcx,r14 + jmp NEAR $L$oop_squeeze + +$L$tail_squeeze: + mov rsi,r8 + mov rdi,r12 + mov rcx,r13 +DB 0xf3,0xa4 + +$L$done_squeeze: + pop r14 + + pop r13 + + pop r12 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_SHA3_squeeze: +ALIGN 256 + DQ 0,0,0,0,0,0,0,0 + +iotas: + DQ 0x0000000000000001 + DQ 0x0000000000008082 + DQ 0x800000000000808a + DQ 0x8000000080008000 + DQ 0x000000000000808b + DQ 0x0000000080000001 + DQ 0x8000000080008081 + DQ 0x8000000000008009 + DQ 0x000000000000008a + DQ 0x0000000000000088 + DQ 0x0000000080008009 + DQ 0x000000008000000a + DQ 0x000000008000808b + DQ 0x800000000000008b + DQ 0x8000000000008089 + DQ 0x8000000000008003 + DQ 0x8000000000008002 + DQ 0x8000000000000080 + DQ 0x000000000000800a + DQ 0x800000008000000a + DQ 0x8000000080008081 + DQ 0x8000000000008080 + DQ 0x0000000080000001 + DQ 0x8000000080008008 + +DB 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111 +DB 114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102 +DB 111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84 +DB 79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64 +DB 111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha1-mb-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha1-mb-x86_64.asm index cfc4874e52..725bf4e796 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha1-mb-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha1-mb-x86_64.asm @@ -20,14 +20,18 @@ $L$SEH_begin_sha1_multi_block: mov rdx,r8 + mov rcx,QWORD[((OPENSSL_ia32cap_P+4))] bt rcx,61 jc NEAR _shaext_shortcut test ecx,268435456 jnz NEAR _avx_shortcut mov rax,rsp + push rbx + push rbp + lea rsp,[((-168))+rsp] movaps XMMWORD[rsp],xmm6 movaps XMMWORD[16+rsp],xmm7 @@ -42,6 +46,7 @@ $L$SEH_begin_sha1_multi_block: sub rsp,288 and rsp,-256 mov QWORD[272+rsp],rax + $L$body: lea rbp,[K_XX_XX] lea rbx,[256+rsp] @@ -2571,6 +2576,7 @@ DB 102,15,56,0,197 $L$done: mov rax,QWORD[272+rsp] + movaps xmm6,XMMWORD[((-184))+rax] movaps xmm7,XMMWORD[((-168))+rax] movaps xmm8,XMMWORD[((-152))+rax] @@ -2582,12 +2588,16 @@ $L$done: movaps xmm14,XMMWORD[((-56))+rax] movaps xmm15,XMMWORD[((-40))+rax] mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha1_multi_block: ALIGN 32 @@ -2601,10 +2611,14 @@ $L$SEH_begin_sha1_multi_block_shaext: mov rdx,r8 + _shaext_shortcut: mov rax,rsp + push rbx + push rbp + lea rsp,[((-168))+rsp] movaps XMMWORD[rsp],xmm6 movaps XMMWORD[16+rsp],xmm7 @@ -2981,12 +2995,16 @@ $L$done_shaext: movaps xmm14,XMMWORD[((-56))+rax] movaps xmm15,XMMWORD[((-40))+rax] mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$epilogue_shaext: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha1_multi_block_shaext: ALIGN 32 @@ -3000,6 +3018,7 @@ $L$SEH_begin_sha1_multi_block_avx: mov rdx,r8 + _avx_shortcut: shr rcx,32 cmp edx,2 @@ -3010,8 +3029,11 @@ _avx_shortcut: ALIGN 32 $L$avx: mov rax,rsp + push rbx + push rbp + lea rsp,[((-168))+rsp] movaps XMMWORD[rsp],xmm6 movaps XMMWORD[16+rsp],xmm7 @@ -3026,6 +3048,7 @@ $L$avx: sub rsp,288 and rsp,-256 mov QWORD[272+rsp],rax + $L$body_avx: lea rbp,[K_XX_XX] lea rbx,[256+rsp] @@ -5075,6 +5098,7 @@ $L$oop_avx: $L$done_avx: mov rax,QWORD[272+rsp] + vzeroupper movaps xmm6,XMMWORD[((-184))+rax] movaps xmm7,XMMWORD[((-168))+rax] @@ -5087,12 +5111,16 @@ $L$done_avx: movaps xmm14,XMMWORD[((-56))+rax] movaps xmm15,XMMWORD[((-40))+rax] mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha1_multi_block_avx: ALIGN 32 @@ -5106,14 +5134,22 @@ $L$SEH_begin_sha1_multi_block_avx2: mov rdx,r8 + _avx2_shortcut: mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + lea rsp,[((-168))+rsp] movaps XMMWORD[rsp],xmm6 movaps XMMWORD[16+rsp],xmm7 @@ -5128,6 +5164,7 @@ _avx2_shortcut: sub rsp,576 and rsp,-256 mov QWORD[544+rsp],rax + $L$body_avx2: lea rbp,[K_XX_XX] shr edx,1 @@ -7314,6 +7351,7 @@ $L$oop_avx2: $L$done_avx2: mov rax,QWORD[544+rsp] + vzeroupper movaps xmm6,XMMWORD[((-216))+rax] movaps xmm7,XMMWORD[((-200))+rax] @@ -7326,16 +7364,24 @@ $L$done_avx2: movaps xmm14,XMMWORD[((-88))+rax] movaps xmm15,XMMWORD[((-72))+rax] mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$epilogue_avx2: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha1_multi_block_avx2: ALIGN 256 diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha1-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha1-x86_64.asm index 6282079ede..d796380ae8 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha1-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha1-x86_64.asm @@ -19,6 +19,7 @@ $L$SEH_begin_sha1_block_data_order: mov rdx,r8 + mov r9d,DWORD[((OPENSSL_ia32cap_P+0))] mov r8d,DWORD[((OPENSSL_ia32cap_P+4))] mov r10d,DWORD[((OPENSSL_ia32cap_P+8))] @@ -39,17 +40,24 @@ $L$SEH_begin_sha1_block_data_order: ALIGN 16 $L$ialu: mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + mov r8,rdi sub rsp,72 mov r9,rsi and rsp,-64 mov r10,rdx mov QWORD[64+rsp],rax + $L$prologue: mov esi,DWORD[r8] @@ -1244,16 +1252,24 @@ $L$loop: jnz NEAR $L$loop mov rsi,QWORD[64+rsp] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha1_block_data_order: ALIGN 32 @@ -1268,6 +1284,7 @@ $L$SEH_begin_sha1_block_data_order_shaext: _shaext_shortcut: + lea rsp,[((-72))+rsp] movaps XMMWORD[(-8-64)+rax],xmm6 movaps XMMWORD[(-8-48)+rax],xmm7 @@ -1441,6 +1458,7 @@ DB 102,15,56,0,251 movaps xmm9,XMMWORD[((-8-16))+rax] mov rsp,rax $L$epilogue_shaext: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret @@ -1458,21 +1476,27 @@ $L$SEH_begin_sha1_block_data_order_ssse3: _ssse3_shortcut: - mov rax,rsp + + mov r11,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + lea rsp,[((-160))+rsp] - movaps XMMWORD[(-40-96)+rax],xmm6 - movaps XMMWORD[(-40-80)+rax],xmm7 - movaps XMMWORD[(-40-64)+rax],xmm8 - movaps XMMWORD[(-40-48)+rax],xmm9 - movaps XMMWORD[(-40-32)+rax],xmm10 - movaps XMMWORD[(-40-16)+rax],xmm11 + movaps XMMWORD[(-40-96)+r11],xmm6 + movaps XMMWORD[(-40-80)+r11],xmm7 + movaps XMMWORD[(-40-64)+r11],xmm8 + movaps XMMWORD[(-40-48)+r11],xmm9 + movaps XMMWORD[(-40-32)+r11],xmm10 + movaps XMMWORD[(-40-16)+r11],xmm11 $L$prologue_ssse3: - mov r14,rax and rsp,-64 mov r8,rdi mov r9,rsi @@ -1480,7 +1504,7 @@ $L$prologue_ssse3: shl r10,6 add r10,r9 - lea r11,[((K_XX_XX+64))] + lea r14,[((K_XX_XX+64))] mov eax,DWORD[r8] mov ebx,DWORD[4+r8] @@ -1492,8 +1516,8 @@ $L$prologue_ssse3: xor edi,edx and esi,edi - movdqa xmm6,XMMWORD[64+r11] - movdqa xmm9,XMMWORD[((-64))+r11] + movdqa xmm6,XMMWORD[64+r14] + movdqa xmm9,XMMWORD[((-64))+r14] movdqu xmm0,XMMWORD[r9] movdqu xmm1,XMMWORD[16+r9] movdqu xmm2,XMMWORD[32+r9] @@ -1569,7 +1593,7 @@ $L$oop_ssse3: pslld xmm9,2 pxor xmm4,xmm10 xor edx,ebp - movdqa xmm10,XMMWORD[((-64))+r11] + movdqa xmm10,XMMWORD[((-64))+r14] rol ecx,5 add ebx,edi and esi,edx @@ -1630,7 +1654,7 @@ $L$oop_ssse3: pslld xmm10,2 pxor xmm5,xmm8 xor ebp,eax - movdqa xmm8,XMMWORD[((-32))+r11] + movdqa xmm8,XMMWORD[((-32))+r14] rol edx,5 add ecx,edi and esi,ebp @@ -1691,7 +1715,7 @@ $L$oop_ssse3: pslld xmm8,2 pxor xmm6,xmm9 xor eax,ebx - movdqa xmm9,XMMWORD[((-32))+r11] + movdqa xmm9,XMMWORD[((-32))+r14] rol ebp,5 add edx,edi and esi,eax @@ -1752,7 +1776,7 @@ $L$oop_ssse3: pslld xmm9,2 pxor xmm7,xmm10 xor ebx,ecx - movdqa xmm10,XMMWORD[((-32))+r11] + movdqa xmm10,XMMWORD[((-32))+r14] rol eax,5 add ebp,edi and esi,ebx @@ -1863,7 +1887,7 @@ $L$oop_ssse3: pxor xmm2,xmm3 add eax,esi xor edi,edx - movdqa xmm10,XMMWORD[r11] + movdqa xmm10,XMMWORD[r14] ror ecx,7 paddd xmm9,xmm1 add eax,ebx @@ -2098,7 +2122,7 @@ $L$oop_ssse3: pxor xmm7,xmm0 rol ebx,5 add eax,esi - movdqa xmm9,XMMWORD[32+r11] + movdqa xmm9,XMMWORD[32+r14] xor edi,ecx paddd xmm8,xmm6 xor ecx,edx @@ -2389,8 +2413,8 @@ $L$oop_ssse3: add ecx,edx cmp r9,r10 je NEAR $L$done_ssse3 - movdqa xmm6,XMMWORD[64+r11] - movdqa xmm9,XMMWORD[((-64))+r11] + movdqa xmm6,XMMWORD[64+r14] + movdqa xmm9,XMMWORD[((-64))+r14] movdqu xmm0,XMMWORD[r9] movdqu xmm1,XMMWORD[16+r9] movdqu xmm2,XMMWORD[32+r9] @@ -2627,23 +2651,29 @@ $L$done_ssse3: mov DWORD[8+r8],ecx mov DWORD[12+r8],edx mov DWORD[16+r8],ebp - movaps xmm6,XMMWORD[((-40-96))+r14] - movaps xmm7,XMMWORD[((-40-80))+r14] - movaps xmm8,XMMWORD[((-40-64))+r14] - movaps xmm9,XMMWORD[((-40-48))+r14] - movaps xmm10,XMMWORD[((-40-32))+r14] - movaps xmm11,XMMWORD[((-40-16))+r14] - lea rsi,[r14] - mov r14,QWORD[((-40))+rsi] - mov r13,QWORD[((-32))+rsi] - mov r12,QWORD[((-24))+rsi] - mov rbp,QWORD[((-16))+rsi] - mov rbx,QWORD[((-8))+rsi] - lea rsp,[rsi] + movaps xmm6,XMMWORD[((-40-96))+r11] + movaps xmm7,XMMWORD[((-40-80))+r11] + movaps xmm8,XMMWORD[((-40-64))+r11] + movaps xmm9,XMMWORD[((-40-48))+r11] + movaps xmm10,XMMWORD[((-40-32))+r11] + movaps xmm11,XMMWORD[((-40-16))+r11] + mov r14,QWORD[((-40))+r11] + + mov r13,QWORD[((-32))+r11] + + mov r12,QWORD[((-24))+r11] + + mov rbp,QWORD[((-16))+r11] + + mov rbx,QWORD[((-8))+r11] + + lea rsp,[r11] + $L$epilogue_ssse3: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha1_block_data_order_ssse3: ALIGN 16 @@ -2658,22 +2688,28 @@ $L$SEH_begin_sha1_block_data_order_avx: _avx_shortcut: - mov rax,rsp + + mov r11,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + lea rsp,[((-160))+rsp] vzeroupper - vmovaps XMMWORD[(-40-96)+rax],xmm6 - vmovaps XMMWORD[(-40-80)+rax],xmm7 - vmovaps XMMWORD[(-40-64)+rax],xmm8 - vmovaps XMMWORD[(-40-48)+rax],xmm9 - vmovaps XMMWORD[(-40-32)+rax],xmm10 - vmovaps XMMWORD[(-40-16)+rax],xmm11 + vmovaps XMMWORD[(-40-96)+r11],xmm6 + vmovaps XMMWORD[(-40-80)+r11],xmm7 + vmovaps XMMWORD[(-40-64)+r11],xmm8 + vmovaps XMMWORD[(-40-48)+r11],xmm9 + vmovaps XMMWORD[(-40-32)+r11],xmm10 + vmovaps XMMWORD[(-40-16)+r11],xmm11 $L$prologue_avx: - mov r14,rax and rsp,-64 mov r8,rdi mov r9,rsi @@ -2681,7 +2717,7 @@ $L$prologue_avx: shl r10,6 add r10,r9 - lea r11,[((K_XX_XX+64))] + lea r14,[((K_XX_XX+64))] mov eax,DWORD[r8] mov ebx,DWORD[4+r8] @@ -2693,8 +2729,8 @@ $L$prologue_avx: xor edi,edx and esi,edi - vmovdqa xmm6,XMMWORD[64+r11] - vmovdqa xmm11,XMMWORD[((-64))+r11] + vmovdqa xmm6,XMMWORD[64+r14] + vmovdqa xmm11,XMMWORD[((-64))+r14] vmovdqu xmm0,XMMWORD[r9] vmovdqu xmm1,XMMWORD[16+r9] vmovdqu xmm2,XMMWORD[32+r9] @@ -2819,7 +2855,7 @@ $L$oop_avx: vpxor xmm5,xmm5,xmm10 xor ebp,eax shld edx,edx,5 - vmovdqa xmm11,XMMWORD[((-32))+r11] + vmovdqa xmm11,XMMWORD[((-32))+r14] add ecx,edi and esi,ebp xor ebp,eax @@ -3032,7 +3068,7 @@ $L$oop_avx: add eax,esi xor edi,edx vpaddd xmm9,xmm11,xmm1 - vmovdqa xmm11,XMMWORD[r11] + vmovdqa xmm11,XMMWORD[r14] shrd ecx,ecx,7 add eax,ebx vpxor xmm2,xmm2,xmm8 @@ -3251,7 +3287,7 @@ $L$oop_avx: mov edi,ebx xor esi,edx vpaddd xmm9,xmm11,xmm6 - vmovdqa xmm11,XMMWORD[32+r11] + vmovdqa xmm11,XMMWORD[32+r14] shld ebx,ebx,5 add eax,esi vpxor xmm7,xmm7,xmm8 @@ -3530,8 +3566,8 @@ $L$oop_avx: add ecx,edx cmp r9,r10 je NEAR $L$done_avx - vmovdqa xmm6,XMMWORD[64+r11] - vmovdqa xmm11,XMMWORD[((-64))+r11] + vmovdqa xmm6,XMMWORD[64+r14] + vmovdqa xmm11,XMMWORD[((-64))+r14] vmovdqu xmm0,XMMWORD[r9] vmovdqu xmm1,XMMWORD[16+r9] vmovdqu xmm2,XMMWORD[32+r9] @@ -3767,23 +3803,29 @@ $L$done_avx: mov DWORD[8+r8],ecx mov DWORD[12+r8],edx mov DWORD[16+r8],ebp - movaps xmm6,XMMWORD[((-40-96))+r14] - movaps xmm7,XMMWORD[((-40-80))+r14] - movaps xmm8,XMMWORD[((-40-64))+r14] - movaps xmm9,XMMWORD[((-40-48))+r14] - movaps xmm10,XMMWORD[((-40-32))+r14] - movaps xmm11,XMMWORD[((-40-16))+r14] - lea rsi,[r14] - mov r14,QWORD[((-40))+rsi] - mov r13,QWORD[((-32))+rsi] - mov r12,QWORD[((-24))+rsi] - mov rbp,QWORD[((-16))+rsi] - mov rbx,QWORD[((-8))+rsi] - lea rsp,[rsi] + movaps xmm6,XMMWORD[((-40-96))+r11] + movaps xmm7,XMMWORD[((-40-80))+r11] + movaps xmm8,XMMWORD[((-40-64))+r11] + movaps xmm9,XMMWORD[((-40-48))+r11] + movaps xmm10,XMMWORD[((-40-32))+r11] + movaps xmm11,XMMWORD[((-40-16))+r11] + mov r14,QWORD[((-40))+r11] + + mov r13,QWORD[((-32))+r11] + + mov r12,QWORD[((-24))+r11] + + mov rbp,QWORD[((-16))+r11] + + mov rbx,QWORD[((-8))+r11] + + lea rsp,[r11] + $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha1_block_data_order_avx: ALIGN 16 @@ -3798,22 +3840,28 @@ $L$SEH_begin_sha1_block_data_order_avx2: _avx2_shortcut: - mov rax,rsp + + mov r11,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + vzeroupper lea rsp,[((-96))+rsp] - vmovaps XMMWORD[(-40-96)+rax],xmm6 - vmovaps XMMWORD[(-40-80)+rax],xmm7 - vmovaps XMMWORD[(-40-64)+rax],xmm8 - vmovaps XMMWORD[(-40-48)+rax],xmm9 - vmovaps XMMWORD[(-40-32)+rax],xmm10 - vmovaps XMMWORD[(-40-16)+rax],xmm11 + vmovaps XMMWORD[(-40-96)+r11],xmm6 + vmovaps XMMWORD[(-40-80)+r11],xmm7 + vmovaps XMMWORD[(-40-64)+r11],xmm8 + vmovaps XMMWORD[(-40-48)+r11],xmm9 + vmovaps XMMWORD[(-40-32)+r11],xmm10 + vmovaps XMMWORD[(-40-16)+r11],xmm11 $L$prologue_avx2: - mov r14,rax mov r8,rdi mov r9,rsi mov r10,rdx @@ -3823,7 +3871,7 @@ $L$prologue_avx2: lea r13,[64+r9] and rsp,-128 add r10,r9 - lea r11,[((K_XX_XX+64))] + lea r14,[((K_XX_XX+64))] mov eax,DWORD[r8] cmp r13,r10 @@ -3832,7 +3880,7 @@ $L$prologue_avx2: mov ecx,DWORD[8+r8] mov edx,DWORD[12+r8] mov esi,DWORD[16+r8] - vmovdqu ymm6,YMMWORD[64+r11] + vmovdqu ymm6,YMMWORD[64+r14] vmovdqu xmm0,XMMWORD[r9] vmovdqu xmm1,XMMWORD[16+r9] @@ -3846,7 +3894,7 @@ $L$prologue_avx2: vpshufb ymm1,ymm1,ymm6 vinserti128 ymm3,ymm3,XMMWORD[48+r13],1 vpshufb ymm2,ymm2,ymm6 - vmovdqu ymm11,YMMWORD[((-64))+r11] + vmovdqu ymm11,YMMWORD[((-64))+r14] vpshufb ymm3,ymm3,ymm6 vpaddd ymm4,ymm0,ymm11 @@ -3878,7 +3926,7 @@ $L$prologue_avx2: vpxor ymm8,ymm8,ymm3 vpxor ymm5,ymm5,ymm8 vpsrld ymm8,ymm5,31 - vmovdqu ymm11,YMMWORD[((-32))+r11] + vmovdqu ymm11,YMMWORD[((-32))+r14] vpslldq ymm10,ymm5,12 vpaddd ymm5,ymm5,ymm5 vpsrld ymm9,ymm10,30 @@ -4032,7 +4080,7 @@ $L$align32_1: add ebp,DWORD[((-56))+r13] andn edi,ebx,esi vpxor ymm2,ymm2,ymm3 - vmovdqu ymm11,YMMWORD[r11] + vmovdqu ymm11,YMMWORD[r14] add ebp,ecx rorx r12d,ebx,27 rorx ecx,ebx,2 @@ -4263,7 +4311,7 @@ $L$align32_1: add eax,DWORD[((-116))+r13] lea eax,[rbx*1+rax] vpxor ymm7,ymm7,ymm0 - vmovdqu ymm11,YMMWORD[32+r11] + vmovdqu ymm11,YMMWORD[32+r14] rorx r12d,ebp,27 rorx ebx,ebp,2 xor ebp,ecx @@ -4708,7 +4756,7 @@ $L$align32_2: cmp r9,r10 je NEAR $L$done_avx2 - vmovdqu ymm6,YMMWORD[64+r11] + vmovdqu ymm6,YMMWORD[64+r14] cmp rdi,r10 ja NEAR $L$ast_avx2 @@ -4924,7 +4972,7 @@ $L$ast_avx2: xor eax,ebx add esi,r12d xor eax,ecx - vmovdqu ymm11,YMMWORD[((-64))+r11] + vmovdqu ymm11,YMMWORD[((-64))+r14] vpshufb ymm0,ymm0,ymm6 add edx,DWORD[68+r13] lea edx,[rax*1+rdx] @@ -5280,7 +5328,7 @@ $L$align32_3: xor esi,ebp add edx,r12d vpsrld ymm8,ymm5,31 - vmovdqu ymm11,YMMWORD[((-32))+r11] + vmovdqu ymm11,YMMWORD[((-32))+r14] xor esi,ebx add ecx,DWORD[104+r13] lea ecx,[rsi*1+rcx] @@ -5473,23 +5521,29 @@ $L$align32_3: $L$done_avx2: vzeroupper - movaps xmm6,XMMWORD[((-40-96))+r14] - movaps xmm7,XMMWORD[((-40-80))+r14] - movaps xmm8,XMMWORD[((-40-64))+r14] - movaps xmm9,XMMWORD[((-40-48))+r14] - movaps xmm10,XMMWORD[((-40-32))+r14] - movaps xmm11,XMMWORD[((-40-16))+r14] - lea rsi,[r14] - mov r14,QWORD[((-40))+rsi] - mov r13,QWORD[((-32))+rsi] - mov r12,QWORD[((-24))+rsi] - mov rbp,QWORD[((-16))+rsi] - mov rbx,QWORD[((-8))+rsi] - lea rsp,[rsi] + movaps xmm6,XMMWORD[((-40-96))+r11] + movaps xmm7,XMMWORD[((-40-80))+r11] + movaps xmm8,XMMWORD[((-40-64))+r11] + movaps xmm9,XMMWORD[((-40-48))+r11] + movaps xmm10,XMMWORD[((-40-32))+r11] + movaps xmm11,XMMWORD[((-40-16))+r11] + mov r14,QWORD[((-40))+r11] + + mov r13,QWORD[((-32))+r11] + + mov r12,QWORD[((-24))+r11] + + mov rbp,QWORD[((-16))+r11] + + mov rbx,QWORD[((-8))+r11] + + lea rsp,[r11] + $L$epilogue_avx2: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha1_block_data_order_avx2: ALIGN 64 K_XX_XX: @@ -5610,15 +5664,13 @@ ssse3_handler: cmp rbx,r10 jb NEAR $L$common_seh_tail - mov rax,QWORD[152+r8] + mov rax,QWORD[208+r8] mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail - mov rax,QWORD[232+r8] - lea rsi,[((-40-96))+rax] lea rdi,[512+r8] mov ecx,12 diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha256-mb-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha256-mb-x86_64.asm index 9efc2ad7f0..f0754004b2 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha256-mb-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha256-mb-x86_64.asm @@ -20,14 +20,18 @@ $L$SEH_begin_sha256_multi_block: mov rdx,r8 + mov rcx,QWORD[((OPENSSL_ia32cap_P+4))] bt rcx,61 jc NEAR _shaext_shortcut test ecx,268435456 jnz NEAR _avx_shortcut mov rax,rsp + push rbx + push rbp + lea rsp,[((-168))+rsp] movaps XMMWORD[rsp],xmm6 movaps XMMWORD[16+rsp],xmm7 @@ -42,6 +46,7 @@ $L$SEH_begin_sha256_multi_block: sub rsp,288 and rsp,-256 mov QWORD[272+rsp],rax + $L$body: lea rbp,[((K256+128))] lea rbx,[256+rsp] @@ -2640,6 +2645,7 @@ $L$oop_16_xx: $L$done: mov rax,QWORD[272+rsp] + movaps xmm6,XMMWORD[((-184))+rax] movaps xmm7,XMMWORD[((-168))+rax] movaps xmm8,XMMWORD[((-152))+rax] @@ -2651,12 +2657,16 @@ $L$done: movaps xmm14,XMMWORD[((-56))+rax] movaps xmm15,XMMWORD[((-40))+rax] mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha256_multi_block: ALIGN 32 @@ -2670,10 +2680,14 @@ $L$SEH_begin_sha256_multi_block_shaext: mov rdx,r8 + _shaext_shortcut: mov rax,rsp + push rbx + push rbp + lea rsp,[((-168))+rsp] movaps XMMWORD[rsp],xmm6 movaps XMMWORD[16+rsp],xmm7 @@ -3169,12 +3183,16 @@ $L$done_shaext: movaps xmm14,XMMWORD[((-56))+rax] movaps xmm15,XMMWORD[((-40))+rax] mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$epilogue_shaext: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha256_multi_block_shaext: ALIGN 32 @@ -3188,6 +3206,7 @@ $L$SEH_begin_sha256_multi_block_avx: mov rdx,r8 + _avx_shortcut: shr rcx,32 cmp edx,2 @@ -3198,8 +3217,11 @@ _avx_shortcut: ALIGN 32 $L$avx: mov rax,rsp + push rbx + push rbp + lea rsp,[((-168))+rsp] movaps XMMWORD[rsp],xmm6 movaps XMMWORD[16+rsp],xmm7 @@ -3214,6 +3236,7 @@ $L$avx: sub rsp,288 and rsp,-256 mov QWORD[272+rsp],rax + $L$body_avx: lea rbp,[((K256+128))] lea rbx,[256+rsp] @@ -5442,6 +5465,7 @@ $L$oop_16_xx_avx: $L$done_avx: mov rax,QWORD[272+rsp] + vzeroupper movaps xmm6,XMMWORD[((-184))+rax] movaps xmm7,XMMWORD[((-168))+rax] @@ -5454,12 +5478,16 @@ $L$done_avx: movaps xmm14,XMMWORD[((-56))+rax] movaps xmm15,XMMWORD[((-40))+rax] mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha256_multi_block_avx: ALIGN 32 @@ -5473,14 +5501,22 @@ $L$SEH_begin_sha256_multi_block_avx2: mov rdx,r8 + _avx2_shortcut: mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + lea rsp,[((-168))+rsp] movaps XMMWORD[rsp],xmm6 movaps XMMWORD[16+rsp],xmm7 @@ -5495,6 +5531,7 @@ _avx2_shortcut: sub rsp,576 and rsp,-256 mov QWORD[544+rsp],rax + $L$body_avx2: lea rbp,[((K256+128))] lea rdi,[128+rdi] @@ -7859,6 +7896,7 @@ $L$oop_16_xx_avx2: $L$done_avx2: mov rax,QWORD[544+rsp] + vzeroupper movaps xmm6,XMMWORD[((-216))+rax] movaps xmm7,XMMWORD[((-200))+rax] @@ -7871,16 +7909,24 @@ $L$done_avx2: movaps xmm14,XMMWORD[((-88))+rax] movaps xmm15,XMMWORD[((-72))+rax] mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] + $L$epilogue_avx2: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha256_multi_block_avx2: ALIGN 256 K256: diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha256-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha256-x86_64.asm index 31a5279fc3..fc102444ff 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha256-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha256-x86_64.asm @@ -19,6 +19,7 @@ $L$SEH_begin_sha256_block_data_order: mov rdx,r8 + lea r11,[OPENSSL_ia32cap_P] mov r9d,DWORD[r11] mov r10d,DWORD[4+r11] @@ -35,13 +36,20 @@ $L$SEH_begin_sha256_block_data_order: je NEAR $L$avx_shortcut test r10d,512 jnz NEAR $L$ssse3_shortcut + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - mov r11,rsp + shl rdx,4 sub rsp,16*4+4*8 lea rdx,[rdx*4+rsi] @@ -49,7 +57,8 @@ $L$SEH_begin_sha256_block_data_order: mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx - mov QWORD[((64+24))+rsp],r11 + mov QWORD[88+rsp],rax + $L$prologue: mov eax,DWORD[rdi] @@ -1713,18 +1722,27 @@ $L$rounds_16_xx: mov DWORD[28+rdi],r11d jb NEAR $L$loop - mov rsi,QWORD[((64+24))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov rsi,QWORD[88+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha256_block_data_order: ALIGN 64 @@ -2017,14 +2035,22 @@ $L$SEH_begin_sha256_block_data_order_ssse3: mov rdx,r8 + $L$ssse3_shortcut: + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - mov r11,rsp + shl rdx,4 sub rsp,160 lea rdx,[rdx*4+rsi] @@ -2032,7 +2058,8 @@ $L$ssse3_shortcut: mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx - mov QWORD[((64+24))+rsp],r11 + mov QWORD[88+rsp],rax + movaps XMMWORD[(64+32)+rsp],xmm6 movaps XMMWORD[(64+48)+rsp],xmm7 movaps XMMWORD[(64+64)+rsp],xmm8 @@ -3102,22 +3129,31 @@ DB 102,15,58,15,249,4 mov DWORD[28+rdi],r11d jb NEAR $L$loop_ssse3 - mov rsi,QWORD[((64+24))+rsp] + mov rsi,QWORD[88+rsp] + movaps xmm6,XMMWORD[((64+32))+rsp] movaps xmm7,XMMWORD[((64+48))+rsp] movaps xmm8,XMMWORD[((64+64))+rsp] movaps xmm9,XMMWORD[((64+80))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$epilogue_ssse3: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha256_block_data_order_ssse3: ALIGN 64 @@ -3131,14 +3167,22 @@ $L$SEH_begin_sha256_block_data_order_avx: mov rdx,r8 + $L$avx_shortcut: + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - mov r11,rsp + shl rdx,4 sub rsp,160 lea rdx,[rdx*4+rsi] @@ -3146,7 +3190,8 @@ $L$avx_shortcut: mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx - mov QWORD[((64+24))+rsp],r11 + mov QWORD[88+rsp],rax + movaps XMMWORD[(64+32)+rsp],xmm6 movaps XMMWORD[(64+48)+rsp],xmm7 movaps XMMWORD[(64+64)+rsp],xmm8 @@ -4177,23 +4222,32 @@ $L$avx_00_47: mov DWORD[28+rdi],r11d jb NEAR $L$loop_avx - mov rsi,QWORD[((64+24))+rsp] + mov rsi,QWORD[88+rsp] + vzeroupper movaps xmm6,XMMWORD[((64+32))+rsp] movaps xmm7,XMMWORD[((64+48))+rsp] movaps xmm8,XMMWORD[((64+64))+rsp] movaps xmm9,XMMWORD[((64+80))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha256_block_data_order_avx: ALIGN 64 @@ -4207,14 +4261,22 @@ $L$SEH_begin_sha256_block_data_order_avx2: mov rdx,r8 + $L$avx2_shortcut: + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - mov r11,rsp + sub rsp,608 shl rdx,4 and rsp,-256*4 @@ -4223,7 +4285,8 @@ $L$avx2_shortcut: mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx - mov QWORD[((64+24))+rsp],r11 + mov QWORD[88+rsp],rax + movaps XMMWORD[(64+32)+rsp],xmm6 movaps XMMWORD[(64+48)+rsp],xmm7 movaps XMMWORD[(64+64)+rsp],xmm8 @@ -5440,23 +5503,32 @@ $L$ower_avx2: $L$done_avx2: lea rsp,[rbp] - mov rsi,QWORD[((64+24))+rsp] + mov rsi,QWORD[88+rsp] + vzeroupper movaps xmm6,XMMWORD[((64+32))+rsp] movaps xmm7,XMMWORD[((64+48))+rsp] movaps xmm8,XMMWORD[((64+64))+rsp] movaps xmm9,XMMWORD[((64+80))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$epilogue_avx2: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha256_block_data_order_avx2: EXTERN __imp_RtlVirtualUnwind @@ -5499,7 +5571,6 @@ se_handler: $L$not_in_avx2: mov rsi,rax mov rax,QWORD[((64+24))+rax] - lea rax,[48+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha512-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha512-x86_64.asm index 0e99bed5a7..1a9935d7b6 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha512-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha512-x86_64.asm @@ -19,6 +19,7 @@ $L$SEH_begin_sha512_block_data_order: mov rdx,r8 + lea r11,[OPENSSL_ia32cap_P] mov r9d,DWORD[r11] mov r10d,DWORD[4+r11] @@ -33,13 +34,20 @@ $L$SEH_begin_sha512_block_data_order: or r10d,r9d cmp r10d,1342177792 je NEAR $L$avx_shortcut + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - mov r11,rsp + shl rdx,4 sub rsp,16*8+4*8 lea rdx,[rdx*8+rsi] @@ -47,7 +55,8 @@ $L$SEH_begin_sha512_block_data_order: mov QWORD[((128+0))+rsp],rdi mov QWORD[((128+8))+rsp],rsi mov QWORD[((128+16))+rsp],rdx - mov QWORD[((128+24))+rsp],r11 + mov QWORD[152+rsp],rax + $L$prologue: mov rax,QWORD[rdi] @@ -1711,18 +1720,27 @@ $L$rounds_16_xx: mov QWORD[56+rdi],r11 jb NEAR $L$loop - mov rsi,QWORD[((128+24))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov rsi,QWORD[152+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha512_block_data_order: ALIGN 64 @@ -1827,14 +1845,22 @@ $L$SEH_begin_sha512_block_data_order_xop: mov rdx,r8 + $L$xop_shortcut: + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - mov r11,rsp + shl rdx,4 sub rsp,256 lea rdx,[rdx*8+rsi] @@ -1842,7 +1868,8 @@ $L$xop_shortcut: mov QWORD[((128+0))+rsp],rdi mov QWORD[((128+8))+rsp],rsi mov QWORD[((128+16))+rsp],rdx - mov QWORD[((128+24))+rsp],r11 + mov QWORD[152+rsp],rax + movaps XMMWORD[(128+32)+rsp],xmm6 movaps XMMWORD[(128+48)+rsp],xmm7 movaps XMMWORD[(128+64)+rsp],xmm8 @@ -2901,7 +2928,8 @@ DB 143,72,120,195,203,42 mov QWORD[56+rdi],r11 jb NEAR $L$loop_xop - mov rsi,QWORD[((128+24))+rsp] + mov rsi,QWORD[152+rsp] + vzeroupper movaps xmm6,XMMWORD[((128+32))+rsp] movaps xmm7,XMMWORD[((128+48))+rsp] @@ -2909,17 +2937,25 @@ DB 143,72,120,195,203,42 movaps xmm9,XMMWORD[((128+80))+rsp] movaps xmm10,XMMWORD[((128+96))+rsp] movaps xmm11,XMMWORD[((128+112))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$epilogue_xop: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha512_block_data_order_xop: ALIGN 64 @@ -2933,14 +2969,22 @@ $L$SEH_begin_sha512_block_data_order_avx: mov rdx,r8 + $L$avx_shortcut: + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - mov r11,rsp + shl rdx,4 sub rsp,256 lea rdx,[rdx*8+rsi] @@ -2948,7 +2992,8 @@ $L$avx_shortcut: mov QWORD[((128+0))+rsp],rdi mov QWORD[((128+8))+rsp],rsi mov QWORD[((128+16))+rsp],rdx - mov QWORD[((128+24))+rsp],r11 + mov QWORD[152+rsp],rax + movaps XMMWORD[(128+32)+rsp],xmm6 movaps XMMWORD[(128+48)+rsp],xmm7 movaps XMMWORD[(128+64)+rsp],xmm8 @@ -4071,7 +4116,8 @@ $L$avx_00_47: mov QWORD[56+rdi],r11 jb NEAR $L$loop_avx - mov rsi,QWORD[((128+24))+rsp] + mov rsi,QWORD[152+rsp] + vzeroupper movaps xmm6,XMMWORD[((128+32))+rsp] movaps xmm7,XMMWORD[((128+48))+rsp] @@ -4079,17 +4125,25 @@ $L$avx_00_47: movaps xmm9,XMMWORD[((128+80))+rsp] movaps xmm10,XMMWORD[((128+96))+rsp] movaps xmm11,XMMWORD[((128+112))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha512_block_data_order_avx: ALIGN 64 @@ -4103,14 +4157,22 @@ $L$SEH_begin_sha512_block_data_order_avx2: mov rdx,r8 + $L$avx2_shortcut: + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - mov r11,rsp + sub rsp,1408 shl rdx,4 and rsp,-256*8 @@ -4119,7 +4181,8 @@ $L$avx2_shortcut: mov QWORD[((128+0))+rsp],rdi mov QWORD[((128+8))+rsp],rsi mov QWORD[((128+16))+rsp],rdx - mov QWORD[((128+24))+rsp],r11 + mov QWORD[152+rsp],rax + movaps XMMWORD[(128+32)+rsp],xmm6 movaps XMMWORD[(128+48)+rsp],xmm7 movaps XMMWORD[(128+64)+rsp],xmm8 @@ -5432,7 +5495,8 @@ $L$ower_avx2: $L$done_avx2: lea rsp,[rbp] - mov rsi,QWORD[((128+24))+rsp] + mov rsi,QWORD[152+rsp] + vzeroupper movaps xmm6,XMMWORD[((128+32))+rsp] movaps xmm7,XMMWORD[((128+48))+rsp] @@ -5440,17 +5504,25 @@ $L$done_avx2: movaps xmm9,XMMWORD[((128+80))+rsp] movaps xmm10,XMMWORD[((128+96))+rsp] movaps xmm11,XMMWORD[((128+112))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$epilogue_avx2: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha512_block_data_order_avx2: EXTERN __imp_RtlVirtualUnwind @@ -5493,7 +5565,6 @@ se_handler: $L$not_in_avx2: mov rsi,rax mov rax,QWORD[((128+24))+rax] - lea rax,[48+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/whrlpool/wp-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/whrlpool/wp-x86_64.asm index 065697e58d..76f7b07678 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/whrlpool/wp-x86_64.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/whrlpool/wp-x86_64.asm @@ -18,14 +18,22 @@ $L$SEH_begin_whirlpool_block: mov rdx,r8 + + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - mov r11,rsp + sub rsp,128+40 and rsp,-64 @@ -33,7 +41,8 @@ $L$SEH_begin_whirlpool_block: mov QWORD[r10],rdi mov QWORD[8+r10],rsi mov QWORD[16+r10],rdx - mov QWORD[32+r10],r11 + mov QWORD[32+r10],rax + $L$prologue: mov rbx,r10 @@ -593,17 +602,26 @@ $L$roundsdone: jmp NEAR $L$outerloop $L$alldone: mov rsi,QWORD[32+rbx] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_whirlpool_block: ALIGN 64 @@ -904,7 +922,6 @@ se_handler: jae NEAR $L$in_prologue mov rax,QWORD[((128+32))+rax] - lea rax,[48+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/x86_64cpuid.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/x86_64cpuid.asm index 2aede40d9e..e2fec12d22 100644 --- a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/x86_64cpuid.asm +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/x86_64cpuid.asm @@ -48,10 +48,12 @@ $L$SEH_begin_OPENSSL_ia32_cpuid: mov rdi,rcx + mov r8,rbx + xor eax,eax - mov DWORD[8+rdi],eax + mov QWORD[8+rdi],rax cpuid mov r11d,eax @@ -122,6 +124,7 @@ $L$intel: $L$nocacheinfo: mov eax,1 cpuid + movd xmm0,eax and edx,0xbfefffff cmp r9d,0 jne NEAR $L$notintel @@ -169,28 +172,47 @@ $L$generic: jc NEAR $L$notknights and ebx,0xfff7ffff $L$notknights: + movd eax,xmm0 + and eax,0x0fff0ff0 + cmp eax,0x00050650 + jne NEAR $L$notskylakex + and ebx,0xfffeffff + +$L$notskylakex: mov DWORD[8+rdi],ebx + mov DWORD[12+rdi],ecx $L$no_extended_info: bt r9d,27 jnc NEAR $L$clear_avx xor ecx,ecx DB 0x0f,0x01,0xd0 + and eax,0xe6 + cmp eax,0xe6 + je NEAR $L$done + and DWORD[8+rdi],0x3fdeffff + + + + and eax,6 cmp eax,6 je NEAR $L$done $L$clear_avx: mov eax,0xefffe7ff and r9d,eax - and DWORD[8+rdi],0xffffffdf + mov eax,0x3fdeffdf + and DWORD[8+rdi],eax $L$done: shl r9,32 mov eax,r10d mov rbx,r8 + or rax,r9 mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_OPENSSL_ia32_cpuid: global OPENSSL_cleanse @@ -236,6 +258,18 @@ CRYPTO_memcmp: xor r10,r10 cmp r8,0 je NEAR $L$no_data + cmp r8,16 + jne NEAR $L$oop_cmp + mov r10,QWORD[rcx] + mov r11,QWORD[8+rcx] + mov r8,1 + xor r10,QWORD[rdx] + xor r11,QWORD[8+rdx] + or r10,r11 + cmovnz rax,r8 + DB 0F3h,0C3h ;repret + +ALIGN 16 $L$oop_cmp: mov r10b,BYTE[rcx] lea rcx,[1+rcx] @@ -347,21 +381,6 @@ $L$done2: sub rax,rcx DB 0F3h,0C3h ;repret -global OPENSSL_ia32_rdrand - -ALIGN 16 -OPENSSL_ia32_rdrand: - mov ecx,8 -$L$oop_rdrand: -DB 72,15,199,240 - jc NEAR $L$break_rdrand - loop $L$oop_rdrand -$L$break_rdrand: - cmp rax,0 - cmove rax,rcx - DB 0F3h,0C3h ;repret - - global OPENSSL_ia32_rdrand_bytes ALIGN 16 @@ -395,28 +414,14 @@ $L$tail_rdrand_bytes: mov BYTE[rcx],r10b lea rcx,[1+rcx] inc rax - shr r8,8 + shr r10,8 dec rdx jnz NEAR $L$tail_rdrand_bytes $L$done_rdrand_bytes: + xor r10,r10 DB 0F3h,0C3h ;repret -global OPENSSL_ia32_rdseed - -ALIGN 16 -OPENSSL_ia32_rdseed: - mov ecx,8 -$L$oop_rdseed: -DB 72,15,199,248 - jc NEAR $L$break_rdseed - loop $L$oop_rdseed -$L$break_rdseed: - cmp rax,0 - cmove rax,rcx - DB 0F3h,0C3h ;repret - - global OPENSSL_ia32_rdseed_bytes ALIGN 16 @@ -450,10 +455,11 @@ $L$tail_rdseed_bytes: mov BYTE[rcx],r10b lea rcx,[1+rcx] inc rax - shr r8,8 + shr r10,8 dec rdx jnz NEAR $L$tail_rdseed_bytes $L$done_rdseed_bytes: + xor r10,r10 DB 0F3h,0C3h ;repret |