diff options
Diffstat (limited to 'deps/openssl/config/archs/VC-WIN64A/asm/crypto/poly1305/poly1305-x86_64.asm')
-rw-r--r-- | deps/openssl/config/archs/VC-WIN64A/asm/crypto/poly1305/poly1305-x86_64.asm | 2172 |
1 files changed, 2172 insertions, 0 deletions
diff --git a/deps/openssl/config/archs/VC-WIN64A/asm/crypto/poly1305/poly1305-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/poly1305/poly1305-x86_64.asm new file mode 100644 index 0000000000..15fde3cba6 --- /dev/null +++ b/deps/openssl/config/archs/VC-WIN64A/asm/crypto/poly1305/poly1305-x86_64.asm @@ -0,0 +1,2172 @@ +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + + +EXTERN OPENSSL_ia32cap_P + +global poly1305_init + +global poly1305_blocks + +global poly1305_emit + + + +ALIGN 32 +poly1305_init: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_poly1305_init: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + xor rax,rax + mov QWORD[rdi],rax + mov QWORD[8+rdi],rax + mov QWORD[16+rdi],rax + + cmp rsi,0 + je NEAR $L$no_key + + lea r10,[poly1305_blocks] + lea r11,[poly1305_emit] + mov r9,QWORD[((OPENSSL_ia32cap_P+4))] + lea rax,[poly1305_blocks_avx] + lea rcx,[poly1305_emit_avx] + bt r9,28 + cmovc r10,rax + cmovc r11,rcx + lea rax,[poly1305_blocks_avx2] + bt r9,37 + cmovc r10,rax + mov rax,0x0ffffffc0fffffff + mov rcx,0x0ffffffc0ffffffc + and rax,QWORD[rsi] + and rcx,QWORD[8+rsi] + mov QWORD[24+rdi],rax + mov QWORD[32+rdi],rcx + mov QWORD[rdx],r10 + mov QWORD[8+rdx],r11 + mov eax,1 +$L$no_key: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_poly1305_init: + + +ALIGN 32 +poly1305_blocks: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_poly1305_blocks: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + +$L$blocks: + shr rdx,4 + jz NEAR $L$no_data + + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 +$L$blocks_body: + + mov r15,rdx + + mov r11,QWORD[24+rdi] + mov r13,QWORD[32+rdi] + + mov r14,QWORD[rdi] + mov rbx,QWORD[8+rdi] + mov rbp,QWORD[16+rdi] + + mov r12,r13 + shr r13,2 + mov rax,r12 + add r13,r12 + jmp NEAR $L$oop + +ALIGN 32 +$L$oop: + add r14,QWORD[rsi] + adc rbx,QWORD[8+rsi] + lea rsi,[16+rsi] + adc rbp,rcx + mul r14 + mov r9,rax + mov rax,r11 + mov r10,rdx + + mul r14 + mov r14,rax + mov rax,r11 + mov r8,rdx + + mul rbx + add r9,rax + mov rax,r13 + adc r10,rdx + + mul rbx + mov rbx,rbp + add r14,rax + adc r8,rdx + + imul rbx,r13 + add r9,rbx + mov rbx,r8 + adc r10,0 + + imul rbp,r11 + add rbx,r9 + mov rax,-4 + adc r10,rbp + + and rax,r10 + mov rbp,r10 + shr r10,2 + and rbp,3 + add rax,r10 + add r14,rax + adc rbx,0 + adc rbp,0 + mov rax,r12 + dec r15 + jnz NEAR $L$oop + + mov QWORD[rdi],r14 + mov QWORD[8+rdi],rbx + mov QWORD[16+rdi],rbp + + mov r15,QWORD[rsp] + mov r14,QWORD[8+rsp] + mov r13,QWORD[16+rsp] + mov r12,QWORD[24+rsp] + mov rbp,QWORD[32+rsp] + mov rbx,QWORD[40+rsp] + lea rsp,[48+rsp] +$L$no_data: +$L$blocks_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_poly1305_blocks: + + +ALIGN 32 +poly1305_emit: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_poly1305_emit: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +$L$emit: + mov r8,QWORD[rdi] + mov r9,QWORD[8+rdi] + mov r10,QWORD[16+rdi] + + mov rax,r8 + add r8,5 + mov rcx,r9 + adc r9,0 + adc r10,0 + shr r10,2 + cmovnz rax,r8 + cmovnz rcx,r9 + + add rax,QWORD[rdx] + adc rcx,QWORD[8+rdx] + mov QWORD[rsi],rax + mov QWORD[8+rsi],rcx + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_poly1305_emit: + +ALIGN 32 +__poly1305_block: + mul r14 + mov r9,rax + mov rax,r11 + mov r10,rdx + + mul r14 + mov r14,rax + mov rax,r11 + mov r8,rdx + + mul rbx + add r9,rax + mov rax,r13 + adc r10,rdx + + mul rbx + mov rbx,rbp + add r14,rax + adc r8,rdx + + imul rbx,r13 + add r9,rbx + mov rbx,r8 + adc r10,0 + + imul rbp,r11 + add rbx,r9 + mov rax,-4 + adc r10,rbp + + and rax,r10 + mov rbp,r10 + shr r10,2 + and rbp,3 + add rax,r10 + add r14,rax + adc rbx,0 + adc rbp,0 + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +__poly1305_init_avx: + mov r14,r11 + mov rbx,r12 + xor rbp,rbp + + lea rdi,[((48+64))+rdi] + + mov rax,r12 + call __poly1305_block + + mov eax,0x3ffffff + mov edx,0x3ffffff + mov r8,r14 + and eax,r14d + mov r9,r11 + and edx,r11d + mov DWORD[((-64))+rdi],eax + shr r8,26 + mov DWORD[((-60))+rdi],edx + shr r9,26 + + mov eax,0x3ffffff + mov edx,0x3ffffff + and eax,r8d + and edx,r9d + mov DWORD[((-48))+rdi],eax + lea eax,[rax*4+rax] + mov DWORD[((-44))+rdi],edx + lea edx,[rdx*4+rdx] + mov DWORD[((-32))+rdi],eax + shr r8,26 + mov DWORD[((-28))+rdi],edx + shr r9,26 + + mov rax,rbx + mov rdx,r12 + shl rax,12 + shl rdx,12 + or rax,r8 + or rdx,r9 + and eax,0x3ffffff + and edx,0x3ffffff + mov DWORD[((-16))+rdi],eax + lea eax,[rax*4+rax] + mov DWORD[((-12))+rdi],edx + lea edx,[rdx*4+rdx] + mov DWORD[rdi],eax + mov r8,rbx + mov DWORD[4+rdi],edx + mov r9,r12 + + mov eax,0x3ffffff + mov edx,0x3ffffff + shr r8,14 + shr r9,14 + and eax,r8d + and edx,r9d + mov DWORD[16+rdi],eax + lea eax,[rax*4+rax] + mov DWORD[20+rdi],edx + lea edx,[rdx*4+rdx] + mov DWORD[32+rdi],eax + shr r8,26 + mov DWORD[36+rdi],edx + shr r9,26 + + mov rax,rbp + shl rax,24 + or r8,rax + mov DWORD[48+rdi],r8d + lea r8,[r8*4+r8] + mov DWORD[52+rdi],r9d + lea r9,[r9*4+r9] + mov DWORD[64+rdi],r8d + mov DWORD[68+rdi],r9d + + mov rax,r12 + call __poly1305_block + + mov eax,0x3ffffff + mov r8,r14 + and eax,r14d + shr r8,26 + mov DWORD[((-52))+rdi],eax + + mov edx,0x3ffffff + and edx,r8d + mov DWORD[((-36))+rdi],edx + lea edx,[rdx*4+rdx] + shr r8,26 + mov DWORD[((-20))+rdi],edx + + mov rax,rbx + shl rax,12 + or rax,r8 + and eax,0x3ffffff + mov DWORD[((-4))+rdi],eax + lea eax,[rax*4+rax] + mov r8,rbx + mov DWORD[12+rdi],eax + + mov edx,0x3ffffff + shr r8,14 + and edx,r8d + mov DWORD[28+rdi],edx + lea edx,[rdx*4+rdx] + shr r8,26 + mov DWORD[44+rdi],edx + + mov rax,rbp + shl rax,24 + or r8,rax + mov DWORD[60+rdi],r8d + lea r8,[r8*4+r8] + mov DWORD[76+rdi],r8d + + mov rax,r12 + call __poly1305_block + + mov eax,0x3ffffff + mov r8,r14 + and eax,r14d + shr r8,26 + mov DWORD[((-56))+rdi],eax + + mov edx,0x3ffffff + and edx,r8d + mov DWORD[((-40))+rdi],edx + lea edx,[rdx*4+rdx] + shr r8,26 + mov DWORD[((-24))+rdi],edx + + mov rax,rbx + shl rax,12 + or rax,r8 + and eax,0x3ffffff + mov DWORD[((-8))+rdi],eax + lea eax,[rax*4+rax] + mov r8,rbx + mov DWORD[8+rdi],eax + + mov edx,0x3ffffff + shr r8,14 + and edx,r8d + mov DWORD[24+rdi],edx + lea edx,[rdx*4+rdx] + shr r8,26 + mov DWORD[40+rdi],edx + + mov rax,rbp + shl rax,24 + or r8,rax + mov DWORD[56+rdi],r8d + lea r8,[r8*4+r8] + mov DWORD[72+rdi],r8d + + lea rdi,[((-48-64))+rdi] + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +poly1305_blocks_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_poly1305_blocks_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + mov r8d,DWORD[20+rdi] + cmp rdx,128 + jae NEAR $L$blocks_avx + test r8d,r8d + jz NEAR $L$blocks + +$L$blocks_avx: + and rdx,-16 + jz NEAR $L$no_data_avx + + vzeroupper + + test r8d,r8d + jz NEAR $L$base2_64_avx + + test rdx,31 + jz NEAR $L$even_avx + + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 +$L$blocks_avx_body: + + mov r15,rdx + + mov r8,QWORD[rdi] + mov r9,QWORD[8+rdi] + mov ebp,DWORD[16+rdi] + + mov r11,QWORD[24+rdi] + mov r13,QWORD[32+rdi] + + + mov r14d,r8d + and r8,-2147483648 + mov r12,r9 + mov ebx,r9d + and r9,-2147483648 + + shr r8,6 + shl r12,52 + add r14,r8 + shr rbx,12 + shr r9,18 + add r14,r12 + adc rbx,r9 + + mov r8,rbp + shl r8,40 + shr rbp,24 + add rbx,r8 + adc rbp,0 + + mov r9,-4 + mov r8,rbp + and r9,rbp + shr r8,2 + and rbp,3 + add r8,r9 + add r14,r8 + adc rbx,0 + adc rbp,0 + + mov r12,r13 + mov rax,r13 + shr r13,2 + add r13,r12 + + add r14,QWORD[rsi] + adc rbx,QWORD[8+rsi] + lea rsi,[16+rsi] + adc rbp,rcx + + call __poly1305_block + + test rcx,rcx + jz NEAR $L$store_base2_64_avx + + + mov rax,r14 + mov rdx,r14 + shr r14,52 + mov r11,rbx + mov r12,rbx + shr rdx,26 + and rax,0x3ffffff + shl r11,12 + and rdx,0x3ffffff + shr rbx,14 + or r14,r11 + shl rbp,24 + and r14,0x3ffffff + shr r12,40 + and rbx,0x3ffffff + or rbp,r12 + + sub r15,16 + jz NEAR $L$store_base2_26_avx + + vmovd xmm0,eax + vmovd xmm1,edx + vmovd xmm2,r14d + vmovd xmm3,ebx + vmovd xmm4,ebp + jmp NEAR $L$proceed_avx + +ALIGN 32 +$L$store_base2_64_avx: + mov QWORD[rdi],r14 + mov QWORD[8+rdi],rbx + mov QWORD[16+rdi],rbp + jmp NEAR $L$done_avx + +ALIGN 16 +$L$store_base2_26_avx: + mov DWORD[rdi],eax + mov DWORD[4+rdi],edx + mov DWORD[8+rdi],r14d + mov DWORD[12+rdi],ebx + mov DWORD[16+rdi],ebp +ALIGN 16 +$L$done_avx: + mov r15,QWORD[rsp] + mov r14,QWORD[8+rsp] + mov r13,QWORD[16+rsp] + mov r12,QWORD[24+rsp] + mov rbp,QWORD[32+rsp] + mov rbx,QWORD[40+rsp] + lea rsp,[48+rsp] +$L$no_data_avx: +$L$blocks_avx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +ALIGN 32 +$L$base2_64_avx: + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 +$L$base2_64_avx_body: + + mov r15,rdx + + mov r11,QWORD[24+rdi] + mov r13,QWORD[32+rdi] + + mov r14,QWORD[rdi] + mov rbx,QWORD[8+rdi] + mov ebp,DWORD[16+rdi] + + mov r12,r13 + mov rax,r13 + shr r13,2 + add r13,r12 + + test rdx,31 + jz NEAR $L$init_avx + + add r14,QWORD[rsi] + adc rbx,QWORD[8+rsi] + lea rsi,[16+rsi] + adc rbp,rcx + sub r15,16 + + call __poly1305_block + +$L$init_avx: + + mov rax,r14 + mov rdx,r14 + shr r14,52 + mov r8,rbx + mov r9,rbx + shr rdx,26 + and rax,0x3ffffff + shl r8,12 + and rdx,0x3ffffff + shr rbx,14 + or r14,r8 + shl rbp,24 + and r14,0x3ffffff + shr r9,40 + and rbx,0x3ffffff + or rbp,r9 + + vmovd xmm0,eax + vmovd xmm1,edx + vmovd xmm2,r14d + vmovd xmm3,ebx + vmovd xmm4,ebp + mov DWORD[20+rdi],1 + + call __poly1305_init_avx + +$L$proceed_avx: + mov rdx,r15 + + mov r15,QWORD[rsp] + mov r14,QWORD[8+rsp] + mov r13,QWORD[16+rsp] + mov r12,QWORD[24+rsp] + mov rbp,QWORD[32+rsp] + mov rbx,QWORD[40+rsp] + lea rax,[48+rsp] + lea rsp,[48+rsp] +$L$base2_64_avx_epilogue: + jmp NEAR $L$do_avx + +ALIGN 32 +$L$even_avx: + vmovd xmm0,DWORD[rdi] + vmovd xmm1,DWORD[4+rdi] + vmovd xmm2,DWORD[8+rdi] + vmovd xmm3,DWORD[12+rdi] + vmovd xmm4,DWORD[16+rdi] + +$L$do_avx: + lea r11,[((-248))+rsp] + sub rsp,0x218 + vmovdqa XMMWORD[80+r11],xmm6 + vmovdqa XMMWORD[96+r11],xmm7 + vmovdqa XMMWORD[112+r11],xmm8 + vmovdqa XMMWORD[128+r11],xmm9 + vmovdqa XMMWORD[144+r11],xmm10 + vmovdqa XMMWORD[160+r11],xmm11 + vmovdqa XMMWORD[176+r11],xmm12 + vmovdqa XMMWORD[192+r11],xmm13 + vmovdqa XMMWORD[208+r11],xmm14 + vmovdqa XMMWORD[224+r11],xmm15 +$L$do_avx_body: + sub rdx,64 + lea rax,[((-32))+rsi] + cmovc rsi,rax + + vmovdqu xmm14,XMMWORD[48+rdi] + lea rdi,[112+rdi] + lea rcx,[$L$const] + + + + vmovdqu xmm5,XMMWORD[32+rsi] + vmovdqu xmm6,XMMWORD[48+rsi] + vmovdqa xmm15,XMMWORD[64+rcx] + + vpsrldq xmm7,xmm5,6 + vpsrldq xmm8,xmm6,6 + vpunpckhqdq xmm9,xmm5,xmm6 + vpunpcklqdq xmm5,xmm5,xmm6 + vpunpcklqdq xmm8,xmm7,xmm8 + + vpsrlq xmm9,xmm9,40 + vpsrlq xmm6,xmm5,26 + vpand xmm5,xmm5,xmm15 + vpsrlq xmm7,xmm8,4 + vpand xmm6,xmm6,xmm15 + vpsrlq xmm8,xmm8,30 + vpand xmm7,xmm7,xmm15 + vpand xmm8,xmm8,xmm15 + vpor xmm9,xmm9,XMMWORD[32+rcx] + + jbe NEAR $L$skip_loop_avx + + + vmovdqu xmm11,XMMWORD[((-48))+rdi] + vmovdqu xmm12,XMMWORD[((-32))+rdi] + vpshufd xmm13,xmm14,0xEE + vpshufd xmm10,xmm14,0x44 + vmovdqa XMMWORD[(-144)+r11],xmm13 + vmovdqa XMMWORD[rsp],xmm10 + vpshufd xmm14,xmm11,0xEE + vmovdqu xmm10,XMMWORD[((-16))+rdi] + vpshufd xmm11,xmm11,0x44 + vmovdqa XMMWORD[(-128)+r11],xmm14 + vmovdqa XMMWORD[16+rsp],xmm11 + vpshufd xmm13,xmm12,0xEE + vmovdqu xmm11,XMMWORD[rdi] + vpshufd xmm12,xmm12,0x44 + vmovdqa XMMWORD[(-112)+r11],xmm13 + vmovdqa XMMWORD[32+rsp],xmm12 + vpshufd xmm14,xmm10,0xEE + vmovdqu xmm12,XMMWORD[16+rdi] + vpshufd xmm10,xmm10,0x44 + vmovdqa XMMWORD[(-96)+r11],xmm14 + vmovdqa XMMWORD[48+rsp],xmm10 + vpshufd xmm13,xmm11,0xEE + vmovdqu xmm10,XMMWORD[32+rdi] + vpshufd xmm11,xmm11,0x44 + vmovdqa XMMWORD[(-80)+r11],xmm13 + vmovdqa XMMWORD[64+rsp],xmm11 + vpshufd xmm14,xmm12,0xEE + vmovdqu xmm11,XMMWORD[48+rdi] + vpshufd xmm12,xmm12,0x44 + vmovdqa XMMWORD[(-64)+r11],xmm14 + vmovdqa XMMWORD[80+rsp],xmm12 + vpshufd xmm13,xmm10,0xEE + vmovdqu xmm12,XMMWORD[64+rdi] + vpshufd xmm10,xmm10,0x44 + vmovdqa XMMWORD[(-48)+r11],xmm13 + vmovdqa XMMWORD[96+rsp],xmm10 + vpshufd xmm14,xmm11,0xEE + vpshufd xmm11,xmm11,0x44 + vmovdqa XMMWORD[(-32)+r11],xmm14 + vmovdqa XMMWORD[112+rsp],xmm11 + vpshufd xmm13,xmm12,0xEE + vmovdqa xmm14,XMMWORD[rsp] + vpshufd xmm12,xmm12,0x44 + vmovdqa XMMWORD[(-16)+r11],xmm13 + vmovdqa XMMWORD[128+rsp],xmm12 + + jmp NEAR $L$oop_avx + +ALIGN 32 +$L$oop_avx: + + + + + + + + + + + + + + + + + + + + + vpmuludq xmm10,xmm14,xmm5 + vpmuludq xmm11,xmm14,xmm6 + vmovdqa XMMWORD[32+r11],xmm2 + vpmuludq xmm12,xmm14,xmm7 + vmovdqa xmm2,XMMWORD[16+rsp] + vpmuludq xmm13,xmm14,xmm8 + vpmuludq xmm14,xmm14,xmm9 + + vmovdqa XMMWORD[r11],xmm0 + vpmuludq xmm0,xmm9,XMMWORD[32+rsp] + vmovdqa XMMWORD[16+r11],xmm1 + vpmuludq xmm1,xmm2,xmm8 + vpaddq xmm10,xmm10,xmm0 + vpaddq xmm14,xmm14,xmm1 + vmovdqa XMMWORD[48+r11],xmm3 + vpmuludq xmm0,xmm2,xmm7 + vpmuludq xmm1,xmm2,xmm6 + vpaddq xmm13,xmm13,xmm0 + vmovdqa xmm3,XMMWORD[48+rsp] + vpaddq xmm12,xmm12,xmm1 + vmovdqa XMMWORD[64+r11],xmm4 + vpmuludq xmm2,xmm2,xmm5 + vpmuludq xmm0,xmm3,xmm7 + vpaddq xmm11,xmm11,xmm2 + + vmovdqa xmm4,XMMWORD[64+rsp] + vpaddq xmm14,xmm14,xmm0 + vpmuludq xmm1,xmm3,xmm6 + vpmuludq xmm3,xmm3,xmm5 + vpaddq xmm13,xmm13,xmm1 + vmovdqa xmm2,XMMWORD[80+rsp] + vpaddq xmm12,xmm12,xmm3 + vpmuludq xmm0,xmm4,xmm9 + vpmuludq xmm4,xmm4,xmm8 + vpaddq xmm11,xmm11,xmm0 + vmovdqa xmm3,XMMWORD[96+rsp] + vpaddq xmm10,xmm10,xmm4 + + vmovdqa xmm4,XMMWORD[128+rsp] + vpmuludq xmm1,xmm2,xmm6 + vpmuludq xmm2,xmm2,xmm5 + vpaddq xmm14,xmm14,xmm1 + vpaddq xmm13,xmm13,xmm2 + vpmuludq xmm0,xmm3,xmm9 + vpmuludq xmm1,xmm3,xmm8 + vpaddq xmm12,xmm12,xmm0 + vmovdqu xmm0,XMMWORD[rsi] + vpaddq xmm11,xmm11,xmm1 + vpmuludq xmm3,xmm3,xmm7 + vpmuludq xmm7,xmm4,xmm7 + vpaddq xmm10,xmm10,xmm3 + + vmovdqu xmm1,XMMWORD[16+rsi] + vpaddq xmm11,xmm11,xmm7 + vpmuludq xmm8,xmm4,xmm8 + vpmuludq xmm9,xmm4,xmm9 + vpsrldq xmm2,xmm0,6 + vpaddq xmm12,xmm12,xmm8 + vpaddq xmm13,xmm13,xmm9 + vpsrldq xmm3,xmm1,6 + vpmuludq xmm9,xmm5,XMMWORD[112+rsp] + vpmuludq xmm5,xmm4,xmm6 + vpunpckhqdq xmm4,xmm0,xmm1 + vpaddq xmm14,xmm14,xmm9 + vmovdqa xmm9,XMMWORD[((-144))+r11] + vpaddq xmm10,xmm10,xmm5 + + vpunpcklqdq xmm0,xmm0,xmm1 + vpunpcklqdq xmm3,xmm2,xmm3 + + + vpsrldq xmm4,xmm4,5 + vpsrlq xmm1,xmm0,26 + vpand xmm0,xmm0,xmm15 + vpsrlq xmm2,xmm3,4 + vpand xmm1,xmm1,xmm15 + vpand xmm4,xmm4,XMMWORD[rcx] + vpsrlq xmm3,xmm3,30 + vpand xmm2,xmm2,xmm15 + vpand xmm3,xmm3,xmm15 + vpor xmm4,xmm4,XMMWORD[32+rcx] + + vpaddq xmm0,xmm0,XMMWORD[r11] + vpaddq xmm1,xmm1,XMMWORD[16+r11] + vpaddq xmm2,xmm2,XMMWORD[32+r11] + vpaddq xmm3,xmm3,XMMWORD[48+r11] + vpaddq xmm4,xmm4,XMMWORD[64+r11] + + lea rax,[32+rsi] + lea rsi,[64+rsi] + sub rdx,64 + cmovc rsi,rax + + + + + + + + + + + vpmuludq xmm5,xmm9,xmm0 + vpmuludq xmm6,xmm9,xmm1 + vpaddq xmm10,xmm10,xmm5 + vpaddq xmm11,xmm11,xmm6 + vmovdqa xmm7,XMMWORD[((-128))+r11] + vpmuludq xmm5,xmm9,xmm2 + vpmuludq xmm6,xmm9,xmm3 + vpaddq xmm12,xmm12,xmm5 + vpaddq xmm13,xmm13,xmm6 + vpmuludq xmm9,xmm9,xmm4 + vpmuludq xmm5,xmm4,XMMWORD[((-112))+r11] + vpaddq xmm14,xmm14,xmm9 + + vpaddq xmm10,xmm10,xmm5 + vpmuludq xmm6,xmm7,xmm2 + vpmuludq xmm5,xmm7,xmm3 + vpaddq xmm13,xmm13,xmm6 + vmovdqa xmm8,XMMWORD[((-96))+r11] + vpaddq xmm14,xmm14,xmm5 + vpmuludq xmm6,xmm7,xmm1 + vpmuludq xmm7,xmm7,xmm0 + vpaddq xmm12,xmm12,xmm6 + vpaddq xmm11,xmm11,xmm7 + + vmovdqa xmm9,XMMWORD[((-80))+r11] + vpmuludq xmm5,xmm8,xmm2 + vpmuludq xmm6,xmm8,xmm1 + vpaddq xmm14,xmm14,xmm5 + vpaddq xmm13,xmm13,xmm6 + vmovdqa xmm7,XMMWORD[((-64))+r11] + vpmuludq xmm8,xmm8,xmm0 + vpmuludq xmm5,xmm9,xmm4 + vpaddq xmm12,xmm12,xmm8 + vpaddq xmm11,xmm11,xmm5 + vmovdqa xmm8,XMMWORD[((-48))+r11] + vpmuludq xmm9,xmm9,xmm3 + vpmuludq xmm6,xmm7,xmm1 + vpaddq xmm10,xmm10,xmm9 + + vmovdqa xmm9,XMMWORD[((-16))+r11] + vpaddq xmm14,xmm14,xmm6 + vpmuludq xmm7,xmm7,xmm0 + vpmuludq xmm5,xmm8,xmm4 + vpaddq xmm13,xmm13,xmm7 + vpaddq xmm12,xmm12,xmm5 + vmovdqu xmm5,XMMWORD[32+rsi] + vpmuludq xmm7,xmm8,xmm3 + vpmuludq xmm8,xmm8,xmm2 + vpaddq xmm11,xmm11,xmm7 + vmovdqu xmm6,XMMWORD[48+rsi] + vpaddq xmm10,xmm10,xmm8 + + vpmuludq xmm2,xmm9,xmm2 + vpmuludq xmm3,xmm9,xmm3 + vpsrldq xmm7,xmm5,6 + vpaddq xmm11,xmm11,xmm2 + vpmuludq xmm4,xmm9,xmm4 + vpsrldq xmm8,xmm6,6 + vpaddq xmm2,xmm12,xmm3 + vpaddq xmm3,xmm13,xmm4 + vpmuludq xmm4,xmm0,XMMWORD[((-32))+r11] + vpmuludq xmm0,xmm9,xmm1 + vpunpckhqdq xmm9,xmm5,xmm6 + vpaddq xmm4,xmm14,xmm4 + vpaddq xmm0,xmm10,xmm0 + + vpunpcklqdq xmm5,xmm5,xmm6 + vpunpcklqdq xmm8,xmm7,xmm8 + + + vpsrldq xmm9,xmm9,5 + vpsrlq xmm6,xmm5,26 + vmovdqa xmm14,XMMWORD[rsp] + vpand xmm5,xmm5,xmm15 + vpsrlq xmm7,xmm8,4 + vpand xmm6,xmm6,xmm15 + vpand xmm9,xmm9,XMMWORD[rcx] + vpsrlq xmm8,xmm8,30 + vpand xmm7,xmm7,xmm15 + vpand xmm8,xmm8,xmm15 + vpor xmm9,xmm9,XMMWORD[32+rcx] + + + + + + vpsrlq xmm13,xmm3,26 + vpand xmm3,xmm3,xmm15 + vpaddq xmm4,xmm4,xmm13 + + vpsrlq xmm10,xmm0,26 + vpand xmm0,xmm0,xmm15 + vpaddq xmm1,xmm11,xmm10 + + vpsrlq xmm10,xmm4,26 + vpand xmm4,xmm4,xmm15 + + vpsrlq xmm11,xmm1,26 + vpand xmm1,xmm1,xmm15 + vpaddq xmm2,xmm2,xmm11 + + vpaddq xmm0,xmm0,xmm10 + vpsllq xmm10,xmm10,2 + vpaddq xmm0,xmm0,xmm10 + + vpsrlq xmm12,xmm2,26 + vpand xmm2,xmm2,xmm15 + vpaddq xmm3,xmm3,xmm12 + + vpsrlq xmm10,xmm0,26 + vpand xmm0,xmm0,xmm15 + vpaddq xmm1,xmm1,xmm10 + + vpsrlq xmm13,xmm3,26 + vpand xmm3,xmm3,xmm15 + vpaddq xmm4,xmm4,xmm13 + + ja NEAR $L$oop_avx + +$L$skip_loop_avx: + + + + vpshufd xmm14,xmm14,0x10 + add rdx,32 + jnz NEAR $L$ong_tail_avx + + vpaddq xmm7,xmm7,xmm2 + vpaddq xmm5,xmm5,xmm0 + vpaddq xmm6,xmm6,xmm1 + vpaddq xmm8,xmm8,xmm3 + vpaddq xmm9,xmm9,xmm4 + +$L$ong_tail_avx: + vmovdqa XMMWORD[32+r11],xmm2 + vmovdqa XMMWORD[r11],xmm0 + vmovdqa XMMWORD[16+r11],xmm1 + vmovdqa XMMWORD[48+r11],xmm3 + vmovdqa XMMWORD[64+r11],xmm4 + + + + + + + + vpmuludq xmm12,xmm14,xmm7 + vpmuludq xmm10,xmm14,xmm5 + vpshufd xmm2,XMMWORD[((-48))+rdi],0x10 + vpmuludq xmm11,xmm14,xmm6 + vpmuludq xmm13,xmm14,xmm8 + vpmuludq xmm14,xmm14,xmm9 + + vpmuludq xmm0,xmm2,xmm8 + vpaddq xmm14,xmm14,xmm0 + vpshufd xmm3,XMMWORD[((-32))+rdi],0x10 + vpmuludq xmm1,xmm2,xmm7 + vpaddq xmm13,xmm13,xmm1 + vpshufd xmm4,XMMWORD[((-16))+rdi],0x10 + vpmuludq xmm0,xmm2,xmm6 + vpaddq xmm12,xmm12,xmm0 + vpmuludq xmm2,xmm2,xmm5 + vpaddq xmm11,xmm11,xmm2 + vpmuludq xmm3,xmm3,xmm9 + vpaddq xmm10,xmm10,xmm3 + + vpshufd xmm2,XMMWORD[rdi],0x10 + vpmuludq xmm1,xmm4,xmm7 + vpaddq xmm14,xmm14,xmm1 + vpmuludq xmm0,xmm4,xmm6 + vpaddq xmm13,xmm13,xmm0 + vpshufd xmm3,XMMWORD[16+rdi],0x10 + vpmuludq xmm4,xmm4,xmm5 + vpaddq xmm12,xmm12,xmm4 + vpmuludq xmm1,xmm2,xmm9 + vpaddq xmm11,xmm11,xmm1 + vpshufd xmm4,XMMWORD[32+rdi],0x10 + vpmuludq xmm2,xmm2,xmm8 + vpaddq xmm10,xmm10,xmm2 + + vpmuludq xmm0,xmm3,xmm6 + vpaddq xmm14,xmm14,xmm0 + vpmuludq xmm3,xmm3,xmm5 + vpaddq xmm13,xmm13,xmm3 + vpshufd xmm2,XMMWORD[48+rdi],0x10 + vpmuludq xmm1,xmm4,xmm9 + vpaddq xmm12,xmm12,xmm1 + vpshufd xmm3,XMMWORD[64+rdi],0x10 + vpmuludq xmm0,xmm4,xmm8 + vpaddq xmm11,xmm11,xmm0 + vpmuludq xmm4,xmm4,xmm7 + vpaddq xmm10,xmm10,xmm4 + + vpmuludq xmm2,xmm2,xmm5 + vpaddq xmm14,xmm14,xmm2 + vpmuludq xmm1,xmm3,xmm9 + vpaddq xmm13,xmm13,xmm1 + vpmuludq xmm0,xmm3,xmm8 + vpaddq xmm12,xmm12,xmm0 + vpmuludq xmm1,xmm3,xmm7 + vpaddq xmm11,xmm11,xmm1 + vpmuludq xmm3,xmm3,xmm6 + vpaddq xmm10,xmm10,xmm3 + + jz NEAR $L$short_tail_avx + + vmovdqu xmm0,XMMWORD[rsi] + vmovdqu xmm1,XMMWORD[16+rsi] + + vpsrldq xmm2,xmm0,6 + vpsrldq xmm3,xmm1,6 + vpunpckhqdq xmm4,xmm0,xmm1 + vpunpcklqdq xmm0,xmm0,xmm1 + vpunpcklqdq xmm3,xmm2,xmm3 + + vpsrlq xmm4,xmm4,40 + vpsrlq xmm1,xmm0,26 + vpand xmm0,xmm0,xmm15 + vpsrlq xmm2,xmm3,4 + vpand xmm1,xmm1,xmm15 + vpsrlq xmm3,xmm3,30 + vpand xmm2,xmm2,xmm15 + vpand xmm3,xmm3,xmm15 + vpor xmm4,xmm4,XMMWORD[32+rcx] + + vpshufd xmm9,XMMWORD[((-64))+rdi],0x32 + vpaddq xmm0,xmm0,XMMWORD[r11] + vpaddq xmm1,xmm1,XMMWORD[16+r11] + vpaddq xmm2,xmm2,XMMWORD[32+r11] + vpaddq xmm3,xmm3,XMMWORD[48+r11] + vpaddq xmm4,xmm4,XMMWORD[64+r11] + + + + + vpmuludq xmm5,xmm9,xmm0 + vpaddq xmm10,xmm10,xmm5 + vpmuludq xmm6,xmm9,xmm1 + vpaddq xmm11,xmm11,xmm6 + vpmuludq xmm5,xmm9,xmm2 + vpaddq xmm12,xmm12,xmm5 + vpshufd xmm7,XMMWORD[((-48))+rdi],0x32 + vpmuludq xmm6,xmm9,xmm3 + vpaddq xmm13,xmm13,xmm6 + vpmuludq xmm9,xmm9,xmm4 + vpaddq xmm14,xmm14,xmm9 + + vpmuludq xmm5,xmm7,xmm3 + vpaddq xmm14,xmm14,xmm5 + vpshufd xmm8,XMMWORD[((-32))+rdi],0x32 + vpmuludq xmm6,xmm7,xmm2 + vpaddq xmm13,xmm13,xmm6 + vpshufd xmm9,XMMWORD[((-16))+rdi],0x32 + vpmuludq xmm5,xmm7,xmm1 + vpaddq xmm12,xmm12,xmm5 + vpmuludq xmm7,xmm7,xmm0 + vpaddq xmm11,xmm11,xmm7 + vpmuludq xmm8,xmm8,xmm4 + vpaddq xmm10,xmm10,xmm8 + + vpshufd xmm7,XMMWORD[rdi],0x32 + vpmuludq xmm6,xmm9,xmm2 + vpaddq xmm14,xmm14,xmm6 + vpmuludq xmm5,xmm9,xmm1 + vpaddq xmm13,xmm13,xmm5 + vpshufd xmm8,XMMWORD[16+rdi],0x32 + vpmuludq xmm9,xmm9,xmm0 + vpaddq xmm12,xmm12,xmm9 + vpmuludq xmm6,xmm7,xmm4 + vpaddq xmm11,xmm11,xmm6 + vpshufd xmm9,XMMWORD[32+rdi],0x32 + vpmuludq xmm7,xmm7,xmm3 + vpaddq xmm10,xmm10,xmm7 + + vpmuludq xmm5,xmm8,xmm1 + vpaddq xmm14,xmm14,xmm5 + vpmuludq xmm8,xmm8,xmm0 + vpaddq xmm13,xmm13,xmm8 + vpshufd xmm7,XMMWORD[48+rdi],0x32 + vpmuludq xmm6,xmm9,xmm4 + vpaddq xmm12,xmm12,xmm6 + vpshufd xmm8,XMMWORD[64+rdi],0x32 + vpmuludq xmm5,xmm9,xmm3 + vpaddq xmm11,xmm11,xmm5 + vpmuludq xmm9,xmm9,xmm2 + vpaddq xmm10,xmm10,xmm9 + + vpmuludq xmm7,xmm7,xmm0 + vpaddq xmm14,xmm14,xmm7 + vpmuludq xmm6,xmm8,xmm4 + vpaddq xmm13,xmm13,xmm6 + vpmuludq xmm5,xmm8,xmm3 + vpaddq xmm12,xmm12,xmm5 + vpmuludq xmm6,xmm8,xmm2 + vpaddq xmm11,xmm11,xmm6 + vpmuludq xmm8,xmm8,xmm1 + vpaddq xmm10,xmm10,xmm8 + +$L$short_tail_avx: + + + + vpsrldq xmm9,xmm14,8 + vpsrldq xmm8,xmm13,8 + vpsrldq xmm6,xmm11,8 + vpsrldq xmm5,xmm10,8 + vpsrldq xmm7,xmm12,8 + vpaddq xmm13,xmm13,xmm8 + vpaddq xmm14,xmm14,xmm9 + vpaddq xmm10,xmm10,xmm5 + vpaddq xmm11,xmm11,xmm6 + vpaddq xmm12,xmm12,xmm7 + + + + + vpsrlq xmm3,xmm13,26 + vpand xmm13,xmm13,xmm15 + vpaddq xmm14,xmm14,xmm3 + + vpsrlq xmm0,xmm10,26 + vpand xmm10,xmm10,xmm15 + vpaddq xmm11,xmm11,xmm0 + + vpsrlq xmm4,xmm14,26 + vpand xmm14,xmm14,xmm15 + + vpsrlq xmm1,xmm11,26 + vpand xmm11,xmm11,xmm15 + vpaddq xmm12,xmm12,xmm1 + + vpaddq xmm10,xmm10,xmm4 + vpsllq xmm4,xmm4,2 + vpaddq xmm10,xmm10,xmm4 + + vpsrlq xmm2,xmm12,26 + vpand xmm12,xmm12,xmm15 + vpaddq xmm13,xmm13,xmm2 + + vpsrlq xmm0,xmm10,26 + vpand xmm10,xmm10,xmm15 + vpaddq xmm11,xmm11,xmm0 + + vpsrlq xmm3,xmm13,26 + vpand xmm13,xmm13,xmm15 + vpaddq xmm14,xmm14,xmm3 + + vmovd DWORD[(-112)+rdi],xmm10 + vmovd DWORD[(-108)+rdi],xmm11 + vmovd DWORD[(-104)+rdi],xmm12 + vmovd DWORD[(-100)+rdi],xmm13 + vmovd DWORD[(-96)+rdi],xmm14 + vmovdqa xmm6,XMMWORD[80+r11] + vmovdqa xmm7,XMMWORD[96+r11] + vmovdqa xmm8,XMMWORD[112+r11] + vmovdqa xmm9,XMMWORD[128+r11] + vmovdqa xmm10,XMMWORD[144+r11] + vmovdqa xmm11,XMMWORD[160+r11] + vmovdqa xmm12,XMMWORD[176+r11] + vmovdqa xmm13,XMMWORD[192+r11] + vmovdqa xmm14,XMMWORD[208+r11] + vmovdqa xmm15,XMMWORD[224+r11] + lea rsp,[248+r11] +$L$do_avx_epilogue: + vzeroupper + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_poly1305_blocks_avx: + + +ALIGN 32 +poly1305_emit_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_poly1305_emit_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + cmp DWORD[20+rdi],0 + je NEAR $L$emit + + mov eax,DWORD[rdi] + mov ecx,DWORD[4+rdi] + mov r8d,DWORD[8+rdi] + mov r11d,DWORD[12+rdi] + mov r10d,DWORD[16+rdi] + + shl rcx,26 + mov r9,r8 + shl r8,52 + add rax,rcx + shr r9,12 + add r8,rax + adc r9,0 + + shl r11,14 + mov rax,r10 + shr r10,24 + add r9,r11 + shl rax,40 + add r9,rax + adc r10,0 + + mov rax,r10 + mov rcx,r10 + and r10,3 + shr rax,2 + and rcx,-4 + add rax,rcx + add r8,rax + adc r9,0 + adc r10,0 + + mov rax,r8 + add r8,5 + mov rcx,r9 + adc r9,0 + adc r10,0 + shr r10,2 + cmovnz rax,r8 + cmovnz rcx,r9 + + add rax,QWORD[rdx] + adc rcx,QWORD[8+rdx] + mov QWORD[rsi],rax + mov QWORD[8+rsi],rcx + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_poly1305_emit_avx: + +ALIGN 32 +poly1305_blocks_avx2: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_poly1305_blocks_avx2: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + mov r8d,DWORD[20+rdi] + cmp rdx,128 + jae NEAR $L$blocks_avx2 + test r8d,r8d + jz NEAR $L$blocks + +$L$blocks_avx2: + and rdx,-16 + jz NEAR $L$no_data_avx2 + + vzeroupper + + test r8d,r8d + jz NEAR $L$base2_64_avx2 + + test rdx,63 + jz NEAR $L$even_avx2 + + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 +$L$blocks_avx2_body: + + mov r15,rdx + + mov r8,QWORD[rdi] + mov r9,QWORD[8+rdi] + mov ebp,DWORD[16+rdi] + + mov r11,QWORD[24+rdi] + mov r13,QWORD[32+rdi] + + + mov r14d,r8d + and r8,-2147483648 + mov r12,r9 + mov ebx,r9d + and r9,-2147483648 + + shr r8,6 + shl r12,52 + add r14,r8 + shr rbx,12 + shr r9,18 + add r14,r12 + adc rbx,r9 + + mov r8,rbp + shl r8,40 + shr rbp,24 + add rbx,r8 + adc rbp,0 + + mov r9,-4 + mov r8,rbp + and r9,rbp + shr r8,2 + and rbp,3 + add r8,r9 + add r14,r8 + adc rbx,0 + adc rbp,0 + + mov r12,r13 + mov rax,r13 + shr r13,2 + add r13,r12 + +$L$base2_26_pre_avx2: + add r14,QWORD[rsi] + adc rbx,QWORD[8+rsi] + lea rsi,[16+rsi] + adc rbp,rcx + sub r15,16 + + call __poly1305_block + mov rax,r12 + + test r15,63 + jnz NEAR $L$base2_26_pre_avx2 + + test rcx,rcx + jz NEAR $L$store_base2_64_avx2 + + + mov rax,r14 + mov rdx,r14 + shr r14,52 + mov r11,rbx + mov r12,rbx + shr rdx,26 + and rax,0x3ffffff + shl r11,12 + and rdx,0x3ffffff + shr rbx,14 + or r14,r11 + shl rbp,24 + and r14,0x3ffffff + shr r12,40 + and rbx,0x3ffffff + or rbp,r12 + + test r15,r15 + jz NEAR $L$store_base2_26_avx2 + + vmovd xmm0,eax + vmovd xmm1,edx + vmovd xmm2,r14d + vmovd xmm3,ebx + vmovd xmm4,ebp + jmp NEAR $L$proceed_avx2 + +ALIGN 32 +$L$store_base2_64_avx2: + mov QWORD[rdi],r14 + mov QWORD[8+rdi],rbx + mov QWORD[16+rdi],rbp + jmp NEAR $L$done_avx2 + +ALIGN 16 +$L$store_base2_26_avx2: + mov DWORD[rdi],eax + mov DWORD[4+rdi],edx + mov DWORD[8+rdi],r14d + mov DWORD[12+rdi],ebx + mov DWORD[16+rdi],ebp +ALIGN 16 +$L$done_avx2: + mov r15,QWORD[rsp] + mov r14,QWORD[8+rsp] + mov r13,QWORD[16+rsp] + mov r12,QWORD[24+rsp] + mov rbp,QWORD[32+rsp] + mov rbx,QWORD[40+rsp] + lea rsp,[48+rsp] +$L$no_data_avx2: +$L$blocks_avx2_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +ALIGN 32 +$L$base2_64_avx2: + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 +$L$base2_64_avx2_body: + + mov r15,rdx + + mov r11,QWORD[24+rdi] + mov r13,QWORD[32+rdi] + + mov r14,QWORD[rdi] + mov rbx,QWORD[8+rdi] + mov ebp,DWORD[16+rdi] + + mov r12,r13 + mov rax,r13 + shr r13,2 + add r13,r12 + + test rdx,63 + jz NEAR $L$init_avx2 + +$L$base2_64_pre_avx2: + add r14,QWORD[rsi] + adc rbx,QWORD[8+rsi] + lea rsi,[16+rsi] + adc rbp,rcx + sub r15,16 + + call __poly1305_block + mov rax,r12 + + test r15,63 + jnz NEAR $L$base2_64_pre_avx2 + +$L$init_avx2: + + mov rax,r14 + mov rdx,r14 + shr r14,52 + mov r8,rbx + mov r9,rbx + shr rdx,26 + and rax,0x3ffffff + shl r8,12 + and rdx,0x3ffffff + shr rbx,14 + or r14,r8 + shl rbp,24 + and r14,0x3ffffff + shr r9,40 + and rbx,0x3ffffff + or rbp,r9 + + vmovd xmm0,eax + vmovd xmm1,edx + vmovd xmm2,r14d + vmovd xmm3,ebx + vmovd xmm4,ebp + mov DWORD[20+rdi],1 + + call __poly1305_init_avx + +$L$proceed_avx2: + mov rdx,r15 + + mov r15,QWORD[rsp] + mov r14,QWORD[8+rsp] + mov r13,QWORD[16+rsp] + mov r12,QWORD[24+rsp] + mov rbp,QWORD[32+rsp] + mov rbx,QWORD[40+rsp] + lea rax,[48+rsp] + lea rsp,[48+rsp] +$L$base2_64_avx2_epilogue: + jmp NEAR $L$do_avx2 + +ALIGN 32 +$L$even_avx2: + vmovd xmm0,DWORD[rdi] + vmovd xmm1,DWORD[4+rdi] + vmovd xmm2,DWORD[8+rdi] + vmovd xmm3,DWORD[12+rdi] + vmovd xmm4,DWORD[16+rdi] + +$L$do_avx2: + lea r11,[((-248))+rsp] + sub rsp,0x1c8 + vmovdqa XMMWORD[80+r11],xmm6 + vmovdqa XMMWORD[96+r11],xmm7 + vmovdqa XMMWORD[112+r11],xmm8 + vmovdqa XMMWORD[128+r11],xmm9 + vmovdqa XMMWORD[144+r11],xmm10 + vmovdqa XMMWORD[160+r11],xmm11 + vmovdqa XMMWORD[176+r11],xmm12 + vmovdqa XMMWORD[192+r11],xmm13 + vmovdqa XMMWORD[208+r11],xmm14 + vmovdqa XMMWORD[224+r11],xmm15 +$L$do_avx2_body: + lea rdi,[((48+64))+rdi] + lea rcx,[$L$const] + + + vmovdqu xmm9,XMMWORD[((-64))+rdi] + and rsp,-512 + vmovdqu xmm10,XMMWORD[((-48))+rdi] + vmovdqu xmm6,XMMWORD[((-32))+rdi] + vmovdqu xmm11,XMMWORD[((-16))+rdi] + vmovdqu xmm12,XMMWORD[rdi] + vmovdqu xmm13,XMMWORD[16+rdi] + vmovdqu xmm14,XMMWORD[32+rdi] + vpermq ymm9,ymm9,0x15 + vmovdqu xmm15,XMMWORD[48+rdi] + vpermq ymm10,ymm10,0x15 + vpshufd ymm9,ymm9,0xc8 + vmovdqu xmm5,XMMWORD[64+rdi] + vpermq ymm6,ymm6,0x15 + vpshufd ymm10,ymm10,0xc8 + vmovdqa YMMWORD[rsp],ymm9 + vpermq ymm11,ymm11,0x15 + vpshufd ymm6,ymm6,0xc8 + vmovdqa YMMWORD[32+rsp],ymm10 + vpermq ymm12,ymm12,0x15 + vpshufd ymm11,ymm11,0xc8 + vmovdqa YMMWORD[64+rsp],ymm6 + vpermq ymm13,ymm13,0x15 + vpshufd ymm12,ymm12,0xc8 + vmovdqa YMMWORD[96+rsp],ymm11 + vpermq ymm14,ymm14,0x15 + vpshufd ymm13,ymm13,0xc8 + vmovdqa YMMWORD[128+rsp],ymm12 + vpermq ymm15,ymm15,0x15 + vpshufd ymm14,ymm14,0xc8 + vmovdqa YMMWORD[160+rsp],ymm13 + vpermq ymm5,ymm5,0x15 + vpshufd ymm15,ymm15,0xc8 + vmovdqa YMMWORD[192+rsp],ymm14 + vpshufd ymm5,ymm5,0xc8 + vmovdqa YMMWORD[224+rsp],ymm15 + vmovdqa YMMWORD[256+rsp],ymm5 + vmovdqa ymm5,YMMWORD[64+rcx] + + + + vmovdqu xmm7,XMMWORD[rsi] + vmovdqu xmm8,XMMWORD[16+rsi] + vinserti128 ymm7,ymm7,XMMWORD[32+rsi],1 + vinserti128 ymm8,ymm8,XMMWORD[48+rsi],1 + lea rsi,[64+rsi] + + vpsrldq ymm9,ymm7,6 + vpsrldq ymm10,ymm8,6 + vpunpckhqdq ymm6,ymm7,ymm8 + vpunpcklqdq ymm9,ymm9,ymm10 + vpunpcklqdq ymm7,ymm7,ymm8 + + vpsrlq ymm10,ymm9,30 + vpsrlq ymm9,ymm9,4 + vpsrlq ymm8,ymm7,26 + vpsrlq ymm6,ymm6,40 + vpand ymm9,ymm9,ymm5 + vpand ymm7,ymm7,ymm5 + vpand ymm8,ymm8,ymm5 + vpand ymm10,ymm10,ymm5 + vpor ymm6,ymm6,YMMWORD[32+rcx] + + lea rax,[144+rsp] + vpaddq ymm2,ymm9,ymm2 + sub rdx,64 + jz NEAR $L$tail_avx2 + jmp NEAR $L$oop_avx2 + +ALIGN 32 +$L$oop_avx2: + + + + + + + + + vpaddq ymm0,ymm7,ymm0 + vmovdqa ymm7,YMMWORD[rsp] + vpaddq ymm1,ymm8,ymm1 + vmovdqa ymm8,YMMWORD[32+rsp] + vpaddq ymm3,ymm10,ymm3 + vmovdqa ymm9,YMMWORD[96+rsp] + vpaddq ymm4,ymm6,ymm4 + vmovdqa ymm10,YMMWORD[48+rax] + vmovdqa ymm5,YMMWORD[112+rax] + + + + + + + + + + + + + + + + + vpmuludq ymm13,ymm7,ymm2 + vpmuludq ymm14,ymm8,ymm2 + vpmuludq ymm15,ymm9,ymm2 + vpmuludq ymm11,ymm10,ymm2 + vpmuludq ymm12,ymm5,ymm2 + + vpmuludq ymm6,ymm8,ymm0 + vpmuludq ymm2,ymm8,ymm1 + vpaddq ymm12,ymm12,ymm6 + vpaddq ymm13,ymm13,ymm2 + vpmuludq ymm6,ymm8,ymm3 + vpmuludq ymm2,ymm4,YMMWORD[64+rsp] + vpaddq ymm15,ymm15,ymm6 + vpaddq ymm11,ymm11,ymm2 + vmovdqa ymm8,YMMWORD[((-16))+rax] + + vpmuludq ymm6,ymm7,ymm0 + vpmuludq ymm2,ymm7,ymm1 + vpaddq ymm11,ymm11,ymm6 + vpaddq ymm12,ymm12,ymm2 + vpmuludq ymm6,ymm7,ymm3 + vpmuludq ymm2,ymm7,ymm4 + vmovdqu xmm7,XMMWORD[rsi] + vpaddq ymm14,ymm14,ymm6 + vpaddq ymm15,ymm15,ymm2 + vinserti128 ymm7,ymm7,XMMWORD[32+rsi],1 + + vpmuludq ymm6,ymm8,ymm3 + vpmuludq ymm2,ymm8,ymm4 + vmovdqu xmm8,XMMWORD[16+rsi] + vpaddq ymm11,ymm11,ymm6 + vpaddq ymm12,ymm12,ymm2 + vmovdqa ymm2,YMMWORD[16+rax] + vpmuludq ymm6,ymm9,ymm1 + vpmuludq ymm9,ymm9,ymm0 + vpaddq ymm14,ymm14,ymm6 + vpaddq ymm13,ymm13,ymm9 + vinserti128 ymm8,ymm8,XMMWORD[48+rsi],1 + lea rsi,[64+rsi] + + vpmuludq ymm6,ymm2,ymm1 + vpmuludq ymm2,ymm2,ymm0 + vpsrldq ymm9,ymm7,6 + vpaddq ymm15,ymm15,ymm6 + vpaddq ymm14,ymm14,ymm2 + vpmuludq ymm6,ymm10,ymm3 + vpmuludq ymm2,ymm10,ymm4 + vpsrldq ymm10,ymm8,6 + vpaddq ymm12,ymm12,ymm6 + vpaddq ymm13,ymm13,ymm2 + vpunpckhqdq ymm6,ymm7,ymm8 + + vpmuludq ymm3,ymm5,ymm3 + vpmuludq ymm4,ymm5,ymm4 + vpunpcklqdq ymm7,ymm7,ymm8 + vpaddq ymm2,ymm13,ymm3 + vpaddq ymm3,ymm14,ymm4 + vpunpcklqdq ymm10,ymm9,ymm10 + vpmuludq ymm4,ymm0,YMMWORD[80+rax] + vpmuludq ymm0,ymm5,ymm1 + vmovdqa ymm5,YMMWORD[64+rcx] + vpaddq ymm4,ymm15,ymm4 + vpaddq ymm0,ymm11,ymm0 + + + + + vpsrlq ymm14,ymm3,26 + vpand ymm3,ymm3,ymm5 + vpaddq ymm4,ymm4,ymm14 + + vpsrlq ymm11,ymm0,26 + vpand ymm0,ymm0,ymm5 + vpaddq ymm1,ymm12,ymm11 + + vpsrlq ymm15,ymm4,26 + vpand ymm4,ymm4,ymm5 + + vpsrlq ymm9,ymm10,4 + + vpsrlq ymm12,ymm1,26 + vpand ymm1,ymm1,ymm5 + vpaddq ymm2,ymm2,ymm12 + + vpaddq ymm0,ymm0,ymm15 + vpsllq ymm15,ymm15,2 + vpaddq ymm0,ymm0,ymm15 + + vpand ymm9,ymm9,ymm5 + vpsrlq ymm8,ymm7,26 + + vpsrlq ymm13,ymm2,26 + vpand ymm2,ymm2,ymm5 + vpaddq ymm3,ymm3,ymm13 + + vpaddq ymm2,ymm2,ymm9 + vpsrlq ymm10,ymm10,30 + + vpsrlq ymm11,ymm0,26 + vpand ymm0,ymm0,ymm5 + vpaddq ymm1,ymm1,ymm11 + + vpsrlq ymm6,ymm6,40 + + vpsrlq ymm14,ymm3,26 + vpand ymm3,ymm3,ymm5 + vpaddq ymm4,ymm4,ymm14 + + vpand ymm7,ymm7,ymm5 + vpand ymm8,ymm8,ymm5 + vpand ymm10,ymm10,ymm5 + vpor ymm6,ymm6,YMMWORD[32+rcx] + + sub rdx,64 + jnz NEAR $L$oop_avx2 + +DB 0x66,0x90 +$L$tail_avx2: + + + + + + + + vpaddq ymm0,ymm7,ymm0 + vmovdqu ymm7,YMMWORD[4+rsp] + vpaddq ymm1,ymm8,ymm1 + vmovdqu ymm8,YMMWORD[36+rsp] + vpaddq ymm3,ymm10,ymm3 + vmovdqu ymm9,YMMWORD[100+rsp] + vpaddq ymm4,ymm6,ymm4 + vmovdqu ymm10,YMMWORD[52+rax] + vmovdqu ymm5,YMMWORD[116+rax] + + vpmuludq ymm13,ymm7,ymm2 + vpmuludq ymm14,ymm8,ymm2 + vpmuludq ymm15,ymm9,ymm2 + vpmuludq ymm11,ymm10,ymm2 + vpmuludq ymm12,ymm5,ymm2 + + vpmuludq ymm6,ymm8,ymm0 + vpmuludq ymm2,ymm8,ymm1 + vpaddq ymm12,ymm12,ymm6 + vpaddq ymm13,ymm13,ymm2 + vpmuludq ymm6,ymm8,ymm3 + vpmuludq ymm2,ymm4,YMMWORD[68+rsp] + vpaddq ymm15,ymm15,ymm6 + vpaddq ymm11,ymm11,ymm2 + + vpmuludq ymm6,ymm7,ymm0 + vpmuludq ymm2,ymm7,ymm1 + vpaddq ymm11,ymm11,ymm6 + vmovdqu ymm8,YMMWORD[((-12))+rax] + vpaddq ymm12,ymm12,ymm2 + vpmuludq ymm6,ymm7,ymm3 + vpmuludq ymm2,ymm7,ymm4 + vpaddq ymm14,ymm14,ymm6 + vpaddq ymm15,ymm15,ymm2 + + vpmuludq ymm6,ymm8,ymm3 + vpmuludq ymm2,ymm8,ymm4 + vpaddq ymm11,ymm11,ymm6 + vpaddq ymm12,ymm12,ymm2 + vmovdqu ymm2,YMMWORD[20+rax] + vpmuludq ymm6,ymm9,ymm1 + vpmuludq ymm9,ymm9,ymm0 + vpaddq ymm14,ymm14,ymm6 + vpaddq ymm13,ymm13,ymm9 + + vpmuludq ymm6,ymm2,ymm1 + vpmuludq ymm2,ymm2,ymm0 + vpaddq ymm15,ymm15,ymm6 + vpaddq ymm14,ymm14,ymm2 + vpmuludq ymm6,ymm10,ymm3 + vpmuludq ymm2,ymm10,ymm4 + vpaddq ymm12,ymm12,ymm6 + vpaddq ymm13,ymm13,ymm2 + + vpmuludq ymm3,ymm5,ymm3 + vpmuludq ymm4,ymm5,ymm4 + vpaddq ymm2,ymm13,ymm3 + vpaddq ymm3,ymm14,ymm4 + vpmuludq ymm4,ymm0,YMMWORD[84+rax] + vpmuludq ymm0,ymm5,ymm1 + vmovdqa ymm5,YMMWORD[64+rcx] + vpaddq ymm4,ymm15,ymm4 + vpaddq ymm0,ymm11,ymm0 + + + + + vpsrldq ymm8,ymm12,8 + vpsrldq ymm9,ymm2,8 + vpsrldq ymm10,ymm3,8 + vpsrldq ymm6,ymm4,8 + vpsrldq ymm7,ymm0,8 + vpaddq ymm12,ymm12,ymm8 + vpaddq ymm2,ymm2,ymm9 + vpaddq ymm3,ymm3,ymm10 + vpaddq ymm4,ymm4,ymm6 + vpaddq ymm0,ymm0,ymm7 + + vpermq ymm10,ymm3,0x2 + vpermq ymm6,ymm4,0x2 + vpermq ymm7,ymm0,0x2 + vpermq ymm8,ymm12,0x2 + vpermq ymm9,ymm2,0x2 + vpaddq ymm3,ymm3,ymm10 + vpaddq ymm4,ymm4,ymm6 + vpaddq ymm0,ymm0,ymm7 + vpaddq ymm12,ymm12,ymm8 + vpaddq ymm2,ymm2,ymm9 + + + + + vpsrlq ymm14,ymm3,26 + vpand ymm3,ymm3,ymm5 + vpaddq ymm4,ymm4,ymm14 + + vpsrlq ymm11,ymm0,26 + vpand ymm0,ymm0,ymm5 + vpaddq ymm1,ymm12,ymm11 + + vpsrlq ymm15,ymm4,26 + vpand ymm4,ymm4,ymm5 + + vpsrlq ymm12,ymm1,26 + vpand ymm1,ymm1,ymm5 + vpaddq ymm2,ymm2,ymm12 + + vpaddq ymm0,ymm0,ymm15 + vpsllq ymm15,ymm15,2 + vpaddq ymm0,ymm0,ymm15 + + vpsrlq ymm13,ymm2,26 + vpand ymm2,ymm2,ymm5 + vpaddq ymm3,ymm3,ymm13 + + vpsrlq ymm11,ymm0,26 + vpand ymm0,ymm0,ymm5 + vpaddq ymm1,ymm1,ymm11 + + vpsrlq ymm14,ymm3,26 + vpand ymm3,ymm3,ymm5 + vpaddq ymm4,ymm4,ymm14 + + vmovd DWORD[(-112)+rdi],xmm0 + vmovd DWORD[(-108)+rdi],xmm1 + vmovd DWORD[(-104)+rdi],xmm2 + vmovd DWORD[(-100)+rdi],xmm3 + vmovd DWORD[(-96)+rdi],xmm4 + vmovdqa xmm6,XMMWORD[80+r11] + vmovdqa xmm7,XMMWORD[96+r11] + vmovdqa xmm8,XMMWORD[112+r11] + vmovdqa xmm9,XMMWORD[128+r11] + vmovdqa xmm10,XMMWORD[144+r11] + vmovdqa xmm11,XMMWORD[160+r11] + vmovdqa xmm12,XMMWORD[176+r11] + vmovdqa xmm13,XMMWORD[192+r11] + vmovdqa xmm14,XMMWORD[208+r11] + vmovdqa xmm15,XMMWORD[224+r11] + lea rsp,[248+r11] +$L$do_avx2_epilogue: + vzeroupper + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_poly1305_blocks_avx2: +ALIGN 64 +$L$const: +$L$mask24: + DD 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 +$L$129: + DD 16777216,0,16777216,0,16777216,0,16777216,0 +$L$mask26: + DD 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 +$L$five: + DD 5,0,5,0,5,0,5,0 +DB 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54 +DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32 +DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115 +DB 108,46,111,114,103,62,0 +ALIGN 16 +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rax,[48+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + + jmp NEAR $L$common_seh_tail + + + +ALIGN 16 +avx_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov rax,QWORD[208+r8] + + lea rsi,[80+rax] + lea rax,[248+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_poly1305_init wrt ..imagebase + DD $L$SEH_end_poly1305_init wrt ..imagebase + DD $L$SEH_info_poly1305_init wrt ..imagebase + + DD $L$SEH_begin_poly1305_blocks wrt ..imagebase + DD $L$SEH_end_poly1305_blocks wrt ..imagebase + DD $L$SEH_info_poly1305_blocks wrt ..imagebase + + DD $L$SEH_begin_poly1305_emit wrt ..imagebase + DD $L$SEH_end_poly1305_emit wrt ..imagebase + DD $L$SEH_info_poly1305_emit wrt ..imagebase + DD $L$SEH_begin_poly1305_blocks_avx wrt ..imagebase + DD $L$base2_64_avx wrt ..imagebase + DD $L$SEH_info_poly1305_blocks_avx_1 wrt ..imagebase + + DD $L$base2_64_avx wrt ..imagebase + DD $L$even_avx wrt ..imagebase + DD $L$SEH_info_poly1305_blocks_avx_2 wrt ..imagebase + + DD $L$even_avx wrt ..imagebase + DD $L$SEH_end_poly1305_blocks_avx wrt ..imagebase + DD $L$SEH_info_poly1305_blocks_avx_3 wrt ..imagebase + + DD $L$SEH_begin_poly1305_emit_avx wrt ..imagebase + DD $L$SEH_end_poly1305_emit_avx wrt ..imagebase + DD $L$SEH_info_poly1305_emit_avx wrt ..imagebase + DD $L$SEH_begin_poly1305_blocks_avx2 wrt ..imagebase + DD $L$base2_64_avx2 wrt ..imagebase + DD $L$SEH_info_poly1305_blocks_avx2_1 wrt ..imagebase + + DD $L$base2_64_avx2 wrt ..imagebase + DD $L$even_avx2 wrt ..imagebase + DD $L$SEH_info_poly1305_blocks_avx2_2 wrt ..imagebase + + DD $L$even_avx2 wrt ..imagebase + DD $L$SEH_end_poly1305_blocks_avx2 wrt ..imagebase + DD $L$SEH_info_poly1305_blocks_avx2_3 wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_poly1305_init: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$SEH_begin_poly1305_init wrt ..imagebase,$L$SEH_begin_poly1305_init wrt ..imagebase + +$L$SEH_info_poly1305_blocks: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$blocks_body wrt ..imagebase,$L$blocks_epilogue wrt ..imagebase + +$L$SEH_info_poly1305_emit: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$SEH_begin_poly1305_emit wrt ..imagebase,$L$SEH_begin_poly1305_emit wrt ..imagebase +$L$SEH_info_poly1305_blocks_avx_1: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$blocks_avx_body wrt ..imagebase,$L$blocks_avx_epilogue wrt ..imagebase + +$L$SEH_info_poly1305_blocks_avx_2: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$base2_64_avx_body wrt ..imagebase,$L$base2_64_avx_epilogue wrt ..imagebase + +$L$SEH_info_poly1305_blocks_avx_3: +DB 9,0,0,0 + DD avx_handler wrt ..imagebase + DD $L$do_avx_body wrt ..imagebase,$L$do_avx_epilogue wrt ..imagebase + +$L$SEH_info_poly1305_emit_avx: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$SEH_begin_poly1305_emit_avx wrt ..imagebase,$L$SEH_begin_poly1305_emit_avx wrt ..imagebase +$L$SEH_info_poly1305_blocks_avx2_1: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$blocks_avx2_body wrt ..imagebase,$L$blocks_avx2_epilogue wrt ..imagebase + +$L$SEH_info_poly1305_blocks_avx2_2: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$base2_64_avx2_body wrt ..imagebase,$L$base2_64_avx2_epilogue wrt ..imagebase + +$L$SEH_info_poly1305_blocks_avx2_3: +DB 9,0,0,0 + DD avx_handler wrt ..imagebase + DD $L$do_avx2_body wrt ..imagebase,$L$do_avx2_epilogue wrt ..imagebase |