diff options
Diffstat (limited to 'deps/openssl/config/archs/VC-WIN64A/asm_avx2/crypto/chacha/chacha-x86_64.asm')
-rw-r--r-- | deps/openssl/config/archs/VC-WIN64A/asm_avx2/crypto/chacha/chacha-x86_64.asm | 3917 |
1 files changed, 3917 insertions, 0 deletions
diff --git a/deps/openssl/config/archs/VC-WIN64A/asm_avx2/crypto/chacha/chacha-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm_avx2/crypto/chacha/chacha-x86_64.asm new file mode 100644 index 0000000000..1a2003ea1f --- /dev/null +++ b/deps/openssl/config/archs/VC-WIN64A/asm_avx2/crypto/chacha/chacha-x86_64.asm @@ -0,0 +1,3917 @@ +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + + +EXTERN OPENSSL_ia32cap_P + +ALIGN 64 +$L$zero: + DD 0,0,0,0 +$L$one: + DD 1,0,0,0 +$L$inc: + DD 0,1,2,3 +$L$four: + DD 4,4,4,4 +$L$incy: + DD 0,2,4,6,1,3,5,7 +$L$eight: + DD 8,8,8,8,8,8,8,8 +$L$rot16: +DB 0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd +$L$rot24: +DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe +$L$twoy: + DD 2,0,0,0,2,0,0,0 +ALIGN 64 +$L$zeroz: + DD 0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0 +$L$fourz: + DD 4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0 +$L$incz: + DD 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +$L$sixteen: + DD 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 +$L$sigma: +DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107 +DB 0 +DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 +DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32 +DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115 +DB 108,46,111,114,103,62,0 +global ChaCha20_ctr32 + +ALIGN 64 +ChaCha20_ctr32: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_ctr32: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + + cmp rdx,0 + je NEAR $L$no_data + mov r10,QWORD[((OPENSSL_ia32cap_P+4))] + bt r10,48 + jc NEAR $L$ChaCha20_avx512 + test r10,r10 + js NEAR $L$ChaCha20_avx512vl + test r10d,512 + jnz NEAR $L$ChaCha20_ssse3 + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,64+24 + +$L$ctr32_body: + + + movdqu xmm1,XMMWORD[rcx] + movdqu xmm2,XMMWORD[16+rcx] + movdqu xmm3,XMMWORD[r8] + movdqa xmm4,XMMWORD[$L$one] + + + movdqa XMMWORD[16+rsp],xmm1 + movdqa XMMWORD[32+rsp],xmm2 + movdqa XMMWORD[48+rsp],xmm3 + mov rbp,rdx + jmp NEAR $L$oop_outer + +ALIGN 32 +$L$oop_outer: + mov eax,0x61707865 + mov ebx,0x3320646e + mov ecx,0x79622d32 + mov edx,0x6b206574 + mov r8d,DWORD[16+rsp] + mov r9d,DWORD[20+rsp] + mov r10d,DWORD[24+rsp] + mov r11d,DWORD[28+rsp] + movd r12d,xmm3 + mov r13d,DWORD[52+rsp] + mov r14d,DWORD[56+rsp] + mov r15d,DWORD[60+rsp] + + mov QWORD[((64+0))+rsp],rbp + mov ebp,10 + mov QWORD[((64+8))+rsp],rsi +DB 102,72,15,126,214 + mov QWORD[((64+16))+rsp],rdi + mov rdi,rsi + shr rdi,32 + jmp NEAR $L$oop + +ALIGN 32 +$L$oop: + add eax,r8d + xor r12d,eax + rol r12d,16 + add ebx,r9d + xor r13d,ebx + rol r13d,16 + add esi,r12d + xor r8d,esi + rol r8d,12 + add edi,r13d + xor r9d,edi + rol r9d,12 + add eax,r8d + xor r12d,eax + rol r12d,8 + add ebx,r9d + xor r13d,ebx + rol r13d,8 + add esi,r12d + xor r8d,esi + rol r8d,7 + add edi,r13d + xor r9d,edi + rol r9d,7 + mov DWORD[32+rsp],esi + mov DWORD[36+rsp],edi + mov esi,DWORD[40+rsp] + mov edi,DWORD[44+rsp] + add ecx,r10d + xor r14d,ecx + rol r14d,16 + add edx,r11d + xor r15d,edx + rol r15d,16 + add esi,r14d + xor r10d,esi + rol r10d,12 + add edi,r15d + xor r11d,edi + rol r11d,12 + add ecx,r10d + xor r14d,ecx + rol r14d,8 + add edx,r11d + xor r15d,edx + rol r15d,8 + add esi,r14d + xor r10d,esi + rol r10d,7 + add edi,r15d + xor r11d,edi + rol r11d,7 + add eax,r9d + xor r15d,eax + rol r15d,16 + add ebx,r10d + xor r12d,ebx + rol r12d,16 + add esi,r15d + xor r9d,esi + rol r9d,12 + add edi,r12d + xor r10d,edi + rol r10d,12 + add eax,r9d + xor r15d,eax + rol r15d,8 + add ebx,r10d + xor r12d,ebx + rol r12d,8 + add esi,r15d + xor r9d,esi + rol r9d,7 + add edi,r12d + xor r10d,edi + rol r10d,7 + mov DWORD[40+rsp],esi + mov DWORD[44+rsp],edi + mov esi,DWORD[32+rsp] + mov edi,DWORD[36+rsp] + add ecx,r11d + xor r13d,ecx + rol r13d,16 + add edx,r8d + xor r14d,edx + rol r14d,16 + add esi,r13d + xor r11d,esi + rol r11d,12 + add edi,r14d + xor r8d,edi + rol r8d,12 + add ecx,r11d + xor r13d,ecx + rol r13d,8 + add edx,r8d + xor r14d,edx + rol r14d,8 + add esi,r13d + xor r11d,esi + rol r11d,7 + add edi,r14d + xor r8d,edi + rol r8d,7 + dec ebp + jnz NEAR $L$oop + mov DWORD[36+rsp],edi + mov DWORD[32+rsp],esi + mov rbp,QWORD[64+rsp] + movdqa xmm1,xmm2 + mov rsi,QWORD[((64+8))+rsp] + paddd xmm3,xmm4 + mov rdi,QWORD[((64+16))+rsp] + + add eax,0x61707865 + add ebx,0x3320646e + add ecx,0x79622d32 + add edx,0x6b206574 + add r8d,DWORD[16+rsp] + add r9d,DWORD[20+rsp] + add r10d,DWORD[24+rsp] + add r11d,DWORD[28+rsp] + add r12d,DWORD[48+rsp] + add r13d,DWORD[52+rsp] + add r14d,DWORD[56+rsp] + add r15d,DWORD[60+rsp] + paddd xmm1,XMMWORD[32+rsp] + + cmp rbp,64 + jb NEAR $L$tail + + xor eax,DWORD[rsi] + xor ebx,DWORD[4+rsi] + xor ecx,DWORD[8+rsi] + xor edx,DWORD[12+rsi] + xor r8d,DWORD[16+rsi] + xor r9d,DWORD[20+rsi] + xor r10d,DWORD[24+rsi] + xor r11d,DWORD[28+rsi] + movdqu xmm0,XMMWORD[32+rsi] + xor r12d,DWORD[48+rsi] + xor r13d,DWORD[52+rsi] + xor r14d,DWORD[56+rsi] + xor r15d,DWORD[60+rsi] + lea rsi,[64+rsi] + pxor xmm0,xmm1 + + movdqa XMMWORD[32+rsp],xmm2 + movd DWORD[48+rsp],xmm3 + + mov DWORD[rdi],eax + mov DWORD[4+rdi],ebx + mov DWORD[8+rdi],ecx + mov DWORD[12+rdi],edx + mov DWORD[16+rdi],r8d + mov DWORD[20+rdi],r9d + mov DWORD[24+rdi],r10d + mov DWORD[28+rdi],r11d + movdqu XMMWORD[32+rdi],xmm0 + mov DWORD[48+rdi],r12d + mov DWORD[52+rdi],r13d + mov DWORD[56+rdi],r14d + mov DWORD[60+rdi],r15d + lea rdi,[64+rdi] + + sub rbp,64 + jnz NEAR $L$oop_outer + + jmp NEAR $L$done + +ALIGN 16 +$L$tail: + mov DWORD[rsp],eax + mov DWORD[4+rsp],ebx + xor rbx,rbx + mov DWORD[8+rsp],ecx + mov DWORD[12+rsp],edx + mov DWORD[16+rsp],r8d + mov DWORD[20+rsp],r9d + mov DWORD[24+rsp],r10d + mov DWORD[28+rsp],r11d + movdqa XMMWORD[32+rsp],xmm1 + mov DWORD[48+rsp],r12d + mov DWORD[52+rsp],r13d + mov DWORD[56+rsp],r14d + mov DWORD[60+rsp],r15d + +$L$oop_tail: + movzx eax,BYTE[rbx*1+rsi] + movzx edx,BYTE[rbx*1+rsp] + lea rbx,[1+rbx] + xor eax,edx + mov BYTE[((-1))+rbx*1+rdi],al + dec rbp + jnz NEAR $L$oop_tail + +$L$done: + lea rsi,[((64+24+48))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$no_data: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ChaCha20_ctr32: + +ALIGN 32 +ChaCha20_ssse3: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_ssse3: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +$L$ChaCha20_ssse3: + mov r9,rsp + + test r10d,2048 + jnz NEAR $L$ChaCha20_4xop + cmp rdx,128 + je NEAR $L$ChaCha20_128 + ja NEAR $L$ChaCha20_4x + +$L$do_sse3_after_all: + sub rsp,64+40 + movaps XMMWORD[(-40)+r9],xmm6 + movaps XMMWORD[(-24)+r9],xmm7 +$L$ssse3_body: + movdqa xmm0,XMMWORD[$L$sigma] + movdqu xmm1,XMMWORD[rcx] + movdqu xmm2,XMMWORD[16+rcx] + movdqu xmm3,XMMWORD[r8] + movdqa xmm6,XMMWORD[$L$rot16] + movdqa xmm7,XMMWORD[$L$rot24] + + movdqa XMMWORD[rsp],xmm0 + movdqa XMMWORD[16+rsp],xmm1 + movdqa XMMWORD[32+rsp],xmm2 + movdqa XMMWORD[48+rsp],xmm3 + mov r8,10 + jmp NEAR $L$oop_ssse3 + +ALIGN 32 +$L$oop_outer_ssse3: + movdqa xmm3,XMMWORD[$L$one] + movdqa xmm0,XMMWORD[rsp] + movdqa xmm1,XMMWORD[16+rsp] + movdqa xmm2,XMMWORD[32+rsp] + paddd xmm3,XMMWORD[48+rsp] + mov r8,10 + movdqa XMMWORD[48+rsp],xmm3 + jmp NEAR $L$oop_ssse3 + +ALIGN 32 +$L$oop_ssse3: + paddd xmm0,xmm1 + pxor xmm3,xmm0 +DB 102,15,56,0,222 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,20 + pslld xmm4,12 + por xmm1,xmm4 + paddd xmm0,xmm1 + pxor xmm3,xmm0 +DB 102,15,56,0,223 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,25 + pslld xmm4,7 + por xmm1,xmm4 + pshufd xmm2,xmm2,78 + pshufd xmm1,xmm1,57 + pshufd xmm3,xmm3,147 + nop + paddd xmm0,xmm1 + pxor xmm3,xmm0 +DB 102,15,56,0,222 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,20 + pslld xmm4,12 + por xmm1,xmm4 + paddd xmm0,xmm1 + pxor xmm3,xmm0 +DB 102,15,56,0,223 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,25 + pslld xmm4,7 + por xmm1,xmm4 + pshufd xmm2,xmm2,78 + pshufd xmm1,xmm1,147 + pshufd xmm3,xmm3,57 + dec r8 + jnz NEAR $L$oop_ssse3 + paddd xmm0,XMMWORD[rsp] + paddd xmm1,XMMWORD[16+rsp] + paddd xmm2,XMMWORD[32+rsp] + paddd xmm3,XMMWORD[48+rsp] + + cmp rdx,64 + jb NEAR $L$tail_ssse3 + + movdqu xmm4,XMMWORD[rsi] + movdqu xmm5,XMMWORD[16+rsi] + pxor xmm0,xmm4 + movdqu xmm4,XMMWORD[32+rsi] + pxor xmm1,xmm5 + movdqu xmm5,XMMWORD[48+rsi] + lea rsi,[64+rsi] + pxor xmm2,xmm4 + pxor xmm3,xmm5 + + movdqu XMMWORD[rdi],xmm0 + movdqu XMMWORD[16+rdi],xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + lea rdi,[64+rdi] + + sub rdx,64 + jnz NEAR $L$oop_outer_ssse3 + + jmp NEAR $L$done_ssse3 + +ALIGN 16 +$L$tail_ssse3: + movdqa XMMWORD[rsp],xmm0 + movdqa XMMWORD[16+rsp],xmm1 + movdqa XMMWORD[32+rsp],xmm2 + movdqa XMMWORD[48+rsp],xmm3 + xor r8,r8 + +$L$oop_tail_ssse3: + movzx eax,BYTE[r8*1+rsi] + movzx ecx,BYTE[r8*1+rsp] + lea r8,[1+r8] + xor eax,ecx + mov BYTE[((-1))+r8*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail_ssse3 + +$L$done_ssse3: + movaps xmm6,XMMWORD[((-40))+r9] + movaps xmm7,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$ssse3_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ChaCha20_ssse3: + +ALIGN 32 +ChaCha20_128: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_128: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +$L$ChaCha20_128: + mov r9,rsp + + sub rsp,64+104 + movaps XMMWORD[(-104)+r9],xmm6 + movaps XMMWORD[(-88)+r9],xmm7 + movaps XMMWORD[(-72)+r9],xmm8 + movaps XMMWORD[(-56)+r9],xmm9 + movaps XMMWORD[(-40)+r9],xmm10 + movaps XMMWORD[(-24)+r9],xmm11 +$L$128_body: + movdqa xmm8,XMMWORD[$L$sigma] + movdqu xmm9,XMMWORD[rcx] + movdqu xmm2,XMMWORD[16+rcx] + movdqu xmm3,XMMWORD[r8] + movdqa xmm1,XMMWORD[$L$one] + movdqa xmm6,XMMWORD[$L$rot16] + movdqa xmm7,XMMWORD[$L$rot24] + + movdqa xmm10,xmm8 + movdqa XMMWORD[rsp],xmm8 + movdqa xmm11,xmm9 + movdqa XMMWORD[16+rsp],xmm9 + movdqa xmm0,xmm2 + movdqa XMMWORD[32+rsp],xmm2 + paddd xmm1,xmm3 + movdqa XMMWORD[48+rsp],xmm3 + mov r8,10 + jmp NEAR $L$oop_128 + +ALIGN 32 +$L$oop_128: + paddd xmm8,xmm9 + pxor xmm3,xmm8 + paddd xmm10,xmm11 + pxor xmm1,xmm10 +DB 102,15,56,0,222 +DB 102,15,56,0,206 + paddd xmm2,xmm3 + paddd xmm0,xmm1 + pxor xmm9,xmm2 + pxor xmm11,xmm0 + movdqa xmm4,xmm9 + psrld xmm9,20 + movdqa xmm5,xmm11 + pslld xmm4,12 + psrld xmm11,20 + por xmm9,xmm4 + pslld xmm5,12 + por xmm11,xmm5 + paddd xmm8,xmm9 + pxor xmm3,xmm8 + paddd xmm10,xmm11 + pxor xmm1,xmm10 +DB 102,15,56,0,223 +DB 102,15,56,0,207 + paddd xmm2,xmm3 + paddd xmm0,xmm1 + pxor xmm9,xmm2 + pxor xmm11,xmm0 + movdqa xmm4,xmm9 + psrld xmm9,25 + movdqa xmm5,xmm11 + pslld xmm4,7 + psrld xmm11,25 + por xmm9,xmm4 + pslld xmm5,7 + por xmm11,xmm5 + pshufd xmm2,xmm2,78 + pshufd xmm9,xmm9,57 + pshufd xmm3,xmm3,147 + pshufd xmm0,xmm0,78 + pshufd xmm11,xmm11,57 + pshufd xmm1,xmm1,147 + paddd xmm8,xmm9 + pxor xmm3,xmm8 + paddd xmm10,xmm11 + pxor xmm1,xmm10 +DB 102,15,56,0,222 +DB 102,15,56,0,206 + paddd xmm2,xmm3 + paddd xmm0,xmm1 + pxor xmm9,xmm2 + pxor xmm11,xmm0 + movdqa xmm4,xmm9 + psrld xmm9,20 + movdqa xmm5,xmm11 + pslld xmm4,12 + psrld xmm11,20 + por xmm9,xmm4 + pslld xmm5,12 + por xmm11,xmm5 + paddd xmm8,xmm9 + pxor xmm3,xmm8 + paddd xmm10,xmm11 + pxor xmm1,xmm10 +DB 102,15,56,0,223 +DB 102,15,56,0,207 + paddd xmm2,xmm3 + paddd xmm0,xmm1 + pxor xmm9,xmm2 + pxor xmm11,xmm0 + movdqa xmm4,xmm9 + psrld xmm9,25 + movdqa xmm5,xmm11 + pslld xmm4,7 + psrld xmm11,25 + por xmm9,xmm4 + pslld xmm5,7 + por xmm11,xmm5 + pshufd xmm2,xmm2,78 + pshufd xmm9,xmm9,147 + pshufd xmm3,xmm3,57 + pshufd xmm0,xmm0,78 + pshufd xmm11,xmm11,147 + pshufd xmm1,xmm1,57 + dec r8 + jnz NEAR $L$oop_128 + paddd xmm8,XMMWORD[rsp] + paddd xmm9,XMMWORD[16+rsp] + paddd xmm2,XMMWORD[32+rsp] + paddd xmm3,XMMWORD[48+rsp] + paddd xmm1,XMMWORD[$L$one] + paddd xmm10,XMMWORD[rsp] + paddd xmm11,XMMWORD[16+rsp] + paddd xmm0,XMMWORD[32+rsp] + paddd xmm1,XMMWORD[48+rsp] + + movdqu xmm4,XMMWORD[rsi] + movdqu xmm5,XMMWORD[16+rsi] + pxor xmm8,xmm4 + movdqu xmm4,XMMWORD[32+rsi] + pxor xmm9,xmm5 + movdqu xmm5,XMMWORD[48+rsi] + pxor xmm2,xmm4 + movdqu xmm4,XMMWORD[64+rsi] + pxor xmm3,xmm5 + movdqu xmm5,XMMWORD[80+rsi] + pxor xmm10,xmm4 + movdqu xmm4,XMMWORD[96+rsi] + pxor xmm11,xmm5 + movdqu xmm5,XMMWORD[112+rsi] + pxor xmm0,xmm4 + pxor xmm1,xmm5 + + movdqu XMMWORD[rdi],xmm8 + movdqu XMMWORD[16+rdi],xmm9 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + movdqu XMMWORD[64+rdi],xmm10 + movdqu XMMWORD[80+rdi],xmm11 + movdqu XMMWORD[96+rdi],xmm0 + movdqu XMMWORD[112+rdi],xmm1 + movaps xmm6,XMMWORD[((-104))+r9] + movaps xmm7,XMMWORD[((-88))+r9] + movaps xmm8,XMMWORD[((-72))+r9] + movaps xmm9,XMMWORD[((-56))+r9] + movaps xmm10,XMMWORD[((-40))+r9] + movaps xmm11,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$128_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ChaCha20_128: + +ALIGN 32 +ChaCha20_4x: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_4x: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +$L$ChaCha20_4x: + mov r9,rsp + + mov r11,r10 + shr r10,32 + test r10,32 + jnz NEAR $L$ChaCha20_8x + cmp rdx,192 + ja NEAR $L$proceed4x + + and r11,71303168 + cmp r11,4194304 + je NEAR $L$do_sse3_after_all + +$L$proceed4x: + sub rsp,0x140+168 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$4x_body: + movdqa xmm11,XMMWORD[$L$sigma] + movdqu xmm15,XMMWORD[rcx] + movdqu xmm7,XMMWORD[16+rcx] + movdqu xmm3,XMMWORD[r8] + lea rcx,[256+rsp] + lea r10,[$L$rot16] + lea r11,[$L$rot24] + + pshufd xmm8,xmm11,0x00 + pshufd xmm9,xmm11,0x55 + movdqa XMMWORD[64+rsp],xmm8 + pshufd xmm10,xmm11,0xaa + movdqa XMMWORD[80+rsp],xmm9 + pshufd xmm11,xmm11,0xff + movdqa XMMWORD[96+rsp],xmm10 + movdqa XMMWORD[112+rsp],xmm11 + + pshufd xmm12,xmm15,0x00 + pshufd xmm13,xmm15,0x55 + movdqa XMMWORD[(128-256)+rcx],xmm12 + pshufd xmm14,xmm15,0xaa + movdqa XMMWORD[(144-256)+rcx],xmm13 + pshufd xmm15,xmm15,0xff + movdqa XMMWORD[(160-256)+rcx],xmm14 + movdqa XMMWORD[(176-256)+rcx],xmm15 + + pshufd xmm4,xmm7,0x00 + pshufd xmm5,xmm7,0x55 + movdqa XMMWORD[(192-256)+rcx],xmm4 + pshufd xmm6,xmm7,0xaa + movdqa XMMWORD[(208-256)+rcx],xmm5 + pshufd xmm7,xmm7,0xff + movdqa XMMWORD[(224-256)+rcx],xmm6 + movdqa XMMWORD[(240-256)+rcx],xmm7 + + pshufd xmm0,xmm3,0x00 + pshufd xmm1,xmm3,0x55 + paddd xmm0,XMMWORD[$L$inc] + pshufd xmm2,xmm3,0xaa + movdqa XMMWORD[(272-256)+rcx],xmm1 + pshufd xmm3,xmm3,0xff + movdqa XMMWORD[(288-256)+rcx],xmm2 + movdqa XMMWORD[(304-256)+rcx],xmm3 + + jmp NEAR $L$oop_enter4x + +ALIGN 32 +$L$oop_outer4x: + movdqa xmm8,XMMWORD[64+rsp] + movdqa xmm9,XMMWORD[80+rsp] + movdqa xmm10,XMMWORD[96+rsp] + movdqa xmm11,XMMWORD[112+rsp] + movdqa xmm12,XMMWORD[((128-256))+rcx] + movdqa xmm13,XMMWORD[((144-256))+rcx] + movdqa xmm14,XMMWORD[((160-256))+rcx] + movdqa xmm15,XMMWORD[((176-256))+rcx] + movdqa xmm4,XMMWORD[((192-256))+rcx] + movdqa xmm5,XMMWORD[((208-256))+rcx] + movdqa xmm6,XMMWORD[((224-256))+rcx] + movdqa xmm7,XMMWORD[((240-256))+rcx] + movdqa xmm0,XMMWORD[((256-256))+rcx] + movdqa xmm1,XMMWORD[((272-256))+rcx] + movdqa xmm2,XMMWORD[((288-256))+rcx] + movdqa xmm3,XMMWORD[((304-256))+rcx] + paddd xmm0,XMMWORD[$L$four] + +$L$oop_enter4x: + movdqa XMMWORD[32+rsp],xmm6 + movdqa XMMWORD[48+rsp],xmm7 + movdqa xmm7,XMMWORD[r10] + mov eax,10 + movdqa XMMWORD[(256-256)+rcx],xmm0 + jmp NEAR $L$oop4x + +ALIGN 32 +$L$oop4x: + paddd xmm8,xmm12 + paddd xmm9,xmm13 + pxor xmm0,xmm8 + pxor xmm1,xmm9 +DB 102,15,56,0,199 +DB 102,15,56,0,207 + paddd xmm4,xmm0 + paddd xmm5,xmm1 + pxor xmm12,xmm4 + pxor xmm13,xmm5 + movdqa xmm6,xmm12 + pslld xmm12,12 + psrld xmm6,20 + movdqa xmm7,xmm13 + pslld xmm13,12 + por xmm12,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm13,xmm7 + paddd xmm8,xmm12 + paddd xmm9,xmm13 + pxor xmm0,xmm8 + pxor xmm1,xmm9 +DB 102,15,56,0,198 +DB 102,15,56,0,206 + paddd xmm4,xmm0 + paddd xmm5,xmm1 + pxor xmm12,xmm4 + pxor xmm13,xmm5 + movdqa xmm7,xmm12 + pslld xmm12,7 + psrld xmm7,25 + movdqa xmm6,xmm13 + pslld xmm13,7 + por xmm12,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm13,xmm6 + movdqa XMMWORD[rsp],xmm4 + movdqa XMMWORD[16+rsp],xmm5 + movdqa xmm4,XMMWORD[32+rsp] + movdqa xmm5,XMMWORD[48+rsp] + paddd xmm10,xmm14 + paddd xmm11,xmm15 + pxor xmm2,xmm10 + pxor xmm3,xmm11 +DB 102,15,56,0,215 +DB 102,15,56,0,223 + paddd xmm4,xmm2 + paddd xmm5,xmm3 + pxor xmm14,xmm4 + pxor xmm15,xmm5 + movdqa xmm6,xmm14 + pslld xmm14,12 + psrld xmm6,20 + movdqa xmm7,xmm15 + pslld xmm15,12 + por xmm14,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm15,xmm7 + paddd xmm10,xmm14 + paddd xmm11,xmm15 + pxor xmm2,xmm10 + pxor xmm3,xmm11 +DB 102,15,56,0,214 +DB 102,15,56,0,222 + paddd xmm4,xmm2 + paddd xmm5,xmm3 + pxor xmm14,xmm4 + pxor xmm15,xmm5 + movdqa xmm7,xmm14 + pslld xmm14,7 + psrld xmm7,25 + movdqa xmm6,xmm15 + pslld xmm15,7 + por xmm14,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm15,xmm6 + paddd xmm8,xmm13 + paddd xmm9,xmm14 + pxor xmm3,xmm8 + pxor xmm0,xmm9 +DB 102,15,56,0,223 +DB 102,15,56,0,199 + paddd xmm4,xmm3 + paddd xmm5,xmm0 + pxor xmm13,xmm4 + pxor xmm14,xmm5 + movdqa xmm6,xmm13 + pslld xmm13,12 + psrld xmm6,20 + movdqa xmm7,xmm14 + pslld xmm14,12 + por xmm13,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm14,xmm7 + paddd xmm8,xmm13 + paddd xmm9,xmm14 + pxor xmm3,xmm8 + pxor xmm0,xmm9 +DB 102,15,56,0,222 +DB 102,15,56,0,198 + paddd xmm4,xmm3 + paddd xmm5,xmm0 + pxor xmm13,xmm4 + pxor xmm14,xmm5 + movdqa xmm7,xmm13 + pslld xmm13,7 + psrld xmm7,25 + movdqa xmm6,xmm14 + pslld xmm14,7 + por xmm13,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm14,xmm6 + movdqa XMMWORD[32+rsp],xmm4 + movdqa XMMWORD[48+rsp],xmm5 + movdqa xmm4,XMMWORD[rsp] + movdqa xmm5,XMMWORD[16+rsp] + paddd xmm10,xmm15 + paddd xmm11,xmm12 + pxor xmm1,xmm10 + pxor xmm2,xmm11 +DB 102,15,56,0,207 +DB 102,15,56,0,215 + paddd xmm4,xmm1 + paddd xmm5,xmm2 + pxor xmm15,xmm4 + pxor xmm12,xmm5 + movdqa xmm6,xmm15 + pslld xmm15,12 + psrld xmm6,20 + movdqa xmm7,xmm12 + pslld xmm12,12 + por xmm15,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm12,xmm7 + paddd xmm10,xmm15 + paddd xmm11,xmm12 + pxor xmm1,xmm10 + pxor xmm2,xmm11 +DB 102,15,56,0,206 +DB 102,15,56,0,214 + paddd xmm4,xmm1 + paddd xmm5,xmm2 + pxor xmm15,xmm4 + pxor xmm12,xmm5 + movdqa xmm7,xmm15 + pslld xmm15,7 + psrld xmm7,25 + movdqa xmm6,xmm12 + pslld xmm12,7 + por xmm15,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm12,xmm6 + dec eax + jnz NEAR $L$oop4x + + paddd xmm8,XMMWORD[64+rsp] + paddd xmm9,XMMWORD[80+rsp] + paddd xmm10,XMMWORD[96+rsp] + paddd xmm11,XMMWORD[112+rsp] + + movdqa xmm6,xmm8 + punpckldq xmm8,xmm9 + movdqa xmm7,xmm10 + punpckldq xmm10,xmm11 + punpckhdq xmm6,xmm9 + punpckhdq xmm7,xmm11 + movdqa xmm9,xmm8 + punpcklqdq xmm8,xmm10 + movdqa xmm11,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm9,xmm10 + punpckhqdq xmm11,xmm7 + paddd xmm12,XMMWORD[((128-256))+rcx] + paddd xmm13,XMMWORD[((144-256))+rcx] + paddd xmm14,XMMWORD[((160-256))+rcx] + paddd xmm15,XMMWORD[((176-256))+rcx] + + movdqa XMMWORD[rsp],xmm8 + movdqa XMMWORD[16+rsp],xmm9 + movdqa xmm8,XMMWORD[32+rsp] + movdqa xmm9,XMMWORD[48+rsp] + + movdqa xmm10,xmm12 + punpckldq xmm12,xmm13 + movdqa xmm7,xmm14 + punpckldq xmm14,xmm15 + punpckhdq xmm10,xmm13 + punpckhdq xmm7,xmm15 + movdqa xmm13,xmm12 + punpcklqdq xmm12,xmm14 + movdqa xmm15,xmm10 + punpcklqdq xmm10,xmm7 + punpckhqdq xmm13,xmm14 + punpckhqdq xmm15,xmm7 + paddd xmm4,XMMWORD[((192-256))+rcx] + paddd xmm5,XMMWORD[((208-256))+rcx] + paddd xmm8,XMMWORD[((224-256))+rcx] + paddd xmm9,XMMWORD[((240-256))+rcx] + + movdqa XMMWORD[32+rsp],xmm6 + movdqa XMMWORD[48+rsp],xmm11 + + movdqa xmm14,xmm4 + punpckldq xmm4,xmm5 + movdqa xmm7,xmm8 + punpckldq xmm8,xmm9 + punpckhdq xmm14,xmm5 + punpckhdq xmm7,xmm9 + movdqa xmm5,xmm4 + punpcklqdq xmm4,xmm8 + movdqa xmm9,xmm14 + punpcklqdq xmm14,xmm7 + punpckhqdq xmm5,xmm8 + punpckhqdq xmm9,xmm7 + paddd xmm0,XMMWORD[((256-256))+rcx] + paddd xmm1,XMMWORD[((272-256))+rcx] + paddd xmm2,XMMWORD[((288-256))+rcx] + paddd xmm3,XMMWORD[((304-256))+rcx] + + movdqa xmm8,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm8,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm8 + punpcklqdq xmm8,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + cmp rdx,64*4 + jb NEAR $L$tail4x + + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + lea rsi,[128+rsi] + pxor xmm6,XMMWORD[16+rsp] + pxor xmm11,xmm13 + pxor xmm2,xmm5 + pxor xmm7,xmm1 + + movdqu XMMWORD[64+rdi],xmm6 + movdqu xmm6,XMMWORD[rsi] + movdqu XMMWORD[80+rdi],xmm11 + movdqu xmm11,XMMWORD[16+rsi] + movdqu XMMWORD[96+rdi],xmm2 + movdqu xmm2,XMMWORD[32+rsi] + movdqu XMMWORD[112+rdi],xmm7 + lea rdi,[128+rdi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[32+rsp] + pxor xmm11,xmm10 + pxor xmm2,xmm14 + pxor xmm7,xmm8 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + lea rsi,[128+rsi] + pxor xmm6,XMMWORD[48+rsp] + pxor xmm11,xmm15 + pxor xmm2,xmm9 + pxor xmm7,xmm3 + movdqu XMMWORD[64+rdi],xmm6 + movdqu XMMWORD[80+rdi],xmm11 + movdqu XMMWORD[96+rdi],xmm2 + movdqu XMMWORD[112+rdi],xmm7 + lea rdi,[128+rdi] + + sub rdx,64*4 + jnz NEAR $L$oop_outer4x + + jmp NEAR $L$done4x + +$L$tail4x: + cmp rdx,192 + jae NEAR $L$192_or_more4x + cmp rdx,128 + jae NEAR $L$128_or_more4x + cmp rdx,64 + jae NEAR $L$64_or_more4x + + + xor r10,r10 + + movdqa XMMWORD[16+rsp],xmm12 + movdqa XMMWORD[32+rsp],xmm4 + movdqa XMMWORD[48+rsp],xmm0 + jmp NEAR $L$oop_tail4x + +ALIGN 32 +$L$64_or_more4x: + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + movdqu XMMWORD[rdi],xmm6 + movdqu XMMWORD[16+rdi],xmm11 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm7 + je NEAR $L$done4x + + movdqa xmm6,XMMWORD[16+rsp] + lea rsi,[64+rsi] + xor r10,r10 + movdqa XMMWORD[rsp],xmm6 + movdqa XMMWORD[16+rsp],xmm13 + lea rdi,[64+rdi] + movdqa XMMWORD[32+rsp],xmm5 + sub rdx,64 + movdqa XMMWORD[48+rsp],xmm1 + jmp NEAR $L$oop_tail4x + +ALIGN 32 +$L$128_or_more4x: + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + pxor xmm6,XMMWORD[16+rsp] + pxor xmm11,xmm13 + pxor xmm2,xmm5 + pxor xmm7,xmm1 + movdqu XMMWORD[64+rdi],xmm6 + movdqu XMMWORD[80+rdi],xmm11 + movdqu XMMWORD[96+rdi],xmm2 + movdqu XMMWORD[112+rdi],xmm7 + je NEAR $L$done4x + + movdqa xmm6,XMMWORD[32+rsp] + lea rsi,[128+rsi] + xor r10,r10 + movdqa XMMWORD[rsp],xmm6 + movdqa XMMWORD[16+rsp],xmm10 + lea rdi,[128+rdi] + movdqa XMMWORD[32+rsp],xmm14 + sub rdx,128 + movdqa XMMWORD[48+rsp],xmm8 + jmp NEAR $L$oop_tail4x + +ALIGN 32 +$L$192_or_more4x: + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + lea rsi,[128+rsi] + pxor xmm6,XMMWORD[16+rsp] + pxor xmm11,xmm13 + pxor xmm2,xmm5 + pxor xmm7,xmm1 + + movdqu XMMWORD[64+rdi],xmm6 + movdqu xmm6,XMMWORD[rsi] + movdqu XMMWORD[80+rdi],xmm11 + movdqu xmm11,XMMWORD[16+rsi] + movdqu XMMWORD[96+rdi],xmm2 + movdqu xmm2,XMMWORD[32+rsi] + movdqu XMMWORD[112+rdi],xmm7 + lea rdi,[128+rdi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[32+rsp] + pxor xmm11,xmm10 + pxor xmm2,xmm14 + pxor xmm7,xmm8 + movdqu XMMWORD[rdi],xmm6 + movdqu XMMWORD[16+rdi],xmm11 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm7 + je NEAR $L$done4x + + movdqa xmm6,XMMWORD[48+rsp] + lea rsi,[64+rsi] + xor r10,r10 + movdqa XMMWORD[rsp],xmm6 + movdqa XMMWORD[16+rsp],xmm15 + lea rdi,[64+rdi] + movdqa XMMWORD[32+rsp],xmm9 + sub rdx,192 + movdqa XMMWORD[48+rsp],xmm3 + +$L$oop_tail4x: + movzx eax,BYTE[r10*1+rsi] + movzx ecx,BYTE[r10*1+rsp] + lea r10,[1+r10] + xor eax,ecx + mov BYTE[((-1))+r10*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail4x + +$L$done4x: + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ChaCha20_4x: + +ALIGN 32 +ChaCha20_4xop: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_4xop: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +$L$ChaCha20_4xop: + mov r9,rsp + + sub rsp,0x140+168 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$4xop_body: + vzeroupper + + vmovdqa xmm11,XMMWORD[$L$sigma] + vmovdqu xmm3,XMMWORD[rcx] + vmovdqu xmm15,XMMWORD[16+rcx] + vmovdqu xmm7,XMMWORD[r8] + lea rcx,[256+rsp] + + vpshufd xmm8,xmm11,0x00 + vpshufd xmm9,xmm11,0x55 + vmovdqa XMMWORD[64+rsp],xmm8 + vpshufd xmm10,xmm11,0xaa + vmovdqa XMMWORD[80+rsp],xmm9 + vpshufd xmm11,xmm11,0xff + vmovdqa XMMWORD[96+rsp],xmm10 + vmovdqa XMMWORD[112+rsp],xmm11 + + vpshufd xmm0,xmm3,0x00 + vpshufd xmm1,xmm3,0x55 + vmovdqa XMMWORD[(128-256)+rcx],xmm0 + vpshufd xmm2,xmm3,0xaa + vmovdqa XMMWORD[(144-256)+rcx],xmm1 + vpshufd xmm3,xmm3,0xff + vmovdqa XMMWORD[(160-256)+rcx],xmm2 + vmovdqa XMMWORD[(176-256)+rcx],xmm3 + + vpshufd xmm12,xmm15,0x00 + vpshufd xmm13,xmm15,0x55 + vmovdqa XMMWORD[(192-256)+rcx],xmm12 + vpshufd xmm14,xmm15,0xaa + vmovdqa XMMWORD[(208-256)+rcx],xmm13 + vpshufd xmm15,xmm15,0xff + vmovdqa XMMWORD[(224-256)+rcx],xmm14 + vmovdqa XMMWORD[(240-256)+rcx],xmm15 + + vpshufd xmm4,xmm7,0x00 + vpshufd xmm5,xmm7,0x55 + vpaddd xmm4,xmm4,XMMWORD[$L$inc] + vpshufd xmm6,xmm7,0xaa + vmovdqa XMMWORD[(272-256)+rcx],xmm5 + vpshufd xmm7,xmm7,0xff + vmovdqa XMMWORD[(288-256)+rcx],xmm6 + vmovdqa XMMWORD[(304-256)+rcx],xmm7 + + jmp NEAR $L$oop_enter4xop + +ALIGN 32 +$L$oop_outer4xop: + vmovdqa xmm8,XMMWORD[64+rsp] + vmovdqa xmm9,XMMWORD[80+rsp] + vmovdqa xmm10,XMMWORD[96+rsp] + vmovdqa xmm11,XMMWORD[112+rsp] + vmovdqa xmm0,XMMWORD[((128-256))+rcx] + vmovdqa xmm1,XMMWORD[((144-256))+rcx] + vmovdqa xmm2,XMMWORD[((160-256))+rcx] + vmovdqa xmm3,XMMWORD[((176-256))+rcx] + vmovdqa xmm12,XMMWORD[((192-256))+rcx] + vmovdqa xmm13,XMMWORD[((208-256))+rcx] + vmovdqa xmm14,XMMWORD[((224-256))+rcx] + vmovdqa xmm15,XMMWORD[((240-256))+rcx] + vmovdqa xmm4,XMMWORD[((256-256))+rcx] + vmovdqa xmm5,XMMWORD[((272-256))+rcx] + vmovdqa xmm6,XMMWORD[((288-256))+rcx] + vmovdqa xmm7,XMMWORD[((304-256))+rcx] + vpaddd xmm4,xmm4,XMMWORD[$L$four] + +$L$oop_enter4xop: + mov eax,10 + vmovdqa XMMWORD[(256-256)+rcx],xmm4 + jmp NEAR $L$oop4xop + +ALIGN 32 +$L$oop4xop: + vpaddd xmm8,xmm8,xmm0 + vpaddd xmm9,xmm9,xmm1 + vpaddd xmm10,xmm10,xmm2 + vpaddd xmm11,xmm11,xmm3 + vpxor xmm4,xmm8,xmm4 + vpxor xmm5,xmm9,xmm5 + vpxor xmm6,xmm10,xmm6 + vpxor xmm7,xmm11,xmm7 +DB 143,232,120,194,228,16 +DB 143,232,120,194,237,16 +DB 143,232,120,194,246,16 +DB 143,232,120,194,255,16 + vpaddd xmm12,xmm12,xmm4 + vpaddd xmm13,xmm13,xmm5 + vpaddd xmm14,xmm14,xmm6 + vpaddd xmm15,xmm15,xmm7 + vpxor xmm0,xmm12,xmm0 + vpxor xmm1,xmm13,xmm1 + vpxor xmm2,xmm2,xmm14 + vpxor xmm3,xmm3,xmm15 +DB 143,232,120,194,192,12 +DB 143,232,120,194,201,12 +DB 143,232,120,194,210,12 +DB 143,232,120,194,219,12 + vpaddd xmm8,xmm0,xmm8 + vpaddd xmm9,xmm1,xmm9 + vpaddd xmm10,xmm10,xmm2 + vpaddd xmm11,xmm11,xmm3 + vpxor xmm4,xmm8,xmm4 + vpxor xmm5,xmm9,xmm5 + vpxor xmm6,xmm10,xmm6 + vpxor xmm7,xmm11,xmm7 +DB 143,232,120,194,228,8 +DB 143,232,120,194,237,8 +DB 143,232,120,194,246,8 +DB 143,232,120,194,255,8 + vpaddd xmm12,xmm12,xmm4 + vpaddd xmm13,xmm13,xmm5 + vpaddd xmm14,xmm14,xmm6 + vpaddd xmm15,xmm15,xmm7 + vpxor xmm0,xmm12,xmm0 + vpxor xmm1,xmm13,xmm1 + vpxor xmm2,xmm2,xmm14 + vpxor xmm3,xmm3,xmm15 +DB 143,232,120,194,192,7 +DB 143,232,120,194,201,7 +DB 143,232,120,194,210,7 +DB 143,232,120,194,219,7 + vpaddd xmm8,xmm8,xmm1 + vpaddd xmm9,xmm9,xmm2 + vpaddd xmm10,xmm10,xmm3 + vpaddd xmm11,xmm11,xmm0 + vpxor xmm7,xmm8,xmm7 + vpxor xmm4,xmm9,xmm4 + vpxor xmm5,xmm10,xmm5 + vpxor xmm6,xmm11,xmm6 +DB 143,232,120,194,255,16 +DB 143,232,120,194,228,16 +DB 143,232,120,194,237,16 +DB 143,232,120,194,246,16 + vpaddd xmm14,xmm14,xmm7 + vpaddd xmm15,xmm15,xmm4 + vpaddd xmm12,xmm12,xmm5 + vpaddd xmm13,xmm13,xmm6 + vpxor xmm1,xmm14,xmm1 + vpxor xmm2,xmm15,xmm2 + vpxor xmm3,xmm3,xmm12 + vpxor xmm0,xmm0,xmm13 +DB 143,232,120,194,201,12 +DB 143,232,120,194,210,12 +DB 143,232,120,194,219,12 +DB 143,232,120,194,192,12 + vpaddd xmm8,xmm1,xmm8 + vpaddd xmm9,xmm2,xmm9 + vpaddd xmm10,xmm10,xmm3 + vpaddd xmm11,xmm11,xmm0 + vpxor xmm7,xmm8,xmm7 + vpxor xmm4,xmm9,xmm4 + vpxor xmm5,xmm10,xmm5 + vpxor xmm6,xmm11,xmm6 +DB 143,232,120,194,255,8 +DB 143,232,120,194,228,8 +DB 143,232,120,194,237,8 +DB 143,232,120,194,246,8 + vpaddd xmm14,xmm14,xmm7 + vpaddd xmm15,xmm15,xmm4 + vpaddd xmm12,xmm12,xmm5 + vpaddd xmm13,xmm13,xmm6 + vpxor xmm1,xmm14,xmm1 + vpxor xmm2,xmm15,xmm2 + vpxor xmm3,xmm3,xmm12 + vpxor xmm0,xmm0,xmm13 +DB 143,232,120,194,201,7 +DB 143,232,120,194,210,7 +DB 143,232,120,194,219,7 +DB 143,232,120,194,192,7 + dec eax + jnz NEAR $L$oop4xop + + vpaddd xmm8,xmm8,XMMWORD[64+rsp] + vpaddd xmm9,xmm9,XMMWORD[80+rsp] + vpaddd xmm10,xmm10,XMMWORD[96+rsp] + vpaddd xmm11,xmm11,XMMWORD[112+rsp] + + vmovdqa XMMWORD[32+rsp],xmm14 + vmovdqa XMMWORD[48+rsp],xmm15 + + vpunpckldq xmm14,xmm8,xmm9 + vpunpckldq xmm15,xmm10,xmm11 + vpunpckhdq xmm8,xmm8,xmm9 + vpunpckhdq xmm10,xmm10,xmm11 + vpunpcklqdq xmm9,xmm14,xmm15 + vpunpckhqdq xmm14,xmm14,xmm15 + vpunpcklqdq xmm11,xmm8,xmm10 + vpunpckhqdq xmm8,xmm8,xmm10 + vpaddd xmm0,xmm0,XMMWORD[((128-256))+rcx] + vpaddd xmm1,xmm1,XMMWORD[((144-256))+rcx] + vpaddd xmm2,xmm2,XMMWORD[((160-256))+rcx] + vpaddd xmm3,xmm3,XMMWORD[((176-256))+rcx] + + vmovdqa XMMWORD[rsp],xmm9 + vmovdqa XMMWORD[16+rsp],xmm14 + vmovdqa xmm9,XMMWORD[32+rsp] + vmovdqa xmm14,XMMWORD[48+rsp] + + vpunpckldq xmm10,xmm0,xmm1 + vpunpckldq xmm15,xmm2,xmm3 + vpunpckhdq xmm0,xmm0,xmm1 + vpunpckhdq xmm2,xmm2,xmm3 + vpunpcklqdq xmm1,xmm10,xmm15 + vpunpckhqdq xmm10,xmm10,xmm15 + vpunpcklqdq xmm3,xmm0,xmm2 + vpunpckhqdq xmm0,xmm0,xmm2 + vpaddd xmm12,xmm12,XMMWORD[((192-256))+rcx] + vpaddd xmm13,xmm13,XMMWORD[((208-256))+rcx] + vpaddd xmm9,xmm9,XMMWORD[((224-256))+rcx] + vpaddd xmm14,xmm14,XMMWORD[((240-256))+rcx] + + vpunpckldq xmm2,xmm12,xmm13 + vpunpckldq xmm15,xmm9,xmm14 + vpunpckhdq xmm12,xmm12,xmm13 + vpunpckhdq xmm9,xmm9,xmm14 + vpunpcklqdq xmm13,xmm2,xmm15 + vpunpckhqdq xmm2,xmm2,xmm15 + vpunpcklqdq xmm14,xmm12,xmm9 + vpunpckhqdq xmm12,xmm12,xmm9 + vpaddd xmm4,xmm4,XMMWORD[((256-256))+rcx] + vpaddd xmm5,xmm5,XMMWORD[((272-256))+rcx] + vpaddd xmm6,xmm6,XMMWORD[((288-256))+rcx] + vpaddd xmm7,xmm7,XMMWORD[((304-256))+rcx] + + vpunpckldq xmm9,xmm4,xmm5 + vpunpckldq xmm15,xmm6,xmm7 + vpunpckhdq xmm4,xmm4,xmm5 + vpunpckhdq xmm6,xmm6,xmm7 + vpunpcklqdq xmm5,xmm9,xmm15 + vpunpckhqdq xmm9,xmm9,xmm15 + vpunpcklqdq xmm7,xmm4,xmm6 + vpunpckhqdq xmm4,xmm4,xmm6 + vmovdqa xmm6,XMMWORD[rsp] + vmovdqa xmm15,XMMWORD[16+rsp] + + cmp rdx,64*4 + jb NEAR $L$tail4xop + + vpxor xmm6,xmm6,XMMWORD[rsi] + vpxor xmm1,xmm1,XMMWORD[16+rsi] + vpxor xmm13,xmm13,XMMWORD[32+rsi] + vpxor xmm5,xmm5,XMMWORD[48+rsi] + vpxor xmm15,xmm15,XMMWORD[64+rsi] + vpxor xmm10,xmm10,XMMWORD[80+rsi] + vpxor xmm2,xmm2,XMMWORD[96+rsi] + vpxor xmm9,xmm9,XMMWORD[112+rsi] + lea rsi,[128+rsi] + vpxor xmm11,xmm11,XMMWORD[rsi] + vpxor xmm3,xmm3,XMMWORD[16+rsi] + vpxor xmm14,xmm14,XMMWORD[32+rsi] + vpxor xmm7,xmm7,XMMWORD[48+rsi] + vpxor xmm8,xmm8,XMMWORD[64+rsi] + vpxor xmm0,xmm0,XMMWORD[80+rsi] + vpxor xmm12,xmm12,XMMWORD[96+rsi] + vpxor xmm4,xmm4,XMMWORD[112+rsi] + lea rsi,[128+rsi] + + vmovdqu XMMWORD[rdi],xmm6 + vmovdqu XMMWORD[16+rdi],xmm1 + vmovdqu XMMWORD[32+rdi],xmm13 + vmovdqu XMMWORD[48+rdi],xmm5 + vmovdqu XMMWORD[64+rdi],xmm15 + vmovdqu XMMWORD[80+rdi],xmm10 + vmovdqu XMMWORD[96+rdi],xmm2 + vmovdqu XMMWORD[112+rdi],xmm9 + lea rdi,[128+rdi] + vmovdqu XMMWORD[rdi],xmm11 + vmovdqu XMMWORD[16+rdi],xmm3 + vmovdqu XMMWORD[32+rdi],xmm14 + vmovdqu XMMWORD[48+rdi],xmm7 + vmovdqu XMMWORD[64+rdi],xmm8 + vmovdqu XMMWORD[80+rdi],xmm0 + vmovdqu XMMWORD[96+rdi],xmm12 + vmovdqu XMMWORD[112+rdi],xmm4 + lea rdi,[128+rdi] + + sub rdx,64*4 + jnz NEAR $L$oop_outer4xop + + jmp NEAR $L$done4xop + +ALIGN 32 +$L$tail4xop: + cmp rdx,192 + jae NEAR $L$192_or_more4xop + cmp rdx,128 + jae NEAR $L$128_or_more4xop + cmp rdx,64 + jae NEAR $L$64_or_more4xop + + xor r10,r10 + vmovdqa XMMWORD[rsp],xmm6 + vmovdqa XMMWORD[16+rsp],xmm1 + vmovdqa XMMWORD[32+rsp],xmm13 + vmovdqa XMMWORD[48+rsp],xmm5 + jmp NEAR $L$oop_tail4xop + +ALIGN 32 +$L$64_or_more4xop: + vpxor xmm6,xmm6,XMMWORD[rsi] + vpxor xmm1,xmm1,XMMWORD[16+rsi] + vpxor xmm13,xmm13,XMMWORD[32+rsi] + vpxor xmm5,xmm5,XMMWORD[48+rsi] + vmovdqu XMMWORD[rdi],xmm6 + vmovdqu XMMWORD[16+rdi],xmm1 + vmovdqu XMMWORD[32+rdi],xmm13 + vmovdqu XMMWORD[48+rdi],xmm5 + je NEAR $L$done4xop + + lea rsi,[64+rsi] + vmovdqa XMMWORD[rsp],xmm15 + xor r10,r10 + vmovdqa XMMWORD[16+rsp],xmm10 + lea rdi,[64+rdi] + vmovdqa XMMWORD[32+rsp],xmm2 + sub rdx,64 + vmovdqa XMMWORD[48+rsp],xmm9 + jmp NEAR $L$oop_tail4xop + +ALIGN 32 +$L$128_or_more4xop: + vpxor xmm6,xmm6,XMMWORD[rsi] + vpxor xmm1,xmm1,XMMWORD[16+rsi] + vpxor xmm13,xmm13,XMMWORD[32+rsi] + vpxor xmm5,xmm5,XMMWORD[48+rsi] + vpxor xmm15,xmm15,XMMWORD[64+rsi] + vpxor xmm10,xmm10,XMMWORD[80+rsi] + vpxor xmm2,xmm2,XMMWORD[96+rsi] + vpxor xmm9,xmm9,XMMWORD[112+rsi] + + vmovdqu XMMWORD[rdi],xmm6 + vmovdqu XMMWORD[16+rdi],xmm1 + vmovdqu XMMWORD[32+rdi],xmm13 + vmovdqu XMMWORD[48+rdi],xmm5 + vmovdqu XMMWORD[64+rdi],xmm15 + vmovdqu XMMWORD[80+rdi],xmm10 + vmovdqu XMMWORD[96+rdi],xmm2 + vmovdqu XMMWORD[112+rdi],xmm9 + je NEAR $L$done4xop + + lea rsi,[128+rsi] + vmovdqa XMMWORD[rsp],xmm11 + xor r10,r10 + vmovdqa XMMWORD[16+rsp],xmm3 + lea rdi,[128+rdi] + vmovdqa XMMWORD[32+rsp],xmm14 + sub rdx,128 + vmovdqa XMMWORD[48+rsp],xmm7 + jmp NEAR $L$oop_tail4xop + +ALIGN 32 +$L$192_or_more4xop: + vpxor xmm6,xmm6,XMMWORD[rsi] + vpxor xmm1,xmm1,XMMWORD[16+rsi] + vpxor xmm13,xmm13,XMMWORD[32+rsi] + vpxor xmm5,xmm5,XMMWORD[48+rsi] + vpxor xmm15,xmm15,XMMWORD[64+rsi] + vpxor xmm10,xmm10,XMMWORD[80+rsi] + vpxor xmm2,xmm2,XMMWORD[96+rsi] + vpxor xmm9,xmm9,XMMWORD[112+rsi] + lea rsi,[128+rsi] + vpxor xmm11,xmm11,XMMWORD[rsi] + vpxor xmm3,xmm3,XMMWORD[16+rsi] + vpxor xmm14,xmm14,XMMWORD[32+rsi] + vpxor xmm7,xmm7,XMMWORD[48+rsi] + + vmovdqu XMMWORD[rdi],xmm6 + vmovdqu XMMWORD[16+rdi],xmm1 + vmovdqu XMMWORD[32+rdi],xmm13 + vmovdqu XMMWORD[48+rdi],xmm5 + vmovdqu XMMWORD[64+rdi],xmm15 + vmovdqu XMMWORD[80+rdi],xmm10 + vmovdqu XMMWORD[96+rdi],xmm2 + vmovdqu XMMWORD[112+rdi],xmm9 + lea rdi,[128+rdi] + vmovdqu XMMWORD[rdi],xmm11 + vmovdqu XMMWORD[16+rdi],xmm3 + vmovdqu XMMWORD[32+rdi],xmm14 + vmovdqu XMMWORD[48+rdi],xmm7 + je NEAR $L$done4xop + + lea rsi,[64+rsi] + vmovdqa XMMWORD[rsp],xmm8 + xor r10,r10 + vmovdqa XMMWORD[16+rsp],xmm0 + lea rdi,[64+rdi] + vmovdqa XMMWORD[32+rsp],xmm12 + sub rdx,192 + vmovdqa XMMWORD[48+rsp],xmm4 + +$L$oop_tail4xop: + movzx eax,BYTE[r10*1+rsi] + movzx ecx,BYTE[r10*1+rsp] + lea r10,[1+r10] + xor eax,ecx + mov BYTE[((-1))+r10*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail4xop + +$L$done4xop: + vzeroupper + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$4xop_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ChaCha20_4xop: + +ALIGN 32 +ChaCha20_8x: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_8x: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +$L$ChaCha20_8x: + mov r9,rsp + + sub rsp,0x280+168 + and rsp,-32 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$8x_body: + vzeroupper + + + + + + + + + + + vbroadcasti128 ymm11,XMMWORD[$L$sigma] + vbroadcasti128 ymm3,XMMWORD[rcx] + vbroadcasti128 ymm15,XMMWORD[16+rcx] + vbroadcasti128 ymm7,XMMWORD[r8] + lea rcx,[256+rsp] + lea rax,[512+rsp] + lea r10,[$L$rot16] + lea r11,[$L$rot24] + + vpshufd ymm8,ymm11,0x00 + vpshufd ymm9,ymm11,0x55 + vmovdqa YMMWORD[(128-256)+rcx],ymm8 + vpshufd ymm10,ymm11,0xaa + vmovdqa YMMWORD[(160-256)+rcx],ymm9 + vpshufd ymm11,ymm11,0xff + vmovdqa YMMWORD[(192-256)+rcx],ymm10 + vmovdqa YMMWORD[(224-256)+rcx],ymm11 + + vpshufd ymm0,ymm3,0x00 + vpshufd ymm1,ymm3,0x55 + vmovdqa YMMWORD[(256-256)+rcx],ymm0 + vpshufd ymm2,ymm3,0xaa + vmovdqa YMMWORD[(288-256)+rcx],ymm1 + vpshufd ymm3,ymm3,0xff + vmovdqa YMMWORD[(320-256)+rcx],ymm2 + vmovdqa YMMWORD[(352-256)+rcx],ymm3 + + vpshufd ymm12,ymm15,0x00 + vpshufd ymm13,ymm15,0x55 + vmovdqa YMMWORD[(384-512)+rax],ymm12 + vpshufd ymm14,ymm15,0xaa + vmovdqa YMMWORD[(416-512)+rax],ymm13 + vpshufd ymm15,ymm15,0xff + vmovdqa YMMWORD[(448-512)+rax],ymm14 + vmovdqa YMMWORD[(480-512)+rax],ymm15 + + vpshufd ymm4,ymm7,0x00 + vpshufd ymm5,ymm7,0x55 + vpaddd ymm4,ymm4,YMMWORD[$L$incy] + vpshufd ymm6,ymm7,0xaa + vmovdqa YMMWORD[(544-512)+rax],ymm5 + vpshufd ymm7,ymm7,0xff + vmovdqa YMMWORD[(576-512)+rax],ymm6 + vmovdqa YMMWORD[(608-512)+rax],ymm7 + + jmp NEAR $L$oop_enter8x + +ALIGN 32 +$L$oop_outer8x: + vmovdqa ymm8,YMMWORD[((128-256))+rcx] + vmovdqa ymm9,YMMWORD[((160-256))+rcx] + vmovdqa ymm10,YMMWORD[((192-256))+rcx] + vmovdqa ymm11,YMMWORD[((224-256))+rcx] + vmovdqa ymm0,YMMWORD[((256-256))+rcx] + vmovdqa ymm1,YMMWORD[((288-256))+rcx] + vmovdqa ymm2,YMMWORD[((320-256))+rcx] + vmovdqa ymm3,YMMWORD[((352-256))+rcx] + vmovdqa ymm12,YMMWORD[((384-512))+rax] + vmovdqa ymm13,YMMWORD[((416-512))+rax] + vmovdqa ymm14,YMMWORD[((448-512))+rax] + vmovdqa ymm15,YMMWORD[((480-512))+rax] + vmovdqa ymm4,YMMWORD[((512-512))+rax] + vmovdqa ymm5,YMMWORD[((544-512))+rax] + vmovdqa ymm6,YMMWORD[((576-512))+rax] + vmovdqa ymm7,YMMWORD[((608-512))+rax] + vpaddd ymm4,ymm4,YMMWORD[$L$eight] + +$L$oop_enter8x: + vmovdqa YMMWORD[64+rsp],ymm14 + vmovdqa YMMWORD[96+rsp],ymm15 + vbroadcasti128 ymm15,XMMWORD[r10] + vmovdqa YMMWORD[(512-512)+rax],ymm4 + mov eax,10 + jmp NEAR $L$oop8x + +ALIGN 32 +$L$oop8x: + vpaddd ymm8,ymm8,ymm0 + vpxor ymm4,ymm8,ymm4 + vpshufb ymm4,ymm4,ymm15 + vpaddd ymm9,ymm9,ymm1 + vpxor ymm5,ymm9,ymm5 + vpshufb ymm5,ymm5,ymm15 + vpaddd ymm12,ymm12,ymm4 + vpxor ymm0,ymm12,ymm0 + vpslld ymm14,ymm0,12 + vpsrld ymm0,ymm0,20 + vpor ymm0,ymm14,ymm0 + vbroadcasti128 ymm14,XMMWORD[r11] + vpaddd ymm13,ymm13,ymm5 + vpxor ymm1,ymm13,ymm1 + vpslld ymm15,ymm1,12 + vpsrld ymm1,ymm1,20 + vpor ymm1,ymm15,ymm1 + vpaddd ymm8,ymm8,ymm0 + vpxor ymm4,ymm8,ymm4 + vpshufb ymm4,ymm4,ymm14 + vpaddd ymm9,ymm9,ymm1 + vpxor ymm5,ymm9,ymm5 + vpshufb ymm5,ymm5,ymm14 + vpaddd ymm12,ymm12,ymm4 + vpxor ymm0,ymm12,ymm0 + vpslld ymm15,ymm0,7 + vpsrld ymm0,ymm0,25 + vpor ymm0,ymm15,ymm0 + vbroadcasti128 ymm15,XMMWORD[r10] + vpaddd ymm13,ymm13,ymm5 + vpxor ymm1,ymm13,ymm1 + vpslld ymm14,ymm1,7 + vpsrld ymm1,ymm1,25 + vpor ymm1,ymm14,ymm1 + vmovdqa YMMWORD[rsp],ymm12 + vmovdqa YMMWORD[32+rsp],ymm13 + vmovdqa ymm12,YMMWORD[64+rsp] + vmovdqa ymm13,YMMWORD[96+rsp] + vpaddd ymm10,ymm10,ymm2 + vpxor ymm6,ymm10,ymm6 + vpshufb ymm6,ymm6,ymm15 + vpaddd ymm11,ymm11,ymm3 + vpxor ymm7,ymm11,ymm7 + vpshufb ymm7,ymm7,ymm15 + vpaddd ymm12,ymm12,ymm6 + vpxor ymm2,ymm12,ymm2 + vpslld ymm14,ymm2,12 + vpsrld ymm2,ymm2,20 + vpor ymm2,ymm14,ymm2 + vbroadcasti128 ymm14,XMMWORD[r11] + vpaddd ymm13,ymm13,ymm7 + vpxor ymm3,ymm13,ymm3 + vpslld ymm15,ymm3,12 + vpsrld ymm3,ymm3,20 + vpor ymm3,ymm15,ymm3 + vpaddd ymm10,ymm10,ymm2 + vpxor ymm6,ymm10,ymm6 + vpshufb ymm6,ymm6,ymm14 + vpaddd ymm11,ymm11,ymm3 + vpxor ymm7,ymm11,ymm7 + vpshufb ymm7,ymm7,ymm14 + vpaddd ymm12,ymm12,ymm6 + vpxor ymm2,ymm12,ymm2 + vpslld ymm15,ymm2,7 + vpsrld ymm2,ymm2,25 + vpor ymm2,ymm15,ymm2 + vbroadcasti128 ymm15,XMMWORD[r10] + vpaddd ymm13,ymm13,ymm7 + vpxor ymm3,ymm13,ymm3 + vpslld ymm14,ymm3,7 + vpsrld ymm3,ymm3,25 + vpor ymm3,ymm14,ymm3 + vpaddd ymm8,ymm8,ymm1 + vpxor ymm7,ymm8,ymm7 + vpshufb ymm7,ymm7,ymm15 + vpaddd ymm9,ymm9,ymm2 + vpxor ymm4,ymm9,ymm4 + vpshufb ymm4,ymm4,ymm15 + vpaddd ymm12,ymm12,ymm7 + vpxor ymm1,ymm12,ymm1 + vpslld ymm14,ymm1,12 + vpsrld ymm1,ymm1,20 + vpor ymm1,ymm14,ymm1 + vbroadcasti128 ymm14,XMMWORD[r11] + vpaddd ymm13,ymm13,ymm4 + vpxor ymm2,ymm13,ymm2 + vpslld ymm15,ymm2,12 + vpsrld ymm2,ymm2,20 + vpor ymm2,ymm15,ymm2 + vpaddd ymm8,ymm8,ymm1 + vpxor ymm7,ymm8,ymm7 + vpshufb ymm7,ymm7,ymm14 + vpaddd ymm9,ymm9,ymm2 + vpxor ymm4,ymm9,ymm4 + vpshufb ymm4,ymm4,ymm14 + vpaddd ymm12,ymm12,ymm7 + vpxor ymm1,ymm12,ymm1 + vpslld ymm15,ymm1,7 + vpsrld ymm1,ymm1,25 + vpor ymm1,ymm15,ymm1 + vbroadcasti128 ymm15,XMMWORD[r10] + vpaddd ymm13,ymm13,ymm4 + vpxor ymm2,ymm13,ymm2 + vpslld ymm14,ymm2,7 + vpsrld ymm2,ymm2,25 + vpor ymm2,ymm14,ymm2 + vmovdqa YMMWORD[64+rsp],ymm12 + vmovdqa YMMWORD[96+rsp],ymm13 + vmovdqa ymm12,YMMWORD[rsp] + vmovdqa ymm13,YMMWORD[32+rsp] + vpaddd ymm10,ymm10,ymm3 + vpxor ymm5,ymm10,ymm5 + vpshufb ymm5,ymm5,ymm15 + vpaddd ymm11,ymm11,ymm0 + vpxor ymm6,ymm11,ymm6 + vpshufb ymm6,ymm6,ymm15 + vpaddd ymm12,ymm12,ymm5 + vpxor ymm3,ymm12,ymm3 + vpslld ymm14,ymm3,12 + vpsrld ymm3,ymm3,20 + vpor ymm3,ymm14,ymm3 + vbroadcasti128 ymm14,XMMWORD[r11] + vpaddd ymm13,ymm13,ymm6 + vpxor ymm0,ymm13,ymm0 + vpslld ymm15,ymm0,12 + vpsrld ymm0,ymm0,20 + vpor ymm0,ymm15,ymm0 + vpaddd ymm10,ymm10,ymm3 + vpxor ymm5,ymm10,ymm5 + vpshufb ymm5,ymm5,ymm14 + vpaddd ymm11,ymm11,ymm0 + vpxor ymm6,ymm11,ymm6 + vpshufb ymm6,ymm6,ymm14 + vpaddd ymm12,ymm12,ymm5 + vpxor ymm3,ymm12,ymm3 + vpslld ymm15,ymm3,7 + vpsrld ymm3,ymm3,25 + vpor ymm3,ymm15,ymm3 + vbroadcasti128 ymm15,XMMWORD[r10] + vpaddd ymm13,ymm13,ymm6 + vpxor ymm0,ymm13,ymm0 + vpslld ymm14,ymm0,7 + vpsrld ymm0,ymm0,25 + vpor ymm0,ymm14,ymm0 + dec eax + jnz NEAR $L$oop8x + + lea rax,[512+rsp] + vpaddd ymm8,ymm8,YMMWORD[((128-256))+rcx] + vpaddd ymm9,ymm9,YMMWORD[((160-256))+rcx] + vpaddd ymm10,ymm10,YMMWORD[((192-256))+rcx] + vpaddd ymm11,ymm11,YMMWORD[((224-256))+rcx] + + vpunpckldq ymm14,ymm8,ymm9 + vpunpckldq ymm15,ymm10,ymm11 + vpunpckhdq ymm8,ymm8,ymm9 + vpunpckhdq ymm10,ymm10,ymm11 + vpunpcklqdq ymm9,ymm14,ymm15 + vpunpckhqdq ymm14,ymm14,ymm15 + vpunpcklqdq ymm11,ymm8,ymm10 + vpunpckhqdq ymm8,ymm8,ymm10 + vpaddd ymm0,ymm0,YMMWORD[((256-256))+rcx] + vpaddd ymm1,ymm1,YMMWORD[((288-256))+rcx] + vpaddd ymm2,ymm2,YMMWORD[((320-256))+rcx] + vpaddd ymm3,ymm3,YMMWORD[((352-256))+rcx] + + vpunpckldq ymm10,ymm0,ymm1 + vpunpckldq ymm15,ymm2,ymm3 + vpunpckhdq ymm0,ymm0,ymm1 + vpunpckhdq ymm2,ymm2,ymm3 + vpunpcklqdq ymm1,ymm10,ymm15 + vpunpckhqdq ymm10,ymm10,ymm15 + vpunpcklqdq ymm3,ymm0,ymm2 + vpunpckhqdq ymm0,ymm0,ymm2 + vperm2i128 ymm15,ymm9,ymm1,0x20 + vperm2i128 ymm1,ymm9,ymm1,0x31 + vperm2i128 ymm9,ymm14,ymm10,0x20 + vperm2i128 ymm10,ymm14,ymm10,0x31 + vperm2i128 ymm14,ymm11,ymm3,0x20 + vperm2i128 ymm3,ymm11,ymm3,0x31 + vperm2i128 ymm11,ymm8,ymm0,0x20 + vperm2i128 ymm0,ymm8,ymm0,0x31 + vmovdqa YMMWORD[rsp],ymm15 + vmovdqa YMMWORD[32+rsp],ymm9 + vmovdqa ymm15,YMMWORD[64+rsp] + vmovdqa ymm9,YMMWORD[96+rsp] + + vpaddd ymm12,ymm12,YMMWORD[((384-512))+rax] + vpaddd ymm13,ymm13,YMMWORD[((416-512))+rax] + vpaddd ymm15,ymm15,YMMWORD[((448-512))+rax] + vpaddd ymm9,ymm9,YMMWORD[((480-512))+rax] + + vpunpckldq ymm2,ymm12,ymm13 + vpunpckldq ymm8,ymm15,ymm9 + vpunpckhdq ymm12,ymm12,ymm13 + vpunpckhdq ymm15,ymm15,ymm9 + vpunpcklqdq ymm13,ymm2,ymm8 + vpunpckhqdq ymm2,ymm2,ymm8 + vpunpcklqdq ymm9,ymm12,ymm15 + vpunpckhqdq ymm12,ymm12,ymm15 + vpaddd ymm4,ymm4,YMMWORD[((512-512))+rax] + vpaddd ymm5,ymm5,YMMWORD[((544-512))+rax] + vpaddd ymm6,ymm6,YMMWORD[((576-512))+rax] + vpaddd ymm7,ymm7,YMMWORD[((608-512))+rax] + + vpunpckldq ymm15,ymm4,ymm5 + vpunpckldq ymm8,ymm6,ymm7 + vpunpckhdq ymm4,ymm4,ymm5 + vpunpckhdq ymm6,ymm6,ymm7 + vpunpcklqdq ymm5,ymm15,ymm8 + vpunpckhqdq ymm15,ymm15,ymm8 + vpunpcklqdq ymm7,ymm4,ymm6 + vpunpckhqdq ymm4,ymm4,ymm6 + vperm2i128 ymm8,ymm13,ymm5,0x20 + vperm2i128 ymm5,ymm13,ymm5,0x31 + vperm2i128 ymm13,ymm2,ymm15,0x20 + vperm2i128 ymm15,ymm2,ymm15,0x31 + vperm2i128 ymm2,ymm9,ymm7,0x20 + vperm2i128 ymm7,ymm9,ymm7,0x31 + vperm2i128 ymm9,ymm12,ymm4,0x20 + vperm2i128 ymm4,ymm12,ymm4,0x31 + vmovdqa ymm6,YMMWORD[rsp] + vmovdqa ymm12,YMMWORD[32+rsp] + + cmp rdx,64*8 + jb NEAR $L$tail8x + + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + lea rdi,[128+rdi] + + vpxor ymm12,ymm12,YMMWORD[rsi] + vpxor ymm13,ymm13,YMMWORD[32+rsi] + vpxor ymm10,ymm10,YMMWORD[64+rsi] + vpxor ymm15,ymm15,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm12 + vmovdqu YMMWORD[32+rdi],ymm13 + vmovdqu YMMWORD[64+rdi],ymm10 + vmovdqu YMMWORD[96+rdi],ymm15 + lea rdi,[128+rdi] + + vpxor ymm14,ymm14,YMMWORD[rsi] + vpxor ymm2,ymm2,YMMWORD[32+rsi] + vpxor ymm3,ymm3,YMMWORD[64+rsi] + vpxor ymm7,ymm7,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm14 + vmovdqu YMMWORD[32+rdi],ymm2 + vmovdqu YMMWORD[64+rdi],ymm3 + vmovdqu YMMWORD[96+rdi],ymm7 + lea rdi,[128+rdi] + + vpxor ymm11,ymm11,YMMWORD[rsi] + vpxor ymm9,ymm9,YMMWORD[32+rsi] + vpxor ymm0,ymm0,YMMWORD[64+rsi] + vpxor ymm4,ymm4,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm11 + vmovdqu YMMWORD[32+rdi],ymm9 + vmovdqu YMMWORD[64+rdi],ymm0 + vmovdqu YMMWORD[96+rdi],ymm4 + lea rdi,[128+rdi] + + sub rdx,64*8 + jnz NEAR $L$oop_outer8x + + jmp NEAR $L$done8x + +$L$tail8x: + cmp rdx,448 + jae NEAR $L$448_or_more8x + cmp rdx,384 + jae NEAR $L$384_or_more8x + cmp rdx,320 + jae NEAR $L$320_or_more8x + cmp rdx,256 + jae NEAR $L$256_or_more8x + cmp rdx,192 + jae NEAR $L$192_or_more8x + cmp rdx,128 + jae NEAR $L$128_or_more8x + cmp rdx,64 + jae NEAR $L$64_or_more8x + + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm6 + vmovdqa YMMWORD[32+rsp],ymm8 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$64_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + je NEAR $L$done8x + + lea rsi,[64+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm1 + lea rdi,[64+rdi] + sub rdx,64 + vmovdqa YMMWORD[32+rsp],ymm5 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$128_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + je NEAR $L$done8x + + lea rsi,[128+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm12 + lea rdi,[128+rdi] + sub rdx,128 + vmovdqa YMMWORD[32+rsp],ymm13 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$192_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + je NEAR $L$done8x + + lea rsi,[192+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm10 + lea rdi,[192+rdi] + sub rdx,192 + vmovdqa YMMWORD[32+rsp],ymm15 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$256_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + je NEAR $L$done8x + + lea rsi,[256+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm14 + lea rdi,[256+rdi] + sub rdx,256 + vmovdqa YMMWORD[32+rsp],ymm2 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$320_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vpxor ymm14,ymm14,YMMWORD[256+rsi] + vpxor ymm2,ymm2,YMMWORD[288+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + vmovdqu YMMWORD[256+rdi],ymm14 + vmovdqu YMMWORD[288+rdi],ymm2 + je NEAR $L$done8x + + lea rsi,[320+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm3 + lea rdi,[320+rdi] + sub rdx,320 + vmovdqa YMMWORD[32+rsp],ymm7 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$384_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vpxor ymm14,ymm14,YMMWORD[256+rsi] + vpxor ymm2,ymm2,YMMWORD[288+rsi] + vpxor ymm3,ymm3,YMMWORD[320+rsi] + vpxor ymm7,ymm7,YMMWORD[352+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + vmovdqu YMMWORD[256+rdi],ymm14 + vmovdqu YMMWORD[288+rdi],ymm2 + vmovdqu YMMWORD[320+rdi],ymm3 + vmovdqu YMMWORD[352+rdi],ymm7 + je NEAR $L$done8x + + lea rsi,[384+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm11 + lea rdi,[384+rdi] + sub rdx,384 + vmovdqa YMMWORD[32+rsp],ymm9 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$448_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vpxor ymm14,ymm14,YMMWORD[256+rsi] + vpxor ymm2,ymm2,YMMWORD[288+rsi] + vpxor ymm3,ymm3,YMMWORD[320+rsi] + vpxor ymm7,ymm7,YMMWORD[352+rsi] + vpxor ymm11,ymm11,YMMWORD[384+rsi] + vpxor ymm9,ymm9,YMMWORD[416+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + vmovdqu YMMWORD[256+rdi],ymm14 + vmovdqu YMMWORD[288+rdi],ymm2 + vmovdqu YMMWORD[320+rdi],ymm3 + vmovdqu YMMWORD[352+rdi],ymm7 + vmovdqu YMMWORD[384+rdi],ymm11 + vmovdqu YMMWORD[416+rdi],ymm9 + je NEAR $L$done8x + + lea rsi,[448+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm0 + lea rdi,[448+rdi] + sub rdx,448 + vmovdqa YMMWORD[32+rsp],ymm4 + +$L$oop_tail8x: + movzx eax,BYTE[r10*1+rsi] + movzx ecx,BYTE[r10*1+rsp] + lea r10,[1+r10] + xor eax,ecx + mov BYTE[((-1))+r10*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail8x + +$L$done8x: + vzeroall + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$8x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ChaCha20_8x: + +ALIGN 32 +ChaCha20_avx512: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_avx512: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +$L$ChaCha20_avx512: + mov r9,rsp + + cmp rdx,512 + ja NEAR $L$ChaCha20_16x + + sub rsp,64+40 + movaps XMMWORD[(-40)+r9],xmm6 + movaps XMMWORD[(-24)+r9],xmm7 +$L$avx512_body: + vbroadcasti32x4 zmm0,ZMMWORD[$L$sigma] + vbroadcasti32x4 zmm1,ZMMWORD[rcx] + vbroadcasti32x4 zmm2,ZMMWORD[16+rcx] + vbroadcasti32x4 zmm3,ZMMWORD[r8] + + vmovdqa32 zmm16,zmm0 + vmovdqa32 zmm17,zmm1 + vmovdqa32 zmm18,zmm2 + vpaddd zmm3,zmm3,ZMMWORD[$L$zeroz] + vmovdqa32 zmm20,ZMMWORD[$L$fourz] + mov r8,10 + vmovdqa32 zmm19,zmm3 + jmp NEAR $L$oop_avx512 + +ALIGN 16 +$L$oop_outer_avx512: + vmovdqa32 zmm0,zmm16 + vmovdqa32 zmm1,zmm17 + vmovdqa32 zmm2,zmm18 + vpaddd zmm3,zmm19,zmm20 + mov r8,10 + vmovdqa32 zmm19,zmm3 + jmp NEAR $L$oop_avx512 + +ALIGN 32 +$L$oop_avx512: + vpaddd zmm0,zmm0,zmm1 + vpxord zmm3,zmm3,zmm0 + vprold zmm3,zmm3,16 + vpaddd zmm2,zmm2,zmm3 + vpxord zmm1,zmm1,zmm2 + vprold zmm1,zmm1,12 + vpaddd zmm0,zmm0,zmm1 + vpxord zmm3,zmm3,zmm0 + vprold zmm3,zmm3,8 + vpaddd zmm2,zmm2,zmm3 + vpxord zmm1,zmm1,zmm2 + vprold zmm1,zmm1,7 + vpshufd zmm2,zmm2,78 + vpshufd zmm1,zmm1,57 + vpshufd zmm3,zmm3,147 + vpaddd zmm0,zmm0,zmm1 + vpxord zmm3,zmm3,zmm0 + vprold zmm3,zmm3,16 + vpaddd zmm2,zmm2,zmm3 + vpxord zmm1,zmm1,zmm2 + vprold zmm1,zmm1,12 + vpaddd zmm0,zmm0,zmm1 + vpxord zmm3,zmm3,zmm0 + vprold zmm3,zmm3,8 + vpaddd zmm2,zmm2,zmm3 + vpxord zmm1,zmm1,zmm2 + vprold zmm1,zmm1,7 + vpshufd zmm2,zmm2,78 + vpshufd zmm1,zmm1,147 + vpshufd zmm3,zmm3,57 + dec r8 + jnz NEAR $L$oop_avx512 + vpaddd zmm0,zmm0,zmm16 + vpaddd zmm1,zmm1,zmm17 + vpaddd zmm2,zmm2,zmm18 + vpaddd zmm3,zmm3,zmm19 + + sub rdx,64 + jb NEAR $L$tail64_avx512 + + vpxor xmm4,xmm0,XMMWORD[rsi] + vpxor xmm5,xmm1,XMMWORD[16+rsi] + vpxor xmm6,xmm2,XMMWORD[32+rsi] + vpxor xmm7,xmm3,XMMWORD[48+rsi] + lea rsi,[64+rsi] + + vmovdqu XMMWORD[rdi],xmm4 + vmovdqu XMMWORD[16+rdi],xmm5 + vmovdqu XMMWORD[32+rdi],xmm6 + vmovdqu XMMWORD[48+rdi],xmm7 + lea rdi,[64+rdi] + + jz NEAR $L$done_avx512 + + vextracti32x4 xmm4,zmm0,1 + vextracti32x4 xmm5,zmm1,1 + vextracti32x4 xmm6,zmm2,1 + vextracti32x4 xmm7,zmm3,1 + + sub rdx,64 + jb NEAR $L$tail_avx512 + + vpxor xmm4,xmm4,XMMWORD[rsi] + vpxor xmm5,xmm5,XMMWORD[16+rsi] + vpxor xmm6,xmm6,XMMWORD[32+rsi] + vpxor xmm7,xmm7,XMMWORD[48+rsi] + lea rsi,[64+rsi] + + vmovdqu XMMWORD[rdi],xmm4 + vmovdqu XMMWORD[16+rdi],xmm5 + vmovdqu XMMWORD[32+rdi],xmm6 + vmovdqu XMMWORD[48+rdi],xmm7 + lea rdi,[64+rdi] + + jz NEAR $L$done_avx512 + + vextracti32x4 xmm4,zmm0,2 + vextracti32x4 xmm5,zmm1,2 + vextracti32x4 xmm6,zmm2,2 + vextracti32x4 xmm7,zmm3,2 + + sub rdx,64 + jb NEAR $L$tail_avx512 + + vpxor xmm4,xmm4,XMMWORD[rsi] + vpxor xmm5,xmm5,XMMWORD[16+rsi] + vpxor xmm6,xmm6,XMMWORD[32+rsi] + vpxor xmm7,xmm7,XMMWORD[48+rsi] + lea rsi,[64+rsi] + + vmovdqu XMMWORD[rdi],xmm4 + vmovdqu XMMWORD[16+rdi],xmm5 + vmovdqu XMMWORD[32+rdi],xmm6 + vmovdqu XMMWORD[48+rdi],xmm7 + lea rdi,[64+rdi] + + jz NEAR $L$done_avx512 + + vextracti32x4 xmm4,zmm0,3 + vextracti32x4 xmm5,zmm1,3 + vextracti32x4 xmm6,zmm2,3 + vextracti32x4 xmm7,zmm3,3 + + sub rdx,64 + jb NEAR $L$tail_avx512 + + vpxor xmm4,xmm4,XMMWORD[rsi] + vpxor xmm5,xmm5,XMMWORD[16+rsi] + vpxor xmm6,xmm6,XMMWORD[32+rsi] + vpxor xmm7,xmm7,XMMWORD[48+rsi] + lea rsi,[64+rsi] + + vmovdqu XMMWORD[rdi],xmm4 + vmovdqu XMMWORD[16+rdi],xmm5 + vmovdqu XMMWORD[32+rdi],xmm6 + vmovdqu XMMWORD[48+rdi],xmm7 + lea rdi,[64+rdi] + + jnz NEAR $L$oop_outer_avx512 + + jmp NEAR $L$done_avx512 + +ALIGN 16 +$L$tail64_avx512: + vmovdqa XMMWORD[rsp],xmm0 + vmovdqa XMMWORD[16+rsp],xmm1 + vmovdqa XMMWORD[32+rsp],xmm2 + vmovdqa XMMWORD[48+rsp],xmm3 + add rdx,64 + jmp NEAR $L$oop_tail_avx512 + +ALIGN 16 +$L$tail_avx512: + vmovdqa XMMWORD[rsp],xmm4 + vmovdqa XMMWORD[16+rsp],xmm5 + vmovdqa XMMWORD[32+rsp],xmm6 + vmovdqa XMMWORD[48+rsp],xmm7 + add rdx,64 + +$L$oop_tail_avx512: + movzx eax,BYTE[r8*1+rsi] + movzx ecx,BYTE[r8*1+rsp] + lea r8,[1+r8] + xor eax,ecx + mov BYTE[((-1))+r8*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail_avx512 + + vmovdqu32 ZMMWORD[rsp],zmm16 + +$L$done_avx512: + vzeroall + movaps xmm6,XMMWORD[((-40))+r9] + movaps xmm7,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$avx512_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ChaCha20_avx512: + +ALIGN 32 +ChaCha20_avx512vl: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_avx512vl: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +$L$ChaCha20_avx512vl: + mov r9,rsp + + cmp rdx,128 + ja NEAR $L$ChaCha20_8xvl + + sub rsp,64+40 + movaps XMMWORD[(-40)+r9],xmm6 + movaps XMMWORD[(-24)+r9],xmm7 +$L$avx512vl_body: + vbroadcasti128 ymm0,XMMWORD[$L$sigma] + vbroadcasti128 ymm1,XMMWORD[rcx] + vbroadcasti128 ymm2,XMMWORD[16+rcx] + vbroadcasti128 ymm3,XMMWORD[r8] + + vmovdqa32 ymm16,ymm0 + vmovdqa32 ymm17,ymm1 + vmovdqa32 ymm18,ymm2 + vpaddd ymm3,ymm3,YMMWORD[$L$zeroz] + vmovdqa32 ymm20,YMMWORD[$L$twoy] + mov r8,10 + vmovdqa32 ymm19,ymm3 + jmp NEAR $L$oop_avx512vl + +ALIGN 16 +$L$oop_outer_avx512vl: + vmovdqa32 ymm2,ymm18 + vpaddd ymm3,ymm19,ymm20 + mov r8,10 + vmovdqa32 ymm19,ymm3 + jmp NEAR $L$oop_avx512vl + +ALIGN 32 +$L$oop_avx512vl: + vpaddd ymm0,ymm0,ymm1 + vpxor ymm3,ymm3,ymm0 + vprold ymm3,ymm3,16 + vpaddd ymm2,ymm2,ymm3 + vpxor ymm1,ymm1,ymm2 + vprold ymm1,ymm1,12 + vpaddd ymm0,ymm0,ymm1 + vpxor ymm3,ymm3,ymm0 + vprold ymm3,ymm3,8 + vpaddd ymm2,ymm2,ymm3 + vpxor ymm1,ymm1,ymm2 + vprold ymm1,ymm1,7 + vpshufd ymm2,ymm2,78 + vpshufd ymm1,ymm1,57 + vpshufd ymm3,ymm3,147 + vpaddd ymm0,ymm0,ymm1 + vpxor ymm3,ymm3,ymm0 + vprold ymm3,ymm3,16 + vpaddd ymm2,ymm2,ymm3 + vpxor ymm1,ymm1,ymm2 + vprold ymm1,ymm1,12 + vpaddd ymm0,ymm0,ymm1 + vpxor ymm3,ymm3,ymm0 + vprold ymm3,ymm3,8 + vpaddd ymm2,ymm2,ymm3 + vpxor ymm1,ymm1,ymm2 + vprold ymm1,ymm1,7 + vpshufd ymm2,ymm2,78 + vpshufd ymm1,ymm1,147 + vpshufd ymm3,ymm3,57 + dec r8 + jnz NEAR $L$oop_avx512vl + vpaddd ymm0,ymm0,ymm16 + vpaddd ymm1,ymm1,ymm17 + vpaddd ymm2,ymm2,ymm18 + vpaddd ymm3,ymm3,ymm19 + + sub rdx,64 + jb NEAR $L$tail64_avx512vl + + vpxor xmm4,xmm0,XMMWORD[rsi] + vpxor xmm5,xmm1,XMMWORD[16+rsi] + vpxor xmm6,xmm2,XMMWORD[32+rsi] + vpxor xmm7,xmm3,XMMWORD[48+rsi] + lea rsi,[64+rsi] + + vmovdqu XMMWORD[rdi],xmm4 + vmovdqu XMMWORD[16+rdi],xmm5 + vmovdqu XMMWORD[32+rdi],xmm6 + vmovdqu XMMWORD[48+rdi],xmm7 + lea rdi,[64+rdi] + + jz NEAR $L$done_avx512vl + + vextracti128 xmm4,ymm0,1 + vextracti128 xmm5,ymm1,1 + vextracti128 xmm6,ymm2,1 + vextracti128 xmm7,ymm3,1 + + sub rdx,64 + jb NEAR $L$tail_avx512vl + + vpxor xmm4,xmm4,XMMWORD[rsi] + vpxor xmm5,xmm5,XMMWORD[16+rsi] + vpxor xmm6,xmm6,XMMWORD[32+rsi] + vpxor xmm7,xmm7,XMMWORD[48+rsi] + lea rsi,[64+rsi] + + vmovdqu XMMWORD[rdi],xmm4 + vmovdqu XMMWORD[16+rdi],xmm5 + vmovdqu XMMWORD[32+rdi],xmm6 + vmovdqu XMMWORD[48+rdi],xmm7 + lea rdi,[64+rdi] + + vmovdqa32 ymm0,ymm16 + vmovdqa32 ymm1,ymm17 + jnz NEAR $L$oop_outer_avx512vl + + jmp NEAR $L$done_avx512vl + +ALIGN 16 +$L$tail64_avx512vl: + vmovdqa XMMWORD[rsp],xmm0 + vmovdqa XMMWORD[16+rsp],xmm1 + vmovdqa XMMWORD[32+rsp],xmm2 + vmovdqa XMMWORD[48+rsp],xmm3 + add rdx,64 + jmp NEAR $L$oop_tail_avx512vl + +ALIGN 16 +$L$tail_avx512vl: + vmovdqa XMMWORD[rsp],xmm4 + vmovdqa XMMWORD[16+rsp],xmm5 + vmovdqa XMMWORD[32+rsp],xmm6 + vmovdqa XMMWORD[48+rsp],xmm7 + add rdx,64 + +$L$oop_tail_avx512vl: + movzx eax,BYTE[r8*1+rsi] + movzx ecx,BYTE[r8*1+rsp] + lea r8,[1+r8] + xor eax,ecx + mov BYTE[((-1))+r8*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail_avx512vl + + vmovdqu32 YMMWORD[rsp],ymm16 + vmovdqu32 YMMWORD[32+rsp],ymm16 + +$L$done_avx512vl: + vzeroall + movaps xmm6,XMMWORD[((-40))+r9] + movaps xmm7,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$avx512vl_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ChaCha20_avx512vl: + +ALIGN 32 +ChaCha20_16x: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_16x: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +$L$ChaCha20_16x: + mov r9,rsp + + sub rsp,64+168 + and rsp,-64 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$16x_body: + vzeroupper + + lea r10,[$L$sigma] + vbroadcasti32x4 zmm3,ZMMWORD[r10] + vbroadcasti32x4 zmm7,ZMMWORD[rcx] + vbroadcasti32x4 zmm11,ZMMWORD[16+rcx] + vbroadcasti32x4 zmm15,ZMMWORD[r8] + + vpshufd zmm0,zmm3,0x00 + vpshufd zmm1,zmm3,0x55 + vpshufd zmm2,zmm3,0xaa + vpshufd zmm3,zmm3,0xff + vmovdqa64 zmm16,zmm0 + vmovdqa64 zmm17,zmm1 + vmovdqa64 zmm18,zmm2 + vmovdqa64 zmm19,zmm3 + + vpshufd zmm4,zmm7,0x00 + vpshufd zmm5,zmm7,0x55 + vpshufd zmm6,zmm7,0xaa + vpshufd zmm7,zmm7,0xff + vmovdqa64 zmm20,zmm4 + vmovdqa64 zmm21,zmm5 + vmovdqa64 zmm22,zmm6 + vmovdqa64 zmm23,zmm7 + + vpshufd zmm8,zmm11,0x00 + vpshufd zmm9,zmm11,0x55 + vpshufd zmm10,zmm11,0xaa + vpshufd zmm11,zmm11,0xff + vmovdqa64 zmm24,zmm8 + vmovdqa64 zmm25,zmm9 + vmovdqa64 zmm26,zmm10 + vmovdqa64 zmm27,zmm11 + + vpshufd zmm12,zmm15,0x00 + vpshufd zmm13,zmm15,0x55 + vpshufd zmm14,zmm15,0xaa + vpshufd zmm15,zmm15,0xff + vpaddd zmm12,zmm12,ZMMWORD[$L$incz] + vmovdqa64 zmm28,zmm12 + vmovdqa64 zmm29,zmm13 + vmovdqa64 zmm30,zmm14 + vmovdqa64 zmm31,zmm15 + + mov eax,10 + jmp NEAR $L$oop16x + +ALIGN 32 +$L$oop_outer16x: + vpbroadcastd zmm0,DWORD[r10] + vpbroadcastd zmm1,DWORD[4+r10] + vpbroadcastd zmm2,DWORD[8+r10] + vpbroadcastd zmm3,DWORD[12+r10] + vpaddd zmm28,zmm28,ZMMWORD[$L$sixteen] + vmovdqa64 zmm4,zmm20 + vmovdqa64 zmm5,zmm21 + vmovdqa64 zmm6,zmm22 + vmovdqa64 zmm7,zmm23 + vmovdqa64 zmm8,zmm24 + vmovdqa64 zmm9,zmm25 + vmovdqa64 zmm10,zmm26 + vmovdqa64 zmm11,zmm27 + vmovdqa64 zmm12,zmm28 + vmovdqa64 zmm13,zmm29 + vmovdqa64 zmm14,zmm30 + vmovdqa64 zmm15,zmm31 + + vmovdqa64 zmm16,zmm0 + vmovdqa64 zmm17,zmm1 + vmovdqa64 zmm18,zmm2 + vmovdqa64 zmm19,zmm3 + + mov eax,10 + jmp NEAR $L$oop16x + +ALIGN 32 +$L$oop16x: + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprold zmm12,zmm12,16 + vprold zmm13,zmm13,16 + vprold zmm14,zmm14,16 + vprold zmm15,zmm15,16 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprold zmm4,zmm4,12 + vprold zmm5,zmm5,12 + vprold zmm6,zmm6,12 + vprold zmm7,zmm7,12 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprold zmm12,zmm12,8 + vprold zmm13,zmm13,8 + vprold zmm14,zmm14,8 + vprold zmm15,zmm15,8 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprold zmm4,zmm4,7 + vprold zmm5,zmm5,7 + vprold zmm6,zmm6,7 + vprold zmm7,zmm7,7 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprold zmm15,zmm15,16 + vprold zmm12,zmm12,16 + vprold zmm13,zmm13,16 + vprold zmm14,zmm14,16 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprold zmm5,zmm5,12 + vprold zmm6,zmm6,12 + vprold zmm7,zmm7,12 + vprold zmm4,zmm4,12 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprold zmm15,zmm15,8 + vprold zmm12,zmm12,8 + vprold zmm13,zmm13,8 + vprold zmm14,zmm14,8 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprold zmm5,zmm5,7 + vprold zmm6,zmm6,7 + vprold zmm7,zmm7,7 + vprold zmm4,zmm4,7 + dec eax + jnz NEAR $L$oop16x + + vpaddd zmm0,zmm0,zmm16 + vpaddd zmm1,zmm1,zmm17 + vpaddd zmm2,zmm2,zmm18 + vpaddd zmm3,zmm3,zmm19 + + vpunpckldq zmm18,zmm0,zmm1 + vpunpckldq zmm19,zmm2,zmm3 + vpunpckhdq zmm0,zmm0,zmm1 + vpunpckhdq zmm2,zmm2,zmm3 + vpunpcklqdq zmm1,zmm18,zmm19 + vpunpckhqdq zmm18,zmm18,zmm19 + vpunpcklqdq zmm3,zmm0,zmm2 + vpunpckhqdq zmm0,zmm0,zmm2 + vpaddd zmm4,zmm4,zmm20 + vpaddd zmm5,zmm5,zmm21 + vpaddd zmm6,zmm6,zmm22 + vpaddd zmm7,zmm7,zmm23 + + vpunpckldq zmm2,zmm4,zmm5 + vpunpckldq zmm19,zmm6,zmm7 + vpunpckhdq zmm4,zmm4,zmm5 + vpunpckhdq zmm6,zmm6,zmm7 + vpunpcklqdq zmm5,zmm2,zmm19 + vpunpckhqdq zmm2,zmm2,zmm19 + vpunpcklqdq zmm7,zmm4,zmm6 + vpunpckhqdq zmm4,zmm4,zmm6 + vshufi32x4 zmm19,zmm1,zmm5,0x44 + vshufi32x4 zmm5,zmm1,zmm5,0xee + vshufi32x4 zmm1,zmm18,zmm2,0x44 + vshufi32x4 zmm2,zmm18,zmm2,0xee + vshufi32x4 zmm18,zmm3,zmm7,0x44 + vshufi32x4 zmm7,zmm3,zmm7,0xee + vshufi32x4 zmm3,zmm0,zmm4,0x44 + vshufi32x4 zmm4,zmm0,zmm4,0xee + vpaddd zmm8,zmm8,zmm24 + vpaddd zmm9,zmm9,zmm25 + vpaddd zmm10,zmm10,zmm26 + vpaddd zmm11,zmm11,zmm27 + + vpunpckldq zmm6,zmm8,zmm9 + vpunpckldq zmm0,zmm10,zmm11 + vpunpckhdq zmm8,zmm8,zmm9 + vpunpckhdq zmm10,zmm10,zmm11 + vpunpcklqdq zmm9,zmm6,zmm0 + vpunpckhqdq zmm6,zmm6,zmm0 + vpunpcklqdq zmm11,zmm8,zmm10 + vpunpckhqdq zmm8,zmm8,zmm10 + vpaddd zmm12,zmm12,zmm28 + vpaddd zmm13,zmm13,zmm29 + vpaddd zmm14,zmm14,zmm30 + vpaddd zmm15,zmm15,zmm31 + + vpunpckldq zmm10,zmm12,zmm13 + vpunpckldq zmm0,zmm14,zmm15 + vpunpckhdq zmm12,zmm12,zmm13 + vpunpckhdq zmm14,zmm14,zmm15 + vpunpcklqdq zmm13,zmm10,zmm0 + vpunpckhqdq zmm10,zmm10,zmm0 + vpunpcklqdq zmm15,zmm12,zmm14 + vpunpckhqdq zmm12,zmm12,zmm14 + vshufi32x4 zmm0,zmm9,zmm13,0x44 + vshufi32x4 zmm13,zmm9,zmm13,0xee + vshufi32x4 zmm9,zmm6,zmm10,0x44 + vshufi32x4 zmm10,zmm6,zmm10,0xee + vshufi32x4 zmm6,zmm11,zmm15,0x44 + vshufi32x4 zmm15,zmm11,zmm15,0xee + vshufi32x4 zmm11,zmm8,zmm12,0x44 + vshufi32x4 zmm12,zmm8,zmm12,0xee + vshufi32x4 zmm16,zmm19,zmm0,0x88 + vshufi32x4 zmm19,zmm19,zmm0,0xdd + vshufi32x4 zmm0,zmm5,zmm13,0x88 + vshufi32x4 zmm13,zmm5,zmm13,0xdd + vshufi32x4 zmm17,zmm1,zmm9,0x88 + vshufi32x4 zmm1,zmm1,zmm9,0xdd + vshufi32x4 zmm9,zmm2,zmm10,0x88 + vshufi32x4 zmm10,zmm2,zmm10,0xdd + vshufi32x4 zmm14,zmm18,zmm6,0x88 + vshufi32x4 zmm18,zmm18,zmm6,0xdd + vshufi32x4 zmm6,zmm7,zmm15,0x88 + vshufi32x4 zmm15,zmm7,zmm15,0xdd + vshufi32x4 zmm8,zmm3,zmm11,0x88 + vshufi32x4 zmm3,zmm3,zmm11,0xdd + vshufi32x4 zmm11,zmm4,zmm12,0x88 + vshufi32x4 zmm12,zmm4,zmm12,0xdd + cmp rdx,64*16 + jb NEAR $L$tail16x + + vpxord zmm16,zmm16,ZMMWORD[rsi] + vpxord zmm17,zmm17,ZMMWORD[64+rsi] + vpxord zmm14,zmm14,ZMMWORD[128+rsi] + vpxord zmm8,zmm8,ZMMWORD[192+rsi] + vmovdqu32 ZMMWORD[rdi],zmm16 + vmovdqu32 ZMMWORD[64+rdi],zmm17 + vmovdqu32 ZMMWORD[128+rdi],zmm14 + vmovdqu32 ZMMWORD[192+rdi],zmm8 + + vpxord zmm19,zmm19,ZMMWORD[256+rsi] + vpxord zmm1,zmm1,ZMMWORD[320+rsi] + vpxord zmm18,zmm18,ZMMWORD[384+rsi] + vpxord zmm3,zmm3,ZMMWORD[448+rsi] + vmovdqu32 ZMMWORD[256+rdi],zmm19 + vmovdqu32 ZMMWORD[320+rdi],zmm1 + vmovdqu32 ZMMWORD[384+rdi],zmm18 + vmovdqu32 ZMMWORD[448+rdi],zmm3 + + vpxord zmm0,zmm0,ZMMWORD[512+rsi] + vpxord zmm9,zmm9,ZMMWORD[576+rsi] + vpxord zmm6,zmm6,ZMMWORD[640+rsi] + vpxord zmm11,zmm11,ZMMWORD[704+rsi] + vmovdqu32 ZMMWORD[512+rdi],zmm0 + vmovdqu32 ZMMWORD[576+rdi],zmm9 + vmovdqu32 ZMMWORD[640+rdi],zmm6 + vmovdqu32 ZMMWORD[704+rdi],zmm11 + + vpxord zmm13,zmm13,ZMMWORD[768+rsi] + vpxord zmm10,zmm10,ZMMWORD[832+rsi] + vpxord zmm15,zmm15,ZMMWORD[896+rsi] + vpxord zmm12,zmm12,ZMMWORD[960+rsi] + lea rsi,[1024+rsi] + vmovdqu32 ZMMWORD[768+rdi],zmm13 + vmovdqu32 ZMMWORD[832+rdi],zmm10 + vmovdqu32 ZMMWORD[896+rdi],zmm15 + vmovdqu32 ZMMWORD[960+rdi],zmm12 + lea rdi,[1024+rdi] + + sub rdx,64*16 + jnz NEAR $L$oop_outer16x + + jmp NEAR $L$done16x + +ALIGN 32 +$L$tail16x: + xor r10,r10 + sub rdi,rsi + cmp rdx,64*1 + jb NEAR $L$ess_than_64_16x + vpxord zmm16,zmm16,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm16 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm17 + lea rsi,[64+rsi] + + cmp rdx,64*2 + jb NEAR $L$ess_than_64_16x + vpxord zmm17,zmm17,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm17 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm14 + lea rsi,[64+rsi] + + cmp rdx,64*3 + jb NEAR $L$ess_than_64_16x + vpxord zmm14,zmm14,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm14 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm8 + lea rsi,[64+rsi] + + cmp rdx,64*4 + jb NEAR $L$ess_than_64_16x + vpxord zmm8,zmm8,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm8 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm19 + lea rsi,[64+rsi] + + cmp rdx,64*5 + jb NEAR $L$ess_than_64_16x + vpxord zmm19,zmm19,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm19 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm1 + lea rsi,[64+rsi] + + cmp rdx,64*6 + jb NEAR $L$ess_than_64_16x + vpxord zmm1,zmm1,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm1 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm18 + lea rsi,[64+rsi] + + cmp rdx,64*7 + jb NEAR $L$ess_than_64_16x + vpxord zmm18,zmm18,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm18 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm3 + lea rsi,[64+rsi] + + cmp rdx,64*8 + jb NEAR $L$ess_than_64_16x + vpxord zmm3,zmm3,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm3 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm0 + lea rsi,[64+rsi] + + cmp rdx,64*9 + jb NEAR $L$ess_than_64_16x + vpxord zmm0,zmm0,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm0 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm9 + lea rsi,[64+rsi] + + cmp rdx,64*10 + jb NEAR $L$ess_than_64_16x + vpxord zmm9,zmm9,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm9 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm6 + lea rsi,[64+rsi] + + cmp rdx,64*11 + jb NEAR $L$ess_than_64_16x + vpxord zmm6,zmm6,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm6 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm11 + lea rsi,[64+rsi] + + cmp rdx,64*12 + jb NEAR $L$ess_than_64_16x + vpxord zmm11,zmm11,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm11 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm13 + lea rsi,[64+rsi] + + cmp rdx,64*13 + jb NEAR $L$ess_than_64_16x + vpxord zmm13,zmm13,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm13 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm10 + lea rsi,[64+rsi] + + cmp rdx,64*14 + jb NEAR $L$ess_than_64_16x + vpxord zmm10,zmm10,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm10 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm15 + lea rsi,[64+rsi] + + cmp rdx,64*15 + jb NEAR $L$ess_than_64_16x + vpxord zmm15,zmm15,ZMMWORD[rsi] + vmovdqu32 ZMMWORD[rsi*1+rdi],zmm15 + je NEAR $L$done16x + vmovdqa32 zmm16,zmm12 + lea rsi,[64+rsi] + +$L$ess_than_64_16x: + vmovdqa32 ZMMWORD[rsp],zmm16 + lea rdi,[rsi*1+rdi] + and rdx,63 + +$L$oop_tail16x: + movzx eax,BYTE[r10*1+rsi] + movzx ecx,BYTE[r10*1+rsp] + lea r10,[1+r10] + xor eax,ecx + mov BYTE[((-1))+r10*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail16x + + vpxord zmm16,zmm16,zmm16 + vmovdqa32 ZMMWORD[rsp],zmm16 + +$L$done16x: + vzeroall + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$16x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ChaCha20_16x: + +ALIGN 32 +ChaCha20_8xvl: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_8xvl: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +$L$ChaCha20_8xvl: + mov r9,rsp + + sub rsp,64+168 + and rsp,-64 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$8xvl_body: + vzeroupper + + lea r10,[$L$sigma] + vbroadcasti128 ymm3,XMMWORD[r10] + vbroadcasti128 ymm7,XMMWORD[rcx] + vbroadcasti128 ymm11,XMMWORD[16+rcx] + vbroadcasti128 ymm15,XMMWORD[r8] + + vpshufd ymm0,ymm3,0x00 + vpshufd ymm1,ymm3,0x55 + vpshufd ymm2,ymm3,0xaa + vpshufd ymm3,ymm3,0xff + vmovdqa64 ymm16,ymm0 + vmovdqa64 ymm17,ymm1 + vmovdqa64 ymm18,ymm2 + vmovdqa64 ymm19,ymm3 + + vpshufd ymm4,ymm7,0x00 + vpshufd ymm5,ymm7,0x55 + vpshufd ymm6,ymm7,0xaa + vpshufd ymm7,ymm7,0xff + vmovdqa64 ymm20,ymm4 + vmovdqa64 ymm21,ymm5 + vmovdqa64 ymm22,ymm6 + vmovdqa64 ymm23,ymm7 + + vpshufd ymm8,ymm11,0x00 + vpshufd ymm9,ymm11,0x55 + vpshufd ymm10,ymm11,0xaa + vpshufd ymm11,ymm11,0xff + vmovdqa64 ymm24,ymm8 + vmovdqa64 ymm25,ymm9 + vmovdqa64 ymm26,ymm10 + vmovdqa64 ymm27,ymm11 + + vpshufd ymm12,ymm15,0x00 + vpshufd ymm13,ymm15,0x55 + vpshufd ymm14,ymm15,0xaa + vpshufd ymm15,ymm15,0xff + vpaddd ymm12,ymm12,YMMWORD[$L$incy] + vmovdqa64 ymm28,ymm12 + vmovdqa64 ymm29,ymm13 + vmovdqa64 ymm30,ymm14 + vmovdqa64 ymm31,ymm15 + + mov eax,10 + jmp NEAR $L$oop8xvl + +ALIGN 32 +$L$oop_outer8xvl: + + + vpbroadcastd ymm2,DWORD[8+r10] + vpbroadcastd ymm3,DWORD[12+r10] + vpaddd ymm28,ymm28,YMMWORD[$L$eight] + vmovdqa64 ymm4,ymm20 + vmovdqa64 ymm5,ymm21 + vmovdqa64 ymm6,ymm22 + vmovdqa64 ymm7,ymm23 + vmovdqa64 ymm8,ymm24 + vmovdqa64 ymm9,ymm25 + vmovdqa64 ymm10,ymm26 + vmovdqa64 ymm11,ymm27 + vmovdqa64 ymm12,ymm28 + vmovdqa64 ymm13,ymm29 + vmovdqa64 ymm14,ymm30 + vmovdqa64 ymm15,ymm31 + + vmovdqa64 ymm16,ymm0 + vmovdqa64 ymm17,ymm1 + vmovdqa64 ymm18,ymm2 + vmovdqa64 ymm19,ymm3 + + mov eax,10 + jmp NEAR $L$oop8xvl + +ALIGN 32 +$L$oop8xvl: + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxor ymm12,ymm12,ymm0 + vpxor ymm13,ymm13,ymm1 + vpxor ymm14,ymm14,ymm2 + vpxor ymm15,ymm15,ymm3 + vprold ymm12,ymm12,16 + vprold ymm13,ymm13,16 + vprold ymm14,ymm14,16 + vprold ymm15,ymm15,16 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxor ymm4,ymm4,ymm8 + vpxor ymm5,ymm5,ymm9 + vpxor ymm6,ymm6,ymm10 + vpxor ymm7,ymm7,ymm11 + vprold ymm4,ymm4,12 + vprold ymm5,ymm5,12 + vprold ymm6,ymm6,12 + vprold ymm7,ymm7,12 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxor ymm12,ymm12,ymm0 + vpxor ymm13,ymm13,ymm1 + vpxor ymm14,ymm14,ymm2 + vpxor ymm15,ymm15,ymm3 + vprold ymm12,ymm12,8 + vprold ymm13,ymm13,8 + vprold ymm14,ymm14,8 + vprold ymm15,ymm15,8 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxor ymm4,ymm4,ymm8 + vpxor ymm5,ymm5,ymm9 + vpxor ymm6,ymm6,ymm10 + vpxor ymm7,ymm7,ymm11 + vprold ymm4,ymm4,7 + vprold ymm5,ymm5,7 + vprold ymm6,ymm6,7 + vprold ymm7,ymm7,7 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxor ymm15,ymm15,ymm0 + vpxor ymm12,ymm12,ymm1 + vpxor ymm13,ymm13,ymm2 + vpxor ymm14,ymm14,ymm3 + vprold ymm15,ymm15,16 + vprold ymm12,ymm12,16 + vprold ymm13,ymm13,16 + vprold ymm14,ymm14,16 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxor ymm5,ymm5,ymm10 + vpxor ymm6,ymm6,ymm11 + vpxor ymm7,ymm7,ymm8 + vpxor ymm4,ymm4,ymm9 + vprold ymm5,ymm5,12 + vprold ymm6,ymm6,12 + vprold ymm7,ymm7,12 + vprold ymm4,ymm4,12 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxor ymm15,ymm15,ymm0 + vpxor ymm12,ymm12,ymm1 + vpxor ymm13,ymm13,ymm2 + vpxor ymm14,ymm14,ymm3 + vprold ymm15,ymm15,8 + vprold ymm12,ymm12,8 + vprold ymm13,ymm13,8 + vprold ymm14,ymm14,8 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxor ymm5,ymm5,ymm10 + vpxor ymm6,ymm6,ymm11 + vpxor ymm7,ymm7,ymm8 + vpxor ymm4,ymm4,ymm9 + vprold ymm5,ymm5,7 + vprold ymm6,ymm6,7 + vprold ymm7,ymm7,7 + vprold ymm4,ymm4,7 + dec eax + jnz NEAR $L$oop8xvl + + vpaddd ymm0,ymm0,ymm16 + vpaddd ymm1,ymm1,ymm17 + vpaddd ymm2,ymm2,ymm18 + vpaddd ymm3,ymm3,ymm19 + + vpunpckldq ymm18,ymm0,ymm1 + vpunpckldq ymm19,ymm2,ymm3 + vpunpckhdq ymm0,ymm0,ymm1 + vpunpckhdq ymm2,ymm2,ymm3 + vpunpcklqdq ymm1,ymm18,ymm19 + vpunpckhqdq ymm18,ymm18,ymm19 + vpunpcklqdq ymm3,ymm0,ymm2 + vpunpckhqdq ymm0,ymm0,ymm2 + vpaddd ymm4,ymm4,ymm20 + vpaddd ymm5,ymm5,ymm21 + vpaddd ymm6,ymm6,ymm22 + vpaddd ymm7,ymm7,ymm23 + + vpunpckldq ymm2,ymm4,ymm5 + vpunpckldq ymm19,ymm6,ymm7 + vpunpckhdq ymm4,ymm4,ymm5 + vpunpckhdq ymm6,ymm6,ymm7 + vpunpcklqdq ymm5,ymm2,ymm19 + vpunpckhqdq ymm2,ymm2,ymm19 + vpunpcklqdq ymm7,ymm4,ymm6 + vpunpckhqdq ymm4,ymm4,ymm6 + vshufi32x4 ymm19,ymm1,ymm5,0 + vshufi32x4 ymm5,ymm1,ymm5,3 + vshufi32x4 ymm1,ymm18,ymm2,0 + vshufi32x4 ymm2,ymm18,ymm2,3 + vshufi32x4 ymm18,ymm3,ymm7,0 + vshufi32x4 ymm7,ymm3,ymm7,3 + vshufi32x4 ymm3,ymm0,ymm4,0 + vshufi32x4 ymm4,ymm0,ymm4,3 + vpaddd ymm8,ymm8,ymm24 + vpaddd ymm9,ymm9,ymm25 + vpaddd ymm10,ymm10,ymm26 + vpaddd ymm11,ymm11,ymm27 + + vpunpckldq ymm6,ymm8,ymm9 + vpunpckldq ymm0,ymm10,ymm11 + vpunpckhdq ymm8,ymm8,ymm9 + vpunpckhdq ymm10,ymm10,ymm11 + vpunpcklqdq ymm9,ymm6,ymm0 + vpunpckhqdq ymm6,ymm6,ymm0 + vpunpcklqdq ymm11,ymm8,ymm10 + vpunpckhqdq ymm8,ymm8,ymm10 + vpaddd ymm12,ymm12,ymm28 + vpaddd ymm13,ymm13,ymm29 + vpaddd ymm14,ymm14,ymm30 + vpaddd ymm15,ymm15,ymm31 + + vpunpckldq ymm10,ymm12,ymm13 + vpunpckldq ymm0,ymm14,ymm15 + vpunpckhdq ymm12,ymm12,ymm13 + vpunpckhdq ymm14,ymm14,ymm15 + vpunpcklqdq ymm13,ymm10,ymm0 + vpunpckhqdq ymm10,ymm10,ymm0 + vpunpcklqdq ymm15,ymm12,ymm14 + vpunpckhqdq ymm12,ymm12,ymm14 + vperm2i128 ymm0,ymm9,ymm13,0x20 + vperm2i128 ymm13,ymm9,ymm13,0x31 + vperm2i128 ymm9,ymm6,ymm10,0x20 + vperm2i128 ymm10,ymm6,ymm10,0x31 + vperm2i128 ymm6,ymm11,ymm15,0x20 + vperm2i128 ymm15,ymm11,ymm15,0x31 + vperm2i128 ymm11,ymm8,ymm12,0x20 + vperm2i128 ymm12,ymm8,ymm12,0x31 + cmp rdx,64*8 + jb NEAR $L$tail8xvl + + mov eax,0x80 + vpxord ymm19,ymm19,YMMWORD[rsi] + vpxor ymm0,ymm0,YMMWORD[32+rsi] + vpxor ymm5,ymm5,YMMWORD[64+rsi] + vpxor ymm13,ymm13,YMMWORD[96+rsi] + lea rsi,[rax*1+rsi] + vmovdqu32 YMMWORD[rdi],ymm19 + vmovdqu YMMWORD[32+rdi],ymm0 + vmovdqu YMMWORD[64+rdi],ymm5 + vmovdqu YMMWORD[96+rdi],ymm13 + lea rdi,[rax*1+rdi] + + vpxor ymm1,ymm1,YMMWORD[rsi] + vpxor ymm9,ymm9,YMMWORD[32+rsi] + vpxor ymm2,ymm2,YMMWORD[64+rsi] + vpxor ymm10,ymm10,YMMWORD[96+rsi] + lea rsi,[rax*1+rsi] + vmovdqu YMMWORD[rdi],ymm1 + vmovdqu YMMWORD[32+rdi],ymm9 + vmovdqu YMMWORD[64+rdi],ymm2 + vmovdqu YMMWORD[96+rdi],ymm10 + lea rdi,[rax*1+rdi] + + vpxord ymm18,ymm18,YMMWORD[rsi] + vpxor ymm6,ymm6,YMMWORD[32+rsi] + vpxor ymm7,ymm7,YMMWORD[64+rsi] + vpxor ymm15,ymm15,YMMWORD[96+rsi] + lea rsi,[rax*1+rsi] + vmovdqu32 YMMWORD[rdi],ymm18 + vmovdqu YMMWORD[32+rdi],ymm6 + vmovdqu YMMWORD[64+rdi],ymm7 + vmovdqu YMMWORD[96+rdi],ymm15 + lea rdi,[rax*1+rdi] + + vpxor ymm3,ymm3,YMMWORD[rsi] + vpxor ymm11,ymm11,YMMWORD[32+rsi] + vpxor ymm4,ymm4,YMMWORD[64+rsi] + vpxor ymm12,ymm12,YMMWORD[96+rsi] + lea rsi,[rax*1+rsi] + vmovdqu YMMWORD[rdi],ymm3 + vmovdqu YMMWORD[32+rdi],ymm11 + vmovdqu YMMWORD[64+rdi],ymm4 + vmovdqu YMMWORD[96+rdi],ymm12 + lea rdi,[rax*1+rdi] + + vpbroadcastd ymm0,DWORD[r10] + vpbroadcastd ymm1,DWORD[4+r10] + + sub rdx,64*8 + jnz NEAR $L$oop_outer8xvl + + jmp NEAR $L$done8xvl + +ALIGN 32 +$L$tail8xvl: + vmovdqa64 ymm8,ymm19 + xor r10,r10 + sub rdi,rsi + cmp rdx,64*1 + jb NEAR $L$ess_than_64_8xvl + vpxor ymm8,ymm8,YMMWORD[rsi] + vpxor ymm0,ymm0,YMMWORD[32+rsi] + vmovdqu YMMWORD[rsi*1+rdi],ymm8 + vmovdqu YMMWORD[32+rsi*1+rdi],ymm0 + je NEAR $L$done8xvl + vmovdqa ymm8,ymm5 + vmovdqa ymm0,ymm13 + lea rsi,[64+rsi] + + cmp rdx,64*2 + jb NEAR $L$ess_than_64_8xvl + vpxor ymm5,ymm5,YMMWORD[rsi] + vpxor ymm13,ymm13,YMMWORD[32+rsi] + vmovdqu YMMWORD[rsi*1+rdi],ymm5 + vmovdqu YMMWORD[32+rsi*1+rdi],ymm13 + je NEAR $L$done8xvl + vmovdqa ymm8,ymm1 + vmovdqa ymm0,ymm9 + lea rsi,[64+rsi] + + cmp rdx,64*3 + jb NEAR $L$ess_than_64_8xvl + vpxor ymm1,ymm1,YMMWORD[rsi] + vpxor ymm9,ymm9,YMMWORD[32+rsi] + vmovdqu YMMWORD[rsi*1+rdi],ymm1 + vmovdqu YMMWORD[32+rsi*1+rdi],ymm9 + je NEAR $L$done8xvl + vmovdqa ymm8,ymm2 + vmovdqa ymm0,ymm10 + lea rsi,[64+rsi] + + cmp rdx,64*4 + jb NEAR $L$ess_than_64_8xvl + vpxor ymm2,ymm2,YMMWORD[rsi] + vpxor ymm10,ymm10,YMMWORD[32+rsi] + vmovdqu YMMWORD[rsi*1+rdi],ymm2 + vmovdqu YMMWORD[32+rsi*1+rdi],ymm10 + je NEAR $L$done8xvl + vmovdqa32 ymm8,ymm18 + vmovdqa ymm0,ymm6 + lea rsi,[64+rsi] + + cmp rdx,64*5 + jb NEAR $L$ess_than_64_8xvl + vpxord ymm18,ymm18,YMMWORD[rsi] + vpxor ymm6,ymm6,YMMWORD[32+rsi] + vmovdqu32 YMMWORD[rsi*1+rdi],ymm18 + vmovdqu YMMWORD[32+rsi*1+rdi],ymm6 + je NEAR $L$done8xvl + vmovdqa ymm8,ymm7 + vmovdqa ymm0,ymm15 + lea rsi,[64+rsi] + + cmp rdx,64*6 + jb NEAR $L$ess_than_64_8xvl + vpxor ymm7,ymm7,YMMWORD[rsi] + vpxor ymm15,ymm15,YMMWORD[32+rsi] + vmovdqu YMMWORD[rsi*1+rdi],ymm7 + vmovdqu YMMWORD[32+rsi*1+rdi],ymm15 + je NEAR $L$done8xvl + vmovdqa ymm8,ymm3 + vmovdqa ymm0,ymm11 + lea rsi,[64+rsi] + + cmp rdx,64*7 + jb NEAR $L$ess_than_64_8xvl + vpxor ymm3,ymm3,YMMWORD[rsi] + vpxor ymm11,ymm11,YMMWORD[32+rsi] + vmovdqu YMMWORD[rsi*1+rdi],ymm3 + vmovdqu YMMWORD[32+rsi*1+rdi],ymm11 + je NEAR $L$done8xvl + vmovdqa ymm8,ymm4 + vmovdqa ymm0,ymm12 + lea rsi,[64+rsi] + +$L$ess_than_64_8xvl: + vmovdqa YMMWORD[rsp],ymm8 + vmovdqa YMMWORD[32+rsp],ymm0 + lea rdi,[rsi*1+rdi] + and rdx,63 + +$L$oop_tail8xvl: + movzx eax,BYTE[r10*1+rsi] + movzx ecx,BYTE[r10*1+rsp] + lea r10,[1+r10] + xor eax,ecx + mov BYTE[((-1))+r10*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail8xvl + + vpxor ymm8,ymm8,ymm8 + vmovdqa YMMWORD[rsp],ymm8 + vmovdqa YMMWORD[32+rsp],ymm8 + +$L$done8xvl: + vzeroall + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$8xvl_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ChaCha20_8xvl: +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + lea r10,[$L$ctr32_body] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + lea r10,[$L$no_data] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rax,[((64+24+48))+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + + +ALIGN 16 +simd_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[192+r8] + + mov r10d,DWORD[4+r11] + mov ecx,DWORD[8+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + neg rcx + lea rsi,[((-8))+rcx*1+rax] + lea rdi,[512+r8] + neg ecx + shr ecx,3 + DD 0xa548f3fc + + jmp NEAR $L$common_seh_tail + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_ChaCha20_ctr32 wrt ..imagebase + DD $L$SEH_end_ChaCha20_ctr32 wrt ..imagebase + DD $L$SEH_info_ChaCha20_ctr32 wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase + DD $L$SEH_end_ChaCha20_ssse3 wrt ..imagebase + DD $L$SEH_info_ChaCha20_ssse3 wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_128 wrt ..imagebase + DD $L$SEH_end_ChaCha20_128 wrt ..imagebase + DD $L$SEH_info_ChaCha20_128 wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_4x wrt ..imagebase + DD $L$SEH_end_ChaCha20_4x wrt ..imagebase + DD $L$SEH_info_ChaCha20_4x wrt ..imagebase + DD $L$SEH_begin_ChaCha20_4xop wrt ..imagebase + DD $L$SEH_end_ChaCha20_4xop wrt ..imagebase + DD $L$SEH_info_ChaCha20_4xop wrt ..imagebase + DD $L$SEH_begin_ChaCha20_8x wrt ..imagebase + DD $L$SEH_end_ChaCha20_8x wrt ..imagebase + DD $L$SEH_info_ChaCha20_8x wrt ..imagebase + DD $L$SEH_begin_ChaCha20_avx512 wrt ..imagebase + DD $L$SEH_end_ChaCha20_avx512 wrt ..imagebase + DD $L$SEH_info_ChaCha20_avx512 wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_avx512vl wrt ..imagebase + DD $L$SEH_end_ChaCha20_avx512vl wrt ..imagebase + DD $L$SEH_info_ChaCha20_avx512vl wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_16x wrt ..imagebase + DD $L$SEH_end_ChaCha20_16x wrt ..imagebase + DD $L$SEH_info_ChaCha20_16x wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_8xvl wrt ..imagebase + DD $L$SEH_end_ChaCha20_8xvl wrt ..imagebase + DD $L$SEH_info_ChaCha20_8xvl wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_ChaCha20_ctr32: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + +$L$SEH_info_ChaCha20_ssse3: +DB 9,0,0,0 + DD simd_handler wrt ..imagebase + DD $L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase + DD 0x20,0 + +$L$SEH_info_ChaCha20_128: +DB 9,0,0,0 + DD simd_handler wrt ..imagebase + DD $L$128_body wrt ..imagebase,$L$128_epilogue wrt ..imagebase + DD 0x60,0 + +$L$SEH_info_ChaCha20_4x: +DB 9,0,0,0 + DD simd_handler wrt ..imagebase + DD $L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase + DD 0xa0,0 +$L$SEH_info_ChaCha20_4xop: +DB 9,0,0,0 + DD simd_handler wrt ..imagebase + DD $L$4xop_body wrt ..imagebase,$L$4xop_epilogue wrt ..imagebase + DD 0xa0,0 +$L$SEH_info_ChaCha20_8x: +DB 9,0,0,0 + DD simd_handler wrt ..imagebase + DD $L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase + DD 0xa0,0 +$L$SEH_info_ChaCha20_avx512: +DB 9,0,0,0 + DD simd_handler wrt ..imagebase + DD $L$avx512_body wrt ..imagebase,$L$avx512_epilogue wrt ..imagebase + DD 0x20,0 + +$L$SEH_info_ChaCha20_avx512vl: +DB 9,0,0,0 + DD simd_handler wrt ..imagebase + DD $L$avx512vl_body wrt ..imagebase,$L$avx512vl_epilogue wrt ..imagebase + DD 0x20,0 + +$L$SEH_info_ChaCha20_16x: +DB 9,0,0,0 + DD simd_handler wrt ..imagebase + DD $L$16x_body wrt ..imagebase,$L$16x_epilogue wrt ..imagebase + DD 0xa0,0 + +$L$SEH_info_ChaCha20_8xvl: +DB 9,0,0,0 + DD simd_handler wrt ..imagebase + DD $L$8xvl_body wrt ..imagebase,$L$8xvl_epilogue wrt ..imagebase + DD 0xa0,0 |