diff options
Diffstat (limited to 'deps/openssl/config/archs/darwin-i386-cc/asm_avx2/crypto/poly1305/poly1305-x86.s')
-rw-r--r-- | deps/openssl/config/archs/darwin-i386-cc/asm_avx2/crypto/poly1305/poly1305-x86.s | 1898 |
1 files changed, 1898 insertions, 0 deletions
diff --git a/deps/openssl/config/archs/darwin-i386-cc/asm_avx2/crypto/poly1305/poly1305-x86.s b/deps/openssl/config/archs/darwin-i386-cc/asm_avx2/crypto/poly1305/poly1305-x86.s new file mode 100644 index 0000000000..cf6f78b2eb --- /dev/null +++ b/deps/openssl/config/archs/darwin-i386-cc/asm_avx2/crypto/poly1305/poly1305-x86.s @@ -0,0 +1,1898 @@ +.text +.align 6,0x90 +.globl _poly1305_init +.align 4 +_poly1305_init: +L_poly1305_init_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ebp + xorl %eax,%eax + movl %eax,(%edi) + movl %eax,4(%edi) + movl %eax,8(%edi) + movl %eax,12(%edi) + movl %eax,16(%edi) + movl %eax,20(%edi) + cmpl $0,%esi + je L000nokey + call L001pic_point +L001pic_point: + popl %ebx + leal _poly1305_blocks-L001pic_point(%ebx),%eax + leal _poly1305_emit-L001pic_point(%ebx),%edx + movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L001pic_point(%ebx),%edi + movl (%edi),%ecx + andl $83886080,%ecx + cmpl $83886080,%ecx + jne L002no_sse2 + leal __poly1305_blocks_sse2-L001pic_point(%ebx),%eax + leal __poly1305_emit_sse2-L001pic_point(%ebx),%edx + movl 8(%edi),%ecx + testl $32,%ecx + jz L002no_sse2 + leal __poly1305_blocks_avx2-L001pic_point(%ebx),%eax +L002no_sse2: + movl 20(%esp),%edi + movl %eax,(%ebp) + movl %edx,4(%ebp) + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + andl $268435455,%eax + andl $268435452,%ebx + andl $268435452,%ecx + andl $268435452,%edx + movl %eax,24(%edi) + movl %ebx,28(%edi) + movl %ecx,32(%edi) + movl %edx,36(%edi) + movl $1,%eax +L000nokey: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _poly1305_blocks +.align 4 +_poly1305_blocks: +L_poly1305_blocks_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx +Lenter_blocks: + andl $-15,%ecx + jz L003nodata + subl $64,%esp + movl 24(%edi),%eax + movl 28(%edi),%ebx + leal (%esi,%ecx,1),%ebp + movl 32(%edi),%ecx + movl 36(%edi),%edx + movl %ebp,92(%esp) + movl %esi,%ebp + movl %eax,36(%esp) + movl %ebx,%eax + shrl $2,%eax + movl %ebx,40(%esp) + addl %ebx,%eax + movl %ecx,%ebx + shrl $2,%ebx + movl %ecx,44(%esp) + addl %ecx,%ebx + movl %edx,%ecx + shrl $2,%ecx + movl %edx,48(%esp) + addl %edx,%ecx + movl %eax,52(%esp) + movl %ebx,56(%esp) + movl %ecx,60(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%esi + movl 16(%edi),%edi + jmp L004loop +.align 5,0x90 +L004loop: + addl (%ebp),%eax + adcl 4(%ebp),%ebx + adcl 8(%ebp),%ecx + adcl 12(%ebp),%esi + leal 16(%ebp),%ebp + adcl 96(%esp),%edi + movl %eax,(%esp) + movl %esi,12(%esp) + mull 36(%esp) + movl %edi,16(%esp) + movl %eax,%edi + movl %ebx,%eax + movl %edx,%esi + mull 60(%esp) + addl %eax,%edi + movl %ecx,%eax + adcl %edx,%esi + mull 56(%esp) + addl %eax,%edi + movl 12(%esp),%eax + adcl %edx,%esi + mull 52(%esp) + addl %eax,%edi + movl (%esp),%eax + adcl %edx,%esi + mull 40(%esp) + movl %edi,20(%esp) + xorl %edi,%edi + addl %eax,%esi + movl %ebx,%eax + adcl %edx,%edi + mull 36(%esp) + addl %eax,%esi + movl %ecx,%eax + adcl %edx,%edi + mull 60(%esp) + addl %eax,%esi + movl 12(%esp),%eax + adcl %edx,%edi + mull 56(%esp) + addl %eax,%esi + movl 16(%esp),%eax + adcl %edx,%edi + imull 52(%esp),%eax + addl %eax,%esi + movl (%esp),%eax + adcl $0,%edi + mull 44(%esp) + movl %esi,24(%esp) + xorl %esi,%esi + addl %eax,%edi + movl %ebx,%eax + adcl %edx,%esi + mull 40(%esp) + addl %eax,%edi + movl %ecx,%eax + adcl %edx,%esi + mull 36(%esp) + addl %eax,%edi + movl 12(%esp),%eax + adcl %edx,%esi + mull 60(%esp) + addl %eax,%edi + movl 16(%esp),%eax + adcl %edx,%esi + imull 56(%esp),%eax + addl %eax,%edi + movl (%esp),%eax + adcl $0,%esi + mull 48(%esp) + movl %edi,28(%esp) + xorl %edi,%edi + addl %eax,%esi + movl %ebx,%eax + adcl %edx,%edi + mull 44(%esp) + addl %eax,%esi + movl %ecx,%eax + adcl %edx,%edi + mull 40(%esp) + addl %eax,%esi + movl 12(%esp),%eax + adcl %edx,%edi + mull 36(%esp) + addl %eax,%esi + movl 16(%esp),%ecx + adcl %edx,%edi + movl %ecx,%edx + imull 60(%esp),%ecx + addl %ecx,%esi + movl 20(%esp),%eax + adcl $0,%edi + imull 36(%esp),%edx + addl %edi,%edx + movl 24(%esp),%ebx + movl 28(%esp),%ecx + movl %edx,%edi + shrl $2,%edx + andl $3,%edi + leal (%edx,%edx,4),%edx + addl %edx,%eax + adcl $0,%ebx + adcl $0,%ecx + adcl $0,%esi + adcl $0,%edi + cmpl 92(%esp),%ebp + jne L004loop + movl 84(%esp),%edx + addl $64,%esp + movl %eax,(%edx) + movl %ebx,4(%edx) + movl %ecx,8(%edx) + movl %esi,12(%edx) + movl %edi,16(%edx) +L003nodata: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _poly1305_emit +.align 4 +_poly1305_emit: +L_poly1305_emit_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%ebp +Lenter_emit: + movl 24(%esp),%edi + movl (%ebp),%eax + movl 4(%ebp),%ebx + movl 8(%ebp),%ecx + movl 12(%ebp),%edx + movl 16(%ebp),%esi + addl $5,%eax + adcl $0,%ebx + adcl $0,%ecx + adcl $0,%edx + adcl $0,%esi + shrl $2,%esi + negl %esi + andl %esi,%eax + andl %esi,%ebx + andl %esi,%ecx + andl %esi,%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + notl %esi + movl (%ebp),%eax + movl 4(%ebp),%ebx + movl 8(%ebp),%ecx + movl 12(%ebp),%edx + movl 28(%esp),%ebp + andl %esi,%eax + andl %esi,%ebx + andl %esi,%ecx + andl %esi,%edx + orl (%edi),%eax + orl 4(%edi),%ebx + orl 8(%edi),%ecx + orl 12(%edi),%edx + addl (%ebp),%eax + adcl 4(%ebp),%ebx + adcl 8(%ebp),%ecx + adcl 12(%ebp),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 5,0x90 +.align 4 +__poly1305_init_sse2: + movdqu 24(%edi),%xmm4 + leal 48(%edi),%edi + movl %esp,%ebp + subl $224,%esp + andl $-16,%esp + movq 64(%ebx),%xmm7 + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + movdqa %xmm4,%xmm2 + pand %xmm7,%xmm0 + psrlq $26,%xmm1 + psrldq $6,%xmm2 + pand %xmm7,%xmm1 + movdqa %xmm2,%xmm3 + psrlq $4,%xmm2 + psrlq $30,%xmm3 + pand %xmm7,%xmm2 + pand %xmm7,%xmm3 + psrldq $13,%xmm4 + leal 144(%esp),%edx + movl $2,%ecx +L005square: + movdqa %xmm0,(%esp) + movdqa %xmm1,16(%esp) + movdqa %xmm2,32(%esp) + movdqa %xmm3,48(%esp) + movdqa %xmm4,64(%esp) + movdqa %xmm1,%xmm6 + movdqa %xmm2,%xmm5 + pslld $2,%xmm6 + pslld $2,%xmm5 + paddd %xmm1,%xmm6 + paddd %xmm2,%xmm5 + movdqa %xmm6,80(%esp) + movdqa %xmm5,96(%esp) + movdqa %xmm3,%xmm6 + movdqa %xmm4,%xmm5 + pslld $2,%xmm6 + pslld $2,%xmm5 + paddd %xmm3,%xmm6 + paddd %xmm4,%xmm5 + movdqa %xmm6,112(%esp) + movdqa %xmm5,128(%esp) + pshufd $68,%xmm0,%xmm6 + movdqa %xmm1,%xmm5 + pshufd $68,%xmm1,%xmm1 + pshufd $68,%xmm2,%xmm2 + pshufd $68,%xmm3,%xmm3 + pshufd $68,%xmm4,%xmm4 + movdqa %xmm6,(%edx) + movdqa %xmm1,16(%edx) + movdqa %xmm2,32(%edx) + movdqa %xmm3,48(%edx) + movdqa %xmm4,64(%edx) + pmuludq %xmm0,%xmm4 + pmuludq %xmm0,%xmm3 + pmuludq %xmm0,%xmm2 + pmuludq %xmm0,%xmm1 + pmuludq %xmm6,%xmm0 + movdqa %xmm5,%xmm6 + pmuludq 48(%edx),%xmm5 + movdqa %xmm6,%xmm7 + pmuludq 32(%edx),%xmm6 + paddq %xmm5,%xmm4 + movdqa %xmm7,%xmm5 + pmuludq 16(%edx),%xmm7 + paddq %xmm6,%xmm3 + movdqa 80(%esp),%xmm6 + pmuludq (%edx),%xmm5 + paddq %xmm7,%xmm2 + pmuludq 64(%edx),%xmm6 + movdqa 32(%esp),%xmm7 + paddq %xmm5,%xmm1 + movdqa %xmm7,%xmm5 + pmuludq 32(%edx),%xmm7 + paddq %xmm6,%xmm0 + movdqa %xmm5,%xmm6 + pmuludq 16(%edx),%xmm5 + paddq %xmm7,%xmm4 + movdqa 96(%esp),%xmm7 + pmuludq (%edx),%xmm6 + paddq %xmm5,%xmm3 + movdqa %xmm7,%xmm5 + pmuludq 64(%edx),%xmm7 + paddq %xmm6,%xmm2 + pmuludq 48(%edx),%xmm5 + movdqa 48(%esp),%xmm6 + paddq %xmm7,%xmm1 + movdqa %xmm6,%xmm7 + pmuludq 16(%edx),%xmm6 + paddq %xmm5,%xmm0 + movdqa 112(%esp),%xmm5 + pmuludq (%edx),%xmm7 + paddq %xmm6,%xmm4 + movdqa %xmm5,%xmm6 + pmuludq 64(%edx),%xmm5 + paddq %xmm7,%xmm3 + movdqa %xmm6,%xmm7 + pmuludq 48(%edx),%xmm6 + paddq %xmm5,%xmm2 + pmuludq 32(%edx),%xmm7 + movdqa 64(%esp),%xmm5 + paddq %xmm6,%xmm1 + movdqa 128(%esp),%xmm6 + pmuludq (%edx),%xmm5 + paddq %xmm7,%xmm0 + movdqa %xmm6,%xmm7 + pmuludq 64(%edx),%xmm6 + paddq %xmm5,%xmm4 + movdqa %xmm7,%xmm5 + pmuludq 16(%edx),%xmm7 + paddq %xmm6,%xmm3 + movdqa %xmm5,%xmm6 + pmuludq 32(%edx),%xmm5 + paddq %xmm7,%xmm0 + pmuludq 48(%edx),%xmm6 + movdqa 64(%ebx),%xmm7 + paddq %xmm5,%xmm1 + paddq %xmm6,%xmm2 + movdqa %xmm3,%xmm5 + pand %xmm7,%xmm3 + psrlq $26,%xmm5 + paddq %xmm4,%xmm5 + movdqa %xmm0,%xmm6 + pand %xmm7,%xmm0 + psrlq $26,%xmm6 + movdqa %xmm5,%xmm4 + paddq %xmm1,%xmm6 + psrlq $26,%xmm5 + pand %xmm7,%xmm4 + movdqa %xmm6,%xmm1 + psrlq $26,%xmm6 + paddd %xmm5,%xmm0 + psllq $2,%xmm5 + paddq %xmm2,%xmm6 + paddq %xmm0,%xmm5 + pand %xmm7,%xmm1 + movdqa %xmm6,%xmm2 + psrlq $26,%xmm6 + pand %xmm7,%xmm2 + paddd %xmm3,%xmm6 + movdqa %xmm5,%xmm0 + psrlq $26,%xmm5 + movdqa %xmm6,%xmm3 + psrlq $26,%xmm6 + pand %xmm7,%xmm0 + paddd %xmm5,%xmm1 + pand %xmm7,%xmm3 + paddd %xmm6,%xmm4 + decl %ecx + jz L006square_break + punpcklqdq (%esp),%xmm0 + punpcklqdq 16(%esp),%xmm1 + punpcklqdq 32(%esp),%xmm2 + punpcklqdq 48(%esp),%xmm3 + punpcklqdq 64(%esp),%xmm4 + jmp L005square +L006square_break: + psllq $32,%xmm0 + psllq $32,%xmm1 + psllq $32,%xmm2 + psllq $32,%xmm3 + psllq $32,%xmm4 + por (%esp),%xmm0 + por 16(%esp),%xmm1 + por 32(%esp),%xmm2 + por 48(%esp),%xmm3 + por 64(%esp),%xmm4 + pshufd $141,%xmm0,%xmm0 + pshufd $141,%xmm1,%xmm1 + pshufd $141,%xmm2,%xmm2 + pshufd $141,%xmm3,%xmm3 + pshufd $141,%xmm4,%xmm4 + movdqu %xmm0,(%edi) + movdqu %xmm1,16(%edi) + movdqu %xmm2,32(%edi) + movdqu %xmm3,48(%edi) + movdqu %xmm4,64(%edi) + movdqa %xmm1,%xmm6 + movdqa %xmm2,%xmm5 + pslld $2,%xmm6 + pslld $2,%xmm5 + paddd %xmm1,%xmm6 + paddd %xmm2,%xmm5 + movdqu %xmm6,80(%edi) + movdqu %xmm5,96(%edi) + movdqa %xmm3,%xmm6 + movdqa %xmm4,%xmm5 + pslld $2,%xmm6 + pslld $2,%xmm5 + paddd %xmm3,%xmm6 + paddd %xmm4,%xmm5 + movdqu %xmm6,112(%edi) + movdqu %xmm5,128(%edi) + movl %ebp,%esp + leal -48(%edi),%edi + ret +.align 5,0x90 +.align 4 +__poly1305_blocks_sse2: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 20(%edi),%eax + andl $-16,%ecx + jz L007nodata + cmpl $64,%ecx + jae L008enter_sse2 + testl %eax,%eax + jz Lenter_blocks +.align 4,0x90 +L008enter_sse2: + call L009pic_point +L009pic_point: + popl %ebx + leal Lconst_sse2-L009pic_point(%ebx),%ebx + testl %eax,%eax + jnz L010base2_26 + call __poly1305_init_sse2 + movl (%edi),%eax + movl 3(%edi),%ecx + movl 6(%edi),%edx + movl 9(%edi),%esi + movl 13(%edi),%ebp + movl $1,20(%edi) + shrl $2,%ecx + andl $67108863,%eax + shrl $4,%edx + andl $67108863,%ecx + shrl $6,%esi + andl $67108863,%edx + movd %eax,%xmm0 + movd %ecx,%xmm1 + movd %edx,%xmm2 + movd %esi,%xmm3 + movd %ebp,%xmm4 + movl 24(%esp),%esi + movl 28(%esp),%ecx + jmp L011base2_32 +.align 4,0x90 +L010base2_26: + movd (%edi),%xmm0 + movd 4(%edi),%xmm1 + movd 8(%edi),%xmm2 + movd 12(%edi),%xmm3 + movd 16(%edi),%xmm4 + movdqa 64(%ebx),%xmm7 +L011base2_32: + movl 32(%esp),%eax + movl %esp,%ebp + subl $528,%esp + andl $-16,%esp + leal 48(%edi),%edi + shll $24,%eax + testl $31,%ecx + jz L012even + movdqu (%esi),%xmm6 + leal 16(%esi),%esi + movdqa %xmm6,%xmm5 + pand %xmm7,%xmm6 + paddd %xmm6,%xmm0 + movdqa %xmm5,%xmm6 + psrlq $26,%xmm5 + psrldq $6,%xmm6 + pand %xmm7,%xmm5 + paddd %xmm5,%xmm1 + movdqa %xmm6,%xmm5 + psrlq $4,%xmm6 + pand %xmm7,%xmm6 + paddd %xmm6,%xmm2 + movdqa %xmm5,%xmm6 + psrlq $30,%xmm5 + pand %xmm7,%xmm5 + psrldq $7,%xmm6 + paddd %xmm5,%xmm3 + movd %eax,%xmm5 + paddd %xmm6,%xmm4 + movd 12(%edi),%xmm6 + paddd %xmm5,%xmm4 + movdqa %xmm0,(%esp) + movdqa %xmm1,16(%esp) + movdqa %xmm2,32(%esp) + movdqa %xmm3,48(%esp) + movdqa %xmm4,64(%esp) + pmuludq %xmm6,%xmm0 + pmuludq %xmm6,%xmm1 + pmuludq %xmm6,%xmm2 + movd 28(%edi),%xmm5 + pmuludq %xmm6,%xmm3 + pmuludq %xmm6,%xmm4 + movdqa %xmm5,%xmm6 + pmuludq 48(%esp),%xmm5 + movdqa %xmm6,%xmm7 + pmuludq 32(%esp),%xmm6 + paddq %xmm5,%xmm4 + movdqa %xmm7,%xmm5 + pmuludq 16(%esp),%xmm7 + paddq %xmm6,%xmm3 + movd 92(%edi),%xmm6 + pmuludq (%esp),%xmm5 + paddq %xmm7,%xmm2 + pmuludq 64(%esp),%xmm6 + movd 44(%edi),%xmm7 + paddq %xmm5,%xmm1 + movdqa %xmm7,%xmm5 + pmuludq 32(%esp),%xmm7 + paddq %xmm6,%xmm0 + movdqa %xmm5,%xmm6 + pmuludq 16(%esp),%xmm5 + paddq %xmm7,%xmm4 + movd 108(%edi),%xmm7 + pmuludq (%esp),%xmm6 + paddq %xmm5,%xmm3 + movdqa %xmm7,%xmm5 + pmuludq 64(%esp),%xmm7 + paddq %xmm6,%xmm2 + pmuludq 48(%esp),%xmm5 + movd 60(%edi),%xmm6 + paddq %xmm7,%xmm1 + movdqa %xmm6,%xmm7 + pmuludq 16(%esp),%xmm6 + paddq %xmm5,%xmm0 + movd 124(%edi),%xmm5 + pmuludq (%esp),%xmm7 + paddq %xmm6,%xmm4 + movdqa %xmm5,%xmm6 + pmuludq 64(%esp),%xmm5 + paddq %xmm7,%xmm3 + movdqa %xmm6,%xmm7 + pmuludq 48(%esp),%xmm6 + paddq %xmm5,%xmm2 + pmuludq 32(%esp),%xmm7 + movd 76(%edi),%xmm5 + paddq %xmm6,%xmm1 + movd 140(%edi),%xmm6 + pmuludq (%esp),%xmm5 + paddq %xmm7,%xmm0 + movdqa %xmm6,%xmm7 + pmuludq 64(%esp),%xmm6 + paddq %xmm5,%xmm4 + movdqa %xmm7,%xmm5 + pmuludq 16(%esp),%xmm7 + paddq %xmm6,%xmm3 + movdqa %xmm5,%xmm6 + pmuludq 32(%esp),%xmm5 + paddq %xmm7,%xmm0 + pmuludq 48(%esp),%xmm6 + movdqa 64(%ebx),%xmm7 + paddq %xmm5,%xmm1 + paddq %xmm6,%xmm2 + movdqa %xmm3,%xmm5 + pand %xmm7,%xmm3 + psrlq $26,%xmm5 + paddq %xmm4,%xmm5 + movdqa %xmm0,%xmm6 + pand %xmm7,%xmm0 + psrlq $26,%xmm6 + movdqa %xmm5,%xmm4 + paddq %xmm1,%xmm6 + psrlq $26,%xmm5 + pand %xmm7,%xmm4 + movdqa %xmm6,%xmm1 + psrlq $26,%xmm6 + paddd %xmm5,%xmm0 + psllq $2,%xmm5 + paddq %xmm2,%xmm6 + paddq %xmm0,%xmm5 + pand %xmm7,%xmm1 + movdqa %xmm6,%xmm2 + psrlq $26,%xmm6 + pand %xmm7,%xmm2 + paddd %xmm3,%xmm6 + movdqa %xmm5,%xmm0 + psrlq $26,%xmm5 + movdqa %xmm6,%xmm3 + psrlq $26,%xmm6 + pand %xmm7,%xmm0 + paddd %xmm5,%xmm1 + pand %xmm7,%xmm3 + paddd %xmm6,%xmm4 + subl $16,%ecx + jz L013done +L012even: + leal 384(%esp),%edx + leal -32(%esi),%eax + subl $64,%ecx + movdqu (%edi),%xmm5 + pshufd $68,%xmm5,%xmm6 + cmovbl %eax,%esi + pshufd $238,%xmm5,%xmm5 + movdqa %xmm6,(%edx) + leal 160(%esp),%eax + movdqu 16(%edi),%xmm6 + movdqa %xmm5,-144(%edx) + pshufd $68,%xmm6,%xmm5 + pshufd $238,%xmm6,%xmm6 + movdqa %xmm5,16(%edx) + movdqu 32(%edi),%xmm5 + movdqa %xmm6,-128(%edx) + pshufd $68,%xmm5,%xmm6 + pshufd $238,%xmm5,%xmm5 + movdqa %xmm6,32(%edx) + movdqu 48(%edi),%xmm6 + movdqa %xmm5,-112(%edx) + pshufd $68,%xmm6,%xmm5 + pshufd $238,%xmm6,%xmm6 + movdqa %xmm5,48(%edx) + movdqu 64(%edi),%xmm5 + movdqa %xmm6,-96(%edx) + pshufd $68,%xmm5,%xmm6 + pshufd $238,%xmm5,%xmm5 + movdqa %xmm6,64(%edx) + movdqu 80(%edi),%xmm6 + movdqa %xmm5,-80(%edx) + pshufd $68,%xmm6,%xmm5 + pshufd $238,%xmm6,%xmm6 + movdqa %xmm5,80(%edx) + movdqu 96(%edi),%xmm5 + movdqa %xmm6,-64(%edx) + pshufd $68,%xmm5,%xmm6 + pshufd $238,%xmm5,%xmm5 + movdqa %xmm6,96(%edx) + movdqu 112(%edi),%xmm6 + movdqa %xmm5,-48(%edx) + pshufd $68,%xmm6,%xmm5 + pshufd $238,%xmm6,%xmm6 + movdqa %xmm5,112(%edx) + movdqu 128(%edi),%xmm5 + movdqa %xmm6,-32(%edx) + pshufd $68,%xmm5,%xmm6 + pshufd $238,%xmm5,%xmm5 + movdqa %xmm6,128(%edx) + movdqa %xmm5,-16(%edx) + movdqu 32(%esi),%xmm5 + movdqu 48(%esi),%xmm6 + leal 32(%esi),%esi + movdqa %xmm2,112(%esp) + movdqa %xmm3,128(%esp) + movdqa %xmm4,144(%esp) + movdqa %xmm5,%xmm2 + movdqa %xmm6,%xmm3 + psrldq $6,%xmm2 + psrldq $6,%xmm3 + movdqa %xmm5,%xmm4 + punpcklqdq %xmm3,%xmm2 + punpckhqdq %xmm6,%xmm4 + punpcklqdq %xmm6,%xmm5 + movdqa %xmm2,%xmm3 + psrlq $4,%xmm2 + psrlq $30,%xmm3 + movdqa %xmm5,%xmm6 + psrlq $40,%xmm4 + psrlq $26,%xmm6 + pand %xmm7,%xmm5 + pand %xmm7,%xmm6 + pand %xmm7,%xmm2 + pand %xmm7,%xmm3 + por (%ebx),%xmm4 + movdqa %xmm0,80(%esp) + movdqa %xmm1,96(%esp) + jbe L014skip_loop + jmp L015loop +.align 5,0x90 +L015loop: + movdqa -144(%edx),%xmm7 + movdqa %xmm6,16(%eax) + movdqa %xmm2,32(%eax) + movdqa %xmm3,48(%eax) + movdqa %xmm4,64(%eax) + movdqa %xmm5,%xmm1 + pmuludq %xmm7,%xmm5 + movdqa %xmm6,%xmm0 + pmuludq %xmm7,%xmm6 + pmuludq %xmm7,%xmm2 + pmuludq %xmm7,%xmm3 + pmuludq %xmm7,%xmm4 + pmuludq -16(%edx),%xmm0 + movdqa %xmm1,%xmm7 + pmuludq -128(%edx),%xmm1 + paddq %xmm5,%xmm0 + movdqa %xmm7,%xmm5 + pmuludq -112(%edx),%xmm7 + paddq %xmm6,%xmm1 + movdqa %xmm5,%xmm6 + pmuludq -96(%edx),%xmm5 + paddq %xmm7,%xmm2 + movdqa 16(%eax),%xmm7 + pmuludq -80(%edx),%xmm6 + paddq %xmm5,%xmm3 + movdqa %xmm7,%xmm5 + pmuludq -128(%edx),%xmm7 + paddq %xmm6,%xmm4 + movdqa %xmm5,%xmm6 + pmuludq -112(%edx),%xmm5 + paddq %xmm7,%xmm2 + movdqa 32(%eax),%xmm7 + pmuludq -96(%edx),%xmm6 + paddq %xmm5,%xmm3 + movdqa %xmm7,%xmm5 + pmuludq -32(%edx),%xmm7 + paddq %xmm6,%xmm4 + movdqa %xmm5,%xmm6 + pmuludq -16(%edx),%xmm5 + paddq %xmm7,%xmm0 + movdqa %xmm6,%xmm7 + pmuludq -128(%edx),%xmm6 + paddq %xmm5,%xmm1 + movdqa 48(%eax),%xmm5 + pmuludq -112(%edx),%xmm7 + paddq %xmm6,%xmm3 + movdqa %xmm5,%xmm6 + pmuludq -48(%edx),%xmm5 + paddq %xmm7,%xmm4 + movdqa %xmm6,%xmm7 + pmuludq -32(%edx),%xmm6 + paddq %xmm5,%xmm0 + movdqa %xmm7,%xmm5 + pmuludq -16(%edx),%xmm7 + paddq %xmm6,%xmm1 + movdqa 64(%eax),%xmm6 + pmuludq -128(%edx),%xmm5 + paddq %xmm7,%xmm2 + movdqa %xmm6,%xmm7 + pmuludq -16(%edx),%xmm6 + paddq %xmm5,%xmm4 + movdqa %xmm7,%xmm5 + pmuludq -64(%edx),%xmm7 + paddq %xmm6,%xmm3 + movdqa %xmm5,%xmm6 + pmuludq -48(%edx),%xmm5 + paddq %xmm7,%xmm0 + movdqa 64(%ebx),%xmm7 + pmuludq -32(%edx),%xmm6 + paddq %xmm5,%xmm1 + paddq %xmm6,%xmm2 + movdqu -32(%esi),%xmm5 + movdqu -16(%esi),%xmm6 + leal 32(%esi),%esi + movdqa %xmm2,32(%esp) + movdqa %xmm3,48(%esp) + movdqa %xmm4,64(%esp) + movdqa %xmm5,%xmm2 + movdqa %xmm6,%xmm3 + psrldq $6,%xmm2 + psrldq $6,%xmm3 + movdqa %xmm5,%xmm4 + punpcklqdq %xmm3,%xmm2 + punpckhqdq %xmm6,%xmm4 + punpcklqdq %xmm6,%xmm5 + movdqa %xmm2,%xmm3 + psrlq $4,%xmm2 + psrlq $30,%xmm3 + movdqa %xmm5,%xmm6 + psrlq $40,%xmm4 + psrlq $26,%xmm6 + pand %xmm7,%xmm5 + pand %xmm7,%xmm6 + pand %xmm7,%xmm2 + pand %xmm7,%xmm3 + por (%ebx),%xmm4 + leal -32(%esi),%eax + subl $64,%ecx + paddd 80(%esp),%xmm5 + paddd 96(%esp),%xmm6 + paddd 112(%esp),%xmm2 + paddd 128(%esp),%xmm3 + paddd 144(%esp),%xmm4 + cmovbl %eax,%esi + leal 160(%esp),%eax + movdqa (%edx),%xmm7 + movdqa %xmm1,16(%esp) + movdqa %xmm6,16(%eax) + movdqa %xmm2,32(%eax) + movdqa %xmm3,48(%eax) + movdqa %xmm4,64(%eax) + movdqa %xmm5,%xmm1 + pmuludq %xmm7,%xmm5 + paddq %xmm0,%xmm5 + movdqa %xmm6,%xmm0 + pmuludq %xmm7,%xmm6 + pmuludq %xmm7,%xmm2 + pmuludq %xmm7,%xmm3 + pmuludq %xmm7,%xmm4 + paddq 16(%esp),%xmm6 + paddq 32(%esp),%xmm2 + paddq 48(%esp),%xmm3 + paddq 64(%esp),%xmm4 + pmuludq 128(%edx),%xmm0 + movdqa %xmm1,%xmm7 + pmuludq 16(%edx),%xmm1 + paddq %xmm5,%xmm0 + movdqa %xmm7,%xmm5 + pmuludq 32(%edx),%xmm7 + paddq %xmm6,%xmm1 + movdqa %xmm5,%xmm6 + pmuludq 48(%edx),%xmm5 + paddq %xmm7,%xmm2 + movdqa 16(%eax),%xmm7 + pmuludq 64(%edx),%xmm6 + paddq %xmm5,%xmm3 + movdqa %xmm7,%xmm5 + pmuludq 16(%edx),%xmm7 + paddq %xmm6,%xmm4 + movdqa %xmm5,%xmm6 + pmuludq 32(%edx),%xmm5 + paddq %xmm7,%xmm2 + movdqa 32(%eax),%xmm7 + pmuludq 48(%edx),%xmm6 + paddq %xmm5,%xmm3 + movdqa %xmm7,%xmm5 + pmuludq 112(%edx),%xmm7 + paddq %xmm6,%xmm4 + movdqa %xmm5,%xmm6 + pmuludq 128(%edx),%xmm5 + paddq %xmm7,%xmm0 + movdqa %xmm6,%xmm7 + pmuludq 16(%edx),%xmm6 + paddq %xmm5,%xmm1 + movdqa 48(%eax),%xmm5 + pmuludq 32(%edx),%xmm7 + paddq %xmm6,%xmm3 + movdqa %xmm5,%xmm6 + pmuludq 96(%edx),%xmm5 + paddq %xmm7,%xmm4 + movdqa %xmm6,%xmm7 + pmuludq 112(%edx),%xmm6 + paddq %xmm5,%xmm0 + movdqa %xmm7,%xmm5 + pmuludq 128(%edx),%xmm7 + paddq %xmm6,%xmm1 + movdqa 64(%eax),%xmm6 + pmuludq 16(%edx),%xmm5 + paddq %xmm7,%xmm2 + movdqa %xmm6,%xmm7 + pmuludq 128(%edx),%xmm6 + paddq %xmm5,%xmm4 + movdqa %xmm7,%xmm5 + pmuludq 80(%edx),%xmm7 + paddq %xmm6,%xmm3 + movdqa %xmm5,%xmm6 + pmuludq 96(%edx),%xmm5 + paddq %xmm7,%xmm0 + movdqa 64(%ebx),%xmm7 + pmuludq 112(%edx),%xmm6 + paddq %xmm5,%xmm1 + paddq %xmm6,%xmm2 + movdqa %xmm3,%xmm5 + pand %xmm7,%xmm3 + psrlq $26,%xmm5 + paddq %xmm4,%xmm5 + movdqa %xmm0,%xmm6 + pand %xmm7,%xmm0 + psrlq $26,%xmm6 + movdqa %xmm5,%xmm4 + paddq %xmm1,%xmm6 + psrlq $26,%xmm5 + pand %xmm7,%xmm4 + movdqa %xmm6,%xmm1 + psrlq $26,%xmm6 + paddd %xmm5,%xmm0 + psllq $2,%xmm5 + paddq %xmm2,%xmm6 + paddq %xmm0,%xmm5 + pand %xmm7,%xmm1 + movdqa %xmm6,%xmm2 + psrlq $26,%xmm6 + pand %xmm7,%xmm2 + paddd %xmm3,%xmm6 + movdqa %xmm5,%xmm0 + psrlq $26,%xmm5 + movdqa %xmm6,%xmm3 + psrlq $26,%xmm6 + pand %xmm7,%xmm0 + paddd %xmm5,%xmm1 + pand %xmm7,%xmm3 + paddd %xmm6,%xmm4 + movdqu 32(%esi),%xmm5 + movdqu 48(%esi),%xmm6 + leal 32(%esi),%esi + movdqa %xmm2,112(%esp) + movdqa %xmm3,128(%esp) + movdqa %xmm4,144(%esp) + movdqa %xmm5,%xmm2 + movdqa %xmm6,%xmm3 + psrldq $6,%xmm2 + psrldq $6,%xmm3 + movdqa %xmm5,%xmm4 + punpcklqdq %xmm3,%xmm2 + punpckhqdq %xmm6,%xmm4 + punpcklqdq %xmm6,%xmm5 + movdqa %xmm2,%xmm3 + psrlq $4,%xmm2 + psrlq $30,%xmm3 + movdqa %xmm5,%xmm6 + psrlq $40,%xmm4 + psrlq $26,%xmm6 + pand %xmm7,%xmm5 + pand %xmm7,%xmm6 + pand %xmm7,%xmm2 + pand %xmm7,%xmm3 + por (%ebx),%xmm4 + movdqa %xmm0,80(%esp) + movdqa %xmm1,96(%esp) + ja L015loop +L014skip_loop: + pshufd $16,-144(%edx),%xmm7 + addl $32,%ecx + jnz L016long_tail + paddd %xmm0,%xmm5 + paddd %xmm1,%xmm6 + paddd 112(%esp),%xmm2 + paddd 128(%esp),%xmm3 + paddd 144(%esp),%xmm4 +L016long_tail: + movdqa %xmm5,(%eax) + movdqa %xmm6,16(%eax) + movdqa %xmm2,32(%eax) + movdqa %xmm3,48(%eax) + movdqa %xmm4,64(%eax) + pmuludq %xmm7,%xmm5 + pmuludq %xmm7,%xmm6 + pmuludq %xmm7,%xmm2 + movdqa %xmm5,%xmm0 + pshufd $16,-128(%edx),%xmm5 + pmuludq %xmm7,%xmm3 + movdqa %xmm6,%xmm1 + pmuludq %xmm7,%xmm4 + movdqa %xmm5,%xmm6 + pmuludq 48(%eax),%xmm5 + movdqa %xmm6,%xmm7 + pmuludq 32(%eax),%xmm6 + paddq %xmm5,%xmm4 + movdqa %xmm7,%xmm5 + pmuludq 16(%eax),%xmm7 + paddq %xmm6,%xmm3 + pshufd $16,-64(%edx),%xmm6 + pmuludq (%eax),%xmm5 + paddq %xmm7,%xmm2 + pmuludq 64(%eax),%xmm6 + pshufd $16,-112(%edx),%xmm7 + paddq %xmm5,%xmm1 + movdqa %xmm7,%xmm5 + pmuludq 32(%eax),%xmm7 + paddq %xmm6,%xmm0 + movdqa %xmm5,%xmm6 + pmuludq 16(%eax),%xmm5 + paddq %xmm7,%xmm4 + pshufd $16,-48(%edx),%xmm7 + pmuludq (%eax),%xmm6 + paddq %xmm5,%xmm3 + movdqa %xmm7,%xmm5 + pmuludq 64(%eax),%xmm7 + paddq %xmm6,%xmm2 + pmuludq 48(%eax),%xmm5 + pshufd $16,-96(%edx),%xmm6 + paddq %xmm7,%xmm1 + movdqa %xmm6,%xmm7 + pmuludq 16(%eax),%xmm6 + paddq %xmm5,%xmm0 + pshufd $16,-32(%edx),%xmm5 + pmuludq (%eax),%xmm7 + paddq %xmm6,%xmm4 + movdqa %xmm5,%xmm6 + pmuludq 64(%eax),%xmm5 + paddq %xmm7,%xmm3 + movdqa %xmm6,%xmm7 + pmuludq 48(%eax),%xmm6 + paddq %xmm5,%xmm2 + pmuludq 32(%eax),%xmm7 + pshufd $16,-80(%edx),%xmm5 + paddq %xmm6,%xmm1 + pshufd $16,-16(%edx),%xmm6 + pmuludq (%eax),%xmm5 + paddq %xmm7,%xmm0 + movdqa %xmm6,%xmm7 + pmuludq 64(%eax),%xmm6 + paddq %xmm5,%xmm4 + movdqa %xmm7,%xmm5 + pmuludq 16(%eax),%xmm7 + paddq %xmm6,%xmm3 + movdqa %xmm5,%xmm6 + pmuludq 32(%eax),%xmm5 + paddq %xmm7,%xmm0 + pmuludq 48(%eax),%xmm6 + movdqa 64(%ebx),%xmm7 + paddq %xmm5,%xmm1 + paddq %xmm6,%xmm2 + jz L017short_tail + movdqu -32(%esi),%xmm5 + movdqu -16(%esi),%xmm6 + leal 32(%esi),%esi + movdqa %xmm2,32(%esp) + movdqa %xmm3,48(%esp) + movdqa %xmm4,64(%esp) + movdqa %xmm5,%xmm2 + movdqa %xmm6,%xmm3 + psrldq $6,%xmm2 + psrldq $6,%xmm3 + movdqa %xmm5,%xmm4 + punpcklqdq %xmm3,%xmm2 + punpckhqdq %xmm6,%xmm4 + punpcklqdq %xmm6,%xmm5 + movdqa %xmm2,%xmm3 + psrlq $4,%xmm2 + psrlq $30,%xmm3 + movdqa %xmm5,%xmm6 + psrlq $40,%xmm4 + psrlq $26,%xmm6 + pand %xmm7,%xmm5 + pand %xmm7,%xmm6 + pand %xmm7,%xmm2 + pand %xmm7,%xmm3 + por (%ebx),%xmm4 + pshufd $16,(%edx),%xmm7 + paddd 80(%esp),%xmm5 + paddd 96(%esp),%xmm6 + paddd 112(%esp),%xmm2 + paddd 128(%esp),%xmm3 + paddd 144(%esp),%xmm4 + movdqa %xmm5,(%esp) + pmuludq %xmm7,%xmm5 + movdqa %xmm6,16(%esp) + pmuludq %xmm7,%xmm6 + paddq %xmm5,%xmm0 + movdqa %xmm2,%xmm5 + pmuludq %xmm7,%xmm2 + paddq %xmm6,%xmm1 + movdqa %xmm3,%xmm6 + pmuludq %xmm7,%xmm3 + paddq 32(%esp),%xmm2 + movdqa %xmm5,32(%esp) + pshufd $16,16(%edx),%xmm5 + paddq 48(%esp),%xmm3 + movdqa %xmm6,48(%esp) + movdqa %xmm4,%xmm6 + pmuludq %xmm7,%xmm4 + paddq 64(%esp),%xmm4 + movdqa %xmm6,64(%esp) + movdqa %xmm5,%xmm6 + pmuludq 48(%esp),%xmm5 + movdqa %xmm6,%xmm7 + pmuludq 32(%esp),%xmm6 + paddq %xmm5,%xmm4 + movdqa %xmm7,%xmm5 + pmuludq 16(%esp),%xmm7 + paddq %xmm6,%xmm3 + pshufd $16,80(%edx),%xmm6 + pmuludq (%esp),%xmm5 + paddq %xmm7,%xmm2 + pmuludq 64(%esp),%xmm6 + pshufd $16,32(%edx),%xmm7 + paddq %xmm5,%xmm1 + movdqa %xmm7,%xmm5 + pmuludq 32(%esp),%xmm7 + paddq %xmm6,%xmm0 + movdqa %xmm5,%xmm6 + pmuludq 16(%esp),%xmm5 + paddq %xmm7,%xmm4 + pshufd $16,96(%edx),%xmm7 + pmuludq (%esp),%xmm6 + paddq %xmm5,%xmm3 + movdqa %xmm7,%xmm5 + pmuludq 64(%esp),%xmm7 + paddq %xmm6,%xmm2 + pmuludq 48(%esp),%xmm5 + pshufd $16,48(%edx),%xmm6 + paddq %xmm7,%xmm1 + movdqa %xmm6,%xmm7 + pmuludq 16(%esp),%xmm6 + paddq %xmm5,%xmm0 + pshufd $16,112(%edx),%xmm5 + pmuludq (%esp),%xmm7 + paddq %xmm6,%xmm4 + movdqa %xmm5,%xmm6 + pmuludq 64(%esp),%xmm5 + paddq %xmm7,%xmm3 + movdqa %xmm6,%xmm7 + pmuludq 48(%esp),%xmm6 + paddq %xmm5,%xmm2 + pmuludq 32(%esp),%xmm7 + pshufd $16,64(%edx),%xmm5 + paddq %xmm6,%xmm1 + pshufd $16,128(%edx),%xmm6 + pmuludq (%esp),%xmm5 + paddq %xmm7,%xmm0 + movdqa %xmm6,%xmm7 + pmuludq 64(%esp),%xmm6 + paddq %xmm5,%xmm4 + movdqa %xmm7,%xmm5 + pmuludq 16(%esp),%xmm7 + paddq %xmm6,%xmm3 + movdqa %xmm5,%xmm6 + pmuludq 32(%esp),%xmm5 + paddq %xmm7,%xmm0 + pmuludq 48(%esp),%xmm6 + movdqa 64(%ebx),%xmm7 + paddq %xmm5,%xmm1 + paddq %xmm6,%xmm2 +L017short_tail: + pshufd $78,%xmm4,%xmm6 + pshufd $78,%xmm3,%xmm5 + paddq %xmm6,%xmm4 + paddq %xmm5,%xmm3 + pshufd $78,%xmm0,%xmm6 + pshufd $78,%xmm1,%xmm5 + paddq %xmm6,%xmm0 + paddq %xmm5,%xmm1 + pshufd $78,%xmm2,%xmm6 + movdqa %xmm3,%xmm5 + pand %xmm7,%xmm3 + psrlq $26,%xmm5 + paddq %xmm6,%xmm2 + paddq %xmm4,%xmm5 + movdqa %xmm0,%xmm6 + pand %xmm7,%xmm0 + psrlq $26,%xmm6 + movdqa %xmm5,%xmm4 + paddq %xmm1,%xmm6 + psrlq $26,%xmm5 + pand %xmm7,%xmm4 + movdqa %xmm6,%xmm1 + psrlq $26,%xmm6 + paddd %xmm5,%xmm0 + psllq $2,%xmm5 + paddq %xmm2,%xmm6 + paddq %xmm0,%xmm5 + pand %xmm7,%xmm1 + movdqa %xmm6,%xmm2 + psrlq $26,%xmm6 + pand %xmm7,%xmm2 + paddd %xmm3,%xmm6 + movdqa %xmm5,%xmm0 + psrlq $26,%xmm5 + movdqa %xmm6,%xmm3 + psrlq $26,%xmm6 + pand %xmm7,%xmm0 + paddd %xmm5,%xmm1 + pand %xmm7,%xmm3 + paddd %xmm6,%xmm4 +L013done: + movd %xmm0,-48(%edi) + movd %xmm1,-44(%edi) + movd %xmm2,-40(%edi) + movd %xmm3,-36(%edi) + movd %xmm4,-32(%edi) + movl %ebp,%esp +L007nodata: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 5,0x90 +.align 4 +__poly1305_emit_sse2: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%ebp + cmpl $0,20(%ebp) + je Lenter_emit + movl (%ebp),%eax + movl 4(%ebp),%edi + movl 8(%ebp),%ecx + movl 12(%ebp),%edx + movl 16(%ebp),%esi + movl %edi,%ebx + shll $26,%edi + shrl $6,%ebx + addl %edi,%eax + movl %ecx,%edi + adcl $0,%ebx + shll $20,%edi + shrl $12,%ecx + addl %edi,%ebx + movl %edx,%edi + adcl $0,%ecx + shll $14,%edi + shrl $18,%edx + addl %edi,%ecx + movl %esi,%edi + adcl $0,%edx + shll $8,%edi + shrl $24,%esi + addl %edi,%edx + adcl $0,%esi + movl %esi,%edi + andl $3,%esi + shrl $2,%edi + leal (%edi,%edi,4),%ebp + movl 24(%esp),%edi + addl %ebp,%eax + movl 28(%esp),%ebp + adcl $0,%ebx + adcl $0,%ecx + adcl $0,%edx + adcl $0,%esi + movd %eax,%xmm0 + addl $5,%eax + movd %ebx,%xmm1 + adcl $0,%ebx + movd %ecx,%xmm2 + adcl $0,%ecx + movd %edx,%xmm3 + adcl $0,%edx + adcl $0,%esi + shrl $2,%esi + negl %esi + andl %esi,%eax + andl %esi,%ebx + andl %esi,%ecx + andl %esi,%edx + movl %eax,(%edi) + movd %xmm0,%eax + movl %ebx,4(%edi) + movd %xmm1,%ebx + movl %ecx,8(%edi) + movd %xmm2,%ecx + movl %edx,12(%edi) + movd %xmm3,%edx + notl %esi + andl %esi,%eax + andl %esi,%ebx + orl (%edi),%eax + andl %esi,%ecx + orl 4(%edi),%ebx + andl %esi,%edx + orl 8(%edi),%ecx + orl 12(%edi),%edx + addl (%ebp),%eax + adcl 4(%ebp),%ebx + movl %eax,(%edi) + adcl 8(%ebp),%ecx + movl %ebx,4(%edi) + adcl 12(%ebp),%edx + movl %ecx,8(%edi) + movl %edx,12(%edi) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 5,0x90 +.align 4 +__poly1305_init_avx2: + vmovdqu 24(%edi),%xmm4 + leal 48(%edi),%edi + movl %esp,%ebp + subl $224,%esp + andl $-16,%esp + vmovdqa 64(%ebx),%xmm7 + vpand %xmm7,%xmm4,%xmm0 + vpsrlq $26,%xmm4,%xmm1 + vpsrldq $6,%xmm4,%xmm3 + vpand %xmm7,%xmm1,%xmm1 + vpsrlq $4,%xmm3,%xmm2 + vpsrlq $30,%xmm3,%xmm3 + vpand %xmm7,%xmm2,%xmm2 + vpand %xmm7,%xmm3,%xmm3 + vpsrldq $13,%xmm4,%xmm4 + leal 144(%esp),%edx + movl $2,%ecx +L018square: + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + vmovdqa %xmm4,64(%esp) + vpslld $2,%xmm1,%xmm6 + vpslld $2,%xmm2,%xmm5 + vpaddd %xmm1,%xmm6,%xmm6 + vpaddd %xmm2,%xmm5,%xmm5 + vmovdqa %xmm6,80(%esp) + vmovdqa %xmm5,96(%esp) + vpslld $2,%xmm3,%xmm6 + vpslld $2,%xmm4,%xmm5 + vpaddd %xmm3,%xmm6,%xmm6 + vpaddd %xmm4,%xmm5,%xmm5 + vmovdqa %xmm6,112(%esp) + vmovdqa %xmm5,128(%esp) + vpshufd $68,%xmm0,%xmm5 + vmovdqa %xmm1,%xmm6 + vpshufd $68,%xmm1,%xmm1 + vpshufd $68,%xmm2,%xmm2 + vpshufd $68,%xmm3,%xmm3 + vpshufd $68,%xmm4,%xmm4 + vmovdqa %xmm5,(%edx) + vmovdqa %xmm1,16(%edx) + vmovdqa %xmm2,32(%edx) + vmovdqa %xmm3,48(%edx) + vmovdqa %xmm4,64(%edx) + vpmuludq %xmm0,%xmm4,%xmm4 + vpmuludq %xmm0,%xmm3,%xmm3 + vpmuludq %xmm0,%xmm2,%xmm2 + vpmuludq %xmm0,%xmm1,%xmm1 + vpmuludq %xmm0,%xmm5,%xmm0 + vpmuludq 48(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm4,%xmm4 + vpmuludq 32(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm3,%xmm3 + vpmuludq 16(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vmovdqa 80(%esp),%xmm7 + vpmuludq (%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm1,%xmm1 + vmovdqa 32(%esp),%xmm5 + vpmuludq 64(%edx),%xmm7,%xmm7 + vpaddq %xmm7,%xmm0,%xmm0 + vpmuludq 32(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm4,%xmm4 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm3,%xmm3 + vmovdqa 96(%esp),%xmm6 + vpmuludq (%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vpmuludq 64(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm1,%xmm1 + vmovdqa 48(%esp),%xmm5 + vpmuludq 48(%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm0,%xmm0 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm4,%xmm4 + vmovdqa 112(%esp),%xmm6 + vpmuludq (%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm3,%xmm3 + vpmuludq 64(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm2,%xmm2 + vpmuludq 48(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm1,%xmm1 + vmovdqa 64(%esp),%xmm7 + vpmuludq 32(%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm0,%xmm0 + vmovdqa 128(%esp),%xmm5 + vpmuludq (%edx),%xmm7,%xmm7 + vpaddq %xmm7,%xmm4,%xmm4 + vpmuludq 64(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm3,%xmm3 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm0,%xmm0 + vpmuludq 32(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm1,%xmm1 + vmovdqa 64(%ebx),%xmm7 + vpmuludq 48(%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vpsrlq $26,%xmm3,%xmm5 + vpand %xmm7,%xmm3,%xmm3 + vpsrlq $26,%xmm0,%xmm6 + vpand %xmm7,%xmm0,%xmm0 + vpaddq %xmm5,%xmm4,%xmm4 + vpaddq %xmm6,%xmm1,%xmm1 + vpsrlq $26,%xmm4,%xmm5 + vpand %xmm7,%xmm4,%xmm4 + vpsrlq $26,%xmm1,%xmm6 + vpand %xmm7,%xmm1,%xmm1 + vpaddq %xmm6,%xmm2,%xmm2 + vpaddd %xmm5,%xmm0,%xmm0 + vpsllq $2,%xmm5,%xmm5 + vpsrlq $26,%xmm2,%xmm6 + vpand %xmm7,%xmm2,%xmm2 + vpaddd %xmm5,%xmm0,%xmm0 + vpaddd %xmm6,%xmm3,%xmm3 + vpsrlq $26,%xmm3,%xmm6 + vpsrlq $26,%xmm0,%xmm5 + vpand %xmm7,%xmm0,%xmm0 + vpand %xmm7,%xmm3,%xmm3 + vpaddd %xmm5,%xmm1,%xmm1 + vpaddd %xmm6,%xmm4,%xmm4 + decl %ecx + jz L019square_break + vpunpcklqdq (%esp),%xmm0,%xmm0 + vpunpcklqdq 16(%esp),%xmm1,%xmm1 + vpunpcklqdq 32(%esp),%xmm2,%xmm2 + vpunpcklqdq 48(%esp),%xmm3,%xmm3 + vpunpcklqdq 64(%esp),%xmm4,%xmm4 + jmp L018square +L019square_break: + vpsllq $32,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm1 + vpsllq $32,%xmm2,%xmm2 + vpsllq $32,%xmm3,%xmm3 + vpsllq $32,%xmm4,%xmm4 + vpor (%esp),%xmm0,%xmm0 + vpor 16(%esp),%xmm1,%xmm1 + vpor 32(%esp),%xmm2,%xmm2 + vpor 48(%esp),%xmm3,%xmm3 + vpor 64(%esp),%xmm4,%xmm4 + vpshufd $141,%xmm0,%xmm0 + vpshufd $141,%xmm1,%xmm1 + vpshufd $141,%xmm2,%xmm2 + vpshufd $141,%xmm3,%xmm3 + vpshufd $141,%xmm4,%xmm4 + vmovdqu %xmm0,(%edi) + vmovdqu %xmm1,16(%edi) + vmovdqu %xmm2,32(%edi) + vmovdqu %xmm3,48(%edi) + vmovdqu %xmm4,64(%edi) + vpslld $2,%xmm1,%xmm6 + vpslld $2,%xmm2,%xmm5 + vpaddd %xmm1,%xmm6,%xmm6 + vpaddd %xmm2,%xmm5,%xmm5 + vmovdqu %xmm6,80(%edi) + vmovdqu %xmm5,96(%edi) + vpslld $2,%xmm3,%xmm6 + vpslld $2,%xmm4,%xmm5 + vpaddd %xmm3,%xmm6,%xmm6 + vpaddd %xmm4,%xmm5,%xmm5 + vmovdqu %xmm6,112(%edi) + vmovdqu %xmm5,128(%edi) + movl %ebp,%esp + leal -48(%edi),%edi + ret +.align 5,0x90 +.align 4 +__poly1305_blocks_avx2: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 20(%edi),%eax + andl $-16,%ecx + jz L020nodata + cmpl $64,%ecx + jae L021enter_avx2 + testl %eax,%eax + jz Lenter_blocks +L021enter_avx2: + vzeroupper + call L022pic_point +L022pic_point: + popl %ebx + leal Lconst_sse2-L022pic_point(%ebx),%ebx + testl %eax,%eax + jnz L023base2_26 + call __poly1305_init_avx2 + movl (%edi),%eax + movl 3(%edi),%ecx + movl 6(%edi),%edx + movl 9(%edi),%esi + movl 13(%edi),%ebp + shrl $2,%ecx + andl $67108863,%eax + shrl $4,%edx + andl $67108863,%ecx + shrl $6,%esi + andl $67108863,%edx + movl %eax,(%edi) + movl %ecx,4(%edi) + movl %edx,8(%edi) + movl %esi,12(%edi) + movl %ebp,16(%edi) + movl $1,20(%edi) + movl 24(%esp),%esi + movl 28(%esp),%ecx +L023base2_26: + movl 32(%esp),%eax + movl %esp,%ebp + subl $448,%esp + andl $-512,%esp + vmovdqu 48(%edi),%xmm0 + leal 288(%esp),%edx + vmovdqu 64(%edi),%xmm1 + vmovdqu 80(%edi),%xmm2 + vmovdqu 96(%edi),%xmm3 + vmovdqu 112(%edi),%xmm4 + leal 48(%edi),%edi + vpermq $64,%ymm0,%ymm0 + vpermq $64,%ymm1,%ymm1 + vpermq $64,%ymm2,%ymm2 + vpermq $64,%ymm3,%ymm3 + vpermq $64,%ymm4,%ymm4 + vpshufd $200,%ymm0,%ymm0 + vpshufd $200,%ymm1,%ymm1 + vpshufd $200,%ymm2,%ymm2 + vpshufd $200,%ymm3,%ymm3 + vpshufd $200,%ymm4,%ymm4 + vmovdqa %ymm0,-128(%edx) + vmovdqu 80(%edi),%xmm0 + vmovdqa %ymm1,-96(%edx) + vmovdqu 96(%edi),%xmm1 + vmovdqa %ymm2,-64(%edx) + vmovdqu 112(%edi),%xmm2 + vmovdqa %ymm3,-32(%edx) + vmovdqu 128(%edi),%xmm3 + vmovdqa %ymm4,(%edx) + vpermq $64,%ymm0,%ymm0 + vpermq $64,%ymm1,%ymm1 + vpermq $64,%ymm2,%ymm2 + vpermq $64,%ymm3,%ymm3 + vpshufd $200,%ymm0,%ymm0 + vpshufd $200,%ymm1,%ymm1 + vpshufd $200,%ymm2,%ymm2 + vpshufd $200,%ymm3,%ymm3 + vmovdqa %ymm0,32(%edx) + vmovd -48(%edi),%xmm0 + vmovdqa %ymm1,64(%edx) + vmovd -44(%edi),%xmm1 + vmovdqa %ymm2,96(%edx) + vmovd -40(%edi),%xmm2 + vmovdqa %ymm3,128(%edx) + vmovd -36(%edi),%xmm3 + vmovd -32(%edi),%xmm4 + vmovdqa 64(%ebx),%ymm7 + negl %eax + testl $63,%ecx + jz L024even + movl %ecx,%edx + andl $-64,%ecx + andl $63,%edx + vmovdqu (%esi),%xmm5 + cmpl $32,%edx + jb L025one + vmovdqu 16(%esi),%xmm6 + je L026two + vinserti128 $1,32(%esi),%ymm5,%ymm5 + leal 48(%esi),%esi + leal 8(%ebx),%ebx + leal 296(%esp),%edx + jmp L027tail +L026two: + leal 32(%esi),%esi + leal 16(%ebx),%ebx + leal 304(%esp),%edx + jmp L027tail +L025one: + leal 16(%esi),%esi + vpxor %ymm6,%ymm6,%ymm6 + leal 32(%ebx,%eax,8),%ebx + leal 312(%esp),%edx + jmp L027tail +.align 5,0x90 +L024even: + vmovdqu (%esi),%xmm5 + vmovdqu 16(%esi),%xmm6 + vinserti128 $1,32(%esi),%ymm5,%ymm5 + vinserti128 $1,48(%esi),%ymm6,%ymm6 + leal 64(%esi),%esi + subl $64,%ecx + jz L027tail +L028loop: + vmovdqa %ymm2,64(%esp) + vpsrldq $6,%ymm5,%ymm2 + vmovdqa %ymm0,(%esp) + vpsrldq $6,%ymm6,%ymm0 + vmovdqa %ymm1,32(%esp) + vpunpckhqdq %ymm6,%ymm5,%ymm1 + vpunpcklqdq %ymm6,%ymm5,%ymm5 + vpunpcklqdq %ymm0,%ymm2,%ymm2 + vpsrlq $30,%ymm2,%ymm0 + vpsrlq $4,%ymm2,%ymm2 + vpsrlq $26,%ymm5,%ymm6 + vpsrlq $40,%ymm1,%ymm1 + vpand %ymm7,%ymm2,%ymm2 + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpor (%ebx),%ymm1,%ymm1 + vpaddq 64(%esp),%ymm2,%ymm2 + vpaddq (%esp),%ymm5,%ymm5 + vpaddq 32(%esp),%ymm6,%ymm6 + vpaddq %ymm3,%ymm0,%ymm0 + vpaddq %ymm4,%ymm1,%ymm1 + vpmuludq -96(%edx),%ymm2,%ymm3 + vmovdqa %ymm6,32(%esp) + vpmuludq -64(%edx),%ymm2,%ymm4 + vmovdqa %ymm0,96(%esp) + vpmuludq 96(%edx),%ymm2,%ymm0 + vmovdqa %ymm1,128(%esp) + vpmuludq 128(%edx),%ymm2,%ymm1 + vpmuludq -128(%edx),%ymm2,%ymm2 + vpmuludq -32(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq (%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm4,%ymm4 + vpmuludq -128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm0,%ymm0 + vmovdqa 32(%esp),%ymm7 + vpmuludq -96(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq -64(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpmuludq -64(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm3,%ymm3 + vpmuludq -32(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm4,%ymm4 + vpmuludq 128(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vmovdqa 96(%esp),%ymm6 + vpmuludq -128(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm1,%ymm1 + vpmuludq -96(%edx),%ymm7,%ymm7 + vpaddq %ymm7,%ymm2,%ymm2 + vpmuludq -128(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm3,%ymm3 + vpmuludq -96(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vpmuludq 64(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm0,%ymm0 + vmovdqa 128(%esp),%ymm5 + vpmuludq 96(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm1,%ymm1 + vpmuludq 128(%edx),%ymm6,%ymm6 + vpaddq %ymm6,%ymm2,%ymm2 + vpmuludq 128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 32(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vpmuludq -128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vmovdqa 64(%ebx),%ymm7 + vpmuludq 64(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq 96(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpsrlq $26,%ymm3,%ymm5 + vpand %ymm7,%ymm3,%ymm3 + vpsrlq $26,%ymm0,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpaddq %ymm5,%ymm4,%ymm4 + vpaddq %ymm6,%ymm1,%ymm1 + vpsrlq $26,%ymm4,%ymm5 + vpand %ymm7,%ymm4,%ymm4 + vpsrlq $26,%ymm1,%ymm6 + vpand %ymm7,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpsllq $2,%ymm5,%ymm5 + vpsrlq $26,%ymm2,%ymm6 + vpand %ymm7,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrlq $26,%ymm3,%ymm6 + vpsrlq $26,%ymm0,%ymm5 + vpand %ymm7,%ymm0,%ymm0 + vpand %ymm7,%ymm3,%ymm3 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm4,%ymm4 + vmovdqu (%esi),%xmm5 + vmovdqu 16(%esi),%xmm6 + vinserti128 $1,32(%esi),%ymm5,%ymm5 + vinserti128 $1,48(%esi),%ymm6,%ymm6 + leal 64(%esi),%esi + subl $64,%ecx + jnz L028loop +L027tail: + vmovdqa %ymm2,64(%esp) + vpsrldq $6,%ymm5,%ymm2 + vmovdqa %ymm0,(%esp) + vpsrldq $6,%ymm6,%ymm0 + vmovdqa %ymm1,32(%esp) + vpunpckhqdq %ymm6,%ymm5,%ymm1 + vpunpcklqdq %ymm6,%ymm5,%ymm5 + vpunpcklqdq %ymm0,%ymm2,%ymm2 + vpsrlq $30,%ymm2,%ymm0 + vpsrlq $4,%ymm2,%ymm2 + vpsrlq $26,%ymm5,%ymm6 + vpsrlq $40,%ymm1,%ymm1 + vpand %ymm7,%ymm2,%ymm2 + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpor (%ebx),%ymm1,%ymm1 + andl $-64,%ebx + vpaddq 64(%esp),%ymm2,%ymm2 + vpaddq (%esp),%ymm5,%ymm5 + vpaddq 32(%esp),%ymm6,%ymm6 + vpaddq %ymm3,%ymm0,%ymm0 + vpaddq %ymm4,%ymm1,%ymm1 + vpmuludq -92(%edx),%ymm2,%ymm3 + vmovdqa %ymm6,32(%esp) + vpmuludq -60(%edx),%ymm2,%ymm4 + vmovdqa %ymm0,96(%esp) + vpmuludq 100(%edx),%ymm2,%ymm0 + vmovdqa %ymm1,128(%esp) + vpmuludq 132(%edx),%ymm2,%ymm1 + vpmuludq -124(%edx),%ymm2,%ymm2 + vpmuludq -28(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 4(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm4,%ymm4 + vpmuludq -124(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm0,%ymm0 + vmovdqa 32(%esp),%ymm7 + vpmuludq -92(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq -60(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpmuludq -60(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm3,%ymm3 + vpmuludq -28(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm4,%ymm4 + vpmuludq 132(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vmovdqa 96(%esp),%ymm6 + vpmuludq -124(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm1,%ymm1 + vpmuludq -92(%edx),%ymm7,%ymm7 + vpaddq %ymm7,%ymm2,%ymm2 + vpmuludq -124(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm3,%ymm3 + vpmuludq -92(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vpmuludq 68(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm0,%ymm0 + vmovdqa 128(%esp),%ymm5 + vpmuludq 100(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm1,%ymm1 + vpmuludq 132(%edx),%ymm6,%ymm6 + vpaddq %ymm6,%ymm2,%ymm2 + vpmuludq 132(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 36(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vpmuludq -124(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vmovdqa 64(%ebx),%ymm7 + vpmuludq 68(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq 100(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpsrldq $8,%ymm4,%ymm5 + vpsrldq $8,%ymm3,%ymm6 + vpaddq %ymm5,%ymm4,%ymm4 + vpsrldq $8,%ymm0,%ymm5 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrldq $8,%ymm1,%ymm6 + vpaddq %ymm5,%ymm0,%ymm0 + vpsrldq $8,%ymm2,%ymm5 + vpaddq %ymm6,%ymm1,%ymm1 + vpermq $2,%ymm4,%ymm6 + vpaddq %ymm5,%ymm2,%ymm2 + vpermq $2,%ymm3,%ymm5 + vpaddq %ymm6,%ymm4,%ymm4 + vpermq $2,%ymm0,%ymm6 + vpaddq %ymm5,%ymm3,%ymm3 + vpermq $2,%ymm1,%ymm5 + vpaddq %ymm6,%ymm0,%ymm0 + vpermq $2,%ymm2,%ymm6 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpsrlq $26,%ymm3,%ymm5 + vpand %ymm7,%ymm3,%ymm3 + vpsrlq $26,%ymm0,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpaddq %ymm5,%ymm4,%ymm4 + vpaddq %ymm6,%ymm1,%ymm1 + vpsrlq $26,%ymm4,%ymm5 + vpand %ymm7,%ymm4,%ymm4 + vpsrlq $26,%ymm1,%ymm6 + vpand %ymm7,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpsllq $2,%ymm5,%ymm5 + vpsrlq $26,%ymm2,%ymm6 + vpand %ymm7,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrlq $26,%ymm3,%ymm6 + vpsrlq $26,%ymm0,%ymm5 + vpand %ymm7,%ymm0,%ymm0 + vpand %ymm7,%ymm3,%ymm3 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm4,%ymm4 + cmpl $0,%ecx + je L029done + vpshufd $252,%xmm0,%xmm0 + leal 288(%esp),%edx + vpshufd $252,%xmm1,%xmm1 + vpshufd $252,%xmm2,%xmm2 + vpshufd $252,%xmm3,%xmm3 + vpshufd $252,%xmm4,%xmm4 + jmp L024even +.align 4,0x90 +L029done: + vmovd %xmm0,-48(%edi) + vmovd %xmm1,-44(%edi) + vmovd %xmm2,-40(%edi) + vmovd %xmm3,-36(%edi) + vmovd %xmm4,-32(%edi) + vzeroupper + movl %ebp,%esp +L020nodata: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 6,0x90 +Lconst_sse2: +.long 16777216,0,16777216,0,16777216,0,16777216,0 +.long 0,0,0,0,0,0,0,0 +.long 67108863,0,67108863,0,67108863,0,67108863,0 +.long 268435455,268435452,268435452,268435452 +.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54 +.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 +.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 +.byte 114,103,62,0 +.align 2,0x90 +.section __IMPORT,__pointers,non_lazy_symbol_pointers +L_OPENSSL_ia32cap_P$non_lazy_ptr: +.indirect_symbol _OPENSSL_ia32cap_P +.long 0 +.comm _OPENSSL_ia32cap_P,16,2 |