summaryrefslogtreecommitdiff
path: root/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha
diff options
context:
space:
mode:
Diffstat (limited to 'deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha')
-rw-r--r--deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/keccak1600-x86_64.s522
-rw-r--r--deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha1-mb-x86_64.s7267
-rw-r--r--deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha1-x86_64.s5450
-rw-r--r--deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha256-mb-x86_64.s7948
-rw-r--r--deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha256-x86_64.s5430
-rw-r--r--deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha512-x86_64.s5437
6 files changed, 32054 insertions, 0 deletions
diff --git a/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/keccak1600-x86_64.s b/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/keccak1600-x86_64.s
new file mode 100644
index 0000000000..e511f25035
--- /dev/null
+++ b/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/keccak1600-x86_64.s
@@ -0,0 +1,522 @@
+.text
+
+.type __KeccakF1600,@function
+.align 32
+__KeccakF1600:
+ movq 60(%rdi),%rax
+ movq 68(%rdi),%rbx
+ movq 76(%rdi),%rcx
+ movq 84(%rdi),%rdx
+ movq 92(%rdi),%rbp
+ jmp .Loop
+
+.align 32
+.Loop:
+ movq -100(%rdi),%r8
+ movq -52(%rdi),%r9
+ movq -4(%rdi),%r10
+ movq 44(%rdi),%r11
+
+ xorq -84(%rdi),%rcx
+ xorq -76(%rdi),%rdx
+ xorq %r8,%rax
+ xorq -92(%rdi),%rbx
+ xorq -44(%rdi),%rcx
+ xorq -60(%rdi),%rax
+ movq %rbp,%r12
+ xorq -68(%rdi),%rbp
+
+ xorq %r10,%rcx
+ xorq -20(%rdi),%rax
+ xorq -36(%rdi),%rdx
+ xorq %r9,%rbx
+ xorq -28(%rdi),%rbp
+
+ xorq 36(%rdi),%rcx
+ xorq 20(%rdi),%rax
+ xorq 4(%rdi),%rdx
+ xorq -12(%rdi),%rbx
+ xorq 12(%rdi),%rbp
+
+ movq %rcx,%r13
+ rolq $1,%rcx
+ xorq %rax,%rcx
+ xorq %r11,%rdx
+
+ rolq $1,%rax
+ xorq %rdx,%rax
+ xorq 28(%rdi),%rbx
+
+ rolq $1,%rdx
+ xorq %rbx,%rdx
+ xorq 52(%rdi),%rbp
+
+ rolq $1,%rbx
+ xorq %rbp,%rbx
+
+ rolq $1,%rbp
+ xorq %r13,%rbp
+ xorq %rcx,%r9
+ xorq %rdx,%r10
+ rolq $44,%r9
+ xorq %rbp,%r11
+ xorq %rax,%r12
+ rolq $43,%r10
+ xorq %rbx,%r8
+ movq %r9,%r13
+ rolq $21,%r11
+ orq %r10,%r9
+ xorq %r8,%r9
+ rolq $14,%r12
+
+ xorq (%r15),%r9
+ leaq 8(%r15),%r15
+
+ movq %r12,%r14
+ andq %r11,%r12
+ movq %r9,-100(%rsi)
+ xorq %r10,%r12
+ notq %r10
+ movq %r12,-84(%rsi)
+
+ orq %r11,%r10
+ movq 76(%rdi),%r12
+ xorq %r13,%r10
+ movq %r10,-92(%rsi)
+
+ andq %r8,%r13
+ movq -28(%rdi),%r9
+ xorq %r14,%r13
+ movq -20(%rdi),%r10
+ movq %r13,-68(%rsi)
+
+ orq %r8,%r14
+ movq -76(%rdi),%r8
+ xorq %r11,%r14
+ movq 28(%rdi),%r11
+ movq %r14,-76(%rsi)
+
+
+ xorq %rbp,%r8
+ xorq %rdx,%r12
+ rolq $28,%r8
+ xorq %rcx,%r11
+ xorq %rax,%r9
+ rolq $61,%r12
+ rolq $45,%r11
+ xorq %rbx,%r10
+ rolq $20,%r9
+ movq %r8,%r13
+ orq %r12,%r8
+ rolq $3,%r10
+
+ xorq %r11,%r8
+ movq %r8,-36(%rsi)
+
+ movq %r9,%r14
+ andq %r13,%r9
+ movq -92(%rdi),%r8
+ xorq %r12,%r9
+ notq %r12
+ movq %r9,-28(%rsi)
+
+ orq %r11,%r12
+ movq -44(%rdi),%r9
+ xorq %r10,%r12
+ movq %r12,-44(%rsi)
+
+ andq %r10,%r11
+ movq 60(%rdi),%r12
+ xorq %r14,%r11
+ movq %r11,-52(%rsi)
+
+ orq %r10,%r14
+ movq 4(%rdi),%r10
+ xorq %r13,%r14
+ movq 52(%rdi),%r11
+ movq %r14,-60(%rsi)
+
+
+ xorq %rbp,%r10
+ xorq %rax,%r11
+ rolq $25,%r10
+ xorq %rdx,%r9
+ rolq $8,%r11
+ xorq %rbx,%r12
+ rolq $6,%r9
+ xorq %rcx,%r8
+ rolq $18,%r12
+ movq %r10,%r13
+ andq %r11,%r10
+ rolq $1,%r8
+
+ notq %r11
+ xorq %r9,%r10
+ movq %r10,-12(%rsi)
+
+ movq %r12,%r14
+ andq %r11,%r12
+ movq -12(%rdi),%r10
+ xorq %r13,%r12
+ movq %r12,-4(%rsi)
+
+ orq %r9,%r13
+ movq 84(%rdi),%r12
+ xorq %r8,%r13
+ movq %r13,-20(%rsi)
+
+ andq %r8,%r9
+ xorq %r14,%r9
+ movq %r9,12(%rsi)
+
+ orq %r8,%r14
+ movq -60(%rdi),%r9
+ xorq %r11,%r14
+ movq 36(%rdi),%r11
+ movq %r14,4(%rsi)
+
+
+ movq -68(%rdi),%r8
+
+ xorq %rcx,%r10
+ xorq %rdx,%r11
+ rolq $10,%r10
+ xorq %rbx,%r9
+ rolq $15,%r11
+ xorq %rbp,%r12
+ rolq $36,%r9
+ xorq %rax,%r8
+ rolq $56,%r12
+ movq %r10,%r13
+ orq %r11,%r10
+ rolq $27,%r8
+
+ notq %r11
+ xorq %r9,%r10
+ movq %r10,28(%rsi)
+
+ movq %r12,%r14
+ orq %r11,%r12
+ xorq %r13,%r12
+ movq %r12,36(%rsi)
+
+ andq %r9,%r13
+ xorq %r8,%r13
+ movq %r13,20(%rsi)
+
+ orq %r8,%r9
+ xorq %r14,%r9
+ movq %r9,52(%rsi)
+
+ andq %r14,%r8
+ xorq %r11,%r8
+ movq %r8,44(%rsi)
+
+
+ xorq -84(%rdi),%rdx
+ xorq -36(%rdi),%rbp
+ rolq $62,%rdx
+ xorq 68(%rdi),%rcx
+ rolq $55,%rbp
+ xorq 12(%rdi),%rax
+ rolq $2,%rcx
+ xorq 20(%rdi),%rbx
+ xchgq %rsi,%rdi
+ rolq $39,%rax
+ rolq $41,%rbx
+ movq %rdx,%r13
+ andq %rbp,%rdx
+ notq %rbp
+ xorq %rcx,%rdx
+ movq %rdx,92(%rdi)
+
+ movq %rax,%r14
+ andq %rbp,%rax
+ xorq %r13,%rax
+ movq %rax,60(%rdi)
+
+ orq %rcx,%r13
+ xorq %rbx,%r13
+ movq %r13,84(%rdi)
+
+ andq %rbx,%rcx
+ xorq %r14,%rcx
+ movq %rcx,76(%rdi)
+
+ orq %r14,%rbx
+ xorq %rbp,%rbx
+ movq %rbx,68(%rdi)
+
+ movq %rdx,%rbp
+ movq %r13,%rdx
+
+ testq $255,%r15
+ jnz .Loop
+
+ leaq -192(%r15),%r15
+ .byte 0xf3,0xc3
+.size __KeccakF1600,.-__KeccakF1600
+
+.type KeccakF1600,@function
+.align 32
+KeccakF1600:
+.cfi_startproc
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+
+ leaq 100(%rdi),%rdi
+ subq $200,%rsp
+.cfi_adjust_cfa_offset 200
+
+ notq -92(%rdi)
+ notq -84(%rdi)
+ notq -36(%rdi)
+ notq -4(%rdi)
+ notq 36(%rdi)
+ notq 60(%rdi)
+
+ leaq iotas(%rip),%r15
+ leaq 100(%rsp),%rsi
+
+ call __KeccakF1600
+
+ notq -92(%rdi)
+ notq -84(%rdi)
+ notq -36(%rdi)
+ notq -4(%rdi)
+ notq 36(%rdi)
+ notq 60(%rdi)
+ leaq -100(%rdi),%rdi
+
+ addq $200,%rsp
+.cfi_adjust_cfa_offset -200
+
+ popq %r15
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r15
+ popq %r14
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r14
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ popq %rbp
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbp
+ popq %rbx
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbx
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size KeccakF1600,.-KeccakF1600
+.globl SHA3_absorb
+.type SHA3_absorb,@function
+.align 32
+SHA3_absorb:
+.cfi_startproc
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+
+ leaq 100(%rdi),%rdi
+ subq $232,%rsp
+.cfi_adjust_cfa_offset 232
+
+ movq %rsi,%r9
+ leaq 100(%rsp),%rsi
+
+ notq -92(%rdi)
+ notq -84(%rdi)
+ notq -36(%rdi)
+ notq -4(%rdi)
+ notq 36(%rdi)
+ notq 60(%rdi)
+ leaq iotas(%rip),%r15
+
+ movq %rcx,216-100(%rsi)
+
+.Loop_absorb:
+ cmpq %rcx,%rdx
+ jc .Ldone_absorb
+
+ shrq $3,%rcx
+ leaq -100(%rdi),%r8
+
+.Lblock_absorb:
+ movq (%r9),%rax
+ leaq 8(%r9),%r9
+ xorq (%r8),%rax
+ leaq 8(%r8),%r8
+ subq $8,%rdx
+ movq %rax,-8(%r8)
+ subq $1,%rcx
+ jnz .Lblock_absorb
+
+ movq %r9,200-100(%rsi)
+ movq %rdx,208-100(%rsi)
+ call __KeccakF1600
+ movq 200-100(%rsi),%r9
+ movq 208-100(%rsi),%rdx
+ movq 216-100(%rsi),%rcx
+ jmp .Loop_absorb
+
+.align 32
+.Ldone_absorb:
+ movq %rdx,%rax
+
+ notq -92(%rdi)
+ notq -84(%rdi)
+ notq -36(%rdi)
+ notq -4(%rdi)
+ notq 36(%rdi)
+ notq 60(%rdi)
+
+ addq $232,%rsp
+.cfi_adjust_cfa_offset -232
+
+ popq %r15
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r15
+ popq %r14
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r14
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ popq %rbp
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbp
+ popq %rbx
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbx
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size SHA3_absorb,.-SHA3_absorb
+.globl SHA3_squeeze
+.type SHA3_squeeze,@function
+.align 32
+SHA3_squeeze:
+.cfi_startproc
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-16
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-24
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-32
+
+ shrq $3,%rcx
+ movq %rdi,%r8
+ movq %rsi,%r12
+ movq %rdx,%r13
+ movq %rcx,%r14
+ jmp .Loop_squeeze
+
+.align 32
+.Loop_squeeze:
+ cmpq $8,%r13
+ jb .Ltail_squeeze
+
+ movq (%r8),%rax
+ leaq 8(%r8),%r8
+ movq %rax,(%r12)
+ leaq 8(%r12),%r12
+ subq $8,%r13
+ jz .Ldone_squeeze
+
+ subq $1,%rcx
+ jnz .Loop_squeeze
+
+ call KeccakF1600
+ movq %rdi,%r8
+ movq %r14,%rcx
+ jmp .Loop_squeeze
+
+.Ltail_squeeze:
+ movq %r8,%rsi
+ movq %r12,%rdi
+ movq %r13,%rcx
+.byte 0xf3,0xa4
+
+.Ldone_squeeze:
+ popq %r14
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r14
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r13
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size SHA3_squeeze,.-SHA3_squeeze
+.align 256
+.quad 0,0,0,0,0,0,0,0
+.type iotas,@object
+iotas:
+.quad 0x0000000000000001
+.quad 0x0000000000008082
+.quad 0x800000000000808a
+.quad 0x8000000080008000
+.quad 0x000000000000808b
+.quad 0x0000000080000001
+.quad 0x8000000080008081
+.quad 0x8000000000008009
+.quad 0x000000000000008a
+.quad 0x0000000000000088
+.quad 0x0000000080008009
+.quad 0x000000008000000a
+.quad 0x000000008000808b
+.quad 0x800000000000008b
+.quad 0x8000000000008089
+.quad 0x8000000000008003
+.quad 0x8000000000008002
+.quad 0x8000000000000080
+.quad 0x000000000000800a
+.quad 0x800000008000000a
+.quad 0x8000000080008081
+.quad 0x8000000000008080
+.quad 0x0000000080000001
+.quad 0x8000000080008008
+.size iotas,.-iotas
+.byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha1-mb-x86_64.s b/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha1-mb-x86_64.s
new file mode 100644
index 0000000000..1a0de0f100
--- /dev/null
+++ b/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha1-mb-x86_64.s
@@ -0,0 +1,7267 @@
+.text
+
+
+
+.globl sha1_multi_block
+.type sha1_multi_block,@function
+.align 32
+sha1_multi_block:
+.cfi_startproc
+ movq OPENSSL_ia32cap_P+4(%rip),%rcx
+ btq $61,%rcx
+ jc _shaext_shortcut
+ testl $268435456,%ecx
+ jnz _avx_shortcut
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbx,-24
+ subq $288,%rsp
+ andq $-256,%rsp
+ movq %rax,272(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08
+.Lbody:
+ leaq K_XX_XX(%rip),%rbp
+ leaq 256(%rsp),%rbx
+
+.Loop_grande:
+ movl %edx,280(%rsp)
+ xorl %edx,%edx
+ movq 0(%rsi),%r8
+ movl 8(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,0(%rbx)
+ cmovleq %rbp,%r8
+ movq 16(%rsi),%r9
+ movl 24(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,4(%rbx)
+ cmovleq %rbp,%r9
+ movq 32(%rsi),%r10
+ movl 40(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,8(%rbx)
+ cmovleq %rbp,%r10
+ movq 48(%rsi),%r11
+ movl 56(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,12(%rbx)
+ cmovleq %rbp,%r11
+ testl %edx,%edx
+ jz .Ldone
+
+ movdqu 0(%rdi),%xmm10
+ leaq 128(%rsp),%rax
+ movdqu 32(%rdi),%xmm11
+ movdqu 64(%rdi),%xmm12
+ movdqu 96(%rdi),%xmm13
+ movdqu 128(%rdi),%xmm14
+ movdqa 96(%rbp),%xmm5
+ movdqa -32(%rbp),%xmm15
+ jmp .Loop
+
+.align 32
+.Loop:
+ movd (%r8),%xmm0
+ leaq 64(%r8),%r8
+ movd (%r9),%xmm2
+ leaq 64(%r9),%r9
+ movd (%r10),%xmm3
+ leaq 64(%r10),%r10
+ movd (%r11),%xmm4
+ leaq 64(%r11),%r11
+ punpckldq %xmm3,%xmm0
+ movd -60(%r8),%xmm1
+ punpckldq %xmm4,%xmm2
+ movd -60(%r9),%xmm9
+ punpckldq %xmm2,%xmm0
+ movd -60(%r10),%xmm8
+.byte 102,15,56,0,197
+ movd -60(%r11),%xmm7
+ punpckldq %xmm8,%xmm1
+ movdqa %xmm10,%xmm8
+ paddd %xmm15,%xmm14
+ punpckldq %xmm7,%xmm9
+ movdqa %xmm11,%xmm7
+ movdqa %xmm11,%xmm6
+ pslld $5,%xmm8
+ pandn %xmm13,%xmm7
+ pand %xmm12,%xmm6
+ punpckldq %xmm9,%xmm1
+ movdqa %xmm10,%xmm9
+
+ movdqa %xmm0,0-128(%rax)
+ paddd %xmm0,%xmm14
+ movd -56(%r8),%xmm2
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+ movdqa %xmm11,%xmm7
+
+ por %xmm9,%xmm8
+ movd -56(%r9),%xmm9
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm14
+
+ psrld $2,%xmm11
+ paddd %xmm8,%xmm14
+.byte 102,15,56,0,205
+ movd -56(%r10),%xmm8
+ por %xmm7,%xmm11
+ movd -56(%r11),%xmm7
+ punpckldq %xmm8,%xmm2
+ movdqa %xmm14,%xmm8
+ paddd %xmm15,%xmm13
+ punpckldq %xmm7,%xmm9
+ movdqa %xmm10,%xmm7
+ movdqa %xmm10,%xmm6
+ pslld $5,%xmm8
+ pandn %xmm12,%xmm7
+ pand %xmm11,%xmm6
+ punpckldq %xmm9,%xmm2
+ movdqa %xmm14,%xmm9
+
+ movdqa %xmm1,16-128(%rax)
+ paddd %xmm1,%xmm13
+ movd -52(%r8),%xmm3
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+ movdqa %xmm10,%xmm7
+
+ por %xmm9,%xmm8
+ movd -52(%r9),%xmm9
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm13
+
+ psrld $2,%xmm10
+ paddd %xmm8,%xmm13
+.byte 102,15,56,0,213
+ movd -52(%r10),%xmm8
+ por %xmm7,%xmm10
+ movd -52(%r11),%xmm7
+ punpckldq %xmm8,%xmm3
+ movdqa %xmm13,%xmm8
+ paddd %xmm15,%xmm12
+ punpckldq %xmm7,%xmm9
+ movdqa %xmm14,%xmm7
+ movdqa %xmm14,%xmm6
+ pslld $5,%xmm8
+ pandn %xmm11,%xmm7
+ pand %xmm10,%xmm6
+ punpckldq %xmm9,%xmm3
+ movdqa %xmm13,%xmm9
+
+ movdqa %xmm2,32-128(%rax)
+ paddd %xmm2,%xmm12
+ movd -48(%r8),%xmm4
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+ movdqa %xmm14,%xmm7
+
+ por %xmm9,%xmm8
+ movd -48(%r9),%xmm9
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm12
+
+ psrld $2,%xmm14
+ paddd %xmm8,%xmm12
+.byte 102,15,56,0,221
+ movd -48(%r10),%xmm8
+ por %xmm7,%xmm14
+ movd -48(%r11),%xmm7
+ punpckldq %xmm8,%xmm4
+ movdqa %xmm12,%xmm8
+ paddd %xmm15,%xmm11
+ punpckldq %xmm7,%xmm9
+ movdqa %xmm13,%xmm7
+ movdqa %xmm13,%xmm6
+ pslld $5,%xmm8
+ pandn %xmm10,%xmm7
+ pand %xmm14,%xmm6
+ punpckldq %xmm9,%xmm4
+ movdqa %xmm12,%xmm9
+
+ movdqa %xmm3,48-128(%rax)
+ paddd %xmm3,%xmm11
+ movd -44(%r8),%xmm0
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+ movdqa %xmm13,%xmm7
+
+ por %xmm9,%xmm8
+ movd -44(%r9),%xmm9
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm11
+
+ psrld $2,%xmm13
+ paddd %xmm8,%xmm11
+.byte 102,15,56,0,229
+ movd -44(%r10),%xmm8
+ por %xmm7,%xmm13
+ movd -44(%r11),%xmm7
+ punpckldq %xmm8,%xmm0
+ movdqa %xmm11,%xmm8
+ paddd %xmm15,%xmm10
+ punpckldq %xmm7,%xmm9
+ movdqa %xmm12,%xmm7
+ movdqa %xmm12,%xmm6
+ pslld $5,%xmm8
+ pandn %xmm14,%xmm7
+ pand %xmm13,%xmm6
+ punpckldq %xmm9,%xmm0
+ movdqa %xmm11,%xmm9
+
+ movdqa %xmm4,64-128(%rax)
+ paddd %xmm4,%xmm10
+ movd -40(%r8),%xmm1
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+ movdqa %xmm12,%xmm7
+
+ por %xmm9,%xmm8
+ movd -40(%r9),%xmm9
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm10
+
+ psrld $2,%xmm12
+ paddd %xmm8,%xmm10
+.byte 102,15,56,0,197
+ movd -40(%r10),%xmm8
+ por %xmm7,%xmm12
+ movd -40(%r11),%xmm7
+ punpckldq %xmm8,%xmm1
+ movdqa %xmm10,%xmm8
+ paddd %xmm15,%xmm14
+ punpckldq %xmm7,%xmm9
+ movdqa %xmm11,%xmm7
+ movdqa %xmm11,%xmm6
+ pslld $5,%xmm8
+ pandn %xmm13,%xmm7
+ pand %xmm12,%xmm6
+ punpckldq %xmm9,%xmm1
+ movdqa %xmm10,%xmm9
+
+ movdqa %xmm0,80-128(%rax)
+ paddd %xmm0,%xmm14
+ movd -36(%r8),%xmm2
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+ movdqa %xmm11,%xmm7
+
+ por %xmm9,%xmm8
+ movd -36(%r9),%xmm9
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm14
+
+ psrld $2,%xmm11
+ paddd %xmm8,%xmm14
+.byte 102,15,56,0,205
+ movd -36(%r10),%xmm8
+ por %xmm7,%xmm11
+ movd -36(%r11),%xmm7
+ punpckldq %xmm8,%xmm2
+ movdqa %xmm14,%xmm8
+ paddd %xmm15,%xmm13
+ punpckldq %xmm7,%xmm9
+ movdqa %xmm10,%xmm7
+ movdqa %xmm10,%xmm6
+ pslld $5,%xmm8
+ pandn %xmm12,%xmm7
+ pand %xmm11,%xmm6
+ punpckldq %xmm9,%xmm2
+ movdqa %xmm14,%xmm9
+
+ movdqa %xmm1,96-128(%rax)
+ paddd %xmm1,%xmm13
+ movd -32(%r8),%xmm3
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+ movdqa %xmm10,%xmm7
+
+ por %xmm9,%xmm8
+ movd -32(%r9),%xmm9
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm13
+
+ psrld $2,%xmm10
+ paddd %xmm8,%xmm13
+.byte 102,15,56,0,213
+ movd -32(%r10),%xmm8
+ por %xmm7,%xmm10
+ movd -32(%r11),%xmm7
+ punpckldq %xmm8,%xmm3
+ movdqa %xmm13,%xmm8
+ paddd %xmm15,%xmm12
+ punpckldq %xmm7,%xmm9
+ movdqa %xmm14,%xmm7
+ movdqa %xmm14,%xmm6
+ pslld $5,%xmm8
+ pandn %xmm11,%xmm7
+ pand %xmm10,%xmm6
+ punpckldq %xmm9,%xmm3
+ movdqa %xmm13,%xmm9
+
+ movdqa %xmm2,112-128(%rax)
+ paddd %xmm2,%xmm12
+ movd -28(%r8),%xmm4
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+ movdqa %xmm14,%xmm7
+
+ por %xmm9,%xmm8
+ movd -28(%r9),%xmm9
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm12
+
+ psrld $2,%xmm14
+ paddd %xmm8,%xmm12
+.byte 102,15,56,0,221
+ movd -28(%r10),%xmm8
+ por %xmm7,%xmm14
+ movd -28(%r11),%xmm7
+ punpckldq %xmm8,%xmm4
+ movdqa %xmm12,%xmm8
+ paddd %xmm15,%xmm11
+ punpckldq %xmm7,%xmm9
+ movdqa %xmm13,%xmm7
+ movdqa %xmm13,%xmm6
+ pslld $5,%xmm8
+ pandn %xmm10,%xmm7
+ pand %xmm14,%xmm6
+ punpckldq %xmm9,%xmm4
+ movdqa %xmm12,%xmm9
+
+ movdqa %xmm3,128-128(%rax)
+ paddd %xmm3,%xmm11
+ movd -24(%r8),%xmm0
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+ movdqa %xmm13,%xmm7
+
+ por %xmm9,%xmm8
+ movd -24(%r9),%xmm9
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm11
+
+ psrld $2,%xmm13
+ paddd %xmm8,%xmm11
+.byte 102,15,56,0,229
+ movd -24(%r10),%xmm8
+ por %xmm7,%xmm13
+ movd -24(%r11),%xmm7
+ punpckldq %xmm8,%xmm0
+ movdqa %xmm11,%xmm8
+ paddd %xmm15,%xmm10
+ punpckldq %xmm7,%xmm9
+ movdqa %xmm12,%xmm7
+ movdqa %xmm12,%xmm6
+ pslld $5,%xmm8
+ pandn %xmm14,%xmm7
+ pand %xmm13,%xmm6
+ punpckldq %xmm9,%xmm0
+ movdqa %xmm11,%xmm9
+
+ movdqa %xmm4,144-128(%rax)
+ paddd %xmm4,%xmm10
+ movd -20(%r8),%xmm1
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+ movdqa %xmm12,%xmm7
+
+ por %xmm9,%xmm8
+ movd -20(%r9),%xmm9
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm10
+
+ psrld $2,%xmm12
+ paddd %xmm8,%xmm10
+.byte 102,15,56,0,197
+ movd -20(%r10),%xmm8
+ por %xmm7,%xmm12
+ movd -20(%r11),%xmm7
+ punpckldq %xmm8,%xmm1
+ movdqa %xmm10,%xmm8
+ paddd %xmm15,%xmm14
+ punpckldq %xmm7,%xmm9
+ movdqa %xmm11,%xmm7
+ movdqa %xmm11,%xmm6
+ pslld $5,%xmm8
+ pandn %xmm13,%xmm7
+ pand %xmm12,%xmm6
+ punpckldq %xmm9,%xmm1
+ movdqa %xmm10,%xmm9
+
+ movdqa %xmm0,160-128(%rax)
+ paddd %xmm0,%xmm14
+ movd -16(%r8),%xmm2
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+ movdqa %xmm11,%xmm7
+
+ por %xmm9,%xmm8
+ movd -16(%r9),%xmm9
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm14
+
+ psrld $2,%xmm11
+ paddd %xmm8,%xmm14
+.byte 102,15,56,0,205
+ movd -16(%r10),%xmm8
+ por %xmm7,%xmm11
+ movd -16(%r11),%xmm7
+ punpckldq %xmm8,%xmm2
+ movdqa %xmm14,%xmm8
+ paddd %xmm15,%xmm13
+ punpckldq %xmm7,%xmm9
+ movdqa %xmm10,%xmm7
+ movdqa %xmm10,%xmm6
+ pslld $5,%xmm8
+ pandn %xmm12,%xmm7
+ pand %xmm11,%xmm6
+ punpckldq %xmm9,%xmm2
+ movdqa %xmm14,%xmm9
+
+ movdqa %xmm1,176-128(%rax)
+ paddd %xmm1,%xmm13
+ movd -12(%r8),%xmm3
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+ movdqa %xmm10,%xmm7
+
+ por %xmm9,%xmm8
+ movd -12(%r9),%xmm9
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm13
+
+ psrld $2,%xmm10
+ paddd %xmm8,%xmm13
+.byte 102,15,56,0,213
+ movd -12(%r10),%xmm8
+ por %xmm7,%xmm10
+ movd -12(%r11),%xmm7
+ punpckldq %xmm8,%xmm3
+ movdqa %xmm13,%xmm8
+ paddd %xmm15,%xmm12
+ punpckldq %xmm7,%xmm9
+ movdqa %xmm14,%xmm7
+ movdqa %xmm14,%xmm6
+ pslld $5,%xmm8
+ pandn %xmm11,%xmm7
+ pand %xmm10,%xmm6
+ punpckldq %xmm9,%xmm3
+ movdqa %xmm13,%xmm9
+
+ movdqa %xmm2,192-128(%rax)
+ paddd %xmm2,%xmm12
+ movd -8(%r8),%xmm4
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+ movdqa %xmm14,%xmm7
+
+ por %xmm9,%xmm8
+ movd -8(%r9),%xmm9
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm12
+
+ psrld $2,%xmm14
+ paddd %xmm8,%xmm12
+.byte 102,15,56,0,221
+ movd -8(%r10),%xmm8
+ por %xmm7,%xmm14
+ movd -8(%r11),%xmm7
+ punpckldq %xmm8,%xmm4
+ movdqa %xmm12,%xmm8
+ paddd %xmm15,%xmm11
+ punpckldq %xmm7,%xmm9
+ movdqa %xmm13,%xmm7
+ movdqa %xmm13,%xmm6
+ pslld $5,%xmm8
+ pandn %xmm10,%xmm7
+ pand %xmm14,%xmm6
+ punpckldq %xmm9,%xmm4
+ movdqa %xmm12,%xmm9
+
+ movdqa %xmm3,208-128(%rax)
+ paddd %xmm3,%xmm11
+ movd -4(%r8),%xmm0
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+ movdqa %xmm13,%xmm7
+
+ por %xmm9,%xmm8
+ movd -4(%r9),%xmm9
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm11
+
+ psrld $2,%xmm13
+ paddd %xmm8,%xmm11
+.byte 102,15,56,0,229
+ movd -4(%r10),%xmm8
+ por %xmm7,%xmm13
+ movdqa 0-128(%rax),%xmm1
+ movd -4(%r11),%xmm7
+ punpckldq %xmm8,%xmm0
+ movdqa %xmm11,%xmm8
+ paddd %xmm15,%xmm10
+ punpckldq %xmm7,%xmm9
+ movdqa %xmm12,%xmm7
+ movdqa %xmm12,%xmm6
+ pslld $5,%xmm8
+ prefetcht0 63(%r8)
+ pandn %xmm14,%xmm7
+ pand %xmm13,%xmm6
+ punpckldq %xmm9,%xmm0
+ movdqa %xmm11,%xmm9
+
+ movdqa %xmm4,224-128(%rax)
+ paddd %xmm4,%xmm10
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+ movdqa %xmm12,%xmm7
+ prefetcht0 63(%r9)
+
+ por %xmm9,%xmm8
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm10
+ prefetcht0 63(%r10)
+
+ psrld $2,%xmm12
+ paddd %xmm8,%xmm10
+.byte 102,15,56,0,197
+ prefetcht0 63(%r11)
+ por %xmm7,%xmm12
+ movdqa 16-128(%rax),%xmm2
+ pxor %xmm3,%xmm1
+ movdqa 32-128(%rax),%xmm3
+
+ movdqa %xmm10,%xmm8
+ pxor 128-128(%rax),%xmm1
+ paddd %xmm15,%xmm14
+ movdqa %xmm11,%xmm7
+ pslld $5,%xmm8
+ pxor %xmm3,%xmm1
+ movdqa %xmm11,%xmm6
+ pandn %xmm13,%xmm7
+ movdqa %xmm1,%xmm5
+ pand %xmm12,%xmm6
+ movdqa %xmm10,%xmm9
+ psrld $31,%xmm5
+ paddd %xmm1,%xmm1
+
+ movdqa %xmm0,240-128(%rax)
+ paddd %xmm0,%xmm14
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+
+ movdqa %xmm11,%xmm7
+ por %xmm9,%xmm8
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm14
+
+ psrld $2,%xmm11
+ paddd %xmm8,%xmm14
+ por %xmm5,%xmm1
+ por %xmm7,%xmm11
+ pxor %xmm4,%xmm2
+ movdqa 48-128(%rax),%xmm4
+
+ movdqa %xmm14,%xmm8
+ pxor 144-128(%rax),%xmm2
+ paddd %xmm15,%xmm13
+ movdqa %xmm10,%xmm7
+ pslld $5,%xmm8
+ pxor %xmm4,%xmm2
+ movdqa %xmm10,%xmm6
+ pandn %xmm12,%xmm7
+ movdqa %xmm2,%xmm5
+ pand %xmm11,%xmm6
+ movdqa %xmm14,%xmm9
+ psrld $31,%xmm5
+ paddd %xmm2,%xmm2
+
+ movdqa %xmm1,0-128(%rax)
+ paddd %xmm1,%xmm13
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+
+ movdqa %xmm10,%xmm7
+ por %xmm9,%xmm8
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm13
+
+ psrld $2,%xmm10
+ paddd %xmm8,%xmm13
+ por %xmm5,%xmm2
+ por %xmm7,%xmm10
+ pxor %xmm0,%xmm3
+ movdqa 64-128(%rax),%xmm0
+
+ movdqa %xmm13,%xmm8
+ pxor 160-128(%rax),%xmm3
+ paddd %xmm15,%xmm12
+ movdqa %xmm14,%xmm7
+ pslld $5,%xmm8
+ pxor %xmm0,%xmm3
+ movdqa %xmm14,%xmm6
+ pandn %xmm11,%xmm7
+ movdqa %xmm3,%xmm5
+ pand %xmm10,%xmm6
+ movdqa %xmm13,%xmm9
+ psrld $31,%xmm5
+ paddd %xmm3,%xmm3
+
+ movdqa %xmm2,16-128(%rax)
+ paddd %xmm2,%xmm12
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+
+ movdqa %xmm14,%xmm7
+ por %xmm9,%xmm8
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm12
+
+ psrld $2,%xmm14
+ paddd %xmm8,%xmm12
+ por %xmm5,%xmm3
+ por %xmm7,%xmm14
+ pxor %xmm1,%xmm4
+ movdqa 80-128(%rax),%xmm1
+
+ movdqa %xmm12,%xmm8
+ pxor 176-128(%rax),%xmm4
+ paddd %xmm15,%xmm11
+ movdqa %xmm13,%xmm7
+ pslld $5,%xmm8
+ pxor %xmm1,%xmm4
+ movdqa %xmm13,%xmm6
+ pandn %xmm10,%xmm7
+ movdqa %xmm4,%xmm5
+ pand %xmm14,%xmm6
+ movdqa %xmm12,%xmm9
+ psrld $31,%xmm5
+ paddd %xmm4,%xmm4
+
+ movdqa %xmm3,32-128(%rax)
+ paddd %xmm3,%xmm11
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+
+ movdqa %xmm13,%xmm7
+ por %xmm9,%xmm8
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm11
+
+ psrld $2,%xmm13
+ paddd %xmm8,%xmm11
+ por %xmm5,%xmm4
+ por %xmm7,%xmm13
+ pxor %xmm2,%xmm0
+ movdqa 96-128(%rax),%xmm2
+
+ movdqa %xmm11,%xmm8
+ pxor 192-128(%rax),%xmm0
+ paddd %xmm15,%xmm10
+ movdqa %xmm12,%xmm7
+ pslld $5,%xmm8
+ pxor %xmm2,%xmm0
+ movdqa %xmm12,%xmm6
+ pandn %xmm14,%xmm7
+ movdqa %xmm0,%xmm5
+ pand %xmm13,%xmm6
+ movdqa %xmm11,%xmm9
+ psrld $31,%xmm5
+ paddd %xmm0,%xmm0
+
+ movdqa %xmm4,48-128(%rax)
+ paddd %xmm4,%xmm10
+ psrld $27,%xmm9
+ pxor %xmm7,%xmm6
+
+ movdqa %xmm12,%xmm7
+ por %xmm9,%xmm8
+ pslld $30,%xmm7
+ paddd %xmm6,%xmm10
+
+ psrld $2,%xmm12
+ paddd %xmm8,%xmm10
+ por %xmm5,%xmm0
+ por %xmm7,%xmm12
+ movdqa 0(%rbp),%xmm15
+ pxor %xmm3,%xmm1
+ movdqa 112-128(%rax),%xmm3
+
+ movdqa %xmm10,%xmm8
+ movdqa %xmm13,%xmm6
+ pxor 208-128(%rax),%xmm1
+ paddd %xmm15,%xmm14
+ pslld $5,%xmm8
+ pxor %xmm11,%xmm6
+
+ movdqa %xmm10,%xmm9
+ movdqa %xmm0,64-128(%rax)
+ paddd %xmm0,%xmm14
+ pxor %xmm3,%xmm1
+ psrld $27,%xmm9
+ pxor %xmm12,%xmm6
+ movdqa %xmm11,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm1,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm14
+ paddd %xmm1,%xmm1
+
+ psrld $2,%xmm11
+ paddd %xmm8,%xmm14
+ por %xmm5,%xmm1
+ por %xmm7,%xmm11
+ pxor %xmm4,%xmm2
+ movdqa 128-128(%rax),%xmm4
+
+ movdqa %xmm14,%xmm8
+ movdqa %xmm12,%xmm6
+ pxor 224-128(%rax),%xmm2
+ paddd %xmm15,%xmm13
+ pslld $5,%xmm8
+ pxor %xmm10,%xmm6
+
+ movdqa %xmm14,%xmm9
+ movdqa %xmm1,80-128(%rax)
+ paddd %xmm1,%xmm13
+ pxor %xmm4,%xmm2
+ psrld $27,%xmm9
+ pxor %xmm11,%xmm6
+ movdqa %xmm10,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm2,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm13
+ paddd %xmm2,%xmm2
+
+ psrld $2,%xmm10
+ paddd %xmm8,%xmm13
+ por %xmm5,%xmm2
+ por %xmm7,%xmm10
+ pxor %xmm0,%xmm3
+ movdqa 144-128(%rax),%xmm0
+
+ movdqa %xmm13,%xmm8
+ movdqa %xmm11,%xmm6
+ pxor 240-128(%rax),%xmm3
+ paddd %xmm15,%xmm12
+ pslld $5,%xmm8
+ pxor %xmm14,%xmm6
+
+ movdqa %xmm13,%xmm9
+ movdqa %xmm2,96-128(%rax)
+ paddd %xmm2,%xmm12
+ pxor %xmm0,%xmm3
+ psrld $27,%xmm9
+ pxor %xmm10,%xmm6
+ movdqa %xmm14,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm3,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm12
+ paddd %xmm3,%xmm3
+
+ psrld $2,%xmm14
+ paddd %xmm8,%xmm12
+ por %xmm5,%xmm3
+ por %xmm7,%xmm14
+ pxor %xmm1,%xmm4
+ movdqa 160-128(%rax),%xmm1
+
+ movdqa %xmm12,%xmm8
+ movdqa %xmm10,%xmm6
+ pxor 0-128(%rax),%xmm4
+ paddd %xmm15,%xmm11
+ pslld $5,%xmm8
+ pxor %xmm13,%xmm6
+
+ movdqa %xmm12,%xmm9
+ movdqa %xmm3,112-128(%rax)
+ paddd %xmm3,%xmm11
+ pxor %xmm1,%xmm4
+ psrld $27,%xmm9
+ pxor %xmm14,%xmm6
+ movdqa %xmm13,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm4,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm11
+ paddd %xmm4,%xmm4
+
+ psrld $2,%xmm13
+ paddd %xmm8,%xmm11
+ por %xmm5,%xmm4
+ por %xmm7,%xmm13
+ pxor %xmm2,%xmm0
+ movdqa 176-128(%rax),%xmm2
+
+ movdqa %xmm11,%xmm8
+ movdqa %xmm14,%xmm6
+ pxor 16-128(%rax),%xmm0
+ paddd %xmm15,%xmm10
+ pslld $5,%xmm8
+ pxor %xmm12,%xmm6
+
+ movdqa %xmm11,%xmm9
+ movdqa %xmm4,128-128(%rax)
+ paddd %xmm4,%xmm10
+ pxor %xmm2,%xmm0
+ psrld $27,%xmm9
+ pxor %xmm13,%xmm6
+ movdqa %xmm12,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm0,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm10
+ paddd %xmm0,%xmm0
+
+ psrld $2,%xmm12
+ paddd %xmm8,%xmm10
+ por %xmm5,%xmm0
+ por %xmm7,%xmm12
+ pxor %xmm3,%xmm1
+ movdqa 192-128(%rax),%xmm3
+
+ movdqa %xmm10,%xmm8
+ movdqa %xmm13,%xmm6
+ pxor 32-128(%rax),%xmm1
+ paddd %xmm15,%xmm14
+ pslld $5,%xmm8
+ pxor %xmm11,%xmm6
+
+ movdqa %xmm10,%xmm9
+ movdqa %xmm0,144-128(%rax)
+ paddd %xmm0,%xmm14
+ pxor %xmm3,%xmm1
+ psrld $27,%xmm9
+ pxor %xmm12,%xmm6
+ movdqa %xmm11,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm1,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm14
+ paddd %xmm1,%xmm1
+
+ psrld $2,%xmm11
+ paddd %xmm8,%xmm14
+ por %xmm5,%xmm1
+ por %xmm7,%xmm11
+ pxor %xmm4,%xmm2
+ movdqa 208-128(%rax),%xmm4
+
+ movdqa %xmm14,%xmm8
+ movdqa %xmm12,%xmm6
+ pxor 48-128(%rax),%xmm2
+ paddd %xmm15,%xmm13
+ pslld $5,%xmm8
+ pxor %xmm10,%xmm6
+
+ movdqa %xmm14,%xmm9
+ movdqa %xmm1,160-128(%rax)
+ paddd %xmm1,%xmm13
+ pxor %xmm4,%xmm2
+ psrld $27,%xmm9
+ pxor %xmm11,%xmm6
+ movdqa %xmm10,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm2,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm13
+ paddd %xmm2,%xmm2
+
+ psrld $2,%xmm10
+ paddd %xmm8,%xmm13
+ por %xmm5,%xmm2
+ por %xmm7,%xmm10
+ pxor %xmm0,%xmm3
+ movdqa 224-128(%rax),%xmm0
+
+ movdqa %xmm13,%xmm8
+ movdqa %xmm11,%xmm6
+ pxor 64-128(%rax),%xmm3
+ paddd %xmm15,%xmm12
+ pslld $5,%xmm8
+ pxor %xmm14,%xmm6
+
+ movdqa %xmm13,%xmm9
+ movdqa %xmm2,176-128(%rax)
+ paddd %xmm2,%xmm12
+ pxor %xmm0,%xmm3
+ psrld $27,%xmm9
+ pxor %xmm10,%xmm6
+ movdqa %xmm14,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm3,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm12
+ paddd %xmm3,%xmm3
+
+ psrld $2,%xmm14
+ paddd %xmm8,%xmm12
+ por %xmm5,%xmm3
+ por %xmm7,%xmm14
+ pxor %xmm1,%xmm4
+ movdqa 240-128(%rax),%xmm1
+
+ movdqa %xmm12,%xmm8
+ movdqa %xmm10,%xmm6
+ pxor 80-128(%rax),%xmm4
+ paddd %xmm15,%xmm11
+ pslld $5,%xmm8
+ pxor %xmm13,%xmm6
+
+ movdqa %xmm12,%xmm9
+ movdqa %xmm3,192-128(%rax)
+ paddd %xmm3,%xmm11
+ pxor %xmm1,%xmm4
+ psrld $27,%xmm9
+ pxor %xmm14,%xmm6
+ movdqa %xmm13,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm4,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm11
+ paddd %xmm4,%xmm4
+
+ psrld $2,%xmm13
+ paddd %xmm8,%xmm11
+ por %xmm5,%xmm4
+ por %xmm7,%xmm13
+ pxor %xmm2,%xmm0
+ movdqa 0-128(%rax),%xmm2
+
+ movdqa %xmm11,%xmm8
+ movdqa %xmm14,%xmm6
+ pxor 96-128(%rax),%xmm0
+ paddd %xmm15,%xmm10
+ pslld $5,%xmm8
+ pxor %xmm12,%xmm6
+
+ movdqa %xmm11,%xmm9
+ movdqa %xmm4,208-128(%rax)
+ paddd %xmm4,%xmm10
+ pxor %xmm2,%xmm0
+ psrld $27,%xmm9
+ pxor %xmm13,%xmm6
+ movdqa %xmm12,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm0,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm10
+ paddd %xmm0,%xmm0
+
+ psrld $2,%xmm12
+ paddd %xmm8,%xmm10
+ por %xmm5,%xmm0
+ por %xmm7,%xmm12
+ pxor %xmm3,%xmm1
+ movdqa 16-128(%rax),%xmm3
+
+ movdqa %xmm10,%xmm8
+ movdqa %xmm13,%xmm6
+ pxor 112-128(%rax),%xmm1
+ paddd %xmm15,%xmm14
+ pslld $5,%xmm8
+ pxor %xmm11,%xmm6
+
+ movdqa %xmm10,%xmm9
+ movdqa %xmm0,224-128(%rax)
+ paddd %xmm0,%xmm14
+ pxor %xmm3,%xmm1
+ psrld $27,%xmm9
+ pxor %xmm12,%xmm6
+ movdqa %xmm11,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm1,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm14
+ paddd %xmm1,%xmm1
+
+ psrld $2,%xmm11
+ paddd %xmm8,%xmm14
+ por %xmm5,%xmm1
+ por %xmm7,%xmm11
+ pxor %xmm4,%xmm2
+ movdqa 32-128(%rax),%xmm4
+
+ movdqa %xmm14,%xmm8
+ movdqa %xmm12,%xmm6
+ pxor 128-128(%rax),%xmm2
+ paddd %xmm15,%xmm13
+ pslld $5,%xmm8
+ pxor %xmm10,%xmm6
+
+ movdqa %xmm14,%xmm9
+ movdqa %xmm1,240-128(%rax)
+ paddd %xmm1,%xmm13
+ pxor %xmm4,%xmm2
+ psrld $27,%xmm9
+ pxor %xmm11,%xmm6
+ movdqa %xmm10,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm2,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm13
+ paddd %xmm2,%xmm2
+
+ psrld $2,%xmm10
+ paddd %xmm8,%xmm13
+ por %xmm5,%xmm2
+ por %xmm7,%xmm10
+ pxor %xmm0,%xmm3
+ movdqa 48-128(%rax),%xmm0
+
+ movdqa %xmm13,%xmm8
+ movdqa %xmm11,%xmm6
+ pxor 144-128(%rax),%xmm3
+ paddd %xmm15,%xmm12
+ pslld $5,%xmm8
+ pxor %xmm14,%xmm6
+
+ movdqa %xmm13,%xmm9
+ movdqa %xmm2,0-128(%rax)
+ paddd %xmm2,%xmm12
+ pxor %xmm0,%xmm3
+ psrld $27,%xmm9
+ pxor %xmm10,%xmm6
+ movdqa %xmm14,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm3,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm12
+ paddd %xmm3,%xmm3
+
+ psrld $2,%xmm14
+ paddd %xmm8,%xmm12
+ por %xmm5,%xmm3
+ por %xmm7,%xmm14
+ pxor %xmm1,%xmm4
+ movdqa 64-128(%rax),%xmm1
+
+ movdqa %xmm12,%xmm8
+ movdqa %xmm10,%xmm6
+ pxor 160-128(%rax),%xmm4
+ paddd %xmm15,%xmm11
+ pslld $5,%xmm8
+ pxor %xmm13,%xmm6
+
+ movdqa %xmm12,%xmm9
+ movdqa %xmm3,16-128(%rax)
+ paddd %xmm3,%xmm11
+ pxor %xmm1,%xmm4
+ psrld $27,%xmm9
+ pxor %xmm14,%xmm6
+ movdqa %xmm13,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm4,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm11
+ paddd %xmm4,%xmm4
+
+ psrld $2,%xmm13
+ paddd %xmm8,%xmm11
+ por %xmm5,%xmm4
+ por %xmm7,%xmm13
+ pxor %xmm2,%xmm0
+ movdqa 80-128(%rax),%xmm2
+
+ movdqa %xmm11,%xmm8
+ movdqa %xmm14,%xmm6
+ pxor 176-128(%rax),%xmm0
+ paddd %xmm15,%xmm10
+ pslld $5,%xmm8
+ pxor %xmm12,%xmm6
+
+ movdqa %xmm11,%xmm9
+ movdqa %xmm4,32-128(%rax)
+ paddd %xmm4,%xmm10
+ pxor %xmm2,%xmm0
+ psrld $27,%xmm9
+ pxor %xmm13,%xmm6
+ movdqa %xmm12,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm0,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm10
+ paddd %xmm0,%xmm0
+
+ psrld $2,%xmm12
+ paddd %xmm8,%xmm10
+ por %xmm5,%xmm0
+ por %xmm7,%xmm12
+ pxor %xmm3,%xmm1
+ movdqa 96-128(%rax),%xmm3
+
+ movdqa %xmm10,%xmm8
+ movdqa %xmm13,%xmm6
+ pxor 192-128(%rax),%xmm1
+ paddd %xmm15,%xmm14
+ pslld $5,%xmm8
+ pxor %xmm11,%xmm6
+
+ movdqa %xmm10,%xmm9
+ movdqa %xmm0,48-128(%rax)
+ paddd %xmm0,%xmm14
+ pxor %xmm3,%xmm1
+ psrld $27,%xmm9
+ pxor %xmm12,%xmm6
+ movdqa %xmm11,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm1,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm14
+ paddd %xmm1,%xmm1
+
+ psrld $2,%xmm11
+ paddd %xmm8,%xmm14
+ por %xmm5,%xmm1
+ por %xmm7,%xmm11
+ pxor %xmm4,%xmm2
+ movdqa 112-128(%rax),%xmm4
+
+ movdqa %xmm14,%xmm8
+ movdqa %xmm12,%xmm6
+ pxor 208-128(%rax),%xmm2
+ paddd %xmm15,%xmm13
+ pslld $5,%xmm8
+ pxor %xmm10,%xmm6
+
+ movdqa %xmm14,%xmm9
+ movdqa %xmm1,64-128(%rax)
+ paddd %xmm1,%xmm13
+ pxor %xmm4,%xmm2
+ psrld $27,%xmm9
+ pxor %xmm11,%xmm6
+ movdqa %xmm10,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm2,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm13
+ paddd %xmm2,%xmm2
+
+ psrld $2,%xmm10
+ paddd %xmm8,%xmm13
+ por %xmm5,%xmm2
+ por %xmm7,%xmm10
+ pxor %xmm0,%xmm3
+ movdqa 128-128(%rax),%xmm0
+
+ movdqa %xmm13,%xmm8
+ movdqa %xmm11,%xmm6
+ pxor 224-128(%rax),%xmm3
+ paddd %xmm15,%xmm12
+ pslld $5,%xmm8
+ pxor %xmm14,%xmm6
+
+ movdqa %xmm13,%xmm9
+ movdqa %xmm2,80-128(%rax)
+ paddd %xmm2,%xmm12
+ pxor %xmm0,%xmm3
+ psrld $27,%xmm9
+ pxor %xmm10,%xmm6
+ movdqa %xmm14,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm3,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm12
+ paddd %xmm3,%xmm3
+
+ psrld $2,%xmm14
+ paddd %xmm8,%xmm12
+ por %xmm5,%xmm3
+ por %xmm7,%xmm14
+ pxor %xmm1,%xmm4
+ movdqa 144-128(%rax),%xmm1
+
+ movdqa %xmm12,%xmm8
+ movdqa %xmm10,%xmm6
+ pxor 240-128(%rax),%xmm4
+ paddd %xmm15,%xmm11
+ pslld $5,%xmm8
+ pxor %xmm13,%xmm6
+
+ movdqa %xmm12,%xmm9
+ movdqa %xmm3,96-128(%rax)
+ paddd %xmm3,%xmm11
+ pxor %xmm1,%xmm4
+ psrld $27,%xmm9
+ pxor %xmm14,%xmm6
+ movdqa %xmm13,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm4,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm11
+ paddd %xmm4,%xmm4
+
+ psrld $2,%xmm13
+ paddd %xmm8,%xmm11
+ por %xmm5,%xmm4
+ por %xmm7,%xmm13
+ pxor %xmm2,%xmm0
+ movdqa 160-128(%rax),%xmm2
+
+ movdqa %xmm11,%xmm8
+ movdqa %xmm14,%xmm6
+ pxor 0-128(%rax),%xmm0
+ paddd %xmm15,%xmm10
+ pslld $5,%xmm8
+ pxor %xmm12,%xmm6
+
+ movdqa %xmm11,%xmm9
+ movdqa %xmm4,112-128(%rax)
+ paddd %xmm4,%xmm10
+ pxor %xmm2,%xmm0
+ psrld $27,%xmm9
+ pxor %xmm13,%xmm6
+ movdqa %xmm12,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm0,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm10
+ paddd %xmm0,%xmm0
+
+ psrld $2,%xmm12
+ paddd %xmm8,%xmm10
+ por %xmm5,%xmm0
+ por %xmm7,%xmm12
+ movdqa 32(%rbp),%xmm15
+ pxor %xmm3,%xmm1
+ movdqa 176-128(%rax),%xmm3
+
+ movdqa %xmm10,%xmm8
+ movdqa %xmm13,%xmm7
+ pxor 16-128(%rax),%xmm1
+ pxor %xmm3,%xmm1
+ paddd %xmm15,%xmm14
+ pslld $5,%xmm8
+ movdqa %xmm10,%xmm9
+ pand %xmm12,%xmm7
+
+ movdqa %xmm13,%xmm6
+ movdqa %xmm1,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm14
+ pxor %xmm12,%xmm6
+
+ movdqa %xmm0,128-128(%rax)
+ paddd %xmm0,%xmm14
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm11,%xmm6
+ movdqa %xmm11,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm1,%xmm1
+ paddd %xmm6,%xmm14
+
+ psrld $2,%xmm11
+ paddd %xmm8,%xmm14
+ por %xmm5,%xmm1
+ por %xmm7,%xmm11
+ pxor %xmm4,%xmm2
+ movdqa 192-128(%rax),%xmm4
+
+ movdqa %xmm14,%xmm8
+ movdqa %xmm12,%xmm7
+ pxor 32-128(%rax),%xmm2
+ pxor %xmm4,%xmm2
+ paddd %xmm15,%xmm13
+ pslld $5,%xmm8
+ movdqa %xmm14,%xmm9
+ pand %xmm11,%xmm7
+
+ movdqa %xmm12,%xmm6
+ movdqa %xmm2,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm13
+ pxor %xmm11,%xmm6
+
+ movdqa %xmm1,144-128(%rax)
+ paddd %xmm1,%xmm13
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm10,%xmm6
+ movdqa %xmm10,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm2,%xmm2
+ paddd %xmm6,%xmm13
+
+ psrld $2,%xmm10
+ paddd %xmm8,%xmm13
+ por %xmm5,%xmm2
+ por %xmm7,%xmm10
+ pxor %xmm0,%xmm3
+ movdqa 208-128(%rax),%xmm0
+
+ movdqa %xmm13,%xmm8
+ movdqa %xmm11,%xmm7
+ pxor 48-128(%rax),%xmm3
+ pxor %xmm0,%xmm3
+ paddd %xmm15,%xmm12
+ pslld $5,%xmm8
+ movdqa %xmm13,%xmm9
+ pand %xmm10,%xmm7
+
+ movdqa %xmm11,%xmm6
+ movdqa %xmm3,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm12
+ pxor %xmm10,%xmm6
+
+ movdqa %xmm2,160-128(%rax)
+ paddd %xmm2,%xmm12
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm14,%xmm6
+ movdqa %xmm14,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm3,%xmm3
+ paddd %xmm6,%xmm12
+
+ psrld $2,%xmm14
+ paddd %xmm8,%xmm12
+ por %xmm5,%xmm3
+ por %xmm7,%xmm14
+ pxor %xmm1,%xmm4
+ movdqa 224-128(%rax),%xmm1
+
+ movdqa %xmm12,%xmm8
+ movdqa %xmm10,%xmm7
+ pxor 64-128(%rax),%xmm4
+ pxor %xmm1,%xmm4
+ paddd %xmm15,%xmm11
+ pslld $5,%xmm8
+ movdqa %xmm12,%xmm9
+ pand %xmm14,%xmm7
+
+ movdqa %xmm10,%xmm6
+ movdqa %xmm4,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm11
+ pxor %xmm14,%xmm6
+
+ movdqa %xmm3,176-128(%rax)
+ paddd %xmm3,%xmm11
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm13,%xmm6
+ movdqa %xmm13,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm4,%xmm4
+ paddd %xmm6,%xmm11
+
+ psrld $2,%xmm13
+ paddd %xmm8,%xmm11
+ por %xmm5,%xmm4
+ por %xmm7,%xmm13
+ pxor %xmm2,%xmm0
+ movdqa 240-128(%rax),%xmm2
+
+ movdqa %xmm11,%xmm8
+ movdqa %xmm14,%xmm7
+ pxor 80-128(%rax),%xmm0
+ pxor %xmm2,%xmm0
+ paddd %xmm15,%xmm10
+ pslld $5,%xmm8
+ movdqa %xmm11,%xmm9
+ pand %xmm13,%xmm7
+
+ movdqa %xmm14,%xmm6
+ movdqa %xmm0,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm10
+ pxor %xmm13,%xmm6
+
+ movdqa %xmm4,192-128(%rax)
+ paddd %xmm4,%xmm10
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm12,%xmm6
+ movdqa %xmm12,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm0,%xmm0
+ paddd %xmm6,%xmm10
+
+ psrld $2,%xmm12
+ paddd %xmm8,%xmm10
+ por %xmm5,%xmm0
+ por %xmm7,%xmm12
+ pxor %xmm3,%xmm1
+ movdqa 0-128(%rax),%xmm3
+
+ movdqa %xmm10,%xmm8
+ movdqa %xmm13,%xmm7
+ pxor 96-128(%rax),%xmm1
+ pxor %xmm3,%xmm1
+ paddd %xmm15,%xmm14
+ pslld $5,%xmm8
+ movdqa %xmm10,%xmm9
+ pand %xmm12,%xmm7
+
+ movdqa %xmm13,%xmm6
+ movdqa %xmm1,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm14
+ pxor %xmm12,%xmm6
+
+ movdqa %xmm0,208-128(%rax)
+ paddd %xmm0,%xmm14
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm11,%xmm6
+ movdqa %xmm11,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm1,%xmm1
+ paddd %xmm6,%xmm14
+
+ psrld $2,%xmm11
+ paddd %xmm8,%xmm14
+ por %xmm5,%xmm1
+ por %xmm7,%xmm11
+ pxor %xmm4,%xmm2
+ movdqa 16-128(%rax),%xmm4
+
+ movdqa %xmm14,%xmm8
+ movdqa %xmm12,%xmm7
+ pxor 112-128(%rax),%xmm2
+ pxor %xmm4,%xmm2
+ paddd %xmm15,%xmm13
+ pslld $5,%xmm8
+ movdqa %xmm14,%xmm9
+ pand %xmm11,%xmm7
+
+ movdqa %xmm12,%xmm6
+ movdqa %xmm2,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm13
+ pxor %xmm11,%xmm6
+
+ movdqa %xmm1,224-128(%rax)
+ paddd %xmm1,%xmm13
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm10,%xmm6
+ movdqa %xmm10,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm2,%xmm2
+ paddd %xmm6,%xmm13
+
+ psrld $2,%xmm10
+ paddd %xmm8,%xmm13
+ por %xmm5,%xmm2
+ por %xmm7,%xmm10
+ pxor %xmm0,%xmm3
+ movdqa 32-128(%rax),%xmm0
+
+ movdqa %xmm13,%xmm8
+ movdqa %xmm11,%xmm7
+ pxor 128-128(%rax),%xmm3
+ pxor %xmm0,%xmm3
+ paddd %xmm15,%xmm12
+ pslld $5,%xmm8
+ movdqa %xmm13,%xmm9
+ pand %xmm10,%xmm7
+
+ movdqa %xmm11,%xmm6
+ movdqa %xmm3,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm12
+ pxor %xmm10,%xmm6
+
+ movdqa %xmm2,240-128(%rax)
+ paddd %xmm2,%xmm12
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm14,%xmm6
+ movdqa %xmm14,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm3,%xmm3
+ paddd %xmm6,%xmm12
+
+ psrld $2,%xmm14
+ paddd %xmm8,%xmm12
+ por %xmm5,%xmm3
+ por %xmm7,%xmm14
+ pxor %xmm1,%xmm4
+ movdqa 48-128(%rax),%xmm1
+
+ movdqa %xmm12,%xmm8
+ movdqa %xmm10,%xmm7
+ pxor 144-128(%rax),%xmm4
+ pxor %xmm1,%xmm4
+ paddd %xmm15,%xmm11
+ pslld $5,%xmm8
+ movdqa %xmm12,%xmm9
+ pand %xmm14,%xmm7
+
+ movdqa %xmm10,%xmm6
+ movdqa %xmm4,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm11
+ pxor %xmm14,%xmm6
+
+ movdqa %xmm3,0-128(%rax)
+ paddd %xmm3,%xmm11
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm13,%xmm6
+ movdqa %xmm13,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm4,%xmm4
+ paddd %xmm6,%xmm11
+
+ psrld $2,%xmm13
+ paddd %xmm8,%xmm11
+ por %xmm5,%xmm4
+ por %xmm7,%xmm13
+ pxor %xmm2,%xmm0
+ movdqa 64-128(%rax),%xmm2
+
+ movdqa %xmm11,%xmm8
+ movdqa %xmm14,%xmm7
+ pxor 160-128(%rax),%xmm0
+ pxor %xmm2,%xmm0
+ paddd %xmm15,%xmm10
+ pslld $5,%xmm8
+ movdqa %xmm11,%xmm9
+ pand %xmm13,%xmm7
+
+ movdqa %xmm14,%xmm6
+ movdqa %xmm0,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm10
+ pxor %xmm13,%xmm6
+
+ movdqa %xmm4,16-128(%rax)
+ paddd %xmm4,%xmm10
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm12,%xmm6
+ movdqa %xmm12,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm0,%xmm0
+ paddd %xmm6,%xmm10
+
+ psrld $2,%xmm12
+ paddd %xmm8,%xmm10
+ por %xmm5,%xmm0
+ por %xmm7,%xmm12
+ pxor %xmm3,%xmm1
+ movdqa 80-128(%rax),%xmm3
+
+ movdqa %xmm10,%xmm8
+ movdqa %xmm13,%xmm7
+ pxor 176-128(%rax),%xmm1
+ pxor %xmm3,%xmm1
+ paddd %xmm15,%xmm14
+ pslld $5,%xmm8
+ movdqa %xmm10,%xmm9
+ pand %xmm12,%xmm7
+
+ movdqa %xmm13,%xmm6
+ movdqa %xmm1,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm14
+ pxor %xmm12,%xmm6
+
+ movdqa %xmm0,32-128(%rax)
+ paddd %xmm0,%xmm14
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm11,%xmm6
+ movdqa %xmm11,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm1,%xmm1
+ paddd %xmm6,%xmm14
+
+ psrld $2,%xmm11
+ paddd %xmm8,%xmm14
+ por %xmm5,%xmm1
+ por %xmm7,%xmm11
+ pxor %xmm4,%xmm2
+ movdqa 96-128(%rax),%xmm4
+
+ movdqa %xmm14,%xmm8
+ movdqa %xmm12,%xmm7
+ pxor 192-128(%rax),%xmm2
+ pxor %xmm4,%xmm2
+ paddd %xmm15,%xmm13
+ pslld $5,%xmm8
+ movdqa %xmm14,%xmm9
+ pand %xmm11,%xmm7
+
+ movdqa %xmm12,%xmm6
+ movdqa %xmm2,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm13
+ pxor %xmm11,%xmm6
+
+ movdqa %xmm1,48-128(%rax)
+ paddd %xmm1,%xmm13
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm10,%xmm6
+ movdqa %xmm10,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm2,%xmm2
+ paddd %xmm6,%xmm13
+
+ psrld $2,%xmm10
+ paddd %xmm8,%xmm13
+ por %xmm5,%xmm2
+ por %xmm7,%xmm10
+ pxor %xmm0,%xmm3
+ movdqa 112-128(%rax),%xmm0
+
+ movdqa %xmm13,%xmm8
+ movdqa %xmm11,%xmm7
+ pxor 208-128(%rax),%xmm3
+ pxor %xmm0,%xmm3
+ paddd %xmm15,%xmm12
+ pslld $5,%xmm8
+ movdqa %xmm13,%xmm9
+ pand %xmm10,%xmm7
+
+ movdqa %xmm11,%xmm6
+ movdqa %xmm3,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm12
+ pxor %xmm10,%xmm6
+
+ movdqa %xmm2,64-128(%rax)
+ paddd %xmm2,%xmm12
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm14,%xmm6
+ movdqa %xmm14,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm3,%xmm3
+ paddd %xmm6,%xmm12
+
+ psrld $2,%xmm14
+ paddd %xmm8,%xmm12
+ por %xmm5,%xmm3
+ por %xmm7,%xmm14
+ pxor %xmm1,%xmm4
+ movdqa 128-128(%rax),%xmm1
+
+ movdqa %xmm12,%xmm8
+ movdqa %xmm10,%xmm7
+ pxor 224-128(%rax),%xmm4
+ pxor %xmm1,%xmm4
+ paddd %xmm15,%xmm11
+ pslld $5,%xmm8
+ movdqa %xmm12,%xmm9
+ pand %xmm14,%xmm7
+
+ movdqa %xmm10,%xmm6
+ movdqa %xmm4,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm11
+ pxor %xmm14,%xmm6
+
+ movdqa %xmm3,80-128(%rax)
+ paddd %xmm3,%xmm11
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm13,%xmm6
+ movdqa %xmm13,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm4,%xmm4
+ paddd %xmm6,%xmm11
+
+ psrld $2,%xmm13
+ paddd %xmm8,%xmm11
+ por %xmm5,%xmm4
+ por %xmm7,%xmm13
+ pxor %xmm2,%xmm0
+ movdqa 144-128(%rax),%xmm2
+
+ movdqa %xmm11,%xmm8
+ movdqa %xmm14,%xmm7
+ pxor 240-128(%rax),%xmm0
+ pxor %xmm2,%xmm0
+ paddd %xmm15,%xmm10
+ pslld $5,%xmm8
+ movdqa %xmm11,%xmm9
+ pand %xmm13,%xmm7
+
+ movdqa %xmm14,%xmm6
+ movdqa %xmm0,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm10
+ pxor %xmm13,%xmm6
+
+ movdqa %xmm4,96-128(%rax)
+ paddd %xmm4,%xmm10
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm12,%xmm6
+ movdqa %xmm12,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm0,%xmm0
+ paddd %xmm6,%xmm10
+
+ psrld $2,%xmm12
+ paddd %xmm8,%xmm10
+ por %xmm5,%xmm0
+ por %xmm7,%xmm12
+ pxor %xmm3,%xmm1
+ movdqa 160-128(%rax),%xmm3
+
+ movdqa %xmm10,%xmm8
+ movdqa %xmm13,%xmm7
+ pxor 0-128(%rax),%xmm1
+ pxor %xmm3,%xmm1
+ paddd %xmm15,%xmm14
+ pslld $5,%xmm8
+ movdqa %xmm10,%xmm9
+ pand %xmm12,%xmm7
+
+ movdqa %xmm13,%xmm6
+ movdqa %xmm1,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm14
+ pxor %xmm12,%xmm6
+
+ movdqa %xmm0,112-128(%rax)
+ paddd %xmm0,%xmm14
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm11,%xmm6
+ movdqa %xmm11,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm1,%xmm1
+ paddd %xmm6,%xmm14
+
+ psrld $2,%xmm11
+ paddd %xmm8,%xmm14
+ por %xmm5,%xmm1
+ por %xmm7,%xmm11
+ pxor %xmm4,%xmm2
+ movdqa 176-128(%rax),%xmm4
+
+ movdqa %xmm14,%xmm8
+ movdqa %xmm12,%xmm7
+ pxor 16-128(%rax),%xmm2
+ pxor %xmm4,%xmm2
+ paddd %xmm15,%xmm13
+ pslld $5,%xmm8
+ movdqa %xmm14,%xmm9
+ pand %xmm11,%xmm7
+
+ movdqa %xmm12,%xmm6
+ movdqa %xmm2,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm13
+ pxor %xmm11,%xmm6
+
+ movdqa %xmm1,128-128(%rax)
+ paddd %xmm1,%xmm13
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm10,%xmm6
+ movdqa %xmm10,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm2,%xmm2
+ paddd %xmm6,%xmm13
+
+ psrld $2,%xmm10
+ paddd %xmm8,%xmm13
+ por %xmm5,%xmm2
+ por %xmm7,%xmm10
+ pxor %xmm0,%xmm3
+ movdqa 192-128(%rax),%xmm0
+
+ movdqa %xmm13,%xmm8
+ movdqa %xmm11,%xmm7
+ pxor 32-128(%rax),%xmm3
+ pxor %xmm0,%xmm3
+ paddd %xmm15,%xmm12
+ pslld $5,%xmm8
+ movdqa %xmm13,%xmm9
+ pand %xmm10,%xmm7
+
+ movdqa %xmm11,%xmm6
+ movdqa %xmm3,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm12
+ pxor %xmm10,%xmm6
+
+ movdqa %xmm2,144-128(%rax)
+ paddd %xmm2,%xmm12
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm14,%xmm6
+ movdqa %xmm14,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm3,%xmm3
+ paddd %xmm6,%xmm12
+
+ psrld $2,%xmm14
+ paddd %xmm8,%xmm12
+ por %xmm5,%xmm3
+ por %xmm7,%xmm14
+ pxor %xmm1,%xmm4
+ movdqa 208-128(%rax),%xmm1
+
+ movdqa %xmm12,%xmm8
+ movdqa %xmm10,%xmm7
+ pxor 48-128(%rax),%xmm4
+ pxor %xmm1,%xmm4
+ paddd %xmm15,%xmm11
+ pslld $5,%xmm8
+ movdqa %xmm12,%xmm9
+ pand %xmm14,%xmm7
+
+ movdqa %xmm10,%xmm6
+ movdqa %xmm4,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm11
+ pxor %xmm14,%xmm6
+
+ movdqa %xmm3,160-128(%rax)
+ paddd %xmm3,%xmm11
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm13,%xmm6
+ movdqa %xmm13,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm4,%xmm4
+ paddd %xmm6,%xmm11
+
+ psrld $2,%xmm13
+ paddd %xmm8,%xmm11
+ por %xmm5,%xmm4
+ por %xmm7,%xmm13
+ pxor %xmm2,%xmm0
+ movdqa 224-128(%rax),%xmm2
+
+ movdqa %xmm11,%xmm8
+ movdqa %xmm14,%xmm7
+ pxor 64-128(%rax),%xmm0
+ pxor %xmm2,%xmm0
+ paddd %xmm15,%xmm10
+ pslld $5,%xmm8
+ movdqa %xmm11,%xmm9
+ pand %xmm13,%xmm7
+
+ movdqa %xmm14,%xmm6
+ movdqa %xmm0,%xmm5
+ psrld $27,%xmm9
+ paddd %xmm7,%xmm10
+ pxor %xmm13,%xmm6
+
+ movdqa %xmm4,176-128(%rax)
+ paddd %xmm4,%xmm10
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ pand %xmm12,%xmm6
+ movdqa %xmm12,%xmm7
+
+ pslld $30,%xmm7
+ paddd %xmm0,%xmm0
+ paddd %xmm6,%xmm10
+
+ psrld $2,%xmm12
+ paddd %xmm8,%xmm10
+ por %xmm5,%xmm0
+ por %xmm7,%xmm12
+ movdqa 64(%rbp),%xmm15
+ pxor %xmm3,%xmm1
+ movdqa 240-128(%rax),%xmm3
+
+ movdqa %xmm10,%xmm8
+ movdqa %xmm13,%xmm6
+ pxor 80-128(%rax),%xmm1
+ paddd %xmm15,%xmm14
+ pslld $5,%xmm8
+ pxor %xmm11,%xmm6
+
+ movdqa %xmm10,%xmm9
+ movdqa %xmm0,192-128(%rax)
+ paddd %xmm0,%xmm14
+ pxor %xmm3,%xmm1
+ psrld $27,%xmm9
+ pxor %xmm12,%xmm6
+ movdqa %xmm11,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm1,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm14
+ paddd %xmm1,%xmm1
+
+ psrld $2,%xmm11
+ paddd %xmm8,%xmm14
+ por %xmm5,%xmm1
+ por %xmm7,%xmm11
+ pxor %xmm4,%xmm2
+ movdqa 0-128(%rax),%xmm4
+
+ movdqa %xmm14,%xmm8
+ movdqa %xmm12,%xmm6
+ pxor 96-128(%rax),%xmm2
+ paddd %xmm15,%xmm13
+ pslld $5,%xmm8
+ pxor %xmm10,%xmm6
+
+ movdqa %xmm14,%xmm9
+ movdqa %xmm1,208-128(%rax)
+ paddd %xmm1,%xmm13
+ pxor %xmm4,%xmm2
+ psrld $27,%xmm9
+ pxor %xmm11,%xmm6
+ movdqa %xmm10,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm2,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm13
+ paddd %xmm2,%xmm2
+
+ psrld $2,%xmm10
+ paddd %xmm8,%xmm13
+ por %xmm5,%xmm2
+ por %xmm7,%xmm10
+ pxor %xmm0,%xmm3
+ movdqa 16-128(%rax),%xmm0
+
+ movdqa %xmm13,%xmm8
+ movdqa %xmm11,%xmm6
+ pxor 112-128(%rax),%xmm3
+ paddd %xmm15,%xmm12
+ pslld $5,%xmm8
+ pxor %xmm14,%xmm6
+
+ movdqa %xmm13,%xmm9
+ movdqa %xmm2,224-128(%rax)
+ paddd %xmm2,%xmm12
+ pxor %xmm0,%xmm3
+ psrld $27,%xmm9
+ pxor %xmm10,%xmm6
+ movdqa %xmm14,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm3,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm12
+ paddd %xmm3,%xmm3
+
+ psrld $2,%xmm14
+ paddd %xmm8,%xmm12
+ por %xmm5,%xmm3
+ por %xmm7,%xmm14
+ pxor %xmm1,%xmm4
+ movdqa 32-128(%rax),%xmm1
+
+ movdqa %xmm12,%xmm8
+ movdqa %xmm10,%xmm6
+ pxor 128-128(%rax),%xmm4
+ paddd %xmm15,%xmm11
+ pslld $5,%xmm8
+ pxor %xmm13,%xmm6
+
+ movdqa %xmm12,%xmm9
+ movdqa %xmm3,240-128(%rax)
+ paddd %xmm3,%xmm11
+ pxor %xmm1,%xmm4
+ psrld $27,%xmm9
+ pxor %xmm14,%xmm6
+ movdqa %xmm13,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm4,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm11
+ paddd %xmm4,%xmm4
+
+ psrld $2,%xmm13
+ paddd %xmm8,%xmm11
+ por %xmm5,%xmm4
+ por %xmm7,%xmm13
+ pxor %xmm2,%xmm0
+ movdqa 48-128(%rax),%xmm2
+
+ movdqa %xmm11,%xmm8
+ movdqa %xmm14,%xmm6
+ pxor 144-128(%rax),%xmm0
+ paddd %xmm15,%xmm10
+ pslld $5,%xmm8
+ pxor %xmm12,%xmm6
+
+ movdqa %xmm11,%xmm9
+ movdqa %xmm4,0-128(%rax)
+ paddd %xmm4,%xmm10
+ pxor %xmm2,%xmm0
+ psrld $27,%xmm9
+ pxor %xmm13,%xmm6
+ movdqa %xmm12,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm0,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm10
+ paddd %xmm0,%xmm0
+
+ psrld $2,%xmm12
+ paddd %xmm8,%xmm10
+ por %xmm5,%xmm0
+ por %xmm7,%xmm12
+ pxor %xmm3,%xmm1
+ movdqa 64-128(%rax),%xmm3
+
+ movdqa %xmm10,%xmm8
+ movdqa %xmm13,%xmm6
+ pxor 160-128(%rax),%xmm1
+ paddd %xmm15,%xmm14
+ pslld $5,%xmm8
+ pxor %xmm11,%xmm6
+
+ movdqa %xmm10,%xmm9
+ movdqa %xmm0,16-128(%rax)
+ paddd %xmm0,%xmm14
+ pxor %xmm3,%xmm1
+ psrld $27,%xmm9
+ pxor %xmm12,%xmm6
+ movdqa %xmm11,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm1,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm14
+ paddd %xmm1,%xmm1
+
+ psrld $2,%xmm11
+ paddd %xmm8,%xmm14
+ por %xmm5,%xmm1
+ por %xmm7,%xmm11
+ pxor %xmm4,%xmm2
+ movdqa 80-128(%rax),%xmm4
+
+ movdqa %xmm14,%xmm8
+ movdqa %xmm12,%xmm6
+ pxor 176-128(%rax),%xmm2
+ paddd %xmm15,%xmm13
+ pslld $5,%xmm8
+ pxor %xmm10,%xmm6
+
+ movdqa %xmm14,%xmm9
+ movdqa %xmm1,32-128(%rax)
+ paddd %xmm1,%xmm13
+ pxor %xmm4,%xmm2
+ psrld $27,%xmm9
+ pxor %xmm11,%xmm6
+ movdqa %xmm10,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm2,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm13
+ paddd %xmm2,%xmm2
+
+ psrld $2,%xmm10
+ paddd %xmm8,%xmm13
+ por %xmm5,%xmm2
+ por %xmm7,%xmm10
+ pxor %xmm0,%xmm3
+ movdqa 96-128(%rax),%xmm0
+
+ movdqa %xmm13,%xmm8
+ movdqa %xmm11,%xmm6
+ pxor 192-128(%rax),%xmm3
+ paddd %xmm15,%xmm12
+ pslld $5,%xmm8
+ pxor %xmm14,%xmm6
+
+ movdqa %xmm13,%xmm9
+ movdqa %xmm2,48-128(%rax)
+ paddd %xmm2,%xmm12
+ pxor %xmm0,%xmm3
+ psrld $27,%xmm9
+ pxor %xmm10,%xmm6
+ movdqa %xmm14,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm3,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm12
+ paddd %xmm3,%xmm3
+
+ psrld $2,%xmm14
+ paddd %xmm8,%xmm12
+ por %xmm5,%xmm3
+ por %xmm7,%xmm14
+ pxor %xmm1,%xmm4
+ movdqa 112-128(%rax),%xmm1
+
+ movdqa %xmm12,%xmm8
+ movdqa %xmm10,%xmm6
+ pxor 208-128(%rax),%xmm4
+ paddd %xmm15,%xmm11
+ pslld $5,%xmm8
+ pxor %xmm13,%xmm6
+
+ movdqa %xmm12,%xmm9
+ movdqa %xmm3,64-128(%rax)
+ paddd %xmm3,%xmm11
+ pxor %xmm1,%xmm4
+ psrld $27,%xmm9
+ pxor %xmm14,%xmm6
+ movdqa %xmm13,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm4,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm11
+ paddd %xmm4,%xmm4
+
+ psrld $2,%xmm13
+ paddd %xmm8,%xmm11
+ por %xmm5,%xmm4
+ por %xmm7,%xmm13
+ pxor %xmm2,%xmm0
+ movdqa 128-128(%rax),%xmm2
+
+ movdqa %xmm11,%xmm8
+ movdqa %xmm14,%xmm6
+ pxor 224-128(%rax),%xmm0
+ paddd %xmm15,%xmm10
+ pslld $5,%xmm8
+ pxor %xmm12,%xmm6
+
+ movdqa %xmm11,%xmm9
+ movdqa %xmm4,80-128(%rax)
+ paddd %xmm4,%xmm10
+ pxor %xmm2,%xmm0
+ psrld $27,%xmm9
+ pxor %xmm13,%xmm6
+ movdqa %xmm12,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm0,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm10
+ paddd %xmm0,%xmm0
+
+ psrld $2,%xmm12
+ paddd %xmm8,%xmm10
+ por %xmm5,%xmm0
+ por %xmm7,%xmm12
+ pxor %xmm3,%xmm1
+ movdqa 144-128(%rax),%xmm3
+
+ movdqa %xmm10,%xmm8
+ movdqa %xmm13,%xmm6
+ pxor 240-128(%rax),%xmm1
+ paddd %xmm15,%xmm14
+ pslld $5,%xmm8
+ pxor %xmm11,%xmm6
+
+ movdqa %xmm10,%xmm9
+ movdqa %xmm0,96-128(%rax)
+ paddd %xmm0,%xmm14
+ pxor %xmm3,%xmm1
+ psrld $27,%xmm9
+ pxor %xmm12,%xmm6
+ movdqa %xmm11,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm1,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm14
+ paddd %xmm1,%xmm1
+
+ psrld $2,%xmm11
+ paddd %xmm8,%xmm14
+ por %xmm5,%xmm1
+ por %xmm7,%xmm11
+ pxor %xmm4,%xmm2
+ movdqa 160-128(%rax),%xmm4
+
+ movdqa %xmm14,%xmm8
+ movdqa %xmm12,%xmm6
+ pxor 0-128(%rax),%xmm2
+ paddd %xmm15,%xmm13
+ pslld $5,%xmm8
+ pxor %xmm10,%xmm6
+
+ movdqa %xmm14,%xmm9
+ movdqa %xmm1,112-128(%rax)
+ paddd %xmm1,%xmm13
+ pxor %xmm4,%xmm2
+ psrld $27,%xmm9
+ pxor %xmm11,%xmm6
+ movdqa %xmm10,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm2,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm13
+ paddd %xmm2,%xmm2
+
+ psrld $2,%xmm10
+ paddd %xmm8,%xmm13
+ por %xmm5,%xmm2
+ por %xmm7,%xmm10
+ pxor %xmm0,%xmm3
+ movdqa 176-128(%rax),%xmm0
+
+ movdqa %xmm13,%xmm8
+ movdqa %xmm11,%xmm6
+ pxor 16-128(%rax),%xmm3
+ paddd %xmm15,%xmm12
+ pslld $5,%xmm8
+ pxor %xmm14,%xmm6
+
+ movdqa %xmm13,%xmm9
+ paddd %xmm2,%xmm12
+ pxor %xmm0,%xmm3
+ psrld $27,%xmm9
+ pxor %xmm10,%xmm6
+ movdqa %xmm14,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm3,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm12
+ paddd %xmm3,%xmm3
+
+ psrld $2,%xmm14
+ paddd %xmm8,%xmm12
+ por %xmm5,%xmm3
+ por %xmm7,%xmm14
+ pxor %xmm1,%xmm4
+ movdqa 192-128(%rax),%xmm1
+
+ movdqa %xmm12,%xmm8
+ movdqa %xmm10,%xmm6
+ pxor 32-128(%rax),%xmm4
+ paddd %xmm15,%xmm11
+ pslld $5,%xmm8
+ pxor %xmm13,%xmm6
+
+ movdqa %xmm12,%xmm9
+ paddd %xmm3,%xmm11
+ pxor %xmm1,%xmm4
+ psrld $27,%xmm9
+ pxor %xmm14,%xmm6
+ movdqa %xmm13,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm4,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm11
+ paddd %xmm4,%xmm4
+
+ psrld $2,%xmm13
+ paddd %xmm8,%xmm11
+ por %xmm5,%xmm4
+ por %xmm7,%xmm13
+ pxor %xmm2,%xmm0
+ movdqa 208-128(%rax),%xmm2
+
+ movdqa %xmm11,%xmm8
+ movdqa %xmm14,%xmm6
+ pxor 48-128(%rax),%xmm0
+ paddd %xmm15,%xmm10
+ pslld $5,%xmm8
+ pxor %xmm12,%xmm6
+
+ movdqa %xmm11,%xmm9
+ paddd %xmm4,%xmm10
+ pxor %xmm2,%xmm0
+ psrld $27,%xmm9
+ pxor %xmm13,%xmm6
+ movdqa %xmm12,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm0,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm10
+ paddd %xmm0,%xmm0
+
+ psrld $2,%xmm12
+ paddd %xmm8,%xmm10
+ por %xmm5,%xmm0
+ por %xmm7,%xmm12
+ pxor %xmm3,%xmm1
+ movdqa 224-128(%rax),%xmm3
+
+ movdqa %xmm10,%xmm8
+ movdqa %xmm13,%xmm6
+ pxor 64-128(%rax),%xmm1
+ paddd %xmm15,%xmm14
+ pslld $5,%xmm8
+ pxor %xmm11,%xmm6
+
+ movdqa %xmm10,%xmm9
+ paddd %xmm0,%xmm14
+ pxor %xmm3,%xmm1
+ psrld $27,%xmm9
+ pxor %xmm12,%xmm6
+ movdqa %xmm11,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm1,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm14
+ paddd %xmm1,%xmm1
+
+ psrld $2,%xmm11
+ paddd %xmm8,%xmm14
+ por %xmm5,%xmm1
+ por %xmm7,%xmm11
+ pxor %xmm4,%xmm2
+ movdqa 240-128(%rax),%xmm4
+
+ movdqa %xmm14,%xmm8
+ movdqa %xmm12,%xmm6
+ pxor 80-128(%rax),%xmm2
+ paddd %xmm15,%xmm13
+ pslld $5,%xmm8
+ pxor %xmm10,%xmm6
+
+ movdqa %xmm14,%xmm9
+ paddd %xmm1,%xmm13
+ pxor %xmm4,%xmm2
+ psrld $27,%xmm9
+ pxor %xmm11,%xmm6
+ movdqa %xmm10,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm2,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm13
+ paddd %xmm2,%xmm2
+
+ psrld $2,%xmm10
+ paddd %xmm8,%xmm13
+ por %xmm5,%xmm2
+ por %xmm7,%xmm10
+ pxor %xmm0,%xmm3
+ movdqa 0-128(%rax),%xmm0
+
+ movdqa %xmm13,%xmm8
+ movdqa %xmm11,%xmm6
+ pxor 96-128(%rax),%xmm3
+ paddd %xmm15,%xmm12
+ pslld $5,%xmm8
+ pxor %xmm14,%xmm6
+
+ movdqa %xmm13,%xmm9
+ paddd %xmm2,%xmm12
+ pxor %xmm0,%xmm3
+ psrld $27,%xmm9
+ pxor %xmm10,%xmm6
+ movdqa %xmm14,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm3,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm12
+ paddd %xmm3,%xmm3
+
+ psrld $2,%xmm14
+ paddd %xmm8,%xmm12
+ por %xmm5,%xmm3
+ por %xmm7,%xmm14
+ pxor %xmm1,%xmm4
+ movdqa 16-128(%rax),%xmm1
+
+ movdqa %xmm12,%xmm8
+ movdqa %xmm10,%xmm6
+ pxor 112-128(%rax),%xmm4
+ paddd %xmm15,%xmm11
+ pslld $5,%xmm8
+ pxor %xmm13,%xmm6
+
+ movdqa %xmm12,%xmm9
+ paddd %xmm3,%xmm11
+ pxor %xmm1,%xmm4
+ psrld $27,%xmm9
+ pxor %xmm14,%xmm6
+ movdqa %xmm13,%xmm7
+
+ pslld $30,%xmm7
+ movdqa %xmm4,%xmm5
+ por %xmm9,%xmm8
+ psrld $31,%xmm5
+ paddd %xmm6,%xmm11
+ paddd %xmm4,%xmm4
+
+ psrld $2,%xmm13
+ paddd %xmm8,%xmm11
+ por %xmm5,%xmm4
+ por %xmm7,%xmm13
+ movdqa %xmm11,%xmm8
+ paddd %xmm15,%xmm10
+ movdqa %xmm14,%xmm6
+ pslld $5,%xmm8
+ pxor %xmm12,%xmm6
+
+ movdqa %xmm11,%xmm9
+ paddd %xmm4,%xmm10
+ psrld $27,%xmm9
+ movdqa %xmm12,%xmm7
+ pxor %xmm13,%xmm6
+
+ pslld $30,%xmm7
+ por %xmm9,%xmm8
+ paddd %xmm6,%xmm10
+
+ psrld $2,%xmm12
+ paddd %xmm8,%xmm10
+ por %xmm7,%xmm12
+ movdqa (%rbx),%xmm0
+ movl $1,%ecx
+ cmpl 0(%rbx),%ecx
+ pxor %xmm8,%xmm8
+ cmovgeq %rbp,%r8
+ cmpl 4(%rbx),%ecx
+ movdqa %xmm0,%xmm1
+ cmovgeq %rbp,%r9
+ cmpl 8(%rbx),%ecx
+ pcmpgtd %xmm8,%xmm1
+ cmovgeq %rbp,%r10
+ cmpl 12(%rbx),%ecx
+ paddd %xmm1,%xmm0
+ cmovgeq %rbp,%r11
+
+ movdqu 0(%rdi),%xmm6
+ pand %xmm1,%xmm10
+ movdqu 32(%rdi),%xmm7
+ pand %xmm1,%xmm11
+ paddd %xmm6,%xmm10
+ movdqu 64(%rdi),%xmm8
+ pand %xmm1,%xmm12
+ paddd %xmm7,%xmm11
+ movdqu 96(%rdi),%xmm9
+ pand %xmm1,%xmm13
+ paddd %xmm8,%xmm12
+ movdqu 128(%rdi),%xmm5
+ pand %xmm1,%xmm14
+ movdqu %xmm10,0(%rdi)
+ paddd %xmm9,%xmm13
+ movdqu %xmm11,32(%rdi)
+ paddd %xmm5,%xmm14
+ movdqu %xmm12,64(%rdi)
+ movdqu %xmm13,96(%rdi)
+ movdqu %xmm14,128(%rdi)
+
+ movdqa %xmm0,(%rbx)
+ movdqa 96(%rbp),%xmm5
+ movdqa -32(%rbp),%xmm15
+ decl %edx
+ jnz .Loop
+
+ movl 280(%rsp),%edx
+ leaq 16(%rdi),%rdi
+ leaq 64(%rsi),%rsi
+ decl %edx
+ jnz .Loop_grande
+
+.Ldone:
+ movq 272(%rsp),%rax
+.cfi_def_cfa %rax,8
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_multi_block,.-sha1_multi_block
+.type sha1_multi_block_shaext,@function
+.align 32
+sha1_multi_block_shaext:
+.cfi_startproc
+_shaext_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ subq $288,%rsp
+ shll $1,%edx
+ andq $-256,%rsp
+ leaq 64(%rdi),%rdi
+ movq %rax,272(%rsp)
+.Lbody_shaext:
+ leaq 256(%rsp),%rbx
+ movdqa K_XX_XX+128(%rip),%xmm3
+
+.Loop_grande_shaext:
+ movl %edx,280(%rsp)
+ xorl %edx,%edx
+ movq 0(%rsi),%r8
+ movl 8(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,0(%rbx)
+ cmovleq %rsp,%r8
+ movq 16(%rsi),%r9
+ movl 24(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,4(%rbx)
+ cmovleq %rsp,%r9
+ testl %edx,%edx
+ jz .Ldone_shaext
+
+ movq 0-64(%rdi),%xmm0
+ movq 32-64(%rdi),%xmm4
+ movq 64-64(%rdi),%xmm5
+ movq 96-64(%rdi),%xmm6
+ movq 128-64(%rdi),%xmm7
+
+ punpckldq %xmm4,%xmm0
+ punpckldq %xmm6,%xmm5
+
+ movdqa %xmm0,%xmm8
+ punpcklqdq %xmm5,%xmm0
+ punpckhqdq %xmm5,%xmm8
+
+ pshufd $63,%xmm7,%xmm1
+ pshufd $127,%xmm7,%xmm9
+ pshufd $27,%xmm0,%xmm0
+ pshufd $27,%xmm8,%xmm8
+ jmp .Loop_shaext
+
+.align 32
+.Loop_shaext:
+ movdqu 0(%r8),%xmm4
+ movdqu 0(%r9),%xmm11
+ movdqu 16(%r8),%xmm5
+ movdqu 16(%r9),%xmm12
+ movdqu 32(%r8),%xmm6
+.byte 102,15,56,0,227
+ movdqu 32(%r9),%xmm13
+.byte 102,68,15,56,0,219
+ movdqu 48(%r8),%xmm7
+ leaq 64(%r8),%r8
+.byte 102,15,56,0,235
+ movdqu 48(%r9),%xmm14
+ leaq 64(%r9),%r9
+.byte 102,68,15,56,0,227
+
+ movdqa %xmm1,80(%rsp)
+ paddd %xmm4,%xmm1
+ movdqa %xmm9,112(%rsp)
+ paddd %xmm11,%xmm9
+ movdqa %xmm0,64(%rsp)
+ movdqa %xmm0,%xmm2
+ movdqa %xmm8,96(%rsp)
+ movdqa %xmm8,%xmm10
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+.byte 69,15,58,204,193,0
+.byte 69,15,56,200,212
+.byte 102,15,56,0,243
+ prefetcht0 127(%r8)
+.byte 15,56,201,229
+.byte 102,68,15,56,0,235
+ prefetcht0 127(%r9)
+.byte 69,15,56,201,220
+
+.byte 102,15,56,0,251
+ movdqa %xmm0,%xmm1
+.byte 102,68,15,56,0,243
+ movdqa %xmm8,%xmm9
+.byte 15,58,204,194,0
+.byte 15,56,200,206
+.byte 69,15,58,204,194,0
+.byte 69,15,56,200,205
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+ pxor %xmm13,%xmm11
+.byte 69,15,56,201,229
+ movdqa %xmm0,%xmm2
+ movdqa %xmm8,%xmm10
+.byte 15,58,204,193,0
+.byte 15,56,200,215
+.byte 69,15,58,204,193,0
+.byte 69,15,56,200,214
+.byte 15,56,202,231
+.byte 69,15,56,202,222
+ pxor %xmm7,%xmm5
+.byte 15,56,201,247
+ pxor %xmm14,%xmm12
+.byte 69,15,56,201,238
+ movdqa %xmm0,%xmm1
+ movdqa %xmm8,%xmm9
+.byte 15,58,204,194,0
+.byte 15,56,200,204
+.byte 69,15,58,204,194,0
+.byte 69,15,56,200,203
+.byte 15,56,202,236
+.byte 69,15,56,202,227
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+ pxor %xmm11,%xmm13
+.byte 69,15,56,201,243
+ movdqa %xmm0,%xmm2
+ movdqa %xmm8,%xmm10
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+.byte 69,15,58,204,193,0
+.byte 69,15,56,200,212
+.byte 15,56,202,245
+.byte 69,15,56,202,236
+ pxor %xmm5,%xmm7
+.byte 15,56,201,229
+ pxor %xmm12,%xmm14
+.byte 69,15,56,201,220
+ movdqa %xmm0,%xmm1
+ movdqa %xmm8,%xmm9
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+.byte 69,15,58,204,194,1
+.byte 69,15,56,200,205
+.byte 15,56,202,254
+.byte 69,15,56,202,245
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+ pxor %xmm13,%xmm11
+.byte 69,15,56,201,229
+ movdqa %xmm0,%xmm2
+ movdqa %xmm8,%xmm10
+.byte 15,58,204,193,1
+.byte 15,56,200,215
+.byte 69,15,58,204,193,1
+.byte 69,15,56,200,214
+.byte 15,56,202,231
+.byte 69,15,56,202,222
+ pxor %xmm7,%xmm5
+.byte 15,56,201,247
+ pxor %xmm14,%xmm12
+.byte 69,15,56,201,238
+ movdqa %xmm0,%xmm1
+ movdqa %xmm8,%xmm9
+.byte 15,58,204,194,1
+.byte 15,56,200,204
+.byte 69,15,58,204,194,1
+.byte 69,15,56,200,203
+.byte 15,56,202,236
+.byte 69,15,56,202,227
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+ pxor %xmm11,%xmm13
+.byte 69,15,56,201,243
+ movdqa %xmm0,%xmm2
+ movdqa %xmm8,%xmm10
+.byte 15,58,204,193,1
+.byte 15,56,200,213
+.byte 69,15,58,204,193,1
+.byte 69,15,56,200,212
+.byte 15,56,202,245
+.byte 69,15,56,202,236
+ pxor %xmm5,%xmm7
+.byte 15,56,201,229
+ pxor %xmm12,%xmm14
+.byte 69,15,56,201,220
+ movdqa %xmm0,%xmm1
+ movdqa %xmm8,%xmm9
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+.byte 69,15,58,204,194,1
+.byte 69,15,56,200,205
+.byte 15,56,202,254
+.byte 69,15,56,202,245
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+ pxor %xmm13,%xmm11
+.byte 69,15,56,201,229
+ movdqa %xmm0,%xmm2
+ movdqa %xmm8,%xmm10
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+.byte 69,15,58,204,193,2
+.byte 69,15,56,200,214
+.byte 15,56,202,231
+.byte 69,15,56,202,222
+ pxor %xmm7,%xmm5
+.byte 15,56,201,247
+ pxor %xmm14,%xmm12
+.byte 69,15,56,201,238
+ movdqa %xmm0,%xmm1
+ movdqa %xmm8,%xmm9
+.byte 15,58,204,194,2
+.byte 15,56,200,204
+.byte 69,15,58,204,194,2
+.byte 69,15,56,200,203
+.byte 15,56,202,236
+.byte 69,15,56,202,227
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+ pxor %xmm11,%xmm13
+.byte 69,15,56,201,243
+ movdqa %xmm0,%xmm2
+ movdqa %xmm8,%xmm10
+.byte 15,58,204,193,2
+.byte 15,56,200,213
+.byte 69,15,58,204,193,2
+.byte 69,15,56,200,212
+.byte 15,56,202,245
+.byte 69,15,56,202,236
+ pxor %xmm5,%xmm7
+.byte 15,56,201,229
+ pxor %xmm12,%xmm14
+.byte 69,15,56,201,220
+ movdqa %xmm0,%xmm1
+ movdqa %xmm8,%xmm9
+.byte 15,58,204,194,2
+.byte 15,56,200,206
+.byte 69,15,58,204,194,2
+.byte 69,15,56,200,205
+.byte 15,56,202,254
+.byte 69,15,56,202,245
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+ pxor %xmm13,%xmm11
+.byte 69,15,56,201,229
+ movdqa %xmm0,%xmm2
+ movdqa %xmm8,%xmm10
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+.byte 69,15,58,204,193,2
+.byte 69,15,56,200,214
+.byte 15,56,202,231
+.byte 69,15,56,202,222
+ pxor %xmm7,%xmm5
+.byte 15,56,201,247
+ pxor %xmm14,%xmm12
+.byte 69,15,56,201,238
+ movdqa %xmm0,%xmm1
+ movdqa %xmm8,%xmm9
+.byte 15,58,204,194,3
+.byte 15,56,200,204
+.byte 69,15,58,204,194,3
+.byte 69,15,56,200,203
+.byte 15,56,202,236
+.byte 69,15,56,202,227
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+ pxor %xmm11,%xmm13
+.byte 69,15,56,201,243
+ movdqa %xmm0,%xmm2
+ movdqa %xmm8,%xmm10
+.byte 15,58,204,193,3
+.byte 15,56,200,213
+.byte 69,15,58,204,193,3
+.byte 69,15,56,200,212
+.byte 15,56,202,245
+.byte 69,15,56,202,236
+ pxor %xmm5,%xmm7
+ pxor %xmm12,%xmm14
+
+ movl $1,%ecx
+ pxor %xmm4,%xmm4
+ cmpl 0(%rbx),%ecx
+ cmovgeq %rsp,%r8
+
+ movdqa %xmm0,%xmm1
+ movdqa %xmm8,%xmm9
+.byte 15,58,204,194,3
+.byte 15,56,200,206
+.byte 69,15,58,204,194,3
+.byte 69,15,56,200,205
+.byte 15,56,202,254
+.byte 69,15,56,202,245
+
+ cmpl 4(%rbx),%ecx
+ cmovgeq %rsp,%r9
+ movq (%rbx),%xmm6
+
+ movdqa %xmm0,%xmm2
+ movdqa %xmm8,%xmm10
+.byte 15,58,204,193,3
+.byte 15,56,200,215
+.byte 69,15,58,204,193,3
+.byte 69,15,56,200,214
+
+ pshufd $0x00,%xmm6,%xmm11
+ pshufd $0x55,%xmm6,%xmm12
+ movdqa %xmm6,%xmm7
+ pcmpgtd %xmm4,%xmm11
+ pcmpgtd %xmm4,%xmm12
+
+ movdqa %xmm0,%xmm1
+ movdqa %xmm8,%xmm9
+.byte 15,58,204,194,3
+.byte 15,56,200,204
+.byte 69,15,58,204,194,3
+.byte 68,15,56,200,204
+
+ pcmpgtd %xmm4,%xmm7
+ pand %xmm11,%xmm0
+ pand %xmm11,%xmm1
+ pand %xmm12,%xmm8
+ pand %xmm12,%xmm9
+ paddd %xmm7,%xmm6
+
+ paddd 64(%rsp),%xmm0
+ paddd 80(%rsp),%xmm1
+ paddd 96(%rsp),%xmm8
+ paddd 112(%rsp),%xmm9
+
+ movq %xmm6,(%rbx)
+ decl %edx
+ jnz .Loop_shaext
+
+ movl 280(%rsp),%edx
+
+ pshufd $27,%xmm0,%xmm0
+ pshufd $27,%xmm8,%xmm8
+
+ movdqa %xmm0,%xmm6
+ punpckldq %xmm8,%xmm0
+ punpckhdq %xmm8,%xmm6
+ punpckhdq %xmm9,%xmm1
+ movq %xmm0,0-64(%rdi)
+ psrldq $8,%xmm0
+ movq %xmm6,64-64(%rdi)
+ psrldq $8,%xmm6
+ movq %xmm0,32-64(%rdi)
+ psrldq $8,%xmm1
+ movq %xmm6,96-64(%rdi)
+ movq %xmm1,128-64(%rdi)
+
+ leaq 8(%rdi),%rdi
+ leaq 32(%rsi),%rsi
+ decl %edx
+ jnz .Loop_grande_shaext
+
+.Ldone_shaext:
+
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_shaext:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_multi_block_shaext,.-sha1_multi_block_shaext
+.type sha1_multi_block_avx,@function
+.align 32
+sha1_multi_block_avx:
+.cfi_startproc
+_avx_shortcut:
+ shrq $32,%rcx
+ cmpl $2,%edx
+ jb .Lavx
+ testl $32,%ecx
+ jnz _avx2_shortcut
+ jmp .Lavx
+.align 32
+.Lavx:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ subq $288,%rsp
+ andq $-256,%rsp
+ movq %rax,272(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08
+.Lbody_avx:
+ leaq K_XX_XX(%rip),%rbp
+ leaq 256(%rsp),%rbx
+
+ vzeroupper
+.Loop_grande_avx:
+ movl %edx,280(%rsp)
+ xorl %edx,%edx
+ movq 0(%rsi),%r8
+ movl 8(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,0(%rbx)
+ cmovleq %rbp,%r8
+ movq 16(%rsi),%r9
+ movl 24(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,4(%rbx)
+ cmovleq %rbp,%r9
+ movq 32(%rsi),%r10
+ movl 40(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,8(%rbx)
+ cmovleq %rbp,%r10
+ movq 48(%rsi),%r11
+ movl 56(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,12(%rbx)
+ cmovleq %rbp,%r11
+ testl %edx,%edx
+ jz .Ldone_avx
+
+ vmovdqu 0(%rdi),%xmm10
+ leaq 128(%rsp),%rax
+ vmovdqu 32(%rdi),%xmm11
+ vmovdqu 64(%rdi),%xmm12
+ vmovdqu 96(%rdi),%xmm13
+ vmovdqu 128(%rdi),%xmm14
+ vmovdqu 96(%rbp),%xmm5
+ jmp .Loop_avx
+
+.align 32
+.Loop_avx:
+ vmovdqa -32(%rbp),%xmm15
+ vmovd (%r8),%xmm0
+ leaq 64(%r8),%r8
+ vmovd (%r9),%xmm2
+ leaq 64(%r9),%r9
+ vpinsrd $1,(%r10),%xmm0,%xmm0
+ leaq 64(%r10),%r10
+ vpinsrd $1,(%r11),%xmm2,%xmm2
+ leaq 64(%r11),%r11
+ vmovd -60(%r8),%xmm1
+ vpunpckldq %xmm2,%xmm0,%xmm0
+ vmovd -60(%r9),%xmm9
+ vpshufb %xmm5,%xmm0,%xmm0
+ vpinsrd $1,-60(%r10),%xmm1,%xmm1
+ vpinsrd $1,-60(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpandn %xmm13,%xmm11,%xmm7
+ vpand %xmm12,%xmm11,%xmm6
+
+ vmovdqa %xmm0,0-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpunpckldq %xmm9,%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -56(%r8),%xmm2
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -56(%r9),%xmm9
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpshufb %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpinsrd $1,-56(%r10),%xmm2,%xmm2
+ vpinsrd $1,-56(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpandn %xmm12,%xmm10,%xmm7
+ vpand %xmm11,%xmm10,%xmm6
+
+ vmovdqa %xmm1,16-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpunpckldq %xmm9,%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -52(%r8),%xmm3
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -52(%r9),%xmm9
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpshufb %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpinsrd $1,-52(%r10),%xmm3,%xmm3
+ vpinsrd $1,-52(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpandn %xmm11,%xmm14,%xmm7
+ vpand %xmm10,%xmm14,%xmm6
+
+ vmovdqa %xmm2,32-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpunpckldq %xmm9,%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -48(%r8),%xmm4
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -48(%r9),%xmm9
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpshufb %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpinsrd $1,-48(%r10),%xmm4,%xmm4
+ vpinsrd $1,-48(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpandn %xmm10,%xmm13,%xmm7
+ vpand %xmm14,%xmm13,%xmm6
+
+ vmovdqa %xmm3,48-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpunpckldq %xmm9,%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -44(%r8),%xmm0
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -44(%r9),%xmm9
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpshufb %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpinsrd $1,-44(%r10),%xmm0,%xmm0
+ vpinsrd $1,-44(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpandn %xmm14,%xmm12,%xmm7
+ vpand %xmm13,%xmm12,%xmm6
+
+ vmovdqa %xmm4,64-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpunpckldq %xmm9,%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -40(%r8),%xmm1
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -40(%r9),%xmm9
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpshufb %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpinsrd $1,-40(%r10),%xmm1,%xmm1
+ vpinsrd $1,-40(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpandn %xmm13,%xmm11,%xmm7
+ vpand %xmm12,%xmm11,%xmm6
+
+ vmovdqa %xmm0,80-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpunpckldq %xmm9,%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -36(%r8),%xmm2
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -36(%r9),%xmm9
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpshufb %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpinsrd $1,-36(%r10),%xmm2,%xmm2
+ vpinsrd $1,-36(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpandn %xmm12,%xmm10,%xmm7
+ vpand %xmm11,%xmm10,%xmm6
+
+ vmovdqa %xmm1,96-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpunpckldq %xmm9,%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -32(%r8),%xmm3
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -32(%r9),%xmm9
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpshufb %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpinsrd $1,-32(%r10),%xmm3,%xmm3
+ vpinsrd $1,-32(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpandn %xmm11,%xmm14,%xmm7
+ vpand %xmm10,%xmm14,%xmm6
+
+ vmovdqa %xmm2,112-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpunpckldq %xmm9,%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -28(%r8),%xmm4
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -28(%r9),%xmm9
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpshufb %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpinsrd $1,-28(%r10),%xmm4,%xmm4
+ vpinsrd $1,-28(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpandn %xmm10,%xmm13,%xmm7
+ vpand %xmm14,%xmm13,%xmm6
+
+ vmovdqa %xmm3,128-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpunpckldq %xmm9,%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -24(%r8),%xmm0
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -24(%r9),%xmm9
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpshufb %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpinsrd $1,-24(%r10),%xmm0,%xmm0
+ vpinsrd $1,-24(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpandn %xmm14,%xmm12,%xmm7
+ vpand %xmm13,%xmm12,%xmm6
+
+ vmovdqa %xmm4,144-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpunpckldq %xmm9,%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -20(%r8),%xmm1
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -20(%r9),%xmm9
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpshufb %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpinsrd $1,-20(%r10),%xmm1,%xmm1
+ vpinsrd $1,-20(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpandn %xmm13,%xmm11,%xmm7
+ vpand %xmm12,%xmm11,%xmm6
+
+ vmovdqa %xmm0,160-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpunpckldq %xmm9,%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -16(%r8),%xmm2
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -16(%r9),%xmm9
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpshufb %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpinsrd $1,-16(%r10),%xmm2,%xmm2
+ vpinsrd $1,-16(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpandn %xmm12,%xmm10,%xmm7
+ vpand %xmm11,%xmm10,%xmm6
+
+ vmovdqa %xmm1,176-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpunpckldq %xmm9,%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -12(%r8),%xmm3
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -12(%r9),%xmm9
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpshufb %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpinsrd $1,-12(%r10),%xmm3,%xmm3
+ vpinsrd $1,-12(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpandn %xmm11,%xmm14,%xmm7
+ vpand %xmm10,%xmm14,%xmm6
+
+ vmovdqa %xmm2,192-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpunpckldq %xmm9,%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -8(%r8),%xmm4
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -8(%r9),%xmm9
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpshufb %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpinsrd $1,-8(%r10),%xmm4,%xmm4
+ vpinsrd $1,-8(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpandn %xmm10,%xmm13,%xmm7
+ vpand %xmm14,%xmm13,%xmm6
+
+ vmovdqa %xmm3,208-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpunpckldq %xmm9,%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -4(%r8),%xmm0
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -4(%r9),%xmm9
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpshufb %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vmovdqa 0-128(%rax),%xmm1
+ vpinsrd $1,-4(%r10),%xmm0,%xmm0
+ vpinsrd $1,-4(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm10,%xmm10
+ prefetcht0 63(%r8)
+ vpslld $5,%xmm11,%xmm8
+ vpandn %xmm14,%xmm12,%xmm7
+ vpand %xmm13,%xmm12,%xmm6
+
+ vmovdqa %xmm4,224-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpunpckldq %xmm9,%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ prefetcht0 63(%r9)
+ vpxor %xmm7,%xmm6,%xmm6
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ prefetcht0 63(%r10)
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ prefetcht0 63(%r11)
+ vpshufb %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vmovdqa 16-128(%rax),%xmm2
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 32-128(%rax),%xmm3
+
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpandn %xmm13,%xmm11,%xmm7
+
+ vpand %xmm12,%xmm11,%xmm6
+
+ vmovdqa %xmm0,240-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 128-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 48-128(%rax),%xmm4
+
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpandn %xmm12,%xmm10,%xmm7
+
+ vpand %xmm11,%xmm10,%xmm6
+
+ vmovdqa %xmm1,0-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 144-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 64-128(%rax),%xmm0
+
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpandn %xmm11,%xmm14,%xmm7
+
+ vpand %xmm10,%xmm14,%xmm6
+
+ vmovdqa %xmm2,16-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 160-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 80-128(%rax),%xmm1
+
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpandn %xmm10,%xmm13,%xmm7
+
+ vpand %xmm14,%xmm13,%xmm6
+
+ vmovdqa %xmm3,32-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 176-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 96-128(%rax),%xmm2
+
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpandn %xmm14,%xmm12,%xmm7
+
+ vpand %xmm13,%xmm12,%xmm6
+
+ vmovdqa %xmm4,48-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 192-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vmovdqa 0(%rbp),%xmm15
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 112-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,64-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 208-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 128-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,80-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 224-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 144-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vmovdqa %xmm2,96-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 240-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 160-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vmovdqa %xmm3,112-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 0-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 176-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vmovdqa %xmm4,128-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 16-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 192-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,144-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 32-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 208-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,160-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 48-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 224-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vmovdqa %xmm2,176-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 64-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 240-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vmovdqa %xmm3,192-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 80-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 0-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vmovdqa %xmm4,208-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 96-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 16-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,224-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 112-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 32-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,240-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 128-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 48-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vmovdqa %xmm2,0-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 144-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 64-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vmovdqa %xmm3,16-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 160-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 80-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vmovdqa %xmm4,32-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 176-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 96-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,48-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 192-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 112-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,64-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 208-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 128-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vmovdqa %xmm2,80-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 224-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 144-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vmovdqa %xmm3,96-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 240-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 160-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vmovdqa %xmm4,112-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 0-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vmovdqa 32(%rbp),%xmm15
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 176-128(%rax),%xmm3
+
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpand %xmm12,%xmm13,%xmm7
+ vpxor 16-128(%rax),%xmm1,%xmm1
+
+ vpaddd %xmm7,%xmm14,%xmm14
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm13,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vmovdqu %xmm0,128-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm1,%xmm5
+ vpand %xmm11,%xmm6,%xmm6
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 192-128(%rax),%xmm4
+
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpand %xmm11,%xmm12,%xmm7
+ vpxor 32-128(%rax),%xmm2,%xmm2
+
+ vpaddd %xmm7,%xmm13,%xmm13
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm12,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vmovdqu %xmm1,144-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm2,%xmm5
+ vpand %xmm10,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 208-128(%rax),%xmm0
+
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpand %xmm10,%xmm11,%xmm7
+ vpxor 48-128(%rax),%xmm3,%xmm3
+
+ vpaddd %xmm7,%xmm12,%xmm12
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm11,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vmovdqu %xmm2,160-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm3,%xmm5
+ vpand %xmm14,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 224-128(%rax),%xmm1
+
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpand %xmm14,%xmm10,%xmm7
+ vpxor 64-128(%rax),%xmm4,%xmm4
+
+ vpaddd %xmm7,%xmm11,%xmm11
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm10,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vmovdqu %xmm3,176-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm4,%xmm5
+ vpand %xmm13,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 240-128(%rax),%xmm2
+
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpand %xmm13,%xmm14,%xmm7
+ vpxor 80-128(%rax),%xmm0,%xmm0
+
+ vpaddd %xmm7,%xmm10,%xmm10
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm14,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vmovdqu %xmm4,192-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm0,%xmm5
+ vpand %xmm12,%xmm6,%xmm6
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 0-128(%rax),%xmm3
+
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpand %xmm12,%xmm13,%xmm7
+ vpxor 96-128(%rax),%xmm1,%xmm1
+
+ vpaddd %xmm7,%xmm14,%xmm14
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm13,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vmovdqu %xmm0,208-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm1,%xmm5
+ vpand %xmm11,%xmm6,%xmm6
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 16-128(%rax),%xmm4
+
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpand %xmm11,%xmm12,%xmm7
+ vpxor 112-128(%rax),%xmm2,%xmm2
+
+ vpaddd %xmm7,%xmm13,%xmm13
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm12,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vmovdqu %xmm1,224-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm2,%xmm5
+ vpand %xmm10,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 32-128(%rax),%xmm0
+
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpand %xmm10,%xmm11,%xmm7
+ vpxor 128-128(%rax),%xmm3,%xmm3
+
+ vpaddd %xmm7,%xmm12,%xmm12
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm11,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vmovdqu %xmm2,240-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm3,%xmm5
+ vpand %xmm14,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 48-128(%rax),%xmm1
+
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpand %xmm14,%xmm10,%xmm7
+ vpxor 144-128(%rax),%xmm4,%xmm4
+
+ vpaddd %xmm7,%xmm11,%xmm11
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm10,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vmovdqu %xmm3,0-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm4,%xmm5
+ vpand %xmm13,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 64-128(%rax),%xmm2
+
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpand %xmm13,%xmm14,%xmm7
+ vpxor 160-128(%rax),%xmm0,%xmm0
+
+ vpaddd %xmm7,%xmm10,%xmm10
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm14,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vmovdqu %xmm4,16-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm0,%xmm5
+ vpand %xmm12,%xmm6,%xmm6
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 80-128(%rax),%xmm3
+
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpand %xmm12,%xmm13,%xmm7
+ vpxor 176-128(%rax),%xmm1,%xmm1
+
+ vpaddd %xmm7,%xmm14,%xmm14
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm13,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vmovdqu %xmm0,32-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm1,%xmm5
+ vpand %xmm11,%xmm6,%xmm6
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 96-128(%rax),%xmm4
+
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpand %xmm11,%xmm12,%xmm7
+ vpxor 192-128(%rax),%xmm2,%xmm2
+
+ vpaddd %xmm7,%xmm13,%xmm13
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm12,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vmovdqu %xmm1,48-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm2,%xmm5
+ vpand %xmm10,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 112-128(%rax),%xmm0
+
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpand %xmm10,%xmm11,%xmm7
+ vpxor 208-128(%rax),%xmm3,%xmm3
+
+ vpaddd %xmm7,%xmm12,%xmm12
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm11,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vmovdqu %xmm2,64-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm3,%xmm5
+ vpand %xmm14,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 128-128(%rax),%xmm1
+
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpand %xmm14,%xmm10,%xmm7
+ vpxor 224-128(%rax),%xmm4,%xmm4
+
+ vpaddd %xmm7,%xmm11,%xmm11
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm10,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vmovdqu %xmm3,80-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm4,%xmm5
+ vpand %xmm13,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 144-128(%rax),%xmm2
+
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpand %xmm13,%xmm14,%xmm7
+ vpxor 240-128(%rax),%xmm0,%xmm0
+
+ vpaddd %xmm7,%xmm10,%xmm10
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm14,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vmovdqu %xmm4,96-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm0,%xmm5
+ vpand %xmm12,%xmm6,%xmm6
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 160-128(%rax),%xmm3
+
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpand %xmm12,%xmm13,%xmm7
+ vpxor 0-128(%rax),%xmm1,%xmm1
+
+ vpaddd %xmm7,%xmm14,%xmm14
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm13,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vmovdqu %xmm0,112-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm1,%xmm5
+ vpand %xmm11,%xmm6,%xmm6
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 176-128(%rax),%xmm4
+
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpand %xmm11,%xmm12,%xmm7
+ vpxor 16-128(%rax),%xmm2,%xmm2
+
+ vpaddd %xmm7,%xmm13,%xmm13
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm12,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vmovdqu %xmm1,128-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm2,%xmm5
+ vpand %xmm10,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 192-128(%rax),%xmm0
+
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpand %xmm10,%xmm11,%xmm7
+ vpxor 32-128(%rax),%xmm3,%xmm3
+
+ vpaddd %xmm7,%xmm12,%xmm12
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm11,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vmovdqu %xmm2,144-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm3,%xmm5
+ vpand %xmm14,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 208-128(%rax),%xmm1
+
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpand %xmm14,%xmm10,%xmm7
+ vpxor 48-128(%rax),%xmm4,%xmm4
+
+ vpaddd %xmm7,%xmm11,%xmm11
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm10,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vmovdqu %xmm3,160-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm4,%xmm5
+ vpand %xmm13,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 224-128(%rax),%xmm2
+
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpand %xmm13,%xmm14,%xmm7
+ vpxor 64-128(%rax),%xmm0,%xmm0
+
+ vpaddd %xmm7,%xmm10,%xmm10
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm14,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vmovdqu %xmm4,176-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm0,%xmm5
+ vpand %xmm12,%xmm6,%xmm6
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vmovdqa 64(%rbp),%xmm15
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 240-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,192-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 80-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 0-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,208-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 96-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 16-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vmovdqa %xmm2,224-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 112-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 32-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vmovdqa %xmm3,240-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 128-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 48-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vmovdqa %xmm4,0-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 144-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 64-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,16-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 160-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 80-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,32-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 176-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 96-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vmovdqa %xmm2,48-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 192-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 112-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vmovdqa %xmm3,64-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 208-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 128-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vmovdqa %xmm4,80-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 224-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 144-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,96-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 240-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 160-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,112-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 0-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 176-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 16-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 192-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 32-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 208-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 48-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 224-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 64-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 240-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 80-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 0-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 96-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 16-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 112-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+
+ vpsrld $27,%xmm11,%xmm9
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor %xmm13,%xmm6,%xmm6
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm7,%xmm12,%xmm12
+ movl $1,%ecx
+ cmpl 0(%rbx),%ecx
+ cmovgeq %rbp,%r8
+ cmpl 4(%rbx),%ecx
+ cmovgeq %rbp,%r9
+ cmpl 8(%rbx),%ecx
+ cmovgeq %rbp,%r10
+ cmpl 12(%rbx),%ecx
+ cmovgeq %rbp,%r11
+ vmovdqu (%rbx),%xmm6
+ vpxor %xmm8,%xmm8,%xmm8
+ vmovdqa %xmm6,%xmm7
+ vpcmpgtd %xmm8,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpand %xmm7,%xmm10,%xmm10
+ vpand %xmm7,%xmm11,%xmm11
+ vpaddd 0(%rdi),%xmm10,%xmm10
+ vpand %xmm7,%xmm12,%xmm12
+ vpaddd 32(%rdi),%xmm11,%xmm11
+ vpand %xmm7,%xmm13,%xmm13
+ vpaddd 64(%rdi),%xmm12,%xmm12
+ vpand %xmm7,%xmm14,%xmm14
+ vpaddd 96(%rdi),%xmm13,%xmm13
+ vpaddd 128(%rdi),%xmm14,%xmm14
+ vmovdqu %xmm10,0(%rdi)
+ vmovdqu %xmm11,32(%rdi)
+ vmovdqu %xmm12,64(%rdi)
+ vmovdqu %xmm13,96(%rdi)
+ vmovdqu %xmm14,128(%rdi)
+
+ vmovdqu %xmm6,(%rbx)
+ vmovdqu 96(%rbp),%xmm5
+ decl %edx
+ jnz .Loop_avx
+
+ movl 280(%rsp),%edx
+ leaq 16(%rdi),%rdi
+ leaq 64(%rsi),%rsi
+ decl %edx
+ jnz .Loop_grande_avx
+
+.Ldone_avx:
+ movq 272(%rsp),%rax
+.cfi_def_cfa %rax,8
+ vzeroupper
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_multi_block_avx,.-sha1_multi_block_avx
+.type sha1_multi_block_avx2,@function
+.align 32
+sha1_multi_block_avx2:
+.cfi_startproc
+_avx2_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $576,%rsp
+ andq $-256,%rsp
+ movq %rax,544(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xa0,0x04,0x06,0x23,0x08
+.Lbody_avx2:
+ leaq K_XX_XX(%rip),%rbp
+ shrl $1,%edx
+
+ vzeroupper
+.Loop_grande_avx2:
+ movl %edx,552(%rsp)
+ xorl %edx,%edx
+ leaq 512(%rsp),%rbx
+ movq 0(%rsi),%r12
+ movl 8(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,0(%rbx)
+ cmovleq %rbp,%r12
+ movq 16(%rsi),%r13
+ movl 24(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,4(%rbx)
+ cmovleq %rbp,%r13
+ movq 32(%rsi),%r14
+ movl 40(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,8(%rbx)
+ cmovleq %rbp,%r14
+ movq 48(%rsi),%r15
+ movl 56(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,12(%rbx)
+ cmovleq %rbp,%r15
+ movq 64(%rsi),%r8
+ movl 72(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,16(%rbx)
+ cmovleq %rbp,%r8
+ movq 80(%rsi),%r9
+ movl 88(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,20(%rbx)
+ cmovleq %rbp,%r9
+ movq 96(%rsi),%r10
+ movl 104(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,24(%rbx)
+ cmovleq %rbp,%r10
+ movq 112(%rsi),%r11
+ movl 120(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,28(%rbx)
+ cmovleq %rbp,%r11
+ vmovdqu 0(%rdi),%ymm0
+ leaq 128(%rsp),%rax
+ vmovdqu 32(%rdi),%ymm1
+ leaq 256+128(%rsp),%rbx
+ vmovdqu 64(%rdi),%ymm2
+ vmovdqu 96(%rdi),%ymm3
+ vmovdqu 128(%rdi),%ymm4
+ vmovdqu 96(%rbp),%ymm9
+ jmp .Loop_avx2
+
+.align 32
+.Loop_avx2:
+ vmovdqa -32(%rbp),%ymm15
+ vmovd (%r12),%xmm10
+ leaq 64(%r12),%r12
+ vmovd (%r8),%xmm12
+ leaq 64(%r8),%r8
+ vmovd (%r13),%xmm7
+ leaq 64(%r13),%r13
+ vmovd (%r9),%xmm6
+ leaq 64(%r9),%r9
+ vpinsrd $1,(%r14),%xmm10,%xmm10
+ leaq 64(%r14),%r14
+ vpinsrd $1,(%r10),%xmm12,%xmm12
+ leaq 64(%r10),%r10
+ vpinsrd $1,(%r15),%xmm7,%xmm7
+ leaq 64(%r15),%r15
+ vpunpckldq %ymm7,%ymm10,%ymm10
+ vpinsrd $1,(%r11),%xmm6,%xmm6
+ leaq 64(%r11),%r11
+ vpunpckldq %ymm6,%ymm12,%ymm12
+ vmovd -60(%r12),%xmm11
+ vinserti128 $1,%xmm12,%ymm10,%ymm10
+ vmovd -60(%r8),%xmm8
+ vpshufb %ymm9,%ymm10,%ymm10
+ vmovd -60(%r13),%xmm7
+ vmovd -60(%r9),%xmm6
+ vpinsrd $1,-60(%r14),%xmm11,%xmm11
+ vpinsrd $1,-60(%r10),%xmm8,%xmm8
+ vpinsrd $1,-60(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm11,%ymm11
+ vpinsrd $1,-60(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpandn %ymm3,%ymm1,%ymm6
+ vpand %ymm2,%ymm1,%ymm5
+
+ vmovdqa %ymm10,0-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vinserti128 $1,%xmm8,%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -56(%r12),%xmm12
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -56(%r8),%xmm8
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpshufb %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vmovd -56(%r13),%xmm7
+ vmovd -56(%r9),%xmm6
+ vpinsrd $1,-56(%r14),%xmm12,%xmm12
+ vpinsrd $1,-56(%r10),%xmm8,%xmm8
+ vpinsrd $1,-56(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm12,%ymm12
+ vpinsrd $1,-56(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpandn %ymm2,%ymm0,%ymm6
+ vpand %ymm1,%ymm0,%ymm5
+
+ vmovdqa %ymm11,32-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vinserti128 $1,%xmm8,%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -52(%r12),%xmm13
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -52(%r8),%xmm8
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vmovd -52(%r13),%xmm7
+ vmovd -52(%r9),%xmm6
+ vpinsrd $1,-52(%r14),%xmm13,%xmm13
+ vpinsrd $1,-52(%r10),%xmm8,%xmm8
+ vpinsrd $1,-52(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm13,%ymm13
+ vpinsrd $1,-52(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpandn %ymm1,%ymm4,%ymm6
+ vpand %ymm0,%ymm4,%ymm5
+
+ vmovdqa %ymm12,64-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vinserti128 $1,%xmm8,%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -48(%r12),%xmm14
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -48(%r8),%xmm8
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpshufb %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vmovd -48(%r13),%xmm7
+ vmovd -48(%r9),%xmm6
+ vpinsrd $1,-48(%r14),%xmm14,%xmm14
+ vpinsrd $1,-48(%r10),%xmm8,%xmm8
+ vpinsrd $1,-48(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm14,%ymm14
+ vpinsrd $1,-48(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpandn %ymm0,%ymm3,%ymm6
+ vpand %ymm4,%ymm3,%ymm5
+
+ vmovdqa %ymm13,96-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vinserti128 $1,%xmm8,%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -44(%r12),%xmm10
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -44(%r8),%xmm8
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpshufb %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vmovd -44(%r13),%xmm7
+ vmovd -44(%r9),%xmm6
+ vpinsrd $1,-44(%r14),%xmm10,%xmm10
+ vpinsrd $1,-44(%r10),%xmm8,%xmm8
+ vpinsrd $1,-44(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm10,%ymm10
+ vpinsrd $1,-44(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpandn %ymm4,%ymm2,%ymm6
+ vpand %ymm3,%ymm2,%ymm5
+
+ vmovdqa %ymm14,128-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vinserti128 $1,%xmm8,%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -40(%r12),%xmm11
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -40(%r8),%xmm8
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpshufb %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vmovd -40(%r13),%xmm7
+ vmovd -40(%r9),%xmm6
+ vpinsrd $1,-40(%r14),%xmm11,%xmm11
+ vpinsrd $1,-40(%r10),%xmm8,%xmm8
+ vpinsrd $1,-40(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm11,%ymm11
+ vpinsrd $1,-40(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpandn %ymm3,%ymm1,%ymm6
+ vpand %ymm2,%ymm1,%ymm5
+
+ vmovdqa %ymm10,160-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vinserti128 $1,%xmm8,%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -36(%r12),%xmm12
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -36(%r8),%xmm8
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpshufb %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vmovd -36(%r13),%xmm7
+ vmovd -36(%r9),%xmm6
+ vpinsrd $1,-36(%r14),%xmm12,%xmm12
+ vpinsrd $1,-36(%r10),%xmm8,%xmm8
+ vpinsrd $1,-36(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm12,%ymm12
+ vpinsrd $1,-36(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpandn %ymm2,%ymm0,%ymm6
+ vpand %ymm1,%ymm0,%ymm5
+
+ vmovdqa %ymm11,192-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vinserti128 $1,%xmm8,%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -32(%r12),%xmm13
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -32(%r8),%xmm8
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vmovd -32(%r13),%xmm7
+ vmovd -32(%r9),%xmm6
+ vpinsrd $1,-32(%r14),%xmm13,%xmm13
+ vpinsrd $1,-32(%r10),%xmm8,%xmm8
+ vpinsrd $1,-32(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm13,%ymm13
+ vpinsrd $1,-32(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpandn %ymm1,%ymm4,%ymm6
+ vpand %ymm0,%ymm4,%ymm5
+
+ vmovdqa %ymm12,224-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vinserti128 $1,%xmm8,%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -28(%r12),%xmm14
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -28(%r8),%xmm8
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpshufb %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vmovd -28(%r13),%xmm7
+ vmovd -28(%r9),%xmm6
+ vpinsrd $1,-28(%r14),%xmm14,%xmm14
+ vpinsrd $1,-28(%r10),%xmm8,%xmm8
+ vpinsrd $1,-28(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm14,%ymm14
+ vpinsrd $1,-28(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpandn %ymm0,%ymm3,%ymm6
+ vpand %ymm4,%ymm3,%ymm5
+
+ vmovdqa %ymm13,256-256-128(%rbx)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vinserti128 $1,%xmm8,%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -24(%r12),%xmm10
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -24(%r8),%xmm8
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpshufb %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vmovd -24(%r13),%xmm7
+ vmovd -24(%r9),%xmm6
+ vpinsrd $1,-24(%r14),%xmm10,%xmm10
+ vpinsrd $1,-24(%r10),%xmm8,%xmm8
+ vpinsrd $1,-24(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm10,%ymm10
+ vpinsrd $1,-24(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpandn %ymm4,%ymm2,%ymm6
+ vpand %ymm3,%ymm2,%ymm5
+
+ vmovdqa %ymm14,288-256-128(%rbx)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vinserti128 $1,%xmm8,%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -20(%r12),%xmm11
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -20(%r8),%xmm8
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpshufb %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vmovd -20(%r13),%xmm7
+ vmovd -20(%r9),%xmm6
+ vpinsrd $1,-20(%r14),%xmm11,%xmm11
+ vpinsrd $1,-20(%r10),%xmm8,%xmm8
+ vpinsrd $1,-20(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm11,%ymm11
+ vpinsrd $1,-20(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpandn %ymm3,%ymm1,%ymm6
+ vpand %ymm2,%ymm1,%ymm5
+
+ vmovdqa %ymm10,320-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vinserti128 $1,%xmm8,%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -16(%r12),%xmm12
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -16(%r8),%xmm8
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpshufb %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vmovd -16(%r13),%xmm7
+ vmovd -16(%r9),%xmm6
+ vpinsrd $1,-16(%r14),%xmm12,%xmm12
+ vpinsrd $1,-16(%r10),%xmm8,%xmm8
+ vpinsrd $1,-16(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm12,%ymm12
+ vpinsrd $1,-16(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpandn %ymm2,%ymm0,%ymm6
+ vpand %ymm1,%ymm0,%ymm5
+
+ vmovdqa %ymm11,352-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vinserti128 $1,%xmm8,%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -12(%r12),%xmm13
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -12(%r8),%xmm8
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vmovd -12(%r13),%xmm7
+ vmovd -12(%r9),%xmm6
+ vpinsrd $1,-12(%r14),%xmm13,%xmm13
+ vpinsrd $1,-12(%r10),%xmm8,%xmm8
+ vpinsrd $1,-12(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm13,%ymm13
+ vpinsrd $1,-12(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpandn %ymm1,%ymm4,%ymm6
+ vpand %ymm0,%ymm4,%ymm5
+
+ vmovdqa %ymm12,384-256-128(%rbx)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vinserti128 $1,%xmm8,%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -8(%r12),%xmm14
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -8(%r8),%xmm8
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpshufb %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vmovd -8(%r13),%xmm7
+ vmovd -8(%r9),%xmm6
+ vpinsrd $1,-8(%r14),%xmm14,%xmm14
+ vpinsrd $1,-8(%r10),%xmm8,%xmm8
+ vpinsrd $1,-8(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm14,%ymm14
+ vpinsrd $1,-8(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpandn %ymm0,%ymm3,%ymm6
+ vpand %ymm4,%ymm3,%ymm5
+
+ vmovdqa %ymm13,416-256-128(%rbx)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vinserti128 $1,%xmm8,%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -4(%r12),%xmm10
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -4(%r8),%xmm8
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpshufb %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vmovdqa 0-128(%rax),%ymm11
+ vmovd -4(%r13),%xmm7
+ vmovd -4(%r9),%xmm6
+ vpinsrd $1,-4(%r14),%xmm10,%xmm10
+ vpinsrd $1,-4(%r10),%xmm8,%xmm8
+ vpinsrd $1,-4(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm10,%ymm10
+ vpinsrd $1,-4(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm0,%ymm0
+ prefetcht0 63(%r12)
+ vpslld $5,%ymm1,%ymm7
+ vpandn %ymm4,%ymm2,%ymm6
+ vpand %ymm3,%ymm2,%ymm5
+
+ vmovdqa %ymm14,448-256-128(%rbx)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vinserti128 $1,%xmm8,%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ prefetcht0 63(%r13)
+ vpxor %ymm6,%ymm5,%ymm5
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ prefetcht0 63(%r14)
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ prefetcht0 63(%r15)
+ vpshufb %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vmovdqa 32-128(%rax),%ymm12
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 64-128(%rax),%ymm13
+
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpandn %ymm3,%ymm1,%ymm6
+ prefetcht0 63(%r8)
+ vpand %ymm2,%ymm1,%ymm5
+
+ vmovdqa %ymm10,480-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 256-256-128(%rbx),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+ prefetcht0 63(%r9)
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ prefetcht0 63(%r10)
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ prefetcht0 63(%r11)
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 96-128(%rax),%ymm14
+
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpandn %ymm2,%ymm0,%ymm6
+
+ vpand %ymm1,%ymm0,%ymm5
+
+ vmovdqa %ymm11,0-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 288-256-128(%rbx),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 128-128(%rax),%ymm10
+
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpandn %ymm1,%ymm4,%ymm6
+
+ vpand %ymm0,%ymm4,%ymm5
+
+ vmovdqa %ymm12,32-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 320-256-128(%rbx),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 160-128(%rax),%ymm11
+
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpandn %ymm0,%ymm3,%ymm6
+
+ vpand %ymm4,%ymm3,%ymm5
+
+ vmovdqa %ymm13,64-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 352-256-128(%rbx),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 192-128(%rax),%ymm12
+
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpandn %ymm4,%ymm2,%ymm6
+
+ vpand %ymm3,%ymm2,%ymm5
+
+ vmovdqa %ymm14,96-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 384-256-128(%rbx),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vmovdqa 0(%rbp),%ymm15
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 224-128(%rax),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,128-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 416-256-128(%rbx),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 256-256-128(%rbx),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,160-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 448-256-128(%rbx),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 288-256-128(%rbx),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vmovdqa %ymm12,192-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 480-256-128(%rbx),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 320-256-128(%rbx),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vmovdqa %ymm13,224-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 0-128(%rax),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 352-256-128(%rbx),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vmovdqa %ymm14,256-256-128(%rbx)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 32-128(%rax),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 384-256-128(%rbx),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,288-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 64-128(%rax),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 416-256-128(%rbx),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,320-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 96-128(%rax),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 448-256-128(%rbx),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vmovdqa %ymm12,352-256-128(%rbx)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 128-128(%rax),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 480-256-128(%rbx),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vmovdqa %ymm13,384-256-128(%rbx)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 160-128(%rax),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 0-128(%rax),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vmovdqa %ymm14,416-256-128(%rbx)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 192-128(%rax),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 32-128(%rax),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,448-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 224-128(%rax),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 64-128(%rax),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,480-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 256-256-128(%rbx),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 96-128(%rax),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vmovdqa %ymm12,0-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 288-256-128(%rbx),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 128-128(%rax),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vmovdqa %ymm13,32-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 320-256-128(%rbx),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 160-128(%rax),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vmovdqa %ymm14,64-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 352-256-128(%rbx),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 192-128(%rax),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,96-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 384-256-128(%rbx),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 224-128(%rax),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,128-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 416-256-128(%rbx),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 256-256-128(%rbx),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vmovdqa %ymm12,160-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 448-256-128(%rbx),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 288-256-128(%rbx),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vmovdqa %ymm13,192-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 480-256-128(%rbx),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 320-256-128(%rbx),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vmovdqa %ymm14,224-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 0-128(%rax),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vmovdqa 32(%rbp),%ymm15
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 352-256-128(%rbx),%ymm13
+
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpand %ymm2,%ymm3,%ymm6
+ vpxor 32-128(%rax),%ymm11,%ymm11
+
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm3,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vmovdqu %ymm10,256-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm11,%ymm9
+ vpand %ymm1,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 384-256-128(%rbx),%ymm14
+
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpand %ymm1,%ymm2,%ymm6
+ vpxor 64-128(%rax),%ymm12,%ymm12
+
+ vpaddd %ymm6,%ymm3,%ymm3
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm2,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vmovdqu %ymm11,288-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm12,%ymm9
+ vpand %ymm0,%ymm5,%ymm5
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 416-256-128(%rbx),%ymm10
+
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpand %ymm0,%ymm1,%ymm6
+ vpxor 96-128(%rax),%ymm13,%ymm13
+
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm1,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vmovdqu %ymm12,320-256-128(%rbx)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm13,%ymm9
+ vpand %ymm4,%ymm5,%ymm5
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 448-256-128(%rbx),%ymm11
+
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpand %ymm4,%ymm0,%ymm6
+ vpxor 128-128(%rax),%ymm14,%ymm14
+
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm0,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vmovdqu %ymm13,352-256-128(%rbx)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm14,%ymm9
+ vpand %ymm3,%ymm5,%ymm5
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 480-256-128(%rbx),%ymm12
+
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpand %ymm3,%ymm4,%ymm6
+ vpxor 160-128(%rax),%ymm10,%ymm10
+
+ vpaddd %ymm6,%ymm0,%ymm0
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm4,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vmovdqu %ymm14,384-256-128(%rbx)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm10,%ymm9
+ vpand %ymm2,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 0-128(%rax),%ymm13
+
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpand %ymm2,%ymm3,%ymm6
+ vpxor 192-128(%rax),%ymm11,%ymm11
+
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm3,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vmovdqu %ymm10,416-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm11,%ymm9
+ vpand %ymm1,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 32-128(%rax),%ymm14
+
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpand %ymm1,%ymm2,%ymm6
+ vpxor 224-128(%rax),%ymm12,%ymm12
+
+ vpaddd %ymm6,%ymm3,%ymm3
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm2,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vmovdqu %ymm11,448-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm12,%ymm9
+ vpand %ymm0,%ymm5,%ymm5
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 64-128(%rax),%ymm10
+
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpand %ymm0,%ymm1,%ymm6
+ vpxor 256-256-128(%rbx),%ymm13,%ymm13
+
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm1,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vmovdqu %ymm12,480-256-128(%rbx)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm13,%ymm9
+ vpand %ymm4,%ymm5,%ymm5
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 96-128(%rax),%ymm11
+
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpand %ymm4,%ymm0,%ymm6
+ vpxor 288-256-128(%rbx),%ymm14,%ymm14
+
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm0,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vmovdqu %ymm13,0-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm14,%ymm9
+ vpand %ymm3,%ymm5,%ymm5
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 128-128(%rax),%ymm12
+
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpand %ymm3,%ymm4,%ymm6
+ vpxor 320-256-128(%rbx),%ymm10,%ymm10
+
+ vpaddd %ymm6,%ymm0,%ymm0
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm4,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vmovdqu %ymm14,32-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm10,%ymm9
+ vpand %ymm2,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 160-128(%rax),%ymm13
+
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpand %ymm2,%ymm3,%ymm6
+ vpxor 352-256-128(%rbx),%ymm11,%ymm11
+
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm3,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vmovdqu %ymm10,64-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm11,%ymm9
+ vpand %ymm1,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 192-128(%rax),%ymm14
+
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpand %ymm1,%ymm2,%ymm6
+ vpxor 384-256-128(%rbx),%ymm12,%ymm12
+
+ vpaddd %ymm6,%ymm3,%ymm3
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm2,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vmovdqu %ymm11,96-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm12,%ymm9
+ vpand %ymm0,%ymm5,%ymm5
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 224-128(%rax),%ymm10
+
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpand %ymm0,%ymm1,%ymm6
+ vpxor 416-256-128(%rbx),%ymm13,%ymm13
+
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm1,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vmovdqu %ymm12,128-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm13,%ymm9
+ vpand %ymm4,%ymm5,%ymm5
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 256-256-128(%rbx),%ymm11
+
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpand %ymm4,%ymm0,%ymm6
+ vpxor 448-256-128(%rbx),%ymm14,%ymm14
+
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm0,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vmovdqu %ymm13,160-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm14,%ymm9
+ vpand %ymm3,%ymm5,%ymm5
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 288-256-128(%rbx),%ymm12
+
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpand %ymm3,%ymm4,%ymm6
+ vpxor 480-256-128(%rbx),%ymm10,%ymm10
+
+ vpaddd %ymm6,%ymm0,%ymm0
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm4,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vmovdqu %ymm14,192-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm10,%ymm9
+ vpand %ymm2,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 320-256-128(%rbx),%ymm13
+
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpand %ymm2,%ymm3,%ymm6
+ vpxor 0-128(%rax),%ymm11,%ymm11
+
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm3,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vmovdqu %ymm10,224-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm11,%ymm9
+ vpand %ymm1,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 352-256-128(%rbx),%ymm14
+
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpand %ymm1,%ymm2,%ymm6
+ vpxor 32-128(%rax),%ymm12,%ymm12
+
+ vpaddd %ymm6,%ymm3,%ymm3
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm2,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vmovdqu %ymm11,256-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm12,%ymm9
+ vpand %ymm0,%ymm5,%ymm5
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 384-256-128(%rbx),%ymm10
+
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpand %ymm0,%ymm1,%ymm6
+ vpxor 64-128(%rax),%ymm13,%ymm13
+
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm1,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vmovdqu %ymm12,288-256-128(%rbx)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm13,%ymm9
+ vpand %ymm4,%ymm5,%ymm5
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 416-256-128(%rbx),%ymm11
+
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpand %ymm4,%ymm0,%ymm6
+ vpxor 96-128(%rax),%ymm14,%ymm14
+
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm0,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vmovdqu %ymm13,320-256-128(%rbx)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm14,%ymm9
+ vpand %ymm3,%ymm5,%ymm5
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 448-256-128(%rbx),%ymm12
+
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpand %ymm3,%ymm4,%ymm6
+ vpxor 128-128(%rax),%ymm10,%ymm10
+
+ vpaddd %ymm6,%ymm0,%ymm0
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm4,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vmovdqu %ymm14,352-256-128(%rbx)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm10,%ymm9
+ vpand %ymm2,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vmovdqa 64(%rbp),%ymm15
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 480-256-128(%rbx),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,384-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 160-128(%rax),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 0-128(%rax),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,416-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 192-128(%rax),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 32-128(%rax),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vmovdqa %ymm12,448-256-128(%rbx)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 224-128(%rax),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 64-128(%rax),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vmovdqa %ymm13,480-256-128(%rbx)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 256-256-128(%rbx),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 96-128(%rax),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vmovdqa %ymm14,0-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 288-256-128(%rbx),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 128-128(%rax),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,32-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 320-256-128(%rbx),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 160-128(%rax),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,64-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 352-256-128(%rbx),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 192-128(%rax),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vmovdqa %ymm12,96-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 384-256-128(%rbx),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 224-128(%rax),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vmovdqa %ymm13,128-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 416-256-128(%rbx),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 256-256-128(%rbx),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vmovdqa %ymm14,160-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 448-256-128(%rbx),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 288-256-128(%rbx),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,192-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 480-256-128(%rbx),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 320-256-128(%rbx),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,224-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 0-128(%rax),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 352-256-128(%rbx),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 32-128(%rax),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 384-256-128(%rbx),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 64-128(%rax),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 416-256-128(%rbx),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 96-128(%rax),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 448-256-128(%rbx),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 128-128(%rax),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 480-256-128(%rbx),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 160-128(%rax),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 0-128(%rax),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 192-128(%rax),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 32-128(%rax),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 224-128(%rax),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+
+ vpsrld $27,%ymm1,%ymm8
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor %ymm3,%ymm5,%ymm5
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm6,%ymm2,%ymm2
+ movl $1,%ecx
+ leaq 512(%rsp),%rbx
+ cmpl 0(%rbx),%ecx
+ cmovgeq %rbp,%r12
+ cmpl 4(%rbx),%ecx
+ cmovgeq %rbp,%r13
+ cmpl 8(%rbx),%ecx
+ cmovgeq %rbp,%r14
+ cmpl 12(%rbx),%ecx
+ cmovgeq %rbp,%r15
+ cmpl 16(%rbx),%ecx
+ cmovgeq %rbp,%r8
+ cmpl 20(%rbx),%ecx
+ cmovgeq %rbp,%r9
+ cmpl 24(%rbx),%ecx
+ cmovgeq %rbp,%r10
+ cmpl 28(%rbx),%ecx
+ cmovgeq %rbp,%r11
+ vmovdqu (%rbx),%ymm5
+ vpxor %ymm7,%ymm7,%ymm7
+ vmovdqa %ymm5,%ymm6
+ vpcmpgtd %ymm7,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm5,%ymm5
+
+ vpand %ymm6,%ymm0,%ymm0
+ vpand %ymm6,%ymm1,%ymm1
+ vpaddd 0(%rdi),%ymm0,%ymm0
+ vpand %ymm6,%ymm2,%ymm2
+ vpaddd 32(%rdi),%ymm1,%ymm1
+ vpand %ymm6,%ymm3,%ymm3
+ vpaddd 64(%rdi),%ymm2,%ymm2
+ vpand %ymm6,%ymm4,%ymm4
+ vpaddd 96(%rdi),%ymm3,%ymm3
+ vpaddd 128(%rdi),%ymm4,%ymm4
+ vmovdqu %ymm0,0(%rdi)
+ vmovdqu %ymm1,32(%rdi)
+ vmovdqu %ymm2,64(%rdi)
+ vmovdqu %ymm3,96(%rdi)
+ vmovdqu %ymm4,128(%rdi)
+
+ vmovdqu %ymm5,(%rbx)
+ leaq 256+128(%rsp),%rbx
+ vmovdqu 96(%rbp),%ymm9
+ decl %edx
+ jnz .Loop_avx2
+
+
+
+
+
+
+
+.Ldone_avx2:
+ movq 544(%rsp),%rax
+.cfi_def_cfa %rax,8
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_multi_block_avx2,.-sha1_multi_block_avx2
+
+.align 256
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+K_XX_XX:
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+.byte 83,72,65,49,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha1-x86_64.s b/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha1-x86_64.s
new file mode 100644
index 0000000000..e436521a04
--- /dev/null
+++ b/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha1-x86_64.s
@@ -0,0 +1,5450 @@
+.text
+
+
+.globl sha1_block_data_order
+.type sha1_block_data_order,@function
+.align 16
+sha1_block_data_order:
+.cfi_startproc
+ movl OPENSSL_ia32cap_P+0(%rip),%r9d
+ movl OPENSSL_ia32cap_P+4(%rip),%r8d
+ movl OPENSSL_ia32cap_P+8(%rip),%r10d
+ testl $512,%r8d
+ jz .Lialu
+ testl $536870912,%r10d
+ jnz _shaext_shortcut
+ andl $296,%r10d
+ cmpl $296,%r10d
+ je _avx2_shortcut
+ andl $268435456,%r8d
+ andl $1073741824,%r9d
+ orl %r9d,%r8d
+ cmpl $1342177280,%r8d
+ je _avx_shortcut
+ jmp _ssse3_shortcut
+
+.align 16
+.Lialu:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ movq %rdi,%r8
+ subq $72,%rsp
+ movq %rsi,%r9
+ andq $-64,%rsp
+ movq %rdx,%r10
+ movq %rax,64(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xc0,0x00,0x06,0x23,0x08
+.Lprologue:
+
+ movl 0(%r8),%esi
+ movl 4(%r8),%edi
+ movl 8(%r8),%r11d
+ movl 12(%r8),%r12d
+ movl 16(%r8),%r13d
+ jmp .Lloop
+
+.align 16
+.Lloop:
+ movl 0(%r9),%edx
+ bswapl %edx
+ movl 4(%r9),%ebp
+ movl %r12d,%eax
+ movl %edx,0(%rsp)
+ movl %esi,%ecx
+ bswapl %ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ andl %edi,%eax
+ leal 1518500249(%rdx,%r13,1),%r13d
+ addl %ecx,%r13d
+ xorl %r12d,%eax
+ roll $30,%edi
+ addl %eax,%r13d
+ movl 8(%r9),%r14d
+ movl %r11d,%eax
+ movl %ebp,4(%rsp)
+ movl %r13d,%ecx
+ bswapl %r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ andl %esi,%eax
+ leal 1518500249(%rbp,%r12,1),%r12d
+ addl %ecx,%r12d
+ xorl %r11d,%eax
+ roll $30,%esi
+ addl %eax,%r12d
+ movl 12(%r9),%edx
+ movl %edi,%eax
+ movl %r14d,8(%rsp)
+ movl %r12d,%ecx
+ bswapl %edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ andl %r13d,%eax
+ leal 1518500249(%r14,%r11,1),%r11d
+ addl %ecx,%r11d
+ xorl %edi,%eax
+ roll $30,%r13d
+ addl %eax,%r11d
+ movl 16(%r9),%ebp
+ movl %esi,%eax
+ movl %edx,12(%rsp)
+ movl %r11d,%ecx
+ bswapl %ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ andl %r12d,%eax
+ leal 1518500249(%rdx,%rdi,1),%edi
+ addl %ecx,%edi
+ xorl %esi,%eax
+ roll $30,%r12d
+ addl %eax,%edi
+ movl 20(%r9),%r14d
+ movl %r13d,%eax
+ movl %ebp,16(%rsp)
+ movl %edi,%ecx
+ bswapl %r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ andl %r11d,%eax
+ leal 1518500249(%rbp,%rsi,1),%esi
+ addl %ecx,%esi
+ xorl %r13d,%eax
+ roll $30,%r11d
+ addl %eax,%esi
+ movl 24(%r9),%edx
+ movl %r12d,%eax
+ movl %r14d,20(%rsp)
+ movl %esi,%ecx
+ bswapl %edx
+ xorl %r11d,%eax
+ roll $5,%ecx
+ andl %edi,%eax
+ leal 1518500249(%r14,%r13,1),%r13d
+ addl %ecx,%r13d
+ xorl %r12d,%eax
+ roll $30,%edi
+ addl %eax,%r13d
+ movl 28(%r9),%ebp
+ movl %r11d,%eax
+ movl %edx,24(%rsp)
+ movl %r13d,%ecx
+ bswapl %ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ andl %esi,%eax
+ leal 1518500249(%rdx,%r12,1),%r12d
+ addl %ecx,%r12d
+ xorl %r11d,%eax
+ roll $30,%esi
+ addl %eax,%r12d
+ movl 32(%r9),%r14d
+ movl %edi,%eax
+ movl %ebp,28(%rsp)
+ movl %r12d,%ecx
+ bswapl %r14d
+ xorl %esi,%eax
+ roll $5,%ecx
+ andl %r13d,%eax
+ leal 1518500249(%rbp,%r11,1),%r11d
+ addl %ecx,%r11d
+ xorl %edi,%eax
+ roll $30,%r13d
+ addl %eax,%r11d
+ movl 36(%r9),%edx
+ movl %esi,%eax
+ movl %r14d,32(%rsp)
+ movl %r11d,%ecx
+ bswapl %edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ andl %r12d,%eax
+ leal 1518500249(%r14,%rdi,1),%edi
+ addl %ecx,%edi
+ xorl %esi,%eax
+ roll $30,%r12d
+ addl %eax,%edi
+ movl 40(%r9),%ebp
+ movl %r13d,%eax
+ movl %edx,36(%rsp)
+ movl %edi,%ecx
+ bswapl %ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ andl %r11d,%eax
+ leal 1518500249(%rdx,%rsi,1),%esi
+ addl %ecx,%esi
+ xorl %r13d,%eax
+ roll $30,%r11d
+ addl %eax,%esi
+ movl 44(%r9),%r14d
+ movl %r12d,%eax
+ movl %ebp,40(%rsp)
+ movl %esi,%ecx
+ bswapl %r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ andl %edi,%eax
+ leal 1518500249(%rbp,%r13,1),%r13d
+ addl %ecx,%r13d
+ xorl %r12d,%eax
+ roll $30,%edi
+ addl %eax,%r13d
+ movl 48(%r9),%edx
+ movl %r11d,%eax
+ movl %r14d,44(%rsp)
+ movl %r13d,%ecx
+ bswapl %edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ andl %esi,%eax
+ leal 1518500249(%r14,%r12,1),%r12d
+ addl %ecx,%r12d
+ xorl %r11d,%eax
+ roll $30,%esi
+ addl %eax,%r12d
+ movl 52(%r9),%ebp
+ movl %edi,%eax
+ movl %edx,48(%rsp)
+ movl %r12d,%ecx
+ bswapl %ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ andl %r13d,%eax
+ leal 1518500249(%rdx,%r11,1),%r11d
+ addl %ecx,%r11d
+ xorl %edi,%eax
+ roll $30,%r13d
+ addl %eax,%r11d
+ movl 56(%r9),%r14d
+ movl %esi,%eax
+ movl %ebp,52(%rsp)
+ movl %r11d,%ecx
+ bswapl %r14d
+ xorl %r13d,%eax
+ roll $5,%ecx
+ andl %r12d,%eax
+ leal 1518500249(%rbp,%rdi,1),%edi
+ addl %ecx,%edi
+ xorl %esi,%eax
+ roll $30,%r12d
+ addl %eax,%edi
+ movl 60(%r9),%edx
+ movl %r13d,%eax
+ movl %r14d,56(%rsp)
+ movl %edi,%ecx
+ bswapl %edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ andl %r11d,%eax
+ leal 1518500249(%r14,%rsi,1),%esi
+ addl %ecx,%esi
+ xorl %r13d,%eax
+ roll $30,%r11d
+ addl %eax,%esi
+ xorl 0(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,60(%rsp)
+ movl %esi,%ecx
+ xorl 8(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%ebp
+ andl %edi,%eax
+ leal 1518500249(%rdx,%r13,1),%r13d
+ roll $30,%edi
+ xorl %r12d,%eax
+ addl %ecx,%r13d
+ roll $1,%ebp
+ addl %eax,%r13d
+ xorl 4(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,0(%rsp)
+ movl %r13d,%ecx
+ xorl 12(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%r14d
+ andl %esi,%eax
+ leal 1518500249(%rbp,%r12,1),%r12d
+ roll $30,%esi
+ xorl %r11d,%eax
+ addl %ecx,%r12d
+ roll $1,%r14d
+ addl %eax,%r12d
+ xorl 8(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,4(%rsp)
+ movl %r12d,%ecx
+ xorl 16(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%edx
+ andl %r13d,%eax
+ leal 1518500249(%r14,%r11,1),%r11d
+ roll $30,%r13d
+ xorl %edi,%eax
+ addl %ecx,%r11d
+ roll $1,%edx
+ addl %eax,%r11d
+ xorl 12(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,8(%rsp)
+ movl %r11d,%ecx
+ xorl 20(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%ebp
+ andl %r12d,%eax
+ leal 1518500249(%rdx,%rdi,1),%edi
+ roll $30,%r12d
+ xorl %esi,%eax
+ addl %ecx,%edi
+ roll $1,%ebp
+ addl %eax,%edi
+ xorl 16(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,12(%rsp)
+ movl %edi,%ecx
+ xorl 24(%rsp),%r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 48(%rsp),%r14d
+ andl %r11d,%eax
+ leal 1518500249(%rbp,%rsi,1),%esi
+ roll $30,%r11d
+ xorl %r13d,%eax
+ addl %ecx,%esi
+ roll $1,%r14d
+ addl %eax,%esi
+ xorl 20(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,16(%rsp)
+ movl %esi,%ecx
+ xorl 28(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%edx
+ leal 1859775393(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ xorl 24(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,20(%rsp)
+ movl %r13d,%ecx
+ xorl 32(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%ebp
+ leal 1859775393(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ xorl 28(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,24(%rsp)
+ movl %r12d,%ecx
+ xorl 36(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%r14d
+ leal 1859775393(%rbp,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%r14d
+ xorl 32(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,28(%rsp)
+ movl %r11d,%ecx
+ xorl 40(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 0(%rsp),%edx
+ leal 1859775393(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ xorl 36(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,32(%rsp)
+ movl %edi,%ecx
+ xorl 44(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 4(%rsp),%ebp
+ leal 1859775393(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ xorl 40(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,36(%rsp)
+ movl %esi,%ecx
+ xorl 48(%rsp),%r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 8(%rsp),%r14d
+ leal 1859775393(%rbp,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%r14d
+ xorl 44(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,40(%rsp)
+ movl %r13d,%ecx
+ xorl 52(%rsp),%edx
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 12(%rsp),%edx
+ leal 1859775393(%r14,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%edx
+ xorl 48(%rsp),%ebp
+ movl %r13d,%eax
+ movl %edx,44(%rsp)
+ movl %r12d,%ecx
+ xorl 56(%rsp),%ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 16(%rsp),%ebp
+ leal 1859775393(%rdx,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%ebp
+ xorl 52(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,48(%rsp)
+ movl %r11d,%ecx
+ xorl 60(%rsp),%r14d
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%r14d
+ leal 1859775393(%rbp,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%r14d
+ xorl 56(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,52(%rsp)
+ movl %edi,%ecx
+ xorl 0(%rsp),%edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 24(%rsp),%edx
+ leal 1859775393(%r14,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%edx
+ xorl 60(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,56(%rsp)
+ movl %esi,%ecx
+ xorl 4(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 28(%rsp),%ebp
+ leal 1859775393(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ xorl 0(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,60(%rsp)
+ movl %r13d,%ecx
+ xorl 8(%rsp),%r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%r14d
+ leal 1859775393(%rbp,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%r14d
+ xorl 4(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,0(%rsp)
+ movl %r12d,%ecx
+ xorl 12(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%edx
+ leal 1859775393(%r14,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ xorl 8(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,4(%rsp)
+ movl %r11d,%ecx
+ xorl 16(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%ebp
+ leal 1859775393(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ xorl 12(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,8(%rsp)
+ movl %edi,%ecx
+ xorl 20(%rsp),%r14d
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%r14d
+ leal 1859775393(%rbp,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%r14d
+ xorl 16(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,12(%rsp)
+ movl %esi,%ecx
+ xorl 24(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 48(%rsp),%edx
+ leal 1859775393(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ xorl 20(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,16(%rsp)
+ movl %r13d,%ecx
+ xorl 28(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%ebp
+ leal 1859775393(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ xorl 24(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,20(%rsp)
+ movl %r12d,%ecx
+ xorl 32(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%r14d
+ leal 1859775393(%rbp,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%r14d
+ xorl 28(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,24(%rsp)
+ movl %r11d,%ecx
+ xorl 36(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%edx
+ leal 1859775393(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ xorl 32(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,28(%rsp)
+ movl %edi,%ecx
+ xorl 40(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 0(%rsp),%ebp
+ leal 1859775393(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ xorl 36(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,32(%rsp)
+ movl %r12d,%ebx
+ xorl 44(%rsp),%r14d
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 4(%rsp),%r14d
+ leal -1894007588(%rbp,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%r14d
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 40(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,36(%rsp)
+ movl %r11d,%ebx
+ xorl 48(%rsp),%edx
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 8(%rsp),%edx
+ leal -1894007588(%r14,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%edx
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 44(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,40(%rsp)
+ movl %edi,%ebx
+ xorl 52(%rsp),%ebp
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 12(%rsp),%ebp
+ leal -1894007588(%rdx,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%ebp
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 48(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,44(%rsp)
+ movl %esi,%ebx
+ xorl 56(%rsp),%r14d
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 16(%rsp),%r14d
+ leal -1894007588(%rbp,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%r14d
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 52(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,48(%rsp)
+ movl %r13d,%ebx
+ xorl 60(%rsp),%edx
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 20(%rsp),%edx
+ leal -1894007588(%r14,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%edx
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 56(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,52(%rsp)
+ movl %r12d,%ebx
+ xorl 0(%rsp),%ebp
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 24(%rsp),%ebp
+ leal -1894007588(%rdx,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%ebp
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 60(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,56(%rsp)
+ movl %r11d,%ebx
+ xorl 4(%rsp),%r14d
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 28(%rsp),%r14d
+ leal -1894007588(%rbp,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%r14d
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 0(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,60(%rsp)
+ movl %edi,%ebx
+ xorl 8(%rsp),%edx
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 32(%rsp),%edx
+ leal -1894007588(%r14,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%edx
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 4(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,0(%rsp)
+ movl %esi,%ebx
+ xorl 12(%rsp),%ebp
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 36(%rsp),%ebp
+ leal -1894007588(%rdx,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%ebp
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 8(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,4(%rsp)
+ movl %r13d,%ebx
+ xorl 16(%rsp),%r14d
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 40(%rsp),%r14d
+ leal -1894007588(%rbp,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%r14d
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 12(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,8(%rsp)
+ movl %r12d,%ebx
+ xorl 20(%rsp),%edx
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 44(%rsp),%edx
+ leal -1894007588(%r14,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%edx
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 16(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,12(%rsp)
+ movl %r11d,%ebx
+ xorl 24(%rsp),%ebp
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 48(%rsp),%ebp
+ leal -1894007588(%rdx,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%ebp
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 20(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,16(%rsp)
+ movl %edi,%ebx
+ xorl 28(%rsp),%r14d
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 52(%rsp),%r14d
+ leal -1894007588(%rbp,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%r14d
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 24(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,20(%rsp)
+ movl %esi,%ebx
+ xorl 32(%rsp),%edx
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 56(%rsp),%edx
+ leal -1894007588(%r14,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%edx
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 28(%rsp),%ebp
+ movl %r13d,%eax
+ movl %edx,24(%rsp)
+ movl %r13d,%ebx
+ xorl 36(%rsp),%ebp
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 60(%rsp),%ebp
+ leal -1894007588(%rdx,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%ebp
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 32(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,28(%rsp)
+ movl %r12d,%ebx
+ xorl 40(%rsp),%r14d
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 0(%rsp),%r14d
+ leal -1894007588(%rbp,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%r14d
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 36(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,32(%rsp)
+ movl %r11d,%ebx
+ xorl 44(%rsp),%edx
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 4(%rsp),%edx
+ leal -1894007588(%r14,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%edx
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 40(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,36(%rsp)
+ movl %edi,%ebx
+ xorl 48(%rsp),%ebp
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 8(%rsp),%ebp
+ leal -1894007588(%rdx,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%ebp
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 44(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,40(%rsp)
+ movl %esi,%ebx
+ xorl 52(%rsp),%r14d
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 12(%rsp),%r14d
+ leal -1894007588(%rbp,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%r14d
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 48(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,44(%rsp)
+ movl %r13d,%ebx
+ xorl 56(%rsp),%edx
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 16(%rsp),%edx
+ leal -1894007588(%r14,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%edx
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 52(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,48(%rsp)
+ movl %esi,%ecx
+ xorl 60(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%ebp
+ leal -899497514(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ xorl 56(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,52(%rsp)
+ movl %r13d,%ecx
+ xorl 0(%rsp),%r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 24(%rsp),%r14d
+ leal -899497514(%rbp,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%r14d
+ xorl 60(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,56(%rsp)
+ movl %r12d,%ecx
+ xorl 4(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 28(%rsp),%edx
+ leal -899497514(%r14,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ xorl 0(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,60(%rsp)
+ movl %r11d,%ecx
+ xorl 8(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%ebp
+ leal -899497514(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ xorl 4(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,0(%rsp)
+ movl %edi,%ecx
+ xorl 12(%rsp),%r14d
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%r14d
+ leal -899497514(%rbp,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%r14d
+ xorl 8(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,4(%rsp)
+ movl %esi,%ecx
+ xorl 16(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%edx
+ leal -899497514(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ xorl 12(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,8(%rsp)
+ movl %r13d,%ecx
+ xorl 20(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%ebp
+ leal -899497514(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ xorl 16(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,12(%rsp)
+ movl %r12d,%ecx
+ xorl 24(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 48(%rsp),%r14d
+ leal -899497514(%rbp,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%r14d
+ xorl 20(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,16(%rsp)
+ movl %r11d,%ecx
+ xorl 28(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%edx
+ leal -899497514(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ xorl 24(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,20(%rsp)
+ movl %edi,%ecx
+ xorl 32(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%ebp
+ leal -899497514(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ xorl 28(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,24(%rsp)
+ movl %esi,%ecx
+ xorl 36(%rsp),%r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%r14d
+ leal -899497514(%rbp,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%r14d
+ xorl 32(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,28(%rsp)
+ movl %r13d,%ecx
+ xorl 40(%rsp),%edx
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 0(%rsp),%edx
+ leal -899497514(%r14,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%edx
+ xorl 36(%rsp),%ebp
+ movl %r13d,%eax
+
+ movl %r12d,%ecx
+ xorl 44(%rsp),%ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 4(%rsp),%ebp
+ leal -899497514(%rdx,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%ebp
+ xorl 40(%rsp),%r14d
+ movl %r12d,%eax
+
+ movl %r11d,%ecx
+ xorl 48(%rsp),%r14d
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 8(%rsp),%r14d
+ leal -899497514(%rbp,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%r14d
+ xorl 44(%rsp),%edx
+ movl %r11d,%eax
+
+ movl %edi,%ecx
+ xorl 52(%rsp),%edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 12(%rsp),%edx
+ leal -899497514(%r14,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%edx
+ xorl 48(%rsp),%ebp
+ movl %edi,%eax
+
+ movl %esi,%ecx
+ xorl 56(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 16(%rsp),%ebp
+ leal -899497514(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ xorl 52(%rsp),%r14d
+ movl %esi,%eax
+
+ movl %r13d,%ecx
+ xorl 60(%rsp),%r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%r14d
+ leal -899497514(%rbp,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%r14d
+ xorl 56(%rsp),%edx
+ movl %r13d,%eax
+
+ movl %r12d,%ecx
+ xorl 0(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 24(%rsp),%edx
+ leal -899497514(%r14,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ xorl 60(%rsp),%ebp
+ movl %r12d,%eax
+
+ movl %r11d,%ecx
+ xorl 4(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 28(%rsp),%ebp
+ leal -899497514(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ movl %r11d,%eax
+ movl %edi,%ecx
+ xorl %r13d,%eax
+ leal -899497514(%rbp,%rsi,1),%esi
+ roll $5,%ecx
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ addl 0(%r8),%esi
+ addl 4(%r8),%edi
+ addl 8(%r8),%r11d
+ addl 12(%r8),%r12d
+ addl 16(%r8),%r13d
+ movl %esi,0(%r8)
+ movl %edi,4(%r8)
+ movl %r11d,8(%r8)
+ movl %r12d,12(%r8)
+ movl %r13d,16(%r8)
+
+ subq $1,%r10
+ leaq 64(%r9),%r9
+ jnz .Lloop
+
+ movq 64(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_block_data_order,.-sha1_block_data_order
+.type sha1_block_data_order_shaext,@function
+.align 32
+sha1_block_data_order_shaext:
+_shaext_shortcut:
+.cfi_startproc
+ movdqu (%rdi),%xmm0
+ movd 16(%rdi),%xmm1
+ movdqa K_XX_XX+160(%rip),%xmm3
+
+ movdqu (%rsi),%xmm4
+ pshufd $27,%xmm0,%xmm0
+ movdqu 16(%rsi),%xmm5
+ pshufd $27,%xmm1,%xmm1
+ movdqu 32(%rsi),%xmm6
+.byte 102,15,56,0,227
+ movdqu 48(%rsi),%xmm7
+.byte 102,15,56,0,235
+.byte 102,15,56,0,243
+ movdqa %xmm1,%xmm9
+.byte 102,15,56,0,251
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ decq %rdx
+ leaq 64(%rsi),%r8
+ paddd %xmm4,%xmm1
+ cmovneq %r8,%rsi
+ movdqa %xmm0,%xmm8
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+ movdqu (%rsi),%xmm4
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,213
+ movdqu 16(%rsi),%xmm5
+.byte 102,15,56,0,227
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,206
+ movdqu 32(%rsi),%xmm6
+.byte 102,15,56,0,235
+
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,215
+ movdqu 48(%rsi),%xmm7
+.byte 102,15,56,0,243
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 65,15,56,200,201
+.byte 102,15,56,0,251
+
+ paddd %xmm8,%xmm0
+ movdqa %xmm1,%xmm9
+
+ jnz .Loop_shaext
+
+ pshufd $27,%xmm0,%xmm0
+ pshufd $27,%xmm1,%xmm1
+ movdqu %xmm0,(%rdi)
+ movd %xmm1,16(%rdi)
+.cfi_endproc
+ .byte 0xf3,0xc3
+.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
+.type sha1_block_data_order_ssse3,@function
+.align 16
+sha1_block_data_order_ssse3:
+_ssse3_shortcut:
+.cfi_startproc
+ movq %rsp,%r11
+.cfi_def_cfa_register %r11
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ leaq -64(%rsp),%rsp
+ andq $-64,%rsp
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ shlq $6,%r10
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r14
+
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl %ebx,%esi
+ movl 16(%r8),%ebp
+ movl %ecx,%edi
+ xorl %edx,%edi
+ andl %edi,%esi
+
+ movdqa 64(%r14),%xmm6
+ movdqa -64(%r14),%xmm9
+ movdqu 0(%r9),%xmm0
+ movdqu 16(%r9),%xmm1
+ movdqu 32(%r9),%xmm2
+ movdqu 48(%r9),%xmm3
+.byte 102,15,56,0,198
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+ addq $64,%r9
+ paddd %xmm9,%xmm0
+.byte 102,15,56,0,222
+ paddd %xmm9,%xmm1
+ paddd %xmm9,%xmm2
+ movdqa %xmm0,0(%rsp)
+ psubd %xmm9,%xmm0
+ movdqa %xmm1,16(%rsp)
+ psubd %xmm9,%xmm1
+ movdqa %xmm2,32(%rsp)
+ psubd %xmm9,%xmm2
+ jmp .Loop_ssse3
+.align 16
+.Loop_ssse3:
+ rorl $2,%ebx
+ pshufd $238,%xmm0,%xmm4
+ xorl %edx,%esi
+ movdqa %xmm3,%xmm8
+ paddd %xmm3,%xmm9
+ movl %eax,%edi
+ addl 0(%rsp),%ebp
+ punpcklqdq %xmm1,%xmm4
+ xorl %ecx,%ebx
+ roll $5,%eax
+ addl %esi,%ebp
+ psrldq $4,%xmm8
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ pxor %xmm0,%xmm4
+ addl %eax,%ebp
+ rorl $7,%eax
+ pxor %xmm2,%xmm8
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 4(%rsp),%edx
+ pxor %xmm8,%xmm4
+ xorl %ebx,%eax
+ roll $5,%ebp
+ movdqa %xmm9,48(%rsp)
+ addl %edi,%edx
+ andl %eax,%esi
+ movdqa %xmm4,%xmm10
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ rorl $7,%ebp
+ movdqa %xmm4,%xmm8
+ xorl %ebx,%esi
+ pslldq $12,%xmm10
+ paddd %xmm4,%xmm4
+ movl %edx,%edi
+ addl 8(%rsp),%ecx
+ psrld $31,%xmm8
+ xorl %eax,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ movdqa %xmm10,%xmm9
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ psrld $30,%xmm10
+ addl %edx,%ecx
+ rorl $7,%edx
+ por %xmm8,%xmm4
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 12(%rsp),%ebx
+ pslld $2,%xmm9
+ pxor %xmm10,%xmm4
+ xorl %ebp,%edx
+ movdqa -64(%r14),%xmm10
+ roll $5,%ecx
+ addl %edi,%ebx
+ andl %edx,%esi
+ pxor %xmm9,%xmm4
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pshufd $238,%xmm1,%xmm5
+ xorl %ebp,%esi
+ movdqa %xmm4,%xmm9
+ paddd %xmm4,%xmm10
+ movl %ebx,%edi
+ addl 16(%rsp),%eax
+ punpcklqdq %xmm2,%xmm5
+ xorl %edx,%ecx
+ roll $5,%ebx
+ addl %esi,%eax
+ psrldq $4,%xmm9
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ pxor %xmm1,%xmm5
+ addl %ebx,%eax
+ rorl $7,%ebx
+ pxor %xmm3,%xmm9
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 20(%rsp),%ebp
+ pxor %xmm9,%xmm5
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm10,0(%rsp)
+ addl %edi,%ebp
+ andl %ebx,%esi
+ movdqa %xmm5,%xmm8
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ rorl $7,%eax
+ movdqa %xmm5,%xmm9
+ xorl %ecx,%esi
+ pslldq $12,%xmm8
+ paddd %xmm5,%xmm5
+ movl %ebp,%edi
+ addl 24(%rsp),%edx
+ psrld $31,%xmm9
+ xorl %ebx,%eax
+ roll $5,%ebp
+ addl %esi,%edx
+ movdqa %xmm8,%xmm10
+ andl %eax,%edi
+ xorl %ebx,%eax
+ psrld $30,%xmm8
+ addl %ebp,%edx
+ rorl $7,%ebp
+ por %xmm9,%xmm5
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 28(%rsp),%ecx
+ pslld $2,%xmm10
+ pxor %xmm8,%xmm5
+ xorl %eax,%ebp
+ movdqa -32(%r14),%xmm8
+ roll $5,%edx
+ addl %edi,%ecx
+ andl %ebp,%esi
+ pxor %xmm10,%xmm5
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ rorl $7,%edx
+ pshufd $238,%xmm2,%xmm6
+ xorl %eax,%esi
+ movdqa %xmm5,%xmm10
+ paddd %xmm5,%xmm8
+ movl %ecx,%edi
+ addl 32(%rsp),%ebx
+ punpcklqdq %xmm3,%xmm6
+ xorl %ebp,%edx
+ roll $5,%ecx
+ addl %esi,%ebx
+ psrldq $4,%xmm10
+ andl %edx,%edi
+ xorl %ebp,%edx
+ pxor %xmm2,%xmm6
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pxor %xmm4,%xmm10
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ addl 36(%rsp),%eax
+ pxor %xmm10,%xmm6
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm8,16(%rsp)
+ addl %edi,%eax
+ andl %ecx,%esi
+ movdqa %xmm6,%xmm9
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ rorl $7,%ebx
+ movdqa %xmm6,%xmm10
+ xorl %edx,%esi
+ pslldq $12,%xmm9
+ paddd %xmm6,%xmm6
+ movl %eax,%edi
+ addl 40(%rsp),%ebp
+ psrld $31,%xmm10
+ xorl %ecx,%ebx
+ roll $5,%eax
+ addl %esi,%ebp
+ movdqa %xmm9,%xmm8
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ psrld $30,%xmm9
+ addl %eax,%ebp
+ rorl $7,%eax
+ por %xmm10,%xmm6
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 44(%rsp),%edx
+ pslld $2,%xmm8
+ pxor %xmm9,%xmm6
+ xorl %ebx,%eax
+ movdqa -32(%r14),%xmm9
+ roll $5,%ebp
+ addl %edi,%edx
+ andl %eax,%esi
+ pxor %xmm8,%xmm6
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ rorl $7,%ebp
+ pshufd $238,%xmm3,%xmm7
+ xorl %ebx,%esi
+ movdqa %xmm6,%xmm8
+ paddd %xmm6,%xmm9
+ movl %edx,%edi
+ addl 48(%rsp),%ecx
+ punpcklqdq %xmm4,%xmm7
+ xorl %eax,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ psrldq $4,%xmm8
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ pxor %xmm3,%xmm7
+ addl %edx,%ecx
+ rorl $7,%edx
+ pxor %xmm5,%xmm8
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 52(%rsp),%ebx
+ pxor %xmm8,%xmm7
+ xorl %ebp,%edx
+ roll $5,%ecx
+ movdqa %xmm9,32(%rsp)
+ addl %edi,%ebx
+ andl %edx,%esi
+ movdqa %xmm7,%xmm10
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ movdqa %xmm7,%xmm8
+ xorl %ebp,%esi
+ pslldq $12,%xmm10
+ paddd %xmm7,%xmm7
+ movl %ebx,%edi
+ addl 56(%rsp),%eax
+ psrld $31,%xmm8
+ xorl %edx,%ecx
+ roll $5,%ebx
+ addl %esi,%eax
+ movdqa %xmm10,%xmm9
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ psrld $30,%xmm10
+ addl %ebx,%eax
+ rorl $7,%ebx
+ por %xmm8,%xmm7
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 60(%rsp),%ebp
+ pslld $2,%xmm9
+ pxor %xmm10,%xmm7
+ xorl %ecx,%ebx
+ movdqa -32(%r14),%xmm10
+ roll $5,%eax
+ addl %edi,%ebp
+ andl %ebx,%esi
+ pxor %xmm9,%xmm7
+ pshufd $238,%xmm6,%xmm9
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ rorl $7,%eax
+ pxor %xmm4,%xmm0
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ addl 0(%rsp),%edx
+ punpcklqdq %xmm7,%xmm9
+ xorl %ebx,%eax
+ roll $5,%ebp
+ pxor %xmm1,%xmm0
+ addl %esi,%edx
+ andl %eax,%edi
+ movdqa %xmm10,%xmm8
+ xorl %ebx,%eax
+ paddd %xmm7,%xmm10
+ addl %ebp,%edx
+ pxor %xmm9,%xmm0
+ rorl $7,%ebp
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 4(%rsp),%ecx
+ movdqa %xmm0,%xmm9
+ xorl %eax,%ebp
+ roll $5,%edx
+ movdqa %xmm10,48(%rsp)
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ pslld $2,%xmm0
+ addl %edx,%ecx
+ rorl $7,%edx
+ psrld $30,%xmm9
+ xorl %eax,%esi
+ movl %ecx,%edi
+ addl 8(%rsp),%ebx
+ por %xmm9,%xmm0
+ xorl %ebp,%edx
+ roll $5,%ecx
+ pshufd $238,%xmm7,%xmm10
+ addl %esi,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ pxor %xmm5,%xmm1
+ addl 16(%rsp),%ebp
+ xorl %ecx,%esi
+ punpcklqdq %xmm0,%xmm10
+ movl %eax,%edi
+ roll $5,%eax
+ pxor %xmm2,%xmm1
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ movdqa %xmm8,%xmm9
+ rorl $7,%ebx
+ paddd %xmm0,%xmm8
+ addl %eax,%ebp
+ pxor %xmm10,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ movdqa %xmm1,%xmm10
+ addl %edi,%edx
+ xorl %ebx,%esi
+ movdqa %xmm8,0(%rsp)
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 24(%rsp),%ecx
+ pslld $2,%xmm1
+ xorl %eax,%esi
+ movl %edx,%edi
+ psrld $30,%xmm10
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ por %xmm10,%xmm1
+ addl %edx,%ecx
+ addl 28(%rsp),%ebx
+ pshufd $238,%xmm0,%xmm8
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ pxor %xmm6,%xmm2
+ addl 32(%rsp),%eax
+ xorl %edx,%esi
+ punpcklqdq %xmm1,%xmm8
+ movl %ebx,%edi
+ roll $5,%ebx
+ pxor %xmm3,%xmm2
+ addl %esi,%eax
+ xorl %edx,%edi
+ movdqa 0(%r14),%xmm10
+ rorl $7,%ecx
+ paddd %xmm1,%xmm9
+ addl %ebx,%eax
+ pxor %xmm8,%xmm2
+ addl 36(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ movdqa %xmm2,%xmm8
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ movdqa %xmm9,16(%rsp)
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 40(%rsp),%edx
+ pslld $2,%xmm2
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ psrld $30,%xmm8
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ por %xmm8,%xmm2
+ addl %ebp,%edx
+ addl 44(%rsp),%ecx
+ pshufd $238,%xmm1,%xmm9
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ pxor %xmm7,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %ebp,%esi
+ punpcklqdq %xmm2,%xmm9
+ movl %ecx,%edi
+ roll $5,%ecx
+ pxor %xmm4,%xmm3
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ movdqa %xmm10,%xmm8
+ rorl $7,%edx
+ paddd %xmm2,%xmm10
+ addl %ecx,%ebx
+ pxor %xmm9,%xmm3
+ addl 52(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ movdqa %xmm3,%xmm9
+ addl %edi,%eax
+ xorl %edx,%esi
+ movdqa %xmm10,32(%rsp)
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 56(%rsp),%ebp
+ pslld $2,%xmm3
+ xorl %ecx,%esi
+ movl %eax,%edi
+ psrld $30,%xmm9
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ rorl $7,%ebx
+ por %xmm9,%xmm3
+ addl %eax,%ebp
+ addl 60(%rsp),%edx
+ pshufd $238,%xmm2,%xmm10
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ pxor %xmm0,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %eax,%esi
+ punpcklqdq %xmm3,%xmm10
+ movl %edx,%edi
+ roll $5,%edx
+ pxor %xmm5,%xmm4
+ addl %esi,%ecx
+ xorl %eax,%edi
+ movdqa %xmm8,%xmm9
+ rorl $7,%ebp
+ paddd %xmm3,%xmm8
+ addl %edx,%ecx
+ pxor %xmm10,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ movdqa %xmm4,%xmm10
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ movdqa %xmm8,48(%rsp)
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 8(%rsp),%eax
+ pslld $2,%xmm4
+ xorl %edx,%esi
+ movl %ebx,%edi
+ psrld $30,%xmm10
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ rorl $7,%ecx
+ por %xmm10,%xmm4
+ addl %ebx,%eax
+ addl 12(%rsp),%ebp
+ pshufd $238,%xmm3,%xmm8
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ pxor %xmm1,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ebx,%esi
+ punpcklqdq %xmm4,%xmm8
+ movl %ebp,%edi
+ roll $5,%ebp
+ pxor %xmm6,%xmm5
+ addl %esi,%edx
+ xorl %ebx,%edi
+ movdqa %xmm9,%xmm10
+ rorl $7,%eax
+ paddd %xmm4,%xmm9
+ addl %ebp,%edx
+ pxor %xmm8,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ movdqa %xmm5,%xmm8
+ addl %edi,%ecx
+ xorl %eax,%esi
+ movdqa %xmm9,0(%rsp)
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 24(%rsp),%ebx
+ pslld $2,%xmm5
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ psrld $30,%xmm8
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ por %xmm8,%xmm5
+ addl %ecx,%ebx
+ addl 28(%rsp),%eax
+ pshufd $238,%xmm4,%xmm9
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ pxor %xmm2,%xmm6
+ addl 32(%rsp),%ebp
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ punpcklqdq %xmm5,%xmm9
+ movl %eax,%edi
+ xorl %ecx,%esi
+ pxor %xmm7,%xmm6
+ roll $5,%eax
+ addl %esi,%ebp
+ movdqa %xmm10,%xmm8
+ xorl %ebx,%edi
+ paddd %xmm5,%xmm10
+ xorl %ecx,%ebx
+ pxor %xmm9,%xmm6
+ addl %eax,%ebp
+ addl 36(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movdqa %xmm6,%xmm9
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ movdqa %xmm10,16(%rsp)
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ pslld $2,%xmm6
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ psrld $30,%xmm9
+ addl 40(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ por %xmm9,%xmm6
+ rorl $7,%ebp
+ movl %edx,%edi
+ xorl %eax,%esi
+ roll $5,%edx
+ pshufd $238,%xmm5,%xmm10
+ addl %esi,%ecx
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 44(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ pxor %xmm3,%xmm7
+ addl 48(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ rorl $7,%ecx
+ punpcklqdq %xmm6,%xmm10
+ movl %ebx,%edi
+ xorl %edx,%esi
+ pxor %xmm0,%xmm7
+ roll $5,%ebx
+ addl %esi,%eax
+ movdqa 32(%r14),%xmm9
+ xorl %ecx,%edi
+ paddd %xmm6,%xmm8
+ xorl %edx,%ecx
+ pxor %xmm10,%xmm7
+ addl %ebx,%eax
+ addl 52(%rsp),%ebp
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movdqa %xmm7,%xmm10
+ movl %eax,%esi
+ xorl %ecx,%edi
+ movdqa %xmm8,32(%rsp)
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ pslld $2,%xmm7
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ psrld $30,%xmm10
+ addl 56(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ por %xmm10,%xmm7
+ rorl $7,%eax
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ roll $5,%ebp
+ pshufd $238,%xmm6,%xmm8
+ addl %esi,%edx
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 60(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ rorl $7,%ebp
+ movl %edx,%esi
+ xorl %eax,%edi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ pxor %xmm4,%xmm0
+ addl 0(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ rorl $7,%edx
+ punpcklqdq %xmm7,%xmm8
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ pxor %xmm1,%xmm0
+ roll $5,%ecx
+ addl %esi,%ebx
+ movdqa %xmm9,%xmm10
+ xorl %edx,%edi
+ paddd %xmm7,%xmm9
+ xorl %ebp,%edx
+ pxor %xmm8,%xmm0
+ addl %ecx,%ebx
+ addl 4(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ rorl $7,%ecx
+ movdqa %xmm0,%xmm8
+ movl %ebx,%esi
+ xorl %edx,%edi
+ movdqa %xmm9,48(%rsp)
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ pslld $2,%xmm0
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ psrld $30,%xmm8
+ addl 8(%rsp),%ebp
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ por %xmm8,%xmm0
+ rorl $7,%ebx
+ movl %eax,%edi
+ xorl %ecx,%esi
+ roll $5,%eax
+ pshufd $238,%xmm7,%xmm9
+ addl %esi,%ebp
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 12(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ pxor %xmm5,%xmm1
+ addl 16(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%ebp
+ punpcklqdq %xmm0,%xmm9
+ movl %edx,%edi
+ xorl %eax,%esi
+ pxor %xmm2,%xmm1
+ roll $5,%edx
+ addl %esi,%ecx
+ movdqa %xmm10,%xmm8
+ xorl %ebp,%edi
+ paddd %xmm0,%xmm10
+ xorl %eax,%ebp
+ pxor %xmm9,%xmm1
+ addl %edx,%ecx
+ addl 20(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ rorl $7,%edx
+ movdqa %xmm1,%xmm9
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ movdqa %xmm10,0(%rsp)
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ pslld $2,%xmm1
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ psrld $30,%xmm9
+ addl 24(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ por %xmm9,%xmm1
+ rorl $7,%ecx
+ movl %ebx,%edi
+ xorl %edx,%esi
+ roll $5,%ebx
+ pshufd $238,%xmm0,%xmm10
+ addl %esi,%eax
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%rsp),%ebp
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%edi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ pxor %xmm6,%xmm2
+ addl 32(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ punpcklqdq %xmm1,%xmm10
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ pxor %xmm3,%xmm2
+ roll $5,%ebp
+ addl %esi,%edx
+ movdqa %xmm8,%xmm9
+ xorl %eax,%edi
+ paddd %xmm1,%xmm8
+ xorl %ebx,%eax
+ pxor %xmm10,%xmm2
+ addl %ebp,%edx
+ addl 36(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ rorl $7,%ebp
+ movdqa %xmm2,%xmm10
+ movl %edx,%esi
+ xorl %eax,%edi
+ movdqa %xmm8,16(%rsp)
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ pslld $2,%xmm2
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ psrld $30,%xmm10
+ addl 40(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ por %xmm10,%xmm2
+ rorl $7,%edx
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ roll $5,%ecx
+ pshufd $238,%xmm1,%xmm8
+ addl %esi,%ebx
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 44(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ pxor %xmm7,%xmm3
+ addl 48(%rsp),%ebp
+ xorl %ecx,%esi
+ punpcklqdq %xmm2,%xmm8
+ movl %eax,%edi
+ roll $5,%eax
+ pxor %xmm4,%xmm3
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ movdqa %xmm9,%xmm10
+ rorl $7,%ebx
+ paddd %xmm2,%xmm9
+ addl %eax,%ebp
+ pxor %xmm8,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ movdqa %xmm3,%xmm8
+ addl %edi,%edx
+ xorl %ebx,%esi
+ movdqa %xmm9,32(%rsp)
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 56(%rsp),%ecx
+ pslld $2,%xmm3
+ xorl %eax,%esi
+ movl %edx,%edi
+ psrld $30,%xmm8
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ por %xmm8,%xmm3
+ addl %edx,%ecx
+ addl 60(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 0(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ paddd %xmm3,%xmm10
+ addl %esi,%eax
+ xorl %edx,%edi
+ movdqa %xmm10,48(%rsp)
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 4(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 8(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 12(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ cmpq %r10,%r9
+ je .Ldone_ssse3
+ movdqa 64(%r14),%xmm6
+ movdqa -64(%r14),%xmm9
+ movdqu 0(%r9),%xmm0
+ movdqu 16(%r9),%xmm1
+ movdqu 32(%r9),%xmm2
+ movdqu 48(%r9),%xmm3
+.byte 102,15,56,0,198
+ addq $64,%r9
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+.byte 102,15,56,0,206
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ paddd %xmm9,%xmm0
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ movdqa %xmm0,0(%rsp)
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ psubd %xmm9,%xmm0
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+.byte 102,15,56,0,214
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ paddd %xmm9,%xmm1
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ movdqa %xmm1,16(%rsp)
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ psubd %xmm9,%xmm1
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+.byte 102,15,56,0,222
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ paddd %xmm9,%xmm2
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ movdqa %xmm2,32(%rsp)
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ psubd %xmm9,%xmm2
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ addl 12(%r8),%edx
+ movl %eax,0(%r8)
+ addl 16(%r8),%ebp
+ movl %esi,4(%r8)
+ movl %esi,%ebx
+ movl %ecx,8(%r8)
+ movl %ecx,%edi
+ movl %edx,12(%r8)
+ xorl %edx,%edi
+ movl %ebp,16(%r8)
+ andl %edi,%esi
+ jmp .Loop_ssse3
+
+.align 16
+.Ldone_ssse3:
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ movl %eax,0(%r8)
+ addl 12(%r8),%edx
+ movl %esi,4(%r8)
+ addl 16(%r8),%ebp
+ movl %ecx,8(%r8)
+ movl %edx,12(%r8)
+ movl %ebp,16(%r8)
+ movq -40(%r11),%r14
+.cfi_restore %r14
+ movq -32(%r11),%r13
+.cfi_restore %r13
+ movq -24(%r11),%r12
+.cfi_restore %r12
+ movq -16(%r11),%rbp
+.cfi_restore %rbp
+ movq -8(%r11),%rbx
+.cfi_restore %rbx
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_ssse3:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+.type sha1_block_data_order_avx,@function
+.align 16
+sha1_block_data_order_avx:
+_avx_shortcut:
+.cfi_startproc
+ movq %rsp,%r11
+.cfi_def_cfa_register %r11
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ leaq -64(%rsp),%rsp
+ vzeroupper
+ andq $-64,%rsp
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ shlq $6,%r10
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r14
+
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl %ebx,%esi
+ movl 16(%r8),%ebp
+ movl %ecx,%edi
+ xorl %edx,%edi
+ andl %edi,%esi
+
+ vmovdqa 64(%r14),%xmm6
+ vmovdqa -64(%r14),%xmm11
+ vmovdqu 0(%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r9
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm11,%xmm0,%xmm4
+ vpaddd %xmm11,%xmm1,%xmm5
+ vpaddd %xmm11,%xmm2,%xmm6
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ jmp .Loop_avx
+.align 16
+.Loop_avx:
+ shrdl $2,%ebx,%ebx
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%edi
+ addl 0(%rsp),%ebp
+ vpaddd %xmm3,%xmm11,%xmm9
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm8
+ addl %esi,%ebp
+ andl %ebx,%edi
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm2,%xmm8,%xmm8
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 4(%rsp),%edx
+ vpxor %xmm8,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%edx
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm8
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm10
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%edi
+ addl 8(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm4,%xmm4
+ addl %esi,%ecx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm4,%xmm4
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 12(%rsp),%ebx
+ vpxor %xmm10,%xmm4,%xmm4
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ andl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%edi
+ addl 16(%rsp),%eax
+ vpaddd %xmm4,%xmm11,%xmm9
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm8
+ addl %esi,%eax
+ andl %ecx,%edi
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm8,%xmm8
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 20(%rsp),%ebp
+ vpxor %xmm8,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ebp
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm8
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm10
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %ebp,%edi
+ addl 24(%rsp),%edx
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm5,%xmm5
+ addl %esi,%edx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm5,%xmm5
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 28(%rsp),%ecx
+ vpxor %xmm10,%xmm5,%xmm5
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vmovdqa -32(%r14),%xmm11
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%edi
+ addl 32(%rsp),%ebx
+ vpaddd %xmm5,%xmm11,%xmm9
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm8
+ addl %esi,%ebx
+ andl %edx,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm8,%xmm8
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ addl 36(%rsp),%eax
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm8
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm10
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%edi
+ addl 40(%rsp),%ebp
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm6,%xmm6
+ addl %esi,%ebp
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 44(%rsp),%edx
+ vpxor %xmm10,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%edi
+ addl 48(%rsp),%ecx
+ vpaddd %xmm6,%xmm11,%xmm9
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm8
+ addl %esi,%ecx
+ andl %ebp,%edi
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm8,%xmm8
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 52(%rsp),%ebx
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm8
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpslldq $12,%xmm7,%xmm10
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%edi
+ addl 56(%rsp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm7,%xmm7
+ addl %esi,%eax
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 60(%rsp),%ebp
+ vpxor %xmm10,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ addl 0(%rsp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm7,%xmm11,%xmm9
+ addl %esi,%edx
+ andl %eax,%edi
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ movl %edx,%esi
+ addl 4(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ addl 8(%rsp),%ebx
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm0,%xmm11,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm1,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm1,%xmm1
+ addl 28(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ addl %esi,%eax
+ xorl %edx,%edi
+ vpaddd %xmm1,%xmm11,%xmm9
+ vmovdqa 0(%r14),%xmm11
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm8,%xmm2,%xmm2
+ addl 36(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpor %xmm8,%xmm2,%xmm2
+ addl 44(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ vpaddd %xmm2,%xmm11,%xmm9
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ vpxor %xmm0,%xmm4,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %esi,%ecx
+ xorl %eax,%edi
+ vpaddd %xmm3,%xmm11,%xmm9
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpxor %xmm8,%xmm4,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm8,%xmm4,%xmm4
+ addl 12(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpxor %xmm6,%xmm5,%xmm5
+ addl %esi,%edx
+ xorl %ebx,%edi
+ vpaddd %xmm4,%xmm11,%xmm9
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpxor %xmm8,%xmm5,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm8,%xmm5,%xmm5
+ addl 28(%rsp),%eax
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%rsp),%ebp
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %eax,%edi
+ xorl %ecx,%esi
+ vpaddd %xmm5,%xmm11,%xmm9
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 36(%rsp),%edx
+ vpsrld $30,%xmm6,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 40(%rsp),%ecx
+ andl %eax,%esi
+ vpor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%edi
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 44(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ movl %ebx,%edi
+ xorl %edx,%esi
+ vpaddd %xmm6,%xmm11,%xmm9
+ vmovdqa 32(%r14),%xmm11
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%rsp),%ebp
+ vpsrld $30,%xmm7,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 56(%rsp),%edx
+ andl %ebx,%esi
+ vpor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 60(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ addl 0(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ vpaddd %xmm7,%xmm11,%xmm9
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 4(%rsp),%eax
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%rsp),%ebp
+ andl %ecx,%esi
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%edi
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 12(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ vpxor %xmm2,%xmm1,%xmm1
+ movl %edx,%edi
+ xorl %eax,%esi
+ vpaddd %xmm0,%xmm11,%xmm9
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 20(%rsp),%ebx
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 24(%rsp),%eax
+ andl %edx,%esi
+ vpor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%edi
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%rsp),%ebp
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ vpaddd %xmm1,%xmm11,%xmm9
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ vpxor %xmm8,%xmm2,%xmm2
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 36(%rsp),%ecx
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 40(%rsp),%ebx
+ andl %ebp,%esi
+ vpor %xmm8,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 44(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm2,%xmm11,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 0(%rsp),%eax
+ vpaddd %xmm3,%xmm11,%xmm9
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm9,48(%rsp)
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 8(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 12(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ cmpq %r10,%r9
+ je .Ldone_avx
+ vmovdqa 64(%r14),%xmm6
+ vmovdqa -64(%r14),%xmm11
+ vmovdqu 0(%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r9
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm11,%xmm0,%xmm4
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm4,0(%rsp)
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpaddd %xmm11,%xmm1,%xmm5
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vmovdqa %xmm5,16(%rsp)
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm11,%xmm2,%xmm6
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vmovdqa %xmm6,32(%rsp)
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ addl 12(%r8),%edx
+ movl %eax,0(%r8)
+ addl 16(%r8),%ebp
+ movl %esi,4(%r8)
+ movl %esi,%ebx
+ movl %ecx,8(%r8)
+ movl %ecx,%edi
+ movl %edx,12(%r8)
+ xorl %edx,%edi
+ movl %ebp,16(%r8)
+ andl %edi,%esi
+ jmp .Loop_avx
+
+.align 16
+.Ldone_avx:
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vzeroupper
+
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ movl %eax,0(%r8)
+ addl 12(%r8),%edx
+ movl %esi,4(%r8)
+ addl 16(%r8),%ebp
+ movl %ecx,8(%r8)
+ movl %edx,12(%r8)
+ movl %ebp,16(%r8)
+ movq -40(%r11),%r14
+.cfi_restore %r14
+ movq -32(%r11),%r13
+.cfi_restore %r13
+ movq -24(%r11),%r12
+.cfi_restore %r12
+ movq -16(%r11),%rbp
+.cfi_restore %rbp
+ movq -8(%r11),%rbx
+.cfi_restore %rbx
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
+.type sha1_block_data_order_avx2,@function
+.align 16
+sha1_block_data_order_avx2:
+_avx2_shortcut:
+.cfi_startproc
+ movq %rsp,%r11
+.cfi_def_cfa_register %r11
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ vzeroupper
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ leaq -640(%rsp),%rsp
+ shlq $6,%r10
+ leaq 64(%r9),%r13
+ andq $-128,%rsp
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r14
+
+ movl 0(%r8),%eax
+ cmpq %r10,%r13
+ cmovaeq %r9,%r13
+ movl 4(%r8),%ebp
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl 16(%r8),%esi
+ vmovdqu 64(%r14),%ymm6
+
+ vmovdqu (%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ leaq 64(%r9),%r9
+ vinserti128 $1,(%r13),%ymm0,%ymm0
+ vinserti128 $1,16(%r13),%ymm1,%ymm1
+ vpshufb %ymm6,%ymm0,%ymm0
+ vinserti128 $1,32(%r13),%ymm2,%ymm2
+ vpshufb %ymm6,%ymm1,%ymm1
+ vinserti128 $1,48(%r13),%ymm3,%ymm3
+ vpshufb %ymm6,%ymm2,%ymm2
+ vmovdqu -64(%r14),%ymm11
+ vpshufb %ymm6,%ymm3,%ymm3
+
+ vpaddd %ymm11,%ymm0,%ymm4
+ vpaddd %ymm11,%ymm1,%ymm5
+ vmovdqu %ymm4,0(%rsp)
+ vpaddd %ymm11,%ymm2,%ymm6
+ vmovdqu %ymm5,32(%rsp)
+ vpaddd %ymm11,%ymm3,%ymm7
+ vmovdqu %ymm6,64(%rsp)
+ vmovdqu %ymm7,96(%rsp)
+ vpalignr $8,%ymm0,%ymm1,%ymm4
+ vpsrldq $4,%ymm3,%ymm8
+ vpxor %ymm0,%ymm4,%ymm4
+ vpxor %ymm2,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $31,%ymm4,%ymm8
+ vpslldq $12,%ymm4,%ymm10
+ vpaddd %ymm4,%ymm4,%ymm4
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm4,%ymm4
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm4,%ymm4
+ vpxor %ymm10,%ymm4,%ymm4
+ vpaddd %ymm11,%ymm4,%ymm9
+ vmovdqu %ymm9,128(%rsp)
+ vpalignr $8,%ymm1,%ymm2,%ymm5
+ vpsrldq $4,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm3,%ymm8,%ymm8
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $31,%ymm5,%ymm8
+ vmovdqu -32(%r14),%ymm11
+ vpslldq $12,%ymm5,%ymm10
+ vpaddd %ymm5,%ymm5,%ymm5
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm5,%ymm5
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm5,%ymm9
+ vmovdqu %ymm9,160(%rsp)
+ vpalignr $8,%ymm2,%ymm3,%ymm6
+ vpsrldq $4,%ymm5,%ymm8
+ vpxor %ymm2,%ymm6,%ymm6
+ vpxor %ymm4,%ymm8,%ymm8
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $31,%ymm6,%ymm8
+ vpslldq $12,%ymm6,%ymm10
+ vpaddd %ymm6,%ymm6,%ymm6
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm6,%ymm6
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm6,%ymm6
+ vpxor %ymm10,%ymm6,%ymm6
+ vpaddd %ymm11,%ymm6,%ymm9
+ vmovdqu %ymm9,192(%rsp)
+ vpalignr $8,%ymm3,%ymm4,%ymm7
+ vpsrldq $4,%ymm6,%ymm8
+ vpxor %ymm3,%ymm7,%ymm7
+ vpxor %ymm5,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm7,%ymm8
+ vpslldq $12,%ymm7,%ymm10
+ vpaddd %ymm7,%ymm7,%ymm7
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm7,%ymm7
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm7,%ymm7
+ vpxor %ymm10,%ymm7,%ymm7
+ vpaddd %ymm11,%ymm7,%ymm9
+ vmovdqu %ymm9,224(%rsp)
+ leaq 128(%rsp),%r13
+ jmp .Loop_avx2
+.align 32
+.Loop_avx2:
+ rorxl $2,%ebp,%ebx
+ andnl %edx,%ebp,%edi
+ andl %ecx,%ebp
+ xorl %edi,%ebp
+ jmp .Lalign32_1
+.align 32
+.Lalign32_1:
+ vpalignr $8,%ymm6,%ymm7,%ymm8
+ vpxor %ymm4,%ymm0,%ymm0
+ addl -128(%r13),%esi
+ andnl %ecx,%eax,%edi
+ vpxor %ymm1,%ymm0,%ymm0
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpxor %ymm8,%ymm0,%ymm0
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ vpsrld $30,%ymm0,%ymm8
+ vpslld $2,%ymm0,%ymm0
+ addl -124(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ vpor %ymm8,%ymm0,%ymm0
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -120(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ vpaddd %ymm11,%ymm0,%ymm9
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ vmovdqu %ymm9,256(%rsp)
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -116(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -96(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ vpalignr $8,%ymm7,%ymm0,%ymm8
+ vpxor %ymm5,%ymm1,%ymm1
+ addl -92(%r13),%eax
+ andnl %edx,%ebp,%edi
+ vpxor %ymm2,%ymm1,%ymm1
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ vpxor %ymm8,%ymm1,%ymm1
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ vpsrld $30,%ymm1,%ymm8
+ vpslld $2,%ymm1,%ymm1
+ addl -88(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ vpor %ymm8,%ymm1,%ymm1
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -84(%r13),%edx
+ andnl %ebx,%esi,%edi
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ vmovdqu %ymm9,288(%rsp)
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -64(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -60(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ vpalignr $8,%ymm0,%ymm1,%ymm8
+ vpxor %ymm6,%ymm2,%ymm2
+ addl -56(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ vpxor %ymm3,%ymm2,%ymm2
+ vmovdqu 0(%r14),%ymm11
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpxor %ymm8,%ymm2,%ymm2
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ vpsrld $30,%ymm2,%ymm8
+ vpslld $2,%ymm2,%ymm2
+ addl -52(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ vpor %ymm8,%ymm2,%ymm2
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -32(%r13),%esi
+ andnl %ecx,%eax,%edi
+ vpaddd %ymm11,%ymm2,%ymm9
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ vmovdqu %ymm9,320(%rsp)
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -28(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -24(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ vpalignr $8,%ymm1,%ymm2,%ymm8
+ vpxor %ymm7,%ymm3,%ymm3
+ addl -20(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ vpxor %ymm4,%ymm3,%ymm3
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpxor %ymm8,%ymm3,%ymm3
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ vpsrld $30,%ymm3,%ymm8
+ vpslld $2,%ymm3,%ymm3
+ addl 0(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ vpor %ymm8,%ymm3,%ymm3
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl 4(%r13),%eax
+ andnl %edx,%ebp,%edi
+ vpaddd %ymm11,%ymm3,%ymm9
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ vmovdqu %ymm9,352(%rsp)
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl 8(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl 12(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vpalignr $8,%ymm2,%ymm3,%ymm8
+ vpxor %ymm0,%ymm4,%ymm4
+ addl 32(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpxor %ymm8,%ymm4,%ymm4
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 36(%r13),%ebx
+ vpsrld $30,%ymm4,%ymm8
+ vpslld $2,%ymm4,%ymm4
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vpor %ymm8,%ymm4,%ymm4
+ addl 40(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpaddd %ymm11,%ymm4,%ymm9
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 44(%r13),%eax
+ vmovdqu %ymm9,384(%rsp)
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpalignr $8,%ymm3,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ addl 68(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm6,%ymm5,%ymm5
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ vpxor %ymm8,%ymm5,%ymm5
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 72(%r13),%ecx
+ vpsrld $30,%ymm5,%ymm8
+ vpslld $2,%ymm5,%ymm5
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ vpor %ymm8,%ymm5,%ymm5
+ addl 76(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpaddd %ymm11,%ymm5,%ymm9
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 96(%r13),%ebp
+ vmovdqu %ymm9,416(%rsp)
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 100(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpalignr $8,%ymm4,%ymm5,%ymm8
+ vpxor %ymm2,%ymm6,%ymm6
+ addl 104(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ vpxor %ymm8,%ymm6,%ymm6
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 108(%r13),%edx
+ leaq 256(%r13),%r13
+ vpsrld $30,%ymm6,%ymm8
+ vpslld $2,%ymm6,%ymm6
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vpor %ymm8,%ymm6,%ymm6
+ addl -128(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpaddd %ymm11,%ymm6,%ymm9
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -124(%r13),%ebx
+ vmovdqu %ymm9,448(%rsp)
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -120(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpalignr $8,%ymm5,%ymm6,%ymm8
+ vpxor %ymm3,%ymm7,%ymm7
+ addl -116(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ vpxor %ymm0,%ymm7,%ymm7
+ vmovdqu 32(%r14),%ymm11
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ vpxor %ymm8,%ymm7,%ymm7
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -96(%r13),%esi
+ vpsrld $30,%ymm7,%ymm8
+ vpslld $2,%ymm7,%ymm7
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpor %ymm8,%ymm7,%ymm7
+ addl -92(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpaddd %ymm11,%ymm7,%ymm9
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -88(%r13),%ecx
+ vmovdqu %ymm9,480(%rsp)
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -84(%r13),%ebx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ jmp .Lalign32_2
+.align 32
+.Lalign32_2:
+ vpalignr $8,%ymm6,%ymm7,%ymm8
+ vpxor %ymm4,%ymm0,%ymm0
+ addl -64(%r13),%ebp
+ xorl %esi,%ecx
+ vpxor %ymm1,%ymm0,%ymm0
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ vpxor %ymm8,%ymm0,%ymm0
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ vpsrld $30,%ymm0,%ymm8
+ vpslld $2,%ymm0,%ymm0
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -60(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ vpor %ymm8,%ymm0,%ymm0
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ vpaddd %ymm11,%ymm0,%ymm9
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl -56(%r13),%esi
+ xorl %ecx,%ebp
+ vmovdqu %ymm9,512(%rsp)
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl -52(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl -32(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ vpalignr $8,%ymm7,%ymm0,%ymm8
+ vpxor %ymm5,%ymm1,%ymm1
+ addl -28(%r13),%ebx
+ xorl %eax,%edx
+ vpxor %ymm2,%ymm1,%ymm1
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ vpxor %ymm8,%ymm1,%ymm1
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vpsrld $30,%ymm1,%ymm8
+ vpslld $2,%ymm1,%ymm1
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl -24(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ vpor %ymm8,%ymm1,%ymm1
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -20(%r13),%eax
+ xorl %edx,%ebx
+ vmovdqu %ymm9,544(%rsp)
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 0(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl 4(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ vpalignr $8,%ymm0,%ymm1,%ymm8
+ vpxor %ymm6,%ymm2,%ymm2
+ addl 8(%r13),%ecx
+ xorl %ebp,%esi
+ vpxor %ymm3,%ymm2,%ymm2
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ vpxor %ymm8,%ymm2,%ymm2
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpsrld $30,%ymm2,%ymm8
+ vpslld $2,%ymm2,%ymm2
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 12(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ vpor %ymm8,%ymm2,%ymm2
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vpaddd %ymm11,%ymm2,%ymm9
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 32(%r13),%ebp
+ xorl %esi,%ecx
+ vmovdqu %ymm9,576(%rsp)
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 36(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 40(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ vpalignr $8,%ymm1,%ymm2,%ymm8
+ vpxor %ymm7,%ymm3,%ymm3
+ addl 44(%r13),%edx
+ xorl %ebx,%eax
+ vpxor %ymm4,%ymm3,%ymm3
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm8,%ymm3,%ymm3
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ vpsrld $30,%ymm3,%ymm8
+ vpslld $2,%ymm3,%ymm3
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl 64(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ vpor %ymm8,%ymm3,%ymm3
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpaddd %ymm11,%ymm3,%ymm9
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 68(%r13),%ebx
+ xorl %eax,%edx
+ vmovdqu %ymm9,608(%rsp)
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 72(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 76(%r13),%eax
+ xorl %edx,%ebx
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 100(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 104(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 108(%r13),%ebx
+ leaq 256(%r13),%r13
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -128(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -124(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -120(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -116(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -96(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -92(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -88(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -84(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -60(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -56(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -52(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -32(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -28(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -24(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -20(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ addl %r12d,%edx
+ leaq 128(%r9),%r13
+ leaq 128(%r9),%rdi
+ cmpq %r10,%r13
+ cmovaeq %r9,%r13
+
+
+ addl 0(%r8),%edx
+ addl 4(%r8),%esi
+ addl 8(%r8),%ebp
+ movl %edx,0(%r8)
+ addl 12(%r8),%ebx
+ movl %esi,4(%r8)
+ movl %edx,%eax
+ addl 16(%r8),%ecx
+ movl %ebp,%r12d
+ movl %ebp,8(%r8)
+ movl %ebx,%edx
+
+ movl %ebx,12(%r8)
+ movl %esi,%ebp
+ movl %ecx,16(%r8)
+
+ movl %ecx,%esi
+ movl %r12d,%ecx
+
+
+ cmpq %r10,%r9
+ je .Ldone_avx2
+ vmovdqu 64(%r14),%ymm6
+ cmpq %r10,%rdi
+ ja .Last_avx2
+
+ vmovdqu -64(%rdi),%xmm0
+ vmovdqu -48(%rdi),%xmm1
+ vmovdqu -32(%rdi),%xmm2
+ vmovdqu -16(%rdi),%xmm3
+ vinserti128 $1,0(%r13),%ymm0,%ymm0
+ vinserti128 $1,16(%r13),%ymm1,%ymm1
+ vinserti128 $1,32(%r13),%ymm2,%ymm2
+ vinserti128 $1,48(%r13),%ymm3,%ymm3
+ jmp .Last_avx2
+
+.align 32
+.Last_avx2:
+ leaq 128+16(%rsp),%r13
+ rorxl $2,%ebp,%ebx
+ andnl %edx,%ebp,%edi
+ andl %ecx,%ebp
+ xorl %edi,%ebp
+ subq $-128,%r9
+ addl -128(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -124(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -120(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -116(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -96(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl -92(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -88(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -84(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -64(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -60(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -56(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl -52(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -32(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -28(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -24(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -20(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl 0(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl 4(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl 8(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl 12(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 32(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 36(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 40(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 44(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vmovdqu -64(%r14),%ymm11
+ vpshufb %ymm6,%ymm0,%ymm0
+ addl 68(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 72(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 76(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 96(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 100(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpshufb %ymm6,%ymm1,%ymm1
+ vpaddd %ymm11,%ymm0,%ymm8
+ addl 104(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 108(%r13),%edx
+ leaq 256(%r13),%r13
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -128(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -124(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -120(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vmovdqu %ymm8,0(%rsp)
+ vpshufb %ymm6,%ymm2,%ymm2
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl -116(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -92(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -88(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -84(%r13),%ebx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ vmovdqu %ymm9,32(%rsp)
+ vpshufb %ymm6,%ymm3,%ymm3
+ vpaddd %ymm11,%ymm2,%ymm6
+ addl -64(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -60(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl -56(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl -52(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl -32(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ jmp .Lalign32_3
+.align 32
+.Lalign32_3:
+ vmovdqu %ymm6,64(%rsp)
+ vpaddd %ymm11,%ymm3,%ymm7
+ addl -28(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl -24(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -20(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 0(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl 4(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ vmovdqu %ymm7,96(%rsp)
+ addl 8(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 12(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 32(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 36(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 40(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ vpalignr $8,%ymm0,%ymm1,%ymm4
+ addl 44(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ vpsrldq $4,%ymm3,%ymm8
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpxor %ymm0,%ymm4,%ymm4
+ vpxor %ymm2,%ymm8,%ymm8
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpxor %ymm8,%ymm4,%ymm4
+ andl %edi,%esi
+ addl 64(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ vpsrld $31,%ymm4,%ymm8
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ vpslldq $12,%ymm4,%ymm10
+ vpaddd %ymm4,%ymm4,%ymm4
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm4,%ymm4
+ addl %r12d,%ecx
+ andl %edi,%edx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm4,%ymm4
+ addl 68(%r13),%ebx
+ xorl %eax,%edx
+ vpxor %ymm10,%ymm4,%ymm4
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ vpaddd %ymm11,%ymm4,%ymm9
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vmovdqu %ymm9,128(%rsp)
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 72(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 76(%r13),%eax
+ xorl %edx,%ebx
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpalignr $8,%ymm1,%ymm2,%ymm5
+ addl 96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpsrldq $4,%ymm4,%ymm8
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm3,%ymm8,%ymm8
+ addl 100(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm8,%ymm5,%ymm5
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpsrld $31,%ymm5,%ymm8
+ vmovdqu -32(%r14),%ymm11
+ xorl %ebx,%esi
+ addl 104(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ vpslldq $12,%ymm5,%ymm10
+ vpaddd %ymm5,%ymm5,%ymm5
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm5,%ymm5
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm5,%ymm5
+ xorl %ebp,%edx
+ addl 108(%r13),%ebx
+ leaq 256(%r13),%r13
+ vpxor %ymm10,%ymm5,%ymm5
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpaddd %ymm11,%ymm5,%ymm9
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vmovdqu %ymm9,160(%rsp)
+ addl -128(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpalignr $8,%ymm2,%ymm3,%ymm6
+ addl -124(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ vpsrldq $4,%ymm5,%ymm8
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpxor %ymm2,%ymm6,%ymm6
+ vpxor %ymm4,%ymm8,%ymm8
+ addl -120(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpxor %ymm8,%ymm6,%ymm6
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ vpsrld $31,%ymm6,%ymm8
+ xorl %ecx,%eax
+ addl -116(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpslldq $12,%ymm6,%ymm10
+ vpaddd %ymm6,%ymm6,%ymm6
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm6,%ymm6
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm6,%ymm6
+ xorl %ebx,%esi
+ addl -96(%r13),%ecx
+ vpxor %ymm10,%ymm6,%ymm6
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpaddd %ymm11,%ymm6,%ymm9
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ vmovdqu %ymm9,192(%rsp)
+ addl -92(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vpalignr $8,%ymm3,%ymm4,%ymm7
+ addl -88(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpsrldq $4,%ymm6,%ymm8
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpxor %ymm3,%ymm7,%ymm7
+ vpxor %ymm5,%ymm8,%ymm8
+ addl -84(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ vpxor %ymm8,%ymm7,%ymm7
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ vpsrld $31,%ymm7,%ymm8
+ xorl %edx,%ebp
+ addl -64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpslldq $12,%ymm7,%ymm10
+ vpaddd %ymm7,%ymm7,%ymm7
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm7,%ymm7
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm7,%ymm7
+ xorl %ecx,%eax
+ addl -60(%r13),%edx
+ vpxor %ymm10,%ymm7,%ymm7
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpaddd %ymm11,%ymm7,%ymm9
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vmovdqu %ymm9,224(%rsp)
+ addl -56(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -52(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -32(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -28(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -24(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -20(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ addl %r12d,%edx
+ leaq 128(%rsp),%r13
+
+
+ addl 0(%r8),%edx
+ addl 4(%r8),%esi
+ addl 8(%r8),%ebp
+ movl %edx,0(%r8)
+ addl 12(%r8),%ebx
+ movl %esi,4(%r8)
+ movl %edx,%eax
+ addl 16(%r8),%ecx
+ movl %ebp,%r12d
+ movl %ebp,8(%r8)
+ movl %ebx,%edx
+
+ movl %ebx,12(%r8)
+ movl %esi,%ebp
+ movl %ecx,16(%r8)
+
+ movl %ecx,%esi
+ movl %r12d,%ecx
+
+
+ cmpq %r10,%r9
+ jbe .Loop_avx2
+
+.Ldone_avx2:
+ vzeroupper
+ movq -40(%r11),%r14
+.cfi_restore %r14
+ movq -32(%r11),%r13
+.cfi_restore %r13
+ movq -24(%r11),%r12
+.cfi_restore %r12
+ movq -16(%r11),%rbp
+.cfi_restore %rbp
+ movq -8(%r11),%rbx
+.cfi_restore %rbx
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
+.align 64
+K_XX_XX:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
diff --git a/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha256-mb-x86_64.s b/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha256-mb-x86_64.s
new file mode 100644
index 0000000000..59cf9c984e
--- /dev/null
+++ b/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha256-mb-x86_64.s
@@ -0,0 +1,7948 @@
+.text
+
+
+
+.globl sha256_multi_block
+.type sha256_multi_block,@function
+.align 32
+sha256_multi_block:
+.cfi_startproc
+ movq OPENSSL_ia32cap_P+4(%rip),%rcx
+ btq $61,%rcx
+ jc _shaext_shortcut
+ testl $268435456,%ecx
+ jnz _avx_shortcut
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ subq $288,%rsp
+ andq $-256,%rsp
+ movq %rax,272(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08
+.Lbody:
+ leaq K256+128(%rip),%rbp
+ leaq 256(%rsp),%rbx
+ leaq 128(%rdi),%rdi
+
+.Loop_grande:
+ movl %edx,280(%rsp)
+ xorl %edx,%edx
+ movq 0(%rsi),%r8
+ movl 8(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,0(%rbx)
+ cmovleq %rbp,%r8
+ movq 16(%rsi),%r9
+ movl 24(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,4(%rbx)
+ cmovleq %rbp,%r9
+ movq 32(%rsi),%r10
+ movl 40(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,8(%rbx)
+ cmovleq %rbp,%r10
+ movq 48(%rsi),%r11
+ movl 56(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,12(%rbx)
+ cmovleq %rbp,%r11
+ testl %edx,%edx
+ jz .Ldone
+
+ movdqu 0-128(%rdi),%xmm8
+ leaq 128(%rsp),%rax
+ movdqu 32-128(%rdi),%xmm9
+ movdqu 64-128(%rdi),%xmm10
+ movdqu 96-128(%rdi),%xmm11
+ movdqu 128-128(%rdi),%xmm12
+ movdqu 160-128(%rdi),%xmm13
+ movdqu 192-128(%rdi),%xmm14
+ movdqu 224-128(%rdi),%xmm15
+ movdqu .Lpbswap(%rip),%xmm6
+ jmp .Loop
+
+.align 32
+.Loop:
+ movdqa %xmm10,%xmm4
+ pxor %xmm9,%xmm4
+ movd 0(%r8),%xmm5
+ movd 0(%r9),%xmm0
+ movd 0(%r10),%xmm1
+ movd 0(%r11),%xmm2
+ punpckldq %xmm1,%xmm5
+ punpckldq %xmm2,%xmm0
+ punpckldq %xmm0,%xmm5
+ movdqa %xmm12,%xmm7
+.byte 102,15,56,0,238
+ movdqa %xmm12,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm12,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,0-128(%rax)
+ paddd %xmm15,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd -128(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm12,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm12,%xmm3
+ pslld $26-21,%xmm2
+ pandn %xmm14,%xmm0
+ pand %xmm13,%xmm3
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm8,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm8,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ movdqa %xmm9,%xmm3
+ movdqa %xmm8,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm8,%xmm3
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm3,%xmm4
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm9,%xmm15
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm4,%xmm15
+ paddd %xmm5,%xmm11
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm15
+ paddd %xmm7,%xmm15
+ movd 4(%r8),%xmm5
+ movd 4(%r9),%xmm0
+ movd 4(%r10),%xmm1
+ movd 4(%r11),%xmm2
+ punpckldq %xmm1,%xmm5
+ punpckldq %xmm2,%xmm0
+ punpckldq %xmm0,%xmm5
+ movdqa %xmm11,%xmm7
+
+ movdqa %xmm11,%xmm2
+.byte 102,15,56,0,238
+ psrld $6,%xmm7
+ movdqa %xmm11,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,16-128(%rax)
+ paddd %xmm14,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd -96(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm11,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm11,%xmm4
+ pslld $26-21,%xmm2
+ pandn %xmm13,%xmm0
+ pand %xmm12,%xmm4
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm15,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm15,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm4,%xmm0
+ movdqa %xmm8,%xmm4
+ movdqa %xmm15,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm15,%xmm4
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm4,%xmm3
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm8,%xmm14
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm3,%xmm14
+ paddd %xmm5,%xmm10
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm14
+ paddd %xmm7,%xmm14
+ movd 8(%r8),%xmm5
+ movd 8(%r9),%xmm0
+ movd 8(%r10),%xmm1
+ movd 8(%r11),%xmm2
+ punpckldq %xmm1,%xmm5
+ punpckldq %xmm2,%xmm0
+ punpckldq %xmm0,%xmm5
+ movdqa %xmm10,%xmm7
+.byte 102,15,56,0,238
+ movdqa %xmm10,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm10,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,32-128(%rax)
+ paddd %xmm13,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd -64(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm10,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm10,%xmm3
+ pslld $26-21,%xmm2
+ pandn %xmm12,%xmm0
+ pand %xmm11,%xmm3
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm14,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm14,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ movdqa %xmm15,%xmm3
+ movdqa %xmm14,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm14,%xmm3
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm3,%xmm4
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm15,%xmm13
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm4,%xmm13
+ paddd %xmm5,%xmm9
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm13
+ paddd %xmm7,%xmm13
+ movd 12(%r8),%xmm5
+ movd 12(%r9),%xmm0
+ movd 12(%r10),%xmm1
+ movd 12(%r11),%xmm2
+ punpckldq %xmm1,%xmm5
+ punpckldq %xmm2,%xmm0
+ punpckldq %xmm0,%xmm5
+ movdqa %xmm9,%xmm7
+
+ movdqa %xmm9,%xmm2
+.byte 102,15,56,0,238
+ psrld $6,%xmm7
+ movdqa %xmm9,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,48-128(%rax)
+ paddd %xmm12,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd -32(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm9,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm9,%xmm4
+ pslld $26-21,%xmm2
+ pandn %xmm11,%xmm0
+ pand %xmm10,%xmm4
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm13,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm13,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm4,%xmm0
+ movdqa %xmm14,%xmm4
+ movdqa %xmm13,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm13,%xmm4
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm4,%xmm3
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm14,%xmm12
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm3,%xmm12
+ paddd %xmm5,%xmm8
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm12
+ paddd %xmm7,%xmm12
+ movd 16(%r8),%xmm5
+ movd 16(%r9),%xmm0
+ movd 16(%r10),%xmm1
+ movd 16(%r11),%xmm2
+ punpckldq %xmm1,%xmm5
+ punpckldq %xmm2,%xmm0
+ punpckldq %xmm0,%xmm5
+ movdqa %xmm8,%xmm7
+.byte 102,15,56,0,238
+ movdqa %xmm8,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm8,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,64-128(%rax)
+ paddd %xmm11,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd 0(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm8,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm8,%xmm3
+ pslld $26-21,%xmm2
+ pandn %xmm10,%xmm0
+ pand %xmm9,%xmm3
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm12,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm12,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ movdqa %xmm13,%xmm3
+ movdqa %xmm12,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm12,%xmm3
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm3,%xmm4
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm13,%xmm11
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm4,%xmm11
+ paddd %xmm5,%xmm15
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm11
+ paddd %xmm7,%xmm11
+ movd 20(%r8),%xmm5
+ movd 20(%r9),%xmm0
+ movd 20(%r10),%xmm1
+ movd 20(%r11),%xmm2
+ punpckldq %xmm1,%xmm5
+ punpckldq %xmm2,%xmm0
+ punpckldq %xmm0,%xmm5
+ movdqa %xmm15,%xmm7
+
+ movdqa %xmm15,%xmm2
+.byte 102,15,56,0,238
+ psrld $6,%xmm7
+ movdqa %xmm15,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,80-128(%rax)
+ paddd %xmm10,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd 32(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm15,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm15,%xmm4
+ pslld $26-21,%xmm2
+ pandn %xmm9,%xmm0
+ pand %xmm8,%xmm4
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm11,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm11,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm4,%xmm0
+ movdqa %xmm12,%xmm4
+ movdqa %xmm11,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm11,%xmm4
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm4,%xmm3
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm12,%xmm10
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm3,%xmm10
+ paddd %xmm5,%xmm14
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm10
+ paddd %xmm7,%xmm10
+ movd 24(%r8),%xmm5
+ movd 24(%r9),%xmm0
+ movd 24(%r10),%xmm1
+ movd 24(%r11),%xmm2
+ punpckldq %xmm1,%xmm5
+ punpckldq %xmm2,%xmm0
+ punpckldq %xmm0,%xmm5
+ movdqa %xmm14,%xmm7
+.byte 102,15,56,0,238
+ movdqa %xmm14,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm14,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,96-128(%rax)
+ paddd %xmm9,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd 64(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm14,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm14,%xmm3
+ pslld $26-21,%xmm2
+ pandn %xmm8,%xmm0
+ pand %xmm15,%xmm3
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm10,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm10,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ movdqa %xmm11,%xmm3
+ movdqa %xmm10,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm10,%xmm3
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm3,%xmm4
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm11,%xmm9
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm4,%xmm9
+ paddd %xmm5,%xmm13
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm9
+ paddd %xmm7,%xmm9
+ movd 28(%r8),%xmm5
+ movd 28(%r9),%xmm0
+ movd 28(%r10),%xmm1
+ movd 28(%r11),%xmm2
+ punpckldq %xmm1,%xmm5
+ punpckldq %xmm2,%xmm0
+ punpckldq %xmm0,%xmm5
+ movdqa %xmm13,%xmm7
+
+ movdqa %xmm13,%xmm2
+.byte 102,15,56,0,238
+ psrld $6,%xmm7
+ movdqa %xmm13,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,112-128(%rax)
+ paddd %xmm8,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd 96(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm13,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm13,%xmm4
+ pslld $26-21,%xmm2
+ pandn %xmm15,%xmm0
+ pand %xmm14,%xmm4
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm9,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm9,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm4,%xmm0
+ movdqa %xmm10,%xmm4
+ movdqa %xmm9,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm9,%xmm4
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm4,%xmm3
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm10,%xmm8
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm3,%xmm8
+ paddd %xmm5,%xmm12
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm8
+ paddd %xmm7,%xmm8
+ leaq 256(%rbp),%rbp
+ movd 32(%r8),%xmm5
+ movd 32(%r9),%xmm0
+ movd 32(%r10),%xmm1
+ movd 32(%r11),%xmm2
+ punpckldq %xmm1,%xmm5
+ punpckldq %xmm2,%xmm0
+ punpckldq %xmm0,%xmm5
+ movdqa %xmm12,%xmm7
+.byte 102,15,56,0,238
+ movdqa %xmm12,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm12,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,128-128(%rax)
+ paddd %xmm15,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd -128(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm12,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm12,%xmm3
+ pslld $26-21,%xmm2
+ pandn %xmm14,%xmm0
+ pand %xmm13,%xmm3
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm8,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm8,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ movdqa %xmm9,%xmm3
+ movdqa %xmm8,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm8,%xmm3
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm3,%xmm4
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm9,%xmm15
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm4,%xmm15
+ paddd %xmm5,%xmm11
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm15
+ paddd %xmm7,%xmm15
+ movd 36(%r8),%xmm5
+ movd 36(%r9),%xmm0
+ movd 36(%r10),%xmm1
+ movd 36(%r11),%xmm2
+ punpckldq %xmm1,%xmm5
+ punpckldq %xmm2,%xmm0
+ punpckldq %xmm0,%xmm5
+ movdqa %xmm11,%xmm7
+
+ movdqa %xmm11,%xmm2
+.byte 102,15,56,0,238
+ psrld $6,%xmm7
+ movdqa %xmm11,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,144-128(%rax)
+ paddd %xmm14,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd -96(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm11,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm11,%xmm4
+ pslld $26-21,%xmm2
+ pandn %xmm13,%xmm0
+ pand %xmm12,%xmm4
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm15,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm15,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm4,%xmm0
+ movdqa %xmm8,%xmm4
+ movdqa %xmm15,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm15,%xmm4
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm4,%xmm3
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm8,%xmm14
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm3,%xmm14
+ paddd %xmm5,%xmm10
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm14
+ paddd %xmm7,%xmm14
+ movd 40(%r8),%xmm5
+ movd 40(%r9),%xmm0
+ movd 40(%r10),%xmm1
+ movd 40(%r11),%xmm2
+ punpckldq %xmm1,%xmm5
+ punpckldq %xmm2,%xmm0
+ punpckldq %xmm0,%xmm5
+ movdqa %xmm10,%xmm7
+.byte 102,15,56,0,238
+ movdqa %xmm10,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm10,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,160-128(%rax)
+ paddd %xmm13,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd -64(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm10,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm10,%xmm3
+ pslld $26-21,%xmm2
+ pandn %xmm12,%xmm0
+ pand %xmm11,%xmm3
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm14,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm14,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ movdqa %xmm15,%xmm3
+ movdqa %xmm14,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm14,%xmm3
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm3,%xmm4
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm15,%xmm13
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm4,%xmm13
+ paddd %xmm5,%xmm9
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm13
+ paddd %xmm7,%xmm13
+ movd 44(%r8),%xmm5
+ movd 44(%r9),%xmm0
+ movd 44(%r10),%xmm1
+ movd 44(%r11),%xmm2
+ punpckldq %xmm1,%xmm5
+ punpckldq %xmm2,%xmm0
+ punpckldq %xmm0,%xmm5
+ movdqa %xmm9,%xmm7
+
+ movdqa %xmm9,%xmm2
+.byte 102,15,56,0,238
+ psrld $6,%xmm7
+ movdqa %xmm9,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,176-128(%rax)
+ paddd %xmm12,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd -32(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm9,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm9,%xmm4
+ pslld $26-21,%xmm2
+ pandn %xmm11,%xmm0
+ pand %xmm10,%xmm4
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm13,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm13,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm4,%xmm0
+ movdqa %xmm14,%xmm4
+ movdqa %xmm13,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm13,%xmm4
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm4,%xmm3
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm14,%xmm12
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm3,%xmm12
+ paddd %xmm5,%xmm8
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm12
+ paddd %xmm7,%xmm12
+ movd 48(%r8),%xmm5
+ movd 48(%r9),%xmm0
+ movd 48(%r10),%xmm1
+ movd 48(%r11),%xmm2
+ punpckldq %xmm1,%xmm5
+ punpckldq %xmm2,%xmm0
+ punpckldq %xmm0,%xmm5
+ movdqa %xmm8,%xmm7
+.byte 102,15,56,0,238
+ movdqa %xmm8,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm8,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,192-128(%rax)
+ paddd %xmm11,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd 0(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm8,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm8,%xmm3
+ pslld $26-21,%xmm2
+ pandn %xmm10,%xmm0
+ pand %xmm9,%xmm3
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm12,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm12,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ movdqa %xmm13,%xmm3
+ movdqa %xmm12,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm12,%xmm3
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm3,%xmm4
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm13,%xmm11
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm4,%xmm11
+ paddd %xmm5,%xmm15
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm11
+ paddd %xmm7,%xmm11
+ movd 52(%r8),%xmm5
+ movd 52(%r9),%xmm0
+ movd 52(%r10),%xmm1
+ movd 52(%r11),%xmm2
+ punpckldq %xmm1,%xmm5
+ punpckldq %xmm2,%xmm0
+ punpckldq %xmm0,%xmm5
+ movdqa %xmm15,%xmm7
+
+ movdqa %xmm15,%xmm2
+.byte 102,15,56,0,238
+ psrld $6,%xmm7
+ movdqa %xmm15,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,208-128(%rax)
+ paddd %xmm10,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd 32(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm15,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm15,%xmm4
+ pslld $26-21,%xmm2
+ pandn %xmm9,%xmm0
+ pand %xmm8,%xmm4
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm11,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm11,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm4,%xmm0
+ movdqa %xmm12,%xmm4
+ movdqa %xmm11,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm11,%xmm4
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm4,%xmm3
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm12,%xmm10
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm3,%xmm10
+ paddd %xmm5,%xmm14
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm10
+ paddd %xmm7,%xmm10
+ movd 56(%r8),%xmm5
+ movd 56(%r9),%xmm0
+ movd 56(%r10),%xmm1
+ movd 56(%r11),%xmm2
+ punpckldq %xmm1,%xmm5
+ punpckldq %xmm2,%xmm0
+ punpckldq %xmm0,%xmm5
+ movdqa %xmm14,%xmm7
+.byte 102,15,56,0,238
+ movdqa %xmm14,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm14,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,224-128(%rax)
+ paddd %xmm9,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd 64(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm14,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm14,%xmm3
+ pslld $26-21,%xmm2
+ pandn %xmm8,%xmm0
+ pand %xmm15,%xmm3
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm10,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm10,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ movdqa %xmm11,%xmm3
+ movdqa %xmm10,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm10,%xmm3
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm3,%xmm4
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm11,%xmm9
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm4,%xmm9
+ paddd %xmm5,%xmm13
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm9
+ paddd %xmm7,%xmm9
+ movd 60(%r8),%xmm5
+ leaq 64(%r8),%r8
+ movd 60(%r9),%xmm0
+ leaq 64(%r9),%r9
+ movd 60(%r10),%xmm1
+ leaq 64(%r10),%r10
+ movd 60(%r11),%xmm2
+ leaq 64(%r11),%r11
+ punpckldq %xmm1,%xmm5
+ punpckldq %xmm2,%xmm0
+ punpckldq %xmm0,%xmm5
+ movdqa %xmm13,%xmm7
+
+ movdqa %xmm13,%xmm2
+.byte 102,15,56,0,238
+ psrld $6,%xmm7
+ movdqa %xmm13,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,240-128(%rax)
+ paddd %xmm8,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd 96(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm13,%xmm0
+ prefetcht0 63(%r8)
+ pxor %xmm2,%xmm7
+ movdqa %xmm13,%xmm4
+ pslld $26-21,%xmm2
+ pandn %xmm15,%xmm0
+ pand %xmm14,%xmm4
+ pxor %xmm1,%xmm7
+
+ prefetcht0 63(%r9)
+ movdqa %xmm9,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm9,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm4,%xmm0
+ movdqa %xmm10,%xmm4
+ movdqa %xmm9,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm9,%xmm4
+
+ prefetcht0 63(%r10)
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm4,%xmm3
+ pxor %xmm7,%xmm1
+
+ prefetcht0 63(%r11)
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm10,%xmm8
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm3,%xmm8
+ paddd %xmm5,%xmm12
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm8
+ paddd %xmm7,%xmm8
+ leaq 256(%rbp),%rbp
+ movdqu 0-128(%rax),%xmm5
+ movl $3,%ecx
+ jmp .Loop_16_xx
+.align 32
+.Loop_16_xx:
+ movdqa 16-128(%rax),%xmm6
+ paddd 144-128(%rax),%xmm5
+
+ movdqa %xmm6,%xmm7
+ movdqa %xmm6,%xmm1
+ psrld $3,%xmm7
+ movdqa %xmm6,%xmm2
+
+ psrld $7,%xmm1
+ movdqa 224-128(%rax),%xmm0
+ pslld $14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $18-7,%xmm1
+ movdqa %xmm0,%xmm3
+ pxor %xmm2,%xmm7
+ pslld $25-14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $10,%xmm0
+ movdqa %xmm3,%xmm1
+
+ psrld $17,%xmm3
+ pxor %xmm2,%xmm7
+ pslld $13,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ psrld $19-17,%xmm3
+ pxor %xmm1,%xmm0
+ pslld $15-13,%xmm1
+ pxor %xmm3,%xmm0
+ pxor %xmm1,%xmm0
+ paddd %xmm0,%xmm5
+ movdqa %xmm12,%xmm7
+
+ movdqa %xmm12,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm12,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,0-128(%rax)
+ paddd %xmm15,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd -128(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm12,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm12,%xmm3
+ pslld $26-21,%xmm2
+ pandn %xmm14,%xmm0
+ pand %xmm13,%xmm3
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm8,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm8,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ movdqa %xmm9,%xmm3
+ movdqa %xmm8,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm8,%xmm3
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm3,%xmm4
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm9,%xmm15
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm4,%xmm15
+ paddd %xmm5,%xmm11
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm15
+ paddd %xmm7,%xmm15
+ movdqa 32-128(%rax),%xmm5
+ paddd 160-128(%rax),%xmm6
+
+ movdqa %xmm5,%xmm7
+ movdqa %xmm5,%xmm1
+ psrld $3,%xmm7
+ movdqa %xmm5,%xmm2
+
+ psrld $7,%xmm1
+ movdqa 240-128(%rax),%xmm0
+ pslld $14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $18-7,%xmm1
+ movdqa %xmm0,%xmm4
+ pxor %xmm2,%xmm7
+ pslld $25-14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $10,%xmm0
+ movdqa %xmm4,%xmm1
+
+ psrld $17,%xmm4
+ pxor %xmm2,%xmm7
+ pslld $13,%xmm1
+ paddd %xmm7,%xmm6
+ pxor %xmm4,%xmm0
+ psrld $19-17,%xmm4
+ pxor %xmm1,%xmm0
+ pslld $15-13,%xmm1
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
+ paddd %xmm0,%xmm6
+ movdqa %xmm11,%xmm7
+
+ movdqa %xmm11,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm11,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm6,16-128(%rax)
+ paddd %xmm14,%xmm6
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd -96(%rbp),%xmm6
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm11,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm11,%xmm4
+ pslld $26-21,%xmm2
+ pandn %xmm13,%xmm0
+ pand %xmm12,%xmm4
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm15,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm15,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm6
+ pxor %xmm4,%xmm0
+ movdqa %xmm8,%xmm4
+ movdqa %xmm15,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm15,%xmm4
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm6
+ pslld $19-10,%xmm2
+ pand %xmm4,%xmm3
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm8,%xmm14
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm3,%xmm14
+ paddd %xmm6,%xmm10
+ pxor %xmm2,%xmm7
+
+ paddd %xmm6,%xmm14
+ paddd %xmm7,%xmm14
+ movdqa 48-128(%rax),%xmm6
+ paddd 176-128(%rax),%xmm5
+
+ movdqa %xmm6,%xmm7
+ movdqa %xmm6,%xmm1
+ psrld $3,%xmm7
+ movdqa %xmm6,%xmm2
+
+ psrld $7,%xmm1
+ movdqa 0-128(%rax),%xmm0
+ pslld $14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $18-7,%xmm1
+ movdqa %xmm0,%xmm3
+ pxor %xmm2,%xmm7
+ pslld $25-14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $10,%xmm0
+ movdqa %xmm3,%xmm1
+
+ psrld $17,%xmm3
+ pxor %xmm2,%xmm7
+ pslld $13,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ psrld $19-17,%xmm3
+ pxor %xmm1,%xmm0
+ pslld $15-13,%xmm1
+ pxor %xmm3,%xmm0
+ pxor %xmm1,%xmm0
+ paddd %xmm0,%xmm5
+ movdqa %xmm10,%xmm7
+
+ movdqa %xmm10,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm10,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,32-128(%rax)
+ paddd %xmm13,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd -64(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm10,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm10,%xmm3
+ pslld $26-21,%xmm2
+ pandn %xmm12,%xmm0
+ pand %xmm11,%xmm3
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm14,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm14,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ movdqa %xmm15,%xmm3
+ movdqa %xmm14,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm14,%xmm3
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm3,%xmm4
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm15,%xmm13
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm4,%xmm13
+ paddd %xmm5,%xmm9
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm13
+ paddd %xmm7,%xmm13
+ movdqa 64-128(%rax),%xmm5
+ paddd 192-128(%rax),%xmm6
+
+ movdqa %xmm5,%xmm7
+ movdqa %xmm5,%xmm1
+ psrld $3,%xmm7
+ movdqa %xmm5,%xmm2
+
+ psrld $7,%xmm1
+ movdqa 16-128(%rax),%xmm0
+ pslld $14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $18-7,%xmm1
+ movdqa %xmm0,%xmm4
+ pxor %xmm2,%xmm7
+ pslld $25-14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $10,%xmm0
+ movdqa %xmm4,%xmm1
+
+ psrld $17,%xmm4
+ pxor %xmm2,%xmm7
+ pslld $13,%xmm1
+ paddd %xmm7,%xmm6
+ pxor %xmm4,%xmm0
+ psrld $19-17,%xmm4
+ pxor %xmm1,%xmm0
+ pslld $15-13,%xmm1
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
+ paddd %xmm0,%xmm6
+ movdqa %xmm9,%xmm7
+
+ movdqa %xmm9,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm9,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm6,48-128(%rax)
+ paddd %xmm12,%xmm6
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd -32(%rbp),%xmm6
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm9,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm9,%xmm4
+ pslld $26-21,%xmm2
+ pandn %xmm11,%xmm0
+ pand %xmm10,%xmm4
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm13,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm13,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm6
+ pxor %xmm4,%xmm0
+ movdqa %xmm14,%xmm4
+ movdqa %xmm13,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm13,%xmm4
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm6
+ pslld $19-10,%xmm2
+ pand %xmm4,%xmm3
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm14,%xmm12
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm3,%xmm12
+ paddd %xmm6,%xmm8
+ pxor %xmm2,%xmm7
+
+ paddd %xmm6,%xmm12
+ paddd %xmm7,%xmm12
+ movdqa 80-128(%rax),%xmm6
+ paddd 208-128(%rax),%xmm5
+
+ movdqa %xmm6,%xmm7
+ movdqa %xmm6,%xmm1
+ psrld $3,%xmm7
+ movdqa %xmm6,%xmm2
+
+ psrld $7,%xmm1
+ movdqa 32-128(%rax),%xmm0
+ pslld $14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $18-7,%xmm1
+ movdqa %xmm0,%xmm3
+ pxor %xmm2,%xmm7
+ pslld $25-14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $10,%xmm0
+ movdqa %xmm3,%xmm1
+
+ psrld $17,%xmm3
+ pxor %xmm2,%xmm7
+ pslld $13,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ psrld $19-17,%xmm3
+ pxor %xmm1,%xmm0
+ pslld $15-13,%xmm1
+ pxor %xmm3,%xmm0
+ pxor %xmm1,%xmm0
+ paddd %xmm0,%xmm5
+ movdqa %xmm8,%xmm7
+
+ movdqa %xmm8,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm8,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,64-128(%rax)
+ paddd %xmm11,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd 0(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm8,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm8,%xmm3
+ pslld $26-21,%xmm2
+ pandn %xmm10,%xmm0
+ pand %xmm9,%xmm3
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm12,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm12,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ movdqa %xmm13,%xmm3
+ movdqa %xmm12,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm12,%xmm3
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm3,%xmm4
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm13,%xmm11
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm4,%xmm11
+ paddd %xmm5,%xmm15
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm11
+ paddd %xmm7,%xmm11
+ movdqa 96-128(%rax),%xmm5
+ paddd 224-128(%rax),%xmm6
+
+ movdqa %xmm5,%xmm7
+ movdqa %xmm5,%xmm1
+ psrld $3,%xmm7
+ movdqa %xmm5,%xmm2
+
+ psrld $7,%xmm1
+ movdqa 48-128(%rax),%xmm0
+ pslld $14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $18-7,%xmm1
+ movdqa %xmm0,%xmm4
+ pxor %xmm2,%xmm7
+ pslld $25-14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $10,%xmm0
+ movdqa %xmm4,%xmm1
+
+ psrld $17,%xmm4
+ pxor %xmm2,%xmm7
+ pslld $13,%xmm1
+ paddd %xmm7,%xmm6
+ pxor %xmm4,%xmm0
+ psrld $19-17,%xmm4
+ pxor %xmm1,%xmm0
+ pslld $15-13,%xmm1
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
+ paddd %xmm0,%xmm6
+ movdqa %xmm15,%xmm7
+
+ movdqa %xmm15,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm15,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm6,80-128(%rax)
+ paddd %xmm10,%xmm6
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd 32(%rbp),%xmm6
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm15,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm15,%xmm4
+ pslld $26-21,%xmm2
+ pandn %xmm9,%xmm0
+ pand %xmm8,%xmm4
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm11,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm11,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm6
+ pxor %xmm4,%xmm0
+ movdqa %xmm12,%xmm4
+ movdqa %xmm11,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm11,%xmm4
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm6
+ pslld $19-10,%xmm2
+ pand %xmm4,%xmm3
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm12,%xmm10
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm3,%xmm10
+ paddd %xmm6,%xmm14
+ pxor %xmm2,%xmm7
+
+ paddd %xmm6,%xmm10
+ paddd %xmm7,%xmm10
+ movdqa 112-128(%rax),%xmm6
+ paddd 240-128(%rax),%xmm5
+
+ movdqa %xmm6,%xmm7
+ movdqa %xmm6,%xmm1
+ psrld $3,%xmm7
+ movdqa %xmm6,%xmm2
+
+ psrld $7,%xmm1
+ movdqa 64-128(%rax),%xmm0
+ pslld $14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $18-7,%xmm1
+ movdqa %xmm0,%xmm3
+ pxor %xmm2,%xmm7
+ pslld $25-14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $10,%xmm0
+ movdqa %xmm3,%xmm1
+
+ psrld $17,%xmm3
+ pxor %xmm2,%xmm7
+ pslld $13,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ psrld $19-17,%xmm3
+ pxor %xmm1,%xmm0
+ pslld $15-13,%xmm1
+ pxor %xmm3,%xmm0
+ pxor %xmm1,%xmm0
+ paddd %xmm0,%xmm5
+ movdqa %xmm14,%xmm7
+
+ movdqa %xmm14,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm14,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,96-128(%rax)
+ paddd %xmm9,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd 64(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm14,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm14,%xmm3
+ pslld $26-21,%xmm2
+ pandn %xmm8,%xmm0
+ pand %xmm15,%xmm3
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm10,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm10,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ movdqa %xmm11,%xmm3
+ movdqa %xmm10,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm10,%xmm3
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm3,%xmm4
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm11,%xmm9
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm4,%xmm9
+ paddd %xmm5,%xmm13
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm9
+ paddd %xmm7,%xmm9
+ movdqa 128-128(%rax),%xmm5
+ paddd 0-128(%rax),%xmm6
+
+ movdqa %xmm5,%xmm7
+ movdqa %xmm5,%xmm1
+ psrld $3,%xmm7
+ movdqa %xmm5,%xmm2
+
+ psrld $7,%xmm1
+ movdqa 80-128(%rax),%xmm0
+ pslld $14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $18-7,%xmm1
+ movdqa %xmm0,%xmm4
+ pxor %xmm2,%xmm7
+ pslld $25-14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $10,%xmm0
+ movdqa %xmm4,%xmm1
+
+ psrld $17,%xmm4
+ pxor %xmm2,%xmm7
+ pslld $13,%xmm1
+ paddd %xmm7,%xmm6
+ pxor %xmm4,%xmm0
+ psrld $19-17,%xmm4
+ pxor %xmm1,%xmm0
+ pslld $15-13,%xmm1
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
+ paddd %xmm0,%xmm6
+ movdqa %xmm13,%xmm7
+
+ movdqa %xmm13,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm13,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm6,112-128(%rax)
+ paddd %xmm8,%xmm6
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd 96(%rbp),%xmm6
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm13,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm13,%xmm4
+ pslld $26-21,%xmm2
+ pandn %xmm15,%xmm0
+ pand %xmm14,%xmm4
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm9,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm9,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm6
+ pxor %xmm4,%xmm0
+ movdqa %xmm10,%xmm4
+ movdqa %xmm9,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm9,%xmm4
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm6
+ pslld $19-10,%xmm2
+ pand %xmm4,%xmm3
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm10,%xmm8
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm3,%xmm8
+ paddd %xmm6,%xmm12
+ pxor %xmm2,%xmm7
+
+ paddd %xmm6,%xmm8
+ paddd %xmm7,%xmm8
+ leaq 256(%rbp),%rbp
+ movdqa 144-128(%rax),%xmm6
+ paddd 16-128(%rax),%xmm5
+
+ movdqa %xmm6,%xmm7
+ movdqa %xmm6,%xmm1
+ psrld $3,%xmm7
+ movdqa %xmm6,%xmm2
+
+ psrld $7,%xmm1
+ movdqa 96-128(%rax),%xmm0
+ pslld $14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $18-7,%xmm1
+ movdqa %xmm0,%xmm3
+ pxor %xmm2,%xmm7
+ pslld $25-14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $10,%xmm0
+ movdqa %xmm3,%xmm1
+
+ psrld $17,%xmm3
+ pxor %xmm2,%xmm7
+ pslld $13,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ psrld $19-17,%xmm3
+ pxor %xmm1,%xmm0
+ pslld $15-13,%xmm1
+ pxor %xmm3,%xmm0
+ pxor %xmm1,%xmm0
+ paddd %xmm0,%xmm5
+ movdqa %xmm12,%xmm7
+
+ movdqa %xmm12,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm12,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,128-128(%rax)
+ paddd %xmm15,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd -128(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm12,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm12,%xmm3
+ pslld $26-21,%xmm2
+ pandn %xmm14,%xmm0
+ pand %xmm13,%xmm3
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm8,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm8,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ movdqa %xmm9,%xmm3
+ movdqa %xmm8,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm8,%xmm3
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm3,%xmm4
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm9,%xmm15
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm4,%xmm15
+ paddd %xmm5,%xmm11
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm15
+ paddd %xmm7,%xmm15
+ movdqa 160-128(%rax),%xmm5
+ paddd 32-128(%rax),%xmm6
+
+ movdqa %xmm5,%xmm7
+ movdqa %xmm5,%xmm1
+ psrld $3,%xmm7
+ movdqa %xmm5,%xmm2
+
+ psrld $7,%xmm1
+ movdqa 112-128(%rax),%xmm0
+ pslld $14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $18-7,%xmm1
+ movdqa %xmm0,%xmm4
+ pxor %xmm2,%xmm7
+ pslld $25-14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $10,%xmm0
+ movdqa %xmm4,%xmm1
+
+ psrld $17,%xmm4
+ pxor %xmm2,%xmm7
+ pslld $13,%xmm1
+ paddd %xmm7,%xmm6
+ pxor %xmm4,%xmm0
+ psrld $19-17,%xmm4
+ pxor %xmm1,%xmm0
+ pslld $15-13,%xmm1
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
+ paddd %xmm0,%xmm6
+ movdqa %xmm11,%xmm7
+
+ movdqa %xmm11,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm11,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm6,144-128(%rax)
+ paddd %xmm14,%xmm6
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd -96(%rbp),%xmm6
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm11,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm11,%xmm4
+ pslld $26-21,%xmm2
+ pandn %xmm13,%xmm0
+ pand %xmm12,%xmm4
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm15,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm15,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm6
+ pxor %xmm4,%xmm0
+ movdqa %xmm8,%xmm4
+ movdqa %xmm15,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm15,%xmm4
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm6
+ pslld $19-10,%xmm2
+ pand %xmm4,%xmm3
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm8,%xmm14
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm3,%xmm14
+ paddd %xmm6,%xmm10
+ pxor %xmm2,%xmm7
+
+ paddd %xmm6,%xmm14
+ paddd %xmm7,%xmm14
+ movdqa 176-128(%rax),%xmm6
+ paddd 48-128(%rax),%xmm5
+
+ movdqa %xmm6,%xmm7
+ movdqa %xmm6,%xmm1
+ psrld $3,%xmm7
+ movdqa %xmm6,%xmm2
+
+ psrld $7,%xmm1
+ movdqa 128-128(%rax),%xmm0
+ pslld $14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $18-7,%xmm1
+ movdqa %xmm0,%xmm3
+ pxor %xmm2,%xmm7
+ pslld $25-14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $10,%xmm0
+ movdqa %xmm3,%xmm1
+
+ psrld $17,%xmm3
+ pxor %xmm2,%xmm7
+ pslld $13,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ psrld $19-17,%xmm3
+ pxor %xmm1,%xmm0
+ pslld $15-13,%xmm1
+ pxor %xmm3,%xmm0
+ pxor %xmm1,%xmm0
+ paddd %xmm0,%xmm5
+ movdqa %xmm10,%xmm7
+
+ movdqa %xmm10,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm10,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,160-128(%rax)
+ paddd %xmm13,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd -64(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm10,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm10,%xmm3
+ pslld $26-21,%xmm2
+ pandn %xmm12,%xmm0
+ pand %xmm11,%xmm3
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm14,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm14,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ movdqa %xmm15,%xmm3
+ movdqa %xmm14,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm14,%xmm3
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm3,%xmm4
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm15,%xmm13
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm4,%xmm13
+ paddd %xmm5,%xmm9
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm13
+ paddd %xmm7,%xmm13
+ movdqa 192-128(%rax),%xmm5
+ paddd 64-128(%rax),%xmm6
+
+ movdqa %xmm5,%xmm7
+ movdqa %xmm5,%xmm1
+ psrld $3,%xmm7
+ movdqa %xmm5,%xmm2
+
+ psrld $7,%xmm1
+ movdqa 144-128(%rax),%xmm0
+ pslld $14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $18-7,%xmm1
+ movdqa %xmm0,%xmm4
+ pxor %xmm2,%xmm7
+ pslld $25-14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $10,%xmm0
+ movdqa %xmm4,%xmm1
+
+ psrld $17,%xmm4
+ pxor %xmm2,%xmm7
+ pslld $13,%xmm1
+ paddd %xmm7,%xmm6
+ pxor %xmm4,%xmm0
+ psrld $19-17,%xmm4
+ pxor %xmm1,%xmm0
+ pslld $15-13,%xmm1
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
+ paddd %xmm0,%xmm6
+ movdqa %xmm9,%xmm7
+
+ movdqa %xmm9,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm9,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm6,176-128(%rax)
+ paddd %xmm12,%xmm6
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd -32(%rbp),%xmm6
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm9,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm9,%xmm4
+ pslld $26-21,%xmm2
+ pandn %xmm11,%xmm0
+ pand %xmm10,%xmm4
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm13,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm13,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm6
+ pxor %xmm4,%xmm0
+ movdqa %xmm14,%xmm4
+ movdqa %xmm13,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm13,%xmm4
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm6
+ pslld $19-10,%xmm2
+ pand %xmm4,%xmm3
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm14,%xmm12
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm3,%xmm12
+ paddd %xmm6,%xmm8
+ pxor %xmm2,%xmm7
+
+ paddd %xmm6,%xmm12
+ paddd %xmm7,%xmm12
+ movdqa 208-128(%rax),%xmm6
+ paddd 80-128(%rax),%xmm5
+
+ movdqa %xmm6,%xmm7
+ movdqa %xmm6,%xmm1
+ psrld $3,%xmm7
+ movdqa %xmm6,%xmm2
+
+ psrld $7,%xmm1
+ movdqa 160-128(%rax),%xmm0
+ pslld $14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $18-7,%xmm1
+ movdqa %xmm0,%xmm3
+ pxor %xmm2,%xmm7
+ pslld $25-14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $10,%xmm0
+ movdqa %xmm3,%xmm1
+
+ psrld $17,%xmm3
+ pxor %xmm2,%xmm7
+ pslld $13,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ psrld $19-17,%xmm3
+ pxor %xmm1,%xmm0
+ pslld $15-13,%xmm1
+ pxor %xmm3,%xmm0
+ pxor %xmm1,%xmm0
+ paddd %xmm0,%xmm5
+ movdqa %xmm8,%xmm7
+
+ movdqa %xmm8,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm8,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,192-128(%rax)
+ paddd %xmm11,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd 0(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm8,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm8,%xmm3
+ pslld $26-21,%xmm2
+ pandn %xmm10,%xmm0
+ pand %xmm9,%xmm3
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm12,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm12,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ movdqa %xmm13,%xmm3
+ movdqa %xmm12,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm12,%xmm3
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm3,%xmm4
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm13,%xmm11
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm4,%xmm11
+ paddd %xmm5,%xmm15
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm11
+ paddd %xmm7,%xmm11
+ movdqa 224-128(%rax),%xmm5
+ paddd 96-128(%rax),%xmm6
+
+ movdqa %xmm5,%xmm7
+ movdqa %xmm5,%xmm1
+ psrld $3,%xmm7
+ movdqa %xmm5,%xmm2
+
+ psrld $7,%xmm1
+ movdqa 176-128(%rax),%xmm0
+ pslld $14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $18-7,%xmm1
+ movdqa %xmm0,%xmm4
+ pxor %xmm2,%xmm7
+ pslld $25-14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $10,%xmm0
+ movdqa %xmm4,%xmm1
+
+ psrld $17,%xmm4
+ pxor %xmm2,%xmm7
+ pslld $13,%xmm1
+ paddd %xmm7,%xmm6
+ pxor %xmm4,%xmm0
+ psrld $19-17,%xmm4
+ pxor %xmm1,%xmm0
+ pslld $15-13,%xmm1
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
+ paddd %xmm0,%xmm6
+ movdqa %xmm15,%xmm7
+
+ movdqa %xmm15,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm15,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm6,208-128(%rax)
+ paddd %xmm10,%xmm6
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd 32(%rbp),%xmm6
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm15,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm15,%xmm4
+ pslld $26-21,%xmm2
+ pandn %xmm9,%xmm0
+ pand %xmm8,%xmm4
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm11,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm11,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm6
+ pxor %xmm4,%xmm0
+ movdqa %xmm12,%xmm4
+ movdqa %xmm11,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm11,%xmm4
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm6
+ pslld $19-10,%xmm2
+ pand %xmm4,%xmm3
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm12,%xmm10
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm3,%xmm10
+ paddd %xmm6,%xmm14
+ pxor %xmm2,%xmm7
+
+ paddd %xmm6,%xmm10
+ paddd %xmm7,%xmm10
+ movdqa 240-128(%rax),%xmm6
+ paddd 112-128(%rax),%xmm5
+
+ movdqa %xmm6,%xmm7
+ movdqa %xmm6,%xmm1
+ psrld $3,%xmm7
+ movdqa %xmm6,%xmm2
+
+ psrld $7,%xmm1
+ movdqa 192-128(%rax),%xmm0
+ pslld $14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $18-7,%xmm1
+ movdqa %xmm0,%xmm3
+ pxor %xmm2,%xmm7
+ pslld $25-14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $10,%xmm0
+ movdqa %xmm3,%xmm1
+
+ psrld $17,%xmm3
+ pxor %xmm2,%xmm7
+ pslld $13,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ psrld $19-17,%xmm3
+ pxor %xmm1,%xmm0
+ pslld $15-13,%xmm1
+ pxor %xmm3,%xmm0
+ pxor %xmm1,%xmm0
+ paddd %xmm0,%xmm5
+ movdqa %xmm14,%xmm7
+
+ movdqa %xmm14,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm14,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm5,224-128(%rax)
+ paddd %xmm9,%xmm5
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd 64(%rbp),%xmm5
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm14,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm14,%xmm3
+ pslld $26-21,%xmm2
+ pandn %xmm8,%xmm0
+ pand %xmm15,%xmm3
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm10,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm10,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm5
+ pxor %xmm3,%xmm0
+ movdqa %xmm11,%xmm3
+ movdqa %xmm10,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm10,%xmm3
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm5
+ pslld $19-10,%xmm2
+ pand %xmm3,%xmm4
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm11,%xmm9
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm4,%xmm9
+ paddd %xmm5,%xmm13
+ pxor %xmm2,%xmm7
+
+ paddd %xmm5,%xmm9
+ paddd %xmm7,%xmm9
+ movdqa 0-128(%rax),%xmm5
+ paddd 128-128(%rax),%xmm6
+
+ movdqa %xmm5,%xmm7
+ movdqa %xmm5,%xmm1
+ psrld $3,%xmm7
+ movdqa %xmm5,%xmm2
+
+ psrld $7,%xmm1
+ movdqa 208-128(%rax),%xmm0
+ pslld $14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $18-7,%xmm1
+ movdqa %xmm0,%xmm4
+ pxor %xmm2,%xmm7
+ pslld $25-14,%xmm2
+ pxor %xmm1,%xmm7
+ psrld $10,%xmm0
+ movdqa %xmm4,%xmm1
+
+ psrld $17,%xmm4
+ pxor %xmm2,%xmm7
+ pslld $13,%xmm1
+ paddd %xmm7,%xmm6
+ pxor %xmm4,%xmm0
+ psrld $19-17,%xmm4
+ pxor %xmm1,%xmm0
+ pslld $15-13,%xmm1
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
+ paddd %xmm0,%xmm6
+ movdqa %xmm13,%xmm7
+
+ movdqa %xmm13,%xmm2
+
+ psrld $6,%xmm7
+ movdqa %xmm13,%xmm1
+ pslld $7,%xmm2
+ movdqa %xmm6,240-128(%rax)
+ paddd %xmm8,%xmm6
+
+ psrld $11,%xmm1
+ pxor %xmm2,%xmm7
+ pslld $21-7,%xmm2
+ paddd 96(%rbp),%xmm6
+ pxor %xmm1,%xmm7
+
+ psrld $25-11,%xmm1
+ movdqa %xmm13,%xmm0
+
+ pxor %xmm2,%xmm7
+ movdqa %xmm13,%xmm4
+ pslld $26-21,%xmm2
+ pandn %xmm15,%xmm0
+ pand %xmm14,%xmm4
+ pxor %xmm1,%xmm7
+
+
+ movdqa %xmm9,%xmm1
+ pxor %xmm2,%xmm7
+ movdqa %xmm9,%xmm2
+ psrld $2,%xmm1
+ paddd %xmm7,%xmm6
+ pxor %xmm4,%xmm0
+ movdqa %xmm10,%xmm4
+ movdqa %xmm9,%xmm7
+ pslld $10,%xmm2
+ pxor %xmm9,%xmm4
+
+
+ psrld $13,%xmm7
+ pxor %xmm2,%xmm1
+ paddd %xmm0,%xmm6
+ pslld $19-10,%xmm2
+ pand %xmm4,%xmm3
+ pxor %xmm7,%xmm1
+
+
+ psrld $22-13,%xmm7
+ pxor %xmm2,%xmm1
+ movdqa %xmm10,%xmm8
+ pslld $30-19,%xmm2
+ pxor %xmm1,%xmm7
+ pxor %xmm3,%xmm8
+ paddd %xmm6,%xmm12
+ pxor %xmm2,%xmm7
+
+ paddd %xmm6,%xmm8
+ paddd %xmm7,%xmm8
+ leaq 256(%rbp),%rbp
+ decl %ecx
+ jnz .Loop_16_xx
+
+ movl $1,%ecx
+ leaq K256+128(%rip),%rbp
+
+ movdqa (%rbx),%xmm7
+ cmpl 0(%rbx),%ecx
+ pxor %xmm0,%xmm0
+ cmovgeq %rbp,%r8
+ cmpl 4(%rbx),%ecx
+ movdqa %xmm7,%xmm6
+ cmovgeq %rbp,%r9
+ cmpl 8(%rbx),%ecx
+ pcmpgtd %xmm0,%xmm6
+ cmovgeq %rbp,%r10
+ cmpl 12(%rbx),%ecx
+ paddd %xmm6,%xmm7
+ cmovgeq %rbp,%r11
+
+ movdqu 0-128(%rdi),%xmm0
+ pand %xmm6,%xmm8
+ movdqu 32-128(%rdi),%xmm1
+ pand %xmm6,%xmm9
+ movdqu 64-128(%rdi),%xmm2
+ pand %xmm6,%xmm10
+ movdqu 96-128(%rdi),%xmm5
+ pand %xmm6,%xmm11
+ paddd %xmm0,%xmm8
+ movdqu 128-128(%rdi),%xmm0
+ pand %xmm6,%xmm12
+ paddd %xmm1,%xmm9
+ movdqu 160-128(%rdi),%xmm1
+ pand %xmm6,%xmm13
+ paddd %xmm2,%xmm10
+ movdqu 192-128(%rdi),%xmm2
+ pand %xmm6,%xmm14
+ paddd %xmm5,%xmm11
+ movdqu 224-128(%rdi),%xmm5
+ pand %xmm6,%xmm15
+ paddd %xmm0,%xmm12
+ paddd %xmm1,%xmm13
+ movdqu %xmm8,0-128(%rdi)
+ paddd %xmm2,%xmm14
+ movdqu %xmm9,32-128(%rdi)
+ paddd %xmm5,%xmm15
+ movdqu %xmm10,64-128(%rdi)
+ movdqu %xmm11,96-128(%rdi)
+ movdqu %xmm12,128-128(%rdi)
+ movdqu %xmm13,160-128(%rdi)
+ movdqu %xmm14,192-128(%rdi)
+ movdqu %xmm15,224-128(%rdi)
+
+ movdqa %xmm7,(%rbx)
+ movdqa .Lpbswap(%rip),%xmm6
+ decl %edx
+ jnz .Loop
+
+ movl 280(%rsp),%edx
+ leaq 16(%rdi),%rdi
+ leaq 64(%rsi),%rsi
+ decl %edx
+ jnz .Loop_grande
+
+.Ldone:
+ movq 272(%rsp),%rax
+.cfi_def_cfa %rax,8
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha256_multi_block,.-sha256_multi_block
+.type sha256_multi_block_shaext,@function
+.align 32
+sha256_multi_block_shaext:
+.cfi_startproc
+_shaext_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ subq $288,%rsp
+ shll $1,%edx
+ andq $-256,%rsp
+ leaq 128(%rdi),%rdi
+ movq %rax,272(%rsp)
+.Lbody_shaext:
+ leaq 256(%rsp),%rbx
+ leaq K256_shaext+128(%rip),%rbp
+
+.Loop_grande_shaext:
+ movl %edx,280(%rsp)
+ xorl %edx,%edx
+ movq 0(%rsi),%r8
+ movl 8(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,0(%rbx)
+ cmovleq %rsp,%r8
+ movq 16(%rsi),%r9
+ movl 24(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,4(%rbx)
+ cmovleq %rsp,%r9
+ testl %edx,%edx
+ jz .Ldone_shaext
+
+ movq 0-128(%rdi),%xmm12
+ movq 32-128(%rdi),%xmm4
+ movq 64-128(%rdi),%xmm13
+ movq 96-128(%rdi),%xmm5
+ movq 128-128(%rdi),%xmm8
+ movq 160-128(%rdi),%xmm9
+ movq 192-128(%rdi),%xmm10
+ movq 224-128(%rdi),%xmm11
+
+ punpckldq %xmm4,%xmm12
+ punpckldq %xmm5,%xmm13
+ punpckldq %xmm9,%xmm8
+ punpckldq %xmm11,%xmm10
+ movdqa K256_shaext-16(%rip),%xmm3
+
+ movdqa %xmm12,%xmm14
+ movdqa %xmm13,%xmm15
+ punpcklqdq %xmm8,%xmm12
+ punpcklqdq %xmm10,%xmm13
+ punpckhqdq %xmm8,%xmm14
+ punpckhqdq %xmm10,%xmm15
+
+ pshufd $27,%xmm12,%xmm12
+ pshufd $27,%xmm13,%xmm13
+ pshufd $27,%xmm14,%xmm14
+ pshufd $27,%xmm15,%xmm15
+ jmp .Loop_shaext
+
+.align 32
+.Loop_shaext:
+ movdqu 0(%r8),%xmm4
+ movdqu 0(%r9),%xmm8
+ movdqu 16(%r8),%xmm5
+ movdqu 16(%r9),%xmm9
+ movdqu 32(%r8),%xmm6
+.byte 102,15,56,0,227
+ movdqu 32(%r9),%xmm10
+.byte 102,68,15,56,0,195
+ movdqu 48(%r8),%xmm7
+ leaq 64(%r8),%r8
+ movdqu 48(%r9),%xmm11
+ leaq 64(%r9),%r9
+
+ movdqa 0-128(%rbp),%xmm0
+.byte 102,15,56,0,235
+ paddd %xmm4,%xmm0
+ pxor %xmm12,%xmm4
+ movdqa %xmm0,%xmm1
+ movdqa 0-128(%rbp),%xmm2
+.byte 102,68,15,56,0,203
+ paddd %xmm8,%xmm2
+ movdqa %xmm13,80(%rsp)
+.byte 69,15,56,203,236
+ pxor %xmm14,%xmm8
+ movdqa %xmm2,%xmm0
+ movdqa %xmm15,112(%rsp)
+.byte 69,15,56,203,254
+ pshufd $0x0e,%xmm1,%xmm0
+ pxor %xmm12,%xmm4
+ movdqa %xmm12,64(%rsp)
+.byte 69,15,56,203,229
+ pshufd $0x0e,%xmm2,%xmm0
+ pxor %xmm14,%xmm8
+ movdqa %xmm14,96(%rsp)
+ movdqa 16-128(%rbp),%xmm1
+ paddd %xmm5,%xmm1
+.byte 102,15,56,0,243
+.byte 69,15,56,203,247
+
+ movdqa %xmm1,%xmm0
+ movdqa 16-128(%rbp),%xmm2
+ paddd %xmm9,%xmm2
+.byte 69,15,56,203,236
+ movdqa %xmm2,%xmm0
+ prefetcht0 127(%r8)
+.byte 102,15,56,0,251
+.byte 102,68,15,56,0,211
+ prefetcht0 127(%r9)
+.byte 69,15,56,203,254
+ pshufd $0x0e,%xmm1,%xmm0
+.byte 102,68,15,56,0,219
+.byte 15,56,204,229
+.byte 69,15,56,203,229
+ pshufd $0x0e,%xmm2,%xmm0
+ movdqa 32-128(%rbp),%xmm1
+ paddd %xmm6,%xmm1
+.byte 69,15,56,203,247
+
+ movdqa %xmm1,%xmm0
+ movdqa 32-128(%rbp),%xmm2
+ paddd %xmm10,%xmm2
+.byte 69,15,56,203,236
+.byte 69,15,56,204,193
+ movdqa %xmm2,%xmm0
+ movdqa %xmm7,%xmm3
+.byte 69,15,56,203,254
+ pshufd $0x0e,%xmm1,%xmm0
+.byte 102,15,58,15,222,4
+ paddd %xmm3,%xmm4
+ movdqa %xmm11,%xmm3
+.byte 102,65,15,58,15,218,4
+.byte 15,56,204,238
+.byte 69,15,56,203,229
+ pshufd $0x0e,%xmm2,%xmm0
+ movdqa 48-128(%rbp),%xmm1
+ paddd %xmm7,%xmm1
+.byte 69,15,56,203,247
+.byte 69,15,56,204,202
+
+ movdqa %xmm1,%xmm0
+ movdqa 48-128(%rbp),%xmm2
+ paddd %xmm3,%xmm8
+ paddd %xmm11,%xmm2
+.byte 15,56,205,231
+.byte 69,15,56,203,236
+ movdqa %xmm2,%xmm0
+ movdqa %xmm4,%xmm3
+.byte 102,15,58,15,223,4
+.byte 69,15,56,203,254
+.byte 69,15,56,205,195
+ pshufd $0x0e,%xmm1,%xmm0
+ paddd %xmm3,%xmm5
+ movdqa %xmm8,%xmm3
+.byte 102,65,15,58,15,219,4
+.byte 15,56,204,247
+.byte 69,15,56,203,229
+ pshufd $0x0e,%xmm2,%xmm0
+ movdqa 64-128(%rbp),%xmm1
+ paddd %xmm4,%xmm1
+.byte 69,15,56,203,247
+.byte 69,15,56,204,211
+ movdqa %xmm1,%xmm0
+ movdqa 64-128(%rbp),%xmm2
+ paddd %xmm3,%xmm9
+ paddd %xmm8,%xmm2
+.byte 15,56,205,236
+.byte 69,15,56,203,236
+ movdqa %xmm2,%xmm0
+ movdqa %xmm5,%xmm3
+.byte 102,15,58,15,220,4
+.byte 69,15,56,203,254
+.byte 69,15,56,205,200
+ pshufd $0x0e,%xmm1,%xmm0
+ paddd %xmm3,%xmm6
+ movdqa %xmm9,%xmm3
+.byte 102,65,15,58,15,216,4
+.byte 15,56,204,252
+.byte 69,15,56,203,229
+ pshufd $0x0e,%xmm2,%xmm0
+ movdqa 80-128(%rbp),%xmm1
+ paddd %xmm5,%xmm1
+.byte 69,15,56,203,247
+.byte 69,15,56,204,216
+ movdqa %xmm1,%xmm0
+ movdqa 80-128(%rbp),%xmm2
+ paddd %xmm3,%xmm10
+ paddd %xmm9,%xmm2
+.byte 15,56,205,245
+.byte 69,15,56,203,236
+ movdqa %xmm2,%xmm0
+ movdqa %xmm6,%xmm3
+.byte 102,15,58,15,221,4
+.byte 69,15,56,203,254
+.byte 69,15,56,205,209
+ pshufd $0x0e,%xmm1,%xmm0
+ paddd %xmm3,%xmm7
+ movdqa %xmm10,%xmm3
+.byte 102,65,15,58,15,217,4
+.byte 15,56,204,229
+.byte 69,15,56,203,229
+ pshufd $0x0e,%xmm2,%xmm0
+ movdqa 96-128(%rbp),%xmm1
+ paddd %xmm6,%xmm1
+.byte 69,15,56,203,247
+.byte 69,15,56,204,193
+ movdqa %xmm1,%xmm0
+ movdqa 96-128(%rbp),%xmm2
+ paddd %xmm3,%xmm11
+ paddd %xmm10,%xmm2
+.byte 15,56,205,254
+.byte 69,15,56,203,236
+ movdqa %xmm2,%xmm0
+ movdqa %xmm7,%xmm3
+.byte 102,15,58,15,222,4
+.byte 69,15,56,203,254
+.byte 69,15,56,205,218
+ pshufd $0x0e,%xmm1,%xmm0
+ paddd %xmm3,%xmm4
+ movdqa %xmm11,%xmm3
+.byte 102,65,15,58,15,218,4
+.byte 15,56,204,238
+.byte 69,15,56,203,229
+ pshufd $0x0e,%xmm2,%xmm0
+ movdqa 112-128(%rbp),%xmm1
+ paddd %xmm7,%xmm1
+.byte 69,15,56,203,247
+.byte 69,15,56,204,202
+ movdqa %xmm1,%xmm0
+ movdqa 112-128(%rbp),%xmm2
+ paddd %xmm3,%xmm8
+ paddd %xmm11,%xmm2
+.byte 15,56,205,231
+.byte 69,15,56,203,236
+ movdqa %xmm2,%xmm0
+ movdqa %xmm4,%xmm3
+.byte 102,15,58,15,223,4
+.byte 69,15,56,203,254
+.byte 69,15,56,205,195
+ pshufd $0x0e,%xmm1,%xmm0
+ paddd %xmm3,%xmm5
+ movdqa %xmm8,%xmm3
+.byte 102,65,15,58,15,219,4
+.byte 15,56,204,247
+.byte 69,15,56,203,229
+ pshufd $0x0e,%xmm2,%xmm0
+ movdqa 128-128(%rbp),%xmm1
+ paddd %xmm4,%xmm1
+.byte 69,15,56,203,247
+.byte 69,15,56,204,211
+ movdqa %xmm1,%xmm0
+ movdqa 128-128(%rbp),%xmm2
+ paddd %xmm3,%xmm9
+ paddd %xmm8,%xmm2
+.byte 15,56,205,236
+.byte 69,15,56,203,236
+ movdqa %xmm2,%xmm0
+ movdqa %xmm5,%xmm3
+.byte 102,15,58,15,220,4
+.byte 69,15,56,203,254
+.byte 69,15,56,205,200
+ pshufd $0x0e,%xmm1,%xmm0
+ paddd %xmm3,%xmm6
+ movdqa %xmm9,%xmm3
+.byte 102,65,15,58,15,216,4
+.byte 15,56,204,252
+.byte 69,15,56,203,229
+ pshufd $0x0e,%xmm2,%xmm0
+ movdqa 144-128(%rbp),%xmm1
+ paddd %xmm5,%xmm1
+.byte 69,15,56,203,247
+.byte 69,15,56,204,216
+ movdqa %xmm1,%xmm0
+ movdqa 144-128(%rbp),%xmm2
+ paddd %xmm3,%xmm10
+ paddd %xmm9,%xmm2
+.byte 15,56,205,245
+.byte 69,15,56,203,236
+ movdqa %xmm2,%xmm0
+ movdqa %xmm6,%xmm3
+.byte 102,15,58,15,221,4
+.byte 69,15,56,203,254
+.byte 69,15,56,205,209
+ pshufd $0x0e,%xmm1,%xmm0
+ paddd %xmm3,%xmm7
+ movdqa %xmm10,%xmm3
+.byte 102,65,15,58,15,217,4
+.byte 15,56,204,229
+.byte 69,15,56,203,229
+ pshufd $0x0e,%xmm2,%xmm0
+ movdqa 160-128(%rbp),%xmm1
+ paddd %xmm6,%xmm1
+.byte 69,15,56,203,247
+.byte 69,15,56,204,193
+ movdqa %xmm1,%xmm0
+ movdqa 160-128(%rbp),%xmm2
+ paddd %xmm3,%xmm11
+ paddd %xmm10,%xmm2
+.byte 15,56,205,254
+.byte 69,15,56,203,236
+ movdqa %xmm2,%xmm0
+ movdqa %xmm7,%xmm3
+.byte 102,15,58,15,222,4
+.byte 69,15,56,203,254
+.byte 69,15,56,205,218
+ pshufd $0x0e,%xmm1,%xmm0
+ paddd %xmm3,%xmm4
+ movdqa %xmm11,%xmm3
+.byte 102,65,15,58,15,218,4
+.byte 15,56,204,238
+.byte 69,15,56,203,229
+ pshufd $0x0e,%xmm2,%xmm0
+ movdqa 176-128(%rbp),%xmm1
+ paddd %xmm7,%xmm1
+.byte 69,15,56,203,247
+.byte 69,15,56,204,202
+ movdqa %xmm1,%xmm0
+ movdqa 176-128(%rbp),%xmm2
+ paddd %xmm3,%xmm8
+ paddd %xmm11,%xmm2
+.byte 15,56,205,231
+.byte 69,15,56,203,236
+ movdqa %xmm2,%xmm0
+ movdqa %xmm4,%xmm3
+.byte 102,15,58,15,223,4
+.byte 69,15,56,203,254
+.byte 69,15,56,205,195
+ pshufd $0x0e,%xmm1,%xmm0
+ paddd %xmm3,%xmm5
+ movdqa %xmm8,%xmm3
+.byte 102,65,15,58,15,219,4
+.byte 15,56,204,247
+.byte 69,15,56,203,229
+ pshufd $0x0e,%xmm2,%xmm0
+ movdqa 192-128(%rbp),%xmm1
+ paddd %xmm4,%xmm1
+.byte 69,15,56,203,247
+.byte 69,15,56,204,211
+ movdqa %xmm1,%xmm0
+ movdqa 192-128(%rbp),%xmm2
+ paddd %xmm3,%xmm9
+ paddd %xmm8,%xmm2
+.byte 15,56,205,236
+.byte 69,15,56,203,236
+ movdqa %xmm2,%xmm0
+ movdqa %xmm5,%xmm3
+.byte 102,15,58,15,220,4
+.byte 69,15,56,203,254
+.byte 69,15,56,205,200
+ pshufd $0x0e,%xmm1,%xmm0
+ paddd %xmm3,%xmm6
+ movdqa %xmm9,%xmm3
+.byte 102,65,15,58,15,216,4
+.byte 15,56,204,252
+.byte 69,15,56,203,229
+ pshufd $0x0e,%xmm2,%xmm0
+ movdqa 208-128(%rbp),%xmm1
+ paddd %xmm5,%xmm1
+.byte 69,15,56,203,247
+.byte 69,15,56,204,216
+ movdqa %xmm1,%xmm0
+ movdqa 208-128(%rbp),%xmm2
+ paddd %xmm3,%xmm10
+ paddd %xmm9,%xmm2
+.byte 15,56,205,245
+.byte 69,15,56,203,236
+ movdqa %xmm2,%xmm0
+ movdqa %xmm6,%xmm3
+.byte 102,15,58,15,221,4
+.byte 69,15,56,203,254
+.byte 69,15,56,205,209
+ pshufd $0x0e,%xmm1,%xmm0
+ paddd %xmm3,%xmm7
+ movdqa %xmm10,%xmm3
+.byte 102,65,15,58,15,217,4
+ nop
+.byte 69,15,56,203,229
+ pshufd $0x0e,%xmm2,%xmm0
+ movdqa 224-128(%rbp),%xmm1
+ paddd %xmm6,%xmm1
+.byte 69,15,56,203,247
+
+ movdqa %xmm1,%xmm0
+ movdqa 224-128(%rbp),%xmm2
+ paddd %xmm3,%xmm11
+ paddd %xmm10,%xmm2
+.byte 15,56,205,254
+ nop
+.byte 69,15,56,203,236
+ movdqa %xmm2,%xmm0
+ movl $1,%ecx
+ pxor %xmm6,%xmm6
+.byte 69,15,56,203,254
+.byte 69,15,56,205,218
+ pshufd $0x0e,%xmm1,%xmm0
+ movdqa 240-128(%rbp),%xmm1
+ paddd %xmm7,%xmm1
+ movq (%rbx),%xmm7
+ nop
+.byte 69,15,56,203,229
+ pshufd $0x0e,%xmm2,%xmm0
+ movdqa 240-128(%rbp),%xmm2
+ paddd %xmm11,%xmm2
+.byte 69,15,56,203,247
+
+ movdqa %xmm1,%xmm0
+ cmpl 0(%rbx),%ecx
+ cmovgeq %rsp,%r8
+ cmpl 4(%rbx),%ecx
+ cmovgeq %rsp,%r9
+ pshufd $0x00,%xmm7,%xmm9
+.byte 69,15,56,203,236
+ movdqa %xmm2,%xmm0
+ pshufd $0x55,%xmm7,%xmm10
+ movdqa %xmm7,%xmm11
+.byte 69,15,56,203,254
+ pshufd $0x0e,%xmm1,%xmm0
+ pcmpgtd %xmm6,%xmm9
+ pcmpgtd %xmm6,%xmm10
+.byte 69,15,56,203,229
+ pshufd $0x0e,%xmm2,%xmm0
+ pcmpgtd %xmm6,%xmm11
+ movdqa K256_shaext-16(%rip),%xmm3
+.byte 69,15,56,203,247
+
+ pand %xmm9,%xmm13
+ pand %xmm10,%xmm15
+ pand %xmm9,%xmm12
+ pand %xmm10,%xmm14
+ paddd %xmm7,%xmm11
+
+ paddd 80(%rsp),%xmm13
+ paddd 112(%rsp),%xmm15
+ paddd 64(%rsp),%xmm12
+ paddd 96(%rsp),%xmm14
+
+ movq %xmm11,(%rbx)
+ decl %edx
+ jnz .Loop_shaext
+
+ movl 280(%rsp),%edx
+
+ pshufd $27,%xmm12,%xmm12
+ pshufd $27,%xmm13,%xmm13
+ pshufd $27,%xmm14,%xmm14
+ pshufd $27,%xmm15,%xmm15
+
+ movdqa %xmm12,%xmm5
+ movdqa %xmm13,%xmm6
+ punpckldq %xmm14,%xmm12
+ punpckhdq %xmm14,%xmm5
+ punpckldq %xmm15,%xmm13
+ punpckhdq %xmm15,%xmm6
+
+ movq %xmm12,0-128(%rdi)
+ psrldq $8,%xmm12
+ movq %xmm5,128-128(%rdi)
+ psrldq $8,%xmm5
+ movq %xmm12,32-128(%rdi)
+ movq %xmm5,160-128(%rdi)
+
+ movq %xmm13,64-128(%rdi)
+ psrldq $8,%xmm13
+ movq %xmm6,192-128(%rdi)
+ psrldq $8,%xmm6
+ movq %xmm13,96-128(%rdi)
+ movq %xmm6,224-128(%rdi)
+
+ leaq 8(%rdi),%rdi
+ leaq 32(%rsi),%rsi
+ decl %edx
+ jnz .Loop_grande_shaext
+
+.Ldone_shaext:
+
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_shaext:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha256_multi_block_shaext,.-sha256_multi_block_shaext
+.type sha256_multi_block_avx,@function
+.align 32
+sha256_multi_block_avx:
+.cfi_startproc
+_avx_shortcut:
+ shrq $32,%rcx
+ cmpl $2,%edx
+ jb .Lavx
+ testl $32,%ecx
+ jnz _avx2_shortcut
+ jmp .Lavx
+.align 32
+.Lavx:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ subq $288,%rsp
+ andq $-256,%rsp
+ movq %rax,272(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08
+.Lbody_avx:
+ leaq K256+128(%rip),%rbp
+ leaq 256(%rsp),%rbx
+ leaq 128(%rdi),%rdi
+
+.Loop_grande_avx:
+ movl %edx,280(%rsp)
+ xorl %edx,%edx
+ movq 0(%rsi),%r8
+ movl 8(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,0(%rbx)
+ cmovleq %rbp,%r8
+ movq 16(%rsi),%r9
+ movl 24(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,4(%rbx)
+ cmovleq %rbp,%r9
+ movq 32(%rsi),%r10
+ movl 40(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,8(%rbx)
+ cmovleq %rbp,%r10
+ movq 48(%rsi),%r11
+ movl 56(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,12(%rbx)
+ cmovleq %rbp,%r11
+ testl %edx,%edx
+ jz .Ldone_avx
+
+ vmovdqu 0-128(%rdi),%xmm8
+ leaq 128(%rsp),%rax
+ vmovdqu 32-128(%rdi),%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ vmovdqu 96-128(%rdi),%xmm11
+ vmovdqu 128-128(%rdi),%xmm12
+ vmovdqu 160-128(%rdi),%xmm13
+ vmovdqu 192-128(%rdi),%xmm14
+ vmovdqu 224-128(%rdi),%xmm15
+ vmovdqu .Lpbswap(%rip),%xmm6
+ jmp .Loop_avx
+
+.align 32
+.Loop_avx:
+ vpxor %xmm9,%xmm10,%xmm4
+ vmovd 0(%r8),%xmm5
+ vmovd 0(%r9),%xmm0
+ vpinsrd $1,0(%r10),%xmm5,%xmm5
+ vpinsrd $1,0(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm12,%xmm7
+ vpslld $26,%xmm12,%xmm2
+ vmovdqu %xmm5,0-128(%rax)
+ vpaddd %xmm15,%xmm5,%xmm5
+
+ vpsrld $11,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm12,%xmm2
+ vpaddd -128(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm12,%xmm2
+ vpandn %xmm14,%xmm12,%xmm0
+ vpand %xmm13,%xmm12,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm8,%xmm15
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm8,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm8,%xmm9,%xmm3
+
+ vpxor %xmm1,%xmm15,%xmm15
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm8,%xmm1
+
+ vpslld $19,%xmm8,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm15,%xmm7
+
+ vpsrld $22,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm8,%xmm2
+ vpxor %xmm4,%xmm9,%xmm15
+ vpaddd %xmm5,%xmm11,%xmm11
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm15,%xmm15
+ vpaddd %xmm7,%xmm15,%xmm15
+ vmovd 4(%r8),%xmm5
+ vmovd 4(%r9),%xmm0
+ vpinsrd $1,4(%r10),%xmm5,%xmm5
+ vpinsrd $1,4(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm11,%xmm7
+ vpslld $26,%xmm11,%xmm2
+ vmovdqu %xmm5,16-128(%rax)
+ vpaddd %xmm14,%xmm5,%xmm5
+
+ vpsrld $11,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm11,%xmm2
+ vpaddd -96(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm11,%xmm2
+ vpandn %xmm13,%xmm11,%xmm0
+ vpand %xmm12,%xmm11,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm15,%xmm14
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm15,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm15,%xmm8,%xmm4
+
+ vpxor %xmm1,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm15,%xmm1
+
+ vpslld $19,%xmm15,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm14,%xmm7
+
+ vpsrld $22,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm15,%xmm2
+ vpxor %xmm3,%xmm8,%xmm14
+ vpaddd %xmm5,%xmm10,%xmm10
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm14,%xmm14
+ vmovd 8(%r8),%xmm5
+ vmovd 8(%r9),%xmm0
+ vpinsrd $1,8(%r10),%xmm5,%xmm5
+ vpinsrd $1,8(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm10,%xmm7
+ vpslld $26,%xmm10,%xmm2
+ vmovdqu %xmm5,32-128(%rax)
+ vpaddd %xmm13,%xmm5,%xmm5
+
+ vpsrld $11,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm10,%xmm2
+ vpaddd -64(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm10,%xmm2
+ vpandn %xmm12,%xmm10,%xmm0
+ vpand %xmm11,%xmm10,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm14,%xmm13
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm14,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm14,%xmm15,%xmm3
+
+ vpxor %xmm1,%xmm13,%xmm13
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm14,%xmm1
+
+ vpslld $19,%xmm14,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm13,%xmm7
+
+ vpsrld $22,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm14,%xmm2
+ vpxor %xmm4,%xmm15,%xmm13
+ vpaddd %xmm5,%xmm9,%xmm9
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm13,%xmm13
+ vpaddd %xmm7,%xmm13,%xmm13
+ vmovd 12(%r8),%xmm5
+ vmovd 12(%r9),%xmm0
+ vpinsrd $1,12(%r10),%xmm5,%xmm5
+ vpinsrd $1,12(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm9,%xmm7
+ vpslld $26,%xmm9,%xmm2
+ vmovdqu %xmm5,48-128(%rax)
+ vpaddd %xmm12,%xmm5,%xmm5
+
+ vpsrld $11,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm9,%xmm2
+ vpaddd -32(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm9,%xmm2
+ vpandn %xmm11,%xmm9,%xmm0
+ vpand %xmm10,%xmm9,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm13,%xmm12
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm13,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm13,%xmm14,%xmm4
+
+ vpxor %xmm1,%xmm12,%xmm12
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm13,%xmm1
+
+ vpslld $19,%xmm13,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm12,%xmm7
+
+ vpsrld $22,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm13,%xmm2
+ vpxor %xmm3,%xmm14,%xmm12
+ vpaddd %xmm5,%xmm8,%xmm8
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm12,%xmm12
+ vpaddd %xmm7,%xmm12,%xmm12
+ vmovd 16(%r8),%xmm5
+ vmovd 16(%r9),%xmm0
+ vpinsrd $1,16(%r10),%xmm5,%xmm5
+ vpinsrd $1,16(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm8,%xmm7
+ vpslld $26,%xmm8,%xmm2
+ vmovdqu %xmm5,64-128(%rax)
+ vpaddd %xmm11,%xmm5,%xmm5
+
+ vpsrld $11,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm8,%xmm2
+ vpaddd 0(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm8,%xmm2
+ vpandn %xmm10,%xmm8,%xmm0
+ vpand %xmm9,%xmm8,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm12,%xmm11
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm12,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm12,%xmm13,%xmm3
+
+ vpxor %xmm1,%xmm11,%xmm11
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm12,%xmm1
+
+ vpslld $19,%xmm12,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm11,%xmm7
+
+ vpsrld $22,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm12,%xmm2
+ vpxor %xmm4,%xmm13,%xmm11
+ vpaddd %xmm5,%xmm15,%xmm15
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm11,%xmm11
+ vpaddd %xmm7,%xmm11,%xmm11
+ vmovd 20(%r8),%xmm5
+ vmovd 20(%r9),%xmm0
+ vpinsrd $1,20(%r10),%xmm5,%xmm5
+ vpinsrd $1,20(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm15,%xmm7
+ vpslld $26,%xmm15,%xmm2
+ vmovdqu %xmm5,80-128(%rax)
+ vpaddd %xmm10,%xmm5,%xmm5
+
+ vpsrld $11,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm15,%xmm2
+ vpaddd 32(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm15,%xmm2
+ vpandn %xmm9,%xmm15,%xmm0
+ vpand %xmm8,%xmm15,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm11,%xmm10
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm11,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm11,%xmm12,%xmm4
+
+ vpxor %xmm1,%xmm10,%xmm10
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm11,%xmm1
+
+ vpslld $19,%xmm11,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm10,%xmm7
+
+ vpsrld $22,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm11,%xmm2
+ vpxor %xmm3,%xmm12,%xmm10
+ vpaddd %xmm5,%xmm14,%xmm14
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm10,%xmm10
+ vpaddd %xmm7,%xmm10,%xmm10
+ vmovd 24(%r8),%xmm5
+ vmovd 24(%r9),%xmm0
+ vpinsrd $1,24(%r10),%xmm5,%xmm5
+ vpinsrd $1,24(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm14,%xmm7
+ vpslld $26,%xmm14,%xmm2
+ vmovdqu %xmm5,96-128(%rax)
+ vpaddd %xmm9,%xmm5,%xmm5
+
+ vpsrld $11,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm14,%xmm2
+ vpaddd 64(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm14,%xmm2
+ vpandn %xmm8,%xmm14,%xmm0
+ vpand %xmm15,%xmm14,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm10,%xmm9
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm10,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm10,%xmm11,%xmm3
+
+ vpxor %xmm1,%xmm9,%xmm9
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm10,%xmm1
+
+ vpslld $19,%xmm10,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm9,%xmm7
+
+ vpsrld $22,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm10,%xmm2
+ vpxor %xmm4,%xmm11,%xmm9
+ vpaddd %xmm5,%xmm13,%xmm13
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm9,%xmm9
+ vpaddd %xmm7,%xmm9,%xmm9
+ vmovd 28(%r8),%xmm5
+ vmovd 28(%r9),%xmm0
+ vpinsrd $1,28(%r10),%xmm5,%xmm5
+ vpinsrd $1,28(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm13,%xmm7
+ vpslld $26,%xmm13,%xmm2
+ vmovdqu %xmm5,112-128(%rax)
+ vpaddd %xmm8,%xmm5,%xmm5
+
+ vpsrld $11,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm13,%xmm2
+ vpaddd 96(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm13,%xmm2
+ vpandn %xmm15,%xmm13,%xmm0
+ vpand %xmm14,%xmm13,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm9,%xmm8
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm9,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm9,%xmm10,%xmm4
+
+ vpxor %xmm1,%xmm8,%xmm8
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm9,%xmm1
+
+ vpslld $19,%xmm9,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm8,%xmm7
+
+ vpsrld $22,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm9,%xmm2
+ vpxor %xmm3,%xmm10,%xmm8
+ vpaddd %xmm5,%xmm12,%xmm12
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm8,%xmm8
+ vpaddd %xmm7,%xmm8,%xmm8
+ addq $256,%rbp
+ vmovd 32(%r8),%xmm5
+ vmovd 32(%r9),%xmm0
+ vpinsrd $1,32(%r10),%xmm5,%xmm5
+ vpinsrd $1,32(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm12,%xmm7
+ vpslld $26,%xmm12,%xmm2
+ vmovdqu %xmm5,128-128(%rax)
+ vpaddd %xmm15,%xmm5,%xmm5
+
+ vpsrld $11,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm12,%xmm2
+ vpaddd -128(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm12,%xmm2
+ vpandn %xmm14,%xmm12,%xmm0
+ vpand %xmm13,%xmm12,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm8,%xmm15
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm8,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm8,%xmm9,%xmm3
+
+ vpxor %xmm1,%xmm15,%xmm15
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm8,%xmm1
+
+ vpslld $19,%xmm8,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm15,%xmm7
+
+ vpsrld $22,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm8,%xmm2
+ vpxor %xmm4,%xmm9,%xmm15
+ vpaddd %xmm5,%xmm11,%xmm11
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm15,%xmm15
+ vpaddd %xmm7,%xmm15,%xmm15
+ vmovd 36(%r8),%xmm5
+ vmovd 36(%r9),%xmm0
+ vpinsrd $1,36(%r10),%xmm5,%xmm5
+ vpinsrd $1,36(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm11,%xmm7
+ vpslld $26,%xmm11,%xmm2
+ vmovdqu %xmm5,144-128(%rax)
+ vpaddd %xmm14,%xmm5,%xmm5
+
+ vpsrld $11,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm11,%xmm2
+ vpaddd -96(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm11,%xmm2
+ vpandn %xmm13,%xmm11,%xmm0
+ vpand %xmm12,%xmm11,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm15,%xmm14
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm15,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm15,%xmm8,%xmm4
+
+ vpxor %xmm1,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm15,%xmm1
+
+ vpslld $19,%xmm15,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm14,%xmm7
+
+ vpsrld $22,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm15,%xmm2
+ vpxor %xmm3,%xmm8,%xmm14
+ vpaddd %xmm5,%xmm10,%xmm10
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm14,%xmm14
+ vmovd 40(%r8),%xmm5
+ vmovd 40(%r9),%xmm0
+ vpinsrd $1,40(%r10),%xmm5,%xmm5
+ vpinsrd $1,40(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm10,%xmm7
+ vpslld $26,%xmm10,%xmm2
+ vmovdqu %xmm5,160-128(%rax)
+ vpaddd %xmm13,%xmm5,%xmm5
+
+ vpsrld $11,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm10,%xmm2
+ vpaddd -64(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm10,%xmm2
+ vpandn %xmm12,%xmm10,%xmm0
+ vpand %xmm11,%xmm10,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm14,%xmm13
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm14,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm14,%xmm15,%xmm3
+
+ vpxor %xmm1,%xmm13,%xmm13
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm14,%xmm1
+
+ vpslld $19,%xmm14,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm13,%xmm7
+
+ vpsrld $22,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm14,%xmm2
+ vpxor %xmm4,%xmm15,%xmm13
+ vpaddd %xmm5,%xmm9,%xmm9
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm13,%xmm13
+ vpaddd %xmm7,%xmm13,%xmm13
+ vmovd 44(%r8),%xmm5
+ vmovd 44(%r9),%xmm0
+ vpinsrd $1,44(%r10),%xmm5,%xmm5
+ vpinsrd $1,44(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm9,%xmm7
+ vpslld $26,%xmm9,%xmm2
+ vmovdqu %xmm5,176-128(%rax)
+ vpaddd %xmm12,%xmm5,%xmm5
+
+ vpsrld $11,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm9,%xmm2
+ vpaddd -32(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm9,%xmm2
+ vpandn %xmm11,%xmm9,%xmm0
+ vpand %xmm10,%xmm9,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm13,%xmm12
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm13,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm13,%xmm14,%xmm4
+
+ vpxor %xmm1,%xmm12,%xmm12
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm13,%xmm1
+
+ vpslld $19,%xmm13,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm12,%xmm7
+
+ vpsrld $22,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm13,%xmm2
+ vpxor %xmm3,%xmm14,%xmm12
+ vpaddd %xmm5,%xmm8,%xmm8
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm12,%xmm12
+ vpaddd %xmm7,%xmm12,%xmm12
+ vmovd 48(%r8),%xmm5
+ vmovd 48(%r9),%xmm0
+ vpinsrd $1,48(%r10),%xmm5,%xmm5
+ vpinsrd $1,48(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm8,%xmm7
+ vpslld $26,%xmm8,%xmm2
+ vmovdqu %xmm5,192-128(%rax)
+ vpaddd %xmm11,%xmm5,%xmm5
+
+ vpsrld $11,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm8,%xmm2
+ vpaddd 0(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm8,%xmm2
+ vpandn %xmm10,%xmm8,%xmm0
+ vpand %xmm9,%xmm8,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm12,%xmm11
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm12,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm12,%xmm13,%xmm3
+
+ vpxor %xmm1,%xmm11,%xmm11
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm12,%xmm1
+
+ vpslld $19,%xmm12,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm11,%xmm7
+
+ vpsrld $22,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm12,%xmm2
+ vpxor %xmm4,%xmm13,%xmm11
+ vpaddd %xmm5,%xmm15,%xmm15
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm11,%xmm11
+ vpaddd %xmm7,%xmm11,%xmm11
+ vmovd 52(%r8),%xmm5
+ vmovd 52(%r9),%xmm0
+ vpinsrd $1,52(%r10),%xmm5,%xmm5
+ vpinsrd $1,52(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm15,%xmm7
+ vpslld $26,%xmm15,%xmm2
+ vmovdqu %xmm5,208-128(%rax)
+ vpaddd %xmm10,%xmm5,%xmm5
+
+ vpsrld $11,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm15,%xmm2
+ vpaddd 32(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm15,%xmm2
+ vpandn %xmm9,%xmm15,%xmm0
+ vpand %xmm8,%xmm15,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm11,%xmm10
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm11,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm11,%xmm12,%xmm4
+
+ vpxor %xmm1,%xmm10,%xmm10
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm11,%xmm1
+
+ vpslld $19,%xmm11,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm10,%xmm7
+
+ vpsrld $22,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm11,%xmm2
+ vpxor %xmm3,%xmm12,%xmm10
+ vpaddd %xmm5,%xmm14,%xmm14
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm10,%xmm10
+ vpaddd %xmm7,%xmm10,%xmm10
+ vmovd 56(%r8),%xmm5
+ vmovd 56(%r9),%xmm0
+ vpinsrd $1,56(%r10),%xmm5,%xmm5
+ vpinsrd $1,56(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm14,%xmm7
+ vpslld $26,%xmm14,%xmm2
+ vmovdqu %xmm5,224-128(%rax)
+ vpaddd %xmm9,%xmm5,%xmm5
+
+ vpsrld $11,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm14,%xmm2
+ vpaddd 64(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm14,%xmm2
+ vpandn %xmm8,%xmm14,%xmm0
+ vpand %xmm15,%xmm14,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm10,%xmm9
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm10,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm10,%xmm11,%xmm3
+
+ vpxor %xmm1,%xmm9,%xmm9
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm10,%xmm1
+
+ vpslld $19,%xmm10,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm9,%xmm7
+
+ vpsrld $22,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm10,%xmm2
+ vpxor %xmm4,%xmm11,%xmm9
+ vpaddd %xmm5,%xmm13,%xmm13
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm9,%xmm9
+ vpaddd %xmm7,%xmm9,%xmm9
+ vmovd 60(%r8),%xmm5
+ leaq 64(%r8),%r8
+ vmovd 60(%r9),%xmm0
+ leaq 64(%r9),%r9
+ vpinsrd $1,60(%r10),%xmm5,%xmm5
+ leaq 64(%r10),%r10
+ vpinsrd $1,60(%r11),%xmm0,%xmm0
+ leaq 64(%r11),%r11
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm13,%xmm7
+ vpslld $26,%xmm13,%xmm2
+ vmovdqu %xmm5,240-128(%rax)
+ vpaddd %xmm8,%xmm5,%xmm5
+
+ vpsrld $11,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm13,%xmm2
+ vpaddd 96(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ prefetcht0 63(%r8)
+ vpslld $7,%xmm13,%xmm2
+ vpandn %xmm15,%xmm13,%xmm0
+ vpand %xmm14,%xmm13,%xmm4
+ prefetcht0 63(%r9)
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm9,%xmm8
+ vpxor %xmm2,%xmm7,%xmm7
+ prefetcht0 63(%r10)
+ vpslld $30,%xmm9,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm9,%xmm10,%xmm4
+ prefetcht0 63(%r11)
+ vpxor %xmm1,%xmm8,%xmm8
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm9,%xmm1
+
+ vpslld $19,%xmm9,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm8,%xmm7
+
+ vpsrld $22,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm9,%xmm2
+ vpxor %xmm3,%xmm10,%xmm8
+ vpaddd %xmm5,%xmm12,%xmm12
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm8,%xmm8
+ vpaddd %xmm7,%xmm8,%xmm8
+ addq $256,%rbp
+ vmovdqu 0-128(%rax),%xmm5
+ movl $3,%ecx
+ jmp .Loop_16_xx_avx
+.align 32
+.Loop_16_xx_avx:
+ vmovdqu 16-128(%rax),%xmm6
+ vpaddd 144-128(%rax),%xmm5,%xmm5
+
+ vpsrld $3,%xmm6,%xmm7
+ vpsrld $7,%xmm6,%xmm1
+ vpslld $25,%xmm6,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm6,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm6,%xmm2
+ vmovdqu 224-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm1,%xmm3,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpsrld $6,%xmm12,%xmm7
+ vpslld $26,%xmm12,%xmm2
+ vmovdqu %xmm5,0-128(%rax)
+ vpaddd %xmm15,%xmm5,%xmm5
+
+ vpsrld $11,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm12,%xmm2
+ vpaddd -128(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm12,%xmm2
+ vpandn %xmm14,%xmm12,%xmm0
+ vpand %xmm13,%xmm12,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm8,%xmm15
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm8,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm8,%xmm9,%xmm3
+
+ vpxor %xmm1,%xmm15,%xmm15
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm8,%xmm1
+
+ vpslld $19,%xmm8,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm15,%xmm7
+
+ vpsrld $22,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm8,%xmm2
+ vpxor %xmm4,%xmm9,%xmm15
+ vpaddd %xmm5,%xmm11,%xmm11
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm15,%xmm15
+ vpaddd %xmm7,%xmm15,%xmm15
+ vmovdqu 32-128(%rax),%xmm5
+ vpaddd 160-128(%rax),%xmm6,%xmm6
+
+ vpsrld $3,%xmm5,%xmm7
+ vpsrld $7,%xmm5,%xmm1
+ vpslld $25,%xmm5,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm5,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm5,%xmm2
+ vmovdqu 240-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpsrld $6,%xmm11,%xmm7
+ vpslld $26,%xmm11,%xmm2
+ vmovdqu %xmm6,16-128(%rax)
+ vpaddd %xmm14,%xmm6,%xmm6
+
+ vpsrld $11,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm11,%xmm2
+ vpaddd -96(%rbp),%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm11,%xmm2
+ vpandn %xmm13,%xmm11,%xmm0
+ vpand %xmm12,%xmm11,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm15,%xmm14
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm15,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm15,%xmm8,%xmm4
+
+ vpxor %xmm1,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpsrld $13,%xmm15,%xmm1
+
+ vpslld $19,%xmm15,%xmm2
+ vpaddd %xmm0,%xmm6,%xmm6
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm14,%xmm7
+
+ vpsrld $22,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm15,%xmm2
+ vpxor %xmm3,%xmm8,%xmm14
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm14,%xmm14
+ vmovdqu 48-128(%rax),%xmm6
+ vpaddd 176-128(%rax),%xmm5,%xmm5
+
+ vpsrld $3,%xmm6,%xmm7
+ vpsrld $7,%xmm6,%xmm1
+ vpslld $25,%xmm6,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm6,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm6,%xmm2
+ vmovdqu 0-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm1,%xmm3,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpsrld $6,%xmm10,%xmm7
+ vpslld $26,%xmm10,%xmm2
+ vmovdqu %xmm5,32-128(%rax)
+ vpaddd %xmm13,%xmm5,%xmm5
+
+ vpsrld $11,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm10,%xmm2
+ vpaddd -64(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm10,%xmm2
+ vpandn %xmm12,%xmm10,%xmm0
+ vpand %xmm11,%xmm10,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm14,%xmm13
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm14,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm14,%xmm15,%xmm3
+
+ vpxor %xmm1,%xmm13,%xmm13
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm14,%xmm1
+
+ vpslld $19,%xmm14,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm13,%xmm7
+
+ vpsrld $22,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm14,%xmm2
+ vpxor %xmm4,%xmm15,%xmm13
+ vpaddd %xmm5,%xmm9,%xmm9
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm13,%xmm13
+ vpaddd %xmm7,%xmm13,%xmm13
+ vmovdqu 64-128(%rax),%xmm5
+ vpaddd 192-128(%rax),%xmm6,%xmm6
+
+ vpsrld $3,%xmm5,%xmm7
+ vpsrld $7,%xmm5,%xmm1
+ vpslld $25,%xmm5,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm5,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm5,%xmm2
+ vmovdqu 16-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpsrld $6,%xmm9,%xmm7
+ vpslld $26,%xmm9,%xmm2
+ vmovdqu %xmm6,48-128(%rax)
+ vpaddd %xmm12,%xmm6,%xmm6
+
+ vpsrld $11,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm9,%xmm2
+ vpaddd -32(%rbp),%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm9,%xmm2
+ vpandn %xmm11,%xmm9,%xmm0
+ vpand %xmm10,%xmm9,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm13,%xmm12
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm13,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm13,%xmm14,%xmm4
+
+ vpxor %xmm1,%xmm12,%xmm12
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpsrld $13,%xmm13,%xmm1
+
+ vpslld $19,%xmm13,%xmm2
+ vpaddd %xmm0,%xmm6,%xmm6
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm12,%xmm7
+
+ vpsrld $22,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm13,%xmm2
+ vpxor %xmm3,%xmm14,%xmm12
+ vpaddd %xmm6,%xmm8,%xmm8
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpaddd %xmm7,%xmm12,%xmm12
+ vmovdqu 80-128(%rax),%xmm6
+ vpaddd 208-128(%rax),%xmm5,%xmm5
+
+ vpsrld $3,%xmm6,%xmm7
+ vpsrld $7,%xmm6,%xmm1
+ vpslld $25,%xmm6,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm6,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm6,%xmm2
+ vmovdqu 32-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm1,%xmm3,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpsrld $6,%xmm8,%xmm7
+ vpslld $26,%xmm8,%xmm2
+ vmovdqu %xmm5,64-128(%rax)
+ vpaddd %xmm11,%xmm5,%xmm5
+
+ vpsrld $11,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm8,%xmm2
+ vpaddd 0(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm8,%xmm2
+ vpandn %xmm10,%xmm8,%xmm0
+ vpand %xmm9,%xmm8,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm12,%xmm11
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm12,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm12,%xmm13,%xmm3
+
+ vpxor %xmm1,%xmm11,%xmm11
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm12,%xmm1
+
+ vpslld $19,%xmm12,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm11,%xmm7
+
+ vpsrld $22,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm12,%xmm2
+ vpxor %xmm4,%xmm13,%xmm11
+ vpaddd %xmm5,%xmm15,%xmm15
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm11,%xmm11
+ vpaddd %xmm7,%xmm11,%xmm11
+ vmovdqu 96-128(%rax),%xmm5
+ vpaddd 224-128(%rax),%xmm6,%xmm6
+
+ vpsrld $3,%xmm5,%xmm7
+ vpsrld $7,%xmm5,%xmm1
+ vpslld $25,%xmm5,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm5,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm5,%xmm2
+ vmovdqu 48-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpsrld $6,%xmm15,%xmm7
+ vpslld $26,%xmm15,%xmm2
+ vmovdqu %xmm6,80-128(%rax)
+ vpaddd %xmm10,%xmm6,%xmm6
+
+ vpsrld $11,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm15,%xmm2
+ vpaddd 32(%rbp),%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm15,%xmm2
+ vpandn %xmm9,%xmm15,%xmm0
+ vpand %xmm8,%xmm15,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm11,%xmm10
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm11,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm11,%xmm12,%xmm4
+
+ vpxor %xmm1,%xmm10,%xmm10
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpsrld $13,%xmm11,%xmm1
+
+ vpslld $19,%xmm11,%xmm2
+ vpaddd %xmm0,%xmm6,%xmm6
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm10,%xmm7
+
+ vpsrld $22,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm11,%xmm2
+ vpxor %xmm3,%xmm12,%xmm10
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpaddd %xmm7,%xmm10,%xmm10
+ vmovdqu 112-128(%rax),%xmm6
+ vpaddd 240-128(%rax),%xmm5,%xmm5
+
+ vpsrld $3,%xmm6,%xmm7
+ vpsrld $7,%xmm6,%xmm1
+ vpslld $25,%xmm6,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm6,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm6,%xmm2
+ vmovdqu 64-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm1,%xmm3,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpsrld $6,%xmm14,%xmm7
+ vpslld $26,%xmm14,%xmm2
+ vmovdqu %xmm5,96-128(%rax)
+ vpaddd %xmm9,%xmm5,%xmm5
+
+ vpsrld $11,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm14,%xmm2
+ vpaddd 64(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm14,%xmm2
+ vpandn %xmm8,%xmm14,%xmm0
+ vpand %xmm15,%xmm14,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm10,%xmm9
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm10,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm10,%xmm11,%xmm3
+
+ vpxor %xmm1,%xmm9,%xmm9
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm10,%xmm1
+
+ vpslld $19,%xmm10,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm9,%xmm7
+
+ vpsrld $22,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm10,%xmm2
+ vpxor %xmm4,%xmm11,%xmm9
+ vpaddd %xmm5,%xmm13,%xmm13
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm9,%xmm9
+ vpaddd %xmm7,%xmm9,%xmm9
+ vmovdqu 128-128(%rax),%xmm5
+ vpaddd 0-128(%rax),%xmm6,%xmm6
+
+ vpsrld $3,%xmm5,%xmm7
+ vpsrld $7,%xmm5,%xmm1
+ vpslld $25,%xmm5,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm5,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm5,%xmm2
+ vmovdqu 80-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpsrld $6,%xmm13,%xmm7
+ vpslld $26,%xmm13,%xmm2
+ vmovdqu %xmm6,112-128(%rax)
+ vpaddd %xmm8,%xmm6,%xmm6
+
+ vpsrld $11,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm13,%xmm2
+ vpaddd 96(%rbp),%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm13,%xmm2
+ vpandn %xmm15,%xmm13,%xmm0
+ vpand %xmm14,%xmm13,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm9,%xmm8
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm9,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm9,%xmm10,%xmm4
+
+ vpxor %xmm1,%xmm8,%xmm8
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpsrld $13,%xmm9,%xmm1
+
+ vpslld $19,%xmm9,%xmm2
+ vpaddd %xmm0,%xmm6,%xmm6
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm8,%xmm7
+
+ vpsrld $22,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm9,%xmm2
+ vpxor %xmm3,%xmm10,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm6,%xmm8,%xmm8
+ vpaddd %xmm7,%xmm8,%xmm8
+ addq $256,%rbp
+ vmovdqu 144-128(%rax),%xmm6
+ vpaddd 16-128(%rax),%xmm5,%xmm5
+
+ vpsrld $3,%xmm6,%xmm7
+ vpsrld $7,%xmm6,%xmm1
+ vpslld $25,%xmm6,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm6,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm6,%xmm2
+ vmovdqu 96-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm1,%xmm3,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpsrld $6,%xmm12,%xmm7
+ vpslld $26,%xmm12,%xmm2
+ vmovdqu %xmm5,128-128(%rax)
+ vpaddd %xmm15,%xmm5,%xmm5
+
+ vpsrld $11,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm12,%xmm2
+ vpaddd -128(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm12,%xmm2
+ vpandn %xmm14,%xmm12,%xmm0
+ vpand %xmm13,%xmm12,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm8,%xmm15
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm8,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm8,%xmm9,%xmm3
+
+ vpxor %xmm1,%xmm15,%xmm15
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm8,%xmm1
+
+ vpslld $19,%xmm8,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm15,%xmm7
+
+ vpsrld $22,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm8,%xmm2
+ vpxor %xmm4,%xmm9,%xmm15
+ vpaddd %xmm5,%xmm11,%xmm11
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm15,%xmm15
+ vpaddd %xmm7,%xmm15,%xmm15
+ vmovdqu 160-128(%rax),%xmm5
+ vpaddd 32-128(%rax),%xmm6,%xmm6
+
+ vpsrld $3,%xmm5,%xmm7
+ vpsrld $7,%xmm5,%xmm1
+ vpslld $25,%xmm5,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm5,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm5,%xmm2
+ vmovdqu 112-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpsrld $6,%xmm11,%xmm7
+ vpslld $26,%xmm11,%xmm2
+ vmovdqu %xmm6,144-128(%rax)
+ vpaddd %xmm14,%xmm6,%xmm6
+
+ vpsrld $11,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm11,%xmm2
+ vpaddd -96(%rbp),%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm11,%xmm2
+ vpandn %xmm13,%xmm11,%xmm0
+ vpand %xmm12,%xmm11,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm15,%xmm14
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm15,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm15,%xmm8,%xmm4
+
+ vpxor %xmm1,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpsrld $13,%xmm15,%xmm1
+
+ vpslld $19,%xmm15,%xmm2
+ vpaddd %xmm0,%xmm6,%xmm6
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm14,%xmm7
+
+ vpsrld $22,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm15,%xmm2
+ vpxor %xmm3,%xmm8,%xmm14
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm14,%xmm14
+ vmovdqu 176-128(%rax),%xmm6
+ vpaddd 48-128(%rax),%xmm5,%xmm5
+
+ vpsrld $3,%xmm6,%xmm7
+ vpsrld $7,%xmm6,%xmm1
+ vpslld $25,%xmm6,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm6,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm6,%xmm2
+ vmovdqu 128-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm1,%xmm3,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpsrld $6,%xmm10,%xmm7
+ vpslld $26,%xmm10,%xmm2
+ vmovdqu %xmm5,160-128(%rax)
+ vpaddd %xmm13,%xmm5,%xmm5
+
+ vpsrld $11,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm10,%xmm2
+ vpaddd -64(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm10,%xmm2
+ vpandn %xmm12,%xmm10,%xmm0
+ vpand %xmm11,%xmm10,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm14,%xmm13
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm14,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm14,%xmm15,%xmm3
+
+ vpxor %xmm1,%xmm13,%xmm13
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm14,%xmm1
+
+ vpslld $19,%xmm14,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm13,%xmm7
+
+ vpsrld $22,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm14,%xmm2
+ vpxor %xmm4,%xmm15,%xmm13
+ vpaddd %xmm5,%xmm9,%xmm9
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm13,%xmm13
+ vpaddd %xmm7,%xmm13,%xmm13
+ vmovdqu 192-128(%rax),%xmm5
+ vpaddd 64-128(%rax),%xmm6,%xmm6
+
+ vpsrld $3,%xmm5,%xmm7
+ vpsrld $7,%xmm5,%xmm1
+ vpslld $25,%xmm5,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm5,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm5,%xmm2
+ vmovdqu 144-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpsrld $6,%xmm9,%xmm7
+ vpslld $26,%xmm9,%xmm2
+ vmovdqu %xmm6,176-128(%rax)
+ vpaddd %xmm12,%xmm6,%xmm6
+
+ vpsrld $11,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm9,%xmm2
+ vpaddd -32(%rbp),%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm9,%xmm2
+ vpandn %xmm11,%xmm9,%xmm0
+ vpand %xmm10,%xmm9,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm13,%xmm12
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm13,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm13,%xmm14,%xmm4
+
+ vpxor %xmm1,%xmm12,%xmm12
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpsrld $13,%xmm13,%xmm1
+
+ vpslld $19,%xmm13,%xmm2
+ vpaddd %xmm0,%xmm6,%xmm6
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm12,%xmm7
+
+ vpsrld $22,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm13,%xmm2
+ vpxor %xmm3,%xmm14,%xmm12
+ vpaddd %xmm6,%xmm8,%xmm8
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpaddd %xmm7,%xmm12,%xmm12
+ vmovdqu 208-128(%rax),%xmm6
+ vpaddd 80-128(%rax),%xmm5,%xmm5
+
+ vpsrld $3,%xmm6,%xmm7
+ vpsrld $7,%xmm6,%xmm1
+ vpslld $25,%xmm6,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm6,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm6,%xmm2
+ vmovdqu 160-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm1,%xmm3,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpsrld $6,%xmm8,%xmm7
+ vpslld $26,%xmm8,%xmm2
+ vmovdqu %xmm5,192-128(%rax)
+ vpaddd %xmm11,%xmm5,%xmm5
+
+ vpsrld $11,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm8,%xmm2
+ vpaddd 0(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm8,%xmm2
+ vpandn %xmm10,%xmm8,%xmm0
+ vpand %xmm9,%xmm8,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm12,%xmm11
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm12,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm12,%xmm13,%xmm3
+
+ vpxor %xmm1,%xmm11,%xmm11
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm12,%xmm1
+
+ vpslld $19,%xmm12,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm11,%xmm7
+
+ vpsrld $22,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm12,%xmm2
+ vpxor %xmm4,%xmm13,%xmm11
+ vpaddd %xmm5,%xmm15,%xmm15
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm11,%xmm11
+ vpaddd %xmm7,%xmm11,%xmm11
+ vmovdqu 224-128(%rax),%xmm5
+ vpaddd 96-128(%rax),%xmm6,%xmm6
+
+ vpsrld $3,%xmm5,%xmm7
+ vpsrld $7,%xmm5,%xmm1
+ vpslld $25,%xmm5,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm5,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm5,%xmm2
+ vmovdqu 176-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpsrld $6,%xmm15,%xmm7
+ vpslld $26,%xmm15,%xmm2
+ vmovdqu %xmm6,208-128(%rax)
+ vpaddd %xmm10,%xmm6,%xmm6
+
+ vpsrld $11,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm15,%xmm2
+ vpaddd 32(%rbp),%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm15,%xmm2
+ vpandn %xmm9,%xmm15,%xmm0
+ vpand %xmm8,%xmm15,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm11,%xmm10
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm11,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm11,%xmm12,%xmm4
+
+ vpxor %xmm1,%xmm10,%xmm10
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpsrld $13,%xmm11,%xmm1
+
+ vpslld $19,%xmm11,%xmm2
+ vpaddd %xmm0,%xmm6,%xmm6
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm10,%xmm7
+
+ vpsrld $22,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm11,%xmm2
+ vpxor %xmm3,%xmm12,%xmm10
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpaddd %xmm7,%xmm10,%xmm10
+ vmovdqu 240-128(%rax),%xmm6
+ vpaddd 112-128(%rax),%xmm5,%xmm5
+
+ vpsrld $3,%xmm6,%xmm7
+ vpsrld $7,%xmm6,%xmm1
+ vpslld $25,%xmm6,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm6,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm6,%xmm2
+ vmovdqu 192-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm1,%xmm3,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpsrld $6,%xmm14,%xmm7
+ vpslld $26,%xmm14,%xmm2
+ vmovdqu %xmm5,224-128(%rax)
+ vpaddd %xmm9,%xmm5,%xmm5
+
+ vpsrld $11,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm14,%xmm2
+ vpaddd 64(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm14,%xmm2
+ vpandn %xmm8,%xmm14,%xmm0
+ vpand %xmm15,%xmm14,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm10,%xmm9
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm10,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm10,%xmm11,%xmm3
+
+ vpxor %xmm1,%xmm9,%xmm9
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm10,%xmm1
+
+ vpslld $19,%xmm10,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm9,%xmm7
+
+ vpsrld $22,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm10,%xmm2
+ vpxor %xmm4,%xmm11,%xmm9
+ vpaddd %xmm5,%xmm13,%xmm13
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm9,%xmm9
+ vpaddd %xmm7,%xmm9,%xmm9
+ vmovdqu 0-128(%rax),%xmm5
+ vpaddd 128-128(%rax),%xmm6,%xmm6
+
+ vpsrld $3,%xmm5,%xmm7
+ vpsrld $7,%xmm5,%xmm1
+ vpslld $25,%xmm5,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm5,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm5,%xmm2
+ vmovdqu 208-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpsrld $6,%xmm13,%xmm7
+ vpslld $26,%xmm13,%xmm2
+ vmovdqu %xmm6,240-128(%rax)
+ vpaddd %xmm8,%xmm6,%xmm6
+
+ vpsrld $11,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm13,%xmm2
+ vpaddd 96(%rbp),%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm13,%xmm2
+ vpandn %xmm15,%xmm13,%xmm0
+ vpand %xmm14,%xmm13,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm9,%xmm8
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm9,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm9,%xmm10,%xmm4
+
+ vpxor %xmm1,%xmm8,%xmm8
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpsrld $13,%xmm9,%xmm1
+
+ vpslld $19,%xmm9,%xmm2
+ vpaddd %xmm0,%xmm6,%xmm6
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm8,%xmm7
+
+ vpsrld $22,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm9,%xmm2
+ vpxor %xmm3,%xmm10,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm6,%xmm8,%xmm8
+ vpaddd %xmm7,%xmm8,%xmm8
+ addq $256,%rbp
+ decl %ecx
+ jnz .Loop_16_xx_avx
+
+ movl $1,%ecx
+ leaq K256+128(%rip),%rbp
+ cmpl 0(%rbx),%ecx
+ cmovgeq %rbp,%r8
+ cmpl 4(%rbx),%ecx
+ cmovgeq %rbp,%r9
+ cmpl 8(%rbx),%ecx
+ cmovgeq %rbp,%r10
+ cmpl 12(%rbx),%ecx
+ cmovgeq %rbp,%r11
+ vmovdqa (%rbx),%xmm7
+ vpxor %xmm0,%xmm0,%xmm0
+ vmovdqa %xmm7,%xmm6
+ vpcmpgtd %xmm0,%xmm6,%xmm6
+ vpaddd %xmm6,%xmm7,%xmm7
+
+ vmovdqu 0-128(%rdi),%xmm0
+ vpand %xmm6,%xmm8,%xmm8
+ vmovdqu 32-128(%rdi),%xmm1
+ vpand %xmm6,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm2
+ vpand %xmm6,%xmm10,%xmm10
+ vmovdqu 96-128(%rdi),%xmm5
+ vpand %xmm6,%xmm11,%xmm11
+ vpaddd %xmm0,%xmm8,%xmm8
+ vmovdqu 128-128(%rdi),%xmm0
+ vpand %xmm6,%xmm12,%xmm12
+ vpaddd %xmm1,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm1
+ vpand %xmm6,%xmm13,%xmm13
+ vpaddd %xmm2,%xmm10,%xmm10
+ vmovdqu 192-128(%rdi),%xmm2
+ vpand %xmm6,%xmm14,%xmm14
+ vpaddd %xmm5,%xmm11,%xmm11
+ vmovdqu 224-128(%rdi),%xmm5
+ vpand %xmm6,%xmm15,%xmm15
+ vpaddd %xmm0,%xmm12,%xmm12
+ vpaddd %xmm1,%xmm13,%xmm13
+ vmovdqu %xmm8,0-128(%rdi)
+ vpaddd %xmm2,%xmm14,%xmm14
+ vmovdqu %xmm9,32-128(%rdi)
+ vpaddd %xmm5,%xmm15,%xmm15
+ vmovdqu %xmm10,64-128(%rdi)
+ vmovdqu %xmm11,96-128(%rdi)
+ vmovdqu %xmm12,128-128(%rdi)
+ vmovdqu %xmm13,160-128(%rdi)
+ vmovdqu %xmm14,192-128(%rdi)
+ vmovdqu %xmm15,224-128(%rdi)
+
+ vmovdqu %xmm7,(%rbx)
+ vmovdqu .Lpbswap(%rip),%xmm6
+ decl %edx
+ jnz .Loop_avx
+
+ movl 280(%rsp),%edx
+ leaq 16(%rdi),%rdi
+ leaq 64(%rsi),%rsi
+ decl %edx
+ jnz .Loop_grande_avx
+
+.Ldone_avx:
+ movq 272(%rsp),%rax
+.cfi_def_cfa %rax,8
+ vzeroupper
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha256_multi_block_avx,.-sha256_multi_block_avx
+.type sha256_multi_block_avx2,@function
+.align 32
+sha256_multi_block_avx2:
+.cfi_startproc
+_avx2_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $576,%rsp
+ andq $-256,%rsp
+ movq %rax,544(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xa0,0x04,0x06,0x23,0x08
+.Lbody_avx2:
+ leaq K256+128(%rip),%rbp
+ leaq 128(%rdi),%rdi
+
+.Loop_grande_avx2:
+ movl %edx,552(%rsp)
+ xorl %edx,%edx
+ leaq 512(%rsp),%rbx
+ movq 0(%rsi),%r12
+ movl 8(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,0(%rbx)
+ cmovleq %rbp,%r12
+ movq 16(%rsi),%r13
+ movl 24(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,4(%rbx)
+ cmovleq %rbp,%r13
+ movq 32(%rsi),%r14
+ movl 40(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,8(%rbx)
+ cmovleq %rbp,%r14
+ movq 48(%rsi),%r15
+ movl 56(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,12(%rbx)
+ cmovleq %rbp,%r15
+ movq 64(%rsi),%r8
+ movl 72(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,16(%rbx)
+ cmovleq %rbp,%r8
+ movq 80(%rsi),%r9
+ movl 88(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,20(%rbx)
+ cmovleq %rbp,%r9
+ movq 96(%rsi),%r10
+ movl 104(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,24(%rbx)
+ cmovleq %rbp,%r10
+ movq 112(%rsi),%r11
+ movl 120(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,28(%rbx)
+ cmovleq %rbp,%r11
+ vmovdqu 0-128(%rdi),%ymm8
+ leaq 128(%rsp),%rax
+ vmovdqu 32-128(%rdi),%ymm9
+ leaq 256+128(%rsp),%rbx
+ vmovdqu 64-128(%rdi),%ymm10
+ vmovdqu 96-128(%rdi),%ymm11
+ vmovdqu 128-128(%rdi),%ymm12
+ vmovdqu 160-128(%rdi),%ymm13
+ vmovdqu 192-128(%rdi),%ymm14
+ vmovdqu 224-128(%rdi),%ymm15
+ vmovdqu .Lpbswap(%rip),%ymm6
+ jmp .Loop_avx2
+
+.align 32
+.Loop_avx2:
+ vpxor %ymm9,%ymm10,%ymm4
+ vmovd 0(%r12),%xmm5
+ vmovd 0(%r8),%xmm0
+ vmovd 0(%r13),%xmm1
+ vmovd 0(%r9),%xmm2
+ vpinsrd $1,0(%r14),%xmm5,%xmm5
+ vpinsrd $1,0(%r10),%xmm0,%xmm0
+ vpinsrd $1,0(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,0(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm12,%ymm7
+ vpslld $26,%ymm12,%ymm2
+ vmovdqu %ymm5,0-128(%rax)
+ vpaddd %ymm15,%ymm5,%ymm5
+
+ vpsrld $11,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm12,%ymm2
+ vpaddd -128(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm12,%ymm2
+ vpandn %ymm14,%ymm12,%ymm0
+ vpand %ymm13,%ymm12,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm8,%ymm15
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm8,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm8,%ymm9,%ymm3
+
+ vpxor %ymm1,%ymm15,%ymm15
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm8,%ymm1
+
+ vpslld $19,%ymm8,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm15,%ymm7
+
+ vpsrld $22,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm8,%ymm2
+ vpxor %ymm4,%ymm9,%ymm15
+ vpaddd %ymm5,%ymm11,%ymm11
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm15,%ymm15
+ vpaddd %ymm7,%ymm15,%ymm15
+ vmovd 4(%r12),%xmm5
+ vmovd 4(%r8),%xmm0
+ vmovd 4(%r13),%xmm1
+ vmovd 4(%r9),%xmm2
+ vpinsrd $1,4(%r14),%xmm5,%xmm5
+ vpinsrd $1,4(%r10),%xmm0,%xmm0
+ vpinsrd $1,4(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,4(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm11,%ymm7
+ vpslld $26,%ymm11,%ymm2
+ vmovdqu %ymm5,32-128(%rax)
+ vpaddd %ymm14,%ymm5,%ymm5
+
+ vpsrld $11,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm11,%ymm2
+ vpaddd -96(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm11,%ymm2
+ vpandn %ymm13,%ymm11,%ymm0
+ vpand %ymm12,%ymm11,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm15,%ymm14
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm15,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm15,%ymm8,%ymm4
+
+ vpxor %ymm1,%ymm14,%ymm14
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm15,%ymm1
+
+ vpslld $19,%ymm15,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm14,%ymm7
+
+ vpsrld $22,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm15,%ymm2
+ vpxor %ymm3,%ymm8,%ymm14
+ vpaddd %ymm5,%ymm10,%ymm10
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm14,%ymm14
+ vpaddd %ymm7,%ymm14,%ymm14
+ vmovd 8(%r12),%xmm5
+ vmovd 8(%r8),%xmm0
+ vmovd 8(%r13),%xmm1
+ vmovd 8(%r9),%xmm2
+ vpinsrd $1,8(%r14),%xmm5,%xmm5
+ vpinsrd $1,8(%r10),%xmm0,%xmm0
+ vpinsrd $1,8(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,8(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm10,%ymm7
+ vpslld $26,%ymm10,%ymm2
+ vmovdqu %ymm5,64-128(%rax)
+ vpaddd %ymm13,%ymm5,%ymm5
+
+ vpsrld $11,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm10,%ymm2
+ vpaddd -64(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm10,%ymm2
+ vpandn %ymm12,%ymm10,%ymm0
+ vpand %ymm11,%ymm10,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm14,%ymm13
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm14,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm14,%ymm15,%ymm3
+
+ vpxor %ymm1,%ymm13,%ymm13
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm14,%ymm1
+
+ vpslld $19,%ymm14,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm13,%ymm7
+
+ vpsrld $22,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm14,%ymm2
+ vpxor %ymm4,%ymm15,%ymm13
+ vpaddd %ymm5,%ymm9,%ymm9
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpaddd %ymm7,%ymm13,%ymm13
+ vmovd 12(%r12),%xmm5
+ vmovd 12(%r8),%xmm0
+ vmovd 12(%r13),%xmm1
+ vmovd 12(%r9),%xmm2
+ vpinsrd $1,12(%r14),%xmm5,%xmm5
+ vpinsrd $1,12(%r10),%xmm0,%xmm0
+ vpinsrd $1,12(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,12(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm9,%ymm7
+ vpslld $26,%ymm9,%ymm2
+ vmovdqu %ymm5,96-128(%rax)
+ vpaddd %ymm12,%ymm5,%ymm5
+
+ vpsrld $11,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm9,%ymm2
+ vpaddd -32(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm9,%ymm2
+ vpandn %ymm11,%ymm9,%ymm0
+ vpand %ymm10,%ymm9,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm13,%ymm12
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm13,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm13,%ymm14,%ymm4
+
+ vpxor %ymm1,%ymm12,%ymm12
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm13,%ymm1
+
+ vpslld $19,%ymm13,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm12,%ymm7
+
+ vpsrld $22,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm13,%ymm2
+ vpxor %ymm3,%ymm14,%ymm12
+ vpaddd %ymm5,%ymm8,%ymm8
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpaddd %ymm7,%ymm12,%ymm12
+ vmovd 16(%r12),%xmm5
+ vmovd 16(%r8),%xmm0
+ vmovd 16(%r13),%xmm1
+ vmovd 16(%r9),%xmm2
+ vpinsrd $1,16(%r14),%xmm5,%xmm5
+ vpinsrd $1,16(%r10),%xmm0,%xmm0
+ vpinsrd $1,16(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,16(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm8,%ymm7
+ vpslld $26,%ymm8,%ymm2
+ vmovdqu %ymm5,128-128(%rax)
+ vpaddd %ymm11,%ymm5,%ymm5
+
+ vpsrld $11,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm8,%ymm2
+ vpaddd 0(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm8,%ymm2
+ vpandn %ymm10,%ymm8,%ymm0
+ vpand %ymm9,%ymm8,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm12,%ymm11
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm12,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm12,%ymm13,%ymm3
+
+ vpxor %ymm1,%ymm11,%ymm11
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm12,%ymm1
+
+ vpslld $19,%ymm12,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm11,%ymm7
+
+ vpsrld $22,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm12,%ymm2
+ vpxor %ymm4,%ymm13,%ymm11
+ vpaddd %ymm5,%ymm15,%ymm15
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm11,%ymm11
+ vpaddd %ymm7,%ymm11,%ymm11
+ vmovd 20(%r12),%xmm5
+ vmovd 20(%r8),%xmm0
+ vmovd 20(%r13),%xmm1
+ vmovd 20(%r9),%xmm2
+ vpinsrd $1,20(%r14),%xmm5,%xmm5
+ vpinsrd $1,20(%r10),%xmm0,%xmm0
+ vpinsrd $1,20(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,20(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm15,%ymm7
+ vpslld $26,%ymm15,%ymm2
+ vmovdqu %ymm5,160-128(%rax)
+ vpaddd %ymm10,%ymm5,%ymm5
+
+ vpsrld $11,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm15,%ymm2
+ vpaddd 32(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm15,%ymm2
+ vpandn %ymm9,%ymm15,%ymm0
+ vpand %ymm8,%ymm15,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm11,%ymm10
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm11,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm11,%ymm12,%ymm4
+
+ vpxor %ymm1,%ymm10,%ymm10
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm11,%ymm1
+
+ vpslld $19,%ymm11,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm10,%ymm7
+
+ vpsrld $22,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm11,%ymm2
+ vpxor %ymm3,%ymm12,%ymm10
+ vpaddd %ymm5,%ymm14,%ymm14
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm10,%ymm10
+ vpaddd %ymm7,%ymm10,%ymm10
+ vmovd 24(%r12),%xmm5
+ vmovd 24(%r8),%xmm0
+ vmovd 24(%r13),%xmm1
+ vmovd 24(%r9),%xmm2
+ vpinsrd $1,24(%r14),%xmm5,%xmm5
+ vpinsrd $1,24(%r10),%xmm0,%xmm0
+ vpinsrd $1,24(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,24(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm14,%ymm7
+ vpslld $26,%ymm14,%ymm2
+ vmovdqu %ymm5,192-128(%rax)
+ vpaddd %ymm9,%ymm5,%ymm5
+
+ vpsrld $11,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm14,%ymm2
+ vpaddd 64(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm14,%ymm2
+ vpandn %ymm8,%ymm14,%ymm0
+ vpand %ymm15,%ymm14,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm10,%ymm9
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm10,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm10,%ymm11,%ymm3
+
+ vpxor %ymm1,%ymm9,%ymm9
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm10,%ymm1
+
+ vpslld $19,%ymm10,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm9,%ymm7
+
+ vpsrld $22,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm10,%ymm2
+ vpxor %ymm4,%ymm11,%ymm9
+ vpaddd %ymm5,%ymm13,%ymm13
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm9,%ymm9
+ vpaddd %ymm7,%ymm9,%ymm9
+ vmovd 28(%r12),%xmm5
+ vmovd 28(%r8),%xmm0
+ vmovd 28(%r13),%xmm1
+ vmovd 28(%r9),%xmm2
+ vpinsrd $1,28(%r14),%xmm5,%xmm5
+ vpinsrd $1,28(%r10),%xmm0,%xmm0
+ vpinsrd $1,28(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,28(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm13,%ymm7
+ vpslld $26,%ymm13,%ymm2
+ vmovdqu %ymm5,224-128(%rax)
+ vpaddd %ymm8,%ymm5,%ymm5
+
+ vpsrld $11,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm13,%ymm2
+ vpaddd 96(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm13,%ymm2
+ vpandn %ymm15,%ymm13,%ymm0
+ vpand %ymm14,%ymm13,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm9,%ymm8
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm9,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm9,%ymm10,%ymm4
+
+ vpxor %ymm1,%ymm8,%ymm8
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm9,%ymm1
+
+ vpslld $19,%ymm9,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm8,%ymm7
+
+ vpsrld $22,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm9,%ymm2
+ vpxor %ymm3,%ymm10,%ymm8
+ vpaddd %ymm5,%ymm12,%ymm12
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm8,%ymm8
+ vpaddd %ymm7,%ymm8,%ymm8
+ addq $256,%rbp
+ vmovd 32(%r12),%xmm5
+ vmovd 32(%r8),%xmm0
+ vmovd 32(%r13),%xmm1
+ vmovd 32(%r9),%xmm2
+ vpinsrd $1,32(%r14),%xmm5,%xmm5
+ vpinsrd $1,32(%r10),%xmm0,%xmm0
+ vpinsrd $1,32(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,32(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm12,%ymm7
+ vpslld $26,%ymm12,%ymm2
+ vmovdqu %ymm5,256-256-128(%rbx)
+ vpaddd %ymm15,%ymm5,%ymm5
+
+ vpsrld $11,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm12,%ymm2
+ vpaddd -128(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm12,%ymm2
+ vpandn %ymm14,%ymm12,%ymm0
+ vpand %ymm13,%ymm12,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm8,%ymm15
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm8,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm8,%ymm9,%ymm3
+
+ vpxor %ymm1,%ymm15,%ymm15
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm8,%ymm1
+
+ vpslld $19,%ymm8,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm15,%ymm7
+
+ vpsrld $22,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm8,%ymm2
+ vpxor %ymm4,%ymm9,%ymm15
+ vpaddd %ymm5,%ymm11,%ymm11
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm15,%ymm15
+ vpaddd %ymm7,%ymm15,%ymm15
+ vmovd 36(%r12),%xmm5
+ vmovd 36(%r8),%xmm0
+ vmovd 36(%r13),%xmm1
+ vmovd 36(%r9),%xmm2
+ vpinsrd $1,36(%r14),%xmm5,%xmm5
+ vpinsrd $1,36(%r10),%xmm0,%xmm0
+ vpinsrd $1,36(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,36(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm11,%ymm7
+ vpslld $26,%ymm11,%ymm2
+ vmovdqu %ymm5,288-256-128(%rbx)
+ vpaddd %ymm14,%ymm5,%ymm5
+
+ vpsrld $11,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm11,%ymm2
+ vpaddd -96(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm11,%ymm2
+ vpandn %ymm13,%ymm11,%ymm0
+ vpand %ymm12,%ymm11,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm15,%ymm14
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm15,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm15,%ymm8,%ymm4
+
+ vpxor %ymm1,%ymm14,%ymm14
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm15,%ymm1
+
+ vpslld $19,%ymm15,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm14,%ymm7
+
+ vpsrld $22,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm15,%ymm2
+ vpxor %ymm3,%ymm8,%ymm14
+ vpaddd %ymm5,%ymm10,%ymm10
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm14,%ymm14
+ vpaddd %ymm7,%ymm14,%ymm14
+ vmovd 40(%r12),%xmm5
+ vmovd 40(%r8),%xmm0
+ vmovd 40(%r13),%xmm1
+ vmovd 40(%r9),%xmm2
+ vpinsrd $1,40(%r14),%xmm5,%xmm5
+ vpinsrd $1,40(%r10),%xmm0,%xmm0
+ vpinsrd $1,40(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,40(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm10,%ymm7
+ vpslld $26,%ymm10,%ymm2
+ vmovdqu %ymm5,320-256-128(%rbx)
+ vpaddd %ymm13,%ymm5,%ymm5
+
+ vpsrld $11,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm10,%ymm2
+ vpaddd -64(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm10,%ymm2
+ vpandn %ymm12,%ymm10,%ymm0
+ vpand %ymm11,%ymm10,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm14,%ymm13
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm14,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm14,%ymm15,%ymm3
+
+ vpxor %ymm1,%ymm13,%ymm13
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm14,%ymm1
+
+ vpslld $19,%ymm14,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm13,%ymm7
+
+ vpsrld $22,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm14,%ymm2
+ vpxor %ymm4,%ymm15,%ymm13
+ vpaddd %ymm5,%ymm9,%ymm9
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpaddd %ymm7,%ymm13,%ymm13
+ vmovd 44(%r12),%xmm5
+ vmovd 44(%r8),%xmm0
+ vmovd 44(%r13),%xmm1
+ vmovd 44(%r9),%xmm2
+ vpinsrd $1,44(%r14),%xmm5,%xmm5
+ vpinsrd $1,44(%r10),%xmm0,%xmm0
+ vpinsrd $1,44(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,44(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm9,%ymm7
+ vpslld $26,%ymm9,%ymm2
+ vmovdqu %ymm5,352-256-128(%rbx)
+ vpaddd %ymm12,%ymm5,%ymm5
+
+ vpsrld $11,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm9,%ymm2
+ vpaddd -32(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm9,%ymm2
+ vpandn %ymm11,%ymm9,%ymm0
+ vpand %ymm10,%ymm9,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm13,%ymm12
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm13,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm13,%ymm14,%ymm4
+
+ vpxor %ymm1,%ymm12,%ymm12
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm13,%ymm1
+
+ vpslld $19,%ymm13,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm12,%ymm7
+
+ vpsrld $22,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm13,%ymm2
+ vpxor %ymm3,%ymm14,%ymm12
+ vpaddd %ymm5,%ymm8,%ymm8
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpaddd %ymm7,%ymm12,%ymm12
+ vmovd 48(%r12),%xmm5
+ vmovd 48(%r8),%xmm0
+ vmovd 48(%r13),%xmm1
+ vmovd 48(%r9),%xmm2
+ vpinsrd $1,48(%r14),%xmm5,%xmm5
+ vpinsrd $1,48(%r10),%xmm0,%xmm0
+ vpinsrd $1,48(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,48(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm8,%ymm7
+ vpslld $26,%ymm8,%ymm2
+ vmovdqu %ymm5,384-256-128(%rbx)
+ vpaddd %ymm11,%ymm5,%ymm5
+
+ vpsrld $11,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm8,%ymm2
+ vpaddd 0(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm8,%ymm2
+ vpandn %ymm10,%ymm8,%ymm0
+ vpand %ymm9,%ymm8,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm12,%ymm11
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm12,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm12,%ymm13,%ymm3
+
+ vpxor %ymm1,%ymm11,%ymm11
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm12,%ymm1
+
+ vpslld $19,%ymm12,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm11,%ymm7
+
+ vpsrld $22,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm12,%ymm2
+ vpxor %ymm4,%ymm13,%ymm11
+ vpaddd %ymm5,%ymm15,%ymm15
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm11,%ymm11
+ vpaddd %ymm7,%ymm11,%ymm11
+ vmovd 52(%r12),%xmm5
+ vmovd 52(%r8),%xmm0
+ vmovd 52(%r13),%xmm1
+ vmovd 52(%r9),%xmm2
+ vpinsrd $1,52(%r14),%xmm5,%xmm5
+ vpinsrd $1,52(%r10),%xmm0,%xmm0
+ vpinsrd $1,52(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,52(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm15,%ymm7
+ vpslld $26,%ymm15,%ymm2
+ vmovdqu %ymm5,416-256-128(%rbx)
+ vpaddd %ymm10,%ymm5,%ymm5
+
+ vpsrld $11,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm15,%ymm2
+ vpaddd 32(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm15,%ymm2
+ vpandn %ymm9,%ymm15,%ymm0
+ vpand %ymm8,%ymm15,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm11,%ymm10
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm11,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm11,%ymm12,%ymm4
+
+ vpxor %ymm1,%ymm10,%ymm10
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm11,%ymm1
+
+ vpslld $19,%ymm11,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm10,%ymm7
+
+ vpsrld $22,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm11,%ymm2
+ vpxor %ymm3,%ymm12,%ymm10
+ vpaddd %ymm5,%ymm14,%ymm14
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm10,%ymm10
+ vpaddd %ymm7,%ymm10,%ymm10
+ vmovd 56(%r12),%xmm5
+ vmovd 56(%r8),%xmm0
+ vmovd 56(%r13),%xmm1
+ vmovd 56(%r9),%xmm2
+ vpinsrd $1,56(%r14),%xmm5,%xmm5
+ vpinsrd $1,56(%r10),%xmm0,%xmm0
+ vpinsrd $1,56(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,56(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm14,%ymm7
+ vpslld $26,%ymm14,%ymm2
+ vmovdqu %ymm5,448-256-128(%rbx)
+ vpaddd %ymm9,%ymm5,%ymm5
+
+ vpsrld $11,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm14,%ymm2
+ vpaddd 64(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm14,%ymm2
+ vpandn %ymm8,%ymm14,%ymm0
+ vpand %ymm15,%ymm14,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm10,%ymm9
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm10,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm10,%ymm11,%ymm3
+
+ vpxor %ymm1,%ymm9,%ymm9
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm10,%ymm1
+
+ vpslld $19,%ymm10,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm9,%ymm7
+
+ vpsrld $22,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm10,%ymm2
+ vpxor %ymm4,%ymm11,%ymm9
+ vpaddd %ymm5,%ymm13,%ymm13
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm9,%ymm9
+ vpaddd %ymm7,%ymm9,%ymm9
+ vmovd 60(%r12),%xmm5
+ leaq 64(%r12),%r12
+ vmovd 60(%r8),%xmm0
+ leaq 64(%r8),%r8
+ vmovd 60(%r13),%xmm1
+ leaq 64(%r13),%r13
+ vmovd 60(%r9),%xmm2
+ leaq 64(%r9),%r9
+ vpinsrd $1,60(%r14),%xmm5,%xmm5
+ leaq 64(%r14),%r14
+ vpinsrd $1,60(%r10),%xmm0,%xmm0
+ leaq 64(%r10),%r10
+ vpinsrd $1,60(%r15),%xmm1,%xmm1
+ leaq 64(%r15),%r15
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,60(%r11),%xmm2,%xmm2
+ leaq 64(%r11),%r11
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm13,%ymm7
+ vpslld $26,%ymm13,%ymm2
+ vmovdqu %ymm5,480-256-128(%rbx)
+ vpaddd %ymm8,%ymm5,%ymm5
+
+ vpsrld $11,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm13,%ymm2
+ vpaddd 96(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ prefetcht0 63(%r12)
+ vpslld $7,%ymm13,%ymm2
+ vpandn %ymm15,%ymm13,%ymm0
+ vpand %ymm14,%ymm13,%ymm4
+ prefetcht0 63(%r13)
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm9,%ymm8
+ vpxor %ymm2,%ymm7,%ymm7
+ prefetcht0 63(%r14)
+ vpslld $30,%ymm9,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm9,%ymm10,%ymm4
+ prefetcht0 63(%r15)
+ vpxor %ymm1,%ymm8,%ymm8
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm9,%ymm1
+ prefetcht0 63(%r8)
+ vpslld $19,%ymm9,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm4,%ymm3,%ymm3
+ prefetcht0 63(%r9)
+ vpxor %ymm1,%ymm8,%ymm7
+
+ vpsrld $22,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ prefetcht0 63(%r10)
+ vpslld $10,%ymm9,%ymm2
+ vpxor %ymm3,%ymm10,%ymm8
+ vpaddd %ymm5,%ymm12,%ymm12
+ prefetcht0 63(%r11)
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm8,%ymm8
+ vpaddd %ymm7,%ymm8,%ymm8
+ addq $256,%rbp
+ vmovdqu 0-128(%rax),%ymm5
+ movl $3,%ecx
+ jmp .Loop_16_xx_avx2
+.align 32
+.Loop_16_xx_avx2:
+ vmovdqu 32-128(%rax),%ymm6
+ vpaddd 288-256-128(%rbx),%ymm5,%ymm5
+
+ vpsrld $3,%ymm6,%ymm7
+ vpsrld $7,%ymm6,%ymm1
+ vpslld $25,%ymm6,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm6,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm6,%ymm2
+ vmovdqu 448-256-128(%rbx),%ymm0
+ vpsrld $10,%ymm0,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpxor %ymm1,%ymm3,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpsrld $6,%ymm12,%ymm7
+ vpslld $26,%ymm12,%ymm2
+ vmovdqu %ymm5,0-128(%rax)
+ vpaddd %ymm15,%ymm5,%ymm5
+
+ vpsrld $11,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm12,%ymm2
+ vpaddd -128(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm12,%ymm2
+ vpandn %ymm14,%ymm12,%ymm0
+ vpand %ymm13,%ymm12,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm8,%ymm15
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm8,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm8,%ymm9,%ymm3
+
+ vpxor %ymm1,%ymm15,%ymm15
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm8,%ymm1
+
+ vpslld $19,%ymm8,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm15,%ymm7
+
+ vpsrld $22,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm8,%ymm2
+ vpxor %ymm4,%ymm9,%ymm15
+ vpaddd %ymm5,%ymm11,%ymm11
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm15,%ymm15
+ vpaddd %ymm7,%ymm15,%ymm15
+ vmovdqu 64-128(%rax),%ymm5
+ vpaddd 320-256-128(%rbx),%ymm6,%ymm6
+
+ vpsrld $3,%ymm5,%ymm7
+ vpsrld $7,%ymm5,%ymm1
+ vpslld $25,%ymm5,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm5,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm5,%ymm2
+ vmovdqu 480-256-128(%rbx),%ymm0
+ vpsrld $10,%ymm0,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm1,%ymm4,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpsrld $6,%ymm11,%ymm7
+ vpslld $26,%ymm11,%ymm2
+ vmovdqu %ymm6,32-128(%rax)
+ vpaddd %ymm14,%ymm6,%ymm6
+
+ vpsrld $11,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm11,%ymm2
+ vpaddd -96(%rbp),%ymm6,%ymm6
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm11,%ymm2
+ vpandn %ymm13,%ymm11,%ymm0
+ vpand %ymm12,%ymm11,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm15,%ymm14
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm15,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm15,%ymm8,%ymm4
+
+ vpxor %ymm1,%ymm14,%ymm14
+ vpaddd %ymm7,%ymm6,%ymm6
+
+ vpsrld $13,%ymm15,%ymm1
+
+ vpslld $19,%ymm15,%ymm2
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm14,%ymm7
+
+ vpsrld $22,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm15,%ymm2
+ vpxor %ymm3,%ymm8,%ymm14
+ vpaddd %ymm6,%ymm10,%ymm10
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm6,%ymm14,%ymm14
+ vpaddd %ymm7,%ymm14,%ymm14
+ vmovdqu 96-128(%rax),%ymm6
+ vpaddd 352-256-128(%rbx),%ymm5,%ymm5
+
+ vpsrld $3,%ymm6,%ymm7
+ vpsrld $7,%ymm6,%ymm1
+ vpslld $25,%ymm6,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm6,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm6,%ymm2
+ vmovdqu 0-128(%rax),%ymm0
+ vpsrld $10,%ymm0,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpxor %ymm1,%ymm3,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpsrld $6,%ymm10,%ymm7
+ vpslld $26,%ymm10,%ymm2
+ vmovdqu %ymm5,64-128(%rax)
+ vpaddd %ymm13,%ymm5,%ymm5
+
+ vpsrld $11,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm10,%ymm2
+ vpaddd -64(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm10,%ymm2
+ vpandn %ymm12,%ymm10,%ymm0
+ vpand %ymm11,%ymm10,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm14,%ymm13
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm14,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm14,%ymm15,%ymm3
+
+ vpxor %ymm1,%ymm13,%ymm13
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm14,%ymm1
+
+ vpslld $19,%ymm14,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm13,%ymm7
+
+ vpsrld $22,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm14,%ymm2
+ vpxor %ymm4,%ymm15,%ymm13
+ vpaddd %ymm5,%ymm9,%ymm9
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpaddd %ymm7,%ymm13,%ymm13
+ vmovdqu 128-128(%rax),%ymm5
+ vpaddd 384-256-128(%rbx),%ymm6,%ymm6
+
+ vpsrld $3,%ymm5,%ymm7
+ vpsrld $7,%ymm5,%ymm1
+ vpslld $25,%ymm5,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm5,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm5,%ymm2
+ vmovdqu 32-128(%rax),%ymm0
+ vpsrld $10,%ymm0,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm1,%ymm4,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpsrld $6,%ymm9,%ymm7
+ vpslld $26,%ymm9,%ymm2
+ vmovdqu %ymm6,96-128(%rax)
+ vpaddd %ymm12,%ymm6,%ymm6
+
+ vpsrld $11,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm9,%ymm2
+ vpaddd -32(%rbp),%ymm6,%ymm6
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm9,%ymm2
+ vpandn %ymm11,%ymm9,%ymm0
+ vpand %ymm10,%ymm9,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm13,%ymm12
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm13,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm13,%ymm14,%ymm4
+
+ vpxor %ymm1,%ymm12,%ymm12
+ vpaddd %ymm7,%ymm6,%ymm6
+
+ vpsrld $13,%ymm13,%ymm1
+
+ vpslld $19,%ymm13,%ymm2
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm12,%ymm7
+
+ vpsrld $22,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm13,%ymm2
+ vpxor %ymm3,%ymm14,%ymm12
+ vpaddd %ymm6,%ymm8,%ymm8
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpaddd %ymm7,%ymm12,%ymm12
+ vmovdqu 160-128(%rax),%ymm6
+ vpaddd 416-256-128(%rbx),%ymm5,%ymm5
+
+ vpsrld $3,%ymm6,%ymm7
+ vpsrld $7,%ymm6,%ymm1
+ vpslld $25,%ymm6,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm6,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm6,%ymm2
+ vmovdqu 64-128(%rax),%ymm0
+ vpsrld $10,%ymm0,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpxor %ymm1,%ymm3,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpsrld $6,%ymm8,%ymm7
+ vpslld $26,%ymm8,%ymm2
+ vmovdqu %ymm5,128-128(%rax)
+ vpaddd %ymm11,%ymm5,%ymm5
+
+ vpsrld $11,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm8,%ymm2
+ vpaddd 0(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm8,%ymm2
+ vpandn %ymm10,%ymm8,%ymm0
+ vpand %ymm9,%ymm8,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm12,%ymm11
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm12,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm12,%ymm13,%ymm3
+
+ vpxor %ymm1,%ymm11,%ymm11
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm12,%ymm1
+
+ vpslld $19,%ymm12,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm11,%ymm7
+
+ vpsrld $22,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm12,%ymm2
+ vpxor %ymm4,%ymm13,%ymm11
+ vpaddd %ymm5,%ymm15,%ymm15
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm11,%ymm11
+ vpaddd %ymm7,%ymm11,%ymm11
+ vmovdqu 192-128(%rax),%ymm5
+ vpaddd 448-256-128(%rbx),%ymm6,%ymm6
+
+ vpsrld $3,%ymm5,%ymm7
+ vpsrld $7,%ymm5,%ymm1
+ vpslld $25,%ymm5,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm5,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm5,%ymm2
+ vmovdqu 96-128(%rax),%ymm0
+ vpsrld $10,%ymm0,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm1,%ymm4,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpsrld $6,%ymm15,%ymm7
+ vpslld $26,%ymm15,%ymm2
+ vmovdqu %ymm6,160-128(%rax)
+ vpaddd %ymm10,%ymm6,%ymm6
+
+ vpsrld $11,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm15,%ymm2
+ vpaddd 32(%rbp),%ymm6,%ymm6
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm15,%ymm2
+ vpandn %ymm9,%ymm15,%ymm0
+ vpand %ymm8,%ymm15,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm11,%ymm10
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm11,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm11,%ymm12,%ymm4
+
+ vpxor %ymm1,%ymm10,%ymm10
+ vpaddd %ymm7,%ymm6,%ymm6
+
+ vpsrld $13,%ymm11,%ymm1
+
+ vpslld $19,%ymm11,%ymm2
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm10,%ymm7
+
+ vpsrld $22,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm11,%ymm2
+ vpxor %ymm3,%ymm12,%ymm10
+ vpaddd %ymm6,%ymm14,%ymm14
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm6,%ymm10,%ymm10
+ vpaddd %ymm7,%ymm10,%ymm10
+ vmovdqu 224-128(%rax),%ymm6
+ vpaddd 480-256-128(%rbx),%ymm5,%ymm5
+
+ vpsrld $3,%ymm6,%ymm7
+ vpsrld $7,%ymm6,%ymm1
+ vpslld $25,%ymm6,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm6,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm6,%ymm2
+ vmovdqu 128-128(%rax),%ymm0
+ vpsrld $10,%ymm0,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpxor %ymm1,%ymm3,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpsrld $6,%ymm14,%ymm7
+ vpslld $26,%ymm14,%ymm2
+ vmovdqu %ymm5,192-128(%rax)
+ vpaddd %ymm9,%ymm5,%ymm5
+
+ vpsrld $11,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm14,%ymm2
+ vpaddd 64(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm14,%ymm2
+ vpandn %ymm8,%ymm14,%ymm0
+ vpand %ymm15,%ymm14,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm10,%ymm9
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm10,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm10,%ymm11,%ymm3
+
+ vpxor %ymm1,%ymm9,%ymm9
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm10,%ymm1
+
+ vpslld $19,%ymm10,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm9,%ymm7
+
+ vpsrld $22,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm10,%ymm2
+ vpxor %ymm4,%ymm11,%ymm9
+ vpaddd %ymm5,%ymm13,%ymm13
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm9,%ymm9
+ vpaddd %ymm7,%ymm9,%ymm9
+ vmovdqu 256-256-128(%rbx),%ymm5
+ vpaddd 0-128(%rax),%ymm6,%ymm6
+
+ vpsrld $3,%ymm5,%ymm7
+ vpsrld $7,%ymm5,%ymm1
+ vpslld $25,%ymm5,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm5,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm5,%ymm2
+ vmovdqu 160-128(%rax),%ymm0
+ vpsrld $10,%ymm0,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm1,%ymm4,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpsrld $6,%ymm13,%ymm7
+ vpslld $26,%ymm13,%ymm2
+ vmovdqu %ymm6,224-128(%rax)
+ vpaddd %ymm8,%ymm6,%ymm6
+
+ vpsrld $11,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm13,%ymm2
+ vpaddd 96(%rbp),%ymm6,%ymm6
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm13,%ymm2
+ vpandn %ymm15,%ymm13,%ymm0
+ vpand %ymm14,%ymm13,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm9,%ymm8
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm9,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm9,%ymm10,%ymm4
+
+ vpxor %ymm1,%ymm8,%ymm8
+ vpaddd %ymm7,%ymm6,%ymm6
+
+ vpsrld $13,%ymm9,%ymm1
+
+ vpslld $19,%ymm9,%ymm2
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm8,%ymm7
+
+ vpsrld $22,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm9,%ymm2
+ vpxor %ymm3,%ymm10,%ymm8
+ vpaddd %ymm6,%ymm12,%ymm12
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm6,%ymm8,%ymm8
+ vpaddd %ymm7,%ymm8,%ymm8
+ addq $256,%rbp
+ vmovdqu 288-256-128(%rbx),%ymm6
+ vpaddd 32-128(%rax),%ymm5,%ymm5
+
+ vpsrld $3,%ymm6,%ymm7
+ vpsrld $7,%ymm6,%ymm1
+ vpslld $25,%ymm6,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm6,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm6,%ymm2
+ vmovdqu 192-128(%rax),%ymm0
+ vpsrld $10,%ymm0,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpxor %ymm1,%ymm3,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpsrld $6,%ymm12,%ymm7
+ vpslld $26,%ymm12,%ymm2
+ vmovdqu %ymm5,256-256-128(%rbx)
+ vpaddd %ymm15,%ymm5,%ymm5
+
+ vpsrld $11,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm12,%ymm2
+ vpaddd -128(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm12,%ymm2
+ vpandn %ymm14,%ymm12,%ymm0
+ vpand %ymm13,%ymm12,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm8,%ymm15
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm8,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm8,%ymm9,%ymm3
+
+ vpxor %ymm1,%ymm15,%ymm15
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm8,%ymm1
+
+ vpslld $19,%ymm8,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm15,%ymm7
+
+ vpsrld $22,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm8,%ymm2
+ vpxor %ymm4,%ymm9,%ymm15
+ vpaddd %ymm5,%ymm11,%ymm11
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm15,%ymm15
+ vpaddd %ymm7,%ymm15,%ymm15
+ vmovdqu 320-256-128(%rbx),%ymm5
+ vpaddd 64-128(%rax),%ymm6,%ymm6
+
+ vpsrld $3,%ymm5,%ymm7
+ vpsrld $7,%ymm5,%ymm1
+ vpslld $25,%ymm5,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm5,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm5,%ymm2
+ vmovdqu 224-128(%rax),%ymm0
+ vpsrld $10,%ymm0,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm1,%ymm4,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpsrld $6,%ymm11,%ymm7
+ vpslld $26,%ymm11,%ymm2
+ vmovdqu %ymm6,288-256-128(%rbx)
+ vpaddd %ymm14,%ymm6,%ymm6
+
+ vpsrld $11,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm11,%ymm2
+ vpaddd -96(%rbp),%ymm6,%ymm6
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm11,%ymm2
+ vpandn %ymm13,%ymm11,%ymm0
+ vpand %ymm12,%ymm11,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm15,%ymm14
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm15,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm15,%ymm8,%ymm4
+
+ vpxor %ymm1,%ymm14,%ymm14
+ vpaddd %ymm7,%ymm6,%ymm6
+
+ vpsrld $13,%ymm15,%ymm1
+
+ vpslld $19,%ymm15,%ymm2
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm14,%ymm7
+
+ vpsrld $22,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm15,%ymm2
+ vpxor %ymm3,%ymm8,%ymm14
+ vpaddd %ymm6,%ymm10,%ymm10
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm6,%ymm14,%ymm14
+ vpaddd %ymm7,%ymm14,%ymm14
+ vmovdqu 352-256-128(%rbx),%ymm6
+ vpaddd 96-128(%rax),%ymm5,%ymm5
+
+ vpsrld $3,%ymm6,%ymm7
+ vpsrld $7,%ymm6,%ymm1
+ vpslld $25,%ymm6,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm6,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm6,%ymm2
+ vmovdqu 256-256-128(%rbx),%ymm0
+ vpsrld $10,%ymm0,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpxor %ymm1,%ymm3,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpsrld $6,%ymm10,%ymm7
+ vpslld $26,%ymm10,%ymm2
+ vmovdqu %ymm5,320-256-128(%rbx)
+ vpaddd %ymm13,%ymm5,%ymm5
+
+ vpsrld $11,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm10,%ymm2
+ vpaddd -64(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm10,%ymm2
+ vpandn %ymm12,%ymm10,%ymm0
+ vpand %ymm11,%ymm10,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm14,%ymm13
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm14,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm14,%ymm15,%ymm3
+
+ vpxor %ymm1,%ymm13,%ymm13
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm14,%ymm1
+
+ vpslld $19,%ymm14,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm13,%ymm7
+
+ vpsrld $22,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm14,%ymm2
+ vpxor %ymm4,%ymm15,%ymm13
+ vpaddd %ymm5,%ymm9,%ymm9
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpaddd %ymm7,%ymm13,%ymm13
+ vmovdqu 384-256-128(%rbx),%ymm5
+ vpaddd 128-128(%rax),%ymm6,%ymm6
+
+ vpsrld $3,%ymm5,%ymm7
+ vpsrld $7,%ymm5,%ymm1
+ vpslld $25,%ymm5,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm5,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm5,%ymm2
+ vmovdqu 288-256-128(%rbx),%ymm0
+ vpsrld $10,%ymm0,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm1,%ymm4,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpsrld $6,%ymm9,%ymm7
+ vpslld $26,%ymm9,%ymm2
+ vmovdqu %ymm6,352-256-128(%rbx)
+ vpaddd %ymm12,%ymm6,%ymm6
+
+ vpsrld $11,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm9,%ymm2
+ vpaddd -32(%rbp),%ymm6,%ymm6
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm9,%ymm2
+ vpandn %ymm11,%ymm9,%ymm0
+ vpand %ymm10,%ymm9,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm13,%ymm12
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm13,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm13,%ymm14,%ymm4
+
+ vpxor %ymm1,%ymm12,%ymm12
+ vpaddd %ymm7,%ymm6,%ymm6
+
+ vpsrld $13,%ymm13,%ymm1
+
+ vpslld $19,%ymm13,%ymm2
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm12,%ymm7
+
+ vpsrld $22,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm13,%ymm2
+ vpxor %ymm3,%ymm14,%ymm12
+ vpaddd %ymm6,%ymm8,%ymm8
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpaddd %ymm7,%ymm12,%ymm12
+ vmovdqu 416-256-128(%rbx),%ymm6
+ vpaddd 160-128(%rax),%ymm5,%ymm5
+
+ vpsrld $3,%ymm6,%ymm7
+ vpsrld $7,%ymm6,%ymm1
+ vpslld $25,%ymm6,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm6,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm6,%ymm2
+ vmovdqu 320-256-128(%rbx),%ymm0
+ vpsrld $10,%ymm0,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpxor %ymm1,%ymm3,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpsrld $6,%ymm8,%ymm7
+ vpslld $26,%ymm8,%ymm2
+ vmovdqu %ymm5,384-256-128(%rbx)
+ vpaddd %ymm11,%ymm5,%ymm5
+
+ vpsrld $11,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm8,%ymm2
+ vpaddd 0(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm8,%ymm2
+ vpandn %ymm10,%ymm8,%ymm0
+ vpand %ymm9,%ymm8,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm12,%ymm11
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm12,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm12,%ymm13,%ymm3
+
+ vpxor %ymm1,%ymm11,%ymm11
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm12,%ymm1
+
+ vpslld $19,%ymm12,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm11,%ymm7
+
+ vpsrld $22,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm12,%ymm2
+ vpxor %ymm4,%ymm13,%ymm11
+ vpaddd %ymm5,%ymm15,%ymm15
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm11,%ymm11
+ vpaddd %ymm7,%ymm11,%ymm11
+ vmovdqu 448-256-128(%rbx),%ymm5
+ vpaddd 192-128(%rax),%ymm6,%ymm6
+
+ vpsrld $3,%ymm5,%ymm7
+ vpsrld $7,%ymm5,%ymm1
+ vpslld $25,%ymm5,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm5,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm5,%ymm2
+ vmovdqu 352-256-128(%rbx),%ymm0
+ vpsrld $10,%ymm0,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm1,%ymm4,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpsrld $6,%ymm15,%ymm7
+ vpslld $26,%ymm15,%ymm2
+ vmovdqu %ymm6,416-256-128(%rbx)
+ vpaddd %ymm10,%ymm6,%ymm6
+
+ vpsrld $11,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm15,%ymm2
+ vpaddd 32(%rbp),%ymm6,%ymm6
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm15,%ymm2
+ vpandn %ymm9,%ymm15,%ymm0
+ vpand %ymm8,%ymm15,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm11,%ymm10
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm11,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm11,%ymm12,%ymm4
+
+ vpxor %ymm1,%ymm10,%ymm10
+ vpaddd %ymm7,%ymm6,%ymm6
+
+ vpsrld $13,%ymm11,%ymm1
+
+ vpslld $19,%ymm11,%ymm2
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm10,%ymm7
+
+ vpsrld $22,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm11,%ymm2
+ vpxor %ymm3,%ymm12,%ymm10
+ vpaddd %ymm6,%ymm14,%ymm14
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm6,%ymm10,%ymm10
+ vpaddd %ymm7,%ymm10,%ymm10
+ vmovdqu 480-256-128(%rbx),%ymm6
+ vpaddd 224-128(%rax),%ymm5,%ymm5
+
+ vpsrld $3,%ymm6,%ymm7
+ vpsrld $7,%ymm6,%ymm1
+ vpslld $25,%ymm6,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm6,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm6,%ymm2
+ vmovdqu 384-256-128(%rbx),%ymm0
+ vpsrld $10,%ymm0,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpxor %ymm1,%ymm3,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpsrld $6,%ymm14,%ymm7
+ vpslld $26,%ymm14,%ymm2
+ vmovdqu %ymm5,448-256-128(%rbx)
+ vpaddd %ymm9,%ymm5,%ymm5
+
+ vpsrld $11,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm14,%ymm2
+ vpaddd 64(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm14,%ymm2
+ vpandn %ymm8,%ymm14,%ymm0
+ vpand %ymm15,%ymm14,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm10,%ymm9
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm10,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm10,%ymm11,%ymm3
+
+ vpxor %ymm1,%ymm9,%ymm9
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm10,%ymm1
+
+ vpslld $19,%ymm10,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm9,%ymm7
+
+ vpsrld $22,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm10,%ymm2
+ vpxor %ymm4,%ymm11,%ymm9
+ vpaddd %ymm5,%ymm13,%ymm13
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm9,%ymm9
+ vpaddd %ymm7,%ymm9,%ymm9
+ vmovdqu 0-128(%rax),%ymm5
+ vpaddd 256-256-128(%rbx),%ymm6,%ymm6
+
+ vpsrld $3,%ymm5,%ymm7
+ vpsrld $7,%ymm5,%ymm1
+ vpslld $25,%ymm5,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm5,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm5,%ymm2
+ vmovdqu 416-256-128(%rbx),%ymm0
+ vpsrld $10,%ymm0,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm1,%ymm4,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpsrld $6,%ymm13,%ymm7
+ vpslld $26,%ymm13,%ymm2
+ vmovdqu %ymm6,480-256-128(%rbx)
+ vpaddd %ymm8,%ymm6,%ymm6
+
+ vpsrld $11,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm13,%ymm2
+ vpaddd 96(%rbp),%ymm6,%ymm6
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm13,%ymm2
+ vpandn %ymm15,%ymm13,%ymm0
+ vpand %ymm14,%ymm13,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm9,%ymm8
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm9,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm9,%ymm10,%ymm4
+
+ vpxor %ymm1,%ymm8,%ymm8
+ vpaddd %ymm7,%ymm6,%ymm6
+
+ vpsrld $13,%ymm9,%ymm1
+
+ vpslld $19,%ymm9,%ymm2
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm8,%ymm7
+
+ vpsrld $22,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm9,%ymm2
+ vpxor %ymm3,%ymm10,%ymm8
+ vpaddd %ymm6,%ymm12,%ymm12
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm6,%ymm8,%ymm8
+ vpaddd %ymm7,%ymm8,%ymm8
+ addq $256,%rbp
+ decl %ecx
+ jnz .Loop_16_xx_avx2
+
+ movl $1,%ecx
+ leaq 512(%rsp),%rbx
+ leaq K256+128(%rip),%rbp
+ cmpl 0(%rbx),%ecx
+ cmovgeq %rbp,%r12
+ cmpl 4(%rbx),%ecx
+ cmovgeq %rbp,%r13
+ cmpl 8(%rbx),%ecx
+ cmovgeq %rbp,%r14
+ cmpl 12(%rbx),%ecx
+ cmovgeq %rbp,%r15
+ cmpl 16(%rbx),%ecx
+ cmovgeq %rbp,%r8
+ cmpl 20(%rbx),%ecx
+ cmovgeq %rbp,%r9
+ cmpl 24(%rbx),%ecx
+ cmovgeq %rbp,%r10
+ cmpl 28(%rbx),%ecx
+ cmovgeq %rbp,%r11
+ vmovdqa (%rbx),%ymm7
+ vpxor %ymm0,%ymm0,%ymm0
+ vmovdqa %ymm7,%ymm6
+ vpcmpgtd %ymm0,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm7,%ymm7
+
+ vmovdqu 0-128(%rdi),%ymm0
+ vpand %ymm6,%ymm8,%ymm8
+ vmovdqu 32-128(%rdi),%ymm1
+ vpand %ymm6,%ymm9,%ymm9
+ vmovdqu 64-128(%rdi),%ymm2
+ vpand %ymm6,%ymm10,%ymm10
+ vmovdqu 96-128(%rdi),%ymm5
+ vpand %ymm6,%ymm11,%ymm11
+ vpaddd %ymm0,%ymm8,%ymm8
+ vmovdqu 128-128(%rdi),%ymm0
+ vpand %ymm6,%ymm12,%ymm12
+ vpaddd %ymm1,%ymm9,%ymm9
+ vmovdqu 160-128(%rdi),%ymm1
+ vpand %ymm6,%ymm13,%ymm13
+ vpaddd %ymm2,%ymm10,%ymm10
+ vmovdqu 192-128(%rdi),%ymm2
+ vpand %ymm6,%ymm14,%ymm14
+ vpaddd %ymm5,%ymm11,%ymm11
+ vmovdqu 224-128(%rdi),%ymm5
+ vpand %ymm6,%ymm15,%ymm15
+ vpaddd %ymm0,%ymm12,%ymm12
+ vpaddd %ymm1,%ymm13,%ymm13
+ vmovdqu %ymm8,0-128(%rdi)
+ vpaddd %ymm2,%ymm14,%ymm14
+ vmovdqu %ymm9,32-128(%rdi)
+ vpaddd %ymm5,%ymm15,%ymm15
+ vmovdqu %ymm10,64-128(%rdi)
+ vmovdqu %ymm11,96-128(%rdi)
+ vmovdqu %ymm12,128-128(%rdi)
+ vmovdqu %ymm13,160-128(%rdi)
+ vmovdqu %ymm14,192-128(%rdi)
+ vmovdqu %ymm15,224-128(%rdi)
+
+ vmovdqu %ymm7,(%rbx)
+ leaq 256+128(%rsp),%rbx
+ vmovdqu .Lpbswap(%rip),%ymm6
+ decl %edx
+ jnz .Loop_avx2
+
+
+
+
+
+
+
+.Ldone_avx2:
+ movq 544(%rsp),%rax
+.cfi_def_cfa %rax,8
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha256_multi_block_avx2,.-sha256_multi_block_avx2
+.align 256
+K256:
+.long 1116352408,1116352408,1116352408,1116352408
+.long 1116352408,1116352408,1116352408,1116352408
+.long 1899447441,1899447441,1899447441,1899447441
+.long 1899447441,1899447441,1899447441,1899447441
+.long 3049323471,3049323471,3049323471,3049323471
+.long 3049323471,3049323471,3049323471,3049323471
+.long 3921009573,3921009573,3921009573,3921009573
+.long 3921009573,3921009573,3921009573,3921009573
+.long 961987163,961987163,961987163,961987163
+.long 961987163,961987163,961987163,961987163
+.long 1508970993,1508970993,1508970993,1508970993
+.long 1508970993,1508970993,1508970993,1508970993
+.long 2453635748,2453635748,2453635748,2453635748
+.long 2453635748,2453635748,2453635748,2453635748
+.long 2870763221,2870763221,2870763221,2870763221
+.long 2870763221,2870763221,2870763221,2870763221
+.long 3624381080,3624381080,3624381080,3624381080
+.long 3624381080,3624381080,3624381080,3624381080
+.long 310598401,310598401,310598401,310598401
+.long 310598401,310598401,310598401,310598401
+.long 607225278,607225278,607225278,607225278
+.long 607225278,607225278,607225278,607225278
+.long 1426881987,1426881987,1426881987,1426881987
+.long 1426881987,1426881987,1426881987,1426881987
+.long 1925078388,1925078388,1925078388,1925078388
+.long 1925078388,1925078388,1925078388,1925078388
+.long 2162078206,2162078206,2162078206,2162078206
+.long 2162078206,2162078206,2162078206,2162078206
+.long 2614888103,2614888103,2614888103,2614888103
+.long 2614888103,2614888103,2614888103,2614888103
+.long 3248222580,3248222580,3248222580,3248222580
+.long 3248222580,3248222580,3248222580,3248222580
+.long 3835390401,3835390401,3835390401,3835390401
+.long 3835390401,3835390401,3835390401,3835390401
+.long 4022224774,4022224774,4022224774,4022224774
+.long 4022224774,4022224774,4022224774,4022224774
+.long 264347078,264347078,264347078,264347078
+.long 264347078,264347078,264347078,264347078
+.long 604807628,604807628,604807628,604807628
+.long 604807628,604807628,604807628,604807628
+.long 770255983,770255983,770255983,770255983
+.long 770255983,770255983,770255983,770255983
+.long 1249150122,1249150122,1249150122,1249150122
+.long 1249150122,1249150122,1249150122,1249150122
+.long 1555081692,1555081692,1555081692,1555081692
+.long 1555081692,1555081692,1555081692,1555081692
+.long 1996064986,1996064986,1996064986,1996064986
+.long 1996064986,1996064986,1996064986,1996064986
+.long 2554220882,2554220882,2554220882,2554220882
+.long 2554220882,2554220882,2554220882,2554220882
+.long 2821834349,2821834349,2821834349,2821834349
+.long 2821834349,2821834349,2821834349,2821834349
+.long 2952996808,2952996808,2952996808,2952996808
+.long 2952996808,2952996808,2952996808,2952996808
+.long 3210313671,3210313671,3210313671,3210313671
+.long 3210313671,3210313671,3210313671,3210313671
+.long 3336571891,3336571891,3336571891,3336571891
+.long 3336571891,3336571891,3336571891,3336571891
+.long 3584528711,3584528711,3584528711,3584528711
+.long 3584528711,3584528711,3584528711,3584528711
+.long 113926993,113926993,113926993,113926993
+.long 113926993,113926993,113926993,113926993
+.long 338241895,338241895,338241895,338241895
+.long 338241895,338241895,338241895,338241895
+.long 666307205,666307205,666307205,666307205
+.long 666307205,666307205,666307205,666307205
+.long 773529912,773529912,773529912,773529912
+.long 773529912,773529912,773529912,773529912
+.long 1294757372,1294757372,1294757372,1294757372
+.long 1294757372,1294757372,1294757372,1294757372
+.long 1396182291,1396182291,1396182291,1396182291
+.long 1396182291,1396182291,1396182291,1396182291
+.long 1695183700,1695183700,1695183700,1695183700
+.long 1695183700,1695183700,1695183700,1695183700
+.long 1986661051,1986661051,1986661051,1986661051
+.long 1986661051,1986661051,1986661051,1986661051
+.long 2177026350,2177026350,2177026350,2177026350
+.long 2177026350,2177026350,2177026350,2177026350
+.long 2456956037,2456956037,2456956037,2456956037
+.long 2456956037,2456956037,2456956037,2456956037
+.long 2730485921,2730485921,2730485921,2730485921
+.long 2730485921,2730485921,2730485921,2730485921
+.long 2820302411,2820302411,2820302411,2820302411
+.long 2820302411,2820302411,2820302411,2820302411
+.long 3259730800,3259730800,3259730800,3259730800
+.long 3259730800,3259730800,3259730800,3259730800
+.long 3345764771,3345764771,3345764771,3345764771
+.long 3345764771,3345764771,3345764771,3345764771
+.long 3516065817,3516065817,3516065817,3516065817
+.long 3516065817,3516065817,3516065817,3516065817
+.long 3600352804,3600352804,3600352804,3600352804
+.long 3600352804,3600352804,3600352804,3600352804
+.long 4094571909,4094571909,4094571909,4094571909
+.long 4094571909,4094571909,4094571909,4094571909
+.long 275423344,275423344,275423344,275423344
+.long 275423344,275423344,275423344,275423344
+.long 430227734,430227734,430227734,430227734
+.long 430227734,430227734,430227734,430227734
+.long 506948616,506948616,506948616,506948616
+.long 506948616,506948616,506948616,506948616
+.long 659060556,659060556,659060556,659060556
+.long 659060556,659060556,659060556,659060556
+.long 883997877,883997877,883997877,883997877
+.long 883997877,883997877,883997877,883997877
+.long 958139571,958139571,958139571,958139571
+.long 958139571,958139571,958139571,958139571
+.long 1322822218,1322822218,1322822218,1322822218
+.long 1322822218,1322822218,1322822218,1322822218
+.long 1537002063,1537002063,1537002063,1537002063
+.long 1537002063,1537002063,1537002063,1537002063
+.long 1747873779,1747873779,1747873779,1747873779
+.long 1747873779,1747873779,1747873779,1747873779
+.long 1955562222,1955562222,1955562222,1955562222
+.long 1955562222,1955562222,1955562222,1955562222
+.long 2024104815,2024104815,2024104815,2024104815
+.long 2024104815,2024104815,2024104815,2024104815
+.long 2227730452,2227730452,2227730452,2227730452
+.long 2227730452,2227730452,2227730452,2227730452
+.long 2361852424,2361852424,2361852424,2361852424
+.long 2361852424,2361852424,2361852424,2361852424
+.long 2428436474,2428436474,2428436474,2428436474
+.long 2428436474,2428436474,2428436474,2428436474
+.long 2756734187,2756734187,2756734187,2756734187
+.long 2756734187,2756734187,2756734187,2756734187
+.long 3204031479,3204031479,3204031479,3204031479
+.long 3204031479,3204031479,3204031479,3204031479
+.long 3329325298,3329325298,3329325298,3329325298
+.long 3329325298,3329325298,3329325298,3329325298
+.Lpbswap:
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+K256_shaext:
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.byte 83,72,65,50,53,54,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha256-x86_64.s b/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha256-x86_64.s
new file mode 100644
index 0000000000..42b24df18e
--- /dev/null
+++ b/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha256-x86_64.s
@@ -0,0 +1,5430 @@
+.text
+
+
+.globl sha256_block_data_order
+.type sha256_block_data_order,@function
+.align 16
+sha256_block_data_order:
+.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl 0(%r11),%r9d
+ movl 4(%r11),%r10d
+ movl 8(%r11),%r11d
+ testl $536870912,%r11d
+ jnz _shaext_shortcut
+ andl $296,%r11d
+ cmpl $296,%r11d
+ je .Lavx2_shortcut
+ andl $1073741824,%r9d
+ andl $268435968,%r10d
+ orl %r9d,%r10d
+ cmpl $1342177792,%r10d
+ je .Lavx_shortcut
+ testl $512,%r10d
+ jnz .Lssse3_shortcut
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $64+32,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue:
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+ jmp .Lloop
+
+.align 16
+.Lloop:
+ movl %ebx,%edi
+ leaq K256(%rip),%rbp
+ xorl %ecx,%edi
+ movl 0(%rsi),%r12d
+ movl %r8d,%r13d
+ movl %eax,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,0(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r11d
+ movl 4(%rsi),%r12d
+ movl %edx,%r13d
+ movl %r11d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,4(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r10d
+ movl 8(%rsi),%r12d
+ movl %ecx,%r13d
+ movl %r10d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,8(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r9d
+ movl 12(%rsi),%r12d
+ movl %ebx,%r13d
+ movl %r9d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,12(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ addl %r14d,%r8d
+ movl 16(%rsi),%r12d
+ movl %eax,%r13d
+ movl %r8d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,16(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%edx
+ movl 20(%rsi),%r12d
+ movl %r11d,%r13d
+ movl %edx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,20(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ecx
+ movl 24(%rsi),%r12d
+ movl %r10d,%r13d
+ movl %ecx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,24(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ebx
+ movl 28(%rsi),%r12d
+ movl %r9d,%r13d
+ movl %ebx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,28(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ addl %r14d,%eax
+ movl 32(%rsi),%r12d
+ movl %r8d,%r13d
+ movl %eax,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,32(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r11d
+ movl 36(%rsi),%r12d
+ movl %edx,%r13d
+ movl %r11d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,36(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r10d
+ movl 40(%rsi),%r12d
+ movl %ecx,%r13d
+ movl %r10d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,40(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r9d
+ movl 44(%rsi),%r12d
+ movl %ebx,%r13d
+ movl %r9d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,44(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ addl %r14d,%r8d
+ movl 48(%rsi),%r12d
+ movl %eax,%r13d
+ movl %r8d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,48(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%edx
+ movl 52(%rsi),%r12d
+ movl %r11d,%r13d
+ movl %edx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,52(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ecx
+ movl 56(%rsi),%r12d
+ movl %r10d,%r13d
+ movl %ecx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,56(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ebx
+ movl 60(%rsi),%r12d
+ movl %r9d,%r13d
+ movl %ebx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,60(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ jmp .Lrounds_16_xx
+.align 16
+.Lrounds_16_xx:
+ movl 4(%rsp),%r13d
+ movl 56(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%eax
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 36(%rsp),%r12d
+
+ addl 0(%rsp),%r12d
+ movl %r8d,%r13d
+ addl %r15d,%r12d
+ movl %eax,%r14d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,0(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ movl 8(%rsp),%r13d
+ movl 60(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r11d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 40(%rsp),%r12d
+
+ addl 4(%rsp),%r12d
+ movl %edx,%r13d
+ addl %edi,%r12d
+ movl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,4(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ movl 12(%rsp),%r13d
+ movl 0(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r10d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 44(%rsp),%r12d
+
+ addl 8(%rsp),%r12d
+ movl %ecx,%r13d
+ addl %r15d,%r12d
+ movl %r10d,%r14d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,8(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ movl 16(%rsp),%r13d
+ movl 4(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r9d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 48(%rsp),%r12d
+
+ addl 12(%rsp),%r12d
+ movl %ebx,%r13d
+ addl %edi,%r12d
+ movl %r9d,%r14d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,12(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ movl 20(%rsp),%r13d
+ movl 8(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r8d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 52(%rsp),%r12d
+
+ addl 16(%rsp),%r12d
+ movl %eax,%r13d
+ addl %r15d,%r12d
+ movl %r8d,%r14d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,16(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ movl 24(%rsp),%r13d
+ movl 12(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%edx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 56(%rsp),%r12d
+
+ addl 20(%rsp),%r12d
+ movl %r11d,%r13d
+ addl %edi,%r12d
+ movl %edx,%r14d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,20(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ movl 28(%rsp),%r13d
+ movl 16(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ecx
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 60(%rsp),%r12d
+
+ addl 24(%rsp),%r12d
+ movl %r10d,%r13d
+ addl %r15d,%r12d
+ movl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,24(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ movl 32(%rsp),%r13d
+ movl 20(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ebx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 0(%rsp),%r12d
+
+ addl 28(%rsp),%r12d
+ movl %r9d,%r13d
+ addl %edi,%r12d
+ movl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,28(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ movl 36(%rsp),%r13d
+ movl 24(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%eax
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 4(%rsp),%r12d
+
+ addl 32(%rsp),%r12d
+ movl %r8d,%r13d
+ addl %r15d,%r12d
+ movl %eax,%r14d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,32(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ movl 40(%rsp),%r13d
+ movl 28(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r11d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 8(%rsp),%r12d
+
+ addl 36(%rsp),%r12d
+ movl %edx,%r13d
+ addl %edi,%r12d
+ movl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,36(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ movl 44(%rsp),%r13d
+ movl 32(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r10d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 12(%rsp),%r12d
+
+ addl 40(%rsp),%r12d
+ movl %ecx,%r13d
+ addl %r15d,%r12d
+ movl %r10d,%r14d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,40(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ movl 48(%rsp),%r13d
+ movl 36(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r9d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 16(%rsp),%r12d
+
+ addl 44(%rsp),%r12d
+ movl %ebx,%r13d
+ addl %edi,%r12d
+ movl %r9d,%r14d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,44(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ movl 52(%rsp),%r13d
+ movl 40(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r8d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 20(%rsp),%r12d
+
+ addl 48(%rsp),%r12d
+ movl %eax,%r13d
+ addl %r15d,%r12d
+ movl %r8d,%r14d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,48(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ movl 56(%rsp),%r13d
+ movl 44(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%edx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 24(%rsp),%r12d
+
+ addl 52(%rsp),%r12d
+ movl %r11d,%r13d
+ addl %edi,%r12d
+ movl %edx,%r14d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,52(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ movl 60(%rsp),%r13d
+ movl 48(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ecx
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 28(%rsp),%r12d
+
+ addl 56(%rsp),%r12d
+ movl %r10d,%r13d
+ addl %r15d,%r12d
+ movl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,56(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ movl 0(%rsp),%r13d
+ movl 52(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ebx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 32(%rsp),%r12d
+
+ addl 60(%rsp),%r12d
+ movl %r9d,%r13d
+ addl %edi,%r12d
+ movl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,60(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ cmpb $0,3(%rbp)
+ jnz .Lrounds_16_xx
+
+ movq 64+0(%rsp),%rdi
+ addl %r14d,%eax
+ leaq 64(%rsi),%rsi
+
+ addl 0(%rdi),%eax
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb .Lloop
+
+ movq 88(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha256_block_data_order,.-sha256_block_data_order
+.align 64
+.type K256,@object
+K256:
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.type sha256_block_data_order_shaext,@function
+.align 64
+sha256_block_data_order_shaext:
+_shaext_shortcut:
+ leaq K256+128(%rip),%rcx
+ movdqu (%rdi),%xmm1
+ movdqu 16(%rdi),%xmm2
+ movdqa 512-128(%rcx),%xmm7
+
+ pshufd $0x1b,%xmm1,%xmm0
+ pshufd $0xb1,%xmm1,%xmm1
+ pshufd $0x1b,%xmm2,%xmm2
+ movdqa %xmm7,%xmm8
+.byte 102,15,58,15,202,8
+ punpcklqdq %xmm0,%xmm2
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ movdqu (%rsi),%xmm3
+ movdqu 16(%rsi),%xmm4
+ movdqu 32(%rsi),%xmm5
+.byte 102,15,56,0,223
+ movdqu 48(%rsi),%xmm6
+
+ movdqa 0-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 102,15,56,0,231
+ movdqa %xmm2,%xmm10
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ nop
+ movdqa %xmm1,%xmm9
+.byte 15,56,203,202
+
+ movdqa 32-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 102,15,56,0,239
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ leaq 64(%rsi),%rsi
+.byte 15,56,204,220
+.byte 15,56,203,202
+
+ movdqa 64-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 102,15,56,0,247
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+
+ movdqa 96-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 128-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 160-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 192-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 224-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 256-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 288-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 320-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 352-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 384-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 416-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+.byte 15,56,203,202
+ paddd %xmm7,%xmm6
+
+ movdqa 448-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+.byte 15,56,205,245
+ movdqa %xmm8,%xmm7
+.byte 15,56,203,202
+
+ movdqa 480-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+ nop
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ decq %rdx
+ nop
+.byte 15,56,203,202
+
+ paddd %xmm10,%xmm2
+ paddd %xmm9,%xmm1
+ jnz .Loop_shaext
+
+ pshufd $0xb1,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm7
+ pshufd $0xb1,%xmm1,%xmm1
+ punpckhqdq %xmm2,%xmm1
+.byte 102,15,58,15,215,8
+
+ movdqu %xmm1,(%rdi)
+ movdqu %xmm2,16(%rdi)
+ .byte 0xf3,0xc3
+.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
+.type sha256_block_data_order_ssse3,@function
+.align 64
+sha256_block_data_order_ssse3:
+.cfi_startproc
+.Lssse3_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $96,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_ssse3:
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+
+
+ jmp .Lloop_ssse3
+.align 16
+.Lloop_ssse3:
+ movdqa K256+512(%rip),%xmm7
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+.byte 102,15,56,0,199
+ movdqu 48(%rsi),%xmm3
+ leaq K256(%rip),%rbp
+.byte 102,15,56,0,207
+ movdqa 0(%rbp),%xmm4
+ movdqa 32(%rbp),%xmm5
+.byte 102,15,56,0,215
+ paddd %xmm0,%xmm4
+ movdqa 64(%rbp),%xmm6
+.byte 102,15,56,0,223
+ movdqa 96(%rbp),%xmm7
+ paddd %xmm1,%xmm5
+ paddd %xmm2,%xmm6
+ paddd %xmm3,%xmm7
+ movdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ movdqa %xmm5,16(%rsp)
+ movl %ebx,%edi
+ movdqa %xmm6,32(%rsp)
+ xorl %ecx,%edi
+ movdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp .Lssse3_00_47
+
+.align 16
+.Lssse3_00_47:
+ subq $-128,%rbp
+ rorl $14,%r13d
+ movdqa %xmm1,%xmm4
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ movdqa %xmm3,%xmm7
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+.byte 102,15,58,15,224,4
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+.byte 102,15,58,15,250,4
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ paddd %xmm7,%xmm0
+ rorl $2,%r14d
+ addl %r11d,%edx
+ psrld $7,%xmm6
+ addl %edi,%r11d
+ movl %edx,%r13d
+ pshufd $250,%xmm3,%xmm7
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %r11d,%r14d
+ pxor %xmm5,%xmm4
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ pslld $11,%xmm5
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ pxor %xmm6,%xmm4
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ paddd %xmm4,%xmm0
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ psrlq $17,%xmm6
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ psrldq $8,%xmm7
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ paddd %xmm7,%xmm0
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ pshufd $80,%xmm0,%xmm7
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ movdqa %xmm7,%xmm6
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ psrld $10,%xmm7
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ psrlq $2,%xmm6
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ pxor %xmm6,%xmm7
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ movdqa 0(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ paddd %xmm7,%xmm0
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ paddd %xmm0,%xmm6
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ movdqa %xmm6,0(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm2,%xmm4
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ movdqa %xmm0,%xmm7
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+.byte 102,15,58,15,225,4
+ andl %eax,%r12d
+ xorl %eax,%r13d
+.byte 102,15,58,15,251,4
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ paddd %xmm7,%xmm1
+ rorl $2,%r14d
+ addl %edx,%r11d
+ psrld $7,%xmm6
+ addl %edi,%edx
+ movl %r11d,%r13d
+ pshufd $250,%xmm0,%xmm7
+ addl %edx,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%edx
+ movl %eax,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %edx,%r14d
+ pxor %xmm5,%xmm4
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ pslld $11,%xmm5
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ pxor %xmm6,%xmm4
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ paddd %xmm4,%xmm1
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ psrlq $17,%xmm6
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ psrldq $8,%xmm7
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ paddd %xmm7,%xmm1
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ pshufd $80,%xmm1,%xmm7
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ movdqa %xmm7,%xmm6
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ psrld $10,%xmm7
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ psrlq $2,%xmm6
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ pxor %xmm6,%xmm7
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ movdqa 32(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ paddd %xmm7,%xmm1
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ paddd %xmm1,%xmm6
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movdqa %xmm6,16(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm3,%xmm4
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ movdqa %xmm1,%xmm7
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+.byte 102,15,58,15,226,4
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+.byte 102,15,58,15,248,4
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ paddd %xmm7,%xmm2
+ rorl $2,%r14d
+ addl %r11d,%edx
+ psrld $7,%xmm6
+ addl %edi,%r11d
+ movl %edx,%r13d
+ pshufd $250,%xmm1,%xmm7
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %r11d,%r14d
+ pxor %xmm5,%xmm4
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ pslld $11,%xmm5
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ pxor %xmm6,%xmm4
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ paddd %xmm4,%xmm2
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ psrlq $17,%xmm6
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ psrldq $8,%xmm7
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ paddd %xmm7,%xmm2
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ pshufd $80,%xmm2,%xmm7
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ movdqa %xmm7,%xmm6
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ psrld $10,%xmm7
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ psrlq $2,%xmm6
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ pxor %xmm6,%xmm7
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ movdqa 64(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ paddd %xmm7,%xmm2
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ paddd %xmm2,%xmm6
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ movdqa %xmm6,32(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm0,%xmm4
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ movdqa %xmm2,%xmm7
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+.byte 102,15,58,15,227,4
+ andl %eax,%r12d
+ xorl %eax,%r13d
+.byte 102,15,58,15,249,4
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ paddd %xmm7,%xmm3
+ rorl $2,%r14d
+ addl %edx,%r11d
+ psrld $7,%xmm6
+ addl %edi,%edx
+ movl %r11d,%r13d
+ pshufd $250,%xmm2,%xmm7
+ addl %edx,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%edx
+ movl %eax,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %edx,%r14d
+ pxor %xmm5,%xmm4
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ pslld $11,%xmm5
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ pxor %xmm6,%xmm4
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ paddd %xmm4,%xmm3
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ psrlq $17,%xmm6
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ psrldq $8,%xmm7
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ paddd %xmm7,%xmm3
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ pshufd $80,%xmm3,%xmm7
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ movdqa %xmm7,%xmm6
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ psrld $10,%xmm7
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ psrlq $2,%xmm6
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ pxor %xmm6,%xmm7
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ movdqa 96(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ paddd %xmm7,%xmm3
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ paddd %xmm3,%xmm6
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movdqa %xmm6,48(%rsp)
+ cmpb $0,131(%rbp)
+ jne .Lssse3_00_47
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ rorl $2,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ rorl $2,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ rorl $2,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ rorl $2,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%rdi
+ movl %r14d,%eax
+
+ addl 0(%rdi),%eax
+ leaq 64(%rsi),%rsi
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb .Lloop_ssse3
+
+ movq 88(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_ssse3:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3
+.type sha256_block_data_order_avx,@function
+.align 64
+sha256_block_data_order_avx:
+.cfi_startproc
+.Lavx_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $96,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_avx:
+
+ vzeroupper
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+ vmovdqa K256+512+32(%rip),%xmm8
+ vmovdqa K256+512+64(%rip),%xmm9
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa K256+512(%rip),%xmm7
+ vmovdqu 0(%rsi),%xmm0
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm7,%xmm0,%xmm0
+ leaq K256(%rip),%rbp
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd 0(%rbp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 32(%rbp),%xmm1,%xmm5
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ vpaddd 96(%rbp),%xmm3,%xmm7
+ vmovdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ vmovdqa %xmm5,16(%rsp)
+ movl %ebx,%edi
+ vmovdqa %xmm6,32(%rsp)
+ xorl %ecx,%edi
+ vmovdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ subq $-128,%rbp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ vpshufd $250,%xmm3,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm0,%xmm0
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpaddd %xmm6,%xmm0,%xmm0
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ vpshufd $80,%xmm0,%xmm7
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ vpaddd %xmm6,%xmm0,%xmm0
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpaddd 0(%rbp),%xmm0,%xmm6
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,0(%rsp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ vpshufd $250,%xmm0,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm1,%xmm1
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpaddd %xmm6,%xmm1,%xmm1
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ vpshufd $80,%xmm1,%xmm7
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ vpaddd %xmm6,%xmm1,%xmm1
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpaddd 32(%rbp),%xmm1,%xmm6
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,16(%rsp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ vpshufd $250,%xmm1,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm2,%xmm2
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpaddd %xmm6,%xmm2,%xmm2
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ vpshufd $80,%xmm2,%xmm7
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ vpaddd %xmm6,%xmm2,%xmm2
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,32(%rsp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ vpshufd $250,%xmm2,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm3,%xmm3
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpaddd %xmm6,%xmm3,%xmm3
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ vpshufd $80,%xmm3,%xmm7
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ vpaddd %xmm6,%xmm3,%xmm3
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpaddd 96(%rbp),%xmm3,%xmm6
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,48(%rsp)
+ cmpb $0,131(%rbp)
+ jne .Lavx_00_47
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%rdi
+ movl %r14d,%eax
+
+ addl 0(%rdi),%eax
+ leaq 64(%rsi),%rsi
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb .Lloop_avx
+
+ movq 88(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vzeroupper
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha256_block_data_order_avx,.-sha256_block_data_order_avx
+.type sha256_block_data_order_avx2,@function
+.align 64
+sha256_block_data_order_avx2:
+.cfi_startproc
+.Lavx2_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $544,%rsp
+ shlq $4,%rdx
+ andq $-1024,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ addq $448,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_avx2:
+
+ vzeroupper
+ subq $-64,%rsi
+ movl 0(%rdi),%eax
+ movq %rsi,%r12
+ movl 4(%rdi),%ebx
+ cmpq %rdx,%rsi
+ movl 8(%rdi),%ecx
+ cmoveq %rsp,%r12
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+ vmovdqa K256+512+32(%rip),%ymm8
+ vmovdqa K256+512+64(%rip),%ymm9
+ jmp .Loop_avx2
+.align 16
+.Loop_avx2:
+ vmovdqa K256+512(%rip),%ymm7
+ vmovdqu -64+0(%rsi),%xmm0
+ vmovdqu -64+16(%rsi),%xmm1
+ vmovdqu -64+32(%rsi),%xmm2
+ vmovdqu -64+48(%rsi),%xmm3
+
+ vinserti128 $1,(%r12),%ymm0,%ymm0
+ vinserti128 $1,16(%r12),%ymm1,%ymm1
+ vpshufb %ymm7,%ymm0,%ymm0
+ vinserti128 $1,32(%r12),%ymm2,%ymm2
+ vpshufb %ymm7,%ymm1,%ymm1
+ vinserti128 $1,48(%r12),%ymm3,%ymm3
+
+ leaq K256(%rip),%rbp
+ vpshufb %ymm7,%ymm2,%ymm2
+ vpaddd 0(%rbp),%ymm0,%ymm4
+ vpshufb %ymm7,%ymm3,%ymm3
+ vpaddd 32(%rbp),%ymm1,%ymm5
+ vpaddd 64(%rbp),%ymm2,%ymm6
+ vpaddd 96(%rbp),%ymm3,%ymm7
+ vmovdqa %ymm4,0(%rsp)
+ xorl %r14d,%r14d
+ vmovdqa %ymm5,32(%rsp)
+ leaq -64(%rsp),%rsp
+ movl %ebx,%edi
+ vmovdqa %ymm6,0(%rsp)
+ xorl %ecx,%edi
+ vmovdqa %ymm7,32(%rsp)
+ movl %r9d,%r12d
+ subq $-32*4,%rbp
+ jmp .Lavx2_00_47
+
+.align 16
+.Lavx2_00_47:
+ leaq -64(%rsp),%rsp
+ vpalignr $4,%ymm0,%ymm1,%ymm4
+ addl 0+128(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ vpalignr $4,%ymm2,%ymm3,%ymm7
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ vpsrld $7,%ymm4,%ymm6
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ vpaddd %ymm7,%ymm0,%ymm0
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %ebx,%edi
+ vpshufd $250,%ymm3,%ymm7
+ xorl %r13d,%r14d
+ leal (%r11,%rdi,1),%r11d
+ movl %r8d,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 4+128(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%edx,%edi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ vpslld $11,%ymm5,%ymm5
+ andnl %r9d,%edx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%edx,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%edi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%edi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ vpsrlq $17,%ymm7,%ymm7
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ vpaddd %ymm4,%ymm0,%ymm0
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 8+128(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ vpshufb %ymm8,%ymm6,%ymm6
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ vpaddd %ymm6,%ymm0,%ymm0
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ vpshufd $80,%ymm0,%ymm7
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ vpsrld $10,%ymm7,%ymm6
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r11d,%edi
+ vpsrlq $17,%ymm7,%ymm7
+ xorl %r13d,%r14d
+ leal (%r9,%rdi,1),%r9d
+ movl %ecx,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 12+128(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%ebx,%edi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %edx,%ebx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%ebx,%r14d
+ vpshufb %ymm9,%ymm6,%ymm6
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%edi
+ vpaddd %ymm6,%ymm0,%ymm0
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%edi
+ vpaddd 0(%rbp),%ymm0,%ymm6
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ vmovdqa %ymm6,0(%rsp)
+ vpalignr $4,%ymm1,%ymm2,%ymm4
+ addl 32+128(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ vpalignr $4,%ymm3,%ymm0,%ymm7
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ vpsrld $7,%ymm4,%ymm6
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ vpaddd %ymm7,%ymm1,%ymm1
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r9d,%edi
+ vpshufd $250,%ymm0,%ymm7
+ xorl %r13d,%r14d
+ leal (%rdx,%rdi,1),%edx
+ movl %eax,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 36+128(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%r11d,%edi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ vpslld $11,%ymm5,%ymm5
+ andnl %ebx,%r11d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r11d,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%edi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%edi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ vpsrlq $17,%ymm7,%ymm7
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ vpaddd %ymm4,%ymm1,%ymm1
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 40+128(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ vpshufb %ymm8,%ymm6,%ymm6
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ vpaddd %ymm6,%ymm1,%ymm1
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ vpshufd $80,%ymm1,%ymm7
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ vpsrld $10,%ymm7,%ymm6
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %edx,%edi
+ vpsrlq $17,%ymm7,%ymm7
+ xorl %r13d,%r14d
+ leal (%rbx,%rdi,1),%ebx
+ movl %r10d,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 44+128(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%r9d,%edi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %r11d,%r9d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r9d,%r14d
+ vpshufb %ymm9,%ymm6,%ymm6
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%edi
+ vpaddd %ymm6,%ymm1,%ymm1
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%edi
+ vpaddd 32(%rbp),%ymm1,%ymm6
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ vmovdqa %ymm6,32(%rsp)
+ leaq -64(%rsp),%rsp
+ vpalignr $4,%ymm2,%ymm3,%ymm4
+ addl 0+128(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ vpalignr $4,%ymm0,%ymm1,%ymm7
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ vpsrld $7,%ymm4,%ymm6
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ vpaddd %ymm7,%ymm2,%ymm2
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %ebx,%edi
+ vpshufd $250,%ymm1,%ymm7
+ xorl %r13d,%r14d
+ leal (%r11,%rdi,1),%r11d
+ movl %r8d,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 4+128(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%edx,%edi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ vpslld $11,%ymm5,%ymm5
+ andnl %r9d,%edx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%edx,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%edi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%edi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ vpsrlq $17,%ymm7,%ymm7
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ vpaddd %ymm4,%ymm2,%ymm2
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 8+128(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ vpshufb %ymm8,%ymm6,%ymm6
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ vpaddd %ymm6,%ymm2,%ymm2
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ vpshufd $80,%ymm2,%ymm7
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ vpsrld $10,%ymm7,%ymm6
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r11d,%edi
+ vpsrlq $17,%ymm7,%ymm7
+ xorl %r13d,%r14d
+ leal (%r9,%rdi,1),%r9d
+ movl %ecx,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 12+128(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%ebx,%edi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %edx,%ebx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%ebx,%r14d
+ vpshufb %ymm9,%ymm6,%ymm6
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%edi
+ vpaddd %ymm6,%ymm2,%ymm2
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%edi
+ vpaddd 64(%rbp),%ymm2,%ymm6
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ vmovdqa %ymm6,0(%rsp)
+ vpalignr $4,%ymm3,%ymm0,%ymm4
+ addl 32+128(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ vpalignr $4,%ymm1,%ymm2,%ymm7
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ vpsrld $7,%ymm4,%ymm6
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ vpaddd %ymm7,%ymm3,%ymm3
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r9d,%edi
+ vpshufd $250,%ymm2,%ymm7
+ xorl %r13d,%r14d
+ leal (%rdx,%rdi,1),%edx
+ movl %eax,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 36+128(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%r11d,%edi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ vpslld $11,%ymm5,%ymm5
+ andnl %ebx,%r11d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r11d,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%edi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%edi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ vpsrlq $17,%ymm7,%ymm7
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ vpaddd %ymm4,%ymm3,%ymm3
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 40+128(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ vpshufb %ymm8,%ymm6,%ymm6
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ vpaddd %ymm6,%ymm3,%ymm3
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ vpshufd $80,%ymm3,%ymm7
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ vpsrld $10,%ymm7,%ymm6
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %edx,%edi
+ vpsrlq $17,%ymm7,%ymm7
+ xorl %r13d,%r14d
+ leal (%rbx,%rdi,1),%ebx
+ movl %r10d,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 44+128(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%r9d,%edi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %r11d,%r9d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r9d,%r14d
+ vpshufb %ymm9,%ymm6,%ymm6
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%edi
+ vpaddd %ymm6,%ymm3,%ymm3
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%edi
+ vpaddd 96(%rbp),%ymm3,%ymm6
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ vmovdqa %ymm6,32(%rsp)
+ leaq 128(%rbp),%rbp
+ cmpb $0,3(%rbp)
+ jne .Lavx2_00_47
+ addl 0+64(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %ebx,%edi
+ xorl %r13d,%r14d
+ leal (%r11,%rdi,1),%r11d
+ movl %r8d,%r12d
+ addl 4+64(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%edi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%edi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%edi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8+64(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r11d,%edi
+ xorl %r13d,%r14d
+ leal (%r9,%rdi,1),%r9d
+ movl %ecx,%r12d
+ addl 12+64(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%edi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%edi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%edi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32+64(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r9d,%edi
+ xorl %r13d,%r14d
+ leal (%rdx,%rdi,1),%edx
+ movl %eax,%r12d
+ addl 36+64(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%edi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%edi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%edi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40+64(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %edx,%edi
+ xorl %r13d,%r14d
+ leal (%rbx,%rdi,1),%ebx
+ movl %r10d,%r12d
+ addl 44+64(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%edi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%edi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%edi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ addl 0(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %ebx,%edi
+ xorl %r13d,%r14d
+ leal (%r11,%rdi,1),%r11d
+ movl %r8d,%r12d
+ addl 4(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%edi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%edi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%edi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r11d,%edi
+ xorl %r13d,%r14d
+ leal (%r9,%rdi,1),%r9d
+ movl %ecx,%r12d
+ addl 12(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%edi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%edi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%edi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r9d,%edi
+ xorl %r13d,%r14d
+ leal (%rdx,%rdi,1),%edx
+ movl %eax,%r12d
+ addl 36(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%edi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%edi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%edi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %edx,%edi
+ xorl %r13d,%r14d
+ leal (%rbx,%rdi,1),%ebx
+ movl %r10d,%r12d
+ addl 44(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%edi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%edi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%edi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ movq 512(%rsp),%rdi
+ addl %r14d,%eax
+
+ leaq 448(%rsp),%rbp
+
+ addl 0(%rdi),%eax
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+
+ cmpq 80(%rbp),%rsi
+ je .Ldone_avx2
+
+ xorl %r14d,%r14d
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ movl %r9d,%r12d
+ jmp .Lower_avx2
+.align 16
+.Lower_avx2:
+ addl 0+16(%rbp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %ebx,%edi
+ xorl %r13d,%r14d
+ leal (%r11,%rdi,1),%r11d
+ movl %r8d,%r12d
+ addl 4+16(%rbp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%edi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%edi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%edi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8+16(%rbp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r11d,%edi
+ xorl %r13d,%r14d
+ leal (%r9,%rdi,1),%r9d
+ movl %ecx,%r12d
+ addl 12+16(%rbp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%edi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%edi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%edi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32+16(%rbp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r9d,%edi
+ xorl %r13d,%r14d
+ leal (%rdx,%rdi,1),%edx
+ movl %eax,%r12d
+ addl 36+16(%rbp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%edi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%edi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%edi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40+16(%rbp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %edx,%edi
+ xorl %r13d,%r14d
+ leal (%rbx,%rdi,1),%ebx
+ movl %r10d,%r12d
+ addl 44+16(%rbp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%edi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%edi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%edi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ leaq -64(%rbp),%rbp
+ cmpq %rsp,%rbp
+ jae .Lower_avx2
+
+ movq 512(%rsp),%rdi
+ addl %r14d,%eax
+
+ leaq 448(%rsp),%rsp
+
+ addl 0(%rdi),%eax
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ leaq 128(%rsi),%rsi
+ addl 24(%rdi),%r10d
+ movq %rsi,%r12
+ addl 28(%rdi),%r11d
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ cmoveq %rsp,%r12
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+
+ jbe .Loop_avx2
+ leaq (%rsp),%rbp
+
+.Ldone_avx2:
+ leaq (%rbp),%rsp
+ movq 88(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vzeroupper
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha256_block_data_order_avx2,.-sha256_block_data_order_avx2
diff --git a/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha512-x86_64.s b/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha512-x86_64.s
new file mode 100644
index 0000000000..5931a2a932
--- /dev/null
+++ b/deps/openssl/config/archs/solaris64-x86_64-gcc/asm_avx2/crypto/sha/sha512-x86_64.s
@@ -0,0 +1,5437 @@
+.text
+
+
+.globl sha512_block_data_order
+.type sha512_block_data_order,@function
+.align 16
+sha512_block_data_order:
+.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl 0(%r11),%r9d
+ movl 4(%r11),%r10d
+ movl 8(%r11),%r11d
+ testl $2048,%r10d
+ jnz .Lxop_shortcut
+ andl $296,%r11d
+ cmpl $296,%r11d
+ je .Lavx2_shortcut
+ andl $1073741824,%r9d
+ andl $268435968,%r10d
+ orl %r9d,%r10d
+ cmpl $1342177792,%r10d
+ je .Lavx_shortcut
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $128+32,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %rax,152(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue:
+
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp .Lloop
+
+.align 16
+.Lloop:
+ movq %rbx,%rdi
+ leaq K512(%rip),%rbp
+ xorq %rcx,%rdi
+ movq 0(%rsi),%r12
+ movq %r8,%r13
+ movq %rax,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,0(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r11
+ movq 8(%rsi),%r12
+ movq %rdx,%r13
+ movq %r11,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,8(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r10
+ movq 16(%rsi),%r12
+ movq %rcx,%r13
+ movq %r10,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,16(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r9
+ movq 24(%rsi),%r12
+ movq %rbx,%r13
+ movq %r9,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,24(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r8
+ movq 32(%rsi),%r12
+ movq %rax,%r13
+ movq %r8,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,32(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rdx
+ movq 40(%rsi),%r12
+ movq %r11,%r13
+ movq %rdx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,40(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%rcx
+ movq 48(%rsi),%r12
+ movq %r10,%r13
+ movq %rcx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,48(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rbx
+ movq 56(%rsi),%r12
+ movq %r9,%r13
+ movq %rbx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,56(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%rax
+ movq 64(%rsi),%r12
+ movq %r8,%r13
+ movq %rax,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,64(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r11
+ movq 72(%rsi),%r12
+ movq %rdx,%r13
+ movq %r11,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,72(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r10
+ movq 80(%rsi),%r12
+ movq %rcx,%r13
+ movq %r10,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,80(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r9
+ movq 88(%rsi),%r12
+ movq %rbx,%r13
+ movq %r9,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,88(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r8
+ movq 96(%rsi),%r12
+ movq %rax,%r13
+ movq %r8,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,96(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rdx
+ movq 104(%rsi),%r12
+ movq %r11,%r13
+ movq %rdx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,104(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%rcx
+ movq 112(%rsi),%r12
+ movq %r10,%r13
+ movq %rcx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,112(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rbx
+ movq 120(%rsi),%r12
+ movq %r9,%r13
+ movq %rbx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,120(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ jmp .Lrounds_16_xx
+.align 16
+.Lrounds_16_xx:
+ movq 8(%rsp),%r13
+ movq 112(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rax
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 72(%rsp),%r12
+
+ addq 0(%rsp),%r12
+ movq %r8,%r13
+ addq %r15,%r12
+ movq %rax,%r14
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,0(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ movq 16(%rsp),%r13
+ movq 120(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r11
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 80(%rsp),%r12
+
+ addq 8(%rsp),%r12
+ movq %rdx,%r13
+ addq %rdi,%r12
+ movq %r11,%r14
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,8(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ movq 24(%rsp),%r13
+ movq 0(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r10
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 88(%rsp),%r12
+
+ addq 16(%rsp),%r12
+ movq %rcx,%r13
+ addq %r15,%r12
+ movq %r10,%r14
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,16(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ movq 32(%rsp),%r13
+ movq 8(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r9
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 96(%rsp),%r12
+
+ addq 24(%rsp),%r12
+ movq %rbx,%r13
+ addq %rdi,%r12
+ movq %r9,%r14
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,24(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ movq 40(%rsp),%r13
+ movq 16(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r8
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 104(%rsp),%r12
+
+ addq 32(%rsp),%r12
+ movq %rax,%r13
+ addq %r15,%r12
+ movq %r8,%r14
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,32(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ movq 48(%rsp),%r13
+ movq 24(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rdx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 112(%rsp),%r12
+
+ addq 40(%rsp),%r12
+ movq %r11,%r13
+ addq %rdi,%r12
+ movq %rdx,%r14
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,40(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ movq 56(%rsp),%r13
+ movq 32(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rcx
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 120(%rsp),%r12
+
+ addq 48(%rsp),%r12
+ movq %r10,%r13
+ addq %r15,%r12
+ movq %rcx,%r14
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,48(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ movq 64(%rsp),%r13
+ movq 40(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rbx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 0(%rsp),%r12
+
+ addq 56(%rsp),%r12
+ movq %r9,%r13
+ addq %rdi,%r12
+ movq %rbx,%r14
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,56(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ movq 72(%rsp),%r13
+ movq 48(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rax
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 8(%rsp),%r12
+
+ addq 64(%rsp),%r12
+ movq %r8,%r13
+ addq %r15,%r12
+ movq %rax,%r14
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,64(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ movq 80(%rsp),%r13
+ movq 56(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r11
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 16(%rsp),%r12
+
+ addq 72(%rsp),%r12
+ movq %rdx,%r13
+ addq %rdi,%r12
+ movq %r11,%r14
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,72(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ movq 88(%rsp),%r13
+ movq 64(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r10
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 24(%rsp),%r12
+
+ addq 80(%rsp),%r12
+ movq %rcx,%r13
+ addq %r15,%r12
+ movq %r10,%r14
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,80(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ movq 96(%rsp),%r13
+ movq 72(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r9
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 32(%rsp),%r12
+
+ addq 88(%rsp),%r12
+ movq %rbx,%r13
+ addq %rdi,%r12
+ movq %r9,%r14
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,88(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ movq 104(%rsp),%r13
+ movq 80(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r8
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 40(%rsp),%r12
+
+ addq 96(%rsp),%r12
+ movq %rax,%r13
+ addq %r15,%r12
+ movq %r8,%r14
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,96(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ movq 112(%rsp),%r13
+ movq 88(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rdx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 48(%rsp),%r12
+
+ addq 104(%rsp),%r12
+ movq %r11,%r13
+ addq %rdi,%r12
+ movq %rdx,%r14
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,104(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ movq 120(%rsp),%r13
+ movq 96(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rcx
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 56(%rsp),%r12
+
+ addq 112(%rsp),%r12
+ movq %r10,%r13
+ addq %r15,%r12
+ movq %rcx,%r14
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,112(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ movq 0(%rsp),%r13
+ movq 104(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rbx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 64(%rsp),%r12
+
+ addq 120(%rsp),%r12
+ movq %r9,%r13
+ addq %rdi,%r12
+ movq %rbx,%r14
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,120(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ cmpb $0,7(%rbp)
+ jnz .Lrounds_16_xx
+
+ movq 128+0(%rsp),%rdi
+ addq %r14,%rax
+ leaq 128(%rsi),%rsi
+
+ addq 0(%rdi),%rax
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb .Lloop
+
+ movq 152(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha512_block_data_order,.-sha512_block_data_order
+.align 64
+.type K512,@object
+K512:
+.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad 0x3956c25bf348b538,0x59f111f1b605d019
+.quad 0x3956c25bf348b538,0x59f111f1b605d019
+.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad 0xd807aa98a3030242,0x12835b0145706fbe
+.quad 0xd807aa98a3030242,0x12835b0145706fbe
+.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad 0x06ca6351e003826f,0x142929670a0e6e70
+.quad 0x06ca6351e003826f,0x142929670a0e6e70
+.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad 0x81c2c92e47edaee6,0x92722c851482353b
+.quad 0x81c2c92e47edaee6,0x92722c851482353b
+.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad 0xd192e819d6ef5218,0xd69906245565a910
+.quad 0xd192e819d6ef5218,0xd69906245565a910
+.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad 0x90befffa23631e28,0xa4506cebde82bde9
+.quad 0x90befffa23631e28,0xa4506cebde82bde9
+.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad 0xca273eceea26619c,0xd186b8c721c0c207
+.quad 0xca273eceea26619c,0xd186b8c721c0c207
+.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad 0x113f9804bef90dae,0x1b710b35131c471b
+.quad 0x113f9804bef90dae,0x1b710b35131c471b
+.quad 0x28db77f523047d84,0x32caab7b40c72493
+.quad 0x28db77f523047d84,0x32caab7b40c72493
+.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.quad 0x0001020304050607,0x08090a0b0c0d0e0f
+.quad 0x0001020304050607,0x08090a0b0c0d0e0f
+.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.type sha512_block_data_order_xop,@function
+.align 64
+sha512_block_data_order_xop:
+.cfi_startproc
+.Lxop_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $160,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %rax,152(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue_xop:
+
+ vzeroupper
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp .Lloop_xop
+.align 16
+.Lloop_xop:
+ vmovdqa K512+1280(%rip),%xmm11
+ vmovdqu 0(%rsi),%xmm0
+ leaq K512+128(%rip),%rbp
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vpshufb %xmm11,%xmm0,%xmm0
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm11,%xmm1,%xmm1
+ vmovdqu 64(%rsi),%xmm4
+ vpshufb %xmm11,%xmm2,%xmm2
+ vmovdqu 80(%rsi),%xmm5
+ vpshufb %xmm11,%xmm3,%xmm3
+ vmovdqu 96(%rsi),%xmm6
+ vpshufb %xmm11,%xmm4,%xmm4
+ vmovdqu 112(%rsi),%xmm7
+ vpshufb %xmm11,%xmm5,%xmm5
+ vpaddq -128(%rbp),%xmm0,%xmm8
+ vpshufb %xmm11,%xmm6,%xmm6
+ vpaddq -96(%rbp),%xmm1,%xmm9
+ vpshufb %xmm11,%xmm7,%xmm7
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ vpaddq -32(%rbp),%xmm3,%xmm11
+ vmovdqa %xmm8,0(%rsp)
+ vpaddq 0(%rbp),%xmm4,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ vpaddq 32(%rbp),%xmm5,%xmm9
+ vmovdqa %xmm10,32(%rsp)
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ vmovdqa %xmm11,48(%rsp)
+ vpaddq 96(%rbp),%xmm7,%xmm11
+ vmovdqa %xmm8,64(%rsp)
+ movq %rax,%r14
+ vmovdqa %xmm9,80(%rsp)
+ movq %rbx,%rdi
+ vmovdqa %xmm10,96(%rsp)
+ xorq %rcx,%rdi
+ vmovdqa %xmm11,112(%rsp)
+ movq %r8,%r13
+ jmp .Lxop_00_47
+
+.align 16
+.Lxop_00_47:
+ addq $256,%rbp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ rorq $23,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm4,%xmm5,%xmm11
+ movq %r9,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %rax,%r14
+ vpaddq %xmm11,%xmm0,%xmm0
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+.byte 143,72,120,195,209,7
+ xorq %r10,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %rbx,%r15
+ addq %r12,%r11
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,223,3
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm7,%xmm10
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpaddq %xmm8,%xmm0,%xmm0
+ movq %rdx,%r13
+ addq %r11,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%r11
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %r8,%r12
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpaddq %xmm11,%xmm0,%xmm0
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ rorq $6,%r14
+ vpaddq -128(%rbp),%xmm0,%xmm10
+ xorq %rax,%rdi
+ addq %r12,%r10
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ rorq $28,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,0(%rsp)
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ rorq $23,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm5,%xmm6,%xmm11
+ movq %rdx,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %r10,%r14
+ vpaddq %xmm11,%xmm1,%xmm1
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+.byte 143,72,120,195,209,7
+ xorq %r8,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %r11,%r15
+ addq %r12,%r9
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,216,3
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm0,%xmm10
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpaddq %xmm8,%xmm1,%xmm1
+ movq %rbx,%r13
+ addq %r9,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%r9
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %rcx,%r12
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpaddq %xmm11,%xmm1,%xmm1
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ rorq $6,%r14
+ vpaddq -96(%rbp),%xmm1,%xmm10
+ xorq %r10,%rdi
+ addq %r12,%r8
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ rorq $28,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,16(%rsp)
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ rorq $23,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm6,%xmm7,%xmm11
+ movq %rbx,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %r8,%r14
+ vpaddq %xmm11,%xmm2,%xmm2
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+.byte 143,72,120,195,209,7
+ xorq %rcx,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %r9,%r15
+ addq %r12,%rdx
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,217,3
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm1,%xmm10
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpaddq %xmm8,%xmm2,%xmm2
+ movq %r11,%r13
+ addq %rdx,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%rdx
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %rax,%r12
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpaddq %xmm11,%xmm2,%xmm2
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ rorq $6,%r14
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ rorq $28,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,32(%rsp)
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ rorq $23,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm7,%xmm0,%xmm11
+ movq %r11,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %rcx,%r14
+ vpaddq %xmm11,%xmm3,%xmm3
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+.byte 143,72,120,195,209,7
+ xorq %rax,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,218,3
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm2,%xmm10
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpaddq %xmm8,%xmm3,%xmm3
+ movq %r9,%r13
+ addq %rbx,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%rbx
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %r10,%r12
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpaddq %xmm11,%xmm3,%xmm3
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ rorq $6,%r14
+ vpaddq -32(%rbp),%xmm3,%xmm10
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ rorq $28,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,48(%rsp)
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ rorq $23,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm0,%xmm1,%xmm11
+ movq %r9,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %rax,%r14
+ vpaddq %xmm11,%xmm4,%xmm4
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+.byte 143,72,120,195,209,7
+ xorq %r10,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %rbx,%r15
+ addq %r12,%r11
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,219,3
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm3,%xmm10
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpaddq %xmm8,%xmm4,%xmm4
+ movq %rdx,%r13
+ addq %r11,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%r11
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %r8,%r12
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpaddq %xmm11,%xmm4,%xmm4
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ rorq $6,%r14
+ vpaddq 0(%rbp),%xmm4,%xmm10
+ xorq %rax,%rdi
+ addq %r12,%r10
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ rorq $28,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,64(%rsp)
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ rorq $23,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm1,%xmm2,%xmm11
+ movq %rdx,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %r10,%r14
+ vpaddq %xmm11,%xmm5,%xmm5
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+.byte 143,72,120,195,209,7
+ xorq %r8,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %r11,%r15
+ addq %r12,%r9
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,220,3
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm4,%xmm10
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpaddq %xmm8,%xmm5,%xmm5
+ movq %rbx,%r13
+ addq %r9,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%r9
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %rcx,%r12
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpaddq %xmm11,%xmm5,%xmm5
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ rorq $6,%r14
+ vpaddq 32(%rbp),%xmm5,%xmm10
+ xorq %r10,%rdi
+ addq %r12,%r8
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ rorq $28,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,80(%rsp)
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ rorq $23,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm2,%xmm3,%xmm11
+ movq %rbx,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %r8,%r14
+ vpaddq %xmm11,%xmm6,%xmm6
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+.byte 143,72,120,195,209,7
+ xorq %rcx,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %r9,%r15
+ addq %r12,%rdx
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,221,3
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm5,%xmm10
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpaddq %xmm8,%xmm6,%xmm6
+ movq %r11,%r13
+ addq %rdx,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%rdx
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %rax,%r12
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpaddq %xmm11,%xmm6,%xmm6
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ rorq $6,%r14
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ rorq $28,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,96(%rsp)
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ rorq $23,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm3,%xmm4,%xmm11
+ movq %r11,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %rcx,%r14
+ vpaddq %xmm11,%xmm7,%xmm7
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+.byte 143,72,120,195,209,7
+ xorq %rax,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,222,3
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm6,%xmm10
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpaddq %xmm8,%xmm7,%xmm7
+ movq %r9,%r13
+ addq %rbx,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%rbx
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %r10,%r12
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpaddq %xmm11,%xmm7,%xmm7
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ rorq $6,%r14
+ vpaddq 96(%rbp),%xmm7,%xmm10
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ rorq $28,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,112(%rsp)
+ cmpb $0,135(%rbp)
+ jne .Lxop_00_47
+ rorq $23,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ rorq $5,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ rorq $4,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ rorq $6,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ rorq $28,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ rorq $23,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ rorq $4,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ rorq $6,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ rorq $28,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ rorq $23,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ rorq $5,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ rorq $4,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ rorq $6,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ rorq $28,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ rorq $23,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ rorq $4,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ rorq $6,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ rorq $28,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ rorq $23,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ rorq $5,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ rorq $4,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ rorq $6,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ rorq $28,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ rorq $23,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ rorq $4,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ rorq $6,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ rorq $28,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ rorq $23,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ rorq $5,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ rorq $4,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ rorq $6,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ rorq $28,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ rorq $23,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ rorq $4,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ rorq $6,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ rorq $28,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ rorq $23,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ rorq $5,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ rorq $4,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ rorq $6,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ rorq $28,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ rorq $23,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ rorq $4,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ rorq $6,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ rorq $28,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ rorq $23,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ rorq $5,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ rorq $4,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ rorq $6,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ rorq $28,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ rorq $23,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ rorq $4,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ rorq $6,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ rorq $28,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ rorq $23,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ rorq $5,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ rorq $4,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ rorq $6,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ rorq $28,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ rorq $23,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ rorq $4,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ rorq $6,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ rorq $28,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ rorq $23,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ rorq $5,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ rorq $4,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ rorq $6,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ rorq $28,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ rorq $23,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ rorq $4,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ rorq $6,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ rorq $28,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ movq 128+0(%rsp),%rdi
+ movq %r14,%rax
+
+ addq 0(%rdi),%rax
+ leaq 128(%rsi),%rsi
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb .Lloop_xop
+
+ movq 152(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vzeroupper
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_xop:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha512_block_data_order_xop,.-sha512_block_data_order_xop
+.type sha512_block_data_order_avx,@function
+.align 64
+sha512_block_data_order_avx:
+.cfi_startproc
+.Lavx_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $160,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %rax,152(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue_avx:
+
+ vzeroupper
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa K512+1280(%rip),%xmm11
+ vmovdqu 0(%rsi),%xmm0
+ leaq K512+128(%rip),%rbp
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vpshufb %xmm11,%xmm0,%xmm0
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm11,%xmm1,%xmm1
+ vmovdqu 64(%rsi),%xmm4
+ vpshufb %xmm11,%xmm2,%xmm2
+ vmovdqu 80(%rsi),%xmm5
+ vpshufb %xmm11,%xmm3,%xmm3
+ vmovdqu 96(%rsi),%xmm6
+ vpshufb %xmm11,%xmm4,%xmm4
+ vmovdqu 112(%rsi),%xmm7
+ vpshufb %xmm11,%xmm5,%xmm5
+ vpaddq -128(%rbp),%xmm0,%xmm8
+ vpshufb %xmm11,%xmm6,%xmm6
+ vpaddq -96(%rbp),%xmm1,%xmm9
+ vpshufb %xmm11,%xmm7,%xmm7
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ vpaddq -32(%rbp),%xmm3,%xmm11
+ vmovdqa %xmm8,0(%rsp)
+ vpaddq 0(%rbp),%xmm4,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ vpaddq 32(%rbp),%xmm5,%xmm9
+ vmovdqa %xmm10,32(%rsp)
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ vmovdqa %xmm11,48(%rsp)
+ vpaddq 96(%rbp),%xmm7,%xmm11
+ vmovdqa %xmm8,64(%rsp)
+ movq %rax,%r14
+ vmovdqa %xmm9,80(%rsp)
+ movq %rbx,%rdi
+ vmovdqa %xmm10,96(%rsp)
+ xorq %rcx,%rdi
+ vmovdqa %xmm11,112(%rsp)
+ movq %r8,%r13
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ addq $256,%rbp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm4,%xmm5,%xmm11
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpaddq %xmm11,%xmm0,%xmm0
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r8,%r12
+ xorq %r8,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rbx,%r15
+ addq %r12,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm7,%xmm11
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rdx,%r13
+ addq %r11,%r14
+ vpsllq $3,%xmm7,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ vpaddq %xmm8,%xmm0,%xmm0
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm7,%xmm9
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rax,%rdi
+ addq %r12,%r10
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm0,%xmm0
+ xorq %r11,%r14
+ addq %r13,%r10
+ vpaddq -128(%rbp),%xmm0,%xmm10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,0(%rsp)
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm5,%xmm6,%xmm11
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpaddq %xmm11,%xmm1,%xmm1
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r11,%r15
+ addq %r12,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm0,%xmm11
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rbx,%r13
+ addq %r9,%r14
+ vpsllq $3,%xmm0,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ vpaddq %xmm8,%xmm1,%xmm1
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm0,%xmm9
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r10,%rdi
+ addq %r12,%r8
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm1,%xmm1
+ xorq %r9,%r14
+ addq %r13,%r8
+ vpaddq -96(%rbp),%xmm1,%xmm10
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,16(%rsp)
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm6,%xmm7,%xmm11
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpaddq %xmm11,%xmm2,%xmm2
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rax,%r12
+ xorq %rax,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r9,%r15
+ addq %r12,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm1,%xmm11
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r11,%r13
+ addq %rdx,%r14
+ vpsllq $3,%xmm1,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ vpaddq %xmm8,%xmm2,%xmm2
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm1,%xmm9
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm2,%xmm2
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,32(%rsp)
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm7,%xmm0,%xmm11
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpaddq %xmm11,%xmm3,%xmm3
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r10,%r12
+ xorq %r10,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm2,%xmm11
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r9,%r13
+ addq %rbx,%r14
+ vpsllq $3,%xmm2,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ vpaddq %xmm8,%xmm3,%xmm3
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm2,%xmm9
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm3,%xmm3
+ xorq %rbx,%r14
+ addq %r13,%rax
+ vpaddq -32(%rbp),%xmm3,%xmm10
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,48(%rsp)
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm0,%xmm1,%xmm11
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpaddq %xmm11,%xmm4,%xmm4
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r8,%r12
+ xorq %r8,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rbx,%r15
+ addq %r12,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm3,%xmm11
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rdx,%r13
+ addq %r11,%r14
+ vpsllq $3,%xmm3,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ vpaddq %xmm8,%xmm4,%xmm4
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm3,%xmm9
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rax,%rdi
+ addq %r12,%r10
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm4,%xmm4
+ xorq %r11,%r14
+ addq %r13,%r10
+ vpaddq 0(%rbp),%xmm4,%xmm10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,64(%rsp)
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm1,%xmm2,%xmm11
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpaddq %xmm11,%xmm5,%xmm5
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r11,%r15
+ addq %r12,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm4,%xmm11
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rbx,%r13
+ addq %r9,%r14
+ vpsllq $3,%xmm4,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ vpaddq %xmm8,%xmm5,%xmm5
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm4,%xmm9
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r10,%rdi
+ addq %r12,%r8
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm5,%xmm5
+ xorq %r9,%r14
+ addq %r13,%r8
+ vpaddq 32(%rbp),%xmm5,%xmm10
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,80(%rsp)
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm2,%xmm3,%xmm11
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpaddq %xmm11,%xmm6,%xmm6
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rax,%r12
+ xorq %rax,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r9,%r15
+ addq %r12,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm5,%xmm11
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r11,%r13
+ addq %rdx,%r14
+ vpsllq $3,%xmm5,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ vpaddq %xmm8,%xmm6,%xmm6
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm5,%xmm9
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm6,%xmm6
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,96(%rsp)
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm3,%xmm4,%xmm11
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpaddq %xmm11,%xmm7,%xmm7
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r10,%r12
+ xorq %r10,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm6,%xmm11
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r9,%r13
+ addq %rbx,%r14
+ vpsllq $3,%xmm6,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ vpaddq %xmm8,%xmm7,%xmm7
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm6,%xmm9
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm7,%xmm7
+ xorq %rbx,%r14
+ addq %r13,%rax
+ vpaddq 96(%rbp),%xmm7,%xmm10
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,112(%rsp)
+ cmpb $0,135(%rbp)
+ jne .Lavx_00_47
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ movq 128+0(%rsp),%rdi
+ movq %r14,%rax
+
+ addq 0(%rdi),%rax
+ leaq 128(%rsi),%rsi
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb .Lloop_avx
+
+ movq 152(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vzeroupper
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha512_block_data_order_avx,.-sha512_block_data_order_avx
+.type sha512_block_data_order_avx2,@function
+.align 64
+sha512_block_data_order_avx2:
+.cfi_startproc
+.Lavx2_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $1312,%rsp
+ shlq $4,%rdx
+ andq $-2048,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ addq $1152,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %rax,152(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue_avx2:
+
+ vzeroupper
+ subq $-128,%rsi
+ movq 0(%rdi),%rax
+ movq %rsi,%r12
+ movq 8(%rdi),%rbx
+ cmpq %rdx,%rsi
+ movq 16(%rdi),%rcx
+ cmoveq %rsp,%r12
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp .Loop_avx2
+.align 16
+.Loop_avx2:
+ vmovdqu -128(%rsi),%xmm0
+ vmovdqu -128+16(%rsi),%xmm1
+ vmovdqu -128+32(%rsi),%xmm2
+ leaq K512+128(%rip),%rbp
+ vmovdqu -128+48(%rsi),%xmm3
+ vmovdqu -128+64(%rsi),%xmm4
+ vmovdqu -128+80(%rsi),%xmm5
+ vmovdqu -128+96(%rsi),%xmm6
+ vmovdqu -128+112(%rsi),%xmm7
+
+ vmovdqa 1152(%rbp),%ymm10
+ vinserti128 $1,(%r12),%ymm0,%ymm0
+ vinserti128 $1,16(%r12),%ymm1,%ymm1
+ vpshufb %ymm10,%ymm0,%ymm0
+ vinserti128 $1,32(%r12),%ymm2,%ymm2
+ vpshufb %ymm10,%ymm1,%ymm1
+ vinserti128 $1,48(%r12),%ymm3,%ymm3
+ vpshufb %ymm10,%ymm2,%ymm2
+ vinserti128 $1,64(%r12),%ymm4,%ymm4
+ vpshufb %ymm10,%ymm3,%ymm3
+ vinserti128 $1,80(%r12),%ymm5,%ymm5
+ vpshufb %ymm10,%ymm4,%ymm4
+ vinserti128 $1,96(%r12),%ymm6,%ymm6
+ vpshufb %ymm10,%ymm5,%ymm5
+ vinserti128 $1,112(%r12),%ymm7,%ymm7
+
+ vpaddq -128(%rbp),%ymm0,%ymm8
+ vpshufb %ymm10,%ymm6,%ymm6
+ vpaddq -96(%rbp),%ymm1,%ymm9
+ vpshufb %ymm10,%ymm7,%ymm7
+ vpaddq -64(%rbp),%ymm2,%ymm10
+ vpaddq -32(%rbp),%ymm3,%ymm11
+ vmovdqa %ymm8,0(%rsp)
+ vpaddq 0(%rbp),%ymm4,%ymm8
+ vmovdqa %ymm9,32(%rsp)
+ vpaddq 32(%rbp),%ymm5,%ymm9
+ vmovdqa %ymm10,64(%rsp)
+ vpaddq 64(%rbp),%ymm6,%ymm10
+ vmovdqa %ymm11,96(%rsp)
+ leaq -128(%rsp),%rsp
+ vpaddq 96(%rbp),%ymm7,%ymm11
+ vmovdqa %ymm8,0(%rsp)
+ xorq %r14,%r14
+ vmovdqa %ymm9,32(%rsp)
+ movq %rbx,%rdi
+ vmovdqa %ymm10,64(%rsp)
+ xorq %rcx,%rdi
+ vmovdqa %ymm11,96(%rsp)
+ movq %r9,%r12
+ addq $32*8,%rbp
+ jmp .Lavx2_00_47
+
+.align 16
+.Lavx2_00_47:
+ leaq -128(%rsp),%rsp
+ vpalignr $8,%ymm0,%ymm1,%ymm8
+ addq 0+256(%rsp),%r11
+ andq %r8,%r12
+ rorxq $41,%r8,%r13
+ vpalignr $8,%ymm4,%ymm5,%ymm11
+ rorxq $18,%r8,%r15
+ leaq (%rax,%r14,1),%rax
+ leaq (%r11,%r12,1),%r11
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %r10,%r8,%r12
+ xorq %r15,%r13
+ rorxq $14,%r8,%r14
+ vpaddq %ymm11,%ymm0,%ymm0
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%r11,%r12,1),%r11
+ xorq %r14,%r13
+ movq %rax,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%rax,%r12
+ leaq (%r11,%r13,1),%r11
+ xorq %rbx,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%rax,%r14
+ rorxq $28,%rax,%r13
+ leaq (%rdx,%r11,1),%rdx
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rbx,%rdi
+ vpsrlq $6,%ymm7,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%r11,%rdi,1),%r11
+ movq %r8,%r12
+ vpsllq $3,%ymm7,%ymm10
+ vpaddq %ymm8,%ymm0,%ymm0
+ addq 8+256(%rsp),%r10
+ andq %rdx,%r12
+ rorxq $41,%rdx,%r13
+ vpsrlq $19,%ymm7,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%rdx,%rdi
+ leaq (%r11,%r14,1),%r11
+ leaq (%r10,%r12,1),%r10
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %r9,%rdx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rdx,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%r10,%r12,1),%r10
+ xorq %r14,%r13
+ movq %r11,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%r11,%r12
+ leaq (%r10,%r13,1),%r10
+ xorq %rax,%rdi
+ vpaddq %ymm11,%ymm0,%ymm0
+ rorxq $34,%r11,%r14
+ rorxq $28,%r11,%r13
+ leaq (%rcx,%r10,1),%rcx
+ vpaddq -128(%rbp),%ymm0,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rax,%r15
+ xorq %r13,%r14
+ leaq (%r10,%r15,1),%r10
+ movq %rdx,%r12
+ vmovdqa %ymm10,0(%rsp)
+ vpalignr $8,%ymm1,%ymm2,%ymm8
+ addq 32+256(%rsp),%r9
+ andq %rcx,%r12
+ rorxq $41,%rcx,%r13
+ vpalignr $8,%ymm5,%ymm6,%ymm11
+ rorxq $18,%rcx,%r15
+ leaq (%r10,%r14,1),%r10
+ leaq (%r9,%r12,1),%r9
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %r8,%rcx,%r12
+ xorq %r15,%r13
+ rorxq $14,%rcx,%r14
+ vpaddq %ymm11,%ymm1,%ymm1
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%r9,%r12,1),%r9
+ xorq %r14,%r13
+ movq %r10,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%r10,%r12
+ leaq (%r9,%r13,1),%r9
+ xorq %r11,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%r10,%r14
+ rorxq $28,%r10,%r13
+ leaq (%rbx,%r9,1),%rbx
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r11,%rdi
+ vpsrlq $6,%ymm0,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%r9,%rdi,1),%r9
+ movq %rcx,%r12
+ vpsllq $3,%ymm0,%ymm10
+ vpaddq %ymm8,%ymm1,%ymm1
+ addq 40+256(%rsp),%r8
+ andq %rbx,%r12
+ rorxq $41,%rbx,%r13
+ vpsrlq $19,%ymm0,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%rbx,%rdi
+ leaq (%r9,%r14,1),%r9
+ leaq (%r8,%r12,1),%r8
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %rdx,%rbx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rbx,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%r8,%r12,1),%r8
+ xorq %r14,%r13
+ movq %r9,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%r9,%r12
+ leaq (%r8,%r13,1),%r8
+ xorq %r10,%rdi
+ vpaddq %ymm11,%ymm1,%ymm1
+ rorxq $34,%r9,%r14
+ rorxq $28,%r9,%r13
+ leaq (%rax,%r8,1),%rax
+ vpaddq -96(%rbp),%ymm1,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r10,%r15
+ xorq %r13,%r14
+ leaq (%r8,%r15,1),%r8
+ movq %rbx,%r12
+ vmovdqa %ymm10,32(%rsp)
+ vpalignr $8,%ymm2,%ymm3,%ymm8
+ addq 64+256(%rsp),%rdx
+ andq %rax,%r12
+ rorxq $41,%rax,%r13
+ vpalignr $8,%ymm6,%ymm7,%ymm11
+ rorxq $18,%rax,%r15
+ leaq (%r8,%r14,1),%r8
+ leaq (%rdx,%r12,1),%rdx
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %rcx,%rax,%r12
+ xorq %r15,%r13
+ rorxq $14,%rax,%r14
+ vpaddq %ymm11,%ymm2,%ymm2
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%rdx,%r12,1),%rdx
+ xorq %r14,%r13
+ movq %r8,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%r8,%r12
+ leaq (%rdx,%r13,1),%rdx
+ xorq %r9,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%r8,%r14
+ rorxq $28,%r8,%r13
+ leaq (%r11,%rdx,1),%r11
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r9,%rdi
+ vpsrlq $6,%ymm1,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%rdx,%rdi,1),%rdx
+ movq %rax,%r12
+ vpsllq $3,%ymm1,%ymm10
+ vpaddq %ymm8,%ymm2,%ymm2
+ addq 72+256(%rsp),%rcx
+ andq %r11,%r12
+ rorxq $41,%r11,%r13
+ vpsrlq $19,%ymm1,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%r11,%rdi
+ leaq (%rdx,%r14,1),%rdx
+ leaq (%rcx,%r12,1),%rcx
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %rbx,%r11,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r11,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%rcx,%r12,1),%rcx
+ xorq %r14,%r13
+ movq %rdx,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%rdx,%r12
+ leaq (%rcx,%r13,1),%rcx
+ xorq %r8,%rdi
+ vpaddq %ymm11,%ymm2,%ymm2
+ rorxq $34,%rdx,%r14
+ rorxq $28,%rdx,%r13
+ leaq (%r10,%rcx,1),%r10
+ vpaddq -64(%rbp),%ymm2,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r8,%r15
+ xorq %r13,%r14
+ leaq (%rcx,%r15,1),%rcx
+ movq %r11,%r12
+ vmovdqa %ymm10,64(%rsp)
+ vpalignr $8,%ymm3,%ymm4,%ymm8
+ addq 96+256(%rsp),%rbx
+ andq %r10,%r12
+ rorxq $41,%r10,%r13
+ vpalignr $8,%ymm7,%ymm0,%ymm11
+ rorxq $18,%r10,%r15
+ leaq (%rcx,%r14,1),%rcx
+ leaq (%rbx,%r12,1),%rbx
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %rax,%r10,%r12
+ xorq %r15,%r13
+ rorxq $14,%r10,%r14
+ vpaddq %ymm11,%ymm3,%ymm3
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%rbx,%r12,1),%rbx
+ xorq %r14,%r13
+ movq %rcx,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%rcx,%r12
+ leaq (%rbx,%r13,1),%rbx
+ xorq %rdx,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%rcx,%r14
+ rorxq $28,%rcx,%r13
+ leaq (%r9,%rbx,1),%r9
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rdx,%rdi
+ vpsrlq $6,%ymm2,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%rbx,%rdi,1),%rbx
+ movq %r10,%r12
+ vpsllq $3,%ymm2,%ymm10
+ vpaddq %ymm8,%ymm3,%ymm3
+ addq 104+256(%rsp),%rax
+ andq %r9,%r12
+ rorxq $41,%r9,%r13
+ vpsrlq $19,%ymm2,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%r9,%rdi
+ leaq (%rbx,%r14,1),%rbx
+ leaq (%rax,%r12,1),%rax
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %r11,%r9,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r9,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%rax,%r12,1),%rax
+ xorq %r14,%r13
+ movq %rbx,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%rbx,%r12
+ leaq (%rax,%r13,1),%rax
+ xorq %rcx,%rdi
+ vpaddq %ymm11,%ymm3,%ymm3
+ rorxq $34,%rbx,%r14
+ rorxq $28,%rbx,%r13
+ leaq (%r8,%rax,1),%r8
+ vpaddq -32(%rbp),%ymm3,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rcx,%r15
+ xorq %r13,%r14
+ leaq (%rax,%r15,1),%rax
+ movq %r9,%r12
+ vmovdqa %ymm10,96(%rsp)
+ leaq -128(%rsp),%rsp
+ vpalignr $8,%ymm4,%ymm5,%ymm8
+ addq 0+256(%rsp),%r11
+ andq %r8,%r12
+ rorxq $41,%r8,%r13
+ vpalignr $8,%ymm0,%ymm1,%ymm11
+ rorxq $18,%r8,%r15
+ leaq (%rax,%r14,1),%rax
+ leaq (%r11,%r12,1),%r11
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %r10,%r8,%r12
+ xorq %r15,%r13
+ rorxq $14,%r8,%r14
+ vpaddq %ymm11,%ymm4,%ymm4
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%r11,%r12,1),%r11
+ xorq %r14,%r13
+ movq %rax,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%rax,%r12
+ leaq (%r11,%r13,1),%r11
+ xorq %rbx,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%rax,%r14
+ rorxq $28,%rax,%r13
+ leaq (%rdx,%r11,1),%rdx
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rbx,%rdi
+ vpsrlq $6,%ymm3,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%r11,%rdi,1),%r11
+ movq %r8,%r12
+ vpsllq $3,%ymm3,%ymm10
+ vpaddq %ymm8,%ymm4,%ymm4
+ addq 8+256(%rsp),%r10
+ andq %rdx,%r12
+ rorxq $41,%rdx,%r13
+ vpsrlq $19,%ymm3,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%rdx,%rdi
+ leaq (%r11,%r14,1),%r11
+ leaq (%r10,%r12,1),%r10
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %r9,%rdx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rdx,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%r10,%r12,1),%r10
+ xorq %r14,%r13
+ movq %r11,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%r11,%r12
+ leaq (%r10,%r13,1),%r10
+ xorq %rax,%rdi
+ vpaddq %ymm11,%ymm4,%ymm4
+ rorxq $34,%r11,%r14
+ rorxq $28,%r11,%r13
+ leaq (%rcx,%r10,1),%rcx
+ vpaddq 0(%rbp),%ymm4,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rax,%r15
+ xorq %r13,%r14
+ leaq (%r10,%r15,1),%r10
+ movq %rdx,%r12
+ vmovdqa %ymm10,0(%rsp)
+ vpalignr $8,%ymm5,%ymm6,%ymm8
+ addq 32+256(%rsp),%r9
+ andq %rcx,%r12
+ rorxq $41,%rcx,%r13
+ vpalignr $8,%ymm1,%ymm2,%ymm11
+ rorxq $18,%rcx,%r15
+ leaq (%r10,%r14,1),%r10
+ leaq (%r9,%r12,1),%r9
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %r8,%rcx,%r12
+ xorq %r15,%r13
+ rorxq $14,%rcx,%r14
+ vpaddq %ymm11,%ymm5,%ymm5
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%r9,%r12,1),%r9
+ xorq %r14,%r13
+ movq %r10,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%r10,%r12
+ leaq (%r9,%r13,1),%r9
+ xorq %r11,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%r10,%r14
+ rorxq $28,%r10,%r13
+ leaq (%rbx,%r9,1),%rbx
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r11,%rdi
+ vpsrlq $6,%ymm4,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%r9,%rdi,1),%r9
+ movq %rcx,%r12
+ vpsllq $3,%ymm4,%ymm10
+ vpaddq %ymm8,%ymm5,%ymm5
+ addq 40+256(%rsp),%r8
+ andq %rbx,%r12
+ rorxq $41,%rbx,%r13
+ vpsrlq $19,%ymm4,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%rbx,%rdi
+ leaq (%r9,%r14,1),%r9
+ leaq (%r8,%r12,1),%r8
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %rdx,%rbx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rbx,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%r8,%r12,1),%r8
+ xorq %r14,%r13
+ movq %r9,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%r9,%r12
+ leaq (%r8,%r13,1),%r8
+ xorq %r10,%rdi
+ vpaddq %ymm11,%ymm5,%ymm5
+ rorxq $34,%r9,%r14
+ rorxq $28,%r9,%r13
+ leaq (%rax,%r8,1),%rax
+ vpaddq 32(%rbp),%ymm5,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r10,%r15
+ xorq %r13,%r14
+ leaq (%r8,%r15,1),%r8
+ movq %rbx,%r12
+ vmovdqa %ymm10,32(%rsp)
+ vpalignr $8,%ymm6,%ymm7,%ymm8
+ addq 64+256(%rsp),%rdx
+ andq %rax,%r12
+ rorxq $41,%rax,%r13
+ vpalignr $8,%ymm2,%ymm3,%ymm11
+ rorxq $18,%rax,%r15
+ leaq (%r8,%r14,1),%r8
+ leaq (%rdx,%r12,1),%rdx
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %rcx,%rax,%r12
+ xorq %r15,%r13
+ rorxq $14,%rax,%r14
+ vpaddq %ymm11,%ymm6,%ymm6
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%rdx,%r12,1),%rdx
+ xorq %r14,%r13
+ movq %r8,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%r8,%r12
+ leaq (%rdx,%r13,1),%rdx
+ xorq %r9,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%r8,%r14
+ rorxq $28,%r8,%r13
+ leaq (%r11,%rdx,1),%r11
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r9,%rdi
+ vpsrlq $6,%ymm5,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%rdx,%rdi,1),%rdx
+ movq %rax,%r12
+ vpsllq $3,%ymm5,%ymm10
+ vpaddq %ymm8,%ymm6,%ymm6
+ addq 72+256(%rsp),%rcx
+ andq %r11,%r12
+ rorxq $41,%r11,%r13
+ vpsrlq $19,%ymm5,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%r11,%rdi
+ leaq (%rdx,%r14,1),%rdx
+ leaq (%rcx,%r12,1),%rcx
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %rbx,%r11,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r11,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%rcx,%r12,1),%rcx
+ xorq %r14,%r13
+ movq %rdx,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%rdx,%r12
+ leaq (%rcx,%r13,1),%rcx
+ xorq %r8,%rdi
+ vpaddq %ymm11,%ymm6,%ymm6
+ rorxq $34,%rdx,%r14
+ rorxq $28,%rdx,%r13
+ leaq (%r10,%rcx,1),%r10
+ vpaddq 64(%rbp),%ymm6,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r8,%r15
+ xorq %r13,%r14
+ leaq (%rcx,%r15,1),%rcx
+ movq %r11,%r12
+ vmovdqa %ymm10,64(%rsp)
+ vpalignr $8,%ymm7,%ymm0,%ymm8
+ addq 96+256(%rsp),%rbx
+ andq %r10,%r12
+ rorxq $41,%r10,%r13
+ vpalignr $8,%ymm3,%ymm4,%ymm11
+ rorxq $18,%r10,%r15
+ leaq (%rcx,%r14,1),%rcx
+ leaq (%rbx,%r12,1),%rbx
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %rax,%r10,%r12
+ xorq %r15,%r13
+ rorxq $14,%r10,%r14
+ vpaddq %ymm11,%ymm7,%ymm7
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%rbx,%r12,1),%rbx
+ xorq %r14,%r13
+ movq %rcx,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%rcx,%r12
+ leaq (%rbx,%r13,1),%rbx
+ xorq %rdx,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%rcx,%r14
+ rorxq $28,%rcx,%r13
+ leaq (%r9,%rbx,1),%r9
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rdx,%rdi
+ vpsrlq $6,%ymm6,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%rbx,%rdi,1),%rbx
+ movq %r10,%r12
+ vpsllq $3,%ymm6,%ymm10
+ vpaddq %ymm8,%ymm7,%ymm7
+ addq 104+256(%rsp),%rax
+ andq %r9,%r12
+ rorxq $41,%r9,%r13
+ vpsrlq $19,%ymm6,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%r9,%rdi
+ leaq (%rbx,%r14,1),%rbx
+ leaq (%rax,%r12,1),%rax
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %r11,%r9,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r9,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%rax,%r12,1),%rax
+ xorq %r14,%r13
+ movq %rbx,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%rbx,%r12
+ leaq (%rax,%r13,1),%rax
+ xorq %rcx,%rdi
+ vpaddq %ymm11,%ymm7,%ymm7
+ rorxq $34,%rbx,%r14
+ rorxq $28,%rbx,%r13
+ leaq (%r8,%rax,1),%r8
+ vpaddq 96(%rbp),%ymm7,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rcx,%r15
+ xorq %r13,%r14
+ leaq (%rax,%r15,1),%rax
+ movq %r9,%r12
+ vmovdqa %ymm10,96(%rsp)
+ leaq 256(%rbp),%rbp
+ cmpb $0,-121(%rbp)
+ jne .Lavx2_00_47
+ addq 0+128(%rsp),%r11
+ andq %r8,%r12
+ rorxq $41,%r8,%r13
+ rorxq $18,%r8,%r15
+ leaq (%rax,%r14,1),%rax
+ leaq (%r11,%r12,1),%r11
+ andnq %r10,%r8,%r12
+ xorq %r15,%r13
+ rorxq $14,%r8,%r14
+ leaq (%r11,%r12,1),%r11
+ xorq %r14,%r13
+ movq %rax,%r15
+ rorxq $39,%rax,%r12
+ leaq (%r11,%r13,1),%r11
+ xorq %rbx,%r15
+ rorxq $34,%rax,%r14
+ rorxq $28,%rax,%r13
+ leaq (%rdx,%r11,1),%rdx
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rbx,%rdi
+ xorq %r13,%r14
+ leaq (%r11,%rdi,1),%r11
+ movq %r8,%r12
+ addq 8+128(%rsp),%r10
+ andq %rdx,%r12
+ rorxq $41,%rdx,%r13
+ rorxq $18,%rdx,%rdi
+ leaq (%r11,%r14,1),%r11
+ leaq (%r10,%r12,1),%r10
+ andnq %r9,%rdx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rdx,%r14
+ leaq (%r10,%r12,1),%r10
+ xorq %r14,%r13
+ movq %r11,%rdi
+ rorxq $39,%r11,%r12
+ leaq (%r10,%r13,1),%r10
+ xorq %rax,%rdi
+ rorxq $34,%r11,%r14
+ rorxq $28,%r11,%r13
+ leaq (%rcx,%r10,1),%rcx
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rax,%r15
+ xorq %r13,%r14
+ leaq (%r10,%r15,1),%r10
+ movq %rdx,%r12
+ addq 32+128(%rsp),%r9
+ andq %rcx,%r12
+ rorxq $41,%rcx,%r13
+ rorxq $18,%rcx,%r15
+ leaq (%r10,%r14,1),%r10
+ leaq (%r9,%r12,1),%r9
+ andnq %r8,%rcx,%r12
+ xorq %r15,%r13
+ rorxq $14,%rcx,%r14
+ leaq (%r9,%r12,1),%r9
+ xorq %r14,%r13
+ movq %r10,%r15
+ rorxq $39,%r10,%r12
+ leaq (%r9,%r13,1),%r9
+ xorq %r11,%r15
+ rorxq $34,%r10,%r14
+ rorxq $28,%r10,%r13
+ leaq (%rbx,%r9,1),%rbx
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r11,%rdi
+ xorq %r13,%r14
+ leaq (%r9,%rdi,1),%r9
+ movq %rcx,%r12
+ addq 40+128(%rsp),%r8
+ andq %rbx,%r12
+ rorxq $41,%rbx,%r13
+ rorxq $18,%rbx,%rdi
+ leaq (%r9,%r14,1),%r9
+ leaq (%r8,%r12,1),%r8
+ andnq %rdx,%rbx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rbx,%r14
+ leaq (%r8,%r12,1),%r8
+ xorq %r14,%r13
+ movq %r9,%rdi
+ rorxq $39,%r9,%r12
+ leaq (%r8,%r13,1),%r8
+ xorq %r10,%rdi
+ rorxq $34,%r9,%r14
+ rorxq $28,%r9,%r13
+ leaq (%rax,%r8,1),%rax
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r10,%r15
+ xorq %r13,%r14
+ leaq (%r8,%r15,1),%r8
+ movq %rbx,%r12
+ addq 64+128(%rsp),%rdx
+ andq %rax,%r12
+ rorxq $41,%rax,%r13
+ rorxq $18,%rax,%r15
+ leaq (%r8,%r14,1),%r8
+ leaq (%rdx,%r12,1),%rdx
+ andnq %rcx,%rax,%r12
+ xorq %r15,%r13
+ rorxq $14,%rax,%r14
+ leaq (%rdx,%r12,1),%rdx
+ xorq %r14,%r13
+ movq %r8,%r15
+ rorxq $39,%r8,%r12
+ leaq (%rdx,%r13,1),%rdx
+ xorq %r9,%r15
+ rorxq $34,%r8,%r14
+ rorxq $28,%r8,%r13
+ leaq (%r11,%rdx,1),%r11
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r9,%rdi
+ xorq %r13,%r14
+ leaq (%rdx,%rdi,1),%rdx
+ movq %rax,%r12
+ addq 72+128(%rsp),%rcx
+ andq %r11,%r12
+ rorxq $41,%r11,%r13
+ rorxq $18,%r11,%rdi
+ leaq (%rdx,%r14,1),%rdx
+ leaq (%rcx,%r12,1),%rcx
+ andnq %rbx,%r11,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r11,%r14
+ leaq (%rcx,%r12,1),%rcx
+ xorq %r14,%r13
+ movq %rdx,%rdi
+ rorxq $39,%rdx,%r12
+ leaq (%rcx,%r13,1),%rcx
+ xorq %r8,%rdi
+ rorxq $34,%rdx,%r14
+ rorxq $28,%rdx,%r13
+ leaq (%r10,%rcx,1),%r10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r8,%r15
+ xorq %r13,%r14
+ leaq (%rcx,%r15,1),%rcx
+ movq %r11,%r12
+ addq 96+128(%rsp),%rbx
+ andq %r10,%r12
+ rorxq $41,%r10,%r13
+ rorxq $18,%r10,%r15
+ leaq (%rcx,%r14,1),%rcx
+ leaq (%rbx,%r12,1),%rbx
+ andnq %rax,%r10,%r12
+ xorq %r15,%r13
+ rorxq $14,%r10,%r14
+ leaq (%rbx,%r12,1),%rbx
+ xorq %r14,%r13
+ movq %rcx,%r15
+ rorxq $39,%rcx,%r12
+ leaq (%rbx,%r13,1),%rbx
+ xorq %rdx,%r15
+ rorxq $34,%rcx,%r14
+ rorxq $28,%rcx,%r13
+ leaq (%r9,%rbx,1),%r9
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rdx,%rdi
+ xorq %r13,%r14
+ leaq (%rbx,%rdi,1),%rbx
+ movq %r10,%r12
+ addq 104+128(%rsp),%rax
+ andq %r9,%r12
+ rorxq $41,%r9,%r13
+ rorxq $18,%r9,%rdi
+ leaq (%rbx,%r14,1),%rbx
+ leaq (%rax,%r12,1),%rax
+ andnq %r11,%r9,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r9,%r14
+ leaq (%rax,%r12,1),%rax
+ xorq %r14,%r13
+ movq %rbx,%rdi
+ rorxq $39,%rbx,%r12
+ leaq (%rax,%r13,1),%rax
+ xorq %rcx,%rdi
+ rorxq $34,%rbx,%r14
+ rorxq $28,%rbx,%r13
+ leaq (%r8,%rax,1),%r8
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rcx,%r15
+ xorq %r13,%r14
+ leaq (%rax,%r15,1),%rax
+ movq %r9,%r12
+ addq 0(%rsp),%r11
+ andq %r8,%r12
+ rorxq $41,%r8,%r13
+ rorxq $18,%r8,%r15
+ leaq (%rax,%r14,1),%rax
+ leaq (%r11,%r12,1),%r11
+ andnq %r10,%r8,%r12
+ xorq %r15,%r13
+ rorxq $14,%r8,%r14
+ leaq (%r11,%r12,1),%r11
+ xorq %r14,%r13
+ movq %rax,%r15
+ rorxq $39,%rax,%r12
+ leaq (%r11,%r13,1),%r11
+ xorq %rbx,%r15
+ rorxq $34,%rax,%r14
+ rorxq $28,%rax,%r13
+ leaq (%rdx,%r11,1),%rdx
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rbx,%rdi
+ xorq %r13,%r14
+ leaq (%r11,%rdi,1),%r11
+ movq %r8,%r12
+ addq 8(%rsp),%r10
+ andq %rdx,%r12
+ rorxq $41,%rdx,%r13
+ rorxq $18,%rdx,%rdi
+ leaq (%r11,%r14,1),%r11
+ leaq (%r10,%r12,1),%r10
+ andnq %r9,%rdx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rdx,%r14
+ leaq (%r10,%r12,1),%r10
+ xorq %r14,%r13
+ movq %r11,%rdi
+ rorxq $39,%r11,%r12
+ leaq (%r10,%r13,1),%r10
+ xorq %rax,%rdi
+ rorxq $34,%r11,%r14
+ rorxq $28,%r11,%r13
+ leaq (%rcx,%r10,1),%rcx
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rax,%r15
+ xorq %r13,%r14
+ leaq (%r10,%r15,1),%r10
+ movq %rdx,%r12
+ addq 32(%rsp),%r9
+ andq %rcx,%r12
+ rorxq $41,%rcx,%r13
+ rorxq $18,%rcx,%r15
+ leaq (%r10,%r14,1),%r10
+ leaq (%r9,%r12,1),%r9
+ andnq %r8,%rcx,%r12
+ xorq %r15,%r13
+ rorxq $14,%rcx,%r14
+ leaq (%r9,%r12,1),%r9
+ xorq %r14,%r13
+ movq %r10,%r15
+ rorxq $39,%r10,%r12
+ leaq (%r9,%r13,1),%r9
+ xorq %r11,%r15
+ rorxq $34,%r10,%r14
+ rorxq $28,%r10,%r13
+ leaq (%rbx,%r9,1),%rbx
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r11,%rdi
+ xorq %r13,%r14
+ leaq (%r9,%rdi,1),%r9
+ movq %rcx,%r12
+ addq 40(%rsp),%r8
+ andq %rbx,%r12
+ rorxq $41,%rbx,%r13
+ rorxq $18,%rbx,%rdi
+ leaq (%r9,%r14,1),%r9
+ leaq (%r8,%r12,1),%r8
+ andnq %rdx,%rbx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rbx,%r14
+ leaq (%r8,%r12,1),%r8
+ xorq %r14,%r13
+ movq %r9,%rdi
+ rorxq $39,%r9,%r12
+ leaq (%r8,%r13,1),%r8
+ xorq %r10,%rdi
+ rorxq $34,%r9,%r14
+ rorxq $28,%r9,%r13
+ leaq (%rax,%r8,1),%rax
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r10,%r15
+ xorq %r13,%r14
+ leaq (%r8,%r15,1),%r8
+ movq %rbx,%r12
+ addq 64(%rsp),%rdx
+ andq %rax,%r12
+ rorxq $41,%rax,%r13
+ rorxq $18,%rax,%r15
+ leaq (%r8,%r14,1),%r8
+ leaq (%rdx,%r12,1),%rdx
+ andnq %rcx,%rax,%r12
+ xorq %r15,%r13
+ rorxq $14,%rax,%r14
+ leaq (%rdx,%r12,1),%rdx
+ xorq %r14,%r13
+ movq %r8,%r15
+ rorxq $39,%r8,%r12
+ leaq (%rdx,%r13,1),%rdx
+ xorq %r9,%r15
+ rorxq $34,%r8,%r14
+ rorxq $28,%r8,%r13
+ leaq (%r11,%rdx,1),%r11
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r9,%rdi
+ xorq %r13,%r14
+ leaq (%rdx,%rdi,1),%rdx
+ movq %rax,%r12
+ addq 72(%rsp),%rcx
+ andq %r11,%r12
+ rorxq $41,%r11,%r13
+ rorxq $18,%r11,%rdi
+ leaq (%rdx,%r14,1),%rdx
+ leaq (%rcx,%r12,1),%rcx
+ andnq %rbx,%r11,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r11,%r14
+ leaq (%rcx,%r12,1),%rcx
+ xorq %r14,%r13
+ movq %rdx,%rdi
+ rorxq $39,%rdx,%r12
+ leaq (%rcx,%r13,1),%rcx
+ xorq %r8,%rdi
+ rorxq $34,%rdx,%r14
+ rorxq $28,%rdx,%r13
+ leaq (%r10,%rcx,1),%r10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r8,%r15
+ xorq %r13,%r14
+ leaq (%rcx,%r15,1),%rcx
+ movq %r11,%r12
+ addq 96(%rsp),%rbx
+ andq %r10,%r12
+ rorxq $41,%r10,%r13
+ rorxq $18,%r10,%r15
+ leaq (%rcx,%r14,1),%rcx
+ leaq (%rbx,%r12,1),%rbx
+ andnq %rax,%r10,%r12
+ xorq %r15,%r13
+ rorxq $14,%r10,%r14
+ leaq (%rbx,%r12,1),%rbx
+ xorq %r14,%r13
+ movq %rcx,%r15
+ rorxq $39,%rcx,%r12
+ leaq (%rbx,%r13,1),%rbx
+ xorq %rdx,%r15
+ rorxq $34,%rcx,%r14
+ rorxq $28,%rcx,%r13
+ leaq (%r9,%rbx,1),%r9
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rdx,%rdi
+ xorq %r13,%r14
+ leaq (%rbx,%rdi,1),%rbx
+ movq %r10,%r12
+ addq 104(%rsp),%rax
+ andq %r9,%r12
+ rorxq $41,%r9,%r13
+ rorxq $18,%r9,%rdi
+ leaq (%rbx,%r14,1),%rbx
+ leaq (%rax,%r12,1),%rax
+ andnq %r11,%r9,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r9,%r14
+ leaq (%rax,%r12,1),%rax
+ xorq %r14,%r13
+ movq %rbx,%rdi
+ rorxq $39,%rbx,%r12
+ leaq (%rax,%r13,1),%rax
+ xorq %rcx,%rdi
+ rorxq $34,%rbx,%r14
+ rorxq $28,%rbx,%r13
+ leaq (%r8,%rax,1),%r8
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rcx,%r15
+ xorq %r13,%r14
+ leaq (%rax,%r15,1),%rax
+ movq %r9,%r12
+ movq 1280(%rsp),%rdi
+ addq %r14,%rax
+
+ leaq 1152(%rsp),%rbp
+
+ addq 0(%rdi),%rax
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+
+ cmpq 144(%rbp),%rsi
+ je .Ldone_avx2
+
+ xorq %r14,%r14
+ movq %rbx,%rdi
+ xorq %rcx,%rdi
+ movq %r9,%r12
+ jmp .Lower_avx2
+.align 16
+.Lower_avx2:
+ addq 0+16(%rbp),%r11
+ andq %r8,%r12
+ rorxq $41,%r8,%r13
+ rorxq $18,%r8,%r15
+ leaq (%rax,%r14,1),%rax
+ leaq (%r11,%r12,1),%r11
+ andnq %r10,%r8,%r12
+ xorq %r15,%r13
+ rorxq $14,%r8,%r14
+ leaq (%r11,%r12,1),%r11
+ xorq %r14,%r13
+ movq %rax,%r15
+ rorxq $39,%rax,%r12
+ leaq (%r11,%r13,1),%r11
+ xorq %rbx,%r15
+ rorxq $34,%rax,%r14
+ rorxq $28,%rax,%r13
+ leaq (%rdx,%r11,1),%rdx
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rbx,%rdi
+ xorq %r13,%r14
+ leaq (%r11,%rdi,1),%r11
+ movq %r8,%r12
+ addq 8+16(%rbp),%r10
+ andq %rdx,%r12
+ rorxq $41,%rdx,%r13
+ rorxq $18,%rdx,%rdi
+ leaq (%r11,%r14,1),%r11
+ leaq (%r10,%r12,1),%r10
+ andnq %r9,%rdx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rdx,%r14
+ leaq (%r10,%r12,1),%r10
+ xorq %r14,%r13
+ movq %r11,%rdi
+ rorxq $39,%r11,%r12
+ leaq (%r10,%r13,1),%r10
+ xorq %rax,%rdi
+ rorxq $34,%r11,%r14
+ rorxq $28,%r11,%r13
+ leaq (%rcx,%r10,1),%rcx
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rax,%r15
+ xorq %r13,%r14
+ leaq (%r10,%r15,1),%r10
+ movq %rdx,%r12
+ addq 32+16(%rbp),%r9
+ andq %rcx,%r12
+ rorxq $41,%rcx,%r13
+ rorxq $18,%rcx,%r15
+ leaq (%r10,%r14,1),%r10
+ leaq (%r9,%r12,1),%r9
+ andnq %r8,%rcx,%r12
+ xorq %r15,%r13
+ rorxq $14,%rcx,%r14
+ leaq (%r9,%r12,1),%r9
+ xorq %r14,%r13
+ movq %r10,%r15
+ rorxq $39,%r10,%r12
+ leaq (%r9,%r13,1),%r9
+ xorq %r11,%r15
+ rorxq $34,%r10,%r14
+ rorxq $28,%r10,%r13
+ leaq (%rbx,%r9,1),%rbx
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r11,%rdi
+ xorq %r13,%r14
+ leaq (%r9,%rdi,1),%r9
+ movq %rcx,%r12
+ addq 40+16(%rbp),%r8
+ andq %rbx,%r12
+ rorxq $41,%rbx,%r13
+ rorxq $18,%rbx,%rdi
+ leaq (%r9,%r14,1),%r9
+ leaq (%r8,%r12,1),%r8
+ andnq %rdx,%rbx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rbx,%r14
+ leaq (%r8,%r12,1),%r8
+ xorq %r14,%r13
+ movq %r9,%rdi
+ rorxq $39,%r9,%r12
+ leaq (%r8,%r13,1),%r8
+ xorq %r10,%rdi
+ rorxq $34,%r9,%r14
+ rorxq $28,%r9,%r13
+ leaq (%rax,%r8,1),%rax
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r10,%r15
+ xorq %r13,%r14
+ leaq (%r8,%r15,1),%r8
+ movq %rbx,%r12
+ addq 64+16(%rbp),%rdx
+ andq %rax,%r12
+ rorxq $41,%rax,%r13
+ rorxq $18,%rax,%r15
+ leaq (%r8,%r14,1),%r8
+ leaq (%rdx,%r12,1),%rdx
+ andnq %rcx,%rax,%r12
+ xorq %r15,%r13
+ rorxq $14,%rax,%r14
+ leaq (%rdx,%r12,1),%rdx
+ xorq %r14,%r13
+ movq %r8,%r15
+ rorxq $39,%r8,%r12
+ leaq (%rdx,%r13,1),%rdx
+ xorq %r9,%r15
+ rorxq $34,%r8,%r14
+ rorxq $28,%r8,%r13
+ leaq (%r11,%rdx,1),%r11
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r9,%rdi
+ xorq %r13,%r14
+ leaq (%rdx,%rdi,1),%rdx
+ movq %rax,%r12
+ addq 72+16(%rbp),%rcx
+ andq %r11,%r12
+ rorxq $41,%r11,%r13
+ rorxq $18,%r11,%rdi
+ leaq (%rdx,%r14,1),%rdx
+ leaq (%rcx,%r12,1),%rcx
+ andnq %rbx,%r11,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r11,%r14
+ leaq (%rcx,%r12,1),%rcx
+ xorq %r14,%r13
+ movq %rdx,%rdi
+ rorxq $39,%rdx,%r12
+ leaq (%rcx,%r13,1),%rcx
+ xorq %r8,%rdi
+ rorxq $34,%rdx,%r14
+ rorxq $28,%rdx,%r13
+ leaq (%r10,%rcx,1),%r10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r8,%r15
+ xorq %r13,%r14
+ leaq (%rcx,%r15,1),%rcx
+ movq %r11,%r12
+ addq 96+16(%rbp),%rbx
+ andq %r10,%r12
+ rorxq $41,%r10,%r13
+ rorxq $18,%r10,%r15
+ leaq (%rcx,%r14,1),%rcx
+ leaq (%rbx,%r12,1),%rbx
+ andnq %rax,%r10,%r12
+ xorq %r15,%r13
+ rorxq $14,%r10,%r14
+ leaq (%rbx,%r12,1),%rbx
+ xorq %r14,%r13
+ movq %rcx,%r15
+ rorxq $39,%rcx,%r12
+ leaq (%rbx,%r13,1),%rbx
+ xorq %rdx,%r15
+ rorxq $34,%rcx,%r14
+ rorxq $28,%rcx,%r13
+ leaq (%r9,%rbx,1),%r9
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rdx,%rdi
+ xorq %r13,%r14
+ leaq (%rbx,%rdi,1),%rbx
+ movq %r10,%r12
+ addq 104+16(%rbp),%rax
+ andq %r9,%r12
+ rorxq $41,%r9,%r13
+ rorxq $18,%r9,%rdi
+ leaq (%rbx,%r14,1),%rbx
+ leaq (%rax,%r12,1),%rax
+ andnq %r11,%r9,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r9,%r14
+ leaq (%rax,%r12,1),%rax
+ xorq %r14,%r13
+ movq %rbx,%rdi
+ rorxq $39,%rbx,%r12
+ leaq (%rax,%r13,1),%rax
+ xorq %rcx,%rdi
+ rorxq $34,%rbx,%r14
+ rorxq $28,%rbx,%r13
+ leaq (%r8,%rax,1),%r8
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rcx,%r15
+ xorq %r13,%r14
+ leaq (%rax,%r15,1),%rax
+ movq %r9,%r12
+ leaq -128(%rbp),%rbp
+ cmpq %rsp,%rbp
+ jae .Lower_avx2
+
+ movq 1280(%rsp),%rdi
+ addq %r14,%rax
+
+ leaq 1152(%rsp),%rsp
+
+ addq 0(%rdi),%rax
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ leaq 256(%rsi),%rsi
+ addq 48(%rdi),%r10
+ movq %rsi,%r12
+ addq 56(%rdi),%r11
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ cmoveq %rsp,%r12
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+
+ jbe .Loop_avx2
+ leaq (%rsp),%rbp
+
+.Ldone_avx2:
+ leaq (%rbp),%rsp
+ movq 152(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vzeroupper
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha512_block_data_order_avx2,.-sha512_block_data_order_avx2