summaryrefslogtreecommitdiff
path: root/deps/openssl/config/archs/linux-x32/asm_avx2/crypto/aes/aesni-mb-x86_64.s
diff options
context:
space:
mode:
Diffstat (limited to 'deps/openssl/config/archs/linux-x32/asm_avx2/crypto/aes/aesni-mb-x86_64.s')
-rw-r--r--deps/openssl/config/archs/linux-x32/asm_avx2/crypto/aes/aesni-mb-x86_64.s1507
1 files changed, 1507 insertions, 0 deletions
diff --git a/deps/openssl/config/archs/linux-x32/asm_avx2/crypto/aes/aesni-mb-x86_64.s b/deps/openssl/config/archs/linux-x32/asm_avx2/crypto/aes/aesni-mb-x86_64.s
new file mode 100644
index 0000000000..f2b5662b9c
--- /dev/null
+++ b/deps/openssl/config/archs/linux-x32/asm_avx2/crypto/aes/aesni-mb-x86_64.s
@@ -0,0 +1,1507 @@
+.text
+
+
+
+.globl aesni_multi_cbc_encrypt
+.type aesni_multi_cbc_encrypt,@function
+.align 32
+aesni_multi_cbc_encrypt:
+.cfi_startproc
+ cmpl $2,%edx
+ jb .Lenc_non_avx
+ movl OPENSSL_ia32cap_P+4(%rip),%ecx
+ testl $268435456,%ecx
+ jnz _avx_cbc_enc_shortcut
+ jmp .Lenc_non_avx
+.align 16
+.Lenc_non_avx:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+
+
+
+
+
+
+ subq $48,%rsp
+ andq $-64,%rsp
+ movq %rax,16(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
+
+.Lenc4x_body:
+ movdqu (%rsi),%xmm12
+ leaq 120(%rsi),%rsi
+ leaq 80(%rdi),%rdi
+
+.Lenc4x_loop_grande:
+ movl %edx,24(%rsp)
+ xorl %edx,%edx
+ movl -64(%rdi),%ecx
+ movq -80(%rdi),%r8
+ cmpl %edx,%ecx
+ movq -72(%rdi),%r12
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movdqu -56(%rdi),%xmm2
+ movl %ecx,32(%rsp)
+ cmovleq %rsp,%r8
+ movl -24(%rdi),%ecx
+ movq -40(%rdi),%r9
+ cmpl %edx,%ecx
+ movq -32(%rdi),%r13
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movdqu -16(%rdi),%xmm3
+ movl %ecx,36(%rsp)
+ cmovleq %rsp,%r9
+ movl 16(%rdi),%ecx
+ movq 0(%rdi),%r10
+ cmpl %edx,%ecx
+ movq 8(%rdi),%r14
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movdqu 24(%rdi),%xmm4
+ movl %ecx,40(%rsp)
+ cmovleq %rsp,%r10
+ movl 56(%rdi),%ecx
+ movq 40(%rdi),%r11
+ cmpl %edx,%ecx
+ movq 48(%rdi),%r15
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movdqu 64(%rdi),%xmm5
+ movl %ecx,44(%rsp)
+ cmovleq %rsp,%r11
+ testl %edx,%edx
+ jz .Lenc4x_done
+
+ movups 16-120(%rsi),%xmm1
+ pxor %xmm12,%xmm2
+ movups 32-120(%rsi),%xmm0
+ pxor %xmm12,%xmm3
+ movl 240-120(%rsi),%eax
+ pxor %xmm12,%xmm4
+ movdqu (%r8),%xmm6
+ pxor %xmm12,%xmm5
+ movdqu (%r9),%xmm7
+ pxor %xmm6,%xmm2
+ movdqu (%r10),%xmm8
+ pxor %xmm7,%xmm3
+ movdqu (%r11),%xmm9
+ pxor %xmm8,%xmm4
+ pxor %xmm9,%xmm5
+ movdqa 32(%rsp),%xmm10
+ xorq %rbx,%rbx
+ jmp .Loop_enc4x
+
+.align 32
+.Loop_enc4x:
+ addq $16,%rbx
+ leaq 16(%rsp),%rbp
+ movl $1,%ecx
+ subq %rbx,%rbp
+
+.byte 102,15,56,220,209
+ prefetcht0 31(%r8,%rbx,1)
+ prefetcht0 31(%r9,%rbx,1)
+.byte 102,15,56,220,217
+ prefetcht0 31(%r10,%rbx,1)
+ prefetcht0 31(%r10,%rbx,1)
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups 48-120(%rsi),%xmm1
+ cmpl 32(%rsp),%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+ cmovgeq %rbp,%r8
+ cmovgq %rbp,%r12
+.byte 102,15,56,220,232
+ movups -56(%rsi),%xmm0
+ cmpl 36(%rsp),%ecx
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ cmovgeq %rbp,%r9
+ cmovgq %rbp,%r13
+.byte 102,15,56,220,233
+ movups -40(%rsi),%xmm1
+ cmpl 40(%rsp),%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+ cmovgeq %rbp,%r10
+ cmovgq %rbp,%r14
+.byte 102,15,56,220,232
+ movups -24(%rsi),%xmm0
+ cmpl 44(%rsp),%ecx
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ cmovgeq %rbp,%r11
+ cmovgq %rbp,%r15
+.byte 102,15,56,220,233
+ movups -8(%rsi),%xmm1
+ movdqa %xmm10,%xmm11
+.byte 102,15,56,220,208
+ prefetcht0 15(%r12,%rbx,1)
+ prefetcht0 15(%r13,%rbx,1)
+.byte 102,15,56,220,216
+ prefetcht0 15(%r14,%rbx,1)
+ prefetcht0 15(%r15,%rbx,1)
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups 128-120(%rsi),%xmm0
+ pxor %xmm12,%xmm12
+
+.byte 102,15,56,220,209
+ pcmpgtd %xmm12,%xmm11
+ movdqu -120(%rsi),%xmm12
+.byte 102,15,56,220,217
+ paddd %xmm11,%xmm10
+ movdqa %xmm10,32(%rsp)
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups 144-120(%rsi),%xmm1
+
+ cmpl $11,%eax
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups 160-120(%rsi),%xmm0
+
+ jb .Lenc4x_tail
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups 176-120(%rsi),%xmm1
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups 192-120(%rsi),%xmm0
+
+ je .Lenc4x_tail
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups 208-120(%rsi),%xmm1
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups 224-120(%rsi),%xmm0
+ jmp .Lenc4x_tail
+
+.align 32
+.Lenc4x_tail:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movdqu (%r8,%rbx,1),%xmm6
+ movdqu 16-120(%rsi),%xmm1
+
+.byte 102,15,56,221,208
+ movdqu (%r9,%rbx,1),%xmm7
+ pxor %xmm12,%xmm6
+.byte 102,15,56,221,216
+ movdqu (%r10,%rbx,1),%xmm8
+ pxor %xmm12,%xmm7
+.byte 102,15,56,221,224
+ movdqu (%r11,%rbx,1),%xmm9
+ pxor %xmm12,%xmm8
+.byte 102,15,56,221,232
+ movdqu 32-120(%rsi),%xmm0
+ pxor %xmm12,%xmm9
+
+ movups %xmm2,-16(%r12,%rbx,1)
+ pxor %xmm6,%xmm2
+ movups %xmm3,-16(%r13,%rbx,1)
+ pxor %xmm7,%xmm3
+ movups %xmm4,-16(%r14,%rbx,1)
+ pxor %xmm8,%xmm4
+ movups %xmm5,-16(%r15,%rbx,1)
+ pxor %xmm9,%xmm5
+
+ decl %edx
+ jnz .Loop_enc4x
+
+ movq 16(%rsp),%rax
+.cfi_def_cfa %rax,8
+ movl 24(%rsp),%edx
+
+
+
+
+
+
+
+
+
+
+ leaq 160(%rdi),%rdi
+ decl %edx
+ jnz .Lenc4x_loop_grande
+
+.Lenc4x_done:
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lenc4x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
+
+.globl aesni_multi_cbc_decrypt
+.type aesni_multi_cbc_decrypt,@function
+.align 32
+aesni_multi_cbc_decrypt:
+.cfi_startproc
+ cmpl $2,%edx
+ jb .Ldec_non_avx
+ movl OPENSSL_ia32cap_P+4(%rip),%ecx
+ testl $268435456,%ecx
+ jnz _avx_cbc_dec_shortcut
+ jmp .Ldec_non_avx
+.align 16
+.Ldec_non_avx:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+
+
+
+
+
+
+ subq $48,%rsp
+ andq $-64,%rsp
+ movq %rax,16(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
+
+.Ldec4x_body:
+ movdqu (%rsi),%xmm12
+ leaq 120(%rsi),%rsi
+ leaq 80(%rdi),%rdi
+
+.Ldec4x_loop_grande:
+ movl %edx,24(%rsp)
+ xorl %edx,%edx
+ movl -64(%rdi),%ecx
+ movq -80(%rdi),%r8
+ cmpl %edx,%ecx
+ movq -72(%rdi),%r12
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movdqu -56(%rdi),%xmm6
+ movl %ecx,32(%rsp)
+ cmovleq %rsp,%r8
+ movl -24(%rdi),%ecx
+ movq -40(%rdi),%r9
+ cmpl %edx,%ecx
+ movq -32(%rdi),%r13
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movdqu -16(%rdi),%xmm7
+ movl %ecx,36(%rsp)
+ cmovleq %rsp,%r9
+ movl 16(%rdi),%ecx
+ movq 0(%rdi),%r10
+ cmpl %edx,%ecx
+ movq 8(%rdi),%r14
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movdqu 24(%rdi),%xmm8
+ movl %ecx,40(%rsp)
+ cmovleq %rsp,%r10
+ movl 56(%rdi),%ecx
+ movq 40(%rdi),%r11
+ cmpl %edx,%ecx
+ movq 48(%rdi),%r15
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movdqu 64(%rdi),%xmm9
+ movl %ecx,44(%rsp)
+ cmovleq %rsp,%r11
+ testl %edx,%edx
+ jz .Ldec4x_done
+
+ movups 16-120(%rsi),%xmm1
+ movups 32-120(%rsi),%xmm0
+ movl 240-120(%rsi),%eax
+ movdqu (%r8),%xmm2
+ movdqu (%r9),%xmm3
+ pxor %xmm12,%xmm2
+ movdqu (%r10),%xmm4
+ pxor %xmm12,%xmm3
+ movdqu (%r11),%xmm5
+ pxor %xmm12,%xmm4
+ pxor %xmm12,%xmm5
+ movdqa 32(%rsp),%xmm10
+ xorq %rbx,%rbx
+ jmp .Loop_dec4x
+
+.align 32
+.Loop_dec4x:
+ addq $16,%rbx
+ leaq 16(%rsp),%rbp
+ movl $1,%ecx
+ subq %rbx,%rbp
+
+.byte 102,15,56,222,209
+ prefetcht0 31(%r8,%rbx,1)
+ prefetcht0 31(%r9,%rbx,1)
+.byte 102,15,56,222,217
+ prefetcht0 31(%r10,%rbx,1)
+ prefetcht0 31(%r11,%rbx,1)
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups 48-120(%rsi),%xmm1
+ cmpl 32(%rsp),%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+ cmovgeq %rbp,%r8
+ cmovgq %rbp,%r12
+.byte 102,15,56,222,232
+ movups -56(%rsi),%xmm0
+ cmpl 36(%rsp),%ecx
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+ cmovgeq %rbp,%r9
+ cmovgq %rbp,%r13
+.byte 102,15,56,222,233
+ movups -40(%rsi),%xmm1
+ cmpl 40(%rsp),%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+ cmovgeq %rbp,%r10
+ cmovgq %rbp,%r14
+.byte 102,15,56,222,232
+ movups -24(%rsi),%xmm0
+ cmpl 44(%rsp),%ecx
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+ cmovgeq %rbp,%r11
+ cmovgq %rbp,%r15
+.byte 102,15,56,222,233
+ movups -8(%rsi),%xmm1
+ movdqa %xmm10,%xmm11
+.byte 102,15,56,222,208
+ prefetcht0 15(%r12,%rbx,1)
+ prefetcht0 15(%r13,%rbx,1)
+.byte 102,15,56,222,216
+ prefetcht0 15(%r14,%rbx,1)
+ prefetcht0 15(%r15,%rbx,1)
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups 128-120(%rsi),%xmm0
+ pxor %xmm12,%xmm12
+
+.byte 102,15,56,222,209
+ pcmpgtd %xmm12,%xmm11
+ movdqu -120(%rsi),%xmm12
+.byte 102,15,56,222,217
+ paddd %xmm11,%xmm10
+ movdqa %xmm10,32(%rsp)
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups 144-120(%rsi),%xmm1
+
+ cmpl $11,%eax
+
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups 160-120(%rsi),%xmm0
+
+ jb .Ldec4x_tail
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups 176-120(%rsi),%xmm1
+
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups 192-120(%rsi),%xmm0
+
+ je .Ldec4x_tail
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups 208-120(%rsi),%xmm1
+
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups 224-120(%rsi),%xmm0
+ jmp .Ldec4x_tail
+
+.align 32
+.Ldec4x_tail:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm6
+ pxor %xmm0,%xmm7
+.byte 102,15,56,222,233
+ movdqu 16-120(%rsi),%xmm1
+ pxor %xmm0,%xmm8
+ pxor %xmm0,%xmm9
+ movdqu 32-120(%rsi),%xmm0
+
+.byte 102,15,56,223,214
+.byte 102,15,56,223,223
+ movdqu -16(%r8,%rbx,1),%xmm6
+ movdqu -16(%r9,%rbx,1),%xmm7
+.byte 102,65,15,56,223,224
+.byte 102,65,15,56,223,233
+ movdqu -16(%r10,%rbx,1),%xmm8
+ movdqu -16(%r11,%rbx,1),%xmm9
+
+ movups %xmm2,-16(%r12,%rbx,1)
+ movdqu (%r8,%rbx,1),%xmm2
+ movups %xmm3,-16(%r13,%rbx,1)
+ movdqu (%r9,%rbx,1),%xmm3
+ pxor %xmm12,%xmm2
+ movups %xmm4,-16(%r14,%rbx,1)
+ movdqu (%r10,%rbx,1),%xmm4
+ pxor %xmm12,%xmm3
+ movups %xmm5,-16(%r15,%rbx,1)
+ movdqu (%r11,%rbx,1),%xmm5
+ pxor %xmm12,%xmm4
+ pxor %xmm12,%xmm5
+
+ decl %edx
+ jnz .Loop_dec4x
+
+ movq 16(%rsp),%rax
+.cfi_def_cfa %rax,8
+ movl 24(%rsp),%edx
+
+ leaq 160(%rdi),%rdi
+ decl %edx
+ jnz .Ldec4x_loop_grande
+
+.Ldec4x_done:
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Ldec4x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
+.type aesni_multi_cbc_encrypt_avx,@function
+.align 32
+aesni_multi_cbc_encrypt_avx:
+.cfi_startproc
+_avx_cbc_enc_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+
+
+
+
+
+
+
+
+ subq $192,%rsp
+ andq $-128,%rsp
+ movq %rax,16(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
+
+.Lenc8x_body:
+ vzeroupper
+ vmovdqu (%rsi),%xmm15
+ leaq 120(%rsi),%rsi
+ leaq 160(%rdi),%rdi
+ shrl $1,%edx
+
+.Lenc8x_loop_grande:
+
+ xorl %edx,%edx
+ movl -144(%rdi),%ecx
+ movq -160(%rdi),%r8
+ cmpl %edx,%ecx
+ movq -152(%rdi),%rbx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -136(%rdi),%xmm2
+ movl %ecx,32(%rsp)
+ cmovleq %rsp,%r8
+ subq %r8,%rbx
+ movq %rbx,64(%rsp)
+ movl -104(%rdi),%ecx
+ movq -120(%rdi),%r9
+ cmpl %edx,%ecx
+ movq -112(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -96(%rdi),%xmm3
+ movl %ecx,36(%rsp)
+ cmovleq %rsp,%r9
+ subq %r9,%rbp
+ movq %rbp,72(%rsp)
+ movl -64(%rdi),%ecx
+ movq -80(%rdi),%r10
+ cmpl %edx,%ecx
+ movq -72(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -56(%rdi),%xmm4
+ movl %ecx,40(%rsp)
+ cmovleq %rsp,%r10
+ subq %r10,%rbp
+ movq %rbp,80(%rsp)
+ movl -24(%rdi),%ecx
+ movq -40(%rdi),%r11
+ cmpl %edx,%ecx
+ movq -32(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -16(%rdi),%xmm5
+ movl %ecx,44(%rsp)
+ cmovleq %rsp,%r11
+ subq %r11,%rbp
+ movq %rbp,88(%rsp)
+ movl 16(%rdi),%ecx
+ movq 0(%rdi),%r12
+ cmpl %edx,%ecx
+ movq 8(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 24(%rdi),%xmm6
+ movl %ecx,48(%rsp)
+ cmovleq %rsp,%r12
+ subq %r12,%rbp
+ movq %rbp,96(%rsp)
+ movl 56(%rdi),%ecx
+ movq 40(%rdi),%r13
+ cmpl %edx,%ecx
+ movq 48(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 64(%rdi),%xmm7
+ movl %ecx,52(%rsp)
+ cmovleq %rsp,%r13
+ subq %r13,%rbp
+ movq %rbp,104(%rsp)
+ movl 96(%rdi),%ecx
+ movq 80(%rdi),%r14
+ cmpl %edx,%ecx
+ movq 88(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 104(%rdi),%xmm8
+ movl %ecx,56(%rsp)
+ cmovleq %rsp,%r14
+ subq %r14,%rbp
+ movq %rbp,112(%rsp)
+ movl 136(%rdi),%ecx
+ movq 120(%rdi),%r15
+ cmpl %edx,%ecx
+ movq 128(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 144(%rdi),%xmm9
+ movl %ecx,60(%rsp)
+ cmovleq %rsp,%r15
+ subq %r15,%rbp
+ movq %rbp,120(%rsp)
+ testl %edx,%edx
+ jz .Lenc8x_done
+
+ vmovups 16-120(%rsi),%xmm1
+ vmovups 32-120(%rsi),%xmm0
+ movl 240-120(%rsi),%eax
+
+ vpxor (%r8),%xmm15,%xmm10
+ leaq 128(%rsp),%rbp
+ vpxor (%r9),%xmm15,%xmm11
+ vpxor (%r10),%xmm15,%xmm12
+ vpxor (%r11),%xmm15,%xmm13
+ vpxor %xmm10,%xmm2,%xmm2
+ vpxor (%r12),%xmm15,%xmm10
+ vpxor %xmm11,%xmm3,%xmm3
+ vpxor (%r13),%xmm15,%xmm11
+ vpxor %xmm12,%xmm4,%xmm4
+ vpxor (%r14),%xmm15,%xmm12
+ vpxor %xmm13,%xmm5,%xmm5
+ vpxor (%r15),%xmm15,%xmm13
+ vpxor %xmm10,%xmm6,%xmm6
+ movl $1,%ecx
+ vpxor %xmm11,%xmm7,%xmm7
+ vpxor %xmm12,%xmm8,%xmm8
+ vpxor %xmm13,%xmm9,%xmm9
+ jmp .Loop_enc8x
+
+.align 32
+.Loop_enc8x:
+ vaesenc %xmm1,%xmm2,%xmm2
+ cmpl 32+0(%rsp),%ecx
+ vaesenc %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r8)
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm5,%xmm5
+ leaq (%r8,%rbx,1),%rbx
+ cmovgeq %rsp,%r8
+ vaesenc %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm1,%xmm7,%xmm7
+ subq %r8,%rbx
+ vaesenc %xmm1,%xmm8,%xmm8
+ vpxor 16(%r8),%xmm15,%xmm10
+ movq %rbx,64+0(%rsp)
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups -72(%rsi),%xmm1
+ leaq 16(%r8,%rbx,1),%r8
+ vmovdqu %xmm10,0(%rbp)
+ vaesenc %xmm0,%xmm2,%xmm2
+ cmpl 32+4(%rsp),%ecx
+ movq 64+8(%rsp),%rbx
+ vaesenc %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r9)
+ vaesenc %xmm0,%xmm4,%xmm4
+ vaesenc %xmm0,%xmm5,%xmm5
+ leaq (%r9,%rbx,1),%rbx
+ cmovgeq %rsp,%r9
+ vaesenc %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm0,%xmm7,%xmm7
+ subq %r9,%rbx
+ vaesenc %xmm0,%xmm8,%xmm8
+ vpxor 16(%r9),%xmm15,%xmm11
+ movq %rbx,64+8(%rsp)
+ vaesenc %xmm0,%xmm9,%xmm9
+ vmovups -56(%rsi),%xmm0
+ leaq 16(%r9,%rbx,1),%r9
+ vmovdqu %xmm11,16(%rbp)
+ vaesenc %xmm1,%xmm2,%xmm2
+ cmpl 32+8(%rsp),%ecx
+ movq 64+16(%rsp),%rbx
+ vaesenc %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r10)
+ vaesenc %xmm1,%xmm4,%xmm4
+ prefetcht0 15(%r8)
+ vaesenc %xmm1,%xmm5,%xmm5
+ leaq (%r10,%rbx,1),%rbx
+ cmovgeq %rsp,%r10
+ vaesenc %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm1,%xmm7,%xmm7
+ subq %r10,%rbx
+ vaesenc %xmm1,%xmm8,%xmm8
+ vpxor 16(%r10),%xmm15,%xmm12
+ movq %rbx,64+16(%rsp)
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups -40(%rsi),%xmm1
+ leaq 16(%r10,%rbx,1),%r10
+ vmovdqu %xmm12,32(%rbp)
+ vaesenc %xmm0,%xmm2,%xmm2
+ cmpl 32+12(%rsp),%ecx
+ movq 64+24(%rsp),%rbx
+ vaesenc %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r11)
+ vaesenc %xmm0,%xmm4,%xmm4
+ prefetcht0 15(%r9)
+ vaesenc %xmm0,%xmm5,%xmm5
+ leaq (%r11,%rbx,1),%rbx
+ cmovgeq %rsp,%r11
+ vaesenc %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm0,%xmm7,%xmm7
+ subq %r11,%rbx
+ vaesenc %xmm0,%xmm8,%xmm8
+ vpxor 16(%r11),%xmm15,%xmm13
+ movq %rbx,64+24(%rsp)
+ vaesenc %xmm0,%xmm9,%xmm9
+ vmovups -24(%rsi),%xmm0
+ leaq 16(%r11,%rbx,1),%r11
+ vmovdqu %xmm13,48(%rbp)
+ vaesenc %xmm1,%xmm2,%xmm2
+ cmpl 32+16(%rsp),%ecx
+ movq 64+32(%rsp),%rbx
+ vaesenc %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r12)
+ vaesenc %xmm1,%xmm4,%xmm4
+ prefetcht0 15(%r10)
+ vaesenc %xmm1,%xmm5,%xmm5
+ leaq (%r12,%rbx,1),%rbx
+ cmovgeq %rsp,%r12
+ vaesenc %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm1,%xmm7,%xmm7
+ subq %r12,%rbx
+ vaesenc %xmm1,%xmm8,%xmm8
+ vpxor 16(%r12),%xmm15,%xmm10
+ movq %rbx,64+32(%rsp)
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups -8(%rsi),%xmm1
+ leaq 16(%r12,%rbx,1),%r12
+ vaesenc %xmm0,%xmm2,%xmm2
+ cmpl 32+20(%rsp),%ecx
+ movq 64+40(%rsp),%rbx
+ vaesenc %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r13)
+ vaesenc %xmm0,%xmm4,%xmm4
+ prefetcht0 15(%r11)
+ vaesenc %xmm0,%xmm5,%xmm5
+ leaq (%rbx,%r13,1),%rbx
+ cmovgeq %rsp,%r13
+ vaesenc %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm0,%xmm7,%xmm7
+ subq %r13,%rbx
+ vaesenc %xmm0,%xmm8,%xmm8
+ vpxor 16(%r13),%xmm15,%xmm11
+ movq %rbx,64+40(%rsp)
+ vaesenc %xmm0,%xmm9,%xmm9
+ vmovups 8(%rsi),%xmm0
+ leaq 16(%r13,%rbx,1),%r13
+ vaesenc %xmm1,%xmm2,%xmm2
+ cmpl 32+24(%rsp),%ecx
+ movq 64+48(%rsp),%rbx
+ vaesenc %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r14)
+ vaesenc %xmm1,%xmm4,%xmm4
+ prefetcht0 15(%r12)
+ vaesenc %xmm1,%xmm5,%xmm5
+ leaq (%r14,%rbx,1),%rbx
+ cmovgeq %rsp,%r14
+ vaesenc %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm1,%xmm7,%xmm7
+ subq %r14,%rbx
+ vaesenc %xmm1,%xmm8,%xmm8
+ vpxor 16(%r14),%xmm15,%xmm12
+ movq %rbx,64+48(%rsp)
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 24(%rsi),%xmm1
+ leaq 16(%r14,%rbx,1),%r14
+ vaesenc %xmm0,%xmm2,%xmm2
+ cmpl 32+28(%rsp),%ecx
+ movq 64+56(%rsp),%rbx
+ vaesenc %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r15)
+ vaesenc %xmm0,%xmm4,%xmm4
+ prefetcht0 15(%r13)
+ vaesenc %xmm0,%xmm5,%xmm5
+ leaq (%r15,%rbx,1),%rbx
+ cmovgeq %rsp,%r15
+ vaesenc %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm0,%xmm7,%xmm7
+ subq %r15,%rbx
+ vaesenc %xmm0,%xmm8,%xmm8
+ vpxor 16(%r15),%xmm15,%xmm13
+ movq %rbx,64+56(%rsp)
+ vaesenc %xmm0,%xmm9,%xmm9
+ vmovups 40(%rsi),%xmm0
+ leaq 16(%r15,%rbx,1),%r15
+ vmovdqu 32(%rsp),%xmm14
+ prefetcht0 15(%r14)
+ prefetcht0 15(%r15)
+ cmpl $11,%eax
+ jb .Lenc8x_tail
+
+ vaesenc %xmm1,%xmm2,%xmm2
+ vaesenc %xmm1,%xmm3,%xmm3
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm5,%xmm5
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm8,%xmm8
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 176-120(%rsi),%xmm1
+
+ vaesenc %xmm0,%xmm2,%xmm2
+ vaesenc %xmm0,%xmm3,%xmm3
+ vaesenc %xmm0,%xmm4,%xmm4
+ vaesenc %xmm0,%xmm5,%xmm5
+ vaesenc %xmm0,%xmm6,%xmm6
+ vaesenc %xmm0,%xmm7,%xmm7
+ vaesenc %xmm0,%xmm8,%xmm8
+ vaesenc %xmm0,%xmm9,%xmm9
+ vmovups 192-120(%rsi),%xmm0
+ je .Lenc8x_tail
+
+ vaesenc %xmm1,%xmm2,%xmm2
+ vaesenc %xmm1,%xmm3,%xmm3
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm5,%xmm5
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm8,%xmm8
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 208-120(%rsi),%xmm1
+
+ vaesenc %xmm0,%xmm2,%xmm2
+ vaesenc %xmm0,%xmm3,%xmm3
+ vaesenc %xmm0,%xmm4,%xmm4
+ vaesenc %xmm0,%xmm5,%xmm5
+ vaesenc %xmm0,%xmm6,%xmm6
+ vaesenc %xmm0,%xmm7,%xmm7
+ vaesenc %xmm0,%xmm8,%xmm8
+ vaesenc %xmm0,%xmm9,%xmm9
+ vmovups 224-120(%rsi),%xmm0
+
+.Lenc8x_tail:
+ vaesenc %xmm1,%xmm2,%xmm2
+ vpxor %xmm15,%xmm15,%xmm15
+ vaesenc %xmm1,%xmm3,%xmm3
+ vaesenc %xmm1,%xmm4,%xmm4
+ vpcmpgtd %xmm15,%xmm14,%xmm15
+ vaesenc %xmm1,%xmm5,%xmm5
+ vaesenc %xmm1,%xmm6,%xmm6
+ vpaddd %xmm14,%xmm15,%xmm15
+ vmovdqu 48(%rsp),%xmm14
+ vaesenc %xmm1,%xmm7,%xmm7
+ movq 64(%rsp),%rbx
+ vaesenc %xmm1,%xmm8,%xmm8
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 16-120(%rsi),%xmm1
+
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vmovdqa %xmm15,32(%rsp)
+ vpxor %xmm15,%xmm15,%xmm15
+ vaesenclast %xmm0,%xmm3,%xmm3
+ vaesenclast %xmm0,%xmm4,%xmm4
+ vpcmpgtd %xmm15,%xmm14,%xmm15
+ vaesenclast %xmm0,%xmm5,%xmm5
+ vaesenclast %xmm0,%xmm6,%xmm6
+ vpaddd %xmm15,%xmm14,%xmm14
+ vmovdqu -120(%rsi),%xmm15
+ vaesenclast %xmm0,%xmm7,%xmm7
+ vaesenclast %xmm0,%xmm8,%xmm8
+ vmovdqa %xmm14,48(%rsp)
+ vaesenclast %xmm0,%xmm9,%xmm9
+ vmovups 32-120(%rsi),%xmm0
+
+ vmovups %xmm2,-16(%r8)
+ subq %rbx,%r8
+ vpxor 0(%rbp),%xmm2,%xmm2
+ vmovups %xmm3,-16(%r9)
+ subq 72(%rsp),%r9
+ vpxor 16(%rbp),%xmm3,%xmm3
+ vmovups %xmm4,-16(%r10)
+ subq 80(%rsp),%r10
+ vpxor 32(%rbp),%xmm4,%xmm4
+ vmovups %xmm5,-16(%r11)
+ subq 88(%rsp),%r11
+ vpxor 48(%rbp),%xmm5,%xmm5
+ vmovups %xmm6,-16(%r12)
+ subq 96(%rsp),%r12
+ vpxor %xmm10,%xmm6,%xmm6
+ vmovups %xmm7,-16(%r13)
+ subq 104(%rsp),%r13
+ vpxor %xmm11,%xmm7,%xmm7
+ vmovups %xmm8,-16(%r14)
+ subq 112(%rsp),%r14
+ vpxor %xmm12,%xmm8,%xmm8
+ vmovups %xmm9,-16(%r15)
+ subq 120(%rsp),%r15
+ vpxor %xmm13,%xmm9,%xmm9
+
+ decl %edx
+ jnz .Loop_enc8x
+
+ movq 16(%rsp),%rax
+.cfi_def_cfa %rax,8
+
+
+
+
+
+.Lenc8x_done:
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lenc8x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
+
+.type aesni_multi_cbc_decrypt_avx,@function
+.align 32
+aesni_multi_cbc_decrypt_avx:
+.cfi_startproc
+_avx_cbc_dec_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+
+
+
+
+
+
+
+
+
+ subq $256,%rsp
+ andq $-256,%rsp
+ subq $192,%rsp
+ movq %rax,16(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
+
+.Ldec8x_body:
+ vzeroupper
+ vmovdqu (%rsi),%xmm15
+ leaq 120(%rsi),%rsi
+ leaq 160(%rdi),%rdi
+ shrl $1,%edx
+
+.Ldec8x_loop_grande:
+
+ xorl %edx,%edx
+ movl -144(%rdi),%ecx
+ movq -160(%rdi),%r8
+ cmpl %edx,%ecx
+ movq -152(%rdi),%rbx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -136(%rdi),%xmm2
+ movl %ecx,32(%rsp)
+ cmovleq %rsp,%r8
+ subq %r8,%rbx
+ movq %rbx,64(%rsp)
+ vmovdqu %xmm2,192(%rsp)
+ movl -104(%rdi),%ecx
+ movq -120(%rdi),%r9
+ cmpl %edx,%ecx
+ movq -112(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -96(%rdi),%xmm3
+ movl %ecx,36(%rsp)
+ cmovleq %rsp,%r9
+ subq %r9,%rbp
+ movq %rbp,72(%rsp)
+ vmovdqu %xmm3,208(%rsp)
+ movl -64(%rdi),%ecx
+ movq -80(%rdi),%r10
+ cmpl %edx,%ecx
+ movq -72(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -56(%rdi),%xmm4
+ movl %ecx,40(%rsp)
+ cmovleq %rsp,%r10
+ subq %r10,%rbp
+ movq %rbp,80(%rsp)
+ vmovdqu %xmm4,224(%rsp)
+ movl -24(%rdi),%ecx
+ movq -40(%rdi),%r11
+ cmpl %edx,%ecx
+ movq -32(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -16(%rdi),%xmm5
+ movl %ecx,44(%rsp)
+ cmovleq %rsp,%r11
+ subq %r11,%rbp
+ movq %rbp,88(%rsp)
+ vmovdqu %xmm5,240(%rsp)
+ movl 16(%rdi),%ecx
+ movq 0(%rdi),%r12
+ cmpl %edx,%ecx
+ movq 8(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 24(%rdi),%xmm6
+ movl %ecx,48(%rsp)
+ cmovleq %rsp,%r12
+ subq %r12,%rbp
+ movq %rbp,96(%rsp)
+ vmovdqu %xmm6,256(%rsp)
+ movl 56(%rdi),%ecx
+ movq 40(%rdi),%r13
+ cmpl %edx,%ecx
+ movq 48(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 64(%rdi),%xmm7
+ movl %ecx,52(%rsp)
+ cmovleq %rsp,%r13
+ subq %r13,%rbp
+ movq %rbp,104(%rsp)
+ vmovdqu %xmm7,272(%rsp)
+ movl 96(%rdi),%ecx
+ movq 80(%rdi),%r14
+ cmpl %edx,%ecx
+ movq 88(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 104(%rdi),%xmm8
+ movl %ecx,56(%rsp)
+ cmovleq %rsp,%r14
+ subq %r14,%rbp
+ movq %rbp,112(%rsp)
+ vmovdqu %xmm8,288(%rsp)
+ movl 136(%rdi),%ecx
+ movq 120(%rdi),%r15
+ cmpl %edx,%ecx
+ movq 128(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 144(%rdi),%xmm9
+ movl %ecx,60(%rsp)
+ cmovleq %rsp,%r15
+ subq %r15,%rbp
+ movq %rbp,120(%rsp)
+ vmovdqu %xmm9,304(%rsp)
+ testl %edx,%edx
+ jz .Ldec8x_done
+
+ vmovups 16-120(%rsi),%xmm1
+ vmovups 32-120(%rsi),%xmm0
+ movl 240-120(%rsi),%eax
+ leaq 192+128(%rsp),%rbp
+
+ vmovdqu (%r8),%xmm2
+ vmovdqu (%r9),%xmm3
+ vmovdqu (%r10),%xmm4
+ vmovdqu (%r11),%xmm5
+ vmovdqu (%r12),%xmm6
+ vmovdqu (%r13),%xmm7
+ vmovdqu (%r14),%xmm8
+ vmovdqu (%r15),%xmm9
+ vmovdqu %xmm2,0(%rbp)
+ vpxor %xmm15,%xmm2,%xmm2
+ vmovdqu %xmm3,16(%rbp)
+ vpxor %xmm15,%xmm3,%xmm3
+ vmovdqu %xmm4,32(%rbp)
+ vpxor %xmm15,%xmm4,%xmm4
+ vmovdqu %xmm5,48(%rbp)
+ vpxor %xmm15,%xmm5,%xmm5
+ vmovdqu %xmm6,64(%rbp)
+ vpxor %xmm15,%xmm6,%xmm6
+ vmovdqu %xmm7,80(%rbp)
+ vpxor %xmm15,%xmm7,%xmm7
+ vmovdqu %xmm8,96(%rbp)
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu %xmm9,112(%rbp)
+ vpxor %xmm15,%xmm9,%xmm9
+ xorq $0x80,%rbp
+ movl $1,%ecx
+ jmp .Loop_dec8x
+
+.align 32
+.Loop_dec8x:
+ vaesdec %xmm1,%xmm2,%xmm2
+ cmpl 32+0(%rsp),%ecx
+ vaesdec %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r8)
+ vaesdec %xmm1,%xmm4,%xmm4
+ vaesdec %xmm1,%xmm5,%xmm5
+ leaq (%r8,%rbx,1),%rbx
+ cmovgeq %rsp,%r8
+ vaesdec %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm1,%xmm7,%xmm7
+ subq %r8,%rbx
+ vaesdec %xmm1,%xmm8,%xmm8
+ vmovdqu 16(%r8),%xmm10
+ movq %rbx,64+0(%rsp)
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups -72(%rsi),%xmm1
+ leaq 16(%r8,%rbx,1),%r8
+ vmovdqu %xmm10,128(%rsp)
+ vaesdec %xmm0,%xmm2,%xmm2
+ cmpl 32+4(%rsp),%ecx
+ movq 64+8(%rsp),%rbx
+ vaesdec %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r9)
+ vaesdec %xmm0,%xmm4,%xmm4
+ vaesdec %xmm0,%xmm5,%xmm5
+ leaq (%r9,%rbx,1),%rbx
+ cmovgeq %rsp,%r9
+ vaesdec %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm0,%xmm7,%xmm7
+ subq %r9,%rbx
+ vaesdec %xmm0,%xmm8,%xmm8
+ vmovdqu 16(%r9),%xmm11
+ movq %rbx,64+8(%rsp)
+ vaesdec %xmm0,%xmm9,%xmm9
+ vmovups -56(%rsi),%xmm0
+ leaq 16(%r9,%rbx,1),%r9
+ vmovdqu %xmm11,144(%rsp)
+ vaesdec %xmm1,%xmm2,%xmm2
+ cmpl 32+8(%rsp),%ecx
+ movq 64+16(%rsp),%rbx
+ vaesdec %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r10)
+ vaesdec %xmm1,%xmm4,%xmm4
+ prefetcht0 15(%r8)
+ vaesdec %xmm1,%xmm5,%xmm5
+ leaq (%r10,%rbx,1),%rbx
+ cmovgeq %rsp,%r10
+ vaesdec %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm1,%xmm7,%xmm7
+ subq %r10,%rbx
+ vaesdec %xmm1,%xmm8,%xmm8
+ vmovdqu 16(%r10),%xmm12
+ movq %rbx,64+16(%rsp)
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups -40(%rsi),%xmm1
+ leaq 16(%r10,%rbx,1),%r10
+ vmovdqu %xmm12,160(%rsp)
+ vaesdec %xmm0,%xmm2,%xmm2
+ cmpl 32+12(%rsp),%ecx
+ movq 64+24(%rsp),%rbx
+ vaesdec %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r11)
+ vaesdec %xmm0,%xmm4,%xmm4
+ prefetcht0 15(%r9)
+ vaesdec %xmm0,%xmm5,%xmm5
+ leaq (%r11,%rbx,1),%rbx
+ cmovgeq %rsp,%r11
+ vaesdec %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm0,%xmm7,%xmm7
+ subq %r11,%rbx
+ vaesdec %xmm0,%xmm8,%xmm8
+ vmovdqu 16(%r11),%xmm13
+ movq %rbx,64+24(%rsp)
+ vaesdec %xmm0,%xmm9,%xmm9
+ vmovups -24(%rsi),%xmm0
+ leaq 16(%r11,%rbx,1),%r11
+ vmovdqu %xmm13,176(%rsp)
+ vaesdec %xmm1,%xmm2,%xmm2
+ cmpl 32+16(%rsp),%ecx
+ movq 64+32(%rsp),%rbx
+ vaesdec %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r12)
+ vaesdec %xmm1,%xmm4,%xmm4
+ prefetcht0 15(%r10)
+ vaesdec %xmm1,%xmm5,%xmm5
+ leaq (%r12,%rbx,1),%rbx
+ cmovgeq %rsp,%r12
+ vaesdec %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm1,%xmm7,%xmm7
+ subq %r12,%rbx
+ vaesdec %xmm1,%xmm8,%xmm8
+ vmovdqu 16(%r12),%xmm10
+ movq %rbx,64+32(%rsp)
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups -8(%rsi),%xmm1
+ leaq 16(%r12,%rbx,1),%r12
+ vaesdec %xmm0,%xmm2,%xmm2
+ cmpl 32+20(%rsp),%ecx
+ movq 64+40(%rsp),%rbx
+ vaesdec %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r13)
+ vaesdec %xmm0,%xmm4,%xmm4
+ prefetcht0 15(%r11)
+ vaesdec %xmm0,%xmm5,%xmm5
+ leaq (%rbx,%r13,1),%rbx
+ cmovgeq %rsp,%r13
+ vaesdec %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm0,%xmm7,%xmm7
+ subq %r13,%rbx
+ vaesdec %xmm0,%xmm8,%xmm8
+ vmovdqu 16(%r13),%xmm11
+ movq %rbx,64+40(%rsp)
+ vaesdec %xmm0,%xmm9,%xmm9
+ vmovups 8(%rsi),%xmm0
+ leaq 16(%r13,%rbx,1),%r13
+ vaesdec %xmm1,%xmm2,%xmm2
+ cmpl 32+24(%rsp),%ecx
+ movq 64+48(%rsp),%rbx
+ vaesdec %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r14)
+ vaesdec %xmm1,%xmm4,%xmm4
+ prefetcht0 15(%r12)
+ vaesdec %xmm1,%xmm5,%xmm5
+ leaq (%r14,%rbx,1),%rbx
+ cmovgeq %rsp,%r14
+ vaesdec %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm1,%xmm7,%xmm7
+ subq %r14,%rbx
+ vaesdec %xmm1,%xmm8,%xmm8
+ vmovdqu 16(%r14),%xmm12
+ movq %rbx,64+48(%rsp)
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups 24(%rsi),%xmm1
+ leaq 16(%r14,%rbx,1),%r14
+ vaesdec %xmm0,%xmm2,%xmm2
+ cmpl 32+28(%rsp),%ecx
+ movq 64+56(%rsp),%rbx
+ vaesdec %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r15)
+ vaesdec %xmm0,%xmm4,%xmm4
+ prefetcht0 15(%r13)
+ vaesdec %xmm0,%xmm5,%xmm5
+ leaq (%r15,%rbx,1),%rbx
+ cmovgeq %rsp,%r15
+ vaesdec %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm0,%xmm7,%xmm7
+ subq %r15,%rbx
+ vaesdec %xmm0,%xmm8,%xmm8
+ vmovdqu 16(%r15),%xmm13
+ movq %rbx,64+56(%rsp)
+ vaesdec %xmm0,%xmm9,%xmm9
+ vmovups 40(%rsi),%xmm0
+ leaq 16(%r15,%rbx,1),%r15
+ vmovdqu 32(%rsp),%xmm14
+ prefetcht0 15(%r14)
+ prefetcht0 15(%r15)
+ cmpl $11,%eax
+ jb .Ldec8x_tail
+
+ vaesdec %xmm1,%xmm2,%xmm2
+ vaesdec %xmm1,%xmm3,%xmm3
+ vaesdec %xmm1,%xmm4,%xmm4
+ vaesdec %xmm1,%xmm5,%xmm5
+ vaesdec %xmm1,%xmm6,%xmm6
+ vaesdec %xmm1,%xmm7,%xmm7
+ vaesdec %xmm1,%xmm8,%xmm8
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups 176-120(%rsi),%xmm1
+
+ vaesdec %xmm0,%xmm2,%xmm2
+ vaesdec %xmm0,%xmm3,%xmm3
+ vaesdec %xmm0,%xmm4,%xmm4
+ vaesdec %xmm0,%xmm5,%xmm5
+ vaesdec %xmm0,%xmm6,%xmm6
+ vaesdec %xmm0,%xmm7,%xmm7
+ vaesdec %xmm0,%xmm8,%xmm8
+ vaesdec %xmm0,%xmm9,%xmm9
+ vmovups 192-120(%rsi),%xmm0
+ je .Ldec8x_tail
+
+ vaesdec %xmm1,%xmm2,%xmm2
+ vaesdec %xmm1,%xmm3,%xmm3
+ vaesdec %xmm1,%xmm4,%xmm4
+ vaesdec %xmm1,%xmm5,%xmm5
+ vaesdec %xmm1,%xmm6,%xmm6
+ vaesdec %xmm1,%xmm7,%xmm7
+ vaesdec %xmm1,%xmm8,%xmm8
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups 208-120(%rsi),%xmm1
+
+ vaesdec %xmm0,%xmm2,%xmm2
+ vaesdec %xmm0,%xmm3,%xmm3
+ vaesdec %xmm0,%xmm4,%xmm4
+ vaesdec %xmm0,%xmm5,%xmm5
+ vaesdec %xmm0,%xmm6,%xmm6
+ vaesdec %xmm0,%xmm7,%xmm7
+ vaesdec %xmm0,%xmm8,%xmm8
+ vaesdec %xmm0,%xmm9,%xmm9
+ vmovups 224-120(%rsi),%xmm0
+
+.Ldec8x_tail:
+ vaesdec %xmm1,%xmm2,%xmm2
+ vpxor %xmm15,%xmm15,%xmm15
+ vaesdec %xmm1,%xmm3,%xmm3
+ vaesdec %xmm1,%xmm4,%xmm4
+ vpcmpgtd %xmm15,%xmm14,%xmm15
+ vaesdec %xmm1,%xmm5,%xmm5
+ vaesdec %xmm1,%xmm6,%xmm6
+ vpaddd %xmm14,%xmm15,%xmm15
+ vmovdqu 48(%rsp),%xmm14
+ vaesdec %xmm1,%xmm7,%xmm7
+ movq 64(%rsp),%rbx
+ vaesdec %xmm1,%xmm8,%xmm8
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups 16-120(%rsi),%xmm1
+
+ vaesdeclast %xmm0,%xmm2,%xmm2
+ vmovdqa %xmm15,32(%rsp)
+ vpxor %xmm15,%xmm15,%xmm15
+ vaesdeclast %xmm0,%xmm3,%xmm3
+ vpxor 0(%rbp),%xmm2,%xmm2
+ vaesdeclast %xmm0,%xmm4,%xmm4
+ vpxor 16(%rbp),%xmm3,%xmm3
+ vpcmpgtd %xmm15,%xmm14,%xmm15
+ vaesdeclast %xmm0,%xmm5,%xmm5
+ vpxor 32(%rbp),%xmm4,%xmm4
+ vaesdeclast %xmm0,%xmm6,%xmm6
+ vpxor 48(%rbp),%xmm5,%xmm5
+ vpaddd %xmm15,%xmm14,%xmm14
+ vmovdqu -120(%rsi),%xmm15
+ vaesdeclast %xmm0,%xmm7,%xmm7
+ vpxor 64(%rbp),%xmm6,%xmm6
+ vaesdeclast %xmm0,%xmm8,%xmm8
+ vpxor 80(%rbp),%xmm7,%xmm7
+ vmovdqa %xmm14,48(%rsp)
+ vaesdeclast %xmm0,%xmm9,%xmm9
+ vpxor 96(%rbp),%xmm8,%xmm8
+ vmovups 32-120(%rsi),%xmm0
+
+ vmovups %xmm2,-16(%r8)
+ subq %rbx,%r8
+ vmovdqu 128+0(%rsp),%xmm2
+ vpxor 112(%rbp),%xmm9,%xmm9
+ vmovups %xmm3,-16(%r9)
+ subq 72(%rsp),%r9
+ vmovdqu %xmm2,0(%rbp)
+ vpxor %xmm15,%xmm2,%xmm2
+ vmovdqu 128+16(%rsp),%xmm3
+ vmovups %xmm4,-16(%r10)
+ subq 80(%rsp),%r10
+ vmovdqu %xmm3,16(%rbp)
+ vpxor %xmm15,%xmm3,%xmm3
+ vmovdqu 128+32(%rsp),%xmm4
+ vmovups %xmm5,-16(%r11)
+ subq 88(%rsp),%r11
+ vmovdqu %xmm4,32(%rbp)
+ vpxor %xmm15,%xmm4,%xmm4
+ vmovdqu 128+48(%rsp),%xmm5
+ vmovups %xmm6,-16(%r12)
+ subq 96(%rsp),%r12
+ vmovdqu %xmm5,48(%rbp)
+ vpxor %xmm15,%xmm5,%xmm5
+ vmovdqu %xmm10,64(%rbp)
+ vpxor %xmm10,%xmm15,%xmm6
+ vmovups %xmm7,-16(%r13)
+ subq 104(%rsp),%r13
+ vmovdqu %xmm11,80(%rbp)
+ vpxor %xmm11,%xmm15,%xmm7
+ vmovups %xmm8,-16(%r14)
+ subq 112(%rsp),%r14
+ vmovdqu %xmm12,96(%rbp)
+ vpxor %xmm12,%xmm15,%xmm8
+ vmovups %xmm9,-16(%r15)
+ subq 120(%rsp),%r15
+ vmovdqu %xmm13,112(%rbp)
+ vpxor %xmm13,%xmm15,%xmm9
+
+ xorq $128,%rbp
+ decl %edx
+ jnz .Loop_dec8x
+
+ movq 16(%rsp),%rax
+.cfi_def_cfa %rax,8
+
+
+
+
+
+.Ldec8x_done:
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Ldec8x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx