diff options
Diffstat (limited to 'deps/openssl/config/archs/VC-WIN64A/asm_avx2/crypto/modes/aesni-gcm-x86_64.asm')
-rw-r--r-- | deps/openssl/config/archs/VC-WIN64A/asm_avx2/crypto/modes/aesni-gcm-x86_64.asm | 977 |
1 files changed, 977 insertions, 0 deletions
diff --git a/deps/openssl/config/archs/VC-WIN64A/asm_avx2/crypto/modes/aesni-gcm-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm_avx2/crypto/modes/aesni-gcm-x86_64.asm new file mode 100644 index 0000000000..b1d8332457 --- /dev/null +++ b/deps/openssl/config/archs/VC-WIN64A/asm_avx2/crypto/modes/aesni-gcm-x86_64.asm @@ -0,0 +1,977 @@ +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + + + +ALIGN 32 +_aesni_ctr32_ghash_6x: + vmovdqu xmm2,XMMWORD[32+r11] + sub rdx,6 + vpxor xmm4,xmm4,xmm4 + vmovdqu xmm15,XMMWORD[((0-128))+rcx] + vpaddb xmm10,xmm1,xmm2 + vpaddb xmm11,xmm10,xmm2 + vpaddb xmm12,xmm11,xmm2 + vpaddb xmm13,xmm12,xmm2 + vpaddb xmm14,xmm13,xmm2 + vpxor xmm9,xmm1,xmm15 + vmovdqu XMMWORD[(16+8)+rsp],xmm4 + jmp NEAR $L$oop6x + +ALIGN 32 +$L$oop6x: + add ebx,100663296 + jc NEAR $L$handle_ctr32 + vmovdqu xmm3,XMMWORD[((0-32))+r9] + vpaddb xmm1,xmm14,xmm2 + vpxor xmm10,xmm10,xmm15 + vpxor xmm11,xmm11,xmm15 + +$L$resume_ctr32: + vmovdqu XMMWORD[r8],xmm1 + vpclmulqdq xmm5,xmm7,xmm3,0x10 + vpxor xmm12,xmm12,xmm15 + vmovups xmm2,XMMWORD[((16-128))+rcx] + vpclmulqdq xmm6,xmm7,xmm3,0x01 + xor r12,r12 + cmp r15,r14 + + vaesenc xmm9,xmm9,xmm2 + vmovdqu xmm0,XMMWORD[((48+8))+rsp] + vpxor xmm13,xmm13,xmm15 + vpclmulqdq xmm1,xmm7,xmm3,0x00 + vaesenc xmm10,xmm10,xmm2 + vpxor xmm14,xmm14,xmm15 + setnc r12b + vpclmulqdq xmm7,xmm7,xmm3,0x11 + vaesenc xmm11,xmm11,xmm2 + vmovdqu xmm3,XMMWORD[((16-32))+r9] + neg r12 + vaesenc xmm12,xmm12,xmm2 + vpxor xmm6,xmm6,xmm5 + vpclmulqdq xmm5,xmm0,xmm3,0x00 + vpxor xmm8,xmm8,xmm4 + vaesenc xmm13,xmm13,xmm2 + vpxor xmm4,xmm1,xmm5 + and r12,0x60 + vmovups xmm15,XMMWORD[((32-128))+rcx] + vpclmulqdq xmm1,xmm0,xmm3,0x10 + vaesenc xmm14,xmm14,xmm2 + + vpclmulqdq xmm2,xmm0,xmm3,0x01 + lea r14,[r12*1+r14] + vaesenc xmm9,xmm9,xmm15 + vpxor xmm8,xmm8,XMMWORD[((16+8))+rsp] + vpclmulqdq xmm3,xmm0,xmm3,0x11 + vmovdqu xmm0,XMMWORD[((64+8))+rsp] + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[88+r14] + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[80+r14] + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((32+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((40+8))+rsp],r12 + vmovdqu xmm5,XMMWORD[((48-32))+r9] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((48-128))+rcx] + vpxor xmm6,xmm6,xmm1 + vpclmulqdq xmm1,xmm0,xmm5,0x00 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm2 + vpclmulqdq xmm2,xmm0,xmm5,0x10 + vaesenc xmm10,xmm10,xmm15 + vpxor xmm7,xmm7,xmm3 + vpclmulqdq xmm3,xmm0,xmm5,0x01 + vaesenc xmm11,xmm11,xmm15 + vpclmulqdq xmm5,xmm0,xmm5,0x11 + vmovdqu xmm0,XMMWORD[((80+8))+rsp] + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vpxor xmm4,xmm4,xmm1 + vmovdqu xmm1,XMMWORD[((64-32))+r9] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((64-128))+rcx] + vpxor xmm6,xmm6,xmm2 + vpclmulqdq xmm2,xmm0,xmm1,0x00 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm3 + vpclmulqdq xmm3,xmm0,xmm1,0x10 + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[72+r14] + vpxor xmm7,xmm7,xmm5 + vpclmulqdq xmm5,xmm0,xmm1,0x01 + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[64+r14] + vpclmulqdq xmm1,xmm0,xmm1,0x11 + vmovdqu xmm0,XMMWORD[((96+8))+rsp] + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((48+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((56+8))+rsp],r12 + vpxor xmm4,xmm4,xmm2 + vmovdqu xmm2,XMMWORD[((96-32))+r9] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((80-128))+rcx] + vpxor xmm6,xmm6,xmm3 + vpclmulqdq xmm3,xmm0,xmm2,0x00 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm5 + vpclmulqdq xmm5,xmm0,xmm2,0x10 + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[56+r14] + vpxor xmm7,xmm7,xmm1 + vpclmulqdq xmm1,xmm0,xmm2,0x01 + vpxor xmm8,xmm8,XMMWORD[((112+8))+rsp] + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[48+r14] + vpclmulqdq xmm2,xmm0,xmm2,0x11 + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((64+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((72+8))+rsp],r12 + vpxor xmm4,xmm4,xmm3 + vmovdqu xmm3,XMMWORD[((112-32))+r9] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((96-128))+rcx] + vpxor xmm6,xmm6,xmm5 + vpclmulqdq xmm5,xmm8,xmm3,0x10 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm1 + vpclmulqdq xmm1,xmm8,xmm3,0x01 + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[40+r14] + vpxor xmm7,xmm7,xmm2 + vpclmulqdq xmm2,xmm8,xmm3,0x00 + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[32+r14] + vpclmulqdq xmm8,xmm8,xmm3,0x11 + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((80+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((88+8))+rsp],r12 + vpxor xmm6,xmm6,xmm5 + vaesenc xmm14,xmm14,xmm15 + vpxor xmm6,xmm6,xmm1 + + vmovups xmm15,XMMWORD[((112-128))+rcx] + vpslldq xmm5,xmm6,8 + vpxor xmm4,xmm4,xmm2 + vmovdqu xmm3,XMMWORD[16+r11] + + vaesenc xmm9,xmm9,xmm15 + vpxor xmm7,xmm7,xmm8 + vaesenc xmm10,xmm10,xmm15 + vpxor xmm4,xmm4,xmm5 + movbe r13,QWORD[24+r14] + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[16+r14] + vpalignr xmm0,xmm4,xmm4,8 + vpclmulqdq xmm4,xmm4,xmm3,0x10 + mov QWORD[((96+8))+rsp],r13 + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((104+8))+rsp],r12 + vaesenc xmm13,xmm13,xmm15 + vmovups xmm1,XMMWORD[((128-128))+rcx] + vaesenc xmm14,xmm14,xmm15 + + vaesenc xmm9,xmm9,xmm1 + vmovups xmm15,XMMWORD[((144-128))+rcx] + vaesenc xmm10,xmm10,xmm1 + vpsrldq xmm6,xmm6,8 + vaesenc xmm11,xmm11,xmm1 + vpxor xmm7,xmm7,xmm6 + vaesenc xmm12,xmm12,xmm1 + vpxor xmm4,xmm4,xmm0 + movbe r13,QWORD[8+r14] + vaesenc xmm13,xmm13,xmm1 + movbe r12,QWORD[r14] + vaesenc xmm14,xmm14,xmm1 + vmovups xmm1,XMMWORD[((160-128))+rcx] + cmp ebp,11 + jb NEAR $L$enc_tail + + vaesenc xmm9,xmm9,xmm15 + vaesenc xmm10,xmm10,xmm15 + vaesenc xmm11,xmm11,xmm15 + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vaesenc xmm14,xmm14,xmm15 + + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + vmovups xmm15,XMMWORD[((176-128))+rcx] + vaesenc xmm14,xmm14,xmm1 + vmovups xmm1,XMMWORD[((192-128))+rcx] + je NEAR $L$enc_tail + + vaesenc xmm9,xmm9,xmm15 + vaesenc xmm10,xmm10,xmm15 + vaesenc xmm11,xmm11,xmm15 + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vaesenc xmm14,xmm14,xmm15 + + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + vmovups xmm15,XMMWORD[((208-128))+rcx] + vaesenc xmm14,xmm14,xmm1 + vmovups xmm1,XMMWORD[((224-128))+rcx] + jmp NEAR $L$enc_tail + +ALIGN 32 +$L$handle_ctr32: + vmovdqu xmm0,XMMWORD[r11] + vpshufb xmm6,xmm1,xmm0 + vmovdqu xmm5,XMMWORD[48+r11] + vpaddd xmm10,xmm6,XMMWORD[64+r11] + vpaddd xmm11,xmm6,xmm5 + vmovdqu xmm3,XMMWORD[((0-32))+r9] + vpaddd xmm12,xmm10,xmm5 + vpshufb xmm10,xmm10,xmm0 + vpaddd xmm13,xmm11,xmm5 + vpshufb xmm11,xmm11,xmm0 + vpxor xmm10,xmm10,xmm15 + vpaddd xmm14,xmm12,xmm5 + vpshufb xmm12,xmm12,xmm0 + vpxor xmm11,xmm11,xmm15 + vpaddd xmm1,xmm13,xmm5 + vpshufb xmm13,xmm13,xmm0 + vpshufb xmm14,xmm14,xmm0 + vpshufb xmm1,xmm1,xmm0 + jmp NEAR $L$resume_ctr32 + +ALIGN 32 +$L$enc_tail: + vaesenc xmm9,xmm9,xmm15 + vmovdqu XMMWORD[(16+8)+rsp],xmm7 + vpalignr xmm8,xmm4,xmm4,8 + vaesenc xmm10,xmm10,xmm15 + vpclmulqdq xmm4,xmm4,xmm3,0x10 + vpxor xmm2,xmm1,XMMWORD[rdi] + vaesenc xmm11,xmm11,xmm15 + vpxor xmm0,xmm1,XMMWORD[16+rdi] + vaesenc xmm12,xmm12,xmm15 + vpxor xmm5,xmm1,XMMWORD[32+rdi] + vaesenc xmm13,xmm13,xmm15 + vpxor xmm6,xmm1,XMMWORD[48+rdi] + vaesenc xmm14,xmm14,xmm15 + vpxor xmm7,xmm1,XMMWORD[64+rdi] + vpxor xmm3,xmm1,XMMWORD[80+rdi] + vmovdqu xmm1,XMMWORD[r8] + + vaesenclast xmm9,xmm9,xmm2 + vmovdqu xmm2,XMMWORD[32+r11] + vaesenclast xmm10,xmm10,xmm0 + vpaddb xmm0,xmm1,xmm2 + mov QWORD[((112+8))+rsp],r13 + lea rdi,[96+rdi] + vaesenclast xmm11,xmm11,xmm5 + vpaddb xmm5,xmm0,xmm2 + mov QWORD[((120+8))+rsp],r12 + lea rsi,[96+rsi] + vmovdqu xmm15,XMMWORD[((0-128))+rcx] + vaesenclast xmm12,xmm12,xmm6 + vpaddb xmm6,xmm5,xmm2 + vaesenclast xmm13,xmm13,xmm7 + vpaddb xmm7,xmm6,xmm2 + vaesenclast xmm14,xmm14,xmm3 + vpaddb xmm3,xmm7,xmm2 + + add r10,0x60 + sub rdx,0x6 + jc NEAR $L$6x_done + + vmovups XMMWORD[(-96)+rsi],xmm9 + vpxor xmm9,xmm1,xmm15 + vmovups XMMWORD[(-80)+rsi],xmm10 + vmovdqa xmm10,xmm0 + vmovups XMMWORD[(-64)+rsi],xmm11 + vmovdqa xmm11,xmm5 + vmovups XMMWORD[(-48)+rsi],xmm12 + vmovdqa xmm12,xmm6 + vmovups XMMWORD[(-32)+rsi],xmm13 + vmovdqa xmm13,xmm7 + vmovups XMMWORD[(-16)+rsi],xmm14 + vmovdqa xmm14,xmm3 + vmovdqu xmm7,XMMWORD[((32+8))+rsp] + jmp NEAR $L$oop6x + +$L$6x_done: + vpxor xmm8,xmm8,XMMWORD[((16+8))+rsp] + vpxor xmm8,xmm8,xmm4 + + DB 0F3h,0C3h ;repret + +global aesni_gcm_decrypt + +ALIGN 32 +aesni_gcm_decrypt: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aesni_gcm_decrypt: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + xor r10,r10 + cmp rdx,0x60 + jb NEAR $L$gcm_dec_abort + + lea rax,[rsp] + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,[((-168))+rsp] + movaps XMMWORD[(-216)+rax],xmm6 + movaps XMMWORD[(-200)+rax],xmm7 + movaps XMMWORD[(-184)+rax],xmm8 + movaps XMMWORD[(-168)+rax],xmm9 + movaps XMMWORD[(-152)+rax],xmm10 + movaps XMMWORD[(-136)+rax],xmm11 + movaps XMMWORD[(-120)+rax],xmm12 + movaps XMMWORD[(-104)+rax],xmm13 + movaps XMMWORD[(-88)+rax],xmm14 + movaps XMMWORD[(-72)+rax],xmm15 +$L$gcm_dec_body: + vzeroupper + + vmovdqu xmm1,XMMWORD[r8] + add rsp,-128 + mov ebx,DWORD[12+r8] + lea r11,[$L$bswap_mask] + lea r14,[((-128))+rcx] + mov r15,0xf80 + vmovdqu xmm8,XMMWORD[r9] + and rsp,-128 + vmovdqu xmm0,XMMWORD[r11] + lea rcx,[128+rcx] + lea r9,[((32+32))+r9] + mov ebp,DWORD[((240-128))+rcx] + vpshufb xmm8,xmm8,xmm0 + + and r14,r15 + and r15,rsp + sub r15,r14 + jc NEAR $L$dec_no_key_aliasing + cmp r15,768 + jnc NEAR $L$dec_no_key_aliasing + sub rsp,r15 +$L$dec_no_key_aliasing: + + vmovdqu xmm7,XMMWORD[80+rdi] + lea r14,[rdi] + vmovdqu xmm4,XMMWORD[64+rdi] + lea r15,[((-192))+rdx*1+rdi] + vmovdqu xmm5,XMMWORD[48+rdi] + shr rdx,4 + xor r10,r10 + vmovdqu xmm6,XMMWORD[32+rdi] + vpshufb xmm7,xmm7,xmm0 + vmovdqu xmm2,XMMWORD[16+rdi] + vpshufb xmm4,xmm4,xmm0 + vmovdqu xmm3,XMMWORD[rdi] + vpshufb xmm5,xmm5,xmm0 + vmovdqu XMMWORD[48+rsp],xmm4 + vpshufb xmm6,xmm6,xmm0 + vmovdqu XMMWORD[64+rsp],xmm5 + vpshufb xmm2,xmm2,xmm0 + vmovdqu XMMWORD[80+rsp],xmm6 + vpshufb xmm3,xmm3,xmm0 + vmovdqu XMMWORD[96+rsp],xmm2 + vmovdqu XMMWORD[112+rsp],xmm3 + + call _aesni_ctr32_ghash_6x + + vmovups XMMWORD[(-96)+rsi],xmm9 + vmovups XMMWORD[(-80)+rsi],xmm10 + vmovups XMMWORD[(-64)+rsi],xmm11 + vmovups XMMWORD[(-48)+rsi],xmm12 + vmovups XMMWORD[(-32)+rsi],xmm13 + vmovups XMMWORD[(-16)+rsi],xmm14 + + vpshufb xmm8,xmm8,XMMWORD[r11] + vmovdqu XMMWORD[(-64)+r9],xmm8 + + vzeroupper + movaps xmm6,XMMWORD[((-216))+rax] + movaps xmm7,XMMWORD[((-200))+rax] + movaps xmm8,XMMWORD[((-184))+rax] + movaps xmm9,XMMWORD[((-168))+rax] + movaps xmm10,XMMWORD[((-152))+rax] + movaps xmm11,XMMWORD[((-136))+rax] + movaps xmm12,XMMWORD[((-120))+rax] + movaps xmm13,XMMWORD[((-104))+rax] + movaps xmm14,XMMWORD[((-88))+rax] + movaps xmm15,XMMWORD[((-72))+rax] + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbp,QWORD[((-16))+rax] + + mov rbx,QWORD[((-8))+rax] + + lea rsp,[rax] + +$L$gcm_dec_abort: + mov rax,r10 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aesni_gcm_decrypt: + +ALIGN 32 +_aesni_ctr32_6x: + vmovdqu xmm4,XMMWORD[((0-128))+rcx] + vmovdqu xmm2,XMMWORD[32+r11] + lea r13,[((-1))+rbp] + vmovups xmm15,XMMWORD[((16-128))+rcx] + lea r12,[((32-128))+rcx] + vpxor xmm9,xmm1,xmm4 + add ebx,100663296 + jc NEAR $L$handle_ctr32_2 + vpaddb xmm10,xmm1,xmm2 + vpaddb xmm11,xmm10,xmm2 + vpxor xmm10,xmm10,xmm4 + vpaddb xmm12,xmm11,xmm2 + vpxor xmm11,xmm11,xmm4 + vpaddb xmm13,xmm12,xmm2 + vpxor xmm12,xmm12,xmm4 + vpaddb xmm14,xmm13,xmm2 + vpxor xmm13,xmm13,xmm4 + vpaddb xmm1,xmm14,xmm2 + vpxor xmm14,xmm14,xmm4 + jmp NEAR $L$oop_ctr32 + +ALIGN 16 +$L$oop_ctr32: + vaesenc xmm9,xmm9,xmm15 + vaesenc xmm10,xmm10,xmm15 + vaesenc xmm11,xmm11,xmm15 + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vaesenc xmm14,xmm14,xmm15 + vmovups xmm15,XMMWORD[r12] + lea r12,[16+r12] + dec r13d + jnz NEAR $L$oop_ctr32 + + vmovdqu xmm3,XMMWORD[r12] + vaesenc xmm9,xmm9,xmm15 + vpxor xmm4,xmm3,XMMWORD[rdi] + vaesenc xmm10,xmm10,xmm15 + vpxor xmm5,xmm3,XMMWORD[16+rdi] + vaesenc xmm11,xmm11,xmm15 + vpxor xmm6,xmm3,XMMWORD[32+rdi] + vaesenc xmm12,xmm12,xmm15 + vpxor xmm8,xmm3,XMMWORD[48+rdi] + vaesenc xmm13,xmm13,xmm15 + vpxor xmm2,xmm3,XMMWORD[64+rdi] + vaesenc xmm14,xmm14,xmm15 + vpxor xmm3,xmm3,XMMWORD[80+rdi] + lea rdi,[96+rdi] + + vaesenclast xmm9,xmm9,xmm4 + vaesenclast xmm10,xmm10,xmm5 + vaesenclast xmm11,xmm11,xmm6 + vaesenclast xmm12,xmm12,xmm8 + vaesenclast xmm13,xmm13,xmm2 + vaesenclast xmm14,xmm14,xmm3 + vmovups XMMWORD[rsi],xmm9 + vmovups XMMWORD[16+rsi],xmm10 + vmovups XMMWORD[32+rsi],xmm11 + vmovups XMMWORD[48+rsi],xmm12 + vmovups XMMWORD[64+rsi],xmm13 + vmovups XMMWORD[80+rsi],xmm14 + lea rsi,[96+rsi] + + DB 0F3h,0C3h ;repret +ALIGN 32 +$L$handle_ctr32_2: + vpshufb xmm6,xmm1,xmm0 + vmovdqu xmm5,XMMWORD[48+r11] + vpaddd xmm10,xmm6,XMMWORD[64+r11] + vpaddd xmm11,xmm6,xmm5 + vpaddd xmm12,xmm10,xmm5 + vpshufb xmm10,xmm10,xmm0 + vpaddd xmm13,xmm11,xmm5 + vpshufb xmm11,xmm11,xmm0 + vpxor xmm10,xmm10,xmm4 + vpaddd xmm14,xmm12,xmm5 + vpshufb xmm12,xmm12,xmm0 + vpxor xmm11,xmm11,xmm4 + vpaddd xmm1,xmm13,xmm5 + vpshufb xmm13,xmm13,xmm0 + vpxor xmm12,xmm12,xmm4 + vpshufb xmm14,xmm14,xmm0 + vpxor xmm13,xmm13,xmm4 + vpshufb xmm1,xmm1,xmm0 + vpxor xmm14,xmm14,xmm4 + jmp NEAR $L$oop_ctr32 + + +global aesni_gcm_encrypt + +ALIGN 32 +aesni_gcm_encrypt: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aesni_gcm_encrypt: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + xor r10,r10 + cmp rdx,0x60*3 + jb NEAR $L$gcm_enc_abort + + lea rax,[rsp] + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,[((-168))+rsp] + movaps XMMWORD[(-216)+rax],xmm6 + movaps XMMWORD[(-200)+rax],xmm7 + movaps XMMWORD[(-184)+rax],xmm8 + movaps XMMWORD[(-168)+rax],xmm9 + movaps XMMWORD[(-152)+rax],xmm10 + movaps XMMWORD[(-136)+rax],xmm11 + movaps XMMWORD[(-120)+rax],xmm12 + movaps XMMWORD[(-104)+rax],xmm13 + movaps XMMWORD[(-88)+rax],xmm14 + movaps XMMWORD[(-72)+rax],xmm15 +$L$gcm_enc_body: + vzeroupper + + vmovdqu xmm1,XMMWORD[r8] + add rsp,-128 + mov ebx,DWORD[12+r8] + lea r11,[$L$bswap_mask] + lea r14,[((-128))+rcx] + mov r15,0xf80 + lea rcx,[128+rcx] + vmovdqu xmm0,XMMWORD[r11] + and rsp,-128 + mov ebp,DWORD[((240-128))+rcx] + + and r14,r15 + and r15,rsp + sub r15,r14 + jc NEAR $L$enc_no_key_aliasing + cmp r15,768 + jnc NEAR $L$enc_no_key_aliasing + sub rsp,r15 +$L$enc_no_key_aliasing: + + lea r14,[rsi] + lea r15,[((-192))+rdx*1+rsi] + shr rdx,4 + + call _aesni_ctr32_6x + vpshufb xmm8,xmm9,xmm0 + vpshufb xmm2,xmm10,xmm0 + vmovdqu XMMWORD[112+rsp],xmm8 + vpshufb xmm4,xmm11,xmm0 + vmovdqu XMMWORD[96+rsp],xmm2 + vpshufb xmm5,xmm12,xmm0 + vmovdqu XMMWORD[80+rsp],xmm4 + vpshufb xmm6,xmm13,xmm0 + vmovdqu XMMWORD[64+rsp],xmm5 + vpshufb xmm7,xmm14,xmm0 + vmovdqu XMMWORD[48+rsp],xmm6 + + call _aesni_ctr32_6x + + vmovdqu xmm8,XMMWORD[r9] + lea r9,[((32+32))+r9] + sub rdx,12 + mov r10,0x60*2 + vpshufb xmm8,xmm8,xmm0 + + call _aesni_ctr32_ghash_6x + vmovdqu xmm7,XMMWORD[32+rsp] + vmovdqu xmm0,XMMWORD[r11] + vmovdqu xmm3,XMMWORD[((0-32))+r9] + vpunpckhqdq xmm1,xmm7,xmm7 + vmovdqu xmm15,XMMWORD[((32-32))+r9] + vmovups XMMWORD[(-96)+rsi],xmm9 + vpshufb xmm9,xmm9,xmm0 + vpxor xmm1,xmm1,xmm7 + vmovups XMMWORD[(-80)+rsi],xmm10 + vpshufb xmm10,xmm10,xmm0 + vmovups XMMWORD[(-64)+rsi],xmm11 + vpshufb xmm11,xmm11,xmm0 + vmovups XMMWORD[(-48)+rsi],xmm12 + vpshufb xmm12,xmm12,xmm0 + vmovups XMMWORD[(-32)+rsi],xmm13 + vpshufb xmm13,xmm13,xmm0 + vmovups XMMWORD[(-16)+rsi],xmm14 + vpshufb xmm14,xmm14,xmm0 + vmovdqu XMMWORD[16+rsp],xmm9 + vmovdqu xmm6,XMMWORD[48+rsp] + vmovdqu xmm0,XMMWORD[((16-32))+r9] + vpunpckhqdq xmm2,xmm6,xmm6 + vpclmulqdq xmm5,xmm7,xmm3,0x00 + vpxor xmm2,xmm2,xmm6 + vpclmulqdq xmm7,xmm7,xmm3,0x11 + vpclmulqdq xmm1,xmm1,xmm15,0x00 + + vmovdqu xmm9,XMMWORD[64+rsp] + vpclmulqdq xmm4,xmm6,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((48-32))+r9] + vpxor xmm4,xmm4,xmm5 + vpunpckhqdq xmm5,xmm9,xmm9 + vpclmulqdq xmm6,xmm6,xmm0,0x11 + vpxor xmm5,xmm5,xmm9 + vpxor xmm6,xmm6,xmm7 + vpclmulqdq xmm2,xmm2,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((80-32))+r9] + vpxor xmm2,xmm2,xmm1 + + vmovdqu xmm1,XMMWORD[80+rsp] + vpclmulqdq xmm7,xmm9,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((64-32))+r9] + vpxor xmm7,xmm7,xmm4 + vpunpckhqdq xmm4,xmm1,xmm1 + vpclmulqdq xmm9,xmm9,xmm3,0x11 + vpxor xmm4,xmm4,xmm1 + vpxor xmm9,xmm9,xmm6 + vpclmulqdq xmm5,xmm5,xmm15,0x00 + vpxor xmm5,xmm5,xmm2 + + vmovdqu xmm2,XMMWORD[96+rsp] + vpclmulqdq xmm6,xmm1,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((96-32))+r9] + vpxor xmm6,xmm6,xmm7 + vpunpckhqdq xmm7,xmm2,xmm2 + vpclmulqdq xmm1,xmm1,xmm0,0x11 + vpxor xmm7,xmm7,xmm2 + vpxor xmm1,xmm1,xmm9 + vpclmulqdq xmm4,xmm4,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((128-32))+r9] + vpxor xmm4,xmm4,xmm5 + + vpxor xmm8,xmm8,XMMWORD[112+rsp] + vpclmulqdq xmm5,xmm2,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((112-32))+r9] + vpunpckhqdq xmm9,xmm8,xmm8 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm2,xmm2,xmm3,0x11 + vpxor xmm9,xmm9,xmm8 + vpxor xmm2,xmm2,xmm1 + vpclmulqdq xmm7,xmm7,xmm15,0x00 + vpxor xmm4,xmm7,xmm4 + + vpclmulqdq xmm6,xmm8,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((0-32))+r9] + vpunpckhqdq xmm1,xmm14,xmm14 + vpclmulqdq xmm8,xmm8,xmm0,0x11 + vpxor xmm1,xmm1,xmm14 + vpxor xmm5,xmm6,xmm5 + vpclmulqdq xmm9,xmm9,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((32-32))+r9] + vpxor xmm7,xmm8,xmm2 + vpxor xmm6,xmm9,xmm4 + + vmovdqu xmm0,XMMWORD[((16-32))+r9] + vpxor xmm9,xmm7,xmm5 + vpclmulqdq xmm4,xmm14,xmm3,0x00 + vpxor xmm6,xmm6,xmm9 + vpunpckhqdq xmm2,xmm13,xmm13 + vpclmulqdq xmm14,xmm14,xmm3,0x11 + vpxor xmm2,xmm2,xmm13 + vpslldq xmm9,xmm6,8 + vpclmulqdq xmm1,xmm1,xmm15,0x00 + vpxor xmm8,xmm5,xmm9 + vpsrldq xmm6,xmm6,8 + vpxor xmm7,xmm7,xmm6 + + vpclmulqdq xmm5,xmm13,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((48-32))+r9] + vpxor xmm5,xmm5,xmm4 + vpunpckhqdq xmm9,xmm12,xmm12 + vpclmulqdq xmm13,xmm13,xmm0,0x11 + vpxor xmm9,xmm9,xmm12 + vpxor xmm13,xmm13,xmm14 + vpalignr xmm14,xmm8,xmm8,8 + vpclmulqdq xmm2,xmm2,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((80-32))+r9] + vpxor xmm2,xmm2,xmm1 + + vpclmulqdq xmm4,xmm12,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((64-32))+r9] + vpxor xmm4,xmm4,xmm5 + vpunpckhqdq xmm1,xmm11,xmm11 + vpclmulqdq xmm12,xmm12,xmm3,0x11 + vpxor xmm1,xmm1,xmm11 + vpxor xmm12,xmm12,xmm13 + vxorps xmm7,xmm7,XMMWORD[16+rsp] + vpclmulqdq xmm9,xmm9,xmm15,0x00 + vpxor xmm9,xmm9,xmm2 + + vpclmulqdq xmm8,xmm8,XMMWORD[16+r11],0x10 + vxorps xmm8,xmm8,xmm14 + + vpclmulqdq xmm5,xmm11,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((96-32))+r9] + vpxor xmm5,xmm5,xmm4 + vpunpckhqdq xmm2,xmm10,xmm10 + vpclmulqdq xmm11,xmm11,xmm0,0x11 + vpxor xmm2,xmm2,xmm10 + vpalignr xmm14,xmm8,xmm8,8 + vpxor xmm11,xmm11,xmm12 + vpclmulqdq xmm1,xmm1,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((128-32))+r9] + vpxor xmm1,xmm1,xmm9 + + vxorps xmm14,xmm14,xmm7 + vpclmulqdq xmm8,xmm8,XMMWORD[16+r11],0x10 + vxorps xmm8,xmm8,xmm14 + + vpclmulqdq xmm4,xmm10,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((112-32))+r9] + vpxor xmm4,xmm4,xmm5 + vpunpckhqdq xmm9,xmm8,xmm8 + vpclmulqdq xmm10,xmm10,xmm3,0x11 + vpxor xmm9,xmm9,xmm8 + vpxor xmm10,xmm10,xmm11 + vpclmulqdq xmm2,xmm2,xmm15,0x00 + vpxor xmm2,xmm2,xmm1 + + vpclmulqdq xmm5,xmm8,xmm0,0x00 + vpclmulqdq xmm7,xmm8,xmm0,0x11 + vpxor xmm5,xmm5,xmm4 + vpclmulqdq xmm6,xmm9,xmm15,0x10 + vpxor xmm7,xmm7,xmm10 + vpxor xmm6,xmm6,xmm2 + + vpxor xmm4,xmm7,xmm5 + vpxor xmm6,xmm6,xmm4 + vpslldq xmm1,xmm6,8 + vmovdqu xmm3,XMMWORD[16+r11] + vpsrldq xmm6,xmm6,8 + vpxor xmm8,xmm5,xmm1 + vpxor xmm7,xmm7,xmm6 + + vpalignr xmm2,xmm8,xmm8,8 + vpclmulqdq xmm8,xmm8,xmm3,0x10 + vpxor xmm8,xmm8,xmm2 + + vpalignr xmm2,xmm8,xmm8,8 + vpclmulqdq xmm8,xmm8,xmm3,0x10 + vpxor xmm2,xmm2,xmm7 + vpxor xmm8,xmm8,xmm2 + vpshufb xmm8,xmm8,XMMWORD[r11] + vmovdqu XMMWORD[(-64)+r9],xmm8 + + vzeroupper + movaps xmm6,XMMWORD[((-216))+rax] + movaps xmm7,XMMWORD[((-200))+rax] + movaps xmm8,XMMWORD[((-184))+rax] + movaps xmm9,XMMWORD[((-168))+rax] + movaps xmm10,XMMWORD[((-152))+rax] + movaps xmm11,XMMWORD[((-136))+rax] + movaps xmm12,XMMWORD[((-120))+rax] + movaps xmm13,XMMWORD[((-104))+rax] + movaps xmm14,XMMWORD[((-88))+rax] + movaps xmm15,XMMWORD[((-72))+rax] + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbp,QWORD[((-16))+rax] + + mov rbx,QWORD[((-8))+rax] + + lea rsp,[rax] + +$L$gcm_enc_abort: + mov rax,r10 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aesni_gcm_encrypt: +ALIGN 64 +$L$bswap_mask: +DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +$L$poly: +DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +$L$one_msb: +DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +$L$two_lsb: +DB 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +$L$one_lsb: +DB 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +DB 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108 +DB 101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82 +DB 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 +DB 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +ALIGN 64 +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +gcm_se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov rax,QWORD[120+r8] + + mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + mov QWORD[240+r8],r15 + mov QWORD[232+r8],r14 + mov QWORD[224+r8],r13 + mov QWORD[216+r8],r12 + mov QWORD[160+r8],rbp + mov QWORD[144+r8],rbx + + lea rsi,[((-216))+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_aesni_gcm_decrypt wrt ..imagebase + DD $L$SEH_end_aesni_gcm_decrypt wrt ..imagebase + DD $L$SEH_gcm_dec_info wrt ..imagebase + + DD $L$SEH_begin_aesni_gcm_encrypt wrt ..imagebase + DD $L$SEH_end_aesni_gcm_encrypt wrt ..imagebase + DD $L$SEH_gcm_enc_info wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_gcm_dec_info: +DB 9,0,0,0 + DD gcm_se_handler wrt ..imagebase + DD $L$gcm_dec_body wrt ..imagebase,$L$gcm_dec_abort wrt ..imagebase +$L$SEH_gcm_enc_info: +DB 9,0,0,0 + DD gcm_se_handler wrt ..imagebase + DD $L$gcm_enc_body wrt ..imagebase,$L$gcm_enc_abort wrt ..imagebase |