diff options
Diffstat (limited to 'deps/openssl/config/archs/VC-WIN64A/asm_avx2/crypto/bn/rsaz-x86_64.asm')
-rw-r--r-- | deps/openssl/config/archs/VC-WIN64A/asm_avx2/crypto/bn/rsaz-x86_64.asm | 2234 |
1 files changed, 2234 insertions, 0 deletions
diff --git a/deps/openssl/config/archs/VC-WIN64A/asm_avx2/crypto/bn/rsaz-x86_64.asm b/deps/openssl/config/archs/VC-WIN64A/asm_avx2/crypto/bn/rsaz-x86_64.asm new file mode 100644 index 0000000000..603a8d17b8 --- /dev/null +++ b/deps/openssl/config/archs/VC-WIN64A/asm_avx2/crypto/bn/rsaz-x86_64.asm @@ -0,0 +1,2234 @@ +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + + +EXTERN OPENSSL_ia32cap_P + +global rsaz_512_sqr + +ALIGN 32 +rsaz_512_sqr: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_rsaz_512_sqr: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + + sub rsp,128+24 + +$L$sqr_body: + mov rbp,rdx + mov rdx,QWORD[rsi] + mov rax,QWORD[8+rsi] + mov QWORD[128+rsp],rcx + mov r11d,0x80100 + and r11d,DWORD[((OPENSSL_ia32cap_P+8))] + cmp r11d,0x80100 + je NEAR $L$oop_sqrx + jmp NEAR $L$oop_sqr + +ALIGN 32 +$L$oop_sqr: + mov DWORD[((128+8))+rsp],r8d + + mov rbx,rdx + mul rdx + mov r8,rax + mov rax,QWORD[16+rsi] + mov r9,rdx + + mul rbx + add r9,rax + mov rax,QWORD[24+rsi] + mov r10,rdx + adc r10,0 + + mul rbx + add r10,rax + mov rax,QWORD[32+rsi] + mov r11,rdx + adc r11,0 + + mul rbx + add r11,rax + mov rax,QWORD[40+rsi] + mov r12,rdx + adc r12,0 + + mul rbx + add r12,rax + mov rax,QWORD[48+rsi] + mov r13,rdx + adc r13,0 + + mul rbx + add r13,rax + mov rax,QWORD[56+rsi] + mov r14,rdx + adc r14,0 + + mul rbx + add r14,rax + mov rax,rbx + mov r15,rdx + adc r15,0 + + add r8,r8 + mov rcx,r9 + adc r9,r9 + + mul rax + mov QWORD[rsp],rax + add r8,rdx + adc r9,0 + + mov QWORD[8+rsp],r8 + shr rcx,63 + + + mov r8,QWORD[8+rsi] + mov rax,QWORD[16+rsi] + mul r8 + add r10,rax + mov rax,QWORD[24+rsi] + mov rbx,rdx + adc rbx,0 + + mul r8 + add r11,rax + mov rax,QWORD[32+rsi] + adc rdx,0 + add r11,rbx + mov rbx,rdx + adc rbx,0 + + mul r8 + add r12,rax + mov rax,QWORD[40+rsi] + adc rdx,0 + add r12,rbx + mov rbx,rdx + adc rbx,0 + + mul r8 + add r13,rax + mov rax,QWORD[48+rsi] + adc rdx,0 + add r13,rbx + mov rbx,rdx + adc rbx,0 + + mul r8 + add r14,rax + mov rax,QWORD[56+rsi] + adc rdx,0 + add r14,rbx + mov rbx,rdx + adc rbx,0 + + mul r8 + add r15,rax + mov rax,r8 + adc rdx,0 + add r15,rbx + mov r8,rdx + mov rdx,r10 + adc r8,0 + + add rdx,rdx + lea r10,[r10*2+rcx] + mov rbx,r11 + adc r11,r11 + + mul rax + add r9,rax + adc r10,rdx + adc r11,0 + + mov QWORD[16+rsp],r9 + mov QWORD[24+rsp],r10 + shr rbx,63 + + + mov r9,QWORD[16+rsi] + mov rax,QWORD[24+rsi] + mul r9 + add r12,rax + mov rax,QWORD[32+rsi] + mov rcx,rdx + adc rcx,0 + + mul r9 + add r13,rax + mov rax,QWORD[40+rsi] + adc rdx,0 + add r13,rcx + mov rcx,rdx + adc rcx,0 + + mul r9 + add r14,rax + mov rax,QWORD[48+rsi] + adc rdx,0 + add r14,rcx + mov rcx,rdx + adc rcx,0 + + mul r9 + mov r10,r12 + lea r12,[r12*2+rbx] + add r15,rax + mov rax,QWORD[56+rsi] + adc rdx,0 + add r15,rcx + mov rcx,rdx + adc rcx,0 + + mul r9 + shr r10,63 + add r8,rax + mov rax,r9 + adc rdx,0 + add r8,rcx + mov r9,rdx + adc r9,0 + + mov rcx,r13 + lea r13,[r13*2+r10] + + mul rax + add r11,rax + adc r12,rdx + adc r13,0 + + mov QWORD[32+rsp],r11 + mov QWORD[40+rsp],r12 + shr rcx,63 + + + mov r10,QWORD[24+rsi] + mov rax,QWORD[32+rsi] + mul r10 + add r14,rax + mov rax,QWORD[40+rsi] + mov rbx,rdx + adc rbx,0 + + mul r10 + add r15,rax + mov rax,QWORD[48+rsi] + adc rdx,0 + add r15,rbx + mov rbx,rdx + adc rbx,0 + + mul r10 + mov r12,r14 + lea r14,[r14*2+rcx] + add r8,rax + mov rax,QWORD[56+rsi] + adc rdx,0 + add r8,rbx + mov rbx,rdx + adc rbx,0 + + mul r10 + shr r12,63 + add r9,rax + mov rax,r10 + adc rdx,0 + add r9,rbx + mov r10,rdx + adc r10,0 + + mov rbx,r15 + lea r15,[r15*2+r12] + + mul rax + add r13,rax + adc r14,rdx + adc r15,0 + + mov QWORD[48+rsp],r13 + mov QWORD[56+rsp],r14 + shr rbx,63 + + + mov r11,QWORD[32+rsi] + mov rax,QWORD[40+rsi] + mul r11 + add r8,rax + mov rax,QWORD[48+rsi] + mov rcx,rdx + adc rcx,0 + + mul r11 + add r9,rax + mov rax,QWORD[56+rsi] + adc rdx,0 + mov r12,r8 + lea r8,[r8*2+rbx] + add r9,rcx + mov rcx,rdx + adc rcx,0 + + mul r11 + shr r12,63 + add r10,rax + mov rax,r11 + adc rdx,0 + add r10,rcx + mov r11,rdx + adc r11,0 + + mov rcx,r9 + lea r9,[r9*2+r12] + + mul rax + add r15,rax + adc r8,rdx + adc r9,0 + + mov QWORD[64+rsp],r15 + mov QWORD[72+rsp],r8 + shr rcx,63 + + + mov r12,QWORD[40+rsi] + mov rax,QWORD[48+rsi] + mul r12 + add r10,rax + mov rax,QWORD[56+rsi] + mov rbx,rdx + adc rbx,0 + + mul r12 + add r11,rax + mov rax,r12 + mov r15,r10 + lea r10,[r10*2+rcx] + adc rdx,0 + shr r15,63 + add r11,rbx + mov r12,rdx + adc r12,0 + + mov rbx,r11 + lea r11,[r11*2+r15] + + mul rax + add r9,rax + adc r10,rdx + adc r11,0 + + mov QWORD[80+rsp],r9 + mov QWORD[88+rsp],r10 + + + mov r13,QWORD[48+rsi] + mov rax,QWORD[56+rsi] + mul r13 + add r12,rax + mov rax,r13 + mov r13,rdx + adc r13,0 + + xor r14,r14 + shl rbx,1 + adc r12,r12 + adc r13,r13 + adc r14,r14 + + mul rax + add r11,rax + adc r12,rdx + adc r13,0 + + mov QWORD[96+rsp],r11 + mov QWORD[104+rsp],r12 + + + mov rax,QWORD[56+rsi] + mul rax + add r13,rax + adc rdx,0 + + add r14,rdx + + mov QWORD[112+rsp],r13 + mov QWORD[120+rsp],r14 + + mov r8,QWORD[rsp] + mov r9,QWORD[8+rsp] + mov r10,QWORD[16+rsp] + mov r11,QWORD[24+rsp] + mov r12,QWORD[32+rsp] + mov r13,QWORD[40+rsp] + mov r14,QWORD[48+rsp] + mov r15,QWORD[56+rsp] + + call __rsaz_512_reduce + + add r8,QWORD[64+rsp] + adc r9,QWORD[72+rsp] + adc r10,QWORD[80+rsp] + adc r11,QWORD[88+rsp] + adc r12,QWORD[96+rsp] + adc r13,QWORD[104+rsp] + adc r14,QWORD[112+rsp] + adc r15,QWORD[120+rsp] + sbb rcx,rcx + + call __rsaz_512_subtract + + mov rdx,r8 + mov rax,r9 + mov r8d,DWORD[((128+8))+rsp] + mov rsi,rdi + + dec r8d + jnz NEAR $L$oop_sqr + jmp NEAR $L$sqr_tail + +ALIGN 32 +$L$oop_sqrx: + mov DWORD[((128+8))+rsp],r8d +DB 102,72,15,110,199 +DB 102,72,15,110,205 + + mulx r9,r8,rax + + mulx r10,rcx,QWORD[16+rsi] + xor rbp,rbp + + mulx r11,rax,QWORD[24+rsi] + adcx r9,rcx + + mulx r12,rcx,QWORD[32+rsi] + adcx r10,rax + + mulx r13,rax,QWORD[40+rsi] + adcx r11,rcx + +DB 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 + adcx r12,rax + adcx r13,rcx + +DB 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 + adcx r14,rax + adcx r15,rbp + + mov rcx,r9 + shld r9,r8,1 + shl r8,1 + + xor ebp,ebp + mulx rdx,rax,rdx + adcx r8,rdx + mov rdx,QWORD[8+rsi] + adcx r9,rbp + + mov QWORD[rsp],rax + mov QWORD[8+rsp],r8 + + + mulx rbx,rax,QWORD[16+rsi] + adox r10,rax + adcx r11,rbx + +DB 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 + adox r11,rdi + adcx r12,r8 + + mulx rbx,rax,QWORD[32+rsi] + adox r12,rax + adcx r13,rbx + + mulx r8,rdi,QWORD[40+rsi] + adox r13,rdi + adcx r14,r8 + +DB 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 + adox r14,rax + adcx r15,rbx + +DB 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 + adox r15,rdi + adcx r8,rbp + adox r8,rbp + + mov rbx,r11 + shld r11,r10,1 + shld r10,rcx,1 + + xor ebp,ebp + mulx rcx,rax,rdx + mov rdx,QWORD[16+rsi] + adcx r9,rax + adcx r10,rcx + adcx r11,rbp + + mov QWORD[16+rsp],r9 +DB 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 + + +DB 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 + adox r12,rdi + adcx r13,r9 + + mulx rcx,rax,QWORD[32+rsi] + adox r13,rax + adcx r14,rcx + + mulx r9,rdi,QWORD[40+rsi] + adox r14,rdi + adcx r15,r9 + +DB 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 + adox r15,rax + adcx r8,rcx + +DB 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 + adox r8,rdi + adcx r9,rbp + adox r9,rbp + + mov rcx,r13 + shld r13,r12,1 + shld r12,rbx,1 + + xor ebp,ebp + mulx rdx,rax,rdx + adcx r11,rax + adcx r12,rdx + mov rdx,QWORD[24+rsi] + adcx r13,rbp + + mov QWORD[32+rsp],r11 +DB 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 + + +DB 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 + adox r14,rax + adcx r15,rbx + + mulx r10,rdi,QWORD[40+rsi] + adox r15,rdi + adcx r8,r10 + + mulx rbx,rax,QWORD[48+rsi] + adox r8,rax + adcx r9,rbx + + mulx r10,rdi,QWORD[56+rsi] + adox r9,rdi + adcx r10,rbp + adox r10,rbp + +DB 0x66 + mov rbx,r15 + shld r15,r14,1 + shld r14,rcx,1 + + xor ebp,ebp + mulx rdx,rax,rdx + adcx r13,rax + adcx r14,rdx + mov rdx,QWORD[32+rsi] + adcx r15,rbp + + mov QWORD[48+rsp],r13 + mov QWORD[56+rsp],r14 + + +DB 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 + adox r8,rdi + adcx r9,r11 + + mulx rcx,rax,QWORD[48+rsi] + adox r9,rax + adcx r10,rcx + + mulx r11,rdi,QWORD[56+rsi] + adox r10,rdi + adcx r11,rbp + adox r11,rbp + + mov rcx,r9 + shld r9,r8,1 + shld r8,rbx,1 + + xor ebp,ebp + mulx rdx,rax,rdx + adcx r15,rax + adcx r8,rdx + mov rdx,QWORD[40+rsi] + adcx r9,rbp + + mov QWORD[64+rsp],r15 + mov QWORD[72+rsp],r8 + + +DB 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 + adox r10,rax + adcx r11,rbx + +DB 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 + adox r11,rdi + adcx r12,rbp + adox r12,rbp + + mov rbx,r11 + shld r11,r10,1 + shld r10,rcx,1 + + xor ebp,ebp + mulx rdx,rax,rdx + adcx r9,rax + adcx r10,rdx + mov rdx,QWORD[48+rsi] + adcx r11,rbp + + mov QWORD[80+rsp],r9 + mov QWORD[88+rsp],r10 + + +DB 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 + adox r12,rax + adox r13,rbp + + xor r14,r14 + shld r14,r13,1 + shld r13,r12,1 + shld r12,rbx,1 + + xor ebp,ebp + mulx rdx,rax,rdx + adcx r11,rax + adcx r12,rdx + mov rdx,QWORD[56+rsi] + adcx r13,rbp + +DB 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 +DB 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 + + + mulx rdx,rax,rdx + adox r13,rax + adox rdx,rbp + +DB 0x66 + add r14,rdx + + mov QWORD[112+rsp],r13 + mov QWORD[120+rsp],r14 +DB 102,72,15,126,199 +DB 102,72,15,126,205 + + mov rdx,QWORD[128+rsp] + mov r8,QWORD[rsp] + mov r9,QWORD[8+rsp] + mov r10,QWORD[16+rsp] + mov r11,QWORD[24+rsp] + mov r12,QWORD[32+rsp] + mov r13,QWORD[40+rsp] + mov r14,QWORD[48+rsp] + mov r15,QWORD[56+rsp] + + call __rsaz_512_reducex + + add r8,QWORD[64+rsp] + adc r9,QWORD[72+rsp] + adc r10,QWORD[80+rsp] + adc r11,QWORD[88+rsp] + adc r12,QWORD[96+rsp] + adc r13,QWORD[104+rsp] + adc r14,QWORD[112+rsp] + adc r15,QWORD[120+rsp] + sbb rcx,rcx + + call __rsaz_512_subtract + + mov rdx,r8 + mov rax,r9 + mov r8d,DWORD[((128+8))+rsp] + mov rsi,rdi + + dec r8d + jnz NEAR $L$oop_sqrx + +$L$sqr_tail: + + lea rax,[((128+24+48))+rsp] + + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbp,QWORD[((-16))+rax] + + mov rbx,QWORD[((-8))+rax] + + lea rsp,[rax] + +$L$sqr_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_rsaz_512_sqr: +global rsaz_512_mul + +ALIGN 32 +rsaz_512_mul: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_rsaz_512_mul: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + + sub rsp,128+24 + +$L$mul_body: +DB 102,72,15,110,199 +DB 102,72,15,110,201 + mov QWORD[128+rsp],r8 + mov r11d,0x80100 + and r11d,DWORD[((OPENSSL_ia32cap_P+8))] + cmp r11d,0x80100 + je NEAR $L$mulx + mov rbx,QWORD[rdx] + mov rbp,rdx + call __rsaz_512_mul + +DB 102,72,15,126,199 +DB 102,72,15,126,205 + + mov r8,QWORD[rsp] + mov r9,QWORD[8+rsp] + mov r10,QWORD[16+rsp] + mov r11,QWORD[24+rsp] + mov r12,QWORD[32+rsp] + mov r13,QWORD[40+rsp] + mov r14,QWORD[48+rsp] + mov r15,QWORD[56+rsp] + + call __rsaz_512_reduce + jmp NEAR $L$mul_tail + +ALIGN 32 +$L$mulx: + mov rbp,rdx + mov rdx,QWORD[rdx] + call __rsaz_512_mulx + +DB 102,72,15,126,199 +DB 102,72,15,126,205 + + mov rdx,QWORD[128+rsp] + mov r8,QWORD[rsp] + mov r9,QWORD[8+rsp] + mov r10,QWORD[16+rsp] + mov r11,QWORD[24+rsp] + mov r12,QWORD[32+rsp] + mov r13,QWORD[40+rsp] + mov r14,QWORD[48+rsp] + mov r15,QWORD[56+rsp] + + call __rsaz_512_reducex +$L$mul_tail: + add r8,QWORD[64+rsp] + adc r9,QWORD[72+rsp] + adc r10,QWORD[80+rsp] + adc r11,QWORD[88+rsp] + adc r12,QWORD[96+rsp] + adc r13,QWORD[104+rsp] + adc r14,QWORD[112+rsp] + adc r15,QWORD[120+rsp] + sbb rcx,rcx + + call __rsaz_512_subtract + + lea rax,[((128+24+48))+rsp] + + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbp,QWORD[((-16))+rax] + + mov rbx,QWORD[((-8))+rax] + + lea rsp,[rax] + +$L$mul_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_rsaz_512_mul: +global rsaz_512_mul_gather4 + +ALIGN 32 +rsaz_512_mul_gather4: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_rsaz_512_mul_gather4: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + + sub rsp,328 + + movaps XMMWORD[160+rsp],xmm6 + movaps XMMWORD[176+rsp],xmm7 + movaps XMMWORD[192+rsp],xmm8 + movaps XMMWORD[208+rsp],xmm9 + movaps XMMWORD[224+rsp],xmm10 + movaps XMMWORD[240+rsp],xmm11 + movaps XMMWORD[256+rsp],xmm12 + movaps XMMWORD[272+rsp],xmm13 + movaps XMMWORD[288+rsp],xmm14 + movaps XMMWORD[304+rsp],xmm15 +$L$mul_gather4_body: + movd xmm8,r9d + movdqa xmm1,XMMWORD[(($L$inc+16))] + movdqa xmm0,XMMWORD[$L$inc] + + pshufd xmm8,xmm8,0 + movdqa xmm7,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm8 + movdqa xmm3,xmm7 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm8 + movdqa xmm4,xmm7 + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm8 + movdqa xmm5,xmm7 + paddd xmm4,xmm3 + pcmpeqd xmm3,xmm8 + movdqa xmm6,xmm7 + paddd xmm5,xmm4 + pcmpeqd xmm4,xmm8 + paddd xmm6,xmm5 + pcmpeqd xmm5,xmm8 + paddd xmm7,xmm6 + pcmpeqd xmm6,xmm8 + pcmpeqd xmm7,xmm8 + + movdqa xmm8,XMMWORD[rdx] + movdqa xmm9,XMMWORD[16+rdx] + movdqa xmm10,XMMWORD[32+rdx] + movdqa xmm11,XMMWORD[48+rdx] + pand xmm8,xmm0 + movdqa xmm12,XMMWORD[64+rdx] + pand xmm9,xmm1 + movdqa xmm13,XMMWORD[80+rdx] + pand xmm10,xmm2 + movdqa xmm14,XMMWORD[96+rdx] + pand xmm11,xmm3 + movdqa xmm15,XMMWORD[112+rdx] + lea rbp,[128+rdx] + pand xmm12,xmm4 + pand xmm13,xmm5 + pand xmm14,xmm6 + pand xmm15,xmm7 + por xmm8,xmm10 + por xmm9,xmm11 + por xmm8,xmm12 + por xmm9,xmm13 + por xmm8,xmm14 + por xmm9,xmm15 + + por xmm8,xmm9 + pshufd xmm9,xmm8,0x4e + por xmm8,xmm9 + mov r11d,0x80100 + and r11d,DWORD[((OPENSSL_ia32cap_P+8))] + cmp r11d,0x80100 + je NEAR $L$mulx_gather +DB 102,76,15,126,195 + + mov QWORD[128+rsp],r8 + mov QWORD[((128+8))+rsp],rdi + mov QWORD[((128+16))+rsp],rcx + + mov rax,QWORD[rsi] + mov rcx,QWORD[8+rsi] + mul rbx + mov QWORD[rsp],rax + mov rax,rcx + mov r8,rdx + + mul rbx + add r8,rax + mov rax,QWORD[16+rsi] + mov r9,rdx + adc r9,0 + + mul rbx + add r9,rax + mov rax,QWORD[24+rsi] + mov r10,rdx + adc r10,0 + + mul rbx + add r10,rax + mov rax,QWORD[32+rsi] + mov r11,rdx + adc r11,0 + + mul rbx + add r11,rax + mov rax,QWORD[40+rsi] + mov r12,rdx + adc r12,0 + + mul rbx + add r12,rax + mov rax,QWORD[48+rsi] + mov r13,rdx + adc r13,0 + + mul rbx + add r13,rax + mov rax,QWORD[56+rsi] + mov r14,rdx + adc r14,0 + + mul rbx + add r14,rax + mov rax,QWORD[rsi] + mov r15,rdx + adc r15,0 + + lea rdi,[8+rsp] + mov ecx,7 + jmp NEAR $L$oop_mul_gather + +ALIGN 32 +$L$oop_mul_gather: + movdqa xmm8,XMMWORD[rbp] + movdqa xmm9,XMMWORD[16+rbp] + movdqa xmm10,XMMWORD[32+rbp] + movdqa xmm11,XMMWORD[48+rbp] + pand xmm8,xmm0 + movdqa xmm12,XMMWORD[64+rbp] + pand xmm9,xmm1 + movdqa xmm13,XMMWORD[80+rbp] + pand xmm10,xmm2 + movdqa xmm14,XMMWORD[96+rbp] + pand xmm11,xmm3 + movdqa xmm15,XMMWORD[112+rbp] + lea rbp,[128+rbp] + pand xmm12,xmm4 + pand xmm13,xmm5 + pand xmm14,xmm6 + pand xmm15,xmm7 + por xmm8,xmm10 + por xmm9,xmm11 + por xmm8,xmm12 + por xmm9,xmm13 + por xmm8,xmm14 + por xmm9,xmm15 + + por xmm8,xmm9 + pshufd xmm9,xmm8,0x4e + por xmm8,xmm9 +DB 102,76,15,126,195 + + mul rbx + add r8,rax + mov rax,QWORD[8+rsi] + mov QWORD[rdi],r8 + mov r8,rdx + adc r8,0 + + mul rbx + add r9,rax + mov rax,QWORD[16+rsi] + adc rdx,0 + add r8,r9 + mov r9,rdx + adc r9,0 + + mul rbx + add r10,rax + mov rax,QWORD[24+rsi] + adc rdx,0 + add r9,r10 + mov r10,rdx + adc r10,0 + + mul rbx + add r11,rax + mov rax,QWORD[32+rsi] + adc rdx,0 + add r10,r11 + mov r11,rdx + adc r11,0 + + mul rbx + add r12,rax + mov rax,QWORD[40+rsi] + adc rdx,0 + add r11,r12 + mov r12,rdx + adc r12,0 + + mul rbx + add r13,rax + mov rax,QWORD[48+rsi] + adc rdx,0 + add r12,r13 + mov r13,rdx + adc r13,0 + + mul rbx + add r14,rax + mov rax,QWORD[56+rsi] + adc rdx,0 + add r13,r14 + mov r14,rdx + adc r14,0 + + mul rbx + add r15,rax + mov rax,QWORD[rsi] + adc rdx,0 + add r14,r15 + mov r15,rdx + adc r15,0 + + lea rdi,[8+rdi] + + dec ecx + jnz NEAR $L$oop_mul_gather + + mov QWORD[rdi],r8 + mov QWORD[8+rdi],r9 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + mov QWORD[32+rdi],r12 + mov QWORD[40+rdi],r13 + mov QWORD[48+rdi],r14 + mov QWORD[56+rdi],r15 + + mov rdi,QWORD[((128+8))+rsp] + mov rbp,QWORD[((128+16))+rsp] + + mov r8,QWORD[rsp] + mov r9,QWORD[8+rsp] + mov r10,QWORD[16+rsp] + mov r11,QWORD[24+rsp] + mov r12,QWORD[32+rsp] + mov r13,QWORD[40+rsp] + mov r14,QWORD[48+rsp] + mov r15,QWORD[56+rsp] + + call __rsaz_512_reduce + jmp NEAR $L$mul_gather_tail + +ALIGN 32 +$L$mulx_gather: +DB 102,76,15,126,194 + + mov QWORD[128+rsp],r8 + mov QWORD[((128+8))+rsp],rdi + mov QWORD[((128+16))+rsp],rcx + + mulx r8,rbx,QWORD[rsi] + mov QWORD[rsp],rbx + xor edi,edi + + mulx r9,rax,QWORD[8+rsi] + + mulx r10,rbx,QWORD[16+rsi] + adcx r8,rax + + mulx r11,rax,QWORD[24+rsi] + adcx r9,rbx + + mulx r12,rbx,QWORD[32+rsi] + adcx r10,rax + + mulx r13,rax,QWORD[40+rsi] + adcx r11,rbx + + mulx r14,rbx,QWORD[48+rsi] + adcx r12,rax + + mulx r15,rax,QWORD[56+rsi] + adcx r13,rbx + adcx r14,rax +DB 0x67 + mov rbx,r8 + adcx r15,rdi + + mov rcx,-7 + jmp NEAR $L$oop_mulx_gather + +ALIGN 32 +$L$oop_mulx_gather: + movdqa xmm8,XMMWORD[rbp] + movdqa xmm9,XMMWORD[16+rbp] + movdqa xmm10,XMMWORD[32+rbp] + movdqa xmm11,XMMWORD[48+rbp] + pand xmm8,xmm0 + movdqa xmm12,XMMWORD[64+rbp] + pand xmm9,xmm1 + movdqa xmm13,XMMWORD[80+rbp] + pand xmm10,xmm2 + movdqa xmm14,XMMWORD[96+rbp] + pand xmm11,xmm3 + movdqa xmm15,XMMWORD[112+rbp] + lea rbp,[128+rbp] + pand xmm12,xmm4 + pand xmm13,xmm5 + pand xmm14,xmm6 + pand xmm15,xmm7 + por xmm8,xmm10 + por xmm9,xmm11 + por xmm8,xmm12 + por xmm9,xmm13 + por xmm8,xmm14 + por xmm9,xmm15 + + por xmm8,xmm9 + pshufd xmm9,xmm8,0x4e + por xmm8,xmm9 +DB 102,76,15,126,194 + +DB 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 + adcx rbx,rax + adox r8,r9 + + mulx r9,rax,QWORD[8+rsi] + adcx r8,rax + adox r9,r10 + + mulx r10,rax,QWORD[16+rsi] + adcx r9,rax + adox r10,r11 + +DB 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 + adcx r10,rax + adox r11,r12 + + mulx r12,rax,QWORD[32+rsi] + adcx r11,rax + adox r12,r13 + + mulx r13,rax,QWORD[40+rsi] + adcx r12,rax + adox r13,r14 + +DB 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 + adcx r13,rax +DB 0x67 + adox r14,r15 + + mulx r15,rax,QWORD[56+rsi] + mov QWORD[64+rcx*8+rsp],rbx + adcx r14,rax + adox r15,rdi + mov rbx,r8 + adcx r15,rdi + + inc rcx + jnz NEAR $L$oop_mulx_gather + + mov QWORD[64+rsp],r8 + mov QWORD[((64+8))+rsp],r9 + mov QWORD[((64+16))+rsp],r10 + mov QWORD[((64+24))+rsp],r11 + mov QWORD[((64+32))+rsp],r12 + mov QWORD[((64+40))+rsp],r13 + mov QWORD[((64+48))+rsp],r14 + mov QWORD[((64+56))+rsp],r15 + + mov rdx,QWORD[128+rsp] + mov rdi,QWORD[((128+8))+rsp] + mov rbp,QWORD[((128+16))+rsp] + + mov r8,QWORD[rsp] + mov r9,QWORD[8+rsp] + mov r10,QWORD[16+rsp] + mov r11,QWORD[24+rsp] + mov r12,QWORD[32+rsp] + mov r13,QWORD[40+rsp] + mov r14,QWORD[48+rsp] + mov r15,QWORD[56+rsp] + + call __rsaz_512_reducex + +$L$mul_gather_tail: + add r8,QWORD[64+rsp] + adc r9,QWORD[72+rsp] + adc r10,QWORD[80+rsp] + adc r11,QWORD[88+rsp] + adc r12,QWORD[96+rsp] + adc r13,QWORD[104+rsp] + adc r14,QWORD[112+rsp] + adc r15,QWORD[120+rsp] + sbb rcx,rcx + + call __rsaz_512_subtract + + lea rax,[((128+24+48))+rsp] + movaps xmm6,XMMWORD[((160-200))+rax] + movaps xmm7,XMMWORD[((176-200))+rax] + movaps xmm8,XMMWORD[((192-200))+rax] + movaps xmm9,XMMWORD[((208-200))+rax] + movaps xmm10,XMMWORD[((224-200))+rax] + movaps xmm11,XMMWORD[((240-200))+rax] + movaps xmm12,XMMWORD[((256-200))+rax] + movaps xmm13,XMMWORD[((272-200))+rax] + movaps xmm14,XMMWORD[((288-200))+rax] + movaps xmm15,XMMWORD[((304-200))+rax] + lea rax,[176+rax] + + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbp,QWORD[((-16))+rax] + + mov rbx,QWORD[((-8))+rax] + + lea rsp,[rax] + +$L$mul_gather4_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_rsaz_512_mul_gather4: +global rsaz_512_mul_scatter4 + +ALIGN 32 +rsaz_512_mul_scatter4: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_rsaz_512_mul_scatter4: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + + mov r9d,r9d + sub rsp,128+24 + +$L$mul_scatter4_body: + lea r8,[r9*8+r8] +DB 102,72,15,110,199 +DB 102,72,15,110,202 +DB 102,73,15,110,208 + mov QWORD[128+rsp],rcx + + mov rbp,rdi + mov r11d,0x80100 + and r11d,DWORD[((OPENSSL_ia32cap_P+8))] + cmp r11d,0x80100 + je NEAR $L$mulx_scatter + mov rbx,QWORD[rdi] + call __rsaz_512_mul + +DB 102,72,15,126,199 +DB 102,72,15,126,205 + + mov r8,QWORD[rsp] + mov r9,QWORD[8+rsp] + mov r10,QWORD[16+rsp] + mov r11,QWORD[24+rsp] + mov r12,QWORD[32+rsp] + mov r13,QWORD[40+rsp] + mov r14,QWORD[48+rsp] + mov r15,QWORD[56+rsp] + + call __rsaz_512_reduce + jmp NEAR $L$mul_scatter_tail + +ALIGN 32 +$L$mulx_scatter: + mov rdx,QWORD[rdi] + call __rsaz_512_mulx + +DB 102,72,15,126,199 +DB 102,72,15,126,205 + + mov rdx,QWORD[128+rsp] + mov r8,QWORD[rsp] + mov r9,QWORD[8+rsp] + mov r10,QWORD[16+rsp] + mov r11,QWORD[24+rsp] + mov r12,QWORD[32+rsp] + mov r13,QWORD[40+rsp] + mov r14,QWORD[48+rsp] + mov r15,QWORD[56+rsp] + + call __rsaz_512_reducex + +$L$mul_scatter_tail: + add r8,QWORD[64+rsp] + adc r9,QWORD[72+rsp] + adc r10,QWORD[80+rsp] + adc r11,QWORD[88+rsp] + adc r12,QWORD[96+rsp] + adc r13,QWORD[104+rsp] + adc r14,QWORD[112+rsp] + adc r15,QWORD[120+rsp] +DB 102,72,15,126,214 + sbb rcx,rcx + + call __rsaz_512_subtract + + mov QWORD[rsi],r8 + mov QWORD[128+rsi],r9 + mov QWORD[256+rsi],r10 + mov QWORD[384+rsi],r11 + mov QWORD[512+rsi],r12 + mov QWORD[640+rsi],r13 + mov QWORD[768+rsi],r14 + mov QWORD[896+rsi],r15 + + lea rax,[((128+24+48))+rsp] + + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbp,QWORD[((-16))+rax] + + mov rbx,QWORD[((-8))+rax] + + lea rsp,[rax] + +$L$mul_scatter4_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_rsaz_512_mul_scatter4: +global rsaz_512_mul_by_one + +ALIGN 32 +rsaz_512_mul_by_one: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_rsaz_512_mul_by_one: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + + sub rsp,128+24 + +$L$mul_by_one_body: + mov eax,DWORD[((OPENSSL_ia32cap_P+8))] + mov rbp,rdx + mov QWORD[128+rsp],rcx + + mov r8,QWORD[rsi] + pxor xmm0,xmm0 + mov r9,QWORD[8+rsi] + mov r10,QWORD[16+rsi] + mov r11,QWORD[24+rsi] + mov r12,QWORD[32+rsi] + mov r13,QWORD[40+rsi] + mov r14,QWORD[48+rsi] + mov r15,QWORD[56+rsi] + + movdqa XMMWORD[rsp],xmm0 + movdqa XMMWORD[16+rsp],xmm0 + movdqa XMMWORD[32+rsp],xmm0 + movdqa XMMWORD[48+rsp],xmm0 + movdqa XMMWORD[64+rsp],xmm0 + movdqa XMMWORD[80+rsp],xmm0 + movdqa XMMWORD[96+rsp],xmm0 + and eax,0x80100 + cmp eax,0x80100 + je NEAR $L$by_one_callx + call __rsaz_512_reduce + jmp NEAR $L$by_one_tail +ALIGN 32 +$L$by_one_callx: + mov rdx,QWORD[128+rsp] + call __rsaz_512_reducex +$L$by_one_tail: + mov QWORD[rdi],r8 + mov QWORD[8+rdi],r9 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + mov QWORD[32+rdi],r12 + mov QWORD[40+rdi],r13 + mov QWORD[48+rdi],r14 + mov QWORD[56+rdi],r15 + + lea rax,[((128+24+48))+rsp] + + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbp,QWORD[((-16))+rax] + + mov rbx,QWORD[((-8))+rax] + + lea rsp,[rax] + +$L$mul_by_one_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_rsaz_512_mul_by_one: + +ALIGN 32 +__rsaz_512_reduce: + mov rbx,r8 + imul rbx,QWORD[((128+8))+rsp] + mov rax,QWORD[rbp] + mov ecx,8 + jmp NEAR $L$reduction_loop + +ALIGN 32 +$L$reduction_loop: + mul rbx + mov rax,QWORD[8+rbp] + neg r8 + mov r8,rdx + adc r8,0 + + mul rbx + add r9,rax + mov rax,QWORD[16+rbp] + adc rdx,0 + add r8,r9 + mov r9,rdx + adc r9,0 + + mul rbx + add r10,rax + mov rax,QWORD[24+rbp] + adc rdx,0 + add r9,r10 + mov r10,rdx + adc r10,0 + + mul rbx + add r11,rax + mov rax,QWORD[32+rbp] + adc rdx,0 + add r10,r11 + mov rsi,QWORD[((128+8))+rsp] + + + adc rdx,0 + mov r11,rdx + + mul rbx + add r12,rax + mov rax,QWORD[40+rbp] + adc rdx,0 + imul rsi,r8 + add r11,r12 + mov r12,rdx + adc r12,0 + + mul rbx + add r13,rax + mov rax,QWORD[48+rbp] + adc rdx,0 + add r12,r13 + mov r13,rdx + adc r13,0 + + mul rbx + add r14,rax + mov rax,QWORD[56+rbp] + adc rdx,0 + add r13,r14 + mov r14,rdx + adc r14,0 + + mul rbx + mov rbx,rsi + add r15,rax + mov rax,QWORD[rbp] + adc rdx,0 + add r14,r15 + mov r15,rdx + adc r15,0 + + dec ecx + jne NEAR $L$reduction_loop + + DB 0F3h,0C3h ;repret + + +ALIGN 32 +__rsaz_512_reducex: + + imul rdx,r8 + xor rsi,rsi + mov ecx,8 + jmp NEAR $L$reduction_loopx + +ALIGN 32 +$L$reduction_loopx: + mov rbx,r8 + mulx r8,rax,QWORD[rbp] + adcx rax,rbx + adox r8,r9 + + mulx r9,rax,QWORD[8+rbp] + adcx r8,rax + adox r9,r10 + + mulx r10,rbx,QWORD[16+rbp] + adcx r9,rbx + adox r10,r11 + + mulx r11,rbx,QWORD[24+rbp] + adcx r10,rbx + adox r11,r12 + +DB 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + mov rax,rdx + mov rdx,r8 + adcx r11,rbx + adox r12,r13 + + mulx rdx,rbx,QWORD[((128+8))+rsp] + mov rdx,rax + + mulx r13,rax,QWORD[40+rbp] + adcx r12,rax + adox r13,r14 + +DB 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 + adcx r13,rax + adox r14,r15 + + mulx r15,rax,QWORD[56+rbp] + mov rdx,rbx + adcx r14,rax + adox r15,rsi + adcx r15,rsi + + dec ecx + jne NEAR $L$reduction_loopx + + DB 0F3h,0C3h ;repret + + +ALIGN 32 +__rsaz_512_subtract: + mov QWORD[rdi],r8 + mov QWORD[8+rdi],r9 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + mov QWORD[32+rdi],r12 + mov QWORD[40+rdi],r13 + mov QWORD[48+rdi],r14 + mov QWORD[56+rdi],r15 + + mov r8,QWORD[rbp] + mov r9,QWORD[8+rbp] + neg r8 + not r9 + and r8,rcx + mov r10,QWORD[16+rbp] + and r9,rcx + not r10 + mov r11,QWORD[24+rbp] + and r10,rcx + not r11 + mov r12,QWORD[32+rbp] + and r11,rcx + not r12 + mov r13,QWORD[40+rbp] + and r12,rcx + not r13 + mov r14,QWORD[48+rbp] + and r13,rcx + not r14 + mov r15,QWORD[56+rbp] + and r14,rcx + not r15 + and r15,rcx + + add r8,QWORD[rdi] + adc r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + + mov QWORD[rdi],r8 + mov QWORD[8+rdi],r9 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + mov QWORD[32+rdi],r12 + mov QWORD[40+rdi],r13 + mov QWORD[48+rdi],r14 + mov QWORD[56+rdi],r15 + + DB 0F3h,0C3h ;repret + + +ALIGN 32 +__rsaz_512_mul: + lea rdi,[8+rsp] + + mov rax,QWORD[rsi] + mul rbx + mov QWORD[rdi],rax + mov rax,QWORD[8+rsi] + mov r8,rdx + + mul rbx + add r8,rax + mov rax,QWORD[16+rsi] + mov r9,rdx + adc r9,0 + + mul rbx + add r9,rax + mov rax,QWORD[24+rsi] + mov r10,rdx + adc r10,0 + + mul rbx + add r10,rax + mov rax,QWORD[32+rsi] + mov r11,rdx + adc r11,0 + + mul rbx + add r11,rax + mov rax,QWORD[40+rsi] + mov r12,rdx + adc r12,0 + + mul rbx + add r12,rax + mov rax,QWORD[48+rsi] + mov r13,rdx + adc r13,0 + + mul rbx + add r13,rax + mov rax,QWORD[56+rsi] + mov r14,rdx + adc r14,0 + + mul rbx + add r14,rax + mov rax,QWORD[rsi] + mov r15,rdx + adc r15,0 + + lea rbp,[8+rbp] + lea rdi,[8+rdi] + + mov ecx,7 + jmp NEAR $L$oop_mul + +ALIGN 32 +$L$oop_mul: + mov rbx,QWORD[rbp] + mul rbx + add r8,rax + mov rax,QWORD[8+rsi] + mov QWORD[rdi],r8 + mov r8,rdx + adc r8,0 + + mul rbx + add r9,rax + mov rax,QWORD[16+rsi] + adc rdx,0 + add r8,r9 + mov r9,rdx + adc r9,0 + + mul rbx + add r10,rax + mov rax,QWORD[24+rsi] + adc rdx,0 + add r9,r10 + mov r10,rdx + adc r10,0 + + mul rbx + add r11,rax + mov rax,QWORD[32+rsi] + adc rdx,0 + add r10,r11 + mov r11,rdx + adc r11,0 + + mul rbx + add r12,rax + mov rax,QWORD[40+rsi] + adc rdx,0 + add r11,r12 + mov r12,rdx + adc r12,0 + + mul rbx + add r13,rax + mov rax,QWORD[48+rsi] + adc rdx,0 + add r12,r13 + mov r13,rdx + adc r13,0 + + mul rbx + add r14,rax + mov rax,QWORD[56+rsi] + adc rdx,0 + add r13,r14 + mov r14,rdx + lea rbp,[8+rbp] + adc r14,0 + + mul rbx + add r15,rax + mov rax,QWORD[rsi] + adc rdx,0 + add r14,r15 + mov r15,rdx + adc r15,0 + + lea rdi,[8+rdi] + + dec ecx + jnz NEAR $L$oop_mul + + mov QWORD[rdi],r8 + mov QWORD[8+rdi],r9 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + mov QWORD[32+rdi],r12 + mov QWORD[40+rdi],r13 + mov QWORD[48+rdi],r14 + mov QWORD[56+rdi],r15 + + DB 0F3h,0C3h ;repret + + +ALIGN 32 +__rsaz_512_mulx: + mulx r8,rbx,QWORD[rsi] + mov rcx,-6 + + mulx r9,rax,QWORD[8+rsi] + mov QWORD[8+rsp],rbx + + mulx r10,rbx,QWORD[16+rsi] + adc r8,rax + + mulx r11,rax,QWORD[24+rsi] + adc r9,rbx + + mulx r12,rbx,QWORD[32+rsi] + adc r10,rax + + mulx r13,rax,QWORD[40+rsi] + adc r11,rbx + + mulx r14,rbx,QWORD[48+rsi] + adc r12,rax + + mulx r15,rax,QWORD[56+rsi] + mov rdx,QWORD[8+rbp] + adc r13,rbx + adc r14,rax + adc r15,0 + + xor rdi,rdi + jmp NEAR $L$oop_mulx + +ALIGN 32 +$L$oop_mulx: + mov rbx,r8 + mulx r8,rax,QWORD[rsi] + adcx rbx,rax + adox r8,r9 + + mulx r9,rax,QWORD[8+rsi] + adcx r8,rax + adox r9,r10 + + mulx r10,rax,QWORD[16+rsi] + adcx r9,rax + adox r10,r11 + + mulx r11,rax,QWORD[24+rsi] + adcx r10,rax + adox r11,r12 + +DB 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 + adcx r11,rax + adox r12,r13 + + mulx r13,rax,QWORD[40+rsi] + adcx r12,rax + adox r13,r14 + + mulx r14,rax,QWORD[48+rsi] + adcx r13,rax + adox r14,r15 + + mulx r15,rax,QWORD[56+rsi] + mov rdx,QWORD[64+rcx*8+rbp] + mov QWORD[((8+64-8))+rcx*8+rsp],rbx + adcx r14,rax + adox r15,rdi + adcx r15,rdi + + inc rcx + jnz NEAR $L$oop_mulx + + mov rbx,r8 + mulx r8,rax,QWORD[rsi] + adcx rbx,rax + adox r8,r9 + +DB 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 + adcx r8,rax + adox r9,r10 + +DB 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 + adcx r9,rax + adox r10,r11 + + mulx r11,rax,QWORD[24+rsi] + adcx r10,rax + adox r11,r12 + + mulx r12,rax,QWORD[32+rsi] + adcx r11,rax + adox r12,r13 + + mulx r13,rax,QWORD[40+rsi] + adcx r12,rax + adox r13,r14 + +DB 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 + adcx r13,rax + adox r14,r15 + +DB 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 + adcx r14,rax + adox r15,rdi + adcx r15,rdi + + mov QWORD[((8+64-8))+rsp],rbx + mov QWORD[((8+64))+rsp],r8 + mov QWORD[((8+64+8))+rsp],r9 + mov QWORD[((8+64+16))+rsp],r10 + mov QWORD[((8+64+24))+rsp],r11 + mov QWORD[((8+64+32))+rsp],r12 + mov QWORD[((8+64+40))+rsp],r13 + mov QWORD[((8+64+48))+rsp],r14 + mov QWORD[((8+64+56))+rsp],r15 + + DB 0F3h,0C3h ;repret + +global rsaz_512_scatter4 + +ALIGN 16 +rsaz_512_scatter4: + lea rcx,[r8*8+rcx] + mov r9d,8 + jmp NEAR $L$oop_scatter +ALIGN 16 +$L$oop_scatter: + mov rax,QWORD[rdx] + lea rdx,[8+rdx] + mov QWORD[rcx],rax + lea rcx,[128+rcx] + dec r9d + jnz NEAR $L$oop_scatter + DB 0F3h,0C3h ;repret + + +global rsaz_512_gather4 + +ALIGN 16 +rsaz_512_gather4: +$L$SEH_begin_rsaz_512_gather4: +DB 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 +DB 0x0f,0x29,0x34,0x24 +DB 0x0f,0x29,0x7c,0x24,0x10 +DB 0x44,0x0f,0x29,0x44,0x24,0x20 +DB 0x44,0x0f,0x29,0x4c,0x24,0x30 +DB 0x44,0x0f,0x29,0x54,0x24,0x40 +DB 0x44,0x0f,0x29,0x5c,0x24,0x50 +DB 0x44,0x0f,0x29,0x64,0x24,0x60 +DB 0x44,0x0f,0x29,0x6c,0x24,0x70 +DB 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 +DB 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 + movd xmm8,r8d + movdqa xmm1,XMMWORD[(($L$inc+16))] + movdqa xmm0,XMMWORD[$L$inc] + + pshufd xmm8,xmm8,0 + movdqa xmm7,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm8 + movdqa xmm3,xmm7 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm8 + movdqa xmm4,xmm7 + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm8 + movdqa xmm5,xmm7 + paddd xmm4,xmm3 + pcmpeqd xmm3,xmm8 + movdqa xmm6,xmm7 + paddd xmm5,xmm4 + pcmpeqd xmm4,xmm8 + paddd xmm6,xmm5 + pcmpeqd xmm5,xmm8 + paddd xmm7,xmm6 + pcmpeqd xmm6,xmm8 + pcmpeqd xmm7,xmm8 + mov r9d,8 + jmp NEAR $L$oop_gather +ALIGN 16 +$L$oop_gather: + movdqa xmm8,XMMWORD[rdx] + movdqa xmm9,XMMWORD[16+rdx] + movdqa xmm10,XMMWORD[32+rdx] + movdqa xmm11,XMMWORD[48+rdx] + pand xmm8,xmm0 + movdqa xmm12,XMMWORD[64+rdx] + pand xmm9,xmm1 + movdqa xmm13,XMMWORD[80+rdx] + pand xmm10,xmm2 + movdqa xmm14,XMMWORD[96+rdx] + pand xmm11,xmm3 + movdqa xmm15,XMMWORD[112+rdx] + lea rdx,[128+rdx] + pand xmm12,xmm4 + pand xmm13,xmm5 + pand xmm14,xmm6 + pand xmm15,xmm7 + por xmm8,xmm10 + por xmm9,xmm11 + por xmm8,xmm12 + por xmm9,xmm13 + por xmm8,xmm14 + por xmm9,xmm15 + + por xmm8,xmm9 + pshufd xmm9,xmm8,0x4e + por xmm8,xmm9 + movq QWORD[rcx],xmm8 + lea rcx,[8+rcx] + dec r9d + jnz NEAR $L$oop_gather + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + add rsp,0xa8 + DB 0F3h,0C3h ;repret +$L$SEH_end_rsaz_512_gather4: + + +ALIGN 64 +$L$inc: + DD 0,0,1,1 + DD 2,2,2,2 +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rax,[((128+24+48))+rax] + + lea rbx,[$L$mul_gather4_epilogue] + cmp rbx,r10 + jne NEAR $L$se_not_in_mul_gather4 + + lea rax,[176+rax] + + lea rsi,[((-48-168))+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + +$L$se_not_in_mul_gather4: + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_rsaz_512_sqr wrt ..imagebase + DD $L$SEH_end_rsaz_512_sqr wrt ..imagebase + DD $L$SEH_info_rsaz_512_sqr wrt ..imagebase + + DD $L$SEH_begin_rsaz_512_mul wrt ..imagebase + DD $L$SEH_end_rsaz_512_mul wrt ..imagebase + DD $L$SEH_info_rsaz_512_mul wrt ..imagebase + + DD $L$SEH_begin_rsaz_512_mul_gather4 wrt ..imagebase + DD $L$SEH_end_rsaz_512_mul_gather4 wrt ..imagebase + DD $L$SEH_info_rsaz_512_mul_gather4 wrt ..imagebase + + DD $L$SEH_begin_rsaz_512_mul_scatter4 wrt ..imagebase + DD $L$SEH_end_rsaz_512_mul_scatter4 wrt ..imagebase + DD $L$SEH_info_rsaz_512_mul_scatter4 wrt ..imagebase + + DD $L$SEH_begin_rsaz_512_mul_by_one wrt ..imagebase + DD $L$SEH_end_rsaz_512_mul_by_one wrt ..imagebase + DD $L$SEH_info_rsaz_512_mul_by_one wrt ..imagebase + + DD $L$SEH_begin_rsaz_512_gather4 wrt ..imagebase + DD $L$SEH_end_rsaz_512_gather4 wrt ..imagebase + DD $L$SEH_info_rsaz_512_gather4 wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_rsaz_512_sqr: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$sqr_body wrt ..imagebase,$L$sqr_epilogue wrt ..imagebase +$L$SEH_info_rsaz_512_mul: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase +$L$SEH_info_rsaz_512_mul_gather4: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$mul_gather4_body wrt ..imagebase,$L$mul_gather4_epilogue wrt ..imagebase +$L$SEH_info_rsaz_512_mul_scatter4: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$mul_scatter4_body wrt ..imagebase,$L$mul_scatter4_epilogue wrt ..imagebase +$L$SEH_info_rsaz_512_mul_by_one: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$mul_by_one_body wrt ..imagebase,$L$mul_by_one_epilogue wrt ..imagebase +$L$SEH_info_rsaz_512_gather4: +DB 0x01,0x46,0x16,0x00 +DB 0x46,0xf8,0x09,0x00 +DB 0x3d,0xe8,0x08,0x00 +DB 0x34,0xd8,0x07,0x00 +DB 0x2e,0xc8,0x06,0x00 +DB 0x28,0xb8,0x05,0x00 +DB 0x22,0xa8,0x04,0x00 +DB 0x1c,0x98,0x03,0x00 +DB 0x16,0x88,0x02,0x00 +DB 0x10,0x78,0x01,0x00 +DB 0x0b,0x68,0x00,0x00 +DB 0x07,0x01,0x15,0x00 |