diff options
Diffstat (limited to 'deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl')
-rw-r--r-- | deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl | 80 |
1 files changed, 61 insertions, 19 deletions
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl index 387e3f854e..afc30c3e72 100644 --- a/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl +++ b/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl @@ -44,9 +44,8 @@ # See ghash-x86.pl for background information and details about coding # techniques. # -# Special thanks to David Woodhouse <dwmw2@infradead.org> for -# providing access to a Westmere-based system on behalf of Intel -# Open Source Technology Centre. +# Special thanks to David Woodhouse for providing access to a +# Westmere-based system on behalf of Intel Open Source Technology Centre. # December 2012 # @@ -74,6 +73,7 @@ # Skylake 0.44(+110%)(if system doesn't support AVX) # Bulldozer 1.49(+27%) # Silvermont 2.88(+13%) +# Knights L 2.12(-) (if system doesn't support AVX) # Goldmont 1.08(+24%) # March 2013 @@ -86,6 +86,8 @@ # it performs in 0.41 cycles per byte on Haswell processor, in # 0.29 on Broadwell, and in 0.36 on Skylake. # +# Knights Landing achieves 1.09 cpb. +# # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest $flavour = shift; @@ -236,9 +238,21 @@ $code=<<___; .type gcm_gmult_4bit,\@function,2 .align 16 gcm_gmult_4bit: +.cfi_startproc push %rbx - push %rbp # %rbp and %r12 are pushed exclusively in +.cfi_push %rbx + push %rbp # %rbp and others are pushed exclusively in +.cfi_push %rbp push %r12 # order to reuse Win64 exception handler... +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$280,%rsp +.cfi_adjust_cfa_offset 280 .Lgmult_prologue: movzb 15($Xi),$Zlo @@ -249,10 +263,15 @@ $code.=<<___; mov $Zlo,8($Xi) mov $Zhi,($Xi) - mov 16(%rsp),%rbx - lea 24(%rsp),%rsp + lea 280+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lgmult_epilogue: ret +.cfi_endproc .size gcm_gmult_4bit,.-gcm_gmult_4bit ___ @@ -266,13 +285,21 @@ $code.=<<___; .type gcm_ghash_4bit,\@function,4 .align 16 gcm_ghash_4bit: +.cfi_startproc push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 sub \$280,%rsp +.cfi_adjust_cfa_offset 280 .Lghash_prologue: mov $inp,%r14 # reassign couple of args mov $len,%r15 @@ -400,16 +427,25 @@ $code.=<<___; mov $Zlo,8($Xi) mov $Zhi,($Xi) - lea 280(%rsp),%rsi - mov 0(%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + lea 280+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea 0(%rsi),%rsp +.cfi_def_cfa_register %rsp .Lghash_epilogue: ret +.cfi_endproc .size gcm_ghash_4bit,.-gcm_ghash_4bit ___ @@ -469,7 +505,7 @@ $code.=<<___; psllq \$57,$Xi # movdqa $Xi,$T1 # pslldq \$8,$Xi - psrldq \$8,$T1 # + psrldq \$8,$T1 # pxor $T2,$Xi pxor $T1,$Xhi # @@ -583,7 +619,7 @@ ___ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); # experimental alternative. special thing about is that there - # no dependency between the two multiplications... + # no dependency between the two multiplications... mov \$`0xE1<<1`,%eax mov \$0xA040608020C0E000,%r10 # ((7..0)ยท0xE0)&0xff mov \$0x07,%r11d @@ -758,7 +794,7 @@ $code.=<<___; movdqa $T2,$T1 # pslldq \$8,$T2 pclmulqdq \$0x00,$Hkey2,$Xln - psrldq \$8,$T1 # + psrldq \$8,$T1 # pxor $T2,$Xi pxor $T1,$Xhi # movdqu 0($inp),$T1 @@ -894,7 +930,7 @@ $code.=<<___; psllq \$57,$Xi # movdqa $Xi,$T1 # pslldq \$8,$Xi - psrldq \$8,$T1 # + psrldq \$8,$T1 # pxor $T2,$Xi pshufd \$0b01001110,$Xhn,$Xmn pxor $T1,$Xhi # @@ -1648,14 +1684,20 @@ se_handler: cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue - lea 24(%rax),%rax # adjust "rsp" + lea 48+280(%rax),%rax # adjust "rsp" mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 .Lin_prologue: mov 8(%rax),%rdi |