summaryrefslogtreecommitdiff
path: root/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
diff options
context:
space:
mode:
Diffstat (limited to 'deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl')
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl80
1 files changed, 61 insertions, 19 deletions
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
index 387e3f854e..afc30c3e72 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
@@ -44,9 +44,8 @@
# See ghash-x86.pl for background information and details about coding
# techniques.
#
-# Special thanks to David Woodhouse <dwmw2@infradead.org> for
-# providing access to a Westmere-based system on behalf of Intel
-# Open Source Technology Centre.
+# Special thanks to David Woodhouse for providing access to a
+# Westmere-based system on behalf of Intel Open Source Technology Centre.
# December 2012
#
@@ -74,6 +73,7 @@
# Skylake 0.44(+110%)(if system doesn't support AVX)
# Bulldozer 1.49(+27%)
# Silvermont 2.88(+13%)
+# Knights L 2.12(-) (if system doesn't support AVX)
# Goldmont 1.08(+24%)
# March 2013
@@ -86,6 +86,8 @@
# it performs in 0.41 cycles per byte on Haswell processor, in
# 0.29 on Broadwell, and in 0.36 on Skylake.
#
+# Knights Landing achieves 1.09 cpb.
+#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
$flavour = shift;
@@ -236,9 +238,21 @@ $code=<<___;
.type gcm_gmult_4bit,\@function,2
.align 16
gcm_gmult_4bit:
+.cfi_startproc
push %rbx
- push %rbp # %rbp and %r12 are pushed exclusively in
+.cfi_push %rbx
+ push %rbp # %rbp and others are pushed exclusively in
+.cfi_push %rbp
push %r12 # order to reuse Win64 exception handler...
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+ sub \$280,%rsp
+.cfi_adjust_cfa_offset 280
.Lgmult_prologue:
movzb 15($Xi),$Zlo
@@ -249,10 +263,15 @@ $code.=<<___;
mov $Zlo,8($Xi)
mov $Zhi,($Xi)
- mov 16(%rsp),%rbx
- lea 24(%rsp),%rsp
+ lea 280+48(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lgmult_epilogue:
ret
+.cfi_endproc
.size gcm_gmult_4bit,.-gcm_gmult_4bit
___
@@ -266,13 +285,21 @@ $code.=<<___;
.type gcm_ghash_4bit,\@function,4
.align 16
gcm_ghash_4bit:
+.cfi_startproc
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
sub \$280,%rsp
+.cfi_adjust_cfa_offset 280
.Lghash_prologue:
mov $inp,%r14 # reassign couple of args
mov $len,%r15
@@ -400,16 +427,25 @@ $code.=<<___;
mov $Zlo,8($Xi)
mov $Zhi,($Xi)
- lea 280(%rsp),%rsi
- mov 0(%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ lea 280+48(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbp
+.cfi_restore %rbp
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea 0(%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lghash_epilogue:
ret
+.cfi_endproc
.size gcm_ghash_4bit,.-gcm_ghash_4bit
___
@@ -469,7 +505,7 @@ $code.=<<___;
psllq \$57,$Xi #
movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T1 #
+ psrldq \$8,$T1 #
pxor $T2,$Xi
pxor $T1,$Xhi #
@@ -583,7 +619,7 @@ ___
&clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
# experimental alternative. special thing about is that there
- # no dependency between the two multiplications...
+ # no dependency between the two multiplications...
mov \$`0xE1<<1`,%eax
mov \$0xA040608020C0E000,%r10 # ((7..0)ยท0xE0)&0xff
mov \$0x07,%r11d
@@ -758,7 +794,7 @@ $code.=<<___;
movdqa $T2,$T1 #
pslldq \$8,$T2
pclmulqdq \$0x00,$Hkey2,$Xln
- psrldq \$8,$T1 #
+ psrldq \$8,$T1 #
pxor $T2,$Xi
pxor $T1,$Xhi #
movdqu 0($inp),$T1
@@ -894,7 +930,7 @@ $code.=<<___;
psllq \$57,$Xi #
movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T1 #
+ psrldq \$8,$T1 #
pxor $T2,$Xi
pshufd \$0b01001110,$Xhn,$Xmn
pxor $T1,$Xhi #
@@ -1648,14 +1684,20 @@ se_handler:
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue
- lea 24(%rax),%rax # adjust "rsp"
+ lea 48+280(%rax),%rax # adjust "rsp"
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
.Lin_prologue:
mov 8(%rax),%rdi