summaryrefslogtreecommitdiff
path: root/deps/openssl/openssl/crypto/modes/asm
diff options
context:
space:
mode:
Diffstat (limited to 'deps/openssl/openssl/crypto/modes/asm')
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl75
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl4
-rwxr-xr-xdeps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl2
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl16
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl14
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl15
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl80
-rwxr-xr-xdeps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl5
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl379
9 files changed, 494 insertions, 96 deletions
diff --git a/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl b/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
index 5ad62b3979..b42016101e 100644
--- a/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -35,6 +35,8 @@
# Applications using the EVP interface will observe a few percent
# worse performance.]
#
+# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
+#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
@@ -116,23 +118,6 @@ _aesni_ctr32_ghash_6x:
vpxor $rndkey,$inout3,$inout3
vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey
vpclmulqdq \$0x01,$Hkey,$Z3,$Z2
-
- # At this point, the current block of 96 (0x60) bytes has already been
- # loaded into registers. Concurrently with processing it, we want to
- # load the next 96 bytes of input for the next round. Obviously, we can
- # only do this if there are at least 96 more bytes of input beyond the
- # input we're currently processing, or else we'd read past the end of
- # the input buffer. Here, we set |%r12| to 96 if there are at least 96
- # bytes of input beyond the 96 bytes we're already processing, and we
- # set |%r12| to 0 otherwise. In the case where we set |%r12| to 96,
- # we'll read in the next block so that it is in registers for the next
- # loop iteration. In the case where we set |%r12| to 0, we'll re-read
- # the current block and then ignore what we re-read.
- #
- # At this point, |$in0| points to the current (already read into
- # registers) block, and |$end0| points to 2*96 bytes before the end of
- # the input. Thus, |$in0| > |$end0| means that we do not have the next
- # 96-byte block to read in, and |$in0| <= |$end0| means we do.
xor %r12,%r12
cmp $in0,$end0
@@ -424,20 +409,25 @@ $code.=<<___;
.type aesni_gcm_decrypt,\@function,6
.align 32
aesni_gcm_decrypt:
+.cfi_startproc
xor $ret,$ret
-
- # We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60)
- # bytes of input.
cmp \$0x60,$len # minimal accepted length
jb .Lgcm_dec_abort
lea (%rsp),%rax # save stack pointer
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
___
$code.=<<___ if ($win64);
lea -0xa8(%rsp),%rsp
@@ -482,15 +472,7 @@ $code.=<<___;
vmovdqu 0x50($inp),$Z3 # I[5]
lea ($inp),$in0
vmovdqu 0x40($inp),$Z0
-
- # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
- # bytes before the end of the input. Note, in particular, that this is
- # correct even if |$len| is not an even multiple of 96 or 16. XXX: This
- # seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must
- # not be near the very beginning of the address space when |$len| < 2*96
- # (0xc0).
lea -0xc0($inp,$len),$end0
-
vmovdqu 0x30($inp),$Z1
shr \$4,$len
xor $ret,$ret
@@ -537,15 +519,23 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
mov -48(%rax),%r15
+.cfi_restore %r15
mov -40(%rax),%r14
+.cfi_restore %r14
mov -32(%rax),%r13
+.cfi_restore %r13
mov -24(%rax),%r12
+.cfi_restore %r12
mov -16(%rax),%rbp
+.cfi_restore %rbp
mov -8(%rax),%rbx
+.cfi_restore %rbx
lea (%rax),%rsp # restore %rsp
+.cfi_def_cfa_register %rsp
.Lgcm_dec_abort:
mov $ret,%rax # return value
ret
+.cfi_endproc
.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
___
@@ -645,21 +635,25 @@ _aesni_ctr32_6x:
.type aesni_gcm_encrypt,\@function,6
.align 32
aesni_gcm_encrypt:
+.cfi_startproc
xor $ret,$ret
-
- # We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of
- # input. Then we call |_aesni_ctr32_ghash_6x|, which requires at
- # least 96 more bytes of input.
cmp \$0x60*3,$len # minimal accepted length
jb .Lgcm_enc_abort
lea (%rsp),%rax # save stack pointer
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
___
$code.=<<___ if ($win64);
lea -0xa8(%rsp),%rsp
@@ -699,16 +693,7 @@ $code.=<<___;
.Lenc_no_key_aliasing:
lea ($out),$in0
-
- # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
- # bytes before the end of the input. Note, in particular, that this is
- # correct even if |$len| is not an even multiple of 96 or 16. Unlike in
- # the decryption case, there's no caveat that |$out| must not be near
- # the very beginning of the address space, because we know that
- # |$len| >= 3*96 from the check above, and so we know
- # |$out| + |$len| >= 2*96 (0xc0).
lea -0xc0($out,$len),$end0
-
shr \$4,$len
call _aesni_ctr32_6x
@@ -931,15 +916,23 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
mov -48(%rax),%r15
+.cfi_restore %r15
mov -40(%rax),%r14
+.cfi_restore %r14
mov -32(%rax),%r13
+.cfi_restore %r13
mov -24(%rax),%r12
+.cfi_restore %r12
mov -16(%rax),%rbp
+.cfi_restore %rbp
mov -8(%rax),%rbx
+.cfi_restore %rbx
lea (%rax),%rsp # restore %rsp
+.cfi_def_cfa_register %rsp
.Lgcm_enc_abort:
mov $ret,%rax # return value
ret
+.cfi_endproc
.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
___
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl
index 1cf14a6c9f..dcc23f7d7d 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl
@@ -54,7 +54,7 @@
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
-#
+#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
# ====================================================================
@@ -525,7 +525,7 @@ $code.=<<___;
#ifdef __ARMEL__
vrev64.8 $Xl,$Xl
#endif
- sub $Xi,#16
+ sub $Xi,#16
vst1.64 $Xl#hi,[$Xi]! @ write out Xi
vst1.64 $Xl#lo,[$Xi]
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl
index 81e75f71a8..eb9ded91e5 100755
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl
@@ -156,7 +156,7 @@ $code.=<<___;
___
######################################################################
-# "528B" (well, "512B" actualy) streamed GHASH
+# "528B" (well, "512B" actually) streamed GHASH
#
$Xip="in0";
$Htbl="in1";
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl
index 1d6254543b..a614c99c22 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl
@@ -1,5 +1,5 @@
#! /usr/bin/env perl
-# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
@@ -705,7 +705,7 @@ my $depd = sub {
my ($mod,$args) = @_;
my $orig = "depd$mod\t$args";
- # I only have ",z" completer, it's impicitly encoded...
+ # I only have ",z" completer, it's implicitly encoded...
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16
{ my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
my $cpos=63-$2;
@@ -724,6 +724,11 @@ sub assemble {
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
}
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler/) {
+ $gnuas = 1;
+}
+
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
if ($SIZE_T==4) {
@@ -731,7 +736,12 @@ foreach (split("\n",$code)) {
s/cmpb,\*/comb,/;
s/,\*/,/;
}
- s/\bbv\b/bve/ if ($SIZE_T==8);
+
+ s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8);
+ s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8);
+ s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8);
+ s/\bbv\b/bve/ if ($SIZE_T==8);
+
print $_,"\n";
}
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl
index 6e628d8823..17dc375053 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl
@@ -80,6 +80,8 @@ $rem_4bit="%r14";
$sp="%r15";
$code.=<<___;
+#include "s390x_arch.h"
+
.text
.globl gcm_gmult_4bit
@@ -89,12 +91,13 @@ ___
$code.=<<___ if(!$softonly && 0); # hardware is slow for single block...
larl %r1,OPENSSL_s390xcap_P
lghi %r0,0
- lg %r1,24(%r1) # load second word of kimd capabilities vector
+ lg %r1,S390X_KIMD+8(%r1) # load second word of kimd capabilities
+ # vector
tmhh %r1,0x4000 # check for function 65
jz .Lsoft_gmult
stg %r0,16($sp) # arrange 16 bytes of zero input
stg %r0,24($sp)
- lghi %r0,65 # function 65
+ lghi %r0,S390X_GHASH # function 65
la %r1,0($Xi) # H lies right after Xi in gcm128_context
la $inp,16($sp)
lghi $len,16
@@ -123,10 +126,11 @@ gcm_ghash_4bit:
___
$code.=<<___ if(!$softonly);
larl %r1,OPENSSL_s390xcap_P
- lg %r0,24(%r1) # load second word of kimd capabilities vector
+ lg %r0,S390X_KIMD+8(%r1) # load second word of kimd capabilities
+ # vector
tmhh %r0,0x4000 # check for function 65
jz .Lsoft_ghash
- lghi %r0,65 # function 65
+ lghi %r0,S390X_GHASH # function 65
la %r1,0($Xi) # H lies right after Xi in gcm128_context
.long 0xb93e0004 # kimd %r0,$inp
brc 1,.-4 # pay attention to "partial completion"
@@ -149,7 +153,7 @@ $code.=<<___;
lg $Zhi,0+1($Xi)
lghi $tmp,0
.Louter:
- xg $Zhi,0($inp) # Xi ^= inp
+ xg $Zhi,0($inp) # Xi ^= inp
xg $Zlo,8($inp)
xgr $Zhi,$tmp
stg $Zlo,8+1($Xi)
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl
index cd8458256e..bcbe6e399d 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl
@@ -103,14 +103,13 @@
#
# Does it make sense to increase Naggr? To start with it's virtually
# impossible in 32-bit mode, because of limited register bank
-# capacity. Otherwise improvement has to be weighed agiainst slower
+# capacity. Otherwise improvement has to be weighed against slower
# setup, as well as code size and complexity increase. As even
# optimistic estimate doesn't promise 30% performance improvement,
# there are currently no plans to increase Naggr.
#
-# Special thanks to David Woodhouse <dwmw2@infradead.org> for
-# providing access to a Westmere-based system on behalf of Intel
-# Open Source Technology Centre.
+# Special thanks to David Woodhouse for providing access to a
+# Westmere-based system on behalf of Intel Open Source Technology Centre.
# January 2010
#
@@ -139,7 +138,7 @@ require "x86asm.pl";
$output=pop;
open STDOUT,">$output";
-&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
@@ -811,7 +810,7 @@ sub mmx_loop() {
&bswap ($dat);
&pshufw ($Zhi,$Zhi,0b00011011); # 76543210
&bswap ("ebx");
-
+
&cmp ("ecx",&DWP(528+16+8,"esp")); # are we done?
&jne (&label("outer"));
}
@@ -915,7 +914,7 @@ my ($Xhi,$Xi) = @_;
&psllq ($Xi,57); #
&movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
- &psrldq ($T1,8); #
+ &psrldq ($T1,8); #
&pxor ($Xi,$T2);
&pxor ($Xhi,$T1); #
@@ -1085,7 +1084,7 @@ my ($Xhi,$Xi) = @_;
&psllq ($Xi,57); #
&movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
- &psrldq ($T1,8); #
+ &psrldq ($T1,8); #
&pxor ($Xi,$T2);
&pxor ($Xhi,$T1); #
&pshufd ($T1,$Xhn,0b01001110);
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
index 387e3f854e..afc30c3e72 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
@@ -44,9 +44,8 @@
# See ghash-x86.pl for background information and details about coding
# techniques.
#
-# Special thanks to David Woodhouse <dwmw2@infradead.org> for
-# providing access to a Westmere-based system on behalf of Intel
-# Open Source Technology Centre.
+# Special thanks to David Woodhouse for providing access to a
+# Westmere-based system on behalf of Intel Open Source Technology Centre.
# December 2012
#
@@ -74,6 +73,7 @@
# Skylake 0.44(+110%)(if system doesn't support AVX)
# Bulldozer 1.49(+27%)
# Silvermont 2.88(+13%)
+# Knights L 2.12(-) (if system doesn't support AVX)
# Goldmont 1.08(+24%)
# March 2013
@@ -86,6 +86,8 @@
# it performs in 0.41 cycles per byte on Haswell processor, in
# 0.29 on Broadwell, and in 0.36 on Skylake.
#
+# Knights Landing achieves 1.09 cpb.
+#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
$flavour = shift;
@@ -236,9 +238,21 @@ $code=<<___;
.type gcm_gmult_4bit,\@function,2
.align 16
gcm_gmult_4bit:
+.cfi_startproc
push %rbx
- push %rbp # %rbp and %r12 are pushed exclusively in
+.cfi_push %rbx
+ push %rbp # %rbp and others are pushed exclusively in
+.cfi_push %rbp
push %r12 # order to reuse Win64 exception handler...
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+ sub \$280,%rsp
+.cfi_adjust_cfa_offset 280
.Lgmult_prologue:
movzb 15($Xi),$Zlo
@@ -249,10 +263,15 @@ $code.=<<___;
mov $Zlo,8($Xi)
mov $Zhi,($Xi)
- mov 16(%rsp),%rbx
- lea 24(%rsp),%rsp
+ lea 280+48(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lgmult_epilogue:
ret
+.cfi_endproc
.size gcm_gmult_4bit,.-gcm_gmult_4bit
___
@@ -266,13 +285,21 @@ $code.=<<___;
.type gcm_ghash_4bit,\@function,4
.align 16
gcm_ghash_4bit:
+.cfi_startproc
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
sub \$280,%rsp
+.cfi_adjust_cfa_offset 280
.Lghash_prologue:
mov $inp,%r14 # reassign couple of args
mov $len,%r15
@@ -400,16 +427,25 @@ $code.=<<___;
mov $Zlo,8($Xi)
mov $Zhi,($Xi)
- lea 280(%rsp),%rsi
- mov 0(%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ lea 280+48(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbp
+.cfi_restore %rbp
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea 0(%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lghash_epilogue:
ret
+.cfi_endproc
.size gcm_ghash_4bit,.-gcm_ghash_4bit
___
@@ -469,7 +505,7 @@ $code.=<<___;
psllq \$57,$Xi #
movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T1 #
+ psrldq \$8,$T1 #
pxor $T2,$Xi
pxor $T1,$Xhi #
@@ -583,7 +619,7 @@ ___
&clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
# experimental alternative. special thing about is that there
- # no dependency between the two multiplications...
+ # no dependency between the two multiplications...
mov \$`0xE1<<1`,%eax
mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
mov \$0x07,%r11d
@@ -758,7 +794,7 @@ $code.=<<___;
movdqa $T2,$T1 #
pslldq \$8,$T2
pclmulqdq \$0x00,$Hkey2,$Xln
- psrldq \$8,$T1 #
+ psrldq \$8,$T1 #
pxor $T2,$Xi
pxor $T1,$Xhi #
movdqu 0($inp),$T1
@@ -894,7 +930,7 @@ $code.=<<___;
psllq \$57,$Xi #
movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T1 #
+ psrldq \$8,$T1 #
pxor $T2,$Xi
pshufd \$0b01001110,$Xhn,$Xmn
pxor $T1,$Xhi #
@@ -1648,14 +1684,20 @@ se_handler:
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue
- lea 24(%rax),%rax # adjust "rsp"
+ lea 48+280(%rax),%rax # adjust "rsp"
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
.Lin_prologue:
mov 8(%rax),%rdi
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl b/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl
index f0598cb28c..6a2ac71295 100755
--- a/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl
@@ -1,5 +1,5 @@
#! /usr/bin/env perl
-# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
@@ -23,13 +23,14 @@
# Relative comparison is therefore more informative. This initial
# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
# faster than "4-bit" integer-only compiler-generated 64-bit code.
-# "Initial version" means that there is room for futher improvement.
+# "Initial version" means that there is room for further improvement.
# May 2016
#
# 2x aggregated reduction improves performance by 50% (resulting
# performance on POWER8 is 1 cycle per processed byte), and 4x
# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
+# POWER9 delivers 0.51 cpb.
$flavour=shift;
$output =shift;
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl b/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl
index e13c709019..47e8820080 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl
@@ -19,23 +19,29 @@
# June 2014
#
# Initial version was developed in tight cooperation with Ard
-# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
-# other assembly modules. Just like aesv8-armx.pl this module
-# supports both AArch32 and AArch64 execution modes.
+# Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
+# Just like aesv8-armx.pl this module supports both AArch32 and
+# AArch64 execution modes.
#
# July 2014
#
# Implement 2x aggregated reduction [see ghash-x86.pl for background
# information].
#
+# November 2017
+#
+# AArch64 register bank to "accommodate" 4x aggregated reduction and
+# improve performance by 20-70% depending on processor.
+#
# Current performance in cycles per processed byte:
#
-# PMULL[2] 32-bit NEON(*)
-# Apple A7 0.92 5.62
-# Cortex-A53 1.01 8.39
-# Cortex-A57 1.17 7.61
-# Denver 0.71 6.02
-# Mongoose 1.10 8.06
+# 64-bit PMULL 32-bit PMULL 32-bit NEON(*)
+# Apple A7 0.58 0.92 5.62
+# Cortex-A53 0.85 1.01 8.39
+# Cortex-A57 0.73 1.17 7.61
+# Denver 0.51 0.65 6.02
+# Mongoose 0.65 1.10 8.06
+# Kryo 0.76 1.16 8.00
#
# (*) presented for reference/comparison purposes;
@@ -131,8 +137,56 @@ gcm_init_v8:
vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing
veor $t1,$t1,$H2
vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
- vst1.64 {$Hhl-$H2},[x0] @ store Htable[1..2]
+ vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2]
+___
+if ($flavour =~ /64/) {
+my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
+$code.=<<___;
+ @ calculate H^3 and H^4
+ vpmull.p64 $Xl,$H, $H2
+ vpmull.p64 $Yl,$H2,$H2
+ vpmull2.p64 $Xh,$H, $H2
+ vpmull2.p64 $Yh,$H2,$H2
+ vpmull.p64 $Xm,$t0,$t1
+ vpmull.p64 $Ym,$t1,$t1
+
+ vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing
+ vext.8 $t1,$Yl,$Yh,#8
+ veor $t2,$Xl,$Xh
+ veor $Xm,$Xm,$t0
+ veor $t3,$Yl,$Yh
+ veor $Ym,$Ym,$t1
+ veor $Xm,$Xm,$t2
+ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
+ veor $Ym,$Ym,$t3
+ vpmull.p64 $t3,$Yl,$xC2
+
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
+ vmov $Yh#lo,$Ym#hi
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
+ vmov $Ym#hi,$Yl#lo
+ veor $Xl,$Xm,$t2
+ veor $Yl,$Ym,$t3
+
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
+ vext.8 $t3,$Yl,$Yl,#8
+ vpmull.p64 $Xl,$Xl,$xC2
+ vpmull.p64 $Yl,$Yl,$xC2
+ veor $t2,$t2,$Xh
+ veor $t3,$t3,$Yh
+ veor $H, $Xl,$t2 @ H^3
+ veor $H2,$Yl,$t3 @ H^4
+
+ vext.8 $t0,$H, $H,#8 @ Karatsuba pre-processing
+ vext.8 $t1,$H2,$H2,#8
+ veor $t0,$t0,$H
+ veor $t1,$t1,$H2
+ vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
+ vst1.64 {$H-$H2},[x0] @ store Htable[3..5]
+___
+}
+$code.=<<___;
ret
.size gcm_init_v8,.-gcm_init_v8
___
@@ -201,6 +255,10 @@ $code.=<<___;
.align 4
gcm_ghash_v8:
___
+$code.=<<___ if ($flavour =~ /64/);
+ cmp $len,#64
+ b.hs .Lgcm_ghash_v8_4x
+___
$code.=<<___ if ($flavour !~ /64/);
vstmdb sp!,{d8-d15} @ 32-bit ABI says so
___
@@ -210,13 +268,13 @@ $code.=<<___;
@ loaded value would have
@ to be rotated in order to
@ make it appear as in
- @ alorithm specification
+ @ algorithm specification
subs $len,$len,#32 @ see if $len is 32 or larger
mov $inc,#16 @ $inc is used as post-
@ increment for input pointer;
@ as loop is modulo-scheduled
@ $inc is zeroed just in time
- @ to preclude oversteping
+ @ to preclude overstepping
@ inp[len], which means that
@ last block[s] are actually
@ loaded twice, but last
@@ -348,7 +406,297 @@ $code.=<<___;
ret
.size gcm_ghash_v8,.-gcm_ghash_v8
___
+
+if ($flavour =~ /64/) { # 4x subroutine
+my ($I0,$j1,$j2,$j3,
+ $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
+
+$code.=<<___;
+.type gcm_ghash_v8_4x,%function
+.align 4
+gcm_ghash_v8_4x:
+.Lgcm_ghash_v8_4x:
+ vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
+ vld1.64 {$H-$H2},[$Htbl],#48 @ load twisted H, ..., H^2
+ vmov.i8 $xC2,#0xe1
+ vld1.64 {$H3-$H4},[$Htbl] @ load twisted H^3, ..., H^4
+ vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
+
+ vld1.64 {$I0-$j3},[$inp],#64
+#ifndef __ARMEB__
+ vrev64.8 $Xl,$Xl
+ vrev64.8 $j1,$j1
+ vrev64.8 $j2,$j2
+ vrev64.8 $j3,$j3
+ vrev64.8 $I0,$I0
+#endif
+ vext.8 $I3,$j3,$j3,#8
+ vext.8 $I2,$j2,$j2,#8
+ vext.8 $I1,$j1,$j1,#8
+
+ vpmull.p64 $Yl,$H,$I3 @ H·Ii+3
+ veor $j3,$j3,$I3
+ vpmull2.p64 $Yh,$H,$I3
+ vpmull.p64 $Ym,$Hhl,$j3
+
+ vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2
+ veor $j2,$j2,$I2
+ vpmull2.p64 $I2,$H2,$I2
+ vpmull2.p64 $j2,$Hhl,$j2
+
+ veor $Yl,$Yl,$t0
+ veor $Yh,$Yh,$I2
+ veor $Ym,$Ym,$j2
+
+ vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1
+ veor $j1,$j1,$I1
+ vpmull2.p64 $I1,$H3,$I1
+ vpmull.p64 $j1,$H34,$j1
+
+ veor $Yl,$Yl,$j3
+ veor $Yh,$Yh,$I1
+ veor $Ym,$Ym,$j1
+
+ subs $len,$len,#128
+ b.lo .Ltail4x
+
+ b .Loop4x
+
+.align 4
+.Loop4x:
+ veor $t0,$I0,$Xl
+ vld1.64 {$I0-$j3},[$inp],#64
+ vext.8 $IN,$t0,$t0,#8
+#ifndef __ARMEB__
+ vrev64.8 $j1,$j1
+ vrev64.8 $j2,$j2
+ vrev64.8 $j3,$j3
+ vrev64.8 $I0,$I0
+#endif
+
+ vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii)
+ veor $t0,$t0,$IN
+ vpmull2.p64 $Xh,$H4,$IN
+ vext.8 $I3,$j3,$j3,#8
+ vpmull2.p64 $Xm,$H34,$t0
+
+ veor $Xl,$Xl,$Yl
+ veor $Xh,$Xh,$Yh
+ vext.8 $I2,$j2,$j2,#8
+ veor $Xm,$Xm,$Ym
+ vext.8 $I1,$j1,$j1,#8
+
+ vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
+ veor $t2,$Xl,$Xh
+ vpmull.p64 $Yl,$H,$I3 @ H·Ii+3
+ veor $j3,$j3,$I3
+ veor $Xm,$Xm,$t1
+ vpmull2.p64 $Yh,$H,$I3
+ veor $Xm,$Xm,$t2
+ vpmull.p64 $Ym,$Hhl,$j3
+
+ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
+ vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2
+ veor $j2,$j2,$I2
+ vpmull2.p64 $I2,$H2,$I2
+ veor $Xl,$Xm,$t2
+ vpmull2.p64 $j2,$Hhl,$j2
+
+ veor $Yl,$Yl,$t0
+ veor $Yh,$Yh,$I2
+ veor $Ym,$Ym,$j2
+
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
+ vpmull.p64 $Xl,$Xl,$xC2
+ vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1
+ veor $j1,$j1,$I1
+ veor $t2,$t2,$Xh
+ vpmull2.p64 $I1,$H3,$I1
+ vpmull.p64 $j1,$H34,$j1
+
+ veor $Xl,$Xl,$t2
+ veor $Yl,$Yl,$j3
+ veor $Yh,$Yh,$I1
+ vext.8 $Xl,$Xl,$Xl,#8
+ veor $Ym,$Ym,$j1
+
+ subs $len,$len,#64
+ b.hs .Loop4x
+
+.Ltail4x:
+ veor $t0,$I0,$Xl
+ vext.8 $IN,$t0,$t0,#8
+
+ vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii)
+ veor $t0,$t0,$IN
+ vpmull2.p64 $Xh,$H4,$IN
+ vpmull2.p64 $Xm,$H34,$t0
+
+ veor $Xl,$Xl,$Yl
+ veor $Xh,$Xh,$Yh
+ veor $Xm,$Xm,$Ym
+
+ adds $len,$len,#64
+ b.eq .Ldone4x
+
+ cmp $len,#32
+ b.lo .Lone
+ b.eq .Ltwo
+.Lthree:
+ vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
+ veor $t2,$Xl,$Xh
+ veor $Xm,$Xm,$t1
+ vld1.64 {$I0-$j2},[$inp]
+ veor $Xm,$Xm,$t2
+#ifndef __ARMEB__
+ vrev64.8 $j1,$j1
+ vrev64.8 $j2,$j2
+ vrev64.8 $I0,$I0
+#endif
+
+ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
+ vext.8 $I2,$j2,$j2,#8
+ vext.8 $I1,$j1,$j1,#8
+ veor $Xl,$Xm,$t2
+
+ vpmull.p64 $Yl,$H,$I2 @ H·Ii+2
+ veor $j2,$j2,$I2
+
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
+ vpmull.p64 $Xl,$Xl,$xC2
+ veor $t2,$t2,$Xh
+ vpmull2.p64 $Yh,$H,$I2
+ vpmull.p64 $Ym,$Hhl,$j2
+ veor $Xl,$Xl,$t2
+ vpmull.p64 $j3,$H2,$I1 @ H^2·Ii+1
+ veor $j1,$j1,$I1
+ vext.8 $Xl,$Xl,$Xl,#8
+
+ vpmull2.p64 $I1,$H2,$I1
+ veor $t0,$I0,$Xl
+ vpmull2.p64 $j1,$Hhl,$j1
+ vext.8 $IN,$t0,$t0,#8
+
+ veor $Yl,$Yl,$j3
+ veor $Yh,$Yh,$I1
+ veor $Ym,$Ym,$j1
+
+ vpmull.p64 $Xl,$H3,$IN @ H^3·(Xi+Ii)
+ veor $t0,$t0,$IN
+ vpmull2.p64 $Xh,$H3,$IN
+ vpmull.p64 $Xm,$H34,$t0
+
+ veor $Xl,$Xl,$Yl
+ veor $Xh,$Xh,$Yh
+ veor $Xm,$Xm,$Ym
+ b .Ldone4x
+
+.align 4
+.Ltwo:
+ vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
+ veor $t2,$Xl,$Xh
+ veor $Xm,$Xm,$t1
+ vld1.64 {$I0-$j1},[$inp]
+ veor $Xm,$Xm,$t2
+#ifndef __ARMEB__
+ vrev64.8 $j1,$j1
+ vrev64.8 $I0,$I0
+#endif
+
+ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
+ vext.8 $I1,$j1,$j1,#8
+ veor $Xl,$Xm,$t2
+
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
+ vpmull.p64 $Xl,$Xl,$xC2
+ veor $t2,$t2,$Xh
+ veor $Xl,$Xl,$t2
+ vext.8 $Xl,$Xl,$Xl,#8
+
+ vpmull.p64 $Yl,$H,$I1 @ H·Ii+1
+ veor $j1,$j1,$I1
+
+ veor $t0,$I0,$Xl
+ vext.8 $IN,$t0,$t0,#8
+
+ vpmull2.p64 $Yh,$H,$I1
+ vpmull.p64 $Ym,$Hhl,$j1
+
+ vpmull.p64 $Xl,$H2,$IN @ H^2·(Xi+Ii)
+ veor $t0,$t0,$IN
+ vpmull2.p64 $Xh,$H2,$IN
+ vpmull2.p64 $Xm,$Hhl,$t0
+
+ veor $Xl,$Xl,$Yl
+ veor $Xh,$Xh,$Yh
+ veor $Xm,$Xm,$Ym
+ b .Ldone4x
+
+.align 4
+.Lone:
+ vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
+ veor $t2,$Xl,$Xh
+ veor $Xm,$Xm,$t1
+ vld1.64 {$I0},[$inp]
+ veor $Xm,$Xm,$t2
+#ifndef __ARMEB__
+ vrev64.8 $I0,$I0
+#endif
+
+ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
+ veor $Xl,$Xm,$t2
+
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
+ vpmull.p64 $Xl,$Xl,$xC2
+ veor $t2,$t2,$Xh
+ veor $Xl,$Xl,$t2
+ vext.8 $Xl,$Xl,$Xl,#8
+
+ veor $t0,$I0,$Xl
+ vext.8 $IN,$t0,$t0,#8
+
+ vpmull.p64 $Xl,$H,$IN
+ veor $t0,$t0,$IN
+ vpmull2.p64 $Xh,$H,$IN
+ vpmull.p64 $Xm,$Hhl,$t0
+
+.Ldone4x:
+ vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
+ veor $t2,$Xl,$Xh
+ veor $Xm,$Xm,$t1
+ veor $Xm,$Xm,$t2
+
+ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
+ veor $Xl,$Xm,$t2
+
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
+ vpmull.p64 $Xl,$Xl,$xC2
+ veor $t2,$t2,$Xh
+ veor $Xl,$Xl,$t2
+ vext.8 $Xl,$Xl,$Xl,#8
+
+#ifndef __ARMEB__
+ vrev64.8 $Xl,$Xl
+#endif
+ vst1.64 {$Xl},[$Xi] @ write out Xi
+
+ ret
+.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
+___
+
}
+}
+
$code.=<<___;
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
@@ -360,7 +708,8 @@ if ($flavour =~ /64/) { ######## 64-bit code
my $arg=shift;
$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
- sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
+ sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
+ $3<8?$3:$3+8,($4 eq "lo")?0:1;
}
foreach(split("\n",$code)) {
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
@@ -375,7 +724,7 @@ if ($flavour =~ /64/) { ######## 64-bit code
s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
s/@\s/\/\//o; # old->new style commentary
- # fix up remainig legacy suffixes
+ # fix up remaining legacy suffixes
s/\.[ui]?8(\s)/$1/o;
s/\.[uis]?32//o and s/\.16b/\.4s/go;
m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
@@ -415,7 +764,7 @@ if ($flavour =~ /64/) { ######## 64-bit code
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
s/\/\/\s?/@ /o; # new->old style commentary
- # fix up remainig new-style suffixes
+ # fix up remaining new-style suffixes
s/\],#[0-9]+/]!/o;
s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or