summaryrefslogtreecommitdiff
path: root/deps/openssl/openssl/crypto/modes/asm
diff options
context:
space:
mode:
Diffstat (limited to 'deps/openssl/openssl/crypto/modes/asm')
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl61
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/ghash-alpha.pl11
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl98
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/ghash-c64xplus.pl247
-rwxr-xr-xdeps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl11
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl9
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl24
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/ghash-sparcv9.pl32
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl20
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl17
-rwxr-xr-xdeps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl478
-rw-r--r--deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl27
12 files changed, 946 insertions, 89 deletions
diff --git a/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl b/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
index 980cfd23ef..5ad62b3979 100644
--- a/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -22,10 +29,11 @@
# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
# Locktyukhin of Intel Corp. who verified that it reduces shuffles
# pressure with notable relative improvement, achieving 1.0 cycle per
-# byte processed with 128-bit key on Haswell processor, and 0.74 -
-# on Broadwell. [Mentioned results are raw profiled measurements for
-# favourable packet size, one divisible by 96. Applications using the
-# EVP interface will observe a few percent worse performance.]
+# byte processed with 128-bit key on Haswell processor, 0.74 - on
+# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
+# measurements for favourable packet size, one divisible by 96.
+# Applications using the EVP interface will observe a few percent
+# worse performance.]
#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
@@ -60,7 +68,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([
$avx = ($2>=3.0) + ($2>3.0);
}
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
if ($avx>1) {{{
@@ -108,6 +116,23 @@ _aesni_ctr32_ghash_6x:
vpxor $rndkey,$inout3,$inout3
vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey
vpclmulqdq \$0x01,$Hkey,$Z3,$Z2
+
+ # At this point, the current block of 96 (0x60) bytes has already been
+ # loaded into registers. Concurrently with processing it, we want to
+ # load the next 96 bytes of input for the next round. Obviously, we can
+ # only do this if there are at least 96 more bytes of input beyond the
+ # input we're currently processing, or else we'd read past the end of
+ # the input buffer. Here, we set |%r12| to 96 if there are at least 96
+ # bytes of input beyond the 96 bytes we're already processing, and we
+ # set |%r12| to 0 otherwise. In the case where we set |%r12| to 96,
+ # we'll read in the next block so that it is in registers for the next
+ # loop iteration. In the case where we set |%r12| to 0, we'll re-read
+ # the current block and then ignore what we re-read.
+ #
+ # At this point, |$in0| points to the current (already read into
+ # registers) block, and |$end0| points to 2*96 bytes before the end of
+ # the input. Thus, |$in0| > |$end0| means that we do not have the next
+ # 96-byte block to read in, and |$in0| <= |$end0| means we do.
xor %r12,%r12
cmp $in0,$end0
@@ -400,6 +425,9 @@ $code.=<<___;
.align 32
aesni_gcm_decrypt:
xor $ret,$ret
+
+ # We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60)
+ # bytes of input.
cmp \$0x60,$len # minimal accepted length
jb .Lgcm_dec_abort
@@ -454,7 +482,15 @@ $code.=<<___;
vmovdqu 0x50($inp),$Z3 # I[5]
lea ($inp),$in0
vmovdqu 0x40($inp),$Z0
+
+ # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
+ # bytes before the end of the input. Note, in particular, that this is
+ # correct even if |$len| is not an even multiple of 96 or 16. XXX: This
+ # seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must
+ # not be near the very beginning of the address space when |$len| < 2*96
+ # (0xc0).
lea -0xc0($inp,$len),$end0
+
vmovdqu 0x30($inp),$Z1
shr \$4,$len
xor $ret,$ret
@@ -610,6 +646,10 @@ _aesni_ctr32_6x:
.align 32
aesni_gcm_encrypt:
xor $ret,$ret
+
+ # We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of
+ # input. Then we call |_aesni_ctr32_ghash_6x|, which requires at
+ # least 96 more bytes of input.
cmp \$0x60*3,$len # minimal accepted length
jb .Lgcm_enc_abort
@@ -659,7 +699,16 @@ $code.=<<___;
.Lenc_no_key_aliasing:
lea ($out),$in0
+
+ # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
+ # bytes before the end of the input. Note, in particular, that this is
+ # correct even if |$len| is not an even multiple of 96 or 16. Unlike in
+ # the decryption case, there's no caveat that |$out| must not be near
+ # the very beginning of the address space, because we know that
+ # |$len| >= 3*96 from the check above, and so we know
+ # |$out| + |$len| >= 2*96 (0xc0).
lea -0xc0($out,$len),$end0
+
shr \$4,$len
call _aesni_ctr32_6x
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-alpha.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-alpha.pl
index aa36029386..ccf6b2bd6f 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-alpha.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-alpha.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -454,7 +461,7 @@ rem_4bit:
.align 4
___
-$output=shift and open STDOUT,">$output";
+$output=pop and open STDOUT,">$output";
print $code;
close STDOUT;
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl
index 8ccc963ef2..7d880c94a7 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -42,8 +49,8 @@
# below and combine it with reduction algorithm from x86 module.
# Performance improvement over previous version varies from 65% on
# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
-# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 -
-# in 9.33.
+# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
+# Snapdragon S4 - in 9.33.
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
@@ -71,8 +78,20 @@
# *native* byte order on current platform. See gcm128.c for working
# example...
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$Xi="r0"; # argument block
$Htbl="r1";
@@ -124,11 +143,18 @@ $code=<<___;
#include "arm_arch.h"
.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
.code 32
+#endif
-#ifdef __clang__
-#define ldrplb ldrbpl
-#define ldrneb ldrbne
+#ifdef __clang__
+#define ldrplb ldrbpl
+#define ldrneb ldrbne
#endif
.type rem_4bit,%object
@@ -142,19 +168,27 @@ rem_4bit:
.type rem_4bit_get,%function
rem_4bit_get:
- sub $rem_4bit,pc,#8
- sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
+#if defined(__thumb2__)
+ adr $rem_4bit,rem_4bit
+#else
+ sub $rem_4bit,pc,#8+32 @ &rem_4bit
+#endif
b .Lrem_4bit_got
nop
+ nop
.size rem_4bit_get,.-rem_4bit_get
.global gcm_ghash_4bit
.type gcm_ghash_4bit,%function
+.align 4
gcm_ghash_4bit:
- sub r12,pc,#8
+#if defined(__thumb2__)
+ adr r12,rem_4bit
+#else
+ sub r12,pc,#8+48 @ &rem_4bit
+#endif
add $len,$inp,$len @ $len to point at the end
stmdb sp!,{r3-r11,lr} @ save $len/end too
- sub r12,r12,#48 @ &rem_4bit
ldmia r12,{r4-r11} @ copy rem_4bit ...
stmdb sp!,{r4-r11} @ ... to stack
@@ -201,6 +235,9 @@ gcm_ghash_4bit:
eor $Zlh,$Zlh,$Zhl,lsl#28
ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
eor $Zhl,$Thl,$Zhl,lsr#4
+#ifdef __thumb2__
+ it pl
+#endif
ldrplb $nlo,[$inp,$cnt]
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
@@ -211,6 +248,9 @@ gcm_ghash_4bit:
add $nhi,$nhi,$nhi
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
eor $Zll,$Tll,$Zll,lsr#4
+#ifdef __thumb2__
+ it pl
+#endif
ldrplb $Tll,[$Xi,$cnt]
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
@@ -218,8 +258,14 @@ gcm_ghash_4bit:
eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
+#ifdef __thumb2__
+ it pl
+#endif
eorpl $nlo,$nlo,$Tll
eor $Zhh,$Thh,$Zhh,lsr#4
+#ifdef __thumb2__
+ itt pl
+#endif
andpl $nhi,$nlo,#0xf0
andpl $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
@@ -229,7 +275,11 @@ gcm_ghash_4bit:
add $inp,$inp,#16
mov $nhi,$Zll
___
- &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
+ &Zsmash("cmp\t$inp,$len","\n".
+ "#ifdef __thumb2__\n".
+ " it ne\n".
+ "#endif\n".
+ " ldrneb $nlo,[$inp,#15]");
$code.=<<___;
bne .Louter
@@ -287,6 +337,9 @@ gcm_gmult_4bit:
eor $Zlh,$Zlh,$Zhl,lsl#28
ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
eor $Zhl,$Thl,$Zhl,lsr#4
+#ifdef __thumb2__
+ it pl
+#endif
ldrplb $nlo,[$Xi,$cnt]
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
@@ -304,6 +357,9 @@ gcm_gmult_4bit:
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
+#ifdef __thumb2__
+ itt pl
+#endif
andpl $nhi,$nlo,#0xf0
andpl $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
@@ -378,9 +434,9 @@ $code.=<<___;
.type gcm_init_neon,%function
.align 4
gcm_init_neon:
- vld1.64 $IN#hi,[r1,:64]! @ load H
+ vld1.64 $IN#hi,[r1]! @ load H
vmov.i8 $t0,#0xe1
- vld1.64 $IN#lo,[r1,:64]
+ vld1.64 $IN#lo,[r1]
vshl.i64 $t0#hi,#57
vshr.u64 $t0#lo,#63 @ t0=0xc2....01
vdup.8 $t1,$IN#hi[7]
@@ -399,8 +455,8 @@ gcm_init_neon:
.type gcm_gmult_neon,%function
.align 4
gcm_gmult_neon:
- vld1.64 $IN#hi,[$Xi,:64]! @ load Xi
- vld1.64 $IN#lo,[$Xi,:64]!
+ vld1.64 $IN#hi,[$Xi]! @ load Xi
+ vld1.64 $IN#lo,[$Xi]!
vmov.i64 $k48,#0x0000ffffffffffff
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
vmov.i64 $k32,#0x00000000ffffffff
@@ -417,8 +473,8 @@ gcm_gmult_neon:
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
- vld1.64 $Xl#hi,[$Xi,:64]! @ load Xi
- vld1.64 $Xl#lo,[$Xi,:64]!
+ vld1.64 $Xl#hi,[$Xi]! @ load Xi
+ vld1.64 $Xl#lo,[$Xi]!
vmov.i64 $k48,#0x0000ffffffffffff
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
vmov.i64 $k32,#0x00000000ffffffff
@@ -473,8 +529,8 @@ $code.=<<___;
vrev64.8 $Xl,$Xl
#endif
sub $Xi,#16
- vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi
- vst1.64 $Xl#lo,[$Xi,:64]
+ vst1.64 $Xl#hi,[$Xi]! @ write out Xi
+ vst1.64 $Xl#lo,[$Xi]
ret @ bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-c64xplus.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-c64xplus.pl
new file mode 100644
index 0000000000..3cadda3994
--- /dev/null
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-c64xplus.pl
@@ -0,0 +1,247 @@
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# December 2011
+#
+# The module implements GCM GHASH function and underlying single
+# multiplication operation in GF(2^128). Even though subroutines
+# have _4bit suffix, they are not using any tables, but rely on
+# hardware Galois Field Multiply support. Streamed GHASH processes
+# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
+# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
+# comparing apples vs. oranges, but compiler surely could have done
+# better, because theoretical [though not necessarily achievable]
+# estimate for "4-bit" table-driven implementation is ~12 cycles.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments
+
+($Z0,$Z1,$Z2,$Z3, $H0, $H1, $H2, $H3,
+ $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
+($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y,
+ $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
+($FF000000,$E10000)=("B30","B31");
+($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len
+ $xia="A9";
+($rem,$res)=("B4","B5"); # $rem zaps $Htable
+
+$code.=<<___;
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .asg gcm_gmult_1bit,_gcm_gmult_1bit
+ .asg gcm_gmult_4bit,_gcm_gmult_4bit
+ .asg gcm_ghash_4bit,_gcm_ghash_4bit
+ .endif
+
+ .asg B3,RA
+
+ .if 0
+ .global _gcm_gmult_1bit
+_gcm_gmult_1bit:
+ ADDAD $Htable,2,$Htable
+ .endif
+ .global _gcm_gmult_4bit
+_gcm_gmult_4bit:
+ .asmfunc
+ LDDW *${Htable}[-1],$H1:$H0 ; H.lo
+ LDDW *${Htable}[-2],$H3:$H2 ; H.hi
+|| MV $Xip,${xip} ; reassign Xi
+|| MVK 15,B1 ; SPLOOPD constant
+
+ MVK 0xE1,$E10000
+|| LDBU *++${xip}[15],$x1 ; Xi[15]
+ MVK 0xFF,$FF000000
+|| LDBU *--${xip},$x0 ; Xi[14]
+ SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
+ SHL $FF000000,24,$FF000000 ; upper byte mask
+|| BNOP ghash_loop?
+|| MVK 1,B0 ; take a single spin
+
+ PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
+ AND $H2,$FF000000,$H2u ; H2's upper byte
+ AND $H3,$FF000000,$H3u ; H3's upper byte
+|| SHRU $H2u,8,$H2u
+ SHRU $H3u,8,$H3u
+|| ZERO $Z1:$Z0
+ SHRU2 $xia,8,$H01u
+|| ZERO $Z3:$Z2
+ .endasmfunc
+
+ .global _gcm_ghash_4bit
+_gcm_ghash_4bit:
+ .asmfunc
+ LDDW *${Htable}[-1],$H1:$H0 ; H.lo
+|| SHRU $len,4,B0 ; reassign len
+ LDDW *${Htable}[-2],$H3:$H2 ; H.hi
+|| MV $Xip,${xip} ; reassign Xi
+|| MVK 15,B1 ; SPLOOPD constant
+
+ MVK 0xE1,$E10000
+|| [B0] LDNDW *${inp}[1],$H1x:$H0x
+ MVK 0xFF,$FF000000
+|| [B0] LDNDW *${inp}++[2],$H3x:$H2x
+ SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
+|| LDDW *${xip}[1],$Z1:$Z0
+ SHL $FF000000,24,$FF000000 ; upper byte mask
+|| LDDW *${xip}[0],$Z3:$Z2
+
+ PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
+ AND $H2,$FF000000,$H2u ; H2's upper byte
+ AND $H3,$FF000000,$H3u ; H3's upper byte
+|| SHRU $H2u,8,$H2u
+ SHRU $H3u,8,$H3u
+ SHRU2 $xia,8,$H01u
+
+|| [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
+|| [B0] XOR $H1x,$Z1,$Z1
+ .if .LITTLE_ENDIAN
+ [B0] XOR $H2x,$Z2,$Z2
+|| [B0] XOR $H3x,$Z3,$Z3
+|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
+ STDW $Z1:$Z0,*${xip}[1]
+|| [B0] SHRU $Z1,16,$x0 ; Xi[14]
+|| [B0] ZERO $Z1:$Z0
+ .else
+ [B0] XOR $H2x,$Z2,$Z2
+|| [B0] XOR $H3x,$Z3,$Z3
+|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
+ STDW $Z1:$Z0,*${xip}[1]
+|| [B0] SHRU $Z0,8,$x0 ; Xi[14]
+|| [B0] ZERO $Z1:$Z0
+ .endif
+ STDW $Z3:$Z2,*${xip}[0]
+|| [B0] ZERO $Z3:$Z2
+|| [B0] MV $xia,$x1
+ [B0] ADDK 14,${xip}
+
+ghash_loop?:
+ SPLOOPD 6 ; 6*16+7
+|| MVC B1,ILC
+|| [B0] SUB B0,1,B0
+|| ZERO A0
+|| ADD $x1,$x1,$xib ; SHL $x1,1,$xib
+|| SHL $x1,1,$xia
+___
+
+########____________________________
+# 0 D2. M1 M2 |
+# 1 M1 |
+# 2 M1 M2 |
+# 3 D1. M1 M2 |
+# 4 S1. L1 |
+# 5 S2 S1x L1 D2 L2 |____________________________
+# 6/0 L1 S1 L2 S2x |D2. M1 M2 |
+# 7/1 L1 S1 D1x S2 M2 | M1 |
+# 8/2 S1 L1x S2 | M1 M2 |
+# 9/3 S1 L1x | D1. M1 M2 |
+# 10/4 D1x | S1. L1 |
+# 11/5 |S2 S1x L1 D2 L2 |____________
+# 12/6/0 D1x __| L1 S1 L2 S2x |D2. ....
+# 7/1 L1 S1 D1x S2 M2 | ....
+# 8/2 S1 L1x S2 | ....
+#####... ................|............
+$code.=<<___;
+ XORMPY $H0,$xia,$H0x ; 0 ; H·(Xi[i]<<1)
+|| XORMPY $H01u,$xib,$H01y
+|| [A0] LDBU *--${xip},$x0
+ XORMPY $H1,$xia,$H1x ; 1
+ XORMPY $H2,$xia,$H2x ; 2
+|| XORMPY $H2u,$xib,$H2y
+ XORMPY $H3,$xia,$H3x ; 3
+|| XORMPY $H3u,$xib,$H3y
+||[!A0] MVK.D 15,A0 ; *--${xip} counter
+ XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·(Xi[i]<<1)
+|| [A0] SUB.S A0,1,A0
+ XOR.L $H1x,$Z1,$Z1 ; 5
+|| AND.D $H01y,$FF000000,$H0z
+|| SWAP2.L $H01y,$H1y ; ; SHL $H01y,16,$H1y
+|| SHL $x0,1,$xib
+|| SHL $x0,1,$xia
+
+ XOR.L $H2x,$Z2,$Z2 ; 6/0 ; [0,0] in epilogue
+|| SHL $Z0,1,$rem ; ; rem=Z<<1
+|| SHRMB.S $Z1,$Z0,$Z0 ; ; Z>>=8
+|| AND.L $H1y,$FF000000,$H1z
+ XOR.L $H3x,$Z3,$Z3 ; 7/1
+|| SHRMB.S $Z2,$Z1,$Z1
+|| XOR.D $H0z,$Z0,$Z0 ; merge upper byte products
+|| AND.S $H2y,$FF000000,$H2z
+|| XORMPY $E10000,$rem,$res ; ; implicit rem&0x1FE
+ XOR.L $H1z,$Z1,$Z1 ; 8/2
+|| SHRMB.S $Z3,$Z2,$Z2
+|| AND.S $H3y,$FF000000,$H3z
+ XOR.L $H2z,$Z2,$Z2 ; 9/3
+|| SHRU $Z3,8,$Z3
+ XOR.D $H3z,$Z3,$Z3 ; 10/4
+ NOP ; 11/5
+
+ SPKERNEL 0,2
+|| XOR.D $res,$Z3,$Z3 ; 12/6/0; Z^=res
+
+ ; input pre-fetch is possible where D1 slot is available...
+ [B0] LDNDW *${inp}[1],$H1x:$H0x ; 8/-
+ [B0] LDNDW *${inp}++[2],$H3x:$H2x ; 9/-
+ NOP ; 10/-
+ .if .LITTLE_ENDIAN
+ SWAP2 $Z0,$Z1 ; 11/-
+|| SWAP4 $Z1,$Z0
+ SWAP4 $Z1,$Z1 ; 12/-
+|| SWAP2 $Z0,$Z0
+ SWAP2 $Z2,$Z3
+|| SWAP4 $Z3,$Z2
+||[!B0] BNOP RA
+ SWAP4 $Z3,$Z3
+|| SWAP2 $Z2,$Z2
+|| [B0] BNOP ghash_loop?
+ [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
+|| [B0] XOR $H1x,$Z1,$Z1
+ [B0] XOR $H2x,$Z2,$Z2
+|| [B0] XOR $H3x,$Z3,$Z3
+|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
+ STDW $Z1:$Z0,*${xip}[1]
+|| [B0] SHRU $Z1,16,$x0 ; Xi[14]
+|| [B0] ZERO $Z1:$Z0
+ .else
+ [!B0] BNOP RA ; 11/-
+ [B0] BNOP ghash_loop? ; 12/-
+ [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
+|| [B0] XOR $H1x,$Z1,$Z1
+ [B0] XOR $H2x,$Z2,$Z2
+|| [B0] XOR $H3x,$Z3,$Z3
+|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
+ STDW $Z1:$Z0,*${xip}[1]
+|| [B0] SHRU $Z0,8,$x0 ; Xi[14]
+|| [B0] ZERO $Z1:$Z0
+ .endif
+ STDW $Z3:$Z2,*${xip}[0]
+|| [B0] ZERO $Z3:$Z2
+|| [B0] MV $xia,$x1
+ [B0] ADDK 14,${xip}
+ .endasmfunc
+
+ .sect .const
+ .cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+ .align 4
+___
+
+print $code;
+close STDOUT;
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl
index 0354c95444..81e75f71a8 100755
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -32,7 +39,7 @@
# Itanium performance should remain the same as the "256B" version,
# i.e. ~8.5 cycles.
-$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
+$output=pop and (open STDOUT,">$output" or die "can't open $output: $!");
if ($^O eq "hpux") {
$ADDP="addp4";
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl
index d5ad96b403..1d6254543b 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl
index be7d55f748..6e628d8823 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -47,7 +54,7 @@ if ($flavour =~ /3[12]/) {
$g="g";
}
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$softonly=0;
@@ -81,9 +88,6 @@ gcm_gmult_4bit:
___
$code.=<<___ if(!$softonly && 0); # hardware is slow for single block...
larl %r1,OPENSSL_s390xcap_P
- lg %r0,0(%r1)
- tmhl %r0,0x4000 # check for message-security-assist
- jz .Lsoft_gmult
lghi %r0,0
lg %r1,24(%r1) # load second word of kimd capabilities vector
tmhh %r1,0x4000 # check for function 65
@@ -119,14 +123,8 @@ gcm_ghash_4bit:
___
$code.=<<___ if(!$softonly);
larl %r1,OPENSSL_s390xcap_P
- lg %r0,0(%r1)
- tmhl %r0,0x4000 # check for message-security-assist
- jz .Lsoft_ghash
- lghi %r0,0
- la %r1,16($sp)
- .long 0xb93e0004 # kimd %r0,%r4
- lg %r1,24($sp)
- tmhh %r1,0x4000 # check for function 65
+ lg %r0,24(%r1) # load second word of kimd capabilities vector
+ tmhh %r0,0x4000 # check for function 65
jz .Lsoft_ghash
lghi %r0,65 # function 65
la %r1,0($Xi) # H lies right after Xi in gcm128_context
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-sparcv9.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-sparcv9.pl
index b129ba706f..c4eb3b1f02 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-sparcv9.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-sparcv9.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -46,14 +53,12 @@
# saturates at ~15.5x single-process result on 8-core processor,
# or ~20.5GBps per 2.85GHz socket.
-$bits=32;
-for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64) { $bias=2047; $frame=192; }
-else { $bias=0; $frame=112; }
-
-$output=shift;
+$output=pop;
open STDOUT,">$output";
+$frame="STACK_FRAME";
+$bias="STACK_BIAS";
+
$Zhi="%o0"; # 64-bit values
$Zlo="%o1";
$Thi="%o2";
@@ -75,11 +80,14 @@ $Htbl="%i1";
$inp="%i2";
$len="%i3";
-$code.=<<___ if ($bits==64);
+$code.=<<___;
+#include "sparc_arch.h"
+
+#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
-___
-$code.=<<___;
+#endif
+
.section ".text",#alloc,#execinstr
.align 64
@@ -183,7 +191,7 @@ gcm_ghash_4bit:
add $inp,16,$inp
cmp $inp,$len
- be,pn `$bits==64?"%xcc":"%icc"`,.Ldone
+ be,pn SIZE_T_CC,.Ldone
and $Zlo,0xf,$remi
ldx [$Htblo+$nhi],$Tlo
@@ -532,7 +540,7 @@ ___
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
-# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis3 {
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl
index 0269169fa7..cd8458256e 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -88,7 +95,7 @@
# where Tproc is time required for Karatsuba pre- and post-processing,
# is more realistic estimate. In this case it gives ... 1.91 cycles.
# Or in other words, depending on how well we can interleave reduction
-# and one of the two multiplications the performance should be betwen
+# and one of the two multiplications the performance should be between
# 1.91 and 2.16. As already mentioned, this implementation processes
# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
# - in 2.02. x86_64 performance is better, because larger register
@@ -129,6 +136,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
+$output=pop;
+open STDOUT,">$output";
+
&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
$sse2=0;
@@ -712,7 +722,7 @@ sub mmx_loop() {
&pxor ($red[1],$red[1]);
&pxor ($red[2],$red[2]);
- # Just like in "May" verson modulo-schedule for critical path in
+ # Just like in "May" version modulo-schedule for critical path in
# 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
# is scheduled so late that rem_8bit[] has to be shifted *right*
# by 16, which is why last argument to pinsrw is 2, which
@@ -1138,7 +1148,7 @@ my ($Xhi,$Xi) = @_;
&movdqu (&QWP(0,$Xip),$Xi);
&function_end("gcm_ghash_clmul");
-} else { # Algorith 5. Kept for reference purposes.
+} else { # Algorithm 5. Kept for reference purposes.
sub reduction_alg5 { # 19/16 times faster than Intel version
my ($Xhi,$Xi)=@_;
@@ -1369,6 +1379,8 @@ my ($Xhi,$Xi)=@_;
&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
+close STDOUT;
+
# A question was risen about choice of vanilla MMX. Or rather why wasn't
# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
# CPUs such as PIII, "4-bit" MMX version was observed to provide better
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
index f889f20187..387e3f854e 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -64,8 +71,10 @@
# Ivy Bridge 1.80(+7%)
# Haswell 0.55(+93%) (if system doesn't support AVX)
# Broadwell 0.45(+110%)(if system doesn't support AVX)
+# Skylake 0.44(+110%)(if system doesn't support AVX)
# Bulldozer 1.49(+27%)
# Silvermont 2.88(+13%)
+# Goldmont 1.08(+24%)
# March 2013
#
@@ -74,8 +83,8 @@
# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
# sub-optimally in comparison to above mentioned version. But thanks
# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
-# it performs in 0.41 cycles per byte on Haswell processor, and in
-# 0.29 on Broadwell.
+# it performs in 0.41 cycles per byte on Haswell processor, in
+# 0.29 on Broadwell, and in 0.36 on Skylake.
#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
@@ -109,7 +118,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([
$avx = ($2>=3.0) + ($2>3.0);
}
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$do4xaggr=1;
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl b/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl
index 71457cf4fc..f0598cb28c 100755
--- a/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -18,6 +25,12 @@
# faster than "4-bit" integer-only compiler-generated 64-bit code.
# "Initial version" means that there is room for futher improvement.
+# May 2016
+#
+# 2x aggregated reduction improves performance by 50% (resulting
+# performance on POWER8 is 1 cycle per processed byte), and 4x
+# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
+
$flavour=shift;
$output =shift;
@@ -27,14 +40,21 @@ if ($flavour =~ /64/) {
$STU="stdu";
$POP="ld";
$PUSH="std";
+ $UCMP="cmpld";
+ $SHRI="srdi";
} elsif ($flavour =~ /32/) {
$SIZE_T=4;
$LRSAVE=$SIZE_T;
$STU="stwu";
$POP="lwz";
$PUSH="stw";
+ $UCMP="cmplw";
+ $SHRI="srwi";
} else { die "nonsense $flavour"; }
+$sp="r1";
+$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
+
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -46,6 +66,7 @@ my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
+my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
my $vrsave="r12";
$code=<<___;
@@ -56,7 +77,7 @@ $code=<<___;
.globl .gcm_init_p8
.align 5
.gcm_init_p8:
- lis r0,0xfff0
+ li r0,-4096
li r8,0x10
mfspr $vrsave,256
li r9,0x20
@@ -78,17 +99,103 @@ $code=<<___;
vsl $H,$H,$t0 # H<<=1
vsrab $t1,$t1,$t2 # broadcast carry bit
vand $t1,$t1,$xC2
- vxor $H,$H,$t1 # twisted H
+ vxor $IN,$H,$t1 # twisted H
- vsldoi $H,$H,$H,8 # twist even more ...
+ vsldoi $H,$IN,$IN,8 # twist even more ...
vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
vsldoi $Hl,$zero,$H,8 # ... and split
vsldoi $Hh,$H,$zero,8
stvx_u $xC2,0,r3 # save pre-computed table
stvx_u $Hl,r8,r3
+ li r8,0x40
stvx_u $H, r9,r3
+ li r9,0x50
stvx_u $Hh,r10,r3
+ li r10,0x60
+
+ vpmsumd $Xl,$IN,$Hl # H.lo·H.lo
+ vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi
+ vpmsumd $Xh,$IN,$Hh # H.hi·H.hi
+
+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
+ vpmsumd $Xl,$Xl,$xC2
+ vxor $t1,$t1,$Xh
+ vxor $IN1,$Xl,$t1
+
+ vsldoi $H2,$IN1,$IN1,8
+ vsldoi $H2l,$zero,$H2,8
+ vsldoi $H2h,$H2,$zero,8
+
+ stvx_u $H2l,r8,r3 # save H^2
+ li r8,0x70
+ stvx_u $H2,r9,r3
+ li r9,0x80
+ stvx_u $H2h,r10,r3
+ li r10,0x90
+___
+{
+my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
+$code.=<<___;
+ vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo
+ vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo
+ vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi
+ vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi
+ vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi
+ vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi
+
+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
+ vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vsldoi $t4,$Xm1,$zero,8
+ vsldoi $t5,$zero,$Xm1,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+ vxor $Xl1,$Xl1,$t4
+ vxor $Xh1,$Xh1,$t5
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vsldoi $Xl1,$Xl1,$Xl1,8
+ vxor $Xl,$Xl,$t2
+ vxor $Xl1,$Xl1,$t6
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
+ vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase
+ vpmsumd $Xl,$Xl,$xC2
+ vpmsumd $Xl1,$Xl1,$xC2
+ vxor $t1,$t1,$Xh
+ vxor $t5,$t5,$Xh1
+ vxor $Xl,$Xl,$t1
+ vxor $Xl1,$Xl1,$t5
+
+ vsldoi $H,$Xl,$Xl,8
+ vsldoi $H2,$Xl1,$Xl1,8
+ vsldoi $Hl,$zero,$H,8
+ vsldoi $Hh,$H,$zero,8
+ vsldoi $H2l,$zero,$H2,8
+ vsldoi $H2h,$H2,$zero,8
+
+ stvx_u $Hl,r8,r3 # save H^3
+ li r8,0xa0
+ stvx_u $H,r9,r3
+ li r9,0xb0
+ stvx_u $Hh,r10,r3
+ li r10,0xc0
+ stvx_u $H2l,r8,r3 # save H^4
+ stvx_u $H2,r9,r3
+ stvx_u $H2h,r10,r3
mtspr 256,$vrsave
blr
@@ -96,7 +203,9 @@ $code=<<___;
.byte 0,12,0x14,0,0,0,2,0
.long 0
.size .gcm_init_p8,.-.gcm_init_p8
-
+___
+}
+$code.=<<___;
.globl .gcm_gmult_p8
.align 5
.gcm_gmult_p8:
@@ -122,7 +231,7 @@ $code=<<___;
vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
- vpmsumd $t2,$Xl,$xC2 # 1st phase
+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
@@ -132,7 +241,7 @@ $code=<<___;
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
- vsldoi $t1,$Xl,$Xl,8 # 2nd phase
+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
vpmsumd $Xl,$Xl,$xC2
vxor $t1,$t1,$Xh
vxor $Xl,$Xl,$t1
@@ -150,7 +259,7 @@ $code=<<___;
.globl .gcm_ghash_p8
.align 5
.gcm_ghash_p8:
- lis r0,0xfff8
+ li r0,-4096
li r8,0x10
mfspr $vrsave,256
li r9,0x20
@@ -159,52 +268,99 @@ $code=<<___;
lvx_u $Xl,0,$Xip # load Xi
lvx_u $Hl,r8,$Htbl # load pre-computed table
+ li r8,0x40
le?lvsl $lemask,r0,r0
lvx_u $H, r9,$Htbl
+ li r9,0x50
le?vspltisb $t0,0x07
lvx_u $Hh,r10,$Htbl
+ li r10,0x60
le?vxor $lemask,$lemask,$t0
lvx_u $xC2,0,$Htbl
le?vperm $Xl,$Xl,$Xl,$lemask
vxor $zero,$zero,$zero
+ ${UCMP}i $len,64
+ bge Lgcm_ghash_p8_4x
+
lvx_u $IN,0,$inp
addi $inp,$inp,16
- subi $len,$len,16
+ subic. $len,$len,16
le?vperm $IN,$IN,$IN,$lemask
vxor $IN,$IN,$Xl
- b Loop
+ beq Lshort
+
+ lvx_u $H2l,r8,$Htbl # load H^2
+ li r8,16
+ lvx_u $H2, r9,$Htbl
+ add r9,$inp,$len # end of input
+ lvx_u $H2h,r10,$Htbl
+ be?b Loop_2x
.align 5
-Loop:
- subic $len,$len,16
- vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
- subfe. r0,r0,r0 # borrow?-1:0
- vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
+Loop_2x:
+ lvx_u $IN1,0,$inp
+ le?vperm $IN1,$IN1,$IN1,$lemask
+
+ subic $len,$len,32
+ vpmsumd $Xl,$IN,$H2l # H^2.lo·Xi.lo
+ vpmsumd $Xl1,$IN1,$Hl # H.lo·Xi+1.lo
+ subfe r0,r0,r0 # borrow?-1:0
+ vpmsumd $Xm,$IN,$H2 # H^2.hi·Xi.lo+H^2.lo·Xi.hi
+ vpmsumd $Xm1,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+1.hi
and r0,r0,$len
- vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
+ vpmsumd $Xh,$IN,$H2h # H^2.hi·Xi.hi
+ vpmsumd $Xh1,$IN1,$Hh # H.hi·Xi+1.hi
add $inp,$inp,r0
- vpmsumd $t2,$Xl,$xC2 # 1st phase
+ vxor $Xl,$Xl,$Xl1
+ vxor $Xm,$Xm,$Xm1
+
+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
+ vxor $Xh,$Xh,$Xh1
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
- lvx_u $IN,0,$inp
- addi $inp,$inp,16
+ lvx_u $IN,r8,$inp
+ addi $inp,$inp,32
- vsldoi $t1,$Xl,$Xl,8 # 2nd phase
+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
vpmsumd $Xl,$Xl,$xC2
le?vperm $IN,$IN,$IN,$lemask
vxor $t1,$t1,$Xh
vxor $IN,$IN,$t1
vxor $IN,$IN,$Xl
- beq Loop # did $len-=16 borrow?
+ $UCMP r9,$inp
+ bgt Loop_2x # done yet?
+
+ cmplwi $len,0
+ bne Leven
+
+Lshort:
+ vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
+ vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
+ vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
+
+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
+ vpmsumd $Xl,$Xl,$xC2
+ vxor $t1,$t1,$Xh
+Leven:
vxor $Xl,$Xl,$t1
le?vperm $Xl,$Xl,$Xl,$lemask
stvx_u $Xl,0,$Xip # write out Xi
@@ -214,6 +370,284 @@ Loop:
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
+___
+{
+my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
+ $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
+my $IN0=$IN;
+my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
+
+$code.=<<___;
+.align 5
+.gcm_ghash_p8_4x:
+Lgcm_ghash_p8_4x:
+ $STU $sp,-$FRAME($sp)
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ stvx v20,r10,$sp
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ li r10,0x60
+ stvx v31,r11,$sp
+ li r0,-1
+ stw $vrsave,`$FRAME-4`($sp) # save vrsave
+ mtspr 256,r0 # preserve all AltiVec registers
+
+ lvsl $t0,0,r8 # 0x0001..0e0f
+ #lvx_u $H2l,r8,$Htbl # load H^2
+ li r8,0x70
+ lvx_u $H2, r9,$Htbl
+ li r9,0x80
+ vspltisb $t1,8 # 0x0808..0808
+ #lvx_u $H2h,r10,$Htbl
+ li r10,0x90
+ lvx_u $H3l,r8,$Htbl # load H^3
+ li r8,0xa0
+ lvx_u $H3, r9,$Htbl
+ li r9,0xb0
+ lvx_u $H3h,r10,$Htbl
+ li r10,0xc0
+ lvx_u $H4l,r8,$Htbl # load H^4
+ li r8,0x10
+ lvx_u $H4, r9,$Htbl
+ li r9,0x20
+ lvx_u $H4h,r10,$Htbl
+ li r10,0x30
+
+ vsldoi $t2,$zero,$t1,8 # 0x0000..0808
+ vaddubm $hiperm,$t0,$t2 # 0x0001..1617
+ vaddubm $loperm,$t1,$hiperm # 0x0809..1e1f
+
+ $SHRI $len,$len,4 # this allows to use sign bit
+ # as carry
+ lvx_u $IN0,0,$inp # load input
+ lvx_u $IN1,r8,$inp
+ subic. $len,$len,8
+ lvx_u $IN2,r9,$inp
+ lvx_u $IN3,r10,$inp
+ addi $inp,$inp,0x40
+ le?vperm $IN0,$IN0,$IN0,$lemask
+ le?vperm $IN1,$IN1,$IN1,$lemask
+ le?vperm $IN2,$IN2,$IN2,$lemask
+ le?vperm $IN3,$IN3,$IN3,$lemask
+
+ vxor $Xh,$IN0,$Xl
+
+ vpmsumd $Xl1,$IN1,$H3l
+ vpmsumd $Xm1,$IN1,$H3
+ vpmsumd $Xh1,$IN1,$H3h
+
+ vperm $H21l,$H2,$H,$hiperm
+ vperm $t0,$IN2,$IN3,$loperm
+ vperm $H21h,$H2,$H,$loperm
+ vperm $t1,$IN2,$IN3,$hiperm
+ vpmsumd $Xm2,$IN2,$H2 # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
+ vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
+ vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
+ vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
+
+ vxor $Xm2,$Xm2,$Xm1
+ vxor $Xl3,$Xl3,$Xl1
+ vxor $Xm3,$Xm3,$Xm2
+ vxor $Xh3,$Xh3,$Xh1
+
+ blt Ltail_4x
+
+Loop_4x:
+ lvx_u $IN0,0,$inp
+ lvx_u $IN1,r8,$inp
+ subic. $len,$len,4
+ lvx_u $IN2,r9,$inp
+ lvx_u $IN3,r10,$inp
+ addi $inp,$inp,0x40
+ le?vperm $IN1,$IN1,$IN1,$lemask
+ le?vperm $IN2,$IN2,$IN2,$lemask
+ le?vperm $IN3,$IN3,$IN3,$lemask
+ le?vperm $IN0,$IN0,$IN0,$lemask
+
+ vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
+ vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
+ vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
+ vpmsumd $Xl1,$IN1,$H3l
+ vpmsumd $Xm1,$IN1,$H3
+ vpmsumd $Xh1,$IN1,$H3h
+
+ vxor $Xl,$Xl,$Xl3
+ vxor $Xm,$Xm,$Xm3
+ vxor $Xh,$Xh,$Xh3
+ vperm $t0,$IN2,$IN3,$loperm
+ vperm $t1,$IN2,$IN3,$hiperm
+
+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
+ vpmsumd $Xl3,$t0,$H21l # H.lo·Xi+3.lo +H^2.lo·Xi+2.lo
+ vpmsumd $Xh3,$t1,$H21h # H.hi·Xi+3.hi +H^2.hi·Xi+2.hi
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
+ vpmsumd $Xm2,$IN2,$H2 # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
+ vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
+ vpmsumd $Xl,$Xl,$xC2
+
+ vxor $Xl3,$Xl3,$Xl1
+ vxor $Xh3,$Xh3,$Xh1
+ vxor $Xh,$Xh,$IN0
+ vxor $Xm2,$Xm2,$Xm1
+ vxor $Xh,$Xh,$t1
+ vxor $Xm3,$Xm3,$Xm2
+ vxor $Xh,$Xh,$Xl
+ bge Loop_4x
+
+Ltail_4x:
+ vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
+ vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
+ vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
+
+ vxor $Xl,$Xl,$Xl3
+ vxor $Xm,$Xm,$Xm3
+
+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xh,$Xh,$Xh3
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
+ vpmsumd $Xl,$Xl,$xC2
+ vxor $t1,$t1,$Xh
+ vxor $Xl,$Xl,$t1
+
+ addic. $len,$len,4
+ beq Ldone_4x
+
+ lvx_u $IN0,0,$inp
+ ${UCMP}i $len,2
+ li $len,-4
+ blt Lone
+ lvx_u $IN1,r8,$inp
+ beq Ltwo
+
+Lthree:
+ lvx_u $IN2,r9,$inp
+ le?vperm $IN0,$IN0,$IN0,$lemask
+ le?vperm $IN1,$IN1,$IN1,$lemask
+ le?vperm $IN2,$IN2,$IN2,$lemask
+
+ vxor $Xh,$IN0,$Xl
+ vmr $H4l,$H3l
+ vmr $H4, $H3
+ vmr $H4h,$H3h
+
+ vperm $t0,$IN1,$IN2,$loperm
+ vperm $t1,$IN1,$IN2,$hiperm
+ vpmsumd $Xm2,$IN1,$H2 # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
+ vpmsumd $Xm3,$IN2,$H # H.hi·Xi+2.lo +H.lo·Xi+2.hi
+ vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
+ vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
+
+ vxor $Xm3,$Xm3,$Xm2
+ b Ltail_4x
+
+.align 4
+Ltwo:
+ le?vperm $IN0,$IN0,$IN0,$lemask
+ le?vperm $IN1,$IN1,$IN1,$lemask
+
+ vxor $Xh,$IN0,$Xl
+ vperm $t0,$zero,$IN1,$loperm
+ vperm $t1,$zero,$IN1,$hiperm
+
+ vsldoi $H4l,$zero,$H2,8
+ vmr $H4, $H2
+ vsldoi $H4h,$H2,$zero,8
+
+ vpmsumd $Xl3,$t0, $H21l # H.lo·Xi+1.lo
+ vpmsumd $Xm3,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+2.hi
+ vpmsumd $Xh3,$t1, $H21h # H.hi·Xi+1.hi
+
+ b Ltail_4x
+
+.align 4
+Lone:
+ le?vperm $IN0,$IN0,$IN0,$lemask
+
+ vsldoi $H4l,$zero,$H,8
+ vmr $H4, $H
+ vsldoi $H4h,$H,$zero,8
+
+ vxor $Xh,$IN0,$Xl
+ vxor $Xl3,$Xl3,$Xl3
+ vxor $Xm3,$Xm3,$Xm3
+ vxor $Xh3,$Xh3,$Xh3
+
+ b Ltail_4x
+
+Ldone_4x:
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ stvx_u $Xl,0,$Xip # write out Xi
+
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mtspr 256,$vrsave
+ lvx v20,r10,$sp
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,0x04,0,0x80,0,4,0
+ .long 0
+___
+}
+$code.=<<___;
.size .gcm_ghash_p8,.-.gcm_ghash_p8
.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
@@ -221,6 +655,8 @@ Loop:
___
foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
if ($flavour =~ /le$/o) { # little-endian
s/le\?//o or
s/be\?/#be#/o;
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl b/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl
index 0886d21807..dcd5f595d2 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -27,11 +34,21 @@
# Apple A7 0.92 5.62
# Cortex-A53 1.01 8.39
# Cortex-A57 1.17 7.61
+# Denver 0.71 6.02
+# Mongoose 1.10 8.06
#
# (*) presented for reference/comparison purposes;
$flavour = shift;
-open STDOUT,">".shift;
+$output = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
$Xi="x0"; # argument block
$Htbl="x1";
@@ -50,7 +67,11 @@ $code=<<___;
.text
___
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
-$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
+$code.=<<___ if ($flavour !~ /64/);
+.fpu neon
+.code 32
+#undef __thumb2__
+___
################################################################################
# void gcm_init_v8(u128 Htable[16],const u64 H[2]);