diff options
Diffstat (limited to 'deps/openssl/openssl/crypto/modes/asm')
9 files changed, 494 insertions, 96 deletions
diff --git a/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl b/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl index 5ad62b3979..b42016101e 100644 --- a/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl +++ b/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl @@ -35,6 +35,8 @@ # Applications using the EVP interface will observe a few percent # worse performance.] # +# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). +# # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf @@ -116,23 +118,6 @@ _aesni_ctr32_ghash_6x: vpxor $rndkey,$inout3,$inout3 vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey vpclmulqdq \$0x01,$Hkey,$Z3,$Z2 - - # At this point, the current block of 96 (0x60) bytes has already been - # loaded into registers. Concurrently with processing it, we want to - # load the next 96 bytes of input for the next round. Obviously, we can - # only do this if there are at least 96 more bytes of input beyond the - # input we're currently processing, or else we'd read past the end of - # the input buffer. Here, we set |%r12| to 96 if there are at least 96 - # bytes of input beyond the 96 bytes we're already processing, and we - # set |%r12| to 0 otherwise. In the case where we set |%r12| to 96, - # we'll read in the next block so that it is in registers for the next - # loop iteration. In the case where we set |%r12| to 0, we'll re-read - # the current block and then ignore what we re-read. - # - # At this point, |$in0| points to the current (already read into - # registers) block, and |$end0| points to 2*96 bytes before the end of - # the input. Thus, |$in0| > |$end0| means that we do not have the next - # 96-byte block to read in, and |$in0| <= |$end0| means we do. xor %r12,%r12 cmp $in0,$end0 @@ -424,20 +409,25 @@ $code.=<<___; .type aesni_gcm_decrypt,\@function,6 .align 32 aesni_gcm_decrypt: +.cfi_startproc xor $ret,$ret - - # We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60) - # bytes of input. cmp \$0x60,$len # minimal accepted length jb .Lgcm_dec_abort lea (%rsp),%rax # save stack pointer +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp @@ -482,15 +472,7 @@ $code.=<<___; vmovdqu 0x50($inp),$Z3 # I[5] lea ($inp),$in0 vmovdqu 0x40($inp),$Z0 - - # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0) - # bytes before the end of the input. Note, in particular, that this is - # correct even if |$len| is not an even multiple of 96 or 16. XXX: This - # seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must - # not be near the very beginning of the address space when |$len| < 2*96 - # (0xc0). lea -0xc0($inp,$len),$end0 - vmovdqu 0x30($inp),$Z1 shr \$4,$len xor $ret,$ret @@ -537,15 +519,23 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov -48(%rax),%r15 +.cfi_restore %r15 mov -40(%rax),%r14 +.cfi_restore %r14 mov -32(%rax),%r13 +.cfi_restore %r13 mov -24(%rax),%r12 +.cfi_restore %r12 mov -16(%rax),%rbp +.cfi_restore %rbp mov -8(%rax),%rbx +.cfi_restore %rbx lea (%rax),%rsp # restore %rsp +.cfi_def_cfa_register %rsp .Lgcm_dec_abort: mov $ret,%rax # return value ret +.cfi_endproc .size aesni_gcm_decrypt,.-aesni_gcm_decrypt ___ @@ -645,21 +635,25 @@ _aesni_ctr32_6x: .type aesni_gcm_encrypt,\@function,6 .align 32 aesni_gcm_encrypt: +.cfi_startproc xor $ret,$ret - - # We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of - # input. Then we call |_aesni_ctr32_ghash_6x|, which requires at - # least 96 more bytes of input. cmp \$0x60*3,$len # minimal accepted length jb .Lgcm_enc_abort lea (%rsp),%rax # save stack pointer +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp @@ -699,16 +693,7 @@ $code.=<<___; .Lenc_no_key_aliasing: lea ($out),$in0 - - # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0) - # bytes before the end of the input. Note, in particular, that this is - # correct even if |$len| is not an even multiple of 96 or 16. Unlike in - # the decryption case, there's no caveat that |$out| must not be near - # the very beginning of the address space, because we know that - # |$len| >= 3*96 from the check above, and so we know - # |$out| + |$len| >= 2*96 (0xc0). lea -0xc0($out,$len),$end0 - shr \$4,$len call _aesni_ctr32_6x @@ -931,15 +916,23 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov -48(%rax),%r15 +.cfi_restore %r15 mov -40(%rax),%r14 +.cfi_restore %r14 mov -32(%rax),%r13 +.cfi_restore %r13 mov -24(%rax),%r12 +.cfi_restore %r12 mov -16(%rax),%rbp +.cfi_restore %rbp mov -8(%rax),%rbx +.cfi_restore %rbx lea (%rax),%rsp # restore %rsp +.cfi_def_cfa_register %rsp .Lgcm_enc_abort: mov $ret,%rax # return value ret +.cfi_endproc .size aesni_gcm_encrypt,.-aesni_gcm_encrypt ___ diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl index 1cf14a6c9f..dcc23f7d7d 100644 --- a/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl +++ b/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl @@ -54,7 +54,7 @@ # # Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software # Polynomial Multiplication on ARM Processors using the NEON Engine. -# +# # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf # ==================================================================== @@ -525,7 +525,7 @@ $code.=<<___; #ifdef __ARMEL__ vrev64.8 $Xl,$Xl #endif - sub $Xi,#16 + sub $Xi,#16 vst1.64 $Xl#hi,[$Xi]! @ write out Xi vst1.64 $Xl#lo,[$Xi] diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl index 81e75f71a8..eb9ded91e5 100755 --- a/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl +++ b/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl @@ -156,7 +156,7 @@ $code.=<<___; ___ ###################################################################### -# "528B" (well, "512B" actualy) streamed GHASH +# "528B" (well, "512B" actually) streamed GHASH # $Xip="in0"; $Htbl="in1"; diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl index 1d6254543b..a614c99c22 100644 --- a/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl +++ b/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl @@ -1,5 +1,5 @@ #! /usr/bin/env perl -# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy @@ -705,7 +705,7 @@ my $depd = sub { my ($mod,$args) = @_; my $orig = "depd$mod\t$args"; - # I only have ",z" completer, it's impicitly encoded... + # I only have ",z" completer, it's implicitly encoded... if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16 { my $opcode=(0x3c<<26)|($4<<21)|($1<<16); my $cpos=63-$2; @@ -724,6 +724,11 @@ sub assemble { ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; } +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler/) { + $gnuas = 1; +} + foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; if ($SIZE_T==4) { @@ -731,7 +736,12 @@ foreach (split("\n",$code)) { s/cmpb,\*/comb,/; s/,\*/,/; } - s/\bbv\b/bve/ if ($SIZE_T==8); + + s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8); + s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8); + s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8); + s/\bbv\b/bve/ if ($SIZE_T==8); + print $_,"\n"; } diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl index 6e628d8823..17dc375053 100644 --- a/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl +++ b/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl @@ -80,6 +80,8 @@ $rem_4bit="%r14"; $sp="%r15"; $code.=<<___; +#include "s390x_arch.h" + .text .globl gcm_gmult_4bit @@ -89,12 +91,13 @@ ___ $code.=<<___ if(!$softonly && 0); # hardware is slow for single block... larl %r1,OPENSSL_s390xcap_P lghi %r0,0 - lg %r1,24(%r1) # load second word of kimd capabilities vector + lg %r1,S390X_KIMD+8(%r1) # load second word of kimd capabilities + # vector tmhh %r1,0x4000 # check for function 65 jz .Lsoft_gmult stg %r0,16($sp) # arrange 16 bytes of zero input stg %r0,24($sp) - lghi %r0,65 # function 65 + lghi %r0,S390X_GHASH # function 65 la %r1,0($Xi) # H lies right after Xi in gcm128_context la $inp,16($sp) lghi $len,16 @@ -123,10 +126,11 @@ gcm_ghash_4bit: ___ $code.=<<___ if(!$softonly); larl %r1,OPENSSL_s390xcap_P - lg %r0,24(%r1) # load second word of kimd capabilities vector + lg %r0,S390X_KIMD+8(%r1) # load second word of kimd capabilities + # vector tmhh %r0,0x4000 # check for function 65 jz .Lsoft_ghash - lghi %r0,65 # function 65 + lghi %r0,S390X_GHASH # function 65 la %r1,0($Xi) # H lies right after Xi in gcm128_context .long 0xb93e0004 # kimd %r0,$inp brc 1,.-4 # pay attention to "partial completion" @@ -149,7 +153,7 @@ $code.=<<___; lg $Zhi,0+1($Xi) lghi $tmp,0 .Louter: - xg $Zhi,0($inp) # Xi ^= inp + xg $Zhi,0($inp) # Xi ^= inp xg $Zlo,8($inp) xgr $Zhi,$tmp stg $Zlo,8+1($Xi) diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl index cd8458256e..bcbe6e399d 100644 --- a/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl +++ b/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl @@ -103,14 +103,13 @@ # # Does it make sense to increase Naggr? To start with it's virtually # impossible in 32-bit mode, because of limited register bank -# capacity. Otherwise improvement has to be weighed agiainst slower +# capacity. Otherwise improvement has to be weighed against slower # setup, as well as code size and complexity increase. As even # optimistic estimate doesn't promise 30% performance improvement, # there are currently no plans to increase Naggr. # -# Special thanks to David Woodhouse <dwmw2@infradead.org> for -# providing access to a Westmere-based system on behalf of Intel -# Open Source Technology Centre. +# Special thanks to David Woodhouse for providing access to a +# Westmere-based system on behalf of Intel Open Source Technology Centre. # January 2010 # @@ -139,7 +138,7 @@ require "x86asm.pl"; $output=pop; open STDOUT,">$output"; -&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); +&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386"); $sse2=0; for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } @@ -811,7 +810,7 @@ sub mmx_loop() { &bswap ($dat); &pshufw ($Zhi,$Zhi,0b00011011); # 76543210 &bswap ("ebx"); - + &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done? &jne (&label("outer")); } @@ -915,7 +914,7 @@ my ($Xhi,$Xi) = @_; &psllq ($Xi,57); # &movdqa ($T1,$Xi); # &pslldq ($Xi,8); - &psrldq ($T1,8); # + &psrldq ($T1,8); # &pxor ($Xi,$T2); &pxor ($Xhi,$T1); # @@ -1085,7 +1084,7 @@ my ($Xhi,$Xi) = @_; &psllq ($Xi,57); # &movdqa ($T1,$Xi); # &pslldq ($Xi,8); - &psrldq ($T1,8); # + &psrldq ($T1,8); # &pxor ($Xi,$T2); &pxor ($Xhi,$T1); # &pshufd ($T1,$Xhn,0b01001110); diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl index 387e3f854e..afc30c3e72 100644 --- a/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl +++ b/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl @@ -44,9 +44,8 @@ # See ghash-x86.pl for background information and details about coding # techniques. # -# Special thanks to David Woodhouse <dwmw2@infradead.org> for -# providing access to a Westmere-based system on behalf of Intel -# Open Source Technology Centre. +# Special thanks to David Woodhouse for providing access to a +# Westmere-based system on behalf of Intel Open Source Technology Centre. # December 2012 # @@ -74,6 +73,7 @@ # Skylake 0.44(+110%)(if system doesn't support AVX) # Bulldozer 1.49(+27%) # Silvermont 2.88(+13%) +# Knights L 2.12(-) (if system doesn't support AVX) # Goldmont 1.08(+24%) # March 2013 @@ -86,6 +86,8 @@ # it performs in 0.41 cycles per byte on Haswell processor, in # 0.29 on Broadwell, and in 0.36 on Skylake. # +# Knights Landing achieves 1.09 cpb. +# # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest $flavour = shift; @@ -236,9 +238,21 @@ $code=<<___; .type gcm_gmult_4bit,\@function,2 .align 16 gcm_gmult_4bit: +.cfi_startproc push %rbx - push %rbp # %rbp and %r12 are pushed exclusively in +.cfi_push %rbx + push %rbp # %rbp and others are pushed exclusively in +.cfi_push %rbp push %r12 # order to reuse Win64 exception handler... +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$280,%rsp +.cfi_adjust_cfa_offset 280 .Lgmult_prologue: movzb 15($Xi),$Zlo @@ -249,10 +263,15 @@ $code.=<<___; mov $Zlo,8($Xi) mov $Zhi,($Xi) - mov 16(%rsp),%rbx - lea 24(%rsp),%rsp + lea 280+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lgmult_epilogue: ret +.cfi_endproc .size gcm_gmult_4bit,.-gcm_gmult_4bit ___ @@ -266,13 +285,21 @@ $code.=<<___; .type gcm_ghash_4bit,\@function,4 .align 16 gcm_ghash_4bit: +.cfi_startproc push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 sub \$280,%rsp +.cfi_adjust_cfa_offset 280 .Lghash_prologue: mov $inp,%r14 # reassign couple of args mov $len,%r15 @@ -400,16 +427,25 @@ $code.=<<___; mov $Zlo,8($Xi) mov $Zhi,($Xi) - lea 280(%rsp),%rsi - mov 0(%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + lea 280+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea 0(%rsi),%rsp +.cfi_def_cfa_register %rsp .Lghash_epilogue: ret +.cfi_endproc .size gcm_ghash_4bit,.-gcm_ghash_4bit ___ @@ -469,7 +505,7 @@ $code.=<<___; psllq \$57,$Xi # movdqa $Xi,$T1 # pslldq \$8,$Xi - psrldq \$8,$T1 # + psrldq \$8,$T1 # pxor $T2,$Xi pxor $T1,$Xhi # @@ -583,7 +619,7 @@ ___ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); # experimental alternative. special thing about is that there - # no dependency between the two multiplications... + # no dependency between the two multiplications... mov \$`0xE1<<1`,%eax mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff mov \$0x07,%r11d @@ -758,7 +794,7 @@ $code.=<<___; movdqa $T2,$T1 # pslldq \$8,$T2 pclmulqdq \$0x00,$Hkey2,$Xln - psrldq \$8,$T1 # + psrldq \$8,$T1 # pxor $T2,$Xi pxor $T1,$Xhi # movdqu 0($inp),$T1 @@ -894,7 +930,7 @@ $code.=<<___; psllq \$57,$Xi # movdqa $Xi,$T1 # pslldq \$8,$Xi - psrldq \$8,$T1 # + psrldq \$8,$T1 # pxor $T2,$Xi pshufd \$0b01001110,$Xhn,$Xmn pxor $T1,$Xhi # @@ -1648,14 +1684,20 @@ se_handler: cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue - lea 24(%rax),%rax # adjust "rsp" + lea 48+280(%rax),%rax # adjust "rsp" mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 .Lin_prologue: mov 8(%rax),%rdi diff --git a/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl b/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl index f0598cb28c..6a2ac71295 100755 --- a/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl +++ b/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl @@ -1,5 +1,5 @@ #! /usr/bin/env perl -# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy @@ -23,13 +23,14 @@ # Relative comparison is therefore more informative. This initial # version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x # faster than "4-bit" integer-only compiler-generated 64-bit code. -# "Initial version" means that there is room for futher improvement. +# "Initial version" means that there is room for further improvement. # May 2016 # # 2x aggregated reduction improves performance by 50% (resulting # performance on POWER8 is 1 cycle per processed byte), and 4x # aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb). +# POWER9 delivers 0.51 cpb. $flavour=shift; $output =shift; diff --git a/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl b/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl index e13c709019..47e8820080 100644 --- a/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl +++ b/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl @@ -19,23 +19,29 @@ # June 2014 # # Initial version was developed in tight cooperation with Ard -# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from -# other assembly modules. Just like aesv8-armx.pl this module -# supports both AArch32 and AArch64 execution modes. +# Biesheuvel of Linaro from bits-n-pieces from other assembly modules. +# Just like aesv8-armx.pl this module supports both AArch32 and +# AArch64 execution modes. # # July 2014 # # Implement 2x aggregated reduction [see ghash-x86.pl for background # information]. # +# November 2017 +# +# AArch64 register bank to "accommodate" 4x aggregated reduction and +# improve performance by 20-70% depending on processor. +# # Current performance in cycles per processed byte: # -# PMULL[2] 32-bit NEON(*) -# Apple A7 0.92 5.62 -# Cortex-A53 1.01 8.39 -# Cortex-A57 1.17 7.61 -# Denver 0.71 6.02 -# Mongoose 1.10 8.06 +# 64-bit PMULL 32-bit PMULL 32-bit NEON(*) +# Apple A7 0.58 0.92 5.62 +# Cortex-A53 0.85 1.01 8.39 +# Cortex-A57 0.73 1.17 7.61 +# Denver 0.51 0.65 6.02 +# Mongoose 0.65 1.10 8.06 +# Kryo 0.76 1.16 8.00 # # (*) presented for reference/comparison purposes; @@ -131,8 +137,56 @@ gcm_init_v8: vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing veor $t1,$t1,$H2 vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed - vst1.64 {$Hhl-$H2},[x0] @ store Htable[1..2] + vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2] +___ +if ($flavour =~ /64/) { +my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7)); +$code.=<<___; + @ calculate H^3 and H^4 + vpmull.p64 $Xl,$H, $H2 + vpmull.p64 $Yl,$H2,$H2 + vpmull2.p64 $Xh,$H, $H2 + vpmull2.p64 $Yh,$H2,$H2 + vpmull.p64 $Xm,$t0,$t1 + vpmull.p64 $Ym,$t1,$t1 + + vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing + vext.8 $t1,$Yl,$Yh,#8 + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t0 + veor $t3,$Yl,$Yh + veor $Ym,$Ym,$t1 + veor $Xm,$Xm,$t2 + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase + veor $Ym,$Ym,$t3 + vpmull.p64 $t3,$Yl,$xC2 + + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Yh#lo,$Ym#hi + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + vmov $Ym#hi,$Yl#lo + veor $Xl,$Xm,$t2 + veor $Yl,$Ym,$t3 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase + vext.8 $t3,$Yl,$Yl,#8 + vpmull.p64 $Xl,$Xl,$xC2 + vpmull.p64 $Yl,$Yl,$xC2 + veor $t2,$t2,$Xh + veor $t3,$t3,$Yh + veor $H, $Xl,$t2 @ H^3 + veor $H2,$Yl,$t3 @ H^4 + + vext.8 $t0,$H, $H,#8 @ Karatsuba pre-processing + vext.8 $t1,$H2,$H2,#8 + veor $t0,$t0,$H + veor $t1,$t1,$H2 + vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed + vst1.64 {$H-$H2},[x0] @ store Htable[3..5] +___ +} +$code.=<<___; ret .size gcm_init_v8,.-gcm_init_v8 ___ @@ -201,6 +255,10 @@ $code.=<<___; .align 4 gcm_ghash_v8: ___ +$code.=<<___ if ($flavour =~ /64/); + cmp $len,#64 + b.hs .Lgcm_ghash_v8_4x +___ $code.=<<___ if ($flavour !~ /64/); vstmdb sp!,{d8-d15} @ 32-bit ABI says so ___ @@ -210,13 +268,13 @@ $code.=<<___; @ loaded value would have @ to be rotated in order to @ make it appear as in - @ alorithm specification + @ algorithm specification subs $len,$len,#32 @ see if $len is 32 or larger mov $inc,#16 @ $inc is used as post- @ increment for input pointer; @ as loop is modulo-scheduled @ $inc is zeroed just in time - @ to preclude oversteping + @ to preclude overstepping @ inp[len], which means that @ last block[s] are actually @ loaded twice, but last @@ -348,7 +406,297 @@ $code.=<<___; ret .size gcm_ghash_v8,.-gcm_ghash_v8 ___ + +if ($flavour =~ /64/) { # 4x subroutine +my ($I0,$j1,$j2,$j3, + $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23)); + +$code.=<<___; +.type gcm_ghash_v8_4x,%function +.align 4 +gcm_ghash_v8_4x: +.Lgcm_ghash_v8_4x: + vld1.64 {$Xl},[$Xi] @ load [rotated] Xi + vld1.64 {$H-$H2},[$Htbl],#48 @ load twisted H, ..., H^2 + vmov.i8 $xC2,#0xe1 + vld1.64 {$H3-$H4},[$Htbl] @ load twisted H^3, ..., H^4 + vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant + + vld1.64 {$I0-$j3},[$inp],#64 +#ifndef __ARMEB__ + vrev64.8 $Xl,$Xl + vrev64.8 $j1,$j1 + vrev64.8 $j2,$j2 + vrev64.8 $j3,$j3 + vrev64.8 $I0,$I0 +#endif + vext.8 $I3,$j3,$j3,#8 + vext.8 $I2,$j2,$j2,#8 + vext.8 $I1,$j1,$j1,#8 + + vpmull.p64 $Yl,$H,$I3 @ H·Ii+3 + veor $j3,$j3,$I3 + vpmull2.p64 $Yh,$H,$I3 + vpmull.p64 $Ym,$Hhl,$j3 + + vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2 + veor $j2,$j2,$I2 + vpmull2.p64 $I2,$H2,$I2 + vpmull2.p64 $j2,$Hhl,$j2 + + veor $Yl,$Yl,$t0 + veor $Yh,$Yh,$I2 + veor $Ym,$Ym,$j2 + + vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1 + veor $j1,$j1,$I1 + vpmull2.p64 $I1,$H3,$I1 + vpmull.p64 $j1,$H34,$j1 + + veor $Yl,$Yl,$j3 + veor $Yh,$Yh,$I1 + veor $Ym,$Ym,$j1 + + subs $len,$len,#128 + b.lo .Ltail4x + + b .Loop4x + +.align 4 +.Loop4x: + veor $t0,$I0,$Xl + vld1.64 {$I0-$j3},[$inp],#64 + vext.8 $IN,$t0,$t0,#8 +#ifndef __ARMEB__ + vrev64.8 $j1,$j1 + vrev64.8 $j2,$j2 + vrev64.8 $j3,$j3 + vrev64.8 $I0,$I0 +#endif + + vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii) + veor $t0,$t0,$IN + vpmull2.p64 $Xh,$H4,$IN + vext.8 $I3,$j3,$j3,#8 + vpmull2.p64 $Xm,$H34,$t0 + + veor $Xl,$Xl,$Yl + veor $Xh,$Xh,$Yh + vext.8 $I2,$j2,$j2,#8 + veor $Xm,$Xm,$Ym + vext.8 $I1,$j1,$j1,#8 + + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + vpmull.p64 $Yl,$H,$I3 @ H·Ii+3 + veor $j3,$j3,$I3 + veor $Xm,$Xm,$t1 + vpmull2.p64 $Yh,$H,$I3 + veor $Xm,$Xm,$t2 + vpmull.p64 $Ym,$Hhl,$j3 + + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2 + veor $j2,$j2,$I2 + vpmull2.p64 $I2,$H2,$I2 + veor $Xl,$Xm,$t2 + vpmull2.p64 $j2,$Hhl,$j2 + + veor $Yl,$Yl,$t0 + veor $Yh,$Yh,$I2 + veor $Ym,$Ym,$j2 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction + vpmull.p64 $Xl,$Xl,$xC2 + vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1 + veor $j1,$j1,$I1 + veor $t2,$t2,$Xh + vpmull2.p64 $I1,$H3,$I1 + vpmull.p64 $j1,$H34,$j1 + + veor $Xl,$Xl,$t2 + veor $Yl,$Yl,$j3 + veor $Yh,$Yh,$I1 + vext.8 $Xl,$Xl,$Xl,#8 + veor $Ym,$Ym,$j1 + + subs $len,$len,#64 + b.hs .Loop4x + +.Ltail4x: + veor $t0,$I0,$Xl + vext.8 $IN,$t0,$t0,#8 + + vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii) + veor $t0,$t0,$IN + vpmull2.p64 $Xh,$H4,$IN + vpmull2.p64 $Xm,$H34,$t0 + + veor $Xl,$Xl,$Yl + veor $Xh,$Xh,$Yh + veor $Xm,$Xm,$Ym + + adds $len,$len,#64 + b.eq .Ldone4x + + cmp $len,#32 + b.lo .Lone + b.eq .Ltwo +.Lthree: + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + vld1.64 {$I0-$j2},[$inp] + veor $Xm,$Xm,$t2 +#ifndef __ARMEB__ + vrev64.8 $j1,$j1 + vrev64.8 $j2,$j2 + vrev64.8 $I0,$I0 +#endif + + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + vext.8 $I2,$j2,$j2,#8 + vext.8 $I1,$j1,$j1,#8 + veor $Xl,$Xm,$t2 + + vpmull.p64 $Yl,$H,$I2 @ H·Ii+2 + veor $j2,$j2,$I2 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction + vpmull.p64 $Xl,$Xl,$xC2 + veor $t2,$t2,$Xh + vpmull2.p64 $Yh,$H,$I2 + vpmull.p64 $Ym,$Hhl,$j2 + veor $Xl,$Xl,$t2 + vpmull.p64 $j3,$H2,$I1 @ H^2·Ii+1 + veor $j1,$j1,$I1 + vext.8 $Xl,$Xl,$Xl,#8 + + vpmull2.p64 $I1,$H2,$I1 + veor $t0,$I0,$Xl + vpmull2.p64 $j1,$Hhl,$j1 + vext.8 $IN,$t0,$t0,#8 + + veor $Yl,$Yl,$j3 + veor $Yh,$Yh,$I1 + veor $Ym,$Ym,$j1 + + vpmull.p64 $Xl,$H3,$IN @ H^3·(Xi+Ii) + veor $t0,$t0,$IN + vpmull2.p64 $Xh,$H3,$IN + vpmull.p64 $Xm,$H34,$t0 + + veor $Xl,$Xl,$Yl + veor $Xh,$Xh,$Yh + veor $Xm,$Xm,$Ym + b .Ldone4x + +.align 4 +.Ltwo: + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + vld1.64 {$I0-$j1},[$inp] + veor $Xm,$Xm,$t2 +#ifndef __ARMEB__ + vrev64.8 $j1,$j1 + vrev64.8 $I0,$I0 +#endif + + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + vext.8 $I1,$j1,$j1,#8 + veor $Xl,$Xm,$t2 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction + vpmull.p64 $Xl,$Xl,$xC2 + veor $t2,$t2,$Xh + veor $Xl,$Xl,$t2 + vext.8 $Xl,$Xl,$Xl,#8 + + vpmull.p64 $Yl,$H,$I1 @ H·Ii+1 + veor $j1,$j1,$I1 + + veor $t0,$I0,$Xl + vext.8 $IN,$t0,$t0,#8 + + vpmull2.p64 $Yh,$H,$I1 + vpmull.p64 $Ym,$Hhl,$j1 + + vpmull.p64 $Xl,$H2,$IN @ H^2·(Xi+Ii) + veor $t0,$t0,$IN + vpmull2.p64 $Xh,$H2,$IN + vpmull2.p64 $Xm,$Hhl,$t0 + + veor $Xl,$Xl,$Yl + veor $Xh,$Xh,$Yh + veor $Xm,$Xm,$Ym + b .Ldone4x + +.align 4 +.Lone: + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + vld1.64 {$I0},[$inp] + veor $Xm,$Xm,$t2 +#ifndef __ARMEB__ + vrev64.8 $I0,$I0 +#endif + + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + veor $Xl,$Xm,$t2 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction + vpmull.p64 $Xl,$Xl,$xC2 + veor $t2,$t2,$Xh + veor $Xl,$Xl,$t2 + vext.8 $Xl,$Xl,$Xl,#8 + + veor $t0,$I0,$Xl + vext.8 $IN,$t0,$t0,#8 + + vpmull.p64 $Xl,$H,$IN + veor $t0,$t0,$IN + vpmull2.p64 $Xh,$H,$IN + vpmull.p64 $Xm,$Hhl,$t0 + +.Ldone4x: + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + veor $Xm,$Xm,$t2 + + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + veor $Xl,$Xm,$t2 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction + vpmull.p64 $Xl,$Xl,$xC2 + veor $t2,$t2,$Xh + veor $Xl,$Xl,$t2 + vext.8 $Xl,$Xl,$Xl,#8 + +#ifndef __ARMEB__ + vrev64.8 $Xl,$Xl +#endif + vst1.64 {$Xl},[$Xi] @ write out Xi + + ret +.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x +___ + } +} + $code.=<<___; .asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" .align 2 @@ -360,7 +708,8 @@ if ($flavour =~ /64/) { ######## 64-bit code my $arg=shift; $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && - sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1; + sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1, + $3<8?$3:$3+8,($4 eq "lo")?0:1; } foreach(split("\n",$code)) { s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or @@ -375,7 +724,7 @@ if ($flavour =~ /64/) { ######## 64-bit code s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers s/@\s/\/\//o; # old->new style commentary - # fix up remainig legacy suffixes + # fix up remaining legacy suffixes s/\.[ui]?8(\s)/$1/o; s/\.[uis]?32//o and s/\.16b/\.4s/go; m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument @@ -415,7 +764,7 @@ if ($flavour =~ /64/) { ######## 64-bit code s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers s/\/\/\s?/@ /o; # new->old style commentary - # fix up remainig new-style suffixes + # fix up remaining new-style suffixes s/\],#[0-9]+/]!/o; s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or |