diff options
author | Shigeki Ohtsu <ohtsu@ohtsu.org> | 2018-03-29 16:39:12 +0900 |
---|---|---|
committer | Shigeki Ohtsu <ohtsu@ohtsu.org> | 2018-04-10 06:45:42 +0900 |
commit | 66cb29e64621fdd1aa5e377a395ff107d21a613b (patch) | |
tree | f05243a51577e04b6f1c4a2f8a6b7b2f05786079 /deps/openssl/openssl/crypto/sha/asm | |
parent | 38c97f5dc7ff3fbf83982d0268fc9e93cfc00c7d (diff) | |
download | android-node-v8-66cb29e64621fdd1aa5e377a395ff107d21a613b.tar.gz android-node-v8-66cb29e64621fdd1aa5e377a395ff107d21a613b.tar.bz2 android-node-v8-66cb29e64621fdd1aa5e377a395ff107d21a613b.zip |
deps: upgrade openssl sources to 1.1.0h
This updates all sources in deps/openssl/openssl with openssl-1.1.0h.
Fixes: https://github.com/nodejs/node/issues/4270
PR-URL: https://github.com/nodejs/node/pull/19794
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Rod Vagg <rod@vagg.org>
Reviewed-By: Michael Dawson <michael_dawson@ca.ibm.com>
Diffstat (limited to 'deps/openssl/openssl/crypto/sha/asm')
32 files changed, 1596 insertions, 135 deletions
diff --git a/deps/openssl/openssl/crypto/sha/asm/README b/deps/openssl/openssl/crypto/sha/asm/README deleted file mode 100644 index b7e755765f..0000000000 --- a/deps/openssl/openssl/crypto/sha/asm/README +++ /dev/null @@ -1 +0,0 @@ -C2.pl works diff --git a/deps/openssl/openssl/crypto/sha/asm/sha1-586.pl b/deps/openssl/openssl/crypto/sha/asm/sha1-586.pl index e0b5d83b62..5adca23404 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha1-586.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha1-586.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 1998-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # [Re]written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -113,6 +120,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; +$output=pop; +open STDOUT,">$output"; + &asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386"); $xmm=$ymm=0; @@ -647,7 +657,7 @@ my $_ror=sub { &ror(@_) }; &jmp (&label("loop")); ###################################################################### -# SSE instruction sequence is first broken to groups of indepentent +# SSE instruction sequence is first broken to groups of independent # instructions, independent in respect to their inputs and shifter # (not all architectures have more than one). Then IALU instructions # are "knitted in" between the SSE groups. Distance is maintained for @@ -656,14 +666,14 @@ my $_ror=sub { &ror(@_) }; # # Temporary registers usage. X[2] is volatile at the entry and at the # end is restored from backtrace ring buffer. X[3] is expected to -# contain current K_XX_XX constant and is used to caclulate X[-1]+K +# contain current K_XX_XX constant and is used to calculate X[-1]+K # from previous round, it becomes volatile the moment the value is # saved to stack for transfer to IALU. X[4] becomes volatile whenever # X[-4] is accumulated and offloaded to backtrace ring buffer, at the # end it is loaded with next K_XX_XX [which becomes X[3] in next # round]... # -sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions @@ -1186,7 +1196,7 @@ my $_ror=sub { &shrd(@_[0],@_) }; &and (@T[0],@T[1]); &jmp (&label("loop")); -sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_avx_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions @@ -1474,3 +1484,5 @@ sub Xtail_avx() &asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); &asm_finish(); + +close STDOUT; diff --git a/deps/openssl/openssl/crypto/sha/asm/sha1-alpha.pl b/deps/openssl/openssl/crypto/sha/asm/sha1-alpha.pl index 6c4b9251fd..4124958f78 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha1-alpha.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha1-alpha.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -317,6 +324,6 @@ $code.=<<___; .ascii "SHA1 block transform for Alpha, CRYPTOGAMS by <appro\@openssl.org>" .align 2 ___ -$output=shift and open STDOUT,">$output"; +$output=pop and open STDOUT,">$output"; print $code; close STDOUT; diff --git a/deps/openssl/openssl/crypto/sha/asm/sha1-armv4-large.pl b/deps/openssl/openssl/crypto/sha/asm/sha1-armv4-large.pl index b2c30322c3..7ff5bfbba6 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha1-armv4-large.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha1-armv4-large.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -60,14 +67,28 @@ # is ~2.5x larger and there are some redundant instructions executed # when processing last block, improvement is not as big for smallest # blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per -# byte, which is also >80% faster than integer-only code. +# byte, which is also >80% faster than integer-only code. Cortex-A15 +# is even faster spending 5.6 cycles per byte outperforming integer- +# only code by factor of 2. # May 2014. # # Add ARMv8 code path performing at 2.35 cpb on Apple A7. -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $ctx="r0"; $inp="r1"; @@ -167,7 +188,12 @@ $code=<<___; #include "arm_arch.h" .text +#if defined(__thumb2__) +.syntax unified +.thumb +#else .code 32 +#endif .global sha1_block_data_order .type sha1_block_data_order,%function @@ -175,9 +201,13 @@ $code=<<___; .align 5 sha1_block_data_order: #if __ARM_MAX_ARCH__>=7 - sub r3,pc,#8 @ sha1_block_data_order +.Lsha1_block: + adr r3,.Lsha1_block ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif tst r12,#ARMV8_SHA1 bne .LARMv8 tst r12,#ARMV7_NEON @@ -199,7 +229,12 @@ for($i=0;$i<5;$i++) { &BODY_00_15(@V); unshift(@V,pop(@V)); } $code.=<<___; +#if defined(__thumb2__) + mov $t3,sp + teq $Xi,$t3 +#else teq $Xi,sp +#endif bne .L_00_15 @ [((11+4)*5+2)*3] sub sp,sp,#25*4 ___ @@ -218,7 +253,12 @@ for($i=0;$i<5;$i++) { &BODY_20_39(@V); unshift(@V,pop(@V)); } $code.=<<___; +#if defined(__thumb2__) + mov $t3,sp + teq $Xi,$t3 +#else teq $Xi,sp @ preserve carry +#endif bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes @@ -230,7 +270,12 @@ for($i=0;$i<5;$i++) { &BODY_40_59(@V); unshift(@V,pop(@V)); } $code.=<<___; +#if defined(__thumb2__) + mov $t3,sp + teq $Xi,$t3 +#else teq $Xi,sp +#endif bne .L_40_59 @ [+((12+5)*5+2)*4] ldr $K,.LK_60_79 @@ -266,7 +311,7 @@ $code.=<<___; .LK_60_79: .word 0xca62c1d6 #if __ARM_MAX_ARCH__>=7 .LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha1_block_data_order +.word OPENSSL_armcap_P-.Lsha1_block #endif .asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" .align 5 @@ -441,6 +486,7 @@ sub Xuplast_80 () &teq ($inp,$len); &sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX + &it ("eq"); &subeq ($inp,$inp,64); # reload last block to avoid SEGV &vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!"); eval(shift(@insns)); @@ -491,12 +537,12 @@ sha1_block_data_order_neon: @ dmb @ errata #451034 on early Cortex A8 @ vstmdb sp!,{d8-d15} @ ABI specification says so mov $saved_sp,sp - sub sp,sp,#64 @ alloca + sub $Xfer,sp,#64 adr $K_XX_XX,.LK_00_19 - bic sp,sp,#15 @ align for 128-bit stores + bic $Xfer,$Xfer,#15 @ align for 128-bit stores ldmia $ctx,{$a,$b,$c,$d,$e} @ load context - mov $Xfer,sp + mov sp,$Xfer @ alloca vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned veor $zero,$zero,$zero @@ -543,10 +589,13 @@ $code.=<<___; add $b,$b,$t0 add $c,$c,$t1 add $d,$d,$Xfer + it eq moveq sp,$saved_sp add $e,$e,$Ki + it ne ldrne $Ki,[sp] stmia $ctx,{$a,$b,$c,$d,$e} + itt ne addne $Xfer,sp,#3*16 bne .Loop_neon @@ -567,6 +616,13 @@ my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14)); $code.=<<___; #if __ARM_MAX_ARCH__>=7 + +# if defined(__thumb2__) +# define INST(a,b,c,d) .byte c,d|0xf,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d|0x10 +# endif + .type sha1_block_data_order_armv8,%function .align 5 sha1_block_data_order_armv8: @@ -660,7 +716,10 @@ ___ # since ARMv7 instructions are always encoded little-endian. # correct solution is to use .inst directive, but older # assemblers don't implement it:-( - sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + + # this fix-up provides Thumb encoding in conjunction with INST + $word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000); + sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", $word&0xff,($word>>8)&0xff, ($word>>16)&0xff,($word>>24)&0xff, $mnemonic,$arg; diff --git a/deps/openssl/openssl/crypto/sha/asm/sha1-armv8.pl b/deps/openssl/openssl/crypto/sha/asm/sha1-armv8.pl index c04432a543..84a00bf2af 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha1-armv8.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha1-armv8.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -18,13 +25,22 @@ # Cortex-A57 2.35 7.88 (+74%) # Denver 2.13 3.97 (+0%)(**) # X-Gene 8.80 (+200%) +# Mongoose 2.05 6.50 (+160%) # # (*) Software results are presented mostly for reference purposes. # (**) Keep in mind that Denver relies on binary translation, which # optimizes compiler output at run-time. $flavour = shift; -open STDOUT,">".shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; ($ctx,$inp,$num)=("x0","x1","x2"); @Xw=map("w$_",(3..17,19)); @@ -158,11 +174,16 @@ $code.=<<___; .text +.extern OPENSSL_armcap_P .globl sha1_block_data_order .type sha1_block_data_order,%function .align 6 sha1_block_data_order: +#ifdef __ILP32__ + ldrsw x16,.LOPENSSL_armcap_P +#else ldr x16,.LOPENSSL_armcap_P +#endif adr x17,.LOPENSSL_armcap_P add x16,x16,x17 ldr w16,[x16] @@ -300,7 +321,11 @@ $code.=<<___; .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 .LOPENSSL_armcap_P: +#ifdef __ILP32__ +.long OPENSSL_armcap_P-. +#else .quad OPENSSL_armcap_P-. +#endif .asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" .align 2 .comm OPENSSL_armcap_P,4,4 diff --git a/deps/openssl/openssl/crypto/sha/asm/sha1-c64xplus.pl b/deps/openssl/openssl/crypto/sha/asm/sha1-c64xplus.pl new file mode 100644 index 0000000000..4db2bcb06b --- /dev/null +++ b/deps/openssl/openssl/crypto/sha/asm/sha1-c64xplus.pl @@ -0,0 +1,337 @@ +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA1 for C64x+. +# +# November 2011 +# +# If compared to compiler-generated code with similar characteristics, +# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs, +# this implementation is 25% smaller and >2x faster. In absolute terms +# performance is (quite impressive) ~6.5 cycles per processed byte. +# Fully unrolled assembler would be ~5x larger and is likely to be +# ~15% faster. It would be free from references to intermediate ring +# buffer, but put more pressure on L1P [both because the code would be +# larger and won't be using SPLOOP buffer]. There are no plans to +# realize fully unrolled variant though... +# +# !!! Note that this module uses AMR, which means that all interrupt +# service routines are expected to preserve it and for own well-being +# zero it upon entry. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments + +($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25)); +($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27"); +($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31)); +($XPA,$XPB) = ("A5","B5"); # X circular buffer +($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM + +$code=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .asg sha1_block_data_order,_sha1_block_data_order + .endif + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg MV,SWAP2 + .asg MV,SWAP4 + .endif + + .global _sha1_block_data_order +_sha1_block_data_order: + .asmfunc stack_usage(64) + MV $NUM,A0 ; reassign $NUM +|| MVK -64,B0 + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64) +|| [A0] MV SP,FP + [A0] LDW *${CTX}[0],$A ; load A-E... +|| [A0] AND B0,SP,SP ; align stack at 64 bytes + [A0] LDW *${CTX}[1],$B +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + [A0] LDW *${CTX}[2],$C +|| [A0] MVK 0x00404,B0 + [A0] LDW *${CTX}[3],$D +|| [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB] + [A0] LDW *${CTX}[4],$E +|| [A0] MVC B0,AMR ; setup circular addressing + LDNW *${INP}++,$TX1 ; pre-fetch input + NOP 1 + +loop?: + MVK 0x00007999,$K +|| ADDAW SP,2,$XPA +|| SUB A0,1,A0 +|| MVK 13,B0 + MVKH 0x5a820000,$K ; K_00_19 +|| ADDAW SP,2,$XPB +|| MV $A,$Actx +|| MV $B,$Bctx +;;================================================== + SPLOOPD 5 ; BODY_00_13 +|| MV $C,$Cctx +|| MV $D,$Dctx +|| MV $E,$Ectx +|| MVC B0,ILC + + ROTL $A,5,$Arot +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C +|| SWAP2 $TX1,$TX2 +|| LDNW *${INP}++,$TX1 + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| SWAP4 $TX2,$TX3 ; byte swap + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A + + ADD $TX3,$T,$A ; A=T+Xi +|| STW $TX3,*${XPB}++ + SPKERNEL +;;================================================== + ROTL $A,5,$Arot ; BODY_14 +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C +|| SWAP2 $TX1,$TX2 +|| LDNW *${INP}++,$TX1 + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| SWAP4 $TX2,$TX2 ; byte swap +|| LDW *${XPA}++,$X0 ; fetches from X ring buffer are +|| LDW *${XPB}[4],$X2 ; 2 iterations ahead + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +;;================================================== + ROTL $A,5,$Arot ; BODY_15 +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C +|| SWAP2 $TX1,$TX2 + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| SWAP4 $TX2,$TX2 ; byte swap +|| XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +|| XOR $TX0,$TX1,$TX1 +|| MVK 3,B0 +;;================================================== + SPLOOPD 5 ; BODY_16_19 +|| MVC B0,ILC + + ROTL $A,5,$Arot +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K +|| ROTL $TX1,1,$TX2 ; Xupdate output + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| XOR $X0,$X2,$TX0 +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +|| XOR $TX0,$TX1,$TX1 + SPKERNEL + + MVK 0xffffeba1,$K +|| MVK 19,B0 + MVKH 0x6ed90000,$K ; K_20_39 +___ +sub BODY_20_39 { +$code.=<<___; +;;================================================== + SPLOOPD 5 ; BODY_20_39 +|| MVC B0,ILC + + ROTL $A,5,$Arot +|| XOR $B,$C,$F +|| ADD $K,$E,$T ; T=E+K +|| ROTL $TX1,1,$TX2 ; Xupdate output + + XOR $D,$F,$F ; F_20_39(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C + + ADD $F,$T,$T ; T+=F_20_39(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| XOR $X0,$X2,$TX0 +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ ; last one is redundant +|| XOR $TX0,$TX1,$TX1 + SPKERNEL +___ +$code.=<<___ if (!shift); + MVK 0xffffbcdc,$K + MVKH 0x8f1b0000,$K ; K_40_59 +___ +} &BODY_20_39(); +$code.=<<___; +;;================================================== + SPLOOPD 5 ; BODY_40_59 +|| MVC B0,ILC +|| AND $B,$C,$F +|| AND $B,$D,$F0 + + ROTL $A,5,$Arot +|| XOR $F0,$F,$F +|| AND $C,$D,$F0 +|| ADD $K,$E,$T ; T=E+K +|| ROTL $TX1,1,$TX2 ; Xupdate output + + XOR $F0,$F,$F ; F_40_59(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C + + ADD $F,$T,$T ; T+=F_40_59(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| XOR $X0,$X2,$TX0 +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +|| XOR $TX0,$TX1,$TX1 +|| AND $B,$C,$F +|| AND $B,$D,$F0 + SPKERNEL + + MVK 0xffffc1d6,$K +|| MVK 18,B0 + MVKH 0xca620000,$K ; K_60_79 +___ + &BODY_20_39(-1); # BODY_60_78 +$code.=<<___; +;;================================================== + [A0] B loop? +|| ROTL $A,5,$Arot ; BODY_79 +|| XOR $B,$C,$F +|| ROTL $TX1,1,$TX2 ; Xupdate output + + [A0] LDNW *${INP}++,$TX1 ; pre-fetch input +|| ADD $K,$E,$T ; T=E+K +|| XOR $D,$F,$F ; F_20_39(B,C,D) + + ADD $F,$T,$T ; T+=F_20_39(B,C,D) +|| ADD $Ectx,$D,$E ; E=D,E+=Ectx +|| ADD $Dctx,$C,$D ; D=C,D+=Dctx +|| ROTL $B,30,$C ; C=ROL(B,30) + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| ADD $Bctx,$A,$B ; B=A,B+=Bctx + + ADD $TX2,$T,$A ; A=T+Xi + + ADD $Actx,$A,$A ; A+=Actx +|| ADD $Cctx,$C,$C ; C+=Cctx +;; end of loop? + + BNOP RA ; return +|| MV FP,SP ; restore stack pointer +|| LDW *FP[0],FP ; restore frame pointer + STW $A,*${CTX}[0] ; emit A-E... +|| MVK 0,B0 + STW $B,*${CTX}[1] +|| MVC B0,AMR ; clear AMR + STW $C,*${CTX}[2] + STW $D,*${CTX}[3] + STW $E,*${CTX}[4] + .endasmfunc + + .sect .const + .cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>" + .align 4 +___ + +print $code; +close STDOUT; diff --git a/deps/openssl/openssl/crypto/sha/asm/sha1-ia64.pl b/deps/openssl/openssl/crypto/sha/asm/sha1-ia64.pl index 02d35d1614..dec21f92d5 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha1-ia64.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha1-ia64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -14,6 +21,8 @@ # Performance under big-endian OS such as HP-UX is 179MBps*1GHz, which # is >50% better than HP C and >2x better than gcc. +$output = pop; + $code=<<___; .ident \"sha1-ia64.s, version 1.3\" .ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\" @@ -301,5 +310,5 @@ $code.=<<___; stringz "SHA1 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>" ___ -$output=shift and open STDOUT,">$output"; +open STDOUT,">$output" if $output; print $code; diff --git a/deps/openssl/openssl/crypto/sha/asm/sha1-mb-x86_64.pl b/deps/openssl/openssl/crypto/sha/asm/sha1-mb-x86_64.pl index a8d8708d4b..51c73c05ac 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha1-mb-x86_64.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha1-mb-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -19,6 +26,7 @@ # Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80% # Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68% # Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160% +# Skylake (8.70 +5.00=13.7)/n 3.64 4.20 +145% # Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64% # # (i) multi-block CBC encrypt with 128-bit key; @@ -62,7 +70,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([ $avx = ($2>=3.0) + ($2>3.0); } -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # void sha1_multi_block ( @@ -478,7 +486,7 @@ $code.=<<___; jnz .Loop_grande .Ldone: - mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp + mov `$REG_SZ*17`(%rsp),%rax # original %rsp ___ $code.=<<___ if ($win64); movaps -0xb8(%rax),%xmm6 @@ -538,7 +546,7 @@ $code.=<<___; movdqa K_XX_XX+0x80(%rip),$BSWAP # byte-n-word swap .Loop_grande_shaext: - mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num + mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num ___ for($i=0;$i<2;$i++) { @@ -1116,7 +1124,7 @@ $code.=<<___; jnz .Loop_grande_avx .Ldone_avx: - mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp + mov `$REG_SZ*17`(%rsp),%rax # original %rsp vzeroupper ___ $code.=<<___ if ($win64); @@ -1271,7 +1279,7 @@ $code.=<<___; #jnz .Loop_grande_avx2 .Ldone_avx2: - mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp + mov `$REG_SZ*17`(%rsp),%rax # original %rsp vzeroupper ___ $code.=<<___ if ($win64); diff --git a/deps/openssl/openssl/crypto/sha/asm/sha1-mips.pl b/deps/openssl/openssl/crypto/sha/asm/sha1-mips.pl index 3408493899..882f9731cf 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha1-mips.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha1-mips.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -70,7 +77,7 @@ if ($flavour =~ /64|n32/i) { $big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0 if ($ENV{CC}); -for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } +for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); } open STDOUT,">$output"; if (!defined($big_endian)) @@ -325,8 +332,8 @@ $code.=<<___ if ($i<79); ___ } -$FRAMESIZE=16; # large enough to accomodate NUBI saved registers -$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; +$FRAMESIZE=16; # large enough to accommodate NUBI saved registers +$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc0fff008" : "0xc0ff0000"; $code=<<___; #ifdef OPENSSL_FIPSCANISTER diff --git a/deps/openssl/openssl/crypto/sha/asm/sha1-parisc.pl b/deps/openssl/openssl/crypto/sha/asm/sha1-parisc.pl index 6e5a328a6f..a85d126ff0 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha1-parisc.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha1-parisc.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL diff --git a/deps/openssl/openssl/crypto/sha/asm/sha1-ppc.pl b/deps/openssl/openssl/crypto/sha/asm/sha1-ppc.pl index ab655021cc..add5a9ea5c 100755 --- a/deps/openssl/openssl/crypto/sha/asm/sha1-ppc.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha1-ppc.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -37,7 +44,7 @@ if ($flavour =~ /64/) { $PUSH ="stw"; } else { die "nonsense $flavour"; } -# Define endianess based on flavour +# Define endianness based on flavour # i.e.: linux64le $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; diff --git a/deps/openssl/openssl/crypto/sha/asm/sha1-s390x.pl b/deps/openssl/openssl/crypto/sha/asm/sha1-s390x.pl index d5cf1640a1..79df1ffdad 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha1-s390x.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha1-s390x.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -28,7 +35,8 @@ # instructions and achieve "64-bit" performance even in 31-bit legacy # application context. The feature is not specific to any particular # processor, as long as it's "z-CPU". Latter implies that the code -# remains z/Architecture specific. +# remains z/Architecture specific. On z990 it was measured to perform +# 23% better than code generated by gcc 4.3. $kimdfunc=1; # magic function code for kimd instruction @@ -42,7 +50,7 @@ if ($flavour =~ /3[12]/) { $g="g"; } -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $K_00_39="%r0"; $K=$K_00_39; @@ -164,9 +172,6 @@ sha1_block_data_order: ___ $code.=<<___ if ($kimdfunc); larl %r1,OPENSSL_s390xcap_P - lg %r0,0(%r1) - tmhl %r0,0x4000 # check for message-security assist - jz .Lsoftware lg %r0,16(%r1) # check kimd capabilities tmhh %r0,`0x8000>>$kimdfunc` jz .Lsoftware @@ -234,7 +239,6 @@ $code.=<<___; br %r14 .size sha1_block_data_order,.-sha1_block_data_order .string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" -.comm OPENSSL_s390xcap_P,80,8 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/deps/openssl/openssl/crypto/sha/asm/sha1-sparcv9.pl b/deps/openssl/openssl/crypto/sha/asm/sha1-sparcv9.pl index b5efcde5c1..7437ff4f05 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha1-sparcv9.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha1-sparcv9.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -25,7 +32,7 @@ # single-process result on 8-core processor, or ~9GBps per 2.85GHz # socket. -$output=shift; +$output=pop; open STDOUT,">$output"; @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7"); @@ -368,7 +375,7 @@ ___ # Purpose of these subroutines is to explicitly encode VIS instructions, # so that one can compile the module without having to specify VIS -# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. # Idea is to reserve for option to produce "universal" binary and let # programmer detect if current CPU is VIS capable at run-time. sub unvis { diff --git a/deps/openssl/openssl/crypto/sha/asm/sha1-sparcv9a.pl b/deps/openssl/openssl/crypto/sha/asm/sha1-sparcv9a.pl index e65291bbd9..f9ed5630e8 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha1-sparcv9a.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha1-sparcv9a.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -544,7 +551,7 @@ ___ # Purpose of these subroutines is to explicitly encode VIS instructions, # so that one can compile the module without having to specify VIS -# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. # Idea is to reserve for option to produce "universal" binary and let # programmer detect if current CPU is VIS capable at run-time. sub unvis { diff --git a/deps/openssl/openssl/crypto/sha/asm/sha1-thumb.pl b/deps/openssl/openssl/crypto/sha/asm/sha1-thumb.pl index 7c9ea9b029..661fd9f9ff 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha1-thumb.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha1-thumb.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL diff --git a/deps/openssl/openssl/crypto/sha/asm/sha1-x86_64.pl b/deps/openssl/openssl/crypto/sha/asm/sha1-x86_64.pl index 752138b0ea..6a3378ba4c 100755 --- a/deps/openssl/openssl/crypto/sha/asm/sha1-x86_64.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha1-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -73,13 +80,16 @@ # Sandy Bridge 7.70 6.10/+26% 4.99/+54% # Ivy Bridge 6.06 4.67/+30% 4.60/+32% # Haswell 5.45 4.15/+31% 3.57/+53% +# Skylake 5.18 4.06/+28% 3.54/+46% # Bulldozer 9.11 5.95/+53% # VIA Nano 9.32 7.15/+30% # Atom 10.3 9.17/+12% # Silvermont 13.1(*) 9.37/+40% +# Goldmont 8.13 6.42/+27% 1.70/+380%(**) # # (*) obviously suboptimal result, nothing was done about it, # because SSSE3 code is compiled unconditionally; +# (**) SHAEXT result $flavour = shift; $output = shift; @@ -114,7 +124,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([ $shaext=1; ### set to zero if compiling for 1.0.1 $avx=1 if (!$shaext && $avx); -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $ctx="%rdi"; # 1st arg @@ -541,7 +551,7 @@ sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } -sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions @@ -994,7 +1004,7 @@ $code.=<<___; jmp .Loop_avx ___ -sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_avx_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions @@ -1466,7 +1476,7 @@ sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path ) } -sub Xupdate_avx2_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_avx2_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions diff --git a/deps/openssl/openssl/crypto/sha/asm/sha256-586.pl b/deps/openssl/openssl/crypto/sha/asm/sha256-586.pl index e907714381..6af1d84beb 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha256-586.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha256-586.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -63,6 +70,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; +$output=pop; +open STDOUT,">$output"; + &asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386"); $xmm=$avx=0; @@ -1279,3 +1289,5 @@ sub bodyx_00_15 () { # +10% &function_end_B("sha256_block_data_order"); &asm_finish(); + +close STDOUT; diff --git a/deps/openssl/openssl/crypto/sha/asm/sha256-armv4.pl b/deps/openssl/openssl/crypto/sha/asm/sha256-armv4.pl index 750216eb42..55d30cba3a 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha256-armv4.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha256-armv4.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -37,8 +44,20 @@ # # Add ARMv8 code path performing at 2.0 cpb on Apple A7. -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $ctx="r0"; $t0="r0"; $inp="r1"; $t4="r1"; @@ -73,7 +92,9 @@ $code.=<<___ if ($i<16); eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` add $a,$a,$t2 @ h+=Maj(a,b,c) from the past eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) +# ifndef __ARMEB__ rev $t1,$t1 +# endif #else @ ldrb $t1,[$inp,#3] @ $i add $a,$a,$t2 @ h+=Maj(a,b,c) from the past @@ -161,15 +182,11 @@ $code=<<___; #endif .text -#if __ARM_ARCH__<7 -.code 32 -#else +#if defined(__thumb2__) .syntax unified -# ifdef __thumb2__ .thumb -# else +#else .code 32 -# endif #endif .type K256,%object @@ -195,21 +212,25 @@ K256: .word 0 @ terminator #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha256_block_data_order +.word OPENSSL_armcap_P-.Lsha256_block_data_order #endif .align 5 .global sha256_block_data_order .type sha256_block_data_order,%function sha256_block_data_order: -#if __ARM_ARCH__<7 +.Lsha256_block_data_order: +#if __ARM_ARCH__<7 && !defined(__thumb2__) sub r3,pc,#8 @ sha256_block_data_order #else - adr r3,. + adr r3,.Lsha256_block_data_order #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif tst r12,#ARMV8_SHA256 bne .LARMv8 tst r12,#ARMV7_NEON @@ -454,7 +475,8 @@ $code.=<<___; .global sha256_block_data_order_neon .type sha256_block_data_order_neon,%function -.align 4 +.align 5 +.skip 16 sha256_block_data_order_neon: .LNEON: stmdb sp!,{r4-r12,lr} @@ -580,7 +602,7 @@ my $Ktbl="r3"; $code.=<<___; #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -# ifdef __thumb2__ +# if defined(__thumb2__) # define INST(a,b,c,d) .byte c,d|0xc,a,b # else # define INST(a,b,c,d) .byte a,b,c,d @@ -591,14 +613,11 @@ $code.=<<___; sha256_block_data_order_armv8: .LARMv8: vld1.32 {$ABCD,$EFGH},[$ctx] -# ifdef __thumb2__ - adr $Ktbl,.LARMv8 - sub $Ktbl,$Ktbl,#.LARMv8-K256 -# else - adrl $Ktbl,K256 -# endif + sub $Ktbl,$Ktbl,#256+32 add $len,$inp,$len,lsl#6 @ len to point at the end of inp + b .Loop_v8 +.align 4 .Loop_v8: vld1.8 {@MSG[0]-@MSG[1]},[$inp]! vld1.8 {@MSG[2]-@MSG[3]},[$inp]! diff --git a/deps/openssl/openssl/crypto/sha/asm/sha256-c64xplus.pl b/deps/openssl/openssl/crypto/sha/asm/sha256-c64xplus.pl new file mode 100644 index 0000000000..3ab7d9b689 --- /dev/null +++ b/deps/openssl/openssl/crypto/sha/asm/sha256-c64xplus.pl @@ -0,0 +1,320 @@ +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA256 for C64x+. +# +# January 2012 +# +# Performance is just below 10 cycles per processed byte, which is +# almost 40% faster than compiler-generated code. Unroll is unlikely +# to give more than ~8% improvement... +# +# !!! Note that this module uses AMR, which means that all interrupt +# service routines are expected to preserve it and for own well-being +# zero it upon entry. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments + $K256="A3"; + +($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14) + =map("A$_",(16..31)); +($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15) + =map("B$_",(16..31)); + +($Xia,$Xib)=("A5","B5"); # circular/ring buffer + $CTXB=$t2e; + +($Xn,$X0,$K)=("B7","B8","B9"); +($Maj,$Ch)=($T2,"B6"); + +$code.=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .nocmp + .asg sha256_block_data_order,_sha256_block_data_order + .endif + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg SWAP2,MV + .asg SWAP4,MV + .endif + + .global _sha256_block_data_order +_sha256_block_data_order: +__sha256_block: + .asmfunc stack_usage(64) + MV $NUM,A0 ; reassign $NUM +|| MVK -64,B0 + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64) +|| [A0] MV SP,FP + [A0] ADDKPC __sha256_block,B2 +|| [A0] AND B0,SP,SP ; align stack at 64 bytes + .if __TI_EABI__ + [A0] MVK 0x00404,B1 +|| [A0] MVKL \$PCR_OFFSET(K256,__sha256_block),$K256 + [A0] MVKH 0x50000,B1 +|| [A0] MVKH \$PCR_OFFSET(K256,__sha256_block),$K256 + .else + [A0] MVK 0x00404,B1 +|| [A0] MVKL (K256-__sha256_block),$K256 + [A0] MVKH 0x50000,B1 +|| [A0] MVKH (K256-__sha256_block),$K256 + .endif + [A0] MVC B1,AMR ; setup circular addressing +|| [A0] MV SP,$Xia + [A0] MV SP,$Xib +|| [A0] ADD B2,$K256,$K256 +|| [A0] MV $CTXA,$CTXB +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + LDW *${CTXA}[0],$A ; load ctx +|| LDW *${CTXB}[4],$E + LDW *${CTXA}[1],$B +|| LDW *${CTXB}[5],$F + LDW *${CTXA}[2],$C +|| LDW *${CTXB}[6],$G + LDW *${CTXA}[3],$D +|| LDW *${CTXB}[7],$H + + LDNW *$INP++,$Xn ; pre-fetch input + LDW *$K256++,$K ; pre-fetch K256[0] + MVK 14,B0 ; loop counters + MVK 47,B1 +|| ADDAW $Xia,9,$Xia +outerloop?: + SUB A0,1,A0 +|| MV $A,$Actx +|| MV $E,$Ectx +|| MVD $B,$Bctx +|| MVD $F,$Fctx + MV $C,$Cctx +|| MV $G,$Gctx +|| MVD $D,$Dctx +|| MVD $H,$Hctx +|| SWAP4 $Xn,$X0 + + SPLOOPD 8 ; BODY_00_14 +|| MVC B0,ILC +|| SWAP2 $X0,$X0 + + LDNW *$INP++,$Xn +|| ROTL $A,30,$S0 +|| OR $A,$B,$Maj +|| AND $A,$B,$t2a +|| ROTL $E,26,$S1 +|| AND $F,$E,$Ch +|| ANDN $G,$E,$t2e + ROTL $A,19,$t0a +|| AND $C,$Maj,$Maj +|| ROTL $E,21,$t0e +|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) + ROTL $A,10,$t1a +|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ROTL $E,7,$t1e +|| ADD $K,$H,$T1 ; T1 = h + K256[i] + ADD $X0,$T1,$T1 ; T1 += X[i]; +|| STW $X0,*$Xib++ +|| XOR $t0a,$S0,$S0 +|| XOR $t0e,$S1,$S1 + XOR $t1a,$S0,$S0 ; Sigma0(a) +|| XOR $t1e,$S1,$S1 ; Sigma1(e) +|| LDW *$K256++,$K ; pre-fetch K256[i+1] +|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) + ADD $S1,$T1,$T1 ; T1 += Sigma1(e) +|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) +|| ROTL $G,0,$H ; h = g +|| MV $F,$G ; g = f +|| MV $X0,$X14 +|| SWAP4 $Xn,$X0 + SWAP2 $X0,$X0 +|| MV $E,$F ; f = e +|| ADD $D,$T1,$E ; e = d + T1 +|| MV $C,$D ; d = c + MV $B,$C ; c = b +|| MV $A,$B ; b = a +|| ADD $T1,$T2,$A ; a = T1 + T2 + SPKERNEL + + ROTL $A,30,$S0 ; BODY_15 +|| OR $A,$B,$Maj +|| AND $A,$B,$t2a +|| ROTL $E,26,$S1 +|| AND $F,$E,$Ch +|| ANDN $G,$E,$t2e +|| LDW *${Xib}[1],$Xn ; modulo-scheduled + ROTL $A,19,$t0a +|| AND $C,$Maj,$Maj +|| ROTL $E,21,$t0e +|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) +|| LDW *${Xib}[2],$X1 ; modulo-scheduled + ROTL $A,10,$t1a +|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ROTL $E,7,$t1e +|| ADD $K,$H,$T1 ; T1 = h + K256[i] + ADD $X0,$T1,$T1 ; T1 += X[i]; +|| STW $X0,*$Xib++ +|| XOR $t0a,$S0,$S0 +|| XOR $t0e,$S1,$S1 + XOR $t1a,$S0,$S0 ; Sigma0(a) +|| XOR $t1e,$S1,$S1 ; Sigma1(e) +|| LDW *$K256++,$K ; pre-fetch K256[i+1] +|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) + ADD $S1,$T1,$T1 ; T1 += Sigma1(e) +|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) +|| ROTL $G,0,$H ; h = g +|| MV $F,$G ; g = f +|| MV $X0,$X15 + MV $E,$F ; f = e +|| ADD $D,$T1,$E ; e = d + T1 +|| MV $C,$D ; d = c +|| MV $Xn,$X0 ; modulo-scheduled +|| LDW *$Xia,$X9 ; modulo-scheduled +|| ROTL $X1,25,$t0e ; modulo-scheduled +|| ROTL $X14,15,$t0a ; modulo-scheduled + SHRU $X1,3,$s0 ; modulo-scheduled +|| SHRU $X14,10,$s1 ; modulo-scheduled +|| ROTL $B,0,$C ; c = b +|| MV $A,$B ; b = a +|| ADD $T1,$T2,$A ; a = T1 + T2 + + SPLOOPD 10 ; BODY_16_63 +|| MVC B1,ILC +|| ROTL $X1,14,$t1e ; modulo-scheduled +|| ROTL $X14,13,$t1a ; modulo-scheduled + + XOR $t0e,$s0,$s0 +|| XOR $t0a,$s1,$s1 +|| MV $X15,$X14 +|| MV $X1,$Xn + XOR $t1e,$s0,$s0 ; sigma0(X[i+1]) +|| XOR $t1a,$s1,$s1 ; sigma1(X[i+14]) +|| LDW *${Xib}[2],$X1 ; module-scheduled + ROTL $A,30,$S0 +|| OR $A,$B,$Maj +|| AND $A,$B,$t2a +|| ROTL $E,26,$S1 +|| AND $F,$E,$Ch +|| ANDN $G,$E,$t2e +|| ADD $X9,$X0,$X0 ; X[i] += X[i+9] + ROTL $A,19,$t0a +|| AND $C,$Maj,$Maj +|| ROTL $E,21,$t0e +|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) +|| ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1]) + ROTL $A,10,$t1a +|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ROTL $E,7,$t1e +|| ADD $H,$K,$T1 ; T1 = h + K256[i] +|| ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14]) + XOR $t0a,$S0,$S0 +|| XOR $t0e,$S1,$S1 +|| ADD $X0,$T1,$T1 ; T1 += X[i] +|| STW $X0,*$Xib++ + XOR $t1a,$S0,$S0 ; Sigma0(a) +|| XOR $t1e,$S1,$S1 ; Sigma1(e) +|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) +|| MV $X0,$X15 +|| ROTL $G,0,$H ; h = g +|| LDW *$K256++,$K ; pre-fetch K256[i+1] + ADD $S1,$T1,$T1 ; T1 += Sigma1(e) +|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) +|| MV $F,$G ; g = f +|| MV $Xn,$X0 ; modulo-scheduled +|| LDW *++$Xia,$X9 ; modulo-scheduled +|| ROTL $X1,25,$t0e ; module-scheduled +|| ROTL $X14,15,$t0a ; modulo-scheduled + ROTL $X1,14,$t1e ; modulo-scheduled +|| ROTL $X14,13,$t1a ; modulo-scheduled +|| MV $E,$F ; f = e +|| ADD $D,$T1,$E ; e = d + T1 +|| MV $C,$D ; d = c +|| MV $B,$C ; c = b + MV $A,$B ; b = a +|| ADD $T1,$T2,$A ; a = T1 + T2 +|| SHRU $X1,3,$s0 ; modulo-scheduled +|| SHRU $X14,10,$s1 ; modulo-scheduled + SPKERNEL + + [A0] B outerloop? +|| [A0] LDNW *$INP++,$Xn ; pre-fetch input +|| [A0] ADDK -260,$K256 ; rewind K256 +|| ADD $Actx,$A,$A ; accumulate ctx +|| ADD $Ectx,$E,$E +|| ADD $Bctx,$B,$B + ADD $Fctx,$F,$F +|| ADD $Cctx,$C,$C +|| ADD $Gctx,$G,$G +|| ADD $Dctx,$D,$D +|| ADD $Hctx,$H,$H +|| [A0] LDW *$K256++,$K ; pre-fetch K256[0] + + [!A0] BNOP RA +||[!A0] MV $CTXA,$CTXB + [!A0] MV FP,SP ; restore stack pointer +||[!A0] LDW *FP[0],FP ; restore frame pointer + [!A0] STW $A,*${CTXA}[0] ; save ctx +||[!A0] STW $E,*${CTXB}[4] +||[!A0] MVK 0,B0 + [!A0] STW $B,*${CTXA}[1] +||[!A0] STW $F,*${CTXB}[5] +||[!A0] MVC B0,AMR ; clear AMR + STW $C,*${CTXA}[2] +|| STW $G,*${CTXB}[6] + STW $D,*${CTXA}[3] +|| STW $H,*${CTXB}[7] + .endasmfunc + + .if __TI_EABI__ + .sect ".text:sha_asm.const" + .else + .sect ".const:sha_asm" + .endif + .align 128 +K256: + .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>" + .align 4 + +___ + +print $code; +close STDOUT; diff --git a/deps/openssl/openssl/crypto/sha/asm/sha256-mb-x86_64.pl b/deps/openssl/openssl/crypto/sha/asm/sha256-mb-x86_64.pl index 9770286b95..fbcd29f2e8 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha256-mb-x86_64.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha256-mb-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -19,6 +26,7 @@ # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103% # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82% # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170% +# Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170% # Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100% # # (i) multi-block CBC encrypt with 128-bit key; @@ -63,7 +71,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([ $avx = ($2>=3.0) + ($2>3.0); } -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # void sha256_multi_block ( @@ -382,7 +390,7 @@ $code.=<<___; jnz .Loop_grande .Ldone: - mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp + mov `$REG_SZ*17`(%rsp),%rax # original %rsp ___ $code.=<<___ if ($win64); movaps -0xb8(%rax),%xmm6 @@ -442,7 +450,7 @@ $code.=<<___; lea K256_shaext+0x80(%rip),$Tbl .Loop_grande_shaext: - mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num + mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num ___ for($i=0;$i<2;$i++) { @@ -1061,7 +1069,7 @@ $code.=<<___; jnz .Loop_grande_avx .Ldone_avx: - mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp + mov `$REG_SZ*17`(%rsp),%rax # original %rsp vzeroupper ___ $code.=<<___ if ($win64); @@ -1237,7 +1245,7 @@ $code.=<<___; #jnz .Loop_grande_avx2 .Ldone_avx2: - mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp + mov `$REG_SZ*17`(%rsp),%rax # original %rsp vzeroupper ___ $code.=<<___ if ($win64); diff --git a/deps/openssl/openssl/crypto/sha/asm/sha512-586.pl b/deps/openssl/openssl/crypto/sha/asm/sha512-586.pl index 2f6a202c37..3873934b69 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha512-586.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha512-586.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -29,6 +36,7 @@ # VIA Nano 91 - 52 33 14.7 # Atom 126 - 68 48(***) 14.7 # Silvermont 97 - 58 42(***) 17.5 +# Goldmont 80 - 48 19.5 12.0 # # (*) whichever best applicable. # (**) x86_64 assembler performance is presented for reference @@ -50,6 +58,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; +$output=pop; +open STDOUT,">$output"; + &asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386"); $sse2=0; @@ -909,3 +920,5 @@ sub BODY_00_15_ssse3 { # "phase-less" copy of BODY_00_15_sse2 &asciz("SHA512 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); &asm_finish(); + +close STDOUT; diff --git a/deps/openssl/openssl/crypto/sha/asm/sha512-armv4.pl b/deps/openssl/openssl/crypto/sha/asm/sha512-armv4.pl index fb7dc506ac..22b5a9d0b1 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha512-armv4.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha512-armv4.pl @@ -1,10 +1,19 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. +# +# Permission to use under GPL terms is granted. # ==================================================================== # SHA512 block procedure for ARMv4. September 2007. @@ -34,16 +43,9 @@ # terms it's 22.6 cycles per byte, which is disappointing result. # Technical writers asserted that 3-way S4 pipeline can sustain # multiple NEON instructions per cycle, but dual NEON issue could -# not be observed, and for NEON-only sequences IPC(*) was found to -# be limited by 1:-( 0.33 and 0.66 were measured for sequences with -# ILPs(*) of 1 and 2 respectively. This in turn means that you can -# even find yourself striving, as I did here, for achieving IPC -# adequate to one delivered by Cortex A8 [for reference, it's -# 0.5 for ILP of 1, and 1 for higher ILPs]. -# -# (*) ILP, instruction-level parallelism, how many instructions -# *can* execute at the same time. IPC, instructions per cycle, -# indicates how many instructions actually execute. +# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html +# for further details. On side note Cortex-A15 processes one byte in +# 16 cycles. # Byte order [in]dependence. ========================================= # @@ -55,8 +57,20 @@ $hi="HI"; $lo="LO"; # ==================================================================== -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $ctx="r0"; # parameter block $inp="r1"; @@ -143,6 +157,9 @@ $code.=<<___; teq $t0,#$magic ldr $t3,[sp,#$Coff+0] @ c.lo +#if __ARM_ARCH__>=7 + it eq @ Thumb2 thing, sanity check in ARM +#endif orreq $Ktbl,$Ktbl,#1 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 @@ -180,7 +197,17 @@ $code.=<<___; ___ } $code=<<___; -#include "arm_arch.h" +#ifndef __KERNEL__ +# include "arm_arch.h" +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +# define VFP_ABI_PUSH +# define VFP_ABI_POP +#endif + #ifdef __ARMEL__ # define LO 0 # define HI 4 @@ -192,7 +219,14 @@ $code=<<___; #endif .text +#if defined(__thumb2__) +.syntax unified +.thumb +# define adrl adr +#else .code 32 +#endif + .type K512,%object .align 5 K512: @@ -237,9 +271,9 @@ WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .size K512,.-K512 -#if __ARM_MAX_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha512_block_data_order +.word OPENSSL_armcap_P-.Lsha512_block_data_order .skip 32-4 #else .skip 32 @@ -248,14 +282,22 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .global sha512_block_data_order .type sha512_block_data_order,%function sha512_block_data_order: +.Lsha512_block_data_order: +#if __ARM_ARCH__<7 && !defined(__thumb2__) sub r3,pc,#8 @ sha512_block_data_order - add $len,$inp,$len,lsl#7 @ len to point at the end of inp -#if __ARM_MAX_ARCH__>=7 +#else + adr r3,.Lsha512_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P - tst r12,#1 +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV7_NEON bne .LNEON #endif + add $len,$inp,$len,lsl#7 @ len to point at the end of inp stmdb sp!,{r4-r12,lr} sub $Ktbl,r3,#672 @ K512 sub sp,sp,#9*8 @@ -369,6 +411,9 @@ $code.=<<___; ___ &BODY_00_15(0x17); $code.=<<___; +#if __ARM_ARCH__>=7 + ittt eq @ Thumb2 thing, sanity check in ARM +#endif ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] beq .L16_79 @@ -453,6 +498,7 @@ $code.=<<___; moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif +.size sha512_block_data_order,.-sha512_block_data_order ___ { @@ -559,11 +605,15 @@ $code.=<<___; .arch armv7-a .fpu neon +.global sha512_block_data_order_neon +.type sha512_block_data_order_neon,%function .align 4 +sha512_block_data_order_neon: .LNEON: dmb @ errata #451034 on early Cortex A8 - vstmdb sp!,{d8-d15} @ ABI specification says so - sub $Ktbl,r3,#672 @ K512 + add $len,$inp,$len,lsl#7 @ len to point at the end of inp + adr $Ktbl,K512 + VFP_ABI_PUSH vldmia $ctx,{$A-$H} @ load context .Loop_neon: ___ @@ -588,16 +638,16 @@ $code.=<<___; sub $Ktbl,#640 @ rewind K512 bne .Loop_neon - vldmia sp!,{d8-d15} @ epilogue + VFP_ABI_POP ret @ bx lr +.size sha512_block_data_order_neon,.-sha512_block_data_order_neon #endif ___ } $code.=<<___; -.size sha512_block_data_order,.-sha512_block_data_order .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" .align 2 -#if __ARM_MAX_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .comm OPENSSL_armcap_P,4,4 #endif ___ @@ -605,5 +655,14 @@ ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 $code =~ s/\bret\b/bx lr/gm; + +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + print $code; close STDOUT; # enforce flush diff --git a/deps/openssl/openssl/crypto/sha/asm/sha512-armv8.pl b/deps/openssl/openssl/crypto/sha/asm/sha512-armv8.pl index f7b36b986a..c1aaf778f4 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha512-armv8.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha512-armv8.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -18,6 +25,7 @@ # Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) # Denver 2.01 10.5 (+26%) 6.70 (+8%) # X-Gene 20.0 (+100%) 12.8 (+300%(***)) +# Mongoose 2.36 13.0 (+50%) 8.36 (+33%) # # (*) Software SHA256 results are of lesser relevance, presented # mostly for informational purposes. @@ -31,7 +39,14 @@ $flavour=shift; $output=shift; -open STDOUT,">$output"; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; if ($output =~ /512/) { $BITS=512; @@ -155,13 +170,18 @@ $code.=<<___; .text +.extern OPENSSL_armcap_P .globl $func .type $func,%function .align 6 $func: ___ $code.=<<___ if ($SZ==4); +#ifdef __ILP32__ + ldrsw x16,.LOPENSSL_armcap_P +#else ldr x16,.LOPENSSL_armcap_P +#endif adr x17,.LOPENSSL_armcap_P add x16,x16,x17 ldr w16,[x16] @@ -184,7 +204,7 @@ $code.=<<___; ldp $E,$F,[$ctx,#4*$SZ] add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input ldp $G,$H,[$ctx,#6*$SZ] - adr $Ktbl,K$BITS + adr $Ktbl,.LK$BITS stp $ctx,$num,[x29,#96] .Loop: @@ -234,8 +254,8 @@ $code.=<<___; .size $func,.-$func .align 6 -.type K$BITS,%object -K$BITS: +.type .LK$BITS,%object +.LK$BITS: ___ $code.=<<___ if ($SZ==8); .quad 0x428a2f98d728ae22,0x7137449123ef65cd @@ -300,10 +320,14 @@ $code.=<<___ if ($SZ==4); .long 0 //terminator ___ $code.=<<___; -.size K$BITS,.-K$BITS +.size .LK$BITS,.-.LK$BITS .align 3 .LOPENSSL_armcap_P: +#ifdef __ILP32__ + .long OPENSSL_armcap_P-. +#else .quad OPENSSL_armcap_P-. +#endif .asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" .align 2 ___ @@ -325,7 +349,7 @@ sha256_block_armv8: add x29,sp,#0 ld1.32 {$ABCD,$EFGH},[$ctx] - adr $Ktbl,K256 + adr $Ktbl,.LK256 .Loop_hw: ld1 {@MSG[0]-@MSG[3]},[$inp],#64 diff --git a/deps/openssl/openssl/crypto/sha/asm/sha512-c64xplus.pl b/deps/openssl/openssl/crypto/sha/asm/sha512-c64xplus.pl new file mode 100644 index 0000000000..9ebfc92e23 --- /dev/null +++ b/deps/openssl/openssl/crypto/sha/asm/sha512-c64xplus.pl @@ -0,0 +1,438 @@ +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA512 for C64x+. +# +# January 2012 +# +# Performance is 19 cycles per processed byte. Compared to block +# transform function from sha512.c compiled with cl6x with -mv6400+ +# -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller. +# Loop unroll won't make it, this implementation, any faster, because +# it's effectively dominated by SHRU||SHL pairs and you can't schedule +# more of them. +# +# !!! Note that this module uses AMR, which means that all interrupt +# service routines are expected to preserve it and for own well-being +# zero it upon entry. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments + $K512="A3"; + +($Ahi,$Actxhi,$Bhi,$Bctxhi,$Chi,$Cctxhi,$Dhi,$Dctxhi, + $Ehi,$Ectxhi,$Fhi,$Fctxhi,$Ghi,$Gctxhi,$Hhi,$Hctxhi)=map("A$_",(16..31)); +($Alo,$Actxlo,$Blo,$Bctxlo,$Clo,$Cctxlo,$Dlo,$Dctxlo, + $Elo,$Ectxlo,$Flo,$Fctxlo,$Glo,$Gctxlo,$Hlo,$Hctxlo)=map("B$_",(16..31)); + +($S1hi,$CHhi,$S0hi,$t0hi)=map("A$_",(10..13)); +($S1lo,$CHlo,$S0lo,$t0lo)=map("B$_",(10..13)); +($T1hi, $T2hi)= ("A6","A7"); +($T1lo,$T1carry,$T2lo,$T2carry)=("B6","B7","B8","B9"); +($Khi,$Klo)=("A9","A8"); +($MAJhi,$MAJlo)=($T2hi,$T2lo); +($t1hi,$t1lo)=($Khi,"B2"); + $CTXB=$t1lo; + +($Xihi,$Xilo)=("A5","B5"); # circular/ring buffer + +$code.=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .nocmp + .asg sha512_block_data_order,_sha512_block_data_order + .endif + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg $Khi,KHI + .asg $Klo,KLO + .else + .asg $Khi,KLO + .asg $Klo,KHI + .endif + + .global _sha512_block_data_order +_sha512_block_data_order: +__sha512_block: + .asmfunc stack_usage(40+128) + MV $NUM,A0 ; reassign $NUM +|| MVK -128,B0 + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] STW FP,*SP--(40) ; save frame pointer +|| [A0] MV SP,FP + [A0] STDW B13:B12,*SP[4] +|| [A0] MVK 0x00404,B1 + [A0] STDW B11:B10,*SP[3] +|| [A0] STDW A13:A12,*FP[-3] +|| [A0] MVKH 0x60000,B1 + [A0] STDW A11:A10,*SP[1] +|| [A0] MVC B1,AMR ; setup circular addressing +|| [A0] ADD B0,SP,SP ; alloca(128) + .if __TI_EABI__ + [A0] AND B0,SP,SP ; align stack at 128 bytes +|| [A0] ADDKPC __sha512_block,B1 +|| [A0] MVKL \$PCR_OFFSET(K512,__sha512_block),$K512 + [A0] MVKH \$PCR_OFFSET(K512,__sha512_block),$K512 +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + .else + [A0] AND B0,SP,SP ; align stack at 128 bytes +|| [A0] ADDKPC __sha512_block,B1 +|| [A0] MVKL (K512-__sha512_block),$K512 + [A0] MVKH (K512-__sha512_block),$K512 +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + .endif + ADDAW SP,3,$Xilo + ADDAW SP,2,$Xihi + +|| MV $CTXA,$CTXB + LDW *${CTXA}[0^.LITTLE_ENDIAN],$Ahi ; load ctx +|| LDW *${CTXB}[1^.LITTLE_ENDIAN],$Alo +|| ADD B1,$K512,$K512 + LDW *${CTXA}[2^.LITTLE_ENDIAN],$Bhi +|| LDW *${CTXB}[3^.LITTLE_ENDIAN],$Blo + LDW *${CTXA}[4^.LITTLE_ENDIAN],$Chi +|| LDW *${CTXB}[5^.LITTLE_ENDIAN],$Clo + LDW *${CTXA}[6^.LITTLE_ENDIAN],$Dhi +|| LDW *${CTXB}[7^.LITTLE_ENDIAN],$Dlo + LDW *${CTXA}[8^.LITTLE_ENDIAN],$Ehi +|| LDW *${CTXB}[9^.LITTLE_ENDIAN],$Elo + LDW *${CTXA}[10^.LITTLE_ENDIAN],$Fhi +|| LDW *${CTXB}[11^.LITTLE_ENDIAN],$Flo + LDW *${CTXA}[12^.LITTLE_ENDIAN],$Ghi +|| LDW *${CTXB}[13^.LITTLE_ENDIAN],$Glo + LDW *${CTXA}[14^.LITTLE_ENDIAN],$Hhi +|| LDW *${CTXB}[15^.LITTLE_ENDIAN],$Hlo + + LDNDW *$INP++,B11:B10 ; pre-fetch input + LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0] +outerloop?: + MVK 15,B0 ; loop counters +|| MVK 64,B1 +|| SUB A0,1,A0 + MV $Ahi,$Actxhi +|| MV $Alo,$Actxlo +|| MV $Bhi,$Bctxhi +|| MV $Blo,$Bctxlo +|| MV $Chi,$Cctxhi +|| MV $Clo,$Cctxlo +|| MVD $Dhi,$Dctxhi +|| MVD $Dlo,$Dctxlo + MV $Ehi,$Ectxhi +|| MV $Elo,$Ectxlo +|| MV $Fhi,$Fctxhi +|| MV $Flo,$Fctxlo +|| MV $Ghi,$Gctxhi +|| MV $Glo,$Gctxlo +|| MVD $Hhi,$Hctxhi +|| MVD $Hlo,$Hctxlo +loop0_15?: + .if .BIG_ENDIAN + MV B11,$T1hi +|| MV B10,$T1lo + .else + SWAP4 B10,$T1hi +|| SWAP4 B11,$T1lo + SWAP2 $T1hi,$T1hi +|| SWAP2 $T1lo,$T1lo + .endif +loop16_79?: + STW $T1hi,*$Xihi++[2] +|| STW $T1lo,*$Xilo++[2] ; X[i] = T1 +|| ADD $Hhi,$T1hi,$T1hi +|| ADDU $Hlo,$T1lo,$T1carry:$T1lo ; T1 += h +|| SHRU $Ehi,14,$S1hi +|| SHL $Ehi,32-14,$S1lo + XOR $Fhi,$Ghi,$CHhi +|| XOR $Flo,$Glo,$CHlo +|| ADD KHI,$T1hi,$T1hi +|| ADDU KLO,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += K512[i] +|| SHRU $Elo,14,$t0lo +|| SHL $Elo,32-14,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| AND $Ehi,$CHhi,$CHhi +|| AND $Elo,$CHlo,$CHlo +|| ROTL $Ghi,0,$Hhi +|| ROTL $Glo,0,$Hlo ; h = g +|| SHRU $Ehi,18,$t0hi +|| SHL $Ehi,32-18,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| XOR $Ghi,$CHhi,$CHhi +|| XOR $Glo,$CHlo,$CHlo ; Ch(e,f,g) = ((f^g)&e)^g +|| ROTL $Fhi,0,$Ghi +|| ROTL $Flo,0,$Glo ; g = f +|| SHRU $Elo,18,$t0lo +|| SHL $Elo,32-18,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| OR $Ahi,$Bhi,$MAJhi +|| OR $Alo,$Blo,$MAJlo +|| ROTL $Ehi,0,$Fhi +|| ROTL $Elo,0,$Flo ; f = e +|| SHRU $Ehi,41-32,$t0lo +|| SHL $Ehi,64-41,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| AND $Chi,$MAJhi,$MAJhi +|| AND $Clo,$MAJlo,$MAJlo +|| ROTL $Dhi,0,$Ehi +|| ROTL $Dlo,0,$Elo ; e = d +|| SHRU $Elo,41-32,$t0hi +|| SHL $Elo,64-41,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo ; Sigma1(e) +|| AND $Ahi,$Bhi,$t1hi +|| AND $Alo,$Blo,$t1lo +|| ROTL $Chi,0,$Dhi +|| ROTL $Clo,0,$Dlo ; d = c +|| SHRU $Ahi,28,$S0hi +|| SHL $Ahi,32-28,$S0lo + OR $t1hi,$MAJhi,$MAJhi +|| OR $t1lo,$MAJlo,$MAJlo ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ADD $CHhi,$T1hi,$T1hi +|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Ch(e,f,g) +|| ROTL $Bhi,0,$Chi +|| ROTL $Blo,0,$Clo ; c = b +|| SHRU $Alo,28,$t0lo +|| SHL $Alo,32-28,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $S1hi,$T1hi,$T1hi +|| ADDU $S1lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Sigma1(e) +|| ROTL $Ahi,0,$Bhi +|| ROTL $Alo,0,$Blo ; b = a +|| SHRU $Ahi,34-32,$t0lo +|| SHL $Ahi,64-34,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $MAJhi,$T1hi,$T2hi +|| ADDU $MAJlo,$T1carry:$T1lo,$T2carry:$T2lo ; T2 = T1+Maj(a,b,c) +|| SHRU $Alo,34-32,$t0hi +|| SHL $Alo,64-34,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $Ehi,$T1hi,$T1hi +|| ADDU $Elo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += e +|| [B0] BNOP loop0_15? +|| SHRU $Ahi,39-32,$t0lo +|| SHL $Ahi,64-39,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| [B0] LDNDW *$INP++,B11:B10 ; pre-fetch input +||[!B1] BNOP break? +|| SHRU $Alo,39-32,$t0hi +|| SHL $Alo,64-39,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo ; Sigma0(a) +|| ADD $T1carry,$T1hi,$Ehi +|| MV $T1lo,$Elo ; e = T1 +||[!B0] LDW *${Xihi}[28],$T1hi +||[!B0] LDW *${Xilo}[28],$T1lo ; X[i+14] + ADD $S0hi,$T2hi,$T2hi +|| ADDU $S0lo,$T2carry:$T2lo,$T2carry:$T2lo ; T2 += Sigma0(a) +|| [B1] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[i] + NOP ; avoid cross-path stall + ADD $T2carry,$T2hi,$Ahi +|| MV $T2lo,$Alo ; a = T2 +|| [B0] SUB B0,1,B0 +;;===== branch to loop00_15? is taken here + NOP +;;===== branch to break? is taken here + LDW *${Xihi}[2],$T2hi +|| LDW *${Xilo}[2],$T2lo ; X[i+1] +|| SHRU $T1hi,19,$S1hi +|| SHL $T1hi,32-19,$S1lo + SHRU $T1lo,19,$t0lo +|| SHL $T1lo,32-19,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1hi,61-32,$t0lo +|| SHL $T1hi,64-61,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1lo,61-32,$t0hi +|| SHL $T1lo,64-61,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1hi,6,$t0hi +|| SHL $T1hi,32-6,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1lo,6,$t0lo +|| LDW *${Xihi}[18],$T1hi +|| LDW *${Xilo}[18],$T1lo ; X[i+9] + XOR $t0lo,$S1lo,$S1lo ; sigma1(Xi[i+14]) + +|| LDW *${Xihi}[0],$CHhi +|| LDW *${Xilo}[0],$CHlo ; X[i] +|| SHRU $T2hi,1,$S0hi +|| SHL $T2hi,32-1,$S0lo + SHRU $T2lo,1,$t0lo +|| SHL $T2lo,32-1,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| SHRU $T2hi,8,$t0hi +|| SHL $T2hi,32-8,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| SHRU $T2lo,8,$t0lo +|| SHL $T2lo,32-8,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $S1hi,$T1hi,$T1hi +|| ADDU $S1lo,$T1lo,$T1carry:$T1lo ; T1 = X[i+9]+sigma1() +|| [B1] BNOP loop16_79? +|| SHRU $T2hi,7,$t0hi +|| SHL $T2hi,32-7,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $CHhi,$T1hi,$T1hi +|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += X[i] +|| SHRU $T2lo,7,$t0lo + XOR $t0lo,$S0lo,$S0lo ; sigma0(Xi[i+1] + + ADD $S0hi,$T1hi,$T1hi +|| ADDU $S0lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += sigma0() +|| [B1] SUB B1,1,B1 + NOP ; avoid cross-path stall + ADD $T1carry,$T1hi,$T1hi +;;===== branch to loop16_79? is taken here + +break?: + ADD $Ahi,$Actxhi,$Ahi ; accumulate ctx +|| ADDU $Alo,$Actxlo,$Actxlo:$Alo +|| [A0] LDNDW *$INP++,B11:B10 ; pre-fetch input +|| [A0] ADDK -640,$K512 ; rewind pointer to K512 + ADD $Bhi,$Bctxhi,$Bhi +|| ADDU $Blo,$Bctxlo,$Bctxlo:$Blo +|| [A0] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0] + ADD $Chi,$Cctxhi,$Chi +|| ADDU $Clo,$Cctxlo,$Cctxlo:$Clo +|| ADD $Actxlo,$Ahi,$Ahi +||[!A0] MV $CTXA,$CTXB + ADD $Dhi,$Dctxhi,$Dhi +|| ADDU $Dlo,$Dctxlo,$Dctxlo:$Dlo +|| ADD $Bctxlo,$Bhi,$Bhi +||[!A0] STW $Ahi,*${CTXA}[0^.LITTLE_ENDIAN] ; save ctx +||[!A0] STW $Alo,*${CTXB}[1^.LITTLE_ENDIAN] + ADD $Ehi,$Ectxhi,$Ehi +|| ADDU $Elo,$Ectxlo,$Ectxlo:$Elo +|| ADD $Cctxlo,$Chi,$Chi +|| [A0] BNOP outerloop? +||[!A0] STW $Bhi,*${CTXA}[2^.LITTLE_ENDIAN] +||[!A0] STW $Blo,*${CTXB}[3^.LITTLE_ENDIAN] + ADD $Fhi,$Fctxhi,$Fhi +|| ADDU $Flo,$Fctxlo,$Fctxlo:$Flo +|| ADD $Dctxlo,$Dhi,$Dhi +||[!A0] STW $Chi,*${CTXA}[4^.LITTLE_ENDIAN] +||[!A0] STW $Clo,*${CTXB}[5^.LITTLE_ENDIAN] + ADD $Ghi,$Gctxhi,$Ghi +|| ADDU $Glo,$Gctxlo,$Gctxlo:$Glo +|| ADD $Ectxlo,$Ehi,$Ehi +||[!A0] STW $Dhi,*${CTXA}[6^.LITTLE_ENDIAN] +||[!A0] STW $Dlo,*${CTXB}[7^.LITTLE_ENDIAN] + ADD $Hhi,$Hctxhi,$Hhi +|| ADDU $Hlo,$Hctxlo,$Hctxlo:$Hlo +|| ADD $Fctxlo,$Fhi,$Fhi +||[!A0] STW $Ehi,*${CTXA}[8^.LITTLE_ENDIAN] +||[!A0] STW $Elo,*${CTXB}[9^.LITTLE_ENDIAN] + ADD $Gctxlo,$Ghi,$Ghi +||[!A0] STW $Fhi,*${CTXA}[10^.LITTLE_ENDIAN] +||[!A0] STW $Flo,*${CTXB}[11^.LITTLE_ENDIAN] + ADD $Hctxlo,$Hhi,$Hhi +||[!A0] STW $Ghi,*${CTXA}[12^.LITTLE_ENDIAN] +||[!A0] STW $Glo,*${CTXB}[13^.LITTLE_ENDIAN] +;;===== branch to outerloop? is taken here + + STW $Hhi,*${CTXA}[14^.LITTLE_ENDIAN] +|| STW $Hlo,*${CTXB}[15^.LITTLE_ENDIAN] +|| MVK -40,B0 + ADD FP,B0,SP ; destroy circular buffer +|| LDDW *FP[-4],A11:A10 + LDDW *SP[2],A13:A12 +|| LDDW *FP[-2],B11:B10 + LDDW *SP[4],B13:B12 +|| BNOP RA + LDW *++SP(40),FP ; restore frame pointer + MVK 0,B0 + MVC B0,AMR ; clear AMR + NOP 2 ; wait till FP is committed + .endasmfunc + + .if __TI_EABI__ + .sect ".text:sha_asm.const" + .else + .sect ".const:sha_asm" + .endif + .align 128 +K512: + .uword 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd + .uword 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc + .uword 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 + .uword 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 + .uword 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe + .uword 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 + .uword 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 + .uword 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 + .uword 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 + .uword 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 + .uword 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 + .uword 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 + .uword 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 + .uword 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 + .uword 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 + .uword 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 + .uword 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 + .uword 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df + .uword 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 + .uword 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b + .uword 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 + .uword 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 + .uword 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 + .uword 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 + .uword 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 + .uword 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 + .uword 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb + .uword 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 + .uword 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 + .uword 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec + .uword 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 + .uword 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b + .uword 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 + .uword 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 + .uword 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 + .uword 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b + .uword 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 + .uword 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c + .uword 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a + .uword 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 + .cstring "SHA512 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>" + .align 4 +___ + +print $code; +close STDOUT; diff --git a/deps/openssl/openssl/crypto/sha/asm/sha512-ia64.pl b/deps/openssl/openssl/crypto/sha/asm/sha512-ia64.pl index 59f889a095..356a46aced 100755 --- a/deps/openssl/openssl/crypto/sha/asm/sha512-ia64.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha512-ia64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -68,7 +75,7 @@ # To generate code, pass the file name with either 256 or 512 in its # name and compiler flags. -$output=shift; +$output=pop; if ($output =~ /512.*\.[s|asm]/) { $SZ=8; diff --git a/deps/openssl/openssl/crypto/sha/asm/sha512-mips.pl b/deps/openssl/openssl/crypto/sha/asm/sha512-mips.pl index b468cfb456..5c2d23faaf 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha512-mips.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha512-mips.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -52,6 +59,7 @@ $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { + $PTR_LA="dla"; $PTR_ADD="dadd"; # incidentally works even on n32 $PTR_SUB="dsub"; # incidentally works even on n32 $REG_S="sd"; @@ -59,6 +67,7 @@ if ($flavour =~ /64|n32/i) { $PTR_SLL="dsll"; # incidentally works even on n32 $SZREG=8; } else { + $PTR_LA="la"; $PTR_ADD="add"; $PTR_SUB="sub"; $REG_S="sw"; @@ -74,7 +83,7 @@ $pf = ($flavour =~ /nubi/i) ? $t0 : $t2; $big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0 if ($ENV{CC}); -for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } +for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); } open STDOUT,">$output"; if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); } @@ -286,7 +295,7 @@ ___ } $FRAMESIZE=16*$SZ+16*$SZREG; -$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; +$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc0fff008" : "0xc0ff0000"; $code.=<<___; #ifdef OPENSSL_FIPSCANISTER @@ -343,7 +352,7 @@ $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification ___ $code.=<<___; .set reorder - la $Ktbl,K${label} # PIC-ified 'load address' + $PTR_LA $Ktbl,K${label} # PIC-ified 'load address' $LD $A,0*$SZ($ctx) # load context $LD $B,1*$SZ($ctx) diff --git a/deps/openssl/openssl/crypto/sha/asm/sha512-parisc.pl b/deps/openssl/openssl/crypto/sha/asm/sha512-parisc.pl index 6cad72e255..fcb6157902 100755 --- a/deps/openssl/openssl/crypto/sha/asm/sha512-parisc.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha512-parisc.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL diff --git a/deps/openssl/openssl/crypto/sha/asm/sha512-ppc.pl b/deps/openssl/openssl/crypto/sha/asm/sha512-ppc.pl index 17fdc6e8e5..fe95b01509 100755 --- a/deps/openssl/openssl/crypto/sha/asm/sha512-ppc.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha512-ppc.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL diff --git a/deps/openssl/openssl/crypto/sha/asm/sha512-s390x.pl b/deps/openssl/openssl/crypto/sha/asm/sha512-s390x.pl index 9c10e4e9ee..427d6f8252 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha512-s390x.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha512-s390x.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -33,7 +40,7 @@ # instructions and achieve "64-bit" performance even in 31-bit legacy # application context. The feature is not specific to any particular # processor, as long as it's "z-CPU". Latter implies that the code -# remains z/Architecture specific. On z900 SHA256 was measured to +# remains z/Architecture specific. On z990 SHA256 was measured to # perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3. $flavour = shift; @@ -64,7 +71,7 @@ $tbl="%r13"; $T1="%r14"; $sp="%r15"; -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; if ($output =~ /512/) { @@ -237,9 +244,6 @@ $Func: ___ $code.=<<___ if ($kimdfunc); larl %r1,OPENSSL_s390xcap_P - lg %r0,0(%r1) - tmhl %r0,0x4000 # check for message-security assist - jz .Lsoftware lg %r0,16(%r1) # check kimd capabilities tmhh %r0,`0x8000>>$kimdfunc` jz .Lsoftware @@ -308,7 +312,6 @@ $code.=<<___; br %r14 .size $Func,.-$Func .string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" -.comm OPENSSL_s390xcap_P,80,8 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/deps/openssl/openssl/crypto/sha/asm/sha512-sparcv9.pl b/deps/openssl/openssl/crypto/sha/asm/sha512-sparcv9.pl index 5a9c15d1d3..4a1ce5fe3e 100644 --- a/deps/openssl/openssl/crypto/sha/asm/sha512-sparcv9.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha512-sparcv9.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -49,7 +56,7 @@ # saturates at 11.5x single-process result on 8-core processor, or # ~11/16GBps per 2.85GHz socket. -$output=shift; +$output=pop; open STDOUT,">$output"; if ($output =~ /512/) { @@ -791,7 +798,7 @@ ___ # Purpose of these subroutines is to explicitly encode VIS instructions, # so that one can compile the module without having to specify VIS -# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. # Idea is to reserve for option to produce "universal" binary and let # programmer detect if current CPU is VIS capable at run-time. sub unvis { diff --git a/deps/openssl/openssl/crypto/sha/asm/sha512-x86_64.pl b/deps/openssl/openssl/crypto/sha/asm/sha512-x86_64.pl index 78e445f3fe..c9b7b28123 100755 --- a/deps/openssl/openssl/crypto/sha/asm/sha512-x86_64.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha512-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -34,7 +41,7 @@ # level parallelism, on a given CPU implementation in this case. # # Special note on Intel EM64T. While Opteron CPU exhibits perfect -# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], +# performance ratio of 1.5 between 64- and 32-bit flavors [see above], # [currently available] EM64T CPUs apparently are far from it. On the # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit # sha256_block:-( This is presumably because 64-bit shifts/rotates @@ -86,12 +93,14 @@ # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) +# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) # VIA Nano 23.0 16.5(+39%) - 14.7 - # Atom 23.0 18.9(+22%) - 14.7 - # Silvermont 27.4 20.6(+33%) - 17.5 - +# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - # -# (*) whichever best applicable; +# (*) whichever best applicable, including SHAEXT; # (**) switch from ror to shrd stands for fair share of improvement; # (***) execution time is fully determined by remaining integer-only # part, body_00_15; reducing the amount of SIMD instructions @@ -131,7 +140,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([ $shaext=1; ### set to zero if compiling for 1.0.1 $avx=1 if (!$shaext && $avx); -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if ($output =~ /512/) { diff --git a/deps/openssl/openssl/crypto/sha/asm/sha512p8-ppc.pl b/deps/openssl/openssl/crypto/sha/asm/sha512p8-ppc.pl index 47189502c6..4d3d3b2f8c 100755 --- a/deps/openssl/openssl/crypto/sha/asm/sha512p8-ppc.pl +++ b/deps/openssl/openssl/crypto/sha/asm/sha512p8-ppc.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |