diff options
Diffstat (limited to 'deps/openssl/openssl/crypto/poly1305')
13 files changed, 2553 insertions, 621 deletions
diff --git a/deps/openssl/openssl/crypto/poly1305/asm/poly1305-armv8.pl b/deps/openssl/openssl/crypto/poly1305/asm/poly1305-armv8.pl index 0fc8667ac7..ac06457b65 100755 --- a/deps/openssl/openssl/crypto/poly1305/asm/poly1305-armv8.pl +++ b/deps/openssl/openssl/crypto/poly1305/asm/poly1305-armv8.pl @@ -28,6 +28,7 @@ # Denver 1.64/+50% 1.18(*) # X-Gene 2.13/+68% 2.27 # Mongoose 1.77/+75% 1.12 +# Kryo 2.70/+55% 1.13 # # (*) estimate based on resources availability is less than 1.0, # i.e. measured result is worse than expected, presumably binary diff --git a/deps/openssl/openssl/crypto/poly1305/asm/poly1305-mips.pl b/deps/openssl/openssl/crypto/poly1305/asm/poly1305-mips.pl index d2b3e90d93..28b6772ee5 100755 --- a/deps/openssl/openssl/crypto/poly1305/asm/poly1305-mips.pl +++ b/deps/openssl/openssl/crypto/poly1305/asm/poly1305-mips.pl @@ -67,6 +67,8 @@ $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); $code.=<<___; +#include "mips_arch.h" + #ifdef MIPSEB # define MSB 0 # define LSB 7 @@ -92,10 +94,15 @@ poly1305_init: beqz $inp,.Lno_key +#if defined(_MIPS_ARCH_MIPS64R6) + ld $in0,0($inp) + ld $in1,8($inp) +#else ldl $in0,0+MSB($inp) ldl $in1,8+MSB($inp) ldr $in0,0+LSB($inp) ldr $in1,8+LSB($inp) +#endif #ifdef MIPSEB # if defined(_MIPS_ARCH_MIPS64R2) dsbh $in0,$in0 # byte swap @@ -182,7 +189,7 @@ poly1305_blocks_internal: .frame $sp,6*8,$ra .mask $SAVED_REGS_MASK,-8 .set noreorder - dsub $sp,6*8 + dsubu $sp,6*8 sd $s5,40($sp) sd $s4,32($sp) ___ @@ -204,11 +211,16 @@ $code.=<<___; ld $s1,40($ctx) .Loop: +#if defined(_MIPS_ARCH_MIPS64R6) + ld $in0,0($inp) # load input + ld $in1,8($inp) +#else ldl $in0,0+MSB($inp) # load input ldl $in1,8+MSB($inp) ldr $in0,0+LSB($inp) - daddiu $len,-1 ldr $in1,8+LSB($inp) +#endif + daddiu $len,-1 daddiu $inp,16 #ifdef MIPSEB # if defined(_MIPS_ARCH_MIPS64R2) @@ -258,42 +270,42 @@ $code.=<<___; sltu $tmp1,$h1,$in1 daddu $h1,$tmp0 - dmultu $r0,$h0 # h0*r0 + dmultu ($r0,$h0) # h0*r0 daddu $h2,$padbit sltu $tmp0,$h1,$tmp0 - mflo $d0 - mfhi $d1 + mflo ($d0,$r0,$h0) + mfhi ($d1,$r0,$h0) - dmultu $s1,$h1 # h1*5*r1 + dmultu ($s1,$h1) # h1*5*r1 daddu $tmp0,$tmp1 daddu $h2,$tmp0 - mflo $tmp0 - mfhi $tmp1 + mflo ($tmp0,$s1,$h1) + mfhi ($tmp1,$s1,$h1) - dmultu $r1,$h0 # h0*r1 + dmultu ($r1,$h0) # h0*r1 daddu $d0,$tmp0 daddu $d1,$tmp1 - mflo $tmp2 - mfhi $d2 + mflo ($tmp2,$r1,$h0) + mfhi ($d2,$r1,$h0) sltu $tmp0,$d0,$tmp0 daddu $d1,$tmp0 - dmultu $r0,$h1 # h1*r0 + dmultu ($r0,$h1) # h1*r0 daddu $d1,$tmp2 sltu $tmp2,$d1,$tmp2 - mflo $tmp0 - mfhi $tmp1 + mflo ($tmp0,$r0,$h1) + mfhi ($tmp1,$r0,$h1) daddu $d2,$tmp2 - dmultu $s1,$h2 # h2*5*r1 + dmultu ($s1,$h2) # h2*5*r1 daddu $d1,$tmp0 daddu $d2,$tmp1 - mflo $tmp2 + mflo ($tmp2,$s1,$h2) - dmultu $r0,$h2 # h2*r0 + dmultu ($r0,$h2) # h2*r0 sltu $tmp0,$d1,$tmp0 daddu $d2,$tmp0 - mflo $tmp3 + mflo ($tmp3,$r0,$h2) daddu $d1,$tmp2 daddu $d2,$tmp3 @@ -329,7 +341,7 @@ $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue ___ $code.=<<___; jr $ra - dadd $sp,6*8 + daddu $sp,6*8 .end poly1305_blocks_internal ___ } diff --git a/deps/openssl/openssl/crypto/poly1305/asm/poly1305-ppc.pl b/deps/openssl/openssl/crypto/poly1305/asm/poly1305-ppc.pl index ab65910282..0c6d015d58 100755 --- a/deps/openssl/openssl/crypto/poly1305/asm/poly1305-ppc.pl +++ b/deps/openssl/openssl/crypto/poly1305/asm/poly1305-ppc.pl @@ -1,5 +1,5 @@ #! /usr/bin/env perl -# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy @@ -28,6 +28,7 @@ # PPC970 7.00/+114% 3.51/+205% # POWER7 3.75/+260% 1.93/+100% # POWER8 - 2.03/+200% +# POWER9 - 2.00/+150% # # Do we need floating-point implementation for PPC? Results presented # in poly1305_ieee754.c are tricky to compare to, because they are for diff --git a/deps/openssl/openssl/crypto/poly1305/asm/poly1305-ppcfp.pl b/deps/openssl/openssl/crypto/poly1305/asm/poly1305-ppcfp.pl index 49f70a8c03..09f8185848 100755 --- a/deps/openssl/openssl/crypto/poly1305/asm/poly1305-ppcfp.pl +++ b/deps/openssl/openssl/crypto/poly1305/asm/poly1305-ppcfp.pl @@ -1,5 +1,5 @@ #! /usr/bin/env perl -# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy diff --git a/deps/openssl/openssl/crypto/poly1305/asm/poly1305-x86.pl b/deps/openssl/openssl/crypto/poly1305/asm/poly1305-x86.pl index 93179e37d5..1e09ddcc10 100755 --- a/deps/openssl/openssl/crypto/poly1305/asm/poly1305-x86.pl +++ b/deps/openssl/openssl/crypto/poly1305/asm/poly1305-x86.pl @@ -29,6 +29,7 @@ # Westmere 4.58/+100% 1.43 # Sandy Bridge 3.90/+100% 1.36 # Haswell 3.88/+70% 1.18 0.72 +# Skylake 3.10/+60% 1.14 0.62 # Silvermont 11.0/+40% 4.80 # Goldmont 4.10/+200% 2.10 # VIA Nano 6.71/+90% 2.47 @@ -49,7 +50,7 @@ require "x86asm.pl"; $output=pop; open STDOUT,">$output"; -&asm_init($ARGV[0],"poly1305-x86.pl",$ARGV[$#ARGV] eq "386"); +&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); $sse2=$avx=0; for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } @@ -729,7 +730,7 @@ my $extra = shift; &movdqa ($T0,$T1); # -> base 2^26 ... &pand ($T1,$MASK); - &paddd ($D0,$T1); # ... and accumuate + &paddd ($D0,$T1); # ... and accumulate &movdqa ($T1,$T0); &psrlq ($T0,26); diff --git a/deps/openssl/openssl/crypto/poly1305/asm/poly1305-x86_64.pl b/deps/openssl/openssl/crypto/poly1305/asm/poly1305-x86_64.pl index 4c22ded580..342ad7f18a 100755 --- a/deps/openssl/openssl/crypto/poly1305/asm/poly1305-x86_64.pl +++ b/deps/openssl/openssl/crypto/poly1305/asm/poly1305-x86_64.pl @@ -1,5 +1,5 @@ #! /usr/bin/env perl -# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy @@ -18,21 +18,39 @@ # # March 2015 # +# Initial release. +# +# December 2016 +# +# Add AVX512F+VL+BW code path. +# +# November 2017 +# +# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be +# executed even on Knights Landing. Trigger for modification was +# observation that AVX512 code paths can negatively affect overall +# Skylake-X system performance. Since we are likely to suppress +# AVX512F capability flag [at least on Skylake-X], conversion serves +# as kind of "investment protection". Note that next *lake processor, +# Cannolake, has AVX512IFMA code path to execute... +# # Numbers are cycles per processed byte with poly1305_blocks alone, # measured with rdtsc at fixed clock frequency. # -# IALU/gcc-4.8(*) AVX(**) AVX2 +# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 # P4 4.46/+120% - # Core 2 2.41/+90% - # Westmere 1.88/+120% - # Sandy Bridge 1.39/+140% 1.10 # Haswell 1.14/+175% 1.11 0.65 -# Skylake 1.13/+120% 0.96 0.51 +# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] # Silvermont 2.83/+95% - +# Knights L 3.60/? 1.65 1.10 0.41(***) # Goldmont 1.70/+180% - # VIA Nano 1.82/+150% - # Sledgehammer 1.38/+160% - # Bulldozer 2.30/+130% 0.97 +# Ryzen 1.15/+200% 1.08 1.18 # # (*) improvement coefficients relative to clang are more modest and # are ~50% on most processors, in both cases we are comparing to @@ -42,6 +60,8 @@ # Core processors, 50-30%, less newer processor is, but slower on # contemporary ones, for example almost 2x slower on Atom, and as # former are naturally disappearing, SSE2 is deemed unnecessary; +# (***) strangely enough performance seems to vary from core to core, +# listed result is best case; $flavour = shift; $output = shift; @@ -56,12 +76,13 @@ die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.19) + ($1>=2.22); + $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.09) + ($1>=2.10); + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { + $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12); + $avx += 2 if ($1==2.11 && $2>=8); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && @@ -171,6 +192,13 @@ $code.=<<___ if ($avx>1); bt \$`5+32`,%r9 # AVX2? cmovc %rax,%r10 ___ +$code.=<<___ if ($avx>3); + mov \$`(1<<31|1<<21|1<<16)`,%rax + shr \$32,%r9 + and %rax,%r9 + cmp %rax,%r9 + je .Linit_base2_44 +___ $code.=<<___; mov \$0x0ffffffc0fffffff,%rax mov \$0x0ffffffc0ffffffc,%rcx @@ -196,16 +224,23 @@ $code.=<<___; .type poly1305_blocks,\@function,4 .align 32 poly1305_blocks: +.cfi_startproc .Lblocks: shr \$4,$len jz .Lno_data # too short push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lblocks_body: mov $len,%r15 # reassign $len @@ -241,15 +276,23 @@ $code.=<<___; mov $h2,16($ctx) mov 0(%rsp),%r15 +.cfi_restore %r15 mov 8(%rsp),%r14 +.cfi_restore %r14 mov 16(%rsp),%r13 +.cfi_restore %r13 mov 24(%rsp),%r12 +.cfi_restore %r12 mov 32(%rsp),%rbp +.cfi_restore %rbp mov 40(%rsp),%rbx +.cfi_restore %rbx lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lno_data: .Lblocks_epilogue: ret +.cfi_endproc .size poly1305_blocks,.-poly1305_blocks .type poly1305_emit,\@function,3 @@ -265,7 +308,7 @@ poly1305_emit: mov %r9,%rcx adc \$0,%r9 adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overfow? + shr \$2,%r10 # did 130-bit value overflow? cmovnz %r8,%rax cmovnz %r9,%rcx @@ -470,6 +513,7 @@ __poly1305_init_avx: .type poly1305_blocks_avx,\@function,4 .align 32 poly1305_blocks_avx: +.cfi_startproc mov 20($ctx),%r8d # is_base2_26 cmp \$128,$len jae .Lblocks_avx @@ -489,11 +533,17 @@ poly1305_blocks_avx: jz .Leven_avx push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lblocks_avx_body: mov $len,%r15 # reassign $len @@ -596,24 +646,39 @@ poly1305_blocks_avx: .align 16 .Ldone_avx: mov 0(%rsp),%r15 +.cfi_restore %r15 mov 8(%rsp),%r14 +.cfi_restore %r14 mov 16(%rsp),%r13 +.cfi_restore %r13 mov 24(%rsp),%r12 +.cfi_restore %r12 mov 32(%rsp),%rbp +.cfi_restore %rbp mov 40(%rsp),%rbx +.cfi_restore %rbx lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lno_data_avx: .Lblocks_avx_epilogue: ret +.cfi_endproc .align 32 .Lbase2_64_avx: +.cfi_startproc push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lbase2_64_avx_body: mov $len,%r15 # reassign $len @@ -673,18 +738,27 @@ poly1305_blocks_avx: mov %r15,$len mov 0(%rsp),%r15 +.cfi_restore %r15 mov 8(%rsp),%r14 +.cfi_restore %r14 mov 16(%rsp),%r13 +.cfi_restore %r13 mov 24(%rsp),%r12 +.cfi_restore %r12 mov 32(%rsp),%rbp +.cfi_restore %rbp mov 40(%rsp),%rbx +.cfi_restore %rbx lea 48(%rsp),%rax lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lbase2_64_avx_epilogue: jmp .Ldo_avx +.cfi_endproc .align 32 .Leven_avx: +.cfi_startproc vmovd 4*0($ctx),$H0 # load hash value vmovd 4*1($ctx),$H1 vmovd 4*2($ctx),$H2 @@ -695,6 +769,7 @@ poly1305_blocks_avx: ___ $code.=<<___ if (!$win64); lea -0x58(%rsp),%r11 +.cfi_def_cfa %r11,0x60 sub \$0x178,%rsp ___ $code.=<<___ if ($win64); @@ -1287,10 +1362,12 @@ $code.=<<___ if ($win64); ___ $code.=<<___ if (!$win64); lea 0x58(%r11),%rsp +.cfi_def_cfa %rsp,8 ___ $code.=<<___; vzeroupper ret +.cfi_endproc .size poly1305_blocks_avx,.-poly1305_blocks_avx .type poly1305_emit_avx,\@function,3 @@ -1336,7 +1413,7 @@ poly1305_emit_avx: mov %r9,%rcx adc \$0,%r9 adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overfow? + shr \$2,%r10 # did 130-bit value overflow? cmovnz %r8,%rax cmovnz %r9,%rcx @@ -1358,6 +1435,7 @@ $code.=<<___; .type poly1305_blocks_avx2,\@function,4 .align 32 poly1305_blocks_avx2: +.cfi_startproc mov 20($ctx),%r8d # is_base2_26 cmp \$128,$len jae .Lblocks_avx2 @@ -1377,11 +1455,17 @@ poly1305_blocks_avx2: jz .Leven_avx2 push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lblocks_avx2_body: mov $len,%r15 # reassign $len @@ -1490,24 +1574,39 @@ poly1305_blocks_avx2: .align 16 .Ldone_avx2: mov 0(%rsp),%r15 +.cfi_restore %r15 mov 8(%rsp),%r14 +.cfi_restore %r14 mov 16(%rsp),%r13 +.cfi_restore %r13 mov 24(%rsp),%r12 +.cfi_restore %r12 mov 32(%rsp),%rbp +.cfi_restore %rbp mov 40(%rsp),%rbx +.cfi_restore %rbx lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lno_data_avx2: .Lblocks_avx2_epilogue: ret +.cfi_endproc .align 32 .Lbase2_64_avx2: +.cfi_startproc push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lbase2_64_avx2_body: mov $len,%r15 # reassign $len @@ -1569,21 +1668,33 @@ poly1305_blocks_avx2: call __poly1305_init_avx .Lproceed_avx2: - mov %r15,$len + mov %r15,$len # restore $len + mov OPENSSL_ia32cap_P+8(%rip),%r10d + mov \$`(1<<31|1<<30|1<<16)`,%r11d mov 0(%rsp),%r15 +.cfi_restore %r15 mov 8(%rsp),%r14 +.cfi_restore %r14 mov 16(%rsp),%r13 +.cfi_restore %r13 mov 24(%rsp),%r12 +.cfi_restore %r12 mov 32(%rsp),%rbp +.cfi_restore %rbp mov 40(%rsp),%rbx +.cfi_restore %rbx lea 48(%rsp),%rax lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lbase2_64_avx2_epilogue: jmp .Ldo_avx2 +.cfi_endproc .align 32 .Leven_avx2: +.cfi_startproc + mov OPENSSL_ia32cap_P+8(%rip),%r10d vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 vmovd 4*1($ctx),%x#$H1 vmovd 4*2($ctx),%x#$H2 @@ -1592,8 +1703,17 @@ poly1305_blocks_avx2: .Ldo_avx2: ___ +$code.=<<___ if ($avx>2); + cmp \$512,$len + jb .Lskip_avx512 + and %r11d,%r10d + test \$`1<<16`,%r10d # check for AVX512F + jnz .Lblocks_avx512 +.Lskip_avx512: +___ $code.=<<___ if (!$win64); lea -8(%rsp),%r11 +.cfi_def_cfa %r11,16 sub \$0x128,%rsp ___ $code.=<<___ if ($win64); @@ -1612,8 +1732,9 @@ $code.=<<___ if ($win64); .Ldo_avx2_body: ___ $code.=<<___; - lea 48+64($ctx),$ctx # size optimization lea .Lconst(%rip),%rcx + lea 48+64($ctx),$ctx # size optimization + vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 # expand and copy pre-calculated table to stack vmovdqu `16*0-64`($ctx),%x#$T2 @@ -1623,36 +1744,28 @@ $code.=<<___; vmovdqu `16*3-64`($ctx),%x#$D0 vmovdqu `16*4-64`($ctx),%x#$D1 vmovdqu `16*5-64`($ctx),%x#$D2 + lea 0x90(%rsp),%rax # size optimization vmovdqu `16*6-64`($ctx),%x#$D3 - vpermq \$0x15,$T2,$T2 # 00003412 -> 12343434 + vpermd $T2,$T0,$T2 # 00003412 -> 14243444 vmovdqu `16*7-64`($ctx),%x#$D4 - vpermq \$0x15,$T3,$T3 - vpshufd \$0xc8,$T2,$T2 # 12343434 -> 14243444 + vpermd $T3,$T0,$T3 vmovdqu `16*8-64`($ctx),%x#$MASK - vpermq \$0x15,$T4,$T4 - vpshufd \$0xc8,$T3,$T3 + vpermd $T4,$T0,$T4 vmovdqa $T2,0x00(%rsp) - vpermq \$0x15,$D0,$D0 - vpshufd \$0xc8,$T4,$T4 - vmovdqa $T3,0x20(%rsp) - vpermq \$0x15,$D1,$D1 - vpshufd \$0xc8,$D0,$D0 - vmovdqa $T4,0x40(%rsp) - vpermq \$0x15,$D2,$D2 - vpshufd \$0xc8,$D1,$D1 - vmovdqa $D0,0x60(%rsp) - vpermq \$0x15,$D3,$D3 - vpshufd \$0xc8,$D2,$D2 - vmovdqa $D1,0x80(%rsp) - vpermq \$0x15,$D4,$D4 - vpshufd \$0xc8,$D3,$D3 - vmovdqa $D2,0xa0(%rsp) - vpermq \$0x15,$MASK,$MASK - vpshufd \$0xc8,$D4,$D4 - vmovdqa $D3,0xc0(%rsp) - vpshufd \$0xc8,$MASK,$MASK - vmovdqa $D4,0xe0(%rsp) - vmovdqa $MASK,0x100(%rsp) + vpermd $D0,$T0,$D0 + vmovdqa $T3,0x20-0x90(%rax) + vpermd $D1,$T0,$D1 + vmovdqa $T4,0x40-0x90(%rax) + vpermd $D2,$T0,$D2 + vmovdqa $D0,0x60-0x90(%rax) + vpermd $D3,$T0,$D3 + vmovdqa $D1,0x80-0x90(%rax) + vpermd $D4,$T0,$D4 + vmovdqa $D2,0xa0-0x90(%rax) + vpermd $MASK,$T0,$MASK + vmovdqa $D3,0xc0-0x90(%rax) + vmovdqa $D4,0xe0-0x90(%rax) + vmovdqa $MASK,0x100-0x90(%rax) vmovdqa 64(%rcx),$MASK # .Lmask26 ################################################################ @@ -1679,7 +1792,6 @@ $code.=<<___; vpand $MASK,$T3,$T3 # 3 vpor 32(%rcx),$T4,$T4 # padbit, yes, always - lea 0x90(%rsp),%rax # size optimization vpaddq $H2,$T2,$H2 # accumulate input sub \$64,$len jz .Ltail_avx2 @@ -1688,11 +1800,11 @@ $code.=<<___; .align 32 .Loop_avx2: ################################################################ - # ((inp[0]*r^4+r[4])*r^4+r[8])*r^4 - # ((inp[1]*r^4+r[5])*r^4+r[9])*r^3 - # ((inp[2]*r^4+r[6])*r^4+r[10])*r^2 - # ((inp[3]*r^4+r[7])*r^4+r[11])*r^1 - # \________/\________/ + # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 + # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 + # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 + # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 + # \________/\__________/ ################################################################ #vpaddq $H2,$T2,$H2 # accumulate input vpaddq $H0,$T0,$H0 @@ -1990,13 +2102,1657 @@ $code.=<<___ if ($win64); ___ $code.=<<___ if (!$win64); lea 8(%r11),%rsp +.cfi_def_cfa %rsp,8 ___ $code.=<<___; vzeroupper ret +.cfi_endproc .size poly1305_blocks_avx2,.-poly1305_blocks_avx2 ___ +####################################################################### +if ($avx>2) { +# On entry we have input length divisible by 64. But since inner loop +# processes 128 bytes per iteration, cases when length is not divisible +# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this +# reason stack layout is kept identical to poly1305_blocks_avx2. If not +# for this tail, we wouldn't have to even allocate stack frame... + +my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); +my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); +my $PADBIT="%zmm30"; + +map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain +map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); +map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); +map(s/%y/%z/,($MASK)); + +$code.=<<___; +.type poly1305_blocks_avx512,\@function,4 +.align 32 +poly1305_blocks_avx512: +.cfi_startproc +.Lblocks_avx512: + mov \$15,%eax + kmovw %eax,%k2 +___ +$code.=<<___ if (!$win64); + lea -8(%rsp),%r11 +.cfi_def_cfa %r11,16 + sub \$0x128,%rsp +___ +$code.=<<___ if ($win64); + lea -0xf8(%rsp),%r11 + sub \$0x1c8,%rsp + vmovdqa %xmm6,0x50(%r11) + vmovdqa %xmm7,0x60(%r11) + vmovdqa %xmm8,0x70(%r11) + vmovdqa %xmm9,0x80(%r11) + vmovdqa %xmm10,0x90(%r11) + vmovdqa %xmm11,0xa0(%r11) + vmovdqa %xmm12,0xb0(%r11) + vmovdqa %xmm13,0xc0(%r11) + vmovdqa %xmm14,0xd0(%r11) + vmovdqa %xmm15,0xe0(%r11) +.Ldo_avx512_body: +___ +$code.=<<___; + lea .Lconst(%rip),%rcx + lea 48+64($ctx),$ctx # size optimization + vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 + + # expand pre-calculated table + vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} + and \$-512,%rsp + vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} + mov \$0x20,%rax + vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} + vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} + vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} + vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} + vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} + vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} + vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} + vpermd $D0,$T2,$R0 # 00003412 -> 14243444 + vpbroadcastq 64(%rcx),$MASK # .Lmask26 + vpermd $D1,$T2,$R1 + vpermd $T0,$T2,$S1 + vpermd $D2,$T2,$R2 + vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 + vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 + vpermd $T1,$T2,$S2 + vmovdqu64 $R1,0x00(%rsp,%rax){%k2} + vpsrlq \$32,$R1,$T1 + vpermd $D3,$T2,$R3 + vmovdqa64 $S1,0x40(%rsp){%k2} + vpermd $T3,$T2,$S3 + vpermd $D4,$T2,$R4 + vmovdqu64 $R2,0x40(%rsp,%rax){%k2} + vpermd $T4,$T2,$S4 + vmovdqa64 $S2,0x80(%rsp){%k2} + vmovdqu64 $R3,0x80(%rsp,%rax){%k2} + vmovdqa64 $S3,0xc0(%rsp){%k2} + vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} + vmovdqa64 $S4,0x100(%rsp){%k2} + + ################################################################ + # calculate 5th through 8th powers of the key + # + # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 + # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 + # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 + # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 + # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 + + vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 + vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 + vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 + vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 + vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 + vpsrlq \$32,$R2,$T2 + + vpmuludq $T1,$S4,$M0 + vpmuludq $T1,$R0,$M1 + vpmuludq $T1,$R1,$M2 + vpmuludq $T1,$R2,$M3 + vpmuludq $T1,$R3,$M4 + vpsrlq \$32,$R3,$T3 + vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 + vpaddq $M1,$D1,$D1 # d1 += r1'*r0 + vpaddq $M2,$D2,$D2 # d2 += r1'*r1 + vpaddq $M3,$D3,$D3 # d3 += r1'*r2 + vpaddq $M4,$D4,$D4 # d4 += r1'*r3 + + vpmuludq $T2,$S3,$M0 + vpmuludq $T2,$S4,$M1 + vpmuludq $T2,$R1,$M3 + vpmuludq $T2,$R2,$M4 + vpmuludq $T2,$R0,$M2 + vpsrlq \$32,$R4,$T4 + vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 + vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 + vpaddq $M3,$D3,$D3 # d3 += r2'*r1 + vpaddq $M4,$D4,$D4 # d4 += r2'*r2 + vpaddq $M2,$D2,$D2 # d2 += r2'*r0 + + vpmuludq $T3,$S2,$M0 + vpmuludq $T3,$R0,$M3 + vpmuludq $T3,$R1,$M4 + vpmuludq $T3,$S3,$M1 + vpmuludq $T3,$S4,$M2 + vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 + vpaddq $M3,$D3,$D3 # d3 += r3'*r0 + vpaddq $M4,$D4,$D4 # d4 += r3'*r1 + vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 + vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 + + vpmuludq $T4,$S4,$M3 + vpmuludq $T4,$R0,$M4 + vpmuludq $T4,$S1,$M0 + vpmuludq $T4,$S2,$M1 + vpmuludq $T4,$S3,$M2 + vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 + vpaddq $M4,$D4,$D4 # d4 += r2'*r0 + vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 + vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 + vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 + + ################################################################ + # load input + vmovdqu64 16*0($inp),%z#$T3 + vmovdqu64 16*4($inp),%z#$T4 + lea 16*8($inp),$inp + + ################################################################ + # lazy reduction + + vpsrlq \$26,$D3,$M3 + vpandq $MASK,$D3,$D3 + vpaddq $M3,$D4,$D4 # d3 -> d4 + + vpsrlq \$26,$D0,$M0 + vpandq $MASK,$D0,$D0 + vpaddq $M0,$D1,$D1 # d0 -> d1 + + vpsrlq \$26,$D4,$M4 + vpandq $MASK,$D4,$D4 + + vpsrlq \$26,$D1,$M1 + vpandq $MASK,$D1,$D1 + vpaddq $M1,$D2,$D2 # d1 -> d2 + + vpaddq $M4,$D0,$D0 + vpsllq \$2,$M4,$M4 + vpaddq $M4,$D0,$D0 # d4 -> d0 + + vpsrlq \$26,$D2,$M2 + vpandq $MASK,$D2,$D2 + vpaddq $M2,$D3,$D3 # d2 -> d3 + + vpsrlq \$26,$D0,$M0 + vpandq $MASK,$D0,$D0 + vpaddq $M0,$D1,$D1 # d0 -> d1 + + vpsrlq \$26,$D3,$M3 + vpandq $MASK,$D3,$D3 + vpaddq $M3,$D4,$D4 # d3 -> d4 + + ################################################################ + # at this point we have 14243444 in $R0-$S4 and 05060708 in + # $D0-$D4, ... + + vpunpcklqdq $T4,$T3,$T0 # transpose input + vpunpckhqdq $T4,$T3,$T4 + + # ... since input 64-bit lanes are ordered as 73625140, we could + # "vperm" it to 76543210 (here and in each loop iteration), *or* + # we could just flow along, hence the goal for $R0-$S4 is + # 1858286838784888 ... + + vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: + mov \$0x7777,%eax + kmovw %eax,%k1 + + vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- + vpermd $R1,$M0,$R1 + vpermd $R2,$M0,$R2 + vpermd $R3,$M0,$R3 + vpermd $R4,$M0,$R4 + + vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 + vpermd $D1,$M0,${R1}{%k1} + vpermd $D2,$M0,${R2}{%k1} + vpermd $D3,$M0,${R3}{%k1} + vpermd $D4,$M0,${R4}{%k1} + + vpslld \$2,$R1,$S1 # *5 + vpslld \$2,$R2,$S2 + vpslld \$2,$R3,$S3 + vpslld \$2,$R4,$S4 + vpaddd $R1,$S1,$S1 + vpaddd $R2,$S2,$S2 + vpaddd $R3,$S3,$S3 + vpaddd $R4,$S4,$S4 + + vpbroadcastq 32(%rcx),$PADBIT # .L129 + + vpsrlq \$52,$T0,$T2 # splat input + vpsllq \$12,$T4,$T3 + vporq $T3,$T2,$T2 + vpsrlq \$26,$T0,$T1 + vpsrlq \$14,$T4,$T3 + vpsrlq \$40,$T4,$T4 # 4 + vpandq $MASK,$T2,$T2 # 2 + vpandq $MASK,$T0,$T0 # 0 + #vpandq $MASK,$T1,$T1 # 1 + #vpandq $MASK,$T3,$T3 # 3 + #vporq $PADBIT,$T4,$T4 # padbit, yes, always + + vpaddq $H2,$T2,$H2 # accumulate input + sub \$192,$len + jbe .Ltail_avx512 + jmp .Loop_avx512 + +.align 32 +.Loop_avx512: + ################################################################ + # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 + # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 + # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 + # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 + # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 + # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 + # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 + # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 + # \________/\___________/ + ################################################################ + #vpaddq $H2,$T2,$H2 # accumulate input + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # however, as h2 is "chronologically" first one available pull + # corresponding operations up, so it's + # + # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 + # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 + # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 + # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 + # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 + + vpmuludq $H2,$R1,$D3 # d3 = h2*r1 + vpaddq $H0,$T0,$H0 + vpmuludq $H2,$R2,$D4 # d4 = h2*r2 + vpandq $MASK,$T1,$T1 # 1 + vpmuludq $H2,$S3,$D0 # d0 = h2*s3 + vpandq $MASK,$T3,$T3 # 3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + vporq $PADBIT,$T4,$T4 # padbit, yes, always + vpmuludq $H2,$R0,$D2 # d2 = h2*r0 + vpaddq $H1,$T1,$H1 # accumulate input + vpaddq $H3,$T3,$H3 + vpaddq $H4,$T4,$H4 + + vmovdqu64 16*0($inp),$T3 # load input + vmovdqu64 16*4($inp),$T4 + lea 16*8($inp),$inp + vpmuludq $H0,$R3,$M3 + vpmuludq $H0,$R4,$M4 + vpmuludq $H0,$R0,$M0 + vpmuludq $H0,$R1,$M1 + vpaddq $M3,$D3,$D3 # d3 += h0*r3 + vpaddq $M4,$D4,$D4 # d4 += h0*r4 + vpaddq $M0,$D0,$D0 # d0 += h0*r0 + vpaddq $M1,$D1,$D1 # d1 += h0*r1 + + vpmuludq $H1,$R2,$M3 + vpmuludq $H1,$R3,$M4 + vpmuludq $H1,$S4,$M0 + vpmuludq $H0,$R2,$M2 + vpaddq $M3,$D3,$D3 # d3 += h1*r2 + vpaddq $M4,$D4,$D4 # d4 += h1*r3 + vpaddq $M0,$D0,$D0 # d0 += h1*s4 + vpaddq $M2,$D2,$D2 # d2 += h0*r2 + + vpunpcklqdq $T4,$T3,$T0 # transpose input + vpunpckhqdq $T4,$T3,$T4 + + vpmuludq $H3,$R0,$M3 + vpmuludq $H3,$R1,$M4 + vpmuludq $H1,$R0,$M1 + vpmuludq $H1,$R1,$M2 + vpaddq $M3,$D3,$D3 # d3 += h3*r0 + vpaddq $M4,$D4,$D4 # d4 += h3*r1 + vpaddq $M1,$D1,$D1 # d1 += h1*r0 + vpaddq $M2,$D2,$D2 # d2 += h1*r1 + + vpmuludq $H4,$S4,$M3 + vpmuludq $H4,$R0,$M4 + vpmuludq $H3,$S2,$M0 + vpmuludq $H3,$S3,$M1 + vpaddq $M3,$D3,$D3 # d3 += h4*s4 + vpmuludq $H3,$S4,$M2 + vpaddq $M4,$D4,$D4 # d4 += h4*r0 + vpaddq $M0,$D0,$D0 # d0 += h3*s2 + vpaddq $M1,$D1,$D1 # d1 += h3*s3 + vpaddq $M2,$D2,$D2 # d2 += h3*s4 + + vpmuludq $H4,$S1,$M0 + vpmuludq $H4,$S2,$M1 + vpmuludq $H4,$S3,$M2 + vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 + vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 + vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 + + ################################################################ + # lazy reduction (interleaved with input splat) + + vpsrlq \$52,$T0,$T2 # splat input + vpsllq \$12,$T4,$T3 + + vpsrlq \$26,$D3,$H3 + vpandq $MASK,$D3,$D3 + vpaddq $H3,$D4,$H4 # h3 -> h4 + + vporq $T3,$T2,$T2 + + vpsrlq \$26,$H0,$D0 + vpandq $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpandq $MASK,$T2,$T2 # 2 + + vpsrlq \$26,$H4,$D4 + vpandq $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpandq $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpaddq $T2,$H2,$H2 # modulo-scheduled + vpsrlq \$26,$T0,$T1 + + vpsrlq \$26,$H2,$D2 + vpandq $MASK,$H2,$H2 + vpaddq $D2,$D3,$H3 # h2 -> h3 + + vpsrlq \$14,$T4,$T3 + + vpsrlq \$26,$H0,$D0 + vpandq $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$40,$T4,$T4 # 4 + + vpsrlq \$26,$H3,$D3 + vpandq $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpandq $MASK,$T0,$T0 # 0 + #vpandq $MASK,$T1,$T1 # 1 + #vpandq $MASK,$T3,$T3 # 3 + #vporq $PADBIT,$T4,$T4 # padbit, yes, always + + sub \$128,$len + ja .Loop_avx512 + +.Ltail_avx512: + ################################################################ + # while above multiplications were by r^8 in all lanes, in last + # iteration we multiply least significant lane by r^8 and most + # significant one by r, that's why table gets shifted... + + vpsrlq \$32,$R0,$R0 # 0105020603070408 + vpsrlq \$32,$R1,$R1 + vpsrlq \$32,$R2,$R2 + vpsrlq \$32,$S3,$S3 + vpsrlq \$32,$S4,$S4 + vpsrlq \$32,$R3,$R3 + vpsrlq \$32,$R4,$R4 + vpsrlq \$32,$S1,$S1 + vpsrlq \$32,$S2,$S2 + + ################################################################ + # load either next or last 64 byte of input + lea ($inp,$len),$inp + + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + + vpmuludq $H2,$R1,$D3 # d3 = h2*r1 + vpmuludq $H2,$R2,$D4 # d4 = h2*r2 + vpmuludq $H2,$S3,$D0 # d0 = h2*s3 + vpandq $MASK,$T1,$T1 # 1 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + vpandq $MASK,$T3,$T3 # 3 + vpmuludq $H2,$R0,$D2 # d2 = h2*r0 + vporq $PADBIT,$T4,$T4 # padbit, yes, always + vpaddq $H1,$T1,$H1 # accumulate input + vpaddq $H3,$T3,$H3 + vpaddq $H4,$T4,$H4 + + vmovdqu 16*0($inp),%x#$T0 + vpmuludq $H0,$R3,$M3 + vpmuludq $H0,$R4,$M4 + vpmuludq $H0,$R0,$M0 + vpmuludq $H0,$R1,$M1 + vpaddq $M3,$D3,$D3 # d3 += h0*r3 + vpaddq $M4,$D4,$D4 # d4 += h0*r4 + vpaddq $M0,$D0,$D0 # d0 += h0*r0 + vpaddq $M1,$D1,$D1 # d1 += h0*r1 + + vmovdqu 16*1($inp),%x#$T1 + vpmuludq $H1,$R2,$M3 + vpmuludq $H1,$R3,$M4 + vpmuludq $H1,$S4,$M0 + vpmuludq $H0,$R2,$M2 + vpaddq $M3,$D3,$D3 # d3 += h1*r2 + vpaddq $M4,$D4,$D4 # d4 += h1*r3 + vpaddq $M0,$D0,$D0 # d0 += h1*s4 + vpaddq $M2,$D2,$D2 # d2 += h0*r2 + + vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 + vpmuludq $H3,$R0,$M3 + vpmuludq $H3,$R1,$M4 + vpmuludq $H1,$R0,$M1 + vpmuludq $H1,$R1,$M2 + vpaddq $M3,$D3,$D3 # d3 += h3*r0 + vpaddq $M4,$D4,$D4 # d4 += h3*r1 + vpaddq $M1,$D1,$D1 # d1 += h1*r0 + vpaddq $M2,$D2,$D2 # d2 += h1*r1 + + vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 + vpmuludq $H4,$S4,$M3 + vpmuludq $H4,$R0,$M4 + vpmuludq $H3,$S2,$M0 + vpmuludq $H3,$S3,$M1 + vpmuludq $H3,$S4,$M2 + vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 + vpaddq $M4,$D4,$D4 # d4 += h4*r0 + vpaddq $M0,$D0,$D0 # d0 += h3*s2 + vpaddq $M1,$D1,$D1 # d1 += h3*s3 + vpaddq $M2,$D2,$D2 # d2 += h3*s4 + + vpmuludq $H4,$S1,$M0 + vpmuludq $H4,$S2,$M1 + vpmuludq $H4,$S3,$M2 + vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 + vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 + vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 + + ################################################################ + # horizontal addition + + mov \$1,%eax + vpermq \$0xb1,$H3,$D3 + vpermq \$0xb1,$D4,$H4 + vpermq \$0xb1,$H0,$D0 + vpermq \$0xb1,$H1,$D1 + vpermq \$0xb1,$H2,$D2 + vpaddq $D3,$H3,$H3 + vpaddq $D4,$H4,$H4 + vpaddq $D0,$H0,$H0 + vpaddq $D1,$H1,$H1 + vpaddq $D2,$H2,$H2 + + kmovw %eax,%k3 + vpermq \$0x2,$H3,$D3 + vpermq \$0x2,$H4,$D4 + vpermq \$0x2,$H0,$D0 + vpermq \$0x2,$H1,$D1 + vpermq \$0x2,$H2,$D2 + vpaddq $D3,$H3,$H3 + vpaddq $D4,$H4,$H4 + vpaddq $D0,$H0,$H0 + vpaddq $D1,$H1,$H1 + vpaddq $D2,$H2,$H2 + + vextracti64x4 \$0x1,$H3,%y#$D3 + vextracti64x4 \$0x1,$H4,%y#$D4 + vextracti64x4 \$0x1,$H0,%y#$D0 + vextracti64x4 \$0x1,$H1,%y#$D1 + vextracti64x4 \$0x1,$H2,%y#$D2 + vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case + vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 + vpaddq $D0,$H0,${H0}{%k3}{z} + vpaddq $D1,$H1,${H1}{%k3}{z} + vpaddq $D2,$H2,${H2}{%k3}{z} +___ +map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); +map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); +$code.=<<___; + ################################################################ + # lazy reduction (interleaved with input splat) + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpunpcklqdq $T3,$T2,$T2 # 2:3 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpsrlq \$30,$T2,$T3 + vpsrlq \$4,$T2,$T2 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpsrlq \$26,$T0,$T1 + vpsrlq \$40,$T4,$T4 # 4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T0,$T0 # 0 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 + vpand $MASK,$T1,$T1 # 1 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + vpaddq $D3,$H4,$H4 # h3 -> h4 + + lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 + add \$64,$len + jnz .Ltail_avx2 + + vpsubq $T2,$H2,$H2 # undo input accumulation + vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced + vmovd %x#$H1,`4*1-48-64`($ctx) + vmovd %x#$H2,`4*2-48-64`($ctx) + vmovd %x#$H3,`4*3-48-64`($ctx) + vmovd %x#$H4,`4*4-48-64`($ctx) + vzeroall +___ +$code.=<<___ if ($win64); + movdqa 0x50(%r11),%xmm6 + movdqa 0x60(%r11),%xmm7 + movdqa 0x70(%r11),%xmm8 + movdqa 0x80(%r11),%xmm9 + movdqa 0x90(%r11),%xmm10 + movdqa 0xa0(%r11),%xmm11 + movdqa 0xb0(%r11),%xmm12 + movdqa 0xc0(%r11),%xmm13 + movdqa 0xd0(%r11),%xmm14 + movdqa 0xe0(%r11),%xmm15 + lea 0xf8(%r11),%rsp +.Ldo_avx512_epilogue: +___ +$code.=<<___ if (!$win64); + lea 8(%r11),%rsp +.cfi_def_cfa %rsp,8 +___ +$code.=<<___; + ret +.cfi_endproc +.size poly1305_blocks_avx512,.-poly1305_blocks_avx512 +___ +if ($avx>3) { +######################################################################## +# VPMADD52 version using 2^44 radix. +# +# One can argue that base 2^52 would be more natural. Well, even though +# some operations would be more natural, one has to recognize couple of +# things. Base 2^52 doesn't provide advantage over base 2^44 if you look +# at amount of multiply-n-accumulate operations. Secondly, it makes it +# impossible to pre-compute multiples of 5 [referred to as s[]/sN in +# reference implementations], which means that more such operations +# would have to be performed in inner loop, which in turn makes critical +# path longer. In other words, even though base 2^44 reduction might +# look less elegant, overall critical path is actually shorter... + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int64 h[3]; # current hash value base 2^44 +# unsigned __int64 s[2]; # key value*20 base 2^44 +# unsigned __int64 r[3]; # key value base 2^44 +# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; +# # r^n positions reflect +# # placement in register, not +# # memory, R[3] is R[1]*20 + +$code.=<<___; +.type poly1305_init_base2_44,\@function,3 +.align 32 +poly1305_init_base2_44: + xor %rax,%rax + mov %rax,0($ctx) # initialize hash value + mov %rax,8($ctx) + mov %rax,16($ctx) + +.Linit_base2_44: + lea poly1305_blocks_vpmadd52(%rip),%r10 + lea poly1305_emit_base2_44(%rip),%r11 + + mov \$0x0ffffffc0fffffff,%rax + mov \$0x0ffffffc0ffffffc,%rcx + and 0($inp),%rax + mov \$0x00000fffffffffff,%r8 + and 8($inp),%rcx + mov \$0x00000fffffffffff,%r9 + and %rax,%r8 + shrd \$44,%rcx,%rax + mov %r8,40($ctx) # r0 + and %r9,%rax + shr \$24,%rcx + mov %rax,48($ctx) # r1 + lea (%rax,%rax,4),%rax # *5 + mov %rcx,56($ctx) # r2 + shl \$2,%rax # magic <<2 + lea (%rcx,%rcx,4),%rcx # *5 + shl \$2,%rcx # magic <<2 + mov %rax,24($ctx) # s1 + mov %rcx,32($ctx) # s2 + movq \$-1,64($ctx) # write impossible value +___ +$code.=<<___ if ($flavour !~ /elf32/); + mov %r10,0(%rdx) + mov %r11,8(%rdx) +___ +$code.=<<___ if ($flavour =~ /elf32/); + mov %r10d,0(%rdx) + mov %r11d,4(%rdx) +___ +$code.=<<___; + mov \$1,%eax + ret +.size poly1305_init_base2_44,.-poly1305_init_base2_44 +___ +{ +my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); +my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); +my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52,\@function,4 +.align 32 +poly1305_blocks_vpmadd52: + shr \$4,$len + jz .Lno_data_vpmadd52 # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + + # if powers of the key are not calculated yet, process up to 3 + # blocks with this single-block subroutine, otherwise ensure that + # length is divisible by 2 blocks and pass the rest down to next + # subroutine... + + mov \$3,%rax + mov \$1,%r10 + cmp \$4,$len # is input long + cmovae %r10,%rax + test %r8,%r8 # is power value impossible? + cmovns %r10,%rax + + and $len,%rax # is input of favourable length? + jz .Lblocks_vpmadd52_4x + + sub %rax,$len + mov \$7,%r10d + mov \$1,%r11d + kmovw %r10d,%k7 + lea .L2_44_inp_permd(%rip),%r10 + kmovw %r11d,%k1 + + vmovq $padbit,%x#$PAD + vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd + vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift + vpermq \$0xcf,$PAD,$PAD + vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask + + vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value + vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys + vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} + vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} + + vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt + vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft + + jmp .Loop_vpmadd52 + +.align 32 +.Loop_vpmadd52: + vmovdqu32 0($inp),%x#$T0 # load input as ----3210 + lea 16($inp),$inp + + vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 + vpsrlvq $inp_shift,$T0,$T0 + vpandq $reduc_mask,$T0,$T0 + vporq $PAD,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo # accumulate input + + vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value + vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} + vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} + + vpxord $Dlo,$Dlo,$Dlo + vpxord $Dhi,$Dhi,$Dhi + + vpmadd52luq $r2r1r0,$H0,$Dlo + vpmadd52huq $r2r1r0,$H0,$Dhi + + vpmadd52luq $r1r0s2,$H1,$Dlo + vpmadd52huq $r1r0s2,$H1,$Dhi + + vpmadd52luq $r0s2s1,$H2,$Dlo + vpmadd52huq $r0s2s1,$H2,$Dhi + + vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword + vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword + vpandq $reduc_mask,$Dlo,$Dlo + + vpaddq $T0,$Dhi,$Dhi + + vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword + + vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) + + vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word + vpandq $reduc_mask,$Dlo,$Dlo + + vpermq \$0b10010011,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo + + vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} + + vpaddq $T0,$Dlo,$Dlo + vpsllq \$2,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo + + dec %rax # len-=16 + jnz .Loop_vpmadd52 + + vmovdqu64 $Dlo,0($ctx){%k7} # store hash value + + test $len,$len + jnz .Lblocks_vpmadd52_4x + +.Lno_data_vpmadd52: + ret +.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 +___ } +{ +######################################################################## +# As implied by its name 4x subroutine processes 4 blocks in parallel +# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power +# and is handled in 256-bit %ymm registers. + +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52_4x,\@function,4 +.align 32 +poly1305_blocks_vpmadd52_4x: + shr \$4,$len + jz .Lno_data_vpmadd52_4x # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + +.Lblocks_vpmadd52_4x: + vpbroadcastq $padbit,$PAD + + vmovdqa64 .Lx_mask44(%rip),$mask44 + mov \$5,%eax + vmovdqa64 .Lx_mask42(%rip),$mask42 + kmovw %eax,%k1 # used in 2x path + + test %r8,%r8 # is power value impossible? + js .Linit_vpmadd52 # if it is, then init R[4] + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + + test \$3,$len # is length 4*n+2? + jnz .Lblocks_vpmadd52_2x_do + +.Lblocks_vpmadd52_4x_do: + vpbroadcastq 64($ctx),$R0 # load 4th power of the key + vpbroadcastq 96($ctx),$R1 + vpbroadcastq 128($ctx),$R2 + vpbroadcastq 160($ctx),$S1 + +.Lblocks_vpmadd52_4x_key_loaded: + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + test \$7,$len # is len 8*n? + jz .Lblocks_vpmadd52_8x + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*2($inp),$T3 + lea 16*4($inp),$inp + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as 3-1-2-0 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + sub \$4,$len + jz .Ltail_vpmadd52_4x + jmp .Loop_vpmadd52_4x + ud2 + +.align 32 +.Linit_vpmadd52: + vmovq 24($ctx),%x#$S1 # load key + vmovq 56($ctx),%x#$H2 + vmovq 32($ctx),%x#$S2 + vmovq 40($ctx),%x#$R0 + vmovq 48($ctx),%x#$R1 + + vmovdqa $R0,$H0 + vmovdqa $R1,$H1 + vmovdqa $H2,$R2 + + mov \$2,%eax + +.Lmul_init_vpmadd52: + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + dec %eax + jz .Ldone_init_vpmadd52 + + vpunpcklqdq $R1,$H1,$R1 # 1,2 + vpbroadcastq %x#$H1,%x#$H1 # 2,2 + vpunpcklqdq $R2,$H2,$R2 + vpbroadcastq %x#$H2,%x#$H2 + vpunpcklqdq $R0,$H0,$R0 + vpbroadcastq %x#$H0,%x#$H0 + + vpsllq \$2,$R1,$S1 # S1 = R1*5*4 + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R1,$S1,$S1 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S1,$S1 + vpsllq \$2,$S2,$S2 + + jmp .Lmul_init_vpmadd52 + ud2 + +.align 32 +.Ldone_init_vpmadd52: + vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 + vinserti128 \$1,%x#$R2,$H2,$R2 + vinserti128 \$1,%x#$R0,$H0,$R0 + + vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 + vpermq \$0b11011000,$R2,$R2 + vpermq \$0b11011000,$R0,$R0 + + vpsllq \$2,$R1,$S1 # S1 = R1*5*4 + vpaddq $R1,$S1,$S1 + vpsllq \$2,$S1,$S1 + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + + test \$3,$len # is length 4*n+2? + jnz .Ldone_init_vpmadd52_2x + + vmovdqu64 $R0,64($ctx) # save key powers + vpbroadcastq %x#$R0,$R0 # broadcast 4th power + vmovdqu64 $R1,96($ctx) + vpbroadcastq %x#$R1,$R1 + vmovdqu64 $R2,128($ctx) + vpbroadcastq %x#$R2,$R2 + vmovdqu64 $S1,160($ctx) + vpbroadcastq %x#$S1,$S1 + + jmp .Lblocks_vpmadd52_4x_key_loaded + ud2 + +.align 32 +.Ldone_init_vpmadd52_2x: + vmovdqu64 $R0,64($ctx) # save key powers + vpsrldq \$8,$R0,$R0 # 0-1-0-2 + vmovdqu64 $R1,96($ctx) + vpsrldq \$8,$R1,$R1 + vmovdqu64 $R2,128($ctx) + vpsrldq \$8,$R2,$R2 + vmovdqu64 $S1,160($ctx) + vpsrldq \$8,$S1,$S1 + jmp .Lblocks_vpmadd52_2x_key_loaded + ud2 + +.align 32 +.Lblocks_vpmadd52_2x_do: + vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers + vmovdqu64 160+8($ctx),${S1}{%k1}{z} + vmovdqu64 64+8($ctx),${R0}{%k1}{z} + vmovdqu64 96+8($ctx),${R1}{%k1}{z} + +.Lblocks_vpmadd52_2x_key_loaded: + vmovdqu64 16*0($inp),$T2 # load data + vpxorq $T3,$T3,$T3 + lea 16*2($inp),$inp + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as x-1-x-0 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + jmp .Ltail_vpmadd52_2x + ud2 + +.align 32 +.Loop_vpmadd52_4x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*2($inp),$T3 + lea 16*4($inp),$inp + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction (interleaved with data splat) + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpsrlq \$24,$T3,$T2 + vporq $PAD,$T2,$T2 + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + sub \$4,$len # len-=64 + jnz .Loop_vpmadd52_4x + +.Ltail_vpmadd52_4x: + vmovdqu64 128($ctx),$R2 # load all key powers + vmovdqu64 160($ctx),$S1 + vmovdqu64 64($ctx),$R0 + vmovdqu64 96($ctx),$R1 + +.Ltail_vpmadd52_2x: + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # horizontal addition + + mov \$1,%eax + kmovw %eax,%k1 + vpsrldq \$8,$D0lo,$T0 + vpsrldq \$8,$D0hi,$H0 + vpsrldq \$8,$D1lo,$T1 + vpsrldq \$8,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpsrldq \$8,$D2lo,$T2 + vpsrldq \$8,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vpermq \$0x2,$D0lo,$T0 + vpermq \$0x2,$D0hi,$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vpermq \$0x2,$D1lo,$T1 + vpermq \$0x2,$D1hi,$H1 + vpaddq $T0,$D0lo,${D0lo}{%k1}{z} + vpaddq $H0,$D0hi,${D0hi}{%k1}{z} + vpermq \$0x2,$D2lo,$T2 + vpermq \$0x2,$D2hi,$H2 + vpaddq $T1,$D1lo,${D1lo}{%k1}{z} + vpaddq $H1,$D1hi,${D1hi}{%k1}{z} + vpaddq $T2,$D2lo,${D2lo}{%k1}{z} + vpaddq $H2,$D2hi,${D2hi}{%k1}{z} + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + # at this point $len is + # either 4*n+2 or 0... + sub \$2,$len # len-=32 + ja .Lblocks_vpmadd52_4x_do + + vmovq %x#$H0,0($ctx) + vmovq %x#$H1,8($ctx) + vmovq %x#$H2,16($ctx) + vzeroall + +.Lno_data_vpmadd52_4x: + ret +.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x +___ +} +{ +######################################################################## +# As implied by its name 8x subroutine processes 8 blocks in parallel... +# This is intermediate version, as it's used only in cases when input +# length is either 8*n, 8*n+1 or 8*n+2... + +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); +my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52_8x,\@function,4 +.align 32 +poly1305_blocks_vpmadd52_8x: + shr \$4,$len + jz .Lno_data_vpmadd52_8x # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + + vmovdqa64 .Lx_mask44(%rip),$mask44 + vmovdqa64 .Lx_mask42(%rip),$mask42 + + test %r8,%r8 # is power value impossible? + js .Linit_vpmadd52 # if it is, then init R[4] + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + +.Lblocks_vpmadd52_8x: + ################################################################ + # fist we calculate more key powers + + vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers + vmovdqu64 160($ctx),$S1 + vmovdqu64 64($ctx),$R0 + vmovdqu64 96($ctx),$R1 + + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + vpbroadcastq %x#$R2,$RR2 # broadcast 4th power + vpbroadcastq %x#$R0,$RR0 + vpbroadcastq %x#$R1,$RR1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $RR2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $RR2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $RR2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $RR2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $RR2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $RR2,$R0,$D2hi + + vpmadd52luq $RR0,$R0,$D0lo + vpmadd52huq $RR0,$R0,$D0hi + vpmadd52luq $RR0,$R1,$D1lo + vpmadd52huq $RR0,$R1,$D1hi + vpmadd52luq $RR0,$R2,$D2lo + vpmadd52huq $RR0,$R2,$D2hi + + vpmadd52luq $RR1,$S2,$D0lo + vpmadd52huq $RR1,$S2,$D0hi + vpmadd52luq $RR1,$R0,$D1lo + vpmadd52huq $RR1,$R0,$D1hi + vpmadd52luq $RR1,$R1,$D2lo + vpmadd52huq $RR1,$R1,$D2hi + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$RR0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$RR1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$RR2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$RR0,$RR0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$RR0,$RR0 + + vpsrlq \$44,$RR0,$tmp # additional step + vpandq $mask44,$RR0,$RR0 + + vpaddq $tmp,$RR1,$RR1 + + ################################################################ + # At this point Rx holds 1324 powers, RRx - 5768, and the goal + # is 15263748, which reflects how data is loaded... + + vpunpcklqdq $R2,$RR2,$T2 # 3748 + vpunpckhqdq $R2,$RR2,$R2 # 1526 + vpunpcklqdq $R0,$RR0,$T0 + vpunpckhqdq $R0,$RR0,$R0 + vpunpcklqdq $R1,$RR1,$T1 + vpunpckhqdq $R1,$RR1,$R1 +___ +######## switch to %zmm +map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); +map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); +map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); +map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); + +$code.=<<___; + vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 + vshufi64x2 \$0x44,$R0,$T0,$RR0 + vshufi64x2 \$0x44,$R1,$T1,$RR1 + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*4($inp),$T3 + lea 16*8($inp),$inp + + vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 + vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 + vpaddq $RR2,$SS2,$SS2 + vpaddq $RR1,$SS1,$SS1 + vpsllq \$2,$SS2,$SS2 + vpsllq \$2,$SS1,$SS1 + + vpbroadcastq $padbit,$PAD + vpbroadcastq %x#$mask44,$mask44 + vpbroadcastq %x#$mask42,$mask42 + + vpbroadcastq %x#$SS1,$S1 # broadcast 8th power + vpbroadcastq %x#$SS2,$S2 + vpbroadcastq %x#$RR0,$R0 + vpbroadcastq %x#$RR1,$R1 + vpbroadcastq %x#$RR2,$R2 + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as 73625140 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + sub \$8,$len + jz .Ltail_vpmadd52_8x + jmp .Loop_vpmadd52_8x + +.align 32 +.Loop_vpmadd52_8x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*4($inp),$T3 + lea 16*8($inp),$inp + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction (interleaved with data splat) + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpsrlq \$24,$T3,$T2 + vporq $PAD,$T2,$T2 + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + sub \$8,$len # len-=128 + jnz .Loop_vpmadd52_8x + +.Ltail_vpmadd52_8x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$SS1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$SS1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$SS2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$SS2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$RR0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$RR0,$D2hi + + vpmadd52luq $H0,$RR0,$D0lo + vpmadd52huq $H0,$RR0,$D0hi + vpmadd52luq $H0,$RR1,$D1lo + vpmadd52huq $H0,$RR1,$D1hi + vpmadd52luq $H0,$RR2,$D2lo + vpmadd52huq $H0,$RR2,$D2hi + + vpmadd52luq $H1,$SS2,$D0lo + vpmadd52huq $H1,$SS2,$D0hi + vpmadd52luq $H1,$RR0,$D1lo + vpmadd52huq $H1,$RR0,$D1hi + vpmadd52luq $H1,$RR1,$D2lo + vpmadd52huq $H1,$RR1,$D2hi + + ################################################################ + # horizontal addition + + mov \$1,%eax + kmovw %eax,%k1 + vpsrldq \$8,$D0lo,$T0 + vpsrldq \$8,$D0hi,$H0 + vpsrldq \$8,$D1lo,$T1 + vpsrldq \$8,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpsrldq \$8,$D2lo,$T2 + vpsrldq \$8,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vpermq \$0x2,$D0lo,$T0 + vpermq \$0x2,$D0hi,$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vpermq \$0x2,$D1lo,$T1 + vpermq \$0x2,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpermq \$0x2,$D2lo,$T2 + vpermq \$0x2,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vextracti64x4 \$1,$D0lo,%y#$T0 + vextracti64x4 \$1,$D0hi,%y#$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vextracti64x4 \$1,$D1lo,%y#$T1 + vextracti64x4 \$1,$D1hi,%y#$H1 + vextracti64x4 \$1,$D2lo,%y#$T2 + vextracti64x4 \$1,$D2hi,%y#$H2 +___ +######## switch back to %ymm +map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); +map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); +map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); + +$code.=<<___; + vpaddq $T0,$D0lo,${D0lo}{%k1}{z} + vpaddq $H0,$D0hi,${D0hi}{%k1}{z} + vpaddq $T1,$D1lo,${D1lo}{%k1}{z} + vpaddq $H1,$D1hi,${D1hi}{%k1}{z} + vpaddq $T2,$D2lo,${D2lo}{%k1}{z} + vpaddq $H2,$D2hi,${D2hi}{%k1}{z} + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + ################################################################ + + vmovq %x#$H0,0($ctx) + vmovq %x#$H1,8($ctx) + vmovq %x#$H2,16($ctx) + vzeroall + +.Lno_data_vpmadd52_8x: + ret +.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x +___ +} +$code.=<<___; +.type poly1305_emit_base2_44,\@function,3 +.align 32 +poly1305_emit_base2_44: + mov 0($ctx),%r8 # load hash value + mov 8($ctx),%r9 + mov 16($ctx),%r10 + + mov %r9,%rax + shr \$20,%r9 + shl \$44,%rax + mov %r10,%rcx + shr \$40,%r10 + shl \$24,%rcx + + add %rax,%r8 + adc %rcx,%r9 + adc \$0,%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + ret +.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 +___ +} } } $code.=<<___; .align 64 .Lconst: @@ -2006,16 +3762,140 @@ $code.=<<___; .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 .Lmask26: .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 -.Lfive: -.long 5,0,5,0,5,0,5,0 +.Lpermd_avx2: +.long 2,2,2,3,2,0,2,1 +.Lpermd_avx512: +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 + +.L2_44_inp_permd: +.long 0,1,1,2,2,3,7,7 +.L2_44_inp_shift: +.quad 0,12,24,64 +.L2_44_mask: +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff +.L2_44_shift_rgt: +.quad 44,44,42,64 +.L2_44_shift_lft: +.quad 8,8,10,64 + +.align 64 +.Lx_mask44: +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.Lx_mask42: +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff ___ } - $code.=<<___; .asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" .align 16 ___ +{ # chacha20-poly1305 helpers +my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order +$code.=<<___; +.globl xor128_encrypt_n_pad +.type xor128_encrypt_n_pad,\@abi-omnipotent +.align 16 +xor128_encrypt_n_pad: + sub $otp,$inp + sub $otp,$out + mov $len,%r10 # put len aside + shr \$4,$len # len / 16 + jz .Ltail_enc + nop +.Loop_enc_xmm: + movdqu ($inp,$otp),%xmm0 + pxor ($otp),%xmm0 + movdqu %xmm0,($out,$otp) + movdqa %xmm0,($otp) + lea 16($otp),$otp + dec $len + jnz .Loop_enc_xmm + + and \$15,%r10 # len % 16 + jz .Ldone_enc + +.Ltail_enc: + mov \$16,$len + sub %r10,$len + xor %eax,%eax +.Loop_enc_byte: + mov ($inp,$otp),%al + xor ($otp),%al + mov %al,($out,$otp) + mov %al,($otp) + lea 1($otp),$otp + dec %r10 + jnz .Loop_enc_byte + + xor %eax,%eax +.Loop_enc_pad: + mov %al,($otp) + lea 1($otp),$otp + dec $len + jnz .Loop_enc_pad + +.Ldone_enc: + mov $otp,%rax + ret +.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad + +.globl xor128_decrypt_n_pad +.type xor128_decrypt_n_pad,\@abi-omnipotent +.align 16 +xor128_decrypt_n_pad: + sub $otp,$inp + sub $otp,$out + mov $len,%r10 # put len aside + shr \$4,$len # len / 16 + jz .Ltail_dec + nop +.Loop_dec_xmm: + movdqu ($inp,$otp),%xmm0 + movdqa ($otp),%xmm1 + pxor %xmm0,%xmm1 + movdqu %xmm1,($out,$otp) + movdqa %xmm0,($otp) + lea 16($otp),$otp + dec $len + jnz .Loop_dec_xmm + + pxor %xmm1,%xmm1 + and \$15,%r10 # len % 16 + jz .Ldone_dec + +.Ltail_dec: + mov \$16,$len + sub %r10,$len + xor %eax,%eax + xor %r11,%r11 +.Loop_dec_byte: + mov ($inp,$otp),%r11b + mov ($otp),%al + xor %r11b,%al + mov %al,($out,$otp) + mov %r11b,($otp) + lea 1($otp),$otp + dec %r10 + jnz .Loop_dec_byte + + xor %eax,%eax +.Loop_dec_pad: + mov %al,($otp) + lea 1($otp),$otp + dec $len + jnz .Loop_dec_pad + +.Ldone_dec: + mov $otp,%rax + ret +.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad +___ +} + # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { @@ -2200,6 +4080,11 @@ $code.=<<___ if ($avx>1); .rva .LSEH_end_poly1305_blocks_avx2 .rva .LSEH_info_poly1305_blocks_avx2_3 ___ +$code.=<<___ if ($avx>2); + .rva .LSEH_begin_poly1305_blocks_avx512 + .rva .LSEH_end_poly1305_blocks_avx512 + .rva .LSEH_info_poly1305_blocks_avx512 +___ $code.=<<___; .section .xdata .align 8 @@ -2255,13 +4140,19 @@ $code.=<<___ if ($avx>1); .rva avx_handler .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] ___ +$code.=<<___ if ($avx>2); +.LSEH_info_poly1305_blocks_avx512: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] +___ } foreach (split('\n',$code)) { s/\`([^\`]*)\`/eval($1)/ge; s/%r([a-z]+)#d/%e$1/g; s/%r([0-9]+)#d/%r$1d/g; - s/%x#%y/%x/g; + s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; print $_,"\n"; } diff --git a/deps/openssl/openssl/crypto/poly1305/build.info b/deps/openssl/openssl/crypto/poly1305/build.info index f90ce2b950..631b32b8e0 100644 --- a/deps/openssl/openssl/crypto/poly1305/build.info +++ b/deps/openssl/openssl/crypto/poly1305/build.info @@ -1,10 +1,13 @@ LIBS=../../libcrypto SOURCE[../../libcrypto]=\ + poly1305_pmeth.c \ + poly1305_ameth.c \ poly1305.c {- $target{poly1305_asm_src} -} GENERATE[poly1305-sparcv9.S]=asm/poly1305-sparcv9.pl $(PERLASM_SCHEME) INCLUDE[poly1305-sparcv9.o]=.. -GENERATE[poly1305-x86.s]=asm/poly1305-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(LIB_CFLAGS) $(PROCESSOR) +GENERATE[poly1305-x86.s]=asm/poly1305-x86.pl \ + $(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR) GENERATE[poly1305-x86_64.s]=asm/poly1305-x86_64.pl $(PERLASM_SCHEME) GENERATE[poly1305-ppc.s]=asm/poly1305-ppc.pl $(PERLASM_SCHEME) GENERATE[poly1305-ppcfp.s]=asm/poly1305-ppcfp.pl $(PERLASM_SCHEME) @@ -13,8 +16,7 @@ INCLUDE[poly1305-armv4.o]=.. GENERATE[poly1305-armv8.S]=asm/poly1305-armv8.pl $(PERLASM_SCHEME) INCLUDE[poly1305-armv8.o]=.. GENERATE[poly1305-mips.S]=asm/poly1305-mips.pl $(PERLASM_SCHEME) -GENERATE[poly1305-s390x.S]=asm/poly1305-s390x.pl $(PERLASM_SCHEME) -INCLUDE[poly1305-s390x.o]=.. +INCLUDE[poly1305-mips.o]=.. BEGINRAW[Makefile(unix)] {- $builddir -}/poly1305-%.S: {- $sourcedir -}/asm/poly1305-%.pl diff --git a/deps/openssl/openssl/crypto/poly1305/poly1305.c b/deps/openssl/openssl/crypto/poly1305/poly1305.c index eec4d67f0c..1d182364ae 100644 --- a/deps/openssl/openssl/crypto/poly1305/poly1305.c +++ b/deps/openssl/openssl/crypto/poly1305/poly1305.c @@ -1,5 +1,5 @@ /* - * Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the OpenSSL license (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy @@ -12,27 +12,9 @@ #include <openssl/crypto.h> #include "internal/poly1305.h" +#include "poly1305_local.h" -typedef void (*poly1305_blocks_f) (void *ctx, const unsigned char *inp, - size_t len, unsigned int padbit); -typedef void (*poly1305_emit_f) (void *ctx, unsigned char mac[16], - const unsigned int nonce[4]); - -struct poly1305_context { - double opaque[24]; /* large enough to hold internal state, declared - * 'double' to ensure at least 64-bit invariant - * alignment across all platforms and - * configurations */ - unsigned int nonce[4]; - unsigned char data[POLY1305_BLOCK_SIZE]; - size_t num; - struct { - poly1305_blocks_f blocks; - poly1305_emit_f emit; - } func; -}; - -size_t Poly1305_ctx_size () +size_t Poly1305_ctx_size(void) { return sizeof(struct poly1305_context); } @@ -113,12 +95,11 @@ poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit); (a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1) \ ) -# if !defined(PEDANTIC) && \ - (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16) && \ +# if (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16) && \ (defined(__SIZEOF_LONG__) && __SIZEOF_LONG__==8) typedef unsigned long u64; -typedef unsigned __int128 u128; +typedef __uint128_t u128; typedef struct { u64 h[3]; @@ -548,490 +529,3 @@ void Poly1305_Final(POLY1305 *ctx, unsigned char mac[16]) /* zero out the state */ OPENSSL_cleanse(ctx, sizeof(*ctx)); } - -#ifdef SELFTEST -#include <stdio.h> - -struct poly1305_test { - const char *inputhex; - const char *keyhex; - const char *outhex; -}; - -static const struct poly1305_test poly1305_tests[] = { - /* - * RFC7539 - */ - { - "43727970746f6772617068696320466f72756d2052657365617263682047726f" - "7570", - "85d6be7857556d337f4452fe42d506a8""0103808afb0db2fd4abff6af4149f51b", - "a8061dc1305136c6c22b8baf0c0127a9" - }, - /* - * test vectors from "The Poly1305-AES message-authentication code" - */ - { - "f3f6", - "851fc40c3467ac0be05cc20404f3f700""580b3b0f9447bb1e69d095b5928b6dbc", - "f4c633c3044fc145f84f335cb81953de" - }, - { - "", - "a0f3080000f46400d0c7e9076c834403""dd3fab2251f11ac759f0887129cc2ee7", - "dd3fab2251f11ac759f0887129cc2ee7" - }, - { - "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136", - "48443d0bb0d21109c89a100b5ce2c208""83149c69b561dd88298a1798b10716ef", - "0ee1c16bb73f0f4fd19881753c01cdbe" - }, - { - "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" - "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9", - "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", - "5154ad0d2cb26e01274fc51148491f1b" - }, - /* - * self-generated vectors exercise "significant" lengths, such that - * are handled by different code paths - */ - { - "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" - "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af", - "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", - "812059a5da198637cac7c4a631bee466" - }, - { - "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" - "990c62e48b8018b2c3e4a0fa3134cb67", - "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", - "5b88d7f6228b11e2e28579a5c0c1f761" - }, - { - "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" - "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af" - "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136", - "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", - "bbb613b2b6d753ba07395b916aaece15" - }, - { - "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" - "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af" - "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef" - "663cea190ffb83d89593f3f476b6bc24", - "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", - "c794d7057d1778c4bbee0a39b3d97342" - }, - { - "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" - "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af" - "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef" - "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136", - "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", - "ffbcb9b371423152d7fca5ad042fbaa9" - }, - { - "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" - "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af" - "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef" - "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136" - "812059a5da198637cac7c4a631bee466", - "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", - "069ed6b8ef0f207b3e243bb1019fe632" - }, - { - "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" - "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af" - "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef" - "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136" - "812059a5da198637cac7c4a631bee4665b88d7f6228b11e2e28579a5c0c1f761", - "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", - "cca339d9a45fa2368c2c68b3a4179133" - }, - { - "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" - "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af" - "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef" - "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136" - "812059a5da198637cac7c4a631bee4665b88d7f6228b11e2e28579a5c0c1f761" - "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" - "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af" - "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef" - "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136", - "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", - "53f6e828a2f0fe0ee815bf0bd5841a34" - }, - { - "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" - "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af" - "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef" - "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136" - "812059a5da198637cac7c4a631bee4665b88d7f6228b11e2e28579a5c0c1f761" - "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" - "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af" - "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef" - "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136" - "812059a5da198637cac7c4a631bee4665b88d7f6228b11e2e28579a5c0c1f761", - "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", - "b846d44e9bbd53cedffbfbb6b7fa4933" - }, - /* - * 4th power of the key spills to 131th bit in SIMD key setup - */ - { - "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" - "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" - "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" - "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" - "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" - "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" - "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" - "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff", - "ad628107e8351d0f2c231a05dc4a4106""00000000000000000000000000000000", - "07145a4c02fe5fa32036de68fabe9066" - }, - { - /* - * poly1305_ieee754.c failed this in final stage - */ - "842364e156336c0998b933a6237726180d9e3fdcbde4cd5d17080fc3beb49614" - "d7122c037463ff104d73f19c12704628d417c4c54a3fe30d3c3d7714382d43b0" - "382a50a5dee54be844b076e8df88201a1cd43b90eb21643fa96f39b518aa8340" - "c942ff3c31baf7c9bdbf0f31ae3fa096bf8c63030609829fe72e179824890bc8" - "e08c315c1cce2a83144dbbff09f74e3efc770b54d0984a8f19b14719e6363564" - "1d6b1eedf63efbf080e1783d32445412114c20de0b837a0dfa33d6b82825fff4" - "4c9a70ea54ce47f07df698e6b03323b53079364a5fc3e9dd034392bdde86dccd" - "da94321c5e44060489336cb65bf3989c36f7282c2f5d2b882c171e74", - "95d5c005503e510d8cd0aa072c4a4d06""6eabc52d11653df47fbf63ab198bcc26", - "f248312e578d9d58f8b7bb4d19105431" - }, - /* - * AVX2 in poly1305-x86.pl failed this with 176+32 split - */ - { - "248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd" - "2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e8" - "74cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c" - "8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936a" - "ff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a37" - "09894e4eb0a4eedc4ae19468e66b81f2" - "71351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb", - "000102030405060708090a0b0c0d0e0f""00000000000000000000000000000000", - "bc939bc5281480fa99c6d68c258ec42f" - }, - /* - * test vectors from Google - */ - { - "", - "c8afaac331ee372cd6082de134943b17""4710130e9f6fea8d72293850a667d86c", - "4710130e9f6fea8d72293850a667d86c", - }, - { - "48656c6c6f20776f726c6421", - "746869732069732033322d6279746520""6b657920666f7220506f6c7931333035", - "a6f745008f81c916a20dcc74eef2b2f0" - }, - { - "0000000000000000000000000000000000000000000000000000000000000000", - "746869732069732033322d6279746520""6b657920666f7220506f6c7931333035", - "49ec78090e481ec6c26b33b91ccc0307" - }, - { - "89dab80b7717c1db5db437860a3f70218e93e1b8f461fb677f16f35f6f87e2a9" - "1c99bc3a47ace47640cc95c345be5ecca5a3523c35cc01893af0b64a62033427" - "0372ec12482d1b1e363561698a578b359803495bb4e2ef1930b17a5190b580f1" - "41300df30adbeca28f6427a8bc1a999fd51c554a017d095d8c3e3127daf9f595", - "2d773be37adb1e4d683bf0075e79c4ee""037918535a7f99ccb7040fb5f5f43aea", - "c85d15ed44c378d6b00e23064c7bcd51" - }, - { - "000000000000000b1703030200000000" - "06db1f1f368d696a810a349c0c714c9a5e7850c2407d721acded95e018d7a852" - "66a6e1289cdb4aeb18da5ac8a2b0026d24a59ad485227f3eaedbb2e7e35e1c66" - "cd60f9abf716dcc9ac42682dd7dab287a7024c4eefc321cc0574e16793e37cec" - "03c5bda42b54c114a80b57af26416c7be742005e20855c73e21dc8e2edc9d435" - "cb6f6059280011c270b71570051c1c9b3052126620bc1e2730fa066c7a509d53" - "c60e5ae1b40aa6e39e49669228c90eecb4a50db32a50bc49e90b4f4b359a1dfd" - "11749cd3867fcf2fb7bb6cd4738f6a4ad6f7ca5058f7618845af9f020f6c3b96" - "7b8f4cd4a91e2813b507ae66f2d35c18284f7292186062e10fd5510d18775351" - "ef334e7634ab4743f5b68f49adcab384d3fd75f7390f4006ef2a295c8c7a076a" - "d54546cd25d2107fbe1436c840924aaebe5b370893cd63d1325b8616fc481088" - "6bc152c53221b6df373119393255ee72bcaa880174f1717f9184fa91646f17a2" - "4ac55d16bfddca9581a92eda479201f0edbf633600d6066d1ab36d5d2415d713" - "51bbcd608a25108d25641992c1f26c531cf9f90203bc4cc19f5927d834b0a471" - "16d3884bbb164b8ec883d1ac832e56b3918a98601a08d171881541d594db399c" - "6ae6151221745aec814c45b0b05b565436fd6f137aa10a0c0b643761dbd6f9a9" - "dcb99b1a6e690854ce0769cde39761d82fcdec15f0d92d7d8e94ade8eb83fbe0", - "99e5822dd4173c995e3dae0ddefb9774""3fde3b080134b39f76e9bf8d0e88d546", - "2637408fe13086ea73f971e3425e2820" - }, - /* - * test vectors from Hanno Böck - */ - { - "cccccccccccccccccccccccccccccccccccccccccccccccccc80cccccccccccc" - "cccccccccccccccccccccccccccccccccccccccccccccccccccccccccecccccc" - "ccccccccccccccccccccccccccccccc5cccccccccccccccccccccccccccccccc" - "cccccccccce3cccccccccccccccccccccccccccccccccccccccccccccccccccc" - "ccccccccaccccccccccccccccccccce6cccccccccc000000afcccccccccccccc" - "ccccfffffff50000000000000000000000000000000000000000000000000000" - "00ffffffe7000000000000000000000000000000000000000000000000000000" - "0000000000000000000000000000000000000000000000000000719205a8521d" - "fc", - "7f1b0264000000000000000000000000""0000000000000000cccccccccccccccc", - "8559b876eceed66eb37798c0457baff9" - }, - { - "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa0000000000" - "00000000800264", - "e0001600000000000000000000000000""0000aaaaaaaaaaaaaaaaaaaaaaaaaaaa", - "00bd1258978e205444c9aaaa82006fed" - }, - { - "02fc", - "0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c""0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c", - "06120c0c0c0c0c0c0c0c0c0c0c0c0c0c" - }, - { - "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b" - "7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b" - "7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b" - "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b6e7b007b7b7b7b7b7b7b7b7b" - "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b" - "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b" - "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b" - "7b6e7b001300000000b300000000000000000000000000000000000000000000" - "f20000000000000000000000000000000000002000efff000900000000000000" - "0000000000100000000009000000640000000000000000000000001300000000" - "b300000000000000000000000000000000000000000000f20000000000000000" - "000000000000000000002000efff00090000000000000000007a000010000000" - "000900000064000000000000000000000000000000000000000000000000fc", - "00ff0000000000000000000000000000""00000000001e00000000000000007b7b", - "33205bbf9e9f8f7212ab9e2ab9b7e4a5" - }, - { - "7777777777777777777777777777777777777777777777777777777777777777" - "7777777777777777777777777777777777777777777777777777777777777777" - "777777777777777777777777ffffffe9e9acacacacacacacacacacac0000acac" - "ec0100acacac2caca2acacacacacacacacacacac64f2", - "0000007f0000007f0100002000000000""0000cf77777777777777777777777777", - "02ee7c8c546ddeb1a467e4c3981158b9" - }, - /* - * test vectors from Andrew Moon - */ - { /* nacl */ - "8e993b9f48681273c29650ba32fc76ce48332ea7164d96a4476fb8c531a1186a" - "c0dfc17c98dce87b4da7f011ec48c97271d2c20f9b928fe2270d6fb863d51738" - "b48eeee314a7cc8ab932164548e526ae90224368517acfeabd6bb3732bc0e9da" - "99832b61ca01b6de56244a9e88d5f9b37973f622a43d14a6599b1f654cb45a74" - "e355a5", - "eea6a7251c1e72916d11c2cb214d3c25""2539121d8e234e652d651fa4c8cff880", - "f3ffc7703f9400e52a7dfb4b3d3305d9" - }, - { /* wrap 2^130-5 */ - "ffffffffffffffffffffffffffffffff", - "02000000000000000000000000000000""00000000000000000000000000000000", - "03000000000000000000000000000000" - }, - { /* wrap 2^128 */ - "02000000000000000000000000000000", - "02000000000000000000000000000000""ffffffffffffffffffffffffffffffff", - "03000000000000000000000000000000" - }, - { /* limb carry */ - "fffffffffffffffffffffffffffffffff0ffffffffffffffffffffffffffffff" - "11000000000000000000000000000000", - "01000000000000000000000000000000""00000000000000000000000000000000", - "05000000000000000000000000000000" - }, - { /* 2^130-5 */ - "fffffffffffffffffffffffffffffffffbfefefefefefefefefefefefefefefe" - "01010101010101010101010101010101", - "01000000000000000000000000000000""00000000000000000000000000000000", - "00000000000000000000000000000000" - }, - { /* 2^130-6 */ - "fdffffffffffffffffffffffffffffff", - "02000000000000000000000000000000""00000000000000000000000000000000", - "faffffffffffffffffffffffffffffff" - }, - { /* 5*H+L reduction intermediate */ - "e33594d7505e43b900000000000000003394d7505e4379cd0100000000000000" - "0000000000000000000000000000000001000000000000000000000000000000", - "01000000000000000400000000000000""00000000000000000000000000000000", - "14000000000000005500000000000000" - }, - { /* 5*H+L reduction final */ - "e33594d7505e43b900000000000000003394d7505e4379cd0100000000000000" - "00000000000000000000000000000000", - "01000000000000000400000000000000""00000000000000000000000000000000", - "13000000000000000000000000000000" - } -}; - -static unsigned char hex_digit(char h) -{ - int i = OPENSSL_hexchar2int(h); - - if (i < 0) - abort(); - return i; -} - -static void hex_decode(unsigned char *out, const char *hex) -{ - size_t j = 0; - - while (*hex != 0) { - unsigned char v = hex_digit(*hex++); - v <<= 4; - v |= hex_digit(*hex++); - out[j++] = v; - } -} - -static void hexdump(unsigned char *a, size_t len) -{ - size_t i; - - for (i = 0; i < len; i++) - printf("%02x", a[i]); -} - -int main() -{ - static const unsigned num_tests = - sizeof(poly1305_tests) / sizeof(struct poly1305_test); - unsigned i; - unsigned char key[32], out[16], expected[16]; - POLY1305 poly1305; - - for (i = 0; i < num_tests; i++) { - const struct poly1305_test *test = &poly1305_tests[i]; - unsigned char *in; - size_t inlen = strlen(test->inputhex); - - if (strlen(test->keyhex) != sizeof(key) * 2 || - strlen(test->outhex) != sizeof(out) * 2 || (inlen & 1) == 1) - return 1; - - inlen /= 2; - - hex_decode(key, test->keyhex); - hex_decode(expected, test->outhex); - - in = malloc(inlen); - - hex_decode(in, test->inputhex); - - Poly1305_Init(&poly1305, key); - Poly1305_Update(&poly1305, in, inlen); - Poly1305_Final(&poly1305, out); - - if (memcmp(out, expected, sizeof(expected)) != 0) { - printf("Poly1305 test #%d failed.\n", i); - printf("got: "); - hexdump(out, sizeof(out)); - printf("\nexpected: "); - hexdump(expected, sizeof(expected)); - printf("\n"); - return 1; - } - - if (inlen > 16) { - Poly1305_Init(&poly1305, key); - Poly1305_Update(&poly1305, in, 1); - Poly1305_Update(&poly1305, in+1, inlen-1); - Poly1305_Final(&poly1305, out); - - if (memcmp(out, expected, sizeof(expected)) != 0) { - printf("Poly1305 test #%d/1+(N-1) failed.\n", i); - printf("got: "); - hexdump(out, sizeof(out)); - printf("\nexpected: "); - hexdump(expected, sizeof(expected)); - printf("\n"); - return 1; - } - } - - if (inlen > 32) { - size_t half = inlen / 2; - - Poly1305_Init(&poly1305, key); - Poly1305_Update(&poly1305, in, half); - Poly1305_Update(&poly1305, in+half, inlen-half); - Poly1305_Final(&poly1305, out); - - if (memcmp(out, expected, sizeof(expected)) != 0) { - printf("Poly1305 test #%d/2 failed.\n", i); - printf("got: "); - hexdump(out, sizeof(out)); - printf("\nexpected: "); - hexdump(expected, sizeof(expected)); - printf("\n"); - return 1; - } - - for (half = 16; half < inlen; half += 16) { - Poly1305_Init(&poly1305, key); - Poly1305_Update(&poly1305, in, half); - Poly1305_Update(&poly1305, in+half, inlen-half); - Poly1305_Final(&poly1305, out); - - if (memcmp(out, expected, sizeof(expected)) != 0) { - printf("Poly1305 test #%d/%d+%d failed.\n", - i, half, inlen-half); - printf("got: "); - hexdump(out, sizeof(out)); - printf("\nexpected: "); - hexdump(expected, sizeof(expected)); - printf("\n"); - return 1; - } - } - } - - free(in); - } - - printf("PASS\n"); - -# ifdef OPENSSL_CPUID_OBJ - { - unsigned char buf[8192]; - unsigned long long stopwatch; - unsigned long long OPENSSL_rdtsc(); - - memset (buf,0x55,sizeof(buf)); - memset (key,0xAA,sizeof(key)); - - Poly1305_Init(&poly1305, key); - - for (i=0;i<100000;i++) - Poly1305_Update(&poly1305,buf,sizeof(buf)); - - stopwatch = OPENSSL_rdtsc(); - for (i=0;i<10000;i++) - Poly1305_Update(&poly1305,buf,sizeof(buf)); - stopwatch = OPENSSL_rdtsc() - stopwatch; - - printf("%g\n",stopwatch/(double)(i*sizeof(buf))); - - stopwatch = OPENSSL_rdtsc(); - for (i=0;i<10000;i++) { - Poly1305_Init(&poly1305, key); - Poly1305_Update(&poly1305,buf,16); - Poly1305_Final(&poly1305,buf); - } - stopwatch = OPENSSL_rdtsc() - stopwatch; - - printf("%g\n",stopwatch/(double)(i)); - } -# endif - return 0; -} -#endif diff --git a/deps/openssl/openssl/crypto/poly1305/poly1305_ameth.c b/deps/openssl/openssl/crypto/poly1305/poly1305_ameth.c new file mode 100644 index 0000000000..033ee8cd96 --- /dev/null +++ b/deps/openssl/openssl/crypto/poly1305/poly1305_ameth.c @@ -0,0 +1,122 @@ +/* + * Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include <stdio.h> +#include "internal/cryptlib.h" +#include <openssl/evp.h> +#include "internal/asn1_int.h" +#include "internal/poly1305.h" +#include "poly1305_local.h" +#include "internal/evp_int.h" + +/* + * POLY1305 "ASN1" method. This is just here to indicate the maximum + * POLY1305 output length and to free up a POLY1305 key. + */ + +static int poly1305_size(const EVP_PKEY *pkey) +{ + return POLY1305_DIGEST_SIZE; +} + +static void poly1305_key_free(EVP_PKEY *pkey) +{ + ASN1_OCTET_STRING *os = EVP_PKEY_get0(pkey); + if (os != NULL) { + if (os->data != NULL) + OPENSSL_cleanse(os->data, os->length); + ASN1_OCTET_STRING_free(os); + } +} + +static int poly1305_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2) +{ + /* nothing, (including ASN1_PKEY_CTRL_DEFAULT_MD_NID), is supported */ + return -2; +} + +static int poly1305_pkey_public_cmp(const EVP_PKEY *a, const EVP_PKEY *b) +{ + return ASN1_OCTET_STRING_cmp(EVP_PKEY_get0(a), EVP_PKEY_get0(b)); +} + +static int poly1305_set_priv_key(EVP_PKEY *pkey, const unsigned char *priv, + size_t len) +{ + ASN1_OCTET_STRING *os; + + if (pkey->pkey.ptr != NULL || len != POLY1305_KEY_SIZE) + return 0; + + os = ASN1_OCTET_STRING_new(); + if (os == NULL) + return 0; + + if (!ASN1_OCTET_STRING_set(os, priv, len)) { + ASN1_OCTET_STRING_free(os); + return 0; + } + + pkey->pkey.ptr = os; + return 1; +} + +static int poly1305_get_priv_key(const EVP_PKEY *pkey, unsigned char *priv, + size_t *len) +{ + ASN1_OCTET_STRING *os = (ASN1_OCTET_STRING *)pkey->pkey.ptr; + + if (priv == NULL) { + *len = POLY1305_KEY_SIZE; + return 1; + } + + if (os == NULL || *len < POLY1305_KEY_SIZE) + return 0; + + memcpy(priv, ASN1_STRING_get0_data(os), ASN1_STRING_length(os)); + *len = POLY1305_KEY_SIZE; + + return 1; +} + +const EVP_PKEY_ASN1_METHOD poly1305_asn1_meth = { + EVP_PKEY_POLY1305, + EVP_PKEY_POLY1305, + 0, + + "POLY1305", + "OpenSSL POLY1305 method", + + 0, 0, poly1305_pkey_public_cmp, 0, + + 0, 0, 0, + + poly1305_size, + 0, 0, + 0, 0, 0, 0, 0, 0, 0, + + poly1305_key_free, + poly1305_pkey_ctrl, + NULL, + NULL, + + NULL, + NULL, + NULL, + + NULL, + NULL, + NULL, + + poly1305_set_priv_key, + NULL, + poly1305_get_priv_key, + NULL, +}; diff --git a/deps/openssl/openssl/crypto/poly1305/poly1305_base2_44.c b/deps/openssl/openssl/crypto/poly1305/poly1305_base2_44.c new file mode 100644 index 0000000000..b6313d01ba --- /dev/null +++ b/deps/openssl/openssl/crypto/poly1305/poly1305_base2_44.c @@ -0,0 +1,171 @@ +/* + * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +/* + * This module is meant to be used as template for base 2^44 assembly + * implementation[s]. On side note compiler-generated code is not + * slower than compiler-generated base 2^64 code on [high-end] x86_64, + * even though amount of multiplications is 50% higher. Go figure... + */ +#include <stdlib.h> + +typedef unsigned char u8; +typedef unsigned int u32; +typedef unsigned long u64; +typedef unsigned __int128 u128; + +typedef struct { + u64 h[3]; + u64 s[2]; + u64 r[3]; +} poly1305_internal; + +#define POLY1305_BLOCK_SIZE 16 + +/* pick 64-bit unsigned integer in little endian order */ +static u64 U8TOU64(const unsigned char *p) +{ + return (((u64)(p[0] & 0xff)) | + ((u64)(p[1] & 0xff) << 8) | + ((u64)(p[2] & 0xff) << 16) | + ((u64)(p[3] & 0xff) << 24) | + ((u64)(p[4] & 0xff) << 32) | + ((u64)(p[5] & 0xff) << 40) | + ((u64)(p[6] & 0xff) << 48) | + ((u64)(p[7] & 0xff) << 56)); +} + +/* store a 64-bit unsigned integer in little endian */ +static void U64TO8(unsigned char *p, u64 v) +{ + p[0] = (unsigned char)((v) & 0xff); + p[1] = (unsigned char)((v >> 8) & 0xff); + p[2] = (unsigned char)((v >> 16) & 0xff); + p[3] = (unsigned char)((v >> 24) & 0xff); + p[4] = (unsigned char)((v >> 32) & 0xff); + p[5] = (unsigned char)((v >> 40) & 0xff); + p[6] = (unsigned char)((v >> 48) & 0xff); + p[7] = (unsigned char)((v >> 56) & 0xff); +} + +int poly1305_init(void *ctx, const unsigned char key[16]) +{ + poly1305_internal *st = (poly1305_internal *)ctx; + u64 r0, r1; + + /* h = 0 */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + + r0 = U8TOU64(&key[0]) & 0x0ffffffc0fffffff; + r1 = U8TOU64(&key[8]) & 0x0ffffffc0ffffffc; + + /* break r1:r0 to three 44-bit digits, masks are 1<<44-1 */ + st->r[0] = r0 & 0x0fffffffffff; + st->r[1] = ((r0 >> 44) | (r1 << 20)) & 0x0fffffffffff; + st->r[2] = (r1 >> 24); + + st->s[0] = (st->r[1] + (st->r[1] << 2)) << 2; + st->s[1] = (st->r[2] + (st->r[2] << 2)) << 2; + + return 0; +} + +void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, + u32 padbit) +{ + poly1305_internal *st = (poly1305_internal *)ctx; + u64 r0, r1, r2; + u64 s1, s2; + u64 h0, h1, h2, c; + u128 d0, d1, d2; + u64 pad = (u64)padbit << 40; + + r0 = st->r[0]; + r1 = st->r[1]; + r2 = st->r[2]; + + s1 = st->s[0]; + s2 = st->s[1]; + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + + while (len >= POLY1305_BLOCK_SIZE) { + u64 m0, m1; + + m0 = U8TOU64(inp + 0); + m1 = U8TOU64(inp + 8); + + /* h += m[i], m[i] is broken to 44-bit digits */ + h0 += m0 & 0x0fffffffffff; + h1 += ((m0 >> 44) | (m1 << 20)) & 0x0fffffffffff; + h2 += (m1 >> 24) + pad; + + /* h *= r "%" p, where "%" stands for "partial remainder" */ + d0 = ((u128)h0 * r0) + ((u128)h1 * s2) + ((u128)h2 * s1); + d1 = ((u128)h0 * r1) + ((u128)h1 * r0) + ((u128)h2 * s2); + d2 = ((u128)h0 * r2) + ((u128)h1 * r1) + ((u128)h2 * r0); + + /* "lazy" reduction step */ + h0 = (u64)d0 & 0x0fffffffffff; + h1 = (u64)(d1 += (u64)(d0 >> 44)) & 0x0fffffffffff; + h2 = (u64)(d2 += (u64)(d1 >> 44)) & 0x03ffffffffff; /* last 42 bits */ + + c = (d2 >> 42); + h0 += c + (c << 2); + + inp += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + } + + st->h[0] = h0; + st->h[1] = h1; + st->h[2] = h2; +} + +void poly1305_emit(void *ctx, unsigned char mac[16], const u32 nonce[4]) +{ + poly1305_internal *st = (poly1305_internal *) ctx; + u64 h0, h1, h2; + u64 g0, g1, g2; + u128 t; + u64 mask; + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + + /* after "lazy" reduction, convert 44+bit digits to 64-bit ones */ + h0 = (u64)(t = (u128)h0 + (h1 << 44)); h1 >>= 20; + h1 = (u64)(t = (u128)h1 + (h2 << 24) + (t >> 64)); h2 >>= 40; + h2 += (u64)(t >> 64); + + /* compare to modulus by computing h + -p */ + g0 = (u64)(t = (u128)h0 + 5); + g1 = (u64)(t = (u128)h1 + (t >> 64)); + g2 = h2 + (u64)(t >> 64); + + /* if there was carry into 131st bit, h1:h0 = g1:g0 */ + mask = 0 - (g2 >> 2); + g0 &= mask; + g1 &= mask; + mask = ~mask; + h0 = (h0 & mask) | g0; + h1 = (h1 & mask) | g1; + + /* mac = (h + nonce) % (2^128) */ + h0 = (u64)(t = (u128)h0 + nonce[0] + ((u64)nonce[1]<<32)); + h1 = (u64)(t = (u128)h1 + nonce[2] + ((u64)nonce[3]<<32) + (t >> 64)); + + U64TO8(mac + 0, h0); + U64TO8(mac + 8, h1); +} diff --git a/deps/openssl/openssl/crypto/poly1305/poly1305_ieee754.c b/deps/openssl/openssl/crypto/poly1305/poly1305_ieee754.c index 08a5b58c2a..7cfd968645 100644 --- a/deps/openssl/openssl/crypto/poly1305/poly1305_ieee754.c +++ b/deps/openssl/openssl/crypto/poly1305/poly1305_ieee754.c @@ -1,5 +1,5 @@ /* - * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the OpenSSL license (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy @@ -20,30 +20,30 @@ * for x86_64 code. And since we are at it, just for sense of it, * large-block performance in cycles per processed byte for *this* code * is: - * gcc-4.8 icc-15.0 clang-3.4(*) + * gcc-4.8 icc-15.0 clang-3.4(*) * - * Westmere 4.96 5.09 4.37 - * Sandy Bridge 4.95 4.90 4.17 - * Haswell 4.92 4.87 3.78 - * Bulldozer 4.67 4.49 4.68 - * VIA Nano 7.07 7.05 5.98 - * Silvermont 10.6 9.61 12.6 + * Westmere 4.96 5.09 4.37 + * Sandy Bridge 4.95 4.90 4.17 + * Haswell 4.92 4.87 3.78 + * Bulldozer 4.67 4.49 4.68 + * VIA Nano 7.07 7.05 5.98 + * Silvermont 10.6 9.61 12.6 * - * (*) clang managed to discover parallelism and deployed SIMD; + * (*) clang managed to discover parallelism and deployed SIMD; * * And for range of other platforms with unspecified gcc versions: * - * Freescale e300 12.5 - * PPC74x0 10.8 - * POWER6 4.92 - * POWER7 4.50 - * POWER8 4.10 + * Freescale e300 12.5 + * PPC74x0 10.8 + * POWER6 4.92 + * POWER7 4.50 + * POWER8 4.10 * - * z10 11.2 - * z196+ 7.30 + * z10 11.2 + * z196+ 7.30 * - * UltraSPARC III 16.0 - * SPARC T4 16.1 + * UltraSPARC III 16.0 + * SPARC T4 16.1 */ #if !(defined(__GNUC__) && __GNUC__>=2) @@ -57,33 +57,33 @@ typedef unsigned int u32; typedef unsigned long long u64; typedef union { double d; u64 u; } elem64; -#define TWO(p) ((double)(1ULL<<(p))) -#define TWO0 TWO(0) -#define TWO32 TWO(32) -#define TWO64 (TWO32*TWO(32)) -#define TWO96 (TWO64*TWO(32)) -#define TWO130 (TWO96*TWO(34)) +#define TWO(p) ((double)(1ULL<<(p))) +#define TWO0 TWO(0) +#define TWO32 TWO(32) +#define TWO64 (TWO32*TWO(32)) +#define TWO96 (TWO64*TWO(32)) +#define TWO130 (TWO96*TWO(34)) -#define EXP(p) ((1023ULL+(p))<<52) +#define EXP(p) ((1023ULL+(p))<<52) #if defined(__x86_64__) || (defined(__PPC__) && defined(__LITTLE_ENDIAN__)) -# define U8TOU32(p) (*(const u32 *)(p)) -# define U32TO8(p,v) (*(u32 *)(p) = (v)) +# define U8TOU32(p) (*(const u32 *)(p)) +# define U32TO8(p,v) (*(u32 *)(p) = (v)) #elif defined(__PPC__) -# define U8TOU32(p) ({u32 ret; asm ("lwbrx %0,0,%1":"=r"(ret):"b"(p)); ret; }) -# define U32TO8(p,v) asm ("stwbrx %0,0,%1"::"r"(v),"b"(p):"memory") +# define U8TOU32(p) ({u32 ret; asm ("lwbrx %0,0,%1":"=r"(ret):"b"(p)); ret; }) +# define U32TO8(p,v) asm ("stwbrx %0,0,%1"::"r"(v),"b"(p):"memory") #elif defined(__s390x__) -# define U8TOU32(p) ({u32 ret; asm ("lrv %0,%1":"=d"(ret):"m"(*(u32 *)(p))); ret; }) -# define U32TO8(p,v) asm ("strv %1,%0":"=m"(*(u32 *)(p)):"d"(v)) +# define U8TOU32(p) ({u32 ret; asm ("lrv %0,%1":"=d"(ret):"m"(*(u32 *)(p))); ret; }) +# define U32TO8(p,v) asm ("strv %1,%0":"=m"(*(u32 *)(p)):"d"(v)) #endif #ifndef U8TOU32 -# define U8TOU32(p) ((u32)(p)[0] | (u32)(p)[1]<<8 | \ - (u32)(p)[2]<<16 | (u32)(p)[3]<<24 ) +# define U8TOU32(p) ((u32)(p)[0] | (u32)(p)[1]<<8 | \ + (u32)(p)[2]<<16 | (u32)(p)[3]<<24 ) #endif #ifndef U32TO8 -# define U32TO8(p,v) ((p)[0] = (u8)(v), (p)[1] = (u8)((v)>>8), \ - (p)[2] = (u8)((v)>>16), (p)[3] = (u8)((v)>>24) ) +# define U32TO8(p,v) ((p)[0] = (u8)(v), (p)[1] = (u8)((v)>>8), \ + (p)[2] = (u8)((v)>>16), (p)[3] = (u8)((v)>>24) ) #endif typedef struct { @@ -101,6 +101,8 @@ static const u64 one = 1; static const u32 fpc = 1; #elif defined(__sparc__) static const u64 fsr = 1ULL<<30; +#elif defined(__mips__) +static const u32 fcsr = 1; #else #error "unrecognized platform" #endif @@ -147,6 +149,11 @@ int poly1305_init(void *ctx, const unsigned char key[16]) asm volatile ("stx %%fsr,%0":"=m"(fsr_orig)); asm volatile ("ldx %0,%%fsr"::"m"(fsr)); +#elif defined(__mips__) + u32 fcsr_orig; + + asm volatile ("cfc1 %0,$31":"=r"(fcsr_orig)); + asm volatile ("ctc1 %0,$31"::"r"(fcsr)); #endif /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ @@ -206,6 +213,8 @@ int poly1305_init(void *ctx, const unsigned char key[16]) asm volatile ("lfpc %0"::"m"(fpc_orig)); #elif defined(__sparc__) asm volatile ("ldx %0,%%fsr"::"m"(fsr_orig)); +#elif defined(__mips__) + asm volatile ("ctc1 %0,$31"::"r"(fcsr_orig)); #endif } @@ -262,6 +271,11 @@ void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, asm volatile ("stx %%fsr,%0":"=m"(fsr_orig)); asm volatile ("ldx %0,%%fsr"::"m"(fsr)); +#elif defined(__mips__) + u32 fcsr_orig; + + asm volatile ("cfc1 %0,$31":"=r"(fcsr_orig)); + asm volatile ("ctc1 %0,$31"::"r"(fcsr)); #endif /* @@ -345,9 +359,9 @@ void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, #ifndef __clang__ fast_entry: #endif - /* - * base 2^32 * base 2^16 = base 2^48 - */ + /* + * base 2^32 * base 2^16 = base 2^48 + */ h0lo = s3lo * x1 + s2lo * x2 + s1lo * x3 + r0lo * x0; h1lo = r0lo * x1 + s3lo * x2 + s2lo * x3 + r1lo * x0; h2lo = r1lo * x1 + r0lo * x2 + s3lo * x3 + r2lo * x0; @@ -408,6 +422,8 @@ void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, asm volatile ("lfpc %0"::"m"(fpc_orig)); #elif defined(__sparc__) asm volatile ("ldx %0,%%fsr"::"m"(fsr_orig)); +#elif defined(__mips__) + asm volatile ("ctc1 %0,$31"::"r"(fcsr_orig)); #endif } diff --git a/deps/openssl/openssl/crypto/poly1305/poly1305_local.h b/deps/openssl/openssl/crypto/poly1305/poly1305_local.h new file mode 100644 index 0000000000..6d4d9dc5b6 --- /dev/null +++ b/deps/openssl/openssl/crypto/poly1305/poly1305_local.h @@ -0,0 +1,27 @@ +/* + * Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +typedef void (*poly1305_blocks_f) (void *ctx, const unsigned char *inp, + size_t len, unsigned int padbit); +typedef void (*poly1305_emit_f) (void *ctx, unsigned char mac[16], + const unsigned int nonce[4]); + +struct poly1305_context { + double opaque[24]; /* large enough to hold internal state, declared + * 'double' to ensure at least 64-bit invariant + * alignment across all platforms and + * configurations */ + unsigned int nonce[4]; + unsigned char data[POLY1305_BLOCK_SIZE]; + size_t num; + struct { + poly1305_blocks_f blocks; + poly1305_emit_f emit; + } func; +}; diff --git a/deps/openssl/openssl/crypto/poly1305/poly1305_pmeth.c b/deps/openssl/openssl/crypto/poly1305/poly1305_pmeth.c new file mode 100644 index 0000000000..3bc24c98cd --- /dev/null +++ b/deps/openssl/openssl/crypto/poly1305/poly1305_pmeth.c @@ -0,0 +1,194 @@ +/* + * Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include <stdio.h> +#include "internal/cryptlib.h" +#include <openssl/x509.h> +#include <openssl/x509v3.h> +#include <openssl/evp.h> +#include <openssl/err.h> +#include "internal/poly1305.h" +#include "poly1305_local.h" +#include "internal/evp_int.h" + +/* POLY1305 pkey context structure */ + +typedef struct { + ASN1_OCTET_STRING ktmp; /* Temp storage for key */ + POLY1305 ctx; +} POLY1305_PKEY_CTX; + +static int pkey_poly1305_init(EVP_PKEY_CTX *ctx) +{ + POLY1305_PKEY_CTX *pctx; + + if ((pctx = OPENSSL_zalloc(sizeof(*pctx))) == NULL) { + CRYPTOerr(CRYPTO_F_PKEY_POLY1305_INIT, ERR_R_MALLOC_FAILURE); + return 0; + } + pctx->ktmp.type = V_ASN1_OCTET_STRING; + + EVP_PKEY_CTX_set_data(ctx, pctx); + EVP_PKEY_CTX_set0_keygen_info(ctx, NULL, 0); + return 1; +} + +static void pkey_poly1305_cleanup(EVP_PKEY_CTX *ctx) +{ + POLY1305_PKEY_CTX *pctx = EVP_PKEY_CTX_get_data(ctx); + + if (pctx != NULL) { + OPENSSL_clear_free(pctx->ktmp.data, pctx->ktmp.length); + OPENSSL_clear_free(pctx, sizeof(*pctx)); + EVP_PKEY_CTX_set_data(ctx, NULL); + } +} + +static int pkey_poly1305_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) +{ + POLY1305_PKEY_CTX *sctx, *dctx; + + /* allocate memory for dst->data and a new POLY1305_CTX in dst->data->ctx */ + if (!pkey_poly1305_init(dst)) + return 0; + sctx = EVP_PKEY_CTX_get_data(src); + dctx = EVP_PKEY_CTX_get_data(dst); + if (ASN1_STRING_get0_data(&sctx->ktmp) != NULL && + !ASN1_STRING_copy(&dctx->ktmp, &sctx->ktmp)) { + /* cleanup and free the POLY1305_PKEY_CTX in dst->data */ + pkey_poly1305_cleanup(dst); + return 0; + } + memcpy(&dctx->ctx, &sctx->ctx, sizeof(POLY1305)); + return 1; +} + +static int pkey_poly1305_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) +{ + ASN1_OCTET_STRING *key; + POLY1305_PKEY_CTX *pctx = EVP_PKEY_CTX_get_data(ctx); + + if (ASN1_STRING_get0_data(&pctx->ktmp) == NULL) + return 0; + key = ASN1_OCTET_STRING_dup(&pctx->ktmp); + if (key == NULL) + return 0; + return EVP_PKEY_assign_POLY1305(pkey, key); +} + +static int int_update(EVP_MD_CTX *ctx, const void *data, size_t count) +{ + POLY1305_PKEY_CTX *pctx = EVP_PKEY_CTX_get_data(EVP_MD_CTX_pkey_ctx(ctx)); + + Poly1305_Update(&pctx->ctx, data, count); + return 1; +} + +static int poly1305_signctx_init(EVP_PKEY_CTX *ctx, EVP_MD_CTX *mctx) +{ + POLY1305_PKEY_CTX *pctx = ctx->data; + ASN1_OCTET_STRING *key = (ASN1_OCTET_STRING *)ctx->pkey->pkey.ptr; + + if (key->length != POLY1305_KEY_SIZE) + return 0; + EVP_MD_CTX_set_flags(mctx, EVP_MD_CTX_FLAG_NO_INIT); + EVP_MD_CTX_set_update_fn(mctx, int_update); + Poly1305_Init(&pctx->ctx, key->data); + return 1; +} +static int poly1305_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, + EVP_MD_CTX *mctx) +{ + POLY1305_PKEY_CTX *pctx = ctx->data; + + *siglen = POLY1305_DIGEST_SIZE; + if (sig != NULL) + Poly1305_Final(&pctx->ctx, sig); + return 1; +} + +static int pkey_poly1305_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) +{ + POLY1305_PKEY_CTX *pctx = EVP_PKEY_CTX_get_data(ctx); + const unsigned char *key; + size_t len; + + switch (type) { + + case EVP_PKEY_CTRL_MD: + /* ignore */ + break; + + case EVP_PKEY_CTRL_SET_MAC_KEY: + case EVP_PKEY_CTRL_DIGESTINIT: + if (type == EVP_PKEY_CTRL_SET_MAC_KEY) { + /* user explicitly setting the key */ + key = p2; + len = p1; + } else { + /* user indirectly setting the key via EVP_DigestSignInit */ + key = EVP_PKEY_get0_poly1305(EVP_PKEY_CTX_get0_pkey(ctx), &len); + } + if (key == NULL || len != POLY1305_KEY_SIZE || + !ASN1_OCTET_STRING_set(&pctx->ktmp, key, len)) + return 0; + Poly1305_Init(&pctx->ctx, ASN1_STRING_get0_data(&pctx->ktmp)); + break; + + default: + return -2; + + } + return 1; +} + +static int pkey_poly1305_ctrl_str(EVP_PKEY_CTX *ctx, + const char *type, const char *value) +{ + if (value == NULL) + return 0; + if (strcmp(type, "key") == 0) + return EVP_PKEY_CTX_str2ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, value); + if (strcmp(type, "hexkey") == 0) + return EVP_PKEY_CTX_hex2ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, value); + return -2; +} + +const EVP_PKEY_METHOD poly1305_pkey_meth = { + EVP_PKEY_POLY1305, + EVP_PKEY_FLAG_SIGCTX_CUSTOM, /* we don't deal with a separate MD */ + pkey_poly1305_init, + pkey_poly1305_copy, + pkey_poly1305_cleanup, + + 0, 0, + + 0, + pkey_poly1305_keygen, + + 0, 0, + + 0, 0, + + 0, 0, + + poly1305_signctx_init, + poly1305_signctx, + + 0, 0, + + 0, 0, + + 0, 0, + + 0, 0, + + pkey_poly1305_ctrl, + pkey_poly1305_ctrl_str +}; |