diff options
Diffstat (limited to 'deps/openssl/openssl/crypto/bn/asm')
50 files changed, 2845 insertions, 10110 deletions
diff --git a/deps/openssl/openssl/crypto/bn/asm/README b/deps/openssl/openssl/crypto/bn/asm/README deleted file mode 100644 index b0f3a68a06..0000000000 --- a/deps/openssl/openssl/crypto/bn/asm/README +++ /dev/null @@ -1,27 +0,0 @@ -<OBSOLETE> - -All assember in this directory are just version of the file -crypto/bn/bn_asm.c. - -Quite a few of these files are just the assember output from gcc since on -quite a few machines they are 2 times faster than the system compiler. - -For the x86, I have hand written assember because of the bad job all -compilers seem to do on it. This normally gives a 2 time speed up in the RSA -routines. - -For the DEC alpha, I also hand wrote the assember (except the division which -is just the output from the C compiler pasted on the end of the file). -On the 2 alpha C compilers I had access to, it was not possible to do -64b x 64b -> 128b calculations (both long and the long long data types -were 64 bits). So the hand assember gives access to the 128 bit result and -a 2 times speedup :-). - -There are 3 versions of assember for the HP PA-RISC. - -pa-risc.s is the origional one which works fine and generated using gcc :-) - -pa-risc2W.s and pa-risc2.s are 64 and 32-bit PA-RISC 2.0 implementations -by Chris Ruemmler from HP (with some help from the HP C compiler). - -</OBSOLETE> diff --git a/deps/openssl/openssl/crypto/bn/asm/alpha-mont.pl b/deps/openssl/openssl/crypto/bn/asm/alpha-mont.pl index 03596e2014..1d68d6d072 100644 --- a/deps/openssl/openssl/crypto/bn/asm/alpha-mont.pl +++ b/deps/openssl/openssl/crypto/bn/asm/alpha-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -15,6 +22,9 @@ # I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x # difference. +$output=pop; +open STDOUT,">$output"; + # int bn_mul_mont( $rp="a0"; # BN_ULONG *rp, $ap="a1"; # const BN_ULONG *ap, diff --git a/deps/openssl/openssl/crypto/bn/asm/armv4-gf2m.pl b/deps/openssl/openssl/crypto/bn/asm/armv4-gf2m.pl index 72381a7724..0bb5433075 100644 --- a/deps/openssl/openssl/crypto/bn/asm/armv4-gf2m.pl +++ b/deps/openssl/openssl/crypto/bn/asm/armv4-gf2m.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -32,14 +39,31 @@ # # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $code=<<___; #include "arm_arch.h" .text +#if defined(__thumb2__) +.syntax unified +.thumb +#else .code 32 +#endif ___ ################ # private interface to mul_1x1_ialu @@ -120,11 +144,17 @@ mul_1x1_ialu: eor $hi,$hi,$t0,lsr#8 ldr $t0,[sp,$i0] @ tab[b >> 30 ] +#ifdef __thumb2__ + itt ne +#endif eorne $lo,$lo,$b,lsl#30 eorne $hi,$hi,$b,lsr#2 tst $a,#1<<31 eor $lo,$lo,$t1,lsl#27 eor $hi,$hi,$t1,lsr#5 +#ifdef __thumb2__ + itt ne +#endif eorne $lo,$lo,$b,lsl#31 eorne $hi,$hi,$b,lsr#1 eor $lo,$lo,$t0,lsl#30 @@ -144,20 +174,33 @@ $code.=<<___; .align 5 bn_GF2m_mul_2x2: #if __ARM_MAX_ARCH__>=7 + stmdb sp!,{r10,lr} ldr r12,.LOPENSSL_armcap -.Lpic: ldr r12,[pc,r12] - tst r12,#1 + adr r10,.LOPENSSL_armcap + ldr r12,[r12,r10] +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV7_NEON + itt ne + ldrne r10,[sp],#8 bne .LNEON + stmdb sp!,{r4-r9} +#else + stmdb sp!,{r4-r10,lr} #endif ___ $ret="r10"; # reassigned 1st argument $code.=<<___; - stmdb sp!,{r4-r10,lr} mov $ret,r0 @ reassign 1st argument mov $b,r3 @ $b=b1 + sub r7,sp,#36 + mov r8,sp + and r7,r7,#-32 ldr r3,[sp,#32] @ load b0 mov $mask,#7<<2 - sub sp,sp,#32 @ allocate tab[8] + mov sp,r7 @ allocate tab[8] + str r8,[r7,#32] bl mul_1x1_ialu @ a1·b1 str $lo,[$ret,#8] @@ -181,6 +224,7 @@ ___ $code.=<<___; ldmia $ret,{@r[0]-@r[3]} eor $lo,$lo,$hi + ldr sp,[sp,#32] @ destroy tab[8] eor $hi,$hi,@r[1] eor $lo,$lo,@r[0] eor $hi,$hi,@r[2] @@ -188,7 +232,6 @@ $code.=<<___; eor $hi,$hi,@r[3] str $hi,[$ret,#8] eor $lo,$lo,$hi - add sp,sp,#32 @ destroy tab[8] str $lo,[$ret,#4] #if __ARM_ARCH__>=5 @@ -213,8 +256,8 @@ $code.=<<___; .align 5 .LNEON: ldr r12, [sp] @ 5th argument - vmov.32 $a, r2, r1 - vmov.32 $b, r12, r3 + vmov $a, r2, r1 + vmov $b, r12, r3 vmov.i64 $k48, #0x0000ffffffffffff vmov.i64 $k32, #0x00000000ffffffff vmov.i64 $k16, #0x000000000000ffff @@ -267,7 +310,7 @@ $code.=<<___; #if __ARM_MAX_ARCH__>=7 .align 5 .LOPENSSL_armcap: -.word OPENSSL_armcap_P-(.Lpic+8) +.word OPENSSL_armcap_P-. #endif .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" .align 5 diff --git a/deps/openssl/openssl/crypto/bn/asm/armv4-mont.pl b/deps/openssl/openssl/crypto/bn/asm/armv4-mont.pl index 1d330e9f8a..0dc4fe95e4 100644 --- a/deps/openssl/openssl/crypto/bn/asm/armv4-mont.pl +++ b/deps/openssl/openssl/crypto/bn/asm/armv4-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -38,8 +45,29 @@ # for execution on all NEON-capable processors, because gain on # others outweighs the marginal loss on Cortex-A9. -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +# September 2015 +# +# Align Cortex-A9 performance with November 2013 improvements, i.e. +# NEON code is now ~20-105% faster than integer-only one on this +# processor. But this optimization further improved performance even +# on other processors: NEON code path is ~45-180% faster than original +# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on +# Snapdragon S4. + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $num="r0"; # starts as num argument, but holds &tp[num-1] $ap="r1"; @@ -70,12 +98,17 @@ $code=<<___; #include "arm_arch.h" .text +#if defined(__thumb2__) +.syntax unified +.thumb +#else .code 32 +#endif #if __ARM_MAX_ARCH__>=7 .align 5 .LOPENSSL_armcap: -.word OPENSSL_armcap_P-bn_mul_mont +.word OPENSSL_armcap_P-.Lbn_mul_mont #endif .global bn_mul_mont @@ -83,15 +116,19 @@ $code=<<___; .align 5 bn_mul_mont: +.Lbn_mul_mont: ldr ip,[sp,#4] @ load num stmdb sp!,{r0,r2} @ sp points at argument block #if __ARM_MAX_ARCH__>=7 tst ip,#7 bne .Lialu - adr r0,bn_mul_mont + adr r0,.Lbn_mul_mont ldr r2,.LOPENSSL_armcap ldr r0,[r0,r2] - tst r0,#1 @ NEON available? +#ifdef __APPLE__ + ldr r0,[r0] +#endif + tst r0,#ARMV7_NEON @ NEON available? ldmia sp, {r0,r2} beq .Lialu add sp,sp,#8 @@ -101,6 +138,9 @@ bn_mul_mont: #endif cmp ip,#2 mov $num,ip @ load num +#ifdef __thumb2__ + ittt lt +#endif movlt r0,#0 addlt sp,sp,#2*4 blt .Labrt @@ -148,10 +188,11 @@ bn_mul_mont: ldr $n0,[$_n0] @ restore n0 adc $nhi,$nhi,#0 str $nlo,[$num] @ tp[num-1]= + mov $tj,sp str $nhi,[$num,#4] @ tp[num]= .Louter: - sub $tj,$num,sp @ "original" $num-1 value + sub $tj,$num,$tj @ "original" $num-1 value sub $ap,$ap,$tj @ "rewind" ap to &ap[1] ldr $bi,[$tp,#4]! @ *(++bp) sub $np,$np,$tj @ "rewind" np to &np[1] @@ -196,11 +237,16 @@ bn_mul_mont: str $nhi,[$num,#4] @ tp[num]= cmp $tp,$tj +#ifdef __thumb2__ + itt ne +#endif + movne $tj,sp bne .Louter ldr $rp,[$_rp] @ pull rp + mov $aj,sp add $num,$num,#4 @ $num to point at &tp[num] - sub $aj,$num,sp @ "original" num value + sub $aj,$num,$aj @ "original" num value mov $tp,sp @ "rewind" $tp mov $ap,$tp @ "borrow" $ap sub $np,$np,$aj @ "rewind" $np to &np[0] @@ -226,7 +272,8 @@ bn_mul_mont: cmp $tp,$num bne .Lcopy - add sp,$num,#4 @ skip over tp[num+1] + mov sp,$num + add sp,sp,#4 @ skip over tp[num+1] ldmia sp!,{r4-r12,lr} @ restore registers add sp,sp,#2*4 @ skip over {r0,r2} mov r0,#1 @@ -241,19 +288,16 @@ bn_mul_mont: .size bn_mul_mont,.-bn_mul_mont ___ { -sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } -sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } - my ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); my ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); my ($Z,$Temp)=("q4","q5"); -my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13)); +my @ACC=map("q$_",(6..13)); my ($Bi,$Ni,$M0)=map("d$_",(28..31)); -my $zero=&Dlo($Z); -my $temp=&Dlo($Temp); +my $zero="$Z#lo"; +my $temp="$Temp#lo"; my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); -my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9)); +my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11)); $code.=<<___; #if __ARM_MAX_ARCH__>=7 @@ -267,60 +311,60 @@ bn_mul8x_mont_neon: stmdb sp!,{r4-r11} vstmdb sp!,{d8-d15} @ ABI specification says so ldmia ip,{r4-r5} @ load rest of parameter block + mov ip,sp + + cmp $num,#8 + bhi .LNEON_8n + + @ special case for $num==8, everything is in register bank... - sub $toutptr,sp,#16 vld1.32 {${Bi}[0]}, [$bptr,:32]! - sub $toutptr,$toutptr,$num,lsl#4 + veor $zero,$zero,$zero + sub $toutptr,sp,$num,lsl#4 vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( and $toutptr,$toutptr,#-64 vld1.32 {${M0}[0]}, [$n0,:32] mov sp,$toutptr @ alloca - veor $zero,$zero,$zero - subs $inner,$num,#8 vzip.16 $Bi,$zero - vmull.u32 $A0xB,$Bi,${A0}[0] - vmull.u32 $A1xB,$Bi,${A0}[1] - vmull.u32 $A2xB,$Bi,${A1}[0] - vshl.i64 $temp,`&Dhi("$A0xB")`,#16 - vmull.u32 $A3xB,$Bi,${A1}[1] + vmull.u32 @ACC[0],$Bi,${A0}[0] + vmull.u32 @ACC[1],$Bi,${A0}[1] + vmull.u32 @ACC[2],$Bi,${A1}[0] + vshl.i64 $Ni,@ACC[0]#hi,#16 + vmull.u32 @ACC[3],$Bi,${A1}[1] - vadd.u64 $temp,$temp,`&Dlo("$A0xB")` + vadd.u64 $Ni,$Ni,@ACC[0]#lo veor $zero,$zero,$zero - vmul.u32 $Ni,$temp,$M0 + vmul.u32 $Ni,$Ni,$M0 - vmull.u32 $A4xB,$Bi,${A2}[0] + vmull.u32 @ACC[4],$Bi,${A2}[0] vld1.32 {$N0-$N3}, [$nptr]! - vmull.u32 $A5xB,$Bi,${A2}[1] - vmull.u32 $A6xB,$Bi,${A3}[0] + vmull.u32 @ACC[5],$Bi,${A2}[1] + vmull.u32 @ACC[6],$Bi,${A3}[0] vzip.16 $Ni,$zero - vmull.u32 $A7xB,$Bi,${A3}[1] - - bne .LNEON_1st - - @ special case for num=8, everything is in register bank... + vmull.u32 @ACC[7],$Bi,${A3}[1] - vmlal.u32 $A0xB,$Ni,${N0}[0] + vmlal.u32 @ACC[0],$Ni,${N0}[0] sub $outer,$num,#1 - vmlal.u32 $A1xB,$Ni,${N0}[1] - vmlal.u32 $A2xB,$Ni,${N1}[0] - vmlal.u32 $A3xB,$Ni,${N1}[1] - - vmlal.u32 $A4xB,$Ni,${N2}[0] - vmov $Temp,$A0xB - vmlal.u32 $A5xB,$Ni,${N2}[1] - vmov $A0xB,$A1xB - vmlal.u32 $A6xB,$Ni,${N3}[0] - vmov $A1xB,$A2xB - vmlal.u32 $A7xB,$Ni,${N3}[1] - vmov $A2xB,$A3xB - vmov $A3xB,$A4xB + vmlal.u32 @ACC[1],$Ni,${N0}[1] + vmlal.u32 @ACC[2],$Ni,${N1}[0] + vmlal.u32 @ACC[3],$Ni,${N1}[1] + + vmlal.u32 @ACC[4],$Ni,${N2}[0] + vmov $Temp,@ACC[0] + vmlal.u32 @ACC[5],$Ni,${N2}[1] + vmov @ACC[0],@ACC[1] + vmlal.u32 @ACC[6],$Ni,${N3}[0] + vmov @ACC[1],@ACC[2] + vmlal.u32 @ACC[7],$Ni,${N3}[1] + vmov @ACC[2],@ACC[3] + vmov @ACC[3],@ACC[4] vshr.u64 $temp,$temp,#16 - vmov $A4xB,$A5xB - vmov $A5xB,$A6xB - vadd.u64 $temp,$temp,`&Dhi("$Temp")` - vmov $A6xB,$A7xB - veor $A7xB,$A7xB + vmov @ACC[4],@ACC[5] + vmov @ACC[5],@ACC[6] + vadd.u64 $temp,$temp,$Temp#hi + vmov @ACC[6],@ACC[7] + veor @ACC[7],@ACC[7] vshr.u64 $temp,$temp,#16 b .LNEON_outer8 @@ -330,279 +374,302 @@ bn_mul8x_mont_neon: vld1.32 {${Bi}[0]}, [$bptr,:32]! veor $zero,$zero,$zero vzip.16 $Bi,$zero - vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp + vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp - vmlal.u32 $A0xB,$Bi,${A0}[0] - vmlal.u32 $A1xB,$Bi,${A0}[1] - vmlal.u32 $A2xB,$Bi,${A1}[0] - vshl.i64 $temp,`&Dhi("$A0xB")`,#16 - vmlal.u32 $A3xB,$Bi,${A1}[1] + vmlal.u32 @ACC[0],$Bi,${A0}[0] + vmlal.u32 @ACC[1],$Bi,${A0}[1] + vmlal.u32 @ACC[2],$Bi,${A1}[0] + vshl.i64 $Ni,@ACC[0]#hi,#16 + vmlal.u32 @ACC[3],$Bi,${A1}[1] - vadd.u64 $temp,$temp,`&Dlo("$A0xB")` + vadd.u64 $Ni,$Ni,@ACC[0]#lo veor $zero,$zero,$zero subs $outer,$outer,#1 - vmul.u32 $Ni,$temp,$M0 + vmul.u32 $Ni,$Ni,$M0 - vmlal.u32 $A4xB,$Bi,${A2}[0] - vmlal.u32 $A5xB,$Bi,${A2}[1] - vmlal.u32 $A6xB,$Bi,${A3}[0] + vmlal.u32 @ACC[4],$Bi,${A2}[0] + vmlal.u32 @ACC[5],$Bi,${A2}[1] + vmlal.u32 @ACC[6],$Bi,${A3}[0] vzip.16 $Ni,$zero - vmlal.u32 $A7xB,$Bi,${A3}[1] - - vmlal.u32 $A0xB,$Ni,${N0}[0] - vmlal.u32 $A1xB,$Ni,${N0}[1] - vmlal.u32 $A2xB,$Ni,${N1}[0] - vmlal.u32 $A3xB,$Ni,${N1}[1] - - vmlal.u32 $A4xB,$Ni,${N2}[0] - vmov $Temp,$A0xB - vmlal.u32 $A5xB,$Ni,${N2}[1] - vmov $A0xB,$A1xB - vmlal.u32 $A6xB,$Ni,${N3}[0] - vmov $A1xB,$A2xB - vmlal.u32 $A7xB,$Ni,${N3}[1] - vmov $A2xB,$A3xB - vmov $A3xB,$A4xB + vmlal.u32 @ACC[7],$Bi,${A3}[1] + + vmlal.u32 @ACC[0],$Ni,${N0}[0] + vmlal.u32 @ACC[1],$Ni,${N0}[1] + vmlal.u32 @ACC[2],$Ni,${N1}[0] + vmlal.u32 @ACC[3],$Ni,${N1}[1] + + vmlal.u32 @ACC[4],$Ni,${N2}[0] + vmov $Temp,@ACC[0] + vmlal.u32 @ACC[5],$Ni,${N2}[1] + vmov @ACC[0],@ACC[1] + vmlal.u32 @ACC[6],$Ni,${N3}[0] + vmov @ACC[1],@ACC[2] + vmlal.u32 @ACC[7],$Ni,${N3}[1] + vmov @ACC[2],@ACC[3] + vmov @ACC[3],@ACC[4] vshr.u64 $temp,$temp,#16 - vmov $A4xB,$A5xB - vmov $A5xB,$A6xB - vadd.u64 $temp,$temp,`&Dhi("$Temp")` - vmov $A6xB,$A7xB - veor $A7xB,$A7xB + vmov @ACC[4],@ACC[5] + vmov @ACC[5],@ACC[6] + vadd.u64 $temp,$temp,$Temp#hi + vmov @ACC[6],@ACC[7] + veor @ACC[7],@ACC[7] vshr.u64 $temp,$temp,#16 bne .LNEON_outer8 - vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp + vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp mov $toutptr,sp - vshr.u64 $temp,`&Dlo("$A0xB")`,#16 + vshr.u64 $temp,@ACC[0]#lo,#16 mov $inner,$num - vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp - add $tinptr,sp,#16 - vshr.u64 $temp,`&Dhi("$A0xB")`,#16 - vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` + vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp + add $tinptr,sp,#96 + vshr.u64 $temp,@ACC[0]#hi,#16 + vzip.16 @ACC[0]#lo,@ACC[0]#hi - b .LNEON_tail2 + b .LNEON_tail_entry .align 4 -.LNEON_1st: - vmlal.u32 $A0xB,$Ni,${N0}[0] - vld1.32 {$A0-$A3}, [$aptr]! - vmlal.u32 $A1xB,$Ni,${N0}[1] +.LNEON_8n: + veor @ACC[0],@ACC[0],@ACC[0] + sub $toutptr,sp,#128 + veor @ACC[1],@ACC[1],@ACC[1] + sub $toutptr,$toutptr,$num,lsl#4 + veor @ACC[2],@ACC[2],@ACC[2] + and $toutptr,$toutptr,#-64 + veor @ACC[3],@ACC[3],@ACC[3] + mov sp,$toutptr @ alloca + veor @ACC[4],@ACC[4],@ACC[4] + add $toutptr,$toutptr,#256 + veor @ACC[5],@ACC[5],@ACC[5] + sub $inner,$num,#8 + veor @ACC[6],@ACC[6],@ACC[6] + veor @ACC[7],@ACC[7],@ACC[7] + +.LNEON_8n_init: + vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! subs $inner,$inner,#8 - vmlal.u32 $A2xB,$Ni,${N1}[0] - vmlal.u32 $A3xB,$Ni,${N1}[1] - - vmlal.u32 $A4xB,$Ni,${N2}[0] - vld1.32 {$N0-$N1}, [$nptr]! - vmlal.u32 $A5xB,$Ni,${N2}[1] - vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! - vmlal.u32 $A6xB,$Ni,${N3}[0] - vmlal.u32 $A7xB,$Ni,${N3}[1] - vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! - - vmull.u32 $A0xB,$Bi,${A0}[0] - vld1.32 {$N2-$N3}, [$nptr]! - vmull.u32 $A1xB,$Bi,${A0}[1] - vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! - vmull.u32 $A2xB,$Bi,${A1}[0] - vmull.u32 $A3xB,$Bi,${A1}[1] - vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! - - vmull.u32 $A4xB,$Bi,${A2}[0] - vmull.u32 $A5xB,$Bi,${A2}[1] - vmull.u32 $A6xB,$Bi,${A3}[0] - vmull.u32 $A7xB,$Bi,${A3}[1] - - bne .LNEON_1st - - vmlal.u32 $A0xB,$Ni,${N0}[0] - add $tinptr,sp,#16 - vmlal.u32 $A1xB,$Ni,${N0}[1] - sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr - vmlal.u32 $A2xB,$Ni,${N1}[0] - vld1.64 {$Temp}, [sp,:128] - vmlal.u32 $A3xB,$Ni,${N1}[1] - sub $outer,$num,#1 - - vmlal.u32 $A4xB,$Ni,${N2}[0] - vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! - vmlal.u32 $A5xB,$Ni,${N2}[1] - vshr.u64 $temp,$temp,#16 - vld1.64 {$A0xB}, [$tinptr, :128]! - vmlal.u32 $A6xB,$Ni,${N3}[0] - vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! - vmlal.u32 $A7xB,$Ni,${N3}[1] - - vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! - vadd.u64 $temp,$temp,`&Dhi("$Temp")` - veor $Z,$Z,$Z - vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! - vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! - vst1.64 {$Z}, [$toutptr,:128] - vshr.u64 $temp,$temp,#16 - - b .LNEON_outer + vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! + vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! + vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]! + bne .LNEON_8n_init + + add $tinptr,sp,#256 + vld1.32 {$A0-$A3},[$aptr]! + add $bnptr,sp,#8 + vld1.32 {${M0}[0]},[$n0,:32] + mov $outer,$num + b .LNEON_8n_outer .align 4 -.LNEON_outer: - vld1.32 {${Bi}[0]}, [$bptr,:32]! - sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr - vld1.32 {$A0-$A3}, [$aptr]! +.LNEON_8n_outer: + vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ veor $zero,$zero,$zero - mov $toutptr,sp vzip.16 $Bi,$zero + add $toutptr,sp,#128 + vld1.32 {$N0-$N3},[$nptr]! + + vmlal.u32 @ACC[0],$Bi,${A0}[0] + vmlal.u32 @ACC[1],$Bi,${A0}[1] + veor $zero,$zero,$zero + vmlal.u32 @ACC[2],$Bi,${A1}[0] + vshl.i64 $Ni,@ACC[0]#hi,#16 + vmlal.u32 @ACC[3],$Bi,${A1}[1] + vadd.u64 $Ni,$Ni,@ACC[0]#lo + vmlal.u32 @ACC[4],$Bi,${A2}[0] + vmul.u32 $Ni,$Ni,$M0 + vmlal.u32 @ACC[5],$Bi,${A2}[1] + vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0] + vmlal.u32 @ACC[6],$Bi,${A3}[0] + vzip.16 $Ni,$zero + vmlal.u32 @ACC[7],$Bi,${A3}[1] +___ +for ($i=0; $i<7;) { +$code.=<<___; + vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ + vmlal.u32 @ACC[0],$Ni,${N0}[0] + veor $temp,$temp,$temp + vmlal.u32 @ACC[1],$Ni,${N0}[1] + vzip.16 $Bi,$temp + vmlal.u32 @ACC[2],$Ni,${N1}[0] + vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 + vmlal.u32 @ACC[3],$Ni,${N1}[1] + vmlal.u32 @ACC[4],$Ni,${N2}[0] + vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi + vmlal.u32 @ACC[5],$Ni,${N2}[1] + vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 + vmlal.u32 @ACC[6],$Ni,${N3}[0] + vmlal.u32 @ACC[7],$Ni,${N3}[1] + vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo + vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i] +___ + push(@ACC,shift(@ACC)); $i++; +$code.=<<___; + vmlal.u32 @ACC[0],$Bi,${A0}[0] + vld1.64 {@ACC[7]},[$tinptr,:128]! + vmlal.u32 @ACC[1],$Bi,${A0}[1] + veor $zero,$zero,$zero + vmlal.u32 @ACC[2],$Bi,${A1}[0] + vshl.i64 $Ni,@ACC[0]#hi,#16 + vmlal.u32 @ACC[3],$Bi,${A1}[1] + vadd.u64 $Ni,$Ni,@ACC[0]#lo + vmlal.u32 @ACC[4],$Bi,${A2}[0] + vmul.u32 $Ni,$Ni,$M0 + vmlal.u32 @ACC[5],$Bi,${A2}[1] + vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i] + vmlal.u32 @ACC[6],$Bi,${A3}[0] + vzip.16 $Ni,$zero + vmlal.u32 @ACC[7],$Bi,${A3}[1] +___ +} +$code.=<<___; + vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] + vmlal.u32 @ACC[0],$Ni,${N0}[0] + vld1.32 {$A0-$A3},[$aptr]! + vmlal.u32 @ACC[1],$Ni,${N0}[1] + vmlal.u32 @ACC[2],$Ni,${N1}[0] + vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 + vmlal.u32 @ACC[3],$Ni,${N1}[1] + vmlal.u32 @ACC[4],$Ni,${N2}[0] + vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi + vmlal.u32 @ACC[5],$Ni,${N2}[1] + vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 + vmlal.u32 @ACC[6],$Ni,${N3}[0] + vmlal.u32 @ACC[7],$Ni,${N3}[1] + vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo + vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i] + add $bnptr,sp,#8 @ rewind +___ + push(@ACC,shift(@ACC)); +$code.=<<___; sub $inner,$num,#8 - vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp - - vmlal.u32 $A0xB,$Bi,${A0}[0] - vld1.64 {$A3xB-$A4xB},[$tinptr,:256]! - vmlal.u32 $A1xB,$Bi,${A0}[1] - vmlal.u32 $A2xB,$Bi,${A1}[0] - vld1.64 {$A5xB-$A6xB},[$tinptr,:256]! - vmlal.u32 $A3xB,$Bi,${A1}[1] + b .LNEON_8n_inner - vshl.i64 $temp,`&Dhi("$A0xB")`,#16 - veor $zero,$zero,$zero - vadd.u64 $temp,$temp,`&Dlo("$A0xB")` - vld1.64 {$A7xB},[$tinptr,:128]! - vmul.u32 $Ni,$temp,$M0 - - vmlal.u32 $A4xB,$Bi,${A2}[0] - vld1.32 {$N0-$N3}, [$nptr]! - vmlal.u32 $A5xB,$Bi,${A2}[1] - vmlal.u32 $A6xB,$Bi,${A3}[0] - vzip.16 $Ni,$zero - vmlal.u32 $A7xB,$Bi,${A3}[1] - -.LNEON_inner: - vmlal.u32 $A0xB,$Ni,${N0}[0] - vld1.32 {$A0-$A3}, [$aptr]! - vmlal.u32 $A1xB,$Ni,${N0}[1] - subs $inner,$inner,#8 - vmlal.u32 $A2xB,$Ni,${N1}[0] - vmlal.u32 $A3xB,$Ni,${N1}[1] - vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! - - vmlal.u32 $A4xB,$Ni,${N2}[0] - vld1.64 {$A0xB}, [$tinptr, :128]! - vmlal.u32 $A5xB,$Ni,${N2}[1] - vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! - vmlal.u32 $A6xB,$Ni,${N3}[0] - vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! - vmlal.u32 $A7xB,$Ni,${N3}[1] - vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! - - vmlal.u32 $A0xB,$Bi,${A0}[0] - vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! - vmlal.u32 $A1xB,$Bi,${A0}[1] - vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! - vmlal.u32 $A2xB,$Bi,${A1}[0] - vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! - vmlal.u32 $A3xB,$Bi,${A1}[1] - vld1.32 {$N0-$N3}, [$nptr]! - - vmlal.u32 $A4xB,$Bi,${A2}[0] - vld1.64 {$A7xB}, [$tinptr, :128]! - vmlal.u32 $A5xB,$Bi,${A2}[1] - vmlal.u32 $A6xB,$Bi,${A3}[0] - vmlal.u32 $A7xB,$Bi,${A3}[1] - - bne .LNEON_inner - - vmlal.u32 $A0xB,$Ni,${N0}[0] - add $tinptr,sp,#16 - vmlal.u32 $A1xB,$Ni,${N0}[1] - sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr - vmlal.u32 $A2xB,$Ni,${N1}[0] - vld1.64 {$Temp}, [sp,:128] - vmlal.u32 $A3xB,$Ni,${N1}[1] - subs $outer,$outer,#1 - - vmlal.u32 $A4xB,$Ni,${N2}[0] - vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! - vmlal.u32 $A5xB,$Ni,${N2}[1] - vld1.64 {$A0xB}, [$tinptr, :128]! - vshr.u64 $temp,$temp,#16 - vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! - vmlal.u32 $A6xB,$Ni,${N3}[0] - vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! - vmlal.u32 $A7xB,$Ni,${N3}[1] - - vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! - vadd.u64 $temp,$temp,`&Dhi("$Temp")` - vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! - vshr.u64 $temp,$temp,#16 - - bne .LNEON_outer +.align 4 +.LNEON_8n_inner: + subs $inner,$inner,#8 + vmlal.u32 @ACC[0],$Bi,${A0}[0] + vld1.64 {@ACC[7]},[$tinptr,:128] + vmlal.u32 @ACC[1],$Bi,${A0}[1] + vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0] + vmlal.u32 @ACC[2],$Bi,${A1}[0] + vld1.32 {$N0-$N3},[$nptr]! + vmlal.u32 @ACC[3],$Bi,${A1}[1] + it ne + addne $tinptr,$tinptr,#16 @ don't advance in last iteration + vmlal.u32 @ACC[4],$Bi,${A2}[0] + vmlal.u32 @ACC[5],$Bi,${A2}[1] + vmlal.u32 @ACC[6],$Bi,${A3}[0] + vmlal.u32 @ACC[7],$Bi,${A3}[1] +___ +for ($i=1; $i<8; $i++) { +$code.=<<___; + vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i] + vmlal.u32 @ACC[0],$Ni,${N0}[0] + vmlal.u32 @ACC[1],$Ni,${N0}[1] + vmlal.u32 @ACC[2],$Ni,${N1}[0] + vmlal.u32 @ACC[3],$Ni,${N1}[1] + vmlal.u32 @ACC[4],$Ni,${N2}[0] + vmlal.u32 @ACC[5],$Ni,${N2}[1] + vmlal.u32 @ACC[6],$Ni,${N3}[0] + vmlal.u32 @ACC[7],$Ni,${N3}[1] + vst1.64 {@ACC[0]},[$toutptr,:128]! +___ + push(@ACC,shift(@ACC)); +$code.=<<___; + vmlal.u32 @ACC[0],$Bi,${A0}[0] + vld1.64 {@ACC[7]},[$tinptr,:128] + vmlal.u32 @ACC[1],$Bi,${A0}[1] + vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i] + vmlal.u32 @ACC[2],$Bi,${A1}[0] + it ne + addne $tinptr,$tinptr,#16 @ don't advance in last iteration + vmlal.u32 @ACC[3],$Bi,${A1}[1] + vmlal.u32 @ACC[4],$Bi,${A2}[0] + vmlal.u32 @ACC[5],$Bi,${A2}[1] + vmlal.u32 @ACC[6],$Bi,${A3}[0] + vmlal.u32 @ACC[7],$Bi,${A3}[1] +___ +} +$code.=<<___; + it eq + subeq $aptr,$aptr,$num,lsl#2 @ rewind + vmlal.u32 @ACC[0],$Ni,${N0}[0] + vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] + vmlal.u32 @ACC[1],$Ni,${N0}[1] + vld1.32 {$A0-$A3},[$aptr]! + vmlal.u32 @ACC[2],$Ni,${N1}[0] + add $bnptr,sp,#8 @ rewind + vmlal.u32 @ACC[3],$Ni,${N1}[1] + vmlal.u32 @ACC[4],$Ni,${N2}[0] + vmlal.u32 @ACC[5],$Ni,${N2}[1] + vmlal.u32 @ACC[6],$Ni,${N3}[0] + vst1.64 {@ACC[0]},[$toutptr,:128]! + vmlal.u32 @ACC[7],$Ni,${N3}[1] + + bne .LNEON_8n_inner +___ + push(@ACC,shift(@ACC)); +$code.=<<___; + add $tinptr,sp,#128 + vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! + veor q2,q2,q2 @ $N0-$N1 + vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! + veor q3,q3,q3 @ $N2-$N3 + vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! + vst1.64 {@ACC[6]},[$toutptr,:128] + + subs $outer,$outer,#8 + vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]! + vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]! + vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]! + vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]! + + itt ne + subne $nptr,$nptr,$num,lsl#2 @ rewind + bne .LNEON_8n_outer + + add $toutptr,sp,#128 + vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame + vshr.u64 $temp,@ACC[0]#lo,#16 + vst1.64 {q2-q3},[sp,:256]! + vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp + vst1.64 {q2-q3}, [sp,:256]! + vshr.u64 $temp,@ACC[0]#hi,#16 + vst1.64 {q2-q3}, [sp,:256]! + vzip.16 @ACC[0]#lo,@ACC[0]#hi - mov $toutptr,sp mov $inner,$num + b .LNEON_tail_entry +.align 4 .LNEON_tail: - vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp - vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! - vshr.u64 $temp,`&Dlo("$A0xB")`,#16 - vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp - vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! - vshr.u64 $temp,`&Dhi("$A0xB")`,#16 - vld1.64 {$A7xB}, [$tinptr, :128]! - vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` - -.LNEON_tail2: - vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp - vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]! - vshr.u64 $temp,`&Dlo("$A1xB")`,#16 - vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp - vshr.u64 $temp,`&Dhi("$A1xB")`,#16 - vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")` - - vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp - vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]! - vshr.u64 $temp,`&Dlo("$A2xB")`,#16 - vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp - vshr.u64 $temp,`&Dhi("$A2xB")`,#16 - vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")` - - vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp - vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]! - vshr.u64 $temp,`&Dlo("$A3xB")`,#16 - vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp - vshr.u64 $temp,`&Dhi("$A3xB")`,#16 - vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")` - - vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp - vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]! - vshr.u64 $temp,`&Dlo("$A4xB")`,#16 - vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp - vshr.u64 $temp,`&Dhi("$A4xB")`,#16 - vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")` - - vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp - vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]! - vshr.u64 $temp,`&Dlo("$A5xB")`,#16 - vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp - vshr.u64 $temp,`&Dhi("$A5xB")`,#16 - vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")` - - vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp - vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]! - vshr.u64 $temp,`&Dlo("$A6xB")`,#16 - vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp - vld1.64 {$A0xB}, [$tinptr, :128]! - vshr.u64 $temp,`&Dhi("$A6xB")`,#16 - vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")` - - vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp - vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]! - vshr.u64 $temp,`&Dlo("$A7xB")`,#16 - vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp - vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! - vshr.u64 $temp,`&Dhi("$A7xB")`,#16 - vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")` + vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp + vshr.u64 $temp,@ACC[0]#lo,#16 + vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]! + vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp + vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]! + vshr.u64 $temp,@ACC[0]#hi,#16 + vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]! + vzip.16 @ACC[0]#lo,@ACC[0]#hi + +.LNEON_tail_entry: +___ +for ($i=1; $i<8; $i++) { +$code.=<<___; + vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp + vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]! + vshr.u64 $temp,@ACC[1]#lo,#16 + vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp + vshr.u64 $temp,@ACC[1]#hi,#16 + vzip.16 @ACC[1]#lo,@ACC[1]#hi +___ + push(@ACC,shift(@ACC)); +} + push(@ACC,shift(@ACC)); +$code.=<<___; + vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]! subs $inner,$inner,#8 - vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]! - + vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]! bne .LNEON_tail vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit @@ -622,8 +689,9 @@ bn_mul8x_mont_neon: bne .LNEON_sub ldr r10, [$aptr] @ load top-most bit + mov r11,sp veor q0,q0,q0 - sub r11,$bptr,sp @ this is num*4 + sub r11,$bptr,r11 @ this is num*4 veor q1,q1,q1 mov $aptr,sp sub $rptr,$rptr,r11 @ rewind $rptr @@ -633,27 +701,33 @@ bn_mul8x_mont_neon: .LNEON_copy_n_zap: ldmia $aptr!, {r4-r7} ldmia $rptr, {r8-r11} + it cc movcc r8, r4 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + itt cc movcc r9, r5 movcc r10,r6 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + it cc movcc r11,r7 ldmia $aptr, {r4-r7} stmia $rptr!, {r8-r11} sub $aptr,$aptr,#16 ldmia $rptr, {r8-r11} + it cc movcc r8, r4 vst1.64 {q0-q1}, [$aptr,:256]! @ wipe + itt cc movcc r9, r5 movcc r10,r6 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + it cc movcc r11,r7 teq $aptr,$bptr @ preserves carry stmia $rptr!, {r8-r11} bne .LNEON_copy_n_zap - sub sp,ip,#96 + mov sp,ip vldmia sp!,{d8-d15} ldmia sp!,{r4-r11} ret @ bx lr @@ -669,8 +743,14 @@ $code.=<<___; #endif ___ -$code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 -$code =~ s/\bret\b/bx lr/gm; -print $code; +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or + s/\bret\b/bx lr/g or + s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} + close STDOUT; diff --git a/deps/openssl/openssl/crypto/bn/asm/armv8-mont.pl b/deps/openssl/openssl/crypto/bn/asm/armv8-mont.pl new file mode 100755 index 0000000000..5d5af1b6be --- /dev/null +++ b/deps/openssl/openssl/crypto/bn/asm/armv8-mont.pl @@ -0,0 +1,1510 @@ +#! /usr/bin/env perl +# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# March 2015 +# +# "Teaser" Montgomery multiplication module for ARMv8. Needs more +# work. While it does improve RSA sign performance by 20-30% (less for +# longer keys) on most processors, for some reason RSA2048 is not +# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication +# instruction issue rate is limited on processor in question, meaning +# that dedicated squaring procedure is a must. Well, actually all +# contemporary AArch64 processors seem to have limited multiplication +# issue rate, i.e. they can't issue multiplication every cycle, which +# explains moderate improvement coefficients in comparison to +# compiler-generated code. Recall that compiler is instructed to use +# umulh and therefore uses same amount of multiplication instructions +# to do the job. Assembly's edge is to minimize number of "collateral" +# instructions and of course instruction scheduling. +# +# April 2015 +# +# Squaring procedure that handles lengths divisible by 8 improves +# RSA/DSA performance by 25-40-60% depending on processor and key +# length. Overall improvement coefficients are always positive in +# comparison to compiler-generated code. On Cortex-A57 improvement +# is still modest on longest key lengths, while others exhibit e.g. +# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster +# on Cortex-A57 and ~60-100% faster on others. + +$flavour = shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +($lo0,$hi0,$aj,$m0,$alo,$ahi, + $lo1,$hi1,$nj,$m1,$nlo,$nhi, + $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24); + +# int bn_mul_mont( +$rp="x0"; # BN_ULONG *rp, +$ap="x1"; # const BN_ULONG *ap, +$bp="x2"; # const BN_ULONG *bp, +$np="x3"; # const BN_ULONG *np, +$n0="x4"; # const BN_ULONG *n0, +$num="x5"; # int num); + +$code.=<<___; +.text + +.globl bn_mul_mont +.type bn_mul_mont,%function +.align 5 +bn_mul_mont: + tst $num,#7 + b.eq __bn_sqr8x_mont + tst $num,#3 + b.eq __bn_mul4x_mont +.Lmul_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldr $m0,[$bp],#8 // bp[0] + sub $tp,sp,$num,lsl#3 + ldp $hi0,$aj,[$ap],#16 // ap[0..1] + lsl $num,$num,#3 + ldr $n0,[$n0] // *n0 + and $tp,$tp,#-16 // ABI says so + ldp $hi1,$nj,[$np],#16 // np[0..1] + + mul $lo0,$hi0,$m0 // ap[0]*bp[0] + sub $j,$num,#16 // j=num-2 + umulh $hi0,$hi0,$m0 + mul $alo,$aj,$m0 // ap[1]*bp[0] + umulh $ahi,$aj,$m0 + + mul $m1,$lo0,$n0 // "tp[0]"*n0 + mov sp,$tp // alloca + + // (*) mul $lo1,$hi1,$m1 // np[0]*m1 + umulh $hi1,$hi1,$m1 + mul $nlo,$nj,$m1 // np[1]*m1 + // (*) adds $lo1,$lo1,$lo0 // discarded + // (*) As for removal of first multiplication and addition + // instructions. The outcome of first addition is + // guaranteed to be zero, which leaves two computationally + // significant outcomes: it either carries or not. Then + // question is when does it carry? Is there alternative + // way to deduce it? If you follow operations, you can + // observe that condition for carry is quite simple: + // $lo0 being non-zero. So that carry can be calculated + // by adding -1 to $lo0. That's what next instruction does. + subs xzr,$lo0,#1 // (*) + umulh $nhi,$nj,$m1 + adc $hi1,$hi1,xzr + cbz $j,.L1st_skip + +.L1st: + ldr $aj,[$ap],#8 + adds $lo0,$alo,$hi0 + sub $j,$j,#8 // j-- + adc $hi0,$ahi,xzr + + ldr $nj,[$np],#8 + adds $lo1,$nlo,$hi1 + mul $alo,$aj,$m0 // ap[j]*bp[0] + adc $hi1,$nhi,xzr + umulh $ahi,$aj,$m0 + + adds $lo1,$lo1,$lo0 + mul $nlo,$nj,$m1 // np[j]*m1 + adc $hi1,$hi1,xzr + umulh $nhi,$nj,$m1 + str $lo1,[$tp],#8 // tp[j-1] + cbnz $j,.L1st + +.L1st_skip: + adds $lo0,$alo,$hi0 + sub $ap,$ap,$num // rewind $ap + adc $hi0,$ahi,xzr + + adds $lo1,$nlo,$hi1 + sub $np,$np,$num // rewind $np + adc $hi1,$nhi,xzr + + adds $lo1,$lo1,$lo0 + sub $i,$num,#8 // i=num-1 + adcs $hi1,$hi1,$hi0 + + adc $ovf,xzr,xzr // upmost overflow bit + stp $lo1,$hi1,[$tp] + +.Louter: + ldr $m0,[$bp],#8 // bp[i] + ldp $hi0,$aj,[$ap],#16 + ldr $tj,[sp] // tp[0] + add $tp,sp,#8 + + mul $lo0,$hi0,$m0 // ap[0]*bp[i] + sub $j,$num,#16 // j=num-2 + umulh $hi0,$hi0,$m0 + ldp $hi1,$nj,[$np],#16 + mul $alo,$aj,$m0 // ap[1]*bp[i] + adds $lo0,$lo0,$tj + umulh $ahi,$aj,$m0 + adc $hi0,$hi0,xzr + + mul $m1,$lo0,$n0 + sub $i,$i,#8 // i-- + + // (*) mul $lo1,$hi1,$m1 // np[0]*m1 + umulh $hi1,$hi1,$m1 + mul $nlo,$nj,$m1 // np[1]*m1 + // (*) adds $lo1,$lo1,$lo0 + subs xzr,$lo0,#1 // (*) + umulh $nhi,$nj,$m1 + cbz $j,.Linner_skip + +.Linner: + ldr $aj,[$ap],#8 + adc $hi1,$hi1,xzr + ldr $tj,[$tp],#8 // tp[j] + adds $lo0,$alo,$hi0 + sub $j,$j,#8 // j-- + adc $hi0,$ahi,xzr + + adds $lo1,$nlo,$hi1 + ldr $nj,[$np],#8 + adc $hi1,$nhi,xzr + + mul $alo,$aj,$m0 // ap[j]*bp[i] + adds $lo0,$lo0,$tj + umulh $ahi,$aj,$m0 + adc $hi0,$hi0,xzr + + mul $nlo,$nj,$m1 // np[j]*m1 + adds $lo1,$lo1,$lo0 + umulh $nhi,$nj,$m1 + str $lo1,[$tp,#-16] // tp[j-1] + cbnz $j,.Linner + +.Linner_skip: + ldr $tj,[$tp],#8 // tp[j] + adc $hi1,$hi1,xzr + adds $lo0,$alo,$hi0 + sub $ap,$ap,$num // rewind $ap + adc $hi0,$ahi,xzr + + adds $lo1,$nlo,$hi1 + sub $np,$np,$num // rewind $np + adcs $hi1,$nhi,$ovf + adc $ovf,xzr,xzr + + adds $lo0,$lo0,$tj + adc $hi0,$hi0,xzr + + adds $lo1,$lo1,$lo0 + adcs $hi1,$hi1,$hi0 + adc $ovf,$ovf,xzr // upmost overflow bit + stp $lo1,$hi1,[$tp,#-16] + + cbnz $i,.Louter + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr $tj,[sp] // tp[0] + add $tp,sp,#8 + ldr $nj,[$np],#8 // np[0] + subs $j,$num,#8 // j=num-1 and clear borrow + mov $ap,$rp +.Lsub: + sbcs $aj,$tj,$nj // tp[j]-np[j] + ldr $tj,[$tp],#8 + sub $j,$j,#8 // j-- + ldr $nj,[$np],#8 + str $aj,[$ap],#8 // rp[j]=tp[j]-np[j] + cbnz $j,.Lsub + + sbcs $aj,$tj,$nj + sbcs $ovf,$ovf,xzr // did it borrow? + str $aj,[$ap],#8 // rp[num-1] + + ldr $tj,[sp] // tp[0] + add $tp,sp,#8 + ldr $aj,[$rp],#8 // rp[0] + sub $num,$num,#8 // num-- + nop +.Lcond_copy: + sub $num,$num,#8 // num-- + csel $nj,$tj,$aj,lo // did it borrow? + ldr $tj,[$tp],#8 + ldr $aj,[$rp],#8 + str xzr,[$tp,#-16] // wipe tp + str $nj,[$rp,#-16] + cbnz $num,.Lcond_copy + + csel $nj,$tj,$aj,lo + str xzr,[$tp,#-8] // wipe tp + str $nj,[$rp,#-8] + + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret +.size bn_mul_mont,.-bn_mul_mont +___ +{ +######################################################################## +# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module. + +my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13)); +my ($t0,$t1,$t2,$t3)=map("x$_",(14..17)); +my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26)); +my ($cnt,$carry,$topmost)=("x27","x28","x30"); +my ($tp,$ap_end,$na0)=($bp,$np,$carry); + +$code.=<<___; +.type __bn_sqr8x_mont,%function +.align 5 +__bn_sqr8x_mont: + cmp $ap,$bp + b.ne __bn_mul4x_mont +.Lsqr8x_mont: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp $rp,$np,[sp,#96] // offload rp and np + + ldp $a0,$a1,[$ap,#8*0] + ldp $a2,$a3,[$ap,#8*2] + ldp $a4,$a5,[$ap,#8*4] + ldp $a6,$a7,[$ap,#8*6] + + sub $tp,sp,$num,lsl#4 + lsl $num,$num,#3 + ldr $n0,[$n0] // *n0 + mov sp,$tp // alloca + sub $cnt,$num,#8*8 + b .Lsqr8x_zero_start + +.Lsqr8x_zero: + sub $cnt,$cnt,#8*8 + stp xzr,xzr,[$tp,#8*0] + stp xzr,xzr,[$tp,#8*2] + stp xzr,xzr,[$tp,#8*4] + stp xzr,xzr,[$tp,#8*6] +.Lsqr8x_zero_start: + stp xzr,xzr,[$tp,#8*8] + stp xzr,xzr,[$tp,#8*10] + stp xzr,xzr,[$tp,#8*12] + stp xzr,xzr,[$tp,#8*14] + add $tp,$tp,#8*16 + cbnz $cnt,.Lsqr8x_zero + + add $ap_end,$ap,$num + add $ap,$ap,#8*8 + mov $acc0,xzr + mov $acc1,xzr + mov $acc2,xzr + mov $acc3,xzr + mov $acc4,xzr + mov $acc5,xzr + mov $acc6,xzr + mov $acc7,xzr + mov $tp,sp + str $n0,[x29,#112] // offload n0 + + // Multiply everything but a[i]*a[i] +.align 4 +.Lsqr8x_outer_loop: + // a[1]a[0] (i) + // a[2]a[0] + // a[3]a[0] + // a[4]a[0] + // a[5]a[0] + // a[6]a[0] + // a[7]a[0] + // a[2]a[1] (ii) + // a[3]a[1] + // a[4]a[1] + // a[5]a[1] + // a[6]a[1] + // a[7]a[1] + // a[3]a[2] (iii) + // a[4]a[2] + // a[5]a[2] + // a[6]a[2] + // a[7]a[2] + // a[4]a[3] (iv) + // a[5]a[3] + // a[6]a[3] + // a[7]a[3] + // a[5]a[4] (v) + // a[6]a[4] + // a[7]a[4] + // a[6]a[5] (vi) + // a[7]a[5] + // a[7]a[6] (vii) + + mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i) + mul $t1,$a2,$a0 + mul $t2,$a3,$a0 + mul $t3,$a4,$a0 + adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0]) + mul $t0,$a5,$a0 + adcs $acc2,$acc2,$t1 + mul $t1,$a6,$a0 + adcs $acc3,$acc3,$t2 + mul $t2,$a7,$a0 + adcs $acc4,$acc4,$t3 + umulh $t3,$a1,$a0 // hi(a[1..7]*a[0]) + adcs $acc5,$acc5,$t0 + umulh $t0,$a2,$a0 + adcs $acc6,$acc6,$t1 + umulh $t1,$a3,$a0 + adcs $acc7,$acc7,$t2 + umulh $t2,$a4,$a0 + stp $acc0,$acc1,[$tp],#8*2 // t[0..1] + adc $acc0,xzr,xzr // t[8] + adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0]) + umulh $t3,$a5,$a0 + adcs $acc3,$acc3,$t0 + umulh $t0,$a6,$a0 + adcs $acc4,$acc4,$t1 + umulh $t1,$a7,$a0 + adcs $acc5,$acc5,$t2 + mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii) + adcs $acc6,$acc6,$t3 + mul $t3,$a3,$a1 + adcs $acc7,$acc7,$t0 + mul $t0,$a4,$a1 + adc $acc0,$acc0,$t1 + + mul $t1,$a5,$a1 + adds $acc3,$acc3,$t2 + mul $t2,$a6,$a1 + adcs $acc4,$acc4,$t3 + mul $t3,$a7,$a1 + adcs $acc5,$acc5,$t0 + umulh $t0,$a2,$a1 // hi(a[2..7]*a[1]) + adcs $acc6,$acc6,$t1 + umulh $t1,$a3,$a1 + adcs $acc7,$acc7,$t2 + umulh $t2,$a4,$a1 + adcs $acc0,$acc0,$t3 + umulh $t3,$a5,$a1 + stp $acc2,$acc3,[$tp],#8*2 // t[2..3] + adc $acc1,xzr,xzr // t[9] + adds $acc4,$acc4,$t0 + umulh $t0,$a6,$a1 + adcs $acc5,$acc5,$t1 + umulh $t1,$a7,$a1 + adcs $acc6,$acc6,$t2 + mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii) + adcs $acc7,$acc7,$t3 + mul $t3,$a4,$a2 + adcs $acc0,$acc0,$t0 + mul $t0,$a5,$a2 + adc $acc1,$acc1,$t1 + + mul $t1,$a6,$a2 + adds $acc5,$acc5,$t2 + mul $t2,$a7,$a2 + adcs $acc6,$acc6,$t3 + umulh $t3,$a3,$a2 // hi(a[3..7]*a[2]) + adcs $acc7,$acc7,$t0 + umulh $t0,$a4,$a2 + adcs $acc0,$acc0,$t1 + umulh $t1,$a5,$a2 + adcs $acc1,$acc1,$t2 + umulh $t2,$a6,$a2 + stp $acc4,$acc5,[$tp],#8*2 // t[4..5] + adc $acc2,xzr,xzr // t[10] + adds $acc6,$acc6,$t3 + umulh $t3,$a7,$a2 + adcs $acc7,$acc7,$t0 + mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv) + adcs $acc0,$acc0,$t1 + mul $t1,$a5,$a3 + adcs $acc1,$acc1,$t2 + mul $t2,$a6,$a3 + adc $acc2,$acc2,$t3 + + mul $t3,$a7,$a3 + adds $acc7,$acc7,$t0 + umulh $t0,$a4,$a3 // hi(a[4..7]*a[3]) + adcs $acc0,$acc0,$t1 + umulh $t1,$a5,$a3 + adcs $acc1,$acc1,$t2 + umulh $t2,$a6,$a3 + adcs $acc2,$acc2,$t3 + umulh $t3,$a7,$a3 + stp $acc6,$acc7,[$tp],#8*2 // t[6..7] + adc $acc3,xzr,xzr // t[11] + adds $acc0,$acc0,$t0 + mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v) + adcs $acc1,$acc1,$t1 + mul $t1,$a6,$a4 + adcs $acc2,$acc2,$t2 + mul $t2,$a7,$a4 + adc $acc3,$acc3,$t3 + + umulh $t3,$a5,$a4 // hi(a[5..7]*a[4]) + adds $acc1,$acc1,$t0 + umulh $t0,$a6,$a4 + adcs $acc2,$acc2,$t1 + umulh $t1,$a7,$a4 + adcs $acc3,$acc3,$t2 + mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi) + adc $acc4,xzr,xzr // t[12] + adds $acc2,$acc2,$t3 + mul $t3,$a7,$a5 + adcs $acc3,$acc3,$t0 + umulh $t0,$a6,$a5 // hi(a[6..7]*a[5]) + adc $acc4,$acc4,$t1 + + umulh $t1,$a7,$a5 + adds $acc3,$acc3,$t2 + mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii) + adcs $acc4,$acc4,$t3 + umulh $t3,$a7,$a6 // hi(a[7]*a[6]) + adc $acc5,xzr,xzr // t[13] + adds $acc4,$acc4,$t0 + sub $cnt,$ap_end,$ap // done yet? + adc $acc5,$acc5,$t1 + + adds $acc5,$acc5,$t2 + sub $t0,$ap_end,$num // rewinded ap + adc $acc6,xzr,xzr // t[14] + add $acc6,$acc6,$t3 + + cbz $cnt,.Lsqr8x_outer_break + + mov $n0,$a0 + ldp $a0,$a1,[$tp,#8*0] + ldp $a2,$a3,[$tp,#8*2] + ldp $a4,$a5,[$tp,#8*4] + ldp $a6,$a7,[$tp,#8*6] + adds $acc0,$acc0,$a0 + adcs $acc1,$acc1,$a1 + ldp $a0,$a1,[$ap,#8*0] + adcs $acc2,$acc2,$a2 + adcs $acc3,$acc3,$a3 + ldp $a2,$a3,[$ap,#8*2] + adcs $acc4,$acc4,$a4 + adcs $acc5,$acc5,$a5 + ldp $a4,$a5,[$ap,#8*4] + adcs $acc6,$acc6,$a6 + mov $rp,$ap + adcs $acc7,xzr,$a7 + ldp $a6,$a7,[$ap,#8*6] + add $ap,$ap,#8*8 + //adc $carry,xzr,xzr // moved below + mov $cnt,#-8*8 + + // a[8]a[0] + // a[9]a[0] + // a[a]a[0] + // a[b]a[0] + // a[c]a[0] + // a[d]a[0] + // a[e]a[0] + // a[f]a[0] + // a[8]a[1] + // a[f]a[1]........................ + // a[8]a[2] + // a[f]a[2]........................ + // a[8]a[3] + // a[f]a[3]........................ + // a[8]a[4] + // a[f]a[4]........................ + // a[8]a[5] + // a[f]a[5]........................ + // a[8]a[6] + // a[f]a[6]........................ + // a[8]a[7] + // a[f]a[7]........................ +.Lsqr8x_mul: + mul $t0,$a0,$n0 + adc $carry,xzr,xzr // carry bit, modulo-scheduled + mul $t1,$a1,$n0 + add $cnt,$cnt,#8 + mul $t2,$a2,$n0 + mul $t3,$a3,$n0 + adds $acc0,$acc0,$t0 + mul $t0,$a4,$n0 + adcs $acc1,$acc1,$t1 + mul $t1,$a5,$n0 + adcs $acc2,$acc2,$t2 + mul $t2,$a6,$n0 + adcs $acc3,$acc3,$t3 + mul $t3,$a7,$n0 + adcs $acc4,$acc4,$t0 + umulh $t0,$a0,$n0 + adcs $acc5,$acc5,$t1 + umulh $t1,$a1,$n0 + adcs $acc6,$acc6,$t2 + umulh $t2,$a2,$n0 + adcs $acc7,$acc7,$t3 + umulh $t3,$a3,$n0 + adc $carry,$carry,xzr + str $acc0,[$tp],#8 + adds $acc0,$acc1,$t0 + umulh $t0,$a4,$n0 + adcs $acc1,$acc2,$t1 + umulh $t1,$a5,$n0 + adcs $acc2,$acc3,$t2 + umulh $t2,$a6,$n0 + adcs $acc3,$acc4,$t3 + umulh $t3,$a7,$n0 + ldr $n0,[$rp,$cnt] + adcs $acc4,$acc5,$t0 + adcs $acc5,$acc6,$t1 + adcs $acc6,$acc7,$t2 + adcs $acc7,$carry,$t3 + //adc $carry,xzr,xzr // moved above + cbnz $cnt,.Lsqr8x_mul + // note that carry flag is guaranteed + // to be zero at this point + cmp $ap,$ap_end // done yet? + b.eq .Lsqr8x_break + + ldp $a0,$a1,[$tp,#8*0] + ldp $a2,$a3,[$tp,#8*2] + ldp $a4,$a5,[$tp,#8*4] + ldp $a6,$a7,[$tp,#8*6] + adds $acc0,$acc0,$a0 + ldr $n0,[$rp,#-8*8] + adcs $acc1,$acc1,$a1 + ldp $a0,$a1,[$ap,#8*0] + adcs $acc2,$acc2,$a2 + adcs $acc3,$acc3,$a3 + ldp $a2,$a3,[$ap,#8*2] + adcs $acc4,$acc4,$a4 + adcs $acc5,$acc5,$a5 + ldp $a4,$a5,[$ap,#8*4] + adcs $acc6,$acc6,$a6 + mov $cnt,#-8*8 + adcs $acc7,$acc7,$a7 + ldp $a6,$a7,[$ap,#8*6] + add $ap,$ap,#8*8 + //adc $carry,xzr,xzr // moved above + b .Lsqr8x_mul + +.align 4 +.Lsqr8x_break: + ldp $a0,$a1,[$rp,#8*0] + add $ap,$rp,#8*8 + ldp $a2,$a3,[$rp,#8*2] + sub $t0,$ap_end,$ap // is it last iteration? + ldp $a4,$a5,[$rp,#8*4] + sub $t1,$tp,$t0 + ldp $a6,$a7,[$rp,#8*6] + cbz $t0,.Lsqr8x_outer_loop + + stp $acc0,$acc1,[$tp,#8*0] + ldp $acc0,$acc1,[$t1,#8*0] + stp $acc2,$acc3,[$tp,#8*2] + ldp $acc2,$acc3,[$t1,#8*2] + stp $acc4,$acc5,[$tp,#8*4] + ldp $acc4,$acc5,[$t1,#8*4] + stp $acc6,$acc7,[$tp,#8*6] + mov $tp,$t1 + ldp $acc6,$acc7,[$t1,#8*6] + b .Lsqr8x_outer_loop + +.align 4 +.Lsqr8x_outer_break: + // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] + ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0] + ldp $t1,$t2,[sp,#8*1] + ldp $a5,$a7,[$t0,#8*2] + add $ap,$t0,#8*4 + ldp $t3,$t0,[sp,#8*3] + + stp $acc0,$acc1,[$tp,#8*0] + mul $acc0,$a1,$a1 + stp $acc2,$acc3,[$tp,#8*2] + umulh $a1,$a1,$a1 + stp $acc4,$acc5,[$tp,#8*4] + mul $a2,$a3,$a3 + stp $acc6,$acc7,[$tp,#8*6] + mov $tp,sp + umulh $a3,$a3,$a3 + adds $acc1,$a1,$t1,lsl#1 + extr $t1,$t2,$t1,#63 + sub $cnt,$num,#8*4 + +.Lsqr4x_shift_n_add: + adcs $acc2,$a2,$t1 + extr $t2,$t3,$t2,#63 + sub $cnt,$cnt,#8*4 + adcs $acc3,$a3,$t2 + ldp $t1,$t2,[$tp,#8*5] + mul $a4,$a5,$a5 + ldp $a1,$a3,[$ap],#8*2 + umulh $a5,$a5,$a5 + mul $a6,$a7,$a7 + umulh $a7,$a7,$a7 + extr $t3,$t0,$t3,#63 + stp $acc0,$acc1,[$tp,#8*0] + adcs $acc4,$a4,$t3 + extr $t0,$t1,$t0,#63 + stp $acc2,$acc3,[$tp,#8*2] + adcs $acc5,$a5,$t0 + ldp $t3,$t0,[$tp,#8*7] + extr $t1,$t2,$t1,#63 + adcs $acc6,$a6,$t1 + extr $t2,$t3,$t2,#63 + adcs $acc7,$a7,$t2 + ldp $t1,$t2,[$tp,#8*9] + mul $a0,$a1,$a1 + ldp $a5,$a7,[$ap],#8*2 + umulh $a1,$a1,$a1 + mul $a2,$a3,$a3 + umulh $a3,$a3,$a3 + stp $acc4,$acc5,[$tp,#8*4] + extr $t3,$t0,$t3,#63 + stp $acc6,$acc7,[$tp,#8*6] + add $tp,$tp,#8*8 + adcs $acc0,$a0,$t3 + extr $t0,$t1,$t0,#63 + adcs $acc1,$a1,$t0 + ldp $t3,$t0,[$tp,#8*3] + extr $t1,$t2,$t1,#63 + cbnz $cnt,.Lsqr4x_shift_n_add +___ +my ($np,$np_end)=($ap,$ap_end); +$code.=<<___; + ldp $np,$n0,[x29,#104] // pull np and n0 + + adcs $acc2,$a2,$t1 + extr $t2,$t3,$t2,#63 + adcs $acc3,$a3,$t2 + ldp $t1,$t2,[$tp,#8*5] + mul $a4,$a5,$a5 + umulh $a5,$a5,$a5 + stp $acc0,$acc1,[$tp,#8*0] + mul $a6,$a7,$a7 + umulh $a7,$a7,$a7 + stp $acc2,$acc3,[$tp,#8*2] + extr $t3,$t0,$t3,#63 + adcs $acc4,$a4,$t3 + extr $t0,$t1,$t0,#63 + ldp $acc0,$acc1,[sp,#8*0] + adcs $acc5,$a5,$t0 + extr $t1,$t2,$t1,#63 + ldp $a0,$a1,[$np,#8*0] + adcs $acc6,$a6,$t1 + extr $t2,xzr,$t2,#63 + ldp $a2,$a3,[$np,#8*2] + adc $acc7,$a7,$t2 + ldp $a4,$a5,[$np,#8*4] + + // Reduce by 512 bits per iteration + mul $na0,$n0,$acc0 // t[0]*n0 + ldp $a6,$a7,[$np,#8*6] + add $np_end,$np,$num + ldp $acc2,$acc3,[sp,#8*2] + stp $acc4,$acc5,[$tp,#8*4] + ldp $acc4,$acc5,[sp,#8*4] + stp $acc6,$acc7,[$tp,#8*6] + ldp $acc6,$acc7,[sp,#8*6] + add $np,$np,#8*8 + mov $topmost,xzr // initial top-most carry + mov $tp,sp + mov $cnt,#8 + +.Lsqr8x_reduction: + // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0) + mul $t1,$a1,$na0 + sub $cnt,$cnt,#1 + mul $t2,$a2,$na0 + str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing + mul $t3,$a3,$na0 + // (*) adds xzr,$acc0,$t0 + subs xzr,$acc0,#1 // (*) + mul $t0,$a4,$na0 + adcs $acc0,$acc1,$t1 + mul $t1,$a5,$na0 + adcs $acc1,$acc2,$t2 + mul $t2,$a6,$na0 + adcs $acc2,$acc3,$t3 + mul $t3,$a7,$na0 + adcs $acc3,$acc4,$t0 + umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0) + adcs $acc4,$acc5,$t1 + umulh $t1,$a1,$na0 + adcs $acc5,$acc6,$t2 + umulh $t2,$a2,$na0 + adcs $acc6,$acc7,$t3 + umulh $t3,$a3,$na0 + adc $acc7,xzr,xzr + adds $acc0,$acc0,$t0 + umulh $t0,$a4,$na0 + adcs $acc1,$acc1,$t1 + umulh $t1,$a5,$na0 + adcs $acc2,$acc2,$t2 + umulh $t2,$a6,$na0 + adcs $acc3,$acc3,$t3 + umulh $t3,$a7,$na0 + mul $na0,$n0,$acc0 // next t[0]*n0 + adcs $acc4,$acc4,$t0 + adcs $acc5,$acc5,$t1 + adcs $acc6,$acc6,$t2 + adc $acc7,$acc7,$t3 + cbnz $cnt,.Lsqr8x_reduction + + ldp $t0,$t1,[$tp,#8*0] + ldp $t2,$t3,[$tp,#8*2] + mov $rp,$tp + sub $cnt,$np_end,$np // done yet? + adds $acc0,$acc0,$t0 + adcs $acc1,$acc1,$t1 + ldp $t0,$t1,[$tp,#8*4] + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + ldp $t2,$t3,[$tp,#8*6] + adcs $acc4,$acc4,$t0 + adcs $acc5,$acc5,$t1 + adcs $acc6,$acc6,$t2 + adcs $acc7,$acc7,$t3 + //adc $carry,xzr,xzr // moved below + cbz $cnt,.Lsqr8x8_post_condition + + ldr $n0,[$tp,#-8*8] + ldp $a0,$a1,[$np,#8*0] + ldp $a2,$a3,[$np,#8*2] + ldp $a4,$a5,[$np,#8*4] + mov $cnt,#-8*8 + ldp $a6,$a7,[$np,#8*6] + add $np,$np,#8*8 + +.Lsqr8x_tail: + mul $t0,$a0,$n0 + adc $carry,xzr,xzr // carry bit, modulo-scheduled + mul $t1,$a1,$n0 + add $cnt,$cnt,#8 + mul $t2,$a2,$n0 + mul $t3,$a3,$n0 + adds $acc0,$acc0,$t0 + mul $t0,$a4,$n0 + adcs $acc1,$acc1,$t1 + mul $t1,$a5,$n0 + adcs $acc2,$acc2,$t2 + mul $t2,$a6,$n0 + adcs $acc3,$acc3,$t3 + mul $t3,$a7,$n0 + adcs $acc4,$acc4,$t0 + umulh $t0,$a0,$n0 + adcs $acc5,$acc5,$t1 + umulh $t1,$a1,$n0 + adcs $acc6,$acc6,$t2 + umulh $t2,$a2,$n0 + adcs $acc7,$acc7,$t3 + umulh $t3,$a3,$n0 + adc $carry,$carry,xzr + str $acc0,[$tp],#8 + adds $acc0,$acc1,$t0 + umulh $t0,$a4,$n0 + adcs $acc1,$acc2,$t1 + umulh $t1,$a5,$n0 + adcs $acc2,$acc3,$t2 + umulh $t2,$a6,$n0 + adcs $acc3,$acc4,$t3 + umulh $t3,$a7,$n0 + ldr $n0,[$rp,$cnt] + adcs $acc4,$acc5,$t0 + adcs $acc5,$acc6,$t1 + adcs $acc6,$acc7,$t2 + adcs $acc7,$carry,$t3 + //adc $carry,xzr,xzr // moved above + cbnz $cnt,.Lsqr8x_tail + // note that carry flag is guaranteed + // to be zero at this point + ldp $a0,$a1,[$tp,#8*0] + sub $cnt,$np_end,$np // done yet? + sub $t2,$np_end,$num // rewinded np + ldp $a2,$a3,[$tp,#8*2] + ldp $a4,$a5,[$tp,#8*4] + ldp $a6,$a7,[$tp,#8*6] + cbz $cnt,.Lsqr8x_tail_break + + ldr $n0,[$rp,#-8*8] + adds $acc0,$acc0,$a0 + adcs $acc1,$acc1,$a1 + ldp $a0,$a1,[$np,#8*0] + adcs $acc2,$acc2,$a2 + adcs $acc3,$acc3,$a3 + ldp $a2,$a3,[$np,#8*2] + adcs $acc4,$acc4,$a4 + adcs $acc5,$acc5,$a5 + ldp $a4,$a5,[$np,#8*4] + adcs $acc6,$acc6,$a6 + mov $cnt,#-8*8 + adcs $acc7,$acc7,$a7 + ldp $a6,$a7,[$np,#8*6] + add $np,$np,#8*8 + //adc $carry,xzr,xzr // moved above + b .Lsqr8x_tail + +.align 4 +.Lsqr8x_tail_break: + ldr $n0,[x29,#112] // pull n0 + add $cnt,$tp,#8*8 // end of current t[num] window + + subs xzr,$topmost,#1 // "move" top-most carry to carry bit + adcs $t0,$acc0,$a0 + adcs $t1,$acc1,$a1 + ldp $acc0,$acc1,[$rp,#8*0] + adcs $acc2,$acc2,$a2 + ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0] + adcs $acc3,$acc3,$a3 + ldp $a2,$a3,[$t2,#8*2] + adcs $acc4,$acc4,$a4 + adcs $acc5,$acc5,$a5 + ldp $a4,$a5,[$t2,#8*4] + adcs $acc6,$acc6,$a6 + adcs $acc7,$acc7,$a7 + ldp $a6,$a7,[$t2,#8*6] + add $np,$t2,#8*8 + adc $topmost,xzr,xzr // top-most carry + mul $na0,$n0,$acc0 + stp $t0,$t1,[$tp,#8*0] + stp $acc2,$acc3,[$tp,#8*2] + ldp $acc2,$acc3,[$rp,#8*2] + stp $acc4,$acc5,[$tp,#8*4] + ldp $acc4,$acc5,[$rp,#8*4] + cmp $cnt,x29 // did we hit the bottom? + stp $acc6,$acc7,[$tp,#8*6] + mov $tp,$rp // slide the window + ldp $acc6,$acc7,[$rp,#8*6] + mov $cnt,#8 + b.ne .Lsqr8x_reduction + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr $rp,[x29,#96] // pull rp + add $tp,$tp,#8*8 + subs $t0,$acc0,$a0 + sbcs $t1,$acc1,$a1 + sub $cnt,$num,#8*8 + mov $ap_end,$rp // $rp copy + +.Lsqr8x_sub: + sbcs $t2,$acc2,$a2 + ldp $a0,$a1,[$np,#8*0] + sbcs $t3,$acc3,$a3 + stp $t0,$t1,[$rp,#8*0] + sbcs $t0,$acc4,$a4 + ldp $a2,$a3,[$np,#8*2] + sbcs $t1,$acc5,$a5 + stp $t2,$t3,[$rp,#8*2] + sbcs $t2,$acc6,$a6 + ldp $a4,$a5,[$np,#8*4] + sbcs $t3,$acc7,$a7 + ldp $a6,$a7,[$np,#8*6] + add $np,$np,#8*8 + ldp $acc0,$acc1,[$tp,#8*0] + sub $cnt,$cnt,#8*8 + ldp $acc2,$acc3,[$tp,#8*2] + ldp $acc4,$acc5,[$tp,#8*4] + ldp $acc6,$acc7,[$tp,#8*6] + add $tp,$tp,#8*8 + stp $t0,$t1,[$rp,#8*4] + sbcs $t0,$acc0,$a0 + stp $t2,$t3,[$rp,#8*6] + add $rp,$rp,#8*8 + sbcs $t1,$acc1,$a1 + cbnz $cnt,.Lsqr8x_sub + + sbcs $t2,$acc2,$a2 + mov $tp,sp + add $ap,sp,$num + ldp $a0,$a1,[$ap_end,#8*0] + sbcs $t3,$acc3,$a3 + stp $t0,$t1,[$rp,#8*0] + sbcs $t0,$acc4,$a4 + ldp $a2,$a3,[$ap_end,#8*2] + sbcs $t1,$acc5,$a5 + stp $t2,$t3,[$rp,#8*2] + sbcs $t2,$acc6,$a6 + ldp $acc0,$acc1,[$ap,#8*0] + sbcs $t3,$acc7,$a7 + ldp $acc2,$acc3,[$ap,#8*2] + sbcs xzr,$topmost,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + stp $t0,$t1,[$rp,#8*4] + stp $t2,$t3,[$rp,#8*6] + + sub $cnt,$num,#8*4 +.Lsqr4x_cond_copy: + sub $cnt,$cnt,#8*4 + csel $t0,$acc0,$a0,lo + stp xzr,xzr,[$tp,#8*0] + csel $t1,$acc1,$a1,lo + ldp $a0,$a1,[$ap_end,#8*4] + ldp $acc0,$acc1,[$ap,#8*4] + csel $t2,$acc2,$a2,lo + stp xzr,xzr,[$tp,#8*2] + add $tp,$tp,#8*4 + csel $t3,$acc3,$a3,lo + ldp $a2,$a3,[$ap_end,#8*6] + ldp $acc2,$acc3,[$ap,#8*6] + add $ap,$ap,#8*4 + stp $t0,$t1,[$ap_end,#8*0] + stp $t2,$t3,[$ap_end,#8*2] + add $ap_end,$ap_end,#8*4 + stp xzr,xzr,[$ap,#8*0] + stp xzr,xzr,[$ap,#8*2] + cbnz $cnt,.Lsqr4x_cond_copy + + csel $t0,$acc0,$a0,lo + stp xzr,xzr,[$tp,#8*0] + csel $t1,$acc1,$a1,lo + stp xzr,xzr,[$tp,#8*2] + csel $t2,$acc2,$a2,lo + csel $t3,$acc3,$a3,lo + stp $t0,$t1,[$ap_end,#8*0] + stp $t2,$t3,[$ap_end,#8*2] + + b .Lsqr8x_done + +.align 4 +.Lsqr8x8_post_condition: + adc $carry,xzr,xzr + ldr x30,[x29,#8] // pull return address + // $acc0-7,$carry hold result, $a0-7 hold modulus + subs $a0,$acc0,$a0 + ldr $ap,[x29,#96] // pull rp + sbcs $a1,$acc1,$a1 + stp xzr,xzr,[sp,#8*0] + sbcs $a2,$acc2,$a2 + stp xzr,xzr,[sp,#8*2] + sbcs $a3,$acc3,$a3 + stp xzr,xzr,[sp,#8*4] + sbcs $a4,$acc4,$a4 + stp xzr,xzr,[sp,#8*6] + sbcs $a5,$acc5,$a5 + stp xzr,xzr,[sp,#8*8] + sbcs $a6,$acc6,$a6 + stp xzr,xzr,[sp,#8*10] + sbcs $a7,$acc7,$a7 + stp xzr,xzr,[sp,#8*12] + sbcs $carry,$carry,xzr // did it borrow? + stp xzr,xzr,[sp,#8*14] + + // $a0-7 hold result-modulus + csel $a0,$acc0,$a0,lo + csel $a1,$acc1,$a1,lo + csel $a2,$acc2,$a2,lo + csel $a3,$acc3,$a3,lo + stp $a0,$a1,[$ap,#8*0] + csel $a4,$acc4,$a4,lo + csel $a5,$acc5,$a5,lo + stp $a2,$a3,[$ap,#8*2] + csel $a6,$acc6,$a6,lo + csel $a7,$acc7,$a7,lo + stp $a4,$a5,[$ap,#8*4] + stp $a6,$a7,[$ap,#8*6] + +.Lsqr8x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + ret +.size __bn_sqr8x_mont,.-__bn_sqr8x_mont +___ +} + +{ +######################################################################## +# Even though this might look as ARMv8 adaptation of mulx4x_mont from +# x86_64-mont5 module, it's different in sense that it performs +# reduction 256 bits at a time. + +my ($a0,$a1,$a2,$a3, + $t0,$t1,$t2,$t3, + $m0,$m1,$m2,$m3, + $acc0,$acc1,$acc2,$acc3,$acc4, + $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28)); +my $bp_end=$rp; +my ($carry,$topmost) = ($rp,"x30"); + +$code.=<<___; +.type __bn_mul4x_mont,%function +.align 5 +__bn_mul4x_mont: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + sub $tp,sp,$num,lsl#3 + lsl $num,$num,#3 + ldr $n0,[$n0] // *n0 + sub sp,$tp,#8*4 // alloca + + add $t0,$bp,$num + add $ap_end,$ap,$num + stp $rp,$t0,[x29,#96] // offload rp and &b[num] + + ldr $bi,[$bp,#8*0] // b[0] + ldp $a0,$a1,[$ap,#8*0] // a[0..3] + ldp $a2,$a3,[$ap,#8*2] + add $ap,$ap,#8*4 + mov $acc0,xzr + mov $acc1,xzr + mov $acc2,xzr + mov $acc3,xzr + ldp $m0,$m1,[$np,#8*0] // n[0..3] + ldp $m2,$m3,[$np,#8*2] + adds $np,$np,#8*4 // clear carry bit + mov $carry,xzr + mov $cnt,#0 + mov $tp,sp + +.Loop_mul4x_1st_reduction: + mul $t0,$a0,$bi // lo(a[0..3]*b[0]) + adc $carry,$carry,xzr // modulo-scheduled + mul $t1,$a1,$bi + add $cnt,$cnt,#8 + mul $t2,$a2,$bi + and $cnt,$cnt,#31 + mul $t3,$a3,$bi + adds $acc0,$acc0,$t0 + umulh $t0,$a0,$bi // hi(a[0..3]*b[0]) + adcs $acc1,$acc1,$t1 + mul $mi,$acc0,$n0 // t[0]*n0 + adcs $acc2,$acc2,$t2 + umulh $t1,$a1,$bi + adcs $acc3,$acc3,$t3 + umulh $t2,$a2,$bi + adc $acc4,xzr,xzr + umulh $t3,$a3,$bi + ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) + adds $acc1,$acc1,$t0 + // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0) + str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing + adcs $acc2,$acc2,$t1 + mul $t1,$m1,$mi + adcs $acc3,$acc3,$t2 + mul $t2,$m2,$mi + adc $acc4,$acc4,$t3 // can't overflow + mul $t3,$m3,$mi + // (*) adds xzr,$acc0,$t0 + subs xzr,$acc0,#1 // (*) + umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0) + adcs $acc0,$acc1,$t1 + umulh $t1,$m1,$mi + adcs $acc1,$acc2,$t2 + umulh $t2,$m2,$mi + adcs $acc2,$acc3,$t3 + umulh $t3,$m3,$mi + adcs $acc3,$acc4,$carry + adc $carry,xzr,xzr + adds $acc0,$acc0,$t0 + sub $t0,$ap_end,$ap + adcs $acc1,$acc1,$t1 + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + //adc $carry,$carry,xzr + cbnz $cnt,.Loop_mul4x_1st_reduction + + cbz $t0,.Lmul4x4_post_condition + + ldp $a0,$a1,[$ap,#8*0] // a[4..7] + ldp $a2,$a3,[$ap,#8*2] + add $ap,$ap,#8*4 + ldr $mi,[sp] // a[0]*n0 + ldp $m0,$m1,[$np,#8*0] // n[4..7] + ldp $m2,$m3,[$np,#8*2] + add $np,$np,#8*4 + +.Loop_mul4x_1st_tail: + mul $t0,$a0,$bi // lo(a[4..7]*b[i]) + adc $carry,$carry,xzr // modulo-scheduled + mul $t1,$a1,$bi + add $cnt,$cnt,#8 + mul $t2,$a2,$bi + and $cnt,$cnt,#31 + mul $t3,$a3,$bi + adds $acc0,$acc0,$t0 + umulh $t0,$a0,$bi // hi(a[4..7]*b[i]) + adcs $acc1,$acc1,$t1 + umulh $t1,$a1,$bi + adcs $acc2,$acc2,$t2 + umulh $t2,$a2,$bi + adcs $acc3,$acc3,$t3 + umulh $t3,$a3,$bi + adc $acc4,xzr,xzr + ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) + adds $acc1,$acc1,$t0 + mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0) + adcs $acc2,$acc2,$t1 + mul $t1,$m1,$mi + adcs $acc3,$acc3,$t2 + mul $t2,$m2,$mi + adc $acc4,$acc4,$t3 // can't overflow + mul $t3,$m3,$mi + adds $acc0,$acc0,$t0 + umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0) + adcs $acc1,$acc1,$t1 + umulh $t1,$m1,$mi + adcs $acc2,$acc2,$t2 + umulh $t2,$m2,$mi + adcs $acc3,$acc3,$t3 + adcs $acc4,$acc4,$carry + umulh $t3,$m3,$mi + adc $carry,xzr,xzr + ldr $mi,[sp,$cnt] // next t[0]*n0 + str $acc0,[$tp],#8 // result!!! + adds $acc0,$acc1,$t0 + sub $t0,$ap_end,$ap // done yet? + adcs $acc1,$acc2,$t1 + adcs $acc2,$acc3,$t2 + adcs $acc3,$acc4,$t3 + //adc $carry,$carry,xzr + cbnz $cnt,.Loop_mul4x_1st_tail + + sub $t1,$ap_end,$num // rewinded $ap + cbz $t0,.Lmul4x_proceed + + ldp $a0,$a1,[$ap,#8*0] + ldp $a2,$a3,[$ap,#8*2] + add $ap,$ap,#8*4 + ldp $m0,$m1,[$np,#8*0] + ldp $m2,$m3,[$np,#8*2] + add $np,$np,#8*4 + b .Loop_mul4x_1st_tail + +.align 5 +.Lmul4x_proceed: + ldr $bi,[$bp,#8*4]! // *++b + adc $topmost,$carry,xzr + ldp $a0,$a1,[$t1,#8*0] // a[0..3] + sub $np,$np,$num // rewind np + ldp $a2,$a3,[$t1,#8*2] + add $ap,$t1,#8*4 + + stp $acc0,$acc1,[$tp,#8*0] // result!!! + ldp $acc0,$acc1,[sp,#8*4] // t[0..3] + stp $acc2,$acc3,[$tp,#8*2] // result!!! + ldp $acc2,$acc3,[sp,#8*6] + + ldp $m0,$m1,[$np,#8*0] // n[0..3] + mov $tp,sp + ldp $m2,$m3,[$np,#8*2] + adds $np,$np,#8*4 // clear carry bit + mov $carry,xzr + +.align 4 +.Loop_mul4x_reduction: + mul $t0,$a0,$bi // lo(a[0..3]*b[4]) + adc $carry,$carry,xzr // modulo-scheduled + mul $t1,$a1,$bi + add $cnt,$cnt,#8 + mul $t2,$a2,$bi + and $cnt,$cnt,#31 + mul $t3,$a3,$bi + adds $acc0,$acc0,$t0 + umulh $t0,$a0,$bi // hi(a[0..3]*b[4]) + adcs $acc1,$acc1,$t1 + mul $mi,$acc0,$n0 // t[0]*n0 + adcs $acc2,$acc2,$t2 + umulh $t1,$a1,$bi + adcs $acc3,$acc3,$t3 + umulh $t2,$a2,$bi + adc $acc4,xzr,xzr + umulh $t3,$a3,$bi + ldr $bi,[$bp,$cnt] // next b[i] + adds $acc1,$acc1,$t0 + // (*) mul $t0,$m0,$mi + str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing + adcs $acc2,$acc2,$t1 + mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0 + adcs $acc3,$acc3,$t2 + mul $t2,$m2,$mi + adc $acc4,$acc4,$t3 // can't overflow + mul $t3,$m3,$mi + // (*) adds xzr,$acc0,$t0 + subs xzr,$acc0,#1 // (*) + umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0 + adcs $acc0,$acc1,$t1 + umulh $t1,$m1,$mi + adcs $acc1,$acc2,$t2 + umulh $t2,$m2,$mi + adcs $acc2,$acc3,$t3 + umulh $t3,$m3,$mi + adcs $acc3,$acc4,$carry + adc $carry,xzr,xzr + adds $acc0,$acc0,$t0 + adcs $acc1,$acc1,$t1 + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + //adc $carry,$carry,xzr + cbnz $cnt,.Loop_mul4x_reduction + + adc $carry,$carry,xzr + ldp $t0,$t1,[$tp,#8*4] // t[4..7] + ldp $t2,$t3,[$tp,#8*6] + ldp $a0,$a1,[$ap,#8*0] // a[4..7] + ldp $a2,$a3,[$ap,#8*2] + add $ap,$ap,#8*4 + adds $acc0,$acc0,$t0 + adcs $acc1,$acc1,$t1 + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + //adc $carry,$carry,xzr + + ldr $mi,[sp] // t[0]*n0 + ldp $m0,$m1,[$np,#8*0] // n[4..7] + ldp $m2,$m3,[$np,#8*2] + add $np,$np,#8*4 + +.align 4 +.Loop_mul4x_tail: + mul $t0,$a0,$bi // lo(a[4..7]*b[4]) + adc $carry,$carry,xzr // modulo-scheduled + mul $t1,$a1,$bi + add $cnt,$cnt,#8 + mul $t2,$a2,$bi + and $cnt,$cnt,#31 + mul $t3,$a3,$bi + adds $acc0,$acc0,$t0 + umulh $t0,$a0,$bi // hi(a[4..7]*b[4]) + adcs $acc1,$acc1,$t1 + umulh $t1,$a1,$bi + adcs $acc2,$acc2,$t2 + umulh $t2,$a2,$bi + adcs $acc3,$acc3,$t3 + umulh $t3,$a3,$bi + adc $acc4,xzr,xzr + ldr $bi,[$bp,$cnt] // next b[i] + adds $acc1,$acc1,$t0 + mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0) + adcs $acc2,$acc2,$t1 + mul $t1,$m1,$mi + adcs $acc3,$acc3,$t2 + mul $t2,$m2,$mi + adc $acc4,$acc4,$t3 // can't overflow + mul $t3,$m3,$mi + adds $acc0,$acc0,$t0 + umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0) + adcs $acc1,$acc1,$t1 + umulh $t1,$m1,$mi + adcs $acc2,$acc2,$t2 + umulh $t2,$m2,$mi + adcs $acc3,$acc3,$t3 + umulh $t3,$m3,$mi + adcs $acc4,$acc4,$carry + ldr $mi,[sp,$cnt] // next a[0]*n0 + adc $carry,xzr,xzr + str $acc0,[$tp],#8 // result!!! + adds $acc0,$acc1,$t0 + sub $t0,$ap_end,$ap // done yet? + adcs $acc1,$acc2,$t1 + adcs $acc2,$acc3,$t2 + adcs $acc3,$acc4,$t3 + //adc $carry,$carry,xzr + cbnz $cnt,.Loop_mul4x_tail + + sub $t1,$np,$num // rewinded np? + adc $carry,$carry,xzr + cbz $t0,.Loop_mul4x_break + + ldp $t0,$t1,[$tp,#8*4] + ldp $t2,$t3,[$tp,#8*6] + ldp $a0,$a1,[$ap,#8*0] + ldp $a2,$a3,[$ap,#8*2] + add $ap,$ap,#8*4 + adds $acc0,$acc0,$t0 + adcs $acc1,$acc1,$t1 + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + //adc $carry,$carry,xzr + ldp $m0,$m1,[$np,#8*0] + ldp $m2,$m3,[$np,#8*2] + add $np,$np,#8*4 + b .Loop_mul4x_tail + +.align 4 +.Loop_mul4x_break: + ldp $t2,$t3,[x29,#96] // pull rp and &b[num] + adds $acc0,$acc0,$topmost + add $bp,$bp,#8*4 // bp++ + adcs $acc1,$acc1,xzr + sub $ap,$ap,$num // rewind ap + adcs $acc2,$acc2,xzr + stp $acc0,$acc1,[$tp,#8*0] // result!!! + adcs $acc3,$acc3,xzr + ldp $acc0,$acc1,[sp,#8*4] // t[0..3] + adc $topmost,$carry,xzr + stp $acc2,$acc3,[$tp,#8*2] // result!!! + cmp $bp,$t3 // done yet? + ldp $acc2,$acc3,[sp,#8*6] + ldp $m0,$m1,[$t1,#8*0] // n[0..3] + ldp $m2,$m3,[$t1,#8*2] + add $np,$t1,#8*4 + b.eq .Lmul4x_post + + ldr $bi,[$bp] + ldp $a0,$a1,[$ap,#8*0] // a[0..3] + ldp $a2,$a3,[$ap,#8*2] + adds $ap,$ap,#8*4 // clear carry bit + mov $carry,xzr + mov $tp,sp + b .Loop_mul4x_reduction + +.align 4 +.Lmul4x_post: + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + mov $rp,$t2 + mov $ap_end,$t2 // $rp copy + subs $t0,$acc0,$m0 + add $tp,sp,#8*8 + sbcs $t1,$acc1,$m1 + sub $cnt,$num,#8*4 + +.Lmul4x_sub: + sbcs $t2,$acc2,$m2 + ldp $m0,$m1,[$np,#8*0] + sub $cnt,$cnt,#8*4 + ldp $acc0,$acc1,[$tp,#8*0] + sbcs $t3,$acc3,$m3 + ldp $m2,$m3,[$np,#8*2] + add $np,$np,#8*4 + ldp $acc2,$acc3,[$tp,#8*2] + add $tp,$tp,#8*4 + stp $t0,$t1,[$rp,#8*0] + sbcs $t0,$acc0,$m0 + stp $t2,$t3,[$rp,#8*2] + add $rp,$rp,#8*4 + sbcs $t1,$acc1,$m1 + cbnz $cnt,.Lmul4x_sub + + sbcs $t2,$acc2,$m2 + mov $tp,sp + add $ap,sp,#8*4 + ldp $a0,$a1,[$ap_end,#8*0] + sbcs $t3,$acc3,$m3 + stp $t0,$t1,[$rp,#8*0] + ldp $a2,$a3,[$ap_end,#8*2] + stp $t2,$t3,[$rp,#8*2] + ldp $acc0,$acc1,[$ap,#8*0] + ldp $acc2,$acc3,[$ap,#8*2] + sbcs xzr,$topmost,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + + sub $cnt,$num,#8*4 +.Lmul4x_cond_copy: + sub $cnt,$cnt,#8*4 + csel $t0,$acc0,$a0,lo + stp xzr,xzr,[$tp,#8*0] + csel $t1,$acc1,$a1,lo + ldp $a0,$a1,[$ap_end,#8*4] + ldp $acc0,$acc1,[$ap,#8*4] + csel $t2,$acc2,$a2,lo + stp xzr,xzr,[$tp,#8*2] + add $tp,$tp,#8*4 + csel $t3,$acc3,$a3,lo + ldp $a2,$a3,[$ap_end,#8*6] + ldp $acc2,$acc3,[$ap,#8*6] + add $ap,$ap,#8*4 + stp $t0,$t1,[$ap_end,#8*0] + stp $t2,$t3,[$ap_end,#8*2] + add $ap_end,$ap_end,#8*4 + cbnz $cnt,.Lmul4x_cond_copy + + csel $t0,$acc0,$a0,lo + stp xzr,xzr,[$tp,#8*0] + csel $t1,$acc1,$a1,lo + stp xzr,xzr,[$tp,#8*2] + csel $t2,$acc2,$a2,lo + stp xzr,xzr,[$tp,#8*3] + csel $t3,$acc3,$a3,lo + stp xzr,xzr,[$tp,#8*4] + stp $t0,$t1,[$ap_end,#8*0] + stp $t2,$t3,[$ap_end,#8*2] + + b .Lmul4x_done + +.align 4 +.Lmul4x4_post_condition: + adc $carry,$carry,xzr + ldr $ap,[x29,#96] // pull rp + // $acc0-3,$carry hold result, $m0-7 hold modulus + subs $a0,$acc0,$m0 + ldr x30,[x29,#8] // pull return address + sbcs $a1,$acc1,$m1 + stp xzr,xzr,[sp,#8*0] + sbcs $a2,$acc2,$m2 + stp xzr,xzr,[sp,#8*2] + sbcs $a3,$acc3,$m3 + stp xzr,xzr,[sp,#8*4] + sbcs xzr,$carry,xzr // did it borrow? + stp xzr,xzr,[sp,#8*6] + + // $a0-3 hold result-modulus + csel $a0,$acc0,$a0,lo + csel $a1,$acc1,$a1,lo + csel $a2,$acc2,$a2,lo + csel $a3,$acc3,$a3,lo + stp $a0,$a1,[$ap,#8*0] + stp $a2,$a3,[$ap,#8*2] + +.Lmul4x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + ret +.size __bn_mul4x_mont,.-__bn_mul4x_mont +___ +} +$code.=<<___; +.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align 4 +___ + +print $code; + +close STDOUT; diff --git a/deps/openssl/openssl/crypto/bn/asm/bn-586.pl b/deps/openssl/openssl/crypto/bn/asm/bn-586.pl index 332ef3e91d..1ca1bbf7d4 100644 --- a/deps/openssl/openssl/crypto/bn/asm/bn-586.pl +++ b/deps/openssl/openssl/crypto/bn/asm/bn-586.pl @@ -1,9 +1,19 @@ -#!/usr/local/bin/perl +#! /usr/bin/env perl +# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; +$output = pop; +open STDOUT,">$output"; + &asm_init($ARGV[0],$0); $sse2=0; @@ -21,6 +31,8 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &asm_finish(); +close STDOUT; + sub bn_mul_add_words { local($name)=@_; @@ -771,4 +783,3 @@ sub bn_sub_part_words &function_end($name); } - diff --git a/deps/openssl/openssl/crypto/bn/asm/bn-c64xplus.asm b/deps/openssl/openssl/crypto/bn/asm/bn-c64xplus.asm new file mode 100644 index 0000000000..de6d37728f --- /dev/null +++ b/deps/openssl/openssl/crypto/bn/asm/bn-c64xplus.asm @@ -0,0 +1,382 @@ +;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +;; +;; Licensed under the OpenSSL license (the "License"). You may not use +;; this file except in compliance with the License. You can obtain a copy +;; in the file LICENSE in the source distribution or at +;; https://www.openssl.org/source/license.html +;; +;;==================================================================== +;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +;; project. +;; +;; Rights for redistribution and usage in source and binary forms are +;; granted according to the OpenSSL license. Warranty of any kind is +;; disclaimed. +;;==================================================================== +;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n +;; being the number of 32-bit words, addition - 8*n. Corresponding 4x +;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler +;; SPLOOPs spin at ... 2*n cycles [plus epilogue]. +;;==================================================================== + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .asg bn_mul_add_words,_bn_mul_add_words + .asg bn_mul_words,_bn_mul_words + .asg bn_sqr_words,_bn_sqr_words + .asg bn_add_words,_bn_add_words + .asg bn_sub_words,_bn_sub_words + .asg bn_div_words,_bn_div_words + .asg bn_sqr_comba8,_bn_sqr_comba8 + .asg bn_mul_comba8,_bn_mul_comba8 + .asg bn_sqr_comba4,_bn_sqr_comba4 + .asg bn_mul_comba4,_bn_mul_comba4 + .endif + + .asg B3,RA + .asg A4,ARG0 + .asg B4,ARG1 + .asg A6,ARG2 + .asg B6,ARG3 + .asg A8,ARG4 + .asg B8,ARG5 + .asg A4,RET + .asg A15,FP + .asg B14,DP + .asg B15,SP + + .global _bn_mul_add_words +_bn_mul_add_words: + .asmfunc + MV ARG2,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] ZERO A19 ; high part of accumulator +|| [B0] MV ARG0,A2 +|| [B0] MV ARG3,A3 + NOP 3 + + SPLOOP 2 ; 2*n+10 +;;==================================================================== + LDW *ARG1++,B7 ; ap[i] + NOP 3 + LDW *ARG0++,A7 ; rp[i] + MPY32U B7,A3,A17:A16 + NOP 3 ; [2,0] in epilogue + ADDU A16,A7,A21:A20 + ADDU A19,A21:A20,A19:A18 +|| MV.S A17,A23 + SPKERNEL 2,1 ; leave slot for "return value" +|| STW A18,*A2++ ; rp[i] +|| ADD A19,A23,A19 +;;==================================================================== + BNOP RA,4 + MV A19,RET ; return value + .endasmfunc + + .global _bn_mul_words +_bn_mul_words: + .asmfunc + MV ARG2,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] ZERO A19 ; high part of accumulator + NOP 3 + + SPLOOP 2 ; 2*n+10 +;;==================================================================== + LDW *ARG1++,A7 ; ap[i] + NOP 4 + MPY32U A7,ARG3,A17:A16 + NOP 4 ; [2,0] in epiloque + ADDU A19,A16,A19:A18 +|| MV.S A17,A21 + SPKERNEL 2,1 ; leave slot for "return value" +|| STW A18,*ARG0++ ; rp[i] +|| ADD.L A19,A21,A19 +;;==================================================================== + BNOP RA,4 + MV A19,RET ; return value + .endasmfunc + + .global _bn_sqr_words +_bn_sqr_words: + .asmfunc + MV ARG2,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] MV ARG0,B2 +|| [B0] ADD 4,ARG0,ARG0 + NOP 3 + + SPLOOP 2 ; 2*n+10 +;;==================================================================== + LDW *ARG1++,B7 ; ap[i] + NOP 4 + MPY32U B7,B7,B1:B0 + NOP 3 ; [2,0] in epilogue + STW B0,*B2++(8) ; rp[2*i] + MV B1,A1 + SPKERNEL 2,0 ; fully overlap BNOP RA,5 +|| STW A1,*ARG0++(8) ; rp[2*i+1] +;;==================================================================== + BNOP RA,5 + .endasmfunc + + .global _bn_add_words +_bn_add_words: + .asmfunc + MV ARG3,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] ZERO A1 ; carry flag +|| [B0] MV ARG0,A3 + NOP 3 + + SPLOOP 2 ; 2*n+6 +;;==================================================================== + LDW *ARG2++,A7 ; bp[i] +|| LDW *ARG1++,B7 ; ap[i] + NOP 4 + ADDU A7,B7,A9:A8 + ADDU A1,A9:A8,A1:A0 + SPKERNEL 0,0 ; fully overlap BNOP RA,5 +|| STW A0,*A3++ ; write result +|| MV A1,RET ; keep carry flag in RET +;;==================================================================== + BNOP RA,5 + .endasmfunc + + .global _bn_sub_words +_bn_sub_words: + .asmfunc + MV ARG3,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] ZERO A2 ; borrow flag +|| [B0] MV ARG0,A3 + NOP 3 + + SPLOOP 2 ; 2*n+6 +;;==================================================================== + LDW *ARG2++,A7 ; bp[i] +|| LDW *ARG1++,B7 ; ap[i] + NOP 4 + SUBU B7,A7,A1:A0 + [A2] SUB A1:A0,1,A1:A0 + SPKERNEL 0,1 ; leave slot for "return borrow flag" +|| STW A0,*A3++ ; write result +|| AND 1,A1,A2 ; pass on borrow flag +;;==================================================================== + BNOP RA,4 + AND 1,A1,RET ; return borrow flag + .endasmfunc + + .global _bn_div_words +_bn_div_words: + .asmfunc + LMBD 1,A6,A0 ; leading zero bits in dv + LMBD 1,A4,A1 ; leading zero bits in hi +|| MVK 32,B0 + CMPLTU A1,A0,A2 +|| ADD A0,B0,B0 + [ A2] BNOP RA +||[ A2] MVK -1,A4 ; return overflow +||[!A2] MV A4,A3 ; reassign hi + [!A2] MV B4,A4 ; reassign lo, will be quotient +||[!A2] MVC B0,ILC + [!A2] SHL A6,A0,A6 ; normalize dv +|| MVK 1,A1 + + [!A2] CMPLTU A3,A6,A1 ; hi<dv? +||[!A2] SHL A4,1,A5:A4 ; lo<<1 + [!A1] SUB A3,A6,A3 ; hi-=dv +||[!A1] OR 1,A4,A4 + [!A2] SHRU A3,31,A1 ; upper bit +||[!A2] ADDAH A5,A3,A3 ; hi<<1|lo>>31 + + SPLOOP 3 + [!A1] CMPLTU A3,A6,A1 ; hi<dv? +||[ A1] ZERO A1 +|| SHL A4,1,A5:A4 ; lo<<1 + [!A1] SUB A3,A6,A3 ; hi-=dv +||[!A1] OR 1,A4,A4 ; quotient + SHRU A3,31,A1 ; upper bit +|| ADDAH A5,A3,A3 ; hi<<1|lo>>31 + SPKERNEL + + BNOP RA,5 + .endasmfunc + +;;==================================================================== +;; Not really Comba algorithm, just straightforward NxM... Dedicated +;; fully unrolled real Comba implementations are asymptotically 2x +;; faster, but naturally larger undertaking. Purpose of this exercise +;; was rather to learn to master nested SPLOOPs... +;;==================================================================== + .global _bn_sqr_comba8 + .global _bn_mul_comba8 +_bn_sqr_comba8: + MV ARG1,ARG2 +_bn_mul_comba8: + .asmfunc + MVK 8,B0 ; N, RILC +|| MVK 8,A0 ; M, outer loop counter +|| MV ARG1,A5 ; copy ap +|| MV ARG0,B4 ; copy rp +|| ZERO B19 ; high part of accumulator + MVC B0,RILC +|| SUB B0,2,B1 ; N-2, initial ILC +|| SUB B0,1,B2 ; const B2=N-1 +|| LDW *A5++,B6 ; ap[0] +|| MV A0,A3 ; const A3=M +sploopNxM?: ; for best performance arrange M<=N + [A0] SPLOOPD 2 ; 2*n+10 +|| MVC B1,ILC +|| ADDAW B4,B0,B5 +|| ZERO B7 +|| LDW *A5++,A9 ; pre-fetch ap[1] +|| ZERO A1 +|| SUB A0,1,A0 +;;==================================================================== +;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files. +;; This is because of Advisory 15 from TI publication SPRZ247I. + LDW *ARG2++,A7 ; bp[i] + NOP 3 + [A1] LDW *B5++,B7 ; rp[i] + MPY32U A7,B6,B17:B16 + NOP 3 + ADDU B16,B7,B21:B20 + ADDU B19,B21:B20,B19:B18 +|| MV.S B17,B23 + SPKERNEL +|| STW B18,*B4++ ; rp[i] +|| ADD.S B19,B23,B19 +;;==================================================================== +outer?: ; m*2*(n+1)+10 + SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0] + SPMASKR +|| CMPGT A0,1,A2 ; done pre-fetching ap[i+1]? + MVD A9,B6 ; move through .M unit(*) + [A2] LDW *A5++,A9 ; pre-fetch ap[i+1] + SUBAW B5,B2,B5 ; rewind rp to rp[1] + MVK 1,A1 + [A0] BNOP.S1 outer?,4 +|| [A0] SUB.L A0,1,A0 + STW B19,*B4--[B2] ; rewind rp tp rp[1] +|| ZERO.S B19 ; high part of accumulator +;; end of outer? + BNOP RA,5 ; return + .endasmfunc +;; (*) It should be noted that B6 is used as input to MPY32U in +;; chronologically next cycle in *preceding* SPLOOP iteration. +;; Normally such arrangement would require DINT, but at this +;; point SPLOOP is draining and interrupts are disabled +;; implicitly. + + .global _bn_sqr_comba4 + .global _bn_mul_comba4 +_bn_sqr_comba4: + MV ARG1,ARG2 +_bn_mul_comba4: + .asmfunc + .if 0 + BNOP sploopNxM?,3 + ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case, + ;; because of low-counter effect, when prologue phase finishes + ;; before SPKERNEL instruction is reached. As result it's 25% + ;; slower than expected... + MVK 4,B0 ; N, RILC +|| MVK 4,A0 ; M, outer loop counter +|| MV ARG1,A5 ; copy ap +|| MV ARG0,B4 ; copy rp +|| ZERO B19 ; high part of accumulator + MVC B0,RILC +|| SUB B0,2,B1 ; first ILC +|| SUB B0,1,B2 ; const B2=N-1 +|| LDW *A5++,B6 ; ap[0] +|| MV A0,A3 ; const A3=M + .else + ;; This alternative is an exercise in fully unrolled Comba + ;; algorithm implementation that operates at n*(n+1)+12, or + ;; as little as 32 cycles... + LDW *ARG1[0],B16 ; a[0] +|| LDW *ARG2[0],A16 ; b[0] + LDW *ARG1[1],B17 ; a[1] +|| LDW *ARG2[1],A17 ; b[1] + LDW *ARG1[2],B18 ; a[2] +|| LDW *ARG2[2],A18 ; b[2] + LDW *ARG1[3],B19 ; a[3] +|| LDW *ARG2[3],A19 ; b[3] + NOP + MPY32U A16,B16,A1:A0 ; a[0]*b[0] + MPY32U A17,B16,A23:A22 ; a[0]*b[1] + MPY32U A16,B17,A25:A24 ; a[1]*b[0] + MPY32U A16,B18,A27:A26 ; a[2]*b[0] + STW A0,*ARG0[0] +|| MPY32U A17,B17,A29:A28 ; a[1]*b[1] + MPY32U A18,B16,A31:A30 ; a[0]*b[2] +|| ADDU A22,A1,A1:A0 + MV A23,B0 +|| MPY32U A19,B16,A21:A20 ; a[3]*b[0] +|| ADDU A24,A1:A0,A1:A0 + ADDU A25,B0,B1:B0 +|| STW A0,*ARG0[1] +|| MPY32U A18,B17,A23:A22 ; a[2]*b[1] +|| ADDU A26,A1,A9:A8 + ADDU A27,B1,B9:B8 +|| MPY32U A17,B18,A25:A24 ; a[1]*b[2] +|| ADDU A28,A9:A8,A9:A8 + ADDU A29,B9:B8,B9:B8 +|| MPY32U A16,B19,A27:A26 ; a[0]*b[3] +|| ADDU A30,A9:A8,A9:A8 + ADDU A31,B9:B8,B9:B8 +|| ADDU B0,A9:A8,A9:A8 + STW A8,*ARG0[2] +|| ADDU A20,A9,A1:A0 + ADDU A21,B9,B1:B0 +|| MPY32U A19,B17,A21:A20 ; a[3]*b[1] +|| ADDU A22,A1:A0,A1:A0 + ADDU A23,B1:B0,B1:B0 +|| MPY32U A18,B18,A23:A22 ; a[2]*b[2] +|| ADDU A24,A1:A0,A1:A0 + ADDU A25,B1:B0,B1:B0 +|| MPY32U A17,B19,A25:A24 ; a[1]*b[3] +|| ADDU A26,A1:A0,A1:A0 + ADDU A27,B1:B0,B1:B0 +|| ADDU B8,A1:A0,A1:A0 + STW A0,*ARG0[3] +|| MPY32U A19,B18,A27:A26 ; a[3]*b[2] +|| ADDU A20,A1,A9:A8 + ADDU A21,B1,B9:B8 +|| MPY32U A18,B19,A29:A28 ; a[2]*b[3] +|| ADDU A22,A9:A8,A9:A8 + ADDU A23,B9:B8,B9:B8 +|| MPY32U A19,B19,A31:A30 ; a[3]*b[3] +|| ADDU A24,A9:A8,A9:A8 + ADDU A25,B9:B8,B9:B8 +|| ADDU B0,A9:A8,A9:A8 + STW A8,*ARG0[4] +|| ADDU A26,A9,A1:A0 + ADDU A27,B9,B1:B0 +|| ADDU A28,A1:A0,A1:A0 + ADDU A29,B1:B0,B1:B0 +|| BNOP RA +|| ADDU B8,A1:A0,A1:A0 + STW A0,*ARG0[5] +|| ADDU A30,A1,A9:A8 + ADD A31,B1,B8 + ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below + ADD B8,A9,A9 +|| STW A8,*ARG0[6] + STW A9,*ARG0[7] + .endif + .endasmfunc diff --git a/deps/openssl/openssl/crypto/bn/asm/c64xplus-gf2m.pl b/deps/openssl/openssl/crypto/bn/asm/c64xplus-gf2m.pl new file mode 100644 index 0000000000..c0e5400807 --- /dev/null +++ b/deps/openssl/openssl/crypto/bn/asm/c64xplus-gf2m.pl @@ -0,0 +1,160 @@ +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# February 2012 +# +# The module implements bn_GF2m_mul_2x2 polynomial multiplication +# used in bn_gf2m.c. It's kind of low-hanging mechanical port from +# C for the time being... The subroutine runs in 37 cycles, which is +# 4.5x faster than compiler-generated code. Though comparison is +# totally unfair, because this module utilizes Galois Field Multiply +# instruction. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector + +($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20)); +($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20)); +($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7"); +($A,$B)=($Alo,$B_1); +$xFF="B1"; + +sub mul_1x1_upper { +my ($A,$B)=@_; +$code.=<<___; + EXTU $B,8,24,$B_2 ; smash $B to 4 bytes +|| AND $B,$xFF,$B_0 +|| SHRU $B,24,$B_3 + SHRU $A,16, $Ahi ; smash $A to two halfwords +|| EXTU $A,16,16,$Alo + + XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits muliplication +|| XORMPY $Ahi,$B_2,$Ahix2 +|| EXTU $B,16,24,$B_1 + XORMPY $Alo,$B_0,$Alox0 +|| XORMPY $Ahi,$B_0,$Ahix0 + XORMPY $Alo,$B_3,$Alox3 +|| XORMPY $Ahi,$B_3,$Ahix3 + XORMPY $Alo,$B_1,$Alox1 +|| XORMPY $Ahi,$B_1,$Ahix1 +___ +} +sub mul_1x1_merged { +my ($OUTlo,$OUThi,$A,$B)=@_; +$code.=<<___; + EXTU $B,8,24,$B_2 ; smash $B to 4 bytes +|| AND $B,$xFF,$B_0 +|| SHRU $B,24,$B_3 + SHRU $A,16, $Ahi ; smash $A to two halfwords +|| EXTU $A,16,16,$Alo + + XOR $Ahix0,$Alox2,$Ahix0 +|| MV $Ahix2,$OUThi +|| XORMPY $Alo,$B_2,$Alox2 + XORMPY $Ahi,$B_2,$Ahix2 +|| EXTU $B,16,24,$B_1 +|| XORMPY $Alo,$B_0,A1 ; $Alox0 + XOR $Ahix1,$Alox3,$Ahix1 +|| SHL $Ahix0,16,$OUTlo +|| SHRU $Ahix0,16,$Ahix0 + XOR $Alox0,$OUTlo,$OUTlo +|| XOR $Ahix0,$OUThi,$OUThi +|| XORMPY $Ahi,$B_0,$Ahix0 +|| XORMPY $Alo,$B_3,$Alox3 +|| SHL $Alox1,8,$Alox1 +|| SHL $Ahix3,8,$Ahix3 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix3,$OUThi,$OUThi +|| XORMPY $Ahi,$B_3,$Ahix3 +|| SHL $Ahix1,24,$Alox1 +|| SHRU $Ahix1,8, $Ahix1 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix1,$OUThi,$OUThi +|| XORMPY $Alo,$B_1,$Alox1 +|| XORMPY $Ahi,$B_1,$Ahix1 +|| MV A1,$Alox0 +___ +} +sub mul_1x1_lower { +my ($OUTlo,$OUThi)=@_; +$code.=<<___; + ;NOP + XOR $Ahix0,$Alox2,$Ahix0 +|| MV $Ahix2,$OUThi + NOP + XOR $Ahix1,$Alox3,$Ahix1 +|| SHL $Ahix0,16,$OUTlo +|| SHRU $Ahix0,16,$Ahix0 + XOR $Alox0,$OUTlo,$OUTlo +|| XOR $Ahix0,$OUThi,$OUThi +|| SHL $Alox1,8,$Alox1 +|| SHL $Ahix3,8,$Ahix3 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix3,$OUThi,$OUThi +|| SHL $Ahix1,24,$Alox1 +|| SHRU $Ahix1,8, $Ahix1 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix1,$OUThi,$OUThi +___ +} +$code.=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .asg bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2 + .endif + + .global _bn_GF2m_mul_2x2 +_bn_GF2m_mul_2x2: + .asmfunc + MVK 0xFF,$xFF +___ + &mul_1x1_upper($a0,$b0); # a0·b0 +$code.=<<___; +|| MV $b1,$B + MV $a1,$A +___ + &mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1 +$code.=<<___; +|| XOR $b0,$b1,$B + XOR $a0,$a1,$A +___ + &mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1) +$code.=<<___; + XOR A28,A31,A29 +|| XOR B28,B31,B29 ; a0·b0+a1·b1 +___ + &mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1) +$code.=<<___; +|| BNOP B3 + XOR A29,A30,A30 +|| XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1 + XOR B28,A30,A30 +|| STW A28,*${rp}[0] + XOR B30,A31,A31 +|| STW A30,*${rp}[1] + STW A31,*${rp}[2] + STW B31,*${rp}[3] + .endasmfunc +___ + +print $code; +close STDOUT; diff --git a/deps/openssl/openssl/crypto/bn/asm/co-586.pl b/deps/openssl/openssl/crypto/bn/asm/co-586.pl index 57101a6bd7..60d0363660 100644 --- a/deps/openssl/openssl/crypto/bn/asm/co-586.pl +++ b/deps/openssl/openssl/crypto/bn/asm/co-586.pl @@ -1,9 +1,18 @@ -#!/usr/local/bin/perl +#! /usr/bin/env perl +# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; +$output = pop; +open STDOUT,">$output"; + &asm_init($ARGV[0],$0); &bn_mul_comba("bn_mul_comba8",8); @@ -13,6 +22,8 @@ require "x86asm.pl"; &asm_finish(); +close STDOUT; + sub mul_add_c { local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; diff --git a/deps/openssl/openssl/crypto/bn/asm/ia64-mont.pl b/deps/openssl/openssl/crypto/bn/asm/ia64-mont.pl index e258658428..5cc5c599f9 100644 --- a/deps/openssl/openssl/crypto/bn/asm/ia64-mont.pl +++ b/deps/openssl/openssl/crypto/bn/asm/ia64-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -60,6 +67,8 @@ # hereafter less for longer keys, while verify - by 74-13%. # DSA performance improves by 115-30%. +$output=pop; + if ($^O eq "hpux") { $ADDP="addp4"; for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } @@ -846,6 +855,6 @@ copyright: stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>" ___ -$output=shift and open STDOUT,">$output"; +open STDOUT,">$output" if $output; print $code; close STDOUT; diff --git a/deps/openssl/openssl/crypto/bn/asm/ia64.S b/deps/openssl/openssl/crypto/bn/asm/ia64.S index a9a42abfc3..f2404a3c1e 100644 --- a/deps/openssl/openssl/crypto/bn/asm/ia64.S +++ b/deps/openssl/openssl/crypto/bn/asm/ia64.S @@ -3,6 +3,13 @@ .ident "ia64.S, Version 2.1" .ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" +// Copyright 2001-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html + // // ==================================================================== // Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -22,7 +29,7 @@ // ports is the same, i.e. 2, while I need 4. In other words, to this // module Itanium2 remains effectively as "wide" as Itanium. Yet it's // essentially different in respect to this module, and a re-tune was -// required. Well, because some intruction latencies has changed. Most +// required. Well, because some instruction latencies has changed. Most // noticeably those intensively used: // // Itanium Itanium2 @@ -363,7 +370,7 @@ bn_mul_words: // The loop therefore spins at the latency of xma minus 1, or in other // words at 6*(n+4) ticks:-( Compare to the "production" loop above // that runs in 2*(n+11) where the low latency problem is worked around -// by moving the dependency to one-tick latent interger ALU. Note that +// by moving the dependency to one-tick latent integer ALU. Note that // "distance" between ldf8 and xma is not latency of ldf8, but the // *difference* between xma and ldf8 latencies. .L_bn_mul_words_ctop: @@ -425,7 +432,7 @@ bn_mul_add_words: // version was performing *all* additions in IALU and was starving // for those even on Itanium 2. In this version one addition is // moved to FPU and is folded with multiplication. This is at cost -// of propogating the result from previous call to this subroutine +// of propagating the result from previous call to this subroutine // to L2 cache... In other words negligible even for shorter keys. // *Overall* performance improvement [over previous version] varies // from 11 to 22 percent depending on key length. @@ -495,7 +502,7 @@ bn_sqr_words: // scalability. The decision will very likely be reconsidered after the // benchmark program is profiled. I.e. if perfomance gain on Itanium // will appear larger than loss on "wider" IA-64, then the loop should -// be explicitely split and the epilogue compressed. +// be explicitly split and the epilogue compressed. .L_bn_sqr_words_ctop: { .mfi; (p16) ldf8 f32=[r33],8 (p25) xmpy.lu f42=f41,f41 diff --git a/deps/openssl/openssl/crypto/bn/asm/mips-mont.pl b/deps/openssl/openssl/crypto/bn/asm/mips-mont.pl index a33cdf4111..a907571bec 100644 --- a/deps/openssl/openssl/crypto/bn/asm/mips-mont.pl +++ b/deps/openssl/openssl/crypto/bn/asm/mips-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -67,7 +74,7 @@ $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000; # ###################################################################### -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; if ($flavour =~ /64|n32/i) { diff --git a/deps/openssl/openssl/crypto/bn/asm/mips.pl b/deps/openssl/openssl/crypto/bn/asm/mips.pl index acafde5e56..420f01f3a4 100644 --- a/deps/openssl/openssl/crypto/bn/asm/mips.pl +++ b/deps/openssl/openssl/crypto/bn/asm/mips.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -15,7 +22,7 @@ # This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c. # # The module is designed to work with either of the "new" MIPS ABI(5), -# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under +# namely N32 or N64, offered by IRIX 6.x. It's not meant to work under # IRIX 5.x not only because it doesn't support new ABIs but also # because 5.x kernels put R4x00 CPU into 32-bit mode and all those # 64-bit instructions (daddu, dmultu, etc.) found below gonna only @@ -49,7 +56,7 @@ # key length, more for longer keys. $flavour = shift || "o32"; -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; if ($flavour =~ /64|n32/i) { diff --git a/deps/openssl/openssl/crypto/bn/asm/mips3-mont.pl b/deps/openssl/openssl/crypto/bn/asm/mips3-mont.pl deleted file mode 100644 index 8f9156e02a..0000000000 --- a/deps/openssl/openssl/crypto/bn/asm/mips3-mont.pl +++ /dev/null @@ -1,327 +0,0 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# This module doesn't present direct interest for OpenSSL, because it -# doesn't provide better performance for longer keys. While 512-bit -# RSA private key operations are 40% faster, 1024-bit ones are hardly -# faster at all, while longer key operations are slower by up to 20%. -# It might be of interest to embedded system developers though, as -# it's smaller than 1KB, yet offers ~3x improvement over compiler -# generated code. -# -# The module targets N32 and N64 MIPS ABIs and currently is a bit -# IRIX-centric, i.e. is likely to require adaptation for other OSes. - -# int bn_mul_mont( -$rp="a0"; # BN_ULONG *rp, -$ap="a1"; # const BN_ULONG *ap, -$bp="a2"; # const BN_ULONG *bp, -$np="a3"; # const BN_ULONG *np, -$n0="a4"; # const BN_ULONG *n0, -$num="a5"; # int num); - -$lo0="a6"; -$hi0="a7"; -$lo1="v0"; -$hi1="v1"; -$aj="t0"; -$bi="t1"; -$nj="t2"; -$tp="t3"; -$alo="s0"; -$ahi="s1"; -$nlo="s2"; -$nhi="s3"; -$tj="s4"; -$i="s5"; -$j="s6"; -$fp="t8"; -$m1="t9"; - -$FRAME=8*(2+8); - -$code=<<___; -#include <asm.h> -#include <regdef.h> - -.text - -.set noat -.set reorder - -.align 5 -.globl bn_mul_mont -.ent bn_mul_mont -bn_mul_mont: - .set noreorder - PTR_SUB sp,64 - move $fp,sp - .frame $fp,64,ra - slt AT,$num,4 - li v0,0 - beqzl AT,.Lproceed - nop - jr ra - PTR_ADD sp,$fp,64 - .set reorder -.align 5 -.Lproceed: - ld $n0,0($n0) - ld $bi,0($bp) # bp[0] - ld $aj,0($ap) # ap[0] - ld $nj,0($np) # np[0] - PTR_SUB sp,16 # place for two extra words - sll $num,3 - li AT,-4096 - PTR_SUB sp,$num - and sp,AT - - sd s0,0($fp) - sd s1,8($fp) - sd s2,16($fp) - sd s3,24($fp) - sd s4,32($fp) - sd s5,40($fp) - sd s6,48($fp) - sd s7,56($fp) - - dmultu $aj,$bi - ld $alo,8($ap) - ld $nlo,8($np) - mflo $lo0 - mfhi $hi0 - dmultu $lo0,$n0 - mflo $m1 - - dmultu $alo,$bi - mflo $alo - mfhi $ahi - - dmultu $nj,$m1 - mflo $lo1 - mfhi $hi1 - dmultu $nlo,$m1 - daddu $lo1,$lo0 - sltu AT,$lo1,$lo0 - daddu $hi1,AT - mflo $nlo - mfhi $nhi - - move $tp,sp - li $j,16 -.align 4 -.L1st: - .set noreorder - PTR_ADD $aj,$ap,$j - ld $aj,($aj) - PTR_ADD $nj,$np,$j - ld $nj,($nj) - - dmultu $aj,$bi - daddu $lo0,$alo,$hi0 - daddu $lo1,$nlo,$hi1 - sltu AT,$lo0,$hi0 - sltu s7,$lo1,$hi1 - daddu $hi0,$ahi,AT - daddu $hi1,$nhi,s7 - mflo $alo - mfhi $ahi - - daddu $lo1,$lo0 - sltu AT,$lo1,$lo0 - dmultu $nj,$m1 - daddu $hi1,AT - addu $j,8 - sd $lo1,($tp) - sltu s7,$j,$num - mflo $nlo - mfhi $nhi - - bnez s7,.L1st - PTR_ADD $tp,8 - .set reorder - - daddu $lo0,$alo,$hi0 - sltu AT,$lo0,$hi0 - daddu $hi0,$ahi,AT - - daddu $lo1,$nlo,$hi1 - sltu s7,$lo1,$hi1 - daddu $hi1,$nhi,s7 - daddu $lo1,$lo0 - sltu AT,$lo1,$lo0 - daddu $hi1,AT - - sd $lo1,($tp) - - daddu $hi1,$hi0 - sltu AT,$hi1,$hi0 - sd $hi1,8($tp) - sd AT,16($tp) - - li $i,8 -.align 4 -.Louter: - PTR_ADD $bi,$bp,$i - ld $bi,($bi) - ld $aj,($ap) - ld $alo,8($ap) - ld $tj,(sp) - - dmultu $aj,$bi - ld $nj,($np) - ld $nlo,8($np) - mflo $lo0 - mfhi $hi0 - daddu $lo0,$tj - dmultu $lo0,$n0 - sltu AT,$lo0,$tj - daddu $hi0,AT - mflo $m1 - - dmultu $alo,$bi - mflo $alo - mfhi $ahi - - dmultu $nj,$m1 - mflo $lo1 - mfhi $hi1 - - dmultu $nlo,$m1 - daddu $lo1,$lo0 - sltu AT,$lo1,$lo0 - daddu $hi1,AT - mflo $nlo - mfhi $nhi - - move $tp,sp - li $j,16 - ld $tj,8($tp) -.align 4 -.Linner: - .set noreorder - PTR_ADD $aj,$ap,$j - ld $aj,($aj) - PTR_ADD $nj,$np,$j - ld $nj,($nj) - - dmultu $aj,$bi - daddu $lo0,$alo,$hi0 - daddu $lo1,$nlo,$hi1 - sltu AT,$lo0,$hi0 - sltu s7,$lo1,$hi1 - daddu $hi0,$ahi,AT - daddu $hi1,$nhi,s7 - mflo $alo - mfhi $ahi - - daddu $lo0,$tj - addu $j,8 - dmultu $nj,$m1 - sltu AT,$lo0,$tj - daddu $lo1,$lo0 - daddu $hi0,AT - sltu s7,$lo1,$lo0 - ld $tj,16($tp) - daddu $hi1,s7 - sltu AT,$j,$num - mflo $nlo - mfhi $nhi - sd $lo1,($tp) - bnez AT,.Linner - PTR_ADD $tp,8 - .set reorder - - daddu $lo0,$alo,$hi0 - sltu AT,$lo0,$hi0 - daddu $hi0,$ahi,AT - daddu $lo0,$tj - sltu s7,$lo0,$tj - daddu $hi0,s7 - - ld $tj,16($tp) - daddu $lo1,$nlo,$hi1 - sltu AT,$lo1,$hi1 - daddu $hi1,$nhi,AT - daddu $lo1,$lo0 - sltu s7,$lo1,$lo0 - daddu $hi1,s7 - sd $lo1,($tp) - - daddu $lo1,$hi1,$hi0 - sltu $hi1,$lo1,$hi0 - daddu $lo1,$tj - sltu AT,$lo1,$tj - daddu $hi1,AT - sd $lo1,8($tp) - sd $hi1,16($tp) - - addu $i,8 - sltu s7,$i,$num - bnez s7,.Louter - - .set noreorder - PTR_ADD $tj,sp,$num # &tp[num] - move $tp,sp - move $ap,sp - li $hi0,0 # clear borrow bit - -.align 4 -.Lsub: ld $lo0,($tp) - ld $lo1,($np) - PTR_ADD $tp,8 - PTR_ADD $np,8 - dsubu $lo1,$lo0,$lo1 # tp[i]-np[i] - sgtu AT,$lo1,$lo0 - dsubu $lo0,$lo1,$hi0 - sgtu $hi0,$lo0,$lo1 - sd $lo0,($rp) - or $hi0,AT - sltu AT,$tp,$tj - bnez AT,.Lsub - PTR_ADD $rp,8 - - dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit - move $tp,sp - PTR_SUB $rp,$num # restore rp - not $hi1,$hi0 - - and $ap,$hi0,sp - and $bp,$hi1,$rp - or $ap,$ap,$bp # ap=borrow?tp:rp - -.align 4 -.Lcopy: ld $aj,($ap) - PTR_ADD $ap,8 - PTR_ADD $tp,8 - sd zero,-8($tp) - sltu AT,$tp,$tj - sd $aj,($rp) - bnez AT,.Lcopy - PTR_ADD $rp,8 - - ld s0,0($fp) - ld s1,8($fp) - ld s2,16($fp) - ld s3,24($fp) - ld s4,32($fp) - ld s5,40($fp) - ld s6,48($fp) - ld s7,56($fp) - li v0,1 - jr ra - PTR_ADD sp,$fp,64 - .set reorder -END(bn_mul_mont) -.rdata -.asciiz "Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>" -___ - -print $code; -close STDOUT; diff --git a/deps/openssl/openssl/crypto/bn/asm/mips3.s b/deps/openssl/openssl/crypto/bn/asm/mips3.s deleted file mode 100644 index dca4105c7d..0000000000 --- a/deps/openssl/openssl/crypto/bn/asm/mips3.s +++ /dev/null @@ -1,2201 +0,0 @@ -.rdata -.asciiz "mips3.s, Version 1.1" -.asciiz "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" - -/* - * ==================================================================== - * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL - * project. - * - * Rights for redistribution and usage in source and binary forms are - * granted according to the OpenSSL license. Warranty of any kind is - * disclaimed. - * ==================================================================== - */ - -/* - * This is my modest contributon to the OpenSSL project (see - * http://www.openssl.org/ for more information about it) and is - * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c - * module. For updates see http://fy.chalmers.se/~appro/hpe/. - * - * The module is designed to work with either of the "new" MIPS ABI(5), - * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under - * IRIX 5.x not only because it doesn't support new ABIs but also - * because 5.x kernels put R4x00 CPU into 32-bit mode and all those - * 64-bit instructions (daddu, dmultu, etc.) found below gonna only - * cause illegal instruction exception:-( - * - * In addition the code depends on preprocessor flags set up by MIPSpro - * compiler driver (either as or cc) and therefore (probably?) can't be - * compiled by the GNU assembler. GNU C driver manages fine though... - * I mean as long as -mmips-as is specified or is the default option, - * because then it simply invokes /usr/bin/as which in turn takes - * perfect care of the preprocessor definitions. Another neat feature - * offered by the MIPSpro assembler is an optimization pass. This gave - * me the opportunity to have the code looking more regular as all those - * architecture dependent instruction rescheduling details were left to - * the assembler. Cool, huh? - * - * Performance improvement is astonishing! 'apps/openssl speed rsa dsa' - * goes way over 3 times faster! - * - * <appro@fy.chalmers.se> - */ -#include <asm.h> -#include <regdef.h> - -#if _MIPS_ISA>=4 -#define MOVNZ(cond,dst,src) \ - movn dst,src,cond -#else -#define MOVNZ(cond,dst,src) \ - .set noreorder; \ - bnezl cond,.+8; \ - move dst,src; \ - .set reorder -#endif - -.text - -.set noat -.set reorder - -#define MINUS4 v1 - -.align 5 -LEAF(bn_mul_add_words) - .set noreorder - bgtzl a2,.L_bn_mul_add_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_mul_add_words_proceed: - li MINUS4,-4 - and ta0,a2,MINUS4 - move v0,zero - beqz ta0,.L_bn_mul_add_words_tail - -.L_bn_mul_add_words_loop: - dmultu t0,a3 - ld t1,0(a0) - ld t2,8(a1) - ld t3,8(a0) - ld ta0,16(a1) - ld ta1,16(a0) - daddu t1,v0 - sltu v0,t1,v0 /* All manuals say it "compares 32-bit - * values", but it seems to work fine - * even on 64-bit registers. */ - mflo AT - mfhi t0 - daddu t1,AT - daddu v0,t0 - sltu AT,t1,AT - sd t1,0(a0) - daddu v0,AT - - dmultu t2,a3 - ld ta2,24(a1) - ld ta3,24(a0) - daddu t3,v0 - sltu v0,t3,v0 - mflo AT - mfhi t2 - daddu t3,AT - daddu v0,t2 - sltu AT,t3,AT - sd t3,8(a0) - daddu v0,AT - - dmultu ta0,a3 - subu a2,4 - PTR_ADD a0,32 - PTR_ADD a1,32 - daddu ta1,v0 - sltu v0,ta1,v0 - mflo AT - mfhi ta0 - daddu ta1,AT - daddu v0,ta0 - sltu AT,ta1,AT - sd ta1,-16(a0) - daddu v0,AT - - - dmultu ta2,a3 - and ta0,a2,MINUS4 - daddu ta3,v0 - sltu v0,ta3,v0 - mflo AT - mfhi ta2 - daddu ta3,AT - daddu v0,ta2 - sltu AT,ta3,AT - sd ta3,-8(a0) - daddu v0,AT - .set noreorder - bgtzl ta0,.L_bn_mul_add_words_loop - ld t0,0(a1) - - bnezl a2,.L_bn_mul_add_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_mul_add_words_return: - jr ra - -.L_bn_mul_add_words_tail: - dmultu t0,a3 - ld t1,0(a0) - subu a2,1 - daddu t1,v0 - sltu v0,t1,v0 - mflo AT - mfhi t0 - daddu t1,AT - daddu v0,t0 - sltu AT,t1,AT - sd t1,0(a0) - daddu v0,AT - beqz a2,.L_bn_mul_add_words_return - - ld t0,8(a1) - dmultu t0,a3 - ld t1,8(a0) - subu a2,1 - daddu t1,v0 - sltu v0,t1,v0 - mflo AT - mfhi t0 - daddu t1,AT - daddu v0,t0 - sltu AT,t1,AT - sd t1,8(a0) - daddu v0,AT - beqz a2,.L_bn_mul_add_words_return - - ld t0,16(a1) - dmultu t0,a3 - ld t1,16(a0) - daddu t1,v0 - sltu v0,t1,v0 - mflo AT - mfhi t0 - daddu t1,AT - daddu v0,t0 - sltu AT,t1,AT - sd t1,16(a0) - daddu v0,AT - jr ra -END(bn_mul_add_words) - -.align 5 -LEAF(bn_mul_words) - .set noreorder - bgtzl a2,.L_bn_mul_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_mul_words_proceed: - li MINUS4,-4 - and ta0,a2,MINUS4 - move v0,zero - beqz ta0,.L_bn_mul_words_tail - -.L_bn_mul_words_loop: - dmultu t0,a3 - ld t2,8(a1) - ld ta0,16(a1) - ld ta2,24(a1) - mflo AT - mfhi t0 - daddu v0,AT - sltu t1,v0,AT - sd v0,0(a0) - daddu v0,t1,t0 - - dmultu t2,a3 - subu a2,4 - PTR_ADD a0,32 - PTR_ADD a1,32 - mflo AT - mfhi t2 - daddu v0,AT - sltu t3,v0,AT - sd v0,-24(a0) - daddu v0,t3,t2 - - dmultu ta0,a3 - mflo AT - mfhi ta0 - daddu v0,AT - sltu ta1,v0,AT - sd v0,-16(a0) - daddu v0,ta1,ta0 - - - dmultu ta2,a3 - and ta0,a2,MINUS4 - mflo AT - mfhi ta2 - daddu v0,AT - sltu ta3,v0,AT - sd v0,-8(a0) - daddu v0,ta3,ta2 - .set noreorder - bgtzl ta0,.L_bn_mul_words_loop - ld t0,0(a1) - - bnezl a2,.L_bn_mul_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_mul_words_return: - jr ra - -.L_bn_mul_words_tail: - dmultu t0,a3 - subu a2,1 - mflo AT - mfhi t0 - daddu v0,AT - sltu t1,v0,AT - sd v0,0(a0) - daddu v0,t1,t0 - beqz a2,.L_bn_mul_words_return - - ld t0,8(a1) - dmultu t0,a3 - subu a2,1 - mflo AT - mfhi t0 - daddu v0,AT - sltu t1,v0,AT - sd v0,8(a0) - daddu v0,t1,t0 - beqz a2,.L_bn_mul_words_return - - ld t0,16(a1) - dmultu t0,a3 - mflo AT - mfhi t0 - daddu v0,AT - sltu t1,v0,AT - sd v0,16(a0) - daddu v0,t1,t0 - jr ra -END(bn_mul_words) - -.align 5 -LEAF(bn_sqr_words) - .set noreorder - bgtzl a2,.L_bn_sqr_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_sqr_words_proceed: - li MINUS4,-4 - and ta0,a2,MINUS4 - move v0,zero - beqz ta0,.L_bn_sqr_words_tail - -.L_bn_sqr_words_loop: - dmultu t0,t0 - ld t2,8(a1) - ld ta0,16(a1) - ld ta2,24(a1) - mflo t1 - mfhi t0 - sd t1,0(a0) - sd t0,8(a0) - - dmultu t2,t2 - subu a2,4 - PTR_ADD a0,64 - PTR_ADD a1,32 - mflo t3 - mfhi t2 - sd t3,-48(a0) - sd t2,-40(a0) - - dmultu ta0,ta0 - mflo ta1 - mfhi ta0 - sd ta1,-32(a0) - sd ta0,-24(a0) - - - dmultu ta2,ta2 - and ta0,a2,MINUS4 - mflo ta3 - mfhi ta2 - sd ta3,-16(a0) - sd ta2,-8(a0) - - .set noreorder - bgtzl ta0,.L_bn_sqr_words_loop - ld t0,0(a1) - - bnezl a2,.L_bn_sqr_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_sqr_words_return: - move v0,zero - jr ra - -.L_bn_sqr_words_tail: - dmultu t0,t0 - subu a2,1 - mflo t1 - mfhi t0 - sd t1,0(a0) - sd t0,8(a0) - beqz a2,.L_bn_sqr_words_return - - ld t0,8(a1) - dmultu t0,t0 - subu a2,1 - mflo t1 - mfhi t0 - sd t1,16(a0) - sd t0,24(a0) - beqz a2,.L_bn_sqr_words_return - - ld t0,16(a1) - dmultu t0,t0 - mflo t1 - mfhi t0 - sd t1,32(a0) - sd t0,40(a0) - jr ra -END(bn_sqr_words) - -.align 5 -LEAF(bn_add_words) - .set noreorder - bgtzl a3,.L_bn_add_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_add_words_proceed: - li MINUS4,-4 - and AT,a3,MINUS4 - move v0,zero - beqz AT,.L_bn_add_words_tail - -.L_bn_add_words_loop: - ld ta0,0(a2) - subu a3,4 - ld t1,8(a1) - and AT,a3,MINUS4 - ld t2,16(a1) - PTR_ADD a2,32 - ld t3,24(a1) - PTR_ADD a0,32 - ld ta1,-24(a2) - PTR_ADD a1,32 - ld ta2,-16(a2) - ld ta3,-8(a2) - daddu ta0,t0 - sltu t8,ta0,t0 - daddu t0,ta0,v0 - sltu v0,t0,ta0 - sd t0,-32(a0) - daddu v0,t8 - - daddu ta1,t1 - sltu t9,ta1,t1 - daddu t1,ta1,v0 - sltu v0,t1,ta1 - sd t1,-24(a0) - daddu v0,t9 - - daddu ta2,t2 - sltu t8,ta2,t2 - daddu t2,ta2,v0 - sltu v0,t2,ta2 - sd t2,-16(a0) - daddu v0,t8 - - daddu ta3,t3 - sltu t9,ta3,t3 - daddu t3,ta3,v0 - sltu v0,t3,ta3 - sd t3,-8(a0) - daddu v0,t9 - - .set noreorder - bgtzl AT,.L_bn_add_words_loop - ld t0,0(a1) - - bnezl a3,.L_bn_add_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_add_words_return: - jr ra - -.L_bn_add_words_tail: - ld ta0,0(a2) - daddu ta0,t0 - subu a3,1 - sltu t8,ta0,t0 - daddu t0,ta0,v0 - sltu v0,t0,ta0 - sd t0,0(a0) - daddu v0,t8 - beqz a3,.L_bn_add_words_return - - ld t1,8(a1) - ld ta1,8(a2) - daddu ta1,t1 - subu a3,1 - sltu t9,ta1,t1 - daddu t1,ta1,v0 - sltu v0,t1,ta1 - sd t1,8(a0) - daddu v0,t9 - beqz a3,.L_bn_add_words_return - - ld t2,16(a1) - ld ta2,16(a2) - daddu ta2,t2 - sltu t8,ta2,t2 - daddu t2,ta2,v0 - sltu v0,t2,ta2 - sd t2,16(a0) - daddu v0,t8 - jr ra -END(bn_add_words) - -.align 5 -LEAF(bn_sub_words) - .set noreorder - bgtzl a3,.L_bn_sub_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_sub_words_proceed: - li MINUS4,-4 - and AT,a3,MINUS4 - move v0,zero - beqz AT,.L_bn_sub_words_tail - -.L_bn_sub_words_loop: - ld ta0,0(a2) - subu a3,4 - ld t1,8(a1) - and AT,a3,MINUS4 - ld t2,16(a1) - PTR_ADD a2,32 - ld t3,24(a1) - PTR_ADD a0,32 - ld ta1,-24(a2) - PTR_ADD a1,32 - ld ta2,-16(a2) - ld ta3,-8(a2) - sltu t8,t0,ta0 - dsubu t0,ta0 - dsubu ta0,t0,v0 - sd ta0,-32(a0) - MOVNZ (t0,v0,t8) - - sltu t9,t1,ta1 - dsubu t1,ta1 - dsubu ta1,t1,v0 - sd ta1,-24(a0) - MOVNZ (t1,v0,t9) - - - sltu t8,t2,ta2 - dsubu t2,ta2 - dsubu ta2,t2,v0 - sd ta2,-16(a0) - MOVNZ (t2,v0,t8) - - sltu t9,t3,ta3 - dsubu t3,ta3 - dsubu ta3,t3,v0 - sd ta3,-8(a0) - MOVNZ (t3,v0,t9) - - .set noreorder - bgtzl AT,.L_bn_sub_words_loop - ld t0,0(a1) - - bnezl a3,.L_bn_sub_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_sub_words_return: - jr ra - -.L_bn_sub_words_tail: - ld ta0,0(a2) - subu a3,1 - sltu t8,t0,ta0 - dsubu t0,ta0 - dsubu ta0,t0,v0 - MOVNZ (t0,v0,t8) - sd ta0,0(a0) - beqz a3,.L_bn_sub_words_return - - ld t1,8(a1) - subu a3,1 - ld ta1,8(a2) - sltu t9,t1,ta1 - dsubu t1,ta1 - dsubu ta1,t1,v0 - MOVNZ (t1,v0,t9) - sd ta1,8(a0) - beqz a3,.L_bn_sub_words_return - - ld t2,16(a1) - ld ta2,16(a2) - sltu t8,t2,ta2 - dsubu t2,ta2 - dsubu ta2,t2,v0 - MOVNZ (t2,v0,t8) - sd ta2,16(a0) - jr ra -END(bn_sub_words) - -#undef MINUS4 - -.align 5 -LEAF(bn_div_3_words) - .set reorder - move a3,a0 /* we know that bn_div_words doesn't - * touch a3, ta2, ta3 and preserves a2 - * so that we can save two arguments - * and return address in registers - * instead of stack:-) - */ - ld a0,(a3) - move ta2,a1 - ld a1,-8(a3) - bne a0,a2,.L_bn_div_3_words_proceed - li v0,-1 - jr ra -.L_bn_div_3_words_proceed: - move ta3,ra - bal bn_div_words - move ra,ta3 - dmultu ta2,v0 - ld t2,-16(a3) - move ta0,zero - mfhi t1 - mflo t0 - sltu t8,t1,v1 -.L_bn_div_3_words_inner_loop: - bnez t8,.L_bn_div_3_words_inner_loop_done - sgeu AT,t2,t0 - seq t9,t1,v1 - and AT,t9 - sltu t3,t0,ta2 - daddu v1,a2 - dsubu t1,t3 - dsubu t0,ta2 - sltu t8,t1,v1 - sltu ta0,v1,a2 - or t8,ta0 - .set noreorder - beqzl AT,.L_bn_div_3_words_inner_loop - dsubu v0,1 - .set reorder -.L_bn_div_3_words_inner_loop_done: - jr ra -END(bn_div_3_words) - -.align 5 -LEAF(bn_div_words) - .set noreorder - bnezl a2,.L_bn_div_words_proceed - move v1,zero - jr ra - li v0,-1 /* I'd rather signal div-by-zero - * which can be done with 'break 7' */ - -.L_bn_div_words_proceed: - bltz a2,.L_bn_div_words_body - move t9,v1 - dsll a2,1 - bgtz a2,.-4 - addu t9,1 - - .set reorder - negu t1,t9 - li t2,-1 - dsll t2,t1 - and t2,a0 - dsrl AT,a1,t1 - .set noreorder - bnezl t2,.+8 - break 6 /* signal overflow */ - .set reorder - dsll a0,t9 - dsll a1,t9 - or a0,AT - -#define QT ta0 -#define HH ta1 -#define DH v1 -.L_bn_div_words_body: - dsrl DH,a2,32 - sgeu AT,a0,a2 - .set noreorder - bnezl AT,.+8 - dsubu a0,a2 - .set reorder - - li QT,-1 - dsrl HH,a0,32 - dsrl QT,32 /* q=0xffffffff */ - beq DH,HH,.L_bn_div_words_skip_div1 - ddivu zero,a0,DH - mflo QT -.L_bn_div_words_skip_div1: - dmultu a2,QT - dsll t3,a0,32 - dsrl AT,a1,32 - or t3,AT - mflo t0 - mfhi t1 -.L_bn_div_words_inner_loop1: - sltu t2,t3,t0 - seq t8,HH,t1 - sltu AT,HH,t1 - and t2,t8 - sltu v0,t0,a2 - or AT,t2 - .set noreorder - beqz AT,.L_bn_div_words_inner_loop1_done - dsubu t1,v0 - dsubu t0,a2 - b .L_bn_div_words_inner_loop1 - dsubu QT,1 - .set reorder -.L_bn_div_words_inner_loop1_done: - - dsll a1,32 - dsubu a0,t3,t0 - dsll v0,QT,32 - - li QT,-1 - dsrl HH,a0,32 - dsrl QT,32 /* q=0xffffffff */ - beq DH,HH,.L_bn_div_words_skip_div2 - ddivu zero,a0,DH - mflo QT -.L_bn_div_words_skip_div2: -#undef DH - dmultu a2,QT - dsll t3,a0,32 - dsrl AT,a1,32 - or t3,AT - mflo t0 - mfhi t1 -.L_bn_div_words_inner_loop2: - sltu t2,t3,t0 - seq t8,HH,t1 - sltu AT,HH,t1 - and t2,t8 - sltu v1,t0,a2 - or AT,t2 - .set noreorder - beqz AT,.L_bn_div_words_inner_loop2_done - dsubu t1,v1 - dsubu t0,a2 - b .L_bn_div_words_inner_loop2 - dsubu QT,1 - .set reorder -.L_bn_div_words_inner_loop2_done: -#undef HH - - dsubu a0,t3,t0 - or v0,QT - dsrl v1,a0,t9 /* v1 contains remainder if anybody wants it */ - dsrl a2,t9 /* restore a2 */ - jr ra -#undef QT -END(bn_div_words) - -#define a_0 t0 -#define a_1 t1 -#define a_2 t2 -#define a_3 t3 -#define b_0 ta0 -#define b_1 ta1 -#define b_2 ta2 -#define b_3 ta3 - -#define a_4 s0 -#define a_5 s2 -#define a_6 s4 -#define a_7 a1 /* once we load a[7] we don't need a anymore */ -#define b_4 s1 -#define b_5 s3 -#define b_6 s5 -#define b_7 a2 /* once we load b[7] we don't need b anymore */ - -#define t_1 t8 -#define t_2 t9 - -#define c_1 v0 -#define c_2 v1 -#define c_3 a3 - -#define FRAME_SIZE 48 - -.align 5 -LEAF(bn_mul_comba8) - .set noreorder - PTR_SUB sp,FRAME_SIZE - .frame sp,64,ra - .set reorder - ld a_0,0(a1) /* If compiled with -mips3 option on - * R5000 box assembler barks on this - * line with "shouldn't have mult/div - * as last instruction in bb (R10K - * bug)" warning. If anybody out there - * has a clue about how to circumvent - * this do send me a note. - * <appro@fy.chalmers.se> - */ - ld b_0,0(a2) - ld a_1,8(a1) - ld a_2,16(a1) - ld a_3,24(a1) - ld b_1,8(a2) - ld b_2,16(a2) - ld b_3,24(a2) - dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ - sd s0,0(sp) - sd s1,8(sp) - sd s2,16(sp) - sd s3,24(sp) - sd s4,32(sp) - sd s5,40(sp) - mflo c_1 - mfhi c_2 - - dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ - ld a_4,32(a1) - ld a_5,40(a1) - ld a_6,48(a1) - ld a_7,56(a1) - ld b_4,32(a2) - ld b_5,40(a2) - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu c_3,t_2,AT - dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ - ld b_6,48(a2) - ld b_7,56(a2) - sd c_1,0(a0) /* r[0]=c1; */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - sd c_2,8(a0) /* r[1]=c2; */ - - dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,16(a0) /* r[2]=c3; */ - - dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,24(a0) /* r[3]=c1; */ - - dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,32(a0) /* r[4]=c2; */ - - dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,40(a0) /* r[5]=c3; */ - - dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,48(a0) /* r[6]=c1; */ - - dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,56(a0) /* r[7]=c2; */ - - dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,64(a0) /* r[8]=c3; */ - - dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,72(a0) /* r[9]=c1; */ - - dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,80(a0) /* r[10]=c2; */ - - dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,88(a0) /* r[11]=c3; */ - - dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,96(a0) /* r[12]=c1; */ - - dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,104(a0) /* r[13]=c2; */ - - dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ - ld s0,0(sp) - ld s1,8(sp) - ld s2,16(sp) - ld s3,24(sp) - ld s4,32(sp) - ld s5,40(sp) - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sd c_3,112(a0) /* r[14]=c3; */ - sd c_1,120(a0) /* r[15]=c1; */ - - PTR_ADD sp,FRAME_SIZE - - jr ra -END(bn_mul_comba8) - -.align 5 -LEAF(bn_mul_comba4) - .set reorder - ld a_0,0(a1) - ld b_0,0(a2) - ld a_1,8(a1) - ld a_2,16(a1) - dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ - ld a_3,24(a1) - ld b_1,8(a2) - ld b_2,16(a2) - ld b_3,24(a2) - mflo c_1 - mfhi c_2 - sd c_1,0(a0) - - dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu c_3,t_2,AT - dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - sd c_2,8(a0) - - dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,16(a0) - - dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,24(a0) - - dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,32(a0) - - dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,40(a0) - - dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sd c_1,48(a0) - sd c_2,56(a0) - - jr ra -END(bn_mul_comba4) - -#undef a_4 -#undef a_5 -#undef a_6 -#undef a_7 -#define a_4 b_0 -#define a_5 b_1 -#define a_6 b_2 -#define a_7 b_3 - -.align 5 -LEAF(bn_sqr_comba8) - .set reorder - ld a_0,0(a1) - ld a_1,8(a1) - ld a_2,16(a1) - ld a_3,24(a1) - - dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ - ld a_4,32(a1) - ld a_5,40(a1) - ld a_6,48(a1) - ld a_7,56(a1) - mflo c_1 - mfhi c_2 - sd c_1,0(a0) - - dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu c_3,t_2,AT - sd c_2,8(a0) - - dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,16(a0) - - dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,24(a0) - - dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,32(a0) - - dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,40(a0) - - dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,48(a0) - - dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,56(a0) - - dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,64(a0) - - dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,72(a0) - - dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,80(a0) - - dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,88(a0) - - dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,96(a0) - - dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,104(a0) - - dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sd c_3,112(a0) - sd c_1,120(a0) - - jr ra -END(bn_sqr_comba8) - -.align 5 -LEAF(bn_sqr_comba4) - .set reorder - ld a_0,0(a1) - ld a_1,8(a1) - ld a_2,16(a1) - ld a_3,24(a1) - dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ - mflo c_1 - mfhi c_2 - sd c_1,0(a0) - - dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu c_3,t_2,AT - sd c_2,8(a0) - - dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,16(a0) - - dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,24(a0) - - dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,32(a0) - - dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,40(a0) - - dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sd c_1,48(a0) - sd c_2,56(a0) - - jr ra -END(bn_sqr_comba4) diff --git a/deps/openssl/openssl/crypto/bn/asm/pa-risc2.s b/deps/openssl/openssl/crypto/bn/asm/pa-risc2.s index f3b16290eb..413eac7123 100644 --- a/deps/openssl/openssl/crypto/bn/asm/pa-risc2.s +++ b/deps/openssl/openssl/crypto/bn/asm/pa-risc2.s @@ -1,3 +1,9 @@ +; Copyright 1998-2016 The OpenSSL Project Authors. All Rights Reserved. +; +; Licensed under the OpenSSL license (the "License"). You may not use +; this file except in compliance with the License. You can obtain a copy +; in the file LICENSE in the source distribution or at +; https://www.openssl.org/source/license.html ; ; PA-RISC 2.0 implementation of bn_asm code, based on the ; 64-bit version of the code. This code is effectively the diff --git a/deps/openssl/openssl/crypto/bn/asm/pa-risc2W.s b/deps/openssl/openssl/crypto/bn/asm/pa-risc2W.s index a99545754d..97381172e7 100644 --- a/deps/openssl/openssl/crypto/bn/asm/pa-risc2W.s +++ b/deps/openssl/openssl/crypto/bn/asm/pa-risc2W.s @@ -1,3 +1,10 @@ +; Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved. +; +; Licensed under the OpenSSL license (the "License"). You may not use +; this file except in compliance with the License. You can obtain a copy +; in the file LICENSE in the source distribution or at +; https://www.openssl.org/source/license.html + ; ; PA-RISC 64-bit implementation of bn_asm code ; diff --git a/deps/openssl/openssl/crypto/bn/asm/parisc-mont.pl b/deps/openssl/openssl/crypto/bn/asm/parisc-mont.pl index c02ef6f014..8aa94e8511 100644 --- a/deps/openssl/openssl/crypto/bn/asm/parisc-mont.pl +++ b/deps/openssl/openssl/crypto/bn/asm/parisc-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -126,7 +133,7 @@ $fp="%r3"; $hi1="%r2"; $hi0="%r1"; -$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s +$xfer=$n0; # accommodates [-16..15] offset in fld[dw]s $fm0="%fr4"; $fti=$fm0; $fbi="%fr5L"; diff --git a/deps/openssl/openssl/crypto/bn/asm/ppc-mont.pl b/deps/openssl/openssl/crypto/bn/asm/ppc-mont.pl index 6930a3aceb..5802260ca6 100644 --- a/deps/openssl/openssl/crypto/bn/asm/ppc-mont.pl +++ b/deps/openssl/openssl/crypto/bn/asm/ppc-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL diff --git a/deps/openssl/openssl/crypto/bn/asm/ppc.pl b/deps/openssl/openssl/crypto/bn/asm/ppc.pl index 446d8ba949..4ea534a1c7 100644 --- a/deps/openssl/openssl/crypto/bn/asm/ppc.pl +++ b/deps/openssl/openssl/crypto/bn/asm/ppc.pl @@ -1,5 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved. # +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # Implemented as a Perl wrapper as we want to support several different # architectures with single file. We pick up the target based on the # file name we are asked to generate. @@ -419,7 +425,7 @@ $data=<<EOF; # r9,r10, r11 are the equivalents of c1,c2, c3. # # Possible optimization of loading all 8 longs of a into registers -# doesnt provide any speedup +# doesn't provide any speedup # xor r0,r0,r0 #set r0 = 0.Used in addze @@ -1009,7 +1015,7 @@ $data=<<EOF; $UMULL r8,r6,r7 $UMULH r9,r6,r7 addc r11,r11,r8 - addze r12,r9 # since we didnt set r12 to zero before. + addze r12,r9 # since we didn't set r12 to zero before. addze r10,r0 #mul_add_c(a[1],b[0],c2,c3,c1); $LD r6,`1*$BNSZ`(r4) diff --git a/deps/openssl/openssl/crypto/bn/asm/ppc64-mont.pl b/deps/openssl/openssl/crypto/bn/asm/ppc64-mont.pl index 595fc6d31f..1e19c958a1 100644 --- a/deps/openssl/openssl/crypto/bn/asm/ppc64-mont.pl +++ b/deps/openssl/openssl/crypto/bn/asm/ppc64-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL diff --git a/deps/openssl/openssl/crypto/bn/asm/rsaz-avx2.pl b/deps/openssl/openssl/crypto/bn/asm/rsaz-avx2.pl index 2b3f8b0e21..46d746b7d0 100755 --- a/deps/openssl/openssl/crypto/bn/asm/rsaz-avx2.pl +++ b/deps/openssl/openssl/crypto/bn/asm/rsaz-avx2.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + ############################################################################## # # @@ -103,7 +110,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0- $addx = ($ver>=3.03); } -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT = *OUT; if ($avx>1) {{{ diff --git a/deps/openssl/openssl/crypto/bn/asm/rsaz-x86_64.pl b/deps/openssl/openssl/crypto/bn/asm/rsaz-x86_64.pl index 87ce2c34d9..6f3b664f7a 100755 --- a/deps/openssl/openssl/crypto/bn/asm/rsaz-x86_64.pl +++ b/deps/openssl/openssl/crypto/bn/asm/rsaz-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + ############################################################################## # # @@ -95,7 +102,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` @@ -1767,7 +1774,7 @@ ___ { # __rsaz_512_mul # # input: %rsi - ap, %rbp - bp - # ouput: + # output: # clobbers: everything my ($ap,$bp) = ("%rsi","%rbp"); $code.=<<___; @@ -1919,7 +1926,7 @@ if ($addx) { # __rsaz_512_mulx # # input: %rsi - ap, %rbp - bp - # ouput: + # output: # clobbers: everything my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi"); $code.=<<___; diff --git a/deps/openssl/openssl/crypto/bn/asm/s390x-gf2m.pl b/deps/openssl/openssl/crypto/bn/asm/s390x-gf2m.pl index 9d18d40e77..cbd16f4214 100644 --- a/deps/openssl/openssl/crypto/bn/asm/s390x-gf2m.pl +++ b/deps/openssl/openssl/crypto/bn/asm/s390x-gf2m.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -35,7 +42,7 @@ if ($flavour =~ /3[12]/) { $g="g"; } -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $stdframe=16*$SIZE_T+4*8; diff --git a/deps/openssl/openssl/crypto/bn/asm/s390x-mont.pl b/deps/openssl/openssl/crypto/bn/asm/s390x-mont.pl index 9fd64e81ee..2205bc2ca0 100644 --- a/deps/openssl/openssl/crypto/bn/asm/s390x-mont.pl +++ b/deps/openssl/openssl/crypto/bn/asm/s390x-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -54,7 +61,7 @@ if ($flavour =~ /3[12]/) { $g="g"; } -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $stdframe=16*$SIZE_T+4*8; diff --git a/deps/openssl/openssl/crypto/bn/asm/s390x.S b/deps/openssl/openssl/crypto/bn/asm/s390x.S index f5eebe413a..292a7a9998 100755..100644 --- a/deps/openssl/openssl/crypto/bn/asm/s390x.S +++ b/deps/openssl/openssl/crypto/bn/asm/s390x.S @@ -1,11 +1,11 @@ .ident "s390x.S, version 1.1" // ==================================================================== -// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -// project. +// Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. // -// Rights for redistribution and usage in source and binary forms are -// granted according to the OpenSSL license. Warranty of any kind is -// disclaimed. +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html // ==================================================================== .text diff --git a/deps/openssl/openssl/crypto/bn/asm/sparct4-mont.pl b/deps/openssl/openssl/crypto/bn/asm/sparct4-mont.pl index 71b45002a4..4faf66f10a 100755 --- a/deps/openssl/openssl/crypto/bn/asm/sparct4-mont.pl +++ b/deps/openssl/openssl/crypto/bn/asm/sparct4-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by David S. Miller <davem@devemloft.net> and Andy Polyakov @@ -76,6 +83,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "sparcv9_modes.pl"; +$output = pop; +open STDOUT,">$output"; + $code.=<<___; #include "sparc_arch.h" diff --git a/deps/openssl/openssl/crypto/bn/asm/sparcv8.S b/deps/openssl/openssl/crypto/bn/asm/sparcv8.S index 88c5dc480a..9c31073b24 100644 --- a/deps/openssl/openssl/crypto/bn/asm/sparcv8.S +++ b/deps/openssl/openssl/crypto/bn/asm/sparcv8.S @@ -3,12 +3,12 @@ /* * ==================================================================== - * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL - * project. + * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. * - * Rights for redistribution and usage in source and binary forms are - * granted according to the OpenSSL license. Warranty of any kind is - * disclaimed. + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html * ==================================================================== */ diff --git a/deps/openssl/openssl/crypto/bn/asm/sparcv8plus.S b/deps/openssl/openssl/crypto/bn/asm/sparcv8plus.S index 63de1860f2..714a136675 100644 --- a/deps/openssl/openssl/crypto/bn/asm/sparcv8plus.S +++ b/deps/openssl/openssl/crypto/bn/asm/sparcv8plus.S @@ -3,12 +3,12 @@ /* * ==================================================================== - * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL - * project. + * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. * - * Rights for redistribution and usage in source and binary forms are - * granted according to the OpenSSL license. Warranty of any kind is - * disclaimed. + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html * ==================================================================== */ @@ -52,7 +52,7 @@ * # cd ../.. * # make; make test * - * Q. V8plus achitecture? What kind of beast is that? + * Q. V8plus architecture? What kind of beast is that? * A. Well, it's rather a programming model than an architecture... * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under * special conditions, namely when kernel doesn't preserve upper @@ -71,7 +71,7 @@ * * Q. 64-bit registers under 32-bit kernels? Didn't you just say it * doesn't work? - * A. You can't adress *all* registers as 64-bit wide:-( The catch is + * A. You can't address *all* registers as 64-bit wide:-( The catch is * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully * preserved if you're in a leaf function, i.e. such never calling * any other functions. All functions in this module are leaf and @@ -144,6 +144,10 @@ * } */ +#ifdef OPENSSL_FIPSCANISTER +#include <openssl/fipssyms.h> +#endif + #if defined(__SUNPRO_C) && defined(__sparcv9) /* They've said -xarch=v9 at command line */ .register %g2,#scratch diff --git a/deps/openssl/openssl/crypto/bn/asm/sparcv9-gf2m.pl b/deps/openssl/openssl/crypto/bn/asm/sparcv9-gf2m.pl index ab94cd917c..dcf11a87a1 100644 --- a/deps/openssl/openssl/crypto/bn/asm/sparcv9-gf2m.pl +++ b/deps/openssl/openssl/crypto/bn/asm/sparcv9-gf2m.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -18,6 +25,9 @@ # ~100-230% faster than gcc-generated code and ~35-90% faster than # the pure SPARCv9 code path. +$output = pop; +open STDOUT,">$output"; + $locals=16*8; $tab="%l0"; diff --git a/deps/openssl/openssl/crypto/bn/asm/sparcv9-mont.pl b/deps/openssl/openssl/crypto/bn/asm/sparcv9-mont.pl index d866287800..6807c8b6e0 100644 --- a/deps/openssl/openssl/crypto/bn/asm/sparcv9-mont.pl +++ b/deps/openssl/openssl/crypto/bn/asm/sparcv9-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -13,7 +20,7 @@ # for undertaken effort are multiple. First of all, UltraSPARC is not # the whole SPARCv9 universe and other VIS-free implementations deserve # optimized code as much. Secondly, newly introduced UltraSPARC T1, -# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes, +# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths, # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with # several integrated RSA/DSA accelerator circuits accessible through # kernel driver [only(*)], but having decent user-land software @@ -23,7 +30,7 @@ # instructions... # (*) Engine accessing the driver in question is on my TODO list. -# For reference, acceleator is estimated to give 6 to 10 times +# For reference, accelerator is estimated to give 6 to 10 times # improvement on single-threaded RSA sign. It should be noted # that 6-10x improvement coefficient does not actually mean # something extraordinary in terms of absolute [single-threaded] @@ -42,6 +49,9 @@ # module still have hidden potential [see TODO list there], which is # estimated to be larger than 20%... +$output = pop; +open STDOUT,">$output"; + # int bn_mul_mont( $rp="%i0"; # BN_ULONG *rp, $ap="%i1"; # const BN_ULONG *ap, @@ -50,10 +60,8 @@ $np="%i3"; # const BN_ULONG *np, $n0="%i4"; # const BN_ULONG *n0, $num="%i5"; # int num); -$bits=32; -for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } -if ($bits==64) { $bias=2047; $frame=192; } -else { $bias=0; $frame=128; } +$frame="STACK_FRAME"; +$bias="STACK_BIAS"; $car0="%o0"; $car1="%o1"; @@ -76,6 +84,8 @@ $tpj="%l7"; $fname="bn_mul_mont_int"; $code=<<___; +#include "sparc_arch.h" + .section ".text",#alloc,#execinstr .global $fname @@ -105,7 +115,7 @@ $fname: ld [$np],$car1 ! np[0] sub %o7,$bias,%sp ! alloca ld [$np+4],$npj ! np[1] - be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont + be,pt SIZE_T_CC,.Lbn_sqr_mont mov 12,$j mulx $car0,$mul0,$car0 ! ap[0]*bp[0] diff --git a/deps/openssl/openssl/crypto/bn/asm/sparcv9a-mont.pl b/deps/openssl/openssl/crypto/bn/asm/sparcv9a-mont.pl index a14205f2f0..50b690653f 100755 --- a/deps/openssl/openssl/crypto/bn/asm/sparcv9a-mont.pl +++ b/deps/openssl/openssl/crypto/bn/asm/sparcv9a-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -51,21 +58,17 @@ # # Modulo-scheduled inner loops allow to interleave floating point and # integer instructions and minimize Read-After-Write penalties. This -# results in *further* 20-50% perfromance improvement [depending on +# results in *further* 20-50% performance improvement [depending on # key length, more for longer keys] on USI&II cores and 30-80% - on # USIII&IV. +$output = pop; +open STDOUT,">$output"; + $fname="bn_mul_mont_fpu"; -$bits=32; -for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } - -if ($bits==64) { - $bias=2047; - $frame=192; -} else { - $bias=0; - $frame=128; # 96 rounded up to largest known cache-line -} + +$frame="STACK_FRAME"; +$bias="STACK_BIAS"; $locals=64; # In order to provide for 32-/64-bit ABI duality, I keep integers wider @@ -121,6 +124,8 @@ $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62"; $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load $code=<<___; +#include "sparc_arch.h" + .section ".text",#alloc,#execinstr .global $fname @@ -867,7 +872,7 @@ ___ $code =~ s/\`([^\`]*)\`/eval($1)/gem; # Below substitution makes it possible to compile without demanding -# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I +# VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I # dare to do this, because VIS capability is detected at run-time now # and this routine is not called on CPU not capable to execute it. Do # note that fzeros is not the only VIS dependency! Another dependency diff --git a/deps/openssl/openssl/crypto/bn/asm/via-mont.pl b/deps/openssl/openssl/crypto/bn/asm/via-mont.pl index c046a514c8..9f81bc822e 100644 --- a/deps/openssl/openssl/crypto/bn/asm/via-mont.pl +++ b/deps/openssl/openssl/crypto/bn/asm/via-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -81,6 +88,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; +$output = pop; +open STDOUT,">$output"; + &asm_init($ARGV[0],"via-mont.pl"); # int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); @@ -240,3 +250,5 @@ $sp=&DWP(28,"esp"); &asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>"); &asm_finish(); + +close STDOUT; diff --git a/deps/openssl/openssl/crypto/bn/asm/vis3-mont.pl b/deps/openssl/openssl/crypto/bn/asm/vis3-mont.pl index 263ac02b6f..64dba4480f 100644 --- a/deps/openssl/openssl/crypto/bn/asm/vis3-mont.pl +++ b/deps/openssl/openssl/crypto/bn/asm/vis3-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -18,16 +25,20 @@ # for reference purposes, because T4 has dedicated Montgomery # multiplication and squaring *instructions* that deliver even more. -$bits=32; -for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } -if ($bits==64) { $bias=2047; $frame=192; } -else { $bias=0; $frame=112; } +$output = pop; +open STDOUT,">$output"; + +$frame = "STACK_FRAME"; +$bias = "STACK_BIAS"; + +$code.=<<___; +#include "sparc_arch.h" -$code.=<<___ if ($bits==64); +#ifdef __arch64__ .register %g2,#scratch .register %g3,#scratch -___ -$code.=<<___; +#endif + .section ".text",#alloc,#execinstr ___ @@ -333,7 +344,7 @@ ___ # Purpose of these subroutines is to explicitly encode VIS instructions, # so that one can compile the module without having to specify VIS -# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. # Idea is to reserve for option to produce "universal" binary and let # programmer detect if current CPU is VIS capable at run-time. sub unvis3 { diff --git a/deps/openssl/openssl/crypto/bn/asm/vms.mar b/deps/openssl/openssl/crypto/bn/asm/vms.mar deleted file mode 100644 index aefab15cdb..0000000000 --- a/deps/openssl/openssl/crypto/bn/asm/vms.mar +++ /dev/null @@ -1,6440 +0,0 @@ - .title vax_bn_mul_add_words unsigned multiply & add, 32*32+32+32=>64 -; -; w.j.m. 15-jan-1999 -; -; it's magic ... -; -; ULONG bn_mul_add_words(ULONG r[],ULONG a[],int n,ULONG w) { -; ULONG c = 0; -; int i; -; for(i = 0; i < n; i++) <c,r[i]> := r[i] + c + a[i] * w ; -; return c; -; } - -r=4 ;(AP) -a=8 ;(AP) -n=12 ;(AP) n by value (input) -w=16 ;(AP) w by value (input) - - - .psect code,nowrt - -.entry bn_mul_add_words,^m<r2,r3,r4,r5,r6> - - moval @r(ap),r2 - moval @a(ap),r3 - movl n(ap),r4 ; assumed >0 by C code - movl w(ap),r5 - clrl r6 ; c - -0$: - emul r5,(r3),(r2),r0 ; w, a[], r[] considered signed - - ; fixup for "negative" r[] - tstl (r2) - bgeq 10$ - incl r1 -10$: - - ; add in c - addl2 r6,r0 - adwc #0,r1 - - ; combined fixup for "negative" w, a[] - tstl r5 - bgeq 20$ - addl2 (r3),r1 -20$: - tstl (r3) - bgeq 30$ - addl2 r5,r1 -30$: - - movl r0,(r2)+ ; store lo result in r[] & advance - addl #4,r3 ; advance a[] - movl r1,r6 ; store hi result => c - - sobgtr r4,0$ - - movl r6,r0 ; return c - ret - - .title vax_bn_mul_words unsigned multiply & add, 32*32+32=>64 -; -; w.j.m. 15-jan-1999 -; -; it's magic ... -; -; ULONG bn_mul_words(ULONG r[],ULONG a[],int n,ULONG w) { -; ULONG c = 0; -; int i; -; for(i = 0; i < num; i++) <c,r[i]> := a[i] * w + c ; -; return(c); -; } - -r=4 ;(AP) -a=8 ;(AP) -n=12 ;(AP) n by value (input) -w=16 ;(AP) w by value (input) - - - .psect code,nowrt - -.entry bn_mul_words,^m<r2,r3,r4,r5,r6> - - moval @r(ap),r2 ; r2 -> r[] - moval @a(ap),r3 ; r3 -> a[] - movl n(ap),r4 ; r4 = loop count (assumed >0 by C code) - movl w(ap),r5 ; r5 = w - clrl r6 ; r6 = c - -0$: - ; <r1,r0> := w * a[] + c - emul r5,(r3),r6,r0 ; w, a[], c considered signed - - ; fixup for "negative" c - tstl r6 ; c - bgeq 10$ - incl r1 -10$: - - ; combined fixup for "negative" w, a[] - tstl r5 ; w - bgeq 20$ - addl2 (r3),r1 ; a[] -20$: - tstl (r3) ; a[] - bgeq 30$ - addl2 r5,r1 ; w -30$: - - movl r0,(r2)+ ; store lo result in r[] & advance - addl #4,r3 ; advance a[] - movl r1,r6 ; store hi result => c - - sobgtr r4,0$ - - movl r6,r0 ; return c - ret - - .title vax_bn_sqr_words unsigned square, 32*32=>64 -; -; w.j.m. 15-jan-1999 -; -; it's magic ... -; -; void bn_sqr_words(ULONG r[],ULONG a[],int n) { -; int i; -; for(i = 0; i < n; i++) <r[2*i+1],r[2*i]> := a[i] * a[i] ; -; } - -r=4 ;(AP) -a=8 ;(AP) -n=12 ;(AP) n by value (input) - - - .psect code,nowrt - -.entry bn_sqr_words,^m<r2,r3,r4,r5> - - moval @r(ap),r2 ; r2 -> r[] - moval @a(ap),r3 ; r3 -> a[] - movl n(ap),r4 ; r4 = n (assumed >0 by C code) - -0$: - movl (r3)+,r5 ; r5 = a[] & advance - - ; <r1,r0> := a[] * a[] - emul r5,r5,#0,r0 ; a[] considered signed - - ; fixup for "negative" a[] - tstl r5 ; a[] - bgeq 30$ - addl2 r5,r1 ; a[] - addl2 r5,r1 ; a[] -30$: - - movl r0,(r2)+ ; store lo result in r[] & advance - movl r1,(r2)+ ; store hi result in r[] & advance - - sobgtr r4,0$ - - movl #1,r0 ; return SS$_NORMAL - ret - - .title vax_bn_div_words unsigned divide -; -; Richard Levitte 20-Nov-2000 -; -; ULONG bn_div_words(ULONG h, ULONG l, ULONG d) -; { -; return ((ULONG)((((ULLONG)h)<<32)|l) / (ULLONG)d); -; } -; -; Using EDIV would be very easy, if it didn't do signed calculations. -; Any time any of the input numbers are signed, there are problems, -; usually with integer overflow, at which point it returns useless -; data (the quotient gets the value of l, and the remainder becomes 0). -; -; If it was just for the dividend, it would be very easy, just divide -; it by 2 (unsigned), do the division, multiply the resulting quotient -; and remainder by 2, add the bit that was dropped when dividing by 2 -; to the remainder, and do some adjustment so the remainder doesn't -; end up larger than the divisor. For some cases when the divisor is -; negative (from EDIV's point of view, i.e. when the highest bit is set), -; dividing the dividend by 2 isn't enough, and since some operations -; might generate integer overflows even when the dividend is divided by -; 4 (when the high part of the shifted down dividend ends up being exactly -; half of the divisor, the result is the quotient 0x80000000, which is -; negative...) it needs to be divided by 8. Furthermore, the divisor needs -; to be divided by 2 (unsigned) as well, to avoid more problems with the sign. -; In this case, a little extra fiddling with the remainder is required. -; -; So, the simplest way to handle this is always to divide the dividend -; by 8, and to divide the divisor by 2 if it's highest bit is set. -; After EDIV has been used, the quotient gets multiplied by 8 if the -; original divisor was positive, otherwise 4. The remainder, oddly -; enough, is *always* multiplied by 8. -; NOTE: in the case mentioned above, where the high part of the shifted -; down dividend ends up being exactly half the shifted down divisor, we -; end up with a 33 bit quotient. That's no problem however, it usually -; means we have ended up with a too large remainder as well, and the -; problem is fixed by the last part of the algorithm (next paragraph). -; -; The routine ends with comparing the resulting remainder with the -; original divisor and if the remainder is larger, subtract the -; original divisor from it, and increase the quotient by 1. This is -; done until the remainder is smaller than the divisor. -; -; The complete algorithm looks like this: -; -; d' = d -; l' = l & 7 -; [h,l] = [h,l] >> 3 -; [q,r] = floor([h,l] / d) # This is the EDIV operation -; if (q < 0) q = -q # I doubt this is necessary any more -; -; r' = r >> 29 -; if (d' >= 0) -; q' = q >> 29 -; q = q << 3 -; else -; q' = q >> 30 -; q = q << 2 -; r = (r << 3) + l' -; -; if (d' < 0) -; { -; [r',r] = [r',r] - q -; while ([r',r] < 0) -; { -; [r',r] = [r',r] + d -; [q',q] = [q',q] - 1 -; } -; } -; -; while ([r',r] >= d') -; { -; [r',r] = [r',r] - d' -; [q',q] = [q',q] + 1 -; } -; -; return q - -h=4 ;(AP) h by value (input) -l=8 ;(AP) l by value (input) -d=12 ;(AP) d by value (input) - -;r2 = l, q -;r3 = h, r -;r4 = d -;r5 = l' -;r6 = r' -;r7 = d' -;r8 = q' - - .psect code,nowrt - -.entry bn_div_words,^m<r2,r3,r4,r5,r6,r7,r8> - movl l(ap),r2 - movl h(ap),r3 - movl d(ap),r4 - - bicl3 #^XFFFFFFF8,r2,r5 ; l' = l & 7 - bicl3 #^X00000007,r2,r2 - - bicl3 #^XFFFFFFF8,r3,r6 - bicl3 #^X00000007,r3,r3 - - addl r6,r2 - - rotl #-3,r2,r2 ; l = l >> 3 - rotl #-3,r3,r3 ; h = h >> 3 - - movl r4,r7 ; d' = d - - movl #0,r6 ; r' = 0 - movl #0,r8 ; q' = 0 - - tstl r4 - beql 666$ ; Uh-oh, the divisor is 0... - bgtr 1$ - rotl #-1,r4,r4 ; If d is negative, shift it right. - bicl2 #^X80000000,r4 ; Since d is then a large number, the - ; lowest bit is insignificant - ; (contradict that, and I'll fix the problem!) -1$: - ediv r4,r2,r2,r3 ; Do the actual division - - tstl r2 - bgeq 3$ - mnegl r2,r2 ; if q < 0, negate it -3$: - tstl r7 - blss 4$ - rotl #3,r2,r2 ; q = q << 3 - bicl3 #^XFFFFFFF8,r2,r8 ; q' gets the high bits from q - bicl3 #^X00000007,r2,r2 - bsb 41$ -4$: ; else - rotl #2,r2,r2 ; q = q << 2 - bicl3 #^XFFFFFFFC,r2,r8 ; q' gets the high bits from q - bicl3 #^X00000003,r2,r2 -41$: - rotl #3,r3,r3 ; r = r << 3 - bicl3 #^XFFFFFFF8,r3,r6 ; r' gets the high bits from r - bicl3 #^X00000007,r3,r3 - addl r5,r3 ; r = r + l' - - tstl r7 - bgeq 5$ - bitl #1,r7 - beql 5$ ; if d' < 0 && d' & 1 - subl r2,r3 ; [r',r] = [r',r] - [q',q] - sbwc r8,r6 -45$: - bgeq 5$ ; while r < 0 - decl r2 ; [q',q] = [q',q] - 1 - sbwc #0,r8 - addl r7,r3 ; [r',r] = [r',r] + d' - adwc #0,r6 - brb 45$ - -; The return points are placed in the middle to keep a short distance from -; all the branch points -42$: -; movl r3,r1 - movl r2,r0 - ret -666$: - movl #^XFFFFFFFF,r0 - ret - -5$: - tstl r6 - bneq 6$ - cmpl r3,r7 - blssu 42$ ; while [r',r] >= d' -6$: - subl r7,r3 ; [r',r] = [r',r] - d' - sbwc #0,r6 - incl r2 ; [q',q] = [q',q] + 1 - adwc #0,r8 - brb 5$ - - .title vax_bn_add_words unsigned add of two arrays -; -; Richard Levitte 20-Nov-2000 -; -; ULONG bn_add_words(ULONG r[], ULONG a[], ULONG b[], int n) { -; ULONG c = 0; -; int i; -; for (i = 0; i < n; i++) <c,r[i]> = a[i] + b[i] + c; -; return(c); -; } - -r=4 ;(AP) r by reference (output) -a=8 ;(AP) a by reference (input) -b=12 ;(AP) b by reference (input) -n=16 ;(AP) n by value (input) - - - .psect code,nowrt - -.entry bn_add_words,^m<r2,r3,r4,r5,r6> - - moval @r(ap),r2 - moval @a(ap),r3 - moval @b(ap),r4 - movl n(ap),r5 ; assumed >0 by C code - clrl r0 ; c - - tstl r5 ; carry = 0 - bleq 666$ - -0$: - movl (r3)+,r6 ; carry untouched - adwc (r4)+,r6 ; carry used and touched - movl r6,(r2)+ ; carry untouched - sobgtr r5,0$ ; carry untouched - - adwc #0,r0 -666$: - ret - - .title vax_bn_sub_words unsigned add of two arrays -; -; Richard Levitte 20-Nov-2000 -; -; ULONG bn_sub_words(ULONG r[], ULONG a[], ULONG b[], int n) { -; ULONG c = 0; -; int i; -; for (i = 0; i < n; i++) <c,r[i]> = a[i] - b[i] - c; -; return(c); -; } - -r=4 ;(AP) r by reference (output) -a=8 ;(AP) a by reference (input) -b=12 ;(AP) b by reference (input) -n=16 ;(AP) n by value (input) - - - .psect code,nowrt - -.entry bn_sub_words,^m<r2,r3,r4,r5,r6> - - moval @r(ap),r2 - moval @a(ap),r3 - moval @b(ap),r4 - movl n(ap),r5 ; assumed >0 by C code - clrl r0 ; c - - tstl r5 ; carry = 0 - bleq 666$ - -0$: - movl (r3)+,r6 ; carry untouched - sbwc (r4)+,r6 ; carry used and touched - movl r6,(r2)+ ; carry untouched - sobgtr r5,0$ ; carry untouched - - adwc #0,r0 -666$: - ret - - -;r=4 ;(AP) -;a=8 ;(AP) -;b=12 ;(AP) -;n=16 ;(AP) n by value (input) - - .psect code,nowrt - -.entry BN_MUL_COMBA8,^m<r2,r3,r4,r5,r6,r7,r8,r9,r10,r11> - movab -924(sp),sp - clrq r8 - - clrl r10 - - movl 8(ap),r6 - movzwl 2(r6),r3 - movl 12(ap),r7 - bicl3 #-65536,(r7),r2 - movzwl 2(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,(r6),-12(fp) - bicl3 #-65536,r3,-16(fp) - mull3 r0,-12(fp),-4(fp) - mull2 r2,-12(fp) - mull3 r2,-16(fp),-8(fp) - mull2 r0,-16(fp) - addl3 -4(fp),-8(fp),r0 - bicl3 #0,r0,-4(fp) - cmpl -4(fp),-8(fp) - bgequ noname.45 - addl2 #65536,-16(fp) -noname.45: - movzwl -2(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-16(fp) - bicl3 #-65536,-4(fp),r0 - ashl #16,r0,-8(fp) - addl3 -8(fp),-12(fp),r0 - bicl3 #0,r0,-12(fp) - cmpl -12(fp),-8(fp) - bgequ noname.46 - incl -16(fp) -noname.46: - movl -12(fp),r1 - movl -16(fp),r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.47 - incl r2 -noname.47: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.48 - incl r10 -noname.48: - - movl 4(ap),r11 - movl r9,(r11) - - clrl r9 - - movzwl 2(r6),r2 - bicl3 #-65536,4(r7),r3 - movzwl 6(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,(r6),-28(fp) - bicl3 #-65536,r2,-32(fp) - mull3 r0,-28(fp),-20(fp) - mull2 r3,-28(fp) - mull3 r3,-32(fp),-24(fp) - mull2 r0,-32(fp) - addl3 -20(fp),-24(fp),r0 - bicl3 #0,r0,-20(fp) - cmpl -20(fp),-24(fp) - bgequ noname.49 - addl2 #65536,-32(fp) -noname.49: - movzwl -18(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-32(fp) - bicl3 #-65536,-20(fp),r0 - ashl #16,r0,-24(fp) - addl3 -24(fp),-28(fp),r0 - bicl3 #0,r0,-28(fp) - cmpl -28(fp),-24(fp) - bgequ noname.50 - incl -32(fp) -noname.50: - movl -28(fp),r1 - movl -32(fp),r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.51 - incl r2 -noname.51: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.52 - incl r9 -noname.52: - - movzwl 6(r6),r2 - bicl3 #-65536,(r7),r3 - movzwl 2(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,4(r6),-44(fp) - bicl3 #-65536,r2,-48(fp) - mull3 r0,-44(fp),-36(fp) - mull2 r3,-44(fp) - mull3 r3,-48(fp),-40(fp) - mull2 r0,-48(fp) - addl3 -36(fp),-40(fp),r0 - bicl3 #0,r0,-36(fp) - cmpl -36(fp),-40(fp) - bgequ noname.53 - addl2 #65536,-48(fp) -noname.53: - movzwl -34(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-48(fp) - bicl3 #-65536,-36(fp),r0 - ashl #16,r0,-40(fp) - addl3 -40(fp),-44(fp),r0 - bicl3 #0,r0,-44(fp) - cmpl -44(fp),-40(fp) - bgequ noname.54 - incl -48(fp) -noname.54: - movl -44(fp),r1 - movl -48(fp),r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.55 - incl r2 -noname.55: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.56 - incl r9 -noname.56: - - movl r8,4(r11) - - clrl r8 - - movzwl 10(r6),r2 - bicl3 #-65536,(r7),r3 - movzwl 2(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,8(r6),-60(fp) - bicl3 #-65536,r2,-64(fp) - mull3 r0,-60(fp),-52(fp) - mull2 r3,-60(fp) - mull3 r3,-64(fp),-56(fp) - mull2 r0,-64(fp) - addl3 -52(fp),-56(fp),r0 - bicl3 #0,r0,-52(fp) - cmpl -52(fp),-56(fp) - bgequ noname.57 - addl2 #65536,-64(fp) -noname.57: - movzwl -50(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-64(fp) - bicl3 #-65536,-52(fp),r0 - ashl #16,r0,-56(fp) - addl3 -56(fp),-60(fp),r0 - bicl3 #0,r0,-60(fp) - cmpl -60(fp),-56(fp) - bgequ noname.58 - incl -64(fp) -noname.58: - movl -60(fp),r1 - movl -64(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.59 - incl r2 -noname.59: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.60 - incl r8 -noname.60: - - movzwl 6(r6),r2 - bicl3 #-65536,4(r7),r3 - movzwl 6(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,4(r6),-76(fp) - bicl3 #-65536,r2,-80(fp) - mull3 r0,-76(fp),-68(fp) - mull2 r3,-76(fp) - mull3 r3,-80(fp),-72(fp) - mull2 r0,-80(fp) - addl3 -68(fp),-72(fp),r0 - bicl3 #0,r0,-68(fp) - cmpl -68(fp),-72(fp) - bgequ noname.61 - addl2 #65536,-80(fp) -noname.61: - movzwl -66(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-80(fp) - bicl3 #-65536,-68(fp),r0 - ashl #16,r0,-72(fp) - addl3 -72(fp),-76(fp),r0 - bicl3 #0,r0,-76(fp) - cmpl -76(fp),-72(fp) - bgequ noname.62 - incl -80(fp) -noname.62: - movl -76(fp),r1 - movl -80(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.63 - incl r2 -noname.63: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.64 - incl r8 -noname.64: - - movzwl 2(r6),r2 - bicl3 #-65536,8(r7),r3 - movzwl 10(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,(r6),-92(fp) - bicl3 #-65536,r2,-96(fp) - mull3 r0,-92(fp),-84(fp) - mull2 r3,-92(fp) - mull3 r3,-96(fp),-88(fp) - mull2 r0,-96(fp) - addl3 -84(fp),-88(fp),r0 - bicl3 #0,r0,-84(fp) - cmpl -84(fp),-88(fp) - bgequ noname.65 - addl2 #65536,-96(fp) -noname.65: - movzwl -82(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-96(fp) - bicl3 #-65536,-84(fp),r0 - ashl #16,r0,-88(fp) - addl3 -88(fp),-92(fp),r0 - bicl3 #0,r0,-92(fp) - cmpl -92(fp),-88(fp) - bgequ noname.66 - incl -96(fp) -noname.66: - movl -92(fp),r1 - movl -96(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.67 - incl r2 -noname.67: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.68 - incl r8 -noname.68: - - movl r10,8(r11) - - clrl r10 - - movzwl 2(r6),r2 - bicl3 #-65536,12(r7),r3 - movzwl 14(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,(r6),-108(fp) - bicl3 #-65536,r2,-112(fp) - mull3 r0,-108(fp),-100(fp) - mull2 r3,-108(fp) - mull3 r3,-112(fp),-104(fp) - mull2 r0,-112(fp) - addl3 -100(fp),-104(fp),r0 - bicl3 #0,r0,-100(fp) - cmpl -100(fp),-104(fp) - bgequ noname.69 - addl2 #65536,-112(fp) -noname.69: - movzwl -98(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-112(fp) - bicl3 #-65536,-100(fp),r0 - ashl #16,r0,-104(fp) - addl3 -104(fp),-108(fp),r0 - bicl3 #0,r0,-108(fp) - cmpl -108(fp),-104(fp) - bgequ noname.70 - incl -112(fp) -noname.70: - movl -108(fp),r1 - movl -112(fp),r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.71 - incl r2 -noname.71: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.72 - incl r10 -noname.72: - - movzwl 6(r6),r2 - bicl3 #-65536,8(r7),r3 - movzwl 10(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,4(r6),-124(fp) - bicl3 #-65536,r2,-128(fp) - mull3 r0,-124(fp),-116(fp) - mull2 r3,-124(fp) - mull3 r3,-128(fp),-120(fp) - mull2 r0,-128(fp) - addl3 -116(fp),-120(fp),r0 - bicl3 #0,r0,-116(fp) - cmpl -116(fp),-120(fp) - bgequ noname.73 - addl2 #65536,-128(fp) -noname.73: - movzwl -114(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-128(fp) - bicl3 #-65536,-116(fp),r0 - ashl #16,r0,-120(fp) - addl3 -120(fp),-124(fp),r0 - bicl3 #0,r0,-124(fp) - cmpl -124(fp),-120(fp) - bgequ noname.74 - incl -128(fp) -noname.74: - movl -124(fp),r1 - movl -128(fp),r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.75 - incl r2 -noname.75: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.76 - incl r10 -noname.76: - - movzwl 10(r6),r2 - bicl3 #-65536,4(r7),r3 - movzwl 6(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,8(r6),-140(fp) - bicl3 #-65536,r2,-144(fp) - mull3 r0,-140(fp),-132(fp) - mull2 r3,-140(fp) - mull3 r3,-144(fp),-136(fp) - mull2 r0,-144(fp) - addl3 -132(fp),-136(fp),r0 - bicl3 #0,r0,-132(fp) - cmpl -132(fp),-136(fp) - bgequ noname.77 - addl2 #65536,-144(fp) -noname.77: - movzwl -130(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-144(fp) - bicl3 #-65536,-132(fp),r0 - ashl #16,r0,-136(fp) - addl3 -136(fp),-140(fp),r0 - bicl3 #0,r0,-140(fp) - cmpl -140(fp),-136(fp) - bgequ noname.78 - incl -144(fp) -noname.78: - movl -140(fp),r1 - movl -144(fp),r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.79 - incl r2 -noname.79: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.80 - incl r10 -noname.80: - - movzwl 14(r6),r2 - bicl3 #-65536,(r7),r3 - movzwl 2(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,12(r6),-156(fp) - bicl3 #-65536,r2,-160(fp) - mull3 r0,-156(fp),-148(fp) - mull2 r3,-156(fp) - mull3 r3,-160(fp),-152(fp) - mull2 r0,-160(fp) - addl3 -148(fp),-152(fp),r0 - bicl3 #0,r0,-148(fp) - cmpl -148(fp),-152(fp) - bgequ noname.81 - addl2 #65536,-160(fp) -noname.81: - movzwl -146(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-160(fp) - bicl3 #-65536,-148(fp),r0 - ashl #16,r0,-152(fp) - addl3 -152(fp),-156(fp),r0 - bicl3 #0,r0,-156(fp) - cmpl -156(fp),-152(fp) - bgequ noname.82 - incl -160(fp) -noname.82: - movl -156(fp),r1 - movl -160(fp),r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.83 - incl r2 -noname.83: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.84 - incl r10 -noname.84: - - movl r9,12(r11) - - clrl r9 - - movzwl 18(r6),r2 - bicl3 #-65536,(r7),r3 - movzwl 2(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,16(r6),-172(fp) - bicl3 #-65536,r2,-176(fp) - mull3 r0,-172(fp),-164(fp) - mull2 r3,-172(fp) - mull3 r3,-176(fp),-168(fp) - mull2 r0,-176(fp) - addl3 -164(fp),-168(fp),r0 - bicl3 #0,r0,-164(fp) - cmpl -164(fp),-168(fp) - bgequ noname.85 - addl2 #65536,-176(fp) -noname.85: - movzwl -162(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-176(fp) - bicl3 #-65536,-164(fp),r0 - ashl #16,r0,-168(fp) - addl3 -168(fp),-172(fp),r0 - bicl3 #0,r0,-172(fp) - cmpl -172(fp),-168(fp) - bgequ noname.86 - incl -176(fp) -noname.86: - movl -172(fp),r1 - movl -176(fp),r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.87 - incl r2 -noname.87: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.88 - incl r9 -noname.88: - - movzwl 14(r6),r2 - bicl3 #-65536,4(r7),r3 - movzwl 6(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,12(r6),-188(fp) - bicl3 #-65536,r2,-192(fp) - mull3 r0,-188(fp),-180(fp) - mull2 r3,-188(fp) - mull3 r3,-192(fp),-184(fp) - mull2 r0,-192(fp) - addl3 -180(fp),-184(fp),r0 - bicl3 #0,r0,-180(fp) - cmpl -180(fp),-184(fp) - bgequ noname.89 - addl2 #65536,-192(fp) -noname.89: - movzwl -178(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-192(fp) - bicl3 #-65536,-180(fp),r0 - ashl #16,r0,-184(fp) - addl3 -184(fp),-188(fp),r0 - bicl3 #0,r0,-188(fp) - cmpl -188(fp),-184(fp) - bgequ noname.90 - incl -192(fp) -noname.90: - movl -188(fp),r1 - movl -192(fp),r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.91 - incl r2 -noname.91: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.92 - incl r9 -noname.92: - - movzwl 10(r6),r2 - bicl3 #-65536,8(r7),r3 - movzwl 10(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,8(r6),-204(fp) - bicl3 #-65536,r2,-208(fp) - mull3 r0,-204(fp),-196(fp) - mull2 r3,-204(fp) - mull3 r3,-208(fp),-200(fp) - mull2 r0,-208(fp) - addl3 -196(fp),-200(fp),r0 - bicl3 #0,r0,-196(fp) - cmpl -196(fp),-200(fp) - bgequ noname.93 - addl2 #65536,-208(fp) -noname.93: - movzwl -194(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-208(fp) - bicl3 #-65536,-196(fp),r0 - ashl #16,r0,-200(fp) - addl3 -200(fp),-204(fp),r0 - bicl3 #0,r0,-204(fp) - cmpl -204(fp),-200(fp) - bgequ noname.94 - incl -208(fp) -noname.94: - movl -204(fp),r1 - movl -208(fp),r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.95 - incl r2 -noname.95: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.96 - incl r9 -noname.96: - - movzwl 6(r6),r2 - bicl3 #-65536,12(r7),r3 - movzwl 14(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,4(r6),-220(fp) - bicl3 #-65536,r2,-224(fp) - mull3 r0,-220(fp),-212(fp) - mull2 r3,-220(fp) - mull3 r3,-224(fp),-216(fp) - mull2 r0,-224(fp) - addl3 -212(fp),-216(fp),r0 - bicl3 #0,r0,-212(fp) - cmpl -212(fp),-216(fp) - bgequ noname.97 - addl2 #65536,-224(fp) -noname.97: - movzwl -210(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-224(fp) - bicl3 #-65536,-212(fp),r0 - ashl #16,r0,-216(fp) - addl3 -216(fp),-220(fp),r0 - bicl3 #0,r0,-220(fp) - cmpl -220(fp),-216(fp) - bgequ noname.98 - incl -224(fp) -noname.98: - movl -220(fp),r1 - movl -224(fp),r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.99 - incl r2 -noname.99: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.100 - incl r9 -noname.100: - - movzwl 2(r6),r2 - bicl3 #-65536,16(r7),r3 - movzwl 18(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,(r6),-236(fp) - bicl3 #-65536,r2,-240(fp) - mull3 r0,-236(fp),-228(fp) - mull2 r3,-236(fp) - mull3 r3,-240(fp),-232(fp) - mull2 r0,-240(fp) - addl3 -228(fp),-232(fp),r0 - bicl3 #0,r0,-228(fp) - cmpl -228(fp),-232(fp) - bgequ noname.101 - addl2 #65536,-240(fp) -noname.101: - movzwl -226(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-240(fp) - bicl3 #-65536,-228(fp),r0 - ashl #16,r0,-232(fp) - addl3 -232(fp),-236(fp),r0 - bicl3 #0,r0,-236(fp) - cmpl -236(fp),-232(fp) - bgequ noname.102 - incl -240(fp) -noname.102: - movl -236(fp),r1 - movl -240(fp),r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.103 - incl r2 -noname.103: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.104 - incl r9 -noname.104: - - movl r8,16(r11) - - clrl r8 - - movzwl 2(r6),r2 - bicl3 #-65536,20(r7),r3 - movzwl 22(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,(r6),-252(fp) - bicl3 #-65536,r2,-256(fp) - mull3 r0,-252(fp),-244(fp) - mull2 r3,-252(fp) - mull3 r3,-256(fp),-248(fp) - mull2 r0,-256(fp) - addl3 -244(fp),-248(fp),r0 - bicl3 #0,r0,-244(fp) - cmpl -244(fp),-248(fp) - bgequ noname.105 - addl2 #65536,-256(fp) -noname.105: - movzwl -242(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-256(fp) - bicl3 #-65536,-244(fp),r0 - ashl #16,r0,-248(fp) - addl3 -248(fp),-252(fp),r0 - bicl3 #0,r0,-252(fp) - cmpl -252(fp),-248(fp) - bgequ noname.106 - incl -256(fp) -noname.106: - movl -252(fp),r1 - movl -256(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.107 - incl r2 -noname.107: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.108 - incl r8 -noname.108: - - movzwl 6(r6),r2 - bicl3 #-65536,16(r7),r3 - movzwl 18(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,4(r6),-268(fp) - bicl3 #-65536,r2,-272(fp) - mull3 r0,-268(fp),-260(fp) - mull2 r3,-268(fp) - mull3 r3,-272(fp),-264(fp) - mull2 r0,-272(fp) - addl3 -260(fp),-264(fp),r0 - bicl3 #0,r0,-260(fp) - cmpl -260(fp),-264(fp) - bgequ noname.109 - addl2 #65536,-272(fp) -noname.109: - movzwl -258(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-272(fp) - bicl3 #-65536,-260(fp),r0 - ashl #16,r0,-264(fp) - addl3 -264(fp),-268(fp),r0 - bicl3 #0,r0,-268(fp) - cmpl -268(fp),-264(fp) - bgequ noname.110 - incl -272(fp) -noname.110: - movl -268(fp),r1 - movl -272(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.111 - incl r2 -noname.111: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.112 - incl r8 -noname.112: - - movzwl 10(r6),r2 - bicl3 #-65536,12(r7),r3 - movzwl 14(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,8(r6),-284(fp) - bicl3 #-65536,r2,-288(fp) - mull3 r0,-284(fp),-276(fp) - mull2 r3,-284(fp) - mull3 r3,-288(fp),-280(fp) - mull2 r0,-288(fp) - addl3 -276(fp),-280(fp),r0 - bicl3 #0,r0,-276(fp) - cmpl -276(fp),-280(fp) - bgequ noname.113 - addl2 #65536,-288(fp) -noname.113: - movzwl -274(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-288(fp) - bicl3 #-65536,-276(fp),r0 - ashl #16,r0,-280(fp) - addl3 -280(fp),-284(fp),r0 - bicl3 #0,r0,-284(fp) - cmpl -284(fp),-280(fp) - bgequ noname.114 - incl -288(fp) -noname.114: - movl -284(fp),r1 - movl -288(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.115 - incl r2 -noname.115: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.116 - incl r8 -noname.116: - - movzwl 14(r6),r2 - bicl3 #-65536,8(r7),r3 - movzwl 10(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,12(r6),-300(fp) - bicl3 #-65536,r2,-304(fp) - mull3 r0,-300(fp),-292(fp) - mull2 r3,-300(fp) - mull3 r3,-304(fp),-296(fp) - mull2 r0,-304(fp) - addl3 -292(fp),-296(fp),r0 - bicl3 #0,r0,-292(fp) - cmpl -292(fp),-296(fp) - bgequ noname.117 - addl2 #65536,-304(fp) -noname.117: - movzwl -290(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-304(fp) - bicl3 #-65536,-292(fp),r0 - ashl #16,r0,-296(fp) - addl3 -296(fp),-300(fp),r0 - bicl3 #0,r0,-300(fp) - cmpl -300(fp),-296(fp) - bgequ noname.118 - incl -304(fp) -noname.118: - movl -300(fp),r1 - movl -304(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.119 - incl r2 -noname.119: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.120 - incl r8 -noname.120: - - movzwl 18(r6),r2 - bicl3 #-65536,4(r7),r3 - movzwl 6(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,16(r6),-316(fp) - bicl3 #-65536,r2,-320(fp) - mull3 r0,-316(fp),-308(fp) - mull2 r3,-316(fp) - mull3 r3,-320(fp),-312(fp) - mull2 r0,-320(fp) - addl3 -308(fp),-312(fp),r0 - bicl3 #0,r0,-308(fp) - cmpl -308(fp),-312(fp) - bgequ noname.121 - addl2 #65536,-320(fp) -noname.121: - movzwl -306(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-320(fp) - bicl3 #-65536,-308(fp),r0 - ashl #16,r0,-312(fp) - addl3 -312(fp),-316(fp),r0 - bicl3 #0,r0,-316(fp) - cmpl -316(fp),-312(fp) - bgequ noname.122 - incl -320(fp) -noname.122: - movl -316(fp),r1 - movl -320(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.123 - incl r2 - -noname.123: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.124 - incl r8 -noname.124: - - movzwl 22(r6),r2 - bicl3 #-65536,(r7),r3 - movzwl 2(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,20(r6),-332(fp) - bicl3 #-65536,r2,-336(fp) - mull3 r0,-332(fp),-324(fp) - mull2 r3,-332(fp) - mull3 r3,-336(fp),-328(fp) - mull2 r0,-336(fp) - addl3 -324(fp),-328(fp),r0 - bicl3 #0,r0,-324(fp) - cmpl -324(fp),-328(fp) - bgequ noname.125 - addl2 #65536,-336(fp) -noname.125: - movzwl -322(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-336(fp) - bicl3 #-65536,-324(fp),r0 - ashl #16,r0,-328(fp) - addl3 -328(fp),-332(fp),r0 - bicl3 #0,r0,-332(fp) - cmpl -332(fp),-328(fp) - bgequ noname.126 - incl -336(fp) -noname.126: - movl -332(fp),r1 - movl -336(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.127 - incl r2 -noname.127: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.128 - incl r8 -noname.128: - - movl r10,20(r11) - - clrl r10 - - movzwl 26(r6),r2 - bicl3 #-65536,(r7),r3 - movzwl 2(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,24(r6),-348(fp) - bicl3 #-65536,r2,-352(fp) - mull3 r0,-348(fp),-340(fp) - mull2 r3,-348(fp) - mull3 r3,-352(fp),-344(fp) - mull2 r0,-352(fp) - addl3 -340(fp),-344(fp),r0 - bicl3 #0,r0,-340(fp) - cmpl -340(fp),-344(fp) - bgequ noname.129 - addl2 #65536,-352(fp) -noname.129: - movzwl -338(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-352(fp) - bicl3 #-65536,-340(fp),r0 - ashl #16,r0,-344(fp) - addl3 -344(fp),-348(fp),r0 - bicl3 #0,r0,-348(fp) - cmpl -348(fp),-344(fp) - bgequ noname.130 - incl -352(fp) -noname.130: - movl -348(fp),r1 - movl -352(fp),r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.131 - incl r2 -noname.131: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.132 - incl r10 -noname.132: - - movzwl 22(r6),r2 - bicl3 #-65536,4(r7),r3 - movzwl 6(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,20(r6),-364(fp) - bicl3 #-65536,r2,-368(fp) - mull3 r0,-364(fp),-356(fp) - mull2 r3,-364(fp) - mull3 r3,-368(fp),-360(fp) - mull2 r0,-368(fp) - addl3 -356(fp),-360(fp),r0 - bicl3 #0,r0,-356(fp) - cmpl -356(fp),-360(fp) - bgequ noname.133 - addl2 #65536,-368(fp) -noname.133: - movzwl -354(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-368(fp) - bicl3 #-65536,-356(fp),r0 - ashl #16,r0,-360(fp) - addl3 -360(fp),-364(fp),r0 - bicl3 #0,r0,-364(fp) - cmpl -364(fp),-360(fp) - bgequ noname.134 - incl -368(fp) -noname.134: - movl -364(fp),r1 - movl -368(fp),r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.135 - incl r2 -noname.135: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.136 - incl r10 -noname.136: - - movzwl 18(r6),r2 - bicl3 #-65536,8(r7),r3 - movzwl 10(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,16(r6),-380(fp) - bicl3 #-65536,r2,-384(fp) - mull3 r0,-380(fp),-372(fp) - mull2 r3,-380(fp) - mull3 r3,-384(fp),-376(fp) - mull2 r0,-384(fp) - addl3 -372(fp),-376(fp),r0 - bicl3 #0,r0,-372(fp) - cmpl -372(fp),-376(fp) - bgequ noname.137 - addl2 #65536,-384(fp) -noname.137: - movzwl -370(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-384(fp) - bicl3 #-65536,-372(fp),r0 - ashl #16,r0,-376(fp) - addl3 -376(fp),-380(fp),r0 - bicl3 #0,r0,-380(fp) - cmpl -380(fp),-376(fp) - bgequ noname.138 - incl -384(fp) -noname.138: - movl -380(fp),r1 - movl -384(fp),r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.139 - incl r2 -noname.139: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.140 - incl r10 -noname.140: - - movzwl 14(r6),r2 - bicl3 #-65536,12(r7),r3 - movzwl 14(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,12(r6),-396(fp) - bicl3 #-65536,r2,-400(fp) - mull3 r0,-396(fp),-388(fp) - mull2 r3,-396(fp) - mull3 r3,-400(fp),-392(fp) - mull2 r0,-400(fp) - addl3 -388(fp),-392(fp),r0 - bicl3 #0,r0,-388(fp) - cmpl -388(fp),-392(fp) - bgequ noname.141 - addl2 #65536,-400(fp) -noname.141: - movzwl -386(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-400(fp) - bicl3 #-65536,-388(fp),r0 - ashl #16,r0,-392(fp) - addl3 -392(fp),-396(fp),r0 - bicl3 #0,r0,-396(fp) - cmpl -396(fp),-392(fp) - bgequ noname.142 - incl -400(fp) -noname.142: - movl -396(fp),r1 - movl -400(fp),r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.143 - incl r2 -noname.143: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.144 - incl r10 -noname.144: - - movzwl 10(r6),r2 - bicl3 #-65536,16(r7),r3 - movzwl 18(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,8(r6),-412(fp) - bicl3 #-65536,r2,-416(fp) - mull3 r0,-412(fp),-404(fp) - mull2 r3,-412(fp) - mull3 r3,-416(fp),-408(fp) - mull2 r0,-416(fp) - addl3 -404(fp),-408(fp),r0 - bicl3 #0,r0,-404(fp) - cmpl -404(fp),-408(fp) - bgequ noname.145 - addl2 #65536,-416(fp) -noname.145: - movzwl -402(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-416(fp) - bicl3 #-65536,-404(fp),r0 - ashl #16,r0,-408(fp) - addl3 -408(fp),-412(fp),r0 - bicl3 #0,r0,-412(fp) - cmpl -412(fp),-408(fp) - bgequ noname.146 - incl -416(fp) -noname.146: - movl -412(fp),r1 - movl -416(fp),r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.147 - incl r2 -noname.147: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.148 - incl r10 -noname.148: - - movzwl 6(r6),r2 - bicl3 #-65536,20(r7),r3 - movzwl 22(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,4(r6),-428(fp) - bicl3 #-65536,r2,-432(fp) - mull3 r0,-428(fp),-420(fp) - mull2 r3,-428(fp) - mull3 r3,-432(fp),-424(fp) - mull2 r0,-432(fp) - addl3 -420(fp),-424(fp),r0 - bicl3 #0,r0,-420(fp) - cmpl -420(fp),-424(fp) - bgequ noname.149 - addl2 #65536,-432(fp) -noname.149: - movzwl -418(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-432(fp) - bicl3 #-65536,-420(fp),r0 - ashl #16,r0,-424(fp) - addl3 -424(fp),-428(fp),r0 - bicl3 #0,r0,-428(fp) - cmpl -428(fp),-424(fp) - bgequ noname.150 - incl -432(fp) -noname.150: - movl -428(fp),r1 - movl -432(fp),r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.151 - incl r2 -noname.151: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.152 - incl r10 -noname.152: - - movzwl 2(r6),r2 - bicl3 #-65536,24(r7),r3 - movzwl 26(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,(r6),-444(fp) - bicl3 #-65536,r2,-448(fp) - mull3 r0,-444(fp),-436(fp) - mull2 r3,-444(fp) - mull3 r3,-448(fp),-440(fp) - mull2 r0,-448(fp) - addl3 -436(fp),-440(fp),r0 - bicl3 #0,r0,-436(fp) - cmpl -436(fp),-440(fp) - bgequ noname.153 - addl2 #65536,-448(fp) -noname.153: - movzwl -434(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-448(fp) - bicl3 #-65536,-436(fp),r0 - ashl #16,r0,-440(fp) - addl3 -440(fp),-444(fp),r0 - bicl3 #0,r0,-444(fp) - cmpl -444(fp),-440(fp) - bgequ noname.154 - incl -448(fp) -noname.154: - movl -444(fp),r1 - movl -448(fp),r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.155 - incl r2 -noname.155: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.156 - incl r10 -noname.156: - - movl r9,24(r11) - - clrl r9 - - movzwl 2(r6),r2 - bicl3 #-65536,28(r7),r3 - movzwl 30(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,(r6),-460(fp) - bicl3 #-65536,r2,-464(fp) - mull3 r0,-460(fp),-452(fp) - mull2 r3,-460(fp) - mull3 r3,-464(fp),-456(fp) - mull2 r0,-464(fp) - addl3 -452(fp),-456(fp),r0 - bicl3 #0,r0,-452(fp) - cmpl -452(fp),-456(fp) - bgequ noname.157 - addl2 #65536,-464(fp) -noname.157: - movzwl -450(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-464(fp) - bicl3 #-65536,-452(fp),r0 - ashl #16,r0,-456(fp) - addl3 -456(fp),-460(fp),r0 - bicl3 #0,r0,-460(fp) - cmpl -460(fp),-456(fp) - bgequ noname.158 - incl -464(fp) -noname.158: - movl -460(fp),r1 - movl -464(fp),r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.159 - incl r2 -noname.159: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.160 - incl r9 -noname.160: - - movzwl 6(r6),r2 - bicl3 #-65536,24(r7),r3 - movzwl 26(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,4(r6),-476(fp) - bicl3 #-65536,r2,-480(fp) - mull3 r0,-476(fp),-468(fp) - mull2 r3,-476(fp) - mull3 r3,-480(fp),-472(fp) - mull2 r0,-480(fp) - addl3 -468(fp),-472(fp),r0 - bicl3 #0,r0,-468(fp) - cmpl -468(fp),-472(fp) - bgequ noname.161 - addl2 #65536,-480(fp) -noname.161: - movzwl -466(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-480(fp) - bicl3 #-65536,-468(fp),r0 - ashl #16,r0,-472(fp) - addl3 -472(fp),-476(fp),r0 - bicl3 #0,r0,-476(fp) - cmpl -476(fp),-472(fp) - bgequ noname.162 - incl -480(fp) -noname.162: - movl -476(fp),r1 - movl -480(fp),r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.163 - incl r2 -noname.163: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.164 - incl r9 -noname.164: - - movzwl 10(r6),r2 - bicl3 #-65536,20(r7),r3 - movzwl 22(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,8(r6),-492(fp) - bicl3 #-65536,r2,-496(fp) - mull3 r0,-492(fp),-484(fp) - mull2 r3,-492(fp) - mull3 r3,-496(fp),-488(fp) - mull2 r0,-496(fp) - addl3 -484(fp),-488(fp),r0 - bicl3 #0,r0,-484(fp) - cmpl -484(fp),-488(fp) - bgequ noname.165 - addl2 #65536,-496(fp) -noname.165: - movzwl -482(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-496(fp) - bicl3 #-65536,-484(fp),r0 - ashl #16,r0,-488(fp) - addl3 -488(fp),-492(fp),r0 - bicl3 #0,r0,-492(fp) - cmpl -492(fp),-488(fp) - bgequ noname.166 - incl -496(fp) -noname.166: - movl -492(fp),r1 - movl -496(fp),r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.167 - incl r2 -noname.167: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.168 - incl r9 -noname.168: - - movzwl 14(r6),r2 - bicl3 #-65536,16(r7),r3 - movzwl 18(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,12(r6),-508(fp) - bicl3 #-65536,r2,-512(fp) - mull3 r0,-508(fp),-500(fp) - mull2 r3,-508(fp) - mull3 r3,-512(fp),-504(fp) - mull2 r0,-512(fp) - addl3 -500(fp),-504(fp),r0 - bicl3 #0,r0,-500(fp) - cmpl -500(fp),-504(fp) - bgequ noname.169 - addl2 #65536,-512(fp) -noname.169: - movzwl -498(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-512(fp) - bicl3 #-65536,-500(fp),r0 - ashl #16,r0,-504(fp) - addl3 -504(fp),-508(fp),r0 - bicl3 #0,r0,-508(fp) - cmpl -508(fp),-504(fp) - bgequ noname.170 - incl -512(fp) -noname.170: - movl -508(fp),r1 - movl -512(fp),r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.171 - incl r2 -noname.171: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.172 - incl r9 -noname.172: - - movzwl 18(r6),r2 - bicl3 #-65536,12(r7),r3 - movzwl 14(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,16(r6),-524(fp) - bicl3 #-65536,r2,-528(fp) - mull3 r0,-524(fp),-516(fp) - mull2 r3,-524(fp) - mull3 r3,-528(fp),-520(fp) - mull2 r0,-528(fp) - addl3 -516(fp),-520(fp),r0 - bicl3 #0,r0,-516(fp) - cmpl -516(fp),-520(fp) - bgequ noname.173 - addl2 #65536,-528(fp) -noname.173: - movzwl -514(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-528(fp) - bicl3 #-65536,-516(fp),r0 - ashl #16,r0,-520(fp) - addl3 -520(fp),-524(fp),r0 - bicl3 #0,r0,-524(fp) - cmpl -524(fp),-520(fp) - bgequ noname.174 - incl -528(fp) -noname.174: - movl -524(fp),r1 - movl -528(fp),r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.175 - incl r2 -noname.175: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.176 - incl r9 -noname.176: - - movzwl 22(r6),r2 - bicl3 #-65536,8(r7),r3 - movzwl 10(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,20(r6),-540(fp) - bicl3 #-65536,r2,-544(fp) - mull3 r0,-540(fp),-532(fp) - mull2 r3,-540(fp) - mull3 r3,-544(fp),-536(fp) - mull2 r0,-544(fp) - addl3 -532(fp),-536(fp),r0 - bicl3 #0,r0,-532(fp) - cmpl -532(fp),-536(fp) - bgequ noname.177 - addl2 #65536,-544(fp) -noname.177: - movzwl -530(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-544(fp) - bicl3 #-65536,-532(fp),r0 - ashl #16,r0,-536(fp) - addl3 -536(fp),-540(fp),r0 - bicl3 #0,r0,-540(fp) - cmpl -540(fp),-536(fp) - bgequ noname.178 - incl -544(fp) -noname.178: - movl -540(fp),r1 - movl -544(fp),r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.179 - incl r2 -noname.179: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.180 - incl r9 -noname.180: - - movzwl 26(r6),r2 - bicl3 #-65536,4(r7),r3 - movzwl 6(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,24(r6),-556(fp) - bicl3 #-65536,r2,-560(fp) - mull3 r0,-556(fp),-548(fp) - mull2 r3,-556(fp) - mull3 r3,-560(fp),-552(fp) - mull2 r0,-560(fp) - addl3 -548(fp),-552(fp),r0 - bicl3 #0,r0,-548(fp) - cmpl -548(fp),-552(fp) - bgequ noname.181 - addl2 #65536,-560(fp) -noname.181: - movzwl -546(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-560(fp) - bicl3 #-65536,-548(fp),r0 - ashl #16,r0,-552(fp) - addl3 -552(fp),-556(fp),r0 - bicl3 #0,r0,-556(fp) - cmpl -556(fp),-552(fp) - bgequ noname.182 - incl -560(fp) -noname.182: - movl -556(fp),r1 - movl -560(fp),r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.183 - incl r2 -noname.183: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.184 - incl r9 -noname.184: - - movzwl 30(r6),r2 - bicl3 #-65536,(r7),r3 - movzwl 2(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,28(r6),-572(fp) - bicl3 #-65536,r2,-576(fp) - mull3 r0,-572(fp),-564(fp) - mull2 r3,-572(fp) - mull3 r3,-576(fp),-568(fp) - mull2 r0,-576(fp) - addl3 -564(fp),-568(fp),r0 - bicl3 #0,r0,-564(fp) - cmpl -564(fp),-568(fp) - bgequ noname.185 - addl2 #65536,-576(fp) -noname.185: - movzwl -562(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-576(fp) - bicl3 #-65536,-564(fp),r0 - ashl #16,r0,-568(fp) - addl3 -568(fp),-572(fp),r0 - bicl3 #0,r0,-572(fp) - cmpl -572(fp),-568(fp) - bgequ noname.186 - incl -576(fp) -noname.186: - movl -572(fp),r1 - movl -576(fp),r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.187 - incl r2 -noname.187: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.188 - incl r9 -noname.188: - - movl r8,28(r11) - - clrl r8 - - movzwl 30(r6),r2 - bicl3 #-65536,4(r7),r3 - movzwl 6(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,28(r6),-588(fp) - bicl3 #-65536,r2,-592(fp) - mull3 r0,-588(fp),-580(fp) - mull2 r3,-588(fp) - mull3 r3,-592(fp),-584(fp) - mull2 r0,-592(fp) - addl3 -580(fp),-584(fp),r0 - bicl3 #0,r0,-580(fp) - cmpl -580(fp),-584(fp) - bgequ noname.189 - addl2 #65536,-592(fp) -noname.189: - movzwl -578(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-592(fp) - bicl3 #-65536,-580(fp),r0 - ashl #16,r0,-584(fp) - addl3 -584(fp),-588(fp),r0 - bicl3 #0,r0,-588(fp) - cmpl -588(fp),-584(fp) - bgequ noname.190 - incl -592(fp) -noname.190: - movl -588(fp),r1 - movl -592(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.191 - incl r2 -noname.191: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.192 - incl r8 -noname.192: - - movzwl 26(r6),r2 - bicl3 #-65536,8(r7),r3 - movzwl 10(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,24(r6),-604(fp) - bicl3 #-65536,r2,-608(fp) - mull3 r0,-604(fp),-596(fp) - mull2 r3,-604(fp) - mull3 r3,-608(fp),-600(fp) - mull2 r0,-608(fp) - addl3 -596(fp),-600(fp),r0 - bicl3 #0,r0,-596(fp) - cmpl -596(fp),-600(fp) - bgequ noname.193 - addl2 #65536,-608(fp) -noname.193: - movzwl -594(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-608(fp) - bicl3 #-65536,-596(fp),r0 - ashl #16,r0,-600(fp) - addl3 -600(fp),-604(fp),r0 - bicl3 #0,r0,-604(fp) - cmpl -604(fp),-600(fp) - bgequ noname.194 - incl -608(fp) -noname.194: - movl -604(fp),r1 - movl -608(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.195 - incl r2 -noname.195: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.196 - incl r8 -noname.196: - - movzwl 22(r6),r2 - bicl3 #-65536,12(r7),r3 - movzwl 14(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,20(r6),-620(fp) - bicl3 #-65536,r2,-624(fp) - mull3 r0,-620(fp),-612(fp) - mull2 r3,-620(fp) - mull3 r3,-624(fp),-616(fp) - mull2 r0,-624(fp) - addl3 -612(fp),-616(fp),r0 - bicl3 #0,r0,-612(fp) - cmpl -612(fp),-616(fp) - bgequ noname.197 - addl2 #65536,-624(fp) -noname.197: - movzwl -610(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-624(fp) - bicl3 #-65536,-612(fp),r0 - ashl #16,r0,-616(fp) - addl3 -616(fp),-620(fp),r0 - bicl3 #0,r0,-620(fp) - cmpl -620(fp),-616(fp) - bgequ noname.198 - incl -624(fp) -noname.198: - movl -620(fp),r1 - movl -624(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.199 - incl r2 -noname.199: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.200 - incl r8 -noname.200: - - movzwl 18(r6),r2 - bicl3 #-65536,16(r7),r3 - movzwl 18(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,16(r6),-636(fp) - bicl3 #-65536,r2,-640(fp) - mull3 r0,-636(fp),-628(fp) - mull2 r3,-636(fp) - mull3 r3,-640(fp),-632(fp) - mull2 r0,-640(fp) - addl3 -628(fp),-632(fp),r0 - bicl3 #0,r0,-628(fp) - cmpl -628(fp),-632(fp) - bgequ noname.201 - addl2 #65536,-640(fp) -noname.201: - movzwl -626(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-640(fp) - bicl3 #-65536,-628(fp),r0 - ashl #16,r0,-632(fp) - addl3 -632(fp),-636(fp),r0 - bicl3 #0,r0,-636(fp) - cmpl -636(fp),-632(fp) - bgequ noname.202 - incl -640(fp) -noname.202: - movl -636(fp),r1 - movl -640(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.203 - incl r2 -noname.203: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.204 - incl r8 -noname.204: - - movzwl 14(r6),r2 - bicl3 #-65536,20(r7),r3 - movzwl 22(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,12(r6),-652(fp) - bicl3 #-65536,r2,-656(fp) - mull3 r0,-652(fp),-644(fp) - mull2 r3,-652(fp) - mull3 r3,-656(fp),-648(fp) - mull2 r0,-656(fp) - addl3 -644(fp),-648(fp),r0 - bicl3 #0,r0,-644(fp) - cmpl -644(fp),-648(fp) - bgequ noname.205 - addl2 #65536,-656(fp) -noname.205: - movzwl -642(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-656(fp) - bicl3 #-65536,-644(fp),r0 - ashl #16,r0,-648(fp) - addl3 -648(fp),-652(fp),r0 - bicl3 #0,r0,-652(fp) - cmpl -652(fp),-648(fp) - bgequ noname.206 - incl -656(fp) -noname.206: - movl -652(fp),r1 - movl -656(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.207 - incl r2 -noname.207: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.208 - incl r8 -noname.208: - - movzwl 10(r6),r2 - bicl3 #-65536,24(r7),r3 - movzwl 26(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,8(r6),-668(fp) - bicl3 #-65536,r2,-672(fp) - mull3 r0,-668(fp),-660(fp) - mull2 r3,-668(fp) - mull3 r3,-672(fp),-664(fp) - mull2 r0,-672(fp) - addl3 -660(fp),-664(fp),r0 - bicl3 #0,r0,-660(fp) - cmpl -660(fp),-664(fp) - bgequ noname.209 - addl2 #65536,-672(fp) -noname.209: - movzwl -658(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-672(fp) - bicl3 #-65536,-660(fp),r0 - ashl #16,r0,-664(fp) - addl3 -664(fp),-668(fp),r0 - bicl3 #0,r0,-668(fp) - cmpl -668(fp),-664(fp) - bgequ noname.210 - incl -672(fp) -noname.210: - movl -668(fp),r1 - movl -672(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.211 - incl r2 -noname.211: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.212 - incl r8 -noname.212: - - movzwl 6(r6),r2 - bicl3 #-65536,28(r7),r3 - movzwl 30(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,4(r6),-684(fp) - bicl3 #-65536,r2,-688(fp) - mull3 r0,-684(fp),-676(fp) - mull2 r3,-684(fp) - mull3 r3,-688(fp),-680(fp) - mull2 r0,-688(fp) - addl3 -676(fp),-680(fp),r0 - bicl3 #0,r0,-676(fp) - cmpl -676(fp),-680(fp) - bgequ noname.213 - addl2 #65536,-688(fp) -noname.213: - movzwl -674(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-688(fp) - bicl3 #-65536,-676(fp),r0 - ashl #16,r0,-680(fp) - addl3 -680(fp),-684(fp),r0 - bicl3 #0,r0,-684(fp) - cmpl -684(fp),-680(fp) - bgequ noname.214 - incl -688(fp) -noname.214: - movl -684(fp),r1 - movl -688(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.215 - incl r2 -noname.215: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.216 - incl r8 -noname.216: - - movl r10,32(r11) - - clrl r10 - - movzwl 10(r6),r2 - bicl3 #-65536,28(r7),r3 - movzwl 30(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,8(r6),-700(fp) - bicl3 #-65536,r2,-704(fp) - mull3 r0,-700(fp),-692(fp) - mull2 r3,-700(fp) - mull3 r3,-704(fp),-696(fp) - mull2 r0,-704(fp) - addl3 -692(fp),-696(fp),r0 - bicl3 #0,r0,-692(fp) - cmpl -692(fp),-696(fp) - bgequ noname.217 - addl2 #65536,-704(fp) -noname.217: - movzwl -690(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-704(fp) - bicl3 #-65536,-692(fp),r0 - ashl #16,r0,-696(fp) - addl3 -696(fp),-700(fp),r0 - bicl3 #0,r0,-700(fp) - cmpl -700(fp),-696(fp) - bgequ noname.218 - incl -704(fp) -noname.218: - movl -700(fp),r1 - movl -704(fp),r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.219 - incl r2 -noname.219: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.220 - incl r10 -noname.220: - - movzwl 14(r6),r2 - bicl3 #-65536,24(r7),r3 - movzwl 26(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,12(r6),-716(fp) - bicl3 #-65536,r2,-720(fp) - mull3 r0,-716(fp),-708(fp) - mull2 r3,-716(fp) - mull3 r3,-720(fp),-712(fp) - mull2 r0,-720(fp) - addl3 -708(fp),-712(fp),r0 - bicl3 #0,r0,-708(fp) - cmpl -708(fp),-712(fp) - bgequ noname.221 - addl2 #65536,-720(fp) -noname.221: - movzwl -706(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-720(fp) - bicl3 #-65536,-708(fp),r0 - ashl #16,r0,-712(fp) - addl3 -712(fp),-716(fp),r0 - bicl3 #0,r0,-716(fp) - cmpl -716(fp),-712(fp) - bgequ noname.222 - incl -720(fp) -noname.222: - movl -716(fp),r1 - movl -720(fp),r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.223 - incl r2 -noname.223: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.224 - incl r10 -noname.224: - - movzwl 18(r6),r2 - bicl3 #-65536,20(r7),r3 - movzwl 22(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,16(r6),-732(fp) - bicl3 #-65536,r2,-736(fp) - mull3 r0,-732(fp),-724(fp) - mull2 r3,-732(fp) - mull3 r3,-736(fp),-728(fp) - mull2 r0,-736(fp) - addl3 -724(fp),-728(fp),r0 - bicl3 #0,r0,-724(fp) - cmpl -724(fp),-728(fp) - bgequ noname.225 - addl2 #65536,-736(fp) -noname.225: - movzwl -722(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-736(fp) - bicl3 #-65536,-724(fp),r0 - ashl #16,r0,-728(fp) - addl3 -728(fp),-732(fp),r0 - bicl3 #0,r0,-732(fp) - cmpl -732(fp),-728(fp) - bgequ noname.226 - incl -736(fp) -noname.226: - movl -732(fp),r1 - movl -736(fp),r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.227 - incl r2 -noname.227: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.228 - incl r10 -noname.228: - - movzwl 22(r6),r2 - bicl3 #-65536,16(r7),r3 - movzwl 18(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,20(r6),-748(fp) - bicl3 #-65536,r2,-752(fp) - mull3 r0,-748(fp),-740(fp) - mull2 r3,-748(fp) - mull3 r3,-752(fp),-744(fp) - mull2 r0,-752(fp) - addl3 -740(fp),-744(fp),r0 - bicl3 #0,r0,-740(fp) - cmpl -740(fp),-744(fp) - bgequ noname.229 - addl2 #65536,-752(fp) -noname.229: - movzwl -738(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-752(fp) - bicl3 #-65536,-740(fp),r0 - ashl #16,r0,-744(fp) - addl3 -744(fp),-748(fp),r0 - bicl3 #0,r0,-748(fp) - cmpl -748(fp),-744(fp) - bgequ noname.230 - incl -752(fp) -noname.230: - movl -748(fp),r1 - movl -752(fp),r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.231 - incl r2 -noname.231: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.232 - incl r10 -noname.232: - - movzwl 26(r6),r2 - bicl3 #-65536,12(r7),r3 - movzwl 14(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,24(r6),-764(fp) - bicl3 #-65536,r2,-768(fp) - mull3 r0,-764(fp),-756(fp) - mull2 r3,-764(fp) - mull3 r3,-768(fp),-760(fp) - mull2 r0,-768(fp) - addl3 -756(fp),-760(fp),r0 - bicl3 #0,r0,-756(fp) - cmpl -756(fp),-760(fp) - bgequ noname.233 - addl2 #65536,-768(fp) -noname.233: - movzwl -754(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-768(fp) - bicl3 #-65536,-756(fp),r0 - ashl #16,r0,-760(fp) - addl3 -760(fp),-764(fp),r0 - bicl3 #0,r0,-764(fp) - cmpl -764(fp),-760(fp) - bgequ noname.234 - incl -768(fp) -noname.234: - movl -764(fp),r1 - movl -768(fp),r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.235 - incl r2 -noname.235: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.236 - incl r10 -noname.236: - - bicl3 #-65536,28(r6),r3 - movzwl 30(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,8(r7),r2 - movzwl 10(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-772(fp) - mull2 r2,r5 - mull3 r2,r4,-776(fp) - mull2 r0,r4 - addl3 -772(fp),-776(fp),r0 - bicl3 #0,r0,-772(fp) - cmpl -772(fp),-776(fp) - bgequ noname.237 - addl2 #65536,r4 -noname.237: - movzwl -770(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-772(fp),r0 - ashl #16,r0,-776(fp) - addl2 -776(fp),r5 - bicl2 #0,r5 - cmpl r5,-776(fp) - bgequ noname.238 - incl r4 -noname.238: - movl r5,r1 - movl r4,r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.239 - incl r2 -noname.239: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.240 - incl r10 -noname.240: - - movl r9,36(r11) - - clrl r9 - - bicl3 #-65536,28(r6),r3 - movzwl 30(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,12(r7),r2 - movzwl 14(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-780(fp) - mull2 r2,r5 - mull3 r2,r4,-784(fp) - mull2 r0,r4 - addl3 -780(fp),-784(fp),r0 - bicl3 #0,r0,-780(fp) - cmpl -780(fp),-784(fp) - bgequ noname.241 - addl2 #65536,r4 -noname.241: - movzwl -778(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-780(fp),r0 - ashl #16,r0,-784(fp) - addl2 -784(fp),r5 - bicl2 #0,r5 - cmpl r5,-784(fp) - bgequ noname.242 - incl r4 -noname.242: - movl r5,r1 - movl r4,r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.243 - incl r2 -noname.243: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.244 - incl r9 -noname.244: - - bicl3 #-65536,24(r6),r3 - movzwl 26(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,16(r7),r2 - movzwl 18(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-788(fp) - mull2 r2,r5 - mull3 r2,r4,-792(fp) - mull2 r0,r4 - addl3 -788(fp),-792(fp),r0 - bicl3 #0,r0,-788(fp) - cmpl -788(fp),-792(fp) - bgequ noname.245 - addl2 #65536,r4 -noname.245: - movzwl -786(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-788(fp),r0 - ashl #16,r0,-792(fp) - addl2 -792(fp),r5 - bicl2 #0,r5 - cmpl r5,-792(fp) - bgequ noname.246 - incl r4 -noname.246: - movl r5,r1 - movl r4,r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.247 - incl r2 -noname.247: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.248 - incl r9 -noname.248: - - bicl3 #-65536,20(r6),r3 - movzwl 22(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,20(r7),r2 - movzwl 22(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-796(fp) - mull2 r2,r5 - mull3 r2,r4,-800(fp) - mull2 r0,r4 - addl3 -796(fp),-800(fp),r0 - bicl3 #0,r0,-796(fp) - cmpl -796(fp),-800(fp) - bgequ noname.249 - addl2 #65536,r4 -noname.249: - movzwl -794(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-796(fp),r0 - ashl #16,r0,-800(fp) - addl2 -800(fp),r5 - bicl2 #0,r5 - cmpl r5,-800(fp) - bgequ noname.250 - incl r4 -noname.250: - movl r5,r1 - movl r4,r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.251 - incl r2 -noname.251: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.252 - incl r9 -noname.252: - - bicl3 #-65536,16(r6),r3 - movzwl 18(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,24(r7),r2 - movzwl 26(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-804(fp) - mull2 r2,r5 - mull3 r2,r4,-808(fp) - mull2 r0,r4 - addl3 -804(fp),-808(fp),r0 - bicl3 #0,r0,-804(fp) - cmpl -804(fp),-808(fp) - bgequ noname.253 - addl2 #65536,r4 -noname.253: - movzwl -802(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-804(fp),r0 - ashl #16,r0,-808(fp) - addl2 -808(fp),r5 - bicl2 #0,r5 - cmpl r5,-808(fp) - bgequ noname.254 - incl r4 -noname.254: - movl r5,r1 - movl r4,r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.255 - incl r2 -noname.255: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.256 - incl r9 -noname.256: - - bicl3 #-65536,12(r6),r3 - movzwl 14(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,28(r7),r2 - movzwl 30(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-812(fp) - mull2 r2,r5 - mull3 r2,r4,-816(fp) - mull2 r0,r4 - addl3 -812(fp),-816(fp),r0 - bicl3 #0,r0,-812(fp) - cmpl -812(fp),-816(fp) - bgequ noname.257 - addl2 #65536,r4 -noname.257: - movzwl -810(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-812(fp),r0 - ashl #16,r0,-816(fp) - addl2 -816(fp),r5 - bicl2 #0,r5 - cmpl r5,-816(fp) - bgequ noname.258 - incl r4 -noname.258: - movl r5,r1 - movl r4,r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.259 - incl r2 -noname.259: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.260 - incl r9 -noname.260: - - movl r8,40(r11) - - clrl r8 - - bicl3 #-65536,16(r6),r3 - movzwl 18(r6),r2 - bicl3 #-65536,28(r7),r1 - movzwl 30(r7),r0 - bicl2 #-65536,r0 - movl r3,r4 - bicl3 #-65536,r2,-828(fp) - mull3 r0,r4,-820(fp) - mull2 r1,r4 - mull3 r1,-828(fp),-824(fp) - mull2 r0,-828(fp) - addl3 -820(fp),-824(fp),r0 - bicl3 #0,r0,-820(fp) - cmpl -820(fp),-824(fp) - bgequ noname.261 - addl2 #65536,-828(fp) -noname.261: - movzwl -818(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-828(fp) - bicl3 #-65536,-820(fp),r0 - ashl #16,r0,-824(fp) - addl2 -824(fp),r4 - bicl2 #0,r4 - cmpl r4,-824(fp) - bgequ noname.262 - incl -828(fp) -noname.262: - movl r4,r1 - movl -828(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.263 - incl r2 -noname.263: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.264 - incl r8 -noname.264: - - movzwl 22(r6),r2 - bicl3 #-65536,24(r7),r3 - movzwl 26(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,20(r6),-840(fp) - bicl3 #-65536,r2,-844(fp) - mull3 r0,-840(fp),-832(fp) - mull2 r3,-840(fp) - mull3 r3,-844(fp),-836(fp) - mull2 r0,-844(fp) - addl3 -832(fp),-836(fp),r0 - bicl3 #0,r0,-832(fp) - cmpl -832(fp),-836(fp) - bgequ noname.265 - addl2 #65536,-844(fp) -noname.265: - movzwl -830(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-844(fp) - bicl3 #-65536,-832(fp),r0 - ashl #16,r0,-836(fp) - addl3 -836(fp),-840(fp),r0 - bicl3 #0,r0,-840(fp) - cmpl -840(fp),-836(fp) - bgequ noname.266 - incl -844(fp) -noname.266: - movl -840(fp),r1 - movl -844(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.267 - incl r2 -noname.267: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.268 - incl r8 -noname.268: - - bicl3 #-65536,24(r6),r3 - movzwl 26(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,20(r7),r2 - movzwl 22(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-848(fp) - mull2 r2,r5 - mull3 r2,r4,-852(fp) - mull2 r0,r4 - addl3 -848(fp),-852(fp),r0 - bicl3 #0,r0,-848(fp) - cmpl -848(fp),-852(fp) - bgequ noname.269 - addl2 #65536,r4 -noname.269: - movzwl -846(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-848(fp),r0 - ashl #16,r0,-852(fp) - addl2 -852(fp),r5 - bicl2 #0,r5 - cmpl r5,-852(fp) - bgequ noname.270 - incl r4 -noname.270: - movl r5,r1 - movl r4,r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.271 - incl r2 -noname.271: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.272 - incl r8 -noname.272: - - bicl3 #-65536,28(r6),r3 - movzwl 30(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,16(r7),r2 - movzwl 18(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-856(fp) - mull2 r2,r5 - mull3 r2,r4,-860(fp) - mull2 r0,r4 - addl3 -856(fp),-860(fp),r0 - bicl3 #0,r0,-856(fp) - cmpl -856(fp),-860(fp) - bgequ noname.273 - addl2 #65536,r4 -noname.273: - movzwl -854(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-856(fp),r0 - ashl #16,r0,-860(fp) - addl2 -860(fp),r5 - bicl2 #0,r5 - cmpl r5,-860(fp) - bgequ noname.274 - incl r4 -noname.274: - movl r5,r1 - movl r4,r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.275 - incl r2 -noname.275: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.276 - incl r8 -noname.276: - - movl r10,44(r11) - - clrl r10 - - bicl3 #-65536,28(r6),r3 - movzwl 30(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,20(r7),r2 - movzwl 22(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-864(fp) - mull2 r2,r5 - mull3 r2,r4,-868(fp) - mull2 r0,r4 - addl3 -864(fp),-868(fp),r0 - bicl3 #0,r0,-864(fp) - cmpl -864(fp),-868(fp) - bgequ noname.277 - addl2 #65536,r4 -noname.277: - movzwl -862(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-864(fp),r0 - ashl #16,r0,-868(fp) - addl2 -868(fp),r5 - bicl2 #0,r5 - cmpl r5,-868(fp) - bgequ noname.278 - incl r4 -noname.278: - movl r5,r1 - movl r4,r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.279 - incl r2 -noname.279: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.280 - incl r10 -noname.280: - - bicl3 #-65536,24(r6),r3 - movzwl 26(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,24(r7),r2 - movzwl 26(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-872(fp) - mull2 r2,r5 - mull3 r2,r4,-876(fp) - mull2 r0,r4 - addl3 -872(fp),-876(fp),r0 - bicl3 #0,r0,-872(fp) - cmpl -872(fp),-876(fp) - bgequ noname.281 - addl2 #65536,r4 -noname.281: - movzwl -870(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-872(fp),r0 - ashl #16,r0,-876(fp) - addl2 -876(fp),r5 - bicl2 #0,r5 - cmpl r5,-876(fp) - bgequ noname.282 - incl r4 -noname.282: - movl r5,r1 - movl r4,r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.283 - incl r2 -noname.283: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.284 - incl r10 -noname.284: - - bicl3 #-65536,20(r6),r3 - movzwl 22(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,28(r7),r2 - movzwl 30(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-880(fp) - mull2 r2,r5 - mull3 r2,r4,-884(fp) - mull2 r0,r4 - addl3 -880(fp),-884(fp),r0 - bicl3 #0,r0,-880(fp) - cmpl -880(fp),-884(fp) - bgequ noname.285 - addl2 #65536,r4 -noname.285: - movzwl -878(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-880(fp),r0 - ashl #16,r0,-884(fp) - addl2 -884(fp),r5 - bicl2 #0,r5 - cmpl r5,-884(fp) - bgequ noname.286 - incl r4 -noname.286: - movl r5,r1 - movl r4,r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.287 - incl r2 -noname.287: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.288 - incl r10 -noname.288: - - movl r9,48(r11) - - clrl r9 - - bicl3 #-65536,24(r6),r3 - movzwl 26(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,28(r7),r2 - movzwl 30(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-888(fp) - mull2 r2,r5 - mull3 r2,r4,-892(fp) - mull2 r0,r4 - addl3 -888(fp),-892(fp),r0 - bicl3 #0,r0,-888(fp) - cmpl -888(fp),-892(fp) - bgequ noname.289 - addl2 #65536,r4 -noname.289: - movzwl -886(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-888(fp),r0 - ashl #16,r0,-892(fp) - addl2 -892(fp),r5 - bicl2 #0,r5 - cmpl r5,-892(fp) - bgequ noname.290 - incl r4 -noname.290: - movl r5,r1 - movl r4,r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.291 - incl r2 -noname.291: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.292 - incl r9 -noname.292: - - movzwl 30(r6),r2 - bicl3 #-65536,24(r7),r3 - movzwl 26(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,28(r6),-904(fp) - bicl3 #-65536,r2,-908(fp) - mull3 r0,-904(fp),-896(fp) - mull2 r3,-904(fp) - mull3 r3,-908(fp),-900(fp) - mull2 r0,-908(fp) - addl3 -896(fp),-900(fp),r0 - bicl3 #0,r0,-896(fp) - cmpl -896(fp),-900(fp) - bgequ noname.293 - addl2 #65536,-908(fp) -noname.293: - movzwl -894(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-908(fp) - bicl3 #-65536,-896(fp),r0 - ashl #16,r0,-900(fp) - addl3 -900(fp),-904(fp),r0 - bicl3 #0,r0,-904(fp) - cmpl -904(fp),-900(fp) - bgequ noname.294 - incl -908(fp) -noname.294: - movl -904(fp),r1 - movl -908(fp),r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.295 - incl r2 -noname.295: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.296 - incl r9 -noname.296: - - movl r8,52(r11) - - clrl r8 - - movzwl 30(r6),r2 - bicl3 #-65536,28(r7),r3 - movzwl 30(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,28(r6),-920(fp) - bicl3 #-65536,r2,-924(fp) - mull3 r0,-920(fp),-912(fp) - mull2 r3,-920(fp) - mull3 r3,-924(fp),-916(fp) - mull2 r0,-924(fp) - addl3 -912(fp),-916(fp),r0 - bicl3 #0,r0,-912(fp) - cmpl -912(fp),-916(fp) - bgequ noname.297 - addl2 #65536,-924(fp) -noname.297: - movzwl -910(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-924(fp) - bicl3 #-65536,-912(fp),r0 - ashl #16,r0,-916(fp) - addl3 -916(fp),-920(fp),r0 - bicl3 #0,r0,-920(fp) - cmpl -920(fp),-916(fp) - bgequ noname.298 - incl -924(fp) -noname.298: - movl -920(fp),r1 - movl -924(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.299 - incl r2 -noname.299: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.300 - incl r8 -noname.300: - - movl r10,56(r11) - - movl r9,60(r11) - - ret - - - -;r=4 ;(AP) -;a=8 ;(AP) -;b=12 ;(AP) -;n=16 ;(AP) n by value (input) - - .psect code,nowrt - -.entry BN_MUL_COMBA4,^m<r2,r3,r4,r5,r6,r7,r8,r9,r10,r11> - movab -156(sp),sp - - clrq r9 - - clrl r8 - - movl 8(ap),r6 - bicl3 #-65536,(r6),r3 - movzwl 2(r6),r2 - bicl2 #-65536,r2 - movl 12(ap),r7 - bicl3 #-65536,(r7),r1 - movzwl 2(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r2,r4 - mull3 r0,r5,-4(fp) - mull2 r1,r5 - mull3 r1,r4,-8(fp) - mull2 r0,r4 - addl3 -4(fp),-8(fp),r0 - bicl3 #0,r0,-4(fp) - cmpl -4(fp),-8(fp) - bgequ noname.303 - addl2 #65536,r4 -noname.303: - movzwl -2(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-4(fp),r0 - ashl #16,r0,-8(fp) - addl2 -8(fp),r5 - bicl2 #0,r5 - cmpl r5,-8(fp) - bgequ noname.304 - incl r4 -noname.304: - movl r5,r1 - movl r4,r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.305 - incl r2 -noname.305: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.306 - incl r8 -noname.306: - - movl 4(ap),r11 - movl r10,(r11) - - clrl r10 - - bicl3 #-65536,(r6),r3 - movzwl 2(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,4(r7),r2 - movzwl 6(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-12(fp) - mull2 r2,r5 - mull3 r2,r4,-16(fp) - mull2 r0,r4 - addl3 -12(fp),-16(fp),r0 - bicl3 #0,r0,-12(fp) - cmpl -12(fp),-16(fp) - bgequ noname.307 - addl2 #65536,r4 -noname.307: - movzwl -10(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-12(fp),r0 - ashl #16,r0,-16(fp) - addl2 -16(fp),r5 - bicl2 #0,r5 - cmpl r5,-16(fp) - bgequ noname.308 - incl r4 -noname.308: - movl r5,r1 - movl r4,r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.309 - incl r2 -noname.309: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.310 - incl r10 -noname.310: - - bicl3 #-65536,4(r6),r3 - movzwl 6(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,(r7),r2 - movzwl 2(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-20(fp) - mull2 r2,r5 - mull3 r2,r4,-24(fp) - mull2 r0,r4 - addl3 -20(fp),-24(fp),r0 - bicl3 #0,r0,-20(fp) - cmpl -20(fp),-24(fp) - bgequ noname.311 - addl2 #65536,r4 -noname.311: - movzwl -18(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-20(fp),r0 - ashl #16,r0,-24(fp) - addl2 -24(fp),r5 - bicl2 #0,r5 - cmpl r5,-24(fp) - bgequ noname.312 - incl r4 -noname.312: - movl r5,r1 - movl r4,r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.313 - incl r2 -noname.313: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.314 - incl r10 -noname.314: - - movl r9,4(r11) - - clrl r9 - - bicl3 #-65536,8(r6),r3 - movzwl 10(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,(r7),r2 - movzwl 2(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-28(fp) - mull2 r2,r5 - mull3 r2,r4,-32(fp) - mull2 r0,r4 - addl3 -28(fp),-32(fp),r0 - bicl3 #0,r0,-28(fp) - cmpl -28(fp),-32(fp) - bgequ noname.315 - addl2 #65536,r4 -noname.315: - movzwl -26(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-28(fp),r0 - ashl #16,r0,-32(fp) - addl2 -32(fp),r5 - bicl2 #0,r5 - cmpl r5,-32(fp) - bgequ noname.316 - incl r4 -noname.316: - movl r5,r1 - movl r4,r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.317 - incl r2 -noname.317: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.318 - incl r9 -noname.318: - - bicl3 #-65536,4(r6),r3 - movzwl 6(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,4(r7),r2 - movzwl 6(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-36(fp) - mull2 r2,r5 - mull3 r2,r4,-40(fp) - mull2 r0,r4 - addl3 -36(fp),-40(fp),r0 - bicl3 #0,r0,-36(fp) - cmpl -36(fp),-40(fp) - bgequ noname.319 - addl2 #65536,r4 -noname.319: - movzwl -34(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-36(fp),r0 - ashl #16,r0,-40(fp) - addl2 -40(fp),r5 - bicl2 #0,r5 - cmpl r5,-40(fp) - bgequ noname.320 - incl r4 -noname.320: - movl r5,r1 - movl r4,r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.321 - incl r2 -noname.321: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.322 - incl r9 -noname.322: - - bicl3 #-65536,(r6),r3 - movzwl 2(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,8(r7),r2 - movzwl 10(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-44(fp) - mull2 r2,r5 - mull3 r2,r4,-48(fp) - mull2 r0,r4 - addl3 -44(fp),-48(fp),r0 - bicl3 #0,r0,-44(fp) - cmpl -44(fp),-48(fp) - bgequ noname.323 - addl2 #65536,r4 -noname.323: - movzwl -42(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-44(fp),r0 - ashl #16,r0,-48(fp) - addl2 -48(fp),r5 - bicl2 #0,r5 - cmpl r5,-48(fp) - bgequ noname.324 - incl r4 -noname.324: - movl r5,r1 - movl r4,r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.325 - incl r2 -noname.325: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.326 - incl r9 -noname.326: - - movl r8,8(r11) - - clrl r8 - - bicl3 #-65536,(r6),r3 - movzwl 2(r6),r2 - bicl3 #-65536,12(r7),r1 - movzwl 14(r7),r0 - bicl2 #-65536,r0 - movl r3,r4 - bicl3 #-65536,r2,-60(fp) - mull3 r0,r4,-52(fp) - mull2 r1,r4 - mull3 r1,-60(fp),-56(fp) - mull2 r0,-60(fp) - addl3 -52(fp),-56(fp),r0 - bicl3 #0,r0,-52(fp) - cmpl -52(fp),-56(fp) - bgequ noname.327 - addl2 #65536,-60(fp) -noname.327: - movzwl -50(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-60(fp) - bicl3 #-65536,-52(fp),r0 - ashl #16,r0,-56(fp) - addl2 -56(fp),r4 - bicl2 #0,r4 - cmpl r4,-56(fp) - bgequ noname.328 - incl -60(fp) -noname.328: - movl r4,r1 - movl -60(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.329 - incl r2 -noname.329: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.330 - incl r8 -noname.330: - - movzwl 6(r6),r2 - bicl3 #-65536,8(r7),r3 - movzwl 10(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,4(r6),-72(fp) - bicl3 #-65536,r2,-76(fp) - mull3 r0,-72(fp),-64(fp) - mull2 r3,-72(fp) - mull3 r3,-76(fp),-68(fp) - mull2 r0,-76(fp) - addl3 -64(fp),-68(fp),r0 - bicl3 #0,r0,-64(fp) - cmpl -64(fp),-68(fp) - bgequ noname.331 - addl2 #65536,-76(fp) -noname.331: - movzwl -62(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-76(fp) - bicl3 #-65536,-64(fp),r0 - ashl #16,r0,-68(fp) - addl3 -68(fp),-72(fp),r0 - bicl3 #0,r0,-72(fp) - cmpl -72(fp),-68(fp) - bgequ noname.332 - incl -76(fp) -noname.332: - movl -72(fp),r1 - movl -76(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.333 - incl r2 -noname.333: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.334 - incl r8 -noname.334: - - bicl3 #-65536,8(r6),r3 - movzwl 10(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,4(r7),r2 - movzwl 6(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-80(fp) - mull2 r2,r5 - mull3 r2,r4,-84(fp) - mull2 r0,r4 - addl3 -80(fp),-84(fp),r0 - bicl3 #0,r0,-80(fp) - cmpl -80(fp),-84(fp) - bgequ noname.335 - addl2 #65536,r4 -noname.335: - movzwl -78(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-80(fp),r0 - ashl #16,r0,-84(fp) - addl2 -84(fp),r5 - bicl2 #0,r5 - cmpl r5,-84(fp) - bgequ noname.336 - incl r4 -noname.336: - movl r5,r1 - movl r4,r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.337 - incl r2 -noname.337: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.338 - incl r8 -noname.338: - - bicl3 #-65536,12(r6),r3 - movzwl 14(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,(r7),r2 - movzwl 2(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-88(fp) - mull2 r2,r5 - mull3 r2,r4,-92(fp) - mull2 r0,r4 - addl3 -88(fp),-92(fp),r0 - bicl3 #0,r0,-88(fp) - cmpl -88(fp),-92(fp) - bgequ noname.339 - addl2 #65536,r4 -noname.339: - movzwl -86(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-88(fp),r0 - ashl #16,r0,-92(fp) - addl2 -92(fp),r5 - bicl2 #0,r5 - cmpl r5,-92(fp) - bgequ noname.340 - incl r4 -noname.340: - movl r5,r1 - movl r4,r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.341 - incl r2 -noname.341: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.342 - incl r8 -noname.342: - - movl r10,12(r11) - - clrl r10 - - bicl3 #-65536,12(r6),r3 - movzwl 14(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,4(r7),r2 - movzwl 6(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-96(fp) - mull2 r2,r5 - mull3 r2,r4,-100(fp) - mull2 r0,r4 - addl3 -96(fp),-100(fp),r0 - bicl3 #0,r0,-96(fp) - cmpl -96(fp),-100(fp) - bgequ noname.343 - addl2 #65536,r4 -noname.343: - movzwl -94(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-96(fp),r0 - ashl #16,r0,-100(fp) - addl2 -100(fp),r5 - bicl2 #0,r5 - cmpl r5,-100(fp) - bgequ noname.344 - incl r4 -noname.344: - movl r5,r1 - movl r4,r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.345 - incl r2 -noname.345: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.346 - incl r10 -noname.346: - - bicl3 #-65536,8(r6),r3 - movzwl 10(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,8(r7),r2 - movzwl 10(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-104(fp) - mull2 r2,r5 - mull3 r2,r4,-108(fp) - mull2 r0,r4 - addl3 -104(fp),-108(fp),r0 - bicl3 #0,r0,-104(fp) - cmpl -104(fp),-108(fp) - bgequ noname.347 - addl2 #65536,r4 -noname.347: - movzwl -102(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-104(fp),r0 - ashl #16,r0,-108(fp) - addl2 -108(fp),r5 - bicl2 #0,r5 - cmpl r5,-108(fp) - bgequ noname.348 - incl r4 -noname.348: - movl r5,r1 - movl r4,r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.349 - incl r2 -noname.349: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.350 - incl r10 -noname.350: - - bicl3 #-65536,4(r6),r3 - movzwl 6(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,12(r7),r2 - movzwl 14(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-112(fp) - mull2 r2,r5 - mull3 r2,r4,-116(fp) - mull2 r0,r4 - addl3 -112(fp),-116(fp),r0 - bicl3 #0,r0,-112(fp) - cmpl -112(fp),-116(fp) - bgequ noname.351 - addl2 #65536,r4 -noname.351: - movzwl -110(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-112(fp),r0 - ashl #16,r0,-116(fp) - addl2 -116(fp),r5 - bicl2 #0,r5 - cmpl r5,-116(fp) - bgequ noname.352 - incl r4 -noname.352: - movl r5,r1 - movl r4,r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.353 - incl r2 -noname.353: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.354 - incl r10 -noname.354: - - movl r9,16(r11) - - clrl r9 - - bicl3 #-65536,8(r6),r3 - movzwl 10(r6),r1 - bicl2 #-65536,r1 - bicl3 #-65536,12(r7),r2 - movzwl 14(r7),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-120(fp) - mull2 r2,r5 - mull3 r2,r4,-124(fp) - mull2 r0,r4 - addl3 -120(fp),-124(fp),r0 - bicl3 #0,r0,-120(fp) - cmpl -120(fp),-124(fp) - bgequ noname.355 - addl2 #65536,r4 -noname.355: - movzwl -118(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-120(fp),r0 - ashl #16,r0,-124(fp) - addl2 -124(fp),r5 - bicl2 #0,r5 - cmpl r5,-124(fp) - bgequ noname.356 - incl r4 -noname.356: - movl r5,r1 - movl r4,r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.357 - incl r2 -noname.357: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.358 - incl r9 -noname.358: - - movzwl 14(r6),r2 - bicl3 #-65536,8(r7),r3 - movzwl 10(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,12(r6),-136(fp) - bicl3 #-65536,r2,-140(fp) - mull3 r0,-136(fp),-128(fp) - mull2 r3,-136(fp) - mull3 r3,-140(fp),-132(fp) - mull2 r0,-140(fp) - addl3 -128(fp),-132(fp),r0 - bicl3 #0,r0,-128(fp) - cmpl -128(fp),-132(fp) - bgequ noname.359 - addl2 #65536,-140(fp) -noname.359: - movzwl -126(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-140(fp) - bicl3 #-65536,-128(fp),r0 - ashl #16,r0,-132(fp) - addl3 -132(fp),-136(fp),r0 - bicl3 #0,r0,-136(fp) - cmpl -136(fp),-132(fp) - bgequ noname.360 - incl -140(fp) -noname.360: - movl -136(fp),r1 - movl -140(fp),r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.361 - incl r2 -noname.361: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.362 - incl r9 -noname.362: - - movl r8,20(r11) - - clrl r8 - - movzwl 14(r6),r2 - bicl3 #-65536,12(r7),r3 - movzwl 14(r7),r0 - bicl2 #-65536,r0 - bicl3 #-65536,12(r6),-152(fp) - bicl3 #-65536,r2,-156(fp) - mull3 r0,-152(fp),-144(fp) - mull2 r3,-152(fp) - mull3 r3,-156(fp),-148(fp) - mull2 r0,-156(fp) - addl3 -144(fp),-148(fp),r0 - bicl3 #0,r0,-144(fp) - cmpl -144(fp),-148(fp) - bgequ noname.363 - addl2 #65536,-156(fp) -noname.363: - movzwl -142(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-156(fp) - bicl3 #-65536,-144(fp),r0 - ashl #16,r0,-148(fp) - addl3 -148(fp),-152(fp),r0 - bicl3 #0,r0,-152(fp) - cmpl -152(fp),-148(fp) - bgequ noname.364 - incl -156(fp) -noname.364: - movl -152(fp),r1 - movl -156(fp),r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.365 - incl r2 -noname.365: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.366 - incl r8 -noname.366: - - movl r10,24(r11) - - movl r9,28(r11) - - ret - - - -;r=4 ;(AP) -;a=8 ;(AP) -;b=12 ;(AP) -;n=16 ;(AP) n by value (input) - - .psect code,nowrt - -.entry BN_SQR_COMBA8,^m<r2,r3,r4,r5,r6,r7,r8,r9> - movab -444(sp),sp - - clrq r8 - - clrl r7 - - movl 8(ap),r4 - movl (r4),r3 - bicl3 #-65536,r3,-4(fp) - extzv #16,#16,r3,r0 - bicl3 #-65536,r0,r3 - movl -4(fp),r0 - mull3 r0,r3,-8(fp) - mull3 r0,r0,-4(fp) - mull2 r3,r3 - bicl3 #32767,-8(fp),r0 - extzv #15,#17,r0,r0 - addl2 r0,r3 - bicl3 #-65536,-8(fp),r0 - ashl #17,r0,-8(fp) - addl3 -4(fp),-8(fp),r0 - bicl3 #0,r0,-4(fp) - cmpl -4(fp),-8(fp) - bgequ noname.369 - incl r3 -noname.369: - movl -4(fp),r1 - movl r3,r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.370 - incl r2 -noname.370: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.371 - incl r7 -noname.371: - - movl r9,@4(ap) - - clrl r9 - - movzwl 6(r4),r2 - bicl3 #-65536,(r4),r3 - movzwl 2(r4),r0 - bicl2 #-65536,r0 - bicl3 #-65536,4(r4),-20(fp) - bicl3 #-65536,r2,-24(fp) - mull3 r0,-20(fp),-12(fp) - mull2 r3,-20(fp) - mull3 r3,-24(fp),-16(fp) - mull2 r0,-24(fp) - addl3 -12(fp),-16(fp),r0 - bicl3 #0,r0,-12(fp) - cmpl -12(fp),-16(fp) - bgequ noname.372 - addl2 #65536,-24(fp) -noname.372: - movzwl -10(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-24(fp) - bicl3 #-65536,-12(fp),r0 - ashl #16,r0,-16(fp) - addl3 -16(fp),-20(fp),r0 - bicl3 #0,r0,-20(fp) - cmpl -20(fp),-16(fp) - bgequ noname.373 - incl -24(fp) -noname.373: - movl -20(fp),r3 - movl -24(fp),r2 - bbc #31,r2,noname.374 - incl r9 -noname.374: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.375 - incl r2 -noname.375: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r8 - bicl2 #0,r8 - cmpl r8,r3 - bgequ noname.376 - incl r2 - bicl3 #0,r2,r0 - bneq noname.376 - incl r9 -noname.376: - addl2 r2,r7 - bicl2 #0,r7 - cmpl r7,r2 - bgequ noname.377 - incl r9 -noname.377: - - movl 4(ap),r0 - movl r8,4(r0) - - clrl r8 - - movl 8(ap),r4 - movl 4(r4),r3 - bicl3 #-65536,r3,-28(fp) - extzv #16,#16,r3,r0 - bicl3 #-65536,r0,r3 - movl -28(fp),r0 - mull3 r0,r3,-32(fp) - mull3 r0,r0,-28(fp) - mull2 r3,r3 - bicl3 #32767,-32(fp),r0 - extzv #15,#17,r0,r0 - addl2 r0,r3 - bicl3 #-65536,-32(fp),r0 - ashl #17,r0,-32(fp) - addl3 -28(fp),-32(fp),r0 - bicl3 #0,r0,-28(fp) - cmpl -28(fp),-32(fp) - bgequ noname.378 - incl r3 -noname.378: - movl -28(fp),r1 - movl r3,r2 - addl2 r1,r7 - bicl2 #0,r7 - cmpl r7,r1 - bgequ noname.379 - incl r2 -noname.379: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.380 - incl r8 -noname.380: - - movzwl 10(r4),r2 - bicl3 #-65536,(r4),r3 - movzwl 2(r4),r0 - bicl2 #-65536,r0 - bicl3 #-65536,8(r4),-44(fp) - bicl3 #-65536,r2,-48(fp) - mull3 r0,-44(fp),-36(fp) - mull2 r3,-44(fp) - mull3 r3,-48(fp),-40(fp) - mull2 r0,-48(fp) - addl3 -36(fp),-40(fp),r0 - bicl3 #0,r0,-36(fp) - cmpl -36(fp),-40(fp) - bgequ noname.381 - addl2 #65536,-48(fp) -noname.381: - movzwl -34(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-48(fp) - bicl3 #-65536,-36(fp),r0 - ashl #16,r0,-40(fp) - addl3 -40(fp),-44(fp),r0 - bicl3 #0,r0,-44(fp) - cmpl -44(fp),-40(fp) - bgequ noname.382 - incl -48(fp) -noname.382: - movl -44(fp),r3 - movl -48(fp),r2 - bbc #31,r2,noname.383 - incl r8 -noname.383: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.384 - incl r2 -noname.384: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r7 - bicl2 #0,r7 - cmpl r7,r3 - bgequ noname.385 - incl r2 - bicl3 #0,r2,r0 - bneq noname.385 - incl r8 -noname.385: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.386 - incl r8 -noname.386: - - movl 4(ap),r0 - movl r7,8(r0) - - clrl r7 - - movl 8(ap),r0 - movzwl 14(r0),r2 - bicl3 #-65536,(r0),r3 - movzwl 2(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,12(r0),-60(fp) - bicl3 #-65536,r2,-64(fp) - mull3 r1,-60(fp),-52(fp) - mull2 r3,-60(fp) - mull3 r3,-64(fp),-56(fp) - mull2 r1,-64(fp) - addl3 -52(fp),-56(fp),r0 - bicl3 #0,r0,-52(fp) - cmpl -52(fp),-56(fp) - bgequ noname.387 - addl2 #65536,-64(fp) -noname.387: - movzwl -50(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-64(fp) - bicl3 #-65536,-52(fp),r0 - ashl #16,r0,-56(fp) - addl3 -56(fp),-60(fp),r0 - bicl3 #0,r0,-60(fp) - cmpl -60(fp),-56(fp) - bgequ noname.388 - incl -64(fp) -noname.388: - movl -60(fp),r3 - movl -64(fp),r2 - bbc #31,r2,noname.389 - incl r7 -noname.389: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.390 - incl r2 -noname.390: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r9 - bicl2 #0,r9 - cmpl r9,r3 - bgequ noname.391 - incl r2 - bicl3 #0,r2,r0 - bneq noname.391 - incl r7 -noname.391: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.392 - incl r7 -noname.392: - - movl 8(ap),r0 - movzwl 10(r0),r2 - bicl3 #-65536,4(r0),r3 - movzwl 6(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,8(r0),-76(fp) - bicl3 #-65536,r2,-80(fp) - mull3 r1,-76(fp),-68(fp) - mull2 r3,-76(fp) - mull3 r3,-80(fp),-72(fp) - mull2 r1,-80(fp) - addl3 -68(fp),-72(fp),r0 - bicl3 #0,r0,-68(fp) - cmpl -68(fp),-72(fp) - bgequ noname.393 - addl2 #65536,-80(fp) -noname.393: - movzwl -66(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-80(fp) - bicl3 #-65536,-68(fp),r0 - ashl #16,r0,-72(fp) - addl3 -72(fp),-76(fp),r0 - bicl3 #0,r0,-76(fp) - cmpl -76(fp),-72(fp) - bgequ noname.394 - incl -80(fp) -noname.394: - movl -76(fp),r3 - movl -80(fp),r2 - bbc #31,r2,noname.395 - incl r7 -noname.395: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.396 - incl r2 -noname.396: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r9 - bicl2 #0,r9 - cmpl r9,r3 - bgequ noname.397 - incl r2 - bicl3 #0,r2,r0 - bneq noname.397 - incl r7 -noname.397: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.398 - incl r7 -noname.398: - - movl 4(ap),r0 - movl r9,12(r0) - - clrl r9 - - movl 8(ap),r2 - movl 8(r2),r4 - bicl3 #-65536,r4,-84(fp) - extzv #16,#16,r4,r0 - bicl3 #-65536,r0,r4 - movl -84(fp),r0 - mull3 r0,r4,-88(fp) - mull3 r0,r0,-84(fp) - mull2 r4,r4 - bicl3 #32767,-88(fp),r0 - extzv #15,#17,r0,r0 - addl2 r0,r4 - bicl3 #-65536,-88(fp),r0 - ashl #17,r0,-88(fp) - addl3 -84(fp),-88(fp),r0 - bicl3 #0,r0,-84(fp) - cmpl -84(fp),-88(fp) - bgequ noname.399 - incl r4 -noname.399: - movl -84(fp),r1 - movl r4,r3 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.400 - incl r3 -noname.400: - addl2 r3,r7 - bicl2 #0,r7 - cmpl r7,r3 - bgequ noname.401 - incl r9 -noname.401: - - movzwl 14(r2),r3 - bicl3 #-65536,4(r2),r1 - movzwl 6(r2),r0 - bicl2 #-65536,r0 - bicl3 #-65536,12(r2),-100(fp) - bicl3 #-65536,r3,-104(fp) - mull3 r0,-100(fp),-92(fp) - mull2 r1,-100(fp) - mull3 r1,-104(fp),-96(fp) - mull2 r0,-104(fp) - addl3 -92(fp),-96(fp),r0 - bicl3 #0,r0,-92(fp) - cmpl -92(fp),-96(fp) - bgequ noname.402 - addl2 #65536,-104(fp) -noname.402: - movzwl -90(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-104(fp) - bicl3 #-65536,-92(fp),r0 - ashl #16,r0,-96(fp) - addl3 -96(fp),-100(fp),r0 - bicl3 #0,r0,-100(fp) - cmpl -100(fp),-96(fp) - bgequ noname.403 - incl -104(fp) -noname.403: - movl -100(fp),r3 - movl -104(fp),r2 - bbc #31,r2,noname.404 - incl r9 -noname.404: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.405 - incl r2 -noname.405: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r8 - bicl2 #0,r8 - cmpl r8,r3 - bgequ noname.406 - incl r2 - bicl3 #0,r2,r0 - bneq noname.406 - incl r9 -noname.406: - addl2 r2,r7 - bicl2 #0,r7 - cmpl r7,r2 - bgequ noname.407 - incl r9 -noname.407: - - movl 8(ap),r0 - movzwl 18(r0),r2 - bicl3 #-65536,(r0),r3 - movzwl 2(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,16(r0),-116(fp) - bicl3 #-65536,r2,-120(fp) - mull3 r1,-116(fp),-108(fp) - mull2 r3,-116(fp) - mull3 r3,-120(fp),-112(fp) - mull2 r1,-120(fp) - addl3 -108(fp),-112(fp),r0 - bicl3 #0,r0,-108(fp) - cmpl -108(fp),-112(fp) - bgequ noname.408 - addl2 #65536,-120(fp) -noname.408: - movzwl -106(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-120(fp) - bicl3 #-65536,-108(fp),r0 - ashl #16,r0,-112(fp) - addl3 -112(fp),-116(fp),r0 - bicl3 #0,r0,-116(fp) - cmpl -116(fp),-112(fp) - bgequ noname.409 - incl -120(fp) -noname.409: - movl -116(fp),r3 - movl -120(fp),r2 - bbc #31,r2,noname.410 - incl r9 -noname.410: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.411 - incl r2 -noname.411: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r8 - bicl2 #0,r8 - cmpl r8,r3 - bgequ noname.412 - incl r2 - bicl3 #0,r2,r0 - bneq noname.412 - incl r9 -noname.412: - addl2 r2,r7 - bicl2 #0,r7 - cmpl r7,r2 - bgequ noname.413 - incl r9 -noname.413: - - movl 4(ap),r0 - movl r8,16(r0) - - clrl r8 - - movl 8(ap),r0 - movzwl 22(r0),r2 - bicl3 #-65536,(r0),r3 - movzwl 2(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,20(r0),-132(fp) - bicl3 #-65536,r2,-136(fp) - mull3 r1,-132(fp),-124(fp) - mull2 r3,-132(fp) - mull3 r3,-136(fp),-128(fp) - mull2 r1,-136(fp) - addl3 -124(fp),-128(fp),r0 - bicl3 #0,r0,-124(fp) - cmpl -124(fp),-128(fp) - bgequ noname.414 - addl2 #65536,-136(fp) -noname.414: - movzwl -122(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-136(fp) - bicl3 #-65536,-124(fp),r0 - ashl #16,r0,-128(fp) - addl3 -128(fp),-132(fp),r0 - bicl3 #0,r0,-132(fp) - cmpl -132(fp),-128(fp) - bgequ noname.415 - incl -136(fp) -noname.415: - movl -132(fp),r3 - movl -136(fp),r2 - bbc #31,r2,noname.416 - incl r8 -noname.416: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.417 - incl r2 -noname.417: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r7 - bicl2 #0,r7 - cmpl r7,r3 - bgequ noname.418 - incl r2 - bicl3 #0,r2,r0 - bneq noname.418 - incl r8 -noname.418: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.419 - incl r8 -noname.419: - - movl 8(ap),r0 - movzwl 18(r0),r2 - bicl3 #-65536,4(r0),r3 - movzwl 6(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,16(r0),-148(fp) - bicl3 #-65536,r2,-152(fp) - mull3 r1,-148(fp),-140(fp) - mull2 r3,-148(fp) - mull3 r3,-152(fp),-144(fp) - mull2 r1,-152(fp) - addl3 -140(fp),-144(fp),r0 - bicl3 #0,r0,-140(fp) - cmpl -140(fp),-144(fp) - bgequ noname.420 - addl2 #65536,-152(fp) -noname.420: - movzwl -138(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-152(fp) - bicl3 #-65536,-140(fp),r0 - ashl #16,r0,-144(fp) - addl3 -144(fp),-148(fp),r0 - bicl3 #0,r0,-148(fp) - cmpl -148(fp),-144(fp) - bgequ noname.421 - incl -152(fp) -noname.421: - movl -148(fp),r3 - movl -152(fp),r2 - bbc #31,r2,noname.422 - incl r8 -noname.422: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.423 - incl r2 -noname.423: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r7 - bicl2 #0,r7 - cmpl r7,r3 - bgequ noname.424 - incl r2 - bicl3 #0,r2,r0 - bneq noname.424 - incl r8 -noname.424: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.425 - incl r8 -noname.425: - - movl 8(ap),r0 - movzwl 14(r0),r2 - bicl3 #-65536,8(r0),r3 - movzwl 10(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,12(r0),-164(fp) - bicl3 #-65536,r2,-168(fp) - mull3 r1,-164(fp),-156(fp) - mull2 r3,-164(fp) - mull3 r3,-168(fp),-160(fp) - mull2 r1,-168(fp) - addl3 -156(fp),-160(fp),r0 - bicl3 #0,r0,-156(fp) - cmpl -156(fp),-160(fp) - bgequ noname.426 - addl2 #65536,-168(fp) -noname.426: - movzwl -154(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-168(fp) - bicl3 #-65536,-156(fp),r0 - ashl #16,r0,-160(fp) - addl3 -160(fp),-164(fp),r0 - bicl3 #0,r0,-164(fp) - cmpl -164(fp),-160(fp) - bgequ noname.427 - incl -168(fp) -noname.427: - movl -164(fp),r3 - movl -168(fp),r2 - bbc #31,r2,noname.428 - incl r8 -noname.428: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.429 - incl r2 -noname.429: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r7 - bicl2 #0,r7 - cmpl r7,r3 - bgequ noname.430 - incl r2 - bicl3 #0,r2,r0 - bneq noname.430 - incl r8 -noname.430: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.431 - incl r8 -noname.431: - - movl 4(ap),r0 - movl r7,20(r0) - - clrl r7 - - movl 8(ap),r2 - movl 12(r2),r4 - bicl3 #-65536,r4,-172(fp) - extzv #16,#16,r4,r0 - bicl3 #-65536,r0,r4 - movl -172(fp),r0 - mull3 r0,r4,-176(fp) - mull3 r0,r0,-172(fp) - mull2 r4,r4 - bicl3 #32767,-176(fp),r0 - extzv #15,#17,r0,r0 - addl2 r0,r4 - bicl3 #-65536,-176(fp),r0 - ashl #17,r0,-176(fp) - addl3 -172(fp),-176(fp),r0 - bicl3 #0,r0,-172(fp) - cmpl -172(fp),-176(fp) - bgequ noname.432 - incl r4 -noname.432: - movl -172(fp),r1 - movl r4,r3 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.433 - incl r3 -noname.433: - addl2 r3,r8 - bicl2 #0,r8 - cmpl r8,r3 - bgequ noname.434 - incl r7 -noname.434: - - movzwl 18(r2),r3 - bicl3 #-65536,8(r2),r1 - movzwl 10(r2),r0 - bicl2 #-65536,r0 - bicl3 #-65536,16(r2),-188(fp) - bicl3 #-65536,r3,-192(fp) - mull3 r0,-188(fp),-180(fp) - mull2 r1,-188(fp) - mull3 r1,-192(fp),-184(fp) - mull2 r0,-192(fp) - addl3 -180(fp),-184(fp),r0 - bicl3 #0,r0,-180(fp) - cmpl -180(fp),-184(fp) - bgequ noname.435 - addl2 #65536,-192(fp) -noname.435: - movzwl -178(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-192(fp) - bicl3 #-65536,-180(fp),r0 - ashl #16,r0,-184(fp) - addl3 -184(fp),-188(fp),r0 - bicl3 #0,r0,-188(fp) - cmpl -188(fp),-184(fp) - bgequ noname.436 - incl -192(fp) -noname.436: - movl -188(fp),r3 - movl -192(fp),r2 - bbc #31,r2,noname.437 - incl r7 -noname.437: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.438 - incl r2 -noname.438: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r9 - bicl2 #0,r9 - cmpl r9,r3 - bgequ noname.439 - incl r2 - bicl3 #0,r2,r0 - bneq noname.439 - incl r7 -noname.439: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.440 - incl r7 -noname.440: - - movl 8(ap),r0 - movzwl 22(r0),r2 - bicl3 #-65536,4(r0),r3 - movzwl 6(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,20(r0),-204(fp) - bicl3 #-65536,r2,-208(fp) - mull3 r1,-204(fp),-196(fp) - mull2 r3,-204(fp) - mull3 r3,-208(fp),-200(fp) - mull2 r1,-208(fp) - addl3 -196(fp),-200(fp),r0 - bicl3 #0,r0,-196(fp) - cmpl -196(fp),-200(fp) - bgequ noname.441 - addl2 #65536,-208(fp) -noname.441: - movzwl -194(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-208(fp) - bicl3 #-65536,-196(fp),r0 - ashl #16,r0,-200(fp) - addl3 -200(fp),-204(fp),r0 - bicl3 #0,r0,-204(fp) - cmpl -204(fp),-200(fp) - bgequ noname.442 - incl -208(fp) -noname.442: - movl -204(fp),r3 - movl -208(fp),r2 - bbc #31,r2,noname.443 - incl r7 -noname.443: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.444 - incl r2 -noname.444: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r9 - bicl2 #0,r9 - cmpl r9,r3 - bgequ noname.445 - incl r2 - bicl3 #0,r2,r0 - bneq noname.445 - incl r7 -noname.445: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.446 - incl r7 -noname.446: - - movl 8(ap),r0 - movzwl 26(r0),r2 - bicl3 #-65536,(r0),r3 - movzwl 2(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,24(r0),-220(fp) - bicl3 #-65536,r2,-224(fp) - mull3 r1,-220(fp),-212(fp) - mull2 r3,-220(fp) - mull3 r3,-224(fp),-216(fp) - mull2 r1,-224(fp) - addl3 -212(fp),-216(fp),r0 - bicl3 #0,r0,-212(fp) - cmpl -212(fp),-216(fp) - bgequ noname.447 - addl2 #65536,-224(fp) -noname.447: - movzwl -210(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-224(fp) - bicl3 #-65536,-212(fp),r0 - ashl #16,r0,-216(fp) - addl3 -216(fp),-220(fp),r0 - bicl3 #0,r0,-220(fp) - cmpl -220(fp),-216(fp) - bgequ noname.448 - incl -224(fp) -noname.448: - movl -220(fp),r3 - movl -224(fp),r2 - bbc #31,r2,noname.449 - incl r7 -noname.449: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.450 - incl r2 -noname.450: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r9 - bicl2 #0,r9 - cmpl r9,r3 - bgequ noname.451 - incl r2 - bicl3 #0,r2,r0 - bneq noname.451 - incl r7 -noname.451: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.452 - incl r7 -noname.452: - - movl 4(ap),r0 - movl r9,24(r0) - - clrl r9 - - movl 8(ap),r0 - movzwl 30(r0),r2 - bicl3 #-65536,(r0),r3 - movzwl 2(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,28(r0),-236(fp) - bicl3 #-65536,r2,-240(fp) - mull3 r1,-236(fp),-228(fp) - mull2 r3,-236(fp) - mull3 r3,-240(fp),-232(fp) - mull2 r1,-240(fp) - addl3 -228(fp),-232(fp),r0 - bicl3 #0,r0,-228(fp) - cmpl -228(fp),-232(fp) - bgequ noname.453 - addl2 #65536,-240(fp) -noname.453: - movzwl -226(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-240(fp) - bicl3 #-65536,-228(fp),r0 - ashl #16,r0,-232(fp) - addl3 -232(fp),-236(fp),r0 - bicl3 #0,r0,-236(fp) - cmpl -236(fp),-232(fp) - bgequ noname.454 - incl -240(fp) -noname.454: - movl -236(fp),r3 - movl -240(fp),r2 - bbc #31,r2,noname.455 - incl r9 -noname.455: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.456 - incl r2 -noname.456: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r8 - bicl2 #0,r8 - cmpl r8,r3 - bgequ noname.457 - incl r2 - bicl3 #0,r2,r0 - bneq noname.457 - incl r9 -noname.457: - addl2 r2,r7 - bicl2 #0,r7 - cmpl r7,r2 - bgequ noname.458 - incl r9 -noname.458: - - movl 8(ap),r0 - movzwl 26(r0),r2 - bicl3 #-65536,4(r0),r3 - movzwl 6(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,24(r0),-252(fp) - bicl3 #-65536,r2,-256(fp) - mull3 r1,-252(fp),-244(fp) - mull2 r3,-252(fp) - mull3 r3,-256(fp),-248(fp) - mull2 r1,-256(fp) - addl3 -244(fp),-248(fp),r0 - bicl3 #0,r0,-244(fp) - cmpl -244(fp),-248(fp) - bgequ noname.459 - addl2 #65536,-256(fp) -noname.459: - movzwl -242(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-256(fp) - bicl3 #-65536,-244(fp),r0 - ashl #16,r0,-248(fp) - addl3 -248(fp),-252(fp),r0 - bicl3 #0,r0,-252(fp) - cmpl -252(fp),-248(fp) - bgequ noname.460 - incl -256(fp) -noname.460: - movl -252(fp),r3 - movl -256(fp),r2 - bbc #31,r2,noname.461 - incl r9 -noname.461: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.462 - incl r2 -noname.462: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r8 - bicl2 #0,r8 - cmpl r8,r3 - bgequ noname.463 - incl r2 - bicl3 #0,r2,r0 - bneq noname.463 - incl r9 -noname.463: - addl2 r2,r7 - bicl2 #0,r7 - cmpl r7,r2 - bgequ noname.464 - incl r9 -noname.464: - - movl 8(ap),r0 - movzwl 22(r0),r2 - bicl3 #-65536,8(r0),r3 - movzwl 10(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,20(r0),-268(fp) - bicl3 #-65536,r2,-272(fp) - mull3 r1,-268(fp),-260(fp) - mull2 r3,-268(fp) - mull3 r3,-272(fp),-264(fp) - mull2 r1,-272(fp) - addl3 -260(fp),-264(fp),r0 - bicl3 #0,r0,-260(fp) - cmpl -260(fp),-264(fp) - bgequ noname.465 - addl2 #65536,-272(fp) -noname.465: - movzwl -258(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-272(fp) - bicl3 #-65536,-260(fp),r0 - ashl #16,r0,-264(fp) - addl3 -264(fp),-268(fp),r0 - bicl3 #0,r0,-268(fp) - cmpl -268(fp),-264(fp) - bgequ noname.466 - incl -272(fp) -noname.466: - movl -268(fp),r3 - movl -272(fp),r2 - bbc #31,r2,noname.467 - incl r9 -noname.467: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.468 - incl r2 -noname.468: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r8 - bicl2 #0,r8 - cmpl r8,r3 - bgequ noname.469 - incl r2 - bicl3 #0,r2,r0 - bneq noname.469 - incl r9 -noname.469: - addl2 r2,r7 - bicl2 #0,r7 - cmpl r7,r2 - bgequ noname.470 - incl r9 -noname.470: - - movl 8(ap),r0 - movzwl 18(r0),r2 - bicl3 #-65536,12(r0),r3 - movzwl 14(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,16(r0),-284(fp) - bicl3 #-65536,r2,-288(fp) - mull3 r1,-284(fp),-276(fp) - mull2 r3,-284(fp) - mull3 r3,-288(fp),-280(fp) - mull2 r1,-288(fp) - addl3 -276(fp),-280(fp),r0 - bicl3 #0,r0,-276(fp) - cmpl -276(fp),-280(fp) - bgequ noname.471 - addl2 #65536,-288(fp) -noname.471: - movzwl -274(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-288(fp) - bicl3 #-65536,-276(fp),r0 - ashl #16,r0,-280(fp) - addl3 -280(fp),-284(fp),r0 - bicl3 #0,r0,-284(fp) - cmpl -284(fp),-280(fp) - bgequ noname.472 - incl -288(fp) -noname.472: - movl -284(fp),r3 - movl -288(fp),r2 - bbc #31,r2,noname.473 - incl r9 -noname.473: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.474 - incl r2 -noname.474: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r8 - bicl2 #0,r8 - cmpl r8,r3 - bgequ noname.475 - incl r2 - bicl3 #0,r2,r0 - bneq noname.475 - incl r9 -noname.475: - addl2 r2,r7 - bicl2 #0,r7 - cmpl r7,r2 - bgequ noname.476 - incl r9 -noname.476: - - movl 4(ap),r0 - movl r8,28(r0) - - clrl r8 - - movl 8(ap),r3 - movl 16(r3),r4 - bicl3 #-65536,r4,r5 - extzv #16,#16,r4,r0 - bicl3 #-65536,r0,r4 - mull3 r5,r4,-292(fp) - mull2 r5,r5 - mull2 r4,r4 - bicl3 #32767,-292(fp),r0 - extzv #15,#17,r0,r0 - addl2 r0,r4 - bicl3 #-65536,-292(fp),r0 - ashl #17,r0,-292(fp) - addl2 -292(fp),r5 - bicl2 #0,r5 - cmpl r5,-292(fp) - bgequ noname.477 - incl r4 -noname.477: - movl r5,r1 - movl r4,r2 - addl2 r1,r7 - bicl2 #0,r7 - cmpl r7,r1 - bgequ noname.478 - incl r2 -noname.478: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.479 - incl r8 -noname.479: - - bicl3 #-65536,20(r3),r4 - movzwl 22(r3),r1 - bicl2 #-65536,r1 - bicl3 #-65536,12(r3),r2 - movzwl 14(r3),r0 - bicl2 #-65536,r0 - movl r4,r6 - movl r1,r5 - mull3 r0,r6,-296(fp) - mull2 r2,r6 - mull3 r2,r5,-300(fp) - mull2 r0,r5 - addl3 -296(fp),-300(fp),r0 - bicl3 #0,r0,-296(fp) - cmpl -296(fp),-300(fp) - bgequ noname.480 - addl2 #65536,r5 -noname.480: - movzwl -294(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r5 - bicl3 #-65536,-296(fp),r0 - ashl #16,r0,-300(fp) - addl2 -300(fp),r6 - bicl2 #0,r6 - cmpl r6,-300(fp) - bgequ noname.481 - incl r5 -noname.481: - movl r6,r3 - movl r5,r2 - bbc #31,r2,noname.482 - incl r8 -noname.482: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.483 - incl r2 -noname.483: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r7 - bicl2 #0,r7 - cmpl r7,r3 - bgequ noname.484 - incl r2 - bicl3 #0,r2,r0 - bneq noname.484 - incl r8 -noname.484: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.485 - incl r8 -noname.485: - - movl 8(ap),r0 - bicl3 #-65536,24(r0),r3 - movzwl 26(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,8(r0),r2 - movzwl 10(r0),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-304(fp) - mull2 r2,r5 - mull3 r2,r4,-308(fp) - mull2 r0,r4 - addl3 -304(fp),-308(fp),r0 - bicl3 #0,r0,-304(fp) - cmpl -304(fp),-308(fp) - bgequ noname.486 - addl2 #65536,r4 -noname.486: - movzwl -302(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-304(fp),r0 - ashl #16,r0,-308(fp) - addl2 -308(fp),r5 - bicl2 #0,r5 - cmpl r5,-308(fp) - bgequ noname.487 - incl r4 -noname.487: - movl r5,r3 - movl r4,r2 - bbc #31,r2,noname.488 - incl r8 -noname.488: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.489 - incl r2 -noname.489: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r7 - bicl2 #0,r7 - cmpl r7,r3 - bgequ noname.490 - incl r2 - bicl3 #0,r2,r0 - bneq noname.490 - incl r8 -noname.490: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.491 - incl r8 -noname.491: - - movl 8(ap),r0 - bicl3 #-65536,28(r0),r3 - movzwl 30(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,4(r0),r2 - movzwl 6(r0),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-312(fp) - mull2 r2,r5 - mull3 r2,r4,-316(fp) - mull2 r0,r4 - addl3 -312(fp),-316(fp),r0 - bicl3 #0,r0,-312(fp) - cmpl -312(fp),-316(fp) - bgequ noname.492 - addl2 #65536,r4 -noname.492: - movzwl -310(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-312(fp),r0 - ashl #16,r0,-316(fp) - addl2 -316(fp),r5 - bicl2 #0,r5 - cmpl r5,-316(fp) - bgequ noname.493 - incl r4 -noname.493: - movl r5,r3 - movl r4,r2 - bbc #31,r2,noname.494 - incl r8 -noname.494: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.495 - incl r2 -noname.495: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r7 - bicl2 #0,r7 - cmpl r7,r3 - bgequ noname.496 - incl r2 - bicl3 #0,r2,r0 - bneq noname.496 - incl r8 -noname.496: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.497 - incl r8 -noname.497: - - movl 4(ap),r0 - movl r7,32(r0) - - clrl r7 - - movl 8(ap),r0 - bicl3 #-65536,28(r0),r3 - movzwl 30(r0),r2 - bicl3 #-65536,8(r0),r1 - movzwl 10(r0),r0 - bicl2 #-65536,r0 - movl r3,r4 - bicl3 #-65536,r2,-328(fp) - mull3 r0,r4,-320(fp) - mull2 r1,r4 - mull3 r1,-328(fp),-324(fp) - mull2 r0,-328(fp) - addl3 -320(fp),-324(fp),r0 - bicl3 #0,r0,-320(fp) - cmpl -320(fp),-324(fp) - bgequ noname.498 - addl2 #65536,-328(fp) -noname.498: - movzwl -318(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-328(fp) - bicl3 #-65536,-320(fp),r0 - ashl #16,r0,-324(fp) - addl2 -324(fp),r4 - bicl2 #0,r4 - cmpl r4,-324(fp) - bgequ noname.499 - incl -328(fp) -noname.499: - movl r4,r3 - movl -328(fp),r2 - bbc #31,r2,noname.500 - incl r7 -noname.500: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.501 - incl r2 -noname.501: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r9 - bicl2 #0,r9 - cmpl r9,r3 - bgequ noname.502 - incl r2 - bicl3 #0,r2,r0 - bneq noname.502 - incl r7 -noname.502: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.503 - incl r7 -noname.503: - - movl 8(ap),r0 - movzwl 26(r0),r2 - bicl3 #-65536,12(r0),r3 - movzwl 14(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,24(r0),-340(fp) - bicl3 #-65536,r2,-344(fp) - mull3 r1,-340(fp),-332(fp) - mull2 r3,-340(fp) - mull3 r3,-344(fp),-336(fp) - mull2 r1,-344(fp) - addl3 -332(fp),-336(fp),r0 - bicl3 #0,r0,-332(fp) - cmpl -332(fp),-336(fp) - bgequ noname.504 - addl2 #65536,-344(fp) -noname.504: - movzwl -330(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-344(fp) - bicl3 #-65536,-332(fp),r0 - ashl #16,r0,-336(fp) - addl3 -336(fp),-340(fp),r0 - bicl3 #0,r0,-340(fp) - cmpl -340(fp),-336(fp) - bgequ noname.505 - incl -344(fp) -noname.505: - movl -340(fp),r3 - movl -344(fp),r2 - bbc #31,r2,noname.506 - incl r7 -noname.506: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.507 - incl r2 -noname.507: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r9 - bicl2 #0,r9 - cmpl r9,r3 - bgequ noname.508 - incl r2 - bicl3 #0,r2,r0 - bneq noname.508 - incl r7 -noname.508: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.509 - incl r7 -noname.509: - - movl 8(ap),r0 - movzwl 22(r0),r2 - bicl3 #-65536,16(r0),r3 - movzwl 18(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,20(r0),-356(fp) - bicl3 #-65536,r2,-360(fp) - mull3 r1,-356(fp),-348(fp) - mull2 r3,-356(fp) - mull3 r3,-360(fp),-352(fp) - mull2 r1,-360(fp) - addl3 -348(fp),-352(fp),r0 - bicl3 #0,r0,-348(fp) - cmpl -348(fp),-352(fp) - bgequ noname.510 - addl2 #65536,-360(fp) -noname.510: - movzwl -346(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-360(fp) - bicl3 #-65536,-348(fp),r0 - ashl #16,r0,-352(fp) - addl3 -352(fp),-356(fp),r0 - bicl3 #0,r0,-356(fp) - cmpl -356(fp),-352(fp) - bgequ noname.511 - incl -360(fp) -noname.511: - movl -356(fp),r3 - movl -360(fp),r2 - bbc #31,r2,noname.512 - incl r7 -noname.512: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.513 - incl r2 -noname.513: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r9 - bicl2 #0,r9 - cmpl r9,r3 - bgequ noname.514 - incl r2 - bicl3 #0,r2,r0 - bneq noname.514 - incl r7 -noname.514: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.515 - incl r7 -noname.515: - - movl 4(ap),r0 - movl r9,36(r0) - - clrl r9 - - movl 8(ap),r3 - movl 20(r3),r4 - bicl3 #-65536,r4,-364(fp) - extzv #16,#16,r4,r0 - bicl3 #-65536,r0,r4 - movl -364(fp),r0 - mull3 r0,r4,-368(fp) - mull3 r0,r0,-364(fp) - mull2 r4,r4 - bicl3 #32767,-368(fp),r0 - extzv #15,#17,r0,r0 - addl2 r0,r4 - bicl3 #-65536,-368(fp),r0 - ashl #17,r0,-368(fp) - addl3 -364(fp),-368(fp),r0 - bicl3 #0,r0,-364(fp) - cmpl -364(fp),-368(fp) - bgequ noname.516 - incl r4 -noname.516: - movl -364(fp),r1 - movl r4,r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.517 - incl r2 -noname.517: - addl2 r2,r7 - bicl2 #0,r7 - cmpl r7,r2 - bgequ noname.518 - incl r9 -noname.518: - - bicl3 #-65536,24(r3),r4 - movzwl 26(r3),r1 - bicl2 #-65536,r1 - bicl3 #-65536,16(r3),r2 - movzwl 18(r3),r0 - bicl2 #-65536,r0 - movl r4,r6 - movl r1,r5 - mull3 r0,r6,-372(fp) - mull2 r2,r6 - mull3 r2,r5,-376(fp) - mull2 r0,r5 - addl3 -372(fp),-376(fp),r0 - bicl3 #0,r0,-372(fp) - cmpl -372(fp),-376(fp) - bgequ noname.519 - addl2 #65536,r5 -noname.519: - movzwl -370(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r5 - bicl3 #-65536,-372(fp),r0 - ashl #16,r0,-376(fp) - addl2 -376(fp),r6 - bicl2 #0,r6 - cmpl r6,-376(fp) - bgequ noname.520 - incl r5 -noname.520: - movl r6,r3 - movl r5,r2 - bbc #31,r2,noname.521 - incl r9 -noname.521: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.522 - incl r2 -noname.522: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r8 - bicl2 #0,r8 - cmpl r8,r3 - bgequ noname.523 - incl r2 - bicl3 #0,r2,r0 - bneq noname.523 - incl r9 -noname.523: - addl2 r2,r7 - bicl2 #0,r7 - cmpl r7,r2 - bgequ noname.524 - incl r9 -noname.524: - - movl 8(ap),r0 - bicl3 #-65536,28(r0),r3 - movzwl 30(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,12(r0),r2 - movzwl 14(r0),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-380(fp) - mull2 r2,r5 - mull3 r2,r4,-384(fp) - mull2 r0,r4 - addl3 -380(fp),-384(fp),r0 - bicl3 #0,r0,-380(fp) - cmpl -380(fp),-384(fp) - bgequ noname.525 - addl2 #65536,r4 -noname.525: - movzwl -378(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-380(fp),r0 - ashl #16,r0,-384(fp) - addl2 -384(fp),r5 - bicl2 #0,r5 - cmpl r5,-384(fp) - bgequ noname.526 - incl r4 -noname.526: - movl r5,r3 - movl r4,r2 - bbc #31,r2,noname.527 - incl r9 -noname.527: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.528 - incl r2 -noname.528: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r8 - bicl2 #0,r8 - cmpl r8,r3 - bgequ noname.529 - incl r2 - bicl3 #0,r2,r0 - bneq noname.529 - incl r9 -noname.529: - addl2 r2,r7 - bicl2 #0,r7 - cmpl r7,r2 - bgequ noname.530 - incl r9 -noname.530: - movl 4(ap),r0 - movl r8,40(r0) - - clrl r8 - - movl 8(ap),r0 - bicl3 #-65536,28(r0),r3 - movzwl 30(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,16(r0),r2 - movzwl 18(r0),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-388(fp) - mull2 r2,r5 - mull3 r2,r4,-392(fp) - mull2 r0,r4 - addl3 -388(fp),-392(fp),r0 - bicl3 #0,r0,-388(fp) - cmpl -388(fp),-392(fp) - bgequ noname.531 - addl2 #65536,r4 -noname.531: - movzwl -386(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-388(fp),r0 - ashl #16,r0,-392(fp) - addl2 -392(fp),r5 - bicl2 #0,r5 - cmpl r5,-392(fp) - bgequ noname.532 - incl r4 -noname.532: - movl r5,r3 - movl r4,r2 - bbc #31,r2,noname.533 - incl r8 -noname.533: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.534 - incl r2 -noname.534: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r7 - bicl2 #0,r7 - cmpl r7,r3 - bgequ noname.535 - incl r2 - bicl3 #0,r2,r0 - bneq noname.535 - incl r8 -noname.535: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.536 - incl r8 -noname.536: - - movl 8(ap),r0 - bicl3 #-65536,24(r0),r3 - movzwl 26(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,20(r0),r2 - movzwl 22(r0),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-396(fp) - mull2 r2,r5 - mull3 r2,r4,-400(fp) - mull2 r0,r4 - addl3 -396(fp),-400(fp),r0 - bicl3 #0,r0,-396(fp) - cmpl -396(fp),-400(fp) - bgequ noname.537 - addl2 #65536,r4 -noname.537: - movzwl -394(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-396(fp),r0 - ashl #16,r0,-400(fp) - addl2 -400(fp),r5 - bicl2 #0,r5 - cmpl r5,-400(fp) - bgequ noname.538 - incl r4 -noname.538: - movl r5,r3 - movl r4,r2 - bbc #31,r2,noname.539 - incl r8 -noname.539: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.540 - incl r2 -noname.540: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r7 - bicl2 #0,r7 - cmpl r7,r3 - bgequ noname.541 - incl r2 - bicl3 #0,r2,r0 - bneq noname.541 - incl r8 -noname.541: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.542 - incl r8 -noname.542: - - movl 4(ap),r0 - movl r7,44(r0) - - clrl r7 - - movl 8(ap),r3 - movl 24(r3),r4 - bicl3 #-65536,r4,r5 - extzv #16,#16,r4,r0 - bicl3 #-65536,r0,r4 - mull3 r5,r4,-404(fp) - mull2 r5,r5 - mull2 r4,r4 - bicl3 #32767,-404(fp),r0 - extzv #15,#17,r0,r0 - addl2 r0,r4 - bicl3 #-65536,-404(fp),r0 - ashl #17,r0,-404(fp) - addl2 -404(fp),r5 - bicl2 #0,r5 - cmpl r5,-404(fp) - bgequ noname.543 - incl r4 -noname.543: - movl r5,r1 - movl r4,r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.544 - incl r2 -noname.544: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.545 - incl r7 -noname.545: - - movzwl 30(r3),r2 - bicl3 #-65536,20(r3),r1 - movzwl 22(r3),r0 - bicl2 #-65536,r0 - bicl3 #-65536,28(r3),-416(fp) - bicl3 #-65536,r2,-420(fp) - mull3 r0,-416(fp),-408(fp) - mull2 r1,-416(fp) - mull3 r1,-420(fp),-412(fp) - mull2 r0,-420(fp) - addl3 -408(fp),-412(fp),r0 - bicl3 #0,r0,-408(fp) - cmpl -408(fp),-412(fp) - bgequ noname.546 - addl2 #65536,-420(fp) -noname.546: - movzwl -406(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-420(fp) - bicl3 #-65536,-408(fp),r0 - ashl #16,r0,-412(fp) - addl3 -412(fp),-416(fp),r0 - bicl3 #0,r0,-416(fp) - cmpl -416(fp),-412(fp) - bgequ noname.547 - incl -420(fp) -noname.547: - movl -416(fp),r3 - movl -420(fp),r2 - bbc #31,r2,noname.548 - incl r7 -noname.548: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.549 - incl r2 -noname.549: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r9 - bicl2 #0,r9 - cmpl r9,r3 - bgequ noname.550 - incl r2 - bicl3 #0,r2,r0 - bneq noname.550 - incl r7 -noname.550: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.551 - incl r7 -noname.551: - - movl 4(ap),r0 - movl r9,48(r0) - - clrl r9 - - movl 8(ap),r0 - movzwl 30(r0),r2 - bicl3 #-65536,24(r0),r3 - movzwl 26(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,28(r0),-432(fp) - bicl3 #-65536,r2,-436(fp) - mull3 r1,-432(fp),-424(fp) - mull2 r3,-432(fp) - mull3 r3,-436(fp),-428(fp) - mull2 r1,-436(fp) - addl3 -424(fp),-428(fp),r0 - bicl3 #0,r0,-424(fp) - cmpl -424(fp),-428(fp) - bgequ noname.552 - addl2 #65536,-436(fp) -noname.552: - movzwl -422(fp),r0 - bicl2 #-65536,r0 - addl2 r0,-436(fp) - bicl3 #-65536,-424(fp),r0 - ashl #16,r0,-428(fp) - addl3 -428(fp),-432(fp),r0 - bicl3 #0,r0,-432(fp) - cmpl -432(fp),-428(fp) - bgequ noname.553 - incl -436(fp) -noname.553: - movl -432(fp),r3 - movl -436(fp),r2 - bbc #31,r2,noname.554 - incl r9 -noname.554: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.555 - incl r2 -noname.555: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r8 - bicl2 #0,r8 - cmpl r8,r3 - bgequ noname.556 - incl r2 - bicl3 #0,r2,r0 - bneq noname.556 - incl r9 -noname.556: - addl2 r2,r7 - bicl2 #0,r7 - cmpl r7,r2 - bgequ noname.557 - incl r9 -noname.557: - - movl 4(ap),r4 - movl r8,52(r4) - - clrl r8 - - movl 8(ap),r0 - movl 28(r0),r3 - bicl3 #-65536,r3,-440(fp) - extzv #16,#16,r3,r0 - bicl3 #-65536,r0,r3 - movl -440(fp),r0 - mull3 r0,r3,-444(fp) - mull3 r0,r0,-440(fp) - mull2 r3,r3 - bicl3 #32767,-444(fp),r0 - extzv #15,#17,r0,r0 - addl2 r0,r3 - bicl3 #-65536,-444(fp),r0 - ashl #17,r0,-444(fp) - addl3 -440(fp),-444(fp),r0 - bicl3 #0,r0,-440(fp) - cmpl -440(fp),-444(fp) - bgequ noname.558 - incl r3 -noname.558: - movl -440(fp),r1 - movl r3,r2 - addl2 r1,r7 - bicl2 #0,r7 - cmpl r7,r1 - bgequ noname.559 - incl r2 -noname.559: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.560 - incl r8 -noname.560: - - movl r7,56(r4) - - movl r9,60(r4) - - ret - - - -;r=4 ;(AP) -;a=8 ;(AP) -;b=12 ;(AP) -;n=16 ;(AP) n by value (input) - - .psect code,nowrt - -.entry BN_SQR_COMBA4,^m<r2,r3,r4,r5,r6,r7,r8,r9,r10> - subl2 #44,sp - - clrq r8 - - clrl r10 - - movl 8(ap),r5 - movl (r5),r3 - bicl3 #-65536,r3,r4 - extzv #16,#16,r3,r0 - bicl3 #-65536,r0,r3 - mull3 r4,r3,-4(fp) - mull2 r4,r4 - mull2 r3,r3 - bicl3 #32767,-4(fp),r0 - extzv #15,#17,r0,r0 - addl2 r0,r3 - bicl3 #-65536,-4(fp),r0 - ashl #17,r0,-4(fp) - addl2 -4(fp),r4 - bicl2 #0,r4 - cmpl r4,-4(fp) - bgequ noname.563 - incl r3 -noname.563: - movl r4,r1 - movl r3,r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.564 - incl r2 -noname.564: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.565 - incl r10 -noname.565: - - movl r9,@4(ap) - - clrl r9 - - bicl3 #-65536,4(r5),r3 - movzwl 6(r5),r1 - bicl2 #-65536,r1 - bicl3 #-65536,(r5),r2 - movzwl 2(r5),r0 - bicl2 #-65536,r0 - movl r3,r6 - movl r1,r4 - mull3 r0,r6,-8(fp) - mull2 r2,r6 - mull2 r4,r2 - mull2 r0,r4 - addl3 -8(fp),r2,r0 - bicl3 #0,r0,-8(fp) - cmpl -8(fp),r2 - bgequ noname.566 - addl2 #65536,r4 -noname.566: - movzwl -6(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-8(fp),r0 - ashl #16,r0,r1 - addl2 r1,r6 - bicl2 #0,r6 - cmpl r6,r1 - bgequ noname.567 - incl r4 -noname.567: - movl r6,r3 - movl r4,r2 - bbc #31,r2,noname.568 - incl r9 -noname.568: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.569 - incl r2 -noname.569: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r8 - bicl2 #0,r8 - cmpl r8,r3 - bgequ noname.570 - incl r2 - bicl3 #0,r2,r0 - bneq noname.570 - incl r9 -noname.570: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.571 - incl r9 -noname.571: - - movl 4(ap),r0 - movl r8,4(r0) - - clrl r8 - - movl 8(ap),r4 - movl 4(r4),r3 - bicl3 #-65536,r3,r5 - extzv #16,#16,r3,r0 - bicl3 #-65536,r0,r3 - mull3 r5,r3,r1 - mull2 r5,r5 - mull2 r3,r3 - bicl3 #32767,r1,r0 - extzv #15,#17,r0,r0 - addl2 r0,r3 - bicl2 #-65536,r1 - ashl #17,r1,r1 - addl2 r1,r5 - bicl2 #0,r5 - cmpl r5,r1 - bgequ noname.572 - incl r3 -noname.572: - movl r5,r1 - movl r3,r2 - addl2 r1,r10 - bicl2 #0,r10 - cmpl r10,r1 - bgequ noname.573 - incl r2 -noname.573: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.574 - incl r8 -noname.574: - - bicl3 #-65536,8(r4),r3 - movzwl 10(r4),r1 - bicl2 #-65536,r1 - bicl3 #-65536,(r4),r2 - movzwl 2(r4),r0 - bicl2 #-65536,r0 - movl r3,r6 - movl r1,r5 - mull3 r0,r6,r7 - mull2 r2,r6 - mull2 r5,r2 - mull2 r0,r5 - addl2 r2,r7 - bicl2 #0,r7 - cmpl r7,r2 - bgequ noname.575 - addl2 #65536,r5 -noname.575: - extzv #16,#16,r7,r0 - bicl2 #-65536,r0 - addl2 r0,r5 - bicl3 #-65536,r7,r0 - ashl #16,r0,r1 - addl2 r1,r6 - bicl2 #0,r6 - cmpl r6,r1 - bgequ noname.576 - incl r5 -noname.576: - movl r6,r3 - movl r5,r2 - bbc #31,r2,noname.577 - incl r8 -noname.577: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.578 - incl r2 -noname.578: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r10 - bicl2 #0,r10 - cmpl r10,r3 - bgequ noname.579 - incl r2 - bicl3 #0,r2,r0 - bneq noname.579 - incl r8 -noname.579: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.580 - incl r8 -noname.580: - - movl 4(ap),r0 - movl r10,8(r0) - - clrl r10 - - movl 8(ap),r0 - bicl3 #-65536,12(r0),r3 - movzwl 14(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,(r0),r2 - movzwl 2(r0),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,r6 - mull2 r2,r5 - mull3 r2,r4,-12(fp) - mull2 r0,r4 - addl2 -12(fp),r6 - bicl2 #0,r6 - cmpl r6,-12(fp) - bgequ noname.581 - addl2 #65536,r4 -noname.581: - extzv #16,#16,r6,r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,r6,r0 - ashl #16,r0,-12(fp) - addl2 -12(fp),r5 - bicl2 #0,r5 - cmpl r5,-12(fp) - bgequ noname.582 - incl r4 -noname.582: - movl r5,r3 - movl r4,r2 - bbc #31,r2,noname.583 - incl r10 -noname.583: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.584 - incl r2 -noname.584: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r9 - bicl2 #0,r9 - cmpl r9,r3 - bgequ noname.585 - incl r2 - bicl3 #0,r2,r0 - bneq noname.585 - incl r10 -noname.585: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.586 - incl r10 -noname.586: - - movl 8(ap),r0 - bicl3 #-65536,8(r0),r3 - movzwl 10(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,4(r0),r2 - movzwl 6(r0),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-16(fp) - mull2 r2,r5 - mull3 r2,r4,-20(fp) - mull2 r0,r4 - addl3 -16(fp),-20(fp),r0 - bicl3 #0,r0,-16(fp) - cmpl -16(fp),-20(fp) - bgequ noname.587 - addl2 #65536,r4 -noname.587: - movzwl -14(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-16(fp),r0 - ashl #16,r0,-20(fp) - addl2 -20(fp),r5 - bicl2 #0,r5 - cmpl r5,-20(fp) - bgequ noname.588 - incl r4 -noname.588: - movl r5,r3 - movl r4,r2 - bbc #31,r2,noname.589 - incl r10 -noname.589: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.590 - incl r2 -noname.590: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r9 - bicl2 #0,r9 - cmpl r9,r3 - bgequ noname.591 - incl r2 - bicl3 #0,r2,r0 - bneq noname.591 - incl r10 -noname.591: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.592 - incl r10 -noname.592: - movl 4(ap),r0 - movl r9,12(r0) - - clrl r9 - - movl 8(ap),r3 - movl 8(r3),r4 - bicl3 #-65536,r4,r5 - extzv #16,#16,r4,r0 - bicl3 #-65536,r0,r4 - mull3 r5,r4,-24(fp) - mull2 r5,r5 - mull2 r4,r4 - bicl3 #32767,-24(fp),r0 - extzv #15,#17,r0,r0 - addl2 r0,r4 - bicl3 #-65536,-24(fp),r0 - ashl #17,r0,-24(fp) - addl2 -24(fp),r5 - bicl2 #0,r5 - cmpl r5,-24(fp) - bgequ noname.593 - incl r4 -noname.593: - movl r5,r1 - movl r4,r2 - addl2 r1,r8 - bicl2 #0,r8 - cmpl r8,r1 - bgequ noname.594 - incl r2 -noname.594: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.595 - incl r9 -noname.595: - - bicl3 #-65536,12(r3),r4 - movzwl 14(r3),r1 - bicl2 #-65536,r1 - bicl3 #-65536,4(r3),r2 - movzwl 6(r3),r0 - bicl2 #-65536,r0 - movl r4,r6 - movl r1,r5 - mull3 r0,r6,-28(fp) - mull2 r2,r6 - mull3 r2,r5,-32(fp) - mull2 r0,r5 - addl3 -28(fp),-32(fp),r0 - bicl3 #0,r0,-28(fp) - cmpl -28(fp),-32(fp) - bgequ noname.596 - addl2 #65536,r5 -noname.596: - movzwl -26(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r5 - bicl3 #-65536,-28(fp),r0 - ashl #16,r0,-32(fp) - addl2 -32(fp),r6 - bicl2 #0,r6 - cmpl r6,-32(fp) - bgequ noname.597 - incl r5 -noname.597: - movl r6,r3 - movl r5,r2 - bbc #31,r2,noname.598 - incl r9 -noname.598: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.599 - incl r2 -noname.599: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r8 - bicl2 #0,r8 - cmpl r8,r3 - bgequ noname.600 - incl r2 - bicl3 #0,r2,r0 - bneq noname.600 - incl r9 -noname.600: - addl2 r2,r10 - bicl2 #0,r10 - cmpl r10,r2 - bgequ noname.601 - incl r9 -noname.601: - - movl 4(ap),r0 - movl r8,16(r0) - - clrl r8 - - movl 8(ap),r0 - bicl3 #-65536,12(r0),r3 - movzwl 14(r0),r1 - bicl2 #-65536,r1 - bicl3 #-65536,8(r0),r2 - movzwl 10(r0),r0 - bicl2 #-65536,r0 - movl r3,r5 - movl r1,r4 - mull3 r0,r5,-36(fp) - mull2 r2,r5 - mull3 r2,r4,-40(fp) - mull2 r0,r4 - addl3 -36(fp),-40(fp),r0 - bicl3 #0,r0,-36(fp) - cmpl -36(fp),-40(fp) - bgequ noname.602 - addl2 #65536,r4 -noname.602: - movzwl -34(fp),r0 - bicl2 #-65536,r0 - addl2 r0,r4 - bicl3 #-65536,-36(fp),r0 - ashl #16,r0,-40(fp) - addl2 -40(fp),r5 - bicl2 #0,r5 - cmpl r5,-40(fp) - bgequ noname.603 - incl r4 -noname.603: - movl r5,r3 - movl r4,r2 - bbc #31,r2,noname.604 - incl r8 -noname.604: - addl2 r2,r2 - bicl2 #0,r2 - bbc #31,r3,noname.605 - incl r2 -noname.605: - addl2 r3,r3 - bicl2 #0,r3 - addl2 r3,r10 - bicl2 #0,r10 - cmpl r10,r3 - bgequ noname.606 - incl r2 - bicl3 #0,r2,r0 - bneq noname.606 - incl r8 -noname.606: - addl2 r2,r9 - bicl2 #0,r9 - cmpl r9,r2 - bgequ noname.607 - incl r8 -noname.607: - - movl 4(ap),r4 - movl r10,20(r4) - - clrl r10 - - movl 8(ap),r0 - movl 12(r0),r3 - bicl3 #-65536,r3,r5 - extzv #16,#16,r3,r0 - bicl3 #-65536,r0,r3 - mull3 r5,r3,-44(fp) - mull2 r5,r5 - mull2 r3,r3 - bicl3 #32767,-44(fp),r0 - extzv #15,#17,r0,r0 - addl2 r0,r3 - bicl3 #-65536,-44(fp),r0 - ashl #17,r0,-44(fp) - addl2 -44(fp),r5 - bicl2 #0,r5 - cmpl r5,-44(fp) - bgequ noname.608 - incl r3 -noname.608: - movl r5,r1 - movl r3,r2 - addl2 r1,r9 - bicl2 #0,r9 - cmpl r9,r1 - bgequ noname.609 - incl r2 -noname.609: - addl2 r2,r8 - bicl2 #0,r8 - cmpl r8,r2 - bgequ noname.610 - incl r10 -noname.610: - - movl r9,24(r4) - - movl r8,28(r4) - - ret - -; For now, the code below doesn't work, so I end this prematurely. -.end diff --git a/deps/openssl/openssl/crypto/bn/asm/x86-gf2m.pl b/deps/openssl/openssl/crypto/bn/asm/x86-gf2m.pl index b579530272..f464368733 100644 --- a/deps/openssl/openssl/crypto/bn/asm/x86-gf2m.pl +++ b/deps/openssl/openssl/crypto/bn/asm/x86-gf2m.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -36,6 +43,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; +$output = pop; +open STDOUT,">$output"; + &asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386"); $sse2=0; @@ -311,3 +321,5 @@ if ($sse2) { &asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); &asm_finish(); + +close STDOUT; diff --git a/deps/openssl/openssl/crypto/bn/asm/x86-mont.pl b/deps/openssl/openssl/crypto/bn/asm/x86-mont.pl index 1c4003efc2..a8b402d59b 100755 --- a/deps/openssl/openssl/crypto/bn/asm/x86-mont.pl +++ b/deps/openssl/openssl/crypto/bn/asm/x86-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @@ -30,6 +37,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; +$output = pop; +open STDOUT,">$output"; + &asm_init($ARGV[0],$0); $sse2=0; @@ -84,7 +94,9 @@ $frame=32; # size of above frame rounded up to 16n &and ("ebp",-64); # align to cache line - # Some OSes, *cough*-dows, insist on stack being "wired" to + # An OS-agnostic version of __chkstk. + # + # Some OSes (Windows) insist on stack being "wired" to # physical memory in strictly sequential manner, i.e. if stack # allocation spans two pages, then reference to farmost one can # be punishable by SEGV. But page walking can do good even on @@ -289,7 +301,7 @@ if (0) { &xor ("eax","eax"); # signal "not fast enough [yet]" &jmp (&label("just_leave")); # While the below code provides competitive performance for - # all key lengthes on modern Intel cores, it's still more + # all key lengths on modern Intel cores, it's still more # than 10% slower for 4096-bit key elsewhere:-( "Competitive" # means compared to the original integer-only assembler. # 512-bit RSA sign is better by ~40%, but that's about all @@ -613,3 +625,5 @@ $sbit=$num; &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); &asm_finish(); + +close STDOUT; diff --git a/deps/openssl/openssl/crypto/bn/asm/x86.pl b/deps/openssl/openssl/crypto/bn/asm/x86.pl deleted file mode 100644 index 1bc4f1bb27..0000000000 --- a/deps/openssl/openssl/crypto/bn/asm/x86.pl +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/local/bin/perl - -push(@INC,"perlasm","../../perlasm"); -require "x86asm.pl"; - -require("x86/mul_add.pl"); -require("x86/mul.pl"); -require("x86/sqr.pl"); -require("x86/div.pl"); -require("x86/add.pl"); -require("x86/sub.pl"); -require("x86/comba.pl"); - -&asm_init($ARGV[0],$0); - -&bn_mul_add_words("bn_mul_add_words"); -&bn_mul_words("bn_mul_words"); -&bn_sqr_words("bn_sqr_words"); -&bn_div_words("bn_div_words"); -&bn_add_words("bn_add_words"); -&bn_sub_words("bn_sub_words"); -&bn_mul_comba("bn_mul_comba8",8); -&bn_mul_comba("bn_mul_comba4",4); -&bn_sqr_comba("bn_sqr_comba8",8); -&bn_sqr_comba("bn_sqr_comba4",4); - -&asm_finish(); - diff --git a/deps/openssl/openssl/crypto/bn/asm/x86/add.pl b/deps/openssl/openssl/crypto/bn/asm/x86/add.pl deleted file mode 100644 index 0b5cf583e3..0000000000 --- a/deps/openssl/openssl/crypto/bn/asm/x86/add.pl +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/local/bin/perl -# x86 assember - -sub bn_add_words - { - local($name)=@_; - - &function_begin($name,""); - - &comment(""); - $a="esi"; - $b="edi"; - $c="eax"; - $r="ebx"; - $tmp1="ecx"; - $tmp2="edx"; - $num="ebp"; - - &mov($r,&wparam(0)); # get r - &mov($a,&wparam(1)); # get a - &mov($b,&wparam(2)); # get b - &mov($num,&wparam(3)); # get num - &xor($c,$c); # clear carry - &and($num,0xfffffff8); # num / 8 - - &jz(&label("aw_finish")); - - &set_label("aw_loop",0); - for ($i=0; $i<8; $i++) - { - &comment("Round $i"); - - &mov($tmp1,&DWP($i*4,$a,"",0)); # *a - &mov($tmp2,&DWP($i*4,$b,"",0)); # *b - &add($tmp1,$c); - &mov($c,0); - &adc($c,$c); - &add($tmp1,$tmp2); - &adc($c,0); - &mov(&DWP($i*4,$r,"",0),$tmp1); # *r - } - - &comment(""); - &add($a,32); - &add($b,32); - &add($r,32); - &sub($num,8); - &jnz(&label("aw_loop")); - - &set_label("aw_finish",0); - &mov($num,&wparam(3)); # get num - &and($num,7); - &jz(&label("aw_end")); - - for ($i=0; $i<7; $i++) - { - &comment("Tail Round $i"); - &mov($tmp1,&DWP($i*4,$a,"",0)); # *a - &mov($tmp2,&DWP($i*4,$b,"",0));# *b - &add($tmp1,$c); - &mov($c,0); - &adc($c,$c); - &add($tmp1,$tmp2); - &adc($c,0); - &dec($num) if ($i != 6); - &mov(&DWP($i*4,$r,"",0),$tmp1); # *a - &jz(&label("aw_end")) if ($i != 6); - } - &set_label("aw_end",0); - -# &mov("eax",$c); # $c is "eax" - - &function_end($name); - } - -1; diff --git a/deps/openssl/openssl/crypto/bn/asm/x86/comba.pl b/deps/openssl/openssl/crypto/bn/asm/x86/comba.pl deleted file mode 100644 index 2291253629..0000000000 --- a/deps/openssl/openssl/crypto/bn/asm/x86/comba.pl +++ /dev/null @@ -1,277 +0,0 @@ -#!/usr/local/bin/perl -# x86 assember - -sub mul_add_c - { - local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; - - # pos == -1 if eax and edx are pre-loaded, 0 to load from next - # words, and 1 if load return value - - &comment("mul a[$ai]*b[$bi]"); - - # "eax" and "edx" will always be pre-loaded. - # &mov("eax",&DWP($ai*4,$a,"",0)) ; - # &mov("edx",&DWP($bi*4,$b,"",0)); - - &mul("edx"); - &add($c0,"eax"); - &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a - &mov("eax",&wparam(0)) if $pos > 0; # load r[] - ### - &adc($c1,"edx"); - &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b - &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b - ### - &adc($c2,0); - # is pos > 1, it means it is the last loop - &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[]; - &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a - } - -sub sqr_add_c - { - local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; - - # pos == -1 if eax and edx are pre-loaded, 0 to load from next - # words, and 1 if load return value - - &comment("sqr a[$ai]*a[$bi]"); - - # "eax" and "edx" will always be pre-loaded. - # &mov("eax",&DWP($ai*4,$a,"",0)) ; - # &mov("edx",&DWP($bi*4,$b,"",0)); - - if ($ai == $bi) - { &mul("eax");} - else - { &mul("edx");} - &add($c0,"eax"); - &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a - ### - &adc($c1,"edx"); - &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb); - ### - &adc($c2,0); - # is pos > 1, it means it is the last loop - &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[]; - &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b - } - -sub sqr_add_c2 - { - local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; - - # pos == -1 if eax and edx are pre-loaded, 0 to load from next - # words, and 1 if load return value - - &comment("sqr a[$ai]*a[$bi]"); - - # "eax" and "edx" will always be pre-loaded. - # &mov("eax",&DWP($ai*4,$a,"",0)) ; - # &mov("edx",&DWP($bi*4,$a,"",0)); - - if ($ai == $bi) - { &mul("eax");} - else - { &mul("edx");} - &add("eax","eax"); - ### - &adc("edx","edx"); - ### - &adc($c2,0); - &add($c0,"eax"); - &adc($c1,"edx"); - &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a - &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b - &adc($c2,0); - &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[]; - &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb); - ### - } - -sub bn_mul_comba - { - local($name,$num)=@_; - local($a,$b,$c0,$c1,$c2); - local($i,$as,$ae,$bs,$be,$ai,$bi); - local($tot,$end); - - &function_begin_B($name,""); - - $c0="ebx"; - $c1="ecx"; - $c2="ebp"; - $a="esi"; - $b="edi"; - - $as=0; - $ae=0; - $bs=0; - $be=0; - $tot=$num+$num-1; - - &push("esi"); - &mov($a,&wparam(1)); - &push("edi"); - &mov($b,&wparam(2)); - &push("ebp"); - &push("ebx"); - - &xor($c0,$c0); - &mov("eax",&DWP(0,$a,"",0)); # load the first word - &xor($c1,$c1); - &mov("edx",&DWP(0,$b,"",0)); # load the first second - - for ($i=0; $i<$tot; $i++) - { - $ai=$as; - $bi=$bs; - $end=$be+1; - - &comment("################## Calculate word $i"); - - for ($j=$bs; $j<$end; $j++) - { - &xor($c2,$c2) if ($j == $bs); - if (($j+1) == $end) - { - $v=1; - $v=2 if (($i+1) == $tot); - } - else - { $v=0; } - if (($j+1) != $end) - { - $na=($ai-1); - $nb=($bi+1); - } - else - { - $na=$as+($i < ($num-1)); - $nb=$bs+($i >= ($num-1)); - } -#printf STDERR "[$ai,$bi] -> [$na,$nb]\n"; - &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb); - if ($v) - { - &comment("saved r[$i]"); - # &mov("eax",&wparam(0)); - # &mov(&DWP($i*4,"eax","",0),$c0); - ($c0,$c1,$c2)=($c1,$c2,$c0); - } - $ai--; - $bi++; - } - $as++ if ($i < ($num-1)); - $ae++ if ($i >= ($num-1)); - - $bs++ if ($i >= ($num-1)); - $be++ if ($i < ($num-1)); - } - &comment("save r[$i]"); - # &mov("eax",&wparam(0)); - &mov(&DWP($i*4,"eax","",0),$c0); - - &pop("ebx"); - &pop("ebp"); - &pop("edi"); - &pop("esi"); - &ret(); - &function_end_B($name); - } - -sub bn_sqr_comba - { - local($name,$num)=@_; - local($r,$a,$c0,$c1,$c2)=@_; - local($i,$as,$ae,$bs,$be,$ai,$bi); - local($b,$tot,$end,$half); - - &function_begin_B($name,""); - - $c0="ebx"; - $c1="ecx"; - $c2="ebp"; - $a="esi"; - $r="edi"; - - &push("esi"); - &push("edi"); - &push("ebp"); - &push("ebx"); - &mov($r,&wparam(0)); - &mov($a,&wparam(1)); - &xor($c0,$c0); - &xor($c1,$c1); - &mov("eax",&DWP(0,$a,"",0)); # load the first word - - $as=0; - $ae=0; - $bs=0; - $be=0; - $tot=$num+$num-1; - - for ($i=0; $i<$tot; $i++) - { - $ai=$as; - $bi=$bs; - $end=$be+1; - - &comment("############### Calculate word $i"); - for ($j=$bs; $j<$end; $j++) - { - &xor($c2,$c2) if ($j == $bs); - if (($ai-1) < ($bi+1)) - { - $v=1; - $v=2 if ($i+1) == $tot; - } - else - { $v=0; } - if (!$v) - { - $na=$ai-1; - $nb=$bi+1; - } - else - { - $na=$as+($i < ($num-1)); - $nb=$bs+($i >= ($num-1)); - } - if ($ai == $bi) - { - &sqr_add_c($r,$a,$ai,$bi, - $c0,$c1,$c2,$v,$i,$na,$nb); - } - else - { - &sqr_add_c2($r,$a,$ai,$bi, - $c0,$c1,$c2,$v,$i,$na,$nb); - } - if ($v) - { - &comment("saved r[$i]"); - #&mov(&DWP($i*4,$r,"",0),$c0); - ($c0,$c1,$c2)=($c1,$c2,$c0); - last; - } - $ai--; - $bi++; - } - $as++ if ($i < ($num-1)); - $ae++ if ($i >= ($num-1)); - - $bs++ if ($i >= ($num-1)); - $be++ if ($i < ($num-1)); - } - &mov(&DWP($i*4,$r,"",0),$c0); - &pop("ebx"); - &pop("ebp"); - &pop("edi"); - &pop("esi"); - &ret(); - &function_end_B($name); - } - -1; diff --git a/deps/openssl/openssl/crypto/bn/asm/x86/div.pl b/deps/openssl/openssl/crypto/bn/asm/x86/div.pl deleted file mode 100644 index 0e90152caa..0000000000 --- a/deps/openssl/openssl/crypto/bn/asm/x86/div.pl +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/local/bin/perl -# x86 assember - -sub bn_div_words - { - local($name)=@_; - - &function_begin($name,""); - &mov("edx",&wparam(0)); # - &mov("eax",&wparam(1)); # - &mov("ebx",&wparam(2)); # - &div("ebx"); - &function_end($name); - } -1; diff --git a/deps/openssl/openssl/crypto/bn/asm/x86/f b/deps/openssl/openssl/crypto/bn/asm/x86/f deleted file mode 100644 index 22e4112224..0000000000 --- a/deps/openssl/openssl/crypto/bn/asm/x86/f +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/local/bin/perl -# x86 assember - diff --git a/deps/openssl/openssl/crypto/bn/asm/x86/mul.pl b/deps/openssl/openssl/crypto/bn/asm/x86/mul.pl deleted file mode 100644 index 674cb9b055..0000000000 --- a/deps/openssl/openssl/crypto/bn/asm/x86/mul.pl +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/local/bin/perl -# x86 assember - -sub bn_mul_words - { - local($name)=@_; - - &function_begin($name,""); - - &comment(""); - $Low="eax"; - $High="edx"; - $a="ebx"; - $w="ecx"; - $r="edi"; - $c="esi"; - $num="ebp"; - - &xor($c,$c); # clear carry - &mov($r,&wparam(0)); # - &mov($a,&wparam(1)); # - &mov($num,&wparam(2)); # - &mov($w,&wparam(3)); # - - &and($num,0xfffffff8); # num / 8 - &jz(&label("mw_finish")); - - &set_label("mw_loop",0); - for ($i=0; $i<32; $i+=4) - { - &comment("Round $i"); - - &mov("eax",&DWP($i,$a,"",0)); # *a - &mul($w); # *a * w - &add("eax",$c); # L(t)+=c - # XXX - - &adc("edx",0); # H(t)+=carry - &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); - - &mov($c,"edx"); # c= H(t); - } - - &comment(""); - &add($a,32); - &add($r,32); - &sub($num,8); - &jz(&label("mw_finish")); - &jmp(&label("mw_loop")); - - &set_label("mw_finish",0); - &mov($num,&wparam(2)); # get num - &and($num,7); - &jnz(&label("mw_finish2")); - &jmp(&label("mw_end")); - - &set_label("mw_finish2",1); - for ($i=0; $i<7; $i++) - { - &comment("Tail Round $i"); - &mov("eax",&DWP($i*4,$a,"",0));# *a - &mul($w); # *a * w - &add("eax",$c); # L(t)+=c - # XXX - &adc("edx",0); # H(t)+=carry - &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t); - &mov($c,"edx"); # c= H(t); - &dec($num) if ($i != 7-1); - &jz(&label("mw_end")) if ($i != 7-1); - } - &set_label("mw_end",0); - &mov("eax",$c); - - &function_end($name); - } - -1; diff --git a/deps/openssl/openssl/crypto/bn/asm/x86/mul_add.pl b/deps/openssl/openssl/crypto/bn/asm/x86/mul_add.pl deleted file mode 100644 index 61830d3a90..0000000000 --- a/deps/openssl/openssl/crypto/bn/asm/x86/mul_add.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/local/bin/perl -# x86 assember - -sub bn_mul_add_words - { - local($name)=@_; - - &function_begin($name,""); - - &comment(""); - $Low="eax"; - $High="edx"; - $a="ebx"; - $w="ebp"; - $r="edi"; - $c="esi"; - - &xor($c,$c); # clear carry - &mov($r,&wparam(0)); # - - &mov("ecx",&wparam(2)); # - &mov($a,&wparam(1)); # - - &and("ecx",0xfffffff8); # num / 8 - &mov($w,&wparam(3)); # - - &push("ecx"); # Up the stack for a tmp variable - - &jz(&label("maw_finish")); - - &set_label("maw_loop",0); - - &mov(&swtmp(0),"ecx"); # - - for ($i=0; $i<32; $i+=4) - { - &comment("Round $i"); - - &mov("eax",&DWP($i,$a,"",0)); # *a - &mul($w); # *a * w - &add("eax",$c); # L(t)+= *r - &mov($c,&DWP($i,$r,"",0)); # L(t)+= *r - &adc("edx",0); # H(t)+=carry - &add("eax",$c); # L(t)+=c - &adc("edx",0); # H(t)+=carry - &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); - &mov($c,"edx"); # c= H(t); - } - - &comment(""); - &mov("ecx",&swtmp(0)); # - &add($a,32); - &add($r,32); - &sub("ecx",8); - &jnz(&label("maw_loop")); - - &set_label("maw_finish",0); - &mov("ecx",&wparam(2)); # get num - &and("ecx",7); - &jnz(&label("maw_finish2")); # helps branch prediction - &jmp(&label("maw_end")); - - &set_label("maw_finish2",1); - for ($i=0; $i<7; $i++) - { - &comment("Tail Round $i"); - &mov("eax",&DWP($i*4,$a,"",0));# *a - &mul($w); # *a * w - &add("eax",$c); # L(t)+=c - &mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r - &adc("edx",0); # H(t)+=carry - &add("eax",$c); - &adc("edx",0); # H(t)+=carry - &dec("ecx") if ($i != 7-1); - &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t); - &mov($c,"edx"); # c= H(t); - &jz(&label("maw_end")) if ($i != 7-1); - } - &set_label("maw_end",0); - &mov("eax",$c); - - &pop("ecx"); # clear variable from - - &function_end($name); - } - -1; diff --git a/deps/openssl/openssl/crypto/bn/asm/x86/sqr.pl b/deps/openssl/openssl/crypto/bn/asm/x86/sqr.pl deleted file mode 100644 index 1f90993cf6..0000000000 --- a/deps/openssl/openssl/crypto/bn/asm/x86/sqr.pl +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/local/bin/perl -# x86 assember - -sub bn_sqr_words - { - local($name)=@_; - - &function_begin($name,""); - - &comment(""); - $r="esi"; - $a="edi"; - $num="ebx"; - - &mov($r,&wparam(0)); # - &mov($a,&wparam(1)); # - &mov($num,&wparam(2)); # - - &and($num,0xfffffff8); # num / 8 - &jz(&label("sw_finish")); - - &set_label("sw_loop",0); - for ($i=0; $i<32; $i+=4) - { - &comment("Round $i"); - &mov("eax",&DWP($i,$a,"",0)); # *a - # XXX - &mul("eax"); # *a * *a - &mov(&DWP($i*2,$r,"",0),"eax"); # - &mov(&DWP($i*2+4,$r,"",0),"edx");# - } - - &comment(""); - &add($a,32); - &add($r,64); - &sub($num,8); - &jnz(&label("sw_loop")); - - &set_label("sw_finish",0); - &mov($num,&wparam(2)); # get num - &and($num,7); - &jz(&label("sw_end")); - - for ($i=0; $i<7; $i++) - { - &comment("Tail Round $i"); - &mov("eax",&DWP($i*4,$a,"",0)); # *a - # XXX - &mul("eax"); # *a * *a - &mov(&DWP($i*8,$r,"",0),"eax"); # - &dec($num) if ($i != 7-1); - &mov(&DWP($i*8+4,$r,"",0),"edx"); - &jz(&label("sw_end")) if ($i != 7-1); - } - &set_label("sw_end",0); - - &function_end($name); - } - -1; diff --git a/deps/openssl/openssl/crypto/bn/asm/x86/sub.pl b/deps/openssl/openssl/crypto/bn/asm/x86/sub.pl deleted file mode 100644 index 837b0e1b07..0000000000 --- a/deps/openssl/openssl/crypto/bn/asm/x86/sub.pl +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/local/bin/perl -# x86 assember - -sub bn_sub_words - { - local($name)=@_; - - &function_begin($name,""); - - &comment(""); - $a="esi"; - $b="edi"; - $c="eax"; - $r="ebx"; - $tmp1="ecx"; - $tmp2="edx"; - $num="ebp"; - - &mov($r,&wparam(0)); # get r - &mov($a,&wparam(1)); # get a - &mov($b,&wparam(2)); # get b - &mov($num,&wparam(3)); # get num - &xor($c,$c); # clear carry - &and($num,0xfffffff8); # num / 8 - - &jz(&label("aw_finish")); - - &set_label("aw_loop",0); - for ($i=0; $i<8; $i++) - { - &comment("Round $i"); - - &mov($tmp1,&DWP($i*4,$a,"",0)); # *a - &mov($tmp2,&DWP($i*4,$b,"",0)); # *b - &sub($tmp1,$c); - &mov($c,0); - &adc($c,$c); - &sub($tmp1,$tmp2); - &adc($c,0); - &mov(&DWP($i*4,$r,"",0),$tmp1); # *r - } - - &comment(""); - &add($a,32); - &add($b,32); - &add($r,32); - &sub($num,8); - &jnz(&label("aw_loop")); - - &set_label("aw_finish",0); - &mov($num,&wparam(3)); # get num - &and($num,7); - &jz(&label("aw_end")); - - for ($i=0; $i<7; $i++) - { - &comment("Tail Round $i"); - &mov($tmp1,&DWP($i*4,$a,"",0)); # *a - &mov($tmp2,&DWP($i*4,$b,"",0));# *b - &sub($tmp1,$c); - &mov($c,0); - &adc($c,$c); - &sub($tmp1,$tmp2); - &adc($c,0); - &dec($num) if ($i != 6); - &mov(&DWP($i*4,$r,"",0),$tmp1); # *a - &jz(&label("aw_end")) if ($i != 6); - } - &set_label("aw_end",0); - -# &mov("eax",$c); # $c is "eax" - - &function_end($name); - } - -1; diff --git a/deps/openssl/openssl/crypto/bn/asm/x86_64-gcc.c b/deps/openssl/openssl/crypto/bn/asm/x86_64-gcc.c index 1729b479d4..0ff3805a61 100644 --- a/deps/openssl/openssl/crypto/bn/asm/x86_64-gcc.c +++ b/deps/openssl/openssl/crypto/bn/asm/x86_64-gcc.c @@ -1,3 +1,12 @@ +/* + * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + #include "../bn_lcl.h" #if !(defined(__GNUC__) && __GNUC__>=2) # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ @@ -216,9 +225,10 @@ BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, " adcq (%5,%2,8),%0 \n" " movq %0,(%3,%2,8) \n" " lea 1(%2),%2 \n" - " loop 1b \n" - " sbbq %0,%0 \n":"=&r" (ret), "+c"(n), - "+r"(i) + " dec %1 \n" + " jnz 1b \n" + " sbbq %0,%0 \n" + :"=&r" (ret), "+c"(n), "+r"(i) :"r"(rp), "r"(ap), "r"(bp) :"cc", "memory"); @@ -242,9 +252,10 @@ BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, " sbbq (%5,%2,8),%0 \n" " movq %0,(%3,%2,8) \n" " lea 1(%2),%2 \n" - " loop 1b \n" - " sbbq %0,%0 \n":"=&r" (ret), "+c"(n), - "+r"(i) + " dec %1 \n" + " jnz 1b \n" + " sbbq %0,%0 \n" + :"=&r" (ret), "+c"(n), "+r"(i) :"r"(rp), "r"(ap), "r"(bp) :"cc", "memory"); diff --git a/deps/openssl/openssl/crypto/bn/asm/x86_64-gf2m.pl b/deps/openssl/openssl/crypto/bn/asm/x86_64-gf2m.pl index 42bbec2fb7..d962f62033 100644 --- a/deps/openssl/openssl/crypto/bn/asm/x86_64-gf2m.pl +++ b/deps/openssl/openssl/crypto/bn/asm/x86_64-gf2m.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -31,7 +38,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; ($lo,$hi)=("%rax","%rdx"); $a=$lo; diff --git a/deps/openssl/openssl/crypto/bn/asm/x86_64-mont.pl b/deps/openssl/openssl/crypto/bn/asm/x86_64-mont.pl index 80492d8e63..df4cca5bfe 100755 --- a/deps/openssl/openssl/crypto/bn/asm/x86_64-mont.pl +++ b/deps/openssl/openssl/crypto/bn/asm/x86_64-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -50,7 +57,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` @@ -129,7 +136,9 @@ $code.=<<___; neg $num # restore $num and \$-1024,%r10 # minimize TLB usage - # Some OSes, *cough*-dows, insist on stack being "wired" to + # An OS-agnostic version of __chkstk. + # + # Some OSes (Windows) insist on stack being "wired" to # physical memory in strictly sequential manner, i.e. if stack # allocation spans two pages, then reference to farmost one can # be punishable by SEGV. But page walking can do good even on diff --git a/deps/openssl/openssl/crypto/bn/asm/x86_64-mont5.pl b/deps/openssl/openssl/crypto/bn/asm/x86_64-mont5.pl index 42178e455a..5779059ea2 100755 --- a/deps/openssl/openssl/crypto/bn/asm/x86_64-mont5.pl +++ b/deps/openssl/openssl/crypto/bn/asm/x86_64-mont5.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -35,7 +42,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` @@ -113,7 +120,9 @@ $code.=<<___; neg $num # restore $num and \$-1024,%r10 # minimize TLB usage - # Some OSes, *cough*-dows, insist on stack being "wired" to + # An OS-agnostic version of __chkstk. + # + # Some OSes (Windows) insist on stack being "wired" to # physical memory in strictly sequential manner, i.e. if stack # allocation spans two pages, then reference to farmost one can # be punishable by SEGV. But page walking can do good even on |