50 files changed, 2845 insertions, 10110 deletions
diff --git a/deps/openssl/openssl/crypto/bn/asm/README b/deps/openssl/openssl/crypto/bn/asm/README
deleted file mode 100644
index b0f3a68a06..0000000000
--- a/deps/openssl/openssl/crypto/bn/asm/README
+++ /dev/null
@@ -1,27 +0,0 @@
-<OBSOLETE>
-
-All assember in this directory are just version of the file
-crypto/bn/bn_asm.c.
-
-Quite a few of these files are just the assember output from gcc since on 
-quite a few machines they are 2 times faster than the system compiler.
-
-For the x86, I have hand written assember because of the bad job all
-compilers seem to do on it.  This normally gives a 2 time speed up in the RSA
-routines.
-
-For the DEC alpha, I also hand wrote the assember (except the division which
-is just the output from the C compiler pasted on the end of the file).
-On the 2 alpha C compilers I had access to, it was not possible to do
-64b x 64b -> 128b calculations (both long and the long long data types
-were 64 bits).  So the hand assember gives access to the 128 bit result and
-a 2 times speedup :-).
-
-There are 3 versions of assember for the HP PA-RISC.
-
-pa-risc.s is the origional one which works fine and generated using gcc :-)
-
-pa-risc2W.s and pa-risc2.s are 64 and 32-bit PA-RISC 2.0 implementations
-by Chris Ruemmler from HP (with some help from the HP C compiler).
-
-</OBSOLETE>
diff --git a/deps/openssl/openssl/crypto/bn/asm/alpha-mont.pl b/deps/openssl/openssl/crypto/bn/asm/alpha-mont.pl
index 03596e2014..1d68d6d072 100644
--- a/deps/openssl/openssl/crypto/bn/asm/alpha-mont.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/alpha-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -15,6 +22,9 @@
 # I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
 # difference.
 
+$output=pop;
+open STDOUT,">$output";
+
 # int bn_mul_mont(
 $rp="a0";	# BN_ULONG *rp,
 $ap="a1";	# const BN_ULONG *ap,
diff --git a/deps/openssl/openssl/crypto/bn/asm/armv4-gf2m.pl b/deps/openssl/openssl/crypto/bn/asm/armv4-gf2m.pl
index 72381a7724..0bb5433075 100644
--- a/deps/openssl/openssl/crypto/bn/asm/armv4-gf2m.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/armv4-gf2m.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -32,14 +39,31 @@
 # 
 # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
 
 $code=<<___;
 #include "arm_arch.h"
 
 .text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
 .code	32
+#endif
 ___
 ################
 # private interface to mul_1x1_ialu
@@ -120,11 +144,17 @@ mul_1x1_ialu:
 	eor	$hi,$hi,$t0,lsr#8
 	ldr	$t0,[sp,$i0]		@ tab[b >> 30      ]
 
+#ifdef	__thumb2__
+	itt	ne
+#endif
 	eorne	$lo,$lo,$b,lsl#30
 	eorne	$hi,$hi,$b,lsr#2
 	tst	$a,#1<<31
 	eor	$lo,$lo,$t1,lsl#27
 	eor	$hi,$hi,$t1,lsr#5
+#ifdef	__thumb2__
+	itt	ne
+#endif
 	eorne	$lo,$lo,$b,lsl#31
 	eorne	$hi,$hi,$b,lsr#1
 	eor	$lo,$lo,$t0,lsl#30
@@ -144,20 +174,33 @@ $code.=<<___;
 .align	5
 bn_GF2m_mul_2x2:
 #if __ARM_MAX_ARCH__>=7
+	stmdb	sp!,{r10,lr}
 	ldr	r12,.LOPENSSL_armcap
-.Lpic:	ldr	r12,[pc,r12]
-	tst	r12,#1
+	adr	r10,.LOPENSSL_armcap
+	ldr	r12,[r12,r10]
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
+	tst	r12,#ARMV7_NEON
+	itt	ne
+	ldrne	r10,[sp],#8
 	bne	.LNEON
+	stmdb	sp!,{r4-r9}
+#else
+	stmdb	sp!,{r4-r10,lr}
 #endif
 ___
 $ret="r10";	# reassigned 1st argument
 $code.=<<___;
-	stmdb	sp!,{r4-r10,lr}
 	mov	$ret,r0			@ reassign 1st argument
 	mov	$b,r3			@ $b=b1
+	sub	r7,sp,#36
+	mov	r8,sp
+	and	r7,r7,#-32
 	ldr	r3,[sp,#32]		@ load b0
 	mov	$mask,#7<<2
-	sub	sp,sp,#32		@ allocate tab[8]
+	mov	sp,r7			@ allocate tab[8]
+	str	r8,[r7,#32]
 
 	bl	mul_1x1_ialu		@ a1·b1
 	str	$lo,[$ret,#8]
@@ -181,6 +224,7 @@ ___
 $code.=<<___;
 	ldmia	$ret,{@r[0]-@r[3]}
 	eor	$lo,$lo,$hi
+	ldr	sp,[sp,#32]		@ destroy tab[8]
 	eor	$hi,$hi,@r[1]
 	eor	$lo,$lo,@r[0]
 	eor	$hi,$hi,@r[2]
@@ -188,7 +232,6 @@ $code.=<<___;
 	eor	$hi,$hi,@r[3]
 	str	$hi,[$ret,#8]
 	eor	$lo,$lo,$hi
-	add	sp,sp,#32		@ destroy tab[8]
 	str	$lo,[$ret,#4]
 
 #if __ARM_ARCH__>=5
@@ -213,8 +256,8 @@ $code.=<<___;
 .align	5
 .LNEON:
 	ldr		r12, [sp]		@ 5th argument
-	vmov.32		$a, r2, r1
-	vmov.32		$b, r12, r3
+	vmov		$a, r2, r1
+	vmov		$b, r12, r3
 	vmov.i64	$k48, #0x0000ffffffffffff
 	vmov.i64	$k32, #0x00000000ffffffff
 	vmov.i64	$k16, #0x000000000000ffff
@@ -267,7 +310,7 @@ $code.=<<___;
 #if __ARM_MAX_ARCH__>=7
 .align	5
 .LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-(.Lpic+8)
+.word	OPENSSL_armcap_P-.
 #endif
 .asciz	"GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 .align	5
diff --git a/deps/openssl/openssl/crypto/bn/asm/armv4-mont.pl b/deps/openssl/openssl/crypto/bn/asm/armv4-mont.pl
index 1d330e9f8a..0dc4fe95e4 100644
--- a/deps/openssl/openssl/crypto/bn/asm/armv4-mont.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/armv4-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -38,8 +45,29 @@
 # for execution on all NEON-capable processors, because gain on
 # others outweighs the marginal loss on Cortex-A9.
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+# September 2015
+#
+# Align Cortex-A9 performance with November 2013 improvements, i.e.
+# NEON code is now ~20-105% faster than integer-only one on this
+# processor. But this optimization further improved performance even
+# on other processors: NEON code path is ~45-180% faster than original
+# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
+# Snapdragon S4.
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
 
 $num="r0";	# starts as num argument, but holds &tp[num-1]
 $ap="r1";
@@ -70,12 +98,17 @@ $code=<<___;
 #include "arm_arch.h"
 
 .text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
 .code	32
+#endif
 
 #if __ARM_MAX_ARCH__>=7
 .align	5
 .LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-bn_mul_mont
+.word	OPENSSL_armcap_P-.Lbn_mul_mont
 #endif
 
 .global	bn_mul_mont
@@ -83,15 +116,19 @@ $code=<<___;
 
 .align	5
 bn_mul_mont:
+.Lbn_mul_mont:
 	ldr	ip,[sp,#4]		@ load num
 	stmdb	sp!,{r0,r2}		@ sp points at argument block
 #if __ARM_MAX_ARCH__>=7
 	tst	ip,#7
 	bne	.Lialu
-	adr	r0,bn_mul_mont
+	adr	r0,.Lbn_mul_mont
 	ldr	r2,.LOPENSSL_armcap
 	ldr	r0,[r0,r2]
-	tst	r0,#1			@ NEON available?
+#ifdef	__APPLE__
+	ldr	r0,[r0]
+#endif
+	tst	r0,#ARMV7_NEON		@ NEON available?
 	ldmia	sp, {r0,r2}
 	beq	.Lialu
 	add	sp,sp,#8
@@ -101,6 +138,9 @@ bn_mul_mont:
 #endif
 	cmp	ip,#2
 	mov	$num,ip			@ load num
+#ifdef	__thumb2__
+	ittt	lt
+#endif
 	movlt	r0,#0
 	addlt	sp,sp,#2*4
 	blt	.Labrt
@@ -148,10 +188,11 @@ bn_mul_mont:
 	ldr	$n0,[$_n0]		@ restore n0
 	adc	$nhi,$nhi,#0
 	str	$nlo,[$num]		@ tp[num-1]=
+	mov	$tj,sp
 	str	$nhi,[$num,#4]		@ tp[num]=
 
 .Louter:
-	sub	$tj,$num,sp		@ "original" $num-1 value
+	sub	$tj,$num,$tj		@ "original" $num-1 value
 	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
 	ldr	$bi,[$tp,#4]!		@ *(++bp)
 	sub	$np,$np,$tj		@ "rewind" np to &np[1]
@@ -196,11 +237,16 @@ bn_mul_mont:
 	str	$nhi,[$num,#4]		@ tp[num]=
 
 	cmp	$tp,$tj
+#ifdef	__thumb2__
+	itt	ne
+#endif
+	movne	$tj,sp
 	bne	.Louter
 
 	ldr	$rp,[$_rp]		@ pull rp
+	mov	$aj,sp
 	add	$num,$num,#4		@ $num to point at &tp[num]
-	sub	$aj,$num,sp		@ "original" num value
+	sub	$aj,$num,$aj		@ "original" num value
 	mov	$tp,sp			@ "rewind" $tp
 	mov	$ap,$tp			@ "borrow" $ap
 	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
@@ -226,7 +272,8 @@ bn_mul_mont:
 	cmp	$tp,$num
 	bne	.Lcopy
 
-	add	sp,$num,#4		@ skip over tp[num+1]
+	mov	sp,$num
+	add	sp,sp,#4		@ skip over tp[num+1]
 	ldmia	sp!,{r4-r12,lr}		@ restore registers
 	add	sp,sp,#2*4		@ skip over {r0,r2}
 	mov	r0,#1
@@ -241,19 +288,16 @@ bn_mul_mont:
 .size	bn_mul_mont,.-bn_mul_mont
 ___
 {
-sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
-sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
-
 my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
 my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
 my ($Z,$Temp)=("q4","q5");
-my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
+my @ACC=map("q$_",(6..13));
 my ($Bi,$Ni,$M0)=map("d$_",(28..31));
-my $zero=&Dlo($Z);
-my $temp=&Dlo($Temp);
+my $zero="$Z#lo";
+my $temp="$Temp#lo";
 
 my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
-my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
+my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
 
 $code.=<<___;
 #if __ARM_MAX_ARCH__>=7
@@ -267,60 +311,60 @@ bn_mul8x_mont_neon:
 	stmdb	sp!,{r4-r11}
 	vstmdb	sp!,{d8-d15}		@ ABI specification says so
 	ldmia	ip,{r4-r5}		@ load rest of parameter block
+	mov	ip,sp
+
+	cmp	$num,#8
+	bhi	.LNEON_8n
+
+	@ special case for $num==8, everything is in register bank...
 
-	sub		$toutptr,sp,#16
 	vld1.32		{${Bi}[0]}, [$bptr,:32]!
-	sub		$toutptr,$toutptr,$num,lsl#4
+	veor		$zero,$zero,$zero
+	sub		$toutptr,sp,$num,lsl#4
 	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
 	and		$toutptr,$toutptr,#-64
 	vld1.32		{${M0}[0]}, [$n0,:32]
 	mov		sp,$toutptr			@ alloca
-	veor		$zero,$zero,$zero
-	subs		$inner,$num,#8
 	vzip.16		$Bi,$zero
 
-	vmull.u32	$A0xB,$Bi,${A0}[0]
-	vmull.u32	$A1xB,$Bi,${A0}[1]
-	vmull.u32	$A2xB,$Bi,${A1}[0]
-	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
-	vmull.u32	$A3xB,$Bi,${A1}[1]
+	vmull.u32	@ACC[0],$Bi,${A0}[0]
+	vmull.u32	@ACC[1],$Bi,${A0}[1]
+	vmull.u32	@ACC[2],$Bi,${A1}[0]
+	vshl.i64	$Ni,@ACC[0]#hi,#16
+	vmull.u32	@ACC[3],$Bi,${A1}[1]
 
-	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
+	vadd.u64	$Ni,$Ni,@ACC[0]#lo
 	veor		$zero,$zero,$zero
-	vmul.u32	$Ni,$temp,$M0
+	vmul.u32	$Ni,$Ni,$M0
 
-	vmull.u32	$A4xB,$Bi,${A2}[0]
+	vmull.u32	@ACC[4],$Bi,${A2}[0]
 	 vld1.32	{$N0-$N3}, [$nptr]!
-	vmull.u32	$A5xB,$Bi,${A2}[1]
-	vmull.u32	$A6xB,$Bi,${A3}[0]
+	vmull.u32	@ACC[5],$Bi,${A2}[1]
+	vmull.u32	@ACC[6],$Bi,${A3}[0]
 	vzip.16		$Ni,$zero
-	vmull.u32	$A7xB,$Bi,${A3}[1]
-
-	bne	.LNEON_1st
-
-	@ special case for num=8, everything is in register bank...
+	vmull.u32	@ACC[7],$Bi,${A3}[1]
 
-	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
 	sub		$outer,$num,#1
-	vmlal.u32	$A1xB,$Ni,${N0}[1]
-	vmlal.u32	$A2xB,$Ni,${N1}[0]
-	vmlal.u32	$A3xB,$Ni,${N1}[1]
-
-	vmlal.u32	$A4xB,$Ni,${N2}[0]
-	vmov		$Temp,$A0xB
-	vmlal.u32	$A5xB,$Ni,${N2}[1]
-	vmov		$A0xB,$A1xB
-	vmlal.u32	$A6xB,$Ni,${N3}[0]
-	vmov		$A1xB,$A2xB
-	vmlal.u32	$A7xB,$Ni,${N3}[1]
-	vmov		$A2xB,$A3xB
-	vmov		$A3xB,$A4xB
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	vmov		$Temp,@ACC[0]
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	vmov		@ACC[0],@ACC[1]
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmov		@ACC[1],@ACC[2]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	vmov		@ACC[2],@ACC[3]
+	vmov		@ACC[3],@ACC[4]
 	vshr.u64	$temp,$temp,#16
-	vmov		$A4xB,$A5xB
-	vmov		$A5xB,$A6xB
-	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
-	vmov		$A6xB,$A7xB
-	veor		$A7xB,$A7xB
+	vmov		@ACC[4],@ACC[5]
+	vmov		@ACC[5],@ACC[6]
+	vadd.u64	$temp,$temp,$Temp#hi
+	vmov		@ACC[6],@ACC[7]
+	veor		@ACC[7],@ACC[7]
 	vshr.u64	$temp,$temp,#16
 
 	b	.LNEON_outer8
@@ -330,279 +374,302 @@ bn_mul8x_mont_neon:
 	vld1.32		{${Bi}[0]}, [$bptr,:32]!
 	veor		$zero,$zero,$zero
 	vzip.16		$Bi,$zero
-	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
 
-	vmlal.u32	$A0xB,$Bi,${A0}[0]
-	vmlal.u32	$A1xB,$Bi,${A0}[1]
-	vmlal.u32	$A2xB,$Bi,${A1}[0]
-	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
-	vmlal.u32	$A3xB,$Bi,${A1}[1]
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	vshl.i64	$Ni,@ACC[0]#hi,#16
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
 
-	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
+	vadd.u64	$Ni,$Ni,@ACC[0]#lo
 	veor		$zero,$zero,$zero
 	subs		$outer,$outer,#1
-	vmul.u32	$Ni,$temp,$M0
+	vmul.u32	$Ni,$Ni,$M0
 
-	vmlal.u32	$A4xB,$Bi,${A2}[0]
-	vmlal.u32	$A5xB,$Bi,${A2}[1]
-	vmlal.u32	$A6xB,$Bi,${A3}[0]
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
 	vzip.16		$Ni,$zero
-	vmlal.u32	$A7xB,$Bi,${A3}[1]
-
-	vmlal.u32	$A0xB,$Ni,${N0}[0]
-	vmlal.u32	$A1xB,$Ni,${N0}[1]
-	vmlal.u32	$A2xB,$Ni,${N1}[0]
-	vmlal.u32	$A3xB,$Ni,${N1}[1]
-
-	vmlal.u32	$A4xB,$Ni,${N2}[0]
-	vmov		$Temp,$A0xB
-	vmlal.u32	$A5xB,$Ni,${N2}[1]
-	vmov		$A0xB,$A1xB
-	vmlal.u32	$A6xB,$Ni,${N3}[0]
-	vmov		$A1xB,$A2xB
-	vmlal.u32	$A7xB,$Ni,${N3}[1]
-	vmov		$A2xB,$A3xB
-	vmov		$A3xB,$A4xB
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	vmov		$Temp,@ACC[0]
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	vmov		@ACC[0],@ACC[1]
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmov		@ACC[1],@ACC[2]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	vmov		@ACC[2],@ACC[3]
+	vmov		@ACC[3],@ACC[4]
 	vshr.u64	$temp,$temp,#16
-	vmov		$A4xB,$A5xB
-	vmov		$A5xB,$A6xB
-	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
-	vmov		$A6xB,$A7xB
-	veor		$A7xB,$A7xB
+	vmov		@ACC[4],@ACC[5]
+	vmov		@ACC[5],@ACC[6]
+	vadd.u64	$temp,$temp,$Temp#hi
+	vmov		@ACC[6],@ACC[7]
+	veor		@ACC[7],@ACC[7]
 	vshr.u64	$temp,$temp,#16
 
 	bne	.LNEON_outer8
 
-	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
 	mov		$toutptr,sp
-	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
+	vshr.u64	$temp,@ACC[0]#lo,#16
 	mov		$inner,$num
-	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
-	add		$tinptr,sp,#16
-	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
-	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
+	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
+	add		$tinptr,sp,#96
+	vshr.u64	$temp,@ACC[0]#hi,#16
+	vzip.16		@ACC[0]#lo,@ACC[0]#hi
 
-	b	.LNEON_tail2
+	b	.LNEON_tail_entry
 
 .align	4
-.LNEON_1st:
-	vmlal.u32	$A0xB,$Ni,${N0}[0]
-	 vld1.32	{$A0-$A3}, [$aptr]!
-	vmlal.u32	$A1xB,$Ni,${N0}[1]
+.LNEON_8n:
+	veor		@ACC[0],@ACC[0],@ACC[0]
+	 sub		$toutptr,sp,#128
+	veor		@ACC[1],@ACC[1],@ACC[1]
+	 sub		$toutptr,$toutptr,$num,lsl#4
+	veor		@ACC[2],@ACC[2],@ACC[2]
+	 and		$toutptr,$toutptr,#-64
+	veor		@ACC[3],@ACC[3],@ACC[3]
+	 mov		sp,$toutptr			@ alloca
+	veor		@ACC[4],@ACC[4],@ACC[4]
+	 add		$toutptr,$toutptr,#256
+	veor		@ACC[5],@ACC[5],@ACC[5]
+	 sub		$inner,$num,#8
+	veor		@ACC[6],@ACC[6],@ACC[6]
+	veor		@ACC[7],@ACC[7],@ACC[7]
+
+.LNEON_8n_init:
+	vst1.64		{@ACC[0]-@ACC[1]},[$toutptr,:256]!
 	subs		$inner,$inner,#8
-	vmlal.u32	$A2xB,$Ni,${N1}[0]
-	vmlal.u32	$A3xB,$Ni,${N1}[1]
-
-	vmlal.u32	$A4xB,$Ni,${N2}[0]
-	 vld1.32	{$N0-$N1}, [$nptr]!
-	vmlal.u32	$A5xB,$Ni,${N2}[1]
-	 vst1.64	{$A0xB-$A1xB}, [$toutptr,:256]!
-	vmlal.u32	$A6xB,$Ni,${N3}[0]
-	vmlal.u32	$A7xB,$Ni,${N3}[1]
-	 vst1.64	{$A2xB-$A3xB}, [$toutptr,:256]!
-
-	vmull.u32	$A0xB,$Bi,${A0}[0]
-	 vld1.32	{$N2-$N3}, [$nptr]!
-	vmull.u32	$A1xB,$Bi,${A0}[1]
-	 vst1.64	{$A4xB-$A5xB}, [$toutptr,:256]!
-	vmull.u32	$A2xB,$Bi,${A1}[0]
-	vmull.u32	$A3xB,$Bi,${A1}[1]
-	 vst1.64	{$A6xB-$A7xB}, [$toutptr,:256]!
-
-	vmull.u32	$A4xB,$Bi,${A2}[0]
-	vmull.u32	$A5xB,$Bi,${A2}[1]
-	vmull.u32	$A6xB,$Bi,${A3}[0]
-	vmull.u32	$A7xB,$Bi,${A3}[1]
-
-	bne	.LNEON_1st
-
-	vmlal.u32	$A0xB,$Ni,${N0}[0]
-	add		$tinptr,sp,#16
-	vmlal.u32	$A1xB,$Ni,${N0}[1]
-	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
-	vmlal.u32	$A2xB,$Ni,${N1}[0]
-	 vld1.64	{$Temp}, [sp,:128]
-	vmlal.u32	$A3xB,$Ni,${N1}[1]
-	sub		$outer,$num,#1
-
-	vmlal.u32	$A4xB,$Ni,${N2}[0]
-	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
-	vmlal.u32	$A5xB,$Ni,${N2}[1]
-	vshr.u64	$temp,$temp,#16
-	 vld1.64	{$A0xB},       [$tinptr, :128]!
-	vmlal.u32	$A6xB,$Ni,${N3}[0]
-	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
-	vmlal.u32	$A7xB,$Ni,${N3}[1]
-
-	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
-	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
-	veor		$Z,$Z,$Z
-	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
-	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
-	vst1.64		{$Z},          [$toutptr,:128]
-	vshr.u64	$temp,$temp,#16
-
-	b		.LNEON_outer
+	vst1.64		{@ACC[2]-@ACC[3]},[$toutptr,:256]!
+	vst1.64		{@ACC[4]-@ACC[5]},[$toutptr,:256]!
+	vst1.64		{@ACC[6]-@ACC[7]},[$toutptr,:256]!
+	bne		.LNEON_8n_init
+
+	add		$tinptr,sp,#256
+	vld1.32		{$A0-$A3},[$aptr]!
+	add		$bnptr,sp,#8
+	vld1.32		{${M0}[0]},[$n0,:32]
+	mov		$outer,$num
+	b		.LNEON_8n_outer
 
 .align	4
-.LNEON_outer:
-	vld1.32		{${Bi}[0]}, [$bptr,:32]!
-	sub		$nptr,$nptr,$num,lsl#2		@ rewind $nptr
-	vld1.32		{$A0-$A3},  [$aptr]!
+.LNEON_8n_outer:
+	vld1.32		{${Bi}[0]},[$bptr,:32]!	@ *b++
 	veor		$zero,$zero,$zero
-	mov		$toutptr,sp
 	vzip.16		$Bi,$zero
+	add		$toutptr,sp,#128
+	vld1.32		{$N0-$N3},[$nptr]!
+
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	 veor		$zero,$zero,$zero
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	 vshl.i64	$Ni,@ACC[0]#hi,#16
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+	 vadd.u64	$Ni,$Ni,@ACC[0]#lo
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	 vmul.u32	$Ni,$Ni,$M0
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vst1.32		{$Bi},[sp,:64]		@ put aside smashed b[8*i+0]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	 vzip.16	$Ni,$zero
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+___
+for ($i=0; $i<7;) {
+$code.=<<___;
+	vld1.32		{${Bi}[0]},[$bptr,:32]!	@ *b++
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	veor		$temp,$temp,$temp
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vzip.16		$Bi,$temp
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	 vadd.u64	@ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	 vadd.u64	@ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
+	vst1.32		{$Ni},[$bnptr,:64]!	@ put aside smashed m[8*i+$i]
+___
+	push(@ACC,shift(@ACC));	$i++;
+$code.=<<___;
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vld1.64		{@ACC[7]},[$tinptr,:128]!
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	 veor		$zero,$zero,$zero
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	 vshl.i64	$Ni,@ACC[0]#hi,#16
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+	 vadd.u64	$Ni,$Ni,@ACC[0]#lo
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	 vmul.u32	$Ni,$Ni,$M0
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vst1.32		{$Bi},[$bnptr,:64]!	@ put aside smashed b[8*i+$i]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	 vzip.16	$Ni,$zero
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+___
+}
+$code.=<<___;
+	vld1.32		{$Bi},[sp,:64]		@ pull smashed b[8*i+0]
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	vld1.32		{$A0-$A3},[$aptr]!
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	 vadd.u64	@ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	 vadd.u64	@ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
+	vst1.32		{$Ni},[$bnptr,:64]	@ put aside smashed m[8*i+$i]
+	add		$bnptr,sp,#8		@ rewind
+___
+	push(@ACC,shift(@ACC));
+$code.=<<___;
 	sub		$inner,$num,#8
-	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
-
-	vmlal.u32	$A0xB,$Bi,${A0}[0]
-	 vld1.64	{$A3xB-$A4xB},[$tinptr,:256]!
-	vmlal.u32	$A1xB,$Bi,${A0}[1]
-	vmlal.u32	$A2xB,$Bi,${A1}[0]
-	 vld1.64	{$A5xB-$A6xB},[$tinptr,:256]!
-	vmlal.u32	$A3xB,$Bi,${A1}[1]
+	b		.LNEON_8n_inner
 
-	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
-	veor		$zero,$zero,$zero
-	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
-	 vld1.64	{$A7xB},[$tinptr,:128]!
-	vmul.u32	$Ni,$temp,$M0
-
-	vmlal.u32	$A4xB,$Bi,${A2}[0]
-	 vld1.32	{$N0-$N3}, [$nptr]!
-	vmlal.u32	$A5xB,$Bi,${A2}[1]
-	vmlal.u32	$A6xB,$Bi,${A3}[0]
-	vzip.16		$Ni,$zero
-	vmlal.u32	$A7xB,$Bi,${A3}[1]
-
-.LNEON_inner:
-	vmlal.u32	$A0xB,$Ni,${N0}[0]
-	 vld1.32	{$A0-$A3}, [$aptr]!
-	vmlal.u32	$A1xB,$Ni,${N0}[1]
-	 subs		$inner,$inner,#8
-	vmlal.u32	$A2xB,$Ni,${N1}[0]
-	vmlal.u32	$A3xB,$Ni,${N1}[1]
-	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
-
-	vmlal.u32	$A4xB,$Ni,${N2}[0]
-	 vld1.64	{$A0xB},       [$tinptr, :128]!
-	vmlal.u32	$A5xB,$Ni,${N2}[1]
-	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
-	vmlal.u32	$A6xB,$Ni,${N3}[0]
-	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
-	vmlal.u32	$A7xB,$Ni,${N3}[1]
-	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
-
-	vmlal.u32	$A0xB,$Bi,${A0}[0]
-	 vld1.64	{$A3xB-$A4xB}, [$tinptr, :256]!
-	vmlal.u32	$A1xB,$Bi,${A0}[1]
-	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
-	vmlal.u32	$A2xB,$Bi,${A1}[0]
-	 vld1.64	{$A5xB-$A6xB}, [$tinptr, :256]!
-	vmlal.u32	$A3xB,$Bi,${A1}[1]
-	 vld1.32	{$N0-$N3}, [$nptr]!
-
-	vmlal.u32	$A4xB,$Bi,${A2}[0]
-	 vld1.64	{$A7xB},       [$tinptr, :128]!
-	vmlal.u32	$A5xB,$Bi,${A2}[1]
-	vmlal.u32	$A6xB,$Bi,${A3}[0]
-	vmlal.u32	$A7xB,$Bi,${A3}[1]
-
-	bne	.LNEON_inner
-
-	vmlal.u32	$A0xB,$Ni,${N0}[0]
-	add		$tinptr,sp,#16
-	vmlal.u32	$A1xB,$Ni,${N0}[1]
-	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
-	vmlal.u32	$A2xB,$Ni,${N1}[0]
-	 vld1.64	{$Temp}, [sp,:128]
-	vmlal.u32	$A3xB,$Ni,${N1}[1]
-	subs		$outer,$outer,#1
-
-	vmlal.u32	$A4xB,$Ni,${N2}[0]
-	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
-	vmlal.u32	$A5xB,$Ni,${N2}[1]
-	 vld1.64	{$A0xB},       [$tinptr, :128]!
-	vshr.u64	$temp,$temp,#16
-	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
-	vmlal.u32	$A6xB,$Ni,${N3}[0]
-	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
-	vmlal.u32	$A7xB,$Ni,${N3}[1]
-
-	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
-	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
-	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
-	vshr.u64	$temp,$temp,#16
-
-	bne	.LNEON_outer
+.align	4
+.LNEON_8n_inner:
+	subs		$inner,$inner,#8
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vld1.64		{@ACC[7]},[$tinptr,:128]
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	vld1.32		{$Ni},[$bnptr,:64]!	@ pull smashed m[8*i+0]
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	vld1.32		{$N0-$N3},[$nptr]!
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+	it		ne
+	addne		$tinptr,$tinptr,#16	@ don't advance in last iteration
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+	vld1.32		{$Bi},[$bnptr,:64]!	@ pull smashed b[8*i+$i]
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	vst1.64		{@ACC[0]},[$toutptr,:128]!
+___
+	push(@ACC,shift(@ACC));
+$code.=<<___;
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vld1.64		{@ACC[7]},[$tinptr,:128]
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	vld1.32		{$Ni},[$bnptr,:64]!	@ pull smashed m[8*i+$i]
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	it		ne
+	addne		$tinptr,$tinptr,#16	@ don't advance in last iteration
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+___
+}
+$code.=<<___;
+	it		eq
+	subeq		$aptr,$aptr,$num,lsl#2	@ rewind
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	vld1.32		{$Bi},[sp,:64]		@ pull smashed b[8*i+0]
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vld1.32		{$A0-$A3},[$aptr]!
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	add		$bnptr,sp,#8		@ rewind
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vst1.64		{@ACC[0]},[$toutptr,:128]!
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+
+	bne		.LNEON_8n_inner
+___
+	push(@ACC,shift(@ACC));
+$code.=<<___;
+	add		$tinptr,sp,#128
+	vst1.64		{@ACC[0]-@ACC[1]},[$toutptr,:256]!
+	veor		q2,q2,q2		@ $N0-$N1
+	vst1.64		{@ACC[2]-@ACC[3]},[$toutptr,:256]!
+	veor		q3,q3,q3		@ $N2-$N3
+	vst1.64		{@ACC[4]-@ACC[5]},[$toutptr,:256]!
+	vst1.64		{@ACC[6]},[$toutptr,:128]
+
+	subs		$outer,$outer,#8
+	vld1.64		{@ACC[0]-@ACC[1]},[$tinptr,:256]!
+	vld1.64		{@ACC[2]-@ACC[3]},[$tinptr,:256]!
+	vld1.64		{@ACC[4]-@ACC[5]},[$tinptr,:256]!
+	vld1.64		{@ACC[6]-@ACC[7]},[$tinptr,:256]!
+
+	itt		ne
+	subne		$nptr,$nptr,$num,lsl#2	@ rewind
+	bne		.LNEON_8n_outer
+
+	add		$toutptr,sp,#128
+	vst1.64		{q2-q3}, [sp,:256]!	@ start wiping stack frame
+	vshr.u64	$temp,@ACC[0]#lo,#16
+	vst1.64		{q2-q3},[sp,:256]!
+	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
+	vst1.64		{q2-q3}, [sp,:256]!
+	vshr.u64	$temp,@ACC[0]#hi,#16
+	vst1.64		{q2-q3}, [sp,:256]!
+	vzip.16		@ACC[0]#lo,@ACC[0]#hi
 
-	mov		$toutptr,sp
 	mov		$inner,$num
+	b		.LNEON_tail_entry
 
+.align	4
 .LNEON_tail:
-	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
-	vld1.64		{$A3xB-$A4xB}, [$tinptr, :256]!
-	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
-	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
-	vld1.64		{$A5xB-$A6xB}, [$tinptr, :256]!
-	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
-	vld1.64		{$A7xB},       [$tinptr, :128]!
-	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
-
-.LNEON_tail2:
-	vadd.u64	`&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
-	vst1.32		{`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
-	vshr.u64	$temp,`&Dlo("$A1xB")`,#16
-	vadd.u64	`&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
-	vshr.u64	$temp,`&Dhi("$A1xB")`,#16
-	vzip.16		`&Dlo("$A1xB")`,`&Dhi("$A1xB")`
-
-	vadd.u64	`&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
-	vst1.32		{`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
-	vshr.u64	$temp,`&Dlo("$A2xB")`,#16
-	vadd.u64	`&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
-	vshr.u64	$temp,`&Dhi("$A2xB")`,#16
-	vzip.16		`&Dlo("$A2xB")`,`&Dhi("$A2xB")`
-
-	vadd.u64	`&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
-	vst1.32		{`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
-	vshr.u64	$temp,`&Dlo("$A3xB")`,#16
-	vadd.u64	`&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
-	vshr.u64	$temp,`&Dhi("$A3xB")`,#16
-	vzip.16		`&Dlo("$A3xB")`,`&Dhi("$A3xB")`
-
-	vadd.u64	`&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
-	vst1.32		{`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
-	vshr.u64	$temp,`&Dlo("$A4xB")`,#16
-	vadd.u64	`&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
-	vshr.u64	$temp,`&Dhi("$A4xB")`,#16
-	vzip.16		`&Dlo("$A4xB")`,`&Dhi("$A4xB")`
-
-	vadd.u64	`&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
-	vst1.32		{`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
-	vshr.u64	$temp,`&Dlo("$A5xB")`,#16
-	vadd.u64	`&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
-	vshr.u64	$temp,`&Dhi("$A5xB")`,#16
-	vzip.16		`&Dlo("$A5xB")`,`&Dhi("$A5xB")`
-
-	vadd.u64	`&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
-	vst1.32		{`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
-	vshr.u64	$temp,`&Dlo("$A6xB")`,#16
-	vadd.u64	`&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
-	vld1.64		{$A0xB}, [$tinptr, :128]!
-	vshr.u64	$temp,`&Dhi("$A6xB")`,#16
-	vzip.16		`&Dlo("$A6xB")`,`&Dhi("$A6xB")`
-
-	vadd.u64	`&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
-	vst1.32		{`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
-	vshr.u64	$temp,`&Dlo("$A7xB")`,#16
-	vadd.u64	`&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
-	vld1.64		{$A1xB-$A2xB},	[$tinptr, :256]!
-	vshr.u64	$temp,`&Dhi("$A7xB")`,#16
-	vzip.16		`&Dlo("$A7xB")`,`&Dhi("$A7xB")`
+	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
+	vshr.u64	$temp,@ACC[0]#lo,#16
+	vld1.64		{@ACC[2]-@ACC[3]}, [$tinptr, :256]!
+	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
+	vld1.64		{@ACC[4]-@ACC[5]}, [$tinptr, :256]!
+	vshr.u64	$temp,@ACC[0]#hi,#16
+	vld1.64		{@ACC[6]-@ACC[7]}, [$tinptr, :256]!
+	vzip.16		@ACC[0]#lo,@ACC[0]#hi
+
+.LNEON_tail_entry:
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+	vadd.u64	@ACC[1]#lo,@ACC[1]#lo,$temp
+	vst1.32		{@ACC[0]#lo[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,@ACC[1]#lo,#16
+	vadd.u64	@ACC[1]#hi,@ACC[1]#hi,$temp
+	vshr.u64	$temp,@ACC[1]#hi,#16
+	vzip.16		@ACC[1]#lo,@ACC[1]#hi
+___
+	push(@ACC,shift(@ACC));
+}
+	push(@ACC,shift(@ACC));
+$code.=<<___;
+	vld1.64		{@ACC[0]-@ACC[1]}, [$tinptr, :256]!
 	subs		$inner,$inner,#8
-	vst1.32		{`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
-
+	vst1.32		{@ACC[7]#lo[0]},   [$toutptr, :32]!
 	bne	.LNEON_tail
 
 	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
@@ -622,8 +689,9 @@ bn_mul8x_mont_neon:
 	bne	.LNEON_sub
 
 	ldr	r10, [$aptr]				@ load top-most bit
+	mov	r11,sp
 	veor	q0,q0,q0
-	sub	r11,$bptr,sp				@ this is num*4
+	sub	r11,$bptr,r11				@ this is num*4
 	veor	q1,q1,q1
 	mov	$aptr,sp
 	sub	$rptr,$rptr,r11				@ rewind $rptr
@@ -633,27 +701,33 @@ bn_mul8x_mont_neon:
 .LNEON_copy_n_zap:
 	ldmia	$aptr!, {r4-r7}
 	ldmia	$rptr,  {r8-r11}
+	it	cc
 	movcc	r8, r4
 	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	itt	cc
 	movcc	r9, r5
 	movcc	r10,r6
 	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	it	cc
 	movcc	r11,r7
 	ldmia	$aptr, {r4-r7}
 	stmia	$rptr!, {r8-r11}
 	sub	$aptr,$aptr,#16
 	ldmia	$rptr, {r8-r11}
+	it	cc
 	movcc	r8, r4
 	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
+	itt	cc
 	movcc	r9, r5
 	movcc	r10,r6
 	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	it	cc
 	movcc	r11,r7
 	teq	$aptr,$bptr				@ preserves carry
 	stmia	$rptr!, {r8-r11}
 	bne	.LNEON_copy_n_zap
 
-	sub	sp,ip,#96
+	mov	sp,ip
         vldmia  sp!,{d8-d15}
         ldmia   sp!,{r4-r11}
 	ret						@ bx lr
@@ -669,8 +743,14 @@ $code.=<<___;
 #endif
 ___
 
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
-$code =~ s/\bret\b/bx	lr/gm;
-print $code;
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge	or
+	s/\bret\b/bx    lr/g						or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/g;	# make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+
 close STDOUT;
diff --git a/deps/openssl/openssl/crypto/bn/asm/armv8-mont.pl b/deps/openssl/openssl/crypto/bn/asm/armv8-mont.pl
new file mode 100755
index 0000000000..5d5af1b6be
--- /dev/null
+++ b/deps/openssl/openssl/crypto/bn/asm/armv8-mont.pl
@@ -0,0 +1,1510 @@
+#! /usr/bin/env perl
+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# March 2015
+#
+# "Teaser" Montgomery multiplication module for ARMv8. Needs more
+# work. While it does improve RSA sign performance by 20-30% (less for
+# longer keys) on most processors, for some reason RSA2048 is not
+# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
+# instruction issue rate is limited on processor in question, meaning
+# that dedicated squaring procedure is a must. Well, actually all
+# contemporary AArch64 processors seem to have limited multiplication
+# issue rate, i.e. they can't issue multiplication every cycle, which
+# explains moderate improvement coefficients in comparison to
+# compiler-generated code. Recall that compiler is instructed to use
+# umulh and therefore uses same amount of multiplication instructions
+# to do the job. Assembly's edge is to minimize number of "collateral"
+# instructions and of course instruction scheduling.
+#
+# April 2015
+#
+# Squaring procedure that handles lengths divisible by 8 improves
+# RSA/DSA performance by 25-40-60% depending on processor and key
+# length. Overall improvement coefficients are always positive in
+# comparison to compiler-generated code. On Cortex-A57 improvement
+# is still modest on longest key lengths, while others exhibit e.g.
+# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
+# on Cortex-A57 and ~60-100% faster on others.
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+($lo0,$hi0,$aj,$m0,$alo,$ahi,
+ $lo1,$hi1,$nj,$m1,$nlo,$nhi,
+ $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
+
+# int bn_mul_mont(
+$rp="x0";	# BN_ULONG *rp,
+$ap="x1";	# const BN_ULONG *ap,
+$bp="x2";	# const BN_ULONG *bp,
+$np="x3";	# const BN_ULONG *np,
+$n0="x4";	# const BN_ULONG *n0,
+$num="x5";	# int num);
+
+$code.=<<___;
+.text
+
+.globl	bn_mul_mont
+.type	bn_mul_mont,%function
+.align	5
+bn_mul_mont:
+	tst	$num,#7
+	b.eq	__bn_sqr8x_mont
+	tst	$num,#3
+	b.eq	__bn_mul4x_mont
+.Lmul_mont:
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldr	$m0,[$bp],#8		// bp[0]
+	sub	$tp,sp,$num,lsl#3
+	ldp	$hi0,$aj,[$ap],#16	// ap[0..1]
+	lsl	$num,$num,#3
+	ldr	$n0,[$n0]		// *n0
+	and	$tp,$tp,#-16		// ABI says so
+	ldp	$hi1,$nj,[$np],#16	// np[0..1]
+
+	mul	$lo0,$hi0,$m0		// ap[0]*bp[0]
+	sub	$j,$num,#16		// j=num-2
+	umulh	$hi0,$hi0,$m0
+	mul	$alo,$aj,$m0		// ap[1]*bp[0]
+	umulh	$ahi,$aj,$m0
+
+	mul	$m1,$lo0,$n0		// "tp[0]"*n0
+	mov	sp,$tp			// alloca
+
+	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
+	umulh	$hi1,$hi1,$m1
+	mul	$nlo,$nj,$m1		// np[1]*m1
+	// (*)	adds	$lo1,$lo1,$lo0	// discarded
+	// (*)	As for removal of first multiplication and addition
+	//	instructions. The outcome of first addition is
+	//	guaranteed to be zero, which leaves two computationally
+	//	significant outcomes: it either carries or not. Then
+	//	question is when does it carry? Is there alternative
+	//	way to deduce it? If you follow operations, you can
+	//	observe that condition for carry is quite simple:
+	//	$lo0 being non-zero. So that carry can be calculated
+	//	by adding -1 to $lo0. That's what next instruction does.
+	subs	xzr,$lo0,#1		// (*)
+	umulh	$nhi,$nj,$m1
+	adc	$hi1,$hi1,xzr
+	cbz	$j,.L1st_skip
+
+.L1st:
+	ldr	$aj,[$ap],#8
+	adds	$lo0,$alo,$hi0
+	sub	$j,$j,#8		// j--
+	adc	$hi0,$ahi,xzr
+
+	ldr	$nj,[$np],#8
+	adds	$lo1,$nlo,$hi1
+	mul	$alo,$aj,$m0		// ap[j]*bp[0]
+	adc	$hi1,$nhi,xzr
+	umulh	$ahi,$aj,$m0
+
+	adds	$lo1,$lo1,$lo0
+	mul	$nlo,$nj,$m1		// np[j]*m1
+	adc	$hi1,$hi1,xzr
+	umulh	$nhi,$nj,$m1
+	str	$lo1,[$tp],#8		// tp[j-1]
+	cbnz	$j,.L1st
+
+.L1st_skip:
+	adds	$lo0,$alo,$hi0
+	sub	$ap,$ap,$num		// rewind $ap
+	adc	$hi0,$ahi,xzr
+
+	adds	$lo1,$nlo,$hi1
+	sub	$np,$np,$num		// rewind $np
+	adc	$hi1,$nhi,xzr
+
+	adds	$lo1,$lo1,$lo0
+	sub	$i,$num,#8		// i=num-1
+	adcs	$hi1,$hi1,$hi0
+
+	adc	$ovf,xzr,xzr		// upmost overflow bit
+	stp	$lo1,$hi1,[$tp]
+
+.Louter:
+	ldr	$m0,[$bp],#8		// bp[i]
+	ldp	$hi0,$aj,[$ap],#16
+	ldr	$tj,[sp]		// tp[0]
+	add	$tp,sp,#8
+
+	mul	$lo0,$hi0,$m0		// ap[0]*bp[i]
+	sub	$j,$num,#16		// j=num-2
+	umulh	$hi0,$hi0,$m0
+	ldp	$hi1,$nj,[$np],#16
+	mul	$alo,$aj,$m0		// ap[1]*bp[i]
+	adds	$lo0,$lo0,$tj
+	umulh	$ahi,$aj,$m0
+	adc	$hi0,$hi0,xzr
+
+	mul	$m1,$lo0,$n0
+	sub	$i,$i,#8		// i--
+
+	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
+	umulh	$hi1,$hi1,$m1
+	mul	$nlo,$nj,$m1		// np[1]*m1
+	// (*)	adds	$lo1,$lo1,$lo0
+	subs	xzr,$lo0,#1		// (*)
+	umulh	$nhi,$nj,$m1
+	cbz	$j,.Linner_skip
+
+.Linner:
+	ldr	$aj,[$ap],#8
+	adc	$hi1,$hi1,xzr
+	ldr	$tj,[$tp],#8		// tp[j]
+	adds	$lo0,$alo,$hi0
+	sub	$j,$j,#8		// j--
+	adc	$hi0,$ahi,xzr
+
+	adds	$lo1,$nlo,$hi1
+	ldr	$nj,[$np],#8
+	adc	$hi1,$nhi,xzr
+
+	mul	$alo,$aj,$m0		// ap[j]*bp[i]
+	adds	$lo0,$lo0,$tj
+	umulh	$ahi,$aj,$m0
+	adc	$hi0,$hi0,xzr
+
+	mul	$nlo,$nj,$m1		// np[j]*m1
+	adds	$lo1,$lo1,$lo0
+	umulh	$nhi,$nj,$m1
+	str	$lo1,[$tp,#-16]		// tp[j-1]
+	cbnz	$j,.Linner
+
+.Linner_skip:
+	ldr	$tj,[$tp],#8		// tp[j]
+	adc	$hi1,$hi1,xzr
+	adds	$lo0,$alo,$hi0
+	sub	$ap,$ap,$num		// rewind $ap
+	adc	$hi0,$ahi,xzr
+
+	adds	$lo1,$nlo,$hi1
+	sub	$np,$np,$num		// rewind $np
+	adcs	$hi1,$nhi,$ovf
+	adc	$ovf,xzr,xzr
+
+	adds	$lo0,$lo0,$tj
+	adc	$hi0,$hi0,xzr
+
+	adds	$lo1,$lo1,$lo0
+	adcs	$hi1,$hi1,$hi0
+	adc	$ovf,$ovf,xzr		// upmost overflow bit
+	stp	$lo1,$hi1,[$tp,#-16]
+
+	cbnz	$i,.Louter
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	$tj,[sp]		// tp[0]
+	add	$tp,sp,#8
+	ldr	$nj,[$np],#8		// np[0]
+	subs	$j,$num,#8		// j=num-1 and clear borrow
+	mov	$ap,$rp
+.Lsub:
+	sbcs	$aj,$tj,$nj		// tp[j]-np[j]
+	ldr	$tj,[$tp],#8
+	sub	$j,$j,#8		// j--
+	ldr	$nj,[$np],#8
+	str	$aj,[$ap],#8		// rp[j]=tp[j]-np[j]
+	cbnz	$j,.Lsub
+
+	sbcs	$aj,$tj,$nj
+	sbcs	$ovf,$ovf,xzr		// did it borrow?
+	str	$aj,[$ap],#8		// rp[num-1]
+
+	ldr	$tj,[sp]		// tp[0]
+	add	$tp,sp,#8
+	ldr	$aj,[$rp],#8		// rp[0]
+	sub	$num,$num,#8		// num--
+	nop
+.Lcond_copy:
+	sub	$num,$num,#8		// num--
+	csel	$nj,$tj,$aj,lo		// did it borrow?
+	ldr	$tj,[$tp],#8
+	ldr	$aj,[$rp],#8
+	str	xzr,[$tp,#-16]		// wipe tp
+	str	$nj,[$rp,#-16]
+	cbnz	$num,.Lcond_copy
+
+	csel	$nj,$tj,$aj,lo
+	str	xzr,[$tp,#-8]		// wipe tp
+	str	$nj,[$rp,#-8]
+
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	ret
+.size	bn_mul_mont,.-bn_mul_mont
+___
+{
+########################################################################
+# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
+
+my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
+my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
+my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
+my ($cnt,$carry,$topmost)=("x27","x28","x30");
+my ($tp,$ap_end,$na0)=($bp,$np,$carry);
+
+$code.=<<___;
+.type	__bn_sqr8x_mont,%function
+.align	5
+__bn_sqr8x_mont:
+	cmp	$ap,$bp
+	b.ne	__bn_mul4x_mont
+.Lsqr8x_mont:
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	$rp,$np,[sp,#96]	// offload rp and np
+
+	ldp	$a0,$a1,[$ap,#8*0]
+	ldp	$a2,$a3,[$ap,#8*2]
+	ldp	$a4,$a5,[$ap,#8*4]
+	ldp	$a6,$a7,[$ap,#8*6]
+
+	sub	$tp,sp,$num,lsl#4
+	lsl	$num,$num,#3
+	ldr	$n0,[$n0]		// *n0
+	mov	sp,$tp			// alloca
+	sub	$cnt,$num,#8*8
+	b	.Lsqr8x_zero_start
+
+.Lsqr8x_zero:
+	sub	$cnt,$cnt,#8*8
+	stp	xzr,xzr,[$tp,#8*0]
+	stp	xzr,xzr,[$tp,#8*2]
+	stp	xzr,xzr,[$tp,#8*4]
+	stp	xzr,xzr,[$tp,#8*6]
+.Lsqr8x_zero_start:
+	stp	xzr,xzr,[$tp,#8*8]
+	stp	xzr,xzr,[$tp,#8*10]
+	stp	xzr,xzr,[$tp,#8*12]
+	stp	xzr,xzr,[$tp,#8*14]
+	add	$tp,$tp,#8*16
+	cbnz	$cnt,.Lsqr8x_zero
+
+	add	$ap_end,$ap,$num
+	add	$ap,$ap,#8*8
+	mov	$acc0,xzr
+	mov	$acc1,xzr
+	mov	$acc2,xzr
+	mov	$acc3,xzr
+	mov	$acc4,xzr
+	mov	$acc5,xzr
+	mov	$acc6,xzr
+	mov	$acc7,xzr
+	mov	$tp,sp
+	str	$n0,[x29,#112]		// offload n0
+
+	// Multiply everything but a[i]*a[i]
+.align	4
+.Lsqr8x_outer_loop:
+        //                                                 a[1]a[0]	(i)
+        //                                             a[2]a[0]
+        //                                         a[3]a[0]
+        //                                     a[4]a[0]
+        //                                 a[5]a[0]
+        //                             a[6]a[0]
+        //                         a[7]a[0]
+        //                                         a[2]a[1]		(ii)
+        //                                     a[3]a[1]
+        //                                 a[4]a[1]
+        //                             a[5]a[1]
+        //                         a[6]a[1]
+        //                     a[7]a[1]
+        //                                 a[3]a[2]			(iii)
+        //                             a[4]a[2]
+        //                         a[5]a[2]
+        //                     a[6]a[2]
+        //                 a[7]a[2]
+        //                         a[4]a[3]				(iv)
+        //                     a[5]a[3]
+        //                 a[6]a[3]
+        //             a[7]a[3]
+        //                 a[5]a[4]					(v)
+        //             a[6]a[4]
+        //         a[7]a[4]
+        //         a[6]a[5]						(vi)
+        //     a[7]a[5]
+        // a[7]a[6]							(vii)
+
+	mul	$t0,$a1,$a0		// lo(a[1..7]*a[0])		(i)
+	mul	$t1,$a2,$a0
+	mul	$t2,$a3,$a0
+	mul	$t3,$a4,$a0
+	adds	$acc1,$acc1,$t0		// t[1]+lo(a[1]*a[0])
+	mul	$t0,$a5,$a0
+	adcs	$acc2,$acc2,$t1
+	mul	$t1,$a6,$a0
+	adcs	$acc3,$acc3,$t2
+	mul	$t2,$a7,$a0
+	adcs	$acc4,$acc4,$t3
+	umulh	$t3,$a1,$a0		// hi(a[1..7]*a[0])
+	adcs	$acc5,$acc5,$t0
+	umulh	$t0,$a2,$a0
+	adcs	$acc6,$acc6,$t1
+	umulh	$t1,$a3,$a0
+	adcs	$acc7,$acc7,$t2
+	umulh	$t2,$a4,$a0
+	stp	$acc0,$acc1,[$tp],#8*2	// t[0..1]
+	adc	$acc0,xzr,xzr		// t[8]
+	adds	$acc2,$acc2,$t3		// t[2]+lo(a[1]*a[0])
+	umulh	$t3,$a5,$a0
+	adcs	$acc3,$acc3,$t0
+	umulh	$t0,$a6,$a0
+	adcs	$acc4,$acc4,$t1
+	umulh	$t1,$a7,$a0
+	adcs	$acc5,$acc5,$t2
+	 mul	$t2,$a2,$a1		// lo(a[2..7]*a[1])		(ii)
+	adcs	$acc6,$acc6,$t3
+	 mul	$t3,$a3,$a1
+	adcs	$acc7,$acc7,$t0
+	 mul	$t0,$a4,$a1
+	adc	$acc0,$acc0,$t1
+
+	mul	$t1,$a5,$a1
+	adds	$acc3,$acc3,$t2
+	mul	$t2,$a6,$a1
+	adcs	$acc4,$acc4,$t3
+	mul	$t3,$a7,$a1
+	adcs	$acc5,$acc5,$t0
+	umulh	$t0,$a2,$a1		// hi(a[2..7]*a[1])
+	adcs	$acc6,$acc6,$t1
+	umulh	$t1,$a3,$a1
+	adcs	$acc7,$acc7,$t2
+	umulh	$t2,$a4,$a1
+	adcs	$acc0,$acc0,$t3
+	umulh	$t3,$a5,$a1
+	stp	$acc2,$acc3,[$tp],#8*2	// t[2..3]
+	adc	$acc1,xzr,xzr		// t[9]
+	adds	$acc4,$acc4,$t0
+	umulh	$t0,$a6,$a1
+	adcs	$acc5,$acc5,$t1
+	umulh	$t1,$a7,$a1
+	adcs	$acc6,$acc6,$t2
+	 mul	$t2,$a3,$a2		// lo(a[3..7]*a[2])		(iii)
+	adcs	$acc7,$acc7,$t3
+	 mul	$t3,$a4,$a2
+	adcs	$acc0,$acc0,$t0
+	 mul	$t0,$a5,$a2
+	adc	$acc1,$acc1,$t1
+
+	mul	$t1,$a6,$a2
+	adds	$acc5,$acc5,$t2
+	mul	$t2,$a7,$a2
+	adcs	$acc6,$acc6,$t3
+	umulh	$t3,$a3,$a2		// hi(a[3..7]*a[2])
+	adcs	$acc7,$acc7,$t0
+	umulh	$t0,$a4,$a2
+	adcs	$acc0,$acc0,$t1
+	umulh	$t1,$a5,$a2
+	adcs	$acc1,$acc1,$t2
+	umulh	$t2,$a6,$a2
+	stp	$acc4,$acc5,[$tp],#8*2	// t[4..5]
+	adc	$acc2,xzr,xzr		// t[10]
+	adds	$acc6,$acc6,$t3
+	umulh	$t3,$a7,$a2
+	adcs	$acc7,$acc7,$t0
+	 mul	$t0,$a4,$a3		// lo(a[4..7]*a[3])		(iv)
+	adcs	$acc0,$acc0,$t1
+	 mul	$t1,$a5,$a3
+	adcs	$acc1,$acc1,$t2
+	 mul	$t2,$a6,$a3
+	adc	$acc2,$acc2,$t3
+
+	mul	$t3,$a7,$a3
+	adds	$acc7,$acc7,$t0
+	umulh	$t0,$a4,$a3		// hi(a[4..7]*a[3])
+	adcs	$acc0,$acc0,$t1
+	umulh	$t1,$a5,$a3
+	adcs	$acc1,$acc1,$t2
+	umulh	$t2,$a6,$a3
+	adcs	$acc2,$acc2,$t3
+	umulh	$t3,$a7,$a3
+	stp	$acc6,$acc7,[$tp],#8*2	// t[6..7]
+	adc	$acc3,xzr,xzr		// t[11]
+	adds	$acc0,$acc0,$t0
+	 mul	$t0,$a5,$a4		// lo(a[5..7]*a[4])		(v)
+	adcs	$acc1,$acc1,$t1
+	 mul	$t1,$a6,$a4
+	adcs	$acc2,$acc2,$t2
+	 mul	$t2,$a7,$a4
+	adc	$acc3,$acc3,$t3
+
+	umulh	$t3,$a5,$a4		// hi(a[5..7]*a[4])
+	adds	$acc1,$acc1,$t0
+	umulh	$t0,$a6,$a4
+	adcs	$acc2,$acc2,$t1
+	umulh	$t1,$a7,$a4
+	adcs	$acc3,$acc3,$t2
+	 mul	$t2,$a6,$a5		// lo(a[6..7]*a[5])		(vi)
+	adc	$acc4,xzr,xzr		// t[12]
+	adds	$acc2,$acc2,$t3
+	 mul	$t3,$a7,$a5
+	adcs	$acc3,$acc3,$t0
+	 umulh	$t0,$a6,$a5		// hi(a[6..7]*a[5])
+	adc	$acc4,$acc4,$t1
+
+	umulh	$t1,$a7,$a5
+	adds	$acc3,$acc3,$t2
+	 mul	$t2,$a7,$a6		// lo(a[7]*a[6])		(vii)
+	adcs	$acc4,$acc4,$t3
+	 umulh	$t3,$a7,$a6		// hi(a[7]*a[6])
+	adc	$acc5,xzr,xzr		// t[13]
+	adds	$acc4,$acc4,$t0
+	sub	$cnt,$ap_end,$ap	// done yet?
+	adc	$acc5,$acc5,$t1
+
+	adds	$acc5,$acc5,$t2
+	sub	$t0,$ap_end,$num	// rewinded ap
+	adc	$acc6,xzr,xzr		// t[14]
+	add	$acc6,$acc6,$t3
+
+	cbz	$cnt,.Lsqr8x_outer_break
+
+	mov	$n0,$a0
+	ldp	$a0,$a1,[$tp,#8*0]
+	ldp	$a2,$a3,[$tp,#8*2]
+	ldp	$a4,$a5,[$tp,#8*4]
+	ldp	$a6,$a7,[$tp,#8*6]
+	adds	$acc0,$acc0,$a0
+	adcs	$acc1,$acc1,$a1
+	ldp	$a0,$a1,[$ap,#8*0]
+	adcs	$acc2,$acc2,$a2
+	adcs	$acc3,$acc3,$a3
+	ldp	$a2,$a3,[$ap,#8*2]
+	adcs	$acc4,$acc4,$a4
+	adcs	$acc5,$acc5,$a5
+	ldp	$a4,$a5,[$ap,#8*4]
+	adcs	$acc6,$acc6,$a6
+	mov	$rp,$ap
+	adcs	$acc7,xzr,$a7
+	ldp	$a6,$a7,[$ap,#8*6]
+	add	$ap,$ap,#8*8
+	//adc	$carry,xzr,xzr		// moved below
+	mov	$cnt,#-8*8
+
+	//                                                         a[8]a[0]
+	//                                                     a[9]a[0]
+	//                                                 a[a]a[0]
+	//                                             a[b]a[0]
+	//                                         a[c]a[0]
+	//                                     a[d]a[0]
+	//                                 a[e]a[0]
+	//                             a[f]a[0]
+	//                                                     a[8]a[1]
+	//                         a[f]a[1]........................
+	//                                                 a[8]a[2]
+	//                     a[f]a[2]........................
+	//                                             a[8]a[3]
+	//                 a[f]a[3]........................
+	//                                         a[8]a[4]
+	//             a[f]a[4]........................
+	//                                     a[8]a[5]
+	//         a[f]a[5]........................
+	//                                 a[8]a[6]
+	//     a[f]a[6]........................
+	//                             a[8]a[7]
+	// a[f]a[7]........................
+.Lsqr8x_mul:
+	mul	$t0,$a0,$n0
+	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
+	mul	$t1,$a1,$n0
+	add	$cnt,$cnt,#8
+	mul	$t2,$a2,$n0
+	mul	$t3,$a3,$n0
+	adds	$acc0,$acc0,$t0
+	mul	$t0,$a4,$n0
+	adcs	$acc1,$acc1,$t1
+	mul	$t1,$a5,$n0
+	adcs	$acc2,$acc2,$t2
+	mul	$t2,$a6,$n0
+	adcs	$acc3,$acc3,$t3
+	mul	$t3,$a7,$n0
+	adcs	$acc4,$acc4,$t0
+	umulh	$t0,$a0,$n0
+	adcs	$acc5,$acc5,$t1
+	umulh	$t1,$a1,$n0
+	adcs	$acc6,$acc6,$t2
+	umulh	$t2,$a2,$n0
+	adcs	$acc7,$acc7,$t3
+	umulh	$t3,$a3,$n0
+	adc	$carry,$carry,xzr
+	str	$acc0,[$tp],#8
+	adds	$acc0,$acc1,$t0
+	umulh	$t0,$a4,$n0
+	adcs	$acc1,$acc2,$t1
+	umulh	$t1,$a5,$n0
+	adcs	$acc2,$acc3,$t2
+	umulh	$t2,$a6,$n0
+	adcs	$acc3,$acc4,$t3
+	umulh	$t3,$a7,$n0
+	ldr	$n0,[$rp,$cnt]
+	adcs	$acc4,$acc5,$t0
+	adcs	$acc5,$acc6,$t1
+	adcs	$acc6,$acc7,$t2
+	adcs	$acc7,$carry,$t3
+	//adc	$carry,xzr,xzr		// moved above
+	cbnz	$cnt,.Lsqr8x_mul
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	cmp	$ap,$ap_end		// done yet?
+	b.eq	.Lsqr8x_break
+
+	ldp	$a0,$a1,[$tp,#8*0]
+	ldp	$a2,$a3,[$tp,#8*2]
+	ldp	$a4,$a5,[$tp,#8*4]
+	ldp	$a6,$a7,[$tp,#8*6]
+	adds	$acc0,$acc0,$a0
+	ldr	$n0,[$rp,#-8*8]
+	adcs	$acc1,$acc1,$a1
+	ldp	$a0,$a1,[$ap,#8*0]
+	adcs	$acc2,$acc2,$a2
+	adcs	$acc3,$acc3,$a3
+	ldp	$a2,$a3,[$ap,#8*2]
+	adcs	$acc4,$acc4,$a4
+	adcs	$acc5,$acc5,$a5
+	ldp	$a4,$a5,[$ap,#8*4]
+	adcs	$acc6,$acc6,$a6
+	mov	$cnt,#-8*8
+	adcs	$acc7,$acc7,$a7
+	ldp	$a6,$a7,[$ap,#8*6]
+	add	$ap,$ap,#8*8
+	//adc	$carry,xzr,xzr		// moved above
+	b	.Lsqr8x_mul
+
+.align	4
+.Lsqr8x_break:
+	ldp	$a0,$a1,[$rp,#8*0]
+	add	$ap,$rp,#8*8
+	ldp	$a2,$a3,[$rp,#8*2]
+	sub	$t0,$ap_end,$ap		// is it last iteration?
+	ldp	$a4,$a5,[$rp,#8*4]
+	sub	$t1,$tp,$t0
+	ldp	$a6,$a7,[$rp,#8*6]
+	cbz	$t0,.Lsqr8x_outer_loop
+
+	stp	$acc0,$acc1,[$tp,#8*0]
+	ldp	$acc0,$acc1,[$t1,#8*0]
+	stp	$acc2,$acc3,[$tp,#8*2]
+	ldp	$acc2,$acc3,[$t1,#8*2]
+	stp	$acc4,$acc5,[$tp,#8*4]
+	ldp	$acc4,$acc5,[$t1,#8*4]
+	stp	$acc6,$acc7,[$tp,#8*6]
+	mov	$tp,$t1
+	ldp	$acc6,$acc7,[$t1,#8*6]
+	b	.Lsqr8x_outer_loop
+
+.align	4
+.Lsqr8x_outer_break:
+	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+	ldp	$a1,$a3,[$t0,#8*0]	// recall that $t0 is &a[0]
+	ldp	$t1,$t2,[sp,#8*1]
+	ldp	$a5,$a7,[$t0,#8*2]
+	add	$ap,$t0,#8*4
+	ldp	$t3,$t0,[sp,#8*3]
+
+	stp	$acc0,$acc1,[$tp,#8*0]
+	mul	$acc0,$a1,$a1
+	stp	$acc2,$acc3,[$tp,#8*2]
+	umulh	$a1,$a1,$a1
+	stp	$acc4,$acc5,[$tp,#8*4]
+	mul	$a2,$a3,$a3
+	stp	$acc6,$acc7,[$tp,#8*6]
+	mov	$tp,sp
+	umulh	$a3,$a3,$a3
+	adds	$acc1,$a1,$t1,lsl#1
+	extr	$t1,$t2,$t1,#63
+	sub	$cnt,$num,#8*4
+
+.Lsqr4x_shift_n_add:
+	adcs	$acc2,$a2,$t1
+	extr	$t2,$t3,$t2,#63
+	sub	$cnt,$cnt,#8*4
+	adcs	$acc3,$a3,$t2
+	ldp	$t1,$t2,[$tp,#8*5]
+	mul	$a4,$a5,$a5
+	ldp	$a1,$a3,[$ap],#8*2
+	umulh	$a5,$a5,$a5
+	mul	$a6,$a7,$a7
+	umulh	$a7,$a7,$a7
+	extr	$t3,$t0,$t3,#63
+	stp	$acc0,$acc1,[$tp,#8*0]
+	adcs	$acc4,$a4,$t3
+	extr	$t0,$t1,$t0,#63
+	stp	$acc2,$acc3,[$tp,#8*2]
+	adcs	$acc5,$a5,$t0
+	ldp	$t3,$t0,[$tp,#8*7]
+	extr	$t1,$t2,$t1,#63
+	adcs	$acc6,$a6,$t1
+	extr	$t2,$t3,$t2,#63
+	adcs	$acc7,$a7,$t2
+	ldp	$t1,$t2,[$tp,#8*9]
+	mul	$a0,$a1,$a1
+	ldp	$a5,$a7,[$ap],#8*2
+	umulh	$a1,$a1,$a1
+	mul	$a2,$a3,$a3
+	umulh	$a3,$a3,$a3
+	stp	$acc4,$acc5,[$tp,#8*4]
+	extr	$t3,$t0,$t3,#63
+	stp	$acc6,$acc7,[$tp,#8*6]
+	add	$tp,$tp,#8*8
+	adcs	$acc0,$a0,$t3
+	extr	$t0,$t1,$t0,#63
+	adcs	$acc1,$a1,$t0
+	ldp	$t3,$t0,[$tp,#8*3]
+	extr	$t1,$t2,$t1,#63
+	cbnz	$cnt,.Lsqr4x_shift_n_add
+___
+my ($np,$np_end)=($ap,$ap_end);
+$code.=<<___;
+	 ldp	$np,$n0,[x29,#104]	// pull np and n0
+
+	adcs	$acc2,$a2,$t1
+	extr	$t2,$t3,$t2,#63
+	adcs	$acc3,$a3,$t2
+	ldp	$t1,$t2,[$tp,#8*5]
+	mul	$a4,$a5,$a5
+	umulh	$a5,$a5,$a5
+	stp	$acc0,$acc1,[$tp,#8*0]
+	mul	$a6,$a7,$a7
+	umulh	$a7,$a7,$a7
+	stp	$acc2,$acc3,[$tp,#8*2]
+	extr	$t3,$t0,$t3,#63
+	adcs	$acc4,$a4,$t3
+	extr	$t0,$t1,$t0,#63
+	 ldp	$acc0,$acc1,[sp,#8*0]
+	adcs	$acc5,$a5,$t0
+	extr	$t1,$t2,$t1,#63
+	 ldp	$a0,$a1,[$np,#8*0]
+	adcs	$acc6,$a6,$t1
+	extr	$t2,xzr,$t2,#63
+	 ldp	$a2,$a3,[$np,#8*2]
+	adc	$acc7,$a7,$t2
+	 ldp	$a4,$a5,[$np,#8*4]
+
+	// Reduce by 512 bits per iteration
+	mul	$na0,$n0,$acc0		// t[0]*n0
+	ldp	$a6,$a7,[$np,#8*6]
+	add	$np_end,$np,$num
+	ldp	$acc2,$acc3,[sp,#8*2]
+	stp	$acc4,$acc5,[$tp,#8*4]
+	ldp	$acc4,$acc5,[sp,#8*4]
+	stp	$acc6,$acc7,[$tp,#8*6]
+	ldp	$acc6,$acc7,[sp,#8*6]
+	add	$np,$np,#8*8
+	mov	$topmost,xzr		// initial top-most carry
+	mov	$tp,sp
+	mov	$cnt,#8
+
+.Lsqr8x_reduction:
+	// (*)	mul	$t0,$a0,$na0	// lo(n[0-7])*lo(t[0]*n0)
+	mul	$t1,$a1,$na0
+	sub	$cnt,$cnt,#1
+	mul	$t2,$a2,$na0
+	str	$na0,[$tp],#8		// put aside t[0]*n0 for tail processing
+	mul	$t3,$a3,$na0
+	// (*)	adds	xzr,$acc0,$t0
+	subs	xzr,$acc0,#1		// (*)
+	mul	$t0,$a4,$na0
+	adcs	$acc0,$acc1,$t1
+	mul	$t1,$a5,$na0
+	adcs	$acc1,$acc2,$t2
+	mul	$t2,$a6,$na0
+	adcs	$acc2,$acc3,$t3
+	mul	$t3,$a7,$na0
+	adcs	$acc3,$acc4,$t0
+	umulh	$t0,$a0,$na0		// hi(n[0-7])*lo(t[0]*n0)
+	adcs	$acc4,$acc5,$t1
+	umulh	$t1,$a1,$na0
+	adcs	$acc5,$acc6,$t2
+	umulh	$t2,$a2,$na0
+	adcs	$acc6,$acc7,$t3
+	umulh	$t3,$a3,$na0
+	adc	$acc7,xzr,xzr
+	adds	$acc0,$acc0,$t0
+	umulh	$t0,$a4,$na0
+	adcs	$acc1,$acc1,$t1
+	umulh	$t1,$a5,$na0
+	adcs	$acc2,$acc2,$t2
+	umulh	$t2,$a6,$na0
+	adcs	$acc3,$acc3,$t3
+	umulh	$t3,$a7,$na0
+	mul	$na0,$n0,$acc0		// next t[0]*n0
+	adcs	$acc4,$acc4,$t0
+	adcs	$acc5,$acc5,$t1
+	adcs	$acc6,$acc6,$t2
+	adc	$acc7,$acc7,$t3
+	cbnz	$cnt,.Lsqr8x_reduction
+
+	ldp	$t0,$t1,[$tp,#8*0]
+	ldp	$t2,$t3,[$tp,#8*2]
+	mov	$rp,$tp
+	sub	$cnt,$np_end,$np	// done yet?
+	adds	$acc0,$acc0,$t0
+	adcs	$acc1,$acc1,$t1
+	ldp	$t0,$t1,[$tp,#8*4]
+	adcs	$acc2,$acc2,$t2
+	adcs	$acc3,$acc3,$t3
+	ldp	$t2,$t3,[$tp,#8*6]
+	adcs	$acc4,$acc4,$t0
+	adcs	$acc5,$acc5,$t1
+	adcs	$acc6,$acc6,$t2
+	adcs	$acc7,$acc7,$t3
+	//adc	$carry,xzr,xzr		// moved below
+	cbz	$cnt,.Lsqr8x8_post_condition
+
+	ldr	$n0,[$tp,#-8*8]
+	ldp	$a0,$a1,[$np,#8*0]
+	ldp	$a2,$a3,[$np,#8*2]
+	ldp	$a4,$a5,[$np,#8*4]
+	mov	$cnt,#-8*8
+	ldp	$a6,$a7,[$np,#8*6]
+	add	$np,$np,#8*8
+
+.Lsqr8x_tail:
+	mul	$t0,$a0,$n0
+	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
+	mul	$t1,$a1,$n0
+	add	$cnt,$cnt,#8
+	mul	$t2,$a2,$n0
+	mul	$t3,$a3,$n0
+	adds	$acc0,$acc0,$t0
+	mul	$t0,$a4,$n0
+	adcs	$acc1,$acc1,$t1
+	mul	$t1,$a5,$n0
+	adcs	$acc2,$acc2,$t2
+	mul	$t2,$a6,$n0
+	adcs	$acc3,$acc3,$t3
+	mul	$t3,$a7,$n0
+	adcs	$acc4,$acc4,$t0
+	umulh	$t0,$a0,$n0
+	adcs	$acc5,$acc5,$t1
+	umulh	$t1,$a1,$n0
+	adcs	$acc6,$acc6,$t2
+	umulh	$t2,$a2,$n0
+	adcs	$acc7,$acc7,$t3
+	umulh	$t3,$a3,$n0
+	adc	$carry,$carry,xzr
+	str	$acc0,[$tp],#8
+	adds	$acc0,$acc1,$t0
+	umulh	$t0,$a4,$n0
+	adcs	$acc1,$acc2,$t1
+	umulh	$t1,$a5,$n0
+	adcs	$acc2,$acc3,$t2
+	umulh	$t2,$a6,$n0
+	adcs	$acc3,$acc4,$t3
+	umulh	$t3,$a7,$n0
+	ldr	$n0,[$rp,$cnt]
+	adcs	$acc4,$acc5,$t0
+	adcs	$acc5,$acc6,$t1
+	adcs	$acc6,$acc7,$t2
+	adcs	$acc7,$carry,$t3
+	//adc	$carry,xzr,xzr		// moved above
+	cbnz	$cnt,.Lsqr8x_tail
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	ldp	$a0,$a1,[$tp,#8*0]
+	sub	$cnt,$np_end,$np	// done yet?
+	sub	$t2,$np_end,$num	// rewinded np
+	ldp	$a2,$a3,[$tp,#8*2]
+	ldp	$a4,$a5,[$tp,#8*4]
+	ldp	$a6,$a7,[$tp,#8*6]
+	cbz	$cnt,.Lsqr8x_tail_break
+
+	ldr	$n0,[$rp,#-8*8]
+	adds	$acc0,$acc0,$a0
+	adcs	$acc1,$acc1,$a1
+	ldp	$a0,$a1,[$np,#8*0]
+	adcs	$acc2,$acc2,$a2
+	adcs	$acc3,$acc3,$a3
+	ldp	$a2,$a3,[$np,#8*2]
+	adcs	$acc4,$acc4,$a4
+	adcs	$acc5,$acc5,$a5
+	ldp	$a4,$a5,[$np,#8*4]
+	adcs	$acc6,$acc6,$a6
+	mov	$cnt,#-8*8
+	adcs	$acc7,$acc7,$a7
+	ldp	$a6,$a7,[$np,#8*6]
+	add	$np,$np,#8*8
+	//adc	$carry,xzr,xzr		// moved above
+	b	.Lsqr8x_tail
+
+.align	4
+.Lsqr8x_tail_break:
+	ldr	$n0,[x29,#112]		// pull n0
+	add	$cnt,$tp,#8*8		// end of current t[num] window
+
+	subs	xzr,$topmost,#1		// "move" top-most carry to carry bit
+	adcs	$t0,$acc0,$a0
+	adcs	$t1,$acc1,$a1
+	ldp	$acc0,$acc1,[$rp,#8*0]
+	adcs	$acc2,$acc2,$a2
+	ldp	$a0,$a1,[$t2,#8*0]	// recall that $t2 is &n[0]
+	adcs	$acc3,$acc3,$a3
+	ldp	$a2,$a3,[$t2,#8*2]
+	adcs	$acc4,$acc4,$a4
+	adcs	$acc5,$acc5,$a5
+	ldp	$a4,$a5,[$t2,#8*4]
+	adcs	$acc6,$acc6,$a6
+	adcs	$acc7,$acc7,$a7
+	ldp	$a6,$a7,[$t2,#8*6]
+	add	$np,$t2,#8*8
+	adc	$topmost,xzr,xzr	// top-most carry
+	mul	$na0,$n0,$acc0
+	stp	$t0,$t1,[$tp,#8*0]
+	stp	$acc2,$acc3,[$tp,#8*2]
+	ldp	$acc2,$acc3,[$rp,#8*2]
+	stp	$acc4,$acc5,[$tp,#8*4]
+	ldp	$acc4,$acc5,[$rp,#8*4]
+	cmp	$cnt,x29		// did we hit the bottom?
+	stp	$acc6,$acc7,[$tp,#8*6]
+	mov	$tp,$rp			// slide the window
+	ldp	$acc6,$acc7,[$rp,#8*6]
+	mov	$cnt,#8
+	b.ne	.Lsqr8x_reduction
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	$rp,[x29,#96]		// pull rp
+	add	$tp,$tp,#8*8
+	subs	$t0,$acc0,$a0
+	sbcs	$t1,$acc1,$a1
+	sub	$cnt,$num,#8*8
+	mov	$ap_end,$rp		// $rp copy
+
+.Lsqr8x_sub:
+	sbcs	$t2,$acc2,$a2
+	ldp	$a0,$a1,[$np,#8*0]
+	sbcs	$t3,$acc3,$a3
+	stp	$t0,$t1,[$rp,#8*0]
+	sbcs	$t0,$acc4,$a4
+	ldp	$a2,$a3,[$np,#8*2]
+	sbcs	$t1,$acc5,$a5
+	stp	$t2,$t3,[$rp,#8*2]
+	sbcs	$t2,$acc6,$a6
+	ldp	$a4,$a5,[$np,#8*4]
+	sbcs	$t3,$acc7,$a7
+	ldp	$a6,$a7,[$np,#8*6]
+	add	$np,$np,#8*8
+	ldp	$acc0,$acc1,[$tp,#8*0]
+	sub	$cnt,$cnt,#8*8
+	ldp	$acc2,$acc3,[$tp,#8*2]
+	ldp	$acc4,$acc5,[$tp,#8*4]
+	ldp	$acc6,$acc7,[$tp,#8*6]
+	add	$tp,$tp,#8*8
+	stp	$t0,$t1,[$rp,#8*4]
+	sbcs	$t0,$acc0,$a0
+	stp	$t2,$t3,[$rp,#8*6]
+	add	$rp,$rp,#8*8
+	sbcs	$t1,$acc1,$a1
+	cbnz	$cnt,.Lsqr8x_sub
+
+	sbcs	$t2,$acc2,$a2
+	 mov	$tp,sp
+	 add	$ap,sp,$num
+	 ldp	$a0,$a1,[$ap_end,#8*0]
+	sbcs	$t3,$acc3,$a3
+	stp	$t0,$t1,[$rp,#8*0]
+	sbcs	$t0,$acc4,$a4
+	 ldp	$a2,$a3,[$ap_end,#8*2]
+	sbcs	$t1,$acc5,$a5
+	stp	$t2,$t3,[$rp,#8*2]
+	sbcs	$t2,$acc6,$a6
+	 ldp	$acc0,$acc1,[$ap,#8*0]
+	sbcs	$t3,$acc7,$a7
+	 ldp	$acc2,$acc3,[$ap,#8*2]
+	sbcs	xzr,$topmost,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+	stp	$t0,$t1,[$rp,#8*4]
+	stp	$t2,$t3,[$rp,#8*6]
+
+	sub	$cnt,$num,#8*4
+.Lsqr4x_cond_copy:
+	sub	$cnt,$cnt,#8*4
+	csel	$t0,$acc0,$a0,lo
+	 stp	xzr,xzr,[$tp,#8*0]
+	csel	$t1,$acc1,$a1,lo
+	ldp	$a0,$a1,[$ap_end,#8*4]
+	ldp	$acc0,$acc1,[$ap,#8*4]
+	csel	$t2,$acc2,$a2,lo
+	 stp	xzr,xzr,[$tp,#8*2]
+	 add	$tp,$tp,#8*4
+	csel	$t3,$acc3,$a3,lo
+	ldp	$a2,$a3,[$ap_end,#8*6]
+	ldp	$acc2,$acc3,[$ap,#8*6]
+	add	$ap,$ap,#8*4
+	stp	$t0,$t1,[$ap_end,#8*0]
+	stp	$t2,$t3,[$ap_end,#8*2]
+	add	$ap_end,$ap_end,#8*4
+	 stp	xzr,xzr,[$ap,#8*0]
+	 stp	xzr,xzr,[$ap,#8*2]
+	cbnz	$cnt,.Lsqr4x_cond_copy
+
+	csel	$t0,$acc0,$a0,lo
+	 stp	xzr,xzr,[$tp,#8*0]
+	csel	$t1,$acc1,$a1,lo
+	 stp	xzr,xzr,[$tp,#8*2]
+	csel	$t2,$acc2,$a2,lo
+	csel	$t3,$acc3,$a3,lo
+	stp	$t0,$t1,[$ap_end,#8*0]
+	stp	$t2,$t3,[$ap_end,#8*2]
+
+	b	.Lsqr8x_done
+
+.align	4
+.Lsqr8x8_post_condition:
+	adc	$carry,xzr,xzr
+	ldr	x30,[x29,#8]		// pull return address
+	// $acc0-7,$carry hold result, $a0-7 hold modulus
+	subs	$a0,$acc0,$a0
+	ldr	$ap,[x29,#96]		// pull rp
+	sbcs	$a1,$acc1,$a1
+	 stp	xzr,xzr,[sp,#8*0]
+	sbcs	$a2,$acc2,$a2
+	 stp	xzr,xzr,[sp,#8*2]
+	sbcs	$a3,$acc3,$a3
+	 stp	xzr,xzr,[sp,#8*4]
+	sbcs	$a4,$acc4,$a4
+	 stp	xzr,xzr,[sp,#8*6]
+	sbcs	$a5,$acc5,$a5
+	 stp	xzr,xzr,[sp,#8*8]
+	sbcs	$a6,$acc6,$a6
+	 stp	xzr,xzr,[sp,#8*10]
+	sbcs	$a7,$acc7,$a7
+	 stp	xzr,xzr,[sp,#8*12]
+	sbcs	$carry,$carry,xzr	// did it borrow?
+	 stp	xzr,xzr,[sp,#8*14]
+
+	// $a0-7 hold result-modulus
+	csel	$a0,$acc0,$a0,lo
+	csel	$a1,$acc1,$a1,lo
+	csel	$a2,$acc2,$a2,lo
+	csel	$a3,$acc3,$a3,lo
+	stp	$a0,$a1,[$ap,#8*0]
+	csel	$a4,$acc4,$a4,lo
+	csel	$a5,$acc5,$a5,lo
+	stp	$a2,$a3,[$ap,#8*2]
+	csel	$a6,$acc6,$a6,lo
+	csel	$a7,$acc7,$a7,lo
+	stp	$a4,$a5,[$ap,#8*4]
+	stp	$a6,$a7,[$ap,#8*6]
+
+.Lsqr8x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	ret
+.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
+___
+}
+
+{
+########################################################################
+# Even though this might look as ARMv8 adaptation of mulx4x_mont from
+# x86_64-mont5 module, it's different in sense that it performs
+# reduction 256 bits at a time.
+
+my ($a0,$a1,$a2,$a3,
+    $t0,$t1,$t2,$t3,
+    $m0,$m1,$m2,$m3,
+    $acc0,$acc1,$acc2,$acc3,$acc4,
+    $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
+my  $bp_end=$rp;
+my  ($carry,$topmost) = ($rp,"x30");
+
+$code.=<<___;
+.type	__bn_mul4x_mont,%function
+.align	5
+__bn_mul4x_mont:
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	sub	$tp,sp,$num,lsl#3
+	lsl	$num,$num,#3
+	ldr	$n0,[$n0]		// *n0
+	sub	sp,$tp,#8*4		// alloca
+
+	add	$t0,$bp,$num
+	add	$ap_end,$ap,$num
+	stp	$rp,$t0,[x29,#96]	// offload rp and &b[num]
+
+	ldr	$bi,[$bp,#8*0]		// b[0]
+	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
+	ldp	$a2,$a3,[$ap,#8*2]
+	add	$ap,$ap,#8*4
+	mov	$acc0,xzr
+	mov	$acc1,xzr
+	mov	$acc2,xzr
+	mov	$acc3,xzr
+	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
+	ldp	$m2,$m3,[$np,#8*2]
+	adds	$np,$np,#8*4		// clear carry bit
+	mov	$carry,xzr
+	mov	$cnt,#0
+	mov	$tp,sp
+
+.Loop_mul4x_1st_reduction:
+	mul	$t0,$a0,$bi		// lo(a[0..3]*b[0])
+	adc	$carry,$carry,xzr	// modulo-scheduled
+	mul	$t1,$a1,$bi
+	add	$cnt,$cnt,#8
+	mul	$t2,$a2,$bi
+	and	$cnt,$cnt,#31
+	mul	$t3,$a3,$bi
+	adds	$acc0,$acc0,$t0
+	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[0])
+	adcs	$acc1,$acc1,$t1
+	mul	$mi,$acc0,$n0		// t[0]*n0
+	adcs	$acc2,$acc2,$t2
+	umulh	$t1,$a1,$bi
+	adcs	$acc3,$acc3,$t3
+	umulh	$t2,$a2,$bi
+	adc	$acc4,xzr,xzr
+	umulh	$t3,$a3,$bi
+	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
+	adds	$acc1,$acc1,$t0
+	// (*)	mul	$t0,$m0,$mi	// lo(n[0..3]*t[0]*n0)
+	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
+	adcs	$acc2,$acc2,$t1
+	mul	$t1,$m1,$mi
+	adcs	$acc3,$acc3,$t2
+	mul	$t2,$m2,$mi
+	adc	$acc4,$acc4,$t3		// can't overflow
+	mul	$t3,$m3,$mi
+	// (*)	adds	xzr,$acc0,$t0
+	subs	xzr,$acc0,#1		// (*)
+	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0)
+	adcs	$acc0,$acc1,$t1
+	umulh	$t1,$m1,$mi
+	adcs	$acc1,$acc2,$t2
+	umulh	$t2,$m2,$mi
+	adcs	$acc2,$acc3,$t3
+	umulh	$t3,$m3,$mi
+	adcs	$acc3,$acc4,$carry
+	adc	$carry,xzr,xzr
+	adds	$acc0,$acc0,$t0
+	sub	$t0,$ap_end,$ap
+	adcs	$acc1,$acc1,$t1
+	adcs	$acc2,$acc2,$t2
+	adcs	$acc3,$acc3,$t3
+	//adc	$carry,$carry,xzr
+	cbnz	$cnt,.Loop_mul4x_1st_reduction
+
+	cbz	$t0,.Lmul4x4_post_condition
+
+	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
+	ldp	$a2,$a3,[$ap,#8*2]
+	add	$ap,$ap,#8*4
+	ldr	$mi,[sp]		// a[0]*n0
+	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
+	ldp	$m2,$m3,[$np,#8*2]
+	add	$np,$np,#8*4
+
+.Loop_mul4x_1st_tail:
+	mul	$t0,$a0,$bi		// lo(a[4..7]*b[i])
+	adc	$carry,$carry,xzr	// modulo-scheduled
+	mul	$t1,$a1,$bi
+	add	$cnt,$cnt,#8
+	mul	$t2,$a2,$bi
+	and	$cnt,$cnt,#31
+	mul	$t3,$a3,$bi
+	adds	$acc0,$acc0,$t0
+	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[i])
+	adcs	$acc1,$acc1,$t1
+	umulh	$t1,$a1,$bi
+	adcs	$acc2,$acc2,$t2
+	umulh	$t2,$a2,$bi
+	adcs	$acc3,$acc3,$t3
+	umulh	$t3,$a3,$bi
+	adc	$acc4,xzr,xzr
+	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
+	adds	$acc1,$acc1,$t0
+	mul	$t0,$m0,$mi		// lo(n[4..7]*a[0]*n0)
+	adcs	$acc2,$acc2,$t1
+	mul	$t1,$m1,$mi
+	adcs	$acc3,$acc3,$t2
+	mul	$t2,$m2,$mi
+	adc	$acc4,$acc4,$t3		// can't overflow
+	mul	$t3,$m3,$mi
+	adds	$acc0,$acc0,$t0
+	umulh	$t0,$m0,$mi		// hi(n[4..7]*a[0]*n0)
+	adcs	$acc1,$acc1,$t1
+	umulh	$t1,$m1,$mi
+	adcs	$acc2,$acc2,$t2
+	umulh	$t2,$m2,$mi
+	adcs	$acc3,$acc3,$t3
+	adcs	$acc4,$acc4,$carry
+	umulh	$t3,$m3,$mi
+	adc	$carry,xzr,xzr
+	ldr	$mi,[sp,$cnt]		// next t[0]*n0
+	str	$acc0,[$tp],#8		// result!!!
+	adds	$acc0,$acc1,$t0
+	sub	$t0,$ap_end,$ap		// done yet?
+	adcs	$acc1,$acc2,$t1
+	adcs	$acc2,$acc3,$t2
+	adcs	$acc3,$acc4,$t3
+	//adc	$carry,$carry,xzr
+	cbnz	$cnt,.Loop_mul4x_1st_tail
+
+	sub	$t1,$ap_end,$num	// rewinded $ap
+	cbz	$t0,.Lmul4x_proceed
+
+	ldp	$a0,$a1,[$ap,#8*0]
+	ldp	$a2,$a3,[$ap,#8*2]
+	add	$ap,$ap,#8*4
+	ldp	$m0,$m1,[$np,#8*0]
+	ldp	$m2,$m3,[$np,#8*2]
+	add	$np,$np,#8*4
+	b	.Loop_mul4x_1st_tail
+
+.align	5
+.Lmul4x_proceed:
+	ldr	$bi,[$bp,#8*4]!		// *++b
+	adc	$topmost,$carry,xzr
+	ldp	$a0,$a1,[$t1,#8*0]	// a[0..3]
+	sub	$np,$np,$num		// rewind np
+	ldp	$a2,$a3,[$t1,#8*2]
+	add	$ap,$t1,#8*4
+
+	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
+	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
+	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
+	ldp	$acc2,$acc3,[sp,#8*6]
+
+	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
+	mov	$tp,sp
+	ldp	$m2,$m3,[$np,#8*2]
+	adds	$np,$np,#8*4		// clear carry bit
+	mov	$carry,xzr
+
+.align	4
+.Loop_mul4x_reduction:
+	mul	$t0,$a0,$bi		// lo(a[0..3]*b[4])
+	adc	$carry,$carry,xzr	// modulo-scheduled
+	mul	$t1,$a1,$bi
+	add	$cnt,$cnt,#8
+	mul	$t2,$a2,$bi
+	and	$cnt,$cnt,#31
+	mul	$t3,$a3,$bi
+	adds	$acc0,$acc0,$t0
+	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[4])
+	adcs	$acc1,$acc1,$t1
+	mul	$mi,$acc0,$n0		// t[0]*n0
+	adcs	$acc2,$acc2,$t2
+	umulh	$t1,$a1,$bi
+	adcs	$acc3,$acc3,$t3
+	umulh	$t2,$a2,$bi
+	adc	$acc4,xzr,xzr
+	umulh	$t3,$a3,$bi
+	ldr	$bi,[$bp,$cnt]		// next b[i]
+	adds	$acc1,$acc1,$t0
+	// (*)	mul	$t0,$m0,$mi
+	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
+	adcs	$acc2,$acc2,$t1
+	mul	$t1,$m1,$mi		// lo(n[0..3]*t[0]*n0
+	adcs	$acc3,$acc3,$t2
+	mul	$t2,$m2,$mi
+	adc	$acc4,$acc4,$t3		// can't overflow
+	mul	$t3,$m3,$mi
+	// (*)	adds	xzr,$acc0,$t0
+	subs	xzr,$acc0,#1		// (*)
+	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0
+	adcs	$acc0,$acc1,$t1
+	umulh	$t1,$m1,$mi
+	adcs	$acc1,$acc2,$t2
+	umulh	$t2,$m2,$mi
+	adcs	$acc2,$acc3,$t3
+	umulh	$t3,$m3,$mi
+	adcs	$acc3,$acc4,$carry
+	adc	$carry,xzr,xzr
+	adds	$acc0,$acc0,$t0
+	adcs	$acc1,$acc1,$t1
+	adcs	$acc2,$acc2,$t2
+	adcs	$acc3,$acc3,$t3
+	//adc	$carry,$carry,xzr
+	cbnz	$cnt,.Loop_mul4x_reduction
+
+	adc	$carry,$carry,xzr
+	ldp	$t0,$t1,[$tp,#8*4]	// t[4..7]
+	ldp	$t2,$t3,[$tp,#8*6]
+	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
+	ldp	$a2,$a3,[$ap,#8*2]
+	add	$ap,$ap,#8*4
+	adds	$acc0,$acc0,$t0
+	adcs	$acc1,$acc1,$t1
+	adcs	$acc2,$acc2,$t2
+	adcs	$acc3,$acc3,$t3
+	//adc	$carry,$carry,xzr
+
+	ldr	$mi,[sp]		// t[0]*n0
+	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
+	ldp	$m2,$m3,[$np,#8*2]
+	add	$np,$np,#8*4
+
+.align	4
+.Loop_mul4x_tail:
+	mul	$t0,$a0,$bi		// lo(a[4..7]*b[4])
+	adc	$carry,$carry,xzr	// modulo-scheduled
+	mul	$t1,$a1,$bi
+	add	$cnt,$cnt,#8
+	mul	$t2,$a2,$bi
+	and	$cnt,$cnt,#31
+	mul	$t3,$a3,$bi
+	adds	$acc0,$acc0,$t0
+	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[4])
+	adcs	$acc1,$acc1,$t1
+	umulh	$t1,$a1,$bi
+	adcs	$acc2,$acc2,$t2
+	umulh	$t2,$a2,$bi
+	adcs	$acc3,$acc3,$t3
+	umulh	$t3,$a3,$bi
+	adc	$acc4,xzr,xzr
+	ldr	$bi,[$bp,$cnt]		// next b[i]
+	adds	$acc1,$acc1,$t0
+	mul	$t0,$m0,$mi		// lo(n[4..7]*t[0]*n0)
+	adcs	$acc2,$acc2,$t1
+	mul	$t1,$m1,$mi
+	adcs	$acc3,$acc3,$t2
+	mul	$t2,$m2,$mi
+	adc	$acc4,$acc4,$t3		// can't overflow
+	mul	$t3,$m3,$mi
+	adds	$acc0,$acc0,$t0
+	umulh	$t0,$m0,$mi		// hi(n[4..7]*t[0]*n0)
+	adcs	$acc1,$acc1,$t1
+	umulh	$t1,$m1,$mi
+	adcs	$acc2,$acc2,$t2
+	umulh	$t2,$m2,$mi
+	adcs	$acc3,$acc3,$t3
+	umulh	$t3,$m3,$mi
+	adcs	$acc4,$acc4,$carry
+	ldr	$mi,[sp,$cnt]		// next a[0]*n0
+	adc	$carry,xzr,xzr
+	str	$acc0,[$tp],#8		// result!!!
+	adds	$acc0,$acc1,$t0
+	sub	$t0,$ap_end,$ap		// done yet?
+	adcs	$acc1,$acc2,$t1
+	adcs	$acc2,$acc3,$t2
+	adcs	$acc3,$acc4,$t3
+	//adc	$carry,$carry,xzr
+	cbnz	$cnt,.Loop_mul4x_tail
+
+	sub	$t1,$np,$num		// rewinded np?
+	adc	$carry,$carry,xzr
+	cbz	$t0,.Loop_mul4x_break
+
+	ldp	$t0,$t1,[$tp,#8*4]
+	ldp	$t2,$t3,[$tp,#8*6]
+	ldp	$a0,$a1,[$ap,#8*0]
+	ldp	$a2,$a3,[$ap,#8*2]
+	add	$ap,$ap,#8*4
+	adds	$acc0,$acc0,$t0
+	adcs	$acc1,$acc1,$t1
+	adcs	$acc2,$acc2,$t2
+	adcs	$acc3,$acc3,$t3
+	//adc	$carry,$carry,xzr
+	ldp	$m0,$m1,[$np,#8*0]
+	ldp	$m2,$m3,[$np,#8*2]
+	add	$np,$np,#8*4
+	b	.Loop_mul4x_tail
+
+.align	4
+.Loop_mul4x_break:
+	ldp	$t2,$t3,[x29,#96]	// pull rp and &b[num]
+	adds	$acc0,$acc0,$topmost
+	add	$bp,$bp,#8*4		// bp++
+	adcs	$acc1,$acc1,xzr
+	sub	$ap,$ap,$num		// rewind ap
+	adcs	$acc2,$acc2,xzr
+	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
+	adcs	$acc3,$acc3,xzr
+	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
+	adc	$topmost,$carry,xzr
+	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
+	cmp	$bp,$t3			// done yet?
+	ldp	$acc2,$acc3,[sp,#8*6]
+	ldp	$m0,$m1,[$t1,#8*0]	// n[0..3]
+	ldp	$m2,$m3,[$t1,#8*2]
+	add	$np,$t1,#8*4
+	b.eq	.Lmul4x_post
+
+	ldr	$bi,[$bp]
+	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
+	ldp	$a2,$a3,[$ap,#8*2]
+	adds	$ap,$ap,#8*4		// clear carry bit
+	mov	$carry,xzr
+	mov	$tp,sp
+	b	.Loop_mul4x_reduction
+
+.align	4
+.Lmul4x_post:
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	mov	$rp,$t2
+	mov	$ap_end,$t2		// $rp copy
+	subs	$t0,$acc0,$m0
+	add	$tp,sp,#8*8
+	sbcs	$t1,$acc1,$m1
+	sub	$cnt,$num,#8*4
+
+.Lmul4x_sub:
+	sbcs	$t2,$acc2,$m2
+	ldp	$m0,$m1,[$np,#8*0]
+	sub	$cnt,$cnt,#8*4
+	ldp	$acc0,$acc1,[$tp,#8*0]
+	sbcs	$t3,$acc3,$m3
+	ldp	$m2,$m3,[$np,#8*2]
+	add	$np,$np,#8*4
+	ldp	$acc2,$acc3,[$tp,#8*2]
+	add	$tp,$tp,#8*4
+	stp	$t0,$t1,[$rp,#8*0]
+	sbcs	$t0,$acc0,$m0
+	stp	$t2,$t3,[$rp,#8*2]
+	add	$rp,$rp,#8*4
+	sbcs	$t1,$acc1,$m1
+	cbnz	$cnt,.Lmul4x_sub
+
+	sbcs	$t2,$acc2,$m2
+	 mov	$tp,sp
+	 add	$ap,sp,#8*4
+	 ldp	$a0,$a1,[$ap_end,#8*0]
+	sbcs	$t3,$acc3,$m3
+	stp	$t0,$t1,[$rp,#8*0]
+	 ldp	$a2,$a3,[$ap_end,#8*2]
+	stp	$t2,$t3,[$rp,#8*2]
+	 ldp	$acc0,$acc1,[$ap,#8*0]
+	 ldp	$acc2,$acc3,[$ap,#8*2]
+	sbcs	xzr,$topmost,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+
+	sub	$cnt,$num,#8*4
+.Lmul4x_cond_copy:
+	sub	$cnt,$cnt,#8*4
+	csel	$t0,$acc0,$a0,lo
+	 stp	xzr,xzr,[$tp,#8*0]
+	csel	$t1,$acc1,$a1,lo
+	ldp	$a0,$a1,[$ap_end,#8*4]
+	ldp	$acc0,$acc1,[$ap,#8*4]
+	csel	$t2,$acc2,$a2,lo
+	 stp	xzr,xzr,[$tp,#8*2]
+	 add	$tp,$tp,#8*4
+	csel	$t3,$acc3,$a3,lo
+	ldp	$a2,$a3,[$ap_end,#8*6]
+	ldp	$acc2,$acc3,[$ap,#8*6]
+	add	$ap,$ap,#8*4
+	stp	$t0,$t1,[$ap_end,#8*0]
+	stp	$t2,$t3,[$ap_end,#8*2]
+	add	$ap_end,$ap_end,#8*4
+	cbnz	$cnt,.Lmul4x_cond_copy
+
+	csel	$t0,$acc0,$a0,lo
+	 stp	xzr,xzr,[$tp,#8*0]
+	csel	$t1,$acc1,$a1,lo
+	 stp	xzr,xzr,[$tp,#8*2]
+	csel	$t2,$acc2,$a2,lo
+	 stp	xzr,xzr,[$tp,#8*3]
+	csel	$t3,$acc3,$a3,lo
+	 stp	xzr,xzr,[$tp,#8*4]
+	stp	$t0,$t1,[$ap_end,#8*0]
+	stp	$t2,$t3,[$ap_end,#8*2]
+
+	b	.Lmul4x_done
+
+.align	4
+.Lmul4x4_post_condition:
+	adc	$carry,$carry,xzr
+	ldr	$ap,[x29,#96]		// pull rp
+	// $acc0-3,$carry hold result, $m0-7 hold modulus
+	subs	$a0,$acc0,$m0
+	ldr	x30,[x29,#8]		// pull return address
+	sbcs	$a1,$acc1,$m1
+	 stp	xzr,xzr,[sp,#8*0]
+	sbcs	$a2,$acc2,$m2
+	 stp	xzr,xzr,[sp,#8*2]
+	sbcs	$a3,$acc3,$m3
+	 stp	xzr,xzr,[sp,#8*4]
+	sbcs	xzr,$carry,xzr		// did it borrow?
+	 stp	xzr,xzr,[sp,#8*6]
+
+	// $a0-3 hold result-modulus
+	csel	$a0,$acc0,$a0,lo
+	csel	$a1,$acc1,$a1,lo
+	csel	$a2,$acc2,$a2,lo
+	csel	$a3,$acc3,$a3,lo
+	stp	$a0,$a1,[$ap,#8*0]
+	stp	$a2,$a3,[$ap,#8*2]
+
+.Lmul4x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	ret
+.size	__bn_mul4x_mont,.-__bn_mul4x_mont
+___
+}
+$code.=<<___;
+.asciz	"Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+___
+
+print $code;
+
+close STDOUT;
diff --git a/deps/openssl/openssl/crypto/bn/asm/bn-586.pl b/deps/openssl/openssl/crypto/bn/asm/bn-586.pl
index 332ef3e91d..1ca1bbf7d4 100644
--- a/deps/openssl/openssl/crypto/bn/asm/bn-586.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/bn-586.pl
@@ -1,9 +1,19 @@
-#!/usr/local/bin/perl
+#! /usr/bin/env perl
+# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
+$output = pop;
+open STDOUT,">$output";
+
 &asm_init($ARGV[0],$0);
 
 $sse2=0;
@@ -21,6 +31,8 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 
 &asm_finish();
 
+close STDOUT;
+
 sub bn_mul_add_words
 	{
 	local($name)=@_;
@@ -771,4 +783,3 @@ sub bn_sub_part_words
 
 	&function_end($name);
 	}
-
diff --git a/deps/openssl/openssl/crypto/bn/asm/bn-c64xplus.asm b/deps/openssl/openssl/crypto/bn/asm/bn-c64xplus.asm
new file mode 100644
index 0000000000..de6d37728f
--- /dev/null
+++ b/deps/openssl/openssl/crypto/bn/asm/bn-c64xplus.asm
@@ -0,0 +1,382 @@
+;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+;;
+;; Licensed under the OpenSSL license (the "License").  You may not use
+;; this file except in compliance with the License.  You can obtain a copy
+;; in the file LICENSE in the source distribution or at
+;; https://www.openssl.org/source/license.html
+;;
+;;====================================================================
+;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+;; project.
+;;
+;; Rights for redistribution and usage in source and binary forms are
+;; granted according to the OpenSSL license. Warranty of any kind is
+;; disclaimed.
+;;====================================================================
+;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
+;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
+;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
+;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
+;;====================================================================
+	.text
+
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.asg	bn_mul_add_words,_bn_mul_add_words
+	.asg	bn_mul_words,_bn_mul_words
+	.asg	bn_sqr_words,_bn_sqr_words
+	.asg	bn_add_words,_bn_add_words
+	.asg	bn_sub_words,_bn_sub_words
+	.asg	bn_div_words,_bn_div_words
+	.asg	bn_sqr_comba8,_bn_sqr_comba8
+	.asg	bn_mul_comba8,_bn_mul_comba8
+	.asg	bn_sqr_comba4,_bn_sqr_comba4
+	.asg	bn_mul_comba4,_bn_mul_comba4
+	.endif
+
+	.asg	B3,RA
+	.asg	A4,ARG0
+	.asg	B4,ARG1
+	.asg	A6,ARG2
+	.asg	B6,ARG3
+	.asg	A8,ARG4
+	.asg	B8,ARG5
+	.asg	A4,RET
+	.asg	A15,FP
+	.asg	B14,DP
+	.asg	B15,SP
+
+	.global	_bn_mul_add_words
+_bn_mul_add_words:
+	.asmfunc
+	MV	ARG2,B0
+  [!B0]	BNOP	RA
+||[!B0]	MVK	0,RET
+   [B0]	MVC	B0,ILC
+   [B0]	ZERO	A19		; high part of accumulator
+|| [B0]	MV	ARG0,A2
+|| [B0]	MV	ARG3,A3
+	NOP	3
+
+	SPLOOP	2		; 2*n+10
+;;====================================================================
+	LDW	*ARG1++,B7	; ap[i]
+	NOP	3
+	LDW	*ARG0++,A7	; rp[i]
+	MPY32U	B7,A3,A17:A16
+	NOP	3		; [2,0] in epilogue
+	ADDU	A16,A7,A21:A20
+	ADDU	A19,A21:A20,A19:A18
+||	MV.S	A17,A23
+	SPKERNEL 2,1		; leave slot for "return value"
+||	STW	A18,*A2++	; rp[i]
+||	ADD	A19,A23,A19
+;;====================================================================
+	BNOP	RA,4
+	MV	A19,RET		; return value
+	.endasmfunc
+
+	.global	_bn_mul_words
+_bn_mul_words:
+	.asmfunc
+	MV	ARG2,B0
+  [!B0]	BNOP	RA
+||[!B0]	MVK	0,RET
+   [B0]	MVC	B0,ILC
+   [B0]	ZERO	A19		; high part of accumulator
+	NOP	3
+
+	SPLOOP	2		; 2*n+10
+;;====================================================================
+	LDW	*ARG1++,A7	; ap[i]
+	NOP	4
+	MPY32U	A7,ARG3,A17:A16
+	NOP	4		; [2,0] in epiloque
+	ADDU	A19,A16,A19:A18
+||	MV.S	A17,A21
+	SPKERNEL 2,1		; leave slot for "return value"
+||	STW	A18,*ARG0++	; rp[i]
+||	ADD.L	A19,A21,A19
+;;====================================================================
+	BNOP	RA,4
+	MV	A19,RET		; return value
+	.endasmfunc
+
+	.global	_bn_sqr_words
+_bn_sqr_words:
+	.asmfunc
+	MV	ARG2,B0
+  [!B0]	BNOP	RA
+||[!B0]	MVK	0,RET
+   [B0]	MVC	B0,ILC
+   [B0]	MV	ARG0,B2
+|| [B0]	ADD	4,ARG0,ARG0
+	NOP	3
+
+	SPLOOP	2		; 2*n+10
+;;====================================================================
+	LDW	*ARG1++,B7	; ap[i]
+	NOP	4
+	MPY32U	B7,B7,B1:B0
+	NOP	3		; [2,0] in epilogue
+	STW	B0,*B2++(8)	; rp[2*i]
+	MV	B1,A1
+	SPKERNEL 2,0		; fully overlap BNOP RA,5
+||	STW	A1,*ARG0++(8)	; rp[2*i+1]
+;;====================================================================
+	BNOP	RA,5
+	.endasmfunc
+
+	.global	_bn_add_words
+_bn_add_words:
+	.asmfunc
+	MV	ARG3,B0
+  [!B0]	BNOP	RA
+||[!B0]	MVK	0,RET
+   [B0]	MVC	B0,ILC
+   [B0]	ZERO	A1		; carry flag
+|| [B0]	MV	ARG0,A3
+	NOP	3
+
+	SPLOOP	2		; 2*n+6
+;;====================================================================
+	LDW	*ARG2++,A7	; bp[i]
+||	LDW	*ARG1++,B7	; ap[i]
+	NOP	4
+	ADDU	A7,B7,A9:A8
+	ADDU	A1,A9:A8,A1:A0
+	SPKERNEL 0,0		; fully overlap BNOP RA,5
+||	STW	A0,*A3++	; write result
+||	MV	A1,RET		; keep carry flag in RET
+;;====================================================================
+	BNOP	RA,5
+	.endasmfunc
+
+	.global	_bn_sub_words
+_bn_sub_words:
+	.asmfunc
+	MV	ARG3,B0
+  [!B0]	BNOP	RA
+||[!B0]	MVK	0,RET
+   [B0]	MVC	B0,ILC
+   [B0]	ZERO	A2		; borrow flag
+|| [B0]	MV	ARG0,A3
+	NOP	3
+
+	SPLOOP	2		; 2*n+6
+;;====================================================================
+	LDW	*ARG2++,A7	; bp[i]
+||	LDW	*ARG1++,B7	; ap[i]
+	NOP	4
+	SUBU	B7,A7,A1:A0
+  [A2]	SUB	A1:A0,1,A1:A0
+	SPKERNEL 0,1		; leave slot for "return borrow flag"
+||	STW	A0,*A3++	; write result
+||	AND	1,A1,A2		; pass on borrow flag
+;;====================================================================
+	BNOP	RA,4
+	AND	1,A1,RET	; return borrow flag
+	.endasmfunc
+
+	.global	_bn_div_words
+_bn_div_words:
+	.asmfunc
+	LMBD	1,A6,A0		; leading zero bits in dv
+	LMBD	1,A4,A1		; leading zero bits in hi
+||	MVK	32,B0
+	CMPLTU	A1,A0,A2
+||	ADD	A0,B0,B0
+  [ A2]	BNOP	RA
+||[ A2]	MVK	-1,A4		; return overflow
+||[!A2]	MV	A4,A3		; reassign hi
+  [!A2]	MV	B4,A4		; reassign lo, will be quotient
+||[!A2]	MVC	B0,ILC
+  [!A2]	SHL	A6,A0,A6	; normalize dv
+||	MVK	1,A1
+
+  [!A2]	CMPLTU	A3,A6,A1	; hi<dv?
+||[!A2]	SHL	A4,1,A5:A4	; lo<<1
+  [!A1]	SUB	A3,A6,A3	; hi-=dv
+||[!A1]	OR	1,A4,A4
+  [!A2]	SHRU	A3,31,A1	; upper bit
+||[!A2]	ADDAH	A5,A3,A3	; hi<<1|lo>>31
+
+	SPLOOP	3
+  [!A1]	CMPLTU	A3,A6,A1	; hi<dv?
+||[ A1]	ZERO	A1
+||	SHL	A4,1,A5:A4	; lo<<1
+  [!A1]	SUB	A3,A6,A3	; hi-=dv
+||[!A1]	OR	1,A4,A4		; quotient
+	SHRU	A3,31,A1	; upper bit
+||	ADDAH	A5,A3,A3	; hi<<1|lo>>31
+	SPKERNEL
+
+	BNOP	RA,5
+	.endasmfunc
+
+;;====================================================================
+;; Not really Comba algorithm, just straightforward NxM... Dedicated
+;; fully unrolled real Comba implementations are asymptotically 2x
+;; faster, but naturally larger undertaking. Purpose of this exercise
+;; was rather to learn to master nested SPLOOPs...
+;;====================================================================
+	.global	_bn_sqr_comba8
+	.global	_bn_mul_comba8
+_bn_sqr_comba8:
+	MV	ARG1,ARG2
+_bn_mul_comba8:
+	.asmfunc
+	MVK	8,B0		; N, RILC
+||	MVK	8,A0		; M, outer loop counter
+||	MV	ARG1,A5		; copy ap
+||	MV	ARG0,B4		; copy rp
+||	ZERO	B19		; high part of accumulator
+	MVC	B0,RILC
+||	SUB	B0,2,B1		; N-2, initial ILC
+||	SUB	B0,1,B2		; const B2=N-1
+||	LDW	*A5++,B6	; ap[0]
+||	MV	A0,A3		; const A3=M
+sploopNxM?:			; for best performance arrange M<=N
+   [A0]	SPLOOPD	2		; 2*n+10
+||	MVC	B1,ILC
+||	ADDAW	B4,B0,B5
+||	ZERO	B7
+||	LDW	*A5++,A9	; pre-fetch ap[1]
+||	ZERO	A1
+||	SUB	A0,1,A0
+;;====================================================================
+;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
+;; This is because of Advisory 15 from TI publication SPRZ247I.
+	LDW	*ARG2++,A7	; bp[i]
+	NOP	3
+   [A1]	LDW	*B5++,B7	; rp[i]
+	MPY32U	A7,B6,B17:B16
+	NOP	3
+	ADDU	B16,B7,B21:B20
+	ADDU	B19,B21:B20,B19:B18
+||	MV.S	B17,B23
+	SPKERNEL
+||	STW	B18,*B4++	; rp[i]
+||	ADD.S	B19,B23,B19
+;;====================================================================
+outer?:				; m*2*(n+1)+10
+	SUBAW	ARG2,A3,ARG2	; rewind bp to bp[0]
+	SPMASKR
+||	CMPGT	A0,1,A2		; done pre-fetching ap[i+1]?
+	MVD	A9,B6		; move through .M unit(*)
+   [A2]	LDW	*A5++,A9	; pre-fetch ap[i+1]
+	SUBAW	B5,B2,B5	; rewind rp to rp[1]
+	MVK	1,A1
+   [A0]	BNOP.S1	outer?,4
+|| [A0]	SUB.L	A0,1,A0
+	STW	B19,*B4--[B2]	; rewind rp tp rp[1]
+||	ZERO.S	B19		; high part of accumulator
+;; end of outer?
+	BNOP	RA,5		; return
+	.endasmfunc
+;; (*)	It should be noted that B6 is used as input to MPY32U in
+;;	chronologically next cycle in *preceding* SPLOOP iteration.
+;;	Normally such arrangement would require DINT, but at this
+;;	point SPLOOP is draining and interrupts are disabled
+;;	implicitly.
+
+	.global	_bn_sqr_comba4
+	.global	_bn_mul_comba4
+_bn_sqr_comba4:
+	MV	ARG1,ARG2
+_bn_mul_comba4:
+	.asmfunc
+	.if	0
+	BNOP	sploopNxM?,3
+	;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
+	;; because of low-counter effect, when prologue phase finishes
+	;; before SPKERNEL instruction is reached. As result it's 25%
+	;; slower than expected...
+	MVK	4,B0		; N, RILC
+||	MVK	4,A0		; M, outer loop counter
+||	MV	ARG1,A5		; copy ap
+||	MV	ARG0,B4		; copy rp
+||	ZERO	B19		; high part of accumulator
+	MVC	B0,RILC
+||	SUB	B0,2,B1		; first ILC
+||	SUB	B0,1,B2		; const B2=N-1
+||	LDW	*A5++,B6	; ap[0]
+||	MV	A0,A3		; const A3=M
+	.else
+	;; This alternative is an exercise in fully unrolled Comba
+	;; algorithm implementation that operates at n*(n+1)+12, or
+	;; as little as 32 cycles...
+	LDW	*ARG1[0],B16	; a[0]
+||	LDW	*ARG2[0],A16	; b[0]
+	LDW	*ARG1[1],B17	; a[1]
+||	LDW	*ARG2[1],A17	; b[1]
+	LDW	*ARG1[2],B18	; a[2]
+||	LDW	*ARG2[2],A18	; b[2]
+	LDW	*ARG1[3],B19	; a[3]
+||	LDW	*ARG2[3],A19	; b[3]
+	NOP
+	MPY32U	A16,B16,A1:A0	; a[0]*b[0]
+	MPY32U	A17,B16,A23:A22	; a[0]*b[1]
+	MPY32U	A16,B17,A25:A24	; a[1]*b[0]
+	MPY32U	A16,B18,A27:A26	; a[2]*b[0]
+	STW	A0,*ARG0[0]
+||	MPY32U	A17,B17,A29:A28	; a[1]*b[1]
+	MPY32U	A18,B16,A31:A30	; a[0]*b[2]
+||	ADDU	A22,A1,A1:A0
+	MV	A23,B0
+||	MPY32U	A19,B16,A21:A20	; a[3]*b[0]
+||	ADDU	A24,A1:A0,A1:A0
+	ADDU	A25,B0,B1:B0
+||	STW	A0,*ARG0[1]
+||	MPY32U	A18,B17,A23:A22	; a[2]*b[1]
+||	ADDU	A26,A1,A9:A8
+	ADDU	A27,B1,B9:B8
+||	MPY32U	A17,B18,A25:A24	; a[1]*b[2]
+||	ADDU	A28,A9:A8,A9:A8
+	ADDU	A29,B9:B8,B9:B8
+||	MPY32U	A16,B19,A27:A26	; a[0]*b[3]
+||	ADDU	A30,A9:A8,A9:A8
+	ADDU	A31,B9:B8,B9:B8
+||	ADDU	B0,A9:A8,A9:A8
+	STW	A8,*ARG0[2]
+||	ADDU	A20,A9,A1:A0
+	ADDU	A21,B9,B1:B0
+||	MPY32U	A19,B17,A21:A20	; a[3]*b[1]
+||	ADDU	A22,A1:A0,A1:A0
+	ADDU	A23,B1:B0,B1:B0
+||	MPY32U	A18,B18,A23:A22	; a[2]*b[2]
+||	ADDU	A24,A1:A0,A1:A0
+	ADDU	A25,B1:B0,B1:B0
+||	MPY32U	A17,B19,A25:A24	; a[1]*b[3]
+||	ADDU	A26,A1:A0,A1:A0
+	ADDU	A27,B1:B0,B1:B0
+||	ADDU	B8,A1:A0,A1:A0
+	STW	A0,*ARG0[3]
+||	MPY32U	A19,B18,A27:A26	; a[3]*b[2]
+||	ADDU	A20,A1,A9:A8
+	ADDU	A21,B1,B9:B8
+||	MPY32U	A18,B19,A29:A28	; a[2]*b[3]
+||	ADDU	A22,A9:A8,A9:A8
+	ADDU	A23,B9:B8,B9:B8
+||	MPY32U	A19,B19,A31:A30	; a[3]*b[3]
+||	ADDU	A24,A9:A8,A9:A8
+	ADDU	A25,B9:B8,B9:B8
+||	ADDU	B0,A9:A8,A9:A8
+	STW	A8,*ARG0[4]
+||	ADDU	A26,A9,A1:A0
+	ADDU	A27,B9,B1:B0
+||	ADDU	A28,A1:A0,A1:A0
+	ADDU	A29,B1:B0,B1:B0
+||	BNOP	RA
+||	ADDU	B8,A1:A0,A1:A0
+	STW	A0,*ARG0[5]
+||	ADDU	A30,A1,A9:A8
+	ADD	A31,B1,B8
+	ADDU	B0,A9:A8,A9:A8	; removed || to avoid cross-path stall below
+	ADD	B8,A9,A9
+||	STW	A8,*ARG0[6]
+	STW	A9,*ARG0[7]
+	.endif
+	.endasmfunc
diff --git a/deps/openssl/openssl/crypto/bn/asm/c64xplus-gf2m.pl b/deps/openssl/openssl/crypto/bn/asm/c64xplus-gf2m.pl
new file mode 100644
index 0000000000..c0e5400807
--- /dev/null
+++ b/deps/openssl/openssl/crypto/bn/asm/c64xplus-gf2m.pl
@@ -0,0 +1,160 @@
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# February 2012
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication
+# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
+# C for the time being... The subroutine runs in 37 cycles, which is
+# 4.5x faster than compiler-generated code. Though comparison is
+# totally unfair, because this module utilizes Galois Field Multiply
+# instruction.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8");   # argument vector
+
+($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
+($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
+($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
+($A,$B)=($Alo,$B_1);
+$xFF="B1";
+
+sub mul_1x1_upper {
+my ($A,$B)=@_;
+$code.=<<___;
+	EXTU	$B,8,24,$B_2		; smash $B to 4 bytes
+||	AND	$B,$xFF,$B_0
+||	SHRU	$B,24,$B_3
+	SHRU	$A,16,   $Ahi		; smash $A to two halfwords
+||	EXTU	$A,16,16,$Alo
+
+	XORMPY	$Alo,$B_2,$Alox2	; 16x8 bits muliplication
+||	XORMPY	$Ahi,$B_2,$Ahix2
+||	EXTU	$B,16,24,$B_1
+	XORMPY	$Alo,$B_0,$Alox0
+||	XORMPY	$Ahi,$B_0,$Ahix0
+	XORMPY	$Alo,$B_3,$Alox3
+||	XORMPY	$Ahi,$B_3,$Ahix3
+	XORMPY	$Alo,$B_1,$Alox1
+||	XORMPY	$Ahi,$B_1,$Ahix1
+___
+}
+sub mul_1x1_merged {
+my ($OUTlo,$OUThi,$A,$B)=@_;
+$code.=<<___;
+	 EXTU	$B,8,24,$B_2		; smash $B to 4 bytes
+||	 AND	$B,$xFF,$B_0
+||	 SHRU	$B,24,$B_3
+	 SHRU	$A,16,   $Ahi		; smash $A to two halfwords
+||	 EXTU	$A,16,16,$Alo
+
+	XOR	$Ahix0,$Alox2,$Ahix0
+||	MV	$Ahix2,$OUThi
+||	 XORMPY	$Alo,$B_2,$Alox2
+	 XORMPY	$Ahi,$B_2,$Ahix2
+||	 EXTU	$B,16,24,$B_1
+||	 XORMPY	$Alo,$B_0,A1		; $Alox0
+	XOR	$Ahix1,$Alox3,$Ahix1
+||	SHL	$Ahix0,16,$OUTlo
+||	SHRU	$Ahix0,16,$Ahix0
+	XOR	$Alox0,$OUTlo,$OUTlo
+||	XOR	$Ahix0,$OUThi,$OUThi
+||	 XORMPY	$Ahi,$B_0,$Ahix0
+||	 XORMPY	$Alo,$B_3,$Alox3
+||	SHL	$Alox1,8,$Alox1
+||	SHL	$Ahix3,8,$Ahix3
+	XOR	$Alox1,$OUTlo,$OUTlo
+||	XOR	$Ahix3,$OUThi,$OUThi
+||	 XORMPY	$Ahi,$B_3,$Ahix3
+||	SHL	$Ahix1,24,$Alox1
+||	SHRU	$Ahix1,8, $Ahix1
+	XOR	$Alox1,$OUTlo,$OUTlo
+||	XOR	$Ahix1,$OUThi,$OUThi
+||	 XORMPY	$Alo,$B_1,$Alox1
+||	 XORMPY	$Ahi,$B_1,$Ahix1
+||	 MV	A1,$Alox0
+___
+}
+sub mul_1x1_lower {
+my ($OUTlo,$OUThi)=@_;
+$code.=<<___;
+	;NOP
+	XOR	$Ahix0,$Alox2,$Ahix0
+||	MV	$Ahix2,$OUThi
+	NOP
+	XOR	$Ahix1,$Alox3,$Ahix1
+||	SHL	$Ahix0,16,$OUTlo
+||	SHRU	$Ahix0,16,$Ahix0
+	XOR	$Alox0,$OUTlo,$OUTlo
+||	XOR	$Ahix0,$OUThi,$OUThi
+||	SHL	$Alox1,8,$Alox1
+||	SHL	$Ahix3,8,$Ahix3
+	XOR	$Alox1,$OUTlo,$OUTlo
+||	XOR	$Ahix3,$OUThi,$OUThi
+||	SHL	$Ahix1,24,$Alox1
+||	SHRU	$Ahix1,8, $Ahix1
+	XOR	$Alox1,$OUTlo,$OUTlo
+||	XOR	$Ahix1,$OUThi,$OUThi
+___
+}
+$code.=<<___;
+	.text
+
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.asg	bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2
+	.endif
+
+	.global	_bn_GF2m_mul_2x2
+_bn_GF2m_mul_2x2:
+	.asmfunc
+	MVK	0xFF,$xFF
+___
+	&mul_1x1_upper($a0,$b0);		# a0·b0
+$code.=<<___;
+||	MV	$b1,$B
+	MV	$a1,$A
+___
+	&mul_1x1_merged("A28","B28",$A,$B);	# a0·b0/a1·b1
+$code.=<<___;
+||	XOR	$b0,$b1,$B
+	XOR	$a0,$a1,$A
+___
+	&mul_1x1_merged("A31","B31",$A,$B);	# a1·b1/(a0+a1)·(b0+b1)
+$code.=<<___;
+	XOR	A28,A31,A29
+||	XOR	B28,B31,B29			; a0·b0+a1·b1
+___
+	&mul_1x1_lower("A30","B30");		# (a0+a1)·(b0+b1)
+$code.=<<___;
+||	BNOP	B3
+	XOR	A29,A30,A30
+||	XOR	B29,B30,B30			; (a0+a1)·(b0+b1)-a0·b0-a1·b1
+	XOR	B28,A30,A30
+||	STW	A28,*${rp}[0]
+	XOR	B30,A31,A31
+||	STW	A30,*${rp}[1]
+	STW	A31,*${rp}[2]
+	STW	B31,*${rp}[3]
+	.endasmfunc
+___
+
+print $code;
+close STDOUT;
diff --git a/deps/openssl/openssl/crypto/bn/asm/co-586.pl b/deps/openssl/openssl/crypto/bn/asm/co-586.pl
index 57101a6bd7..60d0363660 100644
--- a/deps/openssl/openssl/crypto/bn/asm/co-586.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/co-586.pl
@@ -1,9 +1,18 @@
-#!/usr/local/bin/perl
+#! /usr/bin/env perl
+# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
 
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
+$output = pop;
+open STDOUT,">$output";
+
 &asm_init($ARGV[0],$0);
 
 &bn_mul_comba("bn_mul_comba8",8);
@@ -13,6 +22,8 @@ require "x86asm.pl";
 
 &asm_finish();
 
+close STDOUT;
+
 sub mul_add_c
 	{
 	local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
diff --git a/deps/openssl/openssl/crypto/bn/asm/ia64-mont.pl b/deps/openssl/openssl/crypto/bn/asm/ia64-mont.pl
index e258658428..5cc5c599f9 100644
--- a/deps/openssl/openssl/crypto/bn/asm/ia64-mont.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/ia64-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -60,6 +67,8 @@
 # hereafter less for longer keys, while verify - by 74-13%.
 # DSA performance improves by 115-30%.
 
+$output=pop;
+
 if ($^O eq "hpux") {
     $ADDP="addp4";
     for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
@@ -846,6 +855,6 @@ copyright:
 stringz	"Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
-$output=shift and open STDOUT,">$output";
+open STDOUT,">$output" if $output;
 print $code;
 close STDOUT;
diff --git a/deps/openssl/openssl/crypto/bn/asm/ia64.S b/deps/openssl/openssl/crypto/bn/asm/ia64.S
index a9a42abfc3..f2404a3c1e 100644
--- a/deps/openssl/openssl/crypto/bn/asm/ia64.S
+++ b/deps/openssl/openssl/crypto/bn/asm/ia64.S
@@ -3,6 +3,13 @@
 .ident	"ia64.S, Version 2.1"
 .ident	"IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
 
+// Copyright 2001-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
 //
 // ====================================================================
 // Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -22,7 +29,7 @@
 // ports is the same, i.e. 2, while I need 4. In other words, to this
 // module Itanium2 remains effectively as "wide" as Itanium. Yet it's
 // essentially different in respect to this module, and a re-tune was
-// required. Well, because some intruction latencies has changed. Most
+// required. Well, because some instruction latencies has changed. Most
 // noticeably those intensively used:
 //
 //			Itanium	Itanium2
@@ -363,7 +370,7 @@ bn_mul_words:
 // The loop therefore spins at the latency of xma minus 1, or in other
 // words at 6*(n+4) ticks:-( Compare to the "production" loop above
 // that runs in 2*(n+11) where the low latency problem is worked around
-// by moving the dependency to one-tick latent interger ALU. Note that
+// by moving the dependency to one-tick latent integer ALU. Note that
 // "distance" between ldf8 and xma is not latency of ldf8, but the
 // *difference* between xma and ldf8 latencies.
 .L_bn_mul_words_ctop:
@@ -425,7 +432,7 @@ bn_mul_add_words:
 // version was performing *all* additions in IALU and was starving
 // for those even on Itanium 2. In this version one addition is
 // moved to FPU and is folded with multiplication. This is at cost
-// of propogating the result from previous call to this subroutine
+// of propagating the result from previous call to this subroutine
 // to L2 cache... In other words negligible even for shorter keys.
 // *Overall* performance improvement [over previous version] varies
 // from 11 to 22 percent depending on key length.
@@ -495,7 +502,7 @@ bn_sqr_words:
 // scalability. The decision will very likely be reconsidered after the
 // benchmark program is profiled. I.e. if perfomance gain on Itanium
 // will appear larger than loss on "wider" IA-64, then the loop should
-// be explicitely split and the epilogue compressed.
+// be explicitly split and the epilogue compressed.
 .L_bn_sqr_words_ctop:
 { .mfi;	(p16)	ldf8		f32=[r33],8
 	(p25)	xmpy.lu		f42=f41,f41
diff --git a/deps/openssl/openssl/crypto/bn/asm/mips-mont.pl b/deps/openssl/openssl/crypto/bn/asm/mips-mont.pl
index a33cdf4111..a907571bec 100644
--- a/deps/openssl/openssl/crypto/bn/asm/mips-mont.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/mips-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -67,7 +74,7 @@ $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
 #
 ######################################################################
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 if ($flavour =~ /64|n32/i) {
diff --git a/deps/openssl/openssl/crypto/bn/asm/mips.pl b/deps/openssl/openssl/crypto/bn/asm/mips.pl
index acafde5e56..420f01f3a4 100644
--- a/deps/openssl/openssl/crypto/bn/asm/mips.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/mips.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -15,7 +22,7 @@
 # This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
 #
 # The module is designed to work with either of the "new" MIPS ABI(5),
-# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
+# namely N32 or N64, offered by IRIX 6.x. It's not meant to work under
 # IRIX 5.x not only because it doesn't support new ABIs but also
 # because 5.x kernels put R4x00 CPU into 32-bit mode and all those
 # 64-bit instructions (daddu, dmultu, etc.) found below gonna only
@@ -49,7 +56,7 @@
 # key length, more for longer keys.
 
 $flavour = shift || "o32";
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 if ($flavour =~ /64|n32/i) {
diff --git a/deps/openssl/openssl/crypto/bn/asm/mips3-mont.pl b/deps/openssl/openssl/crypto/bn/asm/mips3-mont.pl
deleted file mode 100644
index 8f9156e02a..0000000000
--- a/deps/openssl/openssl/crypto/bn/asm/mips3-mont.pl
+++ /dev/null
@@ -1,327 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# This module doesn't present direct interest for OpenSSL, because it
-# doesn't provide better performance for longer keys. While 512-bit
-# RSA private key operations are 40% faster, 1024-bit ones are hardly
-# faster at all, while longer key operations are slower by up to 20%.
-# It might be of interest to embedded system developers though, as
-# it's smaller than 1KB, yet offers ~3x improvement over compiler
-# generated code.
-#
-# The module targets N32 and N64 MIPS ABIs and currently is a bit
-# IRIX-centric, i.e. is likely to require adaptation for other OSes.
-
-# int bn_mul_mont(
-$rp="a0";	# BN_ULONG *rp,
-$ap="a1";	# const BN_ULONG *ap,
-$bp="a2";	# const BN_ULONG *bp,
-$np="a3";	# const BN_ULONG *np,
-$n0="a4";	# const BN_ULONG *n0,
-$num="a5";	# int num);
-
-$lo0="a6";
-$hi0="a7";
-$lo1="v0";
-$hi1="v1";
-$aj="t0";
-$bi="t1";
-$nj="t2";
-$tp="t3";
-$alo="s0";
-$ahi="s1";
-$nlo="s2";
-$nhi="s3";
-$tj="s4";
-$i="s5";
-$j="s6";
-$fp="t8";
-$m1="t9";
-
-$FRAME=8*(2+8);
-
-$code=<<___;
-#include <asm.h>
-#include <regdef.h>
-
-.text
-
-.set	noat
-.set	reorder
-
-.align	5
-.globl	bn_mul_mont
-.ent	bn_mul_mont
-bn_mul_mont:
-	.set	noreorder
-	PTR_SUB	sp,64
-	move	$fp,sp
-	.frame	$fp,64,ra
-	slt	AT,$num,4
-	li	v0,0
-	beqzl	AT,.Lproceed
-	nop
-	jr	ra
-	PTR_ADD	sp,$fp,64
-	.set	reorder
-.align	5
-.Lproceed:
-	ld	$n0,0($n0)
-	ld	$bi,0($bp)	# bp[0]
-	ld	$aj,0($ap)	# ap[0]
-	ld	$nj,0($np)	# np[0]
-	PTR_SUB	sp,16		# place for two extra words
-	sll	$num,3
-	li	AT,-4096
-	PTR_SUB	sp,$num
-	and	sp,AT
-
-	sd	s0,0($fp)
-	sd	s1,8($fp)
-	sd	s2,16($fp)
-	sd	s3,24($fp)
-	sd	s4,32($fp)
-	sd	s5,40($fp)
-	sd	s6,48($fp)
-	sd	s7,56($fp)
-
-	dmultu	$aj,$bi
-	ld	$alo,8($ap)
-	ld	$nlo,8($np)
-	mflo	$lo0
-	mfhi	$hi0
-	dmultu	$lo0,$n0
-	mflo	$m1
-
-	dmultu	$alo,$bi
-	mflo	$alo
-	mfhi	$ahi
-
-	dmultu	$nj,$m1
-	mflo	$lo1
-	mfhi	$hi1
-	dmultu	$nlo,$m1
-	daddu	$lo1,$lo0
-	sltu	AT,$lo1,$lo0
-	daddu	$hi1,AT
-	mflo	$nlo
-	mfhi	$nhi
-
-	move	$tp,sp
-	li	$j,16
-.align	4
-.L1st:
-	.set	noreorder
-	PTR_ADD	$aj,$ap,$j
-	ld	$aj,($aj)
-	PTR_ADD	$nj,$np,$j
-	ld	$nj,($nj)
-
-	dmultu	$aj,$bi
-	daddu	$lo0,$alo,$hi0
-	daddu	$lo1,$nlo,$hi1
-	sltu	AT,$lo0,$hi0
-	sltu	s7,$lo1,$hi1
-	daddu	$hi0,$ahi,AT
-	daddu	$hi1,$nhi,s7
-	mflo	$alo
-	mfhi	$ahi
-
-	daddu	$lo1,$lo0
-	sltu	AT,$lo1,$lo0
-	dmultu	$nj,$m1
-	daddu	$hi1,AT
-	addu	$j,8
-	sd	$lo1,($tp)
-	sltu	s7,$j,$num
-	mflo	$nlo
-	mfhi	$nhi
-
-	bnez	s7,.L1st
-	PTR_ADD	$tp,8
-	.set	reorder
-
-	daddu	$lo0,$alo,$hi0
-	sltu	AT,$lo0,$hi0
-	daddu	$hi0,$ahi,AT
-
-	daddu	$lo1,$nlo,$hi1
-	sltu	s7,$lo1,$hi1
-	daddu	$hi1,$nhi,s7
-	daddu	$lo1,$lo0
-	sltu	AT,$lo1,$lo0
-	daddu	$hi1,AT
-
-	sd	$lo1,($tp)
-
-	daddu	$hi1,$hi0
-	sltu	AT,$hi1,$hi0
-	sd	$hi1,8($tp)
-	sd	AT,16($tp)
-
-	li	$i,8
-.align	4
-.Louter:
-	PTR_ADD	$bi,$bp,$i
-	ld	$bi,($bi)
-	ld	$aj,($ap)
-	ld	$alo,8($ap)
-	ld	$tj,(sp)
-
-	dmultu	$aj,$bi
-	ld	$nj,($np)
-	ld	$nlo,8($np)
-	mflo	$lo0
-	mfhi	$hi0
-	daddu	$lo0,$tj
-	dmultu	$lo0,$n0
-	sltu	AT,$lo0,$tj
-	daddu	$hi0,AT
-	mflo	$m1
-
-	dmultu	$alo,$bi
-	mflo	$alo
-	mfhi	$ahi
-
-	dmultu	$nj,$m1
-	mflo	$lo1
-	mfhi	$hi1
-
-	dmultu	$nlo,$m1
-	daddu	$lo1,$lo0
-	sltu	AT,$lo1,$lo0
-	daddu	$hi1,AT
-	mflo	$nlo
-	mfhi	$nhi
-
-	move	$tp,sp
-	li	$j,16
-	ld	$tj,8($tp)
-.align	4
-.Linner:
-	.set	noreorder
-	PTR_ADD	$aj,$ap,$j
-	ld	$aj,($aj)
-	PTR_ADD	$nj,$np,$j
-	ld	$nj,($nj)
-
-	dmultu	$aj,$bi
-	daddu	$lo0,$alo,$hi0
-	daddu	$lo1,$nlo,$hi1
-	sltu	AT,$lo0,$hi0
-	sltu	s7,$lo1,$hi1
-	daddu	$hi0,$ahi,AT
-	daddu	$hi1,$nhi,s7
-	mflo	$alo
-	mfhi	$ahi
-
-	daddu	$lo0,$tj
-	addu	$j,8
-	dmultu	$nj,$m1
-	sltu	AT,$lo0,$tj
-	daddu	$lo1,$lo0
-	daddu	$hi0,AT
-	sltu	s7,$lo1,$lo0
-	ld	$tj,16($tp)
-	daddu	$hi1,s7
-	sltu	AT,$j,$num
-	mflo	$nlo
-	mfhi	$nhi
-	sd	$lo1,($tp)
-	bnez	AT,.Linner
-	PTR_ADD	$tp,8
-	.set	reorder
-
-	daddu	$lo0,$alo,$hi0
-	sltu	AT,$lo0,$hi0
-	daddu	$hi0,$ahi,AT
-	daddu	$lo0,$tj
-	sltu	s7,$lo0,$tj
-	daddu	$hi0,s7
-
-	ld	$tj,16($tp)
-	daddu	$lo1,$nlo,$hi1
-	sltu	AT,$lo1,$hi1
-	daddu	$hi1,$nhi,AT
-	daddu	$lo1,$lo0
-	sltu	s7,$lo1,$lo0
-	daddu	$hi1,s7
-	sd	$lo1,($tp)
-
-	daddu	$lo1,$hi1,$hi0
-	sltu	$hi1,$lo1,$hi0
-	daddu	$lo1,$tj
-	sltu	AT,$lo1,$tj
-	daddu	$hi1,AT
-	sd	$lo1,8($tp)
-	sd	$hi1,16($tp)
-
-	addu	$i,8
-	sltu	s7,$i,$num
-	bnez	s7,.Louter
-
-	.set	noreorder
-	PTR_ADD	$tj,sp,$num	# &tp[num]
-	move	$tp,sp
-	move	$ap,sp
-	li	$hi0,0		# clear borrow bit
-
-.align	4
-.Lsub:	ld	$lo0,($tp)
-	ld	$lo1,($np)
-	PTR_ADD	$tp,8
-	PTR_ADD	$np,8
-	dsubu	$lo1,$lo0,$lo1	# tp[i]-np[i]
-	sgtu	AT,$lo1,$lo0
-	dsubu	$lo0,$lo1,$hi0
-	sgtu	$hi0,$lo0,$lo1
-	sd	$lo0,($rp)
-	or	$hi0,AT
-	sltu	AT,$tp,$tj
-	bnez	AT,.Lsub
-	PTR_ADD	$rp,8
-
-	dsubu	$hi0,$hi1,$hi0	# handle upmost overflow bit
-	move	$tp,sp
-	PTR_SUB	$rp,$num	# restore rp
-	not	$hi1,$hi0
-
-	and	$ap,$hi0,sp
-	and	$bp,$hi1,$rp
-	or	$ap,$ap,$bp	# ap=borrow?tp:rp
-
-.align	4
-.Lcopy:	ld	$aj,($ap)
-	PTR_ADD	$ap,8
-	PTR_ADD	$tp,8
-	sd	zero,-8($tp)
-	sltu	AT,$tp,$tj
-	sd	$aj,($rp)
-	bnez	AT,.Lcopy
-	PTR_ADD	$rp,8
-
-	ld	s0,0($fp)
-	ld	s1,8($fp)
-	ld	s2,16($fp)
-	ld	s3,24($fp)
-	ld	s4,32($fp)
-	ld	s5,40($fp)
-	ld	s6,48($fp)
-	ld	s7,56($fp)
-	li	v0,1
-	jr	ra
-	PTR_ADD	sp,$fp,64
-	.set	reorder
-END(bn_mul_mont)
-.rdata
-.asciiz	"Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>"
-___
-
-print $code;
-close STDOUT;
diff --git a/deps/openssl/openssl/crypto/bn/asm/mips3.s b/deps/openssl/openssl/crypto/bn/asm/mips3.s
deleted file mode 100644
index dca4105c7d..0000000000
--- a/deps/openssl/openssl/crypto/bn/asm/mips3.s
+++ /dev/null
@@ -1,2201 +0,0 @@
-.rdata
-.asciiz	"mips3.s, Version 1.1"
-.asciiz	"MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
-
-/*
- * ====================================================================
- * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
- * project.
- *
- * Rights for redistribution and usage in source and binary forms are
- * granted according to the OpenSSL license. Warranty of any kind is
- * disclaimed.
- * ====================================================================
- */
-
-/*
- * This is my modest contributon to the OpenSSL project (see
- * http://www.openssl.org/ for more information about it) and is
- * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c
- * module. For updates see http://fy.chalmers.se/~appro/hpe/.
- *
- * The module is designed to work with either of the "new" MIPS ABI(5),
- * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
- * IRIX 5.x not only because it doesn't support new ABIs but also
- * because 5.x kernels put R4x00 CPU into 32-bit mode and all those
- * 64-bit instructions (daddu, dmultu, etc.) found below gonna only
- * cause illegal instruction exception:-(
- *
- * In addition the code depends on preprocessor flags set up by MIPSpro
- * compiler driver (either as or cc) and therefore (probably?) can't be
- * compiled by the GNU assembler. GNU C driver manages fine though...
- * I mean as long as -mmips-as is specified or is the default option,
- * because then it simply invokes /usr/bin/as which in turn takes
- * perfect care of the preprocessor definitions. Another neat feature
- * offered by the MIPSpro assembler is an optimization pass. This gave
- * me the opportunity to have the code looking more regular as all those
- * architecture dependent instruction rescheduling details were left to
- * the assembler. Cool, huh?
- *
- * Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
- * goes way over 3 times faster!
- *
- *					<appro@fy.chalmers.se>
- */
-#include <asm.h>
-#include <regdef.h>
-
-#if _MIPS_ISA>=4
-#define	MOVNZ(cond,dst,src)	\
-	movn	dst,src,cond
-#else
-#define	MOVNZ(cond,dst,src)	\
-	.set	noreorder;	\
-	bnezl	cond,.+8;	\
-	move	dst,src;	\
-	.set	reorder
-#endif
-
-.text
-
-.set	noat
-.set	reorder
-
-#define	MINUS4	v1
-
-.align	5
-LEAF(bn_mul_add_words)
-	.set	noreorder
-	bgtzl	a2,.L_bn_mul_add_words_proceed
-	ld	t0,0(a1)
-	jr	ra
-	move	v0,zero
-	.set	reorder
-
-.L_bn_mul_add_words_proceed:
-	li	MINUS4,-4
-	and	ta0,a2,MINUS4
-	move	v0,zero
-	beqz	ta0,.L_bn_mul_add_words_tail
-
-.L_bn_mul_add_words_loop:
-	dmultu	t0,a3
-	ld	t1,0(a0)
-	ld	t2,8(a1)
-	ld	t3,8(a0)
-	ld	ta0,16(a1)
-	ld	ta1,16(a0)
-	daddu	t1,v0
-	sltu	v0,t1,v0	/* All manuals say it "compares 32-bit
-				 * values", but it seems to work fine
-				 * even on 64-bit registers. */
-	mflo	AT
-	mfhi	t0
-	daddu	t1,AT
-	daddu	v0,t0
-	sltu	AT,t1,AT
-	sd	t1,0(a0)
-	daddu	v0,AT
-
-	dmultu	t2,a3
-	ld	ta2,24(a1)
-	ld	ta3,24(a0)
-	daddu	t3,v0
-	sltu	v0,t3,v0
-	mflo	AT
-	mfhi	t2
-	daddu	t3,AT
-	daddu	v0,t2
-	sltu	AT,t3,AT
-	sd	t3,8(a0)
-	daddu	v0,AT
-
-	dmultu	ta0,a3
-	subu	a2,4
-	PTR_ADD	a0,32
-	PTR_ADD	a1,32
-	daddu	ta1,v0
-	sltu	v0,ta1,v0
-	mflo	AT
-	mfhi	ta0
-	daddu	ta1,AT
-	daddu	v0,ta0
-	sltu	AT,ta1,AT
-	sd	ta1,-16(a0)
-	daddu	v0,AT
-
-
-	dmultu	ta2,a3
-	and	ta0,a2,MINUS4
-	daddu	ta3,v0
-	sltu	v0,ta3,v0
-	mflo	AT
-	mfhi	ta2
-	daddu	ta3,AT
-	daddu	v0,ta2
-	sltu	AT,ta3,AT
-	sd	ta3,-8(a0)
-	daddu	v0,AT
-	.set	noreorder
-	bgtzl	ta0,.L_bn_mul_add_words_loop
-	ld	t0,0(a1)
-
-	bnezl	a2,.L_bn_mul_add_words_tail
-	ld	t0,0(a1)
-	.set	reorder
-
-.L_bn_mul_add_words_return:
-	jr	ra
-
-.L_bn_mul_add_words_tail:
-	dmultu	t0,a3
-	ld	t1,0(a0)
-	subu	a2,1
-	daddu	t1,v0
-	sltu	v0,t1,v0
-	mflo	AT
-	mfhi	t0
-	daddu	t1,AT
-	daddu	v0,t0
-	sltu	AT,t1,AT
-	sd	t1,0(a0)
-	daddu	v0,AT
-	beqz	a2,.L_bn_mul_add_words_return
-
-	ld	t0,8(a1)
-	dmultu	t0,a3
-	ld	t1,8(a0)
-	subu	a2,1
-	daddu	t1,v0
-	sltu	v0,t1,v0
-	mflo	AT
-	mfhi	t0
-	daddu	t1,AT
-	daddu	v0,t0
-	sltu	AT,t1,AT
-	sd	t1,8(a0)
-	daddu	v0,AT
-	beqz	a2,.L_bn_mul_add_words_return
-
-	ld	t0,16(a1)
-	dmultu	t0,a3
-	ld	t1,16(a0)
-	daddu	t1,v0
-	sltu	v0,t1,v0
-	mflo	AT
-	mfhi	t0
-	daddu	t1,AT
-	daddu	v0,t0
-	sltu	AT,t1,AT
-	sd	t1,16(a0)
-	daddu	v0,AT
-	jr	ra
-END(bn_mul_add_words)
-
-.align	5
-LEAF(bn_mul_words)
-	.set	noreorder
-	bgtzl	a2,.L_bn_mul_words_proceed
-	ld	t0,0(a1)
-	jr	ra
-	move	v0,zero
-	.set	reorder
-
-.L_bn_mul_words_proceed:
-	li	MINUS4,-4
-	and	ta0,a2,MINUS4
-	move	v0,zero
-	beqz	ta0,.L_bn_mul_words_tail
-
-.L_bn_mul_words_loop:
-	dmultu	t0,a3
-	ld	t2,8(a1)
-	ld	ta0,16(a1)
-	ld	ta2,24(a1)
-	mflo	AT
-	mfhi	t0
-	daddu	v0,AT
-	sltu	t1,v0,AT
-	sd	v0,0(a0)
-	daddu	v0,t1,t0
-
-	dmultu	t2,a3
-	subu	a2,4
-	PTR_ADD	a0,32
-	PTR_ADD	a1,32
-	mflo	AT
-	mfhi	t2
-	daddu	v0,AT
-	sltu	t3,v0,AT
-	sd	v0,-24(a0)
-	daddu	v0,t3,t2
-
-	dmultu	ta0,a3
-	mflo	AT
-	mfhi	ta0
-	daddu	v0,AT
-	sltu	ta1,v0,AT
-	sd	v0,-16(a0)
-	daddu	v0,ta1,ta0
-
-
-	dmultu	ta2,a3
-	and	ta0,a2,MINUS4
-	mflo	AT
-	mfhi	ta2
-	daddu	v0,AT
-	sltu	ta3,v0,AT
-	sd	v0,-8(a0)
-	daddu	v0,ta3,ta2
-	.set	noreorder
-	bgtzl	ta0,.L_bn_mul_words_loop
-	ld	t0,0(a1)
-
-	bnezl	a2,.L_bn_mul_words_tail
-	ld	t0,0(a1)
-	.set	reorder
-
-.L_bn_mul_words_return:
-	jr	ra
-
-.L_bn_mul_words_tail:
-	dmultu	t0,a3
-	subu	a2,1
-	mflo	AT
-	mfhi	t0
-	daddu	v0,AT
-	sltu	t1,v0,AT
-	sd	v0,0(a0)
-	daddu	v0,t1,t0
-	beqz	a2,.L_bn_mul_words_return
-
-	ld	t0,8(a1)
-	dmultu	t0,a3
-	subu	a2,1
-	mflo	AT
-	mfhi	t0
-	daddu	v0,AT
-	sltu	t1,v0,AT
-	sd	v0,8(a0)
-	daddu	v0,t1,t0
-	beqz	a2,.L_bn_mul_words_return
-
-	ld	t0,16(a1)
-	dmultu	t0,a3
-	mflo	AT
-	mfhi	t0
-	daddu	v0,AT
-	sltu	t1,v0,AT
-	sd	v0,16(a0)
-	daddu	v0,t1,t0
-	jr	ra
-END(bn_mul_words)
-
-.align	5
-LEAF(bn_sqr_words)
-	.set	noreorder
-	bgtzl	a2,.L_bn_sqr_words_proceed
-	ld	t0,0(a1)
-	jr	ra
-	move	v0,zero
-	.set	reorder
-
-.L_bn_sqr_words_proceed:
-	li	MINUS4,-4
-	and	ta0,a2,MINUS4
-	move	v0,zero
-	beqz	ta0,.L_bn_sqr_words_tail
-
-.L_bn_sqr_words_loop:
-	dmultu	t0,t0
-	ld	t2,8(a1)
-	ld	ta0,16(a1)
-	ld	ta2,24(a1)
-	mflo	t1
-	mfhi	t0
-	sd	t1,0(a0)
-	sd	t0,8(a0)
-
-	dmultu	t2,t2
-	subu	a2,4
-	PTR_ADD	a0,64
-	PTR_ADD	a1,32
-	mflo	t3
-	mfhi	t2
-	sd	t3,-48(a0)
-	sd	t2,-40(a0)
-
-	dmultu	ta0,ta0
-	mflo	ta1
-	mfhi	ta0
-	sd	ta1,-32(a0)
-	sd	ta0,-24(a0)
-
-
-	dmultu	ta2,ta2
-	and	ta0,a2,MINUS4
-	mflo	ta3
-	mfhi	ta2
-	sd	ta3,-16(a0)
-	sd	ta2,-8(a0)
-
-	.set	noreorder
-	bgtzl	ta0,.L_bn_sqr_words_loop
-	ld	t0,0(a1)
-
-	bnezl	a2,.L_bn_sqr_words_tail
-	ld	t0,0(a1)
-	.set	reorder
-
-.L_bn_sqr_words_return:
-	move	v0,zero
-	jr	ra
-
-.L_bn_sqr_words_tail:
-	dmultu	t0,t0
-	subu	a2,1
-	mflo	t1
-	mfhi	t0
-	sd	t1,0(a0)
-	sd	t0,8(a0)
-	beqz	a2,.L_bn_sqr_words_return
-
-	ld	t0,8(a1)
-	dmultu	t0,t0
-	subu	a2,1
-	mflo	t1
-	mfhi	t0
-	sd	t1,16(a0)
-	sd	t0,24(a0)
-	beqz	a2,.L_bn_sqr_words_return
-
-	ld	t0,16(a1)
-	dmultu	t0,t0
-	mflo	t1
-	mfhi	t0
-	sd	t1,32(a0)
-	sd	t0,40(a0)
-	jr	ra
-END(bn_sqr_words)
-
-.align	5
-LEAF(bn_add_words)
-	.set	noreorder
-	bgtzl	a3,.L_bn_add_words_proceed
-	ld	t0,0(a1)
-	jr	ra
-	move	v0,zero
-	.set	reorder
-
-.L_bn_add_words_proceed:
-	li	MINUS4,-4
-	and	AT,a3,MINUS4
-	move	v0,zero
-	beqz	AT,.L_bn_add_words_tail
-
-.L_bn_add_words_loop:
-	ld	ta0,0(a2)
-	subu	a3,4
-	ld	t1,8(a1)
-	and	AT,a3,MINUS4
-	ld	t2,16(a1)
-	PTR_ADD	a2,32
-	ld	t3,24(a1)
-	PTR_ADD	a0,32
-	ld	ta1,-24(a2)
-	PTR_ADD	a1,32
-	ld	ta2,-16(a2)
-	ld	ta3,-8(a2)
-	daddu	ta0,t0
-	sltu	t8,ta0,t0
-	daddu	t0,ta0,v0
-	sltu	v0,t0,ta0
-	sd	t0,-32(a0)
-	daddu	v0,t8
-
-	daddu	ta1,t1
-	sltu	t9,ta1,t1
-	daddu	t1,ta1,v0
-	sltu	v0,t1,ta1
-	sd	t1,-24(a0)
-	daddu	v0,t9
-
-	daddu	ta2,t2
-	sltu	t8,ta2,t2
-	daddu	t2,ta2,v0
-	sltu	v0,t2,ta2
-	sd	t2,-16(a0)
-	daddu	v0,t8
-	
-	daddu	ta3,t3
-	sltu	t9,ta3,t3
-	daddu	t3,ta3,v0
-	sltu	v0,t3,ta3
-	sd	t3,-8(a0)
-	daddu	v0,t9
-	
-	.set	noreorder
-	bgtzl	AT,.L_bn_add_words_loop
-	ld	t0,0(a1)
-
-	bnezl	a3,.L_bn_add_words_tail
-	ld	t0,0(a1)
-	.set	reorder
-
-.L_bn_add_words_return:
-	jr	ra
-
-.L_bn_add_words_tail:
-	ld	ta0,0(a2)
-	daddu	ta0,t0
-	subu	a3,1
-	sltu	t8,ta0,t0
-	daddu	t0,ta0,v0
-	sltu	v0,t0,ta0
-	sd	t0,0(a0)
-	daddu	v0,t8
-	beqz	a3,.L_bn_add_words_return
-
-	ld	t1,8(a1)
-	ld	ta1,8(a2)
-	daddu	ta1,t1
-	subu	a3,1
-	sltu	t9,ta1,t1
-	daddu	t1,ta1,v0
-	sltu	v0,t1,ta1
-	sd	t1,8(a0)
-	daddu	v0,t9
-	beqz	a3,.L_bn_add_words_return
-
-	ld	t2,16(a1)
-	ld	ta2,16(a2)
-	daddu	ta2,t2
-	sltu	t8,ta2,t2
-	daddu	t2,ta2,v0
-	sltu	v0,t2,ta2
-	sd	t2,16(a0)
-	daddu	v0,t8
-	jr	ra
-END(bn_add_words)
-
-.align	5
-LEAF(bn_sub_words)
-	.set	noreorder
-	bgtzl	a3,.L_bn_sub_words_proceed
-	ld	t0,0(a1)
-	jr	ra
-	move	v0,zero
-	.set	reorder
-
-.L_bn_sub_words_proceed:
-	li	MINUS4,-4
-	and	AT,a3,MINUS4
-	move	v0,zero
-	beqz	AT,.L_bn_sub_words_tail
-
-.L_bn_sub_words_loop:
-	ld	ta0,0(a2)
-	subu	a3,4
-	ld	t1,8(a1)
-	and	AT,a3,MINUS4
-	ld	t2,16(a1)
-	PTR_ADD	a2,32
-	ld	t3,24(a1)
-	PTR_ADD	a0,32
-	ld	ta1,-24(a2)
-	PTR_ADD	a1,32
-	ld	ta2,-16(a2)
-	ld	ta3,-8(a2)
-	sltu	t8,t0,ta0
-	dsubu	t0,ta0
-	dsubu	ta0,t0,v0
-	sd	ta0,-32(a0)
-	MOVNZ	(t0,v0,t8)
-
-	sltu	t9,t1,ta1
-	dsubu	t1,ta1
-	dsubu	ta1,t1,v0
-	sd	ta1,-24(a0)
-	MOVNZ	(t1,v0,t9)
-
-
-	sltu	t8,t2,ta2
-	dsubu	t2,ta2
-	dsubu	ta2,t2,v0
-	sd	ta2,-16(a0)
-	MOVNZ	(t2,v0,t8)
-
-	sltu	t9,t3,ta3
-	dsubu	t3,ta3
-	dsubu	ta3,t3,v0
-	sd	ta3,-8(a0)
-	MOVNZ	(t3,v0,t9)
-
-	.set	noreorder
-	bgtzl	AT,.L_bn_sub_words_loop
-	ld	t0,0(a1)
-
-	bnezl	a3,.L_bn_sub_words_tail
-	ld	t0,0(a1)
-	.set	reorder
-
-.L_bn_sub_words_return:
-	jr	ra
-
-.L_bn_sub_words_tail:
-	ld	ta0,0(a2)
-	subu	a3,1
-	sltu	t8,t0,ta0
-	dsubu	t0,ta0
-	dsubu	ta0,t0,v0
-	MOVNZ	(t0,v0,t8)
-	sd	ta0,0(a0)
-	beqz	a3,.L_bn_sub_words_return
-
-	ld	t1,8(a1)
-	subu	a3,1
-	ld	ta1,8(a2)
-	sltu	t9,t1,ta1
-	dsubu	t1,ta1
-	dsubu	ta1,t1,v0
-	MOVNZ	(t1,v0,t9)
-	sd	ta1,8(a0)
-	beqz	a3,.L_bn_sub_words_return
-
-	ld	t2,16(a1)
-	ld	ta2,16(a2)
-	sltu	t8,t2,ta2
-	dsubu	t2,ta2
-	dsubu	ta2,t2,v0
-	MOVNZ	(t2,v0,t8)
-	sd	ta2,16(a0)
-	jr	ra
-END(bn_sub_words)
-
-#undef	MINUS4
-
-.align 5
-LEAF(bn_div_3_words)
-	.set	reorder
-	move	a3,a0		/* we know that bn_div_words doesn't
-				 * touch a3, ta2, ta3 and preserves a2
-				 * so that we can save two arguments
-				 * and return address in registers
-				 * instead of stack:-)
-				 */
-	ld	a0,(a3)
-	move	ta2,a1
-	ld	a1,-8(a3)
-	bne	a0,a2,.L_bn_div_3_words_proceed
-	li	v0,-1
-	jr	ra
-.L_bn_div_3_words_proceed:
-	move	ta3,ra
-	bal	bn_div_words
-	move	ra,ta3
-	dmultu	ta2,v0
-	ld	t2,-16(a3)
-	move	ta0,zero
-	mfhi	t1
-	mflo	t0
-	sltu	t8,t1,v1
-.L_bn_div_3_words_inner_loop:
-	bnez	t8,.L_bn_div_3_words_inner_loop_done
-	sgeu	AT,t2,t0
-	seq	t9,t1,v1
-	and	AT,t9
-	sltu	t3,t0,ta2
-	daddu	v1,a2
-	dsubu	t1,t3
-	dsubu	t0,ta2
-	sltu	t8,t1,v1
-	sltu	ta0,v1,a2
-	or	t8,ta0
-	.set	noreorder
-	beqzl	AT,.L_bn_div_3_words_inner_loop
-	dsubu	v0,1
-	.set	reorder
-.L_bn_div_3_words_inner_loop_done:
-	jr	ra
-END(bn_div_3_words)
-
-.align	5
-LEAF(bn_div_words)
-	.set	noreorder
-	bnezl	a2,.L_bn_div_words_proceed
-	move	v1,zero
-	jr	ra
-	li	v0,-1		/* I'd rather signal div-by-zero
-				 * which can be done with 'break 7' */
-
-.L_bn_div_words_proceed:
-	bltz	a2,.L_bn_div_words_body
-	move	t9,v1
-	dsll	a2,1
-	bgtz	a2,.-4
-	addu	t9,1
-
-	.set	reorder
-	negu	t1,t9
-	li	t2,-1
-	dsll	t2,t1
-	and	t2,a0
-	dsrl	AT,a1,t1
-	.set	noreorder
-	bnezl	t2,.+8
-	break	6		/* signal overflow */
-	.set	reorder
-	dsll	a0,t9
-	dsll	a1,t9
-	or	a0,AT
-
-#define	QT	ta0
-#define	HH	ta1
-#define	DH	v1
-.L_bn_div_words_body:
-	dsrl	DH,a2,32
-	sgeu	AT,a0,a2
-	.set	noreorder
-	bnezl	AT,.+8
-	dsubu	a0,a2
-	.set	reorder
-
-	li	QT,-1
-	dsrl	HH,a0,32
-	dsrl	QT,32	/* q=0xffffffff */
-	beq	DH,HH,.L_bn_div_words_skip_div1
-	ddivu	zero,a0,DH
-	mflo	QT
-.L_bn_div_words_skip_div1:
-	dmultu	a2,QT
-	dsll	t3,a0,32
-	dsrl	AT,a1,32
-	or	t3,AT
-	mflo	t0
-	mfhi	t1
-.L_bn_div_words_inner_loop1:
-	sltu	t2,t3,t0
-	seq	t8,HH,t1
-	sltu	AT,HH,t1
-	and	t2,t8
-	sltu	v0,t0,a2
-	or	AT,t2
-	.set	noreorder
-	beqz	AT,.L_bn_div_words_inner_loop1_done
-	dsubu	t1,v0
-	dsubu	t0,a2
-	b	.L_bn_div_words_inner_loop1
-	dsubu	QT,1
-	.set	reorder
-.L_bn_div_words_inner_loop1_done:
-
-	dsll	a1,32
-	dsubu	a0,t3,t0
-	dsll	v0,QT,32
-
-	li	QT,-1
-	dsrl	HH,a0,32
-	dsrl	QT,32	/* q=0xffffffff */
-	beq	DH,HH,.L_bn_div_words_skip_div2
-	ddivu	zero,a0,DH
-	mflo	QT
-.L_bn_div_words_skip_div2:
-#undef	DH
-	dmultu	a2,QT
-	dsll	t3,a0,32
-	dsrl	AT,a1,32
-	or	t3,AT
-	mflo	t0
-	mfhi	t1
-.L_bn_div_words_inner_loop2:
-	sltu	t2,t3,t0
-	seq	t8,HH,t1
-	sltu	AT,HH,t1
-	and	t2,t8
-	sltu	v1,t0,a2
-	or	AT,t2
-	.set	noreorder
-	beqz	AT,.L_bn_div_words_inner_loop2_done
-	dsubu	t1,v1
-	dsubu	t0,a2
-	b	.L_bn_div_words_inner_loop2
-	dsubu	QT,1
-	.set	reorder
-.L_bn_div_words_inner_loop2_done:	
-#undef	HH
-
-	dsubu	a0,t3,t0
-	or	v0,QT
-	dsrl	v1,a0,t9	/* v1 contains remainder if anybody wants it */
-	dsrl	a2,t9		/* restore a2 */
-	jr	ra
-#undef	QT
-END(bn_div_words)
-
-#define	a_0	t0
-#define	a_1	t1
-#define	a_2	t2
-#define	a_3	t3
-#define	b_0	ta0
-#define	b_1	ta1
-#define	b_2	ta2
-#define	b_3	ta3
-
-#define	a_4	s0
-#define	a_5	s2
-#define	a_6	s4
-#define	a_7	a1	/* once we load a[7] we don't need a anymore */
-#define	b_4	s1
-#define	b_5	s3
-#define	b_6	s5
-#define	b_7	a2	/* once we load b[7] we don't need b anymore */
-
-#define	t_1	t8
-#define	t_2	t9
-
-#define	c_1	v0
-#define	c_2	v1
-#define	c_3	a3
-
-#define	FRAME_SIZE	48
-
-.align	5
-LEAF(bn_mul_comba8)
-	.set	noreorder
-	PTR_SUB	sp,FRAME_SIZE
-	.frame	sp,64,ra
-	.set	reorder
-	ld	a_0,0(a1)	/* If compiled with -mips3 option on
-				 * R5000 box assembler barks on this
-				 * line with "shouldn't have mult/div
-				 * as last instruction in bb (R10K
-				 * bug)" warning. If anybody out there
-				 * has a clue about how to circumvent
-				 * this do send me a note.
-				 *		<appro@fy.chalmers.se>
-				 */
-	ld	b_0,0(a2)
-	ld	a_1,8(a1)
-	ld	a_2,16(a1)
-	ld	a_3,24(a1)
-	ld	b_1,8(a2)
-	ld	b_2,16(a2)
-	ld	b_3,24(a2)
-	dmultu	a_0,b_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
-	sd	s0,0(sp)
-	sd	s1,8(sp)
-	sd	s2,16(sp)
-	sd	s3,24(sp)
-	sd	s4,32(sp)
-	sd	s5,40(sp)
-	mflo	c_1
-	mfhi	c_2
-
-	dmultu	a_0,b_1		/* mul_add_c(a[0],b[1],c2,c3,c1); */
-	ld	a_4,32(a1)
-	ld	a_5,40(a1)
-	ld	a_6,48(a1)
-	ld	a_7,56(a1)
-	ld	b_4,32(a2)
-	ld	b_5,40(a2)
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	c_3,t_2,AT
-	dmultu	a_1,b_0		/* mul_add_c(a[1],b[0],c2,c3,c1); */
-	ld	b_6,48(a2)
-	ld	b_7,56(a2)
-	sd	c_1,0(a0)	/* r[0]=c1; */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	c_1,c_3,t_2
-	sd	c_2,8(a0)	/* r[1]=c2; */
-
-	dmultu	a_2,b_0		/* mul_add_c(a[2],b[0],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	dmultu	a_1,b_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	c_2,c_1,t_2
-	dmultu	a_0,b_2		/* mul_add_c(a[0],b[2],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	sd	c_3,16(a0)	/* r[2]=c3; */
-
-	dmultu	a_0,b_3		/* mul_add_c(a[0],b[3],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	c_3,c_2,t_2
-	dmultu	a_1,b_2		/* mul_add_c(a[1],b[2],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_2,b_1		/* mul_add_c(a[2],b[1],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_3,b_0		/* mul_add_c(a[3],b[0],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	sd	c_1,24(a0)	/* r[3]=c1; */
-
-	dmultu	a_4,b_0		/* mul_add_c(a[4],b[0],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	c_1,c_3,t_2
-	dmultu	a_3,b_1		/* mul_add_c(a[3],b[1],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_2,b_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_1,b_3		/* mul_add_c(a[1],b[3],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_0,b_4		/* mul_add_c(a[0],b[4],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	sd	c_2,32(a0)	/* r[4]=c2; */
-
-	dmultu	a_0,b_5		/* mul_add_c(a[0],b[5],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	c_2,c_1,t_2
-	dmultu	a_1,b_4		/* mul_add_c(a[1],b[4],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_2,b_3		/* mul_add_c(a[2],b[3],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_3,b_2		/* mul_add_c(a[3],b[2],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_4,b_1		/* mul_add_c(a[4],b[1],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_5,b_0		/* mul_add_c(a[5],b[0],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	sd	c_3,40(a0)	/* r[5]=c3; */
-
-	dmultu	a_6,b_0		/* mul_add_c(a[6],b[0],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	c_3,c_2,t_2
-	dmultu	a_5,b_1		/* mul_add_c(a[5],b[1],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_4,b_2		/* mul_add_c(a[4],b[2],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_3,b_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_2,b_4		/* mul_add_c(a[2],b[4],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_1,b_5		/* mul_add_c(a[1],b[5],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_0,b_6		/* mul_add_c(a[0],b[6],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	sd	c_1,48(a0)	/* r[6]=c1; */
-
-	dmultu	a_0,b_7		/* mul_add_c(a[0],b[7],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	c_1,c_3,t_2
-	dmultu	a_1,b_6		/* mul_add_c(a[1],b[6],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_2,b_5		/* mul_add_c(a[2],b[5],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_3,b_4		/* mul_add_c(a[3],b[4],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_4,b_3		/* mul_add_c(a[4],b[3],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_5,b_2		/* mul_add_c(a[5],b[2],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_6,b_1		/* mul_add_c(a[6],b[1],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_7,b_0		/* mul_add_c(a[7],b[0],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	sd	c_2,56(a0)	/* r[7]=c2; */
-
-	dmultu	a_7,b_1		/* mul_add_c(a[7],b[1],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	c_2,c_1,t_2
-	dmultu	a_6,b_2		/* mul_add_c(a[6],b[2],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_5,b_3		/* mul_add_c(a[5],b[3],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_4,b_4		/* mul_add_c(a[4],b[4],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_3,b_5		/* mul_add_c(a[3],b[5],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_2,b_6		/* mul_add_c(a[2],b[6],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_1,b_7		/* mul_add_c(a[1],b[7],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	sd	c_3,64(a0)	/* r[8]=c3; */
-
-	dmultu	a_2,b_7		/* mul_add_c(a[2],b[7],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	c_3,c_2,t_2
-	dmultu	a_3,b_6		/* mul_add_c(a[3],b[6],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_4,b_5		/* mul_add_c(a[4],b[5],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_5,b_4		/* mul_add_c(a[5],b[4],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_6,b_3		/* mul_add_c(a[6],b[3],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_7,b_2		/* mul_add_c(a[7],b[2],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	sd	c_1,72(a0)	/* r[9]=c1; */
-
-	dmultu	a_7,b_3		/* mul_add_c(a[7],b[3],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	c_1,c_3,t_2
-	dmultu	a_6,b_4		/* mul_add_c(a[6],b[4],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_5,b_5		/* mul_add_c(a[5],b[5],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_4,b_6		/* mul_add_c(a[4],b[6],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_3,b_7		/* mul_add_c(a[3],b[7],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	sd	c_2,80(a0)	/* r[10]=c2; */
-
-	dmultu	a_4,b_7		/* mul_add_c(a[4],b[7],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	c_2,c_1,t_2
-	dmultu	a_5,b_6		/* mul_add_c(a[5],b[6],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_6,b_5		/* mul_add_c(a[6],b[5],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_7,b_4		/* mul_add_c(a[7],b[4],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	sd	c_3,88(a0)	/* r[11]=c3; */
-
-	dmultu	a_7,b_5		/* mul_add_c(a[7],b[5],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	c_3,c_2,t_2
-	dmultu	a_6,b_6		/* mul_add_c(a[6],b[6],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_5,b_7		/* mul_add_c(a[5],b[7],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	sd	c_1,96(a0)	/* r[12]=c1; */
-
-	dmultu	a_6,b_7		/* mul_add_c(a[6],b[7],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	c_1,c_3,t_2
-	dmultu	a_7,b_6		/* mul_add_c(a[7],b[6],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	sd	c_2,104(a0)	/* r[13]=c2; */
-
-	dmultu	a_7,b_7		/* mul_add_c(a[7],b[7],c3,c1,c2); */
-	ld	s0,0(sp)
-	ld	s1,8(sp)
-	ld	s2,16(sp)
-	ld	s3,24(sp)
-	ld	s4,32(sp)
-	ld	s5,40(sp)
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sd	c_3,112(a0)	/* r[14]=c3; */
-	sd	c_1,120(a0)	/* r[15]=c1; */
-
-	PTR_ADD	sp,FRAME_SIZE
-
-	jr	ra
-END(bn_mul_comba8)
-
-.align	5
-LEAF(bn_mul_comba4)
-	.set	reorder
-	ld	a_0,0(a1)
-	ld	b_0,0(a2)
-	ld	a_1,8(a1)
-	ld	a_2,16(a1)
-	dmultu	a_0,b_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
-	ld	a_3,24(a1)
-	ld	b_1,8(a2)
-	ld	b_2,16(a2)
-	ld	b_3,24(a2)
-	mflo	c_1
-	mfhi	c_2
-	sd	c_1,0(a0)
-
-	dmultu	a_0,b_1		/* mul_add_c(a[0],b[1],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	c_3,t_2,AT
-	dmultu	a_1,b_0		/* mul_add_c(a[1],b[0],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	c_1,c_3,t_2
-	sd	c_2,8(a0)
-
-	dmultu	a_2,b_0		/* mul_add_c(a[2],b[0],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	dmultu	a_1,b_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	c_2,c_1,t_2
-	dmultu	a_0,b_2		/* mul_add_c(a[0],b[2],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	sd	c_3,16(a0)
-
-	dmultu	a_0,b_3		/* mul_add_c(a[0],b[3],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	c_3,c_2,t_2
-	dmultu	a_1,b_2		/* mul_add_c(a[1],b[2],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_2,b_1		/* mul_add_c(a[2],b[1],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_3,b_0		/* mul_add_c(a[3],b[0],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	sd	c_1,24(a0)
-
-	dmultu	a_3,b_1		/* mul_add_c(a[3],b[1],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	c_1,c_3,t_2
-	dmultu	a_2,b_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_1,b_3		/* mul_add_c(a[1],b[3],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	sd	c_2,32(a0)
-
-	dmultu	a_2,b_3		/* mul_add_c(a[2],b[3],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	c_2,c_1,t_2
-	dmultu	a_3,b_2		/* mul_add_c(a[3],b[2],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	sd	c_3,40(a0)
-
-	dmultu	a_3,b_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sd	c_1,48(a0)
-	sd	c_2,56(a0)
-
-	jr	ra
-END(bn_mul_comba4)
-
-#undef	a_4
-#undef	a_5
-#undef	a_6
-#undef	a_7
-#define	a_4	b_0
-#define	a_5	b_1
-#define	a_6	b_2
-#define	a_7	b_3
-
-.align	5
-LEAF(bn_sqr_comba8)
-	.set	reorder
-	ld	a_0,0(a1)
-	ld	a_1,8(a1)
-	ld	a_2,16(a1)
-	ld	a_3,24(a1)
-
-	dmultu	a_0,a_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
-	ld	a_4,32(a1)
-	ld	a_5,40(a1)
-	ld	a_6,48(a1)
-	ld	a_7,56(a1)
-	mflo	c_1
-	mfhi	c_2
-	sd	c_1,0(a0)
-
-	dmultu	a_0,a_1		/* mul_add_c2(a[0],b[1],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	slt	c_1,t_2,zero
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	c_3,t_2,AT
-	sd	c_2,8(a0)
-
-	dmultu	a_2,a_0		/* mul_add_c2(a[2],b[0],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	slt	c_2,t_2,zero
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_1,a_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	sd	c_3,16(a0)
-
-	dmultu	a_0,a_3		/* mul_add_c2(a[0],b[3],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	slt	c_3,t_2,zero
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_1,a_2		/* mul_add_c2(a[1],b[2],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	slt	AT,t_2,zero
-	daddu	c_3,AT
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	sd	c_1,24(a0)
-
-	dmultu	a_4,a_0		/* mul_add_c2(a[4],b[0],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	slt	c_1,t_2,zero
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_3,a_1		/* mul_add_c2(a[3],b[1],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	slt	AT,t_2,zero
-	daddu	c_1,AT
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_2,a_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	sd	c_2,32(a0)
-
-	dmultu	a_0,a_5		/* mul_add_c2(a[0],b[5],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	slt	c_2,t_2,zero
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_1,a_4		/* mul_add_c2(a[1],b[4],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	slt	AT,t_2,zero
-	daddu	c_2,AT
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_2,a_3		/* mul_add_c2(a[2],b[3],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	slt	AT,t_2,zero
-	daddu	c_2,AT
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	sd	c_3,40(a0)
-
-	dmultu	a_6,a_0		/* mul_add_c2(a[6],b[0],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	slt	c_3,t_2,zero
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_5,a_1		/* mul_add_c2(a[5],b[1],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	slt	AT,t_2,zero
-	daddu	c_3,AT
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_4,a_2		/* mul_add_c2(a[4],b[2],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	slt	AT,t_2,zero
-	daddu	c_3,AT
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_3,a_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	sd	c_1,48(a0)
-
-	dmultu	a_0,a_7		/* mul_add_c2(a[0],b[7],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	slt	c_1,t_2,zero
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_1,a_6		/* mul_add_c2(a[1],b[6],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	slt	AT,t_2,zero
-	daddu	c_1,AT
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_2,a_5		/* mul_add_c2(a[2],b[5],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	slt	AT,t_2,zero
-	daddu	c_1,AT
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_3,a_4		/* mul_add_c2(a[3],b[4],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	slt	AT,t_2,zero
-	daddu	c_1,AT
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	sd	c_2,56(a0)
-
-	dmultu	a_7,a_1		/* mul_add_c2(a[7],b[1],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	slt	c_2,t_2,zero
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_6,a_2		/* mul_add_c2(a[6],b[2],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	slt	AT,t_2,zero
-	daddu	c_2,AT
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_5,a_3		/* mul_add_c2(a[5],b[3],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	slt	AT,t_2,zero
-	daddu	c_2,AT
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_4,a_4		/* mul_add_c(a[4],b[4],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	sd	c_3,64(a0)
-
-	dmultu	a_2,a_7		/* mul_add_c2(a[2],b[7],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	slt	c_3,t_2,zero
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_3,a_6		/* mul_add_c2(a[3],b[6],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	slt	AT,t_2,zero
-	daddu	c_3,AT
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_4,a_5		/* mul_add_c2(a[4],b[5],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	slt	AT,t_2,zero
-	daddu	c_3,AT
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	sd	c_1,72(a0)
-
-	dmultu	a_7,a_3		/* mul_add_c2(a[7],b[3],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	slt	c_1,t_2,zero
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_6,a_4		/* mul_add_c2(a[6],b[4],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	slt	AT,t_2,zero
-	daddu	c_1,AT
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_5,a_5		/* mul_add_c(a[5],b[5],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	sd	c_2,80(a0)
-
-	dmultu	a_4,a_7		/* mul_add_c2(a[4],b[7],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	slt	c_2,t_2,zero
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_5,a_6		/* mul_add_c2(a[5],b[6],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	slt	AT,t_2,zero
-	daddu	c_2,AT
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	sd	c_3,88(a0)
-
-	dmultu	a_7,a_5		/* mul_add_c2(a[7],b[5],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	slt	c_3,t_2,zero
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_6,a_6		/* mul_add_c(a[6],b[6],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	sd	c_1,96(a0)
-
-	dmultu	a_6,a_7		/* mul_add_c2(a[6],b[7],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	slt	c_1,t_2,zero
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	sd	c_2,104(a0)
-
-	dmultu	a_7,a_7		/* mul_add_c(a[7],b[7],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sd	c_3,112(a0)
-	sd	c_1,120(a0)
-
-	jr	ra
-END(bn_sqr_comba8)
-
-.align	5
-LEAF(bn_sqr_comba4)
-	.set	reorder
-	ld	a_0,0(a1)
-	ld	a_1,8(a1)
-	ld	a_2,16(a1)
-	ld	a_3,24(a1)
-	dmultu	a_0,a_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
-	mflo	c_1
-	mfhi	c_2
-	sd	c_1,0(a0)
-
-	dmultu	a_0,a_1		/* mul_add_c2(a[0],b[1],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	slt	c_1,t_2,zero
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	c_3,t_2,AT
-	sd	c_2,8(a0)
-
-	dmultu	a_2,a_0		/* mul_add_c2(a[2],b[0],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	slt	c_2,t_2,zero
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	dmultu	a_1,a_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	sd	c_3,16(a0)
-
-	dmultu	a_0,a_3		/* mul_add_c2(a[0],b[3],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	slt	c_3,t_2,zero
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	dmultu	a_1,a_2		/* mul_add_c(a2[1],b[2],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	slt	AT,t_2,zero
-	daddu	c_3,AT
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sltu	AT,c_2,t_2
-	daddu	c_3,AT
-	sd	c_1,24(a0)
-
-	dmultu	a_3,a_1		/* mul_add_c2(a[3],b[1],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	slt	c_1,t_2,zero
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	dmultu	a_2,a_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_2,t_1
-	sltu	AT,c_2,t_1
-	daddu	t_2,AT
-	daddu	c_3,t_2
-	sltu	AT,c_3,t_2
-	daddu	c_1,AT
-	sd	c_2,32(a0)
-
-	dmultu	a_2,a_3		/* mul_add_c2(a[2],b[3],c3,c1,c2); */
-	mflo	t_1
-	mfhi	t_2
-	slt	c_2,t_2,zero
-	dsll	t_2,1
-	slt	a2,t_1,zero
-	daddu	t_2,a2
-	dsll	t_1,1
-	daddu	c_3,t_1
-	sltu	AT,c_3,t_1
-	daddu	t_2,AT
-	daddu	c_1,t_2
-	sltu	AT,c_1,t_2
-	daddu	c_2,AT
-	sd	c_3,40(a0)
-
-	dmultu	a_3,a_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
-	mflo	t_1
-	mfhi	t_2
-	daddu	c_1,t_1
-	sltu	AT,c_1,t_1
-	daddu	t_2,AT
-	daddu	c_2,t_2
-	sd	c_1,48(a0)
-	sd	c_2,56(a0)
-
-	jr	ra
-END(bn_sqr_comba4)
diff --git a/deps/openssl/openssl/crypto/bn/asm/pa-risc2.s b/deps/openssl/openssl/crypto/bn/asm/pa-risc2.s
index f3b16290eb..413eac7123 100644
--- a/deps/openssl/openssl/crypto/bn/asm/pa-risc2.s
+++ b/deps/openssl/openssl/crypto/bn/asm/pa-risc2.s
@@ -1,3 +1,9 @@
+; Copyright 1998-2016 The OpenSSL Project Authors. All Rights Reserved.
+;
+; Licensed under the OpenSSL license (the "License").  You may not use
+; this file except in compliance with the License.  You can obtain a copy
+; in the file LICENSE in the source distribution or at
+; https://www.openssl.org/source/license.html
 ;
 ; PA-RISC 2.0 implementation of bn_asm code, based on the
 ; 64-bit version of the code.  This code is effectively the
diff --git a/deps/openssl/openssl/crypto/bn/asm/pa-risc2W.s b/deps/openssl/openssl/crypto/bn/asm/pa-risc2W.s
index a99545754d..97381172e7 100644
--- a/deps/openssl/openssl/crypto/bn/asm/pa-risc2W.s
+++ b/deps/openssl/openssl/crypto/bn/asm/pa-risc2W.s
@@ -1,3 +1,10 @@
+; Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
+;
+; Licensed under the OpenSSL license (the "License").  You may not use
+; this file except in compliance with the License.  You can obtain a copy
+; in the file LICENSE in the source distribution or at
+; https://www.openssl.org/source/license.html
+
 ;
 ; PA-RISC 64-bit implementation of bn_asm code
 ;
diff --git a/deps/openssl/openssl/crypto/bn/asm/parisc-mont.pl b/deps/openssl/openssl/crypto/bn/asm/parisc-mont.pl
index c02ef6f014..8aa94e8511 100644
--- a/deps/openssl/openssl/crypto/bn/asm/parisc-mont.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/parisc-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -126,7 +133,7 @@ $fp="%r3";
 $hi1="%r2";
 $hi0="%r1";
 
-$xfer=$n0;	# accomodates [-16..15] offset in fld[dw]s
+$xfer=$n0;	# accommodates [-16..15] offset in fld[dw]s
 
 $fm0="%fr4";	$fti=$fm0;
 $fbi="%fr5L";
diff --git a/deps/openssl/openssl/crypto/bn/asm/ppc-mont.pl b/deps/openssl/openssl/crypto/bn/asm/ppc-mont.pl
index 6930a3aceb..5802260ca6 100644
--- a/deps/openssl/openssl/crypto/bn/asm/ppc-mont.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/ppc-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
diff --git a/deps/openssl/openssl/crypto/bn/asm/ppc.pl b/deps/openssl/openssl/crypto/bn/asm/ppc.pl
index 446d8ba949..4ea534a1c7 100644
--- a/deps/openssl/openssl/crypto/bn/asm/ppc.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/ppc.pl
@@ -1,5 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
 #
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 # Implemented as a Perl wrapper as we want to support several different
 # architectures with single file. We pick up the target based on the
 # file name we are asked to generate.
@@ -419,7 +425,7 @@ $data=<<EOF;
 # r9,r10, r11 are the equivalents of c1,c2, c3.
 #
 # Possible optimization of loading all 8 longs of a into registers
-# doesnt provide any speedup
+# doesn't provide any speedup
 # 
 
 	xor		r0,r0,r0		#set r0 = 0.Used in addze
@@ -1009,7 +1015,7 @@ $data=<<EOF;
 	$UMULL	r8,r6,r7
 	$UMULH	r9,r6,r7
 	addc	r11,r11,r8
-	addze	r12,r9			# since we didnt set r12 to zero before.
+	addze	r12,r9			# since we didn't set r12 to zero before.
 	addze	r10,r0
 					#mul_add_c(a[1],b[0],c2,c3,c1);
 	$LD	r6,`1*$BNSZ`(r4)
diff --git a/deps/openssl/openssl/crypto/bn/asm/ppc64-mont.pl b/deps/openssl/openssl/crypto/bn/asm/ppc64-mont.pl
index 595fc6d31f..1e19c958a1 100644
--- a/deps/openssl/openssl/crypto/bn/asm/ppc64-mont.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/ppc64-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
diff --git a/deps/openssl/openssl/crypto/bn/asm/rsaz-avx2.pl b/deps/openssl/openssl/crypto/bn/asm/rsaz-avx2.pl
index 2b3f8b0e21..46d746b7d0 100755
--- a/deps/openssl/openssl/crypto/bn/asm/rsaz-avx2.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/rsaz-avx2.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 ##############################################################################
 #                                                                            #
@@ -103,7 +110,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-
 	$addx = ($ver>=3.03);
 }
 
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT = *OUT;
 
 if ($avx>1) {{{
diff --git a/deps/openssl/openssl/crypto/bn/asm/rsaz-x86_64.pl b/deps/openssl/openssl/crypto/bn/asm/rsaz-x86_64.pl
index 87ce2c34d9..6f3b664f7a 100755
--- a/deps/openssl/openssl/crypto/bn/asm/rsaz-x86_64.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/rsaz-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 ##############################################################################
 #                                                                            #
@@ -95,7 +102,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;
 
 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
@@ -1767,7 +1774,7 @@ ___
 {	# __rsaz_512_mul
 	#
 	# input: %rsi - ap, %rbp - bp
-	# ouput:
+	# output:
 	# clobbers: everything
 my ($ap,$bp) = ("%rsi","%rbp");
 $code.=<<___;
@@ -1919,7 +1926,7 @@ if ($addx) {
 	# __rsaz_512_mulx
 	#
 	# input: %rsi - ap, %rbp - bp
-	# ouput:
+	# output:
 	# clobbers: everything
 my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
 $code.=<<___;
diff --git a/deps/openssl/openssl/crypto/bn/asm/s390x-gf2m.pl b/deps/openssl/openssl/crypto/bn/asm/s390x-gf2m.pl
index 9d18d40e77..cbd16f4214 100644
--- a/deps/openssl/openssl/crypto/bn/asm/s390x-gf2m.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/s390x-gf2m.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -35,7 +42,7 @@ if ($flavour =~ /3[12]/) {
         $g="g";
 }
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 $stdframe=16*$SIZE_T+4*8;
diff --git a/deps/openssl/openssl/crypto/bn/asm/s390x-mont.pl b/deps/openssl/openssl/crypto/bn/asm/s390x-mont.pl
index 9fd64e81ee..2205bc2ca0 100644
--- a/deps/openssl/openssl/crypto/bn/asm/s390x-mont.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/s390x-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -54,7 +61,7 @@ if ($flavour =~ /3[12]/) {
 	$g="g";
 }
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 $stdframe=16*$SIZE_T+4*8;
diff --git a/deps/openssl/openssl/crypto/bn/asm/s390x.S b/deps/openssl/openssl/crypto/bn/asm/s390x.S
index f5eebe413a..292a7a9998 100755..100644
--- a/deps/openssl/openssl/crypto/bn/asm/s390x.S
+++ b/deps/openssl/openssl/crypto/bn/asm/s390x.S
@@ -1,11 +1,11 @@
 .ident "s390x.S, version 1.1"
 // ====================================================================
-// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-// project.
+// Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
 //
-// Rights for redistribution and usage in source and binary forms are
-// granted according to the OpenSSL license. Warranty of any kind is
-// disclaimed.
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
 // ====================================================================
 
 .text
diff --git a/deps/openssl/openssl/crypto/bn/asm/sparct4-mont.pl b/deps/openssl/openssl/crypto/bn/asm/sparct4-mont.pl
index 71b45002a4..4faf66f10a 100755
--- a/deps/openssl/openssl/crypto/bn/asm/sparct4-mont.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/sparct4-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
@@ -76,6 +83,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "sparcv9_modes.pl";
 
+$output = pop;
+open STDOUT,">$output";
+
 $code.=<<___;
 #include "sparc_arch.h"
 
diff --git a/deps/openssl/openssl/crypto/bn/asm/sparcv8.S b/deps/openssl/openssl/crypto/bn/asm/sparcv8.S
index 88c5dc480a..9c31073b24 100644
--- a/deps/openssl/openssl/crypto/bn/asm/sparcv8.S
+++ b/deps/openssl/openssl/crypto/bn/asm/sparcv8.S
@@ -3,12 +3,12 @@
 
 /*
  * ====================================================================
- * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
- * project.
+ * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved.
  *
- * Rights for redistribution and usage in source and binary forms are
- * granted according to the OpenSSL license. Warranty of any kind is
- * disclaimed.
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
  * ====================================================================
  */
 
diff --git a/deps/openssl/openssl/crypto/bn/asm/sparcv8plus.S b/deps/openssl/openssl/crypto/bn/asm/sparcv8plus.S
index 63de1860f2..714a136675 100644
--- a/deps/openssl/openssl/crypto/bn/asm/sparcv8plus.S
+++ b/deps/openssl/openssl/crypto/bn/asm/sparcv8plus.S
@@ -3,12 +3,12 @@
 
 /*
  * ====================================================================
- * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
- * project.
+ * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved.
  *
- * Rights for redistribution and usage in source and binary forms are
- * granted according to the OpenSSL license. Warranty of any kind is
- * disclaimed.
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
  * ====================================================================
  */
 
@@ -52,7 +52,7 @@
  *	# cd ../..
  *	# make; make test
  *
- * Q. V8plus achitecture? What kind of beast is that?
+ * Q. V8plus architecture? What kind of beast is that?
  * A. Well, it's rather a programming model than an architecture...
  *    It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
  *    special conditions, namely when kernel doesn't preserve upper
@@ -71,7 +71,7 @@
  *
  * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
  *    doesn't work?
- * A. You can't adress *all* registers as 64-bit wide:-( The catch is
+ * A. You can't address *all* registers as 64-bit wide:-( The catch is
  *    that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
  *    preserved if you're in a leaf function, i.e. such never calling
  *    any other functions. All functions in this module are leaf and
@@ -144,6 +144,10 @@
  *	    }
  */
 
+#ifdef OPENSSL_FIPSCANISTER
+#include <openssl/fipssyms.h>
+#endif
+
 #if defined(__SUNPRO_C) && defined(__sparcv9)
   /* They've said -xarch=v9 at command line */
   .register	%g2,#scratch
diff --git a/deps/openssl/openssl/crypto/bn/asm/sparcv9-gf2m.pl b/deps/openssl/openssl/crypto/bn/asm/sparcv9-gf2m.pl
index ab94cd917c..dcf11a87a1 100644
--- a/deps/openssl/openssl/crypto/bn/asm/sparcv9-gf2m.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/sparcv9-gf2m.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -18,6 +25,9 @@
 # ~100-230% faster than gcc-generated code and ~35-90% faster than
 # the pure SPARCv9 code path.
 
+$output = pop;
+open STDOUT,">$output";
+
 $locals=16*8;
 
 $tab="%l0";
diff --git a/deps/openssl/openssl/crypto/bn/asm/sparcv9-mont.pl b/deps/openssl/openssl/crypto/bn/asm/sparcv9-mont.pl
index d866287800..6807c8b6e0 100644
--- a/deps/openssl/openssl/crypto/bn/asm/sparcv9-mont.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/sparcv9-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -13,7 +20,7 @@
 # for undertaken effort are multiple. First of all, UltraSPARC is not
 # the whole SPARCv9 universe and other VIS-free implementations deserve
 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
-# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
+# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
 # several integrated RSA/DSA accelerator circuits accessible through
 # kernel driver [only(*)], but having decent user-land software
@@ -23,7 +30,7 @@
 # instructions...
 
 # (*)	Engine accessing the driver in question is on my TODO list.
-#	For reference, acceleator is estimated to give 6 to 10 times
+#	For reference, accelerator is estimated to give 6 to 10 times
 #	improvement on single-threaded RSA sign. It should be noted
 #	that 6-10x improvement coefficient does not actually mean
 #	something extraordinary in terms of absolute [single-threaded]
@@ -42,6 +49,9 @@
 # module still have hidden potential [see TODO list there], which is
 # estimated to be larger than 20%...
 
+$output = pop;
+open STDOUT,">$output";
+
 # int bn_mul_mont(
 $rp="%i0";	# BN_ULONG *rp,
 $ap="%i1";	# const BN_ULONG *ap,
@@ -50,10 +60,8 @@ $np="%i3";	# const BN_ULONG *np,
 $n0="%i4";	# const BN_ULONG *n0,
 $num="%i5";	# int num);
 
-$bits=32;
-for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64)	{ $bias=2047; $frame=192; }
-else		{ $bias=0;    $frame=128; }
+$frame="STACK_FRAME";
+$bias="STACK_BIAS";
 
 $car0="%o0";
 $car1="%o1";
@@ -76,6 +84,8 @@ $tpj="%l7";
 $fname="bn_mul_mont_int";
 
 $code=<<___;
+#include "sparc_arch.h"
+
 .section	".text",#alloc,#execinstr
 
 .global	$fname
@@ -105,7 +115,7 @@ $fname:
 	ld	[$np],$car1		! np[0]
 	sub	%o7,$bias,%sp		! alloca
 	ld	[$np+4],$npj		! np[1]
-	be,pt	`$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
+	be,pt	SIZE_T_CC,.Lbn_sqr_mont
 	mov	12,$j
 
 	mulx	$car0,$mul0,$car0	! ap[0]*bp[0]
diff --git a/deps/openssl/openssl/crypto/bn/asm/sparcv9a-mont.pl b/deps/openssl/openssl/crypto/bn/asm/sparcv9a-mont.pl
index a14205f2f0..50b690653f 100755
--- a/deps/openssl/openssl/crypto/bn/asm/sparcv9a-mont.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/sparcv9a-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -51,21 +58,17 @@
 #
 # Modulo-scheduled inner loops allow to interleave floating point and
 # integer instructions and minimize Read-After-Write penalties. This
-# results in *further* 20-50% perfromance improvement [depending on
+# results in *further* 20-50% performance improvement [depending on
 # key length, more for longer keys] on USI&II cores and 30-80% - on
 # USIII&IV.
 
+$output = pop;
+open STDOUT,">$output";
+
 $fname="bn_mul_mont_fpu";
-$bits=32;
-for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-
-if ($bits==64) {
-	$bias=2047;
-	$frame=192;
-} else {
-	$bias=0;
-	$frame=128;	# 96 rounded up to largest known cache-line
-}
+
+$frame="STACK_FRAME";
+$bias="STACK_BIAS";
 $locals=64;
 
 # In order to provide for 32-/64-bit ABI duality, I keep integers wider
@@ -121,6 +124,8 @@ $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
 $ASI_FL16_P=0xD2;	# magic ASI value to engage 16-bit FP load
 
 $code=<<___;
+#include "sparc_arch.h"
+
 .section	".text",#alloc,#execinstr
 
 .global $fname
@@ -867,7 +872,7 @@ ___
 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
 
 # Below substitution makes it possible to compile without demanding
-# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
+# VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
 # dare to do this, because VIS capability is detected at run-time now
 # and this routine is not called on CPU not capable to execute it. Do
 # note that fzeros is not the only VIS dependency! Another dependency
diff --git a/deps/openssl/openssl/crypto/bn/asm/via-mont.pl b/deps/openssl/openssl/crypto/bn/asm/via-mont.pl
index c046a514c8..9f81bc822e 100644
--- a/deps/openssl/openssl/crypto/bn/asm/via-mont.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/via-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -81,6 +88,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
+$output = pop;
+open STDOUT,">$output";
+
 &asm_init($ARGV[0],"via-mont.pl");
 
 # int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
@@ -240,3 +250,5 @@ $sp=&DWP(28,"esp");
 &asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
 
 &asm_finish();
+
+close STDOUT;
diff --git a/deps/openssl/openssl/crypto/bn/asm/vis3-mont.pl b/deps/openssl/openssl/crypto/bn/asm/vis3-mont.pl
index 263ac02b6f..64dba4480f 100644
--- a/deps/openssl/openssl/crypto/bn/asm/vis3-mont.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/vis3-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -18,16 +25,20 @@
 # for reference purposes, because T4 has dedicated Montgomery
 # multiplication and squaring *instructions* that deliver even more.
 
-$bits=32;
-for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64)  { $bias=2047; $frame=192; }
-else            { $bias=0;    $frame=112; }
+$output = pop;
+open STDOUT,">$output";
+
+$frame = "STACK_FRAME";
+$bias = "STACK_BIAS";
+
+$code.=<<___;
+#include "sparc_arch.h"
 
-$code.=<<___ if ($bits==64);
+#ifdef	__arch64__
 .register	%g2,#scratch
 .register	%g3,#scratch
-___
-$code.=<<___;
+#endif
+
 .section	".text",#alloc,#execinstr
 ___
 
@@ -333,7 +344,7 @@ ___
 
 # Purpose of these subroutines is to explicitly encode VIS instructions,
 # so that one can compile the module without having to specify VIS
-# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 # Idea is to reserve for option to produce "universal" binary and let
 # programmer detect if current CPU is VIS capable at run-time.
 sub unvis3 {
diff --git a/deps/openssl/openssl/crypto/bn/asm/vms.mar b/deps/openssl/openssl/crypto/bn/asm/vms.mar
deleted file mode 100644
index aefab15cdb..0000000000
--- a/deps/openssl/openssl/crypto/bn/asm/vms.mar
+++ /dev/null
@@ -1,6440 +0,0 @@
-	.title	vax_bn_mul_add_words  unsigned multiply & add, 32*32+32+32=>64
-;
-; w.j.m. 15-jan-1999
-;
-; it's magic ...
-;
-; ULONG bn_mul_add_words(ULONG r[],ULONG a[],int n,ULONG w) {
-;	ULONG c = 0;
-;	int i;
-;	for(i = 0; i < n; i++) <c,r[i]> := r[i] + c + a[i] * w ;
-;	return c;
-; }
-
-r=4 ;(AP)
-a=8 ;(AP)
-n=12 ;(AP)	n	by value (input)
-w=16 ;(AP)	w	by value (input)
-
-
-	.psect	code,nowrt
-
-.entry	bn_mul_add_words,^m<r2,r3,r4,r5,r6>
-
-	moval	@r(ap),r2
-	moval	@a(ap),r3
-	movl	n(ap),r4	; assumed >0 by C code
-	movl	w(ap),r5
-	clrl	r6		; c
-
-0$:
-	emul	r5,(r3),(r2),r0		; w, a[], r[] considered signed
-
-	; fixup for "negative" r[]
-	tstl	(r2)
-	bgeq	10$
-	incl	r1
-10$:
-
-	; add in c
-	addl2	r6,r0
-	adwc	#0,r1
-
-	; combined fixup for "negative" w, a[]
-	tstl	r5
-	bgeq	20$
-	addl2	(r3),r1
-20$:
-	tstl	(r3)
-	bgeq	30$
-	addl2	r5,r1
-30$:
-
-	movl	r0,(r2)+		; store lo result in r[] & advance
-	addl	#4,r3			; advance a[]
-	movl	r1,r6			; store hi result => c
-
-	sobgtr	r4,0$
-
-	movl	r6,r0			; return c
-	ret
-
-	.title	vax_bn_mul_words  unsigned multiply & add, 32*32+32=>64
-;
-; w.j.m. 15-jan-1999
-;
-; it's magic ...
-;
-; ULONG bn_mul_words(ULONG r[],ULONG a[],int n,ULONG w) {
-;	ULONG c = 0;
-;	int i;
-;	for(i = 0; i < num; i++) <c,r[i]> := a[i] * w + c ;
-;	return(c);
-; }
-
-r=4 ;(AP)
-a=8 ;(AP)
-n=12 ;(AP)	n	by value (input)
-w=16 ;(AP)	w	by value (input)
-
-
-	.psect	code,nowrt
-
-.entry	bn_mul_words,^m<r2,r3,r4,r5,r6>
-
-	moval	@r(ap),r2	; r2 -> r[]
-	moval	@a(ap),r3	; r3 -> a[]
-	movl	n(ap),r4	; r4 = loop count (assumed >0 by C code)
-	movl	w(ap),r5	; r5 = w
-	clrl	r6		; r6 = c
-
-0$:
-	; <r1,r0> := w * a[] + c
-	emul	r5,(r3),r6,r0		; w, a[], c considered signed
-
-	; fixup for "negative" c
-	tstl	r6			; c
-	bgeq	10$
-	incl	r1
-10$:
-
-	; combined fixup for "negative" w, a[]
-	tstl	r5			; w
-	bgeq	20$
-	addl2	(r3),r1			; a[]
-20$:
-	tstl	(r3)			; a[]
-	bgeq	30$
-	addl2	r5,r1			; w
-30$:
-
-	movl	r0,(r2)+		; store lo result in r[] & advance
-	addl	#4,r3			; advance a[]
-	movl	r1,r6			; store hi result => c
-
-	sobgtr	r4,0$
-
-	movl	r6,r0			; return c
-	ret
-
-	.title	vax_bn_sqr_words  unsigned square, 32*32=>64
-;
-; w.j.m. 15-jan-1999
-;
-; it's magic ...
-;
-; void bn_sqr_words(ULONG r[],ULONG a[],int n) {
-;	int i;
-;	for(i = 0; i < n; i++) <r[2*i+1],r[2*i]> := a[i] * a[i] ;
-; }
-
-r=4 ;(AP)
-a=8 ;(AP)
-n=12 ;(AP)	n	by value (input)
-
-
-	.psect	code,nowrt
-
-.entry	bn_sqr_words,^m<r2,r3,r4,r5>
-
-	moval	@r(ap),r2	; r2 -> r[]
-	moval	@a(ap),r3	; r3 -> a[]
-	movl	n(ap),r4	; r4 = n (assumed >0 by C code)
-
-0$:
-	movl	(r3)+,r5		; r5 = a[] & advance
-
-	; <r1,r0> := a[] * a[]
-	emul	r5,r5,#0,r0		; a[] considered signed
-
-	; fixup for "negative" a[]
-	tstl	r5			; a[]
-	bgeq	30$
-	addl2	r5,r1			; a[]
-	addl2	r5,r1			; a[]
-30$:
-
-	movl	r0,(r2)+		; store lo result in r[] & advance
-	movl	r1,(r2)+		; store hi result in r[] & advance
-
-	sobgtr	r4,0$
-
-	movl	#1,r0			; return SS$_NORMAL
-	ret
-
-	.title	vax_bn_div_words  unsigned divide
-;
-; Richard Levitte 20-Nov-2000
-;
-; ULONG bn_div_words(ULONG h, ULONG l, ULONG d)
-; {
-;	return ((ULONG)((((ULLONG)h)<<32)|l) / (ULLONG)d);
-; }
-;
-; Using EDIV would be very easy, if it didn't do signed calculations.
-; Any time any of the input numbers are signed, there are problems,
-; usually with integer overflow, at which point it returns useless
-; data (the quotient gets the value of l, and the remainder becomes 0).
-;
-; If it was just for the dividend, it would be very easy, just divide
-; it by 2 (unsigned), do the division, multiply the resulting quotient
-; and remainder by 2, add the bit that was dropped when dividing by 2
-; to the remainder, and do some adjustment so the remainder doesn't
-; end up larger than the divisor.  For some cases when the divisor is
-; negative (from EDIV's point of view, i.e. when the highest bit is set),
-; dividing the dividend by 2 isn't enough, and since some operations
-; might generate integer overflows even when the dividend is divided by
-; 4 (when the high part of the shifted down dividend ends up being exactly
-; half of the divisor, the result is the quotient 0x80000000, which is
-; negative...) it needs to be divided by 8.  Furthermore, the divisor needs
-; to be divided by 2 (unsigned) as well, to avoid more problems with the sign.
-; In this case, a little extra fiddling with the remainder is required.
-;
-; So, the simplest way to handle this is always to divide the dividend
-; by 8, and to divide the divisor by 2 if it's highest bit is set.
-; After EDIV has been used, the quotient gets multiplied by 8 if the
-; original divisor was positive, otherwise 4.  The remainder, oddly
-; enough, is *always* multiplied by 8.
-; NOTE: in the case mentioned above, where the high part of the shifted
-; down dividend ends up being exactly half the shifted down divisor, we
-; end up with a 33 bit quotient.  That's no problem however, it usually
-; means we have ended up with a too large remainder as well, and the
-; problem is fixed by the last part of the algorithm (next paragraph).
-;
-; The routine ends with comparing the resulting remainder with the
-; original divisor and if the remainder is larger, subtract the
-; original divisor from it, and increase the quotient by 1.  This is
-; done until the remainder is smaller than the divisor.
-;
-; The complete algorithm looks like this:
-;
-; d'    = d
-; l'    = l & 7
-; [h,l] = [h,l] >> 3
-; [q,r] = floor([h,l] / d)	# This is the EDIV operation
-; if (q < 0) q = -q		# I doubt this is necessary any more
-;
-; r'    = r >> 29
-; if (d' >= 0)
-;   q'  = q >> 29
-;   q   = q << 3
-; else
-;   q'  = q >> 30
-;   q   = q << 2
-; r     = (r << 3) + l'
-;
-; if (d' < 0)
-;   {
-;     [r',r] = [r',r] - q
-;     while ([r',r] < 0)
-;       {
-;         [r',r] = [r',r] + d
-;         [q',q] = [q',q] - 1
-;       }
-;   }
-;
-; while ([r',r] >= d')
-;   {
-;     [r',r] = [r',r] - d'
-;     [q',q] = [q',q] + 1
-;   }
-;
-; return q
-
-h=4 ;(AP)	h	by value (input)
-l=8 ;(AP)	l	by value (input)
-d=12 ;(AP)	d	by value (input)
-
-;r2 = l, q
-;r3 = h, r
-;r4 = d
-;r5 = l'
-;r6 = r'
-;r7 = d'
-;r8 = q'
-
-	.psect	code,nowrt
-
-.entry	bn_div_words,^m<r2,r3,r4,r5,r6,r7,r8>
-	movl	l(ap),r2
-	movl	h(ap),r3
-	movl	d(ap),r4
-
-	bicl3	#^XFFFFFFF8,r2,r5 ; l' = l & 7
-	bicl3	#^X00000007,r2,r2
-
-	bicl3	#^XFFFFFFF8,r3,r6
-	bicl3	#^X00000007,r3,r3
-        
-	addl	r6,r2
-
-	rotl	#-3,r2,r2	; l = l >> 3
-	rotl	#-3,r3,r3	; h = h >> 3
-                
-	movl	r4,r7		; d' = d
-
-	movl	#0,r6		; r' = 0
-	movl	#0,r8		; q' = 0
-
-	tstl	r4
-	beql	666$		; Uh-oh, the divisor is 0...
-	bgtr	1$
-	rotl	#-1,r4,r4	; If d is negative, shift it right.
-	bicl2	#^X80000000,r4	; Since d is then a large number, the
-				; lowest bit is insignificant
-				; (contradict that, and I'll fix the problem!)
-1$:     
-	ediv	r4,r2,r2,r3	; Do the actual division
-
-	tstl	r2
-	bgeq	3$
-	mnegl	r2,r2		; if q < 0, negate it
-3$:     
-	tstl	r7
-	blss	4$
-	rotl	#3,r2,r2	;   q = q << 3
-	bicl3	#^XFFFFFFF8,r2,r8 ;    q' gets the high bits from q
-	bicl3	#^X00000007,r2,r2
-	bsb	41$
-4$:				; else
-	rotl	#2,r2,r2	;   q = q << 2
-	bicl3	#^XFFFFFFFC,r2,r8 ;   q' gets the high bits from q
-	bicl3	#^X00000003,r2,r2
-41$:
-	rotl	#3,r3,r3	; r = r << 3
-	bicl3	#^XFFFFFFF8,r3,r6 ; r' gets the high bits from r
-	bicl3	#^X00000007,r3,r3
-	addl	r5,r3		; r = r + l'
-
-	tstl	r7
-	bgeq	5$
-	bitl	#1,r7
-	beql	5$		; if d' < 0 && d' & 1
-	subl	r2,r3		;   [r',r] = [r',r] - [q',q]
-	sbwc	r8,r6
-45$:
-	bgeq	5$		;   while r < 0
-	decl	r2		;     [q',q] = [q',q] - 1
-	sbwc	#0,r8
-	addl	r7,r3		;     [r',r] = [r',r] + d'
-	adwc	#0,r6
-	brb	45$
-
-; The return points are placed in the middle to keep a short distance from
-; all the branch points
-42$:
-;	movl	r3,r1
-	movl	r2,r0
-	ret
-666$:
-	movl	#^XFFFFFFFF,r0
-	ret
-
-5$:
-	tstl	r6
-	bneq	6$
-	cmpl	r3,r7
-	blssu	42$		; while [r',r] >= d'
-6$:
-	subl	r7,r3		;   [r',r] = [r',r] - d'
-	sbwc	#0,r6
-	incl	r2		;   [q',q] = [q',q] + 1
-	adwc	#0,r8
-	brb	5$	
-
-	.title	vax_bn_add_words  unsigned add of two arrays
-;
-; Richard Levitte 20-Nov-2000
-;
-; ULONG bn_add_words(ULONG r[], ULONG a[], ULONG b[], int n) {
-;	ULONG c = 0;
-;	int i;
-;	for (i = 0; i < n; i++) <c,r[i]> = a[i] + b[i] + c;
-;	return(c);
-; }
-
-r=4 ;(AP)	r	by reference (output)
-a=8 ;(AP)	a	by reference (input)
-b=12 ;(AP)	b	by reference (input)
-n=16 ;(AP)	n	by value (input)
-
-
-	.psect	code,nowrt
-
-.entry	bn_add_words,^m<r2,r3,r4,r5,r6>
-
-	moval	@r(ap),r2
-	moval	@a(ap),r3
-	moval	@b(ap),r4
-	movl	n(ap),r5	; assumed >0 by C code
-	clrl	r0		; c
-
-	tstl	r5		; carry = 0
-	bleq	666$
-
-0$:
-	movl	(r3)+,r6	; carry untouched
-	adwc	(r4)+,r6	; carry used and touched
-	movl	r6,(r2)+	; carry untouched
-	sobgtr	r5,0$		; carry untouched
-
-	adwc	#0,r0
-666$:
-	ret
-
-	.title	vax_bn_sub_words  unsigned add of two arrays
-;
-; Richard Levitte 20-Nov-2000
-;
-; ULONG bn_sub_words(ULONG r[], ULONG a[], ULONG b[], int n) {
-;	ULONG c = 0;
-;	int i;
-;	for (i = 0; i < n; i++) <c,r[i]> = a[i] - b[i] - c;
-;	return(c);
-; }
-
-r=4 ;(AP)	r	by reference (output)
-a=8 ;(AP)	a	by reference (input)
-b=12 ;(AP)	b	by reference (input)
-n=16 ;(AP)	n	by value (input)
-
-
-	.psect	code,nowrt
-
-.entry	bn_sub_words,^m<r2,r3,r4,r5,r6>
-
-	moval	@r(ap),r2
-	moval	@a(ap),r3
-	moval	@b(ap),r4
-	movl	n(ap),r5	; assumed >0 by C code
-	clrl	r0		; c
-
-	tstl	r5		; carry = 0
-	bleq	666$
-
-0$:
-	movl	(r3)+,r6	; carry untouched
-	sbwc	(r4)+,r6	; carry used and touched
-	movl	r6,(r2)+	; carry untouched
-	sobgtr	r5,0$		; carry untouched
-
-	adwc	#0,r0
-666$:
-	ret
-
-
-;r=4 ;(AP)
-;a=8 ;(AP)
-;b=12 ;(AP)
-;n=16 ;(AP)	n	by value (input)
-
-	.psect	code,nowrt
-
-.entry	BN_MUL_COMBA8,^m<r2,r3,r4,r5,r6,r7,r8,r9,r10,r11>
-	movab	-924(sp),sp
-	clrq	r8
-
-	clrl	r10
-
-	movl	8(ap),r6
-	movzwl	2(r6),r3
-	movl	12(ap),r7
-	bicl3	#-65536,(r7),r2
-	movzwl	2(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,(r6),-12(fp)
-	bicl3	#-65536,r3,-16(fp)
-	mull3	r0,-12(fp),-4(fp)
-	mull2	r2,-12(fp)
-	mull3	r2,-16(fp),-8(fp)
-	mull2	r0,-16(fp)
-	addl3	-4(fp),-8(fp),r0
-	bicl3	#0,r0,-4(fp)
-	cmpl	-4(fp),-8(fp)
-	bgequ	noname.45
-	addl2	#65536,-16(fp)
-noname.45:
-	movzwl	-2(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-16(fp)
-	bicl3	#-65536,-4(fp),r0
-	ashl	#16,r0,-8(fp)
-	addl3	-8(fp),-12(fp),r0
-	bicl3	#0,r0,-12(fp)
-	cmpl	-12(fp),-8(fp)
-	bgequ	noname.46
-	incl	-16(fp)
-noname.46:
-	movl	-12(fp),r1
-	movl	-16(fp),r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.47
-	incl	r2
-noname.47:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.48
-	incl	r10
-noname.48:
-
-	movl	4(ap),r11
-	movl	r9,(r11)
-
-	clrl	r9
-
-	movzwl	2(r6),r2
-	bicl3	#-65536,4(r7),r3
-	movzwl	6(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,(r6),-28(fp)
-	bicl3	#-65536,r2,-32(fp)
-	mull3	r0,-28(fp),-20(fp)
-	mull2	r3,-28(fp)
-	mull3	r3,-32(fp),-24(fp)
-	mull2	r0,-32(fp)
-	addl3	-20(fp),-24(fp),r0
-	bicl3	#0,r0,-20(fp)
-	cmpl	-20(fp),-24(fp)
-	bgequ	noname.49
-	addl2	#65536,-32(fp)
-noname.49:
-	movzwl	-18(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-32(fp)
-	bicl3	#-65536,-20(fp),r0
-	ashl	#16,r0,-24(fp)
-	addl3	-24(fp),-28(fp),r0
-	bicl3	#0,r0,-28(fp)
-	cmpl	-28(fp),-24(fp)
-	bgequ	noname.50
-	incl	-32(fp)
-noname.50:
-	movl	-28(fp),r1
-	movl	-32(fp),r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.51
-	incl	r2
-noname.51:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.52
-	incl	r9
-noname.52:
-
-	movzwl	6(r6),r2
-	bicl3	#-65536,(r7),r3
-	movzwl	2(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,4(r6),-44(fp)
-	bicl3	#-65536,r2,-48(fp)
-	mull3	r0,-44(fp),-36(fp)
-	mull2	r3,-44(fp)
-	mull3	r3,-48(fp),-40(fp)
-	mull2	r0,-48(fp)
-	addl3	-36(fp),-40(fp),r0
-	bicl3	#0,r0,-36(fp)
-	cmpl	-36(fp),-40(fp)
-	bgequ	noname.53
-	addl2	#65536,-48(fp)
-noname.53:
-	movzwl	-34(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-48(fp)
-	bicl3	#-65536,-36(fp),r0
-	ashl	#16,r0,-40(fp)
-	addl3	-40(fp),-44(fp),r0
-	bicl3	#0,r0,-44(fp)
-	cmpl	-44(fp),-40(fp)
-	bgequ	noname.54
-	incl	-48(fp)
-noname.54:
-	movl	-44(fp),r1
-	movl	-48(fp),r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.55
-	incl	r2
-noname.55:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.56
-	incl	r9
-noname.56:
-
-	movl	r8,4(r11)
-
-	clrl	r8
-
-	movzwl	10(r6),r2
-	bicl3	#-65536,(r7),r3
-	movzwl	2(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,8(r6),-60(fp)
-	bicl3	#-65536,r2,-64(fp)
-	mull3	r0,-60(fp),-52(fp)
-	mull2	r3,-60(fp)
-	mull3	r3,-64(fp),-56(fp)
-	mull2	r0,-64(fp)
-	addl3	-52(fp),-56(fp),r0
-	bicl3	#0,r0,-52(fp)
-	cmpl	-52(fp),-56(fp)
-	bgequ	noname.57
-	addl2	#65536,-64(fp)
-noname.57:
-	movzwl	-50(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-64(fp)
-	bicl3	#-65536,-52(fp),r0
-	ashl	#16,r0,-56(fp)
-	addl3	-56(fp),-60(fp),r0
-	bicl3	#0,r0,-60(fp)
-	cmpl	-60(fp),-56(fp)
-	bgequ	noname.58
-	incl	-64(fp)
-noname.58:
-	movl	-60(fp),r1
-	movl	-64(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.59
-	incl	r2
-noname.59:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.60
-	incl	r8
-noname.60:
-
-	movzwl	6(r6),r2
-	bicl3	#-65536,4(r7),r3
-	movzwl	6(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,4(r6),-76(fp)
-	bicl3	#-65536,r2,-80(fp)
-	mull3	r0,-76(fp),-68(fp)
-	mull2	r3,-76(fp)
-	mull3	r3,-80(fp),-72(fp)
-	mull2	r0,-80(fp)
-	addl3	-68(fp),-72(fp),r0
-	bicl3	#0,r0,-68(fp)
-	cmpl	-68(fp),-72(fp)
-	bgequ	noname.61
-	addl2	#65536,-80(fp)
-noname.61:
-	movzwl	-66(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-80(fp)
-	bicl3	#-65536,-68(fp),r0
-	ashl	#16,r0,-72(fp)
-	addl3	-72(fp),-76(fp),r0
-	bicl3	#0,r0,-76(fp)
-	cmpl	-76(fp),-72(fp)
-	bgequ	noname.62
-	incl	-80(fp)
-noname.62:
-	movl	-76(fp),r1
-	movl	-80(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.63
-	incl	r2
-noname.63:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.64
-	incl	r8
-noname.64:
-
-	movzwl	2(r6),r2
-	bicl3	#-65536,8(r7),r3
-	movzwl	10(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,(r6),-92(fp)
-	bicl3	#-65536,r2,-96(fp)
-	mull3	r0,-92(fp),-84(fp)
-	mull2	r3,-92(fp)
-	mull3	r3,-96(fp),-88(fp)
-	mull2	r0,-96(fp)
-	addl3	-84(fp),-88(fp),r0
-	bicl3	#0,r0,-84(fp)
-	cmpl	-84(fp),-88(fp)
-	bgequ	noname.65
-	addl2	#65536,-96(fp)
-noname.65:
-	movzwl	-82(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-96(fp)
-	bicl3	#-65536,-84(fp),r0
-	ashl	#16,r0,-88(fp)
-	addl3	-88(fp),-92(fp),r0
-	bicl3	#0,r0,-92(fp)
-	cmpl	-92(fp),-88(fp)
-	bgequ	noname.66
-	incl	-96(fp)
-noname.66:
-	movl	-92(fp),r1
-	movl	-96(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.67
-	incl	r2
-noname.67:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.68
-	incl	r8
-noname.68:
-
-	movl	r10,8(r11)
-
-	clrl	r10
-
-	movzwl	2(r6),r2
-	bicl3	#-65536,12(r7),r3
-	movzwl	14(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,(r6),-108(fp)
-	bicl3	#-65536,r2,-112(fp)
-	mull3	r0,-108(fp),-100(fp)
-	mull2	r3,-108(fp)
-	mull3	r3,-112(fp),-104(fp)
-	mull2	r0,-112(fp)
-	addl3	-100(fp),-104(fp),r0
-	bicl3	#0,r0,-100(fp)
-	cmpl	-100(fp),-104(fp)
-	bgequ	noname.69
-	addl2	#65536,-112(fp)
-noname.69:
-	movzwl	-98(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-112(fp)
-	bicl3	#-65536,-100(fp),r0
-	ashl	#16,r0,-104(fp)
-	addl3	-104(fp),-108(fp),r0
-	bicl3	#0,r0,-108(fp)
-	cmpl	-108(fp),-104(fp)
-	bgequ	noname.70
-	incl	-112(fp)
-noname.70:
-	movl	-108(fp),r1
-	movl	-112(fp),r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.71
-	incl	r2
-noname.71:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.72
-	incl	r10
-noname.72:
-
-	movzwl	6(r6),r2
-	bicl3	#-65536,8(r7),r3
-	movzwl	10(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,4(r6),-124(fp)
-	bicl3	#-65536,r2,-128(fp)
-	mull3	r0,-124(fp),-116(fp)
-	mull2	r3,-124(fp)
-	mull3	r3,-128(fp),-120(fp)
-	mull2	r0,-128(fp)
-	addl3	-116(fp),-120(fp),r0
-	bicl3	#0,r0,-116(fp)
-	cmpl	-116(fp),-120(fp)
-	bgequ	noname.73
-	addl2	#65536,-128(fp)
-noname.73:
-	movzwl	-114(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-128(fp)
-	bicl3	#-65536,-116(fp),r0
-	ashl	#16,r0,-120(fp)
-	addl3	-120(fp),-124(fp),r0
-	bicl3	#0,r0,-124(fp)
-	cmpl	-124(fp),-120(fp)
-	bgequ	noname.74
-	incl	-128(fp)
-noname.74:
-	movl	-124(fp),r1
-	movl	-128(fp),r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.75
-	incl	r2
-noname.75:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.76
-	incl	r10
-noname.76:
-
-	movzwl	10(r6),r2
-	bicl3	#-65536,4(r7),r3
-	movzwl	6(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,8(r6),-140(fp)
-	bicl3	#-65536,r2,-144(fp)
-	mull3	r0,-140(fp),-132(fp)
-	mull2	r3,-140(fp)
-	mull3	r3,-144(fp),-136(fp)
-	mull2	r0,-144(fp)
-	addl3	-132(fp),-136(fp),r0
-	bicl3	#0,r0,-132(fp)
-	cmpl	-132(fp),-136(fp)
-	bgequ	noname.77
-	addl2	#65536,-144(fp)
-noname.77:
-	movzwl	-130(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-144(fp)
-	bicl3	#-65536,-132(fp),r0
-	ashl	#16,r0,-136(fp)
-	addl3	-136(fp),-140(fp),r0
-	bicl3	#0,r0,-140(fp)
-	cmpl	-140(fp),-136(fp)
-	bgequ	noname.78
-	incl	-144(fp)
-noname.78:
-	movl	-140(fp),r1
-	movl	-144(fp),r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.79
-	incl	r2
-noname.79:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.80
-	incl	r10
-noname.80:
-
-	movzwl	14(r6),r2
-	bicl3	#-65536,(r7),r3
-	movzwl	2(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,12(r6),-156(fp)
-	bicl3	#-65536,r2,-160(fp)
-	mull3	r0,-156(fp),-148(fp)
-	mull2	r3,-156(fp)
-	mull3	r3,-160(fp),-152(fp)
-	mull2	r0,-160(fp)
-	addl3	-148(fp),-152(fp),r0
-	bicl3	#0,r0,-148(fp)
-	cmpl	-148(fp),-152(fp)
-	bgequ	noname.81
-	addl2	#65536,-160(fp)
-noname.81:
-	movzwl	-146(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-160(fp)
-	bicl3	#-65536,-148(fp),r0
-	ashl	#16,r0,-152(fp)
-	addl3	-152(fp),-156(fp),r0
-	bicl3	#0,r0,-156(fp)
-	cmpl	-156(fp),-152(fp)
-	bgequ	noname.82
-	incl	-160(fp)
-noname.82:
-	movl	-156(fp),r1
-	movl	-160(fp),r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.83
-	incl	r2
-noname.83:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.84
-	incl	r10
-noname.84:
-
-	movl	r9,12(r11)
-
-	clrl	r9
-
-	movzwl	18(r6),r2
-	bicl3	#-65536,(r7),r3
-	movzwl	2(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,16(r6),-172(fp)
-	bicl3	#-65536,r2,-176(fp)
-	mull3	r0,-172(fp),-164(fp)
-	mull2	r3,-172(fp)
-	mull3	r3,-176(fp),-168(fp)
-	mull2	r0,-176(fp)
-	addl3	-164(fp),-168(fp),r0
-	bicl3	#0,r0,-164(fp)
-	cmpl	-164(fp),-168(fp)
-	bgequ	noname.85
-	addl2	#65536,-176(fp)
-noname.85:
-	movzwl	-162(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-176(fp)
-	bicl3	#-65536,-164(fp),r0
-	ashl	#16,r0,-168(fp)
-	addl3	-168(fp),-172(fp),r0
-	bicl3	#0,r0,-172(fp)
-	cmpl	-172(fp),-168(fp)
-	bgequ	noname.86
-	incl	-176(fp)
-noname.86:
-	movl	-172(fp),r1
-	movl	-176(fp),r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.87
-	incl	r2
-noname.87:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.88
-	incl	r9
-noname.88:
-
-	movzwl	14(r6),r2
-	bicl3	#-65536,4(r7),r3
-	movzwl	6(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,12(r6),-188(fp)
-	bicl3	#-65536,r2,-192(fp)
-	mull3	r0,-188(fp),-180(fp)
-	mull2	r3,-188(fp)
-	mull3	r3,-192(fp),-184(fp)
-	mull2	r0,-192(fp)
-	addl3	-180(fp),-184(fp),r0
-	bicl3	#0,r0,-180(fp)
-	cmpl	-180(fp),-184(fp)
-	bgequ	noname.89
-	addl2	#65536,-192(fp)
-noname.89:
-	movzwl	-178(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-192(fp)
-	bicl3	#-65536,-180(fp),r0
-	ashl	#16,r0,-184(fp)
-	addl3	-184(fp),-188(fp),r0
-	bicl3	#0,r0,-188(fp)
-	cmpl	-188(fp),-184(fp)
-	bgequ	noname.90
-	incl	-192(fp)
-noname.90:
-	movl	-188(fp),r1
-	movl	-192(fp),r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.91
-	incl	r2
-noname.91:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.92
-	incl	r9
-noname.92:
-
-	movzwl	10(r6),r2
-	bicl3	#-65536,8(r7),r3
-	movzwl	10(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,8(r6),-204(fp)
-	bicl3	#-65536,r2,-208(fp)
-	mull3	r0,-204(fp),-196(fp)
-	mull2	r3,-204(fp)
-	mull3	r3,-208(fp),-200(fp)
-	mull2	r0,-208(fp)
-	addl3	-196(fp),-200(fp),r0
-	bicl3	#0,r0,-196(fp)
-	cmpl	-196(fp),-200(fp)
-	bgequ	noname.93
-	addl2	#65536,-208(fp)
-noname.93:
-	movzwl	-194(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-208(fp)
-	bicl3	#-65536,-196(fp),r0
-	ashl	#16,r0,-200(fp)
-	addl3	-200(fp),-204(fp),r0
-	bicl3	#0,r0,-204(fp)
-	cmpl	-204(fp),-200(fp)
-	bgequ	noname.94
-	incl	-208(fp)
-noname.94:
-	movl	-204(fp),r1
-	movl	-208(fp),r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.95
-	incl	r2
-noname.95:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.96
-	incl	r9
-noname.96:
-
-	movzwl	6(r6),r2
-	bicl3	#-65536,12(r7),r3
-	movzwl	14(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,4(r6),-220(fp)
-	bicl3	#-65536,r2,-224(fp)
-	mull3	r0,-220(fp),-212(fp)
-	mull2	r3,-220(fp)
-	mull3	r3,-224(fp),-216(fp)
-	mull2	r0,-224(fp)
-	addl3	-212(fp),-216(fp),r0
-	bicl3	#0,r0,-212(fp)
-	cmpl	-212(fp),-216(fp)
-	bgequ	noname.97
-	addl2	#65536,-224(fp)
-noname.97:
-	movzwl	-210(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-224(fp)
-	bicl3	#-65536,-212(fp),r0
-	ashl	#16,r0,-216(fp)
-	addl3	-216(fp),-220(fp),r0
-	bicl3	#0,r0,-220(fp)
-	cmpl	-220(fp),-216(fp)
-	bgequ	noname.98
-	incl	-224(fp)
-noname.98:
-	movl	-220(fp),r1
-	movl	-224(fp),r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.99
-	incl	r2
-noname.99:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.100
-	incl	r9
-noname.100:
-
-	movzwl	2(r6),r2
-	bicl3	#-65536,16(r7),r3
-	movzwl	18(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,(r6),-236(fp)
-	bicl3	#-65536,r2,-240(fp)
-	mull3	r0,-236(fp),-228(fp)
-	mull2	r3,-236(fp)
-	mull3	r3,-240(fp),-232(fp)
-	mull2	r0,-240(fp)
-	addl3	-228(fp),-232(fp),r0
-	bicl3	#0,r0,-228(fp)
-	cmpl	-228(fp),-232(fp)
-	bgequ	noname.101
-	addl2	#65536,-240(fp)
-noname.101:
-	movzwl	-226(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-240(fp)
-	bicl3	#-65536,-228(fp),r0
-	ashl	#16,r0,-232(fp)
-	addl3	-232(fp),-236(fp),r0
-	bicl3	#0,r0,-236(fp)
-	cmpl	-236(fp),-232(fp)
-	bgequ	noname.102
-	incl	-240(fp)
-noname.102:
-	movl	-236(fp),r1
-	movl	-240(fp),r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.103
-	incl	r2
-noname.103:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.104
-	incl	r9
-noname.104:
-
-	movl	r8,16(r11)
-
-	clrl	r8
-
-	movzwl	2(r6),r2
-	bicl3	#-65536,20(r7),r3
-	movzwl	22(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,(r6),-252(fp)
-	bicl3	#-65536,r2,-256(fp)
-	mull3	r0,-252(fp),-244(fp)
-	mull2	r3,-252(fp)
-	mull3	r3,-256(fp),-248(fp)
-	mull2	r0,-256(fp)
-	addl3	-244(fp),-248(fp),r0
-	bicl3	#0,r0,-244(fp)
-	cmpl	-244(fp),-248(fp)
-	bgequ	noname.105
-	addl2	#65536,-256(fp)
-noname.105:
-	movzwl	-242(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-256(fp)
-	bicl3	#-65536,-244(fp),r0
-	ashl	#16,r0,-248(fp)
-	addl3	-248(fp),-252(fp),r0
-	bicl3	#0,r0,-252(fp)
-	cmpl	-252(fp),-248(fp)
-	bgequ	noname.106
-	incl	-256(fp)
-noname.106:
-	movl	-252(fp),r1
-	movl	-256(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.107
-	incl	r2
-noname.107:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.108
-	incl	r8
-noname.108:
-
-	movzwl	6(r6),r2
-	bicl3	#-65536,16(r7),r3
-	movzwl	18(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,4(r6),-268(fp)
-	bicl3	#-65536,r2,-272(fp)
-	mull3	r0,-268(fp),-260(fp)
-	mull2	r3,-268(fp)
-	mull3	r3,-272(fp),-264(fp)
-	mull2	r0,-272(fp)
-	addl3	-260(fp),-264(fp),r0
-	bicl3	#0,r0,-260(fp)
-	cmpl	-260(fp),-264(fp)
-	bgequ	noname.109
-	addl2	#65536,-272(fp)
-noname.109:
-	movzwl	-258(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-272(fp)
-	bicl3	#-65536,-260(fp),r0
-	ashl	#16,r0,-264(fp)
-	addl3	-264(fp),-268(fp),r0
-	bicl3	#0,r0,-268(fp)
-	cmpl	-268(fp),-264(fp)
-	bgequ	noname.110
-	incl	-272(fp)
-noname.110:
-	movl	-268(fp),r1
-	movl	-272(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.111
-	incl	r2
-noname.111:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.112
-	incl	r8
-noname.112:
-
-	movzwl	10(r6),r2
-	bicl3	#-65536,12(r7),r3
-	movzwl	14(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,8(r6),-284(fp)
-	bicl3	#-65536,r2,-288(fp)
-	mull3	r0,-284(fp),-276(fp)
-	mull2	r3,-284(fp)
-	mull3	r3,-288(fp),-280(fp)
-	mull2	r0,-288(fp)
-	addl3	-276(fp),-280(fp),r0
-	bicl3	#0,r0,-276(fp)
-	cmpl	-276(fp),-280(fp)
-	bgequ	noname.113
-	addl2	#65536,-288(fp)
-noname.113:
-	movzwl	-274(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-288(fp)
-	bicl3	#-65536,-276(fp),r0
-	ashl	#16,r0,-280(fp)
-	addl3	-280(fp),-284(fp),r0
-	bicl3	#0,r0,-284(fp)
-	cmpl	-284(fp),-280(fp)
-	bgequ	noname.114
-	incl	-288(fp)
-noname.114:
-	movl	-284(fp),r1
-	movl	-288(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.115
-	incl	r2
-noname.115:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.116
-	incl	r8
-noname.116:
-
-	movzwl	14(r6),r2
-	bicl3	#-65536,8(r7),r3
-	movzwl	10(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,12(r6),-300(fp)
-	bicl3	#-65536,r2,-304(fp)
-	mull3	r0,-300(fp),-292(fp)
-	mull2	r3,-300(fp)
-	mull3	r3,-304(fp),-296(fp)
-	mull2	r0,-304(fp)
-	addl3	-292(fp),-296(fp),r0
-	bicl3	#0,r0,-292(fp)
-	cmpl	-292(fp),-296(fp)
-	bgequ	noname.117
-	addl2	#65536,-304(fp)
-noname.117:
-	movzwl	-290(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-304(fp)
-	bicl3	#-65536,-292(fp),r0
-	ashl	#16,r0,-296(fp)
-	addl3	-296(fp),-300(fp),r0
-	bicl3	#0,r0,-300(fp)
-	cmpl	-300(fp),-296(fp)
-	bgequ	noname.118
-	incl	-304(fp)
-noname.118:
-	movl	-300(fp),r1
-	movl	-304(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.119
-	incl	r2
-noname.119:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.120
-	incl	r8
-noname.120:
-
-	movzwl	18(r6),r2
-	bicl3	#-65536,4(r7),r3
-	movzwl	6(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,16(r6),-316(fp)
-	bicl3	#-65536,r2,-320(fp)
-	mull3	r0,-316(fp),-308(fp)
-	mull2	r3,-316(fp)
-	mull3	r3,-320(fp),-312(fp)
-	mull2	r0,-320(fp)
-	addl3	-308(fp),-312(fp),r0
-	bicl3	#0,r0,-308(fp)
-	cmpl	-308(fp),-312(fp)
-	bgequ	noname.121
-	addl2	#65536,-320(fp)
-noname.121:
-	movzwl	-306(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-320(fp)
-	bicl3	#-65536,-308(fp),r0
-	ashl	#16,r0,-312(fp)
-	addl3	-312(fp),-316(fp),r0
-	bicl3	#0,r0,-316(fp)
-	cmpl	-316(fp),-312(fp)
-	bgequ	noname.122
-	incl	-320(fp)
-noname.122:
-	movl	-316(fp),r1
-	movl	-320(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.123
-	incl	r2
-
-noname.123:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.124
-	incl	r8
-noname.124:
-
-	movzwl	22(r6),r2
-	bicl3	#-65536,(r7),r3
-	movzwl	2(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,20(r6),-332(fp)
-	bicl3	#-65536,r2,-336(fp)
-	mull3	r0,-332(fp),-324(fp)
-	mull2	r3,-332(fp)
-	mull3	r3,-336(fp),-328(fp)
-	mull2	r0,-336(fp)
-	addl3	-324(fp),-328(fp),r0
-	bicl3	#0,r0,-324(fp)
-	cmpl	-324(fp),-328(fp)
-	bgequ	noname.125
-	addl2	#65536,-336(fp)
-noname.125:
-	movzwl	-322(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-336(fp)
-	bicl3	#-65536,-324(fp),r0
-	ashl	#16,r0,-328(fp)
-	addl3	-328(fp),-332(fp),r0
-	bicl3	#0,r0,-332(fp)
-	cmpl	-332(fp),-328(fp)
-	bgequ	noname.126
-	incl	-336(fp)
-noname.126:
-	movl	-332(fp),r1
-	movl	-336(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.127
-	incl	r2
-noname.127:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.128
-	incl	r8
-noname.128:
-
-	movl	r10,20(r11)
-
-	clrl	r10
-
-	movzwl	26(r6),r2
-	bicl3	#-65536,(r7),r3
-	movzwl	2(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,24(r6),-348(fp)
-	bicl3	#-65536,r2,-352(fp)
-	mull3	r0,-348(fp),-340(fp)
-	mull2	r3,-348(fp)
-	mull3	r3,-352(fp),-344(fp)
-	mull2	r0,-352(fp)
-	addl3	-340(fp),-344(fp),r0
-	bicl3	#0,r0,-340(fp)
-	cmpl	-340(fp),-344(fp)
-	bgequ	noname.129
-	addl2	#65536,-352(fp)
-noname.129:
-	movzwl	-338(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-352(fp)
-	bicl3	#-65536,-340(fp),r0
-	ashl	#16,r0,-344(fp)
-	addl3	-344(fp),-348(fp),r0
-	bicl3	#0,r0,-348(fp)
-	cmpl	-348(fp),-344(fp)
-	bgequ	noname.130
-	incl	-352(fp)
-noname.130:
-	movl	-348(fp),r1
-	movl	-352(fp),r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.131
-	incl	r2
-noname.131:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.132
-	incl	r10
-noname.132:
-
-	movzwl	22(r6),r2
-	bicl3	#-65536,4(r7),r3
-	movzwl	6(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,20(r6),-364(fp)
-	bicl3	#-65536,r2,-368(fp)
-	mull3	r0,-364(fp),-356(fp)
-	mull2	r3,-364(fp)
-	mull3	r3,-368(fp),-360(fp)
-	mull2	r0,-368(fp)
-	addl3	-356(fp),-360(fp),r0
-	bicl3	#0,r0,-356(fp)
-	cmpl	-356(fp),-360(fp)
-	bgequ	noname.133
-	addl2	#65536,-368(fp)
-noname.133:
-	movzwl	-354(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-368(fp)
-	bicl3	#-65536,-356(fp),r0
-	ashl	#16,r0,-360(fp)
-	addl3	-360(fp),-364(fp),r0
-	bicl3	#0,r0,-364(fp)
-	cmpl	-364(fp),-360(fp)
-	bgequ	noname.134
-	incl	-368(fp)
-noname.134:
-	movl	-364(fp),r1
-	movl	-368(fp),r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.135
-	incl	r2
-noname.135:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.136
-	incl	r10
-noname.136:
-
-	movzwl	18(r6),r2
-	bicl3	#-65536,8(r7),r3
-	movzwl	10(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,16(r6),-380(fp)
-	bicl3	#-65536,r2,-384(fp)
-	mull3	r0,-380(fp),-372(fp)
-	mull2	r3,-380(fp)
-	mull3	r3,-384(fp),-376(fp)
-	mull2	r0,-384(fp)
-	addl3	-372(fp),-376(fp),r0
-	bicl3	#0,r0,-372(fp)
-	cmpl	-372(fp),-376(fp)
-	bgequ	noname.137
-	addl2	#65536,-384(fp)
-noname.137:
-	movzwl	-370(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-384(fp)
-	bicl3	#-65536,-372(fp),r0
-	ashl	#16,r0,-376(fp)
-	addl3	-376(fp),-380(fp),r0
-	bicl3	#0,r0,-380(fp)
-	cmpl	-380(fp),-376(fp)
-	bgequ	noname.138
-	incl	-384(fp)
-noname.138:
-	movl	-380(fp),r1
-	movl	-384(fp),r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.139
-	incl	r2
-noname.139:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.140
-	incl	r10
-noname.140:
-
-	movzwl	14(r6),r2
-	bicl3	#-65536,12(r7),r3
-	movzwl	14(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,12(r6),-396(fp)
-	bicl3	#-65536,r2,-400(fp)
-	mull3	r0,-396(fp),-388(fp)
-	mull2	r3,-396(fp)
-	mull3	r3,-400(fp),-392(fp)
-	mull2	r0,-400(fp)
-	addl3	-388(fp),-392(fp),r0
-	bicl3	#0,r0,-388(fp)
-	cmpl	-388(fp),-392(fp)
-	bgequ	noname.141
-	addl2	#65536,-400(fp)
-noname.141:
-	movzwl	-386(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-400(fp)
-	bicl3	#-65536,-388(fp),r0
-	ashl	#16,r0,-392(fp)
-	addl3	-392(fp),-396(fp),r0
-	bicl3	#0,r0,-396(fp)
-	cmpl	-396(fp),-392(fp)
-	bgequ	noname.142
-	incl	-400(fp)
-noname.142:
-	movl	-396(fp),r1
-	movl	-400(fp),r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.143
-	incl	r2
-noname.143:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.144
-	incl	r10
-noname.144:
-
-	movzwl	10(r6),r2
-	bicl3	#-65536,16(r7),r3
-	movzwl	18(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,8(r6),-412(fp)
-	bicl3	#-65536,r2,-416(fp)
-	mull3	r0,-412(fp),-404(fp)
-	mull2	r3,-412(fp)
-	mull3	r3,-416(fp),-408(fp)
-	mull2	r0,-416(fp)
-	addl3	-404(fp),-408(fp),r0
-	bicl3	#0,r0,-404(fp)
-	cmpl	-404(fp),-408(fp)
-	bgequ	noname.145
-	addl2	#65536,-416(fp)
-noname.145:
-	movzwl	-402(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-416(fp)
-	bicl3	#-65536,-404(fp),r0
-	ashl	#16,r0,-408(fp)
-	addl3	-408(fp),-412(fp),r0
-	bicl3	#0,r0,-412(fp)
-	cmpl	-412(fp),-408(fp)
-	bgequ	noname.146
-	incl	-416(fp)
-noname.146:
-	movl	-412(fp),r1
-	movl	-416(fp),r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.147
-	incl	r2
-noname.147:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.148
-	incl	r10
-noname.148:
-
-	movzwl	6(r6),r2
-	bicl3	#-65536,20(r7),r3
-	movzwl	22(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,4(r6),-428(fp)
-	bicl3	#-65536,r2,-432(fp)
-	mull3	r0,-428(fp),-420(fp)
-	mull2	r3,-428(fp)
-	mull3	r3,-432(fp),-424(fp)
-	mull2	r0,-432(fp)
-	addl3	-420(fp),-424(fp),r0
-	bicl3	#0,r0,-420(fp)
-	cmpl	-420(fp),-424(fp)
-	bgequ	noname.149
-	addl2	#65536,-432(fp)
-noname.149:
-	movzwl	-418(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-432(fp)
-	bicl3	#-65536,-420(fp),r0
-	ashl	#16,r0,-424(fp)
-	addl3	-424(fp),-428(fp),r0
-	bicl3	#0,r0,-428(fp)
-	cmpl	-428(fp),-424(fp)
-	bgequ	noname.150
-	incl	-432(fp)
-noname.150:
-	movl	-428(fp),r1
-	movl	-432(fp),r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.151
-	incl	r2
-noname.151:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.152
-	incl	r10
-noname.152:
-
-	movzwl	2(r6),r2
-	bicl3	#-65536,24(r7),r3
-	movzwl	26(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,(r6),-444(fp)
-	bicl3	#-65536,r2,-448(fp)
-	mull3	r0,-444(fp),-436(fp)
-	mull2	r3,-444(fp)
-	mull3	r3,-448(fp),-440(fp)
-	mull2	r0,-448(fp)
-	addl3	-436(fp),-440(fp),r0
-	bicl3	#0,r0,-436(fp)
-	cmpl	-436(fp),-440(fp)
-	bgequ	noname.153
-	addl2	#65536,-448(fp)
-noname.153:
-	movzwl	-434(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-448(fp)
-	bicl3	#-65536,-436(fp),r0
-	ashl	#16,r0,-440(fp)
-	addl3	-440(fp),-444(fp),r0
-	bicl3	#0,r0,-444(fp)
-	cmpl	-444(fp),-440(fp)
-	bgequ	noname.154
-	incl	-448(fp)
-noname.154:
-	movl	-444(fp),r1
-	movl	-448(fp),r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.155
-	incl	r2
-noname.155:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.156
-	incl	r10
-noname.156:
-
-	movl	r9,24(r11)
-
-	clrl	r9
-
-	movzwl	2(r6),r2
-	bicl3	#-65536,28(r7),r3
-	movzwl	30(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,(r6),-460(fp)
-	bicl3	#-65536,r2,-464(fp)
-	mull3	r0,-460(fp),-452(fp)
-	mull2	r3,-460(fp)
-	mull3	r3,-464(fp),-456(fp)
-	mull2	r0,-464(fp)
-	addl3	-452(fp),-456(fp),r0
-	bicl3	#0,r0,-452(fp)
-	cmpl	-452(fp),-456(fp)
-	bgequ	noname.157
-	addl2	#65536,-464(fp)
-noname.157:
-	movzwl	-450(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-464(fp)
-	bicl3	#-65536,-452(fp),r0
-	ashl	#16,r0,-456(fp)
-	addl3	-456(fp),-460(fp),r0
-	bicl3	#0,r0,-460(fp)
-	cmpl	-460(fp),-456(fp)
-	bgequ	noname.158
-	incl	-464(fp)
-noname.158:
-	movl	-460(fp),r1
-	movl	-464(fp),r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.159
-	incl	r2
-noname.159:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.160
-	incl	r9
-noname.160:
-
-	movzwl	6(r6),r2
-	bicl3	#-65536,24(r7),r3
-	movzwl	26(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,4(r6),-476(fp)
-	bicl3	#-65536,r2,-480(fp)
-	mull3	r0,-476(fp),-468(fp)
-	mull2	r3,-476(fp)
-	mull3	r3,-480(fp),-472(fp)
-	mull2	r0,-480(fp)
-	addl3	-468(fp),-472(fp),r0
-	bicl3	#0,r0,-468(fp)
-	cmpl	-468(fp),-472(fp)
-	bgequ	noname.161
-	addl2	#65536,-480(fp)
-noname.161:
-	movzwl	-466(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-480(fp)
-	bicl3	#-65536,-468(fp),r0
-	ashl	#16,r0,-472(fp)
-	addl3	-472(fp),-476(fp),r0
-	bicl3	#0,r0,-476(fp)
-	cmpl	-476(fp),-472(fp)
-	bgequ	noname.162
-	incl	-480(fp)
-noname.162:
-	movl	-476(fp),r1
-	movl	-480(fp),r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.163
-	incl	r2
-noname.163:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.164
-	incl	r9
-noname.164:
-
-	movzwl	10(r6),r2
-	bicl3	#-65536,20(r7),r3
-	movzwl	22(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,8(r6),-492(fp)
-	bicl3	#-65536,r2,-496(fp)
-	mull3	r0,-492(fp),-484(fp)
-	mull2	r3,-492(fp)
-	mull3	r3,-496(fp),-488(fp)
-	mull2	r0,-496(fp)
-	addl3	-484(fp),-488(fp),r0
-	bicl3	#0,r0,-484(fp)
-	cmpl	-484(fp),-488(fp)
-	bgequ	noname.165
-	addl2	#65536,-496(fp)
-noname.165:
-	movzwl	-482(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-496(fp)
-	bicl3	#-65536,-484(fp),r0
-	ashl	#16,r0,-488(fp)
-	addl3	-488(fp),-492(fp),r0
-	bicl3	#0,r0,-492(fp)
-	cmpl	-492(fp),-488(fp)
-	bgequ	noname.166
-	incl	-496(fp)
-noname.166:
-	movl	-492(fp),r1
-	movl	-496(fp),r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.167
-	incl	r2
-noname.167:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.168
-	incl	r9
-noname.168:
-
-	movzwl	14(r6),r2
-	bicl3	#-65536,16(r7),r3
-	movzwl	18(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,12(r6),-508(fp)
-	bicl3	#-65536,r2,-512(fp)
-	mull3	r0,-508(fp),-500(fp)
-	mull2	r3,-508(fp)
-	mull3	r3,-512(fp),-504(fp)
-	mull2	r0,-512(fp)
-	addl3	-500(fp),-504(fp),r0
-	bicl3	#0,r0,-500(fp)
-	cmpl	-500(fp),-504(fp)
-	bgequ	noname.169
-	addl2	#65536,-512(fp)
-noname.169:
-	movzwl	-498(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-512(fp)
-	bicl3	#-65536,-500(fp),r0
-	ashl	#16,r0,-504(fp)
-	addl3	-504(fp),-508(fp),r0
-	bicl3	#0,r0,-508(fp)
-	cmpl	-508(fp),-504(fp)
-	bgequ	noname.170
-	incl	-512(fp)
-noname.170:
-	movl	-508(fp),r1
-	movl	-512(fp),r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.171
-	incl	r2
-noname.171:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.172
-	incl	r9
-noname.172:
-
-	movzwl	18(r6),r2
-	bicl3	#-65536,12(r7),r3
-	movzwl	14(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,16(r6),-524(fp)
-	bicl3	#-65536,r2,-528(fp)
-	mull3	r0,-524(fp),-516(fp)
-	mull2	r3,-524(fp)
-	mull3	r3,-528(fp),-520(fp)
-	mull2	r0,-528(fp)
-	addl3	-516(fp),-520(fp),r0
-	bicl3	#0,r0,-516(fp)
-	cmpl	-516(fp),-520(fp)
-	bgequ	noname.173
-	addl2	#65536,-528(fp)
-noname.173:
-	movzwl	-514(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-528(fp)
-	bicl3	#-65536,-516(fp),r0
-	ashl	#16,r0,-520(fp)
-	addl3	-520(fp),-524(fp),r0
-	bicl3	#0,r0,-524(fp)
-	cmpl	-524(fp),-520(fp)
-	bgequ	noname.174
-	incl	-528(fp)
-noname.174:
-	movl	-524(fp),r1
-	movl	-528(fp),r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.175
-	incl	r2
-noname.175:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.176
-	incl	r9
-noname.176:
-
-	movzwl	22(r6),r2
-	bicl3	#-65536,8(r7),r3
-	movzwl	10(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,20(r6),-540(fp)
-	bicl3	#-65536,r2,-544(fp)
-	mull3	r0,-540(fp),-532(fp)
-	mull2	r3,-540(fp)
-	mull3	r3,-544(fp),-536(fp)
-	mull2	r0,-544(fp)
-	addl3	-532(fp),-536(fp),r0
-	bicl3	#0,r0,-532(fp)
-	cmpl	-532(fp),-536(fp)
-	bgequ	noname.177
-	addl2	#65536,-544(fp)
-noname.177:
-	movzwl	-530(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-544(fp)
-	bicl3	#-65536,-532(fp),r0
-	ashl	#16,r0,-536(fp)
-	addl3	-536(fp),-540(fp),r0
-	bicl3	#0,r0,-540(fp)
-	cmpl	-540(fp),-536(fp)
-	bgequ	noname.178
-	incl	-544(fp)
-noname.178:
-	movl	-540(fp),r1
-	movl	-544(fp),r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.179
-	incl	r2
-noname.179:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.180
-	incl	r9
-noname.180:
-
-	movzwl	26(r6),r2
-	bicl3	#-65536,4(r7),r3
-	movzwl	6(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,24(r6),-556(fp)
-	bicl3	#-65536,r2,-560(fp)
-	mull3	r0,-556(fp),-548(fp)
-	mull2	r3,-556(fp)
-	mull3	r3,-560(fp),-552(fp)
-	mull2	r0,-560(fp)
-	addl3	-548(fp),-552(fp),r0
-	bicl3	#0,r0,-548(fp)
-	cmpl	-548(fp),-552(fp)
-	bgequ	noname.181
-	addl2	#65536,-560(fp)
-noname.181:
-	movzwl	-546(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-560(fp)
-	bicl3	#-65536,-548(fp),r0
-	ashl	#16,r0,-552(fp)
-	addl3	-552(fp),-556(fp),r0
-	bicl3	#0,r0,-556(fp)
-	cmpl	-556(fp),-552(fp)
-	bgequ	noname.182
-	incl	-560(fp)
-noname.182:
-	movl	-556(fp),r1
-	movl	-560(fp),r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.183
-	incl	r2
-noname.183:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.184
-	incl	r9
-noname.184:
-
-	movzwl	30(r6),r2
-	bicl3	#-65536,(r7),r3
-	movzwl	2(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,28(r6),-572(fp)
-	bicl3	#-65536,r2,-576(fp)
-	mull3	r0,-572(fp),-564(fp)
-	mull2	r3,-572(fp)
-	mull3	r3,-576(fp),-568(fp)
-	mull2	r0,-576(fp)
-	addl3	-564(fp),-568(fp),r0
-	bicl3	#0,r0,-564(fp)
-	cmpl	-564(fp),-568(fp)
-	bgequ	noname.185
-	addl2	#65536,-576(fp)
-noname.185:
-	movzwl	-562(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-576(fp)
-	bicl3	#-65536,-564(fp),r0
-	ashl	#16,r0,-568(fp)
-	addl3	-568(fp),-572(fp),r0
-	bicl3	#0,r0,-572(fp)
-	cmpl	-572(fp),-568(fp)
-	bgequ	noname.186
-	incl	-576(fp)
-noname.186:
-	movl	-572(fp),r1
-	movl	-576(fp),r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.187
-	incl	r2
-noname.187:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.188
-	incl	r9
-noname.188:
-
-	movl	r8,28(r11)
-
-	clrl	r8
-
-	movzwl	30(r6),r2
-	bicl3	#-65536,4(r7),r3
-	movzwl	6(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,28(r6),-588(fp)
-	bicl3	#-65536,r2,-592(fp)
-	mull3	r0,-588(fp),-580(fp)
-	mull2	r3,-588(fp)
-	mull3	r3,-592(fp),-584(fp)
-	mull2	r0,-592(fp)
-	addl3	-580(fp),-584(fp),r0
-	bicl3	#0,r0,-580(fp)
-	cmpl	-580(fp),-584(fp)
-	bgequ	noname.189
-	addl2	#65536,-592(fp)
-noname.189:
-	movzwl	-578(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-592(fp)
-	bicl3	#-65536,-580(fp),r0
-	ashl	#16,r0,-584(fp)
-	addl3	-584(fp),-588(fp),r0
-	bicl3	#0,r0,-588(fp)
-	cmpl	-588(fp),-584(fp)
-	bgequ	noname.190
-	incl	-592(fp)
-noname.190:
-	movl	-588(fp),r1
-	movl	-592(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.191
-	incl	r2
-noname.191:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.192
-	incl	r8
-noname.192:
-
-	movzwl	26(r6),r2
-	bicl3	#-65536,8(r7),r3
-	movzwl	10(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,24(r6),-604(fp)
-	bicl3	#-65536,r2,-608(fp)
-	mull3	r0,-604(fp),-596(fp)
-	mull2	r3,-604(fp)
-	mull3	r3,-608(fp),-600(fp)
-	mull2	r0,-608(fp)
-	addl3	-596(fp),-600(fp),r0
-	bicl3	#0,r0,-596(fp)
-	cmpl	-596(fp),-600(fp)
-	bgequ	noname.193
-	addl2	#65536,-608(fp)
-noname.193:
-	movzwl	-594(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-608(fp)
-	bicl3	#-65536,-596(fp),r0
-	ashl	#16,r0,-600(fp)
-	addl3	-600(fp),-604(fp),r0
-	bicl3	#0,r0,-604(fp)
-	cmpl	-604(fp),-600(fp)
-	bgequ	noname.194
-	incl	-608(fp)
-noname.194:
-	movl	-604(fp),r1
-	movl	-608(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.195
-	incl	r2
-noname.195:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.196
-	incl	r8
-noname.196:
-
-	movzwl	22(r6),r2
-	bicl3	#-65536,12(r7),r3
-	movzwl	14(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,20(r6),-620(fp)
-	bicl3	#-65536,r2,-624(fp)
-	mull3	r0,-620(fp),-612(fp)
-	mull2	r3,-620(fp)
-	mull3	r3,-624(fp),-616(fp)
-	mull2	r0,-624(fp)
-	addl3	-612(fp),-616(fp),r0
-	bicl3	#0,r0,-612(fp)
-	cmpl	-612(fp),-616(fp)
-	bgequ	noname.197
-	addl2	#65536,-624(fp)
-noname.197:
-	movzwl	-610(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-624(fp)
-	bicl3	#-65536,-612(fp),r0
-	ashl	#16,r0,-616(fp)
-	addl3	-616(fp),-620(fp),r0
-	bicl3	#0,r0,-620(fp)
-	cmpl	-620(fp),-616(fp)
-	bgequ	noname.198
-	incl	-624(fp)
-noname.198:
-	movl	-620(fp),r1
-	movl	-624(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.199
-	incl	r2
-noname.199:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.200
-	incl	r8
-noname.200:
-
-	movzwl	18(r6),r2
-	bicl3	#-65536,16(r7),r3
-	movzwl	18(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,16(r6),-636(fp)
-	bicl3	#-65536,r2,-640(fp)
-	mull3	r0,-636(fp),-628(fp)
-	mull2	r3,-636(fp)
-	mull3	r3,-640(fp),-632(fp)
-	mull2	r0,-640(fp)
-	addl3	-628(fp),-632(fp),r0
-	bicl3	#0,r0,-628(fp)
-	cmpl	-628(fp),-632(fp)
-	bgequ	noname.201
-	addl2	#65536,-640(fp)
-noname.201:
-	movzwl	-626(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-640(fp)
-	bicl3	#-65536,-628(fp),r0
-	ashl	#16,r0,-632(fp)
-	addl3	-632(fp),-636(fp),r0
-	bicl3	#0,r0,-636(fp)
-	cmpl	-636(fp),-632(fp)
-	bgequ	noname.202
-	incl	-640(fp)
-noname.202:
-	movl	-636(fp),r1
-	movl	-640(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.203
-	incl	r2
-noname.203:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.204
-	incl	r8
-noname.204:
-
-	movzwl	14(r6),r2
-	bicl3	#-65536,20(r7),r3
-	movzwl	22(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,12(r6),-652(fp)
-	bicl3	#-65536,r2,-656(fp)
-	mull3	r0,-652(fp),-644(fp)
-	mull2	r3,-652(fp)
-	mull3	r3,-656(fp),-648(fp)
-	mull2	r0,-656(fp)
-	addl3	-644(fp),-648(fp),r0
-	bicl3	#0,r0,-644(fp)
-	cmpl	-644(fp),-648(fp)
-	bgequ	noname.205
-	addl2	#65536,-656(fp)
-noname.205:
-	movzwl	-642(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-656(fp)
-	bicl3	#-65536,-644(fp),r0
-	ashl	#16,r0,-648(fp)
-	addl3	-648(fp),-652(fp),r0
-	bicl3	#0,r0,-652(fp)
-	cmpl	-652(fp),-648(fp)
-	bgequ	noname.206
-	incl	-656(fp)
-noname.206:
-	movl	-652(fp),r1
-	movl	-656(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.207
-	incl	r2
-noname.207:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.208
-	incl	r8
-noname.208:
-
-	movzwl	10(r6),r2
-	bicl3	#-65536,24(r7),r3
-	movzwl	26(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,8(r6),-668(fp)
-	bicl3	#-65536,r2,-672(fp)
-	mull3	r0,-668(fp),-660(fp)
-	mull2	r3,-668(fp)
-	mull3	r3,-672(fp),-664(fp)
-	mull2	r0,-672(fp)
-	addl3	-660(fp),-664(fp),r0
-	bicl3	#0,r0,-660(fp)
-	cmpl	-660(fp),-664(fp)
-	bgequ	noname.209
-	addl2	#65536,-672(fp)
-noname.209:
-	movzwl	-658(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-672(fp)
-	bicl3	#-65536,-660(fp),r0
-	ashl	#16,r0,-664(fp)
-	addl3	-664(fp),-668(fp),r0
-	bicl3	#0,r0,-668(fp)
-	cmpl	-668(fp),-664(fp)
-	bgequ	noname.210
-	incl	-672(fp)
-noname.210:
-	movl	-668(fp),r1
-	movl	-672(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.211
-	incl	r2
-noname.211:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.212
-	incl	r8
-noname.212:
-
-	movzwl	6(r6),r2
-	bicl3	#-65536,28(r7),r3
-	movzwl	30(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,4(r6),-684(fp)
-	bicl3	#-65536,r2,-688(fp)
-	mull3	r0,-684(fp),-676(fp)
-	mull2	r3,-684(fp)
-	mull3	r3,-688(fp),-680(fp)
-	mull2	r0,-688(fp)
-	addl3	-676(fp),-680(fp),r0
-	bicl3	#0,r0,-676(fp)
-	cmpl	-676(fp),-680(fp)
-	bgequ	noname.213
-	addl2	#65536,-688(fp)
-noname.213:
-	movzwl	-674(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-688(fp)
-	bicl3	#-65536,-676(fp),r0
-	ashl	#16,r0,-680(fp)
-	addl3	-680(fp),-684(fp),r0
-	bicl3	#0,r0,-684(fp)
-	cmpl	-684(fp),-680(fp)
-	bgequ	noname.214
-	incl	-688(fp)
-noname.214:
-	movl	-684(fp),r1
-	movl	-688(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.215
-	incl	r2
-noname.215:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.216
-	incl	r8
-noname.216:
-
-	movl	r10,32(r11)
-
-	clrl	r10
-
-	movzwl	10(r6),r2
-	bicl3	#-65536,28(r7),r3
-	movzwl	30(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,8(r6),-700(fp)
-	bicl3	#-65536,r2,-704(fp)
-	mull3	r0,-700(fp),-692(fp)
-	mull2	r3,-700(fp)
-	mull3	r3,-704(fp),-696(fp)
-	mull2	r0,-704(fp)
-	addl3	-692(fp),-696(fp),r0
-	bicl3	#0,r0,-692(fp)
-	cmpl	-692(fp),-696(fp)
-	bgequ	noname.217
-	addl2	#65536,-704(fp)
-noname.217:
-	movzwl	-690(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-704(fp)
-	bicl3	#-65536,-692(fp),r0
-	ashl	#16,r0,-696(fp)
-	addl3	-696(fp),-700(fp),r0
-	bicl3	#0,r0,-700(fp)
-	cmpl	-700(fp),-696(fp)
-	bgequ	noname.218
-	incl	-704(fp)
-noname.218:
-	movl	-700(fp),r1
-	movl	-704(fp),r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.219
-	incl	r2
-noname.219:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.220
-	incl	r10
-noname.220:
-
-	movzwl	14(r6),r2
-	bicl3	#-65536,24(r7),r3
-	movzwl	26(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,12(r6),-716(fp)
-	bicl3	#-65536,r2,-720(fp)
-	mull3	r0,-716(fp),-708(fp)
-	mull2	r3,-716(fp)
-	mull3	r3,-720(fp),-712(fp)
-	mull2	r0,-720(fp)
-	addl3	-708(fp),-712(fp),r0
-	bicl3	#0,r0,-708(fp)
-	cmpl	-708(fp),-712(fp)
-	bgequ	noname.221
-	addl2	#65536,-720(fp)
-noname.221:
-	movzwl	-706(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-720(fp)
-	bicl3	#-65536,-708(fp),r0
-	ashl	#16,r0,-712(fp)
-	addl3	-712(fp),-716(fp),r0
-	bicl3	#0,r0,-716(fp)
-	cmpl	-716(fp),-712(fp)
-	bgequ	noname.222
-	incl	-720(fp)
-noname.222:
-	movl	-716(fp),r1
-	movl	-720(fp),r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.223
-	incl	r2
-noname.223:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.224
-	incl	r10
-noname.224:
-
-	movzwl	18(r6),r2
-	bicl3	#-65536,20(r7),r3
-	movzwl	22(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,16(r6),-732(fp)
-	bicl3	#-65536,r2,-736(fp)
-	mull3	r0,-732(fp),-724(fp)
-	mull2	r3,-732(fp)
-	mull3	r3,-736(fp),-728(fp)
-	mull2	r0,-736(fp)
-	addl3	-724(fp),-728(fp),r0
-	bicl3	#0,r0,-724(fp)
-	cmpl	-724(fp),-728(fp)
-	bgequ	noname.225
-	addl2	#65536,-736(fp)
-noname.225:
-	movzwl	-722(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-736(fp)
-	bicl3	#-65536,-724(fp),r0
-	ashl	#16,r0,-728(fp)
-	addl3	-728(fp),-732(fp),r0
-	bicl3	#0,r0,-732(fp)
-	cmpl	-732(fp),-728(fp)
-	bgequ	noname.226
-	incl	-736(fp)
-noname.226:
-	movl	-732(fp),r1
-	movl	-736(fp),r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.227
-	incl	r2
-noname.227:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.228
-	incl	r10
-noname.228:
-
-	movzwl	22(r6),r2
-	bicl3	#-65536,16(r7),r3
-	movzwl	18(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,20(r6),-748(fp)
-	bicl3	#-65536,r2,-752(fp)
-	mull3	r0,-748(fp),-740(fp)
-	mull2	r3,-748(fp)
-	mull3	r3,-752(fp),-744(fp)
-	mull2	r0,-752(fp)
-	addl3	-740(fp),-744(fp),r0
-	bicl3	#0,r0,-740(fp)
-	cmpl	-740(fp),-744(fp)
-	bgequ	noname.229
-	addl2	#65536,-752(fp)
-noname.229:
-	movzwl	-738(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-752(fp)
-	bicl3	#-65536,-740(fp),r0
-	ashl	#16,r0,-744(fp)
-	addl3	-744(fp),-748(fp),r0
-	bicl3	#0,r0,-748(fp)
-	cmpl	-748(fp),-744(fp)
-	bgequ	noname.230
-	incl	-752(fp)
-noname.230:
-	movl	-748(fp),r1
-	movl	-752(fp),r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.231
-	incl	r2
-noname.231:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.232
-	incl	r10
-noname.232:
-
-	movzwl	26(r6),r2
-	bicl3	#-65536,12(r7),r3
-	movzwl	14(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,24(r6),-764(fp)
-	bicl3	#-65536,r2,-768(fp)
-	mull3	r0,-764(fp),-756(fp)
-	mull2	r3,-764(fp)
-	mull3	r3,-768(fp),-760(fp)
-	mull2	r0,-768(fp)
-	addl3	-756(fp),-760(fp),r0
-	bicl3	#0,r0,-756(fp)
-	cmpl	-756(fp),-760(fp)
-	bgequ	noname.233
-	addl2	#65536,-768(fp)
-noname.233:
-	movzwl	-754(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-768(fp)
-	bicl3	#-65536,-756(fp),r0
-	ashl	#16,r0,-760(fp)
-	addl3	-760(fp),-764(fp),r0
-	bicl3	#0,r0,-764(fp)
-	cmpl	-764(fp),-760(fp)
-	bgequ	noname.234
-	incl	-768(fp)
-noname.234:
-	movl	-764(fp),r1
-	movl	-768(fp),r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.235
-	incl	r2
-noname.235:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.236
-	incl	r10
-noname.236:
-
-	bicl3	#-65536,28(r6),r3
-	movzwl	30(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,8(r7),r2
-	movzwl	10(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-772(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-776(fp)
-	mull2	r0,r4
-	addl3	-772(fp),-776(fp),r0
-	bicl3	#0,r0,-772(fp)
-	cmpl	-772(fp),-776(fp)
-	bgequ	noname.237
-	addl2	#65536,r4
-noname.237:
-	movzwl	-770(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-772(fp),r0
-	ashl	#16,r0,-776(fp)
-	addl2	-776(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-776(fp)
-	bgequ	noname.238
-	incl	r4
-noname.238:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.239
-	incl	r2
-noname.239:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.240
-	incl	r10
-noname.240:
-
-	movl	r9,36(r11)
-
-	clrl	r9
-
-	bicl3	#-65536,28(r6),r3
-	movzwl	30(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,12(r7),r2
-	movzwl	14(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-780(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-784(fp)
-	mull2	r0,r4
-	addl3	-780(fp),-784(fp),r0
-	bicl3	#0,r0,-780(fp)
-	cmpl	-780(fp),-784(fp)
-	bgequ	noname.241
-	addl2	#65536,r4
-noname.241:
-	movzwl	-778(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-780(fp),r0
-	ashl	#16,r0,-784(fp)
-	addl2	-784(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-784(fp)
-	bgequ	noname.242
-	incl	r4
-noname.242:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.243
-	incl	r2
-noname.243:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.244
-	incl	r9
-noname.244:
-
-	bicl3	#-65536,24(r6),r3
-	movzwl	26(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,16(r7),r2
-	movzwl	18(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-788(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-792(fp)
-	mull2	r0,r4
-	addl3	-788(fp),-792(fp),r0
-	bicl3	#0,r0,-788(fp)
-	cmpl	-788(fp),-792(fp)
-	bgequ	noname.245
-	addl2	#65536,r4
-noname.245:
-	movzwl	-786(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-788(fp),r0
-	ashl	#16,r0,-792(fp)
-	addl2	-792(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-792(fp)
-	bgequ	noname.246
-	incl	r4
-noname.246:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.247
-	incl	r2
-noname.247:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.248
-	incl	r9
-noname.248:
-
-	bicl3	#-65536,20(r6),r3
-	movzwl	22(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,20(r7),r2
-	movzwl	22(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-796(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-800(fp)
-	mull2	r0,r4
-	addl3	-796(fp),-800(fp),r0
-	bicl3	#0,r0,-796(fp)
-	cmpl	-796(fp),-800(fp)
-	bgequ	noname.249
-	addl2	#65536,r4
-noname.249:
-	movzwl	-794(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-796(fp),r0
-	ashl	#16,r0,-800(fp)
-	addl2	-800(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-800(fp)
-	bgequ	noname.250
-	incl	r4
-noname.250:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.251
-	incl	r2
-noname.251:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.252
-	incl	r9
-noname.252:
-
-	bicl3	#-65536,16(r6),r3
-	movzwl	18(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,24(r7),r2
-	movzwl	26(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-804(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-808(fp)
-	mull2	r0,r4
-	addl3	-804(fp),-808(fp),r0
-	bicl3	#0,r0,-804(fp)
-	cmpl	-804(fp),-808(fp)
-	bgequ	noname.253
-	addl2	#65536,r4
-noname.253:
-	movzwl	-802(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-804(fp),r0
-	ashl	#16,r0,-808(fp)
-	addl2	-808(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-808(fp)
-	bgequ	noname.254
-	incl	r4
-noname.254:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.255
-	incl	r2
-noname.255:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.256
-	incl	r9
-noname.256:
-
-	bicl3	#-65536,12(r6),r3
-	movzwl	14(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,28(r7),r2
-	movzwl	30(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-812(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-816(fp)
-	mull2	r0,r4
-	addl3	-812(fp),-816(fp),r0
-	bicl3	#0,r0,-812(fp)
-	cmpl	-812(fp),-816(fp)
-	bgequ	noname.257
-	addl2	#65536,r4
-noname.257:
-	movzwl	-810(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-812(fp),r0
-	ashl	#16,r0,-816(fp)
-	addl2	-816(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-816(fp)
-	bgequ	noname.258
-	incl	r4
-noname.258:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.259
-	incl	r2
-noname.259:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.260
-	incl	r9
-noname.260:
-
-	movl	r8,40(r11)
-
-	clrl	r8
-
-	bicl3	#-65536,16(r6),r3
-	movzwl	18(r6),r2
-	bicl3	#-65536,28(r7),r1
-	movzwl	30(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r4
-	bicl3	#-65536,r2,-828(fp)
-	mull3	r0,r4,-820(fp)
-	mull2	r1,r4
-	mull3	r1,-828(fp),-824(fp)
-	mull2	r0,-828(fp)
-	addl3	-820(fp),-824(fp),r0
-	bicl3	#0,r0,-820(fp)
-	cmpl	-820(fp),-824(fp)
-	bgequ	noname.261
-	addl2	#65536,-828(fp)
-noname.261:
-	movzwl	-818(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-828(fp)
-	bicl3	#-65536,-820(fp),r0
-	ashl	#16,r0,-824(fp)
-	addl2	-824(fp),r4
-	bicl2	#0,r4
-	cmpl	r4,-824(fp)
-	bgequ	noname.262
-	incl	-828(fp)
-noname.262:
-	movl	r4,r1
-	movl	-828(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.263
-	incl	r2
-noname.263:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.264
-	incl	r8
-noname.264:
-
-	movzwl	22(r6),r2
-	bicl3	#-65536,24(r7),r3
-	movzwl	26(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,20(r6),-840(fp)
-	bicl3	#-65536,r2,-844(fp)
-	mull3	r0,-840(fp),-832(fp)
-	mull2	r3,-840(fp)
-	mull3	r3,-844(fp),-836(fp)
-	mull2	r0,-844(fp)
-	addl3	-832(fp),-836(fp),r0
-	bicl3	#0,r0,-832(fp)
-	cmpl	-832(fp),-836(fp)
-	bgequ	noname.265
-	addl2	#65536,-844(fp)
-noname.265:
-	movzwl	-830(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-844(fp)
-	bicl3	#-65536,-832(fp),r0
-	ashl	#16,r0,-836(fp)
-	addl3	-836(fp),-840(fp),r0
-	bicl3	#0,r0,-840(fp)
-	cmpl	-840(fp),-836(fp)
-	bgequ	noname.266
-	incl	-844(fp)
-noname.266:
-	movl	-840(fp),r1
-	movl	-844(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.267
-	incl	r2
-noname.267:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.268
-	incl	r8
-noname.268:
-
-	bicl3	#-65536,24(r6),r3
-	movzwl	26(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,20(r7),r2
-	movzwl	22(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-848(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-852(fp)
-	mull2	r0,r4
-	addl3	-848(fp),-852(fp),r0
-	bicl3	#0,r0,-848(fp)
-	cmpl	-848(fp),-852(fp)
-	bgequ	noname.269
-	addl2	#65536,r4
-noname.269:
-	movzwl	-846(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-848(fp),r0
-	ashl	#16,r0,-852(fp)
-	addl2	-852(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-852(fp)
-	bgequ	noname.270
-	incl	r4
-noname.270:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.271
-	incl	r2
-noname.271:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.272
-	incl	r8
-noname.272:
-
-	bicl3	#-65536,28(r6),r3
-	movzwl	30(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,16(r7),r2
-	movzwl	18(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-856(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-860(fp)
-	mull2	r0,r4
-	addl3	-856(fp),-860(fp),r0
-	bicl3	#0,r0,-856(fp)
-	cmpl	-856(fp),-860(fp)
-	bgequ	noname.273
-	addl2	#65536,r4
-noname.273:
-	movzwl	-854(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-856(fp),r0
-	ashl	#16,r0,-860(fp)
-	addl2	-860(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-860(fp)
-	bgequ	noname.274
-	incl	r4
-noname.274:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.275
-	incl	r2
-noname.275:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.276
-	incl	r8
-noname.276:
-
-	movl	r10,44(r11)
-
-	clrl	r10
-
-	bicl3	#-65536,28(r6),r3
-	movzwl	30(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,20(r7),r2
-	movzwl	22(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-864(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-868(fp)
-	mull2	r0,r4
-	addl3	-864(fp),-868(fp),r0
-	bicl3	#0,r0,-864(fp)
-	cmpl	-864(fp),-868(fp)
-	bgequ	noname.277
-	addl2	#65536,r4
-noname.277:
-	movzwl	-862(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-864(fp),r0
-	ashl	#16,r0,-868(fp)
-	addl2	-868(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-868(fp)
-	bgequ	noname.278
-	incl	r4
-noname.278:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.279
-	incl	r2
-noname.279:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.280
-	incl	r10
-noname.280:
-
-	bicl3	#-65536,24(r6),r3
-	movzwl	26(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,24(r7),r2
-	movzwl	26(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-872(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-876(fp)
-	mull2	r0,r4
-	addl3	-872(fp),-876(fp),r0
-	bicl3	#0,r0,-872(fp)
-	cmpl	-872(fp),-876(fp)
-	bgequ	noname.281
-	addl2	#65536,r4
-noname.281:
-	movzwl	-870(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-872(fp),r0
-	ashl	#16,r0,-876(fp)
-	addl2	-876(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-876(fp)
-	bgequ	noname.282
-	incl	r4
-noname.282:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.283
-	incl	r2
-noname.283:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.284
-	incl	r10
-noname.284:
-
-	bicl3	#-65536,20(r6),r3
-	movzwl	22(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,28(r7),r2
-	movzwl	30(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-880(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-884(fp)
-	mull2	r0,r4
-	addl3	-880(fp),-884(fp),r0
-	bicl3	#0,r0,-880(fp)
-	cmpl	-880(fp),-884(fp)
-	bgequ	noname.285
-	addl2	#65536,r4
-noname.285:
-	movzwl	-878(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-880(fp),r0
-	ashl	#16,r0,-884(fp)
-	addl2	-884(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-884(fp)
-	bgequ	noname.286
-	incl	r4
-noname.286:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.287
-	incl	r2
-noname.287:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.288
-	incl	r10
-noname.288:
-
-	movl	r9,48(r11)
-
-	clrl	r9
-
-	bicl3	#-65536,24(r6),r3
-	movzwl	26(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,28(r7),r2
-	movzwl	30(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-888(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-892(fp)
-	mull2	r0,r4
-	addl3	-888(fp),-892(fp),r0
-	bicl3	#0,r0,-888(fp)
-	cmpl	-888(fp),-892(fp)
-	bgequ	noname.289
-	addl2	#65536,r4
-noname.289:
-	movzwl	-886(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-888(fp),r0
-	ashl	#16,r0,-892(fp)
-	addl2	-892(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-892(fp)
-	bgequ	noname.290
-	incl	r4
-noname.290:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.291
-	incl	r2
-noname.291:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.292
-	incl	r9
-noname.292:
-
-	movzwl	30(r6),r2
-	bicl3	#-65536,24(r7),r3
-	movzwl	26(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,28(r6),-904(fp)
-	bicl3	#-65536,r2,-908(fp)
-	mull3	r0,-904(fp),-896(fp)
-	mull2	r3,-904(fp)
-	mull3	r3,-908(fp),-900(fp)
-	mull2	r0,-908(fp)
-	addl3	-896(fp),-900(fp),r0
-	bicl3	#0,r0,-896(fp)
-	cmpl	-896(fp),-900(fp)
-	bgequ	noname.293
-	addl2	#65536,-908(fp)
-noname.293:
-	movzwl	-894(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-908(fp)
-	bicl3	#-65536,-896(fp),r0
-	ashl	#16,r0,-900(fp)
-	addl3	-900(fp),-904(fp),r0
-	bicl3	#0,r0,-904(fp)
-	cmpl	-904(fp),-900(fp)
-	bgequ	noname.294
-	incl	-908(fp)
-noname.294:
-	movl	-904(fp),r1
-	movl	-908(fp),r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.295
-	incl	r2
-noname.295:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.296
-	incl	r9
-noname.296:
-
-	movl	r8,52(r11)
-
-	clrl	r8
-
-	movzwl	30(r6),r2
-	bicl3	#-65536,28(r7),r3
-	movzwl	30(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,28(r6),-920(fp)
-	bicl3	#-65536,r2,-924(fp)
-	mull3	r0,-920(fp),-912(fp)
-	mull2	r3,-920(fp)
-	mull3	r3,-924(fp),-916(fp)
-	mull2	r0,-924(fp)
-	addl3	-912(fp),-916(fp),r0
-	bicl3	#0,r0,-912(fp)
-	cmpl	-912(fp),-916(fp)
-	bgequ	noname.297
-	addl2	#65536,-924(fp)
-noname.297:
-	movzwl	-910(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-924(fp)
-	bicl3	#-65536,-912(fp),r0
-	ashl	#16,r0,-916(fp)
-	addl3	-916(fp),-920(fp),r0
-	bicl3	#0,r0,-920(fp)
-	cmpl	-920(fp),-916(fp)
-	bgequ	noname.298
-	incl	-924(fp)
-noname.298:
-	movl	-920(fp),r1
-	movl	-924(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.299
-	incl	r2
-noname.299:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.300
-	incl	r8
-noname.300:
-
-	movl	r10,56(r11)
-
-	movl	r9,60(r11)
-
-	ret	
-
-
-
-;r=4 ;(AP)
-;a=8 ;(AP)
-;b=12 ;(AP)
-;n=16 ;(AP)	n	by value (input)
-
-	.psect	code,nowrt
-
-.entry	BN_MUL_COMBA4,^m<r2,r3,r4,r5,r6,r7,r8,r9,r10,r11>
-	movab	-156(sp),sp
-
-	clrq	r9
-
-	clrl	r8
-
-	movl	8(ap),r6
-	bicl3	#-65536,(r6),r3
-	movzwl	2(r6),r2
-	bicl2	#-65536,r2
-	movl	12(ap),r7
-	bicl3	#-65536,(r7),r1
-	movzwl	2(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r2,r4
-	mull3	r0,r5,-4(fp)
-	mull2	r1,r5
-	mull3	r1,r4,-8(fp)
-	mull2	r0,r4
-	addl3	-4(fp),-8(fp),r0
-	bicl3	#0,r0,-4(fp)
-	cmpl	-4(fp),-8(fp)
-	bgequ	noname.303
-	addl2	#65536,r4
-noname.303:
-	movzwl	-2(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-4(fp),r0
-	ashl	#16,r0,-8(fp)
-	addl2	-8(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-8(fp)
-	bgequ	noname.304
-	incl	r4
-noname.304:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.305
-	incl	r2
-noname.305:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.306
-	incl	r8
-noname.306:
-
-	movl	4(ap),r11
-	movl	r10,(r11)
-
-	clrl	r10
-
-	bicl3	#-65536,(r6),r3
-	movzwl	2(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,4(r7),r2
-	movzwl	6(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-12(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-16(fp)
-	mull2	r0,r4
-	addl3	-12(fp),-16(fp),r0
-	bicl3	#0,r0,-12(fp)
-	cmpl	-12(fp),-16(fp)
-	bgequ	noname.307
-	addl2	#65536,r4
-noname.307:
-	movzwl	-10(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-12(fp),r0
-	ashl	#16,r0,-16(fp)
-	addl2	-16(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-16(fp)
-	bgequ	noname.308
-	incl	r4
-noname.308:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.309
-	incl	r2
-noname.309:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.310
-	incl	r10
-noname.310:
-
-	bicl3	#-65536,4(r6),r3
-	movzwl	6(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,(r7),r2
-	movzwl	2(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-20(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-24(fp)
-	mull2	r0,r4
-	addl3	-20(fp),-24(fp),r0
-	bicl3	#0,r0,-20(fp)
-	cmpl	-20(fp),-24(fp)
-	bgequ	noname.311
-	addl2	#65536,r4
-noname.311:
-	movzwl	-18(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-20(fp),r0
-	ashl	#16,r0,-24(fp)
-	addl2	-24(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-24(fp)
-	bgequ	noname.312
-	incl	r4
-noname.312:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.313
-	incl	r2
-noname.313:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.314
-	incl	r10
-noname.314:
-
-	movl	r9,4(r11)
-
-	clrl	r9
-
-	bicl3	#-65536,8(r6),r3
-	movzwl	10(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,(r7),r2
-	movzwl	2(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-28(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-32(fp)
-	mull2	r0,r4
-	addl3	-28(fp),-32(fp),r0
-	bicl3	#0,r0,-28(fp)
-	cmpl	-28(fp),-32(fp)
-	bgequ	noname.315
-	addl2	#65536,r4
-noname.315:
-	movzwl	-26(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-28(fp),r0
-	ashl	#16,r0,-32(fp)
-	addl2	-32(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-32(fp)
-	bgequ	noname.316
-	incl	r4
-noname.316:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.317
-	incl	r2
-noname.317:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.318
-	incl	r9
-noname.318:
-
-	bicl3	#-65536,4(r6),r3
-	movzwl	6(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,4(r7),r2
-	movzwl	6(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-36(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-40(fp)
-	mull2	r0,r4
-	addl3	-36(fp),-40(fp),r0
-	bicl3	#0,r0,-36(fp)
-	cmpl	-36(fp),-40(fp)
-	bgequ	noname.319
-	addl2	#65536,r4
-noname.319:
-	movzwl	-34(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-36(fp),r0
-	ashl	#16,r0,-40(fp)
-	addl2	-40(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-40(fp)
-	bgequ	noname.320
-	incl	r4
-noname.320:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.321
-	incl	r2
-noname.321:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.322
-	incl	r9
-noname.322:
-
-	bicl3	#-65536,(r6),r3
-	movzwl	2(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,8(r7),r2
-	movzwl	10(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-44(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-48(fp)
-	mull2	r0,r4
-	addl3	-44(fp),-48(fp),r0
-	bicl3	#0,r0,-44(fp)
-	cmpl	-44(fp),-48(fp)
-	bgequ	noname.323
-	addl2	#65536,r4
-noname.323:
-	movzwl	-42(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-44(fp),r0
-	ashl	#16,r0,-48(fp)
-	addl2	-48(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-48(fp)
-	bgequ	noname.324
-	incl	r4
-noname.324:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.325
-	incl	r2
-noname.325:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.326
-	incl	r9
-noname.326:
-
-	movl	r8,8(r11)
-
-	clrl	r8
-
-	bicl3	#-65536,(r6),r3
-	movzwl	2(r6),r2
-	bicl3	#-65536,12(r7),r1
-	movzwl	14(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r4
-	bicl3	#-65536,r2,-60(fp)
-	mull3	r0,r4,-52(fp)
-	mull2	r1,r4
-	mull3	r1,-60(fp),-56(fp)
-	mull2	r0,-60(fp)
-	addl3	-52(fp),-56(fp),r0
-	bicl3	#0,r0,-52(fp)
-	cmpl	-52(fp),-56(fp)
-	bgequ	noname.327
-	addl2	#65536,-60(fp)
-noname.327:
-	movzwl	-50(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-60(fp)
-	bicl3	#-65536,-52(fp),r0
-	ashl	#16,r0,-56(fp)
-	addl2	-56(fp),r4
-	bicl2	#0,r4
-	cmpl	r4,-56(fp)
-	bgequ	noname.328
-	incl	-60(fp)
-noname.328:
-	movl	r4,r1
-	movl	-60(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.329
-	incl	r2
-noname.329:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.330
-	incl	r8
-noname.330:
-
-	movzwl	6(r6),r2
-	bicl3	#-65536,8(r7),r3
-	movzwl	10(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,4(r6),-72(fp)
-	bicl3	#-65536,r2,-76(fp)
-	mull3	r0,-72(fp),-64(fp)
-	mull2	r3,-72(fp)
-	mull3	r3,-76(fp),-68(fp)
-	mull2	r0,-76(fp)
-	addl3	-64(fp),-68(fp),r0
-	bicl3	#0,r0,-64(fp)
-	cmpl	-64(fp),-68(fp)
-	bgequ	noname.331
-	addl2	#65536,-76(fp)
-noname.331:
-	movzwl	-62(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-76(fp)
-	bicl3	#-65536,-64(fp),r0
-	ashl	#16,r0,-68(fp)
-	addl3	-68(fp),-72(fp),r0
-	bicl3	#0,r0,-72(fp)
-	cmpl	-72(fp),-68(fp)
-	bgequ	noname.332
-	incl	-76(fp)
-noname.332:
-	movl	-72(fp),r1
-	movl	-76(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.333
-	incl	r2
-noname.333:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.334
-	incl	r8
-noname.334:
-
-	bicl3	#-65536,8(r6),r3
-	movzwl	10(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,4(r7),r2
-	movzwl	6(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-80(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-84(fp)
-	mull2	r0,r4
-	addl3	-80(fp),-84(fp),r0
-	bicl3	#0,r0,-80(fp)
-	cmpl	-80(fp),-84(fp)
-	bgequ	noname.335
-	addl2	#65536,r4
-noname.335:
-	movzwl	-78(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-80(fp),r0
-	ashl	#16,r0,-84(fp)
-	addl2	-84(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-84(fp)
-	bgequ	noname.336
-	incl	r4
-noname.336:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.337
-	incl	r2
-noname.337:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.338
-	incl	r8
-noname.338:
-
-	bicl3	#-65536,12(r6),r3
-	movzwl	14(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,(r7),r2
-	movzwl	2(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-88(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-92(fp)
-	mull2	r0,r4
-	addl3	-88(fp),-92(fp),r0
-	bicl3	#0,r0,-88(fp)
-	cmpl	-88(fp),-92(fp)
-	bgequ	noname.339
-	addl2	#65536,r4
-noname.339:
-	movzwl	-86(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-88(fp),r0
-	ashl	#16,r0,-92(fp)
-	addl2	-92(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-92(fp)
-	bgequ	noname.340
-	incl	r4
-noname.340:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.341
-	incl	r2
-noname.341:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.342
-	incl	r8
-noname.342:
-
-	movl	r10,12(r11)
-
-	clrl	r10
-
-	bicl3	#-65536,12(r6),r3
-	movzwl	14(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,4(r7),r2
-	movzwl	6(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-96(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-100(fp)
-	mull2	r0,r4
-	addl3	-96(fp),-100(fp),r0
-	bicl3	#0,r0,-96(fp)
-	cmpl	-96(fp),-100(fp)
-	bgequ	noname.343
-	addl2	#65536,r4
-noname.343:
-	movzwl	-94(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-96(fp),r0
-	ashl	#16,r0,-100(fp)
-	addl2	-100(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-100(fp)
-	bgequ	noname.344
-	incl	r4
-noname.344:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.345
-	incl	r2
-noname.345:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.346
-	incl	r10
-noname.346:
-
-	bicl3	#-65536,8(r6),r3
-	movzwl	10(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,8(r7),r2
-	movzwl	10(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-104(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-108(fp)
-	mull2	r0,r4
-	addl3	-104(fp),-108(fp),r0
-	bicl3	#0,r0,-104(fp)
-	cmpl	-104(fp),-108(fp)
-	bgequ	noname.347
-	addl2	#65536,r4
-noname.347:
-	movzwl	-102(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-104(fp),r0
-	ashl	#16,r0,-108(fp)
-	addl2	-108(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-108(fp)
-	bgequ	noname.348
-	incl	r4
-noname.348:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.349
-	incl	r2
-noname.349:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.350
-	incl	r10
-noname.350:
-
-	bicl3	#-65536,4(r6),r3
-	movzwl	6(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,12(r7),r2
-	movzwl	14(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-112(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-116(fp)
-	mull2	r0,r4
-	addl3	-112(fp),-116(fp),r0
-	bicl3	#0,r0,-112(fp)
-	cmpl	-112(fp),-116(fp)
-	bgequ	noname.351
-	addl2	#65536,r4
-noname.351:
-	movzwl	-110(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-112(fp),r0
-	ashl	#16,r0,-116(fp)
-	addl2	-116(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-116(fp)
-	bgequ	noname.352
-	incl	r4
-noname.352:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.353
-	incl	r2
-noname.353:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.354
-	incl	r10
-noname.354:
-
-	movl	r9,16(r11)
-
-	clrl	r9
-
-	bicl3	#-65536,8(r6),r3
-	movzwl	10(r6),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,12(r7),r2
-	movzwl	14(r7),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-120(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-124(fp)
-	mull2	r0,r4
-	addl3	-120(fp),-124(fp),r0
-	bicl3	#0,r0,-120(fp)
-	cmpl	-120(fp),-124(fp)
-	bgequ	noname.355
-	addl2	#65536,r4
-noname.355:
-	movzwl	-118(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-120(fp),r0
-	ashl	#16,r0,-124(fp)
-	addl2	-124(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-124(fp)
-	bgequ	noname.356
-	incl	r4
-noname.356:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.357
-	incl	r2
-noname.357:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.358
-	incl	r9
-noname.358:
-
-	movzwl	14(r6),r2
-	bicl3	#-65536,8(r7),r3
-	movzwl	10(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,12(r6),-136(fp)
-	bicl3	#-65536,r2,-140(fp)
-	mull3	r0,-136(fp),-128(fp)
-	mull2	r3,-136(fp)
-	mull3	r3,-140(fp),-132(fp)
-	mull2	r0,-140(fp)
-	addl3	-128(fp),-132(fp),r0
-	bicl3	#0,r0,-128(fp)
-	cmpl	-128(fp),-132(fp)
-	bgequ	noname.359
-	addl2	#65536,-140(fp)
-noname.359:
-	movzwl	-126(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-140(fp)
-	bicl3	#-65536,-128(fp),r0
-	ashl	#16,r0,-132(fp)
-	addl3	-132(fp),-136(fp),r0
-	bicl3	#0,r0,-136(fp)
-	cmpl	-136(fp),-132(fp)
-	bgequ	noname.360
-	incl	-140(fp)
-noname.360:
-	movl	-136(fp),r1
-	movl	-140(fp),r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.361
-	incl	r2
-noname.361:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.362
-	incl	r9
-noname.362:
-
-	movl	r8,20(r11)
-
-	clrl	r8
-
-	movzwl	14(r6),r2
-	bicl3	#-65536,12(r7),r3
-	movzwl	14(r7),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,12(r6),-152(fp)
-	bicl3	#-65536,r2,-156(fp)
-	mull3	r0,-152(fp),-144(fp)
-	mull2	r3,-152(fp)
-	mull3	r3,-156(fp),-148(fp)
-	mull2	r0,-156(fp)
-	addl3	-144(fp),-148(fp),r0
-	bicl3	#0,r0,-144(fp)
-	cmpl	-144(fp),-148(fp)
-	bgequ	noname.363
-	addl2	#65536,-156(fp)
-noname.363:
-	movzwl	-142(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-156(fp)
-	bicl3	#-65536,-144(fp),r0
-	ashl	#16,r0,-148(fp)
-	addl3	-148(fp),-152(fp),r0
-	bicl3	#0,r0,-152(fp)
-	cmpl	-152(fp),-148(fp)
-	bgequ	noname.364
-	incl	-156(fp)
-noname.364:
-	movl	-152(fp),r1
-	movl	-156(fp),r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.365
-	incl	r2
-noname.365:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.366
-	incl	r8
-noname.366:
-
-	movl	r10,24(r11)
-
-	movl	r9,28(r11)
-
-	ret	
-
-
-
-;r=4 ;(AP)
-;a=8 ;(AP)
-;b=12 ;(AP)
-;n=16 ;(AP)	n	by value (input)
-
-	.psect	code,nowrt
-
-.entry	BN_SQR_COMBA8,^m<r2,r3,r4,r5,r6,r7,r8,r9>
-	movab	-444(sp),sp
-
-	clrq	r8
-
-	clrl	r7
-
-	movl	8(ap),r4
-	movl	(r4),r3
-	bicl3	#-65536,r3,-4(fp)
-	extzv	#16,#16,r3,r0
-	bicl3	#-65536,r0,r3
-	movl	-4(fp),r0
-	mull3	r0,r3,-8(fp)
-	mull3	r0,r0,-4(fp)
-	mull2	r3,r3
-	bicl3	#32767,-8(fp),r0
-	extzv	#15,#17,r0,r0
-	addl2	r0,r3
-	bicl3	#-65536,-8(fp),r0
-	ashl	#17,r0,-8(fp)
-	addl3	-4(fp),-8(fp),r0
-	bicl3	#0,r0,-4(fp)
-	cmpl	-4(fp),-8(fp)
-	bgequ	noname.369
-	incl	r3
-noname.369:
-	movl	-4(fp),r1
-	movl	r3,r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.370
-	incl	r2
-noname.370:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.371
-	incl	r7
-noname.371:
-
-	movl	r9,@4(ap)
-
-	clrl	r9
-
-	movzwl	6(r4),r2
-	bicl3	#-65536,(r4),r3
-	movzwl	2(r4),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,4(r4),-20(fp)
-	bicl3	#-65536,r2,-24(fp)
-	mull3	r0,-20(fp),-12(fp)
-	mull2	r3,-20(fp)
-	mull3	r3,-24(fp),-16(fp)
-	mull2	r0,-24(fp)
-	addl3	-12(fp),-16(fp),r0
-	bicl3	#0,r0,-12(fp)
-	cmpl	-12(fp),-16(fp)
-	bgequ	noname.372
-	addl2	#65536,-24(fp)
-noname.372:
-	movzwl	-10(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-24(fp)
-	bicl3	#-65536,-12(fp),r0
-	ashl	#16,r0,-16(fp)
-	addl3	-16(fp),-20(fp),r0
-	bicl3	#0,r0,-20(fp)
-	cmpl	-20(fp),-16(fp)
-	bgequ	noname.373
-	incl	-24(fp)
-noname.373:
-	movl	-20(fp),r3
-	movl	-24(fp),r2
-	bbc	#31,r2,noname.374
-	incl	r9
-noname.374:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.375
-	incl	r2
-noname.375:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r8
-	bicl2	#0,r8
-	cmpl	r8,r3
-	bgequ	noname.376
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.376
-	incl	r9
-noname.376:
-	addl2	r2,r7
-	bicl2	#0,r7
-	cmpl	r7,r2
-	bgequ	noname.377
-	incl	r9
-noname.377:
-
-	movl	4(ap),r0
-	movl	r8,4(r0)
-
-	clrl	r8
-
-	movl	8(ap),r4
-	movl	4(r4),r3
-	bicl3	#-65536,r3,-28(fp)
-	extzv	#16,#16,r3,r0
-	bicl3	#-65536,r0,r3
-	movl	-28(fp),r0
-	mull3	r0,r3,-32(fp)
-	mull3	r0,r0,-28(fp)
-	mull2	r3,r3
-	bicl3	#32767,-32(fp),r0
-	extzv	#15,#17,r0,r0
-	addl2	r0,r3
-	bicl3	#-65536,-32(fp),r0
-	ashl	#17,r0,-32(fp)
-	addl3	-28(fp),-32(fp),r0
-	bicl3	#0,r0,-28(fp)
-	cmpl	-28(fp),-32(fp)
-	bgequ	noname.378
-	incl	r3
-noname.378:
-	movl	-28(fp),r1
-	movl	r3,r2
-	addl2	r1,r7
-	bicl2	#0,r7
-	cmpl	r7,r1
-	bgequ	noname.379
-	incl	r2
-noname.379:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.380
-	incl	r8
-noname.380:
-
-	movzwl	10(r4),r2
-	bicl3	#-65536,(r4),r3
-	movzwl	2(r4),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,8(r4),-44(fp)
-	bicl3	#-65536,r2,-48(fp)
-	mull3	r0,-44(fp),-36(fp)
-	mull2	r3,-44(fp)
-	mull3	r3,-48(fp),-40(fp)
-	mull2	r0,-48(fp)
-	addl3	-36(fp),-40(fp),r0
-	bicl3	#0,r0,-36(fp)
-	cmpl	-36(fp),-40(fp)
-	bgequ	noname.381
-	addl2	#65536,-48(fp)
-noname.381:
-	movzwl	-34(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-48(fp)
-	bicl3	#-65536,-36(fp),r0
-	ashl	#16,r0,-40(fp)
-	addl3	-40(fp),-44(fp),r0
-	bicl3	#0,r0,-44(fp)
-	cmpl	-44(fp),-40(fp)
-	bgequ	noname.382
-	incl	-48(fp)
-noname.382:
-	movl	-44(fp),r3
-	movl	-48(fp),r2
-	bbc	#31,r2,noname.383
-	incl	r8
-noname.383:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.384
-	incl	r2
-noname.384:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r7
-	bicl2	#0,r7
-	cmpl	r7,r3
-	bgequ	noname.385
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.385
-	incl	r8
-noname.385:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.386
-	incl	r8
-noname.386:
-
-	movl	4(ap),r0
-	movl	r7,8(r0)
-
-	clrl	r7
-
-	movl	8(ap),r0
-	movzwl	14(r0),r2
-	bicl3	#-65536,(r0),r3
-	movzwl	2(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,12(r0),-60(fp)
-	bicl3	#-65536,r2,-64(fp)
-	mull3	r1,-60(fp),-52(fp)
-	mull2	r3,-60(fp)
-	mull3	r3,-64(fp),-56(fp)
-	mull2	r1,-64(fp)
-	addl3	-52(fp),-56(fp),r0
-	bicl3	#0,r0,-52(fp)
-	cmpl	-52(fp),-56(fp)
-	bgequ	noname.387
-	addl2	#65536,-64(fp)
-noname.387:
-	movzwl	-50(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-64(fp)
-	bicl3	#-65536,-52(fp),r0
-	ashl	#16,r0,-56(fp)
-	addl3	-56(fp),-60(fp),r0
-	bicl3	#0,r0,-60(fp)
-	cmpl	-60(fp),-56(fp)
-	bgequ	noname.388
-	incl	-64(fp)
-noname.388:
-	movl	-60(fp),r3
-	movl	-64(fp),r2
-	bbc	#31,r2,noname.389
-	incl	r7
-noname.389:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.390
-	incl	r2
-noname.390:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r9
-	bicl2	#0,r9
-	cmpl	r9,r3
-	bgequ	noname.391
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.391
-	incl	r7
-noname.391:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.392
-	incl	r7
-noname.392:
-
-	movl	8(ap),r0
-	movzwl	10(r0),r2
-	bicl3	#-65536,4(r0),r3
-	movzwl	6(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,8(r0),-76(fp)
-	bicl3	#-65536,r2,-80(fp)
-	mull3	r1,-76(fp),-68(fp)
-	mull2	r3,-76(fp)
-	mull3	r3,-80(fp),-72(fp)
-	mull2	r1,-80(fp)
-	addl3	-68(fp),-72(fp),r0
-	bicl3	#0,r0,-68(fp)
-	cmpl	-68(fp),-72(fp)
-	bgequ	noname.393
-	addl2	#65536,-80(fp)
-noname.393:
-	movzwl	-66(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-80(fp)
-	bicl3	#-65536,-68(fp),r0
-	ashl	#16,r0,-72(fp)
-	addl3	-72(fp),-76(fp),r0
-	bicl3	#0,r0,-76(fp)
-	cmpl	-76(fp),-72(fp)
-	bgequ	noname.394
-	incl	-80(fp)
-noname.394:
-	movl	-76(fp),r3
-	movl	-80(fp),r2
-	bbc	#31,r2,noname.395
-	incl	r7
-noname.395:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.396
-	incl	r2
-noname.396:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r9
-	bicl2	#0,r9
-	cmpl	r9,r3
-	bgequ	noname.397
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.397
-	incl	r7
-noname.397:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.398
-	incl	r7
-noname.398:
-
-	movl	4(ap),r0
-	movl	r9,12(r0)
-
-	clrl	r9
-
-	movl	8(ap),r2
-	movl	8(r2),r4
-	bicl3	#-65536,r4,-84(fp)
-	extzv	#16,#16,r4,r0
-	bicl3	#-65536,r0,r4
-	movl	-84(fp),r0
-	mull3	r0,r4,-88(fp)
-	mull3	r0,r0,-84(fp)
-	mull2	r4,r4
-	bicl3	#32767,-88(fp),r0
-	extzv	#15,#17,r0,r0
-	addl2	r0,r4
-	bicl3	#-65536,-88(fp),r0
-	ashl	#17,r0,-88(fp)
-	addl3	-84(fp),-88(fp),r0
-	bicl3	#0,r0,-84(fp)
-	cmpl	-84(fp),-88(fp)
-	bgequ	noname.399
-	incl	r4
-noname.399:
-	movl	-84(fp),r1
-	movl	r4,r3
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.400
-	incl	r3
-noname.400:
-	addl2	r3,r7
-	bicl2	#0,r7
-	cmpl	r7,r3
-	bgequ	noname.401
-	incl	r9
-noname.401:
-
-	movzwl	14(r2),r3
-	bicl3	#-65536,4(r2),r1
-	movzwl	6(r2),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,12(r2),-100(fp)
-	bicl3	#-65536,r3,-104(fp)
-	mull3	r0,-100(fp),-92(fp)
-	mull2	r1,-100(fp)
-	mull3	r1,-104(fp),-96(fp)
-	mull2	r0,-104(fp)
-	addl3	-92(fp),-96(fp),r0
-	bicl3	#0,r0,-92(fp)
-	cmpl	-92(fp),-96(fp)
-	bgequ	noname.402
-	addl2	#65536,-104(fp)
-noname.402:
-	movzwl	-90(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-104(fp)
-	bicl3	#-65536,-92(fp),r0
-	ashl	#16,r0,-96(fp)
-	addl3	-96(fp),-100(fp),r0
-	bicl3	#0,r0,-100(fp)
-	cmpl	-100(fp),-96(fp)
-	bgequ	noname.403
-	incl	-104(fp)
-noname.403:
-	movl	-100(fp),r3
-	movl	-104(fp),r2
-	bbc	#31,r2,noname.404
-	incl	r9
-noname.404:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.405
-	incl	r2
-noname.405:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r8
-	bicl2	#0,r8
-	cmpl	r8,r3
-	bgequ	noname.406
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.406
-	incl	r9
-noname.406:
-	addl2	r2,r7
-	bicl2	#0,r7
-	cmpl	r7,r2
-	bgequ	noname.407
-	incl	r9
-noname.407:
-
-	movl	8(ap),r0
-	movzwl	18(r0),r2
-	bicl3	#-65536,(r0),r3
-	movzwl	2(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,16(r0),-116(fp)
-	bicl3	#-65536,r2,-120(fp)
-	mull3	r1,-116(fp),-108(fp)
-	mull2	r3,-116(fp)
-	mull3	r3,-120(fp),-112(fp)
-	mull2	r1,-120(fp)
-	addl3	-108(fp),-112(fp),r0
-	bicl3	#0,r0,-108(fp)
-	cmpl	-108(fp),-112(fp)
-	bgequ	noname.408
-	addl2	#65536,-120(fp)
-noname.408:
-	movzwl	-106(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-120(fp)
-	bicl3	#-65536,-108(fp),r0
-	ashl	#16,r0,-112(fp)
-	addl3	-112(fp),-116(fp),r0
-	bicl3	#0,r0,-116(fp)
-	cmpl	-116(fp),-112(fp)
-	bgequ	noname.409
-	incl	-120(fp)
-noname.409:
-	movl	-116(fp),r3
-	movl	-120(fp),r2
-	bbc	#31,r2,noname.410
-	incl	r9
-noname.410:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.411
-	incl	r2
-noname.411:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r8
-	bicl2	#0,r8
-	cmpl	r8,r3
-	bgequ	noname.412
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.412
-	incl	r9
-noname.412:
-	addl2	r2,r7
-	bicl2	#0,r7
-	cmpl	r7,r2
-	bgequ	noname.413
-	incl	r9
-noname.413:
-
-	movl	4(ap),r0
-	movl	r8,16(r0)
-
-	clrl	r8
-
-	movl	8(ap),r0
-	movzwl	22(r0),r2
-	bicl3	#-65536,(r0),r3
-	movzwl	2(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,20(r0),-132(fp)
-	bicl3	#-65536,r2,-136(fp)
-	mull3	r1,-132(fp),-124(fp)
-	mull2	r3,-132(fp)
-	mull3	r3,-136(fp),-128(fp)
-	mull2	r1,-136(fp)
-	addl3	-124(fp),-128(fp),r0
-	bicl3	#0,r0,-124(fp)
-	cmpl	-124(fp),-128(fp)
-	bgequ	noname.414
-	addl2	#65536,-136(fp)
-noname.414:
-	movzwl	-122(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-136(fp)
-	bicl3	#-65536,-124(fp),r0
-	ashl	#16,r0,-128(fp)
-	addl3	-128(fp),-132(fp),r0
-	bicl3	#0,r0,-132(fp)
-	cmpl	-132(fp),-128(fp)
-	bgequ	noname.415
-	incl	-136(fp)
-noname.415:
-	movl	-132(fp),r3
-	movl	-136(fp),r2
-	bbc	#31,r2,noname.416
-	incl	r8
-noname.416:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.417
-	incl	r2
-noname.417:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r7
-	bicl2	#0,r7
-	cmpl	r7,r3
-	bgequ	noname.418
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.418
-	incl	r8
-noname.418:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.419
-	incl	r8
-noname.419:
-
-	movl	8(ap),r0
-	movzwl	18(r0),r2
-	bicl3	#-65536,4(r0),r3
-	movzwl	6(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,16(r0),-148(fp)
-	bicl3	#-65536,r2,-152(fp)
-	mull3	r1,-148(fp),-140(fp)
-	mull2	r3,-148(fp)
-	mull3	r3,-152(fp),-144(fp)
-	mull2	r1,-152(fp)
-	addl3	-140(fp),-144(fp),r0
-	bicl3	#0,r0,-140(fp)
-	cmpl	-140(fp),-144(fp)
-	bgequ	noname.420
-	addl2	#65536,-152(fp)
-noname.420:
-	movzwl	-138(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-152(fp)
-	bicl3	#-65536,-140(fp),r0
-	ashl	#16,r0,-144(fp)
-	addl3	-144(fp),-148(fp),r0
-	bicl3	#0,r0,-148(fp)
-	cmpl	-148(fp),-144(fp)
-	bgequ	noname.421
-	incl	-152(fp)
-noname.421:
-	movl	-148(fp),r3
-	movl	-152(fp),r2
-	bbc	#31,r2,noname.422
-	incl	r8
-noname.422:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.423
-	incl	r2
-noname.423:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r7
-	bicl2	#0,r7
-	cmpl	r7,r3
-	bgequ	noname.424
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.424
-	incl	r8
-noname.424:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.425
-	incl	r8
-noname.425:
-
-	movl	8(ap),r0
-	movzwl	14(r0),r2
-	bicl3	#-65536,8(r0),r3
-	movzwl	10(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,12(r0),-164(fp)
-	bicl3	#-65536,r2,-168(fp)
-	mull3	r1,-164(fp),-156(fp)
-	mull2	r3,-164(fp)
-	mull3	r3,-168(fp),-160(fp)
-	mull2	r1,-168(fp)
-	addl3	-156(fp),-160(fp),r0
-	bicl3	#0,r0,-156(fp)
-	cmpl	-156(fp),-160(fp)
-	bgequ	noname.426
-	addl2	#65536,-168(fp)
-noname.426:
-	movzwl	-154(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-168(fp)
-	bicl3	#-65536,-156(fp),r0
-	ashl	#16,r0,-160(fp)
-	addl3	-160(fp),-164(fp),r0
-	bicl3	#0,r0,-164(fp)
-	cmpl	-164(fp),-160(fp)
-	bgequ	noname.427
-	incl	-168(fp)
-noname.427:
-	movl	-164(fp),r3
-	movl	-168(fp),r2
-	bbc	#31,r2,noname.428
-	incl	r8
-noname.428:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.429
-	incl	r2
-noname.429:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r7
-	bicl2	#0,r7
-	cmpl	r7,r3
-	bgequ	noname.430
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.430
-	incl	r8
-noname.430:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.431
-	incl	r8
-noname.431:
-
-	movl	4(ap),r0
-	movl	r7,20(r0)
-
-	clrl	r7
-
-	movl	8(ap),r2
-	movl	12(r2),r4
-	bicl3	#-65536,r4,-172(fp)
-	extzv	#16,#16,r4,r0
-	bicl3	#-65536,r0,r4
-	movl	-172(fp),r0
-	mull3	r0,r4,-176(fp)
-	mull3	r0,r0,-172(fp)
-	mull2	r4,r4
-	bicl3	#32767,-176(fp),r0
-	extzv	#15,#17,r0,r0
-	addl2	r0,r4
-	bicl3	#-65536,-176(fp),r0
-	ashl	#17,r0,-176(fp)
-	addl3	-172(fp),-176(fp),r0
-	bicl3	#0,r0,-172(fp)
-	cmpl	-172(fp),-176(fp)
-	bgequ	noname.432
-	incl	r4
-noname.432:
-	movl	-172(fp),r1
-	movl	r4,r3
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.433
-	incl	r3
-noname.433:
-	addl2	r3,r8
-	bicl2	#0,r8
-	cmpl	r8,r3
-	bgequ	noname.434
-	incl	r7
-noname.434:
-
-	movzwl	18(r2),r3
-	bicl3	#-65536,8(r2),r1
-	movzwl	10(r2),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,16(r2),-188(fp)
-	bicl3	#-65536,r3,-192(fp)
-	mull3	r0,-188(fp),-180(fp)
-	mull2	r1,-188(fp)
-	mull3	r1,-192(fp),-184(fp)
-	mull2	r0,-192(fp)
-	addl3	-180(fp),-184(fp),r0
-	bicl3	#0,r0,-180(fp)
-	cmpl	-180(fp),-184(fp)
-	bgequ	noname.435
-	addl2	#65536,-192(fp)
-noname.435:
-	movzwl	-178(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-192(fp)
-	bicl3	#-65536,-180(fp),r0
-	ashl	#16,r0,-184(fp)
-	addl3	-184(fp),-188(fp),r0
-	bicl3	#0,r0,-188(fp)
-	cmpl	-188(fp),-184(fp)
-	bgequ	noname.436
-	incl	-192(fp)
-noname.436:
-	movl	-188(fp),r3
-	movl	-192(fp),r2
-	bbc	#31,r2,noname.437
-	incl	r7
-noname.437:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.438
-	incl	r2
-noname.438:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r9
-	bicl2	#0,r9
-	cmpl	r9,r3
-	bgequ	noname.439
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.439
-	incl	r7
-noname.439:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.440
-	incl	r7
-noname.440:
-
-	movl	8(ap),r0
-	movzwl	22(r0),r2
-	bicl3	#-65536,4(r0),r3
-	movzwl	6(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,20(r0),-204(fp)
-	bicl3	#-65536,r2,-208(fp)
-	mull3	r1,-204(fp),-196(fp)
-	mull2	r3,-204(fp)
-	mull3	r3,-208(fp),-200(fp)
-	mull2	r1,-208(fp)
-	addl3	-196(fp),-200(fp),r0
-	bicl3	#0,r0,-196(fp)
-	cmpl	-196(fp),-200(fp)
-	bgequ	noname.441
-	addl2	#65536,-208(fp)
-noname.441:
-	movzwl	-194(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-208(fp)
-	bicl3	#-65536,-196(fp),r0
-	ashl	#16,r0,-200(fp)
-	addl3	-200(fp),-204(fp),r0
-	bicl3	#0,r0,-204(fp)
-	cmpl	-204(fp),-200(fp)
-	bgequ	noname.442
-	incl	-208(fp)
-noname.442:
-	movl	-204(fp),r3
-	movl	-208(fp),r2
-	bbc	#31,r2,noname.443
-	incl	r7
-noname.443:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.444
-	incl	r2
-noname.444:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r9
-	bicl2	#0,r9
-	cmpl	r9,r3
-	bgequ	noname.445
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.445
-	incl	r7
-noname.445:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.446
-	incl	r7
-noname.446:
-
-	movl	8(ap),r0
-	movzwl	26(r0),r2
-	bicl3	#-65536,(r0),r3
-	movzwl	2(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,24(r0),-220(fp)
-	bicl3	#-65536,r2,-224(fp)
-	mull3	r1,-220(fp),-212(fp)
-	mull2	r3,-220(fp)
-	mull3	r3,-224(fp),-216(fp)
-	mull2	r1,-224(fp)
-	addl3	-212(fp),-216(fp),r0
-	bicl3	#0,r0,-212(fp)
-	cmpl	-212(fp),-216(fp)
-	bgequ	noname.447
-	addl2	#65536,-224(fp)
-noname.447:
-	movzwl	-210(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-224(fp)
-	bicl3	#-65536,-212(fp),r0
-	ashl	#16,r0,-216(fp)
-	addl3	-216(fp),-220(fp),r0
-	bicl3	#0,r0,-220(fp)
-	cmpl	-220(fp),-216(fp)
-	bgequ	noname.448
-	incl	-224(fp)
-noname.448:
-	movl	-220(fp),r3
-	movl	-224(fp),r2
-	bbc	#31,r2,noname.449
-	incl	r7
-noname.449:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.450
-	incl	r2
-noname.450:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r9
-	bicl2	#0,r9
-	cmpl	r9,r3
-	bgequ	noname.451
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.451
-	incl	r7
-noname.451:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.452
-	incl	r7
-noname.452:
-
-	movl	4(ap),r0
-	movl	r9,24(r0)
-
-	clrl	r9
-
-	movl	8(ap),r0
-	movzwl	30(r0),r2
-	bicl3	#-65536,(r0),r3
-	movzwl	2(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,28(r0),-236(fp)
-	bicl3	#-65536,r2,-240(fp)
-	mull3	r1,-236(fp),-228(fp)
-	mull2	r3,-236(fp)
-	mull3	r3,-240(fp),-232(fp)
-	mull2	r1,-240(fp)
-	addl3	-228(fp),-232(fp),r0
-	bicl3	#0,r0,-228(fp)
-	cmpl	-228(fp),-232(fp)
-	bgequ	noname.453
-	addl2	#65536,-240(fp)
-noname.453:
-	movzwl	-226(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-240(fp)
-	bicl3	#-65536,-228(fp),r0
-	ashl	#16,r0,-232(fp)
-	addl3	-232(fp),-236(fp),r0
-	bicl3	#0,r0,-236(fp)
-	cmpl	-236(fp),-232(fp)
-	bgequ	noname.454
-	incl	-240(fp)
-noname.454:
-	movl	-236(fp),r3
-	movl	-240(fp),r2
-	bbc	#31,r2,noname.455
-	incl	r9
-noname.455:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.456
-	incl	r2
-noname.456:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r8
-	bicl2	#0,r8
-	cmpl	r8,r3
-	bgequ	noname.457
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.457
-	incl	r9
-noname.457:
-	addl2	r2,r7
-	bicl2	#0,r7
-	cmpl	r7,r2
-	bgequ	noname.458
-	incl	r9
-noname.458:
-
-	movl	8(ap),r0
-	movzwl	26(r0),r2
-	bicl3	#-65536,4(r0),r3
-	movzwl	6(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,24(r0),-252(fp)
-	bicl3	#-65536,r2,-256(fp)
-	mull3	r1,-252(fp),-244(fp)
-	mull2	r3,-252(fp)
-	mull3	r3,-256(fp),-248(fp)
-	mull2	r1,-256(fp)
-	addl3	-244(fp),-248(fp),r0
-	bicl3	#0,r0,-244(fp)
-	cmpl	-244(fp),-248(fp)
-	bgequ	noname.459
-	addl2	#65536,-256(fp)
-noname.459:
-	movzwl	-242(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-256(fp)
-	bicl3	#-65536,-244(fp),r0
-	ashl	#16,r0,-248(fp)
-	addl3	-248(fp),-252(fp),r0
-	bicl3	#0,r0,-252(fp)
-	cmpl	-252(fp),-248(fp)
-	bgequ	noname.460
-	incl	-256(fp)
-noname.460:
-	movl	-252(fp),r3
-	movl	-256(fp),r2
-	bbc	#31,r2,noname.461
-	incl	r9
-noname.461:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.462
-	incl	r2
-noname.462:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r8
-	bicl2	#0,r8
-	cmpl	r8,r3
-	bgequ	noname.463
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.463
-	incl	r9
-noname.463:
-	addl2	r2,r7
-	bicl2	#0,r7
-	cmpl	r7,r2
-	bgequ	noname.464
-	incl	r9
-noname.464:
-
-	movl	8(ap),r0
-	movzwl	22(r0),r2
-	bicl3	#-65536,8(r0),r3
-	movzwl	10(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,20(r0),-268(fp)
-	bicl3	#-65536,r2,-272(fp)
-	mull3	r1,-268(fp),-260(fp)
-	mull2	r3,-268(fp)
-	mull3	r3,-272(fp),-264(fp)
-	mull2	r1,-272(fp)
-	addl3	-260(fp),-264(fp),r0
-	bicl3	#0,r0,-260(fp)
-	cmpl	-260(fp),-264(fp)
-	bgequ	noname.465
-	addl2	#65536,-272(fp)
-noname.465:
-	movzwl	-258(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-272(fp)
-	bicl3	#-65536,-260(fp),r0
-	ashl	#16,r0,-264(fp)
-	addl3	-264(fp),-268(fp),r0
-	bicl3	#0,r0,-268(fp)
-	cmpl	-268(fp),-264(fp)
-	bgequ	noname.466
-	incl	-272(fp)
-noname.466:
-	movl	-268(fp),r3
-	movl	-272(fp),r2
-	bbc	#31,r2,noname.467
-	incl	r9
-noname.467:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.468
-	incl	r2
-noname.468:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r8
-	bicl2	#0,r8
-	cmpl	r8,r3
-	bgequ	noname.469
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.469
-	incl	r9
-noname.469:
-	addl2	r2,r7
-	bicl2	#0,r7
-	cmpl	r7,r2
-	bgequ	noname.470
-	incl	r9
-noname.470:
-
-	movl	8(ap),r0
-	movzwl	18(r0),r2
-	bicl3	#-65536,12(r0),r3
-	movzwl	14(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,16(r0),-284(fp)
-	bicl3	#-65536,r2,-288(fp)
-	mull3	r1,-284(fp),-276(fp)
-	mull2	r3,-284(fp)
-	mull3	r3,-288(fp),-280(fp)
-	mull2	r1,-288(fp)
-	addl3	-276(fp),-280(fp),r0
-	bicl3	#0,r0,-276(fp)
-	cmpl	-276(fp),-280(fp)
-	bgequ	noname.471
-	addl2	#65536,-288(fp)
-noname.471:
-	movzwl	-274(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-288(fp)
-	bicl3	#-65536,-276(fp),r0
-	ashl	#16,r0,-280(fp)
-	addl3	-280(fp),-284(fp),r0
-	bicl3	#0,r0,-284(fp)
-	cmpl	-284(fp),-280(fp)
-	bgequ	noname.472
-	incl	-288(fp)
-noname.472:
-	movl	-284(fp),r3
-	movl	-288(fp),r2
-	bbc	#31,r2,noname.473
-	incl	r9
-noname.473:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.474
-	incl	r2
-noname.474:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r8
-	bicl2	#0,r8
-	cmpl	r8,r3
-	bgequ	noname.475
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.475
-	incl	r9
-noname.475:
-	addl2	r2,r7
-	bicl2	#0,r7
-	cmpl	r7,r2
-	bgequ	noname.476
-	incl	r9
-noname.476:
-
-	movl	4(ap),r0
-	movl	r8,28(r0)
-
-	clrl	r8
-
-	movl	8(ap),r3
-	movl	16(r3),r4
-	bicl3	#-65536,r4,r5
-	extzv	#16,#16,r4,r0
-	bicl3	#-65536,r0,r4
-	mull3	r5,r4,-292(fp)
-	mull2	r5,r5
-	mull2	r4,r4
-	bicl3	#32767,-292(fp),r0
-	extzv	#15,#17,r0,r0
-	addl2	r0,r4
-	bicl3	#-65536,-292(fp),r0
-	ashl	#17,r0,-292(fp)
-	addl2	-292(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-292(fp)
-	bgequ	noname.477
-	incl	r4
-noname.477:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r7
-	bicl2	#0,r7
-	cmpl	r7,r1
-	bgequ	noname.478
-	incl	r2
-noname.478:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.479
-	incl	r8
-noname.479:
-
-	bicl3	#-65536,20(r3),r4
-	movzwl	22(r3),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,12(r3),r2
-	movzwl	14(r3),r0
-	bicl2	#-65536,r0
-	movl	r4,r6
-	movl	r1,r5
-	mull3	r0,r6,-296(fp)
-	mull2	r2,r6
-	mull3	r2,r5,-300(fp)
-	mull2	r0,r5
-	addl3	-296(fp),-300(fp),r0
-	bicl3	#0,r0,-296(fp)
-	cmpl	-296(fp),-300(fp)
-	bgequ	noname.480
-	addl2	#65536,r5
-noname.480:
-	movzwl	-294(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r5
-	bicl3	#-65536,-296(fp),r0
-	ashl	#16,r0,-300(fp)
-	addl2	-300(fp),r6
-	bicl2	#0,r6
-	cmpl	r6,-300(fp)
-	bgequ	noname.481
-	incl	r5
-noname.481:
-	movl	r6,r3
-	movl	r5,r2
-	bbc	#31,r2,noname.482
-	incl	r8
-noname.482:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.483
-	incl	r2
-noname.483:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r7
-	bicl2	#0,r7
-	cmpl	r7,r3
-	bgequ	noname.484
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.484
-	incl	r8
-noname.484:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.485
-	incl	r8
-noname.485:
-
-	movl	8(ap),r0
-	bicl3	#-65536,24(r0),r3
-	movzwl	26(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,8(r0),r2
-	movzwl	10(r0),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-304(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-308(fp)
-	mull2	r0,r4
-	addl3	-304(fp),-308(fp),r0
-	bicl3	#0,r0,-304(fp)
-	cmpl	-304(fp),-308(fp)
-	bgequ	noname.486
-	addl2	#65536,r4
-noname.486:
-	movzwl	-302(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-304(fp),r0
-	ashl	#16,r0,-308(fp)
-	addl2	-308(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-308(fp)
-	bgequ	noname.487
-	incl	r4
-noname.487:
-	movl	r5,r3
-	movl	r4,r2
-	bbc	#31,r2,noname.488
-	incl	r8
-noname.488:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.489
-	incl	r2
-noname.489:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r7
-	bicl2	#0,r7
-	cmpl	r7,r3
-	bgequ	noname.490
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.490
-	incl	r8
-noname.490:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.491
-	incl	r8
-noname.491:
-
-	movl	8(ap),r0
-	bicl3	#-65536,28(r0),r3
-	movzwl	30(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,4(r0),r2
-	movzwl	6(r0),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-312(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-316(fp)
-	mull2	r0,r4
-	addl3	-312(fp),-316(fp),r0
-	bicl3	#0,r0,-312(fp)
-	cmpl	-312(fp),-316(fp)
-	bgequ	noname.492
-	addl2	#65536,r4
-noname.492:
-	movzwl	-310(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-312(fp),r0
-	ashl	#16,r0,-316(fp)
-	addl2	-316(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-316(fp)
-	bgequ	noname.493
-	incl	r4
-noname.493:
-	movl	r5,r3
-	movl	r4,r2
-	bbc	#31,r2,noname.494
-	incl	r8
-noname.494:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.495
-	incl	r2
-noname.495:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r7
-	bicl2	#0,r7
-	cmpl	r7,r3
-	bgequ	noname.496
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.496
-	incl	r8
-noname.496:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.497
-	incl	r8
-noname.497:
-
-	movl	4(ap),r0
-	movl	r7,32(r0)
-
-	clrl	r7
-
-	movl	8(ap),r0
-	bicl3	#-65536,28(r0),r3
-	movzwl	30(r0),r2
-	bicl3	#-65536,8(r0),r1
-	movzwl	10(r0),r0
-	bicl2	#-65536,r0
-	movl	r3,r4
-	bicl3	#-65536,r2,-328(fp)
-	mull3	r0,r4,-320(fp)
-	mull2	r1,r4
-	mull3	r1,-328(fp),-324(fp)
-	mull2	r0,-328(fp)
-	addl3	-320(fp),-324(fp),r0
-	bicl3	#0,r0,-320(fp)
-	cmpl	-320(fp),-324(fp)
-	bgequ	noname.498
-	addl2	#65536,-328(fp)
-noname.498:
-	movzwl	-318(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-328(fp)
-	bicl3	#-65536,-320(fp),r0
-	ashl	#16,r0,-324(fp)
-	addl2	-324(fp),r4
-	bicl2	#0,r4
-	cmpl	r4,-324(fp)
-	bgequ	noname.499
-	incl	-328(fp)
-noname.499:
-	movl	r4,r3
-	movl	-328(fp),r2
-	bbc	#31,r2,noname.500
-	incl	r7
-noname.500:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.501
-	incl	r2
-noname.501:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r9
-	bicl2	#0,r9
-	cmpl	r9,r3
-	bgequ	noname.502
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.502
-	incl	r7
-noname.502:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.503
-	incl	r7
-noname.503:
-
-	movl	8(ap),r0
-	movzwl	26(r0),r2
-	bicl3	#-65536,12(r0),r3
-	movzwl	14(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,24(r0),-340(fp)
-	bicl3	#-65536,r2,-344(fp)
-	mull3	r1,-340(fp),-332(fp)
-	mull2	r3,-340(fp)
-	mull3	r3,-344(fp),-336(fp)
-	mull2	r1,-344(fp)
-	addl3	-332(fp),-336(fp),r0
-	bicl3	#0,r0,-332(fp)
-	cmpl	-332(fp),-336(fp)
-	bgequ	noname.504
-	addl2	#65536,-344(fp)
-noname.504:
-	movzwl	-330(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-344(fp)
-	bicl3	#-65536,-332(fp),r0
-	ashl	#16,r0,-336(fp)
-	addl3	-336(fp),-340(fp),r0
-	bicl3	#0,r0,-340(fp)
-	cmpl	-340(fp),-336(fp)
-	bgequ	noname.505
-	incl	-344(fp)
-noname.505:
-	movl	-340(fp),r3
-	movl	-344(fp),r2
-	bbc	#31,r2,noname.506
-	incl	r7
-noname.506:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.507
-	incl	r2
-noname.507:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r9
-	bicl2	#0,r9
-	cmpl	r9,r3
-	bgequ	noname.508
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.508
-	incl	r7
-noname.508:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.509
-	incl	r7
-noname.509:
-
-	movl	8(ap),r0
-	movzwl	22(r0),r2
-	bicl3	#-65536,16(r0),r3
-	movzwl	18(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,20(r0),-356(fp)
-	bicl3	#-65536,r2,-360(fp)
-	mull3	r1,-356(fp),-348(fp)
-	mull2	r3,-356(fp)
-	mull3	r3,-360(fp),-352(fp)
-	mull2	r1,-360(fp)
-	addl3	-348(fp),-352(fp),r0
-	bicl3	#0,r0,-348(fp)
-	cmpl	-348(fp),-352(fp)
-	bgequ	noname.510
-	addl2	#65536,-360(fp)
-noname.510:
-	movzwl	-346(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-360(fp)
-	bicl3	#-65536,-348(fp),r0
-	ashl	#16,r0,-352(fp)
-	addl3	-352(fp),-356(fp),r0
-	bicl3	#0,r0,-356(fp)
-	cmpl	-356(fp),-352(fp)
-	bgequ	noname.511
-	incl	-360(fp)
-noname.511:
-	movl	-356(fp),r3
-	movl	-360(fp),r2
-	bbc	#31,r2,noname.512
-	incl	r7
-noname.512:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.513
-	incl	r2
-noname.513:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r9
-	bicl2	#0,r9
-	cmpl	r9,r3
-	bgequ	noname.514
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.514
-	incl	r7
-noname.514:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.515
-	incl	r7
-noname.515:
-
-	movl	4(ap),r0
-	movl	r9,36(r0)
-
-	clrl	r9
-
-	movl	8(ap),r3
-	movl	20(r3),r4
-	bicl3	#-65536,r4,-364(fp)
-	extzv	#16,#16,r4,r0
-	bicl3	#-65536,r0,r4
-	movl	-364(fp),r0
-	mull3	r0,r4,-368(fp)
-	mull3	r0,r0,-364(fp)
-	mull2	r4,r4
-	bicl3	#32767,-368(fp),r0
-	extzv	#15,#17,r0,r0
-	addl2	r0,r4
-	bicl3	#-65536,-368(fp),r0
-	ashl	#17,r0,-368(fp)
-	addl3	-364(fp),-368(fp),r0
-	bicl3	#0,r0,-364(fp)
-	cmpl	-364(fp),-368(fp)
-	bgequ	noname.516
-	incl	r4
-noname.516:
-	movl	-364(fp),r1
-	movl	r4,r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.517
-	incl	r2
-noname.517:
-	addl2	r2,r7
-	bicl2	#0,r7
-	cmpl	r7,r2
-	bgequ	noname.518
-	incl	r9
-noname.518:
-
-	bicl3	#-65536,24(r3),r4
-	movzwl	26(r3),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,16(r3),r2
-	movzwl	18(r3),r0
-	bicl2	#-65536,r0
-	movl	r4,r6
-	movl	r1,r5
-	mull3	r0,r6,-372(fp)
-	mull2	r2,r6
-	mull3	r2,r5,-376(fp)
-	mull2	r0,r5
-	addl3	-372(fp),-376(fp),r0
-	bicl3	#0,r0,-372(fp)
-	cmpl	-372(fp),-376(fp)
-	bgequ	noname.519
-	addl2	#65536,r5
-noname.519:
-	movzwl	-370(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r5
-	bicl3	#-65536,-372(fp),r0
-	ashl	#16,r0,-376(fp)
-	addl2	-376(fp),r6
-	bicl2	#0,r6
-	cmpl	r6,-376(fp)
-	bgequ	noname.520
-	incl	r5
-noname.520:
-	movl	r6,r3
-	movl	r5,r2
-	bbc	#31,r2,noname.521
-	incl	r9
-noname.521:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.522
-	incl	r2
-noname.522:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r8
-	bicl2	#0,r8
-	cmpl	r8,r3
-	bgequ	noname.523
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.523
-	incl	r9
-noname.523:
-	addl2	r2,r7
-	bicl2	#0,r7
-	cmpl	r7,r2
-	bgequ	noname.524
-	incl	r9
-noname.524:
-
-	movl	8(ap),r0
-	bicl3	#-65536,28(r0),r3
-	movzwl	30(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,12(r0),r2
-	movzwl	14(r0),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-380(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-384(fp)
-	mull2	r0,r4
-	addl3	-380(fp),-384(fp),r0
-	bicl3	#0,r0,-380(fp)
-	cmpl	-380(fp),-384(fp)
-	bgequ	noname.525
-	addl2	#65536,r4
-noname.525:
-	movzwl	-378(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-380(fp),r0
-	ashl	#16,r0,-384(fp)
-	addl2	-384(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-384(fp)
-	bgequ	noname.526
-	incl	r4
-noname.526:
-	movl	r5,r3
-	movl	r4,r2
-	bbc	#31,r2,noname.527
-	incl	r9
-noname.527:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.528
-	incl	r2
-noname.528:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r8
-	bicl2	#0,r8
-	cmpl	r8,r3
-	bgequ	noname.529
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.529
-	incl	r9
-noname.529:
-	addl2	r2,r7
-	bicl2	#0,r7
-	cmpl	r7,r2
-	bgequ	noname.530
-	incl	r9
-noname.530:
-	movl	4(ap),r0
-	movl	r8,40(r0)
-
-	clrl	r8
-
-	movl	8(ap),r0
-	bicl3	#-65536,28(r0),r3
-	movzwl	30(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,16(r0),r2
-	movzwl	18(r0),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-388(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-392(fp)
-	mull2	r0,r4
-	addl3	-388(fp),-392(fp),r0
-	bicl3	#0,r0,-388(fp)
-	cmpl	-388(fp),-392(fp)
-	bgequ	noname.531
-	addl2	#65536,r4
-noname.531:
-	movzwl	-386(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-388(fp),r0
-	ashl	#16,r0,-392(fp)
-	addl2	-392(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-392(fp)
-	bgequ	noname.532
-	incl	r4
-noname.532:
-	movl	r5,r3
-	movl	r4,r2
-	bbc	#31,r2,noname.533
-	incl	r8
-noname.533:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.534
-	incl	r2
-noname.534:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r7
-	bicl2	#0,r7
-	cmpl	r7,r3
-	bgequ	noname.535
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.535
-	incl	r8
-noname.535:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.536
-	incl	r8
-noname.536:
-
-	movl	8(ap),r0
-	bicl3	#-65536,24(r0),r3
-	movzwl	26(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,20(r0),r2
-	movzwl	22(r0),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-396(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-400(fp)
-	mull2	r0,r4
-	addl3	-396(fp),-400(fp),r0
-	bicl3	#0,r0,-396(fp)
-	cmpl	-396(fp),-400(fp)
-	bgequ	noname.537
-	addl2	#65536,r4
-noname.537:
-	movzwl	-394(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-396(fp),r0
-	ashl	#16,r0,-400(fp)
-	addl2	-400(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-400(fp)
-	bgequ	noname.538
-	incl	r4
-noname.538:
-	movl	r5,r3
-	movl	r4,r2
-	bbc	#31,r2,noname.539
-	incl	r8
-noname.539:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.540
-	incl	r2
-noname.540:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r7
-	bicl2	#0,r7
-	cmpl	r7,r3
-	bgequ	noname.541
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.541
-	incl	r8
-noname.541:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.542
-	incl	r8
-noname.542:
-
-	movl	4(ap),r0
-	movl	r7,44(r0)
-
-	clrl	r7
-
-	movl	8(ap),r3
-	movl	24(r3),r4
-	bicl3	#-65536,r4,r5
-	extzv	#16,#16,r4,r0
-	bicl3	#-65536,r0,r4
-	mull3	r5,r4,-404(fp)
-	mull2	r5,r5
-	mull2	r4,r4
-	bicl3	#32767,-404(fp),r0
-	extzv	#15,#17,r0,r0
-	addl2	r0,r4
-	bicl3	#-65536,-404(fp),r0
-	ashl	#17,r0,-404(fp)
-	addl2	-404(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-404(fp)
-	bgequ	noname.543
-	incl	r4
-noname.543:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.544
-	incl	r2
-noname.544:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.545
-	incl	r7
-noname.545:
-
-	movzwl	30(r3),r2
-	bicl3	#-65536,20(r3),r1
-	movzwl	22(r3),r0
-	bicl2	#-65536,r0
-	bicl3	#-65536,28(r3),-416(fp)
-	bicl3	#-65536,r2,-420(fp)
-	mull3	r0,-416(fp),-408(fp)
-	mull2	r1,-416(fp)
-	mull3	r1,-420(fp),-412(fp)
-	mull2	r0,-420(fp)
-	addl3	-408(fp),-412(fp),r0
-	bicl3	#0,r0,-408(fp)
-	cmpl	-408(fp),-412(fp)
-	bgequ	noname.546
-	addl2	#65536,-420(fp)
-noname.546:
-	movzwl	-406(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-420(fp)
-	bicl3	#-65536,-408(fp),r0
-	ashl	#16,r0,-412(fp)
-	addl3	-412(fp),-416(fp),r0
-	bicl3	#0,r0,-416(fp)
-	cmpl	-416(fp),-412(fp)
-	bgequ	noname.547
-	incl	-420(fp)
-noname.547:
-	movl	-416(fp),r3
-	movl	-420(fp),r2
-	bbc	#31,r2,noname.548
-	incl	r7
-noname.548:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.549
-	incl	r2
-noname.549:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r9
-	bicl2	#0,r9
-	cmpl	r9,r3
-	bgequ	noname.550
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.550
-	incl	r7
-noname.550:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.551
-	incl	r7
-noname.551:
-
-	movl	4(ap),r0
-	movl	r9,48(r0)
-
-	clrl	r9
-
-	movl	8(ap),r0
-	movzwl	30(r0),r2
-	bicl3	#-65536,24(r0),r3
-	movzwl	26(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,28(r0),-432(fp)
-	bicl3	#-65536,r2,-436(fp)
-	mull3	r1,-432(fp),-424(fp)
-	mull2	r3,-432(fp)
-	mull3	r3,-436(fp),-428(fp)
-	mull2	r1,-436(fp)
-	addl3	-424(fp),-428(fp),r0
-	bicl3	#0,r0,-424(fp)
-	cmpl	-424(fp),-428(fp)
-	bgequ	noname.552
-	addl2	#65536,-436(fp)
-noname.552:
-	movzwl	-422(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,-436(fp)
-	bicl3	#-65536,-424(fp),r0
-	ashl	#16,r0,-428(fp)
-	addl3	-428(fp),-432(fp),r0
-	bicl3	#0,r0,-432(fp)
-	cmpl	-432(fp),-428(fp)
-	bgequ	noname.553
-	incl	-436(fp)
-noname.553:
-	movl	-432(fp),r3
-	movl	-436(fp),r2
-	bbc	#31,r2,noname.554
-	incl	r9
-noname.554:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.555
-	incl	r2
-noname.555:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r8
-	bicl2	#0,r8
-	cmpl	r8,r3
-	bgequ	noname.556
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.556
-	incl	r9
-noname.556:
-	addl2	r2,r7
-	bicl2	#0,r7
-	cmpl	r7,r2
-	bgequ	noname.557
-	incl	r9
-noname.557:
-
-	movl	4(ap),r4
-	movl	r8,52(r4)
-
-	clrl	r8
-
-	movl	8(ap),r0
-	movl	28(r0),r3
-	bicl3	#-65536,r3,-440(fp)
-	extzv	#16,#16,r3,r0
-	bicl3	#-65536,r0,r3
-	movl	-440(fp),r0
-	mull3	r0,r3,-444(fp)
-	mull3	r0,r0,-440(fp)
-	mull2	r3,r3
-	bicl3	#32767,-444(fp),r0
-	extzv	#15,#17,r0,r0
-	addl2	r0,r3
-	bicl3	#-65536,-444(fp),r0
-	ashl	#17,r0,-444(fp)
-	addl3	-440(fp),-444(fp),r0
-	bicl3	#0,r0,-440(fp)
-	cmpl	-440(fp),-444(fp)
-	bgequ	noname.558
-	incl	r3
-noname.558:
-	movl	-440(fp),r1
-	movl	r3,r2
-	addl2	r1,r7
-	bicl2	#0,r7
-	cmpl	r7,r1
-	bgequ	noname.559
-	incl	r2
-noname.559:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.560
-	incl	r8
-noname.560:
-
-	movl	r7,56(r4)
-
-	movl	r9,60(r4)
-
-	ret	
-
-
-
-;r=4 ;(AP)
-;a=8 ;(AP)
-;b=12 ;(AP)
-;n=16 ;(AP)	n	by value (input)
-
-	.psect	code,nowrt
-
-.entry	BN_SQR_COMBA4,^m<r2,r3,r4,r5,r6,r7,r8,r9,r10>
-	subl2	#44,sp
-
-	clrq	r8
-
-	clrl	r10
-
-	movl	8(ap),r5
-	movl	(r5),r3
-	bicl3	#-65536,r3,r4
-	extzv	#16,#16,r3,r0
-	bicl3	#-65536,r0,r3
-	mull3	r4,r3,-4(fp)
-	mull2	r4,r4
-	mull2	r3,r3
-	bicl3	#32767,-4(fp),r0
-	extzv	#15,#17,r0,r0
-	addl2	r0,r3
-	bicl3	#-65536,-4(fp),r0
-	ashl	#17,r0,-4(fp)
-	addl2	-4(fp),r4
-	bicl2	#0,r4
-	cmpl	r4,-4(fp)
-	bgequ	noname.563
-	incl	r3
-noname.563:
-	movl	r4,r1
-	movl	r3,r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.564
-	incl	r2
-noname.564:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.565
-	incl	r10
-noname.565:
-
-	movl	r9,@4(ap)
-
-	clrl	r9
-
-	bicl3	#-65536,4(r5),r3
-	movzwl	6(r5),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,(r5),r2
-	movzwl	2(r5),r0
-	bicl2	#-65536,r0
-	movl	r3,r6
-	movl	r1,r4
-	mull3	r0,r6,-8(fp)
-	mull2	r2,r6
-	mull2	r4,r2
-	mull2	r0,r4
-	addl3	-8(fp),r2,r0
-	bicl3	#0,r0,-8(fp)
-	cmpl	-8(fp),r2
-	bgequ	noname.566
-	addl2	#65536,r4
-noname.566:
-	movzwl	-6(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-8(fp),r0
-	ashl	#16,r0,r1
-	addl2	r1,r6
-	bicl2	#0,r6
-	cmpl	r6,r1
-	bgequ	noname.567
-	incl	r4
-noname.567:
-	movl	r6,r3
-	movl	r4,r2
-	bbc	#31,r2,noname.568
-	incl	r9
-noname.568:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.569
-	incl	r2
-noname.569:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r8
-	bicl2	#0,r8
-	cmpl	r8,r3
-	bgequ	noname.570
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.570
-	incl	r9
-noname.570:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.571
-	incl	r9
-noname.571:
-
-	movl	4(ap),r0
-	movl	r8,4(r0)
-
-	clrl	r8
-
-	movl	8(ap),r4
-	movl	4(r4),r3
-	bicl3	#-65536,r3,r5
-	extzv	#16,#16,r3,r0
-	bicl3	#-65536,r0,r3
-	mull3	r5,r3,r1
-	mull2	r5,r5
-	mull2	r3,r3
-	bicl3	#32767,r1,r0
-	extzv	#15,#17,r0,r0
-	addl2	r0,r3
-	bicl2	#-65536,r1
-	ashl	#17,r1,r1
-	addl2	r1,r5
-	bicl2	#0,r5
-	cmpl	r5,r1
-	bgequ	noname.572
-	incl	r3
-noname.572:
-	movl	r5,r1
-	movl	r3,r2
-	addl2	r1,r10
-	bicl2	#0,r10
-	cmpl	r10,r1
-	bgequ	noname.573
-	incl	r2
-noname.573:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.574
-	incl	r8
-noname.574:
-
-	bicl3	#-65536,8(r4),r3
-	movzwl	10(r4),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,(r4),r2
-	movzwl	2(r4),r0
-	bicl2	#-65536,r0
-	movl	r3,r6
-	movl	r1,r5
-	mull3	r0,r6,r7
-	mull2	r2,r6
-	mull2	r5,r2
-	mull2	r0,r5
-	addl2	r2,r7
-	bicl2	#0,r7
-	cmpl	r7,r2
-	bgequ	noname.575
-	addl2	#65536,r5
-noname.575:
-	extzv	#16,#16,r7,r0
-	bicl2	#-65536,r0
-	addl2	r0,r5
-	bicl3	#-65536,r7,r0
-	ashl	#16,r0,r1
-	addl2	r1,r6
-	bicl2	#0,r6
-	cmpl	r6,r1
-	bgequ	noname.576
-	incl	r5
-noname.576:
-	movl	r6,r3
-	movl	r5,r2
-	bbc	#31,r2,noname.577
-	incl	r8
-noname.577:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.578
-	incl	r2
-noname.578:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r10
-	bicl2	#0,r10
-	cmpl	r10,r3
-	bgequ	noname.579
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.579
-	incl	r8
-noname.579:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.580
-	incl	r8
-noname.580:
-
-	movl	4(ap),r0
-	movl	r10,8(r0)
-
-	clrl	r10
-
-	movl	8(ap),r0
-	bicl3	#-65536,12(r0),r3
-	movzwl	14(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,(r0),r2
-	movzwl	2(r0),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,r6
-	mull2	r2,r5
-	mull3	r2,r4,-12(fp)
-	mull2	r0,r4
-	addl2	-12(fp),r6
-	bicl2	#0,r6
-	cmpl	r6,-12(fp)
-	bgequ	noname.581
-	addl2	#65536,r4
-noname.581:
-	extzv	#16,#16,r6,r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,r6,r0
-	ashl	#16,r0,-12(fp)
-	addl2	-12(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-12(fp)
-	bgequ	noname.582
-	incl	r4
-noname.582:
-	movl	r5,r3
-	movl	r4,r2
-	bbc	#31,r2,noname.583
-	incl	r10
-noname.583:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.584
-	incl	r2
-noname.584:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r9
-	bicl2	#0,r9
-	cmpl	r9,r3
-	bgequ	noname.585
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.585
-	incl	r10
-noname.585:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.586
-	incl	r10
-noname.586:
-
-	movl	8(ap),r0
-	bicl3	#-65536,8(r0),r3
-	movzwl	10(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,4(r0),r2
-	movzwl	6(r0),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-16(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-20(fp)
-	mull2	r0,r4
-	addl3	-16(fp),-20(fp),r0
-	bicl3	#0,r0,-16(fp)
-	cmpl	-16(fp),-20(fp)
-	bgequ	noname.587
-	addl2	#65536,r4
-noname.587:
-	movzwl	-14(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-16(fp),r0
-	ashl	#16,r0,-20(fp)
-	addl2	-20(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-20(fp)
-	bgequ	noname.588
-	incl	r4
-noname.588:
-	movl	r5,r3
-	movl	r4,r2
-	bbc	#31,r2,noname.589
-	incl	r10
-noname.589:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.590
-	incl	r2
-noname.590:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r9
-	bicl2	#0,r9
-	cmpl	r9,r3
-	bgequ	noname.591
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.591
-	incl	r10
-noname.591:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.592
-	incl	r10
-noname.592:
-	movl	4(ap),r0
-	movl	r9,12(r0)
-
-	clrl	r9
-
-	movl	8(ap),r3
-	movl	8(r3),r4
-	bicl3	#-65536,r4,r5
-	extzv	#16,#16,r4,r0
-	bicl3	#-65536,r0,r4
-	mull3	r5,r4,-24(fp)
-	mull2	r5,r5
-	mull2	r4,r4
-	bicl3	#32767,-24(fp),r0
-	extzv	#15,#17,r0,r0
-	addl2	r0,r4
-	bicl3	#-65536,-24(fp),r0
-	ashl	#17,r0,-24(fp)
-	addl2	-24(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-24(fp)
-	bgequ	noname.593
-	incl	r4
-noname.593:
-	movl	r5,r1
-	movl	r4,r2
-	addl2	r1,r8
-	bicl2	#0,r8
-	cmpl	r8,r1
-	bgequ	noname.594
-	incl	r2
-noname.594:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.595
-	incl	r9
-noname.595:
-
-	bicl3	#-65536,12(r3),r4
-	movzwl	14(r3),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,4(r3),r2
-	movzwl	6(r3),r0
-	bicl2	#-65536,r0
-	movl	r4,r6
-	movl	r1,r5
-	mull3	r0,r6,-28(fp)
-	mull2	r2,r6
-	mull3	r2,r5,-32(fp)
-	mull2	r0,r5
-	addl3	-28(fp),-32(fp),r0
-	bicl3	#0,r0,-28(fp)
-	cmpl	-28(fp),-32(fp)
-	bgequ	noname.596
-	addl2	#65536,r5
-noname.596:
-	movzwl	-26(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r5
-	bicl3	#-65536,-28(fp),r0
-	ashl	#16,r0,-32(fp)
-	addl2	-32(fp),r6
-	bicl2	#0,r6
-	cmpl	r6,-32(fp)
-	bgequ	noname.597
-	incl	r5
-noname.597:
-	movl	r6,r3
-	movl	r5,r2
-	bbc	#31,r2,noname.598
-	incl	r9
-noname.598:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.599
-	incl	r2
-noname.599:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r8
-	bicl2	#0,r8
-	cmpl	r8,r3
-	bgequ	noname.600
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.600
-	incl	r9
-noname.600:
-	addl2	r2,r10
-	bicl2	#0,r10
-	cmpl	r10,r2
-	bgequ	noname.601
-	incl	r9
-noname.601:
-
-	movl	4(ap),r0
-	movl	r8,16(r0)
-
-	clrl	r8
-
-	movl	8(ap),r0
-	bicl3	#-65536,12(r0),r3
-	movzwl	14(r0),r1
-	bicl2	#-65536,r1
-	bicl3	#-65536,8(r0),r2
-	movzwl	10(r0),r0
-	bicl2	#-65536,r0
-	movl	r3,r5
-	movl	r1,r4
-	mull3	r0,r5,-36(fp)
-	mull2	r2,r5
-	mull3	r2,r4,-40(fp)
-	mull2	r0,r4
-	addl3	-36(fp),-40(fp),r0
-	bicl3	#0,r0,-36(fp)
-	cmpl	-36(fp),-40(fp)
-	bgequ	noname.602
-	addl2	#65536,r4
-noname.602:
-	movzwl	-34(fp),r0
-	bicl2	#-65536,r0
-	addl2	r0,r4
-	bicl3	#-65536,-36(fp),r0
-	ashl	#16,r0,-40(fp)
-	addl2	-40(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-40(fp)
-	bgequ	noname.603
-	incl	r4
-noname.603:
-	movl	r5,r3
-	movl	r4,r2
-	bbc	#31,r2,noname.604
-	incl	r8
-noname.604:
-	addl2	r2,r2
-	bicl2	#0,r2
-	bbc	#31,r3,noname.605
-	incl	r2
-noname.605:
-	addl2	r3,r3
-	bicl2	#0,r3
-	addl2	r3,r10
-	bicl2	#0,r10
-	cmpl	r10,r3
-	bgequ	noname.606
-	incl	r2
-	bicl3	#0,r2,r0
-	bneq	noname.606
-	incl	r8
-noname.606:
-	addl2	r2,r9
-	bicl2	#0,r9
-	cmpl	r9,r2
-	bgequ	noname.607
-	incl	r8
-noname.607:
-
-	movl	4(ap),r4
-	movl	r10,20(r4)
-
-	clrl	r10
-
-	movl	8(ap),r0
-	movl	12(r0),r3
-	bicl3	#-65536,r3,r5
-	extzv	#16,#16,r3,r0
-	bicl3	#-65536,r0,r3
-	mull3	r5,r3,-44(fp)
-	mull2	r5,r5
-	mull2	r3,r3
-	bicl3	#32767,-44(fp),r0
-	extzv	#15,#17,r0,r0
-	addl2	r0,r3
-	bicl3	#-65536,-44(fp),r0
-	ashl	#17,r0,-44(fp)
-	addl2	-44(fp),r5
-	bicl2	#0,r5
-	cmpl	r5,-44(fp)
-	bgequ	noname.608
-	incl	r3
-noname.608:
-	movl	r5,r1
-	movl	r3,r2
-	addl2	r1,r9
-	bicl2	#0,r9
-	cmpl	r9,r1
-	bgequ	noname.609
-	incl	r2
-noname.609:
-	addl2	r2,r8
-	bicl2	#0,r8
-	cmpl	r8,r2
-	bgequ	noname.610
-	incl	r10
-noname.610:
-
-	movl	r9,24(r4)
-
-	movl	r8,28(r4)
-
-	ret	
-
-; For now, the code below doesn't work, so I end this prematurely.
-.end
diff --git a/deps/openssl/openssl/crypto/bn/asm/x86-gf2m.pl b/deps/openssl/openssl/crypto/bn/asm/x86-gf2m.pl
index b579530272..f464368733 100644
--- a/deps/openssl/openssl/crypto/bn/asm/x86-gf2m.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/x86-gf2m.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -36,6 +43,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
+$output = pop;
+open STDOUT,">$output";
+
 &asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
 
 $sse2=0;
@@ -311,3 +321,5 @@ if ($sse2) {
 &asciz	("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
 
 &asm_finish();
+
+close STDOUT;
diff --git a/deps/openssl/openssl/crypto/bn/asm/x86-mont.pl b/deps/openssl/openssl/crypto/bn/asm/x86-mont.pl
index 1c4003efc2..a8b402d59b 100755
--- a/deps/openssl/openssl/crypto/bn/asm/x86-mont.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/x86-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -30,6 +37,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
+$output = pop;
+open STDOUT,">$output";
+
 &asm_init($ARGV[0],$0);
 
 $sse2=0;
@@ -84,7 +94,9 @@ $frame=32;				# size of above frame rounded up to 16n
 
 	&and	("ebp",-64);		# align to cache line
 
-	# Some OSes, *cough*-dows, insist on stack being "wired" to
+	# An OS-agnostic version of __chkstk.
+	#
+	# Some OSes (Windows) insist on stack being "wired" to
 	# physical memory in strictly sequential manner, i.e. if stack
 	# allocation spans two pages, then reference to farmost one can
 	# be punishable by SEGV. But page walking can do good even on
@@ -289,7 +301,7 @@ if (0) {
 	&xor	("eax","eax");	# signal "not fast enough [yet]"
 	&jmp	(&label("just_leave"));
 	# While the below code provides competitive performance for
-	# all key lengthes on modern Intel cores, it's still more
+	# all key lengths on modern Intel cores, it's still more
 	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
 	# means compared to the original integer-only assembler.
 	# 512-bit RSA sign is better by ~40%, but that's about all
@@ -613,3 +625,5 @@ $sbit=$num;
 &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
 
 &asm_finish();
+
+close STDOUT;
diff --git a/deps/openssl/openssl/crypto/bn/asm/x86.pl b/deps/openssl/openssl/crypto/bn/asm/x86.pl
deleted file mode 100644
index 1bc4f1bb27..0000000000
--- a/deps/openssl/openssl/crypto/bn/asm/x86.pl
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/local/bin/perl
-
-push(@INC,"perlasm","../../perlasm");
-require "x86asm.pl";
-
-require("x86/mul_add.pl");
-require("x86/mul.pl");
-require("x86/sqr.pl");
-require("x86/div.pl");
-require("x86/add.pl");
-require("x86/sub.pl");
-require("x86/comba.pl");
-
-&asm_init($ARGV[0],$0);
-
-&bn_mul_add_words("bn_mul_add_words");
-&bn_mul_words("bn_mul_words");
-&bn_sqr_words("bn_sqr_words");
-&bn_div_words("bn_div_words");
-&bn_add_words("bn_add_words");
-&bn_sub_words("bn_sub_words");
-&bn_mul_comba("bn_mul_comba8",8);
-&bn_mul_comba("bn_mul_comba4",4);
-&bn_sqr_comba("bn_sqr_comba8",8);
-&bn_sqr_comba("bn_sqr_comba4",4);
-
-&asm_finish();
-
diff --git a/deps/openssl/openssl/crypto/bn/asm/x86/add.pl b/deps/openssl/openssl/crypto/bn/asm/x86/add.pl
deleted file mode 100644
index 0b5cf583e3..0000000000
--- a/deps/openssl/openssl/crypto/bn/asm/x86/add.pl
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/local/bin/perl
-# x86 assember
-
-sub bn_add_words
-	{
-	local($name)=@_;
-
-	&function_begin($name,"");
-
-	&comment("");
-	$a="esi";
-	$b="edi";
-	$c="eax";
-	$r="ebx";
-	$tmp1="ecx";
-	$tmp2="edx";
-	$num="ebp";
-
-	&mov($r,&wparam(0));	# get r
-	 &mov($a,&wparam(1));	# get a
-	&mov($b,&wparam(2));	# get b
-	 &mov($num,&wparam(3));	# get num
-	&xor($c,$c);		# clear carry
-	 &and($num,0xfffffff8);	# num / 8
-
-	&jz(&label("aw_finish"));
-
-	&set_label("aw_loop",0);
-	for ($i=0; $i<8; $i++)
-		{
-		&comment("Round $i");
-
-		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
-		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
-		&add($tmp1,$c);
-		 &mov($c,0);
-		&adc($c,$c);
-		 &add($tmp1,$tmp2);
-		&adc($c,0);
-		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
-		}
-
-	&comment("");
-	&add($a,32);
-	 &add($b,32);
-	&add($r,32);
-	 &sub($num,8);
-	&jnz(&label("aw_loop"));
-
-	&set_label("aw_finish",0);
-	&mov($num,&wparam(3));	# get num
-	&and($num,7);
-	 &jz(&label("aw_end"));
-
-	for ($i=0; $i<7; $i++)
-		{
-		&comment("Tail Round $i");
-		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
-		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
-		&add($tmp1,$c);
-		 &mov($c,0);
-		&adc($c,$c);
-		 &add($tmp1,$tmp2);
-		&adc($c,0);
-		 &dec($num) if ($i != 6);
-		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *a
-		 &jz(&label("aw_end")) if ($i != 6);
-		}
-	&set_label("aw_end",0);
-
-#	&mov("eax",$c);		# $c is "eax"
-
-	&function_end($name);
-	}
-
-1;
diff --git a/deps/openssl/openssl/crypto/bn/asm/x86/comba.pl b/deps/openssl/openssl/crypto/bn/asm/x86/comba.pl
deleted file mode 100644
index 2291253629..0000000000
--- a/deps/openssl/openssl/crypto/bn/asm/x86/comba.pl
+++ /dev/null
@@ -1,277 +0,0 @@
-#!/usr/local/bin/perl
-# x86 assember
-
-sub mul_add_c
-	{
-	local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
-
-	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
-	# words, and 1 if load return value
-
-	&comment("mul a[$ai]*b[$bi]");
-
-	# "eax" and "edx" will always be pre-loaded.
-	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
-	# &mov("edx",&DWP($bi*4,$b,"",0));
-
-	&mul("edx");
-	&add($c0,"eax");
-	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# laod next a
-	 &mov("eax",&wparam(0)) if $pos > 0;			# load r[]
-	 ###
-	&adc($c1,"edx");
-	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0;	# laod next b
-	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1;	# laod next b
-	 ###
-	&adc($c2,0);
-	 # is pos > 1, it means it is the last loop 
-	 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0;		# save r[];
-	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# laod next a
-	}
-
-sub sqr_add_c
-	{
-	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
-
-	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
-	# words, and 1 if load return value
-
-	&comment("sqr a[$ai]*a[$bi]");
-
-	# "eax" and "edx" will always be pre-loaded.
-	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
-	# &mov("edx",&DWP($bi*4,$b,"",0));
-
-	if ($ai == $bi)
-		{ &mul("eax");}
-	else
-		{ &mul("edx");}
-	&add($c0,"eax");
-	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
-	 ###
-	&adc($c1,"edx");
-	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
-	 ###
-	&adc($c2,0);
-	 # is pos > 1, it means it is the last loop 
-	 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
-	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# load next b
-	}
-
-sub sqr_add_c2
-	{
-	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
-
-	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
-	# words, and 1 if load return value
-
-	&comment("sqr a[$ai]*a[$bi]");
-
-	# "eax" and "edx" will always be pre-loaded.
-	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
-	# &mov("edx",&DWP($bi*4,$a,"",0));
-
-	if ($ai == $bi)
-		{ &mul("eax");}
-	else
-		{ &mul("edx");}
-	&add("eax","eax");
-	 ###
-	&adc("edx","edx");
-	 ###
-	&adc($c2,0);
-	 &add($c0,"eax");
-	&adc($c1,"edx");
-	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
-	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;	# load next b
-	&adc($c2,0);
-	&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
-	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
-	 ###
-	}
-
-sub bn_mul_comba
-	{
-	local($name,$num)=@_;
-	local($a,$b,$c0,$c1,$c2);
-	local($i,$as,$ae,$bs,$be,$ai,$bi);
-	local($tot,$end);
-
-	&function_begin_B($name,"");
-
-	$c0="ebx";
-	$c1="ecx";
-	$c2="ebp";
-	$a="esi";
-	$b="edi";
-	
-	$as=0;
-	$ae=0;
-	$bs=0;
-	$be=0;
-	$tot=$num+$num-1;
-
-	&push("esi");
-	 &mov($a,&wparam(1));
-	&push("edi");
-	 &mov($b,&wparam(2));
-	&push("ebp");
-	 &push("ebx");
-
-	&xor($c0,$c0);
-	 &mov("eax",&DWP(0,$a,"",0));	# load the first word 
-	&xor($c1,$c1);
-	 &mov("edx",&DWP(0,$b,"",0));	# load the first second 
-
-	for ($i=0; $i<$tot; $i++)
-		{
-		$ai=$as;
-		$bi=$bs;
-		$end=$be+1;
-
-		&comment("################## Calculate word $i"); 
-
-		for ($j=$bs; $j<$end; $j++)
-			{
-			&xor($c2,$c2) if ($j == $bs);
-			if (($j+1) == $end)
-				{
-				$v=1;
-				$v=2 if (($i+1) == $tot);
-				}
-			else
-				{ $v=0; }
-			if (($j+1) != $end)
-				{
-				$na=($ai-1);
-				$nb=($bi+1);
-				}
-			else
-				{
-				$na=$as+($i < ($num-1));
-				$nb=$bs+($i >= ($num-1));
-				}
-#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
-			&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
-			if ($v)
-				{
-				&comment("saved r[$i]");
-				# &mov("eax",&wparam(0));
-				# &mov(&DWP($i*4,"eax","",0),$c0);
-				($c0,$c1,$c2)=($c1,$c2,$c0);
-				}
-			$ai--;
-			$bi++;
-			}
-		$as++ if ($i < ($num-1));
-		$ae++ if ($i >= ($num-1));
-
-		$bs++ if ($i >= ($num-1));
-		$be++ if ($i < ($num-1));
-		}
-	&comment("save r[$i]");
-	# &mov("eax",&wparam(0));
-	&mov(&DWP($i*4,"eax","",0),$c0);
-
-	&pop("ebx");
-	&pop("ebp");
-	&pop("edi");
-	&pop("esi");
-	&ret();
-	&function_end_B($name);
-	}
-
-sub bn_sqr_comba
-	{
-	local($name,$num)=@_;
-	local($r,$a,$c0,$c1,$c2)=@_;
-	local($i,$as,$ae,$bs,$be,$ai,$bi);
-	local($b,$tot,$end,$half);
-
-	&function_begin_B($name,"");
-
-	$c0="ebx";
-	$c1="ecx";
-	$c2="ebp";
-	$a="esi";
-	$r="edi";
-
-	&push("esi");
-	 &push("edi");
-	&push("ebp");
-	 &push("ebx");
-	&mov($r,&wparam(0));
-	 &mov($a,&wparam(1));
-	&xor($c0,$c0);
-	 &xor($c1,$c1);
-	&mov("eax",&DWP(0,$a,"",0)); # load the first word
-
-	$as=0;
-	$ae=0;
-	$bs=0;
-	$be=0;
-	$tot=$num+$num-1;
-
-	for ($i=0; $i<$tot; $i++)
-		{
-		$ai=$as;
-		$bi=$bs;
-		$end=$be+1;
-
-		&comment("############### Calculate word $i");
-		for ($j=$bs; $j<$end; $j++)
-			{
-			&xor($c2,$c2) if ($j == $bs);
-			if (($ai-1) < ($bi+1))
-				{
-				$v=1;
-				$v=2 if ($i+1) == $tot;
-				}
-			else
-				{ $v=0; }
-			if (!$v)
-				{
-				$na=$ai-1;
-				$nb=$bi+1;
-				}
-			else
-				{
-				$na=$as+($i < ($num-1));
-				$nb=$bs+($i >= ($num-1));
-				}
-			if ($ai == $bi)
-				{
-				&sqr_add_c($r,$a,$ai,$bi,
-					$c0,$c1,$c2,$v,$i,$na,$nb);
-				}
-			else
-				{
-				&sqr_add_c2($r,$a,$ai,$bi,
-					$c0,$c1,$c2,$v,$i,$na,$nb);
-				}
-			if ($v)
-				{
-				&comment("saved r[$i]");
-				#&mov(&DWP($i*4,$r,"",0),$c0);
-				($c0,$c1,$c2)=($c1,$c2,$c0);
-				last;
-				}
-			$ai--;
-			$bi++;
-			}
-		$as++ if ($i < ($num-1));
-		$ae++ if ($i >= ($num-1));
-
-		$bs++ if ($i >= ($num-1));
-		$be++ if ($i < ($num-1));
-		}
-	&mov(&DWP($i*4,$r,"",0),$c0);
-	&pop("ebx");
-	&pop("ebp");
-	&pop("edi");
-	&pop("esi");
-	&ret();
-	&function_end_B($name);
-	}
-
-1;
diff --git a/deps/openssl/openssl/crypto/bn/asm/x86/div.pl b/deps/openssl/openssl/crypto/bn/asm/x86/div.pl
deleted file mode 100644
index 0e90152caa..0000000000
--- a/deps/openssl/openssl/crypto/bn/asm/x86/div.pl
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/local/bin/perl
-# x86 assember
-
-sub bn_div_words
-	{
-	local($name)=@_;
-
-	&function_begin($name,"");
-	&mov("edx",&wparam(0));	#
-	&mov("eax",&wparam(1));	#
-	&mov("ebx",&wparam(2));	#
-	&div("ebx");
-	&function_end($name);
-	}
-1;
diff --git a/deps/openssl/openssl/crypto/bn/asm/x86/f b/deps/openssl/openssl/crypto/bn/asm/x86/f
deleted file mode 100644
index 22e4112224..0000000000
--- a/deps/openssl/openssl/crypto/bn/asm/x86/f
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/usr/local/bin/perl
-# x86 assember
-
diff --git a/deps/openssl/openssl/crypto/bn/asm/x86/mul.pl b/deps/openssl/openssl/crypto/bn/asm/x86/mul.pl
deleted file mode 100644
index 674cb9b055..0000000000
--- a/deps/openssl/openssl/crypto/bn/asm/x86/mul.pl
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/usr/local/bin/perl
-# x86 assember
-
-sub bn_mul_words
-	{
-	local($name)=@_;
-
-	&function_begin($name,"");
-
-	&comment("");
-	$Low="eax";
-	$High="edx";
-	$a="ebx";
-	$w="ecx";
-	$r="edi";
-	$c="esi";
-	$num="ebp";
-
-	&xor($c,$c);		# clear carry
-	&mov($r,&wparam(0));	#
-	&mov($a,&wparam(1));	#
-	&mov($num,&wparam(2));	#
-	&mov($w,&wparam(3));	#
-
-	&and($num,0xfffffff8);	# num / 8
-	&jz(&label("mw_finish"));
-
-	&set_label("mw_loop",0);
-	for ($i=0; $i<32; $i+=4)
-		{
-		&comment("Round $i");
-
-		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
-		&mul($w);			# *a * w
-		&add("eax",$c);			# L(t)+=c
-		 # XXX
-
-		&adc("edx",0);			# H(t)+=carry
-		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
-
-		&mov($c,"edx");			# c=  H(t);
-		}
-
-	&comment("");
-	&add($a,32);
-	&add($r,32);
-	&sub($num,8);
-	&jz(&label("mw_finish"));
-	&jmp(&label("mw_loop"));
-
-	&set_label("mw_finish",0);
-	&mov($num,&wparam(2));	# get num
-	&and($num,7);
-	&jnz(&label("mw_finish2"));
-	&jmp(&label("mw_end"));
-
-	&set_label("mw_finish2",1);
-	for ($i=0; $i<7; $i++)
-		{
-		&comment("Tail Round $i");
-		 &mov("eax",&DWP($i*4,$a,"",0));# *a
-		&mul($w);			# *a * w
-		&add("eax",$c);			# L(t)+=c
-		 # XXX
-		&adc("edx",0);			# H(t)+=carry
-		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
-		&mov($c,"edx");			# c=  H(t);
-		 &dec($num) if ($i != 7-1);
-		&jz(&label("mw_end")) if ($i != 7-1);
-		}
-	&set_label("mw_end",0);
-	&mov("eax",$c);
-
-	&function_end($name);
-	}
-
-1;
diff --git a/deps/openssl/openssl/crypto/bn/asm/x86/mul_add.pl b/deps/openssl/openssl/crypto/bn/asm/x86/mul_add.pl
deleted file mode 100644
index 61830d3a90..0000000000
--- a/deps/openssl/openssl/crypto/bn/asm/x86/mul_add.pl
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/usr/local/bin/perl
-# x86 assember
-
-sub bn_mul_add_words
-	{
-	local($name)=@_;
-
-	&function_begin($name,"");
-
-	&comment("");
-	$Low="eax";
-	$High="edx";
-	$a="ebx";
-	$w="ebp";
-	$r="edi";
-	$c="esi";
-
-	&xor($c,$c);		# clear carry
-	&mov($r,&wparam(0));	#
-
-	&mov("ecx",&wparam(2));	#
-	&mov($a,&wparam(1));	#
-
-	&and("ecx",0xfffffff8);	# num / 8
-	&mov($w,&wparam(3));	#
-
-	&push("ecx");		# Up the stack for a tmp variable
-
-	&jz(&label("maw_finish"));
-
-	&set_label("maw_loop",0);
-
-	&mov(&swtmp(0),"ecx");	#
-
-	for ($i=0; $i<32; $i+=4)
-		{
-		&comment("Round $i");
-
-		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
-		&mul($w);			# *a * w
-		&add("eax",$c);		# L(t)+= *r
-		 &mov($c,&DWP($i,$r,"",0));	# L(t)+= *r
-		&adc("edx",0);			# H(t)+=carry
-		 &add("eax",$c);		# L(t)+=c
-		&adc("edx",0);			# H(t)+=carry
-		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
-		&mov($c,"edx");			# c=  H(t);
-		}
-
-	&comment("");
-	&mov("ecx",&swtmp(0));	#
-	&add($a,32);
-	&add($r,32);
-	&sub("ecx",8);
-	&jnz(&label("maw_loop"));
-
-	&set_label("maw_finish",0);
-	&mov("ecx",&wparam(2));	# get num
-	&and("ecx",7);
-	&jnz(&label("maw_finish2"));	# helps branch prediction
-	&jmp(&label("maw_end"));
-
-	&set_label("maw_finish2",1);
-	for ($i=0; $i<7; $i++)
-		{
-		&comment("Tail Round $i");
-		 &mov("eax",&DWP($i*4,$a,"",0));# *a
-		&mul($w);			# *a * w
-		&add("eax",$c);			# L(t)+=c
-		 &mov($c,&DWP($i*4,$r,"",0));	# L(t)+= *r
-		&adc("edx",0);			# H(t)+=carry
-		 &add("eax",$c);
-		&adc("edx",0);			# H(t)+=carry
-		 &dec("ecx") if ($i != 7-1);
-		&mov(&DWP($i*4,$r,"",0),"eax");	# *r= L(t);
-		 &mov($c,"edx");			# c=  H(t);
-		&jz(&label("maw_end")) if ($i != 7-1);
-		}
-	&set_label("maw_end",0);
-	&mov("eax",$c);
-
-	&pop("ecx");	# clear variable from
-
-	&function_end($name);
-	}
-
-1;
diff --git a/deps/openssl/openssl/crypto/bn/asm/x86/sqr.pl b/deps/openssl/openssl/crypto/bn/asm/x86/sqr.pl
deleted file mode 100644
index 1f90993cf6..0000000000
--- a/deps/openssl/openssl/crypto/bn/asm/x86/sqr.pl
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/usr/local/bin/perl
-# x86 assember
-
-sub bn_sqr_words
-	{
-	local($name)=@_;
-
-	&function_begin($name,"");
-
-	&comment("");
-	$r="esi";
-	$a="edi";
-	$num="ebx";
-
-	&mov($r,&wparam(0));	#
-	&mov($a,&wparam(1));	#
-	&mov($num,&wparam(2));	#
-
-	&and($num,0xfffffff8);	# num / 8
-	&jz(&label("sw_finish"));
-
-	&set_label("sw_loop",0);
-	for ($i=0; $i<32; $i+=4)
-		{
-		&comment("Round $i");
-		&mov("eax",&DWP($i,$a,"",0)); 	# *a
-		 # XXX
-		&mul("eax");			# *a * *a
-		&mov(&DWP($i*2,$r,"",0),"eax");	#
-		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
-		}
-
-	&comment("");
-	&add($a,32);
-	&add($r,64);
-	&sub($num,8);
-	&jnz(&label("sw_loop"));
-
-	&set_label("sw_finish",0);
-	&mov($num,&wparam(2));	# get num
-	&and($num,7);
-	&jz(&label("sw_end"));
-
-	for ($i=0; $i<7; $i++)
-		{
-		&comment("Tail Round $i");
-		&mov("eax",&DWP($i*4,$a,"",0));	# *a
-		 # XXX
-		&mul("eax");			# *a * *a
-		&mov(&DWP($i*8,$r,"",0),"eax");	#
-		 &dec($num) if ($i != 7-1);
-		&mov(&DWP($i*8+4,$r,"",0),"edx");
-		 &jz(&label("sw_end")) if ($i != 7-1);
-		}
-	&set_label("sw_end",0);
-
-	&function_end($name);
-	}
-
-1;
diff --git a/deps/openssl/openssl/crypto/bn/asm/x86/sub.pl b/deps/openssl/openssl/crypto/bn/asm/x86/sub.pl
deleted file mode 100644
index 837b0e1b07..0000000000
--- a/deps/openssl/openssl/crypto/bn/asm/x86/sub.pl
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/local/bin/perl
-# x86 assember
-
-sub bn_sub_words
-	{
-	local($name)=@_;
-
-	&function_begin($name,"");
-
-	&comment("");
-	$a="esi";
-	$b="edi";
-	$c="eax";
-	$r="ebx";
-	$tmp1="ecx";
-	$tmp2="edx";
-	$num="ebp";
-
-	&mov($r,&wparam(0));	# get r
-	 &mov($a,&wparam(1));	# get a
-	&mov($b,&wparam(2));	# get b
-	 &mov($num,&wparam(3));	# get num
-	&xor($c,$c);		# clear carry
-	 &and($num,0xfffffff8);	# num / 8
-
-	&jz(&label("aw_finish"));
-
-	&set_label("aw_loop",0);
-	for ($i=0; $i<8; $i++)
-		{
-		&comment("Round $i");
-
-		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
-		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
-		&sub($tmp1,$c);
-		 &mov($c,0);
-		&adc($c,$c);
-		 &sub($tmp1,$tmp2);
-		&adc($c,0);
-		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
-		}
-
-	&comment("");
-	&add($a,32);
-	 &add($b,32);
-	&add($r,32);
-	 &sub($num,8);
-	&jnz(&label("aw_loop"));
-
-	&set_label("aw_finish",0);
-	&mov($num,&wparam(3));	# get num
-	&and($num,7);
-	 &jz(&label("aw_end"));
-
-	for ($i=0; $i<7; $i++)
-		{
-		&comment("Tail Round $i");
-		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
-		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
-		&sub($tmp1,$c);
-		 &mov($c,0);
-		&adc($c,$c);
-		 &sub($tmp1,$tmp2);
-		&adc($c,0);
-		 &dec($num) if ($i != 6);
-		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *a
-		 &jz(&label("aw_end")) if ($i != 6);
-		}
-	&set_label("aw_end",0);
-
-#	&mov("eax",$c);		# $c is "eax"
-
-	&function_end($name);
-	}
-
-1;
diff --git a/deps/openssl/openssl/crypto/bn/asm/x86_64-gcc.c b/deps/openssl/openssl/crypto/bn/asm/x86_64-gcc.c
index 1729b479d4..0ff3805a61 100644
--- a/deps/openssl/openssl/crypto/bn/asm/x86_64-gcc.c
+++ b/deps/openssl/openssl/crypto/bn/asm/x86_64-gcc.c
@@ -1,3 +1,12 @@
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
 #include "../bn_lcl.h"
 #if !(defined(__GNUC__) && __GNUC__>=2)
 # include "../bn_asm.c"         /* kind of dirty hack for Sun Studio */
@@ -216,9 +225,10 @@ BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
                   "       adcq    (%5,%2,8),%0    \n"
                   "       movq    %0,(%3,%2,8)    \n"
                   "       lea     1(%2),%2        \n"
-                  "       loop    1b              \n"
-                  "       sbbq    %0,%0           \n":"=&r" (ret), "+c"(n),
-                  "+r"(i)
+                  "       dec     %1              \n"
+                  "       jnz     1b              \n"
+                  "       sbbq    %0,%0           \n"
+                  :"=&r" (ret), "+c"(n), "+r"(i)
                   :"r"(rp), "r"(ap), "r"(bp)
                   :"cc", "memory");
 
@@ -242,9 +252,10 @@ BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
                   "       sbbq    (%5,%2,8),%0    \n"
                   "       movq    %0,(%3,%2,8)    \n"
                   "       lea     1(%2),%2        \n"
-                  "       loop    1b              \n"
-                  "       sbbq    %0,%0           \n":"=&r" (ret), "+c"(n),
-                  "+r"(i)
+                  "       dec     %1              \n"
+                  "       jnz     1b              \n"
+                  "       sbbq    %0,%0           \n"
+                  :"=&r" (ret), "+c"(n), "+r"(i)
                   :"r"(rp), "r"(ap), "r"(bp)
                   :"cc", "memory");
 
diff --git a/deps/openssl/openssl/crypto/bn/asm/x86_64-gf2m.pl b/deps/openssl/openssl/crypto/bn/asm/x86_64-gf2m.pl
index 42bbec2fb7..d962f62033 100644
--- a/deps/openssl/openssl/crypto/bn/asm/x86_64-gf2m.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/x86_64-gf2m.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -31,7 +38,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;
 
 ($lo,$hi)=("%rax","%rdx");	$a=$lo;
diff --git a/deps/openssl/openssl/crypto/bn/asm/x86_64-mont.pl b/deps/openssl/openssl/crypto/bn/asm/x86_64-mont.pl
index 80492d8e63..df4cca5bfe 100755
--- a/deps/openssl/openssl/crypto/bn/asm/x86_64-mont.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/x86_64-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -50,7 +57,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;
 
 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
@@ -129,7 +136,9 @@ $code.=<<___;
 	neg	$num			# restore $num
 	and	\$-1024,%r10		# minimize TLB usage
 
-	# Some OSes, *cough*-dows, insist on stack being "wired" to
+	# An OS-agnostic version of __chkstk.
+	#
+	# Some OSes (Windows) insist on stack being "wired" to
 	# physical memory in strictly sequential manner, i.e. if stack
 	# allocation spans two pages, then reference to farmost one can
 	# be punishable by SEGV. But page walking can do good even on
diff --git a/deps/openssl/openssl/crypto/bn/asm/x86_64-mont5.pl b/deps/openssl/openssl/crypto/bn/asm/x86_64-mont5.pl
index 42178e455a..5779059ea2 100755
--- a/deps/openssl/openssl/crypto/bn/asm/x86_64-mont5.pl
+++ b/deps/openssl/openssl/crypto/bn/asm/x86_64-mont5.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -35,7 +42,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;
 
 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
@@ -113,7 +120,9 @@ $code.=<<___;
 	neg	$num			# restore $num
 	and	\$-1024,%r10		# minimize TLB usage
 
-	# Some OSes, *cough*-dows, insist on stack being "wired" to
+	# An OS-agnostic version of __chkstk.
+	#
+	# Some OSes (Windows) insist on stack being "wired" to
 	# physical memory in strictly sequential manner, i.e. if stack
 	# allocation spans two pages, then reference to farmost one can
 	# be punishable by SEGV. But page walking can do good even on