12 files changed, 946 insertions, 89 deletions
diff --git a/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl b/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
index 980cfd23ef..5ad62b3979 100644
--- a/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -22,10 +29,11 @@
 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
 # pressure with notable relative improvement, achieving 1.0 cycle per
-# byte processed with 128-bit key on Haswell processor, and 0.74 -
-# on Broadwell. [Mentioned results are raw profiled measurements for
-# favourable packet size, one divisible by 96. Applications using the
-# EVP interface will observe a few percent worse performance.]
+# byte processed with 128-bit key on Haswell processor, 0.74 - on
+# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
+# measurements for favourable packet size, one divisible by 96.
+# Applications using the EVP interface will observe a few percent
+# worse performance.]
 #
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
@@ -60,7 +68,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([
 	$avx = ($2>=3.0) + ($2>3.0);
 }
 
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;
 
 if ($avx>1) {{{
@@ -108,6 +116,23 @@ _aesni_ctr32_ghash_6x:
 	  vpxor		$rndkey,$inout3,$inout3
 	  vmovups	0x10-0x80($key),$T2	# borrow $T2 for $rndkey
 	vpclmulqdq	\$0x01,$Hkey,$Z3,$Z2
+
+	# At this point, the current block of 96 (0x60) bytes has already been
+	# loaded into registers. Concurrently with processing it, we want to
+	# load the next 96 bytes of input for the next round. Obviously, we can
+	# only do this if there are at least 96 more bytes of input beyond the
+	# input we're currently processing, or else we'd read past the end of
+	# the input buffer. Here, we set |%r12| to 96 if there are at least 96
+	# bytes of input beyond the 96 bytes we're already processing, and we
+	# set |%r12| to 0 otherwise. In the case where we set |%r12| to 96,
+	# we'll read in the next block so that it is in registers for the next
+	# loop iteration. In the case where we set |%r12| to 0, we'll re-read
+	# the current block and then ignore what we re-read.
+	#
+	# At this point, |$in0| points to the current (already read into
+	# registers) block, and |$end0| points to 2*96 bytes before the end of
+	# the input. Thus, |$in0| > |$end0| means that we do not have the next
+	# 96-byte block to read in, and |$in0| <= |$end0| means we do.
 	xor		%r12,%r12
 	cmp		$in0,$end0
 
@@ -400,6 +425,9 @@ $code.=<<___;
 .align	32
 aesni_gcm_decrypt:
 	xor	$ret,$ret
+
+	# We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60)
+	# bytes of input.
 	cmp	\$0x60,$len			# minimal accepted length
 	jb	.Lgcm_dec_abort
 
@@ -454,7 +482,15 @@ $code.=<<___;
 	vmovdqu		0x50($inp),$Z3		# I[5]
 	lea		($inp),$in0
 	vmovdqu		0x40($inp),$Z0
+
+	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
+	# bytes before the end of the input. Note, in particular, that this is
+	# correct even if |$len| is not an even multiple of 96 or 16. XXX: This
+	# seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must
+	# not be near the very beginning of the address space when |$len| < 2*96
+	# (0xc0).
 	lea		-0xc0($inp,$len),$end0
+
 	vmovdqu		0x30($inp),$Z1
 	shr		\$4,$len
 	xor		$ret,$ret
@@ -610,6 +646,10 @@ _aesni_ctr32_6x:
 .align	32
 aesni_gcm_encrypt:
 	xor	$ret,$ret
+
+	# We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of
+	# input. Then we call |_aesni_ctr32_ghash_6x|, which requires at
+	# least 96 more bytes of input.
 	cmp	\$0x60*3,$len			# minimal accepted length
 	jb	.Lgcm_enc_abort
 
@@ -659,7 +699,16 @@ $code.=<<___;
 .Lenc_no_key_aliasing:
 
 	lea		($out),$in0
+
+	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
+	# bytes before the end of the input. Note, in particular, that this is
+	# correct even if |$len| is not an even multiple of 96 or 16. Unlike in
+	# the decryption case, there's no caveat that |$out| must not be near
+	# the very beginning of the address space, because we know that
+	# |$len| >= 3*96 from the check above, and so we know
+	# |$out| + |$len| >= 2*96 (0xc0).
 	lea		-0xc0($out,$len),$end0
+
 	shr		\$4,$len
 
 	call		_aesni_ctr32_6x
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-alpha.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-alpha.pl
index aa36029386..ccf6b2bd6f 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-alpha.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-alpha.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -454,7 +461,7 @@ rem_4bit:
 .align	4
 
 ___
-$output=shift and open STDOUT,">$output";
+$output=pop and open STDOUT,">$output";
 print $code;
 close STDOUT;
 
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl
index 8ccc963ef2..7d880c94a7 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -42,8 +49,8 @@
 # below and combine it with reduction algorithm from x86 module.
 # Performance improvement over previous version varies from 65% on
 # Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
-# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 -
-# in 9.33.
+# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
+# Snapdragon S4 - in 9.33.
 #
 # Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
 # Polynomial Multiplication on ARM Processors using the NEON Engine.
@@ -71,8 +78,20 @@
 # *native* byte order on current platform. See gcm128.c for working
 # example...
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
 
 $Xi="r0";	# argument block
 $Htbl="r1";
@@ -124,11 +143,18 @@ $code=<<___;
 #include "arm_arch.h"
 
 .text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax	unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
 .code	32
+#endif
 
-#ifdef __clang__
-#define ldrplb	ldrbpl
-#define ldrneb	ldrbne
+#ifdef  __clang__
+#define ldrplb  ldrbpl
+#define ldrneb  ldrbne
 #endif
 
 .type	rem_4bit,%object
@@ -142,19 +168,27 @@ rem_4bit:
 
 .type	rem_4bit_get,%function
 rem_4bit_get:
-	sub	$rem_4bit,pc,#8
-	sub	$rem_4bit,$rem_4bit,#32	@ &rem_4bit
+#if defined(__thumb2__)
+	adr	$rem_4bit,rem_4bit
+#else
+	sub	$rem_4bit,pc,#8+32	@ &rem_4bit
+#endif
 	b	.Lrem_4bit_got
 	nop
+	nop
 .size	rem_4bit_get,.-rem_4bit_get
 
 .global	gcm_ghash_4bit
 .type	gcm_ghash_4bit,%function
+.align	4
 gcm_ghash_4bit:
-	sub	r12,pc,#8
+#if defined(__thumb2__)
+	adr	r12,rem_4bit
+#else
+	sub	r12,pc,#8+48		@ &rem_4bit
+#endif
 	add	$len,$inp,$len		@ $len to point at the end
 	stmdb	sp!,{r3-r11,lr}		@ save $len/end too
-	sub	r12,r12,#48		@ &rem_4bit
 
 	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
 	stmdb	sp!,{r4-r11}		@ ... to stack
@@ -201,6 +235,9 @@ gcm_ghash_4bit:
 	eor	$Zlh,$Zlh,$Zhl,lsl#28
 	ldrh	$Tll,[sp,$nlo]		@ rem_4bit[rem]
 	eor	$Zhl,$Thl,$Zhl,lsr#4
+#ifdef	__thumb2__
+	it	pl
+#endif
 	ldrplb	$nlo,[$inp,$cnt]
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
 	eor	$Zhh,$Thh,$Zhh,lsr#4
@@ -211,6 +248,9 @@ gcm_ghash_4bit:
 	add	$nhi,$nhi,$nhi
 	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
 	eor	$Zll,$Tll,$Zll,lsr#4
+#ifdef	__thumb2__
+	it	pl
+#endif
 	ldrplb	$Tll,[$Xi,$cnt]
 	eor	$Zll,$Zll,$Zlh,lsl#28
 	eor	$Zlh,$Tlh,$Zlh,lsr#4
@@ -218,8 +258,14 @@ gcm_ghash_4bit:
 	eor	$Zlh,$Zlh,$Zhl,lsl#28
 	eor	$Zhl,$Thl,$Zhl,lsr#4
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
+#ifdef	__thumb2__
+	it	pl
+#endif
 	eorpl	$nlo,$nlo,$Tll
 	eor	$Zhh,$Thh,$Zhh,lsr#4
+#ifdef	__thumb2__
+	itt	pl
+#endif
 	andpl	$nhi,$nlo,#0xf0
 	andpl	$nlo,$nlo,#0x0f
 	eor	$Zhh,$Zhh,$Tlh,lsl#16	@ ^= rem_4bit[rem]
@@ -229,7 +275,11 @@ gcm_ghash_4bit:
 	add	$inp,$inp,#16
 	mov	$nhi,$Zll
 ___
-	&Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
+	&Zsmash("cmp\t$inp,$len","\n".
+				 "#ifdef __thumb2__\n".
+				 "	it	ne\n".
+				 "#endif\n".
+				 "	ldrneb	$nlo,[$inp,#15]");
 $code.=<<___;
 	bne	.Louter
 
@@ -287,6 +337,9 @@ gcm_gmult_4bit:
 	eor	$Zlh,$Zlh,$Zhl,lsl#28
 	ldrh	$Tll,[$rem_4bit,$nlo]	@ rem_4bit[rem]
 	eor	$Zhl,$Thl,$Zhl,lsr#4
+#ifdef	__thumb2__
+	it	pl
+#endif
 	ldrplb	$nlo,[$Xi,$cnt]
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
 	eor	$Zhh,$Thh,$Zhh,lsr#4
@@ -304,6 +357,9 @@ gcm_gmult_4bit:
 	eor	$Zhl,$Thl,$Zhl,lsr#4
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
 	eor	$Zhh,$Thh,$Zhh,lsr#4
+#ifdef	__thumb2__
+	itt	pl
+#endif
 	andpl	$nhi,$nlo,#0xf0
 	andpl	$nlo,$nlo,#0x0f
 	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
@@ -378,9 +434,9 @@ $code.=<<___;
 .type	gcm_init_neon,%function
 .align	4
 gcm_init_neon:
-	vld1.64		$IN#hi,[r1,:64]!	@ load H
+	vld1.64		$IN#hi,[r1]!		@ load H
 	vmov.i8		$t0,#0xe1
-	vld1.64		$IN#lo,[r1,:64]
+	vld1.64		$IN#lo,[r1]
 	vshl.i64	$t0#hi,#57
 	vshr.u64	$t0#lo,#63		@ t0=0xc2....01
 	vdup.8		$t1,$IN#hi[7]
@@ -399,8 +455,8 @@ gcm_init_neon:
 .type	gcm_gmult_neon,%function
 .align	4
 gcm_gmult_neon:
-	vld1.64		$IN#hi,[$Xi,:64]!	@ load Xi
-	vld1.64		$IN#lo,[$Xi,:64]!
+	vld1.64		$IN#hi,[$Xi]!		@ load Xi
+	vld1.64		$IN#lo,[$Xi]!
 	vmov.i64	$k48,#0x0000ffffffffffff
 	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
 	vmov.i64	$k32,#0x00000000ffffffff
@@ -417,8 +473,8 @@ gcm_gmult_neon:
 .type	gcm_ghash_neon,%function
 .align	4
 gcm_ghash_neon:
-	vld1.64		$Xl#hi,[$Xi,:64]!	@ load Xi
-	vld1.64		$Xl#lo,[$Xi,:64]!
+	vld1.64		$Xl#hi,[$Xi]!		@ load Xi
+	vld1.64		$Xl#lo,[$Xi]!
 	vmov.i64	$k48,#0x0000ffffffffffff
 	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
 	vmov.i64	$k32,#0x00000000ffffffff
@@ -473,8 +529,8 @@ $code.=<<___;
 	vrev64.8	$Xl,$Xl
 #endif
 	sub		$Xi,#16	
-	vst1.64		$Xl#hi,[$Xi,:64]!	@ write out Xi
-	vst1.64		$Xl#lo,[$Xi,:64]
+	vst1.64		$Xl#hi,[$Xi]!		@ write out Xi
+	vst1.64		$Xl#lo,[$Xi]
 
 	ret					@ bx lr
 .size	gcm_ghash_neon,.-gcm_ghash_neon
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-c64xplus.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-c64xplus.pl
new file mode 100644
index 0000000000..3cadda3994
--- /dev/null
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-c64xplus.pl
@@ -0,0 +1,247 @@
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# December 2011
+#
+# The module implements GCM GHASH function and underlying single
+# multiplication operation in GF(2^128). Even though subroutines
+# have _4bit suffix, they are not using any tables, but rely on
+# hardware Galois Field Multiply support. Streamed GHASH processes
+# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
+# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
+# comparing apples vs. oranges, but compiler surely could have done
+# better, because theoretical [though not necessarily achievable]
+# estimate for "4-bit" table-driven implementation is ~12 cycles.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6");	# arguments
+
+($Z0,$Z1,$Z2,$Z3,	$H0, $H1, $H2, $H3,
+			$H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
+($H01u,$H01y,$H2u,$H3u,	$H0y,$H1y,$H2y,$H3y,
+			$H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
+($FF000000,$E10000)=("B30","B31");
+($xip,$x0,$x1,$xib)=map("B$_",(6..9));	# $xip zaps $len
+ $xia="A9";
+($rem,$res)=("B4","B5");		# $rem zaps $Htable
+
+$code.=<<___;
+	.text
+
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.asg	gcm_gmult_1bit,_gcm_gmult_1bit
+	.asg	gcm_gmult_4bit,_gcm_gmult_4bit
+	.asg	gcm_ghash_4bit,_gcm_ghash_4bit
+	.endif
+
+	.asg	B3,RA
+
+	.if	0
+	.global	_gcm_gmult_1bit
+_gcm_gmult_1bit:
+	ADDAD	$Htable,2,$Htable
+	.endif
+	.global	_gcm_gmult_4bit
+_gcm_gmult_4bit:
+	.asmfunc
+	LDDW	*${Htable}[-1],$H1:$H0	; H.lo
+	LDDW	*${Htable}[-2],$H3:$H2	; H.hi
+||	MV	$Xip,${xip}		; reassign Xi
+||	MVK	15,B1			; SPLOOPD constant
+
+	MVK	0xE1,$E10000
+||	LDBU	*++${xip}[15],$x1	; Xi[15]
+	MVK	0xFF,$FF000000
+||	LDBU	*--${xip},$x0		; Xi[14]
+	SHL	$E10000,16,$E10000	; [pre-shifted] reduction polynomial
+	SHL	$FF000000,24,$FF000000	; upper byte mask
+||	BNOP	ghash_loop?
+||	MVK	1,B0			; take a single spin
+
+	PACKH2	$H0,$H1,$xia		; pack H0' and H1's upper bytes
+	AND	$H2,$FF000000,$H2u	; H2's upper byte
+	AND	$H3,$FF000000,$H3u	; H3's upper byte
+||	SHRU	$H2u,8,$H2u
+	SHRU	$H3u,8,$H3u
+||	ZERO	$Z1:$Z0
+	SHRU2	$xia,8,$H01u
+||	ZERO	$Z3:$Z2
+	.endasmfunc
+
+	.global	_gcm_ghash_4bit
+_gcm_ghash_4bit:
+	.asmfunc
+	LDDW	*${Htable}[-1],$H1:$H0	; H.lo
+||	SHRU	$len,4,B0		; reassign len
+	LDDW	*${Htable}[-2],$H3:$H2	; H.hi
+||	MV	$Xip,${xip}		; reassign Xi
+||	MVK	15,B1			; SPLOOPD constant
+
+	MVK	0xE1,$E10000
+|| [B0]	LDNDW	*${inp}[1],$H1x:$H0x
+	MVK	0xFF,$FF000000
+|| [B0]	LDNDW	*${inp}++[2],$H3x:$H2x
+	SHL	$E10000,16,$E10000	; [pre-shifted] reduction polynomial
+||	LDDW	*${xip}[1],$Z1:$Z0
+	SHL	$FF000000,24,$FF000000	; upper byte mask
+||	LDDW	*${xip}[0],$Z3:$Z2
+
+	PACKH2	$H0,$H1,$xia		; pack H0' and H1's upper bytes
+	AND	$H2,$FF000000,$H2u	; H2's upper byte
+	AND	$H3,$FF000000,$H3u	; H3's upper byte
+||	SHRU	$H2u,8,$H2u
+	SHRU	$H3u,8,$H3u
+	SHRU2	$xia,8,$H01u
+
+|| [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
+|| [B0]	XOR	$H1x,$Z1,$Z1
+	.if	.LITTLE_ENDIAN
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	SHRU	$Z1,24,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0]	SHRU	$Z1,16,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.else
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	MV	$Z0,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0] SHRU	$Z0,8,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.endif
+	STDW	$Z3:$Z2,*${xip}[0]
+|| [B0]	ZERO	$Z3:$Z2
+|| [B0]	MV	$xia,$x1
+   [B0]	ADDK	14,${xip}
+
+ghash_loop?:
+	SPLOOPD	6			; 6*16+7
+||	MVC	B1,ILC
+|| [B0]	SUB	B0,1,B0
+||	ZERO	A0
+||	ADD	$x1,$x1,$xib		; SHL	$x1,1,$xib
+||	SHL	$x1,1,$xia
+___
+
+########____________________________
+#  0    D2.     M1          M2      |
+#  1            M1                  |
+#  2            M1          M2      |
+#  3        D1. M1          M2      |
+#  4        S1. L1                  |
+#  5    S2  S1x L1          D2  L2  |____________________________
+#  6/0          L1  S1      L2  S2x |D2.     M1          M2      |
+#  7/1          L1  S1  D1x S2  M2  |        M1                  |
+#  8/2              S1  L1x S2      |        M1          M2      |
+#  9/3              S1  L1x         |    D1. M1          M2      |
+# 10/4                  D1x         |    S1. L1                  |
+# 11/5                              |S2  S1x L1          D2  L2  |____________
+# 12/6/0                D1x       __|        L1  S1      L2  S2x |D2.     ....
+#    7/1                                     L1  S1  D1x S2  M2  |        ....
+#    8/2                                         S1  L1x S2      |        ....
+#####...                                         ................|............
+$code.=<<___;
+	XORMPY	$H0,$xia,$H0x		; 0	; H·(Xi[i]<<1)
+||	XORMPY	$H01u,$xib,$H01y
+|| [A0]	LDBU	*--${xip},$x0
+	XORMPY	$H1,$xia,$H1x		; 1
+	XORMPY	$H2,$xia,$H2x		; 2
+||	XORMPY	$H2u,$xib,$H2y
+	XORMPY	$H3,$xia,$H3x		; 3
+||	XORMPY	$H3u,$xib,$H3y
+||[!A0]	MVK.D	15,A0				; *--${xip} counter
+	XOR.L	$H0x,$Z0,$Z0		; 4	; Z^=H·(Xi[i]<<1)
+|| [A0]	SUB.S	A0,1,A0
+	XOR.L	$H1x,$Z1,$Z1		; 5
+||	AND.D	$H01y,$FF000000,$H0z
+||	SWAP2.L	$H01y,$H1y		;	; SHL	$H01y,16,$H1y
+||	SHL	$x0,1,$xib
+||	SHL	$x0,1,$xia
+
+	XOR.L	$H2x,$Z2,$Z2		; 6/0	; [0,0] in epilogue
+||	SHL	$Z0,1,$rem		;	; rem=Z<<1
+||	SHRMB.S	$Z1,$Z0,$Z0		;	; Z>>=8
+||	AND.L	$H1y,$FF000000,$H1z
+	XOR.L	$H3x,$Z3,$Z3		; 7/1
+||	SHRMB.S	$Z2,$Z1,$Z1
+||	XOR.D	$H0z,$Z0,$Z0			; merge upper byte products
+||	AND.S	$H2y,$FF000000,$H2z
+||	XORMPY	$E10000,$rem,$res	;	; implicit rem&0x1FE
+	XOR.L	$H1z,$Z1,$Z1		; 8/2
+||	SHRMB.S	$Z3,$Z2,$Z2
+||	AND.S	$H3y,$FF000000,$H3z
+	XOR.L	$H2z,$Z2,$Z2		; 9/3
+||	SHRU	$Z3,8,$Z3
+	XOR.D	$H3z,$Z3,$Z3		; 10/4
+	NOP				; 11/5
+
+	SPKERNEL 0,2
+||	XOR.D	$res,$Z3,$Z3		; 12/6/0; Z^=res
+
+	; input pre-fetch is possible where D1 slot is available...
+   [B0]	LDNDW	*${inp}[1],$H1x:$H0x	; 8/-
+   [B0]	LDNDW	*${inp}++[2],$H3x:$H2x	; 9/-
+	NOP				; 10/-
+	.if	.LITTLE_ENDIAN
+	SWAP2	$Z0,$Z1			; 11/-
+||	SWAP4	$Z1,$Z0
+	SWAP4	$Z1,$Z1			; 12/-
+||	SWAP2	$Z0,$Z0
+	SWAP2	$Z2,$Z3
+||	SWAP4	$Z3,$Z2
+||[!B0]	BNOP	RA
+	SWAP4	$Z3,$Z3
+||	SWAP2	$Z2,$Z2
+|| [B0]	BNOP	ghash_loop?
+   [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
+|| [B0]	XOR	$H1x,$Z1,$Z1
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	SHRU	$Z1,24,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0]	SHRU	$Z1,16,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.else
+  [!B0]	BNOP	RA			; 11/-
+   [B0]	BNOP	ghash_loop?		; 12/-
+   [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
+|| [B0]	XOR	$H1x,$Z1,$Z1
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	MV	$Z0,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0] SHRU	$Z0,8,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.endif
+	STDW	$Z3:$Z2,*${xip}[0]
+|| [B0]	ZERO	$Z3:$Z2
+|| [B0]	MV	$xia,$x1
+   [B0]	ADDK	14,${xip}
+	.endasmfunc
+
+	.sect	.const
+	.cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
+close STDOUT;
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl
index 0354c95444..81e75f71a8 100755
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -32,7 +39,7 @@
 # Itanium performance should remain the same as the "256B" version,
 # i.e. ~8.5 cycles.
 
-$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
+$output=pop and (open STDOUT,">$output" or die "can't open $output: $!");
 
 if ($^O eq "hpux") {
     $ADDP="addp4";
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl
index d5ad96b403..1d6254543b 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl
index be7d55f748..6e628d8823 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -47,7 +54,7 @@ if ($flavour =~ /3[12]/) {
 	$g="g";
 }
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 $softonly=0;
@@ -81,9 +88,6 @@ gcm_gmult_4bit:
 ___
 $code.=<<___ if(!$softonly && 0);	# hardware is slow for single block...
 	larl	%r1,OPENSSL_s390xcap_P
-	lg	%r0,0(%r1)
-	tmhl	%r0,0x4000	# check for message-security-assist
-	jz	.Lsoft_gmult
 	lghi	%r0,0
 	lg	%r1,24(%r1)	# load second word of kimd capabilities vector
 	tmhh	%r1,0x4000	# check for function 65
@@ -119,14 +123,8 @@ gcm_ghash_4bit:
 ___
 $code.=<<___ if(!$softonly);
 	larl	%r1,OPENSSL_s390xcap_P
-	lg	%r0,0(%r1)
-	tmhl	%r0,0x4000	# check for message-security-assist
-	jz	.Lsoft_ghash
-	lghi	%r0,0
-	la	%r1,16($sp)
-	.long	0xb93e0004	# kimd %r0,%r4
-	lg	%r1,24($sp)
-	tmhh	%r1,0x4000	# check for function 65
+	lg	%r0,24(%r1)	# load second word of kimd capabilities vector
+	tmhh	%r0,0x4000	# check for function 65
 	jz	.Lsoft_ghash
 	lghi	%r0,65		# function 65
 	la	%r1,0($Xi)	# H lies right after Xi in gcm128_context
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-sparcv9.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-sparcv9.pl
index b129ba706f..c4eb3b1f02 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-sparcv9.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-sparcv9.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -46,14 +53,12 @@
 # saturates at ~15.5x single-process result on 8-core processor,
 # or ~20.5GBps per 2.85GHz socket.
 
-$bits=32;
-for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64)  { $bias=2047; $frame=192; }
-else            { $bias=0;    $frame=112; }
-
-$output=shift;
+$output=pop;
 open STDOUT,">$output";
 
+$frame="STACK_FRAME";
+$bias="STACK_BIAS";
+
 $Zhi="%o0";	# 64-bit values
 $Zlo="%o1";
 $Thi="%o2";
@@ -75,11 +80,14 @@ $Htbl="%i1";
 $inp="%i2";
 $len="%i3";
 
-$code.=<<___ if ($bits==64);
+$code.=<<___;
+#include "sparc_arch.h"
+
+#ifdef  __arch64__
 .register	%g2,#scratch
 .register	%g3,#scratch
-___
-$code.=<<___;
+#endif
+
 .section	".text",#alloc,#execinstr
 
 .align	64
@@ -183,7 +191,7 @@ gcm_ghash_4bit:
 
 	add	$inp,16,$inp
 	cmp	$inp,$len
-	be,pn	`$bits==64?"%xcc":"%icc"`,.Ldone
+	be,pn	SIZE_T_CC,.Ldone
 	and	$Zlo,0xf,$remi
 
 	ldx	[$Htblo+$nhi],$Tlo
@@ -532,7 +540,7 @@ ___
 
 # Purpose of these subroutines is to explicitly encode VIS instructions,
 # so that one can compile the module without having to specify VIS
-# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 # Idea is to reserve for option to produce "universal" binary and let
 # programmer detect if current CPU is VIS capable at run-time.
 sub unvis3 {
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl
index 0269169fa7..cd8458256e 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -88,7 +95,7 @@
 # where Tproc is time required for Karatsuba pre- and post-processing,
 # is more realistic estimate. In this case it gives ... 1.91 cycles.
 # Or in other words, depending on how well we can interleave reduction
-# and one of the two multiplications the performance should be betwen
+# and one of the two multiplications the performance should be between
 # 1.91 and 2.16. As already mentioned, this implementation processes
 # one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
 # - in 2.02. x86_64 performance is better, because larger register
@@ -129,6 +136,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
+$output=pop;
+open STDOUT,">$output";
+
 &asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
 
 $sse2=0;
@@ -712,7 +722,7 @@ sub mmx_loop() {
     &pxor	($red[1],$red[1]);
     &pxor	($red[2],$red[2]);
 
-    # Just like in "May" verson modulo-schedule for critical path in
+    # Just like in "May" version modulo-schedule for critical path in
     # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
     # is scheduled so late that rem_8bit[] has to be shifted *right*
     # by 16, which is why last argument to pinsrw is 2, which
@@ -1138,7 +1148,7 @@ my ($Xhi,$Xi) = @_;
 	&movdqu		(&QWP(0,$Xip),$Xi);
 &function_end("gcm_ghash_clmul");
 
-} else {		# Algorith 5. Kept for reference purposes.
+} else {		# Algorithm 5. Kept for reference purposes.
 
 sub reduction_alg5 {	# 19/16 times faster than Intel version
 my ($Xhi,$Xi)=@_;
@@ -1369,6 +1379,8 @@ my ($Xhi,$Xi)=@_;
 &asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
 &asm_finish();
 
+close STDOUT;
+
 # A question was risen about choice of vanilla MMX. Or rather why wasn't
 # SSE2 chosen instead? In addition to the fact that MMX runs on legacy
 # CPUs such as PIII, "4-bit" MMX version was observed to provide better
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
index f889f20187..387e3f854e 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -64,8 +71,10 @@
 # Ivy Bridge	1.80(+7%)
 # Haswell	0.55(+93%) (if system doesn't support AVX)
 # Broadwell	0.45(+110%)(if system doesn't support AVX)
+# Skylake	0.44(+110%)(if system doesn't support AVX)
 # Bulldozer	1.49(+27%)
 # Silvermont	2.88(+13%)
+# Goldmont	1.08(+24%)
 
 # March 2013
 #
@@ -74,8 +83,8 @@
 # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
 # sub-optimally in comparison to above mentioned version. But thanks
 # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
-# it performs in 0.41 cycles per byte on Haswell processor, and in
-# 0.29 on Broadwell.
+# it performs in 0.41 cycles per byte on Haswell processor, in
+# 0.29 on Broadwell, and in 0.36 on Skylake.
 #
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 
@@ -109,7 +118,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([
 	$avx = ($2>=3.0) + ($2>3.0);
 }
 
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;
 
 $do4xaggr=1;
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl b/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl
index 71457cf4fc..f0598cb28c 100755
--- a/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -18,6 +25,12 @@
 # faster than "4-bit" integer-only compiler-generated 64-bit code.
 # "Initial version" means that there is room for futher improvement.
 
+# May 2016
+#
+# 2x aggregated reduction improves performance by 50% (resulting
+# performance on POWER8 is 1 cycle per processed byte), and 4x
+# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
+
 $flavour=shift;
 $output =shift;
 
@@ -27,14 +40,21 @@ if ($flavour =~ /64/) {
 	$STU="stdu";
 	$POP="ld";
 	$PUSH="std";
+	$UCMP="cmpld";
+	$SHRI="srdi";
 } elsif ($flavour =~ /32/) {
 	$SIZE_T=4;
 	$LRSAVE=$SIZE_T;
 	$STU="stwu";
 	$POP="lwz";
 	$PUSH="stw";
+	$UCMP="cmplw";
+	$SHRI="srwi";
 } else { die "nonsense $flavour"; }
 
+$sp="r1";
+$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -46,6 +66,7 @@ my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block
 
 my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
 my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
+my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
 my $vrsave="r12";
 
 $code=<<___;
@@ -56,7 +77,7 @@ $code=<<___;
 .globl	.gcm_init_p8
 .align	5
 .gcm_init_p8:
-	lis		r0,0xfff0
+	li		r0,-4096
 	li		r8,0x10
 	mfspr		$vrsave,256
 	li		r9,0x20
@@ -78,17 +99,103 @@ $code=<<___;
 	vsl		$H,$H,$t0		# H<<=1
 	vsrab		$t1,$t1,$t2		# broadcast carry bit
 	vand		$t1,$t1,$xC2
-	vxor		$H,$H,$t1		# twisted H
+	vxor		$IN,$H,$t1		# twisted H
 
-	vsldoi		$H,$H,$H,8		# twist even more ...
+	vsldoi		$H,$IN,$IN,8		# twist even more ...
 	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
 	vsldoi		$Hl,$zero,$H,8		# ... and split
 	vsldoi		$Hh,$H,$zero,8
 
 	stvx_u		$xC2,0,r3		# save pre-computed table
 	stvx_u		$Hl,r8,r3
+	li		r8,0x40
 	stvx_u		$H, r9,r3
+	li		r9,0x50
 	stvx_u		$Hh,r10,r3
+	li		r10,0x60
+
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·H.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi·H.lo+H.lo·H.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·H.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$IN1,$Xl,$t1
+
+	vsldoi		$H2,$IN1,$IN1,8
+	vsldoi		$H2l,$zero,$H2,8
+	vsldoi		$H2h,$H2,$zero,8
+
+	stvx_u		$H2l,r8,r3		# save H^2
+	li		r8,0x70
+	stvx_u		$H2,r9,r3
+	li		r9,0x80
+	stvx_u		$H2h,r10,r3
+	li		r10,0x90
+___
+{
+my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
+$code.=<<___;
+	vpmsumd		$Xl,$IN,$H2l		# H.lo·H^2.lo
+	 vpmsumd	$Xl1,$IN1,$H2l		# H^2.lo·H^2.lo
+	vpmsumd		$Xm,$IN,$H2		# H.hi·H^2.lo+H.lo·H^2.hi
+	 vpmsumd	$Xm1,$IN1,$H2		# H^2.hi·H^2.lo+H^2.lo·H^2.hi
+	vpmsumd		$Xh,$IN,$H2h		# H.hi·H^2.hi
+	 vpmsumd	$Xh1,$IN1,$H2h		# H^2.hi·H^2.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+	 vpmsumd	$t6,$Xl1,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	 vsldoi		$t4,$Xm1,$zero,8
+	 vsldoi		$t5,$zero,$Xm1,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+	 vxor		$Xl1,$Xl1,$t4
+	 vxor		$Xh1,$Xh1,$t5
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	 vsldoi		$Xl1,$Xl1,$Xl1,8
+	vxor		$Xl,$Xl,$t2
+	 vxor		$Xl1,$Xl1,$t6
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	 vsldoi		$t5,$Xl1,$Xl1,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	 vpmsumd	$Xl1,$Xl1,$xC2
+	vxor		$t1,$t1,$Xh
+	 vxor		$t5,$t5,$Xh1
+	vxor		$Xl,$Xl,$t1
+	 vxor		$Xl1,$Xl1,$t5
+
+	vsldoi		$H,$Xl,$Xl,8
+	 vsldoi		$H2,$Xl1,$Xl1,8
+	vsldoi		$Hl,$zero,$H,8
+	vsldoi		$Hh,$H,$zero,8
+	 vsldoi		$H2l,$zero,$H2,8
+	 vsldoi		$H2h,$H2,$zero,8
+
+	stvx_u		$Hl,r8,r3		# save H^3
+	li		r8,0xa0
+	stvx_u		$H,r9,r3
+	li		r9,0xb0
+	stvx_u		$Hh,r10,r3
+	li		r10,0xc0
+	 stvx_u		$H2l,r8,r3		# save H^4
+	 stvx_u		$H2,r9,r3
+	 stvx_u		$H2h,r10,r3
 
 	mtspr		256,$vrsave
 	blr
@@ -96,7 +203,9 @@ $code=<<___;
 	.byte		0,12,0x14,0,0,0,2,0
 	.long		0
 .size	.gcm_init_p8,.-.gcm_init_p8
-
+___
+}
+$code.=<<___;
 .globl	.gcm_gmult_p8
 .align	5
 .gcm_gmult_p8:
@@ -122,7 +231,7 @@ $code=<<___;
 	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
 	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
 
-	vpmsumd		$t2,$Xl,$xC2		# 1st phase
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
 
 	vsldoi		$t0,$Xm,$zero,8
 	vsldoi		$t1,$zero,$Xm,8
@@ -132,7 +241,7 @@ $code=<<___;
 	vsldoi		$Xl,$Xl,$Xl,8
 	vxor		$Xl,$Xl,$t2
 
-	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
 	vpmsumd		$Xl,$Xl,$xC2
 	vxor		$t1,$t1,$Xh
 	vxor		$Xl,$Xl,$t1
@@ -150,7 +259,7 @@ $code=<<___;
 .globl	.gcm_ghash_p8
 .align	5
 .gcm_ghash_p8:
-	lis		r0,0xfff8
+	li		r0,-4096
 	li		r8,0x10
 	mfspr		$vrsave,256
 	li		r9,0x20
@@ -159,52 +268,99 @@ $code=<<___;
 	lvx_u		$Xl,0,$Xip		# load Xi
 
 	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
+	li		r8,0x40
 	 le?lvsl	$lemask,r0,r0
 	lvx_u		$H, r9,$Htbl
+	li		r9,0x50
 	 le?vspltisb	$t0,0x07
 	lvx_u		$Hh,r10,$Htbl
+	li		r10,0x60
 	 le?vxor	$lemask,$lemask,$t0
 	lvx_u		$xC2,0,$Htbl
 	 le?vperm	$Xl,$Xl,$Xl,$lemask
 	vxor		$zero,$zero,$zero
 
+	${UCMP}i	$len,64
+	bge		Lgcm_ghash_p8_4x
+
 	lvx_u		$IN,0,$inp
 	addi		$inp,$inp,16
-	subi		$len,$len,16
+	subic.		$len,$len,16
 	 le?vperm	$IN,$IN,$IN,$lemask
 	vxor		$IN,$IN,$Xl
-	b		Loop
+	beq		Lshort
+
+	lvx_u		$H2l,r8,$Htbl		# load H^2
+	li		r8,16
+	lvx_u		$H2, r9,$Htbl
+	add		r9,$inp,$len		# end of input
+	lvx_u		$H2h,r10,$Htbl
+	be?b		Loop_2x
 
 .align	5
-Loop:
-	 subic		$len,$len,16
-	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
-	 subfe.		r0,r0,r0		# borrow?-1:0
-	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
+Loop_2x:
+	lvx_u		$IN1,0,$inp
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+
+	 subic		$len,$len,32
+	vpmsumd		$Xl,$IN,$H2l		# H^2.lo·Xi.lo
+	 vpmsumd	$Xl1,$IN1,$Hl		# H.lo·Xi+1.lo
+	 subfe		r0,r0,r0		# borrow?-1:0
+	vpmsumd		$Xm,$IN,$H2		# H^2.hi·Xi.lo+H^2.lo·Xi.hi
+	 vpmsumd	$Xm1,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+1.hi
 	 and		r0,r0,$len
-	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
+	vpmsumd		$Xh,$IN,$H2h		# H^2.hi·Xi.hi
+	 vpmsumd	$Xh1,$IN1,$Hh		# H.hi·Xi+1.hi
 	 add		$inp,$inp,r0
 
-	vpmsumd		$t2,$Xl,$xC2		# 1st phase
+	vxor		$Xl,$Xl,$Xl1
+	vxor		$Xm,$Xm,$Xm1
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
 
 	vsldoi		$t0,$Xm,$zero,8
 	vsldoi		$t1,$zero,$Xm,8
+	 vxor		$Xh,$Xh,$Xh1
 	vxor		$Xl,$Xl,$t0
 	vxor		$Xh,$Xh,$t1
 
 	vsldoi		$Xl,$Xl,$Xl,8
 	vxor		$Xl,$Xl,$t2
-	 lvx_u		$IN,0,$inp
-	 addi		$inp,$inp,16
+	 lvx_u		$IN,r8,$inp
+	 addi		$inp,$inp,32
 
-	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
 	vpmsumd		$Xl,$Xl,$xC2
 	 le?vperm	$IN,$IN,$IN,$lemask
 	vxor		$t1,$t1,$Xh
 	vxor		$IN,$IN,$t1
 	vxor		$IN,$IN,$Xl
-	beq		Loop			# did $len-=16 borrow?
+	$UCMP		r9,$inp
+	bgt		Loop_2x			# done yet?
+
+	cmplwi		$len,0
+	bne		Leven
+
+Lshort:
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
 
+Leven:
 	vxor		$Xl,$Xl,$t1
 	le?vperm	$Xl,$Xl,$Xl,$lemask
 	stvx_u		$Xl,0,$Xip		# write out Xi
@@ -214,6 +370,284 @@ Loop:
 	.long		0
 	.byte		0,12,0x14,0,0,0,4,0
 	.long		0
+___
+{
+my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
+    $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
+my $IN0=$IN;
+my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
+
+$code.=<<___;
+.align	5
+.gcm_ghash_p8_4x:
+Lgcm_ghash_p8_4x:
+	$STU		$sp,-$FRAME($sp)
+	li		r10,`15+6*$SIZE_T`
+	li		r11,`31+6*$SIZE_T`
+	stvx		v20,r10,$sp
+	addi		r10,r10,32
+	stvx		v21,r11,$sp
+	addi		r11,r11,32
+	stvx		v22,r10,$sp
+	addi		r10,r10,32
+	stvx		v23,r11,$sp
+	addi		r11,r11,32
+	stvx		v24,r10,$sp
+	addi		r10,r10,32
+	stvx		v25,r11,$sp
+	addi		r11,r11,32
+	stvx		v26,r10,$sp
+	addi		r10,r10,32
+	stvx		v27,r11,$sp
+	addi		r11,r11,32
+	stvx		v28,r10,$sp
+	addi		r10,r10,32
+	stvx		v29,r11,$sp
+	addi		r11,r11,32
+	stvx		v30,r10,$sp
+	li		r10,0x60
+	stvx		v31,r11,$sp
+	li		r0,-1
+	stw		$vrsave,`$FRAME-4`($sp)	# save vrsave
+	mtspr		256,r0			# preserve all AltiVec registers
+
+	lvsl		$t0,0,r8		# 0x0001..0e0f
+	#lvx_u		$H2l,r8,$Htbl		# load H^2
+	li		r8,0x70
+	lvx_u		$H2, r9,$Htbl
+	li		r9,0x80
+	vspltisb	$t1,8			# 0x0808..0808
+	#lvx_u		$H2h,r10,$Htbl
+	li		r10,0x90
+	lvx_u		$H3l,r8,$Htbl		# load H^3
+	li		r8,0xa0
+	lvx_u		$H3, r9,$Htbl
+	li		r9,0xb0
+	lvx_u		$H3h,r10,$Htbl
+	li		r10,0xc0
+	lvx_u		$H4l,r8,$Htbl		# load H^4
+	li		r8,0x10
+	lvx_u		$H4, r9,$Htbl
+	li		r9,0x20
+	lvx_u		$H4h,r10,$Htbl
+	li		r10,0x30
+
+	vsldoi		$t2,$zero,$t1,8		# 0x0000..0808
+	vaddubm		$hiperm,$t0,$t2		# 0x0001..1617
+	vaddubm		$loperm,$t1,$hiperm	# 0x0809..1e1f
+
+	$SHRI		$len,$len,4		# this allows to use sign bit
+						# as carry
+	lvx_u		$IN0,0,$inp		# load input
+	lvx_u		$IN1,r8,$inp
+	subic.		$len,$len,8
+	lvx_u		$IN2,r9,$inp
+	lvx_u		$IN3,r10,$inp
+	addi		$inp,$inp,0x40
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+	le?vperm	$IN2,$IN2,$IN2,$lemask
+	le?vperm	$IN3,$IN3,$IN3,$lemask
+
+	vxor		$Xh,$IN0,$Xl
+
+	 vpmsumd	$Xl1,$IN1,$H3l
+	 vpmsumd	$Xm1,$IN1,$H3
+	 vpmsumd	$Xh1,$IN1,$H3h
+
+	 vperm		$H21l,$H2,$H,$hiperm
+	 vperm		$t0,$IN2,$IN3,$loperm
+	 vperm		$H21h,$H2,$H,$loperm
+	 vperm		$t1,$IN2,$IN3,$hiperm
+	 vpmsumd	$Xm2,$IN2,$H2		# H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
+	 vpmsumd	$Xl3,$t0,$H21l		# H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
+	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
+	 vpmsumd	$Xh3,$t1,$H21h		# H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
+
+	 vxor		$Xm2,$Xm2,$Xm1
+	 vxor		$Xl3,$Xl3,$Xl1
+	 vxor		$Xm3,$Xm3,$Xm2
+	 vxor		$Xh3,$Xh3,$Xh1
+
+	blt		Ltail_4x
+
+Loop_4x:
+	lvx_u		$IN0,0,$inp
+	lvx_u		$IN1,r8,$inp
+	subic.		$len,$len,4
+	lvx_u		$IN2,r9,$inp
+	lvx_u		$IN3,r10,$inp
+	addi		$inp,$inp,0x40
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+	le?vperm	$IN2,$IN2,$IN2,$lemask
+	le?vperm	$IN3,$IN3,$IN3,$lemask
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+
+	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
+	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
+	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
+	 vpmsumd	$Xl1,$IN1,$H3l
+	 vpmsumd	$Xm1,$IN1,$H3
+	 vpmsumd	$Xh1,$IN1,$H3h
+
+	vxor		$Xl,$Xl,$Xl3
+	vxor		$Xm,$Xm,$Xm3
+	vxor		$Xh,$Xh,$Xh3
+	 vperm		$t0,$IN2,$IN3,$loperm
+	 vperm		$t1,$IN2,$IN3,$hiperm
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+	 vpmsumd	$Xl3,$t0,$H21l		# H.lo·Xi+3.lo  +H^2.lo·Xi+2.lo
+	 vpmsumd	$Xh3,$t1,$H21h		# H.hi·Xi+3.hi  +H^2.hi·Xi+2.hi
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	 vpmsumd	$Xm2,$IN2,$H2		# H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
+	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
+	vpmsumd		$Xl,$Xl,$xC2
+
+	 vxor		$Xl3,$Xl3,$Xl1
+	 vxor		$Xh3,$Xh3,$Xh1
+	vxor		$Xh,$Xh,$IN0
+	 vxor		$Xm2,$Xm2,$Xm1
+	vxor		$Xh,$Xh,$t1
+	 vxor		$Xm3,$Xm3,$Xm2
+	vxor		$Xh,$Xh,$Xl
+	bge		Loop_4x
+
+Ltail_4x:
+	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
+	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
+	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
+
+	vxor		$Xl,$Xl,$Xl3
+	vxor		$Xm,$Xm,$Xm3
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	 vxor		$Xh,$Xh,$Xh3
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$Xl,$Xl,$t1
+
+	addic.		$len,$len,4
+	beq		Ldone_4x
+
+	lvx_u		$IN0,0,$inp
+	${UCMP}i	$len,2
+	li		$len,-4
+	blt		Lone
+	lvx_u		$IN1,r8,$inp
+	beq		Ltwo
+
+Lthree:
+	lvx_u		$IN2,r9,$inp
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+	le?vperm	$IN2,$IN2,$IN2,$lemask
+
+	vxor		$Xh,$IN0,$Xl
+	vmr		$H4l,$H3l
+	vmr		$H4, $H3
+	vmr		$H4h,$H3h
+
+	vperm		$t0,$IN1,$IN2,$loperm
+	vperm		$t1,$IN1,$IN2,$hiperm
+	vpmsumd		$Xm2,$IN1,$H2		# H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
+	vpmsumd		$Xm3,$IN2,$H		# H.hi·Xi+2.lo  +H.lo·Xi+2.hi
+	vpmsumd		$Xl3,$t0,$H21l		# H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
+	vpmsumd		$Xh3,$t1,$H21h		# H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
+
+	vxor		$Xm3,$Xm3,$Xm2
+	b		Ltail_4x
+
+.align	4
+Ltwo:
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+
+	vxor		$Xh,$IN0,$Xl
+	vperm		$t0,$zero,$IN1,$loperm
+	vperm		$t1,$zero,$IN1,$hiperm
+
+	vsldoi		$H4l,$zero,$H2,8
+	vmr		$H4, $H2
+	vsldoi		$H4h,$H2,$zero,8
+
+	vpmsumd		$Xl3,$t0, $H21l		# H.lo·Xi+1.lo
+	vpmsumd		$Xm3,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+2.hi
+	vpmsumd		$Xh3,$t1, $H21h		# H.hi·Xi+1.hi
+
+	b		Ltail_4x
+
+.align	4
+Lone:
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+
+	vsldoi		$H4l,$zero,$H,8
+	vmr		$H4, $H
+	vsldoi		$H4h,$H,$zero,8
+
+	vxor		$Xh,$IN0,$Xl
+	vxor		$Xl3,$Xl3,$Xl3
+	vxor		$Xm3,$Xm3,$Xm3
+	vxor		$Xh3,$Xh3,$Xh3
+
+	b		Ltail_4x
+
+Ldone_4x:
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	li		r10,`15+6*$SIZE_T`
+	li		r11,`31+6*$SIZE_T`
+	mtspr		256,$vrsave
+	lvx		v20,r10,$sp
+	addi		r10,r10,32
+	lvx		v21,r11,$sp
+	addi		r11,r11,32
+	lvx		v22,r10,$sp
+	addi		r10,r10,32
+	lvx		v23,r11,$sp
+	addi		r11,r11,32
+	lvx		v24,r10,$sp
+	addi		r10,r10,32
+	lvx		v25,r11,$sp
+	addi		r11,r11,32
+	lvx		v26,r10,$sp
+	addi		r10,r10,32
+	lvx		v27,r11,$sp
+	addi		r11,r11,32
+	lvx		v28,r10,$sp
+	addi		r10,r10,32
+	lvx		v29,r11,$sp
+	addi		r11,r11,32
+	lvx		v30,r10,$sp
+	lvx		v31,r11,$sp
+	addi		$sp,$sp,$FRAME
+	blr
+	.long		0
+	.byte		0,12,0x04,0,0x80,0,4,0
+	.long		0
+___
+}
+$code.=<<___;
 .size	.gcm_ghash_p8,.-.gcm_ghash_p8
 
 .asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
@@ -221,6 +655,8 @@ Loop:
 ___
 
 foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
 	if ($flavour =~ /le$/o) {	# little-endian
 	    s/le\?//o		or
 	    s/be\?/#be#/o;
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl b/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl
index 0886d21807..dcd5f595d2 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -27,11 +34,21 @@
 # Apple A7	0.92		5.62
 # Cortex-A53	1.01		8.39
 # Cortex-A57	1.17		7.61
+# Denver	0.71		6.02
+# Mongoose	1.10		8.06
 #
 # (*)	presented for reference/comparison purposes;
 
 $flavour = shift;
-open STDOUT,">".shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
 
 $Xi="x0";	# argument block
 $Htbl="x1";
@@ -50,7 +67,11 @@ $code=<<___;
 .text
 ___
 $code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/);
-$code.=".fpu	neon\n.code	32\n"	if ($flavour !~ /64/);
+$code.=<<___				if ($flavour !~ /64/);
+.fpu	neon
+.code	32
+#undef	__thumb2__
+___
 
 ################################################################################
 # void gcm_init_v8(u128 Htable[16],const u64 H[2]);