9 files changed, 494 insertions, 96 deletions
diff --git a/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl b/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
index 5ad62b3979..b42016101e 100644
--- a/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -35,6 +35,8 @@
 # Applications using the EVP interface will observe a few percent
 # worse performance.]
 #
+# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
+#
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
 
@@ -116,23 +118,6 @@ _aesni_ctr32_ghash_6x:
 	  vpxor		$rndkey,$inout3,$inout3
 	  vmovups	0x10-0x80($key),$T2	# borrow $T2 for $rndkey
 	vpclmulqdq	\$0x01,$Hkey,$Z3,$Z2
-
-	# At this point, the current block of 96 (0x60) bytes has already been
-	# loaded into registers. Concurrently with processing it, we want to
-	# load the next 96 bytes of input for the next round. Obviously, we can
-	# only do this if there are at least 96 more bytes of input beyond the
-	# input we're currently processing, or else we'd read past the end of
-	# the input buffer. Here, we set |%r12| to 96 if there are at least 96
-	# bytes of input beyond the 96 bytes we're already processing, and we
-	# set |%r12| to 0 otherwise. In the case where we set |%r12| to 96,
-	# we'll read in the next block so that it is in registers for the next
-	# loop iteration. In the case where we set |%r12| to 0, we'll re-read
-	# the current block and then ignore what we re-read.
-	#
-	# At this point, |$in0| points to the current (already read into
-	# registers) block, and |$end0| points to 2*96 bytes before the end of
-	# the input. Thus, |$in0| > |$end0| means that we do not have the next
-	# 96-byte block to read in, and |$in0| <= |$end0| means we do.
 	xor		%r12,%r12
 	cmp		$in0,$end0
 
@@ -424,20 +409,25 @@ $code.=<<___;
 .type	aesni_gcm_decrypt,\@function,6
 .align	32
 aesni_gcm_decrypt:
+.cfi_startproc
 	xor	$ret,$ret
-
-	# We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60)
-	# bytes of input.
 	cmp	\$0x60,$len			# minimal accepted length
 	jb	.Lgcm_dec_abort
 
 	lea	(%rsp),%rax			# save stack pointer
+.cfi_def_cfa_register	%rax
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 ___
 $code.=<<___ if ($win64);
 	lea	-0xa8(%rsp),%rsp
@@ -482,15 +472,7 @@ $code.=<<___;
 	vmovdqu		0x50($inp),$Z3		# I[5]
 	lea		($inp),$in0
 	vmovdqu		0x40($inp),$Z0
-
-	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
-	# bytes before the end of the input. Note, in particular, that this is
-	# correct even if |$len| is not an even multiple of 96 or 16. XXX: This
-	# seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must
-	# not be near the very beginning of the address space when |$len| < 2*96
-	# (0xc0).
 	lea		-0xc0($inp,$len),$end0
-
 	vmovdqu		0x30($inp),$Z1
 	shr		\$4,$len
 	xor		$ret,$ret
@@ -537,15 +519,23 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	-48(%rax),%r15
+.cfi_restore	%r15
 	mov	-40(%rax),%r14
+.cfi_restore	%r14
 	mov	-32(%rax),%r13
+.cfi_restore	%r13
 	mov	-24(%rax),%r12
+.cfi_restore	%r12
 	mov	-16(%rax),%rbp
+.cfi_restore	%rbp
 	mov	-8(%rax),%rbx
+.cfi_restore	%rbx
 	lea	(%rax),%rsp		# restore %rsp
+.cfi_def_cfa_register	%rsp
 .Lgcm_dec_abort:
 	mov	$ret,%rax		# return value
 	ret
+.cfi_endproc
 .size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
 ___
 
@@ -645,21 +635,25 @@ _aesni_ctr32_6x:
 .type	aesni_gcm_encrypt,\@function,6
 .align	32
 aesni_gcm_encrypt:
+.cfi_startproc
 	xor	$ret,$ret
-
-	# We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of
-	# input. Then we call |_aesni_ctr32_ghash_6x|, which requires at
-	# least 96 more bytes of input.
 	cmp	\$0x60*3,$len			# minimal accepted length
 	jb	.Lgcm_enc_abort
 
 	lea	(%rsp),%rax			# save stack pointer
+.cfi_def_cfa_register	%rax
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 ___
 $code.=<<___ if ($win64);
 	lea	-0xa8(%rsp),%rsp
@@ -699,16 +693,7 @@ $code.=<<___;
 .Lenc_no_key_aliasing:
 
 	lea		($out),$in0
-
-	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
-	# bytes before the end of the input. Note, in particular, that this is
-	# correct even if |$len| is not an even multiple of 96 or 16. Unlike in
-	# the decryption case, there's no caveat that |$out| must not be near
-	# the very beginning of the address space, because we know that
-	# |$len| >= 3*96 from the check above, and so we know
-	# |$out| + |$len| >= 2*96 (0xc0).
 	lea		-0xc0($out,$len),$end0
-
 	shr		\$4,$len
 
 	call		_aesni_ctr32_6x
@@ -931,15 +916,23 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	-48(%rax),%r15
+.cfi_restore	%r15
 	mov	-40(%rax),%r14
+.cfi_restore	%r14
 	mov	-32(%rax),%r13
+.cfi_restore	%r13
 	mov	-24(%rax),%r12
+.cfi_restore	%r12
 	mov	-16(%rax),%rbp
+.cfi_restore	%rbp
 	mov	-8(%rax),%rbx
+.cfi_restore	%rbx
 	lea	(%rax),%rsp		# restore %rsp
+.cfi_def_cfa_register	%rsp
 .Lgcm_enc_abort:
 	mov	$ret,%rax		# return value
 	ret
+.cfi_endproc
 .size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
 ___
 
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl
index 1cf14a6c9f..dcc23f7d7d 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-armv4.pl
@@ -54,7 +54,7 @@
 #
 # Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
 # Polynomial Multiplication on ARM Processors using the NEON Engine.
-# 
+#
 # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
 
 # ====================================================================
@@ -525,7 +525,7 @@ $code.=<<___;
 #ifdef __ARMEL__
 	vrev64.8	$Xl,$Xl
 #endif
-	sub		$Xi,#16	
+	sub		$Xi,#16
 	vst1.64		$Xl#hi,[$Xi]!		@ write out Xi
 	vst1.64		$Xl#lo,[$Xi]
 
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl
index 81e75f71a8..eb9ded91e5 100755
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-ia64.pl
@@ -156,7 +156,7 @@ $code.=<<___;
 ___
 
 ######################################################################
-# "528B" (well, "512B" actualy) streamed GHASH
+# "528B" (well, "512B" actually) streamed GHASH
 #
 $Xip="in0";
 $Htbl="in1";
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl
index 1d6254543b..a614c99c22 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-parisc.pl
@@ -1,5 +1,5 @@
 #! /usr/bin/env perl
-# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
 #
 # Licensed under the OpenSSL license (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
@@ -705,7 +705,7 @@ my $depd = sub {
   my ($mod,$args) = @_;
   my $orig = "depd$mod\t$args";
 
-    # I only have ",z" completer, it's impicitly encoded...
+    # I only have ",z" completer, it's implicitly encoded...
     if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 16
     {	my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
     	my $cpos=63-$2;
@@ -724,6 +724,11 @@ sub assemble {
     ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
 }
 
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+	=~ /GNU assembler/) {
+    $gnuas = 1;
+}
+
 foreach (split("\n",$code)) {
 	s/\`([^\`]*)\`/eval $1/ge;
 	if ($SIZE_T==4) {
@@ -731,7 +736,12 @@ foreach (split("\n",$code)) {
 		s/cmpb,\*/comb,/;
 		s/,\*/,/;
 	}
-	s/\bbv\b/bve/	if ($SIZE_T==8);
+
+	s/(\.LEVEL\s+2\.0)W/$1w/	if ($gnuas && $SIZE_T==8);
+	s/\.SPACE\s+\$TEXT\$/.text/	if ($gnuas && $SIZE_T==8);
+	s/\.SUBSPA.*//			if ($gnuas && $SIZE_T==8);
+	s/\bbv\b/bve/			if ($SIZE_T==8);
+
 	print $_,"\n";
 }
 
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl
index 6e628d8823..17dc375053 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-s390x.pl
@@ -80,6 +80,8 @@ $rem_4bit="%r14";
 $sp="%r15";
 
 $code.=<<___;
+#include "s390x_arch.h"
+
 .text
 
 .globl	gcm_gmult_4bit
@@ -89,12 +91,13 @@ ___
 $code.=<<___ if(!$softonly && 0);	# hardware is slow for single block...
 	larl	%r1,OPENSSL_s390xcap_P
 	lghi	%r0,0
-	lg	%r1,24(%r1)	# load second word of kimd capabilities vector
+	lg	%r1,S390X_KIMD+8(%r1)	# load second word of kimd capabilities
+					#  vector
 	tmhh	%r1,0x4000	# check for function 65
 	jz	.Lsoft_gmult
 	stg	%r0,16($sp)	# arrange 16 bytes of zero input
 	stg	%r0,24($sp)
-	lghi	%r0,65		# function 65
+	lghi	%r0,S390X_GHASH	# function 65
 	la	%r1,0($Xi)	# H lies right after Xi in gcm128_context
 	la	$inp,16($sp)
 	lghi	$len,16
@@ -123,10 +126,11 @@ gcm_ghash_4bit:
 ___
 $code.=<<___ if(!$softonly);
 	larl	%r1,OPENSSL_s390xcap_P
-	lg	%r0,24(%r1)	# load second word of kimd capabilities vector
+	lg	%r0,S390X_KIMD+8(%r1)	# load second word of kimd capabilities
+					#  vector
 	tmhh	%r0,0x4000	# check for function 65
 	jz	.Lsoft_ghash
-	lghi	%r0,65		# function 65
+	lghi	%r0,S390X_GHASH	# function 65
 	la	%r1,0($Xi)	# H lies right after Xi in gcm128_context
 	.long	0xb93e0004	# kimd %r0,$inp
 	brc	1,.-4		# pay attention to "partial completion"
@@ -149,7 +153,7 @@ $code.=<<___;
 	lg	$Zhi,0+1($Xi)
 	lghi	$tmp,0
 .Louter:
-	xg	$Zhi,0($inp)		# Xi ^= inp 
+	xg	$Zhi,0($inp)		# Xi ^= inp
 	xg	$Zlo,8($inp)
 	xgr	$Zhi,$tmp
 	stg	$Zlo,8+1($Xi)
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl
index cd8458256e..bcbe6e399d 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-x86.pl
@@ -103,14 +103,13 @@
 #
 # Does it make sense to increase Naggr? To start with it's virtually
 # impossible in 32-bit mode, because of limited register bank
-# capacity. Otherwise improvement has to be weighed agiainst slower
+# capacity. Otherwise improvement has to be weighed against slower
 # setup, as well as code size and complexity increase. As even
 # optimistic estimate doesn't promise 30% performance improvement,
 # there are currently no plans to increase Naggr.
 #
-# Special thanks to David Woodhouse <dwmw2@infradead.org> for
-# providing access to a Westmere-based system on behalf of Intel
-# Open Source Technology Centre.
+# Special thanks to David Woodhouse for providing access to a
+# Westmere-based system on behalf of Intel Open Source Technology Centre.
 
 # January 2010
 #
@@ -139,7 +138,7 @@ require "x86asm.pl";
 $output=pop;
 open STDOUT,">$output";
 
-&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
 
 $sse2=0;
 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
@@ -811,7 +810,7 @@ sub mmx_loop() {
     &bswap	($dat);
     &pshufw	($Zhi,$Zhi,0b00011011);		# 76543210
     &bswap	("ebx");
-    
+
     &cmp	("ecx",&DWP(528+16+8,"esp"));	# are we done?
     &jne	(&label("outer"));
   }
@@ -915,7 +914,7 @@ my ($Xhi,$Xi) = @_;
 	&psllq		($Xi,57);		#
 	&movdqa		($T1,$Xi);		#
 	&pslldq		($Xi,8);
-	&psrldq		($T1,8);		#	
+	&psrldq		($T1,8);		#
 	&pxor		($Xi,$T2);
 	&pxor		($Xhi,$T1);		#
 
@@ -1085,7 +1084,7 @@ my ($Xhi,$Xi) = @_;
 	  &psllq	($Xi,57);		#
 	  &movdqa	($T1,$Xi);		#
 	  &pslldq	($Xi,8);
-	  &psrldq	($T1,8);		#	
+	  &psrldq	($T1,8);		#
 	  &pxor		($Xi,$T2);
 	  &pxor		($Xhi,$T1);		#
 	&pshufd		($T1,$Xhn,0b01001110);
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
index 387e3f854e..afc30c3e72 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl
@@ -44,9 +44,8 @@
 # See ghash-x86.pl for background information and details about coding
 # techniques.
 #
-# Special thanks to David Woodhouse <dwmw2@infradead.org> for
-# providing access to a Westmere-based system on behalf of Intel
-# Open Source Technology Centre.
+# Special thanks to David Woodhouse for providing access to a
+# Westmere-based system on behalf of Intel Open Source Technology Centre.
 
 # December 2012
 #
@@ -74,6 +73,7 @@
 # Skylake	0.44(+110%)(if system doesn't support AVX)
 # Bulldozer	1.49(+27%)
 # Silvermont	2.88(+13%)
+# Knights L	2.12(-)    (if system doesn't support AVX)
 # Goldmont	1.08(+24%)
 
 # March 2013
@@ -86,6 +86,8 @@
 # it performs in 0.41 cycles per byte on Haswell processor, in
 # 0.29 on Broadwell, and in 0.36 on Skylake.
 #
+# Knights Landing achieves 1.09 cpb.
+#
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 
 $flavour = shift;
@@ -236,9 +238,21 @@ $code=<<___;
 .type	gcm_gmult_4bit,\@function,2
 .align	16
 gcm_gmult_4bit:
+.cfi_startproc
 	push	%rbx
-	push	%rbp		# %rbp and %r12 are pushed exclusively in
+.cfi_push	%rbx
+	push	%rbp		# %rbp and others are pushed exclusively in
+.cfi_push	%rbp
 	push	%r12		# order to reuse Win64 exception handler...
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$280,%rsp
+.cfi_adjust_cfa_offset	280
 .Lgmult_prologue:
 
 	movzb	15($Xi),$Zlo
@@ -249,10 +263,15 @@ $code.=<<___;
 	mov	$Zlo,8($Xi)
 	mov	$Zhi,($Xi)
 
-	mov	16(%rsp),%rbx
-	lea	24(%rsp),%rsp
+	lea	280+48(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lgmult_epilogue:
 	ret
+.cfi_endproc
 .size	gcm_gmult_4bit,.-gcm_gmult_4bit
 ___
 
@@ -266,13 +285,21 @@ $code.=<<___;
 .type	gcm_ghash_4bit,\@function,4
 .align	16
 gcm_ghash_4bit:
+.cfi_startproc
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 	sub	\$280,%rsp
+.cfi_adjust_cfa_offset	280
 .Lghash_prologue:
 	mov	$inp,%r14		# reassign couple of args
 	mov	$len,%r15
@@ -400,16 +427,25 @@ $code.=<<___;
 	mov	$Zlo,8($Xi)
 	mov	$Zhi,($Xi)
 
-	lea	280(%rsp),%rsi
-	mov	0(%rsi),%r15
-	mov	8(%rsi),%r14
-	mov	16(%rsi),%r13
-	mov	24(%rsi),%r12
-	mov	32(%rsi),%rbp
-	mov	40(%rsi),%rbx
-	lea	48(%rsi),%rsp
+	lea	280+48(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	lea	0(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lghash_epilogue:
 	ret
+.cfi_endproc
 .size	gcm_ghash_4bit,.-gcm_ghash_4bit
 ___
 
@@ -469,7 +505,7 @@ $code.=<<___;
 	psllq		\$57,$Xi		#
 	movdqa		$Xi,$T1			#
 	pslldq		\$8,$Xi
-	psrldq		\$8,$T1			#	
+	psrldq		\$8,$T1			#
 	pxor		$T2,$Xi
 	pxor		$T1,$Xhi		#
 
@@ -583,7 +619,7 @@ ___
 	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$T2);
 $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
 	# experimental alternative. special thing about is that there
-	# no dependency between the two multiplications... 
+	# no dependency between the two multiplications...
 	mov		\$`0xE1<<1`,%eax
 	mov		\$0xA040608020C0E000,%r10	# ((7..0)·0xE0)&0xff
 	mov		\$0x07,%r11d
@@ -758,7 +794,7 @@ $code.=<<___;
 	movdqa		$T2,$T1			#
 	pslldq		\$8,$T2
 	 pclmulqdq	\$0x00,$Hkey2,$Xln
-	psrldq		\$8,$T1			#	
+	psrldq		\$8,$T1			#
 	pxor		$T2,$Xi
 	pxor		$T1,$Xhi		#
 	movdqu		0($inp),$T1
@@ -894,7 +930,7 @@ $code.=<<___;
 	  psllq		\$57,$Xi		#
 	  movdqa	$Xi,$T1			#
 	  pslldq	\$8,$Xi
-	  psrldq	\$8,$T1			#	
+	  psrldq	\$8,$T1			#
 	  pxor		$T2,$Xi
 	pshufd		\$0b01001110,$Xhn,$Xmn
 	  pxor		$T1,$Xhi		#
@@ -1648,14 +1684,20 @@ se_handler:
 	cmp	%r10,%rbx		# context->Rip>=epilogue label
 	jae	.Lin_prologue
 
-	lea	24(%rax),%rax		# adjust "rsp"
+	lea	48+280(%rax),%rax	# adjust "rsp"
 
 	mov	-8(%rax),%rbx
 	mov	-16(%rax),%rbp
 	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
 	mov	%rbx,144($context)	# restore context->Rbx
 	mov	%rbp,160($context)	# restore context->Rbp
 	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
 
 .Lin_prologue:
 	mov	8(%rax),%rdi
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl b/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl
index f0598cb28c..6a2ac71295 100755
--- a/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghashp8-ppc.pl
@@ -1,5 +1,5 @@
 #! /usr/bin/env perl
-# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
 #
 # Licensed under the OpenSSL license (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
@@ -23,13 +23,14 @@
 # Relative comparison is therefore more informative. This initial
 # version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
 # faster than "4-bit" integer-only compiler-generated 64-bit code.
-# "Initial version" means that there is room for futher improvement.
+# "Initial version" means that there is room for further improvement.
 
 # May 2016
 #
 # 2x aggregated reduction improves performance by 50% (resulting
 # performance on POWER8 is 1 cycle per processed byte), and 4x
 # aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
+# POWER9 delivers 0.51 cpb.
 
 $flavour=shift;
 $output =shift;
diff --git a/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl b/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl
index e13c709019..47e8820080 100644
--- a/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl
+++ b/deps/openssl/openssl/crypto/modes/asm/ghashv8-armx.pl
@@ -19,23 +19,29 @@
 # June 2014
 #
 # Initial version was developed in tight cooperation with Ard
-# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
-# other assembly modules. Just like aesv8-armx.pl this module
-# supports both AArch32 and AArch64 execution modes.
+# Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
+# Just like aesv8-armx.pl this module supports both AArch32 and
+# AArch64 execution modes.
 #
 # July 2014
 #
 # Implement 2x aggregated reduction [see ghash-x86.pl for background
 # information].
 #
+# November 2017
+#
+# AArch64 register bank to "accommodate" 4x aggregated reduction and
+# improve performance by 20-70% depending on processor.
+#
 # Current performance in cycles per processed byte:
 #
-#		PMULL[2]	32-bit NEON(*)
-# Apple A7	0.92		5.62
-# Cortex-A53	1.01		8.39
-# Cortex-A57	1.17		7.61
-# Denver	0.71		6.02
-# Mongoose	1.10		8.06
+#		64-bit PMULL	32-bit PMULL	32-bit NEON(*)
+# Apple A7	0.58		0.92		5.62
+# Cortex-A53	0.85		1.01		8.39
+# Cortex-A57	0.73		1.17		7.61
+# Denver	0.51		0.65		6.02
+# Mongoose	0.65		1.10		8.06
+# Kryo		0.76		1.16		8.00
 #
 # (*)	presented for reference/comparison purposes;
 
@@ -131,8 +137,56 @@ gcm_init_v8:
 	vext.8		$t1,$H2,$H2,#8		@ Karatsuba pre-processing
 	veor		$t1,$t1,$H2
 	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
-	vst1.64		{$Hhl-$H2},[x0]		@ store Htable[1..2]
+	vst1.64		{$Hhl-$H2},[x0],#32	@ store Htable[1..2]
+___
+if ($flavour =~ /64/) {
+my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
 
+$code.=<<___;
+	@ calculate H^3 and H^4
+	vpmull.p64	$Xl,$H, $H2
+	 vpmull.p64	$Yl,$H2,$H2
+	vpmull2.p64	$Xh,$H, $H2
+	 vpmull2.p64	$Yh,$H2,$H2
+	vpmull.p64	$Xm,$t0,$t1
+	 vpmull.p64	$Ym,$t1,$t1
+
+	vext.8		$t0,$Xl,$Xh,#8		@ Karatsuba post-processing
+	 vext.8		$t1,$Yl,$Yh,#8
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t0
+	 veor		$t3,$Yl,$Yh
+	 veor		$Ym,$Ym,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+	 veor		$Ym,$Ym,$t3
+	 vpmull.p64	$t3,$Yl,$xC2
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	 vmov		$Yh#lo,$Ym#hi
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	 vmov		$Ym#hi,$Yl#lo
+	veor		$Xl,$Xm,$t2
+	 veor		$Yl,$Ym,$t3
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	 vext.8		$t3,$Yl,$Yl,#8
+	vpmull.p64	$Xl,$Xl,$xC2
+	 vpmull.p64	$Yl,$Yl,$xC2
+	veor		$t2,$t2,$Xh
+	 veor		$t3,$t3,$Yh
+	veor		$H, $Xl,$t2		@ H^3
+	 veor		$H2,$Yl,$t3		@ H^4
+
+	vext.8		$t0,$H, $H,#8		@ Karatsuba pre-processing
+	 vext.8		$t1,$H2,$H2,#8
+	veor		$t0,$t0,$H
+	 veor		$t1,$t1,$H2
+	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
+	vst1.64		{$H-$H2},[x0]		@ store Htable[3..5]
+___
+}
+$code.=<<___;
 	ret
 .size	gcm_init_v8,.-gcm_init_v8
 ___
@@ -201,6 +255,10 @@ $code.=<<___;
 .align	4
 gcm_ghash_v8:
 ___
+$code.=<<___	if ($flavour =~ /64/);
+	cmp		$len,#64
+	b.hs		.Lgcm_ghash_v8_4x
+___
 $code.=<<___		if ($flavour !~ /64/);
 	vstmdb		sp!,{d8-d15}		@ 32-bit ABI says so
 ___
@@ -210,13 +268,13 @@ $code.=<<___;
 						@ loaded value would have
 						@ to be rotated in order to
 						@ make it appear as in
-						@ alorithm specification
+						@ algorithm specification
 	subs		$len,$len,#32		@ see if $len is 32 or larger
 	mov		$inc,#16		@ $inc is used as post-
 						@ increment for input pointer;
 						@ as loop is modulo-scheduled
 						@ $inc is zeroed just in time
-						@ to preclude oversteping
+						@ to preclude overstepping
 						@ inp[len], which means that
 						@ last block[s] are actually
 						@ loaded twice, but last
@@ -348,7 +406,297 @@ $code.=<<___;
 	ret
 .size	gcm_ghash_v8,.-gcm_ghash_v8
 ___
+
+if ($flavour =~ /64/) {				# 4x subroutine
+my ($I0,$j1,$j2,$j3,
+    $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
+
+$code.=<<___;
+.type	gcm_ghash_v8_4x,%function
+.align	4
+gcm_ghash_v8_4x:
+.Lgcm_ghash_v8_4x:
+	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
+	vld1.64		{$H-$H2},[$Htbl],#48	@ load twisted H, ..., H^2
+	vmov.i8		$xC2,#0xe1
+	vld1.64		{$H3-$H4},[$Htbl]	@ load twisted H^3, ..., H^4
+	vshl.u64	$xC2,$xC2,#57		@ compose 0xc2.0 constant
+
+	vld1.64		{$I0-$j3},[$inp],#64
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+	vrev64.8	$j1,$j1
+	vrev64.8	$j2,$j2
+	vrev64.8	$j3,$j3
+	vrev64.8	$I0,$I0
+#endif
+	vext.8		$I3,$j3,$j3,#8
+	vext.8		$I2,$j2,$j2,#8
+	vext.8		$I1,$j1,$j1,#8
+
+	vpmull.p64	$Yl,$H,$I3		@ H·Ii+3
+	veor		$j3,$j3,$I3
+	vpmull2.p64	$Yh,$H,$I3
+	vpmull.p64	$Ym,$Hhl,$j3
+
+	vpmull.p64	$t0,$H2,$I2		@ H^2·Ii+2
+	veor		$j2,$j2,$I2
+	vpmull2.p64	$I2,$H2,$I2
+	vpmull2.p64	$j2,$Hhl,$j2
+
+	veor		$Yl,$Yl,$t0
+	veor		$Yh,$Yh,$I2
+	veor		$Ym,$Ym,$j2
+
+	vpmull.p64	$j3,$H3,$I1		@ H^3·Ii+1
+	veor		$j1,$j1,$I1
+	vpmull2.p64	$I1,$H3,$I1
+	vpmull.p64	$j1,$H34,$j1
+
+	veor		$Yl,$Yl,$j3
+	veor		$Yh,$Yh,$I1
+	veor		$Ym,$Ym,$j1
+
+	subs		$len,$len,#128
+	b.lo		.Ltail4x
+
+	b		.Loop4x
+
+.align	4
+.Loop4x:
+	veor		$t0,$I0,$Xl
+	 vld1.64	{$I0-$j3},[$inp],#64
+	vext.8		$IN,$t0,$t0,#8
+#ifndef __ARMEB__
+	 vrev64.8	$j1,$j1
+	 vrev64.8	$j2,$j2
+	 vrev64.8	$j3,$j3
+	 vrev64.8	$I0,$I0
+#endif
+
+	vpmull.p64	$Xl,$H4,$IN		@ H^4·(Xi+Ii)
+	veor		$t0,$t0,$IN
+	vpmull2.p64	$Xh,$H4,$IN
+	 vext.8		$I3,$j3,$j3,#8
+	vpmull2.p64	$Xm,$H34,$t0
+
+	veor		$Xl,$Xl,$Yl
+	veor		$Xh,$Xh,$Yh
+	 vext.8		$I2,$j2,$j2,#8
+	veor		$Xm,$Xm,$Ym
+	 vext.8		$I1,$j1,$j1,#8
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	 vpmull.p64	$Yl,$H,$I3		@ H·Ii+3
+	 veor		$j3,$j3,$I3
+	veor		$Xm,$Xm,$t1
+	 vpmull2.p64	$Yh,$H,$I3
+	veor		$Xm,$Xm,$t2
+	 vpmull.p64	$Ym,$Hhl,$j3
+
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	 vpmull.p64	$t0,$H2,$I2		@ H^2·Ii+2
+	 veor		$j2,$j2,$I2
+	 vpmull2.p64	$I2,$H2,$I2
+	veor		$Xl,$Xm,$t2
+	 vpmull2.p64	$j2,$Hhl,$j2
+
+	 veor		$Yl,$Yl,$t0
+	 veor		$Yh,$Yh,$I2
+	 veor		$Ym,$Ym,$j2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	 vpmull.p64	$j3,$H3,$I1		@ H^3·Ii+1
+	 veor		$j1,$j1,$I1
+	veor		$t2,$t2,$Xh
+	 vpmull2.p64	$I1,$H3,$I1
+	 vpmull.p64	$j1,$H34,$j1
+
+	veor		$Xl,$Xl,$t2
+	 veor		$Yl,$Yl,$j3
+	 veor		$Yh,$Yh,$I1
+	vext.8		$Xl,$Xl,$Xl,#8
+	 veor		$Ym,$Ym,$j1
+
+	subs		$len,$len,#64
+	b.hs		.Loop4x
+
+.Ltail4x:
+	veor		$t0,$I0,$Xl
+	vext.8		$IN,$t0,$t0,#8
+
+	vpmull.p64	$Xl,$H4,$IN		@ H^4·(Xi+Ii)
+	veor		$t0,$t0,$IN
+	vpmull2.p64	$Xh,$H4,$IN
+	vpmull2.p64	$Xm,$H34,$t0
+
+	veor		$Xl,$Xl,$Yl
+	veor		$Xh,$Xh,$Yh
+	veor		$Xm,$Xm,$Ym
+
+	adds		$len,$len,#64
+	b.eq		.Ldone4x
+
+	cmp		$len,#32
+	b.lo		.Lone
+	b.eq		.Ltwo
+.Lthree:
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	 vld1.64	{$I0-$j2},[$inp]
+	veor		$Xm,$Xm,$t2
+#ifndef	__ARMEB__
+	 vrev64.8	$j1,$j1
+	 vrev64.8	$j2,$j2
+	 vrev64.8	$I0,$I0
+#endif
+
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	 vext.8		$I2,$j2,$j2,#8
+	 vext.8		$I1,$j1,$j1,#8
+	veor		$Xl,$Xm,$t2
+
+	 vpmull.p64	$Yl,$H,$I2		@ H·Ii+2
+	 veor		$j2,$j2,$I2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	 vpmull2.p64	$Yh,$H,$I2
+	 vpmull.p64	$Ym,$Hhl,$j2
+	veor		$Xl,$Xl,$t2
+	 vpmull.p64	$j3,$H2,$I1		@ H^2·Ii+1
+	 veor		$j1,$j1,$I1
+	vext.8		$Xl,$Xl,$Xl,#8
+
+	 vpmull2.p64	$I1,$H2,$I1
+	veor		$t0,$I0,$Xl
+	 vpmull2.p64	$j1,$Hhl,$j1
+	vext.8		$IN,$t0,$t0,#8
+
+	 veor		$Yl,$Yl,$j3
+	 veor		$Yh,$Yh,$I1
+	 veor		$Ym,$Ym,$j1
+
+	vpmull.p64	$Xl,$H3,$IN		@ H^3·(Xi+Ii)
+	veor		$t0,$t0,$IN
+	vpmull2.p64	$Xh,$H3,$IN
+	vpmull.p64	$Xm,$H34,$t0
+
+	veor		$Xl,$Xl,$Yl
+	veor		$Xh,$Xh,$Yh
+	veor		$Xm,$Xm,$Ym
+	b		.Ldone4x
+
+.align	4
+.Ltwo:
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	 vld1.64	{$I0-$j1},[$inp]
+	veor		$Xm,$Xm,$t2
+#ifndef	__ARMEB__
+	 vrev64.8	$j1,$j1
+	 vrev64.8	$I0,$I0
+#endif
+
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	 vext.8		$I1,$j1,$j1,#8
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+	vext.8		$Xl,$Xl,$Xl,#8
+
+	 vpmull.p64	$Yl,$H,$I1		@ H·Ii+1
+	 veor		$j1,$j1,$I1
+
+	veor		$t0,$I0,$Xl
+	vext.8		$IN,$t0,$t0,#8
+
+	 vpmull2.p64	$Yh,$H,$I1
+	 vpmull.p64	$Ym,$Hhl,$j1
+
+	vpmull.p64	$Xl,$H2,$IN		@ H^2·(Xi+Ii)
+	veor		$t0,$t0,$IN
+	vpmull2.p64	$Xh,$H2,$IN
+	vpmull2.p64	$Xm,$Hhl,$t0
+
+	veor		$Xl,$Xl,$Yl
+	veor		$Xh,$Xh,$Yh
+	veor		$Xm,$Xm,$Ym
+	b		.Ldone4x
+
+.align	4
+.Lone:
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	 vld1.64	{$I0},[$inp]
+	veor		$Xm,$Xm,$t2
+#ifndef	__ARMEB__
+	 vrev64.8	$I0,$I0
+#endif
+
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+	vext.8		$Xl,$Xl,$Xl,#8
+
+	veor		$t0,$I0,$Xl
+	vext.8		$IN,$t0,$t0,#8
+
+	vpmull.p64	$Xl,$H,$IN
+	veor		$t0,$t0,$IN
+	vpmull2.p64	$Xh,$H,$IN
+	vpmull.p64	$Xm,$Hhl,$t0
+
+.Ldone4x:
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	veor		$Xm,$Xm,$t2
+
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+	vext.8		$Xl,$Xl,$Xl,#8
+
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+#endif
+	vst1.64		{$Xl},[$Xi]		@ write out Xi
+
+	ret
+.size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
+___
+
 }
+}
+
 $code.=<<___;
 .asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align  2
@@ -360,7 +708,8 @@ if ($flavour =~ /64/) {			######## 64-bit code
 	my $arg=shift;
 
 	$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
-	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
+	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
+					     $3<8?$3:$3+8,($4 eq "lo")?0:1;
     }
     foreach(split("\n",$code)) {
 	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
@@ -375,7 +724,7 @@ if ($flavour =~ /64/) {			######## 64-bit code
 	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
 	s/@\s/\/\//o;				# old->new style commentary
 
-	# fix up remainig legacy suffixes
+	# fix up remaining legacy suffixes
 	s/\.[ui]?8(\s)/$1/o;
 	s/\.[uis]?32//o and s/\.16b/\.4s/go;
 	m/\.p64/o and s/\.16b/\.1q/o;		# 1st pmull argument
@@ -415,7 +764,7 @@ if ($flavour =~ /64/) {			######## 64-bit code
 	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
 	s/\/\/\s?/@ /o;				# new->old style commentary
 
-	# fix up remainig new-style suffixes
+	# fix up remaining new-style suffixes
 	s/\],#[0-9]+/]!/o;
 
 	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o			or