summaryrefslogtreecommitdiff
path: root/deps/openssl/config/archs/linux-aarch64/asm/crypto/sha/sha512-armv8.S
diff options
context:
space:
mode:
Diffstat (limited to 'deps/openssl/config/archs/linux-aarch64/asm/crypto/sha/sha512-armv8.S')
-rw-r--r--deps/openssl/config/archs/linux-aarch64/asm/crypto/sha/sha512-armv8.S629
1 files changed, 610 insertions, 19 deletions
diff --git a/deps/openssl/config/archs/linux-aarch64/asm/crypto/sha/sha512-armv8.S b/deps/openssl/config/archs/linux-aarch64/asm/crypto/sha/sha512-armv8.S
index eaa27cf0ea..99d70c91c1 100644
--- a/deps/openssl/config/archs/linux-aarch64/asm/crypto/sha/sha512-armv8.S
+++ b/deps/openssl/config/archs/linux-aarch64/asm/crypto/sha/sha512-armv8.S
@@ -1,4 +1,60 @@
-#include "arm_arch.h"
+// Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+// SHA256-hw SHA256(*) SHA512
+// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
+// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
+// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
+// Denver 2.01 10.5 (+26%) 6.70 (+8%)
+// X-Gene 20.0 (+100%) 12.8 (+300%(***))
+// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
+// Kryo 1.92 17.4 (+30%) 11.2 (+8%)
+//
+// (*) Software SHA256 results are of lesser relevance, presented
+// mostly for informational purposes.
+// (**) The result is a trade-off: it's possible to improve it by
+// 10% (or by 1 cycle per round), but at the cost of 20% loss
+// on Cortex-A53 (or by 4 cycles per round).
+// (***) Super-impressive coefficients over gcc-generated code are
+// indication of some compiler "pathology", most notably code
+// generated with -mgeneral-regs-only is significantly faster
+// and the gap is only 40-90%.
+//
+// October 2016.
+//
+// Originally it was reckoned that it makes no sense to implement NEON
+// version of SHA256 for 64-bit processors. This is because performance
+// improvement on most wide-spread Cortex-A5x processors was observed
+// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
+// observed that 32-bit NEON SHA256 performs significantly better than
+// 64-bit scalar version on *some* of the more recent processors. As
+// result 64-bit NEON version of SHA256 was added to provide best
+// all-round performance. For example it executes ~30% faster on X-Gene
+// and Mongoose. [For reference, NEON version of SHA512 is bound to
+// deliver much less improvement, likely *negative* on Cortex-A5x.
+// Which is why NEON support is limited to SHA256.]
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#endif
.text
@@ -7,6 +63,18 @@
.type sha512_block_data_order,%function
.align 6
sha512_block_data_order:
+#ifndef __KERNEL__
+# ifdef __ILP32__
+ ldrsw x16,.LOPENSSL_armcap_P
+# else
+ ldr x16,.LOPENSSL_armcap_P
+# endif
+ adr x17,.LOPENSSL_armcap_P
+ add x16,x16,x17
+ ldr w16,[x16]
+ tst w16,#ARMV8_SHA512
+ b.ne .Lv8_entry
+#endif
stp x29,x30,[sp,#-128]!
add x29,sp,#0
@@ -30,7 +98,7 @@ sha512_block_data_order:
ldr x19,[x30],#8 // *K++
eor x28,x21,x22 // magic seed
str x1,[x29,#112]
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
rev x3,x3 // 0
#endif
ror x16,x24,#14
@@ -53,7 +121,7 @@ sha512_block_data_order:
add x27,x27,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x27,x27,x17 // h+=Sigma0(a)
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
rev x4,x4 // 1
#endif
ldp x5,x6,[x1],#2*8
@@ -78,7 +146,7 @@ sha512_block_data_order:
add x26,x26,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x26,x26,x17 // h+=Sigma0(a)
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
rev x5,x5 // 2
#endif
add x26,x26,x17 // h+=Sigma0(a)
@@ -102,7 +170,7 @@ sha512_block_data_order:
add x25,x25,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x25,x25,x17 // h+=Sigma0(a)
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
rev x6,x6 // 3
#endif
ldp x7,x8,[x1],#2*8
@@ -127,7 +195,7 @@ sha512_block_data_order:
add x24,x24,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x24,x24,x17 // h+=Sigma0(a)
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
rev x7,x7 // 4
#endif
add x24,x24,x17 // h+=Sigma0(a)
@@ -151,7 +219,7 @@ sha512_block_data_order:
add x23,x23,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x23,x23,x17 // h+=Sigma0(a)
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
rev x8,x8 // 5
#endif
ldp x9,x10,[x1],#2*8
@@ -176,7 +244,7 @@ sha512_block_data_order:
add x22,x22,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x22,x22,x17 // h+=Sigma0(a)
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
rev x9,x9 // 6
#endif
add x22,x22,x17 // h+=Sigma0(a)
@@ -200,7 +268,7 @@ sha512_block_data_order:
add x21,x21,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x21,x21,x17 // h+=Sigma0(a)
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
rev x10,x10 // 7
#endif
ldp x11,x12,[x1],#2*8
@@ -225,7 +293,7 @@ sha512_block_data_order:
add x20,x20,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x20,x20,x17 // h+=Sigma0(a)
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
rev x11,x11 // 8
#endif
add x20,x20,x17 // h+=Sigma0(a)
@@ -249,7 +317,7 @@ sha512_block_data_order:
add x27,x27,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x27,x27,x17 // h+=Sigma0(a)
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
rev x12,x12 // 9
#endif
ldp x13,x14,[x1],#2*8
@@ -274,7 +342,7 @@ sha512_block_data_order:
add x26,x26,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x26,x26,x17 // h+=Sigma0(a)
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
rev x13,x13 // 10
#endif
add x26,x26,x17 // h+=Sigma0(a)
@@ -298,7 +366,7 @@ sha512_block_data_order:
add x25,x25,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x25,x25,x17 // h+=Sigma0(a)
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
rev x14,x14 // 11
#endif
ldp x15,x0,[x1],#2*8
@@ -324,7 +392,7 @@ sha512_block_data_order:
add x24,x24,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x24,x24,x17 // h+=Sigma0(a)
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
rev x15,x15 // 12
#endif
add x24,x24,x17 // h+=Sigma0(a)
@@ -349,7 +417,7 @@ sha512_block_data_order:
add x23,x23,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x23,x23,x17 // h+=Sigma0(a)
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
rev x0,x0 // 13
#endif
ldp x1,x2,[x1]
@@ -375,7 +443,7 @@ sha512_block_data_order:
add x22,x22,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x22,x22,x17 // h+=Sigma0(a)
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
rev x1,x1 // 14
#endif
ldr x6,[sp,#24]
@@ -401,7 +469,7 @@ sha512_block_data_order:
add x21,x21,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x21,x21,x17 // h+=Sigma0(a)
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
rev x2,x2 // 15
#endif
ldr x7,[sp,#0]
@@ -1014,14 +1082,537 @@ sha512_block_data_order:
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.quad 0 // terminator
.size .LK512,.-.LK512
+#ifndef __KERNEL__
.align 3
.LOPENSSL_armcap_P:
-#ifdef __ILP32__
+# ifdef __ILP32__
.long OPENSSL_armcap_P-.
-#else
+# else
.quad OPENSSL_armcap_P-.
+# endif
#endif
.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2
+#ifndef __KERNEL__
+.type sha512_block_armv8,%function
+.align 6
+sha512_block_armv8:
+.Lv8_entry:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+ ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context
+ adr x3,.LK512
+
+ rev64 v16.16b,v16.16b
+ rev64 v17.16b,v17.16b
+ rev64 v18.16b,v18.16b
+ rev64 v19.16b,v19.16b
+ rev64 v20.16b,v20.16b
+ rev64 v21.16b,v21.16b
+ rev64 v22.16b,v22.16b
+ rev64 v23.16b,v23.16b
+ b .Loop_hw
+
+.align 4
+.Loop_hw:
+ ld1 {v24.2d},[x3],#16
+ subs x2,x2,#1
+ sub x4,x1,#128
+ orr v26.16b,v0.16b,v0.16b // offload
+ orr v27.16b,v1.16b,v1.16b
+ orr v28.16b,v2.16b,v2.16b
+ orr v29.16b,v3.16b,v3.16b
+ csel x1,x1,x4,ne // conditional rewind
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v16.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+ rev64 v16.16b,v16.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ ld1 {v24.2d},[x3],#16
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v17.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+ rev64 v17.16b,v17.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v18.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+ rev64 v18.16b,v18.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ ld1 {v24.2d},[x3],#16
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v19.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+ rev64 v19.16b,v19.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v20.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+ rev64 v20.16b,v20.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ ld1 {v24.2d},[x3],#16
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v21.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+ rev64 v21.16b,v21.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v22.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+ rev64 v22.16b,v22.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ sub x3,x3,#80*8 // rewind
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v23.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+ rev64 v23.16b,v23.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v0.2d,v0.2d,v26.2d // accumulate
+ add v1.2d,v1.2d,v27.2d
+ add v2.2d,v2.2d,v28.2d
+ add v3.2d,v3.2d,v29.2d
+
+ cbnz x2,.Loop_hw
+
+ st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context
+
+ ldr x29,[sp],#16
+ ret
+.size sha512_block_armv8,.-sha512_block_armv8
+#endif
+#ifndef __KERNEL__
.comm OPENSSL_armcap_P,4,4
+#endif