diff options
author | Sam Roberts <vieuxtech@gmail.com> | 2018-11-22 11:47:07 -0800 |
---|---|---|
committer | Sam Roberts <vieuxtech@gmail.com> | 2019-01-22 13:33:54 -0800 |
commit | 807ed7883a12423270450776f015a7c2348c0913 (patch) | |
tree | 00ec21dd290b29c782680ffc2f97e6d59fd2ab2f /deps/openssl/config/archs/linux-armv4/asm_avx2/crypto/sha | |
parent | 57119fbdb200702d6e2cf23428de4c458ae86bbc (diff) | |
download | android-node-v8-807ed7883a12423270450776f015a7c2348c0913.tar.gz android-node-v8-807ed7883a12423270450776f015a7c2348c0913.tar.bz2 android-node-v8-807ed7883a12423270450776f015a7c2348c0913.zip |
deps: update archs files for OpenSSL-1.1.1a
`cd deps/openssl/config; make` updates all archs dependant files.
PR-URL: https://github.com/nodejs/node/pull/25381
Reviewed-By: Daniel Bevenius <daniel.bevenius@gmail.com>
Reviewed-By: Shigeki Ohtsu <ohtsu@ohtsu.org>
Diffstat (limited to 'deps/openssl/config/archs/linux-armv4/asm_avx2/crypto/sha')
4 files changed, 8838 insertions, 0 deletions
diff --git a/deps/openssl/config/archs/linux-armv4/asm_avx2/crypto/sha/keccak1600-armv4.S b/deps/openssl/config/archs/linux-armv4/asm_avx2/crypto/sha/keccak1600-armv4.S new file mode 100644 index 0000000000..f4e72da051 --- /dev/null +++ b/deps/openssl/config/archs/linux-armv4/asm_avx2/crypto/sha/keccak1600-armv4.S @@ -0,0 +1,2665 @@ +#include "arm_arch.h" + +.text + +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.type iotas32, %object +.align 5 +iotas32: +.long 0x00000001, 0x00000000 +.long 0x00000000, 0x00000089 +.long 0x00000000, 0x8000008b +.long 0x00000000, 0x80008080 +.long 0x00000001, 0x0000008b +.long 0x00000001, 0x00008000 +.long 0x00000001, 0x80008088 +.long 0x00000001, 0x80000082 +.long 0x00000000, 0x0000000b +.long 0x00000000, 0x0000000a +.long 0x00000001, 0x00008082 +.long 0x00000000, 0x00008003 +.long 0x00000001, 0x0000808b +.long 0x00000001, 0x8000000b +.long 0x00000001, 0x8000008a +.long 0x00000001, 0x80000081 +.long 0x00000000, 0x80000081 +.long 0x00000000, 0x80000008 +.long 0x00000000, 0x00000083 +.long 0x00000000, 0x80008003 +.long 0x00000001, 0x80008088 +.long 0x00000000, 0x80000088 +.long 0x00000001, 0x00008000 +.long 0x00000000, 0x80008082 +.size iotas32,.-iotas32 + +.type KeccakF1600_int, %function +.align 5 +KeccakF1600_int: + add r9,sp,#176 + add r12,sp,#0 + add r10,sp,#40 + ldmia r9,{r4,r5,r6,r7,r8,r9} @ A[4][2..4] +KeccakF1600_enter: + str lr,[sp,#440] + eor r11,r11,r11 + str r11,[sp,#444] + b .Lround2x + +.align 4 +.Lround2x: + ldmia r12,{r0,r1,r2,r3} @ A[0][0..1] + ldmia r10,{r10,r11,r12,r14} @ A[1][0..1] +#ifdef __thumb2__ + eor r0,r0,r10 + eor r1,r1,r11 + eor r2,r2,r12 + ldrd r10,r11,[sp,#56] + eor r3,r3,r14 + ldrd r12,r14,[sp,#64] + eor r4,r4,r10 + eor r5,r5,r11 + eor r6,r6,r12 + ldrd r10,r11,[sp,#72] + eor r7,r7,r14 + ldrd r12,r14,[sp,#80] + eor r8,r8,r10 + eor r9,r9,r11 + eor r0,r0,r12 + ldrd r10,r11,[sp,#88] + eor r1,r1,r14 + ldrd r12,r14,[sp,#96] + eor r2,r2,r10 + eor r3,r3,r11 + eor r4,r4,r12 + ldrd r10,r11,[sp,#104] + eor r5,r5,r14 + ldrd r12,r14,[sp,#112] + eor r6,r6,r10 + eor r7,r7,r11 + eor r8,r8,r12 + ldrd r10,r11,[sp,#120] + eor r9,r9,r14 + ldrd r12,r14,[sp,#128] + eor r0,r0,r10 + eor r1,r1,r11 + eor r2,r2,r12 + ldrd r10,r11,[sp,#136] + eor r3,r3,r14 + ldrd r12,r14,[sp,#144] + eor r4,r4,r10 + eor r5,r5,r11 + eor r6,r6,r12 + ldrd r10,r11,[sp,#152] + eor r7,r7,r14 + ldrd r12,r14,[sp,#160] + eor r8,r8,r10 + eor r9,r9,r11 + eor r0,r0,r12 + ldrd r10,r11,[sp,#168] + eor r1,r1,r14 + ldrd r12,r14,[sp,#16] + eor r2,r2,r10 + eor r3,r3,r11 + eor r4,r4,r12 + ldrd r10,r11,[sp,#24] + eor r5,r5,r14 + ldrd r12,r14,[sp,#32] +#else + eor r0,r0,r10 + add r10,sp,#56 + eor r1,r1,r11 + eor r2,r2,r12 + eor r3,r3,r14 + ldmia r10,{r10,r11,r12,r14} @ A[1][2..3] + eor r4,r4,r10 + add r10,sp,#72 + eor r5,r5,r11 + eor r6,r6,r12 + eor r7,r7,r14 + ldmia r10,{r10,r11,r12,r14} @ A[1][4]..A[2][0] + eor r8,r8,r10 + add r10,sp,#88 + eor r9,r9,r11 + eor r0,r0,r12 + eor r1,r1,r14 + ldmia r10,{r10,r11,r12,r14} @ A[2][1..2] + eor r2,r2,r10 + add r10,sp,#104 + eor r3,r3,r11 + eor r4,r4,r12 + eor r5,r5,r14 + ldmia r10,{r10,r11,r12,r14} @ A[2][3..4] + eor r6,r6,r10 + add r10,sp,#120 + eor r7,r7,r11 + eor r8,r8,r12 + eor r9,r9,r14 + ldmia r10,{r10,r11,r12,r14} @ A[3][0..1] + eor r0,r0,r10 + add r10,sp,#136 + eor r1,r1,r11 + eor r2,r2,r12 + eor r3,r3,r14 + ldmia r10,{r10,r11,r12,r14} @ A[3][2..3] + eor r4,r4,r10 + add r10,sp,#152 + eor r5,r5,r11 + eor r6,r6,r12 + eor r7,r7,r14 + ldmia r10,{r10,r11,r12,r14} @ A[3][4]..A[4][0] + eor r8,r8,r10 + ldr r10,[sp,#168] @ A[4][1] + eor r9,r9,r11 + ldr r11,[sp,#168+4] + eor r0,r0,r12 + ldr r12,[sp,#16] @ A[0][2] + eor r1,r1,r14 + ldr r14,[sp,#16+4] + eor r2,r2,r10 + add r10,sp,#24 + eor r3,r3,r11 + eor r4,r4,r12 + eor r5,r5,r14 + ldmia r10,{r10,r11,r12,r14} @ A[0][3..4] +#endif + eor r6,r6,r10 + eor r7,r7,r11 + eor r8,r8,r12 + eor r9,r9,r14 + + eor r10,r0,r5,ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0]; +#ifndef __thumb2__ + str r10,[sp,#208] @ D[1] = E[0] +#endif + eor r11,r1,r4 +#ifndef __thumb2__ + str r11,[sp,#208+4] +#else + strd r10,r11,[sp,#208] @ D[1] = E[0] +#endif + eor r12,r6,r1,ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3]; + eor r14,r7,r0 +#ifndef __thumb2__ + str r12,[sp,#232] @ D[4] = E[1] +#endif + eor r0,r8,r3,ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4]; +#ifndef __thumb2__ + str r14,[sp,#232+4] +#else + strd r12,r14,[sp,#232] @ D[4] = E[1] +#endif + eor r1,r9,r2 +#ifndef __thumb2__ + str r0,[sp,#200] @ D[0] = C[0] +#endif + eor r2,r2,r7,ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1]; +#ifndef __thumb2__ + ldr r7,[sp,#144] +#endif + eor r3,r3,r6 +#ifndef __thumb2__ + str r1,[sp,#200+4] +#else + strd r0,r1,[sp,#200] @ D[0] = C[0] +#endif +#ifndef __thumb2__ + ldr r6,[sp,#144+4] +#else + ldrd r7,r6,[sp,#144] +#endif +#ifndef __thumb2__ + str r2,[sp,#216] @ D[2] = C[1] +#endif + eor r4,r4,r9,ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2]; +#ifndef __thumb2__ + str r3,[sp,#216+4] +#else + strd r2,r3,[sp,#216] @ D[2] = C[1] +#endif + eor r5,r5,r8 + +#ifndef __thumb2__ + ldr r8,[sp,#192] +#endif +#ifndef __thumb2__ + ldr r9,[sp,#192+4] +#else + ldrd r8,r9,[sp,#192] +#endif +#ifndef __thumb2__ + str r4,[sp,#224] @ D[3] = C[2] +#endif + eor r7,r7,r4 +#ifndef __thumb2__ + str r5,[sp,#224+4] +#else + strd r4,r5,[sp,#224] @ D[3] = C[2] +#endif + eor r6,r6,r5 +#ifndef __thumb2__ + ldr r4,[sp,#0] +#endif + @ mov r7,r7,ror#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */ + @ mov r6,r6,ror#32-11 +#ifndef __thumb2__ + ldr r5,[sp,#0+4] +#else + ldrd r4,r5,[sp,#0] +#endif + eor r8,r8,r12 + eor r9,r9,r14 +#ifndef __thumb2__ + ldr r12,[sp,#96] +#endif + eor r0,r0,r4 +#ifndef __thumb2__ + ldr r14,[sp,#96+4] +#else + ldrd r12,r14,[sp,#96] +#endif + @ mov r8,r8,ror#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */ + @ mov r9,r9,ror#32-7 + eor r1,r1,r5 @ C[0] = A[0][0] ^ C[0]; + eor r12,r12,r2 +#ifndef __thumb2__ + ldr r2,[sp,#48] +#endif + eor r14,r14,r3 +#ifndef __thumb2__ + ldr r3,[sp,#48+4] +#else + ldrd r2,r3,[sp,#48] +#endif + mov r5,r12,ror#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); + ldr r12,[sp,#444] @ load counter + eor r2,r2,r10 + adr r10,iotas32 + mov r4,r14,ror#32-22 + add r14,r10,r12 + eor r3,r3,r11 + ldmia r14,{r10,r11} @ iotas[i] + bic r12,r4,r2,ror#32-22 + bic r14,r5,r3,ror#32-22 + mov r2,r2,ror#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); + mov r3,r3,ror#32-22 + eor r12,r12,r0 + eor r14,r14,r1 + eor r10,r10,r12 + eor r11,r11,r14 +#ifndef __thumb2__ + str r10,[sp,#240] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; +#endif + bic r12,r6,r4,ror#11 +#ifndef __thumb2__ + str r11,[sp,#240+4] +#else + strd r10,r11,[sp,#240] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; +#endif + bic r14,r7,r5,ror#10 + bic r10,r8,r6,ror#32-(11-7) + bic r11,r9,r7,ror#32-(10-7) + eor r12,r2,r12,ror#32-11 +#ifndef __thumb2__ + str r12,[sp,#248] @ R[0][1] = C[1] ^ (~C[2] & C[3]); +#endif + eor r14,r3,r14,ror#32-10 +#ifndef __thumb2__ + str r14,[sp,#248+4] +#else + strd r12,r14,[sp,#248] @ R[0][1] = C[1] ^ (~C[2] & C[3]); +#endif + eor r10,r4,r10,ror#32-7 + eor r11,r5,r11,ror#32-7 +#ifndef __thumb2__ + str r10,[sp,#256] @ R[0][2] = C[2] ^ (~C[3] & C[4]); +#endif + bic r12,r0,r8,ror#32-7 +#ifndef __thumb2__ + str r11,[sp,#256+4] +#else + strd r10,r11,[sp,#256] @ R[0][2] = C[2] ^ (~C[3] & C[4]); +#endif + bic r14,r1,r9,ror#32-7 + eor r12,r12,r6,ror#32-11 +#ifndef __thumb2__ + str r12,[sp,#264] @ R[0][3] = C[3] ^ (~C[4] & C[0]); +#endif + eor r14,r14,r7,ror#32-10 +#ifndef __thumb2__ + str r14,[sp,#264+4] +#else + strd r12,r14,[sp,#264] @ R[0][3] = C[3] ^ (~C[4] & C[0]); +#endif + bic r10,r2,r0 + add r14,sp,#224 +#ifndef __thumb2__ + ldr r0,[sp,#24] @ A[0][3] +#endif + bic r11,r3,r1 +#ifndef __thumb2__ + ldr r1,[sp,#24+4] +#else + ldrd r0,r1,[sp,#24] @ A[0][3] +#endif + eor r10,r10,r8,ror#32-7 + eor r11,r11,r9,ror#32-7 +#ifndef __thumb2__ + str r10,[sp,#272] @ R[0][4] = C[4] ^ (~C[0] & C[1]); +#endif + add r9,sp,#200 +#ifndef __thumb2__ + str r11,[sp,#272+4] +#else + strd r10,r11,[sp,#272] @ R[0][4] = C[4] ^ (~C[0] & C[1]); +#endif + + ldmia r14,{r10,r11,r12,r14} @ D[3..4] + ldmia r9,{r6,r7,r8,r9} @ D[0..1] + +#ifndef __thumb2__ + ldr r2,[sp,#72] @ A[1][4] +#endif + eor r0,r0,r10 +#ifndef __thumb2__ + ldr r3,[sp,#72+4] +#else + ldrd r2,r3,[sp,#72] @ A[1][4] +#endif + eor r1,r1,r11 + @ mov r0,r0,ror#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]); +#ifndef __thumb2__ + ldr r10,[sp,#128] @ A[3][1] +#endif + @ mov r1,r1,ror#32-14 +#ifndef __thumb2__ + ldr r11,[sp,#128+4] +#else + ldrd r10,r11,[sp,#128] @ A[3][1] +#endif + + eor r2,r2,r12 +#ifndef __thumb2__ + ldr r4,[sp,#80] @ A[2][0] +#endif + eor r3,r3,r14 +#ifndef __thumb2__ + ldr r5,[sp,#80+4] +#else + ldrd r4,r5,[sp,#80] @ A[2][0] +#endif + @ mov r2,r2,ror#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); + @ mov r3,r3,ror#32-10 + + eor r6,r6,r4 +#ifndef __thumb2__ + ldr r12,[sp,#216] @ D[2] +#endif + eor r7,r7,r5 +#ifndef __thumb2__ + ldr r14,[sp,#216+4] +#else + ldrd r12,r14,[sp,#216] @ D[2] +#endif + mov r5,r6,ror#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]); + mov r4,r7,ror#32-2 + + eor r10,r10,r8 +#ifndef __thumb2__ + ldr r8,[sp,#176] @ A[4][2] +#endif + eor r11,r11,r9 +#ifndef __thumb2__ + ldr r9,[sp,#176+4] +#else + ldrd r8,r9,[sp,#176] @ A[4][2] +#endif + mov r7,r10,ror#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]); + mov r6,r11,ror#32-23 + + bic r10,r4,r2,ror#32-10 + bic r11,r5,r3,ror#32-10 + eor r12,r12,r8 + eor r14,r14,r9 + mov r9,r12,ror#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]); + mov r8,r14,ror#32-31 + eor r10,r10,r0,ror#32-14 + eor r11,r11,r1,ror#32-14 +#ifndef __thumb2__ + str r10,[sp,#280] @ R[1][0] = C[0] ^ (~C[1] & C[2]) +#endif + bic r12,r6,r4 +#ifndef __thumb2__ + str r11,[sp,#280+4] +#else + strd r10,r11,[sp,#280] @ R[1][0] = C[0] ^ (~C[1] & C[2]) +#endif + bic r14,r7,r5 + eor r12,r12,r2,ror#32-10 +#ifndef __thumb2__ + str r12,[sp,#288] @ R[1][1] = C[1] ^ (~C[2] & C[3]); +#endif + eor r14,r14,r3,ror#32-10 +#ifndef __thumb2__ + str r14,[sp,#288+4] +#else + strd r12,r14,[sp,#288] @ R[1][1] = C[1] ^ (~C[2] & C[3]); +#endif + bic r10,r8,r6 + bic r11,r9,r7 + bic r12,r0,r8,ror#14 + bic r14,r1,r9,ror#14 + eor r10,r10,r4 + eor r11,r11,r5 +#ifndef __thumb2__ + str r10,[sp,#296] @ R[1][2] = C[2] ^ (~C[3] & C[4]); +#endif + bic r2,r2,r0,ror#32-(14-10) +#ifndef __thumb2__ + str r11,[sp,#296+4] +#else + strd r10,r11,[sp,#296] @ R[1][2] = C[2] ^ (~C[3] & C[4]); +#endif + eor r12,r6,r12,ror#32-14 + bic r11,r3,r1,ror#32-(14-10) +#ifndef __thumb2__ + str r12,[sp,#304] @ R[1][3] = C[3] ^ (~C[4] & C[0]); +#endif + eor r14,r7,r14,ror#32-14 +#ifndef __thumb2__ + str r14,[sp,#304+4] +#else + strd r12,r14,[sp,#304] @ R[1][3] = C[3] ^ (~C[4] & C[0]); +#endif + add r12,sp,#208 +#ifndef __thumb2__ + ldr r1,[sp,#8] @ A[0][1] +#endif + eor r10,r8,r2,ror#32-10 +#ifndef __thumb2__ + ldr r0,[sp,#8+4] +#else + ldrd r1,r0,[sp,#8] @ A[0][1] +#endif + eor r11,r9,r11,ror#32-10 +#ifndef __thumb2__ + str r10,[sp,#312] @ R[1][4] = C[4] ^ (~C[0] & C[1]); +#endif +#ifndef __thumb2__ + str r11,[sp,#312+4] +#else + strd r10,r11,[sp,#312] @ R[1][4] = C[4] ^ (~C[0] & C[1]); +#endif + + add r9,sp,#224 + ldmia r12,{r10,r11,r12,r14} @ D[1..2] +#ifndef __thumb2__ + ldr r2,[sp,#56] @ A[1][2] +#endif +#ifndef __thumb2__ + ldr r3,[sp,#56+4] +#else + ldrd r2,r3,[sp,#56] @ A[1][2] +#endif + ldmia r9,{r6,r7,r8,r9} @ D[3..4] + + eor r1,r1,r10 +#ifndef __thumb2__ + ldr r4,[sp,#104] @ A[2][3] +#endif + eor r0,r0,r11 +#ifndef __thumb2__ + ldr r5,[sp,#104+4] +#else + ldrd r4,r5,[sp,#104] @ A[2][3] +#endif + mov r0,r0,ror#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]); + + eor r2,r2,r12 +#ifndef __thumb2__ + ldr r10,[sp,#152] @ A[3][4] +#endif + eor r3,r3,r14 +#ifndef __thumb2__ + ldr r11,[sp,#152+4] +#else + ldrd r10,r11,[sp,#152] @ A[3][4] +#endif + @ mov r2,r2,ror#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]); +#ifndef __thumb2__ + ldr r12,[sp,#200] @ D[0] +#endif + @ mov r3,r3,ror#32-3 +#ifndef __thumb2__ + ldr r14,[sp,#200+4] +#else + ldrd r12,r14,[sp,#200] @ D[0] +#endif + + eor r4,r4,r6 + eor r5,r5,r7 + @ mov r5,r6,ror#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); + @ mov r4,r7,ror#32-13 @ [track reverse order below] + + eor r10,r10,r8 +#ifndef __thumb2__ + ldr r8,[sp,#160] @ A[4][0] +#endif + eor r11,r11,r9 +#ifndef __thumb2__ + ldr r9,[sp,#160+4] +#else + ldrd r8,r9,[sp,#160] @ A[4][0] +#endif + mov r6,r10,ror#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]); + mov r7,r11,ror#32-4 + + eor r12,r12,r8 + eor r14,r14,r9 + mov r8,r12,ror#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]); + mov r9,r14,ror#32-9 + + bic r10,r5,r2,ror#13-3 + bic r11,r4,r3,ror#12-3 + bic r12,r6,r5,ror#32-13 + bic r14,r7,r4,ror#32-12 + eor r10,r0,r10,ror#32-13 + eor r11,r1,r11,ror#32-12 +#ifndef __thumb2__ + str r10,[sp,#320] @ R[2][0] = C[0] ^ (~C[1] & C[2]) +#endif + eor r12,r12,r2,ror#32-3 +#ifndef __thumb2__ + str r11,[sp,#320+4] +#else + strd r10,r11,[sp,#320] @ R[2][0] = C[0] ^ (~C[1] & C[2]) +#endif + eor r14,r14,r3,ror#32-3 +#ifndef __thumb2__ + str r12,[sp,#328] @ R[2][1] = C[1] ^ (~C[2] & C[3]); +#endif + bic r10,r8,r6 + bic r11,r9,r7 +#ifndef __thumb2__ + str r14,[sp,#328+4] +#else + strd r12,r14,[sp,#328] @ R[2][1] = C[1] ^ (~C[2] & C[3]); +#endif + eor r10,r10,r5,ror#32-13 + eor r11,r11,r4,ror#32-12 +#ifndef __thumb2__ + str r10,[sp,#336] @ R[2][2] = C[2] ^ (~C[3] & C[4]); +#endif + bic r12,r0,r8 +#ifndef __thumb2__ + str r11,[sp,#336+4] +#else + strd r10,r11,[sp,#336] @ R[2][2] = C[2] ^ (~C[3] & C[4]); +#endif + bic r14,r1,r9 + eor r12,r12,r6 + eor r14,r14,r7 +#ifndef __thumb2__ + str r12,[sp,#344] @ R[2][3] = C[3] ^ (~C[4] & C[0]); +#endif + bic r10,r2,r0,ror#3 +#ifndef __thumb2__ + str r14,[sp,#344+4] +#else + strd r12,r14,[sp,#344] @ R[2][3] = C[3] ^ (~C[4] & C[0]); +#endif + bic r11,r3,r1,ror#3 +#ifndef __thumb2__ + ldr r1,[sp,#32] @ A[0][4] [in reverse order] +#endif + eor r10,r8,r10,ror#32-3 +#ifndef __thumb2__ + ldr r0,[sp,#32+4] +#else + ldrd r1,r0,[sp,#32] @ A[0][4] [in reverse order] +#endif + eor r11,r9,r11,ror#32-3 +#ifndef __thumb2__ + str r10,[sp,#352] @ R[2][4] = C[4] ^ (~C[0] & C[1]); +#endif + add r9,sp,#208 +#ifndef __thumb2__ + str r11,[sp,#352+4] +#else + strd r10,r11,[sp,#352] @ R[2][4] = C[4] ^ (~C[0] & C[1]); +#endif + +#ifndef __thumb2__ + ldr r10,[sp,#232] @ D[4] +#endif +#ifndef __thumb2__ + ldr r11,[sp,#232+4] +#else + ldrd r10,r11,[sp,#232] @ D[4] +#endif +#ifndef __thumb2__ + ldr r12,[sp,#200] @ D[0] +#endif +#ifndef __thumb2__ + ldr r14,[sp,#200+4] +#else + ldrd r12,r14,[sp,#200] @ D[0] +#endif + + ldmia r9,{r6,r7,r8,r9} @ D[1..2] + + eor r1,r1,r10 +#ifndef __thumb2__ + ldr r2,[sp,#40] @ A[1][0] +#endif + eor r0,r0,r11 +#ifndef __thumb2__ + ldr r3,[sp,#40+4] +#else + ldrd r2,r3,[sp,#40] @ A[1][0] +#endif + @ mov r1,r10,ror#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]); +#ifndef __thumb2__ + ldr r4,[sp,#88] @ A[2][1] +#endif + @ mov r0,r11,ror#32-14 @ [was loaded in reverse order] +#ifndef __thumb2__ + ldr r5,[sp,#88+4] +#else + ldrd r4,r5,[sp,#88] @ A[2][1] +#endif + + eor r2,r2,r12 +#ifndef __thumb2__ + ldr r10,[sp,#136] @ A[3][2] +#endif + eor r3,r3,r14 +#ifndef __thumb2__ + ldr r11,[sp,#136+4] +#else + ldrd r10,r11,[sp,#136] @ A[3][2] +#endif + @ mov r2,r2,ror#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]); +#ifndef __thumb2__ + ldr r12,[sp,#224] @ D[3] +#endif + @ mov r3,r3,ror#32-18 +#ifndef __thumb2__ + ldr r14,[sp,#224+4] +#else + ldrd r12,r14,[sp,#224] @ D[3] +#endif + + eor r6,r6,r4 + eor r7,r7,r5 + mov r4,r6,ror#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]); + mov r5,r7,ror#32-5 + + eor r10,r10,r8 +#ifndef __thumb2__ + ldr r8,[sp,#184] @ A[4][3] +#endif + eor r11,r11,r9 +#ifndef __thumb2__ + ldr r9,[sp,#184+4] +#else + ldrd r8,r9,[sp,#184] @ A[4][3] +#endif + mov r7,r10,ror#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); + mov r6,r11,ror#32-8 + + eor r12,r12,r8 + eor r14,r14,r9 + mov r8,r12,ror#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]); + mov r9,r14,ror#32-28 + + bic r10,r4,r2,ror#32-18 + bic r11,r5,r3,ror#32-18 + eor r10,r10,r0,ror#32-14 + eor r11,r11,r1,ror#32-13 +#ifndef __thumb2__ + str r10,[sp,#360] @ R[3][0] = C[0] ^ (~C[1] & C[2]) +#endif + bic r12,r6,r4 +#ifndef __thumb2__ + str r11,[sp,#360+4] +#else + strd r10,r11,[sp,#360] @ R[3][0] = C[0] ^ (~C[1] & C[2]) +#endif + bic r14,r7,r5 + eor r12,r12,r2,ror#32-18 +#ifndef __thumb2__ + str r12,[sp,#368] @ R[3][1] = C[1] ^ (~C[2] & C[3]); +#endif + eor r14,r14,r3,ror#32-18 +#ifndef __thumb2__ + str r14,[sp,#368+4] +#else + strd r12,r14,[sp,#368] @ R[3][1] = C[1] ^ (~C[2] & C[3]); +#endif + bic r10,r8,r6 + bic r11,r9,r7 + bic r12,r0,r8,ror#14 + bic r14,r1,r9,ror#13 + eor r10,r10,r4 + eor r11,r11,r5 +#ifndef __thumb2__ + str r10,[sp,#376] @ R[3][2] = C[2] ^ (~C[3] & C[4]); +#endif + bic r2,r2,r0,ror#18-14 +#ifndef __thumb2__ + str r11,[sp,#376+4] +#else + strd r10,r11,[sp,#376] @ R[3][2] = C[2] ^ (~C[3] & C[4]); +#endif + eor r12,r6,r12,ror#32-14 + bic r11,r3,r1,ror#18-13 + eor r14,r7,r14,ror#32-13 +#ifndef __thumb2__ + str r12,[sp,#384] @ R[3][3] = C[3] ^ (~C[4] & C[0]); +#endif +#ifndef __thumb2__ + str r14,[sp,#384+4] +#else + strd r12,r14,[sp,#384] @ R[3][3] = C[3] ^ (~C[4] & C[0]); +#endif + add r14,sp,#216 +#ifndef __thumb2__ + ldr r0,[sp,#16] @ A[0][2] +#endif + eor r10,r8,r2,ror#32-18 +#ifndef __thumb2__ + ldr r1,[sp,#16+4] +#else + ldrd r0,r1,[sp,#16] @ A[0][2] +#endif + eor r11,r9,r11,ror#32-18 +#ifndef __thumb2__ + str r10,[sp,#392] @ R[3][4] = C[4] ^ (~C[0] & C[1]); +#endif +#ifndef __thumb2__ + str r11,[sp,#392+4] +#else + strd r10,r11,[sp,#392] @ R[3][4] = C[4] ^ (~C[0] & C[1]); +#endif + + ldmia r14,{r10,r11,r12,r14} @ D[2..3] +#ifndef __thumb2__ + ldr r2,[sp,#64] @ A[1][3] +#endif +#ifndef __thumb2__ + ldr r3,[sp,#64+4] +#else + ldrd r2,r3,[sp,#64] @ A[1][3] +#endif +#ifndef __thumb2__ + ldr r6,[sp,#232] @ D[4] +#endif +#ifndef __thumb2__ + ldr r7,[sp,#232+4] +#else + ldrd r6,r7,[sp,#232] @ D[4] +#endif + + eor r0,r0,r10 +#ifndef __thumb2__ + ldr r4,[sp,#112] @ A[2][4] +#endif + eor r1,r1,r11 +#ifndef __thumb2__ + ldr r5,[sp,#112+4] +#else + ldrd r4,r5,[sp,#112] @ A[2][4] +#endif + @ mov r0,r0,ror#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]); +#ifndef __thumb2__ + ldr r8,[sp,#200] @ D[0] +#endif + @ mov r1,r1,ror#32-31 +#ifndef __thumb2__ + ldr r9,[sp,#200+4] +#else + ldrd r8,r9,[sp,#200] @ D[0] +#endif + + eor r12,r12,r2 +#ifndef __thumb2__ + ldr r10,[sp,#120] @ A[3][0] +#endif + eor r14,r14,r3 +#ifndef __thumb2__ + ldr r11,[sp,#120+4] +#else + ldrd r10,r11,[sp,#120] @ A[3][0] +#endif + mov r3,r12,ror#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]); +#ifndef __thumb2__ + ldr r12,[sp,#208] @ D[1] +#endif + mov r2,r14,ror#32-28 +#ifndef __thumb2__ + ldr r14,[sp,#208+4] +#else + ldrd r12,r14,[sp,#208] @ D[1] +#endif + + eor r6,r6,r4 + eor r7,r7,r5 + mov r5,r6,ror#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]); + mov r4,r7,ror#32-20 + + eor r10,r10,r8 +#ifndef __thumb2__ + ldr r8,[sp,#168] @ A[4][1] +#endif + eor r11,r11,r9 +#ifndef __thumb2__ + ldr r9,[sp,#168+4] +#else + ldrd r8,r9,[sp,#168] @ A[4][1] +#endif + mov r7,r10,ror#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]); + mov r6,r11,ror#32-21 + + eor r8,r8,r12 + eor r9,r9,r14 + @ mov r8,r2,ror#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); + @ mov r9,r3,ror#32-1 + + bic r10,r4,r2 + bic r11,r5,r3 + eor r10,r10,r0,ror#32-31 +#ifndef __thumb2__ + str r10,[sp,#400] @ R[4][0] = C[0] ^ (~C[1] & C[2]) +#endif + eor r11,r11,r1,ror#32-31 +#ifndef __thumb2__ + str r11,[sp,#400+4] +#else + strd r10,r11,[sp,#400] @ R[4][0] = C[0] ^ (~C[1] & C[2]) +#endif + bic r12,r6,r4 + bic r14,r7,r5 + eor r12,r12,r2 + eor r14,r14,r3 +#ifndef __thumb2__ + str r12,[sp,#408] @ R[4][1] = C[1] ^ (~C[2] & C[3]); +#endif + bic r10,r8,r6,ror#1 +#ifndef __thumb2__ + str r14,[sp,#408+4] +#else + strd r12,r14,[sp,#408] @ R[4][1] = C[1] ^ (~C[2] & C[3]); +#endif + bic r11,r9,r7,ror#1 + bic r12,r0,r8,ror#31-1 + bic r14,r1,r9,ror#31-1 + eor r4,r4,r10,ror#32-1 +#ifndef __thumb2__ + str r4,[sp,#416] @ R[4][2] = C[2] ^= (~C[3] & C[4]); +#endif + eor r5,r5,r11,ror#32-1 +#ifndef __thumb2__ + str r5,[sp,#416+4] +#else + strd r4,r5,[sp,#416] @ R[4][2] = C[2] ^= (~C[3] & C[4]); +#endif + eor r6,r6,r12,ror#32-31 + eor r7,r7,r14,ror#32-31 +#ifndef __thumb2__ + str r6,[sp,#424] @ R[4][3] = C[3] ^= (~C[4] & C[0]); +#endif + bic r10,r2,r0,ror#32-31 +#ifndef __thumb2__ + str r7,[sp,#424+4] +#else + strd r6,r7,[sp,#424] @ R[4][3] = C[3] ^= (~C[4] & C[0]); +#endif + bic r11,r3,r1,ror#32-31 + add r12,sp,#240 + eor r8,r10,r8,ror#32-1 + add r10,sp,#280 + eor r9,r11,r9,ror#32-1 +#ifndef __thumb2__ + str r8,[sp,#432] @ R[4][4] = C[4] ^= (~C[0] & C[1]); +#endif +#ifndef __thumb2__ + str r9,[sp,#432+4] +#else + strd r8,r9,[sp,#432] @ R[4][4] = C[4] ^= (~C[0] & C[1]); +#endif + ldmia r12,{r0,r1,r2,r3} @ A[0][0..1] + ldmia r10,{r10,r11,r12,r14} @ A[1][0..1] +#ifdef __thumb2__ + eor r0,r0,r10 + eor r1,r1,r11 + eor r2,r2,r12 + ldrd r10,r11,[sp,#296] + eor r3,r3,r14 + ldrd r12,r14,[sp,#304] + eor r4,r4,r10 + eor r5,r5,r11 + eor r6,r6,r12 + ldrd r10,r11,[sp,#312] + eor r7,r7,r14 + ldrd r12,r14,[sp,#320] + eor r8,r8,r10 + eor r9,r9,r11 + eor r0,r0,r12 + ldrd r10,r11,[sp,#328] + eor r1,r1,r14 + ldrd r12,r14,[sp,#336] + eor r2,r2,r10 + eor r3,r3,r11 + eor r4,r4,r12 + ldrd r10,r11,[sp,#344] + eor r5,r5,r14 + ldrd r12,r14,[sp,#352] + eor r6,r6,r10 + eor r7,r7,r11 + eor r8,r8,r12 + ldrd r10,r11,[sp,#360] + eor r9,r9,r14 + ldrd r12,r14,[sp,#368] + eor r0,r0,r10 + eor r1,r1,r11 + eor r2,r2,r12 + ldrd r10,r11,[sp,#376] + eor r3,r3,r14 + ldrd r12,r14,[sp,#384] + eor r4,r4,r10 + eor r5,r5,r11 + eor r6,r6,r12 + ldrd r10,r11,[sp,#392] + eor r7,r7,r14 + ldrd r12,r14,[sp,#400] + eor r8,r8,r10 + eor r9,r9,r11 + eor r0,r0,r12 + ldrd r10,r11,[sp,#408] + eor r1,r1,r14 + ldrd r12,r14,[sp,#256] + eor r2,r2,r10 + eor r3,r3,r11 + eor r4,r4,r12 + ldrd r10,r11,[sp,#264] + eor r5,r5,r14 + ldrd r12,r14,[sp,#272] +#else + eor r0,r0,r10 + add r10,sp,#296 + eor r1,r1,r11 + eor r2,r2,r12 + eor r3,r3,r14 + ldmia r10,{r10,r11,r12,r14} @ A[1][2..3] + eor r4,r4,r10 + add r10,sp,#312 + eor r5,r5,r11 + eor r6,r6,r12 + eor r7,r7,r14 + ldmia r10,{r10,r11,r12,r14} @ A[1][4]..A[2][0] + eor r8,r8,r10 + add r10,sp,#328 + eor r9,r9,r11 + eor r0,r0,r12 + eor r1,r1,r14 + ldmia r10,{r10,r11,r12,r14} @ A[2][1..2] + eor r2,r2,r10 + add r10,sp,#344 + eor r3,r3,r11 + eor r4,r4,r12 + eor r5,r5,r14 + ldmia r10,{r10,r11,r12,r14} @ A[2][3..4] + eor r6,r6,r10 + add r10,sp,#360 + eor r7,r7,r11 + eor r8,r8,r12 + eor r9,r9,r14 + ldmia r10,{r10,r11,r12,r14} @ A[3][0..1] + eor r0,r0,r10 + add r10,sp,#376 + eor r1,r1,r11 + eor r2,r2,r12 + eor r3,r3,r14 + ldmia r10,{r10,r11,r12,r14} @ A[3][2..3] + eor r4,r4,r10 + add r10,sp,#392 + eor r5,r5,r11 + eor r6,r6,r12 + eor r7,r7,r14 + ldmia r10,{r10,r11,r12,r14} @ A[3][4]..A[4][0] + eor r8,r8,r10 + ldr r10,[sp,#408] @ A[4][1] + eor r9,r9,r11 + ldr r11,[sp,#408+4] + eor r0,r0,r12 + ldr r12,[sp,#256] @ A[0][2] + eor r1,r1,r14 + ldr r14,[sp,#256+4] + eor r2,r2,r10 + add r10,sp,#264 + eor r3,r3,r11 + eor r4,r4,r12 + eor r5,r5,r14 + ldmia r10,{r10,r11,r12,r14} @ A[0][3..4] +#endif + eor r6,r6,r10 + eor r7,r7,r11 + eor r8,r8,r12 + eor r9,r9,r14 + + eor r10,r0,r5,ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0]; +#ifndef __thumb2__ + str r10,[sp,#208] @ D[1] = E[0] +#endif + eor r11,r1,r4 +#ifndef __thumb2__ + str r11,[sp,#208+4] +#else + strd r10,r11,[sp,#208] @ D[1] = E[0] +#endif + eor r12,r6,r1,ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3]; + eor r14,r7,r0 +#ifndef __thumb2__ + str r12,[sp,#232] @ D[4] = E[1] +#endif + eor r0,r8,r3,ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4]; +#ifndef __thumb2__ + str r14,[sp,#232+4] +#else + strd r12,r14,[sp,#232] @ D[4] = E[1] +#endif + eor r1,r9,r2 +#ifndef __thumb2__ + str r0,[sp,#200] @ D[0] = C[0] +#endif + eor r2,r2,r7,ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1]; +#ifndef __thumb2__ + ldr r7,[sp,#384] +#endif + eor r3,r3,r6 +#ifndef __thumb2__ + str r1,[sp,#200+4] +#else + strd r0,r1,[sp,#200] @ D[0] = C[0] +#endif +#ifndef __thumb2__ + ldr r6,[sp,#384+4] +#else + ldrd r7,r6,[sp,#384] +#endif +#ifndef __thumb2__ + str r2,[sp,#216] @ D[2] = C[1] +#endif + eor r4,r4,r9,ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2]; +#ifndef __thumb2__ + str r3,[sp,#216+4] +#else + strd r2,r3,[sp,#216] @ D[2] = C[1] +#endif + eor r5,r5,r8 + +#ifndef __thumb2__ + ldr r8,[sp,#432] +#endif +#ifndef __thumb2__ + ldr r9,[sp,#432+4] +#else + ldrd r8,r9,[sp,#432] +#endif +#ifndef __thumb2__ + str r4,[sp,#224] @ D[3] = C[2] +#endif + eor r7,r7,r4 +#ifndef __thumb2__ + str r5,[sp,#224+4] +#else + strd r4,r5,[sp,#224] @ D[3] = C[2] +#endif + eor r6,r6,r5 +#ifndef __thumb2__ + ldr r4,[sp,#240] +#endif + @ mov r7,r7,ror#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */ + @ mov r6,r6,ror#32-11 +#ifndef __thumb2__ + ldr r5,[sp,#240+4] +#else + ldrd r4,r5,[sp,#240] +#endif + eor r8,r8,r12 + eor r9,r9,r14 +#ifndef __thumb2__ + ldr r12,[sp,#336] +#endif + eor r0,r0,r4 +#ifndef __thumb2__ + ldr r14,[sp,#336+4] +#else + ldrd r12,r14,[sp,#336] +#endif + @ mov r8,r8,ror#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */ + @ mov r9,r9,ror#32-7 + eor r1,r1,r5 @ C[0] = A[0][0] ^ C[0]; + eor r12,r12,r2 +#ifndef __thumb2__ + ldr r2,[sp,#288] +#endif + eor r14,r14,r3 +#ifndef __thumb2__ + ldr r3,[sp,#288+4] +#else + ldrd r2,r3,[sp,#288] +#endif + mov r5,r12,ror#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); + ldr r12,[sp,#444] @ load counter + eor r2,r2,r10 + adr r10,iotas32 + mov r4,r14,ror#32-22 + add r14,r10,r12 + eor r3,r3,r11 +#ifndef __thumb2__ + ldr r10,[r14,#8] @ iotas[i].lo +#endif + add r12,r12,#16 +#ifndef __thumb2__ + ldr r11,[r14,#12] @ iotas[i].hi +#else + ldrd r10,r11,[r14,#8] @ iotas[i].lo +#endif + cmp r12,#192 + str r12,[sp,#444] @ store counter + bic r12,r4,r2,ror#32-22 + bic r14,r5,r3,ror#32-22 + mov r2,r2,ror#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); + mov r3,r3,ror#32-22 + eor r12,r12,r0 + eor r14,r14,r1 + eor r10,r10,r12 + eor r11,r11,r14 +#ifndef __thumb2__ + str r10,[sp,#0] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; +#endif + bic r12,r6,r4,ror#11 +#ifndef __thumb2__ + str r11,[sp,#0+4] +#else + strd r10,r11,[sp,#0] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; +#endif + bic r14,r7,r5,ror#10 + bic r10,r8,r6,ror#32-(11-7) + bic r11,r9,r7,ror#32-(10-7) + eor r12,r2,r12,ror#32-11 +#ifndef __thumb2__ + str r12,[sp,#8] @ R[0][1] = C[1] ^ (~C[2] & C[3]); +#endif + eor r14,r3,r14,ror#32-10 +#ifndef __thumb2__ + str r14,[sp,#8+4] +#else + strd r12,r14,[sp,#8] @ R[0][1] = C[1] ^ (~C[2] & C[3]); +#endif + eor r10,r4,r10,ror#32-7 + eor r11,r5,r11,ror#32-7 +#ifndef __thumb2__ + str r10,[sp,#16] @ R[0][2] = C[2] ^ (~C[3] & C[4]); +#endif + bic r12,r0,r8,ror#32-7 +#ifndef __thumb2__ + str r11,[sp,#16+4] +#else + strd r10,r11,[sp,#16] @ R[0][2] = C[2] ^ (~C[3] & C[4]); +#endif + bic r14,r1,r9,ror#32-7 + eor r12,r12,r6,ror#32-11 +#ifndef __thumb2__ + str r12,[sp,#24] @ R[0][3] = C[3] ^ (~C[4] & C[0]); +#endif + eor r14,r14,r7,ror#32-10 +#ifndef __thumb2__ + str r14,[sp,#24+4] +#else + strd r12,r14,[sp,#24] @ R[0][3] = C[3] ^ (~C[4] & C[0]); +#endif + bic r10,r2,r0 + add r14,sp,#224 +#ifndef __thumb2__ + ldr r0,[sp,#264] @ A[0][3] +#endif + bic r11,r3,r1 +#ifndef __thumb2__ + ldr r1,[sp,#264+4] +#else + ldrd r0,r1,[sp,#264] @ A[0][3] +#endif + eor r10,r10,r8,ror#32-7 + eor r11,r11,r9,ror#32-7 +#ifndef __thumb2__ + str r10,[sp,#32] @ R[0][4] = C[4] ^ (~C[0] & C[1]); +#endif + add r9,sp,#200 +#ifndef __thumb2__ + str r11,[sp,#32+4] +#else + strd r10,r11,[sp,#32] @ R[0][4] = C[4] ^ (~C[0] & C[1]); +#endif + + ldmia r14,{r10,r11,r12,r14} @ D[3..4] + ldmia r9,{r6,r7,r8,r9} @ D[0..1] + +#ifndef __thumb2__ + ldr r2,[sp,#312] @ A[1][4] +#endif + eor r0,r0,r10 +#ifndef __thumb2__ + ldr r3,[sp,#312+4] +#else + ldrd r2,r3,[sp,#312] @ A[1][4] +#endif + eor r1,r1,r11 + @ mov r0,r0,ror#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]); +#ifndef __thumb2__ + ldr r10,[sp,#368] @ A[3][1] +#endif + @ mov r1,r1,ror#32-14 +#ifndef __thumb2__ + ldr r11,[sp,#368+4] +#else + ldrd r10,r11,[sp,#368] @ A[3][1] +#endif + + eor r2,r2,r12 +#ifndef __thumb2__ + ldr r4,[sp,#320] @ A[2][0] +#endif + eor r3,r3,r14 +#ifndef __thumb2__ + ldr r5,[sp,#320+4] +#else + ldrd r4,r5,[sp,#320] @ A[2][0] +#endif + @ mov r2,r2,ror#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); + @ mov r3,r3,ror#32-10 + + eor r6,r6,r4 +#ifndef __thumb2__ + ldr r12,[sp,#216] @ D[2] +#endif + eor r7,r7,r5 +#ifndef __thumb2__ + ldr r14,[sp,#216+4] +#else + ldrd r12,r14,[sp,#216] @ D[2] +#endif + mov r5,r6,ror#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]); + mov r4,r7,ror#32-2 + + eor r10,r10,r8 +#ifndef __thumb2__ + ldr r8,[sp,#416] @ A[4][2] +#endif + eor r11,r11,r9 +#ifndef __thumb2__ + ldr r9,[sp,#416+4] +#else + ldrd r8,r9,[sp,#416] @ A[4][2] +#endif + mov r7,r10,ror#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]); + mov r6,r11,ror#32-23 + + bic r10,r4,r2,ror#32-10 + bic r11,r5,r3,ror#32-10 + eor r12,r12,r8 + eor r14,r14,r9 + mov r9,r12,ror#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]); + mov r8,r14,ror#32-31 + eor r10,r10,r0,ror#32-14 + eor r11,r11,r1,ror#32-14 +#ifndef __thumb2__ + str r10,[sp,#40] @ R[1][0] = C[0] ^ (~C[1] & C[2]) +#endif + bic r12,r6,r4 +#ifndef __thumb2__ + str r11,[sp,#40+4] +#else + strd r10,r11,[sp,#40] @ R[1][0] = C[0] ^ (~C[1] & C[2]) +#endif + bic r14,r7,r5 + eor r12,r12,r2,ror#32-10 +#ifndef __thumb2__ + str r12,[sp,#48] @ R[1][1] = C[1] ^ (~C[2] & C[3]); +#endif + eor r14,r14,r3,ror#32-10 +#ifndef __thumb2__ + str r14,[sp,#48+4] +#else + strd r12,r14,[sp,#48] @ R[1][1] = C[1] ^ (~C[2] & C[3]); +#endif + bic r10,r8,r6 + bic r11,r9,r7 + bic r12,r0,r8,ror#14 + bic r14,r1,r9,ror#14 + eor r10,r10,r4 + eor r11,r11,r5 +#ifndef __thumb2__ + str r10,[sp,#56] @ R[1][2] = C[2] ^ (~C[3] & C[4]); +#endif + bic r2,r2,r0,ror#32-(14-10) +#ifndef __thumb2__ + str r11,[sp,#56+4] +#else + strd r10,r11,[sp,#56] @ R[1][2] = C[2] ^ (~C[3] & C[4]); +#endif + eor r12,r6,r12,ror#32-14 + bic r11,r3,r1,ror#32-(14-10) +#ifndef __thumb2__ + str r12,[sp,#64] @ R[1][3] = C[3] ^ (~C[4] & C[0]); +#endif + eor r14,r7,r14,ror#32-14 +#ifndef __thumb2__ + str r14,[sp,#64+4] +#else + strd r12,r14,[sp,#64] @ R[1][3] = C[3] ^ (~C[4] & C[0]); +#endif + add r12,sp,#208 +#ifndef __thumb2__ + ldr r1,[sp,#248] @ A[0][1] +#endif + eor r10,r8,r2,ror#32-10 +#ifndef __thumb2__ + ldr r0,[sp,#248+4] +#else + ldrd r1,r0,[sp,#248] @ A[0][1] +#endif + eor r11,r9,r11,ror#32-10 +#ifndef __thumb2__ + str r10,[sp,#72] @ R[1][4] = C[4] ^ (~C[0] & C[1]); +#endif +#ifndef __thumb2__ + str r11,[sp,#72+4] +#else + strd r10,r11,[sp,#72] @ R[1][4] = C[4] ^ (~C[0] & C[1]); +#endif + + add r9,sp,#224 + ldmia r12,{r10,r11,r12,r14} @ D[1..2] +#ifndef __thumb2__ + ldr r2,[sp,#296] @ A[1][2] +#endif +#ifndef __thumb2__ + ldr r3,[sp,#296+4] +#else + ldrd r2,r3,[sp,#296] @ A[1][2] +#endif + ldmia r9,{r6,r7,r8,r9} @ D[3..4] + + eor r1,r1,r10 +#ifndef __thumb2__ + ldr r4,[sp,#344] @ A[2][3] +#endif + eor r0,r0,r11 +#ifndef __thumb2__ + ldr r5,[sp,#344+4] +#else + ldrd r4,r5,[sp,#344] @ A[2][3] +#endif + mov r0,r0,ror#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]); + + eor r2,r2,r12 +#ifndef __thumb2__ + ldr r10,[sp,#392] @ A[3][4] +#endif + eor r3,r3,r14 +#ifndef __thumb2__ + ldr r11,[sp,#392+4] +#else + ldrd r10,r11,[sp,#392] @ A[3][4] +#endif + @ mov r2,r2,ror#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]); +#ifndef __thumb2__ + ldr r12,[sp,#200] @ D[0] +#endif + @ mov r3,r3,ror#32-3 +#ifndef __thumb2__ + ldr r14,[sp,#200+4] +#else + ldrd r12,r14,[sp,#200] @ D[0] +#endif + + eor r4,r4,r6 + eor r5,r5,r7 + @ mov r5,r6,ror#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); + @ mov r4,r7,ror#32-13 @ [track reverse order below] + + eor r10,r10,r8 +#ifndef __thumb2__ + ldr r8,[sp,#400] @ A[4][0] +#endif + eor r11,r11,r9 +#ifndef __thumb2__ + ldr r9,[sp,#400+4] +#else + ldrd r8,r9,[sp,#400] @ A[4][0] +#endif + mov r6,r10,ror#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]); + mov r7,r11,ror#32-4 + + eor r12,r12,r8 + eor r14,r14,r9 + mov r8,r12,ror#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]); + mov r9,r14,ror#32-9 + + bic r10,r5,r2,ror#13-3 + bic r11,r4,r3,ror#12-3 + bic r12,r6,r5,ror#32-13 + bic r14,r7,r4,ror#32-12 + eor r10,r0,r10,ror#32-13 + eor r11,r1,r11,ror#32-12 +#ifndef __thumb2__ + str r10,[sp,#80] @ R[2][0] = C[0] ^ (~C[1] & C[2]) +#endif + eor r12,r12,r2,ror#32-3 +#ifndef __thumb2__ + str r11,[sp,#80+4] +#else + strd r10,r11,[sp,#80] @ R[2][0] = C[0] ^ (~C[1] & C[2]) +#endif + eor r14,r14,r3,ror#32-3 +#ifndef __thumb2__ + str r12,[sp,#88] @ R[2][1] = C[1] ^ (~C[2] & C[3]); +#endif + bic r10,r8,r6 + bic r11,r9,r7 +#ifndef __thumb2__ + str r14,[sp,#88+4] +#else + strd r12,r14,[sp,#88] @ R[2][1] = C[1] ^ (~C[2] & C[3]); +#endif + eor r10,r10,r5,ror#32-13 + eor r11,r11,r4,ror#32-12 +#ifndef __thumb2__ + str r10,[sp,#96] @ R[2][2] = C[2] ^ (~C[3] & C[4]); +#endif + bic r12,r0,r8 +#ifndef __thumb2__ + str r11,[sp,#96+4] +#else + strd r10,r11,[sp,#96] @ R[2][2] = C[2] ^ (~C[3] & C[4]); +#endif + bic r14,r1,r9 + eor r12,r12,r6 + eor r14,r14,r7 +#ifndef __thumb2__ + str r12,[sp,#104] @ R[2][3] = C[3] ^ (~C[4] & C[0]); +#endif + bic r10,r2,r0,ror#3 +#ifndef __thumb2__ + str r14,[sp,#104+4] +#else + strd r12,r14,[sp,#104] @ R[2][3] = C[3] ^ (~C[4] & C[0]); +#endif + bic r11,r3,r1,ror#3 +#ifndef __thumb2__ + ldr r1,[sp,#272] @ A[0][4] [in reverse order] +#endif + eor r10,r8,r10,ror#32-3 +#ifndef __thumb2__ + ldr r0,[sp,#272+4] +#else + ldrd r1,r0,[sp,#272] @ A[0][4] [in reverse order] +#endif + eor r11,r9,r11,ror#32-3 +#ifndef __thumb2__ + str r10,[sp,#112] @ R[2][4] = C[4] ^ (~C[0] & C[1]); +#endif + add r9,sp,#208 +#ifndef __thumb2__ + str r11,[sp,#112+4] +#else + strd r10,r11,[sp,#112] @ R[2][4] = C[4] ^ (~C[0] & C[1]); +#endif + +#ifndef __thumb2__ + ldr r10,[sp,#232] @ D[4] +#endif +#ifndef __thumb2__ + ldr r11,[sp,#232+4] +#else + ldrd r10,r11,[sp,#232] @ D[4] +#endif +#ifndef __thumb2__ + ldr r12,[sp,#200] @ D[0] +#endif +#ifndef __thumb2__ + ldr r14,[sp,#200+4] +#else + ldrd r12,r14,[sp,#200] @ D[0] +#endif + + ldmia r9,{r6,r7,r8,r9} @ D[1..2] + + eor r1,r1,r10 +#ifndef __thumb2__ + ldr r2,[sp,#280] @ A[1][0] +#endif + eor r0,r0,r11 +#ifndef __thumb2__ + ldr r3,[sp,#280+4] +#else + ldrd r2,r3,[sp,#280] @ A[1][0] +#endif + @ mov r1,r10,ror#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]); +#ifndef __thumb2__ + ldr r4,[sp,#328] @ A[2][1] +#endif + @ mov r0,r11,ror#32-14 @ [was loaded in reverse order] +#ifndef __thumb2__ + ldr r5,[sp,#328+4] +#else + ldrd r4,r5,[sp,#328] @ A[2][1] +#endif + + eor r2,r2,r12 +#ifndef __thumb2__ + ldr r10,[sp,#376] @ A[3][2] +#endif + eor r3,r3,r14 +#ifndef __thumb2__ + ldr r11,[sp,#376+4] +#else + ldrd r10,r11,[sp,#376] @ A[3][2] +#endif + @ mov r2,r2,ror#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]); +#ifndef __thumb2__ + ldr r12,[sp,#224] @ D[3] +#endif + @ mov r3,r3,ror#32-18 +#ifndef __thumb2__ + ldr r14,[sp,#224+4] +#else + ldrd r12,r14,[sp,#224] @ D[3] +#endif + + eor r6,r6,r4 + eor r7,r7,r5 + mov r4,r6,ror#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]); + mov r5,r7,ror#32-5 + + eor r10,r10,r8 +#ifndef __thumb2__ + ldr r8,[sp,#424] @ A[4][3] +#endif + eor r11,r11,r9 +#ifndef __thumb2__ + ldr r9,[sp,#424+4] +#else + ldrd r8,r9,[sp,#424] @ A[4][3] +#endif + mov r7,r10,ror#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); + mov r6,r11,ror#32-8 + + eor r12,r12,r8 + eor r14,r14,r9 + mov r8,r12,ror#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]); + mov r9,r14,ror#32-28 + + bic r10,r4,r2,ror#32-18 + bic r11,r5,r3,ror#32-18 + eor r10,r10,r0,ror#32-14 + eor r11,r11,r1,ror#32-13 +#ifndef __thumb2__ + str r10,[sp,#120] @ R[3][0] = C[0] ^ (~C[1] & C[2]) +#endif + bic r12,r6,r4 +#ifndef __thumb2__ + str r11,[sp,#120+4] +#else + strd r10,r11,[sp,#120] @ R[3][0] = C[0] ^ (~C[1] & C[2]) +#endif + bic r14,r7,r5 + eor r12,r12,r2,ror#32-18 +#ifndef __thumb2__ + str r12,[sp,#128] @ R[3][1] = C[1] ^ (~C[2] & C[3]); +#endif + eor r14,r14,r3,ror#32-18 +#ifndef __thumb2__ + str r14,[sp,#128+4] +#else + strd r12,r14,[sp,#128] @ R[3][1] = C[1] ^ (~C[2] & C[3]); +#endif + bic r10,r8,r6 + bic r11,r9,r7 + bic r12,r0,r8,ror#14 + bic r14,r1,r9,ror#13 + eor r10,r10,r4 + eor r11,r11,r5 +#ifndef __thumb2__ + str r10,[sp,#136] @ R[3][2] = C[2] ^ (~C[3] & C[4]); +#endif + bic r2,r2,r0,ror#18-14 +#ifndef __thumb2__ + str r11,[sp,#136+4] +#else + strd r10,r11,[sp,#136] @ R[3][2] = C[2] ^ (~C[3] & C[4]); +#endif + eor r12,r6,r12,ror#32-14 + bic r11,r3,r1,ror#18-13 + eor r14,r7,r14,ror#32-13 +#ifndef __thumb2__ + str r12,[sp,#144] @ R[3][3] = C[3] ^ (~C[4] & C[0]); +#endif +#ifndef __thumb2__ + str r14,[sp,#144+4] +#else + strd r12,r14,[sp,#144] @ R[3][3] = C[3] ^ (~C[4] & C[0]); +#endif + add r14,sp,#216 +#ifndef __thumb2__ + ldr r0,[sp,#256] @ A[0][2] +#endif + eor r10,r8,r2,ror#32-18 +#ifndef __thumb2__ + ldr r1,[sp,#256+4] +#else + ldrd r0,r1,[sp,#256] @ A[0][2] +#endif + eor r11,r9,r11,ror#32-18 +#ifndef __thumb2__ + str r10,[sp,#152] @ R[3][4] = C[4] ^ (~C[0] & C[1]); +#endif +#ifndef __thumb2__ + str r11,[sp,#152+4] +#else + strd r10,r11,[sp,#152] @ R[3][4] = C[4] ^ (~C[0] & C[1]); +#endif + + ldmia r14,{r10,r11,r12,r14} @ D[2..3] +#ifndef __thumb2__ + ldr r2,[sp,#304] @ A[1][3] +#endif +#ifndef __thumb2__ + ldr r3,[sp,#304+4] +#else + ldrd r2,r3,[sp,#304] @ A[1][3] +#endif +#ifndef __thumb2__ + ldr r6,[sp,#232] @ D[4] +#endif +#ifndef __thumb2__ + ldr r7,[sp,#232+4] +#else + ldrd r6,r7,[sp,#232] @ D[4] +#endif + + eor r0,r0,r10 +#ifndef __thumb2__ + ldr r4,[sp,#352] @ A[2][4] +#endif + eor r1,r1,r11 +#ifndef __thumb2__ + ldr r5,[sp,#352+4] +#else + ldrd r4,r5,[sp,#352] @ A[2][4] +#endif + @ mov r0,r0,ror#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]); +#ifndef __thumb2__ + ldr r8,[sp,#200] @ D[0] +#endif + @ mov r1,r1,ror#32-31 +#ifndef __thumb2__ + ldr r9,[sp,#200+4] +#else + ldrd r8,r9,[sp,#200] @ D[0] +#endif + + eor r12,r12,r2 +#ifndef __thumb2__ + ldr r10,[sp,#360] @ A[3][0] +#endif + eor r14,r14,r3 +#ifndef __thumb2__ + ldr r11,[sp,#360+4] +#else + ldrd r10,r11,[sp,#360] @ A[3][0] +#endif + mov r3,r12,ror#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]); +#ifndef __thumb2__ + ldr r12,[sp,#208] @ D[1] +#endif + mov r2,r14,ror#32-28 +#ifndef __thumb2__ + ldr r14,[sp,#208+4] +#else + ldrd r12,r14,[sp,#208] @ D[1] +#endif + + eor r6,r6,r4 + eor r7,r7,r5 + mov r5,r6,ror#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]); + mov r4,r7,ror#32-20 + + eor r10,r10,r8 +#ifndef __thumb2__ + ldr r8,[sp,#408] @ A[4][1] +#endif + eor r11,r11,r9 +#ifndef __thumb2__ + ldr r9,[sp,#408+4] +#else + ldrd r8,r9,[sp,#408] @ A[4][1] +#endif + mov r7,r10,ror#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]); + mov r6,r11,ror#32-21 + + eor r8,r8,r12 + eor r9,r9,r14 + @ mov r8,r2,ror#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); + @ mov r9,r3,ror#32-1 + + bic r10,r4,r2 + bic r11,r5,r3 + eor r10,r10,r0,ror#32-31 +#ifndef __thumb2__ + str r10,[sp,#160] @ R[4][0] = C[0] ^ (~C[1] & C[2]) +#endif + eor r11,r11,r1,ror#32-31 +#ifndef __thumb2__ + str r11,[sp,#160+4] +#else + strd r10,r11,[sp,#160] @ R[4][0] = C[0] ^ (~C[1] & C[2]) +#endif + bic r12,r6,r4 + bic r14,r7,r5 + eor r12,r12,r2 + eor r14,r14,r3 +#ifndef __thumb2__ + str r12,[sp,#168] @ R[4][1] = C[1] ^ (~C[2] & C[3]); +#endif + bic r10,r8,r6,ror#1 +#ifndef __thumb2__ + str r14,[sp,#168+4] +#else + strd r12,r14,[sp,#168] @ R[4][1] = C[1] ^ (~C[2] & C[3]); +#endif + bic r11,r9,r7,ror#1 + bic r12,r0,r8,ror#31-1 + bic r14,r1,r9,ror#31-1 + eor r4,r4,r10,ror#32-1 +#ifndef __thumb2__ + str r4,[sp,#176] @ R[4][2] = C[2] ^= (~C[3] & C[4]); +#endif + eor r5,r5,r11,ror#32-1 +#ifndef __thumb2__ + str r5,[sp,#176+4] +#else + strd r4,r5,[sp,#176] @ R[4][2] = C[2] ^= (~C[3] & C[4]); +#endif + eor r6,r6,r12,ror#32-31 + eor r7,r7,r14,ror#32-31 +#ifndef __thumb2__ + str r6,[sp,#184] @ R[4][3] = C[3] ^= (~C[4] & C[0]); +#endif + bic r10,r2,r0,ror#32-31 +#ifndef __thumb2__ + str r7,[sp,#184+4] +#else + strd r6,r7,[sp,#184] @ R[4][3] = C[3] ^= (~C[4] & C[0]); +#endif + bic r11,r3,r1,ror#32-31 + add r12,sp,#0 + eor r8,r10,r8,ror#32-1 + add r10,sp,#40 + eor r9,r11,r9,ror#32-1 +#ifndef __thumb2__ + str r8,[sp,#192] @ R[4][4] = C[4] ^= (~C[0] & C[1]); +#endif +#ifndef __thumb2__ + str r9,[sp,#192+4] +#else + strd r8,r9,[sp,#192] @ R[4][4] = C[4] ^= (~C[0] & C[1]); +#endif + blo .Lround2x + + ldr pc,[sp,#440] +.size KeccakF1600_int,.-KeccakF1600_int + +.type KeccakF1600, %function +.align 5 +KeccakF1600: + stmdb sp!,{r0,r4-r11,lr} + sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],... + + add r10,r0,#40 + add r11,sp,#40 + ldmia r0, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ copy A[5][5] to stack + stmia sp, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + ldmia r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + stmia r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + ldmia r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + stmia r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + ldmia r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + stmia r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + ldmia r10, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + add r12,sp,#0 + add r10,sp,#40 + stmia r11, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + + bl KeccakF1600_enter + + ldr r11, [sp,#440+16] @ restore pointer to A + ldmia sp, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + stmia r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ return A[5][5] + ldmia r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + stmia r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + ldmia r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + stmia r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + ldmia r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + stmia r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + ldmia r10, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + stmia r11, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + + add sp,sp,#440+20 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} +.size KeccakF1600,.-KeccakF1600 +.globl SHA3_absorb +.type SHA3_absorb,%function +.align 5 +SHA3_absorb: + stmdb sp!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + sub sp,sp,#456+16 + + add r10,r0,#40 + @ mov r11,r1 + mov r12,r2 + mov r14,r3 + cmp r2,r3 + blo .Labsorb_abort + + add r11,sp,#0 + ldmia r0, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ copy A[5][5] to stack + stmia r11!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + ldmia r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + stmia r11!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + ldmia r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + stmia r11!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + ldmia r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + stmia r11!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + ldmia r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + stmia r11, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + + ldr r11,[sp,#476] @ restore r11 +#ifdef __thumb2__ + mov r9,#0x00ff00ff + mov r8,#0x0f0f0f0f + mov r7,#0x33333333 + mov r6,#0x55555555 +#else + mov r6,#0x11 @ compose constants + mov r8,#0x0f + mov r9,#0xff + orr r6,r6,r6,lsl#8 + orr r8,r8,r8,lsl#8 + orr r6,r6,r6,lsl#16 @ 0x11111111 + orr r9,r9,r9,lsl#16 @ 0x00ff00ff + orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f + orr r7,r6,r6,lsl#1 @ 0x33333333 + orr r6,r6,r6,lsl#2 @ 0x55555555 +#endif + str r9,[sp,#468] + str r8,[sp,#464] + str r7,[sp,#460] + str r6,[sp,#456] + b .Loop_absorb + +.align 4 +.Loop_absorb: + subs r0,r12,r14 + blo .Labsorbed + add r10,sp,#0 + str r0,[sp,#480] @ save len - bsz + +.align 4 +.Loop_block: + ldrb r0,[r11],#1 + ldrb r1,[r11],#1 + ldrb r2,[r11],#1 + ldrb r3,[r11],#1 + ldrb r4,[r11],#1 + orr r0,r0,r1,lsl#8 + ldrb r1,[r11],#1 + orr r0,r0,r2,lsl#16 + ldrb r2,[r11],#1 + orr r0,r0,r3,lsl#24 @ lo + ldrb r3,[r11],#1 + orr r1,r4,r1,lsl#8 + orr r1,r1,r2,lsl#16 + orr r1,r1,r3,lsl#24 @ hi + + and r2,r0,r6 @ &=0x55555555 + and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa + and r3,r1,r6 @ &=0x55555555 + and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa + orr r2,r2,r2,lsr#1 + orr r0,r0,r0,lsl#1 + orr r3,r3,r3,lsr#1 + orr r1,r1,r1,lsl#1 + and r2,r2,r7 @ &=0x33333333 + and r0,r0,r7,lsl#2 @ &=0xcccccccc + and r3,r3,r7 @ &=0x33333333 + and r1,r1,r7,lsl#2 @ &=0xcccccccc + orr r2,r2,r2,lsr#2 + orr r0,r0,r0,lsl#2 + orr r3,r3,r3,lsr#2 + orr r1,r1,r1,lsl#2 + and r2,r2,r8 @ &=0x0f0f0f0f + and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0 + and r3,r3,r8 @ &=0x0f0f0f0f + and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0 + ldmia r10,{r4,r5} @ A_flat[i] + orr r2,r2,r2,lsr#4 + orr r0,r0,r0,lsl#4 + orr r3,r3,r3,lsr#4 + orr r1,r1,r1,lsl#4 + and r2,r2,r9 @ &=0x00ff00ff + and r0,r0,r9,lsl#8 @ &=0xff00ff00 + and r3,r3,r9 @ &=0x00ff00ff + and r1,r1,r9,lsl#8 @ &=0xff00ff00 + orr r2,r2,r2,lsr#8 + orr r0,r0,r0,lsl#8 + orr r3,r3,r3,lsr#8 + orr r1,r1,r1,lsl#8 + + mov r2,r2,lsl#16 + mov r1,r1,lsr#16 + eor r4,r4,r3,lsl#16 + eor r5,r5,r0,lsr#16 + eor r4,r4,r2,lsr#16 + eor r5,r5,r1,lsl#16 + stmia r10!,{r4,r5} @ A_flat[i++] ^= BitInterleave(inp[0..7]) + + subs r14,r14,#8 + bhi .Loop_block + + str r11,[sp,#476] + + bl KeccakF1600_int + + add r14,sp,#456 + ldmia r14,{r6,r7,r8,r9,r10,r11,r12,r14} @ restore constants and variables + b .Loop_absorb + +.align 4 +.Labsorbed: + add r11,sp,#40 + ldmia sp, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + stmia r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ return A[5][5] + ldmia r11!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + stmia r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + ldmia r11!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + stmia r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + ldmia r11!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + stmia r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + ldmia r11, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + stmia r10, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} + +.Labsorb_abort: + add sp,sp,#456+32 + mov r0,r12 @ return value + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +.size SHA3_absorb,.-SHA3_absorb +.globl SHA3_squeeze +.type SHA3_squeeze,%function +.align 5 +SHA3_squeeze: + stmdb sp!,{r0,r3-r10,lr} + + mov r10,r0 + mov r4,r1 + mov r5,r2 + mov r12,r3 + +#ifdef __thumb2__ + mov r9,#0x00ff00ff + mov r8,#0x0f0f0f0f + mov r7,#0x33333333 + mov r6,#0x55555555 +#else + mov r6,#0x11 @ compose constants + mov r8,#0x0f + mov r9,#0xff + orr r6,r6,r6,lsl#8 + orr r8,r8,r8,lsl#8 + orr r6,r6,r6,lsl#16 @ 0x11111111 + orr r9,r9,r9,lsl#16 @ 0x00ff00ff + orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f + orr r7,r6,r6,lsl#1 @ 0x33333333 + orr r6,r6,r6,lsl#2 @ 0x55555555 +#endif + stmdb sp!,{r6,r7,r8,r9} + + mov r14,r10 + b .Loop_squeeze + +.align 4 +.Loop_squeeze: + ldmia r10!,{r0,r1} @ A_flat[i++] + + mov r2,r0,lsl#16 + mov r3,r1,lsl#16 @ r3 = r1 << 16 + mov r2,r2,lsr#16 @ r2 = r0 & 0x0000ffff + mov r1,r1,lsr#16 + mov r0,r0,lsr#16 @ r0 = r0 >> 16 + mov r1,r1,lsl#16 @ r1 = r1 & 0xffff0000 + + orr r2,r2,r2,lsl#8 + orr r3,r3,r3,lsr#8 + orr r0,r0,r0,lsl#8 + orr r1,r1,r1,lsr#8 + and r2,r2,r9 @ &=0x00ff00ff + and r3,r3,r9,lsl#8 @ &=0xff00ff00 + and r0,r0,r9 @ &=0x00ff00ff + and r1,r1,r9,lsl#8 @ &=0xff00ff00 + orr r2,r2,r2,lsl#4 + orr r3,r3,r3,lsr#4 + orr r0,r0,r0,lsl#4 + orr r1,r1,r1,lsr#4 + and r2,r2,r8 @ &=0x0f0f0f0f + and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0 + and r0,r0,r8 @ &=0x0f0f0f0f + and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0 + orr r2,r2,r2,lsl#2 + orr r3,r3,r3,lsr#2 + orr r0,r0,r0,lsl#2 + orr r1,r1,r1,lsr#2 + and r2,r2,r7 @ &=0x33333333 + and r3,r3,r7,lsl#2 @ &=0xcccccccc + and r0,r0,r7 @ &=0x33333333 + and r1,r1,r7,lsl#2 @ &=0xcccccccc + orr r2,r2,r2,lsl#1 + orr r3,r3,r3,lsr#1 + orr r0,r0,r0,lsl#1 + orr r1,r1,r1,lsr#1 + and r2,r2,r6 @ &=0x55555555 + and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa + and r0,r0,r6 @ &=0x55555555 + and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa + + orr r2,r2,r3 + orr r0,r0,r1 + + cmp r5,#8 + blo .Lsqueeze_tail + mov r1,r2,lsr#8 + strb r2,[r4],#1 + mov r3,r2,lsr#16 + strb r1,[r4],#1 + mov r2,r2,lsr#24 + strb r3,[r4],#1 + strb r2,[r4],#1 + + mov r1,r0,lsr#8 + strb r0,[r4],#1 + mov r3,r0,lsr#16 + strb r1,[r4],#1 + mov r0,r0,lsr#24 + strb r3,[r4],#1 + strb r0,[r4],#1 + subs r5,r5,#8 + beq .Lsqueeze_done + + subs r12,r12,#8 @ bsz -= 8 + bhi .Loop_squeeze + + mov r0,r14 @ original r10 + + bl KeccakF1600 + + ldmia sp,{r6,r7,r8,r9,r10,r12} @ restore constants and variables + mov r14,r10 + b .Loop_squeeze + +.align 4 +.Lsqueeze_tail: + strb r2,[r4],#1 + mov r2,r2,lsr#8 + subs r5,r5,#1 + beq .Lsqueeze_done + strb r2,[r4],#1 + mov r2,r2,lsr#8 + subs r5,r5,#1 + beq .Lsqueeze_done + strb r2,[r4],#1 + mov r2,r2,lsr#8 + subs r5,r5,#1 + beq .Lsqueeze_done + strb r2,[r4],#1 + subs r5,r5,#1 + beq .Lsqueeze_done + + strb r0,[r4],#1 + mov r0,r0,lsr#8 + subs r5,r5,#1 + beq .Lsqueeze_done + strb r0,[r4],#1 + mov r0,r0,lsr#8 + subs r5,r5,#1 + beq .Lsqueeze_done + strb r0,[r4] + b .Lsqueeze_done + +.align 4 +.Lsqueeze_done: + add sp,sp,#24 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc} +.size SHA3_squeeze,.-SHA3_squeeze +#if __ARM_MAX_ARCH__>=7 +.fpu neon + +.type iotas64, %object +.align 5 +iotas64: +.quad 0x0000000000000001 +.quad 0x0000000000008082 +.quad 0x800000000000808a +.quad 0x8000000080008000 +.quad 0x000000000000808b +.quad 0x0000000080000001 +.quad 0x8000000080008081 +.quad 0x8000000000008009 +.quad 0x000000000000008a +.quad 0x0000000000000088 +.quad 0x0000000080008009 +.quad 0x000000008000000a +.quad 0x000000008000808b +.quad 0x800000000000008b +.quad 0x8000000000008089 +.quad 0x8000000000008003 +.quad 0x8000000000008002 +.quad 0x8000000000000080 +.quad 0x000000000000800a +.quad 0x800000008000000a +.quad 0x8000000080008081 +.quad 0x8000000000008080 +.quad 0x0000000080000001 +.quad 0x8000000080008008 +.size iotas64,.-iotas64 + +.type KeccakF1600_neon, %function +.align 5 +KeccakF1600_neon: + add r1, r0, #16 + adr r2, iotas64 + mov r3, #24 @ loop counter + b .Loop_neon + +.align 4 +.Loop_neon: + @ Theta + vst1.64 {q4}, [r0:64] @ offload A[0..1][4] + veor q13, q0, q5 @ A[0..1][0]^A[2..3][0] + vst1.64 {d18}, [r1:64] @ offload A[2][4] + veor q14, q1, q6 @ A[0..1][1]^A[2..3][1] + veor q15, q2, q7 @ A[0..1][2]^A[2..3][2] + veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0] + veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1] + veor q14, q3, q8 @ A[0..1][3]^A[2..3][3] + veor q4, q4, q9 @ A[0..1][4]^A[2..3][4] + veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2] + veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3] + veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4] + veor q13, q13, q10 @ C[0..1]^=A[4][0..1] + veor q14, q15, q11 @ C[2..3]^=A[4][2..3] + veor d25, d25, d24 @ C[4]^=A[4][4] + + vadd.u64 q4, q13, q13 @ C[0..1]<<1 + vadd.u64 q15, q14, q14 @ C[2..3]<<1 + vadd.u64 d18, d25, d25 @ C[4]<<1 + vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1) + vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1) + vsri.u64 d18, d25, #63 @ ROL64(C[4],1) + veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1) + veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1) + veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1) + veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1) + + veor d0, d0, d25 @ A[0][0] ^= C[4] + veor d1, d1, d25 @ A[1][0] ^= C[4] + veor d10, d10, d25 @ A[2][0] ^= C[4] + veor d11, d11, d25 @ A[3][0] ^= C[4] + veor d20, d20, d25 @ A[4][0] ^= C[4] + + veor d2, d2, d26 @ A[0][1] ^= D[1] + veor d3, d3, d26 @ A[1][1] ^= D[1] + veor d12, d12, d26 @ A[2][1] ^= D[1] + veor d13, d13, d26 @ A[3][1] ^= D[1] + veor d21, d21, d26 @ A[4][1] ^= D[1] + vmov d26, d27 + + veor d6, d6, d28 @ A[0][3] ^= C[2] + veor d7, d7, d28 @ A[1][3] ^= C[2] + veor d16, d16, d28 @ A[2][3] ^= C[2] + veor d17, d17, d28 @ A[3][3] ^= C[2] + veor d23, d23, d28 @ A[4][3] ^= C[2] + vld1.64 {q4}, [r0:64] @ restore A[0..1][4] + vmov d28, d29 + + vld1.64 {d18}, [r1:64] @ restore A[2][4] + veor q2, q2, q13 @ A[0..1][2] ^= D[2] + veor q7, q7, q13 @ A[2..3][2] ^= D[2] + veor d22, d22, d27 @ A[4][2] ^= D[2] + + veor q4, q4, q14 @ A[0..1][4] ^= C[3] + veor q9, q9, q14 @ A[2..3][4] ^= C[3] + veor d24, d24, d29 @ A[4][4] ^= C[3] + + @ Rho + Pi + vmov d26, d2 @ C[1] = A[0][1] + vshl.u64 d2, d3, #44 + vmov d27, d4 @ C[2] = A[0][2] + vshl.u64 d4, d14, #43 + vmov d28, d6 @ C[3] = A[0][3] + vshl.u64 d6, d17, #21 + vmov d29, d8 @ C[4] = A[0][4] + vshl.u64 d8, d24, #14 + vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1]) + vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2]) + vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3]) + vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4]) + + vshl.u64 d3, d9, #20 + vshl.u64 d14, d16, #25 + vshl.u64 d17, d15, #15 + vshl.u64 d24, d21, #2 + vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4]) + vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3]) + vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2]) + vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1]) + + vshl.u64 d9, d22, #61 + @ vshl.u64 d16, d19, #8 + vshl.u64 d15, d12, #10 + vshl.u64 d21, d7, #55 + vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2]) + vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4]) + vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1]) + vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3]) + + vshl.u64 d22, d18, #39 + @ vshl.u64 d19, d23, #56 + vshl.u64 d12, d5, #6 + vshl.u64 d7, d13, #45 + vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4]) + vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3]) + vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2]) + vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1]) + + vshl.u64 d18, d20, #18 + vshl.u64 d23, d11, #41 + vshl.u64 d5, d10, #3 + vshl.u64 d13, d1, #36 + vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0]) + vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0]) + vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0]) + vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0]) + + vshl.u64 d1, d28, #28 + vshl.u64 d10, d26, #1 + vshl.u64 d11, d29, #27 + vshl.u64 d20, d27, #62 + vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3]) + vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1]) + vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4]) + vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2]) + + @ Chi + Iota + vbic q13, q2, q1 + vbic q14, q3, q2 + vbic q15, q4, q3 + veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2]) + veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3]) + veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4]) + vst1.64 {q13}, [r0:64] @ offload A[0..1][0] + vbic q13, q0, q4 + vbic q15, q1, q0 + vmov q1, q14 @ A[0..1][1] + veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0]) + veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1]) + + vbic q13, q7, q6 + vmov q0, q5 @ A[2..3][0] + vbic q14, q8, q7 + vmov q15, q6 @ A[2..3][1] + veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2]) + vbic q13, q9, q8 + veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3]) + vbic q14, q0, q9 + veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4]) + vbic q13, q15, q0 + veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0]) + vmov q14, q10 @ A[4][0..1] + veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1]) + + vld1.64 d25, [r2:64]! @ Iota[i++] + vbic d26, d22, d21 + vbic d27, d23, d22 + vld1.64 {q0}, [r0:64] @ restore A[0..1][0] + veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2]) + vbic d26, d24, d23 + veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3]) + vbic d27, d28, d24 + veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4]) + vbic d26, d29, d28 + veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0]) + veor d0, d0, d25 @ A[0][0] ^= Iota[i] + veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1]) + + subs r3, r3, #1 + bne .Loop_neon + +.word 0xe12fff1e +.size KeccakF1600_neon,.-KeccakF1600_neon + +.globl SHA3_absorb_neon +.type SHA3_absorb_neon, %function +.align 5 +SHA3_absorb_neon: + stmdb sp!, {r4,r5,r6,lr} + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + mov r4, r1 @ inp + mov r5, r2 @ len + mov r6, r3 @ bsz + + vld1.32 {d0}, [r0:64]! @ A[0][0] + vld1.32 {d2}, [r0:64]! @ A[0][1] + vld1.32 {d4}, [r0:64]! @ A[0][2] + vld1.32 {d6}, [r0:64]! @ A[0][3] + vld1.32 {d8}, [r0:64]! @ A[0][4] + + vld1.32 {d1}, [r0:64]! @ A[1][0] + vld1.32 {d3}, [r0:64]! @ A[1][1] + vld1.32 {d5}, [r0:64]! @ A[1][2] + vld1.32 {d7}, [r0:64]! @ A[1][3] + vld1.32 {d9}, [r0:64]! @ A[1][4] + + vld1.32 {d10}, [r0:64]! @ A[2][0] + vld1.32 {d12}, [r0:64]! @ A[2][1] + vld1.32 {d14}, [r0:64]! @ A[2][2] + vld1.32 {d16}, [r0:64]! @ A[2][3] + vld1.32 {d18}, [r0:64]! @ A[2][4] + + vld1.32 {d11}, [r0:64]! @ A[3][0] + vld1.32 {d13}, [r0:64]! @ A[3][1] + vld1.32 {d15}, [r0:64]! @ A[3][2] + vld1.32 {d17}, [r0:64]! @ A[3][3] + vld1.32 {d19}, [r0:64]! @ A[3][4] + + vld1.32 {d20,d21,d22,d23}, [r0:64]! @ A[4][0..3] + vld1.32 {d24}, [r0:64] @ A[4][4] + sub r0, r0, #24*8 @ rewind + b .Loop_absorb_neon + +.align 4 +.Loop_absorb_neon: + subs r12, r5, r6 @ len - bsz + blo .Labsorbed_neon + mov r5, r12 + + vld1.8 {d31}, [r4]! @ endian-neutral loads... + cmp r6, #8*2 + veor d0, d0, d31 @ A[0][0] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d2, d2, d31 @ A[0][1] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31}, [r4]! + cmp r6, #8*4 + veor d4, d4, d31 @ A[0][2] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d6, d6, d31 @ A[0][3] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31},[r4]! + cmp r6, #8*6 + veor d8, d8, d31 @ A[0][4] ^= *inp++ + blo .Lprocess_neon + + vld1.8 {d31}, [r4]! + veor d1, d1, d31 @ A[1][0] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31}, [r4]! + cmp r6, #8*8 + veor d3, d3, d31 @ A[1][1] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d5, d5, d31 @ A[1][2] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31}, [r4]! + cmp r6, #8*10 + veor d7, d7, d31 @ A[1][3] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d9, d9, d31 @ A[1][4] ^= *inp++ + beq .Lprocess_neon + + vld1.8 {d31}, [r4]! + cmp r6, #8*12 + veor d10, d10, d31 @ A[2][0] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d12, d12, d31 @ A[2][1] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31}, [r4]! + cmp r6, #8*14 + veor d14, d14, d31 @ A[2][2] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d16, d16, d31 @ A[2][3] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31}, [r4]! + cmp r6, #8*16 + veor d18, d18, d31 @ A[2][4] ^= *inp++ + blo .Lprocess_neon + + vld1.8 {d31}, [r4]! + veor d11, d11, d31 @ A[3][0] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31}, [r4]! + cmp r6, #8*18 + veor d13, d13, d31 @ A[3][1] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d15, d15, d31 @ A[3][2] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31}, [r4]! + cmp r6, #8*20 + veor d17, d17, d31 @ A[3][3] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d19, d19, d31 @ A[3][4] ^= *inp++ + beq .Lprocess_neon + + vld1.8 {d31}, [r4]! + cmp r6, #8*22 + veor d20, d20, d31 @ A[4][0] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d21, d21, d31 @ A[4][1] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31}, [r4]! + cmp r6, #8*24 + veor d22, d22, d31 @ A[4][2] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d23, d23, d31 @ A[4][3] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d24, d24, d31 @ A[4][4] ^= *inp++ + +.Lprocess_neon: + bl KeccakF1600_neon + b .Loop_absorb_neon + +.align 4 +.Labsorbed_neon: + vst1.32 {d0}, [r0:64]! @ A[0][0..4] + vst1.32 {d2}, [r0:64]! + vst1.32 {d4}, [r0:64]! + vst1.32 {d6}, [r0:64]! + vst1.32 {d8}, [r0:64]! + + vst1.32 {d1}, [r0:64]! @ A[1][0..4] + vst1.32 {d3}, [r0:64]! + vst1.32 {d5}, [r0:64]! + vst1.32 {d7}, [r0:64]! + vst1.32 {d9}, [r0:64]! + + vst1.32 {d10}, [r0:64]! @ A[2][0..4] + vst1.32 {d12}, [r0:64]! + vst1.32 {d14}, [r0:64]! + vst1.32 {d16}, [r0:64]! + vst1.32 {d18}, [r0:64]! + + vst1.32 {d11}, [r0:64]! @ A[3][0..4] + vst1.32 {d13}, [r0:64]! + vst1.32 {d15}, [r0:64]! + vst1.32 {d17}, [r0:64]! + vst1.32 {d19}, [r0:64]! + + vst1.32 {d20,d21,d22,d23}, [r0:64]! @ A[4][0..4] + vst1.32 {d24}, [r0:64] + + mov r0, r5 @ return value + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!, {r4,r5,r6,pc} +.size SHA3_absorb_neon,.-SHA3_absorb_neon + +.globl SHA3_squeeze_neon +.type SHA3_squeeze_neon, %function +.align 5 +SHA3_squeeze_neon: + stmdb sp!, {r4,r5,r6,lr} + + mov r4, r1 @ out + mov r5, r2 @ len + mov r6, r3 @ bsz + mov r12, r0 @ A_flat + mov r14, r3 @ bsz + b .Loop_squeeze_neon + +.align 4 +.Loop_squeeze_neon: + cmp r5, #8 + blo .Lsqueeze_neon_tail + vld1.32 {d0}, [r12]! + vst1.8 {d0}, [r4]! @ endian-neutral store + + subs r5, r5, #8 @ len -= 8 + beq .Lsqueeze_neon_done + + subs r14, r14, #8 @ bsz -= 8 + bhi .Loop_squeeze_neon + + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + vld1.32 {d0}, [r0:64]! @ A[0][0..4] + vld1.32 {d2}, [r0:64]! + vld1.32 {d4}, [r0:64]! + vld1.32 {d6}, [r0:64]! + vld1.32 {d8}, [r0:64]! + + vld1.32 {d1}, [r0:64]! @ A[1][0..4] + vld1.32 {d3}, [r0:64]! + vld1.32 {d5}, [r0:64]! + vld1.32 {d7}, [r0:64]! + vld1.32 {d9}, [r0:64]! + + vld1.32 {d10}, [r0:64]! @ A[2][0..4] + vld1.32 {d12}, [r0:64]! + vld1.32 {d14}, [r0:64]! + vld1.32 {d16}, [r0:64]! + vld1.32 {d18}, [r0:64]! + + vld1.32 {d11}, [r0:64]! @ A[3][0..4] + vld1.32 {d13}, [r0:64]! + vld1.32 {d15}, [r0:64]! + vld1.32 {d17}, [r0:64]! + vld1.32 {d19}, [r0:64]! + + vld1.32 {d20,d21,d22,d23}, [r0:64]! @ A[4][0..4] + vld1.32 {d24}, [r0:64] + sub r0, r0, #24*8 @ rewind + + bl KeccakF1600_neon + + mov r12, r0 @ A_flat + vst1.32 {d0}, [r0:64]! @ A[0][0..4] + vst1.32 {d2}, [r0:64]! + vst1.32 {d4}, [r0:64]! + vst1.32 {d6}, [r0:64]! + vst1.32 {d8}, [r0:64]! + + vst1.32 {d1}, [r0:64]! @ A[1][0..4] + vst1.32 {d3}, [r0:64]! + vst1.32 {d5}, [r0:64]! + vst1.32 {d7}, [r0:64]! + vst1.32 {d9}, [r0:64]! + + vst1.32 {d10}, [r0:64]! @ A[2][0..4] + vst1.32 {d12}, [r0:64]! + vst1.32 {d14}, [r0:64]! + vst1.32 {d16}, [r0:64]! + vst1.32 {d18}, [r0:64]! + + vst1.32 {d11}, [r0:64]! @ A[3][0..4] + vst1.32 {d13}, [r0:64]! + vst1.32 {d15}, [r0:64]! + vst1.32 {d17}, [r0:64]! + vst1.32 {d19}, [r0:64]! + + vst1.32 {d20,d21,d22,d23}, [r0:64]! @ A[4][0..4] + mov r14, r6 @ bsz + vst1.32 {d24}, [r0:64] + mov r0, r12 @ rewind + + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + b .Loop_squeeze_neon + +.align 4 +.Lsqueeze_neon_tail: + ldmia r12, {r2,r3} + cmp r5, #2 + strb r2, [r4],#1 @ endian-neutral store + mov r2, r2, lsr#8 + blo .Lsqueeze_neon_done + strb r2, [r4], #1 + mov r2, r2, lsr#8 + beq .Lsqueeze_neon_done + strb r2, [r4], #1 + mov r2, r2, lsr#8 + cmp r5, #4 + blo .Lsqueeze_neon_done + strb r2, [r4], #1 + beq .Lsqueeze_neon_done + + strb r3, [r4], #1 + mov r3, r3, lsr#8 + cmp r5, #6 + blo .Lsqueeze_neon_done + strb r3, [r4], #1 + mov r3, r3, lsr#8 + beq .Lsqueeze_neon_done + strb r3, [r4], #1 + +.Lsqueeze_neon_done: + ldmia sp!, {r4,r5,r6,pc} +.size SHA3_squeeze_neon,.-SHA3_squeeze_neon +#endif +.byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 diff --git a/deps/openssl/config/archs/linux-armv4/asm_avx2/crypto/sha/sha1-armv4-large.S b/deps/openssl/config/archs/linux-armv4/asm_avx2/crypto/sha/sha1-armv4-large.S new file mode 100644 index 0000000000..185e432ec2 --- /dev/null +++ b/deps/openssl/config/archs/linux-armv4/asm_avx2/crypto/sha/sha1-armv4-large.S @@ -0,0 +1,1491 @@ +#include "arm_arch.h" + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.globl sha1_block_data_order +.type sha1_block_data_order,%function + +.align 5 +sha1_block_data_order: +#if __ARM_MAX_ARCH__>=7 +.Lsha1_block: + adr r3,.Lsha1_block + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV8_SHA1 + bne .LARMv8 + tst r12,#ARMV7_NEON + bne .LNEON +#endif + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 + ldmia r0,{r3,r4,r5,r6,r7} +.Lloop: + ldr r8,.LK_00_19 + mov r14,sp + sub sp,sp,#15*4 + mov r5,r5,ror#30 + mov r6,r6,ror#30 + mov r7,r7,ror#30 @ [6] +.L_00_15: +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r6,r8,r6,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r4,r5 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r6,r8,r6,ror#2 @ E+=K_00_19 + eor r10,r4,r5 @ F_xx_xx + add r6,r6,r7,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r3,r10,ror#2 + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r6,r6,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r5,r8,r5,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r3,r4 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r5,r8,r5,ror#2 @ E+=K_00_19 + eor r10,r3,r4 @ F_xx_xx + add r5,r5,r6,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r7,r10,ror#2 + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r5,r5,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r4,r8,r4,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r7,r3 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r4,r8,r4,ror#2 @ E+=K_00_19 + eor r10,r7,r3 @ F_xx_xx + add r4,r4,r5,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r6,r10,ror#2 + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r4,r4,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r3,r8,r3,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r6,r7 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r3,r8,r3,ror#2 @ E+=K_00_19 + eor r10,r6,r7 @ F_xx_xx + add r3,r3,r4,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r5,r10,ror#2 + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r3,r3,r10 @ E+=F_00_19(B,C,D) +#if defined(__thumb2__) + mov r12,sp + teq r14,r12 +#else + teq r14,sp +#endif + bne .L_00_15 @ [((11+4)*5+2)*3] + sub sp,sp,#25*4 +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + add r6,r6,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + add r5,r5,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + add r4,r4,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + add r3,r3,r10 @ E+=F_00_19(B,C,D) + + ldr r8,.LK_20_39 @ [+15+16*4] + cmn sp,#0 @ [+3], clear carry to denote 20_39 +.L_20_39_or_60_79: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r4,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_20_39(B,C,D) +#if defined(__thumb2__) + mov r12,sp + teq r14,r12 +#else + teq r14,sp @ preserve carry +#endif + bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] + bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes + + ldr r8,.LK_40_59 + sub sp,sp,#20*4 @ [+2] +.L_40_59: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r4,r10,ror#2 @ F_xx_xx + and r11,r5,r6 @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_40_59(B,C,D) + add r7,r7,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + and r11,r4,r5 @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_40_59(B,C,D) + add r6,r6,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + and r11,r3,r4 @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_40_59(B,C,D) + add r5,r5,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + and r11,r7,r3 @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_40_59(B,C,D) + add r4,r4,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + and r11,r6,r7 @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_40_59(B,C,D) + add r3,r3,r11,ror#2 +#if defined(__thumb2__) + mov r12,sp + teq r14,r12 +#else + teq r14,sp +#endif + bne .L_40_59 @ [+((12+5)*5+2)*4] + + ldr r8,.LK_60_79 + sub sp,sp,#20*4 + cmp sp,#0 @ set carry to denote 60_79 + b .L_20_39_or_60_79 @ [+4], spare 300 bytes +.L_done: + add sp,sp,#80*4 @ "deallocate" stack frame + ldmia r0,{r8,r9,r10,r11,r12} + add r3,r8,r3 + add r4,r9,r4 + add r5,r10,r5,ror#2 + add r6,r11,r6,ror#2 + add r7,r12,r7,ror#2 + stmia r0,{r3,r4,r5,r6,r7} + teq r1,r2 + bne .Lloop @ [+18], total 1307 + +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size sha1_block_data_order,.-sha1_block_data_order + +.align 5 +.LK_00_19:.word 0x5a827999 +.LK_20_39:.word 0x6ed9eba1 +.LK_40_59:.word 0x8f1bbcdc +.LK_60_79:.word 0xca62c1d6 +#if __ARM_MAX_ARCH__>=7 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.Lsha1_block +#endif +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 5 +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.type sha1_block_data_order_neon,%function +.align 4 +sha1_block_data_order_neon: +.LNEON: + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 + @ dmb @ errata #451034 on early Cortex A8 + @ vstmdb sp!,{d8-d15} @ ABI specification says so + mov r14,sp + sub r12,sp,#64 + adr r8,.LK_00_19 + bic r12,r12,#15 @ align for 128-bit stores + + ldmia r0,{r3,r4,r5,r6,r7} @ load context + mov sp,r12 @ alloca + + vld1.8 {q0,q1},[r1]! @ handles unaligned + veor q15,q15,q15 + vld1.8 {q2,q3},[r1]! + vld1.32 {d28[],d29[]},[r8,:32]! @ load K_00_19 + vrev32.8 q0,q0 @ yes, even on + vrev32.8 q1,q1 @ big-endian... + vrev32.8 q2,q2 + vadd.i32 q8,q0,q14 + vrev32.8 q3,q3 + vadd.i32 q9,q1,q14 + vst1.32 {q8},[r12,:128]! + vadd.i32 q10,q2,q14 + vst1.32 {q9},[r12,:128]! + vst1.32 {q10},[r12,:128]! + ldr r9,[sp] @ big RAW stall + +.Loop_neon: + vext.8 q8,q0,q1,#8 + bic r10,r6,r4 + add r7,r7,r9 + and r11,r5,r4 + vadd.i32 q13,q3,q14 + ldr r9,[sp,#4] + add r7,r7,r3,ror#27 + vext.8 q12,q3,q15,#4 + eor r11,r11,r10 + mov r4,r4,ror#2 + add r7,r7,r11 + veor q8,q8,q0 + bic r10,r5,r3 + add r6,r6,r9 + veor q12,q12,q2 + and r11,r4,r3 + ldr r9,[sp,#8] + veor q12,q12,q8 + add r6,r6,r7,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q13,q15,q12,#4 + bic r10,r4,r7 + add r5,r5,r9 + vadd.i32 q8,q12,q12 + and r11,r3,r7 + ldr r9,[sp,#12] + vsri.32 q8,q12,#31 + add r5,r5,r6,ror#27 + eor r11,r11,r10 + mov r7,r7,ror#2 + vshr.u32 q12,q13,#30 + add r5,r5,r11 + bic r10,r3,r6 + vshl.u32 q13,q13,#2 + add r4,r4,r9 + and r11,r7,r6 + veor q8,q8,q12 + ldr r9,[sp,#16] + add r4,r4,r5,ror#27 + veor q8,q8,q13 + eor r11,r11,r10 + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q9,q1,q2,#8 + bic r10,r7,r5 + add r3,r3,r9 + and r11,r6,r5 + vadd.i32 q13,q8,q14 + ldr r9,[sp,#20] + vld1.32 {d28[],d29[]},[r8,:32]! + add r3,r3,r4,ror#27 + vext.8 q12,q8,q15,#4 + eor r11,r11,r10 + mov r5,r5,ror#2 + add r3,r3,r11 + veor q9,q9,q1 + bic r10,r6,r4 + add r7,r7,r9 + veor q12,q12,q3 + and r11,r5,r4 + ldr r9,[sp,#24] + veor q12,q12,q9 + add r7,r7,r3,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q13,q15,q12,#4 + bic r10,r5,r3 + add r6,r6,r9 + vadd.i32 q9,q12,q12 + and r11,r4,r3 + ldr r9,[sp,#28] + vsri.32 q9,q12,#31 + add r6,r6,r7,ror#27 + eor r11,r11,r10 + mov r3,r3,ror#2 + vshr.u32 q12,q13,#30 + add r6,r6,r11 + bic r10,r4,r7 + vshl.u32 q13,q13,#2 + add r5,r5,r9 + and r11,r3,r7 + veor q9,q9,q12 + ldr r9,[sp,#32] + add r5,r5,r6,ror#27 + veor q9,q9,q13 + eor r11,r11,r10 + mov r7,r7,ror#2 + add r5,r5,r11 + vext.8 q10,q2,q3,#8 + bic r10,r3,r6 + add r4,r4,r9 + and r11,r7,r6 + vadd.i32 q13,q9,q14 + ldr r9,[sp,#36] + add r4,r4,r5,ror#27 + vext.8 q12,q9,q15,#4 + eor r11,r11,r10 + mov r6,r6,ror#2 + add r4,r4,r11 + veor q10,q10,q2 + bic r10,r7,r5 + add r3,r3,r9 + veor q12,q12,q8 + and r11,r6,r5 + ldr r9,[sp,#40] + veor q12,q12,q10 + add r3,r3,r4,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q13,q15,q12,#4 + bic r10,r6,r4 + add r7,r7,r9 + vadd.i32 q10,q12,q12 + and r11,r5,r4 + ldr r9,[sp,#44] + vsri.32 q10,q12,#31 + add r7,r7,r3,ror#27 + eor r11,r11,r10 + mov r4,r4,ror#2 + vshr.u32 q12,q13,#30 + add r7,r7,r11 + bic r10,r5,r3 + vshl.u32 q13,q13,#2 + add r6,r6,r9 + and r11,r4,r3 + veor q10,q10,q12 + ldr r9,[sp,#48] + add r6,r6,r7,ror#27 + veor q10,q10,q13 + eor r11,r11,r10 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q11,q3,q8,#8 + bic r10,r4,r7 + add r5,r5,r9 + and r11,r3,r7 + vadd.i32 q13,q10,q14 + ldr r9,[sp,#52] + add r5,r5,r6,ror#27 + vext.8 q12,q10,q15,#4 + eor r11,r11,r10 + mov r7,r7,ror#2 + add r5,r5,r11 + veor q11,q11,q3 + bic r10,r3,r6 + add r4,r4,r9 + veor q12,q12,q9 + and r11,r7,r6 + ldr r9,[sp,#56] + veor q12,q12,q11 + add r4,r4,r5,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q13,q15,q12,#4 + bic r10,r7,r5 + add r3,r3,r9 + vadd.i32 q11,q12,q12 + and r11,r6,r5 + ldr r9,[sp,#60] + vsri.32 q11,q12,#31 + add r3,r3,r4,ror#27 + eor r11,r11,r10 + mov r5,r5,ror#2 + vshr.u32 q12,q13,#30 + add r3,r3,r11 + bic r10,r6,r4 + vshl.u32 q13,q13,#2 + add r7,r7,r9 + and r11,r5,r4 + veor q11,q11,q12 + ldr r9,[sp,#0] + add r7,r7,r3,ror#27 + veor q11,q11,q13 + eor r11,r11,r10 + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q12,q10,q11,#8 + bic r10,r5,r3 + add r6,r6,r9 + and r11,r4,r3 + veor q0,q0,q8 + ldr r9,[sp,#4] + add r6,r6,r7,ror#27 + veor q0,q0,q1 + eor r11,r11,r10 + mov r3,r3,ror#2 + vadd.i32 q13,q11,q14 + add r6,r6,r11 + bic r10,r4,r7 + veor q12,q12,q0 + add r5,r5,r9 + and r11,r3,r7 + vshr.u32 q0,q12,#30 + ldr r9,[sp,#8] + add r5,r5,r6,ror#27 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + eor r11,r11,r10 + mov r7,r7,ror#2 + vsli.32 q0,q12,#2 + add r5,r5,r11 + bic r10,r3,r6 + add r4,r4,r9 + and r11,r7,r6 + ldr r9,[sp,#12] + add r4,r4,r5,ror#27 + eor r11,r11,r10 + mov r6,r6,ror#2 + add r4,r4,r11 + bic r10,r7,r5 + add r3,r3,r9 + and r11,r6,r5 + ldr r9,[sp,#16] + add r3,r3,r4,ror#27 + eor r11,r11,r10 + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q12,q11,q0,#8 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#20] + veor q1,q1,q9 + eor r11,r10,r5 + add r7,r7,r3,ror#27 + veor q1,q1,q2 + mov r4,r4,ror#2 + add r7,r7,r11 + vadd.i32 q13,q0,q14 + eor r10,r3,r5 + add r6,r6,r9 + veor q12,q12,q1 + ldr r9,[sp,#24] + eor r11,r10,r4 + vshr.u32 q1,q12,#30 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + vst1.32 {q13},[r12,:128]! + add r6,r6,r11 + eor r10,r7,r4 + vsli.32 q1,q12,#2 + add r5,r5,r9 + ldr r9,[sp,#28] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#32] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q12,q0,q1,#8 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#36] + veor q2,q2,q10 + eor r11,r10,r6 + add r3,r3,r4,ror#27 + veor q2,q2,q3 + mov r5,r5,ror#2 + add r3,r3,r11 + vadd.i32 q13,q1,q14 + eor r10,r4,r6 + vld1.32 {d28[],d29[]},[r8,:32]! + add r7,r7,r9 + veor q12,q12,q2 + ldr r9,[sp,#40] + eor r11,r10,r5 + vshr.u32 q2,q12,#30 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + vst1.32 {q13},[r12,:128]! + add r7,r7,r11 + eor r10,r3,r5 + vsli.32 q2,q12,#2 + add r6,r6,r9 + ldr r9,[sp,#44] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#48] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + vext.8 q12,q1,q2,#8 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#52] + veor q3,q3,q11 + eor r11,r10,r7 + add r4,r4,r5,ror#27 + veor q3,q3,q8 + mov r6,r6,ror#2 + add r4,r4,r11 + vadd.i32 q13,q2,q14 + eor r10,r5,r7 + add r3,r3,r9 + veor q12,q12,q3 + ldr r9,[sp,#56] + eor r11,r10,r6 + vshr.u32 q3,q12,#30 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + vst1.32 {q13},[r12,:128]! + add r3,r3,r11 + eor r10,r4,r6 + vsli.32 q3,q12,#2 + add r7,r7,r9 + ldr r9,[sp,#60] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#0] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q12,q2,q3,#8 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#4] + veor q8,q8,q0 + eor r11,r10,r3 + add r5,r5,r6,ror#27 + veor q8,q8,q9 + mov r7,r7,ror#2 + add r5,r5,r11 + vadd.i32 q13,q3,q14 + eor r10,r6,r3 + add r4,r4,r9 + veor q12,q12,q8 + ldr r9,[sp,#8] + eor r11,r10,r7 + vshr.u32 q8,q12,#30 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + add r4,r4,r11 + eor r10,r5,r7 + vsli.32 q8,q12,#2 + add r3,r3,r9 + ldr r9,[sp,#12] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#16] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q12,q3,q8,#8 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#20] + veor q9,q9,q1 + eor r11,r10,r4 + add r6,r6,r7,ror#27 + veor q9,q9,q10 + mov r3,r3,ror#2 + add r6,r6,r11 + vadd.i32 q13,q8,q14 + eor r10,r7,r4 + add r5,r5,r9 + veor q12,q12,q9 + ldr r9,[sp,#24] + eor r11,r10,r3 + vshr.u32 q9,q12,#30 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + vst1.32 {q13},[r12,:128]! + add r5,r5,r11 + eor r10,r6,r3 + vsli.32 q9,q12,#2 + add r4,r4,r9 + ldr r9,[sp,#28] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#32] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q12,q8,q9,#8 + add r7,r7,r9 + and r10,r5,r6 + ldr r9,[sp,#36] + veor q10,q10,q2 + add r7,r7,r3,ror#27 + eor r11,r5,r6 + veor q10,q10,q11 + add r7,r7,r10 + and r11,r11,r4 + vadd.i32 q13,q9,q14 + mov r4,r4,ror#2 + add r7,r7,r11 + veor q12,q12,q10 + add r6,r6,r9 + and r10,r4,r5 + vshr.u32 q10,q12,#30 + ldr r9,[sp,#40] + add r6,r6,r7,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r4,r5 + add r6,r6,r10 + vsli.32 q10,q12,#2 + and r11,r11,r3 + mov r3,r3,ror#2 + add r6,r6,r11 + add r5,r5,r9 + and r10,r3,r4 + ldr r9,[sp,#44] + add r5,r5,r6,ror#27 + eor r11,r3,r4 + add r5,r5,r10 + and r11,r11,r7 + mov r7,r7,ror#2 + add r5,r5,r11 + add r4,r4,r9 + and r10,r7,r3 + ldr r9,[sp,#48] + add r4,r4,r5,ror#27 + eor r11,r7,r3 + add r4,r4,r10 + and r11,r11,r6 + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q12,q9,q10,#8 + add r3,r3,r9 + and r10,r6,r7 + ldr r9,[sp,#52] + veor q11,q11,q3 + add r3,r3,r4,ror#27 + eor r11,r6,r7 + veor q11,q11,q0 + add r3,r3,r10 + and r11,r11,r5 + vadd.i32 q13,q10,q14 + mov r5,r5,ror#2 + vld1.32 {d28[],d29[]},[r8,:32]! + add r3,r3,r11 + veor q12,q12,q11 + add r7,r7,r9 + and r10,r5,r6 + vshr.u32 q11,q12,#30 + ldr r9,[sp,#56] + add r7,r7,r3,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r5,r6 + add r7,r7,r10 + vsli.32 q11,q12,#2 + and r11,r11,r4 + mov r4,r4,ror#2 + add r7,r7,r11 + add r6,r6,r9 + and r10,r4,r5 + ldr r9,[sp,#60] + add r6,r6,r7,ror#27 + eor r11,r4,r5 + add r6,r6,r10 + and r11,r11,r3 + mov r3,r3,ror#2 + add r6,r6,r11 + add r5,r5,r9 + and r10,r3,r4 + ldr r9,[sp,#0] + add r5,r5,r6,ror#27 + eor r11,r3,r4 + add r5,r5,r10 + and r11,r11,r7 + mov r7,r7,ror#2 + add r5,r5,r11 + vext.8 q12,q10,q11,#8 + add r4,r4,r9 + and r10,r7,r3 + ldr r9,[sp,#4] + veor q0,q0,q8 + add r4,r4,r5,ror#27 + eor r11,r7,r3 + veor q0,q0,q1 + add r4,r4,r10 + and r11,r11,r6 + vadd.i32 q13,q11,q14 + mov r6,r6,ror#2 + add r4,r4,r11 + veor q12,q12,q0 + add r3,r3,r9 + and r10,r6,r7 + vshr.u32 q0,q12,#30 + ldr r9,[sp,#8] + add r3,r3,r4,ror#27 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + eor r11,r6,r7 + add r3,r3,r10 + vsli.32 q0,q12,#2 + and r11,r11,r5 + mov r5,r5,ror#2 + add r3,r3,r11 + add r7,r7,r9 + and r10,r5,r6 + ldr r9,[sp,#12] + add r7,r7,r3,ror#27 + eor r11,r5,r6 + add r7,r7,r10 + and r11,r11,r4 + mov r4,r4,ror#2 + add r7,r7,r11 + add r6,r6,r9 + and r10,r4,r5 + ldr r9,[sp,#16] + add r6,r6,r7,ror#27 + eor r11,r4,r5 + add r6,r6,r10 + and r11,r11,r3 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q12,q11,q0,#8 + add r5,r5,r9 + and r10,r3,r4 + ldr r9,[sp,#20] + veor q1,q1,q9 + add r5,r5,r6,ror#27 + eor r11,r3,r4 + veor q1,q1,q2 + add r5,r5,r10 + and r11,r11,r7 + vadd.i32 q13,q0,q14 + mov r7,r7,ror#2 + add r5,r5,r11 + veor q12,q12,q1 + add r4,r4,r9 + and r10,r7,r3 + vshr.u32 q1,q12,#30 + ldr r9,[sp,#24] + add r4,r4,r5,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r7,r3 + add r4,r4,r10 + vsli.32 q1,q12,#2 + and r11,r11,r6 + mov r6,r6,ror#2 + add r4,r4,r11 + add r3,r3,r9 + and r10,r6,r7 + ldr r9,[sp,#28] + add r3,r3,r4,ror#27 + eor r11,r6,r7 + add r3,r3,r10 + and r11,r11,r5 + mov r5,r5,ror#2 + add r3,r3,r11 + add r7,r7,r9 + and r10,r5,r6 + ldr r9,[sp,#32] + add r7,r7,r3,ror#27 + eor r11,r5,r6 + add r7,r7,r10 + and r11,r11,r4 + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q12,q0,q1,#8 + add r6,r6,r9 + and r10,r4,r5 + ldr r9,[sp,#36] + veor q2,q2,q10 + add r6,r6,r7,ror#27 + eor r11,r4,r5 + veor q2,q2,q3 + add r6,r6,r10 + and r11,r11,r3 + vadd.i32 q13,q1,q14 + mov r3,r3,ror#2 + add r6,r6,r11 + veor q12,q12,q2 + add r5,r5,r9 + and r10,r3,r4 + vshr.u32 q2,q12,#30 + ldr r9,[sp,#40] + add r5,r5,r6,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r3,r4 + add r5,r5,r10 + vsli.32 q2,q12,#2 + and r11,r11,r7 + mov r7,r7,ror#2 + add r5,r5,r11 + add r4,r4,r9 + and r10,r7,r3 + ldr r9,[sp,#44] + add r4,r4,r5,ror#27 + eor r11,r7,r3 + add r4,r4,r10 + and r11,r11,r6 + mov r6,r6,ror#2 + add r4,r4,r11 + add r3,r3,r9 + and r10,r6,r7 + ldr r9,[sp,#48] + add r3,r3,r4,ror#27 + eor r11,r6,r7 + add r3,r3,r10 + and r11,r11,r5 + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q12,q1,q2,#8 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#52] + veor q3,q3,q11 + eor r11,r10,r5 + add r7,r7,r3,ror#27 + veor q3,q3,q8 + mov r4,r4,ror#2 + add r7,r7,r11 + vadd.i32 q13,q2,q14 + eor r10,r3,r5 + add r6,r6,r9 + veor q12,q12,q3 + ldr r9,[sp,#56] + eor r11,r10,r4 + vshr.u32 q3,q12,#30 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + vst1.32 {q13},[r12,:128]! + add r6,r6,r11 + eor r10,r7,r4 + vsli.32 q3,q12,#2 + add r5,r5,r9 + ldr r9,[sp,#60] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#0] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + vadd.i32 q13,q3,q14 + eor r10,r5,r7 + add r3,r3,r9 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + teq r1,r2 + sub r8,r8,#16 + it eq + subeq r1,r1,#64 + vld1.8 {q0,q1},[r1]! + ldr r9,[sp,#4] + eor r11,r10,r6 + vld1.8 {q2,q3},[r1]! + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + vld1.32 {d28[],d29[]},[r8,:32]! + add r3,r3,r11 + eor r10,r4,r6 + vrev32.8 q0,q0 + add r7,r7,r9 + ldr r9,[sp,#8] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#12] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#16] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + vrev32.8 q1,q1 + eor r10,r6,r3 + add r4,r4,r9 + vadd.i32 q8,q0,q14 + ldr r9,[sp,#20] + eor r11,r10,r7 + vst1.32 {q8},[r12,:128]! + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#24] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#28] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#32] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + vrev32.8 q2,q2 + eor r10,r7,r4 + add r5,r5,r9 + vadd.i32 q9,q1,q14 + ldr r9,[sp,#36] + eor r11,r10,r3 + vst1.32 {q9},[r12,:128]! + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#40] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#44] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#48] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + vrev32.8 q3,q3 + eor r10,r3,r5 + add r6,r6,r9 + vadd.i32 q10,q2,q14 + ldr r9,[sp,#52] + eor r11,r10,r4 + vst1.32 {q10},[r12,:128]! + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#56] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#60] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + ldmia r0,{r9,r10,r11,r12} @ accumulate context + add r3,r3,r9 + ldr r9,[r0,#16] + add r4,r4,r10 + add r5,r5,r11 + add r6,r6,r12 + it eq + moveq sp,r14 + add r7,r7,r9 + it ne + ldrne r9,[sp] + stmia r0,{r3,r4,r5,r6,r7} + itt ne + addne r12,sp,#3*16 + bne .Loop_neon + + @ vldmia sp!,{d8-d15} + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +.size sha1_block_data_order_neon,.-sha1_block_data_order_neon +#endif +#if __ARM_MAX_ARCH__>=7 + +# if defined(__thumb2__) +# define INST(a,b,c,d) .byte c,d|0xf,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d|0x10 +# endif + +.type sha1_block_data_order_armv8,%function +.align 5 +sha1_block_data_order_armv8: +.LARMv8: + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so + + veor q1,q1,q1 + adr r3,.LK_00_19 + vld1.32 {q0},[r0]! + vld1.32 {d2[0]},[r0] + sub r0,r0,#16 + vld1.32 {d16[],d17[]},[r3,:32]! + vld1.32 {d18[],d19[]},[r3,:32]! + vld1.32 {d20[],d21[]},[r3,:32]! + vld1.32 {d22[],d23[]},[r3,:32] + +.Loop_v8: + vld1.8 {q4,q5},[r1]! + vld1.8 {q6,q7},[r1]! + vrev32.8 q4,q4 + vrev32.8 q5,q5 + + vadd.i32 q12,q8,q4 + vrev32.8 q6,q6 + vmov q14,q0 @ offload + subs r2,r2,#1 + + vadd.i32 q13,q8,q5 + vrev32.8 q7,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 0 + INST(0x68,0x0c,0x02,0xe2) @ sha1c q0,q1,q12 + vadd.i32 q12,q8,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 1 + INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13 + vadd.i32 q13,q8,q7 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 2 + INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12 + vadd.i32 q12,q8,q4 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 3 + INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13 + vadd.i32 q13,q9,q5 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 4 + INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12 + vadd.i32 q12,q9,q6 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 5 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q9,q7 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 6 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 + vadd.i32 q12,q9,q4 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 7 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q9,q5 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 8 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 + vadd.i32 q12,q10,q6 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 9 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q10,q7 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 10 + INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 + vadd.i32 q12,q10,q4 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 11 + INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13 + vadd.i32 q13,q10,q5 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 12 + INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 + vadd.i32 q12,q10,q6 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 13 + INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13 + vadd.i32 q13,q11,q7 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 14 + INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 + vadd.i32 q12,q11,q4 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 15 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q11,q5 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 16 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 + vadd.i32 q12,q11,q6 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 17 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q11,q7 + + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 18 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 + + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 19 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + + vadd.i32 q1,q1,q2 + vadd.i32 q0,q0,q14 + bne .Loop_v8 + + vst1.32 {q0},[r0]! + vst1.32 {d2[0]},[r0] + + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} + bx lr @ bx lr +.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8 +#endif +#if __ARM_MAX_ARCH__>=7 +.comm OPENSSL_armcap_P,4,4 +#endif diff --git a/deps/openssl/config/archs/linux-armv4/asm_avx2/crypto/sha/sha256-armv4.S b/deps/openssl/config/archs/linux-armv4/asm_avx2/crypto/sha/sha256-armv4.S new file mode 100644 index 0000000000..3efcde6b6e --- /dev/null +++ b/deps/openssl/config/archs/linux-armv4/asm_avx2/crypto/sha/sha256-armv4.S @@ -0,0 +1,2813 @@ +@ Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the OpenSSL license (the "License"). You may not use +@ this file except in compliance with the License. You can obtain a copy +@ in the file LICENSE in the source distribution or at +@ https://www.openssl.org/source/license.html + + +@ ==================================================================== +@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ +@ Permission to use under GPL terms is granted. +@ ==================================================================== + +@ SHA256 block procedure for ARMv4. May 2007. + +@ Performance is ~2x better than gcc 3.4 generated code and in "abso- +@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per +@ byte [on single-issue Xscale PXA250 core]. + +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 22% improvement on +@ Cortex A8 core and ~20 cycles per processed byte. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 16% +@ improvement on Cortex A8 core and ~15.4 cycles per processed byte. + +@ September 2013. +@ +@ Add NEON implementation. On Cortex A8 it was measured to process one +@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon +@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only +@ code (meaning that latter performs sub-optimally, nothing was done +@ about it). + +@ May 2014. +@ +@ Add ARMv8 code path performing at 2.0 cpb on Apple A7. + +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +#endif + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.type K256,%object +.align 5 +K256: +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.size K256,.-K256 +.word 0 @ terminator +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.Lsha256_block_data_order +#endif +.align 5 + +.globl sha256_block_data_order +.type sha256_block_data_order,%function +sha256_block_data_order: +.Lsha256_block_data_order: +#if __ARM_ARCH__<7 && !defined(__thumb2__) + sub r3,pc,#8 @ sha256_block_data_order +#else + adr r3,.Lsha256_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV8_SHA256 + bne .LARMv8 + tst r12,#ARMV7_NEON + bne .LNEON +#endif + add r2,r1,r2,lsl#6 @ len to point at the end of inp + stmdb sp!,{r0,r1,r2,r4-r11,lr} + ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} + sub r14,r3,#256+32 @ K256 + sub sp,sp,#16*4 @ alloca(X[16]) +.Loop: +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ magic + eor r12,r12,r12 +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 0 +# if 0==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r8,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 0 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 0==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r8,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#0*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 0==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 0<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#2*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#15*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 1 +# if 1==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r7,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 1 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 1==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r7,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#1*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 1==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 1<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#3*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#0*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 2 +# if 2==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r6,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 2 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 2==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r6,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#2*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 2==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 2<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#4*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#1*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 3 +# if 3==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r5,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 3 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 3==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r5,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#3*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 3==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 3<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#5*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#2*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 4 +# if 4==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r4,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 4 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 4==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r4,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#4*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 4==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 4<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#6*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#3*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 5 +# if 5==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r11,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 5==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r11,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#5*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 5==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 5<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#7*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#4*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 6 +# if 6==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r10,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 6 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 6==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r10,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#6*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 6==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 6<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#8*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#5*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 7 +# if 7==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r9,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 7==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r9,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#7*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 7==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 7<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#9*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#6*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 8 +# if 8==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r8,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 8 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 8==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r8,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#8*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 8==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 8<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#10*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#7*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 9 +# if 9==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r7,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 9 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 9==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r7,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#9*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 9==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 9<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#11*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#8*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 10 +# if 10==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r6,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 10 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 10==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r6,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#10*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 10==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 10<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#12*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#9*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 11 +# if 11==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r5,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 11 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 11==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r5,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#11*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 11==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 11<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#13*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#10*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 12 +# if 12==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r4,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 12 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 12==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r4,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#12*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 12==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 12<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#14*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#11*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 13 +# if 13==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r11,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 13 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 13==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r11,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#13*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 13==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 13<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#15*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#12*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 14 +# if 14==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r10,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 14 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 14==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r10,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#14*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 14==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 14<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#0*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#13*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 15 +# if 15==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r9,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 15 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 15==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r9,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#15*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 15==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 15<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#1*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#14*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +.Lrounds_16_xx: + @ ldr r2,[sp,#1*4] @ 16 + @ ldr r1,[sp,#14*4] + mov r0,r2,ror#7 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#0*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#9*4] + + add r12,r12,r0 + eor r0,r8,r8,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r8,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#0*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 16==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 16<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#2*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#15*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#2*4] @ 17 + @ ldr r1,[sp,#15*4] + mov r0,r2,ror#7 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#1*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#10*4] + + add r3,r3,r0 + eor r0,r7,r7,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r7,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#1*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 17==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 17<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#3*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#0*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#3*4] @ 18 + @ ldr r1,[sp,#0*4] + mov r0,r2,ror#7 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#2*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#11*4] + + add r12,r12,r0 + eor r0,r6,r6,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r6,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#2*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 18==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 18<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#4*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#1*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#4*4] @ 19 + @ ldr r1,[sp,#1*4] + mov r0,r2,ror#7 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#3*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#12*4] + + add r3,r3,r0 + eor r0,r5,r5,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r5,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#3*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 19==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 19<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#5*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#2*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#5*4] @ 20 + @ ldr r1,[sp,#2*4] + mov r0,r2,ror#7 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#4*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#13*4] + + add r12,r12,r0 + eor r0,r4,r4,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r4,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#4*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 20==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 20<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#6*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#3*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#6*4] @ 21 + @ ldr r1,[sp,#3*4] + mov r0,r2,ror#7 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#5*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#14*4] + + add r3,r3,r0 + eor r0,r11,r11,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r11,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#5*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 21==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 21<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#7*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#4*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#7*4] @ 22 + @ ldr r1,[sp,#4*4] + mov r0,r2,ror#7 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#6*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#15*4] + + add r12,r12,r0 + eor r0,r10,r10,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r10,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#6*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 22==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 22<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#8*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#5*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#8*4] @ 23 + @ ldr r1,[sp,#5*4] + mov r0,r2,ror#7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#7*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#0*4] + + add r3,r3,r0 + eor r0,r9,r9,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r9,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#7*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 23==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 23<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#9*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#6*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#9*4] @ 24 + @ ldr r1,[sp,#6*4] + mov r0,r2,ror#7 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#8*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#1*4] + + add r12,r12,r0 + eor r0,r8,r8,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r8,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#8*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 24==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 24<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#10*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#7*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#10*4] @ 25 + @ ldr r1,[sp,#7*4] + mov r0,r2,ror#7 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#9*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#2*4] + + add r3,r3,r0 + eor r0,r7,r7,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r7,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#9*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 25==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 25<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#11*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#8*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#11*4] @ 26 + @ ldr r1,[sp,#8*4] + mov r0,r2,ror#7 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#10*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#3*4] + + add r12,r12,r0 + eor r0,r6,r6,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r6,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#10*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 26==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 26<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#12*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#9*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#12*4] @ 27 + @ ldr r1,[sp,#9*4] + mov r0,r2,ror#7 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#11*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#4*4] + + add r3,r3,r0 + eor r0,r5,r5,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r5,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#11*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 27==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 27<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#13*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#10*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#13*4] @ 28 + @ ldr r1,[sp,#10*4] + mov r0,r2,ror#7 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#12*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#5*4] + + add r12,r12,r0 + eor r0,r4,r4,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r4,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#12*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 28==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 28<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#14*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#11*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#14*4] @ 29 + @ ldr r1,[sp,#11*4] + mov r0,r2,ror#7 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#13*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#6*4] + + add r3,r3,r0 + eor r0,r11,r11,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r11,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#13*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 29==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 29<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#15*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#12*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#15*4] @ 30 + @ ldr r1,[sp,#12*4] + mov r0,r2,ror#7 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#14*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#7*4] + + add r12,r12,r0 + eor r0,r10,r10,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r10,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#14*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 30==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 30<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#0*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#13*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#0*4] @ 31 + @ ldr r1,[sp,#13*4] + mov r0,r2,ror#7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#15*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#8*4] + + add r3,r3,r0 + eor r0,r9,r9,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r9,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#15*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 31==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 31<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#1*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#14*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +#ifdef __thumb2__ + ite eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq r3,[sp,#16*4] @ pull ctx + bne .Lrounds_16_xx + + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r0,[r3,#0] + ldr r2,[r3,#4] + ldr r12,[r3,#8] + add r4,r4,r0 + ldr r0,[r3,#12] + add r5,r5,r2 + ldr r2,[r3,#16] + add r6,r6,r12 + ldr r12,[r3,#20] + add r7,r7,r0 + ldr r0,[r3,#24] + add r8,r8,r2 + ldr r2,[r3,#28] + add r9,r9,r12 + ldr r1,[sp,#17*4] @ pull inp + ldr r12,[sp,#18*4] @ pull inp+len + add r10,r10,r0 + add r11,r11,r2 + stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} + cmp r1,r12 + sub r14,r14,#256 @ rewind Ktbl + bne .Loop + + add sp,sp,#19*4 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size sha256_block_data_order,.-sha256_block_data_order +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.globl sha256_block_data_order_neon +.type sha256_block_data_order_neon,%function +.align 5 +.skip 16 +sha256_block_data_order_neon: +.LNEON: + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + + sub r11,sp,#16*4+16 + adr r14,K256 + bic r11,r11,#15 @ align for 128-bit stores + mov r12,sp + mov sp,r11 @ alloca + add r2,r1,r2,lsl#6 @ len to point at the end of inp + + vld1.8 {q0},[r1]! + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + vld1.32 {q8},[r14,:128]! + vld1.32 {q9},[r14,:128]! + vld1.32 {q10},[r14,:128]! + vld1.32 {q11},[r14,:128]! + vrev32.8 q0,q0 @ yes, even on + str r0,[sp,#64] + vrev32.8 q1,q1 @ big-endian + str r1,[sp,#68] + mov r1,sp + vrev32.8 q2,q2 + str r2,[sp,#72] + vrev32.8 q3,q3 + str r12,[sp,#76] @ save original sp + vadd.i32 q8,q8,q0 + vadd.i32 q9,q9,q1 + vst1.32 {q8},[r1,:128]! + vadd.i32 q10,q10,q2 + vst1.32 {q9},[r1,:128]! + vadd.i32 q11,q11,q3 + vst1.32 {q10},[r1,:128]! + vst1.32 {q11},[r1,:128]! + + ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} + sub r1,r1,#64 + ldr r2,[sp,#0] + eor r12,r12,r12 + eor r3,r5,r6 + b .L_00_48 + +.align 4 +.L_00_48: + vext.8 q8,q0,q1,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q2,q3,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q0,q0,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#4] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d7,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d7,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d7,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q0,q0,q9 + add r10,r10,r2 + ldr r2,[sp,#8] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d7,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d7,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d0,d0,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d0,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d0,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d0,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#12] + and r3,r3,r12 + vshr.u32 d24,d0,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d0,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d1,d1,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q0 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q1,q2,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q3,q0,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q1,q1,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#20] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d1,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d1,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d1,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q1,q1,q9 + add r6,r6,r2 + ldr r2,[sp,#24] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d1,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d1,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d2,d2,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d2,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d2,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d2,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#28] + and r3,r3,r12 + vshr.u32 d24,d2,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d2,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d3,d3,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q1 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vext.8 q8,q2,q3,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q0,q1,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q2,q2,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#36] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d3,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d3,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d3,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q2,q2,q9 + add r10,r10,r2 + ldr r2,[sp,#40] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d3,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d3,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d4,d4,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d4,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d4,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d4,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#44] + and r3,r3,r12 + vshr.u32 d24,d4,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d4,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d5,d5,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q2 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q3,q0,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q1,q2,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q3,q3,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#52] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d5,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d5,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d5,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q3,q3,q9 + add r6,r6,r2 + ldr r2,[sp,#56] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d5,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d5,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d6,d6,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d6,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d6,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d6,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#60] + and r3,r3,r12 + vshr.u32 d24,d6,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d6,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d7,d7,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q3 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[r14] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + teq r2,#0 @ check for K256 terminator + ldr r2,[sp,#0] + sub r1,r1,#64 + bne .L_00_48 + + ldr r1,[sp,#68] + ldr r0,[sp,#72] + sub r14,r14,#256 @ rewind r14 + teq r1,r0 + it eq + subeq r1,r1,#64 @ avoid SEGV + vld1.8 {q0},[r1]! @ load next input block + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + it ne + strne r1,[sp,#68] + mov r1,sp + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q0,q0 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q0 + ldr r2,[sp,#4] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#8] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#12] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q1,q1 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q1 + ldr r2,[sp,#20] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#24] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#28] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q2,q2 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q2 + ldr r2,[sp,#36] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#40] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#44] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q3,q3 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q3 + ldr r2,[sp,#52] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#56] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#60] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#64] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + ldr r0,[r2,#0] + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r12,[r2,#4] + ldr r3,[r2,#8] + ldr r1,[r2,#12] + add r4,r4,r0 @ accumulate + ldr r0,[r2,#16] + add r5,r5,r12 + ldr r12,[r2,#20] + add r6,r6,r3 + ldr r3,[r2,#24] + add r7,r7,r1 + ldr r1,[r2,#28] + add r8,r8,r0 + str r4,[r2],#4 + add r9,r9,r12 + str r5,[r2],#4 + add r10,r10,r3 + str r6,[r2],#4 + add r11,r11,r1 + str r7,[r2],#4 + stmia r2,{r8,r9,r10,r11} + + ittte ne + movne r1,sp + ldrne r2,[sp,#0] + eorne r12,r12,r12 + ldreq sp,[sp,#76] @ restore original sp + itt ne + eorne r3,r5,r6 + bne .L_00_48 + + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +.size sha256_block_data_order_neon,.-sha256_block_data_order_neon +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + +# if defined(__thumb2__) +# define INST(a,b,c,d) .byte c,d|0xc,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d +# endif + +.type sha256_block_data_order_armv8,%function +.align 5 +sha256_block_data_order_armv8: +.LARMv8: + vld1.32 {q0,q1},[r0] + sub r3,r3,#256+32 + add r2,r1,r2,lsl#6 @ len to point at the end of inp + b .Loop_v8 + +.align 4 +.Loop_v8: + vld1.8 {q8,q9},[r1]! + vld1.8 {q10,q11},[r1]! + vld1.32 {q12},[r3]! + vrev32.8 q8,q8 + vrev32.8 q9,q9 + vrev32.8 q10,q10 + vrev32.8 q11,q11 + vmov q14,q0 @ offload + vmov q15,q1 + teq r1,r2 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + + vld1.32 {q13},[r3] + vadd.i32 q12,q12,q10 + sub r3,r3,#256-16 @ rewind + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + + vadd.i32 q13,q13,q11 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + + vadd.i32 q0,q0,q14 + vadd.i32 q1,q1,q15 + it ne + bne .Loop_v8 + + vst1.32 {q0,q1},[r0] + + bx lr @ bx lr +.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 +#endif +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm OPENSSL_armcap_P,4,4 +#endif diff --git a/deps/openssl/config/archs/linux-armv4/asm_avx2/crypto/sha/sha512-armv4.S b/deps/openssl/config/archs/linux-armv4/asm_avx2/crypto/sha/sha512-armv4.S new file mode 100644 index 0000000000..1e2fbf6350 --- /dev/null +++ b/deps/openssl/config/archs/linux-armv4/asm_avx2/crypto/sha/sha512-armv4.S @@ -0,0 +1,1869 @@ +@ Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the OpenSSL license (the "License"). You may not use +@ this file except in compliance with the License. You can obtain a copy +@ in the file LICENSE in the source distribution or at +@ https://www.openssl.org/source/license.html + + +@ ==================================================================== +@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ +@ Permission to use under GPL terms is granted. +@ ==================================================================== + +@ SHA512 block procedure for ARMv4. September 2007. + +@ This code is ~4.5 (four and a half) times faster than code generated +@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue +@ Xscale PXA250 core]. +@ +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 6% improvement on +@ Cortex A8 core and ~40 cycles per processed byte. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 7% +@ improvement on Coxtex A8 core and ~38 cycles per byte. + +@ March 2011. +@ +@ Add NEON implementation. On Cortex A8 it was measured to process +@ one byte in 23.3 cycles or ~60% faster than integer-only code. + +@ August 2012. +@ +@ Improve NEON performance by 12% on Snapdragon S4. In absolute +@ terms it's 22.6 cycles per byte, which is disappointing result. +@ Technical writers asserted that 3-way S4 pipeline can sustain +@ multiple NEON instructions per cycle, but dual NEON issue could +@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html +@ for further details. On side note Cortex-A15 processes one byte in +@ 16 cycles. + +@ Byte order [in]dependence. ========================================= +@ +@ Originally caller was expected to maintain specific *dword* order in +@ h[0-7], namely with most significant dword at *lower* address, which +@ was reflected in below two parameters as 0 and 4. Now caller is +@ expected to maintain native byte order for whole 64-bit values. +#ifndef __KERNEL__ +# include "arm_arch.h" +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +# define VFP_ABI_PUSH +# define VFP_ABI_POP +#endif + +#ifdef __ARMEL__ +# define LO 0 +# define HI 4 +# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 +#else +# define HI 0 +# define LO 4 +# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 +#endif + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +# define adrl adr +#else +.code 32 +#endif + +.type K512,%object +.align 5 +K512: + WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) + WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) + WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) + WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) + WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) + WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) + WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) + WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) + WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) + WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) + WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) + WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) + WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) + WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) + WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) + WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) + WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) + WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) + WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) + WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) + WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) + WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) + WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) + WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) + WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) + WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) + WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) + WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) + WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) + WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) + WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) + WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) + WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) + WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) + WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) + WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) + WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) + WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) + WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) + WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) +.size K512,.-K512 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.Lsha512_block_data_order +.skip 32-4 +#else +.skip 32 +#endif + +.globl sha512_block_data_order +.type sha512_block_data_order,%function +sha512_block_data_order: +.Lsha512_block_data_order: +#if __ARM_ARCH__<7 && !defined(__thumb2__) + sub r3,pc,#8 @ sha512_block_data_order +#else + adr r3,.Lsha512_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV7_NEON + bne .LNEON +#endif + add r2,r1,r2,lsl#7 @ len to point at the end of inp + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + sub r14,r3,#672 @ K512 + sub sp,sp,#9*8 + + ldr r7,[r0,#32+LO] + ldr r8,[r0,#32+HI] + ldr r9, [r0,#48+LO] + ldr r10, [r0,#48+HI] + ldr r11, [r0,#56+LO] + ldr r12, [r0,#56+HI] +.Loop: + str r9, [sp,#48+0] + str r10, [sp,#48+4] + str r11, [sp,#56+0] + str r12, [sp,#56+4] + ldr r5,[r0,#0+LO] + ldr r6,[r0,#0+HI] + ldr r3,[r0,#8+LO] + ldr r4,[r0,#8+HI] + ldr r9, [r0,#16+LO] + ldr r10, [r0,#16+HI] + ldr r11, [r0,#24+LO] + ldr r12, [r0,#24+HI] + str r3,[sp,#8+0] + str r4,[sp,#8+4] + str r9, [sp,#16+0] + str r10, [sp,#16+4] + str r11, [sp,#24+0] + str r12, [sp,#24+4] + ldr r3,[r0,#40+LO] + ldr r4,[r0,#40+HI] + str r3,[sp,#40+0] + str r4,[sp,#40+4] + +.L00_15: +#if __ARM_ARCH__<7 + ldrb r3,[r1,#7] + ldrb r9, [r1,#6] + ldrb r10, [r1,#5] + ldrb r11, [r1,#4] + ldrb r4,[r1,#3] + ldrb r12, [r1,#2] + orr r3,r3,r9,lsl#8 + ldrb r9, [r1,#1] + orr r3,r3,r10,lsl#16 + ldrb r10, [r1],#8 + orr r3,r3,r11,lsl#24 + orr r4,r4,r12,lsl#8 + orr r4,r4,r9,lsl#16 + orr r4,r4,r10,lsl#24 +#else + ldr r3,[r1,#4] + ldr r4,[r1],#8 +#ifdef __ARMEL__ + rev r3,r3 + rev r4,r4 +#endif +#endif + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov r9,r7,lsr#14 + str r3,[sp,#64+0] + mov r10,r8,lsr#14 + str r4,[sp,#64+4] + eor r9,r9,r8,lsl#18 + ldr r11,[sp,#56+0] @ h.lo + eor r10,r10,r7,lsl#18 + ldr r12,[sp,#56+4] @ h.hi + eor r9,r9,r7,lsr#18 + eor r10,r10,r8,lsr#18 + eor r9,r9,r8,lsl#14 + eor r10,r10,r7,lsl#14 + eor r9,r9,r8,lsr#9 + eor r10,r10,r7,lsr#9 + eor r9,r9,r7,lsl#23 + eor r10,r10,r8,lsl#23 @ Sigma1(e) + adds r3,r3,r9 + ldr r9,[sp,#40+0] @ f.lo + adc r4,r4,r10 @ T += Sigma1(e) + ldr r10,[sp,#40+4] @ f.hi + adds r3,r3,r11 + ldr r11,[sp,#48+0] @ g.lo + adc r4,r4,r12 @ T += h + ldr r12,[sp,#48+4] @ g.hi + + eor r9,r9,r11 + str r7,[sp,#32+0] + eor r10,r10,r12 + str r8,[sp,#32+4] + and r9,r9,r7 + str r5,[sp,#0+0] + and r10,r10,r8 + str r6,[sp,#0+4] + eor r9,r9,r11 + ldr r11,[r14,#LO] @ K[i].lo + eor r10,r10,r12 @ Ch(e,f,g) + ldr r12,[r14,#HI] @ K[i].hi + + adds r3,r3,r9 + ldr r7,[sp,#24+0] @ d.lo + adc r4,r4,r10 @ T += Ch(e,f,g) + ldr r8,[sp,#24+4] @ d.hi + adds r3,r3,r11 + and r9,r11,#0xff + adc r4,r4,r12 @ T += K[i] + adds r7,r7,r3 + ldr r11,[sp,#8+0] @ b.lo + adc r8,r8,r4 @ d += T + teq r9,#148 + + ldr r12,[sp,#16+0] @ c.lo +#ifdef __thumb2__ + it eq @ Thumb2 thing, sanity check in ARM +#endif + orreq r14,r14,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov r9,r5,lsr#28 + mov r10,r6,lsr#28 + eor r9,r9,r6,lsl#4 + eor r10,r10,r5,lsl#4 + eor r9,r9,r6,lsr#2 + eor r10,r10,r5,lsr#2 + eor r9,r9,r5,lsl#30 + eor r10,r10,r6,lsl#30 + eor r9,r9,r6,lsr#7 + eor r10,r10,r5,lsr#7 + eor r9,r9,r5,lsl#25 + eor r10,r10,r6,lsl#25 @ Sigma0(a) + adds r3,r3,r9 + and r9,r5,r11 + adc r4,r4,r10 @ T += Sigma0(a) + + ldr r10,[sp,#8+4] @ b.hi + orr r5,r5,r11 + ldr r11,[sp,#16+4] @ c.hi + and r5,r5,r12 + and r12,r6,r10 + orr r6,r6,r10 + orr r5,r5,r9 @ Maj(a,b,c).lo + and r6,r6,r11 + adds r5,r5,r3 + orr r6,r6,r12 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc r6,r6,r4 @ h += T + tst r14,#1 + add r14,r14,#8 + tst r14,#1 + beq .L00_15 + ldr r9,[sp,#184+0] + ldr r10,[sp,#184+4] + bic r14,r14,#1 +.L16_79: + @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) + @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 + @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 + mov r3,r9,lsr#1 + ldr r11,[sp,#80+0] + mov r4,r10,lsr#1 + ldr r12,[sp,#80+4] + eor r3,r3,r10,lsl#31 + eor r4,r4,r9,lsl#31 + eor r3,r3,r9,lsr#8 + eor r4,r4,r10,lsr#8 + eor r3,r3,r10,lsl#24 + eor r4,r4,r9,lsl#24 + eor r3,r3,r9,lsr#7 + eor r4,r4,r10,lsr#7 + eor r3,r3,r10,lsl#25 + + @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) + @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 + @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 + mov r9,r11,lsr#19 + mov r10,r12,lsr#19 + eor r9,r9,r12,lsl#13 + eor r10,r10,r11,lsl#13 + eor r9,r9,r12,lsr#29 + eor r10,r10,r11,lsr#29 + eor r9,r9,r11,lsl#3 + eor r10,r10,r12,lsl#3 + eor r9,r9,r11,lsr#6 + eor r10,r10,r12,lsr#6 + ldr r11,[sp,#120+0] + eor r9,r9,r12,lsl#26 + + ldr r12,[sp,#120+4] + adds r3,r3,r9 + ldr r9,[sp,#192+0] + adc r4,r4,r10 + + ldr r10,[sp,#192+4] + adds r3,r3,r11 + adc r4,r4,r12 + adds r3,r3,r9 + adc r4,r4,r10 + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov r9,r7,lsr#14 + str r3,[sp,#64+0] + mov r10,r8,lsr#14 + str r4,[sp,#64+4] + eor r9,r9,r8,lsl#18 + ldr r11,[sp,#56+0] @ h.lo + eor r10,r10,r7,lsl#18 + ldr r12,[sp,#56+4] @ h.hi + eor r9,r9,r7,lsr#18 + eor r10,r10,r8,lsr#18 + eor r9,r9,r8,lsl#14 + eor r10,r10,r7,lsl#14 + eor r9,r9,r8,lsr#9 + eor r10,r10,r7,lsr#9 + eor r9,r9,r7,lsl#23 + eor r10,r10,r8,lsl#23 @ Sigma1(e) + adds r3,r3,r9 + ldr r9,[sp,#40+0] @ f.lo + adc r4,r4,r10 @ T += Sigma1(e) + ldr r10,[sp,#40+4] @ f.hi + adds r3,r3,r11 + ldr r11,[sp,#48+0] @ g.lo + adc r4,r4,r12 @ T += h + ldr r12,[sp,#48+4] @ g.hi + + eor r9,r9,r11 + str r7,[sp,#32+0] + eor r10,r10,r12 + str r8,[sp,#32+4] + and r9,r9,r7 + str r5,[sp,#0+0] + and r10,r10,r8 + str r6,[sp,#0+4] + eor r9,r9,r11 + ldr r11,[r14,#LO] @ K[i].lo + eor r10,r10,r12 @ Ch(e,f,g) + ldr r12,[r14,#HI] @ K[i].hi + + adds r3,r3,r9 + ldr r7,[sp,#24+0] @ d.lo + adc r4,r4,r10 @ T += Ch(e,f,g) + ldr r8,[sp,#24+4] @ d.hi + adds r3,r3,r11 + and r9,r11,#0xff + adc r4,r4,r12 @ T += K[i] + adds r7,r7,r3 + ldr r11,[sp,#8+0] @ b.lo + adc r8,r8,r4 @ d += T + teq r9,#23 + + ldr r12,[sp,#16+0] @ c.lo +#ifdef __thumb2__ + it eq @ Thumb2 thing, sanity check in ARM +#endif + orreq r14,r14,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov r9,r5,lsr#28 + mov r10,r6,lsr#28 + eor r9,r9,r6,lsl#4 + eor r10,r10,r5,lsl#4 + eor r9,r9,r6,lsr#2 + eor r10,r10,r5,lsr#2 + eor r9,r9,r5,lsl#30 + eor r10,r10,r6,lsl#30 + eor r9,r9,r6,lsr#7 + eor r10,r10,r5,lsr#7 + eor r9,r9,r5,lsl#25 + eor r10,r10,r6,lsl#25 @ Sigma0(a) + adds r3,r3,r9 + and r9,r5,r11 + adc r4,r4,r10 @ T += Sigma0(a) + + ldr r10,[sp,#8+4] @ b.hi + orr r5,r5,r11 + ldr r11,[sp,#16+4] @ c.hi + and r5,r5,r12 + and r12,r6,r10 + orr r6,r6,r10 + orr r5,r5,r9 @ Maj(a,b,c).lo + and r6,r6,r11 + adds r5,r5,r3 + orr r6,r6,r12 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc r6,r6,r4 @ h += T + tst r14,#1 + add r14,r14,#8 +#ifdef __thumb2__ + ittt eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq r9,[sp,#184+0] + ldreq r10,[sp,#184+4] + beq .L16_79 + bic r14,r14,#1 + + ldr r3,[sp,#8+0] + ldr r4,[sp,#8+4] + ldr r9, [r0,#0+LO] + ldr r10, [r0,#0+HI] + ldr r11, [r0,#8+LO] + ldr r12, [r0,#8+HI] + adds r9,r5,r9 + str r9, [r0,#0+LO] + adc r10,r6,r10 + str r10, [r0,#0+HI] + adds r11,r3,r11 + str r11, [r0,#8+LO] + adc r12,r4,r12 + str r12, [r0,#8+HI] + + ldr r5,[sp,#16+0] + ldr r6,[sp,#16+4] + ldr r3,[sp,#24+0] + ldr r4,[sp,#24+4] + ldr r9, [r0,#16+LO] + ldr r10, [r0,#16+HI] + ldr r11, [r0,#24+LO] + ldr r12, [r0,#24+HI] + adds r9,r5,r9 + str r9, [r0,#16+LO] + adc r10,r6,r10 + str r10, [r0,#16+HI] + adds r11,r3,r11 + str r11, [r0,#24+LO] + adc r12,r4,r12 + str r12, [r0,#24+HI] + + ldr r3,[sp,#40+0] + ldr r4,[sp,#40+4] + ldr r9, [r0,#32+LO] + ldr r10, [r0,#32+HI] + ldr r11, [r0,#40+LO] + ldr r12, [r0,#40+HI] + adds r7,r7,r9 + str r7,[r0,#32+LO] + adc r8,r8,r10 + str r8,[r0,#32+HI] + adds r11,r3,r11 + str r11, [r0,#40+LO] + adc r12,r4,r12 + str r12, [r0,#40+HI] + + ldr r5,[sp,#48+0] + ldr r6,[sp,#48+4] + ldr r3,[sp,#56+0] + ldr r4,[sp,#56+4] + ldr r9, [r0,#48+LO] + ldr r10, [r0,#48+HI] + ldr r11, [r0,#56+LO] + ldr r12, [r0,#56+HI] + adds r9,r5,r9 + str r9, [r0,#48+LO] + adc r10,r6,r10 + str r10, [r0,#48+HI] + adds r11,r3,r11 + str r11, [r0,#56+LO] + adc r12,r4,r12 + str r12, [r0,#56+HI] + + add sp,sp,#640 + sub r14,r14,#640 + + teq r1,r2 + bne .Loop + + add sp,sp,#8*9 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size sha512_block_data_order,.-sha512_block_data_order +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.globl sha512_block_data_order_neon +.type sha512_block_data_order_neon,%function +.align 4 +sha512_block_data_order_neon: +.LNEON: + dmb @ errata #451034 on early Cortex A8 + add r2,r1,r2,lsl#7 @ len to point at the end of inp + adr r3,K512 + VFP_ABI_PUSH + vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context +.Loop_neon: + vshr.u64 d24,d20,#14 @ 0 +#if 0<16 + vld1.64 {d0},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d20,#18 +#if 0>0 + vadd.i64 d16,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d20,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 0<16 && defined(__ARMEL__) + vrev64.8 d0,d0 +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d0 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 1 +#if 1<16 + vld1.64 {d1},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 1>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 1<16 && defined(__ARMEL__) + vrev64.8 d1,d1 +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d1 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 d24,d18,#14 @ 2 +#if 2<16 + vld1.64 {d2},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d18,#18 +#if 2>0 + vadd.i64 d22,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d18,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 2<16 && defined(__ARMEL__) + vrev64.8 d2,d2 +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d2 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 3 +#if 3<16 + vld1.64 {d3},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 3>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 3<16 && defined(__ARMEL__) + vrev64.8 d3,d3 +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d3 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 d24,d16,#14 @ 4 +#if 4<16 + vld1.64 {d4},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d16,#18 +#if 4>0 + vadd.i64 d20,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d16,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 4<16 && defined(__ARMEL__) + vrev64.8 d4,d4 +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d4 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 5 +#if 5<16 + vld1.64 {d5},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 5>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 5<16 && defined(__ARMEL__) + vrev64.8 d5,d5 +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d5 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 d24,d22,#14 @ 6 +#if 6<16 + vld1.64 {d6},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d22,#18 +#if 6>0 + vadd.i64 d18,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d22,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 6<16 && defined(__ARMEL__) + vrev64.8 d6,d6 +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d6 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 7 +#if 7<16 + vld1.64 {d7},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 7>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 7<16 && defined(__ARMEL__) + vrev64.8 d7,d7 +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d7 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + vshr.u64 d24,d20,#14 @ 8 +#if 8<16 + vld1.64 {d8},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d20,#18 +#if 8>0 + vadd.i64 d16,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d20,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 8<16 && defined(__ARMEL__) + vrev64.8 d8,d8 +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d8 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 9 +#if 9<16 + vld1.64 {d9},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 9>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 9<16 && defined(__ARMEL__) + vrev64.8 d9,d9 +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d9 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 d24,d18,#14 @ 10 +#if 10<16 + vld1.64 {d10},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d18,#18 +#if 10>0 + vadd.i64 d22,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d18,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 10<16 && defined(__ARMEL__) + vrev64.8 d10,d10 +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d10 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 11 +#if 11<16 + vld1.64 {d11},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 11>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 11<16 && defined(__ARMEL__) + vrev64.8 d11,d11 +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d11 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 d24,d16,#14 @ 12 +#if 12<16 + vld1.64 {d12},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d16,#18 +#if 12>0 + vadd.i64 d20,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d16,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 12<16 && defined(__ARMEL__) + vrev64.8 d12,d12 +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d12 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 13 +#if 13<16 + vld1.64 {d13},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 13>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 13<16 && defined(__ARMEL__) + vrev64.8 d13,d13 +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d13 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 d24,d22,#14 @ 14 +#if 14<16 + vld1.64 {d14},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d22,#18 +#if 14>0 + vadd.i64 d18,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d22,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 14<16 && defined(__ARMEL__) + vrev64.8 d14,d14 +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d14 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 15 +#if 15<16 + vld1.64 {d15},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 15>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 15<16 && defined(__ARMEL__) + vrev64.8 d15,d15 +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d15 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + mov r12,#4 +.L16_79_neon: + subs r12,#1 + vshr.u64 q12,q7,#19 + vshr.u64 q13,q7,#61 + vadd.i64 d16,d30 @ h+=Maj from the past + vshr.u64 q15,q7,#6 + vsli.64 q12,q7,#45 + vext.8 q14,q0,q1,#8 @ X[i+1] + vsli.64 q13,q7,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q0,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q4,q5,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d20,#14 @ from NEON_00_15 + vadd.i64 q0,q14 + vshr.u64 d25,d20,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d20,#41 @ from NEON_00_15 + vadd.i64 q0,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 16<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d0 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 17 +#if 17<16 + vld1.64 {d1},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 17>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 17<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d1 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 q12,q0,#19 + vshr.u64 q13,q0,#61 + vadd.i64 d22,d30 @ h+=Maj from the past + vshr.u64 q15,q0,#6 + vsli.64 q12,q0,#45 + vext.8 q14,q1,q2,#8 @ X[i+1] + vsli.64 q13,q0,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q1,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q5,q6,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d18,#14 @ from NEON_00_15 + vadd.i64 q1,q14 + vshr.u64 d25,d18,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d18,#41 @ from NEON_00_15 + vadd.i64 q1,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 18<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d2 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 19 +#if 19<16 + vld1.64 {d3},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 19>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 19<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d3 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 q12,q1,#19 + vshr.u64 q13,q1,#61 + vadd.i64 d20,d30 @ h+=Maj from the past + vshr.u64 q15,q1,#6 + vsli.64 q12,q1,#45 + vext.8 q14,q2,q3,#8 @ X[i+1] + vsli.64 q13,q1,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q2,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q6,q7,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d16,#14 @ from NEON_00_15 + vadd.i64 q2,q14 + vshr.u64 d25,d16,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d16,#41 @ from NEON_00_15 + vadd.i64 q2,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 20<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d4 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 21 +#if 21<16 + vld1.64 {d5},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 21>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 21<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d5 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 q12,q2,#19 + vshr.u64 q13,q2,#61 + vadd.i64 d18,d30 @ h+=Maj from the past + vshr.u64 q15,q2,#6 + vsli.64 q12,q2,#45 + vext.8 q14,q3,q4,#8 @ X[i+1] + vsli.64 q13,q2,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q3,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q7,q0,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d22,#14 @ from NEON_00_15 + vadd.i64 q3,q14 + vshr.u64 d25,d22,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d22,#41 @ from NEON_00_15 + vadd.i64 q3,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 22<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d6 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 23 +#if 23<16 + vld1.64 {d7},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 23>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 23<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d7 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + vshr.u64 q12,q3,#19 + vshr.u64 q13,q3,#61 + vadd.i64 d16,d30 @ h+=Maj from the past + vshr.u64 q15,q3,#6 + vsli.64 q12,q3,#45 + vext.8 q14,q4,q5,#8 @ X[i+1] + vsli.64 q13,q3,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q4,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q0,q1,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d20,#14 @ from NEON_00_15 + vadd.i64 q4,q14 + vshr.u64 d25,d20,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d20,#41 @ from NEON_00_15 + vadd.i64 q4,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 24<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d8 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 25 +#if 25<16 + vld1.64 {d9},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 25>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 25<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d9 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 q12,q4,#19 + vshr.u64 q13,q4,#61 + vadd.i64 d22,d30 @ h+=Maj from the past + vshr.u64 q15,q4,#6 + vsli.64 q12,q4,#45 + vext.8 q14,q5,q6,#8 @ X[i+1] + vsli.64 q13,q4,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q5,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q1,q2,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d18,#14 @ from NEON_00_15 + vadd.i64 q5,q14 + vshr.u64 d25,d18,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d18,#41 @ from NEON_00_15 + vadd.i64 q5,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 26<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d10 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 27 +#if 27<16 + vld1.64 {d11},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 27>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 27<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d11 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 q12,q5,#19 + vshr.u64 q13,q5,#61 + vadd.i64 d20,d30 @ h+=Maj from the past + vshr.u64 q15,q5,#6 + vsli.64 q12,q5,#45 + vext.8 q14,q6,q7,#8 @ X[i+1] + vsli.64 q13,q5,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q6,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q2,q3,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d16,#14 @ from NEON_00_15 + vadd.i64 q6,q14 + vshr.u64 d25,d16,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d16,#41 @ from NEON_00_15 + vadd.i64 q6,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 28<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d12 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 29 +#if 29<16 + vld1.64 {d13},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 29>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 29<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d13 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 q12,q6,#19 + vshr.u64 q13,q6,#61 + vadd.i64 d18,d30 @ h+=Maj from the past + vshr.u64 q15,q6,#6 + vsli.64 q12,q6,#45 + vext.8 q14,q7,q0,#8 @ X[i+1] + vsli.64 q13,q6,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q7,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q3,q4,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d22,#14 @ from NEON_00_15 + vadd.i64 q7,q14 + vshr.u64 d25,d22,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d22,#41 @ from NEON_00_15 + vadd.i64 q7,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 30<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d14 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 31 +#if 31<16 + vld1.64 {d15},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 31>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 31<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d15 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + bne .L16_79_neon + + vadd.i64 d16,d30 @ h+=Maj from the past + vldmia r0,{d24,d25,d26,d27,d28,d29,d30,d31} @ load context to temp + vadd.i64 q8,q12 @ vectorized accumulate + vadd.i64 q9,q13 + vadd.i64 q10,q14 + vadd.i64 q11,q15 + vstmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ save context + teq r1,r2 + sub r3,#640 @ rewind K512 + bne .Loop_neon + + VFP_ABI_POP + bx lr @ .word 0xe12fff1e +.size sha512_block_data_order_neon,.-sha512_block_data_order_neon +#endif +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm OPENSSL_armcap_P,4,4 +#endif |