quickjs-tart

quickjs-based runtime for wallet-core logic
Log | Files | Refs | README | LICENSE

aesce.c (20394B)


      1 /*
      2  *  Armv8-A Cryptographic Extension support functions for Aarch64
      3  *
      4  *  Copyright The Mbed TLS Contributors
      5  *  SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
      6  */
      7 
      8 #if defined(__clang__) &&  (__clang_major__ >= 4)
      9 
     10 /* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8_A in the following #if,
     11  * but that is defined by build_info.h, and we need this block to happen first. */
     12 #if defined(__ARM_ARCH)
     13 #if __ARM_ARCH >= 8
     14 #define MBEDTLS_AESCE_ARCH_IS_ARMV8_A
     15 #endif
     16 #endif
     17 
     18 #if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8_A) && !defined(__ARM_FEATURE_CRYPTO)
     19 /* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged.
     20  *
     21  * The intrinsic declaration are guarded by predefined ACLE macros in clang:
     22  * these are normally only enabled by the -march option on the command line.
     23  * By defining the macros ourselves we gain access to those declarations without
     24  * requiring -march on the command line.
     25  *
     26  * `arm_neon.h` is included by common.h, so we put these defines
     27  * at the top of this file, before any includes.
     28  */
     29 #define __ARM_FEATURE_CRYPTO 1
     30 /* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions
     31  *
     32  * `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it
     33  * for older compilers.
     34  */
     35 #define __ARM_FEATURE_AES    1
     36 #define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
     37 #endif
     38 
     39 #endif /* defined(__clang__) &&  (__clang_major__ >= 4) */
     40 
     41 #include <string.h>
     42 #include "common.h"
     43 
     44 #if defined(MBEDTLS_AESCE_C)
     45 
     46 #include "aesce.h"
     47 
     48 #if defined(MBEDTLS_AESCE_HAVE_CODE)
     49 
     50 /* Compiler version checks. */
     51 #if defined(__clang__)
     52 #   if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11)
     53 #       error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 11.0."
     54 #   elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4)
     55 #       error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0."
     56 #   endif
     57 #elif defined(__GNUC__)
     58 #   if __GNUC__ < 6
     59 #       error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0."
     60 #   endif
     61 #elif defined(_MSC_VER)
     62 /* TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that,
     63  *       please update this and document of `MBEDTLS_AESCE_C` in
     64  *       `mbedtls_config.h`. */
     65 #   if _MSC_VER < 1929
     66 #       error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
     67 #   endif
     68 #elif defined(__ARMCC_VERSION)
     69 #    if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
     70 /* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.
     71  * If someone verified that, please update this and document of
     72  * `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */
     73 #         error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
     74 #    elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
     75 #         error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
     76 #    endif
     77 #endif
     78 
     79 #if !(defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)) || \
     80     defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG)
     81 #   if defined(__ARMCOMPILER_VERSION)
     82 #       if __ARMCOMPILER_VERSION <= 6090000
     83 #           error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C"
     84 #       else
     85 #           pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
     86 #           define MBEDTLS_POP_TARGET_PRAGMA
     87 #       endif
     88 #   elif defined(__clang__)
     89 #       pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
     90 #       define MBEDTLS_POP_TARGET_PRAGMA
     91 #   elif defined(__GNUC__)
     92 #       pragma GCC push_options
     93 #       pragma GCC target ("+crypto")
     94 #       define MBEDTLS_POP_TARGET_PRAGMA
     95 #   elif defined(_MSC_VER)
     96 #       error "Required feature(__ARM_FEATURE_AES) is not enabled."
     97 #   endif
     98 #endif /* !(__ARM_FEATURE_CRYPTO || __ARM_FEATURE_AES) ||
     99           MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */
    100 
    101 #if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
    102 
    103 #include <sys/auxv.h>
    104 #if !defined(HWCAP_NEON)
    105 #define HWCAP_NEON  (1 << 12)
    106 #endif
    107 #if !defined(HWCAP2_AES)
    108 #define HWCAP2_AES  (1 << 0)
    109 #endif
    110 #if !defined(HWCAP_AES)
    111 #define HWCAP_AES   (1 << 3)
    112 #endif
    113 #if !defined(HWCAP_ASIMD)
    114 #define HWCAP_ASIMD (1 << 1)
    115 #endif
    116 
    117 signed char mbedtls_aesce_has_support_result = -1;
    118 
    119 #if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
    120 /*
    121  * AES instruction support detection routine
    122  */
    123 int mbedtls_aesce_has_support_impl(void)
    124 {
    125     /* To avoid many calls to getauxval, cache the result. This is
    126      * thread-safe, because we store the result in a char so cannot
    127      * be vulnerable to non-atomic updates.
    128      * It is possible that we could end up setting result more than
    129      * once, but that is harmless.
    130      */
    131     if (mbedtls_aesce_has_support_result == -1) {
    132 #if defined(MBEDTLS_ARCH_IS_ARM32)
    133         unsigned long auxval  = getauxval(AT_HWCAP);
    134         unsigned long auxval2 = getauxval(AT_HWCAP2);
    135         if (((auxval  & HWCAP_NEON) == HWCAP_NEON) &&
    136             ((auxval2 & HWCAP2_AES) == HWCAP2_AES)) {
    137             mbedtls_aesce_has_support_result = 1;
    138         } else {
    139             mbedtls_aesce_has_support_result = 0;
    140         }
    141 #else
    142         unsigned long auxval = getauxval(AT_HWCAP);
    143         if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) ==
    144             (HWCAP_ASIMD | HWCAP_AES)) {
    145             mbedtls_aesce_has_support_result = 1;
    146         } else {
    147             mbedtls_aesce_has_support_result = 0;
    148         }
    149 #endif
    150     }
    151     return mbedtls_aesce_has_support_result;
    152 }
    153 #endif
    154 
    155 #endif /* defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) */
    156 
    157 /* Single round of AESCE encryption */
    158 #define AESCE_ENCRYPT_ROUND                   \
    159     block = vaeseq_u8(block, vld1q_u8(keys)); \
    160     block = vaesmcq_u8(block);                \
    161     keys += 16
    162 /* Two rounds of AESCE encryption */
    163 #define AESCE_ENCRYPT_ROUND_X2        AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
    164 
    165 MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
    166 static uint8x16_t aesce_encrypt_block(uint8x16_t block,
    167                                       unsigned char *keys,
    168                                       int rounds)
    169 {
    170     /* 10, 12 or 14 rounds. Unroll loop. */
    171     if (rounds == 10) {
    172         goto rounds_10;
    173     }
    174     if (rounds == 12) {
    175         goto rounds_12;
    176     }
    177     AESCE_ENCRYPT_ROUND_X2;
    178 rounds_12:
    179     AESCE_ENCRYPT_ROUND_X2;
    180 rounds_10:
    181     AESCE_ENCRYPT_ROUND_X2;
    182     AESCE_ENCRYPT_ROUND_X2;
    183     AESCE_ENCRYPT_ROUND_X2;
    184     AESCE_ENCRYPT_ROUND_X2;
    185     AESCE_ENCRYPT_ROUND;
    186 
    187     /* AES AddRoundKey for the previous round.
    188      * SubBytes, ShiftRows for the final round.  */
    189     block = vaeseq_u8(block, vld1q_u8(keys));
    190     keys += 16;
    191 
    192     /* Final round: no MixColumns */
    193 
    194     /* Final AddRoundKey */
    195     block = veorq_u8(block, vld1q_u8(keys));
    196 
    197     return block;
    198 }
    199 
    200 /* Single round of AESCE decryption
    201  *
    202  * AES AddRoundKey, SubBytes, ShiftRows
    203  *
    204  *      block = vaesdq_u8(block, vld1q_u8(keys));
    205  *
    206  * AES inverse MixColumns for the next round.
    207  *
    208  * This means that we switch the order of the inverse AddRoundKey and
    209  * inverse MixColumns operations. We have to do this as AddRoundKey is
    210  * done in an atomic instruction together with the inverses of SubBytes
    211  * and ShiftRows.
    212  *
    213  * It works because MixColumns is a linear operation over GF(2^8) and
    214  * AddRoundKey is an exclusive or, which is equivalent to addition over
    215  * GF(2^8). (The inverse of MixColumns needs to be applied to the
    216  * affected round keys separately which has been done when the
    217  * decryption round keys were calculated.)
    218  *
    219  *      block = vaesimcq_u8(block);
    220  */
    221 #define AESCE_DECRYPT_ROUND                   \
    222     block = vaesdq_u8(block, vld1q_u8(keys)); \
    223     block = vaesimcq_u8(block);               \
    224     keys += 16
    225 /* Two rounds of AESCE decryption */
    226 #define AESCE_DECRYPT_ROUND_X2        AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
    227 
    228 #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
    229 static uint8x16_t aesce_decrypt_block(uint8x16_t block,
    230                                       unsigned char *keys,
    231                                       int rounds)
    232 {
    233     /* 10, 12 or 14 rounds. Unroll loop. */
    234     if (rounds == 10) {
    235         goto rounds_10;
    236     }
    237     if (rounds == 12) {
    238         goto rounds_12;
    239     }
    240     AESCE_DECRYPT_ROUND_X2;
    241 rounds_12:
    242     AESCE_DECRYPT_ROUND_X2;
    243 rounds_10:
    244     AESCE_DECRYPT_ROUND_X2;
    245     AESCE_DECRYPT_ROUND_X2;
    246     AESCE_DECRYPT_ROUND_X2;
    247     AESCE_DECRYPT_ROUND_X2;
    248     AESCE_DECRYPT_ROUND;
    249 
    250     /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
    251      * last full round. */
    252     block = vaesdq_u8(block, vld1q_u8(keys));
    253     keys += 16;
    254 
    255     /* Inverse AddRoundKey for inverting the initial round key addition. */
    256     block = veorq_u8(block, vld1q_u8(keys));
    257 
    258     return block;
    259 }
    260 #endif
    261 
    262 /*
    263  * AES-ECB block en(de)cryption
    264  */
    265 int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
    266                             int mode,
    267                             const unsigned char input[16],
    268                             unsigned char output[16])
    269 {
    270     uint8x16_t block = vld1q_u8(&input[0]);
    271     unsigned char *keys = (unsigned char *) (ctx->buf + ctx->rk_offset);
    272 
    273 #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
    274     if (mode == MBEDTLS_AES_DECRYPT) {
    275         block = aesce_decrypt_block(block, keys, ctx->nr);
    276     } else
    277 #else
    278     (void) mode;
    279 #endif
    280     {
    281         block = aesce_encrypt_block(block, keys, ctx->nr);
    282     }
    283     vst1q_u8(&output[0], block);
    284 
    285     return 0;
    286 }
    287 
    288 /*
    289  * Compute decryption round keys from encryption round keys
    290  */
    291 #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
    292 void mbedtls_aesce_inverse_key(unsigned char *invkey,
    293                                const unsigned char *fwdkey,
    294                                int nr)
    295 {
    296     int i, j;
    297     j = nr;
    298     vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16));
    299     for (i = 1, j--; j > 0; i++, j--) {
    300         vst1q_u8(invkey + i * 16,
    301                  vaesimcq_u8(vld1q_u8(fwdkey + j * 16)));
    302     }
    303     vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16));
    304 
    305 }
    306 #endif
    307 
    308 static inline uint32_t aes_rot_word(uint32_t word)
    309 {
    310     return (word << (32 - 8)) | (word >> 8);
    311 }
    312 
    313 static inline uint32_t aes_sub_word(uint32_t in)
    314 {
    315     uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in));
    316     uint8x16_t zero = vdupq_n_u8(0);
    317 
    318     /* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields
    319      * the correct result as ShiftRows doesn't change the first row. */
    320     v = vaeseq_u8(zero, v);
    321     return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0);
    322 }
    323 
    324 /*
    325  * Key expansion function
    326  */
    327 static void aesce_setkey_enc(unsigned char *rk,
    328                              const unsigned char *key,
    329                              const size_t key_bit_length)
    330 {
    331     static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10,
    332                                     0x20, 0x40, 0x80, 0x1b, 0x36 };
    333     /* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf
    334      *   - Section 5, Nr = Nk + 6
    335      *   - Section 5.2, the length of round keys is Nb*(Nr+1)
    336      */
    337     const size_t key_len_in_words = key_bit_length / 32;    /* Nk */
    338     const size_t round_key_len_in_words = 4;                /* Nb */
    339     const size_t rounds_needed = key_len_in_words + 6;      /* Nr */
    340     const size_t round_keys_len_in_words =
    341         round_key_len_in_words * (rounds_needed + 1);       /* Nb*(Nr+1) */
    342     const uint32_t *rko_end = (uint32_t *) rk + round_keys_len_in_words;
    343 
    344     memcpy(rk, key, key_len_in_words * 4);
    345 
    346     for (uint32_t *rki = (uint32_t *) rk;
    347          rki + key_len_in_words < rko_end;
    348          rki += key_len_in_words) {
    349 
    350         size_t iteration = (size_t) (rki - (uint32_t *) rk) / key_len_in_words;
    351         uint32_t *rko;
    352         rko = rki + key_len_in_words;
    353         rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1]));
    354         rko[0] ^= rcon[iteration] ^ rki[0];
    355         rko[1] = rko[0] ^ rki[1];
    356         rko[2] = rko[1] ^ rki[2];
    357         rko[3] = rko[2] ^ rki[3];
    358         if (rko + key_len_in_words > rko_end) {
    359             /* Do not write overflow words.*/
    360             continue;
    361         }
    362 #if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH)
    363         switch (key_bit_length) {
    364             case 128:
    365                 break;
    366             case 192:
    367                 rko[4] = rko[3] ^ rki[4];
    368                 rko[5] = rko[4] ^ rki[5];
    369                 break;
    370             case 256:
    371                 rko[4] = aes_sub_word(rko[3]) ^ rki[4];
    372                 rko[5] = rko[4] ^ rki[5];
    373                 rko[6] = rko[5] ^ rki[6];
    374                 rko[7] = rko[6] ^ rki[7];
    375                 break;
    376         }
    377 #endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */
    378     }
    379 }
    380 
    381 /*
    382  * Key expansion, wrapper
    383  */
    384 int mbedtls_aesce_setkey_enc(unsigned char *rk,
    385                              const unsigned char *key,
    386                              size_t bits)
    387 {
    388     switch (bits) {
    389         case 128:
    390         case 192:
    391         case 256:
    392             aesce_setkey_enc(rk, key, bits);
    393             break;
    394         default:
    395             return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
    396     }
    397 
    398     return 0;
    399 }
    400 
    401 #if defined(MBEDTLS_GCM_C)
    402 
    403 #if defined(MBEDTLS_ARCH_IS_ARM32)
    404 
    405 #if defined(__clang__)
    406 /* On clang for A32/T32, work around some missing intrinsics and types which are listed in
    407  * [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1)
    408  * These are only required for GCM.
    409  */
    410 #define vreinterpretq_u64_p64(a) ((uint64x2_t) a)
    411 
    412 typedef uint8x16_t poly128_t;
    413 
    414 static inline poly128_t vmull_p64(poly64_t a, poly64_t b)
    415 {
    416     poly128_t r;
    417     asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :);
    418     return r;
    419 }
    420 
    421 /* This is set to cause some more missing intrinsics to be defined below */
    422 #define COMMON_MISSING_INTRINSICS
    423 
    424 static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b)
    425 {
    426     return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)),
    427                      (poly64_t) (vget_high_u64((uint64x2_t) b)));
    428 }
    429 
    430 #endif /* defined(__clang__) */
    431 
    432 static inline uint8x16_t vrbitq_u8(uint8x16_t x)
    433 {
    434     /* There is no vrbitq_u8 instruction in A32/T32, so provide
    435      * an equivalent non-Neon implementation. Reverse bit order in each
    436      * byte with 4x rbit, rev. */
    437     asm ("ldm  %[p], { r2-r5 } \n\t"
    438          "rbit r2, r2          \n\t"
    439          "rev  r2, r2          \n\t"
    440          "rbit r3, r3          \n\t"
    441          "rev  r3, r3          \n\t"
    442          "rbit r4, r4          \n\t"
    443          "rev  r4, r4          \n\t"
    444          "rbit r5, r5          \n\t"
    445          "rev  r5, r5          \n\t"
    446          "stm  %[p], { r2-r5 } \n\t"
    447          :
    448          /* Output: 16 bytes of memory pointed to by &x */
    449          "+m" (*(uint8_t(*)[16]) &x)
    450          :
    451          [p] "r" (&x)
    452          :
    453          "r2", "r3", "r4", "r5"
    454          );
    455     return x;
    456 }
    457 
    458 #endif /* defined(MBEDTLS_ARCH_IS_ARM32) */
    459 
    460 #if defined(MBEDTLS_COMPILER_IS_GCC) && __GNUC__ == 5
    461 /* Some intrinsics are not available for GCC 5.X. */
    462 #define COMMON_MISSING_INTRINSICS
    463 #endif /* MBEDTLS_COMPILER_IS_GCC && __GNUC__ == 5 */
    464 
    465 
    466 #if defined(COMMON_MISSING_INTRINSICS)
    467 
    468 /* Missing intrinsics common to both GCC 5, and Clang on 32-bit */
    469 
    470 #define vreinterpretq_p64_u8(a)  ((poly64x2_t) a)
    471 #define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
    472 
    473 static inline poly64x1_t vget_low_p64(poly64x2_t a)
    474 {
    475     uint64x1_t r = vget_low_u64(vreinterpretq_u64_p64(a));
    476     return (poly64x1_t) r;
    477 
    478 }
    479 
    480 #endif /* COMMON_MISSING_INTRINSICS */
    481 
    482 /* vmull_p64/vmull_high_p64 wrappers.
    483  *
    484  * Older compilers miss some intrinsic functions for `poly*_t`. We use
    485  * uint8x16_t and uint8x16x3_t as input/output parameters.
    486  */
    487 #if defined(MBEDTLS_COMPILER_IS_GCC)
    488 /* GCC reports incompatible type error without cast. GCC think poly64_t and
    489  * poly64x1_t are different, that is different with MSVC and Clang. */
    490 #define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
    491 #else
    492 /* MSVC reports `error C2440: 'type cast'` with cast. Clang does not report
    493  * error with/without cast. And I think poly64_t and poly64x1_t are same, no
    494  * cast for clang also. */
    495 #define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
    496 #endif /* MBEDTLS_COMPILER_IS_GCC */
    497 
    498 static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
    499 {
    500 
    501     return vreinterpretq_u8_p128(
    502         MBEDTLS_VMULL_P64(
    503             (poly64_t) vget_low_p64(vreinterpretq_p64_u8(a)),
    504             (poly64_t) vget_low_p64(vreinterpretq_p64_u8(b))
    505             ));
    506 }
    507 
    508 static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b)
    509 {
    510     return vreinterpretq_u8_p128(
    511         vmull_high_p64(vreinterpretq_p64_u8(a),
    512                        vreinterpretq_p64_u8(b)));
    513 }
    514 
    515 /* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by
    516  * `x^128 + x^7 + x^2 + x + 1`.
    517  *
    518  * Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b
    519  * multiplies to generate a 128b.
    520  *
    521  * `poly_mult_128` executes polynomial multiplication and outputs 256b that
    522  * represented by 3 128b due to code size optimization.
    523  *
    524  * Output layout:
    525  * |            |             |             |
    526  * |------------|-------------|-------------|
    527  * | ret.val[0] | h3:h2:00:00 | high   128b |
    528  * | ret.val[1] |   :m2:m1:00 | middle 128b |
    529  * | ret.val[2] |   :  :l1:l0 | low    128b |
    530  */
    531 static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b)
    532 {
    533     uint8x16x3_t ret;
    534     uint8x16_t h, m, l; /* retval high/middle/low */
    535     uint8x16_t c, d, e;
    536 
    537     h = pmull_high(a, b);                       /* h3:h2:00:00 = a1*b1 */
    538     l = pmull_low(a, b);                        /*   :  :l1:l0 = a0*b0 */
    539     c = vextq_u8(b, b, 8);                      /*      :c1:c0 = b0:b1 */
    540     d = pmull_high(a, c);                       /*   :d2:d1:00 = a1*b0 */
    541     e = pmull_low(a, c);                        /*   :e2:e1:00 = a0*b1 */
    542     m = veorq_u8(d, e);                         /*   :m2:m1:00 = d + e */
    543 
    544     ret.val[0] = h;
    545     ret.val[1] = m;
    546     ret.val[2] = l;
    547     return ret;
    548 }
    549 
    550 /*
    551  * Modulo reduction.
    552  *
    553  * See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
    554  *
    555  * Section 4.3
    556  *
    557  * Modular reduction is slightly more complex. Write the GCM modulus as f(z) =
    558  * z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to
    559  * consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit
    560  * operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we
    561  * simply multiply the higher part of the operand by r(z) and add it to l(z). If
    562  * the result is still larger than 128 bits, we reduce again.
    563  */
    564 static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
    565 {
    566     uint8x16_t const ZERO = vdupq_n_u8(0);
    567 
    568     uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87));
    569 #if defined(__GNUC__)
    570     /* use 'asm' as an optimisation barrier to prevent loading MODULO from
    571      * memory. It is for GNUC compatible compilers.
    572      */
    573     asm volatile ("" : "+w" (r));
    574 #endif
    575     uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
    576     uint8x16_t h, m, l; /* input high/middle/low 128b */
    577     uint8x16_t c, d, e, f, g, n, o;
    578     h = input.val[0];            /* h3:h2:00:00                          */
    579     m = input.val[1];            /*   :m2:m1:00                          */
    580     l = input.val[2];            /*   :  :l1:l0                          */
    581     c = pmull_high(h, MODULO);   /*   :c2:c1:00 = reduction of h3        */
    582     d = pmull_low(h, MODULO);    /*   :  :d1:d0 = reduction of h2        */
    583     e = veorq_u8(c, m);          /*   :e2:e1:00 = m2:m1:00 + c2:c1:00    */
    584     f = pmull_high(e, MODULO);   /*   :  :f1:f0 = reduction of e2        */
    585     g = vextq_u8(ZERO, e, 8);    /*   :  :g1:00 = e1:00                  */
    586     n = veorq_u8(d, l);          /*   :  :n1:n0 = d1:d0 + l1:l0          */
    587     o = veorq_u8(n, f);          /*       o1:o0 = f1:f0 + n1:n0          */
    588     return veorq_u8(o, g);       /*             = o1:o0 + g1:00          */
    589 }
    590 
    591 /*
    592  * GCM multiplication: c = a times b in GF(2^128)
    593  */
    594 void mbedtls_aesce_gcm_mult(unsigned char c[16],
    595                             const unsigned char a[16],
    596                             const unsigned char b[16])
    597 {
    598     uint8x16_t va, vb, vc;
    599     va = vrbitq_u8(vld1q_u8(&a[0]));
    600     vb = vrbitq_u8(vld1q_u8(&b[0]));
    601     vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb)));
    602     vst1q_u8(&c[0], vc);
    603 }
    604 
    605 #endif /* MBEDTLS_GCM_C */
    606 
    607 #if defined(MBEDTLS_POP_TARGET_PRAGMA)
    608 #if defined(__clang__)
    609 #pragma clang attribute pop
    610 #elif defined(__GNUC__)
    611 #pragma GCC pop_options
    612 #endif
    613 #undef MBEDTLS_POP_TARGET_PRAGMA
    614 #endif
    615 
    616 #endif /* MBEDTLS_AESCE_HAVE_CODE */
    617 
    618 #endif /* MBEDTLS_AESCE_C */