quickjs-tart

quickjs-based runtime for wallet-core logic
Log | Files | Refs | README | LICENSE

libunicode.c (56291B)


      1 /*
      2  * Unicode utilities
      3  *
      4  * Copyright (c) 2017-2018 Fabrice Bellard
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a copy
      7  * of this software and associated documentation files (the "Software"), to deal
      8  * in the Software without restriction, including without limitation the rights
      9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10  * copies of the Software, and to permit persons to whom the Software is
     11  * furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included in
     14  * all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     22  * THE SOFTWARE.
     23  */
     24 #include <stdlib.h>
     25 #include <stdio.h>
     26 #include <stdarg.h>
     27 #include <string.h>
     28 #include <assert.h>
     29 
     30 #include "cutils.h"
     31 #include "libunicode.h"
     32 #include "libunicode-table.h"
     33 
     34 enum {
     35     RUN_TYPE_U,
     36     RUN_TYPE_L,
     37     RUN_TYPE_UF,
     38     RUN_TYPE_LF,
     39     RUN_TYPE_UL,
     40     RUN_TYPE_LSU,
     41     RUN_TYPE_U2L_399_EXT2,
     42     RUN_TYPE_UF_D20,
     43     RUN_TYPE_UF_D1_EXT,
     44     RUN_TYPE_U_EXT,
     45     RUN_TYPE_LF_EXT,
     46     RUN_TYPE_UF_EXT2,
     47     RUN_TYPE_LF_EXT2,
     48     RUN_TYPE_UF_EXT3,
     49 };
     50 
     51 static int lre_case_conv1(uint32_t c, int conv_type)
     52 {
     53     uint32_t res[LRE_CC_RES_LEN_MAX];
     54     lre_case_conv(res, c, conv_type);
     55     return res[0];
     56 }
     57 
     58 /* case conversion using the table entry 'idx' with value 'v' */
     59 static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v)
     60 {
     61     uint32_t code, data, type, a, is_lower;
     62     is_lower = (conv_type != 0);
     63     type = (v >> (32 - 17 - 7 - 4)) & 0xf;
     64     data = ((v & 0xf) << 8) | case_conv_table2[idx];
     65     code = v >> (32 - 17);
     66     switch(type) {
     67     case RUN_TYPE_U:
     68     case RUN_TYPE_L:
     69     case RUN_TYPE_UF:
     70     case RUN_TYPE_LF:
     71         if (conv_type == (type & 1) ||
     72             (type >= RUN_TYPE_UF && conv_type == 2)) {
     73             c = c - code + (case_conv_table1[data] >> (32 - 17));
     74         }
     75         break;
     76     case RUN_TYPE_UL:
     77         a = c - code;
     78         if ((a & 1) != (1 - is_lower))
     79             break;
     80         c = (a ^ 1) + code;
     81         break;
     82     case RUN_TYPE_LSU:
     83         a = c - code;
     84         if (a == 1) {
     85             c += 2 * is_lower - 1;
     86         } else if (a == (1 - is_lower) * 2) {
     87             c += (2 * is_lower - 1) * 2;
     88         }
     89         break;
     90     case RUN_TYPE_U2L_399_EXT2:
     91         if (!is_lower) {
     92             res[0] = c - code + case_conv_ext[data >> 6];
     93             res[1] = 0x399;
     94             return 2;
     95         } else {
     96             c = c - code + case_conv_ext[data & 0x3f];
     97         }
     98         break;
     99     case RUN_TYPE_UF_D20:
    100         if (conv_type == 1)
    101             break;
    102         c = data + (conv_type == 2) * 0x20;
    103         break;
    104     case RUN_TYPE_UF_D1_EXT:
    105         if (conv_type == 1)
    106             break;
    107         c = case_conv_ext[data] + (conv_type == 2);
    108         break;
    109     case RUN_TYPE_U_EXT:
    110     case RUN_TYPE_LF_EXT:
    111         if (is_lower != (type - RUN_TYPE_U_EXT))
    112             break;
    113         c = case_conv_ext[data];
    114         break;
    115     case RUN_TYPE_LF_EXT2:
    116         if (!is_lower)
    117             break;
    118         res[0] = c - code + case_conv_ext[data >> 6];
    119         res[1] = case_conv_ext[data & 0x3f];
    120         return 2;
    121     case RUN_TYPE_UF_EXT2:
    122         if (conv_type == 1)
    123             break;
    124         res[0] = c - code + case_conv_ext[data >> 6];
    125         res[1] = case_conv_ext[data & 0x3f];
    126         if (conv_type == 2) {
    127             /* convert to lower */
    128             res[0] = lre_case_conv1(res[0], 1);
    129             res[1] = lre_case_conv1(res[1], 1);
    130         }
    131         return 2;
    132     default:
    133     case RUN_TYPE_UF_EXT3:
    134         if (conv_type == 1)
    135             break;
    136         res[0] = case_conv_ext[data >> 8];
    137         res[1] = case_conv_ext[(data >> 4) & 0xf];
    138         res[2] = case_conv_ext[data & 0xf];
    139         if (conv_type == 2) {
    140             /* convert to lower */
    141             res[0] = lre_case_conv1(res[0], 1);
    142             res[1] = lre_case_conv1(res[1], 1);
    143             res[2] = lre_case_conv1(res[2], 1);
    144         }
    145         return 3;
    146     }
    147     res[0] = c;
    148     return 1;
    149 }
    150 
    151 /* conv_type:
    152    0 = to upper
    153    1 = to lower
    154    2 = case folding (= to lower with modifications)
    155 */
    156 int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
    157 {
    158     if (c < 128) {
    159         if (conv_type) {
    160             if (c >= 'A' && c <= 'Z') {
    161                 c = c - 'A' + 'a';
    162             }
    163         } else {
    164             if (c >= 'a' && c <= 'z') {
    165                 c = c - 'a' + 'A';
    166             }
    167         }
    168     } else {
    169         uint32_t v, code, len;
    170         int idx, idx_min, idx_max;
    171 
    172         idx_min = 0;
    173         idx_max = countof(case_conv_table1) - 1;
    174         while (idx_min <= idx_max) {
    175             idx = (unsigned)(idx_max + idx_min) / 2;
    176             v = case_conv_table1[idx];
    177             code = v >> (32 - 17);
    178             len = (v >> (32 - 17 - 7)) & 0x7f;
    179             if (c < code) {
    180                 idx_max = idx - 1;
    181             } else if (c >= code + len) {
    182                 idx_min = idx + 1;
    183             } else {
    184                 return lre_case_conv_entry(res, c, conv_type, idx, v);
    185             }
    186         }
    187     }
    188     res[0] = c;
    189     return 1;
    190 }
    191 
    192 static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode)
    193 {
    194     uint32_t res[LRE_CC_RES_LEN_MAX];
    195     int len;
    196 
    197     if (is_unicode) {
    198         len = lre_case_conv_entry(res, c, 2, idx, v);
    199         if (len == 1) {
    200             c = res[0];
    201         } else {
    202             /* handle the few specific multi-character cases (see
    203                unicode_gen.c:dump_case_folding_special_cases()) */
    204             if (c == 0xfb06) {
    205                 c = 0xfb05;
    206             } else if (c == 0x01fd3) {
    207                 c = 0x390;
    208             } else if (c == 0x01fe3) {
    209                 c = 0x3b0;
    210             }
    211         }
    212     } else {
    213         if (likely(c < 128)) {
    214             if (c >= 'a' && c <= 'z')
    215                 c = c - 'a' + 'A';
    216         } else {
    217             /* legacy regexp: to upper case if single char >= 128 */
    218             len = lre_case_conv_entry(res, c, FALSE, idx, v);
    219             if (len == 1 && res[0] >= 128)
    220                 c = res[0];
    221         }
    222     }
    223     return c;
    224 }
    225 
    226 /* JS regexp specific rules for case folding */
    227 int lre_canonicalize(uint32_t c, BOOL is_unicode)
    228 {
    229     if (c < 128) {
    230         /* fast case */
    231         if (is_unicode) {
    232             if (c >= 'A' && c <= 'Z') {
    233                 c = c - 'A' + 'a';
    234             }
    235         } else {
    236             if (c >= 'a' && c <= 'z') {
    237                 c = c - 'a' + 'A';
    238             }
    239         }
    240     } else {
    241         uint32_t v, code, len;
    242         int idx, idx_min, idx_max;
    243 
    244         idx_min = 0;
    245         idx_max = countof(case_conv_table1) - 1;
    246         while (idx_min <= idx_max) {
    247             idx = (unsigned)(idx_max + idx_min) / 2;
    248             v = case_conv_table1[idx];
    249             code = v >> (32 - 17);
    250             len = (v >> (32 - 17 - 7)) & 0x7f;
    251             if (c < code) {
    252                 idx_max = idx - 1;
    253             } else if (c >= code + len) {
    254                 idx_min = idx + 1;
    255             } else {
    256                 return lre_case_folding_entry(c, idx, v, is_unicode);
    257             }
    258         }
    259     }
    260     return c;
    261 }
    262 
    263 static uint32_t get_le24(const uint8_t *ptr)
    264 {
    265     return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16);
    266 }
    267 
    268 #define UNICODE_INDEX_BLOCK_LEN 32
    269 
    270 /* return -1 if not in table, otherwise the offset in the block */
    271 static int get_index_pos(uint32_t *pcode, uint32_t c,
    272                          const uint8_t *index_table, int index_table_len)
    273 {
    274     uint32_t code, v;
    275     int idx_min, idx_max, idx;
    276 
    277     idx_min = 0;
    278     v = get_le24(index_table);
    279     code = v & ((1 << 21) - 1);
    280     if (c < code) {
    281         *pcode = 0;
    282         return 0;
    283     }
    284     idx_max = index_table_len - 1;
    285     code = get_le24(index_table + idx_max * 3);
    286     if (c >= code)
    287         return -1;
    288     /* invariant: tab[idx_min] <= c < tab2[idx_max] */
    289     while ((idx_max - idx_min) > 1) {
    290         idx = (idx_max + idx_min) / 2;
    291         v = get_le24(index_table + idx * 3);
    292         code = v & ((1 << 21) - 1);
    293         if (c < code) {
    294             idx_max = idx;
    295         } else {
    296             idx_min = idx;
    297         }
    298     }
    299     v = get_le24(index_table + idx_min * 3);
    300     *pcode = v & ((1 << 21) - 1);
    301     return (idx_min + 1) * UNICODE_INDEX_BLOCK_LEN + (v >> 21);
    302 }
    303 
    304 static BOOL lre_is_in_table(uint32_t c, const uint8_t *table,
    305                             const uint8_t *index_table, int index_table_len)
    306 {
    307     uint32_t code, b, bit;
    308     int pos;
    309     const uint8_t *p;
    310 
    311     pos = get_index_pos(&code, c, index_table, index_table_len);
    312     if (pos < 0)
    313         return FALSE; /* outside the table */
    314     p = table + pos;
    315     bit = 0;
    316     /* Compressed run length encoding:
    317        00..3F: 2 packed lengths: 3-bit + 3-bit
    318        40..5F: 5-bits plus extra byte for length
    319        60..7F: 5-bits plus 2 extra bytes for length
    320        80..FF: 7-bit length
    321        lengths must be incremented to get character count
    322        Ranges alternate between false and true return value.
    323      */
    324     for(;;) {
    325         b = *p++;
    326         if (b < 64) {
    327             code += (b >> 3) + 1;
    328             if (c < code)
    329                 return bit;
    330             bit ^= 1;
    331             code += (b & 7) + 1;
    332         } else if (b >= 0x80) {
    333             code += b - 0x80 + 1;
    334         } else if (b < 0x60) {
    335             code += (((b - 0x40) << 8) | p[0]) + 1;
    336             p++;
    337         } else {
    338             code += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
    339             p += 2;
    340         }
    341         if (c < code)
    342             return bit;
    343         bit ^= 1;
    344     }
    345 }
    346 
    347 BOOL lre_is_cased(uint32_t c)
    348 {
    349     uint32_t v, code, len;
    350     int idx, idx_min, idx_max;
    351 
    352     idx_min = 0;
    353     idx_max = countof(case_conv_table1) - 1;
    354     while (idx_min <= idx_max) {
    355         idx = (unsigned)(idx_max + idx_min) / 2;
    356         v = case_conv_table1[idx];
    357         code = v >> (32 - 17);
    358         len = (v >> (32 - 17 - 7)) & 0x7f;
    359         if (c < code) {
    360             idx_max = idx - 1;
    361         } else if (c >= code + len) {
    362             idx_min = idx + 1;
    363         } else {
    364             return TRUE;
    365         }
    366     }
    367     return lre_is_in_table(c, unicode_prop_Cased1_table,
    368                            unicode_prop_Cased1_index,
    369                            sizeof(unicode_prop_Cased1_index) / 3);
    370 }
    371 
    372 BOOL lre_is_case_ignorable(uint32_t c)
    373 {
    374     return lre_is_in_table(c, unicode_prop_Case_Ignorable_table,
    375                            unicode_prop_Case_Ignorable_index,
    376                            sizeof(unicode_prop_Case_Ignorable_index) / 3);
    377 }
    378 
    379 /* character range */
    380 
    381 static __maybe_unused void cr_dump(CharRange *cr)
    382 {
    383     int i;
    384     for(i = 0; i < cr->len; i++)
    385         printf("%d: 0x%04x\n", i, cr->points[i]);
    386 }
    387 
    388 static void *cr_default_realloc(void *opaque, void *ptr, size_t size)
    389 {
    390     return realloc(ptr, size);
    391 }
    392 
    393 void cr_init(CharRange *cr, void *mem_opaque, DynBufReallocFunc *realloc_func)
    394 {
    395     cr->len = cr->size = 0;
    396     cr->points = NULL;
    397     cr->mem_opaque = mem_opaque;
    398     cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc;
    399 }
    400 
    401 void cr_free(CharRange *cr)
    402 {
    403     cr->realloc_func(cr->mem_opaque, cr->points, 0);
    404 }
    405 
    406 int cr_realloc(CharRange *cr, int size)
    407 {
    408     int new_size;
    409     uint32_t *new_buf;
    410 
    411     if (size > cr->size) {
    412         new_size = max_int(size, cr->size * 3 / 2);
    413         new_buf = cr->realloc_func(cr->mem_opaque, cr->points,
    414                                    new_size * sizeof(cr->points[0]));
    415         if (!new_buf)
    416             return -1;
    417         cr->points = new_buf;
    418         cr->size = new_size;
    419     }
    420     return 0;
    421 }
    422 
    423 int cr_copy(CharRange *cr, const CharRange *cr1)
    424 {
    425     if (cr_realloc(cr, cr1->len))
    426         return -1;
    427     memcpy(cr->points, cr1->points, sizeof(cr->points[0]) * cr1->len);
    428     cr->len = cr1->len;
    429     return 0;
    430 }
    431 
    432 /* merge consecutive intervals and remove empty intervals */
    433 static void cr_compress(CharRange *cr)
    434 {
    435     int i, j, k, len;
    436     uint32_t *pt;
    437 
    438     pt = cr->points;
    439     len = cr->len;
    440     i = 0;
    441     j = 0;
    442     k = 0;
    443     while ((i + 1) < len) {
    444         if (pt[i] == pt[i + 1]) {
    445             /* empty interval */
    446             i += 2;
    447         } else {
    448             j = i;
    449             while ((j + 3) < len && pt[j + 1] == pt[j + 2])
    450                 j += 2;
    451             /* just copy */
    452             pt[k] = pt[i];
    453             pt[k + 1] = pt[j + 1];
    454             k += 2;
    455             i = j + 2;
    456         }
    457     }
    458     cr->len = k;
    459 }
    460 
    461 /* union or intersection */
    462 int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
    463           const uint32_t *b_pt, int b_len, int op)
    464 {
    465     int a_idx, b_idx, is_in;
    466     uint32_t v;
    467 
    468     a_idx = 0;
    469     b_idx = 0;
    470     for(;;) {
    471         /* get one more point from a or b in increasing order */
    472         if (a_idx < a_len && b_idx < b_len) {
    473             if (a_pt[a_idx] < b_pt[b_idx]) {
    474                 goto a_add;
    475             } else if (a_pt[a_idx] == b_pt[b_idx]) {
    476                 v = a_pt[a_idx];
    477                 a_idx++;
    478                 b_idx++;
    479             } else {
    480                 goto b_add;
    481             }
    482         } else if (a_idx < a_len) {
    483         a_add:
    484             v = a_pt[a_idx++];
    485         } else if (b_idx < b_len) {
    486         b_add:
    487             v = b_pt[b_idx++];
    488         } else {
    489             break;
    490         }
    491         /* add the point if the in/out status changes */
    492         switch(op) {
    493         case CR_OP_UNION:
    494             is_in = (a_idx & 1) | (b_idx & 1);
    495             break;
    496         case CR_OP_INTER:
    497             is_in = (a_idx & 1) & (b_idx & 1);
    498             break;
    499         case CR_OP_XOR:
    500             is_in = (a_idx & 1) ^ (b_idx & 1);
    501             break;
    502         default:
    503             abort();
    504         }
    505         if (is_in != (cr->len & 1)) {
    506             if (cr_add_point(cr, v))
    507                 return -1;
    508         }
    509     }
    510     cr_compress(cr);
    511     return 0;
    512 }
    513 
    514 int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len)
    515 {
    516     CharRange a = *cr;
    517     int ret;
    518     cr->len = 0;
    519     cr->size = 0;
    520     cr->points = NULL;
    521     ret = cr_op(cr, a.points, a.len, b_pt, b_len, CR_OP_UNION);
    522     cr_free(&a);
    523     return ret;
    524 }
    525 
    526 int cr_invert(CharRange *cr)
    527 {
    528     int len;
    529     len = cr->len;
    530     if (cr_realloc(cr, len + 2))
    531         return -1;
    532     memmove(cr->points + 1, cr->points, len * sizeof(cr->points[0]));
    533     cr->points[0] = 0;
    534     cr->points[len + 1] = UINT32_MAX;
    535     cr->len = len + 2;
    536     cr_compress(cr);
    537     return 0;
    538 }
    539 
    540 #ifdef CONFIG_ALL_UNICODE
    541 
    542 BOOL lre_is_id_start(uint32_t c)
    543 {
    544     return lre_is_in_table(c, unicode_prop_ID_Start_table,
    545                            unicode_prop_ID_Start_index,
    546                            sizeof(unicode_prop_ID_Start_index) / 3);
    547 }
    548 
    549 BOOL lre_is_id_continue(uint32_t c)
    550 {
    551     return lre_is_id_start(c) ||
    552         lre_is_in_table(c, unicode_prop_ID_Continue1_table,
    553                         unicode_prop_ID_Continue1_index,
    554                         sizeof(unicode_prop_ID_Continue1_index) / 3);
    555 }
    556 
    557 #define UNICODE_DECOMP_LEN_MAX 18
    558 
    559 typedef enum {
    560     DECOMP_TYPE_C1, /* 16 bit char */
    561     DECOMP_TYPE_L1, /* 16 bit char table */
    562     DECOMP_TYPE_L2,
    563     DECOMP_TYPE_L3,
    564     DECOMP_TYPE_L4,
    565     DECOMP_TYPE_L5, /* XXX: not used */
    566     DECOMP_TYPE_L6, /* XXX: could remove */
    567     DECOMP_TYPE_L7, /* XXX: could remove */
    568     DECOMP_TYPE_LL1, /* 18 bit char table */
    569     DECOMP_TYPE_LL2,
    570     DECOMP_TYPE_S1, /* 8 bit char table */
    571     DECOMP_TYPE_S2,
    572     DECOMP_TYPE_S3,
    573     DECOMP_TYPE_S4,
    574     DECOMP_TYPE_S5,
    575     DECOMP_TYPE_I1, /* increment 16 bit char value */
    576     DECOMP_TYPE_I2_0,
    577     DECOMP_TYPE_I2_1,
    578     DECOMP_TYPE_I3_1,
    579     DECOMP_TYPE_I3_2,
    580     DECOMP_TYPE_I4_1,
    581     DECOMP_TYPE_I4_2,
    582     DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
    583     DECOMP_TYPE_B2,
    584     DECOMP_TYPE_B3,
    585     DECOMP_TYPE_B4,
    586     DECOMP_TYPE_B5,
    587     DECOMP_TYPE_B6,
    588     DECOMP_TYPE_B7,
    589     DECOMP_TYPE_B8,
    590     DECOMP_TYPE_B18,
    591     DECOMP_TYPE_LS2,
    592     DECOMP_TYPE_PAT3,
    593     DECOMP_TYPE_S2_UL,
    594     DECOMP_TYPE_LS2_UL,
    595 } DecompTypeEnum;
    596 
    597 static uint32_t unicode_get_short_code(uint32_t c)
    598 {
    599     static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
    600 
    601     if (c < 0x80)
    602         return c;
    603     else if (c < 0x80 + 0x50)
    604         return c - 0x80 + 0x300;
    605     else
    606         return unicode_short_table[c - 0x80 - 0x50];
    607 }
    608 
    609 static uint32_t unicode_get_lower_simple(uint32_t c)
    610 {
    611     if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
    612         c += 0x20;
    613     else
    614         c++;
    615     return c;
    616 }
    617 
    618 static uint16_t unicode_get16(const uint8_t *p)
    619 {
    620     return p[0] | (p[1] << 8);
    621 }
    622 
    623 static int unicode_decomp_entry(uint32_t *res, uint32_t c,
    624                                 int idx, uint32_t code, uint32_t len,
    625                                 uint32_t type)
    626 {
    627     uint32_t c1;
    628     int l, i, p;
    629     const uint8_t *d;
    630 
    631     if (type == DECOMP_TYPE_C1) {
    632         res[0] = unicode_decomp_table2[idx];
    633         return 1;
    634     } else {
    635         d = unicode_decomp_data + unicode_decomp_table2[idx];
    636         switch(type) {
    637         case DECOMP_TYPE_L1:
    638         case DECOMP_TYPE_L2:
    639         case DECOMP_TYPE_L3:
    640         case DECOMP_TYPE_L4:
    641         case DECOMP_TYPE_L5:
    642         case DECOMP_TYPE_L6:
    643         case DECOMP_TYPE_L7:
    644             l = type - DECOMP_TYPE_L1 + 1;
    645             d += (c - code) * l * 2;
    646             for(i = 0; i < l; i++) {
    647                 if ((res[i] = unicode_get16(d + 2 * i)) == 0)
    648                     return 0;
    649             }
    650             return l;
    651         case DECOMP_TYPE_LL1:
    652         case DECOMP_TYPE_LL2:
    653             {
    654                 uint32_t k, p;
    655                 l = type - DECOMP_TYPE_LL1 + 1;
    656                 k = (c - code) * l;
    657                 p = len * l * 2;
    658                 for(i = 0; i < l; i++) {
    659                     c1 = unicode_get16(d + 2 * k) |
    660                         (((d[p + (k / 4)] >> ((k % 4) * 2)) & 3) << 16);
    661                     if (!c1)
    662                         return 0;
    663                     res[i] = c1;
    664                     k++;
    665                 }
    666             }
    667             return l;
    668         case DECOMP_TYPE_S1:
    669         case DECOMP_TYPE_S2:
    670         case DECOMP_TYPE_S3:
    671         case DECOMP_TYPE_S4:
    672         case DECOMP_TYPE_S5:
    673             l = type - DECOMP_TYPE_S1 + 1;
    674             d += (c - code) * l;
    675             for(i = 0; i < l; i++) {
    676                 if ((res[i] = unicode_get_short_code(d[i])) == 0)
    677                     return 0;
    678             }
    679             return l;
    680         case DECOMP_TYPE_I1:
    681             l = 1;
    682             p = 0;
    683             goto decomp_type_i;
    684         case DECOMP_TYPE_I2_0:
    685         case DECOMP_TYPE_I2_1:
    686         case DECOMP_TYPE_I3_1:
    687         case DECOMP_TYPE_I3_2:
    688         case DECOMP_TYPE_I4_1:
    689         case DECOMP_TYPE_I4_2:
    690             l = 2 + ((type - DECOMP_TYPE_I2_0) >> 1);
    691             p = ((type - DECOMP_TYPE_I2_0) & 1) + (l > 2);
    692         decomp_type_i:
    693             for(i = 0; i < l; i++) {
    694                 c1 = unicode_get16(d + 2 * i);
    695                 if (i == p)
    696                     c1 += c - code;
    697                 res[i] = c1;
    698             }
    699             return l;
    700         case DECOMP_TYPE_B18:
    701             l = 18;
    702             goto decomp_type_b;
    703         case DECOMP_TYPE_B1:
    704         case DECOMP_TYPE_B2:
    705         case DECOMP_TYPE_B3:
    706         case DECOMP_TYPE_B4:
    707         case DECOMP_TYPE_B5:
    708         case DECOMP_TYPE_B6:
    709         case DECOMP_TYPE_B7:
    710         case DECOMP_TYPE_B8:
    711             l = type - DECOMP_TYPE_B1 + 1;
    712         decomp_type_b:
    713             {
    714                 uint32_t c_min;
    715                 c_min = unicode_get16(d);
    716                 d += 2 + (c - code) * l;
    717                 for(i = 0; i < l; i++) {
    718                     c1 = d[i];
    719                     if (c1 == 0xff)
    720                         c1 = 0x20;
    721                     else
    722                         c1 += c_min;
    723                     res[i] = c1;
    724                 }
    725             }
    726             return l;
    727         case DECOMP_TYPE_LS2:
    728             d += (c - code) * 3;
    729             if (!(res[0] = unicode_get16(d)))
    730                 return 0;
    731             res[1] = unicode_get_short_code(d[2]);
    732             return 2;
    733         case DECOMP_TYPE_PAT3:
    734             res[0] = unicode_get16(d);
    735             res[2] = unicode_get16(d + 2);
    736             d += 4 + (c - code) * 2;
    737             res[1] = unicode_get16(d);
    738             return 3;
    739         case DECOMP_TYPE_S2_UL:
    740         case DECOMP_TYPE_LS2_UL:
    741             c1 = c - code;
    742             if (type == DECOMP_TYPE_S2_UL) {
    743                 d += c1 & ~1;
    744                 c = unicode_get_short_code(*d);
    745                 d++;
    746             } else {
    747                 d += (c1 >> 1) * 3;
    748                 c = unicode_get16(d);
    749                 d += 2;
    750             }
    751             if (c1 & 1)
    752                 c = unicode_get_lower_simple(c);
    753             res[0] = c;
    754             res[1] = unicode_get_short_code(*d);
    755             return 2;
    756         }
    757     }
    758     return 0;
    759 }
    760 
    761 
    762 /* return the length of the decomposition (length <=
    763    UNICODE_DECOMP_LEN_MAX) or 0 if no decomposition */
    764 static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1)
    765 {
    766     uint32_t v, type, is_compat, code, len;
    767     int idx_min, idx_max, idx;
    768 
    769     idx_min = 0;
    770     idx_max = countof(unicode_decomp_table1) - 1;
    771     while (idx_min <= idx_max) {
    772         idx = (idx_max + idx_min) / 2;
    773         v = unicode_decomp_table1[idx];
    774         code = v >> (32 - 18);
    775         len = (v >> (32 - 18 - 7)) & 0x7f;
    776         //        printf("idx=%d code=%05x len=%d\n", idx, code, len);
    777         if (c < code) {
    778             idx_max = idx - 1;
    779         } else if (c >= code + len) {
    780             idx_min = idx + 1;
    781         } else {
    782             is_compat = v & 1;
    783             if (is_compat1 < is_compat)
    784                 break;
    785             type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
    786             return unicode_decomp_entry(res, c, idx, code, len, type);
    787         }
    788     }
    789     return 0;
    790 }
    791 
    792 /* return 0 if no pair found */
    793 static int unicode_compose_pair(uint32_t c0, uint32_t c1)
    794 {
    795     uint32_t code, len, type, v, idx1, d_idx, d_offset, ch;
    796     int idx_min, idx_max, idx, d;
    797     uint32_t pair[2];
    798 
    799     idx_min = 0;
    800     idx_max = countof(unicode_comp_table) - 1;
    801     while (idx_min <= idx_max) {
    802         idx = (idx_max + idx_min) / 2;
    803         idx1 = unicode_comp_table[idx];
    804 
    805         /* idx1 represent an entry of the decomposition table */
    806         d_idx = idx1 >> 6;
    807         d_offset = idx1 & 0x3f;
    808         v = unicode_decomp_table1[d_idx];
    809         code = v >> (32 - 18);
    810         len = (v >> (32 - 18 - 7)) & 0x7f;
    811         type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
    812         ch = code + d_offset;
    813         unicode_decomp_entry(pair, ch, d_idx, code, len, type);
    814         d = c0 - pair[0];
    815         if (d == 0)
    816             d = c1 - pair[1];
    817         if (d < 0) {
    818             idx_max = idx - 1;
    819         } else if (d > 0) {
    820             idx_min = idx + 1;
    821         } else {
    822             return ch;
    823         }
    824     }
    825     return 0;
    826 }
    827 
    828 /* return the combining class of character c (between 0 and 255) */
    829 static int unicode_get_cc(uint32_t c)
    830 {
    831     uint32_t code, n, type, cc, c1, b;
    832     int pos;
    833     const uint8_t *p;
    834 
    835     pos = get_index_pos(&code, c,
    836                         unicode_cc_index, sizeof(unicode_cc_index) / 3);
    837     if (pos < 0)
    838         return 0;
    839     p = unicode_cc_table + pos;
    840     /* Compressed run length encoding:
    841        - 2 high order bits are combining class type
    842        -         0:0, 1:230, 2:extra byte linear progression, 3:extra byte
    843        - 00..2F: range length (add 1)
    844        - 30..37: 3-bit range-length + 1 extra byte
    845        - 38..3F: 3-bit range-length + 2 extra byte
    846      */
    847     for(;;) {
    848         b = *p++;
    849         type = b >> 6;
    850         n = b & 0x3f;
    851         if (n < 48) {
    852         } else if (n < 56) {
    853             n = (n - 48) << 8;
    854             n |= *p++;
    855             n += 48;
    856         } else {
    857             n = (n - 56) << 8;
    858             n |= *p++ << 8;
    859             n |= *p++;
    860             n += 48 + (1 << 11);
    861         }
    862         if (type <= 1)
    863             p++;
    864         c1 = code + n + 1;
    865         if (c < c1) {
    866             switch(type) {
    867             case 0:
    868                 cc = p[-1];
    869                 break;
    870             case 1:
    871                 cc = p[-1] + c - code;
    872                 break;
    873             case 2:
    874                 cc = 0;
    875                 break;
    876             default:
    877             case 3:
    878                 cc = 230;
    879                 break;
    880             }
    881             return cc;
    882         }
    883         code = c1;
    884     }
    885 }
    886 
    887 static void sort_cc(int *buf, int len)
    888 {
    889     int i, j, k, cc, cc1, start, ch1;
    890 
    891     for(i = 0; i < len; i++) {
    892         cc = unicode_get_cc(buf[i]);
    893         if (cc != 0) {
    894             start = i;
    895             j = i + 1;
    896             while (j < len) {
    897                 ch1 = buf[j];
    898                 cc1 = unicode_get_cc(ch1);
    899                 if (cc1 == 0)
    900                     break;
    901                 k = j - 1;
    902                 while (k >= start) {
    903                     if (unicode_get_cc(buf[k]) <= cc1)
    904                         break;
    905                     buf[k + 1] = buf[k];
    906                     k--;
    907                 }
    908                 buf[k + 1] = ch1;
    909                 j++;
    910             }
    911 #if 0
    912             printf("cc:");
    913             for(k = start; k < j; k++) {
    914                 printf(" %3d", unicode_get_cc(buf[k]));
    915             }
    916             printf("\n");
    917 #endif
    918             i = j;
    919         }
    920     }
    921 }
    922 
    923 static void to_nfd_rec(DynBuf *dbuf,
    924                        const int *src, int src_len, int is_compat)
    925 {
    926     uint32_t c, v;
    927     int i, l;
    928     uint32_t res[UNICODE_DECOMP_LEN_MAX];
    929 
    930     for(i = 0; i < src_len; i++) {
    931         c = src[i];
    932         if (c >= 0xac00 && c < 0xd7a4) {
    933             /* Hangul decomposition */
    934             c -= 0xac00;
    935             dbuf_put_u32(dbuf, 0x1100 + c / 588);
    936             dbuf_put_u32(dbuf, 0x1161 + (c % 588) / 28);
    937             v = c % 28;
    938             if (v != 0)
    939                 dbuf_put_u32(dbuf, 0x11a7 + v);
    940         } else {
    941             l = unicode_decomp_char(res, c, is_compat);
    942             if (l) {
    943                 to_nfd_rec(dbuf, (int *)res, l, is_compat);
    944             } else {
    945                 dbuf_put_u32(dbuf, c);
    946             }
    947         }
    948     }
    949 }
    950 
    951 /* return 0 if not found */
    952 static int compose_pair(uint32_t c0, uint32_t c1)
    953 {
    954     /* Hangul composition */
    955     if (c0 >= 0x1100 && c0 < 0x1100 + 19 &&
    956         c1 >= 0x1161 && c1 < 0x1161 + 21) {
    957         return 0xac00 + (c0 - 0x1100) * 588 + (c1 - 0x1161) * 28;
    958     } else if (c0 >= 0xac00 && c0 < 0xac00 + 11172 &&
    959                (c0 - 0xac00) % 28 == 0 &&
    960                c1 >= 0x11a7 && c1 < 0x11a7 + 28) {
    961         return c0 + c1 - 0x11a7;
    962     } else {
    963         return unicode_compose_pair(c0, c1);
    964     }
    965 }
    966 
    967 int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
    968                       UnicodeNormalizationEnum n_type,
    969                       void *opaque, DynBufReallocFunc *realloc_func)
    970 {
    971     int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len;
    972     BOOL is_compat;
    973     DynBuf dbuf_s, *dbuf = &dbuf_s;
    974 
    975     is_compat = n_type >> 1;
    976 
    977     dbuf_init2(dbuf, opaque, realloc_func);
    978     if (dbuf_realloc(dbuf, sizeof(int) * src_len))
    979         goto fail;
    980 
    981     /* common case: latin1 is unaffected by NFC */
    982     if (n_type == UNICODE_NFC) {
    983         for(i = 0; i < src_len; i++) {
    984             if (src[i] >= 0x100)
    985                 goto not_latin1;
    986         }
    987         buf = (int *)dbuf->buf;
    988         memcpy(buf, src, src_len * sizeof(int));
    989         *pdst = (uint32_t *)buf;
    990         return src_len;
    991     not_latin1: ;
    992     }
    993 
    994     to_nfd_rec(dbuf, (const int *)src, src_len, is_compat);
    995     if (dbuf_error(dbuf)) {
    996     fail:
    997         *pdst = NULL;
    998         return -1;
    999     }
   1000     buf = (int *)dbuf->buf;
   1001     buf_len = dbuf->size / sizeof(int);
   1002 
   1003     sort_cc(buf, buf_len);
   1004 
   1005     if (buf_len <= 1 || (n_type & 1) != 0) {
   1006         /* NFD / NFKD */
   1007         *pdst = (uint32_t *)buf;
   1008         return buf_len;
   1009     }
   1010 
   1011     i = 1;
   1012     out_len = 1;
   1013     while (i < buf_len) {
   1014         /* find the starter character and test if it is blocked from
   1015            the character at 'i' */
   1016         last_cc = unicode_get_cc(buf[i]);
   1017         starter_pos = out_len - 1;
   1018         while (starter_pos >= 0) {
   1019             cc = unicode_get_cc(buf[starter_pos]);
   1020             if (cc == 0)
   1021                 break;
   1022             if (cc >= last_cc)
   1023                 goto next;
   1024             last_cc = 256;
   1025             starter_pos--;
   1026         }
   1027         if (starter_pos >= 0 &&
   1028             (p = compose_pair(buf[starter_pos], buf[i])) != 0) {
   1029             buf[starter_pos] = p;
   1030             i++;
   1031         } else {
   1032         next:
   1033             buf[out_len++] = buf[i++];
   1034         }
   1035     }
   1036     *pdst = (uint32_t *)buf;
   1037     return out_len;
   1038 }
   1039 
   1040 /* char ranges for various unicode properties */
   1041 
   1042 static int unicode_find_name(const char *name_table, const char *name)
   1043 {
   1044     const char *p, *r;
   1045     int pos;
   1046     size_t name_len, len;
   1047 
   1048     p = name_table;
   1049     pos = 0;
   1050     name_len = strlen(name);
   1051     while (*p) {
   1052         for(;;) {
   1053             r = strchr(p, ',');
   1054             if (!r)
   1055                 len = strlen(p);
   1056             else
   1057                 len = r - p;
   1058             if (len == name_len && !memcmp(p, name, name_len))
   1059                 return pos;
   1060             p += len + 1;
   1061             if (!r)
   1062                 break;
   1063         }
   1064         pos++;
   1065     }
   1066     return -1;
   1067 }
   1068 
   1069 /* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
   1070    if not found */
   1071 int unicode_script(CharRange *cr,
   1072                    const char *script_name, BOOL is_ext)
   1073 {
   1074     int script_idx;
   1075     const uint8_t *p, *p_end;
   1076     uint32_t c, c1, b, n, v, v_len, i, type;
   1077     CharRange cr1_s, *cr1;
   1078     CharRange cr2_s, *cr2 = &cr2_s;
   1079     BOOL is_common;
   1080 
   1081     script_idx = unicode_find_name(unicode_script_name_table, script_name);
   1082     if (script_idx < 0)
   1083         return -2;
   1084     /* Note: we remove the "Unknown" Script */
   1085     script_idx += UNICODE_SCRIPT_Unknown + 1;
   1086 
   1087     is_common = (script_idx == UNICODE_SCRIPT_Common ||
   1088                  script_idx == UNICODE_SCRIPT_Inherited);
   1089     if (is_ext) {
   1090         cr1 = &cr1_s;
   1091         cr_init(cr1, cr->mem_opaque, cr->realloc_func);
   1092         cr_init(cr2, cr->mem_opaque, cr->realloc_func);
   1093     } else {
   1094         cr1 = cr;
   1095     }
   1096 
   1097     p = unicode_script_table;
   1098     p_end = unicode_script_table + countof(unicode_script_table);
   1099     c = 0;
   1100     while (p < p_end) {
   1101         b = *p++;
   1102         type = b >> 7;
   1103         n = b & 0x7f;
   1104         if (n < 96) {
   1105         } else if (n < 112) {
   1106             n = (n - 96) << 8;
   1107             n |= *p++;
   1108             n += 96;
   1109         } else {
   1110             n = (n - 112) << 16;
   1111             n |= *p++ << 8;
   1112             n |= *p++;
   1113             n += 96 + (1 << 12);
   1114         }
   1115         if (type == 0)
   1116             v = 0;
   1117         else
   1118             v = *p++;
   1119         c1 = c + n + 1;
   1120         if (v == script_idx) {
   1121             if (cr_add_interval(cr1, c, c1))
   1122                 goto fail;
   1123         }
   1124         c = c1;
   1125     }
   1126 
   1127     if (is_ext) {
   1128         /* add the script extensions */
   1129         p = unicode_script_ext_table;
   1130         p_end = unicode_script_ext_table + countof(unicode_script_ext_table);
   1131         c = 0;
   1132         while (p < p_end) {
   1133             b = *p++;
   1134             if (b < 128) {
   1135                 n = b;
   1136             } else if (b < 128 + 64) {
   1137                 n = (b - 128) << 8;
   1138                 n |= *p++;
   1139                 n += 128;
   1140             } else {
   1141                 n = (b - 128 - 64) << 16;
   1142                 n |= *p++ << 8;
   1143                 n |= *p++;
   1144                 n += 128 + (1 << 14);
   1145             }
   1146             c1 = c + n + 1;
   1147             v_len = *p++;
   1148             if (is_common) {
   1149                 if (v_len != 0) {
   1150                     if (cr_add_interval(cr2, c, c1))
   1151                         goto fail;
   1152                 }
   1153             } else {
   1154                 for(i = 0; i < v_len; i++) {
   1155                     if (p[i] == script_idx) {
   1156                         if (cr_add_interval(cr2, c, c1))
   1157                             goto fail;
   1158                         break;
   1159                     }
   1160                 }
   1161             }
   1162             p += v_len;
   1163             c = c1;
   1164         }
   1165         if (is_common) {
   1166             /* remove all the characters with script extensions */
   1167             if (cr_invert(cr2))
   1168                 goto fail;
   1169             if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
   1170                       CR_OP_INTER))
   1171                 goto fail;
   1172         } else {
   1173             if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
   1174                       CR_OP_UNION))
   1175                 goto fail;
   1176         }
   1177         cr_free(cr1);
   1178         cr_free(cr2);
   1179     }
   1180     return 0;
   1181  fail:
   1182     if (is_ext) {
   1183         cr_free(cr1);
   1184         cr_free(cr2);
   1185     }
   1186     goto fail;
   1187 }
   1188 
   1189 #define M(id) (1U << UNICODE_GC_ ## id)
   1190 
   1191 static int unicode_general_category1(CharRange *cr, uint32_t gc_mask)
   1192 {
   1193     const uint8_t *p, *p_end;
   1194     uint32_t c, c0, b, n, v;
   1195 
   1196     p = unicode_gc_table;
   1197     p_end = unicode_gc_table + countof(unicode_gc_table);
   1198     c = 0;
   1199     /* Compressed range encoding:
   1200        initial byte:
   1201        bits 0..4: category number (special case 31)
   1202        bits 5..7: range length (add 1)
   1203        special case bits 5..7 == 7: read an extra byte
   1204        - 00..7F: range length (add 7 + 1)
   1205        - 80..BF: 6-bits plus extra byte for range length (add 7 + 128)
   1206        - C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384)
   1207      */
   1208     while (p < p_end) {
   1209         b = *p++;
   1210         n = b >> 5;
   1211         v = b & 0x1f;
   1212         if (n == 7) {
   1213             n = *p++;
   1214             if (n < 128) {
   1215                 n += 7;
   1216             } else if (n < 128 + 64) {
   1217                 n = (n - 128) << 8;
   1218                 n |= *p++;
   1219                 n += 7 + 128;
   1220             } else {
   1221                 n = (n - 128 - 64) << 16;
   1222                 n |= *p++ << 8;
   1223                 n |= *p++;
   1224                 n += 7 + 128 + (1 << 14);
   1225             }
   1226         }
   1227         c0 = c;
   1228         c += n + 1;
   1229         if (v == 31) {
   1230             /* run of Lu / Ll */
   1231             b = gc_mask & (M(Lu) | M(Ll));
   1232             if (b != 0) {
   1233                 if (b == (M(Lu) | M(Ll))) {
   1234                     goto add_range;
   1235                 } else {
   1236                     c0 += ((gc_mask & M(Ll)) != 0);
   1237                     for(; c0 < c; c0 += 2) {
   1238                         if (cr_add_interval(cr, c0, c0 + 1))
   1239                             return -1;
   1240                     }
   1241                 }
   1242             }
   1243         } else if ((gc_mask >> v) & 1) {
   1244         add_range:
   1245             if (cr_add_interval(cr, c0, c))
   1246                 return -1;
   1247         }
   1248     }
   1249     return 0;
   1250 }
   1251 
   1252 static int unicode_prop1(CharRange *cr, int prop_idx)
   1253 {
   1254     const uint8_t *p, *p_end;
   1255     uint32_t c, c0, b, bit;
   1256 
   1257     p = unicode_prop_table[prop_idx];
   1258     p_end = p + unicode_prop_len_table[prop_idx];
   1259     c = 0;
   1260     bit = 0;
   1261     /* Compressed range encoding:
   1262        00..3F: 2 packed lengths: 3-bit + 3-bit
   1263        40..5F: 5-bits plus extra byte for length
   1264        60..7F: 5-bits plus 2 extra bytes for length
   1265        80..FF: 7-bit length
   1266        lengths must be incremented to get character count
   1267        Ranges alternate between false and true return value.
   1268      */
   1269     while (p < p_end) {
   1270         c0 = c;
   1271         b = *p++;
   1272         if (b < 64) {
   1273             c += (b >> 3) + 1;
   1274             if (bit)  {
   1275                 if (cr_add_interval(cr, c0, c))
   1276                     return -1;
   1277             }
   1278             bit ^= 1;
   1279             c0 = c;
   1280             c += (b & 7) + 1;
   1281         } else if (b >= 0x80) {
   1282             c += b - 0x80 + 1;
   1283         } else if (b < 0x60) {
   1284             c += (((b - 0x40) << 8) | p[0]) + 1;
   1285             p++;
   1286         } else {
   1287             c += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
   1288             p += 2;
   1289         }
   1290         if (bit)  {
   1291             if (cr_add_interval(cr, c0, c))
   1292                 return -1;
   1293         }
   1294         bit ^= 1;
   1295     }
   1296     return 0;
   1297 }
   1298 
   1299 #define CASE_U (1 << 0)
   1300 #define CASE_L (1 << 1)
   1301 #define CASE_F (1 << 2)
   1302 
   1303 /* use the case conversion table to generate range of characters.
   1304    CASE_U: set char if modified by uppercasing,
   1305    CASE_L: set char if modified by lowercasing,
   1306    CASE_F: set char if modified by case folding,
   1307  */
   1308 static int unicode_case1(CharRange *cr, int case_mask)
   1309 {
   1310 #define MR(x) (1 << RUN_TYPE_ ## x)
   1311     const uint32_t tab_run_mask[3] = {
   1312         MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
   1313         MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
   1314 
   1315         MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2),
   1316 
   1317         MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
   1318     };
   1319 #undef MR
   1320     uint32_t mask, v, code, type, len, i, idx;
   1321 
   1322     if (case_mask == 0)
   1323         return 0;
   1324     mask = 0;
   1325     for(i = 0; i < 3; i++) {
   1326         if ((case_mask >> i) & 1)
   1327             mask |= tab_run_mask[i];
   1328     }
   1329     for(idx = 0; idx < countof(case_conv_table1); idx++) {
   1330         v = case_conv_table1[idx];
   1331         type = (v >> (32 - 17 - 7 - 4)) & 0xf;
   1332         code = v >> (32 - 17);
   1333         len = (v >> (32 - 17 - 7)) & 0x7f;
   1334         if ((mask >> type) & 1) {
   1335             //            printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1);
   1336             switch(type) {
   1337             case RUN_TYPE_UL:
   1338                 if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
   1339                     goto def_case;
   1340                 code += ((case_mask & CASE_U) != 0);
   1341                 for(i = 0; i < len; i += 2) {
   1342                     if (cr_add_interval(cr, code + i, code + i + 1))
   1343                         return -1;
   1344                 }
   1345                 break;
   1346             case RUN_TYPE_LSU:
   1347                 if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
   1348                     goto def_case;
   1349                 if (!(case_mask & CASE_U)) {
   1350                     if (cr_add_interval(cr, code, code + 1))
   1351                         return -1;
   1352                 }
   1353                 if (cr_add_interval(cr, code + 1, code + 2))
   1354                     return -1;
   1355                 if (case_mask & CASE_U) {
   1356                     if (cr_add_interval(cr, code + 2, code + 3))
   1357                         return -1;
   1358                 }
   1359                 break;
   1360             default:
   1361             def_case:
   1362                 if (cr_add_interval(cr, code, code + len))
   1363                     return -1;
   1364                 break;
   1365             }
   1366         }
   1367     }
   1368     return 0;
   1369 }
   1370 
   1371 static int point_cmp(const void *p1, const void *p2, void *arg)
   1372 {
   1373     uint32_t v1 = *(uint32_t *)p1;
   1374     uint32_t v2 = *(uint32_t *)p2;
   1375     return (v1 > v2) - (v1 < v2);
   1376 }
   1377 
   1378 static void cr_sort_and_remove_overlap(CharRange *cr)
   1379 {
   1380     uint32_t start, end, start1, end1, i, j;
   1381 
   1382     /* the resulting ranges are not necessarily sorted and may overlap */
   1383     rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL);
   1384     j = 0;
   1385     for(i = 0; i < cr->len; ) {
   1386         start = cr->points[i];
   1387         end = cr->points[i + 1];
   1388         i += 2;
   1389         while (i < cr->len) {
   1390             start1 = cr->points[i];
   1391             end1 = cr->points[i + 1];
   1392             if (start1 > end) {
   1393                 /* |------|
   1394                  *           |-------| */
   1395                 break;
   1396             } else if (end1 <= end) {
   1397                 /* |------|
   1398                  *    |--| */
   1399                 i += 2;
   1400             } else {
   1401                 /* |------|
   1402                  *     |-------| */
   1403                 end = end1;
   1404                 i += 2;
   1405             }
   1406         }
   1407         cr->points[j] = start;
   1408         cr->points[j + 1] = end;
   1409         j += 2;
   1410     }
   1411     cr->len = j;
   1412 }
   1413 
   1414 /* canonicalize a character set using the JS regex case folding rules
   1415    (see lre_canonicalize()) */
   1416 int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode)
   1417 {
   1418     CharRange cr_inter, cr_mask, cr_result, cr_sub;
   1419     uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d;
   1420 
   1421     cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func);
   1422     cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func);
   1423     cr_init(&cr_result, cr->mem_opaque, cr->realloc_func);
   1424     cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func);
   1425 
   1426     if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U))
   1427         goto fail;
   1428     if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
   1429         goto fail;
   1430 
   1431     if (cr_invert(&cr_mask))
   1432         goto fail;
   1433     if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
   1434         goto fail;
   1435 
   1436     /* cr_inter = cr & cr_mask */
   1437     /* cr_sub = cr & ~cr_mask */
   1438 
   1439     /* use the case conversion table to compute the result */
   1440     d_start = -1;
   1441     d_end = -1;
   1442     idx = 0;
   1443     v = case_conv_table1[idx];
   1444     code = v >> (32 - 17);
   1445     len = (v >> (32 - 17 - 7)) & 0x7f;
   1446     for(i = 0; i < cr_inter.len; i += 2) {
   1447         start = cr_inter.points[i];
   1448         end = cr_inter.points[i + 1];
   1449 
   1450         for(c = start; c < end; c++) {
   1451             for(;;) {
   1452                 if (c >= code && c < code + len)
   1453                     break;
   1454                 idx++;
   1455                 assert(idx < countof(case_conv_table1));
   1456                 v = case_conv_table1[idx];
   1457                 code = v >> (32 - 17);
   1458                 len = (v >> (32 - 17 - 7)) & 0x7f;
   1459             }
   1460             d = lre_case_folding_entry(c, idx, v, is_unicode);
   1461             /* try to merge with the current interval */
   1462             if (d_start == -1) {
   1463                 d_start = d;
   1464                 d_end = d + 1;
   1465             } else if (d_end == d) {
   1466                 d_end++;
   1467             } else {
   1468                 cr_add_interval(&cr_result, d_start, d_end);
   1469                 d_start = d;
   1470                 d_end = d + 1;
   1471             }
   1472         }
   1473     }
   1474     if (d_start != -1) {
   1475         if (cr_add_interval(&cr_result, d_start, d_end))
   1476             goto fail;
   1477     }
   1478 
   1479     /* the resulting ranges are not necessarily sorted and may overlap */
   1480     cr_sort_and_remove_overlap(&cr_result);
   1481 
   1482     /* or with the character not affected by the case folding */
   1483     cr->len = 0;
   1484     if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION))
   1485         goto fail;
   1486 
   1487     cr_free(&cr_inter);
   1488     cr_free(&cr_mask);
   1489     cr_free(&cr_result);
   1490     cr_free(&cr_sub);
   1491     return 0;
   1492  fail:
   1493     cr_free(&cr_inter);
   1494     cr_free(&cr_mask);
   1495     cr_free(&cr_result);
   1496     cr_free(&cr_sub);
   1497     return -1;
   1498 }
   1499 
   1500 typedef enum {
   1501     POP_GC,
   1502     POP_PROP,
   1503     POP_CASE,
   1504     POP_UNION,
   1505     POP_INTER,
   1506     POP_XOR,
   1507     POP_INVERT,
   1508     POP_END,
   1509 } PropOPEnum;
   1510 
   1511 #define POP_STACK_LEN_MAX 4
   1512 
   1513 static int unicode_prop_ops(CharRange *cr, ...)
   1514 {
   1515     va_list ap;
   1516     CharRange stack[POP_STACK_LEN_MAX];
   1517     int stack_len, op, ret, i;
   1518     uint32_t a;
   1519 
   1520     va_start(ap, cr);
   1521     stack_len = 0;
   1522     for(;;) {
   1523         op = va_arg(ap, int);
   1524         switch(op) {
   1525         case POP_GC:
   1526             assert(stack_len < POP_STACK_LEN_MAX);
   1527             a = va_arg(ap, int);
   1528             cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
   1529             if (unicode_general_category1(&stack[stack_len - 1], a))
   1530                 goto fail;
   1531             break;
   1532         case POP_PROP:
   1533             assert(stack_len < POP_STACK_LEN_MAX);
   1534             a = va_arg(ap, int);
   1535             cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
   1536             if (unicode_prop1(&stack[stack_len - 1], a))
   1537                 goto fail;
   1538             break;
   1539         case POP_CASE:
   1540             assert(stack_len < POP_STACK_LEN_MAX);
   1541             a = va_arg(ap, int);
   1542             cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
   1543             if (unicode_case1(&stack[stack_len - 1], a))
   1544                 goto fail;
   1545             break;
   1546         case POP_UNION:
   1547         case POP_INTER:
   1548         case POP_XOR:
   1549             {
   1550                 CharRange *cr1, *cr2, *cr3;
   1551                 assert(stack_len >= 2);
   1552                 assert(stack_len < POP_STACK_LEN_MAX);
   1553                 cr1 = &stack[stack_len - 2];
   1554                 cr2 = &stack[stack_len - 1];
   1555                 cr3 = &stack[stack_len++];
   1556                 cr_init(cr3, cr->mem_opaque, cr->realloc_func);
   1557                 if (cr_op(cr3, cr1->points, cr1->len,
   1558                           cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION))
   1559                     goto fail;
   1560                 cr_free(cr1);
   1561                 cr_free(cr2);
   1562                 *cr1 = *cr3;
   1563                 stack_len -= 2;
   1564             }
   1565             break;
   1566         case POP_INVERT:
   1567             assert(stack_len >= 1);
   1568             if (cr_invert(&stack[stack_len - 1]))
   1569                 goto fail;
   1570             break;
   1571         case POP_END:
   1572             goto done;
   1573         default:
   1574             abort();
   1575         }
   1576     }
   1577  done:
   1578     assert(stack_len == 1);
   1579     ret = cr_copy(cr, &stack[0]);
   1580     cr_free(&stack[0]);
   1581     return ret;
   1582  fail:
   1583     for(i = 0; i < stack_len; i++)
   1584         cr_free(&stack[i]);
   1585     return -1;
   1586 }
   1587 
   1588 static const uint32_t unicode_gc_mask_table[] = {
   1589     M(Lu) | M(Ll) | M(Lt), /* LC */
   1590     M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo), /* L */
   1591     M(Mn) | M(Mc) | M(Me), /* M */
   1592     M(Nd) | M(Nl) | M(No), /* N */
   1593     M(Sm) | M(Sc) | M(Sk) | M(So), /* S */
   1594     M(Pc) | M(Pd) | M(Ps) | M(Pe) | M(Pi) | M(Pf) | M(Po), /* P */
   1595     M(Zs) | M(Zl) | M(Zp), /* Z */
   1596     M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn), /* C */
   1597 };
   1598 
   1599 /* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
   1600    if not found */
   1601 int unicode_general_category(CharRange *cr, const char *gc_name)
   1602 {
   1603     int gc_idx;
   1604     uint32_t gc_mask;
   1605 
   1606     gc_idx = unicode_find_name(unicode_gc_name_table, gc_name);
   1607     if (gc_idx < 0)
   1608         return -2;
   1609     if (gc_idx <= UNICODE_GC_Co) {
   1610         gc_mask = (uint64_t)1 << gc_idx;
   1611     } else {
   1612         gc_mask = unicode_gc_mask_table[gc_idx - UNICODE_GC_LC];
   1613     }
   1614     return unicode_general_category1(cr, gc_mask);
   1615 }
   1616 
   1617 
   1618 /* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
   1619    if not found */
   1620 int unicode_prop(CharRange *cr, const char *prop_name)
   1621 {
   1622     int prop_idx, ret;
   1623 
   1624     prop_idx = unicode_find_name(unicode_prop_name_table, prop_name);
   1625     if (prop_idx < 0)
   1626         return -2;
   1627     prop_idx += UNICODE_PROP_ASCII_Hex_Digit;
   1628 
   1629     ret = 0;
   1630     switch(prop_idx) {
   1631     case UNICODE_PROP_ASCII:
   1632         if (cr_add_interval(cr, 0x00, 0x7f + 1))
   1633             return -1;
   1634         break;
   1635     case UNICODE_PROP_Any:
   1636         if (cr_add_interval(cr, 0x00000, 0x10ffff + 1))
   1637             return -1;
   1638         break;
   1639     case UNICODE_PROP_Assigned:
   1640         ret = unicode_prop_ops(cr,
   1641                                POP_GC, M(Cn),
   1642                                POP_INVERT,
   1643                                POP_END);
   1644         break;
   1645     case UNICODE_PROP_Math:
   1646         ret = unicode_prop_ops(cr,
   1647                                POP_GC, M(Sm),
   1648                                POP_PROP, UNICODE_PROP_Other_Math,
   1649                                POP_UNION,
   1650                                POP_END);
   1651         break;
   1652     case UNICODE_PROP_Lowercase:
   1653         ret = unicode_prop_ops(cr,
   1654                                POP_GC, M(Ll),
   1655                                POP_PROP, UNICODE_PROP_Other_Lowercase,
   1656                                POP_UNION,
   1657                                POP_END);
   1658         break;
   1659     case UNICODE_PROP_Uppercase:
   1660         ret = unicode_prop_ops(cr,
   1661                                POP_GC, M(Lu),
   1662                                POP_PROP, UNICODE_PROP_Other_Uppercase,
   1663                                POP_UNION,
   1664                                POP_END);
   1665         break;
   1666     case UNICODE_PROP_Cased:
   1667         ret = unicode_prop_ops(cr,
   1668                                POP_GC, M(Lu) | M(Ll) | M(Lt),
   1669                                POP_PROP, UNICODE_PROP_Other_Uppercase,
   1670                                POP_UNION,
   1671                                POP_PROP, UNICODE_PROP_Other_Lowercase,
   1672                                POP_UNION,
   1673                                POP_END);
   1674         break;
   1675     case UNICODE_PROP_Alphabetic:
   1676         ret = unicode_prop_ops(cr,
   1677                                POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
   1678                                POP_PROP, UNICODE_PROP_Other_Uppercase,
   1679                                POP_UNION,
   1680                                POP_PROP, UNICODE_PROP_Other_Lowercase,
   1681                                POP_UNION,
   1682                                POP_PROP, UNICODE_PROP_Other_Alphabetic,
   1683                                POP_UNION,
   1684                                POP_END);
   1685         break;
   1686     case UNICODE_PROP_Grapheme_Base:
   1687         ret = unicode_prop_ops(cr,
   1688                                POP_GC, M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn) | M(Zl) | M(Zp) | M(Me) | M(Mn),
   1689                                POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
   1690                                POP_UNION,
   1691                                POP_INVERT,
   1692                                POP_END);
   1693         break;
   1694     case UNICODE_PROP_Grapheme_Extend:
   1695         ret = unicode_prop_ops(cr,
   1696                                POP_GC, M(Me) | M(Mn),
   1697                                POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
   1698                                POP_UNION,
   1699                                POP_END);
   1700         break;
   1701     case UNICODE_PROP_XID_Start:
   1702         ret = unicode_prop_ops(cr,
   1703                                POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
   1704                                POP_PROP, UNICODE_PROP_Other_ID_Start,
   1705                                POP_UNION,
   1706                                POP_PROP, UNICODE_PROP_Pattern_Syntax,
   1707                                POP_PROP, UNICODE_PROP_Pattern_White_Space,
   1708                                POP_UNION,
   1709                                POP_PROP, UNICODE_PROP_XID_Start1,
   1710                                POP_UNION,
   1711                                POP_INVERT,
   1712                                POP_INTER,
   1713                                POP_END);
   1714         break;
   1715     case UNICODE_PROP_XID_Continue:
   1716         ret = unicode_prop_ops(cr,
   1717                                POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
   1718                                M(Mn) | M(Mc) | M(Nd) | M(Pc),
   1719                                POP_PROP, UNICODE_PROP_Other_ID_Start,
   1720                                POP_UNION,
   1721                                POP_PROP, UNICODE_PROP_Other_ID_Continue,
   1722                                POP_UNION,
   1723                                POP_PROP, UNICODE_PROP_Pattern_Syntax,
   1724                                POP_PROP, UNICODE_PROP_Pattern_White_Space,
   1725                                POP_UNION,
   1726                                POP_PROP, UNICODE_PROP_XID_Continue1,
   1727                                POP_UNION,
   1728                                POP_INVERT,
   1729                                POP_INTER,
   1730                                POP_END);
   1731         break;
   1732     case UNICODE_PROP_Changes_When_Uppercased:
   1733         ret = unicode_case1(cr, CASE_U);
   1734         break;
   1735     case UNICODE_PROP_Changes_When_Lowercased:
   1736         ret = unicode_case1(cr, CASE_L);
   1737         break;
   1738     case UNICODE_PROP_Changes_When_Casemapped:
   1739         ret = unicode_case1(cr, CASE_U | CASE_L | CASE_F);
   1740         break;
   1741     case UNICODE_PROP_Changes_When_Titlecased:
   1742         ret = unicode_prop_ops(cr,
   1743                                POP_CASE, CASE_U,
   1744                                POP_PROP, UNICODE_PROP_Changes_When_Titlecased1,
   1745                                POP_XOR,
   1746                                POP_END);
   1747         break;
   1748     case UNICODE_PROP_Changes_When_Casefolded:
   1749         ret = unicode_prop_ops(cr,
   1750                                POP_CASE, CASE_F,
   1751                                POP_PROP, UNICODE_PROP_Changes_When_Casefolded1,
   1752                                POP_XOR,
   1753                                POP_END);
   1754         break;
   1755     case UNICODE_PROP_Changes_When_NFKC_Casefolded:
   1756         ret = unicode_prop_ops(cr,
   1757                                POP_CASE, CASE_F,
   1758                                POP_PROP, UNICODE_PROP_Changes_When_NFKC_Casefolded1,
   1759                                POP_XOR,
   1760                                POP_END);
   1761         break;
   1762 #if 0
   1763     case UNICODE_PROP_ID_Start:
   1764         ret = unicode_prop_ops(cr,
   1765                                POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
   1766                                POP_PROP, UNICODE_PROP_Other_ID_Start,
   1767                                POP_UNION,
   1768                                POP_PROP, UNICODE_PROP_Pattern_Syntax,
   1769                                POP_PROP, UNICODE_PROP_Pattern_White_Space,
   1770                                POP_UNION,
   1771                                POP_INVERT,
   1772                                POP_INTER,
   1773                                POP_END);
   1774         break;
   1775     case UNICODE_PROP_ID_Continue:
   1776         ret = unicode_prop_ops(cr,
   1777                                POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
   1778                                M(Mn) | M(Mc) | M(Nd) | M(Pc),
   1779                                POP_PROP, UNICODE_PROP_Other_ID_Start,
   1780                                POP_UNION,
   1781                                POP_PROP, UNICODE_PROP_Other_ID_Continue,
   1782                                POP_UNION,
   1783                                POP_PROP, UNICODE_PROP_Pattern_Syntax,
   1784                                POP_PROP, UNICODE_PROP_Pattern_White_Space,
   1785                                POP_UNION,
   1786                                POP_INVERT,
   1787                                POP_INTER,
   1788                                POP_END);
   1789         break;
   1790     case UNICODE_PROP_Case_Ignorable:
   1791         ret = unicode_prop_ops(cr,
   1792                                POP_GC, M(Mn) | M(Cf) | M(Lm) | M(Sk),
   1793                                POP_PROP, UNICODE_PROP_Case_Ignorable1,
   1794                                POP_XOR,
   1795                                POP_END);
   1796         break;
   1797 #else
   1798         /* we use the existing tables */
   1799     case UNICODE_PROP_ID_Continue:
   1800         ret = unicode_prop_ops(cr,
   1801                                POP_PROP, UNICODE_PROP_ID_Start,
   1802                                POP_PROP, UNICODE_PROP_ID_Continue1,
   1803                                POP_XOR,
   1804                                POP_END);
   1805         break;
   1806 #endif
   1807     default:
   1808         if (prop_idx >= countof(unicode_prop_table))
   1809             return -2;
   1810         ret = unicode_prop1(cr, prop_idx);
   1811         break;
   1812     }
   1813     return ret;
   1814 }
   1815 
   1816 #endif /* CONFIG_ALL_UNICODE */
   1817 
   1818 /*---- lre codepoint categorizing functions ----*/
   1819 
   1820 #define S  UNICODE_C_SPACE
   1821 #define D  UNICODE_C_DIGIT
   1822 #define X  UNICODE_C_XDIGIT
   1823 #define U  UNICODE_C_UPPER
   1824 #define L  UNICODE_C_LOWER
   1825 #define _  UNICODE_C_UNDER
   1826 #define d  UNICODE_C_DOLLAR
   1827 
   1828 uint8_t const lre_ctype_bits[256] = {
   1829     0, 0, 0, 0, 0, 0, 0, 0,
   1830     0, S, S, S, S, S, 0, 0,
   1831     0, 0, 0, 0, 0, 0, 0, 0,
   1832     0, 0, 0, 0, 0, 0, 0, 0,
   1833 
   1834     S, 0, 0, 0, d, 0, 0, 0,
   1835     0, 0, 0, 0, 0, 0, 0, 0,
   1836     X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D,
   1837     X|D, X|D, 0, 0, 0, 0, 0, 0,
   1838 
   1839     0, X|U, X|U, X|U, X|U, X|U, X|U, U,
   1840     U, U, U, U, U, U, U, U,
   1841     U, U, U, U, U, U, U, U,
   1842     U, U, U, 0, 0, 0, 0, _,
   1843 
   1844     0, X|L, X|L, X|L, X|L, X|L, X|L, L,
   1845     L, L, L, L, L, L, L, L,
   1846     L, L, L, L, L, L, L, L,
   1847     L, L, L, 0, 0, 0, 0, 0,
   1848 
   1849     0, 0, 0, 0, 0, 0, 0, 0,
   1850     0, 0, 0, 0, 0, 0, 0, 0,
   1851     0, 0, 0, 0, 0, 0, 0, 0,
   1852     0, 0, 0, 0, 0, 0, 0, 0,
   1853 
   1854     S, 0, 0, 0, 0, 0, 0, 0,
   1855     0, 0, 0, 0, 0, 0, 0, 0,
   1856     0, 0, 0, 0, 0, 0, 0, 0,
   1857     0, 0, 0, 0, 0, 0, 0, 0,
   1858 
   1859     0, 0, 0, 0, 0, 0, 0, 0,
   1860     0, 0, 0, 0, 0, 0, 0, 0,
   1861     0, 0, 0, 0, 0, 0, 0, 0,
   1862     0, 0, 0, 0, 0, 0, 0, 0,
   1863 
   1864     0, 0, 0, 0, 0, 0, 0, 0,
   1865     0, 0, 0, 0, 0, 0, 0, 0,
   1866     0, 0, 0, 0, 0, 0, 0, 0,
   1867     0, 0, 0, 0, 0, 0, 0, 0,
   1868 };
   1869 
   1870 #undef S
   1871 #undef D
   1872 #undef X
   1873 #undef U
   1874 #undef L
   1875 #undef _
   1876 #undef d
   1877 
   1878 /* code point ranges for Zs,Zl or Zp property */
   1879 static const uint16_t char_range_s[] = {
   1880     10,
   1881     0x0009, 0x000D + 1,
   1882     0x0020, 0x0020 + 1,
   1883     0x00A0, 0x00A0 + 1,
   1884     0x1680, 0x1680 + 1,
   1885     0x2000, 0x200A + 1,
   1886     /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
   1887     /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
   1888     0x2028, 0x2029 + 1,
   1889     0x202F, 0x202F + 1,
   1890     0x205F, 0x205F + 1,
   1891     0x3000, 0x3000 + 1,
   1892     /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
   1893     0xFEFF, 0xFEFF + 1,
   1894 };
   1895 
   1896 BOOL lre_is_space_non_ascii(uint32_t c)
   1897 {
   1898     size_t i, n;
   1899 
   1900     n = countof(char_range_s);
   1901     for(i = 5; i < n; i += 2) {
   1902         uint32_t low = char_range_s[i];
   1903         uint32_t high = char_range_s[i + 1];
   1904         if (c < low)
   1905             return FALSE;
   1906         if (c < high)
   1907             return TRUE;
   1908     }
   1909     return FALSE;
   1910 }