quickjs-tart

quickjs-based runtime for wallet-core logic
Log | Files | Refs | README | LICENSE

unicode_gen.c (88975B)


      1 /*
      2  * Generation of Unicode tables
      3  *
      4  * Copyright (c) 2017-2018 Fabrice Bellard
      5  * Copyright (c) 2017-2018 Charlie Gordon
      6  *
      7  * Permission is hereby granted, free of charge, to any person obtaining a copy
      8  * of this software and associated documentation files (the "Software"), to deal
      9  * in the Software without restriction, including without limitation the rights
     10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     11  * copies of the Software, and to permit persons to whom the Software is
     12  * furnished to do so, subject to the following conditions:
     13  *
     14  * The above copyright notice and this permission notice shall be included in
     15  * all copies or substantial portions of the Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     23  * THE SOFTWARE.
     24  */
     25 #include <stdlib.h>
     26 #include <stdio.h>
     27 #include <stdarg.h>
     28 #include <inttypes.h>
     29 #include <string.h>
     30 #include <assert.h>
     31 #include <ctype.h>
     32 #include <time.h>
     33 
     34 #include "cutils.h"
     35 
     36 uint32_t total_tables;
     37 uint32_t total_table_bytes;
     38 uint32_t total_index;
     39 uint32_t total_index_bytes;
     40 
     41 /* define it to be able to test unicode.c */
     42 //#define USE_TEST
     43 /* profile tests */
     44 //#define PROFILE
     45 
     46 //#define DUMP_CASE_CONV_TABLE
     47 //#define DUMP_TABLE_SIZE
     48 //#define DUMP_CC_TABLE
     49 //#define DUMP_DECOMP_TABLE
     50 //#define DUMP_CASE_FOLDING_SPECIAL_CASES
     51 
     52 /* Ideas:
     53    - Generalize run length encoding + index for all tables
     54    - remove redundant tables for ID_start, ID_continue, Case_Ignorable, Cased
     55 
     56    Case conversion:
     57    - use a single entry for consecutive U/LF runs
     58    - allow EXT runs of length > 1
     59 
     60    Decomposition:
     61    - Greek lower case (+1f10/1f10) ?
     62    - allow holes in B runs
     63    - suppress more upper / lower case redundancy
     64 */
     65 
     66 #ifdef USE_TEST
     67 #include "libunicode.c"
     68 #endif
     69 
     70 #define CHARCODE_MAX 0x10ffff
     71 #define CC_LEN_MAX 3
     72 
     73 void *mallocz(size_t size)
     74 {
     75     void *ptr;
     76     ptr = malloc(size);
     77     memset(ptr, 0, size);
     78     return ptr;
     79 }
     80 
     81 const char *get_field(const char *p, int n)
     82 {
     83     int i;
     84     for(i = 0; i < n; i++) {
     85         while (*p != ';' && *p != '\0')
     86             p++;
     87         if (*p == '\0')
     88             return NULL;
     89         p++;
     90     }
     91     return p;
     92 }
     93 
     94 const char *get_field_buf(char *buf, size_t buf_size, const char *p, int n)
     95 {
     96     char *q;
     97     p = get_field(p, n);
     98     q = buf;
     99     while (*p != ';' && *p != '\0') {
    100         if ((q - buf) < buf_size - 1)
    101             *q++ = *p;
    102         p++;
    103     }
    104     *q = '\0';
    105     return buf;
    106 }
    107 
    108 void add_char(int **pbuf, int *psize, int *plen, int c)
    109 {
    110     int len, size, *buf;
    111     buf = *pbuf;
    112     size = *psize;
    113     len = *plen;
    114     if (len >= size) {
    115         size = *psize;
    116         size = max_int(len + 1, size * 3 / 2);
    117         buf = realloc(buf, sizeof(buf[0]) * size);
    118         *pbuf = buf;
    119         *psize = size;
    120     }
    121     buf[len++] = c;
    122     *plen = len;
    123 }
    124 
    125 int *get_field_str(int *plen, const char *str, int n)
    126 {
    127     const char *p;
    128     int *buf, len, size;
    129     p = get_field(str, n);
    130     if (!p) {
    131         *plen = 0;
    132         return NULL;
    133     }
    134     len = 0;
    135     size = 0;
    136     buf = NULL;
    137     for(;;) {
    138         while (isspace(*p))
    139             p++;
    140         if (!isxdigit(*p))
    141             break;
    142         add_char(&buf, &size, &len, strtoul(p, (char **)&p, 16));
    143     }
    144     *plen = len;
    145     return buf;
    146 }
    147 
    148 char *get_line(char *buf, int buf_size, FILE *f)
    149 {
    150     int len;
    151     if (!fgets(buf, buf_size, f))
    152         return NULL;
    153     len = strlen(buf);
    154     if (len > 0 && buf[len - 1] == '\n')
    155         buf[len - 1] = '\0';
    156     return buf;
    157 }
    158 
    159 #define UNICODE_GENERAL_CATEGORY
    160 
    161 typedef enum {
    162 #define DEF(id, str) GCAT_ ## id,
    163 #include "unicode_gen_def.h"
    164 #undef DEF
    165     GCAT_COUNT,
    166 } UnicodeGCEnum1;
    167 
    168 static const char *unicode_gc_name[] = {
    169 #define DEF(id, str) #id,
    170 #include "unicode_gen_def.h"
    171 #undef DEF
    172 };
    173 
    174 static const char *unicode_gc_short_name[] = {
    175 #define DEF(id, str) str,
    176 #include "unicode_gen_def.h"
    177 #undef DEF
    178 };
    179 
    180 #undef UNICODE_GENERAL_CATEGORY
    181 
    182 #define UNICODE_SCRIPT
    183 
    184 typedef enum {
    185 #define DEF(id, str) SCRIPT_ ## id,
    186 #include "unicode_gen_def.h"
    187 #undef DEF
    188     SCRIPT_COUNT,
    189 } UnicodeScriptEnum1;
    190 
    191 static const char *unicode_script_name[] = {
    192 #define DEF(id, str) #id,
    193 #include "unicode_gen_def.h"
    194 #undef DEF
    195 };
    196 
    197 const char *unicode_script_short_name[] = {
    198 #define DEF(id, str) str,
    199 #include "unicode_gen_def.h"
    200 #undef DEF
    201 };
    202 
    203 #undef UNICODE_SCRIPT
    204 
    205 #define UNICODE_PROP_LIST
    206 
    207 typedef enum {
    208 #define DEF(id, str) PROP_ ## id,
    209 #include "unicode_gen_def.h"
    210 #undef DEF
    211     PROP_COUNT,
    212 } UnicodePropEnum1;
    213 
    214 static const char *unicode_prop_name[] = {
    215 #define DEF(id, str) #id,
    216 #include "unicode_gen_def.h"
    217 #undef DEF
    218 };
    219 
    220 static const char *unicode_prop_short_name[] = {
    221 #define DEF(id, str) str,
    222 #include "unicode_gen_def.h"
    223 #undef DEF
    224 };
    225 
    226 #undef UNICODE_PROP_LIST
    227 
    228 typedef struct {
    229     /* case conv */
    230     uint8_t u_len;
    231     uint8_t l_len;
    232     uint8_t f_len;
    233     int u_data[CC_LEN_MAX]; /* to upper case */
    234     int l_data[CC_LEN_MAX]; /* to lower case */
    235     int f_data[CC_LEN_MAX]; /* to case folding */
    236 
    237     uint8_t combining_class;
    238     uint8_t is_compat:1;
    239     uint8_t is_excluded:1;
    240     uint8_t general_category;
    241     uint8_t script;
    242     uint8_t script_ext_len;
    243     uint8_t *script_ext;
    244     uint32_t prop_bitmap_tab[3];
    245     /* decomposition */
    246     int decomp_len;
    247     int *decomp_data;
    248 } CCInfo;
    249 
    250 CCInfo *unicode_db;
    251 
    252 int find_name(const char **tab, int tab_len, const char *name)
    253 {
    254     int i, len, name_len;
    255     const char *p, *r;
    256 
    257     name_len = strlen(name);
    258     for(i = 0; i < tab_len; i++) {
    259         p = tab[i];
    260         for(;;) {
    261             r = strchr(p, ',');
    262             if (!r)
    263                 len = strlen(p);
    264             else
    265                 len = r - p;
    266             if (len == name_len && memcmp(p, name, len) == 0)
    267                 return i;
    268             if (!r)
    269                 break;
    270             p = r + 1;
    271         }
    272     }
    273     return -1;
    274 }
    275 
    276 static BOOL get_prop(uint32_t c, int prop_idx)
    277 {
    278     return (unicode_db[c].prop_bitmap_tab[prop_idx >> 5] >> (prop_idx & 0x1f)) & 1;
    279 }
    280 
    281 static void set_prop(uint32_t c, int prop_idx, int val)
    282 {
    283     uint32_t mask;
    284     mask = 1U << (prop_idx & 0x1f);
    285     if (val)
    286         unicode_db[c].prop_bitmap_tab[prop_idx >> 5] |= mask;
    287     else
    288         unicode_db[c].prop_bitmap_tab[prop_idx >> 5]  &= ~mask;
    289 }
    290 
    291 void parse_unicode_data(const char *filename)
    292 {
    293     FILE *f;
    294     char line[1024];
    295     char buf1[256];
    296     const char *p;
    297     int code, lc, uc, last_code;
    298     CCInfo *ci, *tab = unicode_db;
    299 
    300     f = fopen(filename, "rb");
    301     if (!f) {
    302         perror(filename);
    303         exit(1);
    304     }
    305 
    306     last_code = 0;
    307     for(;;) {
    308         if (!get_line(line, sizeof(line), f))
    309             break;
    310         p = line;
    311         while (isspace(*p))
    312             p++;
    313         if (*p == '#')
    314             continue;
    315 
    316         p = get_field(line, 0);
    317         if (!p)
    318             continue;
    319         code = strtoul(p, NULL, 16);
    320         lc = 0;
    321         uc = 0;
    322 
    323         p = get_field(line, 12);
    324         if (p && *p != ';') {
    325             uc = strtoul(p, NULL, 16);
    326         }
    327 
    328         p = get_field(line, 13);
    329         if (p && *p != ';') {
    330             lc = strtoul(p, NULL, 16);
    331         }
    332         ci = &tab[code];
    333         if (uc > 0 || lc > 0) {
    334             assert(code <= CHARCODE_MAX);
    335             if (uc > 0) {
    336                 assert(ci->u_len == 0);
    337                 ci->u_len = 1;
    338                 ci->u_data[0] = uc;
    339             }
    340             if (lc > 0) {
    341                 assert(ci->l_len == 0);
    342                 ci->l_len = 1;
    343                 ci->l_data[0] = lc;
    344             }
    345         }
    346 
    347         {
    348             int i;
    349             get_field_buf(buf1, sizeof(buf1), line, 2);
    350             i = find_name(unicode_gc_name, countof(unicode_gc_name), buf1);
    351             if (i < 0) {
    352                 fprintf(stderr, "General category '%s' not found\n",
    353                         buf1);
    354                 exit(1);
    355             }
    356             ci->general_category = i;
    357         }
    358 
    359         p = get_field(line, 3);
    360         if (p && *p != ';' && *p != '\0') {
    361             int cc;
    362             cc = strtoul(p, NULL, 0);
    363             if (cc != 0) {
    364                 assert(code <= CHARCODE_MAX);
    365                 ci->combining_class = cc;
    366                 //                printf("%05x: %d\n", code, ci->combining_class);
    367             }
    368         }
    369 
    370         p = get_field(line, 5);
    371         if (p && *p != ';' && *p != '\0') {
    372             int size;
    373             assert(code <= CHARCODE_MAX);
    374             ci->is_compat = 0;
    375             if (*p == '<') {
    376                 while (*p != '\0' && *p != '>')
    377                     p++;
    378                 if (*p == '>')
    379                     p++;
    380                 ci->is_compat = 1;
    381             }
    382             size = 0;
    383             for(;;) {
    384                 while (isspace(*p))
    385                     p++;
    386                 if (!isxdigit(*p))
    387                     break;
    388                 add_char(&ci->decomp_data, &size, &ci->decomp_len, strtoul(p, (char **)&p, 16));
    389             }
    390 #if 0
    391             {
    392                 int i;
    393                 static int count, d_count;
    394 
    395                 printf("%05x: %c", code, ci->is_compat ? 'C': ' ');
    396                 for(i = 0; i < ci->decomp_len; i++)
    397                     printf(" %05x", ci->decomp_data[i]);
    398                 printf("\n");
    399                 count++;
    400                 d_count += ci->decomp_len;
    401                 //                printf("%d %d\n", count, d_count);
    402             }
    403 #endif
    404         }
    405 
    406         p = get_field(line, 9);
    407         if (p && *p == 'Y') {
    408             set_prop(code, PROP_Bidi_Mirrored, 1);
    409         }
    410 
    411         /* handle ranges */
    412         get_field_buf(buf1, sizeof(buf1), line, 1);
    413         if (strstr(buf1, " Last>")) {
    414             int i;
    415             //            printf("range: 0x%x-%0x\n", last_code, code);
    416             assert(ci->decomp_len == 0);
    417             assert(ci->script_ext_len == 0);
    418             for(i = last_code + 1; i < code; i++) {
    419                 unicode_db[i] = *ci;
    420             }
    421         }
    422         last_code = code;
    423     }
    424 
    425     fclose(f);
    426 }
    427 
    428 void parse_special_casing(CCInfo *tab, const char *filename)
    429 {
    430     FILE *f;
    431     char line[1024];
    432     const char *p;
    433     int code;
    434     CCInfo *ci;
    435 
    436     f = fopen(filename, "rb");
    437     if (!f) {
    438         perror(filename);
    439         exit(1);
    440     }
    441 
    442     for(;;) {
    443         if (!get_line(line, sizeof(line), f))
    444             break;
    445         p = line;
    446         while (isspace(*p))
    447             p++;
    448         if (*p == '#')
    449             continue;
    450 
    451         p = get_field(line, 0);
    452         if (!p)
    453             continue;
    454         code = strtoul(p, NULL, 16);
    455         assert(code <= CHARCODE_MAX);
    456         ci = &tab[code];
    457 
    458         p = get_field(line, 4);
    459         if (p) {
    460             /* locale dependent casing */
    461             while (isspace(*p))
    462                 p++;
    463             if (*p != '#' && *p != '\0')
    464                 continue;
    465         }
    466 
    467 
    468         p = get_field(line, 1);
    469         if (p && *p != ';') {
    470             ci->l_len = 0;
    471             for(;;) {
    472                 while (isspace(*p))
    473                     p++;
    474                 if (*p == ';')
    475                     break;
    476                 assert(ci->l_len < CC_LEN_MAX);
    477                 ci->l_data[ci->l_len++] = strtoul(p, (char **)&p, 16);
    478             }
    479 
    480             if (ci->l_len == 1 && ci->l_data[0] == code)
    481                 ci->l_len = 0;
    482         }
    483 
    484         p = get_field(line, 3);
    485         if (p && *p != ';') {
    486             ci->u_len = 0;
    487             for(;;) {
    488                 while (isspace(*p))
    489                     p++;
    490                 if (*p == ';')
    491                     break;
    492                 assert(ci->u_len < CC_LEN_MAX);
    493                 ci->u_data[ci->u_len++] = strtoul(p, (char **)&p, 16);
    494             }
    495 
    496             if (ci->u_len == 1 && ci->u_data[0] == code)
    497                 ci->u_len = 0;
    498         }
    499     }
    500 
    501     fclose(f);
    502 }
    503 
    504 void parse_case_folding(CCInfo *tab, const char *filename)
    505 {
    506     FILE *f;
    507     char line[1024];
    508     const char *p;
    509     int code, status;
    510     CCInfo *ci;
    511 
    512     f = fopen(filename, "rb");
    513     if (!f) {
    514         perror(filename);
    515         exit(1);
    516     }
    517 
    518     for(;;) {
    519         if (!get_line(line, sizeof(line), f))
    520             break;
    521         p = line;
    522         while (isspace(*p))
    523             p++;
    524         if (*p == '#')
    525             continue;
    526 
    527         p = get_field(line, 0);
    528         if (!p)
    529             continue;
    530         code = strtoul(p, NULL, 16);
    531         assert(code <= CHARCODE_MAX);
    532         ci = &tab[code];
    533 
    534         p = get_field(line, 1);
    535         if (!p)
    536             continue;
    537         /* locale dependent casing */
    538         while (isspace(*p))
    539             p++;
    540         status = *p;
    541         if (status != 'C' && status != 'S' && status != 'F')
    542             continue;
    543 
    544         p = get_field(line, 2);
    545         assert(p != NULL);
    546         if (status == 'S') {
    547             /* we always select the simple case folding and assume it
    548              * comes after the full case folding case */
    549             assert(ci->f_len >= 2);
    550             ci->f_len = 0;
    551         } else {
    552             assert(ci->f_len == 0);
    553         }
    554         for(;;) {
    555             while (isspace(*p))
    556                 p++;
    557             if (*p == ';')
    558                 break;
    559             assert(ci->l_len < CC_LEN_MAX);
    560             ci->f_data[ci->f_len++] = strtoul(p, (char **)&p, 16);
    561         }
    562     }
    563 
    564     fclose(f);
    565 }
    566 
    567 void parse_composition_exclusions(const char *filename)
    568 {
    569     FILE *f;
    570     char line[4096], *p;
    571     uint32_t c0;
    572 
    573     f = fopen(filename, "rb");
    574     if (!f) {
    575         perror(filename);
    576         exit(1);
    577     }
    578 
    579     for(;;) {
    580         if (!get_line(line, sizeof(line), f))
    581             break;
    582         p = line;
    583         while (isspace(*p))
    584             p++;
    585         if (*p == '#' || *p == '@' || *p == '\0')
    586             continue;
    587         c0 = strtoul(p, (char **)&p, 16);
    588         assert(c0 > 0 && c0 <= CHARCODE_MAX);
    589         unicode_db[c0].is_excluded = TRUE;
    590     }
    591     fclose(f);
    592 }
    593 
    594 void parse_derived_core_properties(const char *filename)
    595 {
    596     FILE *f;
    597     char line[4096], *p, buf[256], *q;
    598     uint32_t c0, c1, c;
    599     int i;
    600 
    601     f = fopen(filename, "rb");
    602     if (!f) {
    603         perror(filename);
    604         exit(1);
    605     }
    606 
    607     for(;;) {
    608         if (!get_line(line, sizeof(line), f))
    609             break;
    610         p = line;
    611         while (isspace(*p))
    612             p++;
    613         if (*p == '#' || *p == '@' || *p == '\0')
    614             continue;
    615         c0 = strtoul(p, (char **)&p, 16);
    616         if (*p == '.' && p[1] == '.') {
    617             p += 2;
    618             c1 = strtoul(p, (char **)&p, 16);
    619         } else {
    620             c1 = c0;
    621         }
    622         assert(c1 <= CHARCODE_MAX);
    623         p += strspn(p, " \t");
    624         if (*p == ';') {
    625             p++;
    626             p += strspn(p, " \t");
    627             q = buf;
    628             while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
    629                 if ((q - buf) < sizeof(buf) - 1)
    630                     *q++ = *p;
    631                 p++;
    632             }
    633             *q = '\0';
    634             i = find_name(unicode_prop_name,
    635                           countof(unicode_prop_name), buf);
    636             if (i < 0) {
    637                 if (!strcmp(buf, "Grapheme_Link"))
    638                     goto next;
    639                 fprintf(stderr, "Property not found: %s\n", buf);
    640                 exit(1);
    641             }
    642             for(c = c0; c <= c1; c++) {
    643                 set_prop(c, i, 1);
    644             }
    645 next: ;
    646         }
    647     }
    648     fclose(f);
    649 }
    650 
    651 void parse_derived_norm_properties(const char *filename)
    652 {
    653     FILE *f;
    654     char line[4096], *p, buf[256], *q;
    655     uint32_t c0, c1, c;
    656 
    657     f = fopen(filename, "rb");
    658     if (!f) {
    659         perror(filename);
    660         exit(1);
    661     }
    662 
    663     for(;;) {
    664         if (!get_line(line, sizeof(line), f))
    665             break;
    666         p = line;
    667         while (isspace(*p))
    668             p++;
    669         if (*p == '#' || *p == '@' || *p == '\0')
    670             continue;
    671         c0 = strtoul(p, (char **)&p, 16);
    672         if (*p == '.' && p[1] == '.') {
    673             p += 2;
    674             c1 = strtoul(p, (char **)&p, 16);
    675         } else {
    676             c1 = c0;
    677         }
    678         assert(c1 <= CHARCODE_MAX);
    679         p += strspn(p, " \t");
    680         if (*p == ';') {
    681             p++;
    682             p += strspn(p, " \t");
    683             q = buf;
    684             while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
    685                 if ((q - buf) < sizeof(buf) - 1)
    686                     *q++ = *p;
    687                 p++;
    688             }
    689             *q = '\0';
    690             if (!strcmp(buf, "Changes_When_NFKC_Casefolded")) {
    691                 for(c = c0; c <= c1; c++) {
    692                     set_prop(c, PROP_Changes_When_NFKC_Casefolded, 1);
    693                 }
    694             }
    695         }
    696     }
    697     fclose(f);
    698 }
    699 
    700 void parse_prop_list(const char *filename)
    701 {
    702     FILE *f;
    703     char line[4096], *p, buf[256], *q;
    704     uint32_t c0, c1, c;
    705     int i;
    706 
    707     f = fopen(filename, "rb");
    708     if (!f) {
    709         perror(filename);
    710         exit(1);
    711     }
    712 
    713     for(;;) {
    714         if (!get_line(line, sizeof(line), f))
    715             break;
    716         p = line;
    717         while (isspace(*p))
    718             p++;
    719         if (*p == '#' || *p == '@' || *p == '\0')
    720             continue;
    721         c0 = strtoul(p, (char **)&p, 16);
    722         if (*p == '.' && p[1] == '.') {
    723             p += 2;
    724             c1 = strtoul(p, (char **)&p, 16);
    725         } else {
    726             c1 = c0;
    727         }
    728         assert(c1 <= CHARCODE_MAX);
    729         p += strspn(p, " \t");
    730         if (*p == ';') {
    731             p++;
    732             p += strspn(p, " \t");
    733             q = buf;
    734             while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
    735                 if ((q - buf) < sizeof(buf) - 1)
    736                     *q++ = *p;
    737                 p++;
    738             }
    739             *q = '\0';
    740             i = find_name(unicode_prop_name,
    741                           countof(unicode_prop_name), buf);
    742             if (i < 0) {
    743                 fprintf(stderr, "Property not found: %s\n", buf);
    744                 exit(1);
    745             }
    746             for(c = c0; c <= c1; c++) {
    747                 set_prop(c, i, 1);
    748             }
    749         }
    750     }
    751     fclose(f);
    752 }
    753 
    754 void parse_scripts(const char *filename)
    755 {
    756     FILE *f;
    757     char line[4096], *p, buf[256], *q;
    758     uint32_t c0, c1, c;
    759     int i;
    760 
    761     f = fopen(filename, "rb");
    762     if (!f) {
    763         perror(filename);
    764         exit(1);
    765     }
    766 
    767     for(;;) {
    768         if (!get_line(line, sizeof(line), f))
    769             break;
    770         p = line;
    771         while (isspace(*p))
    772             p++;
    773         if (*p == '#' || *p == '@' || *p == '\0')
    774             continue;
    775         c0 = strtoul(p, (char **)&p, 16);
    776         if (*p == '.' && p[1] == '.') {
    777             p += 2;
    778             c1 = strtoul(p, (char **)&p, 16);
    779         } else {
    780             c1 = c0;
    781         }
    782         assert(c1 <= CHARCODE_MAX);
    783         p += strspn(p, " \t");
    784         if (*p == ';') {
    785             p++;
    786             p += strspn(p, " \t");
    787             q = buf;
    788             while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
    789                 if ((q - buf) < sizeof(buf) - 1)
    790                     *q++ = *p;
    791                 p++;
    792             }
    793             *q = '\0';
    794             i = find_name(unicode_script_name,
    795                           countof(unicode_script_name), buf);
    796             if (i < 0) {
    797                 fprintf(stderr, "Unknown script: '%s'\n", buf);
    798                 exit(1);
    799             }
    800             for(c = c0; c <= c1; c++)
    801                 unicode_db[c].script = i;
    802         }
    803     }
    804     fclose(f);
    805 }
    806 
    807 void parse_script_extensions(const char *filename)
    808 {
    809     FILE *f;
    810     char line[4096], *p, buf[256], *q;
    811     uint32_t c0, c1, c;
    812     int i;
    813     uint8_t script_ext[255];
    814     int script_ext_len;
    815 
    816     f = fopen(filename, "rb");
    817     if (!f) {
    818         perror(filename);
    819         exit(1);
    820     }
    821 
    822     for(;;) {
    823         if (!get_line(line, sizeof(line), f))
    824             break;
    825         p = line;
    826         while (isspace(*p))
    827             p++;
    828         if (*p == '#' || *p == '@' || *p == '\0')
    829             continue;
    830         c0 = strtoul(p, (char **)&p, 16);
    831         if (*p == '.' && p[1] == '.') {
    832             p += 2;
    833             c1 = strtoul(p, (char **)&p, 16);
    834         } else {
    835             c1 = c0;
    836         }
    837         assert(c1 <= CHARCODE_MAX);
    838         p += strspn(p, " \t");
    839         script_ext_len = 0;
    840         if (*p == ';') {
    841             p++;
    842             for(;;) {
    843                 p += strspn(p, " \t");
    844                 q = buf;
    845                 while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
    846                     if ((q - buf) < sizeof(buf) - 1)
    847                         *q++ = *p;
    848                     p++;
    849                 }
    850                 *q = '\0';
    851                 if (buf[0] == '\0')
    852                     break;
    853                 i = find_name(unicode_script_short_name,
    854                               countof(unicode_script_short_name), buf);
    855                 if (i < 0) {
    856                     fprintf(stderr, "Script not found: %s\n", buf);
    857                     exit(1);
    858                 }
    859                 assert(script_ext_len < sizeof(script_ext));
    860                 script_ext[script_ext_len++] = i;
    861             }
    862             for(c = c0; c <= c1; c++) {
    863                 CCInfo *ci = &unicode_db[c];
    864                 ci->script_ext_len = script_ext_len;
    865                 ci->script_ext = malloc(sizeof(ci->script_ext[0]) * script_ext_len);
    866                 for(i = 0; i < script_ext_len; i++)
    867                     ci->script_ext[i] = script_ext[i];
    868             }
    869         }
    870     }
    871     fclose(f);
    872 }
    873 
    874 void dump_cc_info(CCInfo *ci, int i)
    875 {
    876     int j;
    877     printf("%05x:", i);
    878     if (ci->u_len != 0) {
    879         printf(" U:");
    880         for(j = 0; j < ci->u_len; j++)
    881             printf(" %05x", ci->u_data[j]);
    882     }
    883     if (ci->l_len != 0) {
    884         printf(" L:");
    885         for(j = 0; j < ci->l_len; j++)
    886             printf(" %05x", ci->l_data[j]);
    887     }
    888     if (ci->f_len != 0) {
    889         printf(" F:");
    890         for(j = 0; j < ci->f_len; j++)
    891             printf(" %05x", ci->f_data[j]);
    892     }
    893     printf("\n");
    894 }
    895 
    896 void dump_unicode_data(CCInfo *tab)
    897 {
    898     int i;
    899     CCInfo *ci;
    900     for(i = 0; i <= CHARCODE_MAX; i++) {
    901         ci = &tab[i];
    902         if (ci->u_len != 0 || ci->l_len != 0 || ci->f_len != 0) {
    903             dump_cc_info(ci, i);
    904         }
    905     }
    906 }
    907 
    908 BOOL is_complicated_case(const CCInfo *ci)
    909 {
    910     return (ci->u_len > 1 || ci->l_len > 1 ||
    911             (ci->u_len > 0 && ci->l_len > 0) ||
    912             (ci->f_len != ci->l_len) ||
    913             (memcmp(ci->f_data, ci->l_data, ci->f_len * sizeof(ci->f_data[0])) != 0));
    914 }
    915 
    916 #ifndef USE_TEST
    917 enum {
    918     RUN_TYPE_U,
    919     RUN_TYPE_L,
    920     RUN_TYPE_UF,
    921     RUN_TYPE_LF,
    922     RUN_TYPE_UL,
    923     RUN_TYPE_LSU,
    924     RUN_TYPE_U2L_399_EXT2,
    925     RUN_TYPE_UF_D20,
    926     RUN_TYPE_UF_D1_EXT,
    927     RUN_TYPE_U_EXT,
    928     RUN_TYPE_LF_EXT,
    929     RUN_TYPE_UF_EXT2,
    930     RUN_TYPE_LF_EXT2,
    931     RUN_TYPE_UF_EXT3,
    932 };
    933 #endif
    934 
    935 const char *run_type_str[] = {
    936     "U",
    937     "L",
    938     "UF",
    939     "LF",
    940     "UL",
    941     "LSU",
    942     "U2L_399_EXT2",
    943     "UF_D20",
    944     "UF_D1_EXT",
    945     "U_EXT",
    946     "LF_EXT",
    947     "UF_EXT2",
    948     "LF_EXT2",
    949     "UF_EXT3",
    950 };
    951 
    952 typedef struct {
    953     int code;
    954     int len;
    955     int type;
    956     int data;
    957     int ext_len;
    958     int ext_data[3];
    959     int data_index; /* 'data' coming from the table */
    960 } TableEntry;
    961 
    962 static int simple_to_lower(CCInfo *tab, int c)
    963 {
    964     if (tab[c].l_len != 1)
    965         return c;
    966     return tab[c].l_data[0];
    967 }
    968 
    969 /* code (17), len (7), type (4) */
    970 
    971 void find_run_type(TableEntry *te, CCInfo *tab, int code)
    972 {
    973     int is_lower, len;
    974     CCInfo *ci, *ci1, *ci2;
    975 
    976     ci = &tab[code];
    977     ci1 = &tab[code + 1];
    978     ci2 = &tab[code + 2];
    979     te->code = code;
    980 
    981     if (ci->l_len == 1 && ci->l_data[0] == code + 2 &&
    982         ci->f_len == 1 && ci->f_data[0] == ci->l_data[0] &&
    983         ci->u_len == 0 &&
    984 
    985         ci1->l_len == 1 && ci1->l_data[0] == code + 2 &&
    986         ci1->f_len == 1 && ci1->f_data[0] == ci1->l_data[0] &&
    987         ci1->u_len == 1 && ci1->u_data[0] == code &&
    988 
    989         ci2->l_len == 0 &&
    990         ci2->f_len == 0 &&
    991         ci2->u_len == 1 && ci2->u_data[0] == code) {
    992         te->len = 3;
    993         te->data = 0;
    994         te->type = RUN_TYPE_LSU;
    995         return;
    996     }
    997 
    998     if (is_complicated_case(ci)) {
    999         len = 1;
   1000         while (code + len <= CHARCODE_MAX) {
   1001             ci1 = &tab[code + len];
   1002             if (ci1->u_len != 1 ||
   1003                 ci1->u_data[0] != ci->u_data[0] + len ||
   1004                 ci1->l_len != 0 ||
   1005                 ci1->f_len != 1 || ci1->f_data[0] != ci1->u_data[0])
   1006                 break;
   1007             len++;
   1008         }
   1009         if (len > 1) {
   1010             te->len = len;
   1011             te->type = RUN_TYPE_UF;
   1012             te->data = ci->u_data[0];
   1013             return;
   1014         }
   1015 
   1016         if (ci->l_len == 0 &&
   1017             ci->u_len == 2 && ci->u_data[1] == 0x399 &&
   1018             ci->f_len == 2 && ci->f_data[1] == 0x3B9 &&
   1019             ci->f_data[0] == simple_to_lower(tab, ci->u_data[0])) {
   1020             len = 1;
   1021             while (code + len <= CHARCODE_MAX) {
   1022                 ci1 = &tab[code + len];
   1023                 if (!(ci1->u_len == 2 &&
   1024                       ci1->u_data[1] == ci->u_data[1] &&
   1025                       ci1->u_data[0] == ci->u_data[0] + len &&
   1026                       ci1->f_len == 2 &&
   1027                       ci1->f_data[1] == ci->f_data[1] &&
   1028                       ci1->f_data[0] == ci->f_data[0] + len &&
   1029                       ci1->l_len == 0))
   1030                     break;
   1031                 len++;
   1032             }
   1033             te->len = len;
   1034             te->type = RUN_TYPE_UF_EXT2;
   1035             te->ext_data[0] = ci->u_data[0];
   1036             te->ext_data[1] = ci->u_data[1];
   1037             te->ext_len = 2;
   1038             return;
   1039         }
   1040 
   1041         if (ci->u_len == 2 && ci->u_data[1] == 0x399 &&
   1042             ci->l_len == 1 &&
   1043             ci->f_len == 1 && ci->f_data[0] == ci->l_data[0]) {
   1044             len = 1;
   1045             while (code + len <= CHARCODE_MAX) {
   1046                 ci1 = &tab[code + len];
   1047                 if (!(ci1->u_len == 2 &&
   1048                       ci1->u_data[1] == 0x399 &&
   1049                       ci1->u_data[0] == ci->u_data[0] + len &&
   1050                       ci1->l_len == 1 &&
   1051                       ci1->l_data[0] == ci->l_data[0] + len &&
   1052                       ci1->f_len == 1 && ci1->f_data[0] == ci1->l_data[0]))
   1053                     break;
   1054                 len++;
   1055             }
   1056             te->len = len;
   1057             te->type = RUN_TYPE_U2L_399_EXT2;
   1058             te->ext_data[0] = ci->u_data[0];
   1059             te->ext_data[1] = ci->l_data[0];
   1060             te->ext_len = 2;
   1061             return;
   1062         }
   1063 
   1064         if (ci->l_len == 1 && ci->u_len == 0 && ci->f_len == 0) {
   1065             len = 1;
   1066             while (code + len <= CHARCODE_MAX) {
   1067                 ci1 = &tab[code + len];
   1068                 if (!(ci1->l_len == 1 &&
   1069                       ci1->l_data[0] == ci->l_data[0] + len &&
   1070                       ci1->u_len == 0 && ci1->f_len == 0))
   1071                     break;
   1072                 len++;
   1073             }
   1074             te->len = len;
   1075             te->type = RUN_TYPE_L;
   1076             te->data = ci->l_data[0];
   1077             return;
   1078         }
   1079 
   1080         if (ci->l_len == 0 &&
   1081             ci->u_len == 1 &&
   1082             ci->u_data[0] < 0x1000 &&
   1083             ci->f_len == 1 && ci->f_data[0] == ci->u_data[0] + 0x20) {
   1084             te->len = 1;
   1085             te->type = RUN_TYPE_UF_D20;
   1086             te->data = ci->u_data[0];
   1087         } else if (ci->l_len == 0 &&
   1088                    ci->u_len == 1 &&
   1089                    ci->f_len == 1 && ci->f_data[0] == ci->u_data[0] + 1) {
   1090             te->len = 1;
   1091             te->type = RUN_TYPE_UF_D1_EXT;
   1092             te->ext_data[0] = ci->u_data[0];
   1093             te->ext_len = 1;
   1094         } else if (ci->l_len == 2 && ci->u_len == 0 && ci->f_len == 2 &&
   1095                    ci->l_data[0] == ci->f_data[0] &&
   1096                    ci->l_data[1] == ci->f_data[1]) {
   1097             te->len = 1;
   1098             te->type = RUN_TYPE_LF_EXT2;
   1099             te->ext_data[0] = ci->l_data[0];
   1100             te->ext_data[1] = ci->l_data[1];
   1101             te->ext_len = 2;
   1102         } else if (ci->u_len == 2 && ci->l_len == 0 && ci->f_len == 2 &&
   1103                    ci->f_data[0] == simple_to_lower(tab, ci->u_data[0]) &&
   1104                    ci->f_data[1] == simple_to_lower(tab, ci->u_data[1])) {
   1105             te->len = 1;
   1106             te->type = RUN_TYPE_UF_EXT2;
   1107             te->ext_data[0] = ci->u_data[0];
   1108             te->ext_data[1] = ci->u_data[1];
   1109             te->ext_len = 2;
   1110         } else if (ci->u_len == 3 && ci->l_len == 0 && ci->f_len == 3 &&
   1111                    ci->f_data[0] == simple_to_lower(tab, ci->u_data[0]) &&
   1112                    ci->f_data[1] == simple_to_lower(tab, ci->u_data[1]) &&
   1113                    ci->f_data[2] == simple_to_lower(tab, ci->u_data[2])) {
   1114             te->len = 1;
   1115             te->type = RUN_TYPE_UF_EXT3;
   1116             te->ext_data[0] = ci->u_data[0];
   1117             te->ext_data[1] = ci->u_data[1];
   1118             te->ext_data[2] = ci->u_data[2];
   1119             te->ext_len = 3;
   1120         } else {
   1121             printf("unsupported encoding case:\n");
   1122             dump_cc_info(ci, code);
   1123             abort();
   1124         }
   1125     } else {
   1126         /* look for a run of identical conversions */
   1127         len = 0;
   1128         for(;;) {
   1129             if (code >= CHARCODE_MAX || len >= 126)
   1130                 break;
   1131             ci = &tab[code + len];
   1132             ci1 = &tab[code + len + 1];
   1133             if (is_complicated_case(ci) || is_complicated_case(ci1)) {
   1134                 break;
   1135             }
   1136             if (ci->l_len != 1 || ci->l_data[0] != code + len + 1)
   1137                 break;
   1138             if (ci1->u_len != 1 || ci1->u_data[0] != code + len)
   1139                 break;
   1140             len += 2;
   1141         }
   1142         if (len > 0) {
   1143             te->len = len;
   1144             te->type = RUN_TYPE_UL;
   1145             te->data = 0;
   1146             return;
   1147         }
   1148 
   1149         ci = &tab[code];
   1150         is_lower = ci->l_len > 0;
   1151         len = 1;
   1152         while (code + len <= CHARCODE_MAX) {
   1153             ci1 = &tab[code + len];
   1154             if (is_complicated_case(ci1))
   1155                 break;
   1156             if (is_lower) {
   1157                 if (ci1->l_len != 1 ||
   1158                     ci1->l_data[0] != ci->l_data[0] + len)
   1159                     break;
   1160             } else {
   1161                 if (ci1->u_len != 1 ||
   1162                     ci1->u_data[0] != ci->u_data[0] + len)
   1163                     break;
   1164             }
   1165             len++;
   1166         }
   1167         te->len = len;
   1168         if (is_lower) {
   1169             te->type = RUN_TYPE_LF;
   1170             te->data = ci->l_data[0];
   1171         } else {
   1172             te->type = RUN_TYPE_U;
   1173             te->data = ci->u_data[0];
   1174         }
   1175     }
   1176 }
   1177 
   1178 TableEntry conv_table[1000];
   1179 int conv_table_len;
   1180 int ext_data[1000];
   1181 int ext_data_len;
   1182 
   1183 void dump_case_conv_table1(void)
   1184 {
   1185     int i, j;
   1186     const TableEntry *te;
   1187 
   1188     for(i = 0; i < conv_table_len; i++) {
   1189         te = &conv_table[i];
   1190         printf("%05x %02x %-10s %05x",
   1191                te->code, te->len, run_type_str[te->type], te->data);
   1192         for(j = 0; j < te->ext_len; j++) {
   1193             printf(" %05x", te->ext_data[j]);
   1194         }
   1195         printf("\n");
   1196     }
   1197     printf("table_len=%d ext_len=%d\n", conv_table_len, ext_data_len);
   1198 }
   1199 
   1200 int find_data_index(const TableEntry *conv_table, int len, int data)
   1201 {
   1202     int i;
   1203     const TableEntry *te;
   1204     for(i = 0; i < len; i++) {
   1205         te = &conv_table[i];
   1206         if (te->code == data)
   1207             return i;
   1208     }
   1209     return -1;
   1210 }
   1211 
   1212 int find_ext_data_index(int data)
   1213 {
   1214     int i;
   1215     for(i = 0; i < ext_data_len; i++) {
   1216         if (ext_data[i] == data)
   1217             return i;
   1218     }
   1219     assert(ext_data_len < countof(ext_data));
   1220     ext_data[ext_data_len++] = data;
   1221     return ext_data_len - 1;
   1222 }
   1223 
   1224 void build_conv_table(CCInfo *tab)
   1225 {
   1226     int code, i, j;
   1227     CCInfo *ci;
   1228     TableEntry *te;
   1229 
   1230     te = conv_table;
   1231     for(code = 0; code <= CHARCODE_MAX; code++) {
   1232         ci = &tab[code];
   1233         if (ci->u_len == 0 && ci->l_len == 0 && ci->f_len == 0)
   1234             continue;
   1235         assert(te - conv_table < countof(conv_table));
   1236         find_run_type(te, tab, code);
   1237 #if 0
   1238         if (te->type == RUN_TYPE_TODO) {
   1239             printf("TODO: ");
   1240             dump_cc_info(ci, code);
   1241         }
   1242 #endif
   1243         assert(te->len <= 127);
   1244         code += te->len - 1;
   1245         te++;
   1246     }
   1247     conv_table_len = te - conv_table;
   1248 
   1249     /* find the data index */
   1250     for(i = 0; i < conv_table_len; i++) {
   1251         int data_index;
   1252         te = &conv_table[i];
   1253 
   1254         switch(te->type) {
   1255         case RUN_TYPE_U:
   1256         case RUN_TYPE_L:
   1257         case RUN_TYPE_UF:
   1258         case RUN_TYPE_LF:
   1259             data_index = find_data_index(conv_table, conv_table_len, te->data);
   1260             if (data_index < 0) {
   1261                 switch(te->type) {
   1262                 case RUN_TYPE_U:
   1263                     te->type = RUN_TYPE_U_EXT;
   1264                     te->ext_len = 1;
   1265                     te->ext_data[0] = te->data;
   1266                     break;
   1267                 case RUN_TYPE_LF:
   1268                     te->type = RUN_TYPE_LF_EXT;
   1269                     te->ext_len = 1;
   1270                     te->ext_data[0] = te->data;
   1271                     break;
   1272                 default:
   1273                     printf("%05x: index not found\n", te->code);
   1274                     exit(1);
   1275                 }
   1276             } else {
   1277                 te->data_index = data_index;
   1278             }
   1279             break;
   1280         case RUN_TYPE_UF_D20:
   1281             te->data_index = te->data;
   1282             break;
   1283         }
   1284     }
   1285 
   1286     /* find the data index for ext_data */
   1287     for(i = 0; i < conv_table_len; i++) {
   1288         te = &conv_table[i];
   1289         if (te->type == RUN_TYPE_UF_EXT3) {
   1290             int p, v;
   1291             v = 0;
   1292             for(j = 0; j < 3; j++) {
   1293                 p = find_ext_data_index(te->ext_data[j]);
   1294                 assert(p < 16);
   1295                 v = (v << 4) | p;
   1296             }
   1297             te->data_index = v;
   1298         }
   1299     }
   1300 
   1301     for(i = 0; i < conv_table_len; i++) {
   1302         te = &conv_table[i];
   1303         if (te->type == RUN_TYPE_LF_EXT2 ||
   1304             te->type == RUN_TYPE_UF_EXT2 ||
   1305             te->type == RUN_TYPE_U2L_399_EXT2) {
   1306             int p, v;
   1307             v = 0;
   1308             for(j = 0; j < 2; j++) {
   1309                 p = find_ext_data_index(te->ext_data[j]);
   1310                 assert(p < 64);
   1311                 v = (v << 6) | p;
   1312             }
   1313             te->data_index = v;
   1314         }
   1315     }
   1316 
   1317     for(i = 0; i < conv_table_len; i++) {
   1318         te = &conv_table[i];
   1319         if (te->type == RUN_TYPE_UF_D1_EXT ||
   1320             te->type == RUN_TYPE_U_EXT ||
   1321             te->type == RUN_TYPE_LF_EXT) {
   1322             te->data_index = find_ext_data_index(te->ext_data[0]);
   1323         }
   1324     }
   1325 #ifdef DUMP_CASE_CONV_TABLE
   1326     dump_case_conv_table1();
   1327 #endif
   1328 }
   1329 
   1330 void dump_case_conv_table(FILE *f)
   1331 {
   1332     int i;
   1333     uint32_t v;
   1334     const TableEntry *te;
   1335 
   1336     total_tables++;
   1337     total_table_bytes += conv_table_len * sizeof(uint32_t);
   1338     fprintf(f, "static const uint32_t case_conv_table1[%d] = {", conv_table_len);
   1339     for(i = 0; i < conv_table_len; i++) {
   1340         if (i % 4 == 0)
   1341             fprintf(f, "\n   ");
   1342         te = &conv_table[i];
   1343         v = te->code << (32 - 17);
   1344         v |= te->len << (32 - 17 - 7);
   1345         v |= te->type << (32 - 17 - 7 - 4);
   1346         v |= te->data_index >> 8;
   1347         fprintf(f, " 0x%08x,", v);
   1348     }
   1349     fprintf(f, "\n};\n\n");
   1350 
   1351     total_tables++;
   1352     total_table_bytes += conv_table_len;
   1353     fprintf(f, "static const uint8_t case_conv_table2[%d] = {", conv_table_len);
   1354     for(i = 0; i < conv_table_len; i++) {
   1355         if (i % 8 == 0)
   1356             fprintf(f, "\n   ");
   1357         te = &conv_table[i];
   1358         fprintf(f, " 0x%02x,", te->data_index & 0xff);
   1359     }
   1360     fprintf(f, "\n};\n\n");
   1361 
   1362     total_tables++;
   1363     total_table_bytes += ext_data_len * sizeof(uint16_t);
   1364     fprintf(f, "static const uint16_t case_conv_ext[%d] = {", ext_data_len);
   1365     for(i = 0; i < ext_data_len; i++) {
   1366         if (i % 8 == 0)
   1367             fprintf(f, "\n   ");
   1368         fprintf(f, " 0x%04x,", ext_data[i]);
   1369     }
   1370     fprintf(f, "\n};\n\n");
   1371 }
   1372 
   1373 
   1374 static CCInfo *global_tab;
   1375 
   1376 static int sp_cc_cmp(const void *p1, const void *p2)
   1377 {
   1378     CCInfo *c1 = &global_tab[*(const int *)p1];
   1379     CCInfo *c2 = &global_tab[*(const int *)p2];
   1380     if (c1->f_len < c2->f_len) {
   1381         return -1;
   1382     } else if (c2->f_len < c1->f_len) {
   1383         return 1;
   1384     } else {
   1385         return memcmp(c1->f_data, c2->f_data, sizeof(c1->f_data[0]) * c1->f_len);
   1386     }
   1387 }
   1388 
   1389 /* dump the case special cases (multi character results which are
   1390    identical and need specific handling in lre_canonicalize() */
   1391 void dump_case_folding_special_cases(CCInfo *tab)
   1392 {
   1393     int i, len, j;
   1394     int *perm;
   1395 
   1396     perm = malloc(sizeof(perm[0]) * (CHARCODE_MAX + 1));
   1397     for(i = 0; i <= CHARCODE_MAX; i++)
   1398         perm[i] = i;
   1399     global_tab = tab;
   1400     qsort(perm, CHARCODE_MAX + 1, sizeof(perm[0]), sp_cc_cmp);
   1401     for(i = 0; i <= CHARCODE_MAX;) {
   1402         if (tab[perm[i]].f_len <= 1) {
   1403             i++;
   1404         } else {
   1405             len = 1;
   1406             while ((i + len) <= CHARCODE_MAX && !sp_cc_cmp(&perm[i], &perm[i + len]))
   1407                 len++;
   1408 
   1409             if (len > 1) {
   1410                 for(j = i; j < i + len; j++)
   1411                     dump_cc_info(&tab[perm[j]], perm[j]);
   1412             }
   1413             i += len;
   1414         }
   1415     }
   1416     free(perm);
   1417     global_tab = NULL;
   1418 }
   1419 
   1420 
   1421 int tabcmp(const int *tab1, const int *tab2, int n)
   1422 {
   1423     int i;
   1424     for(i = 0; i < n; i++) {
   1425         if (tab1[i] != tab2[i])
   1426             return -1;
   1427     }
   1428     return 0;
   1429 }
   1430 
   1431 void dump_str(const char *str, const int *buf, int len)
   1432 {
   1433     int i;
   1434     printf("%s=", str);
   1435     for(i = 0; i < len; i++)
   1436         printf(" %05x", buf[i]);
   1437     printf("\n");
   1438 }
   1439 
   1440 void compute_internal_props(void)
   1441 {
   1442     int i;
   1443     BOOL has_ul;
   1444 
   1445     for(i = 0; i <= CHARCODE_MAX; i++) {
   1446         CCInfo *ci = &unicode_db[i];
   1447         has_ul = (ci->u_len != 0 || ci->l_len != 0 || ci->f_len != 0);
   1448         if (has_ul) {
   1449             assert(get_prop(i, PROP_Cased));
   1450         } else {
   1451             set_prop(i, PROP_Cased1, get_prop(i, PROP_Cased));
   1452         }
   1453         set_prop(i, PROP_ID_Continue1,
   1454                  get_prop(i, PROP_ID_Continue) & (get_prop(i, PROP_ID_Start) ^ 1));
   1455         set_prop(i, PROP_XID_Start1,
   1456                  get_prop(i, PROP_ID_Start) ^ get_prop(i, PROP_XID_Start));
   1457         set_prop(i, PROP_XID_Continue1,
   1458                  get_prop(i, PROP_ID_Continue) ^ get_prop(i, PROP_XID_Continue));
   1459         set_prop(i, PROP_Changes_When_Titlecased1,
   1460                  get_prop(i, PROP_Changes_When_Titlecased) ^ (ci->u_len != 0));
   1461         set_prop(i, PROP_Changes_When_Casefolded1,
   1462                  get_prop(i, PROP_Changes_When_Casefolded) ^ (ci->f_len != 0));
   1463         /* XXX: reduce table size (438 bytes) */
   1464         set_prop(i, PROP_Changes_When_NFKC_Casefolded1,
   1465                  get_prop(i, PROP_Changes_When_NFKC_Casefolded) ^ (ci->f_len != 0));
   1466 #if 0
   1467         /* TEST */
   1468 #define M(x) (1U << GCAT_ ## x)
   1469         {
   1470             int b;
   1471             b = ((M(Mn) | M(Cf) | M(Lm) | M(Sk)) >>
   1472                  unicode_db[i].general_category) & 1;
   1473             set_prop(i, PROP_Cased1,
   1474                      get_prop(i, PROP_Case_Ignorable) ^ b);
   1475         }
   1476 #undef M
   1477 #endif
   1478     }
   1479 }
   1480 
   1481 void dump_byte_table(FILE *f, const char *cname, const uint8_t *tab, int len)
   1482 {
   1483     int i;
   1484 
   1485     total_tables++;
   1486     total_table_bytes += len;
   1487     fprintf(f, "static const uint8_t %s[%d] = {", cname, len);
   1488     for(i = 0; i < len; i++) {
   1489         if (i % 8 == 0)
   1490             fprintf(f, "\n   ");
   1491         fprintf(f, " 0x%02x,", tab[i]);
   1492     }
   1493     fprintf(f, "\n};\n\n");
   1494 }
   1495 
   1496 void dump_index_table(FILE *f, const char *cname, const uint8_t *tab, int len)
   1497 {
   1498     int i, code, offset;
   1499 
   1500     total_index++;
   1501     total_index_bytes += len;
   1502     fprintf(f, "static const uint8_t %s[%d] = {\n", cname, len);
   1503     for(i = 0; i < len; i += 3) {
   1504         code = tab[i] + (tab[i+1] << 8) + ((tab[i+2] & 0x1f) << 16);
   1505         offset = ((i / 3) + 1) * 32 + (tab[i+2] >> 5);
   1506         fprintf(f, "    0x%02x, 0x%02x, 0x%02x,", tab[i], tab[i+1], tab[i+2]);
   1507         fprintf(f, "  // %6.5X at %d%s\n", code, offset,
   1508                 i == len - 3 ? " (upper bound)" : "");
   1509     }
   1510     fprintf(f, "};\n\n");
   1511 }
   1512 
   1513 #define PROP_BLOCK_LEN 32
   1514 
   1515 void build_prop_table(FILE *f, const char *name, int prop_index, BOOL add_index)
   1516 {
   1517     int i, j, n, v, offset, code;
   1518     DynBuf dbuf_s, *dbuf = &dbuf_s;
   1519     DynBuf dbuf1_s, *dbuf1 = &dbuf1_s;
   1520     DynBuf dbuf2_s, *dbuf2 = &dbuf2_s;
   1521     const uint32_t *buf;
   1522     int buf_len, block_end_pos, bit;
   1523     char cname[128];
   1524 
   1525     dbuf_init(dbuf1);
   1526 
   1527     for(i = 0; i <= CHARCODE_MAX;) {
   1528         v = get_prop(i, prop_index);
   1529         j = i + 1;
   1530         while (j <= CHARCODE_MAX && get_prop(j, prop_index) == v) {
   1531             j++;
   1532         }
   1533         n = j - i;
   1534         if (j == (CHARCODE_MAX + 1) && v == 0)
   1535             break; /* no need to encode last zero run */
   1536         //printf("%05x: %d %d\n", i, n, v);
   1537         dbuf_put_u32(dbuf1, n - 1);
   1538         i += n;
   1539     }
   1540 
   1541     dbuf_init(dbuf);
   1542     dbuf_init(dbuf2);
   1543     buf = (uint32_t *)dbuf1->buf;
   1544     buf_len = dbuf1->size / sizeof(buf[0]);
   1545 
   1546     /* the first value is assumed to be 0 */
   1547     assert(get_prop(0, prop_index) == 0);
   1548 
   1549     block_end_pos = PROP_BLOCK_LEN;
   1550     i = 0;
   1551     code = 0;
   1552     bit = 0;
   1553     while (i < buf_len) {
   1554         if (add_index && dbuf->size >= block_end_pos && bit == 0) {
   1555             offset = (dbuf->size - block_end_pos);
   1556             /* XXX: offset could be larger in case of runs of small
   1557                lengths. Could add code to change the encoding to
   1558                prevent it at the expense of one byte loss */
   1559             assert(offset <= 7);
   1560             v = code | (offset << 21);
   1561             dbuf_putc(dbuf2, v);
   1562             dbuf_putc(dbuf2, v >> 8);
   1563             dbuf_putc(dbuf2, v >> 16);
   1564             block_end_pos += PROP_BLOCK_LEN;
   1565         }
   1566 
   1567         /* Compressed byte encoding:
   1568            00..3F: 2 packed lengths: 3-bit + 3-bit
   1569            40..5F: 5-bits plus extra byte for length
   1570            60..7F: 5-bits plus 2 extra bytes for length
   1571            80..FF: 7-bit length
   1572            lengths must be incremented to get character count
   1573            Ranges alternate between false and true return value.
   1574          */
   1575         v = buf[i];
   1576         code += v + 1;
   1577         bit ^= 1;
   1578         if (v < 8 && (i + 1) < buf_len && buf[i + 1] < 8) {
   1579             code += buf[i + 1] + 1;
   1580             bit ^= 1;
   1581             dbuf_putc(dbuf, (v << 3) | buf[i + 1]);
   1582             i += 2;
   1583         } else if (v < 128) {
   1584             dbuf_putc(dbuf, 0x80 + v);
   1585             i++;
   1586         } else if (v < (1 << 13)) {
   1587             dbuf_putc(dbuf, 0x40 + (v >> 8));
   1588             dbuf_putc(dbuf, v);
   1589             i++;
   1590         } else {
   1591             assert(v < (1 << 21));
   1592             dbuf_putc(dbuf, 0x60 + (v >> 16));
   1593             dbuf_putc(dbuf, v >> 8);
   1594             dbuf_putc(dbuf, v);
   1595             i++;
   1596         }
   1597     }
   1598 
   1599     if (add_index) {
   1600         /* last index entry */
   1601         v = code;
   1602         dbuf_putc(dbuf2, v);
   1603         dbuf_putc(dbuf2, v >> 8);
   1604         dbuf_putc(dbuf2, v >> 16);
   1605     }
   1606 
   1607 #ifdef DUMP_TABLE_SIZE
   1608     printf("prop %s: length=%d bytes\n", unicode_prop_name[prop_index],
   1609            (int)(dbuf->size + dbuf2->size));
   1610 #endif
   1611     snprintf(cname, sizeof(cname), "unicode_prop_%s_table", unicode_prop_name[prop_index]);
   1612     dump_byte_table(f, cname, dbuf->buf, dbuf->size);
   1613     if (add_index) {
   1614         snprintf(cname, sizeof(cname), "unicode_prop_%s_index", unicode_prop_name[prop_index]);
   1615         dump_index_table(f, cname, dbuf2->buf, dbuf2->size);
   1616     }
   1617 
   1618     dbuf_free(dbuf);
   1619     dbuf_free(dbuf1);
   1620     dbuf_free(dbuf2);
   1621 }
   1622 
   1623 void build_flags_tables(FILE *f)
   1624 {
   1625     build_prop_table(f, "Cased1", PROP_Cased1, TRUE);
   1626     build_prop_table(f, "Case_Ignorable", PROP_Case_Ignorable, TRUE);
   1627     build_prop_table(f, "ID_Start", PROP_ID_Start, TRUE);
   1628     build_prop_table(f, "ID_Continue1", PROP_ID_Continue1, TRUE);
   1629 }
   1630 
   1631 void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len,
   1632                      const char **tab_short_name)
   1633 {
   1634     int i, w, maxw;
   1635 
   1636     maxw = 0;
   1637     for(i = 0; i < len; i++) {
   1638         w = strlen(tab_name[i]);
   1639         if (tab_short_name[i][0] != '\0') {
   1640             w += 1 + strlen(tab_short_name[i]);
   1641         }
   1642         if (maxw < w)
   1643             maxw = w;
   1644     }
   1645 
   1646     /* generate a sequence of strings terminated by an empty string */
   1647     fprintf(f, "static const char %s[] =\n", cname);
   1648     for(i = 0; i < len; i++) {
   1649         fprintf(f, "    \"");
   1650         w = fprintf(f, "%s", tab_name[i]);
   1651         if (tab_short_name[i][0] != '\0') {
   1652             w += fprintf(f, ",%s", tab_short_name[i]);
   1653         }
   1654         fprintf(f, "\"%*s\"\\0\"\n", 1 + maxw - w, "");
   1655     }
   1656     fprintf(f, ";\n\n");
   1657 }
   1658 
   1659 void build_general_category_table(FILE *f)
   1660 {
   1661     int i, v, j, n, n1;
   1662     DynBuf dbuf_s, *dbuf = &dbuf_s;
   1663 #ifdef DUMP_TABLE_SIZE
   1664     int cw_count, cw_len_count[4], cw_start;
   1665 #endif
   1666 
   1667     fprintf(f, "typedef enum {\n");
   1668     for(i = 0; i < GCAT_COUNT; i++)
   1669         fprintf(f, "    UNICODE_GC_%s,\n", unicode_gc_name[i]);
   1670     fprintf(f, "    UNICODE_GC_COUNT,\n");
   1671     fprintf(f, "} UnicodeGCEnum;\n\n");
   1672 
   1673     dump_name_table(f, "unicode_gc_name_table",
   1674                     unicode_gc_name, GCAT_COUNT,
   1675                     unicode_gc_short_name);
   1676 
   1677 
   1678     dbuf_init(dbuf);
   1679 #ifdef DUMP_TABLE_SIZE
   1680     cw_count = 0;
   1681     for(i = 0; i < 4; i++)
   1682         cw_len_count[i] = 0;
   1683 #endif
   1684     for(i = 0; i <= CHARCODE_MAX;) {
   1685         v = unicode_db[i].general_category;
   1686         j = i + 1;
   1687         while (j <= CHARCODE_MAX && unicode_db[j].general_category == v)
   1688             j++;
   1689         n = j - i;
   1690         /* compress Lu/Ll runs */
   1691         if (v == GCAT_Lu) {
   1692             n1 = 1;
   1693             while ((i + n1) <= CHARCODE_MAX && unicode_db[i + n1].general_category == (v + (n1 & 1))) {
   1694                 n1++;
   1695             }
   1696             if (n1 > n) {
   1697                 v = 31;
   1698                 n = n1;
   1699             }
   1700         }
   1701         //        printf("%05x %05x %d\n", i, n, v);
   1702         n--;
   1703 #ifdef DUMP_TABLE_SIZE
   1704         cw_count++;
   1705         cw_start = dbuf->size;
   1706 #endif
   1707         if (n < 7) {
   1708             dbuf_putc(dbuf, (n << 5) | v);
   1709         } else if (n < 7 + 128) {
   1710             n1 = n - 7;
   1711             assert(n1 < 128);
   1712             dbuf_putc(dbuf, (0xf << 5) | v);
   1713             dbuf_putc(dbuf, n1);
   1714         } else if (n < 7 + 128 + (1 << 14)) {
   1715             n1 = n - (7 + 128);
   1716             assert(n1 < (1 << 14));
   1717             dbuf_putc(dbuf, (0xf << 5) | v);
   1718             dbuf_putc(dbuf, (n1 >> 8) + 128);
   1719             dbuf_putc(dbuf, n1);
   1720         } else {
   1721             n1 = n - (7 + 128 + (1 << 14));
   1722             assert(n1 < (1 << 22));
   1723             dbuf_putc(dbuf, (0xf << 5) | v);
   1724             dbuf_putc(dbuf, (n1 >> 16) + 128 + 64);
   1725             dbuf_putc(dbuf, n1 >> 8);
   1726             dbuf_putc(dbuf, n1);
   1727         }
   1728 #ifdef DUMP_TABLE_SIZE
   1729         cw_len_count[dbuf->size - cw_start - 1]++;
   1730 #endif
   1731         i += n + 1;
   1732     }
   1733 #ifdef DUMP_TABLE_SIZE
   1734     printf("general category: %d entries [", cw_count);
   1735     for(i = 0; i < 4; i++)
   1736         printf(" %d", cw_len_count[i]);
   1737     printf(" ], length=%d bytes\n", (int)dbuf->size);
   1738 #endif
   1739 
   1740     dump_byte_table(f, "unicode_gc_table", dbuf->buf, dbuf->size);
   1741 
   1742     dbuf_free(dbuf);
   1743 }
   1744 
   1745 void build_script_table(FILE *f)
   1746 {
   1747     int i, v, j, n, n1, type;
   1748     DynBuf dbuf_s, *dbuf = &dbuf_s;
   1749 #ifdef DUMP_TABLE_SIZE
   1750     int cw_count, cw_len_count[4], cw_start;
   1751 #endif
   1752 
   1753     fprintf(f, "typedef enum {\n");
   1754     for(i = 0; i < SCRIPT_COUNT; i++)
   1755         fprintf(f, "    UNICODE_SCRIPT_%s,\n", unicode_script_name[i]);
   1756     fprintf(f, "    UNICODE_SCRIPT_COUNT,\n");
   1757     fprintf(f, "} UnicodeScriptEnum;\n\n");
   1758 
   1759     i = 1;
   1760     dump_name_table(f, "unicode_script_name_table",
   1761                     unicode_script_name + i, SCRIPT_COUNT - i,
   1762                     unicode_script_short_name + i);
   1763 
   1764     dbuf_init(dbuf);
   1765 #ifdef DUMP_TABLE_SIZE
   1766     cw_count = 0;
   1767     for(i = 0; i < 4; i++)
   1768         cw_len_count[i] = 0;
   1769 #endif
   1770     for(i = 0; i <= CHARCODE_MAX;) {
   1771         v = unicode_db[i].script;
   1772         j = i + 1;
   1773         while (j <= CHARCODE_MAX && unicode_db[j].script == v)
   1774             j++;
   1775         n = j - i;
   1776         if (v == 0 && j == (CHARCODE_MAX + 1))
   1777             break;
   1778         //        printf("%05x %05x %d\n", i, n, v);
   1779         n--;
   1780 #ifdef DUMP_TABLE_SIZE
   1781         cw_count++;
   1782         cw_start = dbuf->size;
   1783 #endif
   1784         if (v == 0)
   1785             type = 0;
   1786         else
   1787             type = 1;
   1788         if (n < 96) {
   1789             dbuf_putc(dbuf, n | (type << 7));
   1790         } else if (n < 96 + (1 << 12)) {
   1791             n1 = n - 96;
   1792             assert(n1 < (1 << 12));
   1793             dbuf_putc(dbuf, ((n1 >> 8) + 96) | (type << 7));
   1794             dbuf_putc(dbuf, n1);
   1795         } else {
   1796             n1 = n - (96 + (1 << 12));
   1797             assert(n1 < (1 << 20));
   1798             dbuf_putc(dbuf, ((n1 >> 16) + 112) | (type << 7));
   1799             dbuf_putc(dbuf, n1 >> 8);
   1800             dbuf_putc(dbuf, n1);
   1801         }
   1802         if (type != 0)
   1803             dbuf_putc(dbuf, v);
   1804 
   1805 #ifdef DUMP_TABLE_SIZE
   1806         cw_len_count[dbuf->size - cw_start - 1]++;
   1807 #endif
   1808         i += n + 1;
   1809     }
   1810 #ifdef DUMP_TABLE_SIZE
   1811     printf("script: %d entries [", cw_count);
   1812     for(i = 0; i < 4; i++)
   1813         printf(" %d", cw_len_count[i]);
   1814     printf(" ], length=%d bytes\n", (int)dbuf->size);
   1815 #endif
   1816 
   1817     dump_byte_table(f, "unicode_script_table", dbuf->buf, dbuf->size);
   1818 
   1819     dbuf_free(dbuf);
   1820 }
   1821 
   1822 void build_script_ext_table(FILE *f)
   1823 {
   1824     int i, j, n, n1, script_ext_len;
   1825     DynBuf dbuf_s, *dbuf = &dbuf_s;
   1826 #if defined(DUMP_TABLE_SIZE)
   1827     int cw_count = 0;
   1828 #endif
   1829 
   1830     dbuf_init(dbuf);
   1831     for(i = 0; i <= CHARCODE_MAX;) {
   1832         script_ext_len = unicode_db[i].script_ext_len;
   1833         j = i + 1;
   1834         while (j <= CHARCODE_MAX &&
   1835                unicode_db[j].script_ext_len == script_ext_len &&
   1836                !memcmp(unicode_db[j].script_ext, unicode_db[i].script_ext,
   1837                        script_ext_len)) {
   1838             j++;
   1839         }
   1840         n = j - i;
   1841 #if defined(DUMP_TABLE_SIZE)
   1842         cw_count++;
   1843 #endif
   1844         n--;
   1845         if (n < 128) {
   1846             dbuf_putc(dbuf, n);
   1847         } else if (n < 128 + (1 << 14)) {
   1848             n1 = n - 128;
   1849             assert(n1 < (1 << 14));
   1850             dbuf_putc(dbuf, (n1 >> 8) + 128);
   1851             dbuf_putc(dbuf, n1);
   1852         } else {
   1853             n1 = n - (128 + (1 << 14));
   1854             assert(n1 < (1 << 22));
   1855             dbuf_putc(dbuf, (n1 >> 16) + 128 + 64);
   1856             dbuf_putc(dbuf, n1 >> 8);
   1857             dbuf_putc(dbuf, n1);
   1858         }
   1859         dbuf_putc(dbuf, script_ext_len);
   1860         for(j = 0; j < script_ext_len; j++)
   1861             dbuf_putc(dbuf, unicode_db[i].script_ext[j]);
   1862         i += n + 1;
   1863     }
   1864 #ifdef DUMP_TABLE_SIZE
   1865     printf("script_ext: %d entries", cw_count);
   1866     printf(", length=%d bytes\n", (int)dbuf->size);
   1867 #endif
   1868 
   1869     dump_byte_table(f, "unicode_script_ext_table", dbuf->buf, dbuf->size);
   1870 
   1871     dbuf_free(dbuf);
   1872 }
   1873 
   1874 /* the following properties are synthetized so no table is necessary */
   1875 #define PROP_TABLE_COUNT PROP_ASCII
   1876 
   1877 void build_prop_list_table(FILE *f)
   1878 {
   1879     int i;
   1880 
   1881     for(i = 0; i < PROP_TABLE_COUNT; i++) {
   1882         if (i == PROP_ID_Start ||
   1883             i == PROP_Case_Ignorable ||
   1884             i == PROP_ID_Continue1) {
   1885             /* already generated */
   1886         } else {
   1887             build_prop_table(f, unicode_prop_name[i], i, FALSE);
   1888         }
   1889     }
   1890 
   1891     fprintf(f, "typedef enum {\n");
   1892     for(i = 0; i < PROP_COUNT; i++)
   1893         fprintf(f, "    UNICODE_PROP_%s,\n", unicode_prop_name[i]);
   1894     fprintf(f, "    UNICODE_PROP_COUNT,\n");
   1895     fprintf(f, "} UnicodePropertyEnum;\n\n");
   1896 
   1897     i = PROP_ASCII_Hex_Digit;
   1898     dump_name_table(f, "unicode_prop_name_table",
   1899                     unicode_prop_name + i, PROP_XID_Start - i + 1,
   1900                     unicode_prop_short_name + i);
   1901 
   1902     fprintf(f, "static const uint8_t * const unicode_prop_table[] = {\n");
   1903     for(i = 0; i < PROP_TABLE_COUNT; i++) {
   1904         fprintf(f, "    unicode_prop_%s_table,\n", unicode_prop_name[i]);
   1905     }
   1906     fprintf(f, "};\n\n");
   1907 
   1908     fprintf(f, "static const uint16_t unicode_prop_len_table[] = {\n");
   1909     for(i = 0; i < PROP_TABLE_COUNT; i++) {
   1910         fprintf(f, "    countof(unicode_prop_%s_table),\n", unicode_prop_name[i]);
   1911     }
   1912     fprintf(f, "};\n\n");
   1913 }
   1914 
   1915 #ifdef USE_TEST
   1916 int check_conv(uint32_t *res, uint32_t c, int conv_type)
   1917 {
   1918     return lre_case_conv(res, c, conv_type);
   1919 }
   1920 
   1921 void check_case_conv(void)
   1922 {
   1923     CCInfo *tab = unicode_db;
   1924     uint32_t res[3];
   1925     int l, error;
   1926     CCInfo ci_s, *ci1, *ci = &ci_s;
   1927     int code;
   1928 
   1929     for(code = 0; code <= CHARCODE_MAX; code++) {
   1930         ci1 = &tab[code];
   1931         *ci = *ci1;
   1932         if (ci->l_len == 0) {
   1933             ci->l_len = 1;
   1934             ci->l_data[0] = code;
   1935         }
   1936         if (ci->u_len == 0) {
   1937             ci->u_len = 1;
   1938             ci->u_data[0] = code;
   1939         }
   1940         if (ci->f_len == 0) {
   1941             ci->f_len = 1;
   1942             ci->f_data[0] = code;
   1943         }
   1944 
   1945         error = 0;
   1946         l = check_conv(res, code, 0);
   1947         if (l != ci->u_len || tabcmp((int *)res, ci->u_data, l)) {
   1948             printf("ERROR: L\n");
   1949             error++;
   1950         }
   1951         l = check_conv(res, code, 1);
   1952         if (l != ci->l_len || tabcmp((int *)res, ci->l_data, l)) {
   1953             printf("ERROR: U\n");
   1954             error++;
   1955         }
   1956         l = check_conv(res, code, 2);
   1957         if (l != ci->f_len || tabcmp((int *)res, ci->f_data, l)) {
   1958             printf("ERROR: F\n");
   1959             error++;
   1960         }
   1961         if (error) {
   1962             dump_cc_info(ci, code);
   1963             exit(1);
   1964         }
   1965     }
   1966 }
   1967 
   1968 #ifdef PROFILE
   1969 static int64_t get_time_ns(void)
   1970 {
   1971     struct timespec ts;
   1972     clock_gettime(CLOCK_MONOTONIC, &ts);
   1973     return (int64_t)ts.tv_sec * 1000000000 + ts.tv_nsec;
   1974 }
   1975 #endif
   1976 
   1977 
   1978 void check_flags(void)
   1979 {
   1980     int c;
   1981     BOOL flag_ref, flag;
   1982     for(c = 0; c <= CHARCODE_MAX; c++) {
   1983         flag_ref = get_prop(c, PROP_Cased);
   1984         flag = !!lre_is_cased(c);
   1985         if (flag != flag_ref) {
   1986             printf("ERROR: c=%05x cased=%d ref=%d\n",
   1987                    c, flag, flag_ref);
   1988             exit(1);
   1989         }
   1990 
   1991         flag_ref = get_prop(c, PROP_Case_Ignorable);
   1992         flag = !!lre_is_case_ignorable(c);
   1993         if (flag != flag_ref) {
   1994             printf("ERROR: c=%05x case_ignorable=%d ref=%d\n",
   1995                    c, flag, flag_ref);
   1996             exit(1);
   1997         }
   1998 
   1999         flag_ref = get_prop(c, PROP_ID_Start);
   2000         flag = !!lre_is_id_start(c);
   2001         if (flag != flag_ref) {
   2002             printf("ERROR: c=%05x id_start=%d ref=%d\n",
   2003                    c, flag, flag_ref);
   2004             exit(1);
   2005         }
   2006 
   2007         flag_ref = get_prop(c, PROP_ID_Continue);
   2008         flag = !!lre_is_id_continue(c);
   2009         if (flag != flag_ref) {
   2010             printf("ERROR: c=%05x id_cont=%d ref=%d\n",
   2011                    c, flag, flag_ref);
   2012             exit(1);
   2013         }
   2014     }
   2015 #ifdef PROFILE
   2016     {
   2017         int64_t ti, count;
   2018         ti = get_time_ns();
   2019         count = 0;
   2020         for(c = 0x20; c <= 0xffff; c++) {
   2021             flag_ref = get_prop(c, PROP_ID_Start);
   2022             flag = !!lre_is_id_start(c);
   2023             assert(flag == flag_ref);
   2024             count++;
   2025         }
   2026         ti = get_time_ns() - ti;
   2027         printf("flags time=%0.1f ns/char\n",
   2028                (double)ti / count);
   2029     }
   2030 #endif
   2031 }
   2032 
   2033 #endif
   2034 
   2035 #define CC_BLOCK_LEN 32
   2036 
   2037 void build_cc_table(FILE *f)
   2038 {
   2039     // Compress combining class table
   2040     // see: https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values
   2041     int i, cc, n, type, n1, block_end_pos;
   2042     DynBuf dbuf_s, *dbuf = &dbuf_s;
   2043     DynBuf dbuf1_s, *dbuf1 = &dbuf1_s;
   2044 #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
   2045     int cw_len_tab[3], cw_start, cc_table_len;
   2046 #endif
   2047     uint32_t v;
   2048 
   2049     dbuf_init(dbuf);
   2050     dbuf_init(dbuf1);
   2051 #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
   2052     cc_table_len = 0;
   2053     for(i = 0; i < countof(cw_len_tab); i++)
   2054         cw_len_tab[i] = 0;
   2055 #endif
   2056     block_end_pos = CC_BLOCK_LEN;
   2057     for(i = 0; i <= CHARCODE_MAX;) {
   2058         cc = unicode_db[i].combining_class;
   2059         assert(cc <= 255);
   2060         /* check increasing values */
   2061         n = 1;
   2062         while ((i + n) <= CHARCODE_MAX &&
   2063                unicode_db[i + n].combining_class == (cc + n))
   2064             n++;
   2065         if (n >= 2) {
   2066             type = 1;
   2067         } else {
   2068             type = 0;
   2069             n = 1;
   2070             while ((i + n) <= CHARCODE_MAX &&
   2071                    unicode_db[i + n].combining_class == cc)
   2072                 n++;
   2073         }
   2074         /* no need to encode the last run */
   2075         if (cc == 0 && (i + n - 1) == CHARCODE_MAX)
   2076             break;
   2077 #ifdef DUMP_CC_TABLE
   2078         printf("%05x %6d %d %d\n", i, n, type, cc);
   2079 #endif
   2080         if (type == 0) {
   2081             if (cc == 0)
   2082                 type = 2;
   2083             else if (cc == 230)
   2084                 type = 3;
   2085         }
   2086         n1 = n - 1;
   2087 
   2088         /* add an entry to the index if necessary */
   2089         if (dbuf->size >= block_end_pos) {
   2090             v = i | ((dbuf->size - block_end_pos) << 21);
   2091             dbuf_putc(dbuf1, v);
   2092             dbuf_putc(dbuf1, v >> 8);
   2093             dbuf_putc(dbuf1, v >> 16);
   2094             block_end_pos += CC_BLOCK_LEN;
   2095         }
   2096 #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
   2097         cw_start = dbuf->size;
   2098 #endif
   2099         /* Compressed run length encoding:
   2100            - 2 high order bits are combining class type
   2101            -         0:0, 1:230, 2:extra byte linear progression, 3:extra byte
   2102            - 00..2F: range length (add 1)
   2103            - 30..37: 3-bit range-length + 1 extra byte
   2104            - 38..3F: 3-bit range-length + 2 extra byte
   2105          */
   2106         if (n1 < 48) {
   2107             dbuf_putc(dbuf, n1 | (type << 6));
   2108         } else if (n1 < 48 + (1 << 11)) {
   2109             n1 -= 48;
   2110             dbuf_putc(dbuf, ((n1 >> 8) + 48) | (type << 6));
   2111             dbuf_putc(dbuf, n1);
   2112         } else {
   2113             n1 -= 48 + (1 << 11);
   2114             assert(n1 < (1 << 20));
   2115             dbuf_putc(dbuf, ((n1 >> 16) + 56) | (type << 6));
   2116             dbuf_putc(dbuf, n1 >> 8);
   2117             dbuf_putc(dbuf, n1);
   2118         }
   2119 #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
   2120         cw_len_tab[dbuf->size - cw_start - 1]++;
   2121         cc_table_len++;
   2122 #endif
   2123         if (type == 0 || type == 1)
   2124             dbuf_putc(dbuf, cc);
   2125         i += n;
   2126     }
   2127 
   2128     /* last index entry */
   2129     v = i;
   2130     dbuf_putc(dbuf1, v);
   2131     dbuf_putc(dbuf1, v >> 8);
   2132     dbuf_putc(dbuf1, v >> 16);
   2133 
   2134     dump_byte_table(f, "unicode_cc_table", dbuf->buf, dbuf->size);
   2135     dump_index_table(f, "unicode_cc_index", dbuf1->buf, dbuf1->size);
   2136 
   2137 #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
   2138     printf("CC table: size=%d (%d entries) [",
   2139            (int)(dbuf->size + dbuf1->size),
   2140            cc_table_len);
   2141     for(i = 0; i < countof(cw_len_tab); i++)
   2142         printf(" %d", cw_len_tab[i]);
   2143     printf(" ]\n");
   2144 #endif
   2145     dbuf_free(dbuf);
   2146     dbuf_free(dbuf1);
   2147 }
   2148 
   2149 /* maximum length of decomposition: 18 chars (1), then 8 */
   2150 #ifndef USE_TEST
   2151 typedef enum {
   2152     DECOMP_TYPE_C1, /* 16 bit char */
   2153     DECOMP_TYPE_L1, /* 16 bit char table */
   2154     DECOMP_TYPE_L2,
   2155     DECOMP_TYPE_L3,
   2156     DECOMP_TYPE_L4,
   2157     DECOMP_TYPE_L5, /* XXX: not used */
   2158     DECOMP_TYPE_L6, /* XXX: could remove */
   2159     DECOMP_TYPE_L7, /* XXX: could remove */
   2160     DECOMP_TYPE_LL1, /* 18 bit char table */
   2161     DECOMP_TYPE_LL2,
   2162     DECOMP_TYPE_S1, /* 8 bit char table */
   2163     DECOMP_TYPE_S2,
   2164     DECOMP_TYPE_S3,
   2165     DECOMP_TYPE_S4,
   2166     DECOMP_TYPE_S5,
   2167     DECOMP_TYPE_I1, /* increment 16 bit char value */
   2168     DECOMP_TYPE_I2_0,
   2169     DECOMP_TYPE_I2_1,
   2170     DECOMP_TYPE_I3_1,
   2171     DECOMP_TYPE_I3_2,
   2172     DECOMP_TYPE_I4_1,
   2173     DECOMP_TYPE_I4_2,
   2174     DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
   2175     DECOMP_TYPE_B2,
   2176     DECOMP_TYPE_B3,
   2177     DECOMP_TYPE_B4,
   2178     DECOMP_TYPE_B5,
   2179     DECOMP_TYPE_B6,
   2180     DECOMP_TYPE_B7,
   2181     DECOMP_TYPE_B8,
   2182     DECOMP_TYPE_B18,
   2183     DECOMP_TYPE_LS2,
   2184     DECOMP_TYPE_PAT3,
   2185     DECOMP_TYPE_S2_UL,
   2186     DECOMP_TYPE_LS2_UL,
   2187 } DecompTypeEnum;
   2188 #endif
   2189 
   2190 const char *decomp_type_str[] = {
   2191     "C1",
   2192     "L1",
   2193     "L2",
   2194     "L3",
   2195     "L4",
   2196     "L5",
   2197     "L6",
   2198     "L7",
   2199     "LL1",
   2200     "LL2",
   2201     "S1",
   2202     "S2",
   2203     "S3",
   2204     "S4",
   2205     "S5",
   2206     "I1",
   2207     "I2_0",
   2208     "I2_1",
   2209     "I3_1",
   2210     "I3_2",
   2211     "I4_1",
   2212     "I4_2",
   2213     "B1",
   2214     "B2",
   2215     "B3",
   2216     "B4",
   2217     "B5",
   2218     "B6",
   2219     "B7",
   2220     "B8",
   2221     "B18",
   2222     "LS2",
   2223     "PAT3",
   2224     "S2_UL",
   2225     "LS2_UL",
   2226 };
   2227 
   2228 const int decomp_incr_tab[4][4] = {
   2229     { DECOMP_TYPE_I1, 0, -1 },
   2230     { DECOMP_TYPE_I2_0, 0, 1, -1 },
   2231     { DECOMP_TYPE_I3_1, 1, 2, -1 },
   2232     { DECOMP_TYPE_I4_1, 1, 2, -1 },
   2233 };
   2234 
   2235 /*
   2236   entry size:
   2237   type   bits
   2238   code   18
   2239   len    7
   2240   compat 1
   2241   type   5
   2242   index  16
   2243   total  47
   2244 */
   2245 
   2246 typedef struct {
   2247     int code;
   2248     uint8_t len;
   2249     uint8_t type;
   2250     uint8_t c_len;
   2251     uint16_t c_min;
   2252     uint16_t data_index;
   2253     int cost; /* size in bytes from this entry to the end */
   2254 } DecompEntry;
   2255 
   2256 int get_decomp_run_size(const DecompEntry *de)
   2257 {
   2258     int s;
   2259     s = 6;
   2260     if (de->type <= DECOMP_TYPE_C1) {
   2261         /* nothing more */
   2262     } else if (de->type <= DECOMP_TYPE_L7) {
   2263         s += de->len * de->c_len * 2;
   2264     } else if (de->type <= DECOMP_TYPE_LL2) {
   2265         /* 18 bits per char */
   2266         s += (de->len * de->c_len * 18 + 7) / 8;
   2267     } else if (de->type <= DECOMP_TYPE_S5) {
   2268         s += de->len * de->c_len;
   2269     } else if (de->type <= DECOMP_TYPE_I4_2) {
   2270         s += de->c_len * 2;
   2271     } else if (de->type <= DECOMP_TYPE_B18) {
   2272         s += 2 + de->len * de->c_len;
   2273     } else if (de->type <= DECOMP_TYPE_LS2) {
   2274         s += de->len * 3;
   2275     } else if (de->type <= DECOMP_TYPE_PAT3) {
   2276         s += 4 + de->len * 2;
   2277     } else if (de->type <= DECOMP_TYPE_S2_UL) {
   2278         s += de->len;
   2279     } else if (de->type <= DECOMP_TYPE_LS2_UL) {
   2280         s += (de->len / 2) * 3;
   2281     } else {
   2282         abort();
   2283     }
   2284     return s;
   2285 }
   2286 
   2287 static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
   2288 
   2289 /* return -1 if not found */
   2290 int get_short_code(int c)
   2291 {
   2292     int i;
   2293     if (c < 0x80) {
   2294         return c;
   2295     } else if (c >= 0x300 && c < 0x350) {
   2296         return c - 0x300 + 0x80;
   2297     } else {
   2298         for(i = 0; i < countof(unicode_short_table); i++) {
   2299             if (c == unicode_short_table[i])
   2300                 return i + 0x80 + 0x50;
   2301         }
   2302         return -1;
   2303     }
   2304 }
   2305 
   2306 static BOOL is_short(int code)
   2307 {
   2308     return get_short_code(code) >= 0;
   2309 }
   2310 
   2311 static BOOL is_short_tab(const int *tab, int len)
   2312 {
   2313     int i;
   2314     for(i = 0; i < len; i++) {
   2315         if (!is_short(tab[i]))
   2316             return FALSE;
   2317     }
   2318     return TRUE;
   2319 }
   2320 
   2321 static BOOL is_16bit(const int *tab, int len)
   2322 {
   2323     int i;
   2324     for(i = 0; i < len; i++) {
   2325         if (tab[i] > 0xffff)
   2326             return FALSE;
   2327     }
   2328     return TRUE;
   2329 }
   2330 
   2331 static uint32_t to_lower_simple(uint32_t c)
   2332 {
   2333     /* Latin1 and Cyrillic */
   2334     if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
   2335         c += 0x20;
   2336     else
   2337         c++;
   2338     return c;
   2339 }
   2340 
   2341 /* select best encoding with dynamic programming */
   2342 void find_decomp_run(DecompEntry *tab_de, int i)
   2343 {
   2344     DecompEntry de_s, *de = &de_s;
   2345     CCInfo *ci, *ci1, *ci2;
   2346     int l, j, n, len_max;
   2347 
   2348     ci = &unicode_db[i];
   2349     l = ci->decomp_len;
   2350     if (l == 0) {
   2351         tab_de[i].cost = tab_de[i + 1].cost;
   2352         return;
   2353     }
   2354 
   2355     /* the offset for the compose table has only 6 bits, so we must
   2356        limit if it can be used by the compose table */
   2357     if (!ci->is_compat && !ci->is_excluded && l == 2)
   2358         len_max = 64;
   2359     else
   2360         len_max = 127;
   2361 
   2362     tab_de[i].cost = 0x7fffffff;
   2363 
   2364     if (!is_16bit(ci->decomp_data, l)) {
   2365         assert(l <= 2);
   2366 
   2367         n = 1;
   2368         for(;;) {
   2369             de->code = i;
   2370             de->len = n;
   2371             de->type = DECOMP_TYPE_LL1 + l - 1;
   2372             de->c_len = l;
   2373             de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
   2374             if (de->cost < tab_de[i].cost) {
   2375                 tab_de[i] = *de;
   2376             }
   2377             if (!((i + n) <= CHARCODE_MAX && n < len_max))
   2378                 break;
   2379             ci1 = &unicode_db[i + n];
   2380             /* Note: we accept a hole */
   2381             if (!(ci1->decomp_len == 0 ||
   2382                   (ci1->decomp_len == l &&
   2383                    ci1->is_compat == ci->is_compat)))
   2384                 break;
   2385             n++;
   2386         }
   2387         return;
   2388     }
   2389 
   2390     if (l <= 7) {
   2391         n = 1;
   2392         for(;;) {
   2393             de->code = i;
   2394             de->len = n;
   2395             if (l == 1 && n == 1) {
   2396                 de->type = DECOMP_TYPE_C1;
   2397             } else {
   2398                 assert(l <= 8);
   2399                 de->type = DECOMP_TYPE_L1 + l - 1;
   2400             }
   2401             de->c_len = l;
   2402             de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
   2403             if (de->cost < tab_de[i].cost) {
   2404                 tab_de[i] = *de;
   2405             }
   2406 
   2407             if (!((i + n) <= CHARCODE_MAX && n < len_max))
   2408                 break;
   2409             ci1 = &unicode_db[i + n];
   2410             /* Note: we accept a hole */
   2411             if (!(ci1->decomp_len == 0 ||
   2412                   (ci1->decomp_len == l &&
   2413                    ci1->is_compat == ci->is_compat &&
   2414                    is_16bit(ci1->decomp_data, l))))
   2415                 break;
   2416             n++;
   2417         }
   2418     }
   2419 
   2420     if (l <= 8 || l == 18) {
   2421         int c_min, c_max, c;
   2422         c_min = c_max = -1;
   2423         n = 1;
   2424         for(;;) {
   2425             ci1 = &unicode_db[i + n - 1];
   2426             for(j = 0; j < l; j++) {
   2427                 c = ci1->decomp_data[j];
   2428                 if (c == 0x20) {
   2429                     /* we accept space for Arabic */
   2430                 } else if (c_min == -1) {
   2431                     c_min = c_max = c;
   2432                 } else {
   2433                     c_min = min_int(c_min, c);
   2434                     c_max = max_int(c_max, c);
   2435                 }
   2436             }
   2437             if ((c_max - c_min) > 254)
   2438                 break;
   2439             de->code = i;
   2440             de->len = n;
   2441             if (l == 18)
   2442                 de->type = DECOMP_TYPE_B18;
   2443             else
   2444                 de->type = DECOMP_TYPE_B1 + l - 1;
   2445             de->c_len = l;
   2446             de->c_min = c_min;
   2447             de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
   2448             if (de->cost < tab_de[i].cost) {
   2449                 tab_de[i] = *de;
   2450             }
   2451             if (!((i + n) <= CHARCODE_MAX && n < len_max))
   2452                 break;
   2453             ci1 = &unicode_db[i + n];
   2454             if (!(ci1->decomp_len == l &&
   2455                   ci1->is_compat == ci->is_compat))
   2456                 break;
   2457             n++;
   2458         }
   2459     }
   2460 
   2461     /* find an ascii run */
   2462     if (l <= 5 && is_short_tab(ci->decomp_data, l)) {
   2463         n = 1;
   2464         for(;;) {
   2465             de->code = i;
   2466             de->len = n;
   2467             de->type = DECOMP_TYPE_S1 + l - 1;
   2468             de->c_len = l;
   2469             de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
   2470             if (de->cost < tab_de[i].cost) {
   2471                 tab_de[i] = *de;
   2472             }
   2473 
   2474             if (!((i + n) <= CHARCODE_MAX && n < len_max))
   2475                 break;
   2476             ci1 = &unicode_db[i + n];
   2477             /* Note: we accept a hole */
   2478             if (!(ci1->decomp_len == 0 ||
   2479                   (ci1->decomp_len == l &&
   2480                    ci1->is_compat == ci->is_compat &&
   2481                    is_short_tab(ci1->decomp_data, l))))
   2482                 break;
   2483             n++;
   2484         }
   2485     }
   2486 
   2487     /* check if a single char is increasing */
   2488     if (l <= 4) {
   2489         int idx1, idx;
   2490 
   2491         for(idx1 = 1; (idx = decomp_incr_tab[l - 1][idx1]) >= 0; idx1++) {
   2492             n = 1;
   2493             for(;;) {
   2494                 de->code = i;
   2495                 de->len = n;
   2496                 de->type = decomp_incr_tab[l - 1][0] + idx1 - 1;
   2497                 de->c_len = l;
   2498                 de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
   2499                 if (de->cost < tab_de[i].cost) {
   2500                     tab_de[i] = *de;
   2501                 }
   2502 
   2503                 if (!((i + n) <= CHARCODE_MAX && n < len_max))
   2504                     break;
   2505                 ci1 = &unicode_db[i + n];
   2506                 if (!(ci1->decomp_len == l &&
   2507                       ci1->is_compat == ci->is_compat))
   2508                     goto next1;
   2509                 for(j = 0; j < l; j++) {
   2510                     if (j == idx) {
   2511                         if (ci1->decomp_data[j] != ci->decomp_data[j] + n)
   2512                             goto next1;
   2513                     } else {
   2514                         if (ci1->decomp_data[j] != ci->decomp_data[j])
   2515                             goto next1;
   2516                     }
   2517                 }
   2518                 n++;
   2519             }
   2520         next1: ;
   2521         }
   2522     }
   2523 
   2524     if (l == 3) {
   2525         n = 1;
   2526         for(;;) {
   2527             de->code = i;
   2528             de->len = n;
   2529             de->type = DECOMP_TYPE_PAT3;
   2530             de->c_len = l;
   2531             de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
   2532             if (de->cost < tab_de[i].cost) {
   2533                 tab_de[i] = *de;
   2534             }
   2535             if (!((i + n) <= CHARCODE_MAX && n < len_max))
   2536                 break;
   2537             ci1 = &unicode_db[i + n];
   2538             if (!(ci1->decomp_len == l &&
   2539                   ci1->is_compat == ci->is_compat &&
   2540                   ci1->decomp_data[1] <= 0xffff &&
   2541                   ci1->decomp_data[0] == ci->decomp_data[0] &&
   2542                   ci1->decomp_data[l - 1] == ci->decomp_data[l - 1]))
   2543                 break;
   2544             n++;
   2545         }
   2546     }
   2547 
   2548     if (l == 2 && is_short(ci->decomp_data[1])) {
   2549         n = 1;
   2550         for(;;) {
   2551             de->code = i;
   2552             de->len = n;
   2553             de->type = DECOMP_TYPE_LS2;
   2554             de->c_len = l;
   2555             de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
   2556             if (de->cost < tab_de[i].cost) {
   2557                 tab_de[i] = *de;
   2558             }
   2559             if (!((i + n) <= CHARCODE_MAX && n < len_max))
   2560                 break;
   2561             ci1 = &unicode_db[i + n];
   2562             if (!(ci1->decomp_len == 0 ||
   2563                   (ci1->decomp_len == l &&
   2564                    ci1->is_compat == ci->is_compat &&
   2565                    ci1->decomp_data[0] <= 0xffff &&
   2566                    is_short(ci1->decomp_data[1]))))
   2567                 break;
   2568             n++;
   2569         }
   2570     }
   2571 
   2572     if (l == 2) {
   2573         BOOL is_16bit;
   2574 
   2575         n = 0;
   2576         is_16bit = FALSE;
   2577         for(;;) {
   2578             if (!((i + n + 1) <= CHARCODE_MAX && n + 2 <= len_max))
   2579                 break;
   2580             ci1 = &unicode_db[i + n];
   2581             if (!(ci1->decomp_len == l &&
   2582                   ci1->is_compat == ci->is_compat &&
   2583                   is_short(ci1->decomp_data[1])))
   2584                 break;
   2585             if (!is_16bit && !is_short(ci1->decomp_data[0]))
   2586                 is_16bit = TRUE;
   2587             ci2 = &unicode_db[i + n + 1];
   2588             if (!(ci2->decomp_len == l &&
   2589                   ci2->is_compat == ci->is_compat &&
   2590                   ci2->decomp_data[0] == to_lower_simple(ci1->decomp_data[0])  &&
   2591                   ci2->decomp_data[1] == ci1->decomp_data[1]))
   2592                 break;
   2593             n += 2;
   2594             de->code = i;
   2595             de->len = n;
   2596             de->type = DECOMP_TYPE_S2_UL + is_16bit;
   2597             de->c_len = l;
   2598             de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
   2599             if (de->cost < tab_de[i].cost) {
   2600                 tab_de[i] = *de;
   2601             }
   2602         }
   2603     }
   2604 }
   2605 
   2606 void put16(uint8_t *data_buf, int *pidx, uint16_t c)
   2607 {
   2608     int idx;
   2609     idx = *pidx;
   2610     data_buf[idx++] = c;
   2611     data_buf[idx++] = c >> 8;
   2612     *pidx = idx;
   2613 }
   2614 
   2615 void add_decomp_data(uint8_t *data_buf, int *pidx, DecompEntry *de)
   2616 {
   2617     int i, j, idx, c;
   2618     CCInfo *ci;
   2619 
   2620     idx = *pidx;
   2621     de->data_index = idx;
   2622     if (de->type <= DECOMP_TYPE_C1) {
   2623         ci = &unicode_db[de->code];
   2624         assert(ci->decomp_len == 1);
   2625         de->data_index = ci->decomp_data[0];
   2626     } else if (de->type <= DECOMP_TYPE_L7) {
   2627         for(i = 0; i < de->len; i++) {
   2628             ci = &unicode_db[de->code + i];
   2629             for(j = 0; j < de->c_len; j++) {
   2630                 if (ci->decomp_len == 0)
   2631                     c = 0;
   2632                 else
   2633                     c = ci->decomp_data[j];
   2634                 put16(data_buf, &idx,  c);
   2635             }
   2636         }
   2637     } else if (de->type <= DECOMP_TYPE_LL2) {
   2638         int n, p, k;
   2639         n = (de->len * de->c_len * 18 + 7) / 8;
   2640         p = de->len * de->c_len * 2;
   2641         memset(data_buf + idx, 0, n);
   2642         k = 0;
   2643         for(i = 0; i < de->len; i++) {
   2644             ci = &unicode_db[de->code + i];
   2645             for(j = 0; j < de->c_len; j++) {
   2646                 if (ci->decomp_len == 0)
   2647                     c = 0;
   2648                 else
   2649                     c = ci->decomp_data[j];
   2650                 data_buf[idx + k * 2] = c;
   2651                 data_buf[idx + k * 2 + 1] = c >> 8;
   2652                 data_buf[idx + p + (k / 4)] |= (c >> 16) << ((k % 4) * 2);
   2653                 k++;
   2654             }
   2655         }
   2656         idx += n;
   2657     } else if (de->type <= DECOMP_TYPE_S5) {
   2658         for(i = 0; i < de->len; i++) {
   2659             ci = &unicode_db[de->code + i];
   2660             for(j = 0; j < de->c_len; j++) {
   2661                 if (ci->decomp_len == 0)
   2662                     c = 0;
   2663                 else
   2664                     c = ci->decomp_data[j];
   2665                 c = get_short_code(c);
   2666                 assert(c >= 0);
   2667                 data_buf[idx++] = c;
   2668             }
   2669         }
   2670     } else if (de->type <= DECOMP_TYPE_I4_2) {
   2671         ci = &unicode_db[de->code];
   2672         assert(ci->decomp_len == de->c_len);
   2673         for(j = 0; j < de->c_len; j++)
   2674             put16(data_buf, &idx, ci->decomp_data[j]);
   2675     } else if (de->type <= DECOMP_TYPE_B18) {
   2676         c = de->c_min;
   2677         data_buf[idx++] = c;
   2678         data_buf[idx++] = c >> 8;
   2679         for(i = 0; i < de->len; i++) {
   2680             ci = &unicode_db[de->code + i];
   2681             for(j = 0; j < de->c_len; j++) {
   2682                 assert(ci->decomp_len == de->c_len);
   2683                 c = ci->decomp_data[j];
   2684                 if (c == 0x20) {
   2685                     c = 0xff;
   2686                 } else {
   2687                     c -= de->c_min;
   2688                     assert((uint32_t)c <= 254);
   2689                 }
   2690                 data_buf[idx++] = c;
   2691             }
   2692         }
   2693     } else if (de->type <= DECOMP_TYPE_LS2) {
   2694         assert(de->c_len == 2);
   2695         for(i = 0; i < de->len; i++) {
   2696             ci = &unicode_db[de->code + i];
   2697             if (ci->decomp_len == 0)
   2698                 c = 0;
   2699             else
   2700                 c = ci->decomp_data[0];
   2701             put16(data_buf, &idx,  c);
   2702 
   2703             if (ci->decomp_len == 0)
   2704                 c = 0;
   2705             else
   2706                 c = ci->decomp_data[1];
   2707             c = get_short_code(c);
   2708             assert(c >= 0);
   2709             data_buf[idx++] = c;
   2710         }
   2711     } else if (de->type <= DECOMP_TYPE_PAT3) {
   2712         ci = &unicode_db[de->code];
   2713         assert(ci->decomp_len == 3);
   2714         put16(data_buf, &idx,  ci->decomp_data[0]);
   2715         put16(data_buf, &idx,  ci->decomp_data[2]);
   2716         for(i = 0; i < de->len; i++) {
   2717             ci = &unicode_db[de->code + i];
   2718             assert(ci->decomp_len == 3);
   2719             put16(data_buf, &idx,  ci->decomp_data[1]);
   2720         }
   2721     } else if (de->type <= DECOMP_TYPE_S2_UL) {
   2722         for(i = 0; i < de->len; i += 2) {
   2723             ci = &unicode_db[de->code + i];
   2724             c = ci->decomp_data[0];
   2725             c = get_short_code(c);
   2726             assert(c >= 0);
   2727             data_buf[idx++] = c;
   2728             c = ci->decomp_data[1];
   2729             c = get_short_code(c);
   2730             assert(c >= 0);
   2731             data_buf[idx++] = c;
   2732         }
   2733     } else if (de->type <= DECOMP_TYPE_LS2_UL) {
   2734         for(i = 0; i < de->len; i += 2) {
   2735             ci = &unicode_db[de->code + i];
   2736             c = ci->decomp_data[0];
   2737             put16(data_buf, &idx,  c);
   2738             c = ci->decomp_data[1];
   2739             c = get_short_code(c);
   2740             assert(c >= 0);
   2741             data_buf[idx++] = c;
   2742         }
   2743     } else {
   2744         abort();
   2745     }
   2746     *pidx = idx;
   2747 }
   2748 
   2749 #if 0
   2750 void dump_large_char(void)
   2751 {
   2752     int i, j;
   2753     for(i = 0; i <= CHARCODE_MAX; i++) {
   2754         CCInfo *ci = &unicode_db[i];
   2755         for(j = 0; j < ci->decomp_len; j++) {
   2756             if (ci->decomp_data[j] > 0xffff)
   2757                 printf("%05x\n", ci->decomp_data[j]);
   2758         }
   2759     }
   2760 }
   2761 #endif
   2762 
   2763 void build_compose_table(FILE *f, const DecompEntry *tab_de);
   2764 
   2765 void build_decompose_table(FILE *f)
   2766 {
   2767     int i, array_len, code_max, data_len, count;
   2768     DecompEntry *tab_de, de_s, *de = &de_s;
   2769     uint8_t *data_buf;
   2770 
   2771     code_max = CHARCODE_MAX;
   2772 
   2773     tab_de = mallocz((code_max + 2) * sizeof(*tab_de));
   2774 
   2775     for(i = code_max; i >= 0; i--) {
   2776         find_decomp_run(tab_de, i);
   2777     }
   2778 
   2779     /* build the data buffer */
   2780     data_buf = malloc(100000);
   2781     data_len = 0;
   2782     array_len = 0;
   2783     for(i = 0; i <= code_max; i++) {
   2784         de = &tab_de[i];
   2785         if (de->len != 0) {
   2786             add_decomp_data(data_buf, &data_len, de);
   2787             i += de->len - 1;
   2788             array_len++;
   2789         }
   2790     }
   2791 
   2792 #ifdef DUMP_DECOMP_TABLE
   2793     /* dump */
   2794     {
   2795         int size, size1;
   2796 
   2797         printf("START LEN   TYPE  L C SIZE\n");
   2798         size = 0;
   2799         for(i = 0; i <= code_max; i++) {
   2800             de = &tab_de[i];
   2801             if (de->len != 0) {
   2802                 size1 = get_decomp_run_size(de);
   2803                 printf("%05x %3d %6s %2d %1d %4d\n", i, de->len,
   2804                        decomp_type_str[de->type], de->c_len,
   2805                        unicode_db[i].is_compat, size1);
   2806                 i += de->len - 1;
   2807                 size += size1;
   2808             }
   2809         }
   2810 
   2811         printf("array_len=%d estimated size=%d bytes actual=%d bytes\n",
   2812                array_len, size, array_len * 6 + data_len);
   2813     }
   2814 #endif
   2815 
   2816     total_tables++;
   2817     total_table_bytes += array_len * sizeof(uint32_t);
   2818     fprintf(f, "static const uint32_t unicode_decomp_table1[%d] = {", array_len);
   2819     count = 0;
   2820     for(i = 0; i <= code_max; i++) {
   2821         de = &tab_de[i];
   2822         if (de->len != 0) {
   2823             uint32_t v;
   2824             if (count++ % 4 == 0)
   2825                 fprintf(f, "\n   ");
   2826             v = (de->code << (32 - 18)) |
   2827                 (de->len << (32 - 18 - 7)) |
   2828                 (de->type << (32 - 18 - 7 - 6)) |
   2829                 unicode_db[de->code].is_compat;
   2830             fprintf(f, " 0x%08x,", v);
   2831             i += de->len - 1;
   2832         }
   2833     }
   2834     fprintf(f, "\n};\n\n");
   2835 
   2836     total_tables++;
   2837     total_table_bytes += array_len * sizeof(uint16_t);
   2838     fprintf(f, "static const uint16_t unicode_decomp_table2[%d] = {", array_len);
   2839     count = 0;
   2840     for(i = 0; i <= code_max; i++) {
   2841         de = &tab_de[i];
   2842         if (de->len != 0) {
   2843             if (count++ % 8 == 0)
   2844                 fprintf(f, "\n   ");
   2845             fprintf(f, " 0x%04x,", de->data_index);
   2846             i += de->len - 1;
   2847         }
   2848     }
   2849     fprintf(f, "\n};\n\n");
   2850 
   2851     total_tables++;
   2852     total_table_bytes += data_len;
   2853     fprintf(f, "static const uint8_t unicode_decomp_data[%d] = {", data_len);
   2854     for(i = 0; i < data_len; i++) {
   2855         if (i % 8 == 0)
   2856             fprintf(f, "\n   ");
   2857         fprintf(f, " 0x%02x,", data_buf[i]);
   2858     }
   2859     fprintf(f, "\n};\n\n");
   2860 
   2861     build_compose_table(f, tab_de);
   2862 
   2863     free(data_buf);
   2864 
   2865     free(tab_de);
   2866 }
   2867 
   2868 typedef struct {
   2869     uint32_t c[2];
   2870     uint32_t p;
   2871 } ComposeEntry;
   2872 
   2873 #define COMPOSE_LEN_MAX 10000
   2874 
   2875 static int ce_cmp(const void *p1, const void *p2)
   2876 {
   2877     const ComposeEntry *ce1 = p1;
   2878     const ComposeEntry *ce2 = p2;
   2879     int i;
   2880 
   2881     for(i = 0; i < 2; i++) {
   2882         if (ce1->c[i] < ce2->c[i])
   2883             return -1;
   2884         else if (ce1->c[i] > ce2->c[i])
   2885             return 1;
   2886     }
   2887     return 0;
   2888 }
   2889 
   2890 
   2891 static int get_decomp_pos(const DecompEntry *tab_de, int c)
   2892 {
   2893     int i, v, k;
   2894     const DecompEntry *de;
   2895 
   2896     k = 0;
   2897     for(i = 0; i <= CHARCODE_MAX; i++) {
   2898         de = &tab_de[i];
   2899         if (de->len != 0) {
   2900             if (c >= de->code && c < de->code + de->len) {
   2901                 v = c - de->code;
   2902                 assert(v < 64);
   2903                 v |= k << 6;
   2904                 assert(v < 65536);
   2905                 return v;
   2906             }
   2907             i += de->len - 1;
   2908             k++;
   2909         }
   2910     }
   2911     return -1;
   2912 }
   2913 
   2914 void build_compose_table(FILE *f, const DecompEntry *tab_de)
   2915 {
   2916     int i, v, tab_ce_len;
   2917     ComposeEntry *ce, *tab_ce;
   2918 
   2919     tab_ce = malloc(sizeof(*tab_ce) * COMPOSE_LEN_MAX);
   2920     tab_ce_len = 0;
   2921     for(i = 0; i <= CHARCODE_MAX; i++) {
   2922         CCInfo *ci = &unicode_db[i];
   2923         if (ci->decomp_len == 2 && !ci->is_compat &&
   2924             !ci->is_excluded) {
   2925             assert(tab_ce_len < COMPOSE_LEN_MAX);
   2926             ce = &tab_ce[tab_ce_len++];
   2927             ce->c[0] = ci->decomp_data[0];
   2928             ce->c[1] = ci->decomp_data[1];
   2929             ce->p = i;
   2930         }
   2931     }
   2932     qsort(tab_ce, tab_ce_len, sizeof(*tab_ce), ce_cmp);
   2933 
   2934 #if 0
   2935     {
   2936         printf("tab_ce_len=%d\n", tab_ce_len);
   2937         for(i = 0; i < tab_ce_len; i++) {
   2938             ce = &tab_ce[i];
   2939             printf("%05x %05x %05x\n", ce->c[0], ce->c[1], ce->p);
   2940         }
   2941     }
   2942 #endif
   2943 
   2944     total_tables++;
   2945     total_table_bytes += tab_ce_len * sizeof(uint16_t);
   2946     fprintf(f, "static const uint16_t unicode_comp_table[%u] = {", tab_ce_len);
   2947     for(i = 0; i < tab_ce_len; i++) {
   2948         if (i % 8 == 0)
   2949             fprintf(f, "\n   ");
   2950         v = get_decomp_pos(tab_de, tab_ce[i].p);
   2951         if (v < 0) {
   2952             printf("ERROR: entry for c=%04x not found\n",
   2953                    tab_ce[i].p);
   2954             exit(1);
   2955         }
   2956         fprintf(f, " 0x%04x,", v);
   2957     }
   2958     fprintf(f, "\n};\n\n");
   2959 
   2960     free(tab_ce);
   2961 }
   2962 
   2963 #ifdef USE_TEST
   2964 void check_decompose_table(void)
   2965 {
   2966     int c;
   2967     CCInfo *ci;
   2968     int res[UNICODE_DECOMP_LEN_MAX], *ref;
   2969     int len, ref_len, is_compat;
   2970 
   2971     for(is_compat = 0; is_compat <= 1; is_compat++) {
   2972         for(c = 0; c < CHARCODE_MAX; c++) {
   2973             ci = &unicode_db[c];
   2974             ref_len = ci->decomp_len;
   2975             ref = ci->decomp_data;
   2976             if (!is_compat && ci->is_compat) {
   2977                 ref_len = 0;
   2978             }
   2979             len = unicode_decomp_char((uint32_t *)res, c, is_compat);
   2980             if (len != ref_len ||
   2981                 tabcmp(res, ref, ref_len) != 0) {
   2982                 printf("ERROR c=%05x compat=%d\n", c, is_compat);
   2983                 dump_str("res", res, len);
   2984                 dump_str("ref", ref, ref_len);
   2985                 exit(1);
   2986             }
   2987         }
   2988     }
   2989 }
   2990 
   2991 void check_compose_table(void)
   2992 {
   2993     int i, p;
   2994     /* XXX: we don't test all the cases */
   2995 
   2996     for(i = 0; i <= CHARCODE_MAX; i++) {
   2997         CCInfo *ci = &unicode_db[i];
   2998         if (ci->decomp_len == 2 && !ci->is_compat &&
   2999             !ci->is_excluded) {
   3000             p = unicode_compose_pair(ci->decomp_data[0], ci->decomp_data[1]);
   3001             if (p != i) {
   3002                 printf("ERROR compose: c=%05x %05x -> %05x ref=%05x\n",
   3003                        ci->decomp_data[0], ci->decomp_data[1], p, i);
   3004                 exit(1);
   3005             }
   3006         }
   3007     }
   3008 
   3009 
   3010 
   3011 }
   3012 
   3013 #endif
   3014 
   3015 
   3016 
   3017 #ifdef USE_TEST
   3018 
   3019 void check_str(const char *msg, int num, const int *in_buf, int in_len,
   3020                const int *buf1, int len1,
   3021                const int *buf2, int len2)
   3022 {
   3023     if (len1 != len2 || tabcmp(buf1, buf2, len1) != 0) {
   3024         printf("%d: ERROR %s:\n", num, msg);
   3025         dump_str(" in", in_buf, in_len);
   3026         dump_str("res", buf1, len1);
   3027         dump_str("ref", buf2, len2);
   3028         exit(1);
   3029     }
   3030 }
   3031 
   3032 void check_cc_table(void)
   3033 {
   3034     int cc, cc_ref, c;
   3035 
   3036     for(c = 0; c <= CHARCODE_MAX; c++) {
   3037         cc_ref = unicode_db[c].combining_class;
   3038         cc = unicode_get_cc(c);
   3039         if (cc != cc_ref) {
   3040             printf("ERROR: c=%04x cc=%d cc_ref=%d\n",
   3041                    c, cc, cc_ref);
   3042             exit(1);
   3043         }
   3044     }
   3045 #ifdef PROFILE
   3046     {
   3047         int64_t ti, count;
   3048 
   3049         ti = get_time_ns();
   3050         count = 0;
   3051         /* only do it on meaningful chars */
   3052         for(c = 0x20; c <= 0xffff; c++) {
   3053             cc_ref = unicode_db[c].combining_class;
   3054             cc = unicode_get_cc(c);
   3055             count++;
   3056         }
   3057         ti = get_time_ns() - ti;
   3058         printf("cc time=%0.1f ns/char\n",
   3059                (double)ti / count);
   3060     }
   3061 #endif
   3062 }
   3063 
   3064 void normalization_test(const char *filename)
   3065 {
   3066     FILE *f;
   3067     char line[4096], *p;
   3068     int *in_str, *nfc_str, *nfd_str, *nfkc_str, *nfkd_str;
   3069     int in_len, nfc_len, nfd_len, nfkc_len, nfkd_len;
   3070     int *buf, buf_len, pos;
   3071 
   3072     f = fopen(filename, "rb");
   3073     if (!f) {
   3074         perror(filename);
   3075         exit(1);
   3076     }
   3077     pos = 0;
   3078     for(;;) {
   3079         if (!get_line(line, sizeof(line), f))
   3080             break;
   3081         pos++;
   3082         p = line;
   3083         while (isspace(*p))
   3084             p++;
   3085         if (*p == '#' || *p == '@')
   3086             continue;
   3087         in_str = get_field_str(&in_len, p, 0);
   3088         nfc_str = get_field_str(&nfc_len, p, 1);
   3089         nfd_str = get_field_str(&nfd_len, p, 2);
   3090         nfkc_str = get_field_str(&nfkc_len, p, 3);
   3091         nfkd_str = get_field_str(&nfkd_len, p, 4);
   3092 
   3093         //        dump_str("in", in_str, in_len);
   3094 
   3095         buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFD, NULL, NULL);
   3096         check_str("nfd", pos, in_str, in_len, buf, buf_len, nfd_str, nfd_len);
   3097         free(buf);
   3098 
   3099         buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFKD, NULL, NULL);
   3100         check_str("nfkd", pos, in_str, in_len, buf, buf_len, nfkd_str, nfkd_len);
   3101         free(buf);
   3102 
   3103         buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFC, NULL, NULL);
   3104         check_str("nfc", pos, in_str, in_len, buf, buf_len, nfc_str, nfc_len);
   3105         free(buf);
   3106 
   3107         buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFKC, NULL, NULL);
   3108         check_str("nfkc", pos, in_str, in_len, buf, buf_len, nfkc_str, nfkc_len);
   3109         free(buf);
   3110 
   3111         free(in_str);
   3112         free(nfc_str);
   3113         free(nfd_str);
   3114         free(nfkc_str);
   3115         free(nfkd_str);
   3116     }
   3117     fclose(f);
   3118 }
   3119 #endif
   3120 
   3121 int main(int argc, char *argv[])
   3122 {
   3123     const char *unicode_db_path, *outfilename;
   3124     char filename[1024];
   3125     int arg = 1;
   3126 
   3127     if (arg >= argc || (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "--help"))) {
   3128         printf("usage: %s PATH [OUTPUT]\n"
   3129                "  PATH    path to the Unicode database directory\n"
   3130                "  OUTPUT  name of the output file.  If omitted, a self test is performed\n"
   3131                "          using the files from the Unicode library\n"
   3132                , argv[0]);
   3133         return 1;
   3134     }
   3135     unicode_db_path = argv[arg++];
   3136     outfilename = NULL;
   3137     if (arg < argc)
   3138         outfilename = argv[arg++];
   3139 
   3140     unicode_db = mallocz(sizeof(unicode_db[0]) * (CHARCODE_MAX + 1));
   3141 
   3142     snprintf(filename, sizeof(filename), "%s/UnicodeData.txt", unicode_db_path);
   3143 
   3144     parse_unicode_data(filename);
   3145 
   3146     snprintf(filename, sizeof(filename), "%s/SpecialCasing.txt", unicode_db_path);
   3147     parse_special_casing(unicode_db, filename);
   3148 
   3149     snprintf(filename, sizeof(filename), "%s/CaseFolding.txt", unicode_db_path);
   3150     parse_case_folding(unicode_db, filename);
   3151 
   3152     snprintf(filename, sizeof(filename), "%s/CompositionExclusions.txt", unicode_db_path);
   3153     parse_composition_exclusions(filename);
   3154 
   3155     snprintf(filename, sizeof(filename), "%s/DerivedCoreProperties.txt", unicode_db_path);
   3156     parse_derived_core_properties(filename);
   3157 
   3158     snprintf(filename, sizeof(filename), "%s/DerivedNormalizationProps.txt", unicode_db_path);
   3159     parse_derived_norm_properties(filename);
   3160 
   3161     snprintf(filename, sizeof(filename), "%s/PropList.txt", unicode_db_path);
   3162     parse_prop_list(filename);
   3163 
   3164     snprintf(filename, sizeof(filename), "%s/Scripts.txt", unicode_db_path);
   3165     parse_scripts(filename);
   3166 
   3167     snprintf(filename, sizeof(filename), "%s/ScriptExtensions.txt",
   3168              unicode_db_path);
   3169     parse_script_extensions(filename);
   3170 
   3171     snprintf(filename, sizeof(filename), "%s/emoji-data.txt",
   3172              unicode_db_path);
   3173     parse_prop_list(filename);
   3174 
   3175     //    dump_unicode_data(unicode_db);
   3176     build_conv_table(unicode_db);
   3177 
   3178 #ifdef DUMP_CASE_FOLDING_SPECIAL_CASES
   3179     dump_case_folding_special_cases(unicode_db);
   3180 #endif
   3181 
   3182     if (!outfilename) {
   3183 #ifdef USE_TEST
   3184         check_case_conv();
   3185         check_flags();
   3186         check_decompose_table();
   3187         check_compose_table();
   3188         check_cc_table();
   3189         snprintf(filename, sizeof(filename), "%s/NormalizationTest.txt", unicode_db_path);
   3190         normalization_test(filename);
   3191 #else
   3192         fprintf(stderr, "Tests are not compiled\n");
   3193         exit(1);
   3194 #endif
   3195     } else
   3196     {
   3197         FILE *fo = fopen(outfilename, "wb");
   3198 
   3199         if (!fo) {
   3200             perror(outfilename);
   3201             exit(1);
   3202         }
   3203         fprintf(fo,
   3204                 "/* Compressed unicode tables */\n"
   3205                 "/* Automatically generated file - do not edit */\n"
   3206                 "\n"
   3207                 "#include <stdint.h>\n"
   3208                 "\n");
   3209         dump_case_conv_table(fo);
   3210         compute_internal_props();
   3211         build_flags_tables(fo);
   3212         fprintf(fo, "#ifdef CONFIG_ALL_UNICODE\n\n");
   3213         build_cc_table(fo);
   3214         build_decompose_table(fo);
   3215         build_general_category_table(fo);
   3216         build_script_table(fo);
   3217         build_script_ext_table(fo);
   3218         build_prop_list_table(fo);
   3219         fprintf(fo, "#endif /* CONFIG_ALL_UNICODE */\n");
   3220         fprintf(fo, "/* %u tables / %u bytes, %u index / %u bytes */\n",
   3221                 total_tables, total_table_bytes, total_index, total_index_bytes);
   3222         fclose(fo);
   3223     }
   3224     return 0;
   3225 }