unicode_gen.c (88975B)
1 /* 2 * Generation of Unicode tables 3 * 4 * Copyright (c) 2017-2018 Fabrice Bellard 5 * Copyright (c) 2017-2018 Charlie Gordon 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a copy 8 * of this software and associated documentation files (the "Software"), to deal 9 * in the Software without restriction, including without limitation the rights 10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 * copies of the Software, and to permit persons to whom the Software is 12 * furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included in 15 * all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 * THE SOFTWARE. 24 */ 25 #include <stdlib.h> 26 #include <stdio.h> 27 #include <stdarg.h> 28 #include <inttypes.h> 29 #include <string.h> 30 #include <assert.h> 31 #include <ctype.h> 32 #include <time.h> 33 34 #include "cutils.h" 35 36 uint32_t total_tables; 37 uint32_t total_table_bytes; 38 uint32_t total_index; 39 uint32_t total_index_bytes; 40 41 /* define it to be able to test unicode.c */ 42 //#define USE_TEST 43 /* profile tests */ 44 //#define PROFILE 45 46 //#define DUMP_CASE_CONV_TABLE 47 //#define DUMP_TABLE_SIZE 48 //#define DUMP_CC_TABLE 49 //#define DUMP_DECOMP_TABLE 50 //#define DUMP_CASE_FOLDING_SPECIAL_CASES 51 52 /* Ideas: 53 - Generalize run length encoding + index for all tables 54 - remove redundant tables for ID_start, ID_continue, Case_Ignorable, Cased 55 56 Case conversion: 57 - use a single entry for consecutive U/LF runs 58 - allow EXT runs of length > 1 59 60 Decomposition: 61 - Greek lower case (+1f10/1f10) ? 62 - allow holes in B runs 63 - suppress more upper / lower case redundancy 64 */ 65 66 #ifdef USE_TEST 67 #include "libunicode.c" 68 #endif 69 70 #define CHARCODE_MAX 0x10ffff 71 #define CC_LEN_MAX 3 72 73 void *mallocz(size_t size) 74 { 75 void *ptr; 76 ptr = malloc(size); 77 memset(ptr, 0, size); 78 return ptr; 79 } 80 81 const char *get_field(const char *p, int n) 82 { 83 int i; 84 for(i = 0; i < n; i++) { 85 while (*p != ';' && *p != '\0') 86 p++; 87 if (*p == '\0') 88 return NULL; 89 p++; 90 } 91 return p; 92 } 93 94 const char *get_field_buf(char *buf, size_t buf_size, const char *p, int n) 95 { 96 char *q; 97 p = get_field(p, n); 98 q = buf; 99 while (*p != ';' && *p != '\0') { 100 if ((q - buf) < buf_size - 1) 101 *q++ = *p; 102 p++; 103 } 104 *q = '\0'; 105 return buf; 106 } 107 108 void add_char(int **pbuf, int *psize, int *plen, int c) 109 { 110 int len, size, *buf; 111 buf = *pbuf; 112 size = *psize; 113 len = *plen; 114 if (len >= size) { 115 size = *psize; 116 size = max_int(len + 1, size * 3 / 2); 117 buf = realloc(buf, sizeof(buf[0]) * size); 118 *pbuf = buf; 119 *psize = size; 120 } 121 buf[len++] = c; 122 *plen = len; 123 } 124 125 int *get_field_str(int *plen, const char *str, int n) 126 { 127 const char *p; 128 int *buf, len, size; 129 p = get_field(str, n); 130 if (!p) { 131 *plen = 0; 132 return NULL; 133 } 134 len = 0; 135 size = 0; 136 buf = NULL; 137 for(;;) { 138 while (isspace(*p)) 139 p++; 140 if (!isxdigit(*p)) 141 break; 142 add_char(&buf, &size, &len, strtoul(p, (char **)&p, 16)); 143 } 144 *plen = len; 145 return buf; 146 } 147 148 char *get_line(char *buf, int buf_size, FILE *f) 149 { 150 int len; 151 if (!fgets(buf, buf_size, f)) 152 return NULL; 153 len = strlen(buf); 154 if (len > 0 && buf[len - 1] == '\n') 155 buf[len - 1] = '\0'; 156 return buf; 157 } 158 159 #define UNICODE_GENERAL_CATEGORY 160 161 typedef enum { 162 #define DEF(id, str) GCAT_ ## id, 163 #include "unicode_gen_def.h" 164 #undef DEF 165 GCAT_COUNT, 166 } UnicodeGCEnum1; 167 168 static const char *unicode_gc_name[] = { 169 #define DEF(id, str) #id, 170 #include "unicode_gen_def.h" 171 #undef DEF 172 }; 173 174 static const char *unicode_gc_short_name[] = { 175 #define DEF(id, str) str, 176 #include "unicode_gen_def.h" 177 #undef DEF 178 }; 179 180 #undef UNICODE_GENERAL_CATEGORY 181 182 #define UNICODE_SCRIPT 183 184 typedef enum { 185 #define DEF(id, str) SCRIPT_ ## id, 186 #include "unicode_gen_def.h" 187 #undef DEF 188 SCRIPT_COUNT, 189 } UnicodeScriptEnum1; 190 191 static const char *unicode_script_name[] = { 192 #define DEF(id, str) #id, 193 #include "unicode_gen_def.h" 194 #undef DEF 195 }; 196 197 const char *unicode_script_short_name[] = { 198 #define DEF(id, str) str, 199 #include "unicode_gen_def.h" 200 #undef DEF 201 }; 202 203 #undef UNICODE_SCRIPT 204 205 #define UNICODE_PROP_LIST 206 207 typedef enum { 208 #define DEF(id, str) PROP_ ## id, 209 #include "unicode_gen_def.h" 210 #undef DEF 211 PROP_COUNT, 212 } UnicodePropEnum1; 213 214 static const char *unicode_prop_name[] = { 215 #define DEF(id, str) #id, 216 #include "unicode_gen_def.h" 217 #undef DEF 218 }; 219 220 static const char *unicode_prop_short_name[] = { 221 #define DEF(id, str) str, 222 #include "unicode_gen_def.h" 223 #undef DEF 224 }; 225 226 #undef UNICODE_PROP_LIST 227 228 typedef struct { 229 /* case conv */ 230 uint8_t u_len; 231 uint8_t l_len; 232 uint8_t f_len; 233 int u_data[CC_LEN_MAX]; /* to upper case */ 234 int l_data[CC_LEN_MAX]; /* to lower case */ 235 int f_data[CC_LEN_MAX]; /* to case folding */ 236 237 uint8_t combining_class; 238 uint8_t is_compat:1; 239 uint8_t is_excluded:1; 240 uint8_t general_category; 241 uint8_t script; 242 uint8_t script_ext_len; 243 uint8_t *script_ext; 244 uint32_t prop_bitmap_tab[3]; 245 /* decomposition */ 246 int decomp_len; 247 int *decomp_data; 248 } CCInfo; 249 250 CCInfo *unicode_db; 251 252 int find_name(const char **tab, int tab_len, const char *name) 253 { 254 int i, len, name_len; 255 const char *p, *r; 256 257 name_len = strlen(name); 258 for(i = 0; i < tab_len; i++) { 259 p = tab[i]; 260 for(;;) { 261 r = strchr(p, ','); 262 if (!r) 263 len = strlen(p); 264 else 265 len = r - p; 266 if (len == name_len && memcmp(p, name, len) == 0) 267 return i; 268 if (!r) 269 break; 270 p = r + 1; 271 } 272 } 273 return -1; 274 } 275 276 static BOOL get_prop(uint32_t c, int prop_idx) 277 { 278 return (unicode_db[c].prop_bitmap_tab[prop_idx >> 5] >> (prop_idx & 0x1f)) & 1; 279 } 280 281 static void set_prop(uint32_t c, int prop_idx, int val) 282 { 283 uint32_t mask; 284 mask = 1U << (prop_idx & 0x1f); 285 if (val) 286 unicode_db[c].prop_bitmap_tab[prop_idx >> 5] |= mask; 287 else 288 unicode_db[c].prop_bitmap_tab[prop_idx >> 5] &= ~mask; 289 } 290 291 void parse_unicode_data(const char *filename) 292 { 293 FILE *f; 294 char line[1024]; 295 char buf1[256]; 296 const char *p; 297 int code, lc, uc, last_code; 298 CCInfo *ci, *tab = unicode_db; 299 300 f = fopen(filename, "rb"); 301 if (!f) { 302 perror(filename); 303 exit(1); 304 } 305 306 last_code = 0; 307 for(;;) { 308 if (!get_line(line, sizeof(line), f)) 309 break; 310 p = line; 311 while (isspace(*p)) 312 p++; 313 if (*p == '#') 314 continue; 315 316 p = get_field(line, 0); 317 if (!p) 318 continue; 319 code = strtoul(p, NULL, 16); 320 lc = 0; 321 uc = 0; 322 323 p = get_field(line, 12); 324 if (p && *p != ';') { 325 uc = strtoul(p, NULL, 16); 326 } 327 328 p = get_field(line, 13); 329 if (p && *p != ';') { 330 lc = strtoul(p, NULL, 16); 331 } 332 ci = &tab[code]; 333 if (uc > 0 || lc > 0) { 334 assert(code <= CHARCODE_MAX); 335 if (uc > 0) { 336 assert(ci->u_len == 0); 337 ci->u_len = 1; 338 ci->u_data[0] = uc; 339 } 340 if (lc > 0) { 341 assert(ci->l_len == 0); 342 ci->l_len = 1; 343 ci->l_data[0] = lc; 344 } 345 } 346 347 { 348 int i; 349 get_field_buf(buf1, sizeof(buf1), line, 2); 350 i = find_name(unicode_gc_name, countof(unicode_gc_name), buf1); 351 if (i < 0) { 352 fprintf(stderr, "General category '%s' not found\n", 353 buf1); 354 exit(1); 355 } 356 ci->general_category = i; 357 } 358 359 p = get_field(line, 3); 360 if (p && *p != ';' && *p != '\0') { 361 int cc; 362 cc = strtoul(p, NULL, 0); 363 if (cc != 0) { 364 assert(code <= CHARCODE_MAX); 365 ci->combining_class = cc; 366 // printf("%05x: %d\n", code, ci->combining_class); 367 } 368 } 369 370 p = get_field(line, 5); 371 if (p && *p != ';' && *p != '\0') { 372 int size; 373 assert(code <= CHARCODE_MAX); 374 ci->is_compat = 0; 375 if (*p == '<') { 376 while (*p != '\0' && *p != '>') 377 p++; 378 if (*p == '>') 379 p++; 380 ci->is_compat = 1; 381 } 382 size = 0; 383 for(;;) { 384 while (isspace(*p)) 385 p++; 386 if (!isxdigit(*p)) 387 break; 388 add_char(&ci->decomp_data, &size, &ci->decomp_len, strtoul(p, (char **)&p, 16)); 389 } 390 #if 0 391 { 392 int i; 393 static int count, d_count; 394 395 printf("%05x: %c", code, ci->is_compat ? 'C': ' '); 396 for(i = 0; i < ci->decomp_len; i++) 397 printf(" %05x", ci->decomp_data[i]); 398 printf("\n"); 399 count++; 400 d_count += ci->decomp_len; 401 // printf("%d %d\n", count, d_count); 402 } 403 #endif 404 } 405 406 p = get_field(line, 9); 407 if (p && *p == 'Y') { 408 set_prop(code, PROP_Bidi_Mirrored, 1); 409 } 410 411 /* handle ranges */ 412 get_field_buf(buf1, sizeof(buf1), line, 1); 413 if (strstr(buf1, " Last>")) { 414 int i; 415 // printf("range: 0x%x-%0x\n", last_code, code); 416 assert(ci->decomp_len == 0); 417 assert(ci->script_ext_len == 0); 418 for(i = last_code + 1; i < code; i++) { 419 unicode_db[i] = *ci; 420 } 421 } 422 last_code = code; 423 } 424 425 fclose(f); 426 } 427 428 void parse_special_casing(CCInfo *tab, const char *filename) 429 { 430 FILE *f; 431 char line[1024]; 432 const char *p; 433 int code; 434 CCInfo *ci; 435 436 f = fopen(filename, "rb"); 437 if (!f) { 438 perror(filename); 439 exit(1); 440 } 441 442 for(;;) { 443 if (!get_line(line, sizeof(line), f)) 444 break; 445 p = line; 446 while (isspace(*p)) 447 p++; 448 if (*p == '#') 449 continue; 450 451 p = get_field(line, 0); 452 if (!p) 453 continue; 454 code = strtoul(p, NULL, 16); 455 assert(code <= CHARCODE_MAX); 456 ci = &tab[code]; 457 458 p = get_field(line, 4); 459 if (p) { 460 /* locale dependent casing */ 461 while (isspace(*p)) 462 p++; 463 if (*p != '#' && *p != '\0') 464 continue; 465 } 466 467 468 p = get_field(line, 1); 469 if (p && *p != ';') { 470 ci->l_len = 0; 471 for(;;) { 472 while (isspace(*p)) 473 p++; 474 if (*p == ';') 475 break; 476 assert(ci->l_len < CC_LEN_MAX); 477 ci->l_data[ci->l_len++] = strtoul(p, (char **)&p, 16); 478 } 479 480 if (ci->l_len == 1 && ci->l_data[0] == code) 481 ci->l_len = 0; 482 } 483 484 p = get_field(line, 3); 485 if (p && *p != ';') { 486 ci->u_len = 0; 487 for(;;) { 488 while (isspace(*p)) 489 p++; 490 if (*p == ';') 491 break; 492 assert(ci->u_len < CC_LEN_MAX); 493 ci->u_data[ci->u_len++] = strtoul(p, (char **)&p, 16); 494 } 495 496 if (ci->u_len == 1 && ci->u_data[0] == code) 497 ci->u_len = 0; 498 } 499 } 500 501 fclose(f); 502 } 503 504 void parse_case_folding(CCInfo *tab, const char *filename) 505 { 506 FILE *f; 507 char line[1024]; 508 const char *p; 509 int code, status; 510 CCInfo *ci; 511 512 f = fopen(filename, "rb"); 513 if (!f) { 514 perror(filename); 515 exit(1); 516 } 517 518 for(;;) { 519 if (!get_line(line, sizeof(line), f)) 520 break; 521 p = line; 522 while (isspace(*p)) 523 p++; 524 if (*p == '#') 525 continue; 526 527 p = get_field(line, 0); 528 if (!p) 529 continue; 530 code = strtoul(p, NULL, 16); 531 assert(code <= CHARCODE_MAX); 532 ci = &tab[code]; 533 534 p = get_field(line, 1); 535 if (!p) 536 continue; 537 /* locale dependent casing */ 538 while (isspace(*p)) 539 p++; 540 status = *p; 541 if (status != 'C' && status != 'S' && status != 'F') 542 continue; 543 544 p = get_field(line, 2); 545 assert(p != NULL); 546 if (status == 'S') { 547 /* we always select the simple case folding and assume it 548 * comes after the full case folding case */ 549 assert(ci->f_len >= 2); 550 ci->f_len = 0; 551 } else { 552 assert(ci->f_len == 0); 553 } 554 for(;;) { 555 while (isspace(*p)) 556 p++; 557 if (*p == ';') 558 break; 559 assert(ci->l_len < CC_LEN_MAX); 560 ci->f_data[ci->f_len++] = strtoul(p, (char **)&p, 16); 561 } 562 } 563 564 fclose(f); 565 } 566 567 void parse_composition_exclusions(const char *filename) 568 { 569 FILE *f; 570 char line[4096], *p; 571 uint32_t c0; 572 573 f = fopen(filename, "rb"); 574 if (!f) { 575 perror(filename); 576 exit(1); 577 } 578 579 for(;;) { 580 if (!get_line(line, sizeof(line), f)) 581 break; 582 p = line; 583 while (isspace(*p)) 584 p++; 585 if (*p == '#' || *p == '@' || *p == '\0') 586 continue; 587 c0 = strtoul(p, (char **)&p, 16); 588 assert(c0 > 0 && c0 <= CHARCODE_MAX); 589 unicode_db[c0].is_excluded = TRUE; 590 } 591 fclose(f); 592 } 593 594 void parse_derived_core_properties(const char *filename) 595 { 596 FILE *f; 597 char line[4096], *p, buf[256], *q; 598 uint32_t c0, c1, c; 599 int i; 600 601 f = fopen(filename, "rb"); 602 if (!f) { 603 perror(filename); 604 exit(1); 605 } 606 607 for(;;) { 608 if (!get_line(line, sizeof(line), f)) 609 break; 610 p = line; 611 while (isspace(*p)) 612 p++; 613 if (*p == '#' || *p == '@' || *p == '\0') 614 continue; 615 c0 = strtoul(p, (char **)&p, 16); 616 if (*p == '.' && p[1] == '.') { 617 p += 2; 618 c1 = strtoul(p, (char **)&p, 16); 619 } else { 620 c1 = c0; 621 } 622 assert(c1 <= CHARCODE_MAX); 623 p += strspn(p, " \t"); 624 if (*p == ';') { 625 p++; 626 p += strspn(p, " \t"); 627 q = buf; 628 while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') { 629 if ((q - buf) < sizeof(buf) - 1) 630 *q++ = *p; 631 p++; 632 } 633 *q = '\0'; 634 i = find_name(unicode_prop_name, 635 countof(unicode_prop_name), buf); 636 if (i < 0) { 637 if (!strcmp(buf, "Grapheme_Link")) 638 goto next; 639 fprintf(stderr, "Property not found: %s\n", buf); 640 exit(1); 641 } 642 for(c = c0; c <= c1; c++) { 643 set_prop(c, i, 1); 644 } 645 next: ; 646 } 647 } 648 fclose(f); 649 } 650 651 void parse_derived_norm_properties(const char *filename) 652 { 653 FILE *f; 654 char line[4096], *p, buf[256], *q; 655 uint32_t c0, c1, c; 656 657 f = fopen(filename, "rb"); 658 if (!f) { 659 perror(filename); 660 exit(1); 661 } 662 663 for(;;) { 664 if (!get_line(line, sizeof(line), f)) 665 break; 666 p = line; 667 while (isspace(*p)) 668 p++; 669 if (*p == '#' || *p == '@' || *p == '\0') 670 continue; 671 c0 = strtoul(p, (char **)&p, 16); 672 if (*p == '.' && p[1] == '.') { 673 p += 2; 674 c1 = strtoul(p, (char **)&p, 16); 675 } else { 676 c1 = c0; 677 } 678 assert(c1 <= CHARCODE_MAX); 679 p += strspn(p, " \t"); 680 if (*p == ';') { 681 p++; 682 p += strspn(p, " \t"); 683 q = buf; 684 while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') { 685 if ((q - buf) < sizeof(buf) - 1) 686 *q++ = *p; 687 p++; 688 } 689 *q = '\0'; 690 if (!strcmp(buf, "Changes_When_NFKC_Casefolded")) { 691 for(c = c0; c <= c1; c++) { 692 set_prop(c, PROP_Changes_When_NFKC_Casefolded, 1); 693 } 694 } 695 } 696 } 697 fclose(f); 698 } 699 700 void parse_prop_list(const char *filename) 701 { 702 FILE *f; 703 char line[4096], *p, buf[256], *q; 704 uint32_t c0, c1, c; 705 int i; 706 707 f = fopen(filename, "rb"); 708 if (!f) { 709 perror(filename); 710 exit(1); 711 } 712 713 for(;;) { 714 if (!get_line(line, sizeof(line), f)) 715 break; 716 p = line; 717 while (isspace(*p)) 718 p++; 719 if (*p == '#' || *p == '@' || *p == '\0') 720 continue; 721 c0 = strtoul(p, (char **)&p, 16); 722 if (*p == '.' && p[1] == '.') { 723 p += 2; 724 c1 = strtoul(p, (char **)&p, 16); 725 } else { 726 c1 = c0; 727 } 728 assert(c1 <= CHARCODE_MAX); 729 p += strspn(p, " \t"); 730 if (*p == ';') { 731 p++; 732 p += strspn(p, " \t"); 733 q = buf; 734 while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') { 735 if ((q - buf) < sizeof(buf) - 1) 736 *q++ = *p; 737 p++; 738 } 739 *q = '\0'; 740 i = find_name(unicode_prop_name, 741 countof(unicode_prop_name), buf); 742 if (i < 0) { 743 fprintf(stderr, "Property not found: %s\n", buf); 744 exit(1); 745 } 746 for(c = c0; c <= c1; c++) { 747 set_prop(c, i, 1); 748 } 749 } 750 } 751 fclose(f); 752 } 753 754 void parse_scripts(const char *filename) 755 { 756 FILE *f; 757 char line[4096], *p, buf[256], *q; 758 uint32_t c0, c1, c; 759 int i; 760 761 f = fopen(filename, "rb"); 762 if (!f) { 763 perror(filename); 764 exit(1); 765 } 766 767 for(;;) { 768 if (!get_line(line, sizeof(line), f)) 769 break; 770 p = line; 771 while (isspace(*p)) 772 p++; 773 if (*p == '#' || *p == '@' || *p == '\0') 774 continue; 775 c0 = strtoul(p, (char **)&p, 16); 776 if (*p == '.' && p[1] == '.') { 777 p += 2; 778 c1 = strtoul(p, (char **)&p, 16); 779 } else { 780 c1 = c0; 781 } 782 assert(c1 <= CHARCODE_MAX); 783 p += strspn(p, " \t"); 784 if (*p == ';') { 785 p++; 786 p += strspn(p, " \t"); 787 q = buf; 788 while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') { 789 if ((q - buf) < sizeof(buf) - 1) 790 *q++ = *p; 791 p++; 792 } 793 *q = '\0'; 794 i = find_name(unicode_script_name, 795 countof(unicode_script_name), buf); 796 if (i < 0) { 797 fprintf(stderr, "Unknown script: '%s'\n", buf); 798 exit(1); 799 } 800 for(c = c0; c <= c1; c++) 801 unicode_db[c].script = i; 802 } 803 } 804 fclose(f); 805 } 806 807 void parse_script_extensions(const char *filename) 808 { 809 FILE *f; 810 char line[4096], *p, buf[256], *q; 811 uint32_t c0, c1, c; 812 int i; 813 uint8_t script_ext[255]; 814 int script_ext_len; 815 816 f = fopen(filename, "rb"); 817 if (!f) { 818 perror(filename); 819 exit(1); 820 } 821 822 for(;;) { 823 if (!get_line(line, sizeof(line), f)) 824 break; 825 p = line; 826 while (isspace(*p)) 827 p++; 828 if (*p == '#' || *p == '@' || *p == '\0') 829 continue; 830 c0 = strtoul(p, (char **)&p, 16); 831 if (*p == '.' && p[1] == '.') { 832 p += 2; 833 c1 = strtoul(p, (char **)&p, 16); 834 } else { 835 c1 = c0; 836 } 837 assert(c1 <= CHARCODE_MAX); 838 p += strspn(p, " \t"); 839 script_ext_len = 0; 840 if (*p == ';') { 841 p++; 842 for(;;) { 843 p += strspn(p, " \t"); 844 q = buf; 845 while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') { 846 if ((q - buf) < sizeof(buf) - 1) 847 *q++ = *p; 848 p++; 849 } 850 *q = '\0'; 851 if (buf[0] == '\0') 852 break; 853 i = find_name(unicode_script_short_name, 854 countof(unicode_script_short_name), buf); 855 if (i < 0) { 856 fprintf(stderr, "Script not found: %s\n", buf); 857 exit(1); 858 } 859 assert(script_ext_len < sizeof(script_ext)); 860 script_ext[script_ext_len++] = i; 861 } 862 for(c = c0; c <= c1; c++) { 863 CCInfo *ci = &unicode_db[c]; 864 ci->script_ext_len = script_ext_len; 865 ci->script_ext = malloc(sizeof(ci->script_ext[0]) * script_ext_len); 866 for(i = 0; i < script_ext_len; i++) 867 ci->script_ext[i] = script_ext[i]; 868 } 869 } 870 } 871 fclose(f); 872 } 873 874 void dump_cc_info(CCInfo *ci, int i) 875 { 876 int j; 877 printf("%05x:", i); 878 if (ci->u_len != 0) { 879 printf(" U:"); 880 for(j = 0; j < ci->u_len; j++) 881 printf(" %05x", ci->u_data[j]); 882 } 883 if (ci->l_len != 0) { 884 printf(" L:"); 885 for(j = 0; j < ci->l_len; j++) 886 printf(" %05x", ci->l_data[j]); 887 } 888 if (ci->f_len != 0) { 889 printf(" F:"); 890 for(j = 0; j < ci->f_len; j++) 891 printf(" %05x", ci->f_data[j]); 892 } 893 printf("\n"); 894 } 895 896 void dump_unicode_data(CCInfo *tab) 897 { 898 int i; 899 CCInfo *ci; 900 for(i = 0; i <= CHARCODE_MAX; i++) { 901 ci = &tab[i]; 902 if (ci->u_len != 0 || ci->l_len != 0 || ci->f_len != 0) { 903 dump_cc_info(ci, i); 904 } 905 } 906 } 907 908 BOOL is_complicated_case(const CCInfo *ci) 909 { 910 return (ci->u_len > 1 || ci->l_len > 1 || 911 (ci->u_len > 0 && ci->l_len > 0) || 912 (ci->f_len != ci->l_len) || 913 (memcmp(ci->f_data, ci->l_data, ci->f_len * sizeof(ci->f_data[0])) != 0)); 914 } 915 916 #ifndef USE_TEST 917 enum { 918 RUN_TYPE_U, 919 RUN_TYPE_L, 920 RUN_TYPE_UF, 921 RUN_TYPE_LF, 922 RUN_TYPE_UL, 923 RUN_TYPE_LSU, 924 RUN_TYPE_U2L_399_EXT2, 925 RUN_TYPE_UF_D20, 926 RUN_TYPE_UF_D1_EXT, 927 RUN_TYPE_U_EXT, 928 RUN_TYPE_LF_EXT, 929 RUN_TYPE_UF_EXT2, 930 RUN_TYPE_LF_EXT2, 931 RUN_TYPE_UF_EXT3, 932 }; 933 #endif 934 935 const char *run_type_str[] = { 936 "U", 937 "L", 938 "UF", 939 "LF", 940 "UL", 941 "LSU", 942 "U2L_399_EXT2", 943 "UF_D20", 944 "UF_D1_EXT", 945 "U_EXT", 946 "LF_EXT", 947 "UF_EXT2", 948 "LF_EXT2", 949 "UF_EXT3", 950 }; 951 952 typedef struct { 953 int code; 954 int len; 955 int type; 956 int data; 957 int ext_len; 958 int ext_data[3]; 959 int data_index; /* 'data' coming from the table */ 960 } TableEntry; 961 962 static int simple_to_lower(CCInfo *tab, int c) 963 { 964 if (tab[c].l_len != 1) 965 return c; 966 return tab[c].l_data[0]; 967 } 968 969 /* code (17), len (7), type (4) */ 970 971 void find_run_type(TableEntry *te, CCInfo *tab, int code) 972 { 973 int is_lower, len; 974 CCInfo *ci, *ci1, *ci2; 975 976 ci = &tab[code]; 977 ci1 = &tab[code + 1]; 978 ci2 = &tab[code + 2]; 979 te->code = code; 980 981 if (ci->l_len == 1 && ci->l_data[0] == code + 2 && 982 ci->f_len == 1 && ci->f_data[0] == ci->l_data[0] && 983 ci->u_len == 0 && 984 985 ci1->l_len == 1 && ci1->l_data[0] == code + 2 && 986 ci1->f_len == 1 && ci1->f_data[0] == ci1->l_data[0] && 987 ci1->u_len == 1 && ci1->u_data[0] == code && 988 989 ci2->l_len == 0 && 990 ci2->f_len == 0 && 991 ci2->u_len == 1 && ci2->u_data[0] == code) { 992 te->len = 3; 993 te->data = 0; 994 te->type = RUN_TYPE_LSU; 995 return; 996 } 997 998 if (is_complicated_case(ci)) { 999 len = 1; 1000 while (code + len <= CHARCODE_MAX) { 1001 ci1 = &tab[code + len]; 1002 if (ci1->u_len != 1 || 1003 ci1->u_data[0] != ci->u_data[0] + len || 1004 ci1->l_len != 0 || 1005 ci1->f_len != 1 || ci1->f_data[0] != ci1->u_data[0]) 1006 break; 1007 len++; 1008 } 1009 if (len > 1) { 1010 te->len = len; 1011 te->type = RUN_TYPE_UF; 1012 te->data = ci->u_data[0]; 1013 return; 1014 } 1015 1016 if (ci->l_len == 0 && 1017 ci->u_len == 2 && ci->u_data[1] == 0x399 && 1018 ci->f_len == 2 && ci->f_data[1] == 0x3B9 && 1019 ci->f_data[0] == simple_to_lower(tab, ci->u_data[0])) { 1020 len = 1; 1021 while (code + len <= CHARCODE_MAX) { 1022 ci1 = &tab[code + len]; 1023 if (!(ci1->u_len == 2 && 1024 ci1->u_data[1] == ci->u_data[1] && 1025 ci1->u_data[0] == ci->u_data[0] + len && 1026 ci1->f_len == 2 && 1027 ci1->f_data[1] == ci->f_data[1] && 1028 ci1->f_data[0] == ci->f_data[0] + len && 1029 ci1->l_len == 0)) 1030 break; 1031 len++; 1032 } 1033 te->len = len; 1034 te->type = RUN_TYPE_UF_EXT2; 1035 te->ext_data[0] = ci->u_data[0]; 1036 te->ext_data[1] = ci->u_data[1]; 1037 te->ext_len = 2; 1038 return; 1039 } 1040 1041 if (ci->u_len == 2 && ci->u_data[1] == 0x399 && 1042 ci->l_len == 1 && 1043 ci->f_len == 1 && ci->f_data[0] == ci->l_data[0]) { 1044 len = 1; 1045 while (code + len <= CHARCODE_MAX) { 1046 ci1 = &tab[code + len]; 1047 if (!(ci1->u_len == 2 && 1048 ci1->u_data[1] == 0x399 && 1049 ci1->u_data[0] == ci->u_data[0] + len && 1050 ci1->l_len == 1 && 1051 ci1->l_data[0] == ci->l_data[0] + len && 1052 ci1->f_len == 1 && ci1->f_data[0] == ci1->l_data[0])) 1053 break; 1054 len++; 1055 } 1056 te->len = len; 1057 te->type = RUN_TYPE_U2L_399_EXT2; 1058 te->ext_data[0] = ci->u_data[0]; 1059 te->ext_data[1] = ci->l_data[0]; 1060 te->ext_len = 2; 1061 return; 1062 } 1063 1064 if (ci->l_len == 1 && ci->u_len == 0 && ci->f_len == 0) { 1065 len = 1; 1066 while (code + len <= CHARCODE_MAX) { 1067 ci1 = &tab[code + len]; 1068 if (!(ci1->l_len == 1 && 1069 ci1->l_data[0] == ci->l_data[0] + len && 1070 ci1->u_len == 0 && ci1->f_len == 0)) 1071 break; 1072 len++; 1073 } 1074 te->len = len; 1075 te->type = RUN_TYPE_L; 1076 te->data = ci->l_data[0]; 1077 return; 1078 } 1079 1080 if (ci->l_len == 0 && 1081 ci->u_len == 1 && 1082 ci->u_data[0] < 0x1000 && 1083 ci->f_len == 1 && ci->f_data[0] == ci->u_data[0] + 0x20) { 1084 te->len = 1; 1085 te->type = RUN_TYPE_UF_D20; 1086 te->data = ci->u_data[0]; 1087 } else if (ci->l_len == 0 && 1088 ci->u_len == 1 && 1089 ci->f_len == 1 && ci->f_data[0] == ci->u_data[0] + 1) { 1090 te->len = 1; 1091 te->type = RUN_TYPE_UF_D1_EXT; 1092 te->ext_data[0] = ci->u_data[0]; 1093 te->ext_len = 1; 1094 } else if (ci->l_len == 2 && ci->u_len == 0 && ci->f_len == 2 && 1095 ci->l_data[0] == ci->f_data[0] && 1096 ci->l_data[1] == ci->f_data[1]) { 1097 te->len = 1; 1098 te->type = RUN_TYPE_LF_EXT2; 1099 te->ext_data[0] = ci->l_data[0]; 1100 te->ext_data[1] = ci->l_data[1]; 1101 te->ext_len = 2; 1102 } else if (ci->u_len == 2 && ci->l_len == 0 && ci->f_len == 2 && 1103 ci->f_data[0] == simple_to_lower(tab, ci->u_data[0]) && 1104 ci->f_data[1] == simple_to_lower(tab, ci->u_data[1])) { 1105 te->len = 1; 1106 te->type = RUN_TYPE_UF_EXT2; 1107 te->ext_data[0] = ci->u_data[0]; 1108 te->ext_data[1] = ci->u_data[1]; 1109 te->ext_len = 2; 1110 } else if (ci->u_len == 3 && ci->l_len == 0 && ci->f_len == 3 && 1111 ci->f_data[0] == simple_to_lower(tab, ci->u_data[0]) && 1112 ci->f_data[1] == simple_to_lower(tab, ci->u_data[1]) && 1113 ci->f_data[2] == simple_to_lower(tab, ci->u_data[2])) { 1114 te->len = 1; 1115 te->type = RUN_TYPE_UF_EXT3; 1116 te->ext_data[0] = ci->u_data[0]; 1117 te->ext_data[1] = ci->u_data[1]; 1118 te->ext_data[2] = ci->u_data[2]; 1119 te->ext_len = 3; 1120 } else { 1121 printf("unsupported encoding case:\n"); 1122 dump_cc_info(ci, code); 1123 abort(); 1124 } 1125 } else { 1126 /* look for a run of identical conversions */ 1127 len = 0; 1128 for(;;) { 1129 if (code >= CHARCODE_MAX || len >= 126) 1130 break; 1131 ci = &tab[code + len]; 1132 ci1 = &tab[code + len + 1]; 1133 if (is_complicated_case(ci) || is_complicated_case(ci1)) { 1134 break; 1135 } 1136 if (ci->l_len != 1 || ci->l_data[0] != code + len + 1) 1137 break; 1138 if (ci1->u_len != 1 || ci1->u_data[0] != code + len) 1139 break; 1140 len += 2; 1141 } 1142 if (len > 0) { 1143 te->len = len; 1144 te->type = RUN_TYPE_UL; 1145 te->data = 0; 1146 return; 1147 } 1148 1149 ci = &tab[code]; 1150 is_lower = ci->l_len > 0; 1151 len = 1; 1152 while (code + len <= CHARCODE_MAX) { 1153 ci1 = &tab[code + len]; 1154 if (is_complicated_case(ci1)) 1155 break; 1156 if (is_lower) { 1157 if (ci1->l_len != 1 || 1158 ci1->l_data[0] != ci->l_data[0] + len) 1159 break; 1160 } else { 1161 if (ci1->u_len != 1 || 1162 ci1->u_data[0] != ci->u_data[0] + len) 1163 break; 1164 } 1165 len++; 1166 } 1167 te->len = len; 1168 if (is_lower) { 1169 te->type = RUN_TYPE_LF; 1170 te->data = ci->l_data[0]; 1171 } else { 1172 te->type = RUN_TYPE_U; 1173 te->data = ci->u_data[0]; 1174 } 1175 } 1176 } 1177 1178 TableEntry conv_table[1000]; 1179 int conv_table_len; 1180 int ext_data[1000]; 1181 int ext_data_len; 1182 1183 void dump_case_conv_table1(void) 1184 { 1185 int i, j; 1186 const TableEntry *te; 1187 1188 for(i = 0; i < conv_table_len; i++) { 1189 te = &conv_table[i]; 1190 printf("%05x %02x %-10s %05x", 1191 te->code, te->len, run_type_str[te->type], te->data); 1192 for(j = 0; j < te->ext_len; j++) { 1193 printf(" %05x", te->ext_data[j]); 1194 } 1195 printf("\n"); 1196 } 1197 printf("table_len=%d ext_len=%d\n", conv_table_len, ext_data_len); 1198 } 1199 1200 int find_data_index(const TableEntry *conv_table, int len, int data) 1201 { 1202 int i; 1203 const TableEntry *te; 1204 for(i = 0; i < len; i++) { 1205 te = &conv_table[i]; 1206 if (te->code == data) 1207 return i; 1208 } 1209 return -1; 1210 } 1211 1212 int find_ext_data_index(int data) 1213 { 1214 int i; 1215 for(i = 0; i < ext_data_len; i++) { 1216 if (ext_data[i] == data) 1217 return i; 1218 } 1219 assert(ext_data_len < countof(ext_data)); 1220 ext_data[ext_data_len++] = data; 1221 return ext_data_len - 1; 1222 } 1223 1224 void build_conv_table(CCInfo *tab) 1225 { 1226 int code, i, j; 1227 CCInfo *ci; 1228 TableEntry *te; 1229 1230 te = conv_table; 1231 for(code = 0; code <= CHARCODE_MAX; code++) { 1232 ci = &tab[code]; 1233 if (ci->u_len == 0 && ci->l_len == 0 && ci->f_len == 0) 1234 continue; 1235 assert(te - conv_table < countof(conv_table)); 1236 find_run_type(te, tab, code); 1237 #if 0 1238 if (te->type == RUN_TYPE_TODO) { 1239 printf("TODO: "); 1240 dump_cc_info(ci, code); 1241 } 1242 #endif 1243 assert(te->len <= 127); 1244 code += te->len - 1; 1245 te++; 1246 } 1247 conv_table_len = te - conv_table; 1248 1249 /* find the data index */ 1250 for(i = 0; i < conv_table_len; i++) { 1251 int data_index; 1252 te = &conv_table[i]; 1253 1254 switch(te->type) { 1255 case RUN_TYPE_U: 1256 case RUN_TYPE_L: 1257 case RUN_TYPE_UF: 1258 case RUN_TYPE_LF: 1259 data_index = find_data_index(conv_table, conv_table_len, te->data); 1260 if (data_index < 0) { 1261 switch(te->type) { 1262 case RUN_TYPE_U: 1263 te->type = RUN_TYPE_U_EXT; 1264 te->ext_len = 1; 1265 te->ext_data[0] = te->data; 1266 break; 1267 case RUN_TYPE_LF: 1268 te->type = RUN_TYPE_LF_EXT; 1269 te->ext_len = 1; 1270 te->ext_data[0] = te->data; 1271 break; 1272 default: 1273 printf("%05x: index not found\n", te->code); 1274 exit(1); 1275 } 1276 } else { 1277 te->data_index = data_index; 1278 } 1279 break; 1280 case RUN_TYPE_UF_D20: 1281 te->data_index = te->data; 1282 break; 1283 } 1284 } 1285 1286 /* find the data index for ext_data */ 1287 for(i = 0; i < conv_table_len; i++) { 1288 te = &conv_table[i]; 1289 if (te->type == RUN_TYPE_UF_EXT3) { 1290 int p, v; 1291 v = 0; 1292 for(j = 0; j < 3; j++) { 1293 p = find_ext_data_index(te->ext_data[j]); 1294 assert(p < 16); 1295 v = (v << 4) | p; 1296 } 1297 te->data_index = v; 1298 } 1299 } 1300 1301 for(i = 0; i < conv_table_len; i++) { 1302 te = &conv_table[i]; 1303 if (te->type == RUN_TYPE_LF_EXT2 || 1304 te->type == RUN_TYPE_UF_EXT2 || 1305 te->type == RUN_TYPE_U2L_399_EXT2) { 1306 int p, v; 1307 v = 0; 1308 for(j = 0; j < 2; j++) { 1309 p = find_ext_data_index(te->ext_data[j]); 1310 assert(p < 64); 1311 v = (v << 6) | p; 1312 } 1313 te->data_index = v; 1314 } 1315 } 1316 1317 for(i = 0; i < conv_table_len; i++) { 1318 te = &conv_table[i]; 1319 if (te->type == RUN_TYPE_UF_D1_EXT || 1320 te->type == RUN_TYPE_U_EXT || 1321 te->type == RUN_TYPE_LF_EXT) { 1322 te->data_index = find_ext_data_index(te->ext_data[0]); 1323 } 1324 } 1325 #ifdef DUMP_CASE_CONV_TABLE 1326 dump_case_conv_table1(); 1327 #endif 1328 } 1329 1330 void dump_case_conv_table(FILE *f) 1331 { 1332 int i; 1333 uint32_t v; 1334 const TableEntry *te; 1335 1336 total_tables++; 1337 total_table_bytes += conv_table_len * sizeof(uint32_t); 1338 fprintf(f, "static const uint32_t case_conv_table1[%d] = {", conv_table_len); 1339 for(i = 0; i < conv_table_len; i++) { 1340 if (i % 4 == 0) 1341 fprintf(f, "\n "); 1342 te = &conv_table[i]; 1343 v = te->code << (32 - 17); 1344 v |= te->len << (32 - 17 - 7); 1345 v |= te->type << (32 - 17 - 7 - 4); 1346 v |= te->data_index >> 8; 1347 fprintf(f, " 0x%08x,", v); 1348 } 1349 fprintf(f, "\n};\n\n"); 1350 1351 total_tables++; 1352 total_table_bytes += conv_table_len; 1353 fprintf(f, "static const uint8_t case_conv_table2[%d] = {", conv_table_len); 1354 for(i = 0; i < conv_table_len; i++) { 1355 if (i % 8 == 0) 1356 fprintf(f, "\n "); 1357 te = &conv_table[i]; 1358 fprintf(f, " 0x%02x,", te->data_index & 0xff); 1359 } 1360 fprintf(f, "\n};\n\n"); 1361 1362 total_tables++; 1363 total_table_bytes += ext_data_len * sizeof(uint16_t); 1364 fprintf(f, "static const uint16_t case_conv_ext[%d] = {", ext_data_len); 1365 for(i = 0; i < ext_data_len; i++) { 1366 if (i % 8 == 0) 1367 fprintf(f, "\n "); 1368 fprintf(f, " 0x%04x,", ext_data[i]); 1369 } 1370 fprintf(f, "\n};\n\n"); 1371 } 1372 1373 1374 static CCInfo *global_tab; 1375 1376 static int sp_cc_cmp(const void *p1, const void *p2) 1377 { 1378 CCInfo *c1 = &global_tab[*(const int *)p1]; 1379 CCInfo *c2 = &global_tab[*(const int *)p2]; 1380 if (c1->f_len < c2->f_len) { 1381 return -1; 1382 } else if (c2->f_len < c1->f_len) { 1383 return 1; 1384 } else { 1385 return memcmp(c1->f_data, c2->f_data, sizeof(c1->f_data[0]) * c1->f_len); 1386 } 1387 } 1388 1389 /* dump the case special cases (multi character results which are 1390 identical and need specific handling in lre_canonicalize() */ 1391 void dump_case_folding_special_cases(CCInfo *tab) 1392 { 1393 int i, len, j; 1394 int *perm; 1395 1396 perm = malloc(sizeof(perm[0]) * (CHARCODE_MAX + 1)); 1397 for(i = 0; i <= CHARCODE_MAX; i++) 1398 perm[i] = i; 1399 global_tab = tab; 1400 qsort(perm, CHARCODE_MAX + 1, sizeof(perm[0]), sp_cc_cmp); 1401 for(i = 0; i <= CHARCODE_MAX;) { 1402 if (tab[perm[i]].f_len <= 1) { 1403 i++; 1404 } else { 1405 len = 1; 1406 while ((i + len) <= CHARCODE_MAX && !sp_cc_cmp(&perm[i], &perm[i + len])) 1407 len++; 1408 1409 if (len > 1) { 1410 for(j = i; j < i + len; j++) 1411 dump_cc_info(&tab[perm[j]], perm[j]); 1412 } 1413 i += len; 1414 } 1415 } 1416 free(perm); 1417 global_tab = NULL; 1418 } 1419 1420 1421 int tabcmp(const int *tab1, const int *tab2, int n) 1422 { 1423 int i; 1424 for(i = 0; i < n; i++) { 1425 if (tab1[i] != tab2[i]) 1426 return -1; 1427 } 1428 return 0; 1429 } 1430 1431 void dump_str(const char *str, const int *buf, int len) 1432 { 1433 int i; 1434 printf("%s=", str); 1435 for(i = 0; i < len; i++) 1436 printf(" %05x", buf[i]); 1437 printf("\n"); 1438 } 1439 1440 void compute_internal_props(void) 1441 { 1442 int i; 1443 BOOL has_ul; 1444 1445 for(i = 0; i <= CHARCODE_MAX; i++) { 1446 CCInfo *ci = &unicode_db[i]; 1447 has_ul = (ci->u_len != 0 || ci->l_len != 0 || ci->f_len != 0); 1448 if (has_ul) { 1449 assert(get_prop(i, PROP_Cased)); 1450 } else { 1451 set_prop(i, PROP_Cased1, get_prop(i, PROP_Cased)); 1452 } 1453 set_prop(i, PROP_ID_Continue1, 1454 get_prop(i, PROP_ID_Continue) & (get_prop(i, PROP_ID_Start) ^ 1)); 1455 set_prop(i, PROP_XID_Start1, 1456 get_prop(i, PROP_ID_Start) ^ get_prop(i, PROP_XID_Start)); 1457 set_prop(i, PROP_XID_Continue1, 1458 get_prop(i, PROP_ID_Continue) ^ get_prop(i, PROP_XID_Continue)); 1459 set_prop(i, PROP_Changes_When_Titlecased1, 1460 get_prop(i, PROP_Changes_When_Titlecased) ^ (ci->u_len != 0)); 1461 set_prop(i, PROP_Changes_When_Casefolded1, 1462 get_prop(i, PROP_Changes_When_Casefolded) ^ (ci->f_len != 0)); 1463 /* XXX: reduce table size (438 bytes) */ 1464 set_prop(i, PROP_Changes_When_NFKC_Casefolded1, 1465 get_prop(i, PROP_Changes_When_NFKC_Casefolded) ^ (ci->f_len != 0)); 1466 #if 0 1467 /* TEST */ 1468 #define M(x) (1U << GCAT_ ## x) 1469 { 1470 int b; 1471 b = ((M(Mn) | M(Cf) | M(Lm) | M(Sk)) >> 1472 unicode_db[i].general_category) & 1; 1473 set_prop(i, PROP_Cased1, 1474 get_prop(i, PROP_Case_Ignorable) ^ b); 1475 } 1476 #undef M 1477 #endif 1478 } 1479 } 1480 1481 void dump_byte_table(FILE *f, const char *cname, const uint8_t *tab, int len) 1482 { 1483 int i; 1484 1485 total_tables++; 1486 total_table_bytes += len; 1487 fprintf(f, "static const uint8_t %s[%d] = {", cname, len); 1488 for(i = 0; i < len; i++) { 1489 if (i % 8 == 0) 1490 fprintf(f, "\n "); 1491 fprintf(f, " 0x%02x,", tab[i]); 1492 } 1493 fprintf(f, "\n};\n\n"); 1494 } 1495 1496 void dump_index_table(FILE *f, const char *cname, const uint8_t *tab, int len) 1497 { 1498 int i, code, offset; 1499 1500 total_index++; 1501 total_index_bytes += len; 1502 fprintf(f, "static const uint8_t %s[%d] = {\n", cname, len); 1503 for(i = 0; i < len; i += 3) { 1504 code = tab[i] + (tab[i+1] << 8) + ((tab[i+2] & 0x1f) << 16); 1505 offset = ((i / 3) + 1) * 32 + (tab[i+2] >> 5); 1506 fprintf(f, " 0x%02x, 0x%02x, 0x%02x,", tab[i], tab[i+1], tab[i+2]); 1507 fprintf(f, " // %6.5X at %d%s\n", code, offset, 1508 i == len - 3 ? " (upper bound)" : ""); 1509 } 1510 fprintf(f, "};\n\n"); 1511 } 1512 1513 #define PROP_BLOCK_LEN 32 1514 1515 void build_prop_table(FILE *f, const char *name, int prop_index, BOOL add_index) 1516 { 1517 int i, j, n, v, offset, code; 1518 DynBuf dbuf_s, *dbuf = &dbuf_s; 1519 DynBuf dbuf1_s, *dbuf1 = &dbuf1_s; 1520 DynBuf dbuf2_s, *dbuf2 = &dbuf2_s; 1521 const uint32_t *buf; 1522 int buf_len, block_end_pos, bit; 1523 char cname[128]; 1524 1525 dbuf_init(dbuf1); 1526 1527 for(i = 0; i <= CHARCODE_MAX;) { 1528 v = get_prop(i, prop_index); 1529 j = i + 1; 1530 while (j <= CHARCODE_MAX && get_prop(j, prop_index) == v) { 1531 j++; 1532 } 1533 n = j - i; 1534 if (j == (CHARCODE_MAX + 1) && v == 0) 1535 break; /* no need to encode last zero run */ 1536 //printf("%05x: %d %d\n", i, n, v); 1537 dbuf_put_u32(dbuf1, n - 1); 1538 i += n; 1539 } 1540 1541 dbuf_init(dbuf); 1542 dbuf_init(dbuf2); 1543 buf = (uint32_t *)dbuf1->buf; 1544 buf_len = dbuf1->size / sizeof(buf[0]); 1545 1546 /* the first value is assumed to be 0 */ 1547 assert(get_prop(0, prop_index) == 0); 1548 1549 block_end_pos = PROP_BLOCK_LEN; 1550 i = 0; 1551 code = 0; 1552 bit = 0; 1553 while (i < buf_len) { 1554 if (add_index && dbuf->size >= block_end_pos && bit == 0) { 1555 offset = (dbuf->size - block_end_pos); 1556 /* XXX: offset could be larger in case of runs of small 1557 lengths. Could add code to change the encoding to 1558 prevent it at the expense of one byte loss */ 1559 assert(offset <= 7); 1560 v = code | (offset << 21); 1561 dbuf_putc(dbuf2, v); 1562 dbuf_putc(dbuf2, v >> 8); 1563 dbuf_putc(dbuf2, v >> 16); 1564 block_end_pos += PROP_BLOCK_LEN; 1565 } 1566 1567 /* Compressed byte encoding: 1568 00..3F: 2 packed lengths: 3-bit + 3-bit 1569 40..5F: 5-bits plus extra byte for length 1570 60..7F: 5-bits plus 2 extra bytes for length 1571 80..FF: 7-bit length 1572 lengths must be incremented to get character count 1573 Ranges alternate between false and true return value. 1574 */ 1575 v = buf[i]; 1576 code += v + 1; 1577 bit ^= 1; 1578 if (v < 8 && (i + 1) < buf_len && buf[i + 1] < 8) { 1579 code += buf[i + 1] + 1; 1580 bit ^= 1; 1581 dbuf_putc(dbuf, (v << 3) | buf[i + 1]); 1582 i += 2; 1583 } else if (v < 128) { 1584 dbuf_putc(dbuf, 0x80 + v); 1585 i++; 1586 } else if (v < (1 << 13)) { 1587 dbuf_putc(dbuf, 0x40 + (v >> 8)); 1588 dbuf_putc(dbuf, v); 1589 i++; 1590 } else { 1591 assert(v < (1 << 21)); 1592 dbuf_putc(dbuf, 0x60 + (v >> 16)); 1593 dbuf_putc(dbuf, v >> 8); 1594 dbuf_putc(dbuf, v); 1595 i++; 1596 } 1597 } 1598 1599 if (add_index) { 1600 /* last index entry */ 1601 v = code; 1602 dbuf_putc(dbuf2, v); 1603 dbuf_putc(dbuf2, v >> 8); 1604 dbuf_putc(dbuf2, v >> 16); 1605 } 1606 1607 #ifdef DUMP_TABLE_SIZE 1608 printf("prop %s: length=%d bytes\n", unicode_prop_name[prop_index], 1609 (int)(dbuf->size + dbuf2->size)); 1610 #endif 1611 snprintf(cname, sizeof(cname), "unicode_prop_%s_table", unicode_prop_name[prop_index]); 1612 dump_byte_table(f, cname, dbuf->buf, dbuf->size); 1613 if (add_index) { 1614 snprintf(cname, sizeof(cname), "unicode_prop_%s_index", unicode_prop_name[prop_index]); 1615 dump_index_table(f, cname, dbuf2->buf, dbuf2->size); 1616 } 1617 1618 dbuf_free(dbuf); 1619 dbuf_free(dbuf1); 1620 dbuf_free(dbuf2); 1621 } 1622 1623 void build_flags_tables(FILE *f) 1624 { 1625 build_prop_table(f, "Cased1", PROP_Cased1, TRUE); 1626 build_prop_table(f, "Case_Ignorable", PROP_Case_Ignorable, TRUE); 1627 build_prop_table(f, "ID_Start", PROP_ID_Start, TRUE); 1628 build_prop_table(f, "ID_Continue1", PROP_ID_Continue1, TRUE); 1629 } 1630 1631 void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len, 1632 const char **tab_short_name) 1633 { 1634 int i, w, maxw; 1635 1636 maxw = 0; 1637 for(i = 0; i < len; i++) { 1638 w = strlen(tab_name[i]); 1639 if (tab_short_name[i][0] != '\0') { 1640 w += 1 + strlen(tab_short_name[i]); 1641 } 1642 if (maxw < w) 1643 maxw = w; 1644 } 1645 1646 /* generate a sequence of strings terminated by an empty string */ 1647 fprintf(f, "static const char %s[] =\n", cname); 1648 for(i = 0; i < len; i++) { 1649 fprintf(f, " \""); 1650 w = fprintf(f, "%s", tab_name[i]); 1651 if (tab_short_name[i][0] != '\0') { 1652 w += fprintf(f, ",%s", tab_short_name[i]); 1653 } 1654 fprintf(f, "\"%*s\"\\0\"\n", 1 + maxw - w, ""); 1655 } 1656 fprintf(f, ";\n\n"); 1657 } 1658 1659 void build_general_category_table(FILE *f) 1660 { 1661 int i, v, j, n, n1; 1662 DynBuf dbuf_s, *dbuf = &dbuf_s; 1663 #ifdef DUMP_TABLE_SIZE 1664 int cw_count, cw_len_count[4], cw_start; 1665 #endif 1666 1667 fprintf(f, "typedef enum {\n"); 1668 for(i = 0; i < GCAT_COUNT; i++) 1669 fprintf(f, " UNICODE_GC_%s,\n", unicode_gc_name[i]); 1670 fprintf(f, " UNICODE_GC_COUNT,\n"); 1671 fprintf(f, "} UnicodeGCEnum;\n\n"); 1672 1673 dump_name_table(f, "unicode_gc_name_table", 1674 unicode_gc_name, GCAT_COUNT, 1675 unicode_gc_short_name); 1676 1677 1678 dbuf_init(dbuf); 1679 #ifdef DUMP_TABLE_SIZE 1680 cw_count = 0; 1681 for(i = 0; i < 4; i++) 1682 cw_len_count[i] = 0; 1683 #endif 1684 for(i = 0; i <= CHARCODE_MAX;) { 1685 v = unicode_db[i].general_category; 1686 j = i + 1; 1687 while (j <= CHARCODE_MAX && unicode_db[j].general_category == v) 1688 j++; 1689 n = j - i; 1690 /* compress Lu/Ll runs */ 1691 if (v == GCAT_Lu) { 1692 n1 = 1; 1693 while ((i + n1) <= CHARCODE_MAX && unicode_db[i + n1].general_category == (v + (n1 & 1))) { 1694 n1++; 1695 } 1696 if (n1 > n) { 1697 v = 31; 1698 n = n1; 1699 } 1700 } 1701 // printf("%05x %05x %d\n", i, n, v); 1702 n--; 1703 #ifdef DUMP_TABLE_SIZE 1704 cw_count++; 1705 cw_start = dbuf->size; 1706 #endif 1707 if (n < 7) { 1708 dbuf_putc(dbuf, (n << 5) | v); 1709 } else if (n < 7 + 128) { 1710 n1 = n - 7; 1711 assert(n1 < 128); 1712 dbuf_putc(dbuf, (0xf << 5) | v); 1713 dbuf_putc(dbuf, n1); 1714 } else if (n < 7 + 128 + (1 << 14)) { 1715 n1 = n - (7 + 128); 1716 assert(n1 < (1 << 14)); 1717 dbuf_putc(dbuf, (0xf << 5) | v); 1718 dbuf_putc(dbuf, (n1 >> 8) + 128); 1719 dbuf_putc(dbuf, n1); 1720 } else { 1721 n1 = n - (7 + 128 + (1 << 14)); 1722 assert(n1 < (1 << 22)); 1723 dbuf_putc(dbuf, (0xf << 5) | v); 1724 dbuf_putc(dbuf, (n1 >> 16) + 128 + 64); 1725 dbuf_putc(dbuf, n1 >> 8); 1726 dbuf_putc(dbuf, n1); 1727 } 1728 #ifdef DUMP_TABLE_SIZE 1729 cw_len_count[dbuf->size - cw_start - 1]++; 1730 #endif 1731 i += n + 1; 1732 } 1733 #ifdef DUMP_TABLE_SIZE 1734 printf("general category: %d entries [", cw_count); 1735 for(i = 0; i < 4; i++) 1736 printf(" %d", cw_len_count[i]); 1737 printf(" ], length=%d bytes\n", (int)dbuf->size); 1738 #endif 1739 1740 dump_byte_table(f, "unicode_gc_table", dbuf->buf, dbuf->size); 1741 1742 dbuf_free(dbuf); 1743 } 1744 1745 void build_script_table(FILE *f) 1746 { 1747 int i, v, j, n, n1, type; 1748 DynBuf dbuf_s, *dbuf = &dbuf_s; 1749 #ifdef DUMP_TABLE_SIZE 1750 int cw_count, cw_len_count[4], cw_start; 1751 #endif 1752 1753 fprintf(f, "typedef enum {\n"); 1754 for(i = 0; i < SCRIPT_COUNT; i++) 1755 fprintf(f, " UNICODE_SCRIPT_%s,\n", unicode_script_name[i]); 1756 fprintf(f, " UNICODE_SCRIPT_COUNT,\n"); 1757 fprintf(f, "} UnicodeScriptEnum;\n\n"); 1758 1759 i = 1; 1760 dump_name_table(f, "unicode_script_name_table", 1761 unicode_script_name + i, SCRIPT_COUNT - i, 1762 unicode_script_short_name + i); 1763 1764 dbuf_init(dbuf); 1765 #ifdef DUMP_TABLE_SIZE 1766 cw_count = 0; 1767 for(i = 0; i < 4; i++) 1768 cw_len_count[i] = 0; 1769 #endif 1770 for(i = 0; i <= CHARCODE_MAX;) { 1771 v = unicode_db[i].script; 1772 j = i + 1; 1773 while (j <= CHARCODE_MAX && unicode_db[j].script == v) 1774 j++; 1775 n = j - i; 1776 if (v == 0 && j == (CHARCODE_MAX + 1)) 1777 break; 1778 // printf("%05x %05x %d\n", i, n, v); 1779 n--; 1780 #ifdef DUMP_TABLE_SIZE 1781 cw_count++; 1782 cw_start = dbuf->size; 1783 #endif 1784 if (v == 0) 1785 type = 0; 1786 else 1787 type = 1; 1788 if (n < 96) { 1789 dbuf_putc(dbuf, n | (type << 7)); 1790 } else if (n < 96 + (1 << 12)) { 1791 n1 = n - 96; 1792 assert(n1 < (1 << 12)); 1793 dbuf_putc(dbuf, ((n1 >> 8) + 96) | (type << 7)); 1794 dbuf_putc(dbuf, n1); 1795 } else { 1796 n1 = n - (96 + (1 << 12)); 1797 assert(n1 < (1 << 20)); 1798 dbuf_putc(dbuf, ((n1 >> 16) + 112) | (type << 7)); 1799 dbuf_putc(dbuf, n1 >> 8); 1800 dbuf_putc(dbuf, n1); 1801 } 1802 if (type != 0) 1803 dbuf_putc(dbuf, v); 1804 1805 #ifdef DUMP_TABLE_SIZE 1806 cw_len_count[dbuf->size - cw_start - 1]++; 1807 #endif 1808 i += n + 1; 1809 } 1810 #ifdef DUMP_TABLE_SIZE 1811 printf("script: %d entries [", cw_count); 1812 for(i = 0; i < 4; i++) 1813 printf(" %d", cw_len_count[i]); 1814 printf(" ], length=%d bytes\n", (int)dbuf->size); 1815 #endif 1816 1817 dump_byte_table(f, "unicode_script_table", dbuf->buf, dbuf->size); 1818 1819 dbuf_free(dbuf); 1820 } 1821 1822 void build_script_ext_table(FILE *f) 1823 { 1824 int i, j, n, n1, script_ext_len; 1825 DynBuf dbuf_s, *dbuf = &dbuf_s; 1826 #if defined(DUMP_TABLE_SIZE) 1827 int cw_count = 0; 1828 #endif 1829 1830 dbuf_init(dbuf); 1831 for(i = 0; i <= CHARCODE_MAX;) { 1832 script_ext_len = unicode_db[i].script_ext_len; 1833 j = i + 1; 1834 while (j <= CHARCODE_MAX && 1835 unicode_db[j].script_ext_len == script_ext_len && 1836 !memcmp(unicode_db[j].script_ext, unicode_db[i].script_ext, 1837 script_ext_len)) { 1838 j++; 1839 } 1840 n = j - i; 1841 #if defined(DUMP_TABLE_SIZE) 1842 cw_count++; 1843 #endif 1844 n--; 1845 if (n < 128) { 1846 dbuf_putc(dbuf, n); 1847 } else if (n < 128 + (1 << 14)) { 1848 n1 = n - 128; 1849 assert(n1 < (1 << 14)); 1850 dbuf_putc(dbuf, (n1 >> 8) + 128); 1851 dbuf_putc(dbuf, n1); 1852 } else { 1853 n1 = n - (128 + (1 << 14)); 1854 assert(n1 < (1 << 22)); 1855 dbuf_putc(dbuf, (n1 >> 16) + 128 + 64); 1856 dbuf_putc(dbuf, n1 >> 8); 1857 dbuf_putc(dbuf, n1); 1858 } 1859 dbuf_putc(dbuf, script_ext_len); 1860 for(j = 0; j < script_ext_len; j++) 1861 dbuf_putc(dbuf, unicode_db[i].script_ext[j]); 1862 i += n + 1; 1863 } 1864 #ifdef DUMP_TABLE_SIZE 1865 printf("script_ext: %d entries", cw_count); 1866 printf(", length=%d bytes\n", (int)dbuf->size); 1867 #endif 1868 1869 dump_byte_table(f, "unicode_script_ext_table", dbuf->buf, dbuf->size); 1870 1871 dbuf_free(dbuf); 1872 } 1873 1874 /* the following properties are synthetized so no table is necessary */ 1875 #define PROP_TABLE_COUNT PROP_ASCII 1876 1877 void build_prop_list_table(FILE *f) 1878 { 1879 int i; 1880 1881 for(i = 0; i < PROP_TABLE_COUNT; i++) { 1882 if (i == PROP_ID_Start || 1883 i == PROP_Case_Ignorable || 1884 i == PROP_ID_Continue1) { 1885 /* already generated */ 1886 } else { 1887 build_prop_table(f, unicode_prop_name[i], i, FALSE); 1888 } 1889 } 1890 1891 fprintf(f, "typedef enum {\n"); 1892 for(i = 0; i < PROP_COUNT; i++) 1893 fprintf(f, " UNICODE_PROP_%s,\n", unicode_prop_name[i]); 1894 fprintf(f, " UNICODE_PROP_COUNT,\n"); 1895 fprintf(f, "} UnicodePropertyEnum;\n\n"); 1896 1897 i = PROP_ASCII_Hex_Digit; 1898 dump_name_table(f, "unicode_prop_name_table", 1899 unicode_prop_name + i, PROP_XID_Start - i + 1, 1900 unicode_prop_short_name + i); 1901 1902 fprintf(f, "static const uint8_t * const unicode_prop_table[] = {\n"); 1903 for(i = 0; i < PROP_TABLE_COUNT; i++) { 1904 fprintf(f, " unicode_prop_%s_table,\n", unicode_prop_name[i]); 1905 } 1906 fprintf(f, "};\n\n"); 1907 1908 fprintf(f, "static const uint16_t unicode_prop_len_table[] = {\n"); 1909 for(i = 0; i < PROP_TABLE_COUNT; i++) { 1910 fprintf(f, " countof(unicode_prop_%s_table),\n", unicode_prop_name[i]); 1911 } 1912 fprintf(f, "};\n\n"); 1913 } 1914 1915 #ifdef USE_TEST 1916 int check_conv(uint32_t *res, uint32_t c, int conv_type) 1917 { 1918 return lre_case_conv(res, c, conv_type); 1919 } 1920 1921 void check_case_conv(void) 1922 { 1923 CCInfo *tab = unicode_db; 1924 uint32_t res[3]; 1925 int l, error; 1926 CCInfo ci_s, *ci1, *ci = &ci_s; 1927 int code; 1928 1929 for(code = 0; code <= CHARCODE_MAX; code++) { 1930 ci1 = &tab[code]; 1931 *ci = *ci1; 1932 if (ci->l_len == 0) { 1933 ci->l_len = 1; 1934 ci->l_data[0] = code; 1935 } 1936 if (ci->u_len == 0) { 1937 ci->u_len = 1; 1938 ci->u_data[0] = code; 1939 } 1940 if (ci->f_len == 0) { 1941 ci->f_len = 1; 1942 ci->f_data[0] = code; 1943 } 1944 1945 error = 0; 1946 l = check_conv(res, code, 0); 1947 if (l != ci->u_len || tabcmp((int *)res, ci->u_data, l)) { 1948 printf("ERROR: L\n"); 1949 error++; 1950 } 1951 l = check_conv(res, code, 1); 1952 if (l != ci->l_len || tabcmp((int *)res, ci->l_data, l)) { 1953 printf("ERROR: U\n"); 1954 error++; 1955 } 1956 l = check_conv(res, code, 2); 1957 if (l != ci->f_len || tabcmp((int *)res, ci->f_data, l)) { 1958 printf("ERROR: F\n"); 1959 error++; 1960 } 1961 if (error) { 1962 dump_cc_info(ci, code); 1963 exit(1); 1964 } 1965 } 1966 } 1967 1968 #ifdef PROFILE 1969 static int64_t get_time_ns(void) 1970 { 1971 struct timespec ts; 1972 clock_gettime(CLOCK_MONOTONIC, &ts); 1973 return (int64_t)ts.tv_sec * 1000000000 + ts.tv_nsec; 1974 } 1975 #endif 1976 1977 1978 void check_flags(void) 1979 { 1980 int c; 1981 BOOL flag_ref, flag; 1982 for(c = 0; c <= CHARCODE_MAX; c++) { 1983 flag_ref = get_prop(c, PROP_Cased); 1984 flag = !!lre_is_cased(c); 1985 if (flag != flag_ref) { 1986 printf("ERROR: c=%05x cased=%d ref=%d\n", 1987 c, flag, flag_ref); 1988 exit(1); 1989 } 1990 1991 flag_ref = get_prop(c, PROP_Case_Ignorable); 1992 flag = !!lre_is_case_ignorable(c); 1993 if (flag != flag_ref) { 1994 printf("ERROR: c=%05x case_ignorable=%d ref=%d\n", 1995 c, flag, flag_ref); 1996 exit(1); 1997 } 1998 1999 flag_ref = get_prop(c, PROP_ID_Start); 2000 flag = !!lre_is_id_start(c); 2001 if (flag != flag_ref) { 2002 printf("ERROR: c=%05x id_start=%d ref=%d\n", 2003 c, flag, flag_ref); 2004 exit(1); 2005 } 2006 2007 flag_ref = get_prop(c, PROP_ID_Continue); 2008 flag = !!lre_is_id_continue(c); 2009 if (flag != flag_ref) { 2010 printf("ERROR: c=%05x id_cont=%d ref=%d\n", 2011 c, flag, flag_ref); 2012 exit(1); 2013 } 2014 } 2015 #ifdef PROFILE 2016 { 2017 int64_t ti, count; 2018 ti = get_time_ns(); 2019 count = 0; 2020 for(c = 0x20; c <= 0xffff; c++) { 2021 flag_ref = get_prop(c, PROP_ID_Start); 2022 flag = !!lre_is_id_start(c); 2023 assert(flag == flag_ref); 2024 count++; 2025 } 2026 ti = get_time_ns() - ti; 2027 printf("flags time=%0.1f ns/char\n", 2028 (double)ti / count); 2029 } 2030 #endif 2031 } 2032 2033 #endif 2034 2035 #define CC_BLOCK_LEN 32 2036 2037 void build_cc_table(FILE *f) 2038 { 2039 // Compress combining class table 2040 // see: https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values 2041 int i, cc, n, type, n1, block_end_pos; 2042 DynBuf dbuf_s, *dbuf = &dbuf_s; 2043 DynBuf dbuf1_s, *dbuf1 = &dbuf1_s; 2044 #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE) 2045 int cw_len_tab[3], cw_start, cc_table_len; 2046 #endif 2047 uint32_t v; 2048 2049 dbuf_init(dbuf); 2050 dbuf_init(dbuf1); 2051 #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE) 2052 cc_table_len = 0; 2053 for(i = 0; i < countof(cw_len_tab); i++) 2054 cw_len_tab[i] = 0; 2055 #endif 2056 block_end_pos = CC_BLOCK_LEN; 2057 for(i = 0; i <= CHARCODE_MAX;) { 2058 cc = unicode_db[i].combining_class; 2059 assert(cc <= 255); 2060 /* check increasing values */ 2061 n = 1; 2062 while ((i + n) <= CHARCODE_MAX && 2063 unicode_db[i + n].combining_class == (cc + n)) 2064 n++; 2065 if (n >= 2) { 2066 type = 1; 2067 } else { 2068 type = 0; 2069 n = 1; 2070 while ((i + n) <= CHARCODE_MAX && 2071 unicode_db[i + n].combining_class == cc) 2072 n++; 2073 } 2074 /* no need to encode the last run */ 2075 if (cc == 0 && (i + n - 1) == CHARCODE_MAX) 2076 break; 2077 #ifdef DUMP_CC_TABLE 2078 printf("%05x %6d %d %d\n", i, n, type, cc); 2079 #endif 2080 if (type == 0) { 2081 if (cc == 0) 2082 type = 2; 2083 else if (cc == 230) 2084 type = 3; 2085 } 2086 n1 = n - 1; 2087 2088 /* add an entry to the index if necessary */ 2089 if (dbuf->size >= block_end_pos) { 2090 v = i | ((dbuf->size - block_end_pos) << 21); 2091 dbuf_putc(dbuf1, v); 2092 dbuf_putc(dbuf1, v >> 8); 2093 dbuf_putc(dbuf1, v >> 16); 2094 block_end_pos += CC_BLOCK_LEN; 2095 } 2096 #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE) 2097 cw_start = dbuf->size; 2098 #endif 2099 /* Compressed run length encoding: 2100 - 2 high order bits are combining class type 2101 - 0:0, 1:230, 2:extra byte linear progression, 3:extra byte 2102 - 00..2F: range length (add 1) 2103 - 30..37: 3-bit range-length + 1 extra byte 2104 - 38..3F: 3-bit range-length + 2 extra byte 2105 */ 2106 if (n1 < 48) { 2107 dbuf_putc(dbuf, n1 | (type << 6)); 2108 } else if (n1 < 48 + (1 << 11)) { 2109 n1 -= 48; 2110 dbuf_putc(dbuf, ((n1 >> 8) + 48) | (type << 6)); 2111 dbuf_putc(dbuf, n1); 2112 } else { 2113 n1 -= 48 + (1 << 11); 2114 assert(n1 < (1 << 20)); 2115 dbuf_putc(dbuf, ((n1 >> 16) + 56) | (type << 6)); 2116 dbuf_putc(dbuf, n1 >> 8); 2117 dbuf_putc(dbuf, n1); 2118 } 2119 #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE) 2120 cw_len_tab[dbuf->size - cw_start - 1]++; 2121 cc_table_len++; 2122 #endif 2123 if (type == 0 || type == 1) 2124 dbuf_putc(dbuf, cc); 2125 i += n; 2126 } 2127 2128 /* last index entry */ 2129 v = i; 2130 dbuf_putc(dbuf1, v); 2131 dbuf_putc(dbuf1, v >> 8); 2132 dbuf_putc(dbuf1, v >> 16); 2133 2134 dump_byte_table(f, "unicode_cc_table", dbuf->buf, dbuf->size); 2135 dump_index_table(f, "unicode_cc_index", dbuf1->buf, dbuf1->size); 2136 2137 #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE) 2138 printf("CC table: size=%d (%d entries) [", 2139 (int)(dbuf->size + dbuf1->size), 2140 cc_table_len); 2141 for(i = 0; i < countof(cw_len_tab); i++) 2142 printf(" %d", cw_len_tab[i]); 2143 printf(" ]\n"); 2144 #endif 2145 dbuf_free(dbuf); 2146 dbuf_free(dbuf1); 2147 } 2148 2149 /* maximum length of decomposition: 18 chars (1), then 8 */ 2150 #ifndef USE_TEST 2151 typedef enum { 2152 DECOMP_TYPE_C1, /* 16 bit char */ 2153 DECOMP_TYPE_L1, /* 16 bit char table */ 2154 DECOMP_TYPE_L2, 2155 DECOMP_TYPE_L3, 2156 DECOMP_TYPE_L4, 2157 DECOMP_TYPE_L5, /* XXX: not used */ 2158 DECOMP_TYPE_L6, /* XXX: could remove */ 2159 DECOMP_TYPE_L7, /* XXX: could remove */ 2160 DECOMP_TYPE_LL1, /* 18 bit char table */ 2161 DECOMP_TYPE_LL2, 2162 DECOMP_TYPE_S1, /* 8 bit char table */ 2163 DECOMP_TYPE_S2, 2164 DECOMP_TYPE_S3, 2165 DECOMP_TYPE_S4, 2166 DECOMP_TYPE_S5, 2167 DECOMP_TYPE_I1, /* increment 16 bit char value */ 2168 DECOMP_TYPE_I2_0, 2169 DECOMP_TYPE_I2_1, 2170 DECOMP_TYPE_I3_1, 2171 DECOMP_TYPE_I3_2, 2172 DECOMP_TYPE_I4_1, 2173 DECOMP_TYPE_I4_2, 2174 DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */ 2175 DECOMP_TYPE_B2, 2176 DECOMP_TYPE_B3, 2177 DECOMP_TYPE_B4, 2178 DECOMP_TYPE_B5, 2179 DECOMP_TYPE_B6, 2180 DECOMP_TYPE_B7, 2181 DECOMP_TYPE_B8, 2182 DECOMP_TYPE_B18, 2183 DECOMP_TYPE_LS2, 2184 DECOMP_TYPE_PAT3, 2185 DECOMP_TYPE_S2_UL, 2186 DECOMP_TYPE_LS2_UL, 2187 } DecompTypeEnum; 2188 #endif 2189 2190 const char *decomp_type_str[] = { 2191 "C1", 2192 "L1", 2193 "L2", 2194 "L3", 2195 "L4", 2196 "L5", 2197 "L6", 2198 "L7", 2199 "LL1", 2200 "LL2", 2201 "S1", 2202 "S2", 2203 "S3", 2204 "S4", 2205 "S5", 2206 "I1", 2207 "I2_0", 2208 "I2_1", 2209 "I3_1", 2210 "I3_2", 2211 "I4_1", 2212 "I4_2", 2213 "B1", 2214 "B2", 2215 "B3", 2216 "B4", 2217 "B5", 2218 "B6", 2219 "B7", 2220 "B8", 2221 "B18", 2222 "LS2", 2223 "PAT3", 2224 "S2_UL", 2225 "LS2_UL", 2226 }; 2227 2228 const int decomp_incr_tab[4][4] = { 2229 { DECOMP_TYPE_I1, 0, -1 }, 2230 { DECOMP_TYPE_I2_0, 0, 1, -1 }, 2231 { DECOMP_TYPE_I3_1, 1, 2, -1 }, 2232 { DECOMP_TYPE_I4_1, 1, 2, -1 }, 2233 }; 2234 2235 /* 2236 entry size: 2237 type bits 2238 code 18 2239 len 7 2240 compat 1 2241 type 5 2242 index 16 2243 total 47 2244 */ 2245 2246 typedef struct { 2247 int code; 2248 uint8_t len; 2249 uint8_t type; 2250 uint8_t c_len; 2251 uint16_t c_min; 2252 uint16_t data_index; 2253 int cost; /* size in bytes from this entry to the end */ 2254 } DecompEntry; 2255 2256 int get_decomp_run_size(const DecompEntry *de) 2257 { 2258 int s; 2259 s = 6; 2260 if (de->type <= DECOMP_TYPE_C1) { 2261 /* nothing more */ 2262 } else if (de->type <= DECOMP_TYPE_L7) { 2263 s += de->len * de->c_len * 2; 2264 } else if (de->type <= DECOMP_TYPE_LL2) { 2265 /* 18 bits per char */ 2266 s += (de->len * de->c_len * 18 + 7) / 8; 2267 } else if (de->type <= DECOMP_TYPE_S5) { 2268 s += de->len * de->c_len; 2269 } else if (de->type <= DECOMP_TYPE_I4_2) { 2270 s += de->c_len * 2; 2271 } else if (de->type <= DECOMP_TYPE_B18) { 2272 s += 2 + de->len * de->c_len; 2273 } else if (de->type <= DECOMP_TYPE_LS2) { 2274 s += de->len * 3; 2275 } else if (de->type <= DECOMP_TYPE_PAT3) { 2276 s += 4 + de->len * 2; 2277 } else if (de->type <= DECOMP_TYPE_S2_UL) { 2278 s += de->len; 2279 } else if (de->type <= DECOMP_TYPE_LS2_UL) { 2280 s += (de->len / 2) * 3; 2281 } else { 2282 abort(); 2283 } 2284 return s; 2285 } 2286 2287 static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 }; 2288 2289 /* return -1 if not found */ 2290 int get_short_code(int c) 2291 { 2292 int i; 2293 if (c < 0x80) { 2294 return c; 2295 } else if (c >= 0x300 && c < 0x350) { 2296 return c - 0x300 + 0x80; 2297 } else { 2298 for(i = 0; i < countof(unicode_short_table); i++) { 2299 if (c == unicode_short_table[i]) 2300 return i + 0x80 + 0x50; 2301 } 2302 return -1; 2303 } 2304 } 2305 2306 static BOOL is_short(int code) 2307 { 2308 return get_short_code(code) >= 0; 2309 } 2310 2311 static BOOL is_short_tab(const int *tab, int len) 2312 { 2313 int i; 2314 for(i = 0; i < len; i++) { 2315 if (!is_short(tab[i])) 2316 return FALSE; 2317 } 2318 return TRUE; 2319 } 2320 2321 static BOOL is_16bit(const int *tab, int len) 2322 { 2323 int i; 2324 for(i = 0; i < len; i++) { 2325 if (tab[i] > 0xffff) 2326 return FALSE; 2327 } 2328 return TRUE; 2329 } 2330 2331 static uint32_t to_lower_simple(uint32_t c) 2332 { 2333 /* Latin1 and Cyrillic */ 2334 if (c < 0x100 || (c >= 0x410 && c <= 0x42f)) 2335 c += 0x20; 2336 else 2337 c++; 2338 return c; 2339 } 2340 2341 /* select best encoding with dynamic programming */ 2342 void find_decomp_run(DecompEntry *tab_de, int i) 2343 { 2344 DecompEntry de_s, *de = &de_s; 2345 CCInfo *ci, *ci1, *ci2; 2346 int l, j, n, len_max; 2347 2348 ci = &unicode_db[i]; 2349 l = ci->decomp_len; 2350 if (l == 0) { 2351 tab_de[i].cost = tab_de[i + 1].cost; 2352 return; 2353 } 2354 2355 /* the offset for the compose table has only 6 bits, so we must 2356 limit if it can be used by the compose table */ 2357 if (!ci->is_compat && !ci->is_excluded && l == 2) 2358 len_max = 64; 2359 else 2360 len_max = 127; 2361 2362 tab_de[i].cost = 0x7fffffff; 2363 2364 if (!is_16bit(ci->decomp_data, l)) { 2365 assert(l <= 2); 2366 2367 n = 1; 2368 for(;;) { 2369 de->code = i; 2370 de->len = n; 2371 de->type = DECOMP_TYPE_LL1 + l - 1; 2372 de->c_len = l; 2373 de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; 2374 if (de->cost < tab_de[i].cost) { 2375 tab_de[i] = *de; 2376 } 2377 if (!((i + n) <= CHARCODE_MAX && n < len_max)) 2378 break; 2379 ci1 = &unicode_db[i + n]; 2380 /* Note: we accept a hole */ 2381 if (!(ci1->decomp_len == 0 || 2382 (ci1->decomp_len == l && 2383 ci1->is_compat == ci->is_compat))) 2384 break; 2385 n++; 2386 } 2387 return; 2388 } 2389 2390 if (l <= 7) { 2391 n = 1; 2392 for(;;) { 2393 de->code = i; 2394 de->len = n; 2395 if (l == 1 && n == 1) { 2396 de->type = DECOMP_TYPE_C1; 2397 } else { 2398 assert(l <= 8); 2399 de->type = DECOMP_TYPE_L1 + l - 1; 2400 } 2401 de->c_len = l; 2402 de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; 2403 if (de->cost < tab_de[i].cost) { 2404 tab_de[i] = *de; 2405 } 2406 2407 if (!((i + n) <= CHARCODE_MAX && n < len_max)) 2408 break; 2409 ci1 = &unicode_db[i + n]; 2410 /* Note: we accept a hole */ 2411 if (!(ci1->decomp_len == 0 || 2412 (ci1->decomp_len == l && 2413 ci1->is_compat == ci->is_compat && 2414 is_16bit(ci1->decomp_data, l)))) 2415 break; 2416 n++; 2417 } 2418 } 2419 2420 if (l <= 8 || l == 18) { 2421 int c_min, c_max, c; 2422 c_min = c_max = -1; 2423 n = 1; 2424 for(;;) { 2425 ci1 = &unicode_db[i + n - 1]; 2426 for(j = 0; j < l; j++) { 2427 c = ci1->decomp_data[j]; 2428 if (c == 0x20) { 2429 /* we accept space for Arabic */ 2430 } else if (c_min == -1) { 2431 c_min = c_max = c; 2432 } else { 2433 c_min = min_int(c_min, c); 2434 c_max = max_int(c_max, c); 2435 } 2436 } 2437 if ((c_max - c_min) > 254) 2438 break; 2439 de->code = i; 2440 de->len = n; 2441 if (l == 18) 2442 de->type = DECOMP_TYPE_B18; 2443 else 2444 de->type = DECOMP_TYPE_B1 + l - 1; 2445 de->c_len = l; 2446 de->c_min = c_min; 2447 de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; 2448 if (de->cost < tab_de[i].cost) { 2449 tab_de[i] = *de; 2450 } 2451 if (!((i + n) <= CHARCODE_MAX && n < len_max)) 2452 break; 2453 ci1 = &unicode_db[i + n]; 2454 if (!(ci1->decomp_len == l && 2455 ci1->is_compat == ci->is_compat)) 2456 break; 2457 n++; 2458 } 2459 } 2460 2461 /* find an ascii run */ 2462 if (l <= 5 && is_short_tab(ci->decomp_data, l)) { 2463 n = 1; 2464 for(;;) { 2465 de->code = i; 2466 de->len = n; 2467 de->type = DECOMP_TYPE_S1 + l - 1; 2468 de->c_len = l; 2469 de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; 2470 if (de->cost < tab_de[i].cost) { 2471 tab_de[i] = *de; 2472 } 2473 2474 if (!((i + n) <= CHARCODE_MAX && n < len_max)) 2475 break; 2476 ci1 = &unicode_db[i + n]; 2477 /* Note: we accept a hole */ 2478 if (!(ci1->decomp_len == 0 || 2479 (ci1->decomp_len == l && 2480 ci1->is_compat == ci->is_compat && 2481 is_short_tab(ci1->decomp_data, l)))) 2482 break; 2483 n++; 2484 } 2485 } 2486 2487 /* check if a single char is increasing */ 2488 if (l <= 4) { 2489 int idx1, idx; 2490 2491 for(idx1 = 1; (idx = decomp_incr_tab[l - 1][idx1]) >= 0; idx1++) { 2492 n = 1; 2493 for(;;) { 2494 de->code = i; 2495 de->len = n; 2496 de->type = decomp_incr_tab[l - 1][0] + idx1 - 1; 2497 de->c_len = l; 2498 de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; 2499 if (de->cost < tab_de[i].cost) { 2500 tab_de[i] = *de; 2501 } 2502 2503 if (!((i + n) <= CHARCODE_MAX && n < len_max)) 2504 break; 2505 ci1 = &unicode_db[i + n]; 2506 if (!(ci1->decomp_len == l && 2507 ci1->is_compat == ci->is_compat)) 2508 goto next1; 2509 for(j = 0; j < l; j++) { 2510 if (j == idx) { 2511 if (ci1->decomp_data[j] != ci->decomp_data[j] + n) 2512 goto next1; 2513 } else { 2514 if (ci1->decomp_data[j] != ci->decomp_data[j]) 2515 goto next1; 2516 } 2517 } 2518 n++; 2519 } 2520 next1: ; 2521 } 2522 } 2523 2524 if (l == 3) { 2525 n = 1; 2526 for(;;) { 2527 de->code = i; 2528 de->len = n; 2529 de->type = DECOMP_TYPE_PAT3; 2530 de->c_len = l; 2531 de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; 2532 if (de->cost < tab_de[i].cost) { 2533 tab_de[i] = *de; 2534 } 2535 if (!((i + n) <= CHARCODE_MAX && n < len_max)) 2536 break; 2537 ci1 = &unicode_db[i + n]; 2538 if (!(ci1->decomp_len == l && 2539 ci1->is_compat == ci->is_compat && 2540 ci1->decomp_data[1] <= 0xffff && 2541 ci1->decomp_data[0] == ci->decomp_data[0] && 2542 ci1->decomp_data[l - 1] == ci->decomp_data[l - 1])) 2543 break; 2544 n++; 2545 } 2546 } 2547 2548 if (l == 2 && is_short(ci->decomp_data[1])) { 2549 n = 1; 2550 for(;;) { 2551 de->code = i; 2552 de->len = n; 2553 de->type = DECOMP_TYPE_LS2; 2554 de->c_len = l; 2555 de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; 2556 if (de->cost < tab_de[i].cost) { 2557 tab_de[i] = *de; 2558 } 2559 if (!((i + n) <= CHARCODE_MAX && n < len_max)) 2560 break; 2561 ci1 = &unicode_db[i + n]; 2562 if (!(ci1->decomp_len == 0 || 2563 (ci1->decomp_len == l && 2564 ci1->is_compat == ci->is_compat && 2565 ci1->decomp_data[0] <= 0xffff && 2566 is_short(ci1->decomp_data[1])))) 2567 break; 2568 n++; 2569 } 2570 } 2571 2572 if (l == 2) { 2573 BOOL is_16bit; 2574 2575 n = 0; 2576 is_16bit = FALSE; 2577 for(;;) { 2578 if (!((i + n + 1) <= CHARCODE_MAX && n + 2 <= len_max)) 2579 break; 2580 ci1 = &unicode_db[i + n]; 2581 if (!(ci1->decomp_len == l && 2582 ci1->is_compat == ci->is_compat && 2583 is_short(ci1->decomp_data[1]))) 2584 break; 2585 if (!is_16bit && !is_short(ci1->decomp_data[0])) 2586 is_16bit = TRUE; 2587 ci2 = &unicode_db[i + n + 1]; 2588 if (!(ci2->decomp_len == l && 2589 ci2->is_compat == ci->is_compat && 2590 ci2->decomp_data[0] == to_lower_simple(ci1->decomp_data[0]) && 2591 ci2->decomp_data[1] == ci1->decomp_data[1])) 2592 break; 2593 n += 2; 2594 de->code = i; 2595 de->len = n; 2596 de->type = DECOMP_TYPE_S2_UL + is_16bit; 2597 de->c_len = l; 2598 de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; 2599 if (de->cost < tab_de[i].cost) { 2600 tab_de[i] = *de; 2601 } 2602 } 2603 } 2604 } 2605 2606 void put16(uint8_t *data_buf, int *pidx, uint16_t c) 2607 { 2608 int idx; 2609 idx = *pidx; 2610 data_buf[idx++] = c; 2611 data_buf[idx++] = c >> 8; 2612 *pidx = idx; 2613 } 2614 2615 void add_decomp_data(uint8_t *data_buf, int *pidx, DecompEntry *de) 2616 { 2617 int i, j, idx, c; 2618 CCInfo *ci; 2619 2620 idx = *pidx; 2621 de->data_index = idx; 2622 if (de->type <= DECOMP_TYPE_C1) { 2623 ci = &unicode_db[de->code]; 2624 assert(ci->decomp_len == 1); 2625 de->data_index = ci->decomp_data[0]; 2626 } else if (de->type <= DECOMP_TYPE_L7) { 2627 for(i = 0; i < de->len; i++) { 2628 ci = &unicode_db[de->code + i]; 2629 for(j = 0; j < de->c_len; j++) { 2630 if (ci->decomp_len == 0) 2631 c = 0; 2632 else 2633 c = ci->decomp_data[j]; 2634 put16(data_buf, &idx, c); 2635 } 2636 } 2637 } else if (de->type <= DECOMP_TYPE_LL2) { 2638 int n, p, k; 2639 n = (de->len * de->c_len * 18 + 7) / 8; 2640 p = de->len * de->c_len * 2; 2641 memset(data_buf + idx, 0, n); 2642 k = 0; 2643 for(i = 0; i < de->len; i++) { 2644 ci = &unicode_db[de->code + i]; 2645 for(j = 0; j < de->c_len; j++) { 2646 if (ci->decomp_len == 0) 2647 c = 0; 2648 else 2649 c = ci->decomp_data[j]; 2650 data_buf[idx + k * 2] = c; 2651 data_buf[idx + k * 2 + 1] = c >> 8; 2652 data_buf[idx + p + (k / 4)] |= (c >> 16) << ((k % 4) * 2); 2653 k++; 2654 } 2655 } 2656 idx += n; 2657 } else if (de->type <= DECOMP_TYPE_S5) { 2658 for(i = 0; i < de->len; i++) { 2659 ci = &unicode_db[de->code + i]; 2660 for(j = 0; j < de->c_len; j++) { 2661 if (ci->decomp_len == 0) 2662 c = 0; 2663 else 2664 c = ci->decomp_data[j]; 2665 c = get_short_code(c); 2666 assert(c >= 0); 2667 data_buf[idx++] = c; 2668 } 2669 } 2670 } else if (de->type <= DECOMP_TYPE_I4_2) { 2671 ci = &unicode_db[de->code]; 2672 assert(ci->decomp_len == de->c_len); 2673 for(j = 0; j < de->c_len; j++) 2674 put16(data_buf, &idx, ci->decomp_data[j]); 2675 } else if (de->type <= DECOMP_TYPE_B18) { 2676 c = de->c_min; 2677 data_buf[idx++] = c; 2678 data_buf[idx++] = c >> 8; 2679 for(i = 0; i < de->len; i++) { 2680 ci = &unicode_db[de->code + i]; 2681 for(j = 0; j < de->c_len; j++) { 2682 assert(ci->decomp_len == de->c_len); 2683 c = ci->decomp_data[j]; 2684 if (c == 0x20) { 2685 c = 0xff; 2686 } else { 2687 c -= de->c_min; 2688 assert((uint32_t)c <= 254); 2689 } 2690 data_buf[idx++] = c; 2691 } 2692 } 2693 } else if (de->type <= DECOMP_TYPE_LS2) { 2694 assert(de->c_len == 2); 2695 for(i = 0; i < de->len; i++) { 2696 ci = &unicode_db[de->code + i]; 2697 if (ci->decomp_len == 0) 2698 c = 0; 2699 else 2700 c = ci->decomp_data[0]; 2701 put16(data_buf, &idx, c); 2702 2703 if (ci->decomp_len == 0) 2704 c = 0; 2705 else 2706 c = ci->decomp_data[1]; 2707 c = get_short_code(c); 2708 assert(c >= 0); 2709 data_buf[idx++] = c; 2710 } 2711 } else if (de->type <= DECOMP_TYPE_PAT3) { 2712 ci = &unicode_db[de->code]; 2713 assert(ci->decomp_len == 3); 2714 put16(data_buf, &idx, ci->decomp_data[0]); 2715 put16(data_buf, &idx, ci->decomp_data[2]); 2716 for(i = 0; i < de->len; i++) { 2717 ci = &unicode_db[de->code + i]; 2718 assert(ci->decomp_len == 3); 2719 put16(data_buf, &idx, ci->decomp_data[1]); 2720 } 2721 } else if (de->type <= DECOMP_TYPE_S2_UL) { 2722 for(i = 0; i < de->len; i += 2) { 2723 ci = &unicode_db[de->code + i]; 2724 c = ci->decomp_data[0]; 2725 c = get_short_code(c); 2726 assert(c >= 0); 2727 data_buf[idx++] = c; 2728 c = ci->decomp_data[1]; 2729 c = get_short_code(c); 2730 assert(c >= 0); 2731 data_buf[idx++] = c; 2732 } 2733 } else if (de->type <= DECOMP_TYPE_LS2_UL) { 2734 for(i = 0; i < de->len; i += 2) { 2735 ci = &unicode_db[de->code + i]; 2736 c = ci->decomp_data[0]; 2737 put16(data_buf, &idx, c); 2738 c = ci->decomp_data[1]; 2739 c = get_short_code(c); 2740 assert(c >= 0); 2741 data_buf[idx++] = c; 2742 } 2743 } else { 2744 abort(); 2745 } 2746 *pidx = idx; 2747 } 2748 2749 #if 0 2750 void dump_large_char(void) 2751 { 2752 int i, j; 2753 for(i = 0; i <= CHARCODE_MAX; i++) { 2754 CCInfo *ci = &unicode_db[i]; 2755 for(j = 0; j < ci->decomp_len; j++) { 2756 if (ci->decomp_data[j] > 0xffff) 2757 printf("%05x\n", ci->decomp_data[j]); 2758 } 2759 } 2760 } 2761 #endif 2762 2763 void build_compose_table(FILE *f, const DecompEntry *tab_de); 2764 2765 void build_decompose_table(FILE *f) 2766 { 2767 int i, array_len, code_max, data_len, count; 2768 DecompEntry *tab_de, de_s, *de = &de_s; 2769 uint8_t *data_buf; 2770 2771 code_max = CHARCODE_MAX; 2772 2773 tab_de = mallocz((code_max + 2) * sizeof(*tab_de)); 2774 2775 for(i = code_max; i >= 0; i--) { 2776 find_decomp_run(tab_de, i); 2777 } 2778 2779 /* build the data buffer */ 2780 data_buf = malloc(100000); 2781 data_len = 0; 2782 array_len = 0; 2783 for(i = 0; i <= code_max; i++) { 2784 de = &tab_de[i]; 2785 if (de->len != 0) { 2786 add_decomp_data(data_buf, &data_len, de); 2787 i += de->len - 1; 2788 array_len++; 2789 } 2790 } 2791 2792 #ifdef DUMP_DECOMP_TABLE 2793 /* dump */ 2794 { 2795 int size, size1; 2796 2797 printf("START LEN TYPE L C SIZE\n"); 2798 size = 0; 2799 for(i = 0; i <= code_max; i++) { 2800 de = &tab_de[i]; 2801 if (de->len != 0) { 2802 size1 = get_decomp_run_size(de); 2803 printf("%05x %3d %6s %2d %1d %4d\n", i, de->len, 2804 decomp_type_str[de->type], de->c_len, 2805 unicode_db[i].is_compat, size1); 2806 i += de->len - 1; 2807 size += size1; 2808 } 2809 } 2810 2811 printf("array_len=%d estimated size=%d bytes actual=%d bytes\n", 2812 array_len, size, array_len * 6 + data_len); 2813 } 2814 #endif 2815 2816 total_tables++; 2817 total_table_bytes += array_len * sizeof(uint32_t); 2818 fprintf(f, "static const uint32_t unicode_decomp_table1[%d] = {", array_len); 2819 count = 0; 2820 for(i = 0; i <= code_max; i++) { 2821 de = &tab_de[i]; 2822 if (de->len != 0) { 2823 uint32_t v; 2824 if (count++ % 4 == 0) 2825 fprintf(f, "\n "); 2826 v = (de->code << (32 - 18)) | 2827 (de->len << (32 - 18 - 7)) | 2828 (de->type << (32 - 18 - 7 - 6)) | 2829 unicode_db[de->code].is_compat; 2830 fprintf(f, " 0x%08x,", v); 2831 i += de->len - 1; 2832 } 2833 } 2834 fprintf(f, "\n};\n\n"); 2835 2836 total_tables++; 2837 total_table_bytes += array_len * sizeof(uint16_t); 2838 fprintf(f, "static const uint16_t unicode_decomp_table2[%d] = {", array_len); 2839 count = 0; 2840 for(i = 0; i <= code_max; i++) { 2841 de = &tab_de[i]; 2842 if (de->len != 0) { 2843 if (count++ % 8 == 0) 2844 fprintf(f, "\n "); 2845 fprintf(f, " 0x%04x,", de->data_index); 2846 i += de->len - 1; 2847 } 2848 } 2849 fprintf(f, "\n};\n\n"); 2850 2851 total_tables++; 2852 total_table_bytes += data_len; 2853 fprintf(f, "static const uint8_t unicode_decomp_data[%d] = {", data_len); 2854 for(i = 0; i < data_len; i++) { 2855 if (i % 8 == 0) 2856 fprintf(f, "\n "); 2857 fprintf(f, " 0x%02x,", data_buf[i]); 2858 } 2859 fprintf(f, "\n};\n\n"); 2860 2861 build_compose_table(f, tab_de); 2862 2863 free(data_buf); 2864 2865 free(tab_de); 2866 } 2867 2868 typedef struct { 2869 uint32_t c[2]; 2870 uint32_t p; 2871 } ComposeEntry; 2872 2873 #define COMPOSE_LEN_MAX 10000 2874 2875 static int ce_cmp(const void *p1, const void *p2) 2876 { 2877 const ComposeEntry *ce1 = p1; 2878 const ComposeEntry *ce2 = p2; 2879 int i; 2880 2881 for(i = 0; i < 2; i++) { 2882 if (ce1->c[i] < ce2->c[i]) 2883 return -1; 2884 else if (ce1->c[i] > ce2->c[i]) 2885 return 1; 2886 } 2887 return 0; 2888 } 2889 2890 2891 static int get_decomp_pos(const DecompEntry *tab_de, int c) 2892 { 2893 int i, v, k; 2894 const DecompEntry *de; 2895 2896 k = 0; 2897 for(i = 0; i <= CHARCODE_MAX; i++) { 2898 de = &tab_de[i]; 2899 if (de->len != 0) { 2900 if (c >= de->code && c < de->code + de->len) { 2901 v = c - de->code; 2902 assert(v < 64); 2903 v |= k << 6; 2904 assert(v < 65536); 2905 return v; 2906 } 2907 i += de->len - 1; 2908 k++; 2909 } 2910 } 2911 return -1; 2912 } 2913 2914 void build_compose_table(FILE *f, const DecompEntry *tab_de) 2915 { 2916 int i, v, tab_ce_len; 2917 ComposeEntry *ce, *tab_ce; 2918 2919 tab_ce = malloc(sizeof(*tab_ce) * COMPOSE_LEN_MAX); 2920 tab_ce_len = 0; 2921 for(i = 0; i <= CHARCODE_MAX; i++) { 2922 CCInfo *ci = &unicode_db[i]; 2923 if (ci->decomp_len == 2 && !ci->is_compat && 2924 !ci->is_excluded) { 2925 assert(tab_ce_len < COMPOSE_LEN_MAX); 2926 ce = &tab_ce[tab_ce_len++]; 2927 ce->c[0] = ci->decomp_data[0]; 2928 ce->c[1] = ci->decomp_data[1]; 2929 ce->p = i; 2930 } 2931 } 2932 qsort(tab_ce, tab_ce_len, sizeof(*tab_ce), ce_cmp); 2933 2934 #if 0 2935 { 2936 printf("tab_ce_len=%d\n", tab_ce_len); 2937 for(i = 0; i < tab_ce_len; i++) { 2938 ce = &tab_ce[i]; 2939 printf("%05x %05x %05x\n", ce->c[0], ce->c[1], ce->p); 2940 } 2941 } 2942 #endif 2943 2944 total_tables++; 2945 total_table_bytes += tab_ce_len * sizeof(uint16_t); 2946 fprintf(f, "static const uint16_t unicode_comp_table[%u] = {", tab_ce_len); 2947 for(i = 0; i < tab_ce_len; i++) { 2948 if (i % 8 == 0) 2949 fprintf(f, "\n "); 2950 v = get_decomp_pos(tab_de, tab_ce[i].p); 2951 if (v < 0) { 2952 printf("ERROR: entry for c=%04x not found\n", 2953 tab_ce[i].p); 2954 exit(1); 2955 } 2956 fprintf(f, " 0x%04x,", v); 2957 } 2958 fprintf(f, "\n};\n\n"); 2959 2960 free(tab_ce); 2961 } 2962 2963 #ifdef USE_TEST 2964 void check_decompose_table(void) 2965 { 2966 int c; 2967 CCInfo *ci; 2968 int res[UNICODE_DECOMP_LEN_MAX], *ref; 2969 int len, ref_len, is_compat; 2970 2971 for(is_compat = 0; is_compat <= 1; is_compat++) { 2972 for(c = 0; c < CHARCODE_MAX; c++) { 2973 ci = &unicode_db[c]; 2974 ref_len = ci->decomp_len; 2975 ref = ci->decomp_data; 2976 if (!is_compat && ci->is_compat) { 2977 ref_len = 0; 2978 } 2979 len = unicode_decomp_char((uint32_t *)res, c, is_compat); 2980 if (len != ref_len || 2981 tabcmp(res, ref, ref_len) != 0) { 2982 printf("ERROR c=%05x compat=%d\n", c, is_compat); 2983 dump_str("res", res, len); 2984 dump_str("ref", ref, ref_len); 2985 exit(1); 2986 } 2987 } 2988 } 2989 } 2990 2991 void check_compose_table(void) 2992 { 2993 int i, p; 2994 /* XXX: we don't test all the cases */ 2995 2996 for(i = 0; i <= CHARCODE_MAX; i++) { 2997 CCInfo *ci = &unicode_db[i]; 2998 if (ci->decomp_len == 2 && !ci->is_compat && 2999 !ci->is_excluded) { 3000 p = unicode_compose_pair(ci->decomp_data[0], ci->decomp_data[1]); 3001 if (p != i) { 3002 printf("ERROR compose: c=%05x %05x -> %05x ref=%05x\n", 3003 ci->decomp_data[0], ci->decomp_data[1], p, i); 3004 exit(1); 3005 } 3006 } 3007 } 3008 3009 3010 3011 } 3012 3013 #endif 3014 3015 3016 3017 #ifdef USE_TEST 3018 3019 void check_str(const char *msg, int num, const int *in_buf, int in_len, 3020 const int *buf1, int len1, 3021 const int *buf2, int len2) 3022 { 3023 if (len1 != len2 || tabcmp(buf1, buf2, len1) != 0) { 3024 printf("%d: ERROR %s:\n", num, msg); 3025 dump_str(" in", in_buf, in_len); 3026 dump_str("res", buf1, len1); 3027 dump_str("ref", buf2, len2); 3028 exit(1); 3029 } 3030 } 3031 3032 void check_cc_table(void) 3033 { 3034 int cc, cc_ref, c; 3035 3036 for(c = 0; c <= CHARCODE_MAX; c++) { 3037 cc_ref = unicode_db[c].combining_class; 3038 cc = unicode_get_cc(c); 3039 if (cc != cc_ref) { 3040 printf("ERROR: c=%04x cc=%d cc_ref=%d\n", 3041 c, cc, cc_ref); 3042 exit(1); 3043 } 3044 } 3045 #ifdef PROFILE 3046 { 3047 int64_t ti, count; 3048 3049 ti = get_time_ns(); 3050 count = 0; 3051 /* only do it on meaningful chars */ 3052 for(c = 0x20; c <= 0xffff; c++) { 3053 cc_ref = unicode_db[c].combining_class; 3054 cc = unicode_get_cc(c); 3055 count++; 3056 } 3057 ti = get_time_ns() - ti; 3058 printf("cc time=%0.1f ns/char\n", 3059 (double)ti / count); 3060 } 3061 #endif 3062 } 3063 3064 void normalization_test(const char *filename) 3065 { 3066 FILE *f; 3067 char line[4096], *p; 3068 int *in_str, *nfc_str, *nfd_str, *nfkc_str, *nfkd_str; 3069 int in_len, nfc_len, nfd_len, nfkc_len, nfkd_len; 3070 int *buf, buf_len, pos; 3071 3072 f = fopen(filename, "rb"); 3073 if (!f) { 3074 perror(filename); 3075 exit(1); 3076 } 3077 pos = 0; 3078 for(;;) { 3079 if (!get_line(line, sizeof(line), f)) 3080 break; 3081 pos++; 3082 p = line; 3083 while (isspace(*p)) 3084 p++; 3085 if (*p == '#' || *p == '@') 3086 continue; 3087 in_str = get_field_str(&in_len, p, 0); 3088 nfc_str = get_field_str(&nfc_len, p, 1); 3089 nfd_str = get_field_str(&nfd_len, p, 2); 3090 nfkc_str = get_field_str(&nfkc_len, p, 3); 3091 nfkd_str = get_field_str(&nfkd_len, p, 4); 3092 3093 // dump_str("in", in_str, in_len); 3094 3095 buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFD, NULL, NULL); 3096 check_str("nfd", pos, in_str, in_len, buf, buf_len, nfd_str, nfd_len); 3097 free(buf); 3098 3099 buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFKD, NULL, NULL); 3100 check_str("nfkd", pos, in_str, in_len, buf, buf_len, nfkd_str, nfkd_len); 3101 free(buf); 3102 3103 buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFC, NULL, NULL); 3104 check_str("nfc", pos, in_str, in_len, buf, buf_len, nfc_str, nfc_len); 3105 free(buf); 3106 3107 buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFKC, NULL, NULL); 3108 check_str("nfkc", pos, in_str, in_len, buf, buf_len, nfkc_str, nfkc_len); 3109 free(buf); 3110 3111 free(in_str); 3112 free(nfc_str); 3113 free(nfd_str); 3114 free(nfkc_str); 3115 free(nfkd_str); 3116 } 3117 fclose(f); 3118 } 3119 #endif 3120 3121 int main(int argc, char *argv[]) 3122 { 3123 const char *unicode_db_path, *outfilename; 3124 char filename[1024]; 3125 int arg = 1; 3126 3127 if (arg >= argc || (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "--help"))) { 3128 printf("usage: %s PATH [OUTPUT]\n" 3129 " PATH path to the Unicode database directory\n" 3130 " OUTPUT name of the output file. If omitted, a self test is performed\n" 3131 " using the files from the Unicode library\n" 3132 , argv[0]); 3133 return 1; 3134 } 3135 unicode_db_path = argv[arg++]; 3136 outfilename = NULL; 3137 if (arg < argc) 3138 outfilename = argv[arg++]; 3139 3140 unicode_db = mallocz(sizeof(unicode_db[0]) * (CHARCODE_MAX + 1)); 3141 3142 snprintf(filename, sizeof(filename), "%s/UnicodeData.txt", unicode_db_path); 3143 3144 parse_unicode_data(filename); 3145 3146 snprintf(filename, sizeof(filename), "%s/SpecialCasing.txt", unicode_db_path); 3147 parse_special_casing(unicode_db, filename); 3148 3149 snprintf(filename, sizeof(filename), "%s/CaseFolding.txt", unicode_db_path); 3150 parse_case_folding(unicode_db, filename); 3151 3152 snprintf(filename, sizeof(filename), "%s/CompositionExclusions.txt", unicode_db_path); 3153 parse_composition_exclusions(filename); 3154 3155 snprintf(filename, sizeof(filename), "%s/DerivedCoreProperties.txt", unicode_db_path); 3156 parse_derived_core_properties(filename); 3157 3158 snprintf(filename, sizeof(filename), "%s/DerivedNormalizationProps.txt", unicode_db_path); 3159 parse_derived_norm_properties(filename); 3160 3161 snprintf(filename, sizeof(filename), "%s/PropList.txt", unicode_db_path); 3162 parse_prop_list(filename); 3163 3164 snprintf(filename, sizeof(filename), "%s/Scripts.txt", unicode_db_path); 3165 parse_scripts(filename); 3166 3167 snprintf(filename, sizeof(filename), "%s/ScriptExtensions.txt", 3168 unicode_db_path); 3169 parse_script_extensions(filename); 3170 3171 snprintf(filename, sizeof(filename), "%s/emoji-data.txt", 3172 unicode_db_path); 3173 parse_prop_list(filename); 3174 3175 // dump_unicode_data(unicode_db); 3176 build_conv_table(unicode_db); 3177 3178 #ifdef DUMP_CASE_FOLDING_SPECIAL_CASES 3179 dump_case_folding_special_cases(unicode_db); 3180 #endif 3181 3182 if (!outfilename) { 3183 #ifdef USE_TEST 3184 check_case_conv(); 3185 check_flags(); 3186 check_decompose_table(); 3187 check_compose_table(); 3188 check_cc_table(); 3189 snprintf(filename, sizeof(filename), "%s/NormalizationTest.txt", unicode_db_path); 3190 normalization_test(filename); 3191 #else 3192 fprintf(stderr, "Tests are not compiled\n"); 3193 exit(1); 3194 #endif 3195 } else 3196 { 3197 FILE *fo = fopen(outfilename, "wb"); 3198 3199 if (!fo) { 3200 perror(outfilename); 3201 exit(1); 3202 } 3203 fprintf(fo, 3204 "/* Compressed unicode tables */\n" 3205 "/* Automatically generated file - do not edit */\n" 3206 "\n" 3207 "#include <stdint.h>\n" 3208 "\n"); 3209 dump_case_conv_table(fo); 3210 compute_internal_props(); 3211 build_flags_tables(fo); 3212 fprintf(fo, "#ifdef CONFIG_ALL_UNICODE\n\n"); 3213 build_cc_table(fo); 3214 build_decompose_table(fo); 3215 build_general_category_table(fo); 3216 build_script_table(fo); 3217 build_script_ext_table(fo); 3218 build_prop_list_table(fo); 3219 fprintf(fo, "#endif /* CONFIG_ALL_UNICODE */\n"); 3220 fprintf(fo, "/* %u tables / %u bytes, %u index / %u bytes */\n", 3221 total_tables, total_table_bytes, total_index, total_index_bytes); 3222 fclose(fo); 3223 } 3224 return 0; 3225 }