libunicode.c (56291B)
1 /* 2 * Unicode utilities 3 * 4 * Copyright (c) 2017-2018 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include <stdlib.h> 25 #include <stdio.h> 26 #include <stdarg.h> 27 #include <string.h> 28 #include <assert.h> 29 30 #include "cutils.h" 31 #include "libunicode.h" 32 #include "libunicode-table.h" 33 34 enum { 35 RUN_TYPE_U, 36 RUN_TYPE_L, 37 RUN_TYPE_UF, 38 RUN_TYPE_LF, 39 RUN_TYPE_UL, 40 RUN_TYPE_LSU, 41 RUN_TYPE_U2L_399_EXT2, 42 RUN_TYPE_UF_D20, 43 RUN_TYPE_UF_D1_EXT, 44 RUN_TYPE_U_EXT, 45 RUN_TYPE_LF_EXT, 46 RUN_TYPE_UF_EXT2, 47 RUN_TYPE_LF_EXT2, 48 RUN_TYPE_UF_EXT3, 49 }; 50 51 static int lre_case_conv1(uint32_t c, int conv_type) 52 { 53 uint32_t res[LRE_CC_RES_LEN_MAX]; 54 lre_case_conv(res, c, conv_type); 55 return res[0]; 56 } 57 58 /* case conversion using the table entry 'idx' with value 'v' */ 59 static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v) 60 { 61 uint32_t code, data, type, a, is_lower; 62 is_lower = (conv_type != 0); 63 type = (v >> (32 - 17 - 7 - 4)) & 0xf; 64 data = ((v & 0xf) << 8) | case_conv_table2[idx]; 65 code = v >> (32 - 17); 66 switch(type) { 67 case RUN_TYPE_U: 68 case RUN_TYPE_L: 69 case RUN_TYPE_UF: 70 case RUN_TYPE_LF: 71 if (conv_type == (type & 1) || 72 (type >= RUN_TYPE_UF && conv_type == 2)) { 73 c = c - code + (case_conv_table1[data] >> (32 - 17)); 74 } 75 break; 76 case RUN_TYPE_UL: 77 a = c - code; 78 if ((a & 1) != (1 - is_lower)) 79 break; 80 c = (a ^ 1) + code; 81 break; 82 case RUN_TYPE_LSU: 83 a = c - code; 84 if (a == 1) { 85 c += 2 * is_lower - 1; 86 } else if (a == (1 - is_lower) * 2) { 87 c += (2 * is_lower - 1) * 2; 88 } 89 break; 90 case RUN_TYPE_U2L_399_EXT2: 91 if (!is_lower) { 92 res[0] = c - code + case_conv_ext[data >> 6]; 93 res[1] = 0x399; 94 return 2; 95 } else { 96 c = c - code + case_conv_ext[data & 0x3f]; 97 } 98 break; 99 case RUN_TYPE_UF_D20: 100 if (conv_type == 1) 101 break; 102 c = data + (conv_type == 2) * 0x20; 103 break; 104 case RUN_TYPE_UF_D1_EXT: 105 if (conv_type == 1) 106 break; 107 c = case_conv_ext[data] + (conv_type == 2); 108 break; 109 case RUN_TYPE_U_EXT: 110 case RUN_TYPE_LF_EXT: 111 if (is_lower != (type - RUN_TYPE_U_EXT)) 112 break; 113 c = case_conv_ext[data]; 114 break; 115 case RUN_TYPE_LF_EXT2: 116 if (!is_lower) 117 break; 118 res[0] = c - code + case_conv_ext[data >> 6]; 119 res[1] = case_conv_ext[data & 0x3f]; 120 return 2; 121 case RUN_TYPE_UF_EXT2: 122 if (conv_type == 1) 123 break; 124 res[0] = c - code + case_conv_ext[data >> 6]; 125 res[1] = case_conv_ext[data & 0x3f]; 126 if (conv_type == 2) { 127 /* convert to lower */ 128 res[0] = lre_case_conv1(res[0], 1); 129 res[1] = lre_case_conv1(res[1], 1); 130 } 131 return 2; 132 default: 133 case RUN_TYPE_UF_EXT3: 134 if (conv_type == 1) 135 break; 136 res[0] = case_conv_ext[data >> 8]; 137 res[1] = case_conv_ext[(data >> 4) & 0xf]; 138 res[2] = case_conv_ext[data & 0xf]; 139 if (conv_type == 2) { 140 /* convert to lower */ 141 res[0] = lre_case_conv1(res[0], 1); 142 res[1] = lre_case_conv1(res[1], 1); 143 res[2] = lre_case_conv1(res[2], 1); 144 } 145 return 3; 146 } 147 res[0] = c; 148 return 1; 149 } 150 151 /* conv_type: 152 0 = to upper 153 1 = to lower 154 2 = case folding (= to lower with modifications) 155 */ 156 int lre_case_conv(uint32_t *res, uint32_t c, int conv_type) 157 { 158 if (c < 128) { 159 if (conv_type) { 160 if (c >= 'A' && c <= 'Z') { 161 c = c - 'A' + 'a'; 162 } 163 } else { 164 if (c >= 'a' && c <= 'z') { 165 c = c - 'a' + 'A'; 166 } 167 } 168 } else { 169 uint32_t v, code, len; 170 int idx, idx_min, idx_max; 171 172 idx_min = 0; 173 idx_max = countof(case_conv_table1) - 1; 174 while (idx_min <= idx_max) { 175 idx = (unsigned)(idx_max + idx_min) / 2; 176 v = case_conv_table1[idx]; 177 code = v >> (32 - 17); 178 len = (v >> (32 - 17 - 7)) & 0x7f; 179 if (c < code) { 180 idx_max = idx - 1; 181 } else if (c >= code + len) { 182 idx_min = idx + 1; 183 } else { 184 return lre_case_conv_entry(res, c, conv_type, idx, v); 185 } 186 } 187 } 188 res[0] = c; 189 return 1; 190 } 191 192 static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode) 193 { 194 uint32_t res[LRE_CC_RES_LEN_MAX]; 195 int len; 196 197 if (is_unicode) { 198 len = lre_case_conv_entry(res, c, 2, idx, v); 199 if (len == 1) { 200 c = res[0]; 201 } else { 202 /* handle the few specific multi-character cases (see 203 unicode_gen.c:dump_case_folding_special_cases()) */ 204 if (c == 0xfb06) { 205 c = 0xfb05; 206 } else if (c == 0x01fd3) { 207 c = 0x390; 208 } else if (c == 0x01fe3) { 209 c = 0x3b0; 210 } 211 } 212 } else { 213 if (likely(c < 128)) { 214 if (c >= 'a' && c <= 'z') 215 c = c - 'a' + 'A'; 216 } else { 217 /* legacy regexp: to upper case if single char >= 128 */ 218 len = lre_case_conv_entry(res, c, FALSE, idx, v); 219 if (len == 1 && res[0] >= 128) 220 c = res[0]; 221 } 222 } 223 return c; 224 } 225 226 /* JS regexp specific rules for case folding */ 227 int lre_canonicalize(uint32_t c, BOOL is_unicode) 228 { 229 if (c < 128) { 230 /* fast case */ 231 if (is_unicode) { 232 if (c >= 'A' && c <= 'Z') { 233 c = c - 'A' + 'a'; 234 } 235 } else { 236 if (c >= 'a' && c <= 'z') { 237 c = c - 'a' + 'A'; 238 } 239 } 240 } else { 241 uint32_t v, code, len; 242 int idx, idx_min, idx_max; 243 244 idx_min = 0; 245 idx_max = countof(case_conv_table1) - 1; 246 while (idx_min <= idx_max) { 247 idx = (unsigned)(idx_max + idx_min) / 2; 248 v = case_conv_table1[idx]; 249 code = v >> (32 - 17); 250 len = (v >> (32 - 17 - 7)) & 0x7f; 251 if (c < code) { 252 idx_max = idx - 1; 253 } else if (c >= code + len) { 254 idx_min = idx + 1; 255 } else { 256 return lre_case_folding_entry(c, idx, v, is_unicode); 257 } 258 } 259 } 260 return c; 261 } 262 263 static uint32_t get_le24(const uint8_t *ptr) 264 { 265 return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16); 266 } 267 268 #define UNICODE_INDEX_BLOCK_LEN 32 269 270 /* return -1 if not in table, otherwise the offset in the block */ 271 static int get_index_pos(uint32_t *pcode, uint32_t c, 272 const uint8_t *index_table, int index_table_len) 273 { 274 uint32_t code, v; 275 int idx_min, idx_max, idx; 276 277 idx_min = 0; 278 v = get_le24(index_table); 279 code = v & ((1 << 21) - 1); 280 if (c < code) { 281 *pcode = 0; 282 return 0; 283 } 284 idx_max = index_table_len - 1; 285 code = get_le24(index_table + idx_max * 3); 286 if (c >= code) 287 return -1; 288 /* invariant: tab[idx_min] <= c < tab2[idx_max] */ 289 while ((idx_max - idx_min) > 1) { 290 idx = (idx_max + idx_min) / 2; 291 v = get_le24(index_table + idx * 3); 292 code = v & ((1 << 21) - 1); 293 if (c < code) { 294 idx_max = idx; 295 } else { 296 idx_min = idx; 297 } 298 } 299 v = get_le24(index_table + idx_min * 3); 300 *pcode = v & ((1 << 21) - 1); 301 return (idx_min + 1) * UNICODE_INDEX_BLOCK_LEN + (v >> 21); 302 } 303 304 static BOOL lre_is_in_table(uint32_t c, const uint8_t *table, 305 const uint8_t *index_table, int index_table_len) 306 { 307 uint32_t code, b, bit; 308 int pos; 309 const uint8_t *p; 310 311 pos = get_index_pos(&code, c, index_table, index_table_len); 312 if (pos < 0) 313 return FALSE; /* outside the table */ 314 p = table + pos; 315 bit = 0; 316 /* Compressed run length encoding: 317 00..3F: 2 packed lengths: 3-bit + 3-bit 318 40..5F: 5-bits plus extra byte for length 319 60..7F: 5-bits plus 2 extra bytes for length 320 80..FF: 7-bit length 321 lengths must be incremented to get character count 322 Ranges alternate between false and true return value. 323 */ 324 for(;;) { 325 b = *p++; 326 if (b < 64) { 327 code += (b >> 3) + 1; 328 if (c < code) 329 return bit; 330 bit ^= 1; 331 code += (b & 7) + 1; 332 } else if (b >= 0x80) { 333 code += b - 0x80 + 1; 334 } else if (b < 0x60) { 335 code += (((b - 0x40) << 8) | p[0]) + 1; 336 p++; 337 } else { 338 code += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1; 339 p += 2; 340 } 341 if (c < code) 342 return bit; 343 bit ^= 1; 344 } 345 } 346 347 BOOL lre_is_cased(uint32_t c) 348 { 349 uint32_t v, code, len; 350 int idx, idx_min, idx_max; 351 352 idx_min = 0; 353 idx_max = countof(case_conv_table1) - 1; 354 while (idx_min <= idx_max) { 355 idx = (unsigned)(idx_max + idx_min) / 2; 356 v = case_conv_table1[idx]; 357 code = v >> (32 - 17); 358 len = (v >> (32 - 17 - 7)) & 0x7f; 359 if (c < code) { 360 idx_max = idx - 1; 361 } else if (c >= code + len) { 362 idx_min = idx + 1; 363 } else { 364 return TRUE; 365 } 366 } 367 return lre_is_in_table(c, unicode_prop_Cased1_table, 368 unicode_prop_Cased1_index, 369 sizeof(unicode_prop_Cased1_index) / 3); 370 } 371 372 BOOL lre_is_case_ignorable(uint32_t c) 373 { 374 return lre_is_in_table(c, unicode_prop_Case_Ignorable_table, 375 unicode_prop_Case_Ignorable_index, 376 sizeof(unicode_prop_Case_Ignorable_index) / 3); 377 } 378 379 /* character range */ 380 381 static __maybe_unused void cr_dump(CharRange *cr) 382 { 383 int i; 384 for(i = 0; i < cr->len; i++) 385 printf("%d: 0x%04x\n", i, cr->points[i]); 386 } 387 388 static void *cr_default_realloc(void *opaque, void *ptr, size_t size) 389 { 390 return realloc(ptr, size); 391 } 392 393 void cr_init(CharRange *cr, void *mem_opaque, DynBufReallocFunc *realloc_func) 394 { 395 cr->len = cr->size = 0; 396 cr->points = NULL; 397 cr->mem_opaque = mem_opaque; 398 cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc; 399 } 400 401 void cr_free(CharRange *cr) 402 { 403 cr->realloc_func(cr->mem_opaque, cr->points, 0); 404 } 405 406 int cr_realloc(CharRange *cr, int size) 407 { 408 int new_size; 409 uint32_t *new_buf; 410 411 if (size > cr->size) { 412 new_size = max_int(size, cr->size * 3 / 2); 413 new_buf = cr->realloc_func(cr->mem_opaque, cr->points, 414 new_size * sizeof(cr->points[0])); 415 if (!new_buf) 416 return -1; 417 cr->points = new_buf; 418 cr->size = new_size; 419 } 420 return 0; 421 } 422 423 int cr_copy(CharRange *cr, const CharRange *cr1) 424 { 425 if (cr_realloc(cr, cr1->len)) 426 return -1; 427 memcpy(cr->points, cr1->points, sizeof(cr->points[0]) * cr1->len); 428 cr->len = cr1->len; 429 return 0; 430 } 431 432 /* merge consecutive intervals and remove empty intervals */ 433 static void cr_compress(CharRange *cr) 434 { 435 int i, j, k, len; 436 uint32_t *pt; 437 438 pt = cr->points; 439 len = cr->len; 440 i = 0; 441 j = 0; 442 k = 0; 443 while ((i + 1) < len) { 444 if (pt[i] == pt[i + 1]) { 445 /* empty interval */ 446 i += 2; 447 } else { 448 j = i; 449 while ((j + 3) < len && pt[j + 1] == pt[j + 2]) 450 j += 2; 451 /* just copy */ 452 pt[k] = pt[i]; 453 pt[k + 1] = pt[j + 1]; 454 k += 2; 455 i = j + 2; 456 } 457 } 458 cr->len = k; 459 } 460 461 /* union or intersection */ 462 int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len, 463 const uint32_t *b_pt, int b_len, int op) 464 { 465 int a_idx, b_idx, is_in; 466 uint32_t v; 467 468 a_idx = 0; 469 b_idx = 0; 470 for(;;) { 471 /* get one more point from a or b in increasing order */ 472 if (a_idx < a_len && b_idx < b_len) { 473 if (a_pt[a_idx] < b_pt[b_idx]) { 474 goto a_add; 475 } else if (a_pt[a_idx] == b_pt[b_idx]) { 476 v = a_pt[a_idx]; 477 a_idx++; 478 b_idx++; 479 } else { 480 goto b_add; 481 } 482 } else if (a_idx < a_len) { 483 a_add: 484 v = a_pt[a_idx++]; 485 } else if (b_idx < b_len) { 486 b_add: 487 v = b_pt[b_idx++]; 488 } else { 489 break; 490 } 491 /* add the point if the in/out status changes */ 492 switch(op) { 493 case CR_OP_UNION: 494 is_in = (a_idx & 1) | (b_idx & 1); 495 break; 496 case CR_OP_INTER: 497 is_in = (a_idx & 1) & (b_idx & 1); 498 break; 499 case CR_OP_XOR: 500 is_in = (a_idx & 1) ^ (b_idx & 1); 501 break; 502 default: 503 abort(); 504 } 505 if (is_in != (cr->len & 1)) { 506 if (cr_add_point(cr, v)) 507 return -1; 508 } 509 } 510 cr_compress(cr); 511 return 0; 512 } 513 514 int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len) 515 { 516 CharRange a = *cr; 517 int ret; 518 cr->len = 0; 519 cr->size = 0; 520 cr->points = NULL; 521 ret = cr_op(cr, a.points, a.len, b_pt, b_len, CR_OP_UNION); 522 cr_free(&a); 523 return ret; 524 } 525 526 int cr_invert(CharRange *cr) 527 { 528 int len; 529 len = cr->len; 530 if (cr_realloc(cr, len + 2)) 531 return -1; 532 memmove(cr->points + 1, cr->points, len * sizeof(cr->points[0])); 533 cr->points[0] = 0; 534 cr->points[len + 1] = UINT32_MAX; 535 cr->len = len + 2; 536 cr_compress(cr); 537 return 0; 538 } 539 540 #ifdef CONFIG_ALL_UNICODE 541 542 BOOL lre_is_id_start(uint32_t c) 543 { 544 return lre_is_in_table(c, unicode_prop_ID_Start_table, 545 unicode_prop_ID_Start_index, 546 sizeof(unicode_prop_ID_Start_index) / 3); 547 } 548 549 BOOL lre_is_id_continue(uint32_t c) 550 { 551 return lre_is_id_start(c) || 552 lre_is_in_table(c, unicode_prop_ID_Continue1_table, 553 unicode_prop_ID_Continue1_index, 554 sizeof(unicode_prop_ID_Continue1_index) / 3); 555 } 556 557 #define UNICODE_DECOMP_LEN_MAX 18 558 559 typedef enum { 560 DECOMP_TYPE_C1, /* 16 bit char */ 561 DECOMP_TYPE_L1, /* 16 bit char table */ 562 DECOMP_TYPE_L2, 563 DECOMP_TYPE_L3, 564 DECOMP_TYPE_L4, 565 DECOMP_TYPE_L5, /* XXX: not used */ 566 DECOMP_TYPE_L6, /* XXX: could remove */ 567 DECOMP_TYPE_L7, /* XXX: could remove */ 568 DECOMP_TYPE_LL1, /* 18 bit char table */ 569 DECOMP_TYPE_LL2, 570 DECOMP_TYPE_S1, /* 8 bit char table */ 571 DECOMP_TYPE_S2, 572 DECOMP_TYPE_S3, 573 DECOMP_TYPE_S4, 574 DECOMP_TYPE_S5, 575 DECOMP_TYPE_I1, /* increment 16 bit char value */ 576 DECOMP_TYPE_I2_0, 577 DECOMP_TYPE_I2_1, 578 DECOMP_TYPE_I3_1, 579 DECOMP_TYPE_I3_2, 580 DECOMP_TYPE_I4_1, 581 DECOMP_TYPE_I4_2, 582 DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */ 583 DECOMP_TYPE_B2, 584 DECOMP_TYPE_B3, 585 DECOMP_TYPE_B4, 586 DECOMP_TYPE_B5, 587 DECOMP_TYPE_B6, 588 DECOMP_TYPE_B7, 589 DECOMP_TYPE_B8, 590 DECOMP_TYPE_B18, 591 DECOMP_TYPE_LS2, 592 DECOMP_TYPE_PAT3, 593 DECOMP_TYPE_S2_UL, 594 DECOMP_TYPE_LS2_UL, 595 } DecompTypeEnum; 596 597 static uint32_t unicode_get_short_code(uint32_t c) 598 { 599 static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 }; 600 601 if (c < 0x80) 602 return c; 603 else if (c < 0x80 + 0x50) 604 return c - 0x80 + 0x300; 605 else 606 return unicode_short_table[c - 0x80 - 0x50]; 607 } 608 609 static uint32_t unicode_get_lower_simple(uint32_t c) 610 { 611 if (c < 0x100 || (c >= 0x410 && c <= 0x42f)) 612 c += 0x20; 613 else 614 c++; 615 return c; 616 } 617 618 static uint16_t unicode_get16(const uint8_t *p) 619 { 620 return p[0] | (p[1] << 8); 621 } 622 623 static int unicode_decomp_entry(uint32_t *res, uint32_t c, 624 int idx, uint32_t code, uint32_t len, 625 uint32_t type) 626 { 627 uint32_t c1; 628 int l, i, p; 629 const uint8_t *d; 630 631 if (type == DECOMP_TYPE_C1) { 632 res[0] = unicode_decomp_table2[idx]; 633 return 1; 634 } else { 635 d = unicode_decomp_data + unicode_decomp_table2[idx]; 636 switch(type) { 637 case DECOMP_TYPE_L1: 638 case DECOMP_TYPE_L2: 639 case DECOMP_TYPE_L3: 640 case DECOMP_TYPE_L4: 641 case DECOMP_TYPE_L5: 642 case DECOMP_TYPE_L6: 643 case DECOMP_TYPE_L7: 644 l = type - DECOMP_TYPE_L1 + 1; 645 d += (c - code) * l * 2; 646 for(i = 0; i < l; i++) { 647 if ((res[i] = unicode_get16(d + 2 * i)) == 0) 648 return 0; 649 } 650 return l; 651 case DECOMP_TYPE_LL1: 652 case DECOMP_TYPE_LL2: 653 { 654 uint32_t k, p; 655 l = type - DECOMP_TYPE_LL1 + 1; 656 k = (c - code) * l; 657 p = len * l * 2; 658 for(i = 0; i < l; i++) { 659 c1 = unicode_get16(d + 2 * k) | 660 (((d[p + (k / 4)] >> ((k % 4) * 2)) & 3) << 16); 661 if (!c1) 662 return 0; 663 res[i] = c1; 664 k++; 665 } 666 } 667 return l; 668 case DECOMP_TYPE_S1: 669 case DECOMP_TYPE_S2: 670 case DECOMP_TYPE_S3: 671 case DECOMP_TYPE_S4: 672 case DECOMP_TYPE_S5: 673 l = type - DECOMP_TYPE_S1 + 1; 674 d += (c - code) * l; 675 for(i = 0; i < l; i++) { 676 if ((res[i] = unicode_get_short_code(d[i])) == 0) 677 return 0; 678 } 679 return l; 680 case DECOMP_TYPE_I1: 681 l = 1; 682 p = 0; 683 goto decomp_type_i; 684 case DECOMP_TYPE_I2_0: 685 case DECOMP_TYPE_I2_1: 686 case DECOMP_TYPE_I3_1: 687 case DECOMP_TYPE_I3_2: 688 case DECOMP_TYPE_I4_1: 689 case DECOMP_TYPE_I4_2: 690 l = 2 + ((type - DECOMP_TYPE_I2_0) >> 1); 691 p = ((type - DECOMP_TYPE_I2_0) & 1) + (l > 2); 692 decomp_type_i: 693 for(i = 0; i < l; i++) { 694 c1 = unicode_get16(d + 2 * i); 695 if (i == p) 696 c1 += c - code; 697 res[i] = c1; 698 } 699 return l; 700 case DECOMP_TYPE_B18: 701 l = 18; 702 goto decomp_type_b; 703 case DECOMP_TYPE_B1: 704 case DECOMP_TYPE_B2: 705 case DECOMP_TYPE_B3: 706 case DECOMP_TYPE_B4: 707 case DECOMP_TYPE_B5: 708 case DECOMP_TYPE_B6: 709 case DECOMP_TYPE_B7: 710 case DECOMP_TYPE_B8: 711 l = type - DECOMP_TYPE_B1 + 1; 712 decomp_type_b: 713 { 714 uint32_t c_min; 715 c_min = unicode_get16(d); 716 d += 2 + (c - code) * l; 717 for(i = 0; i < l; i++) { 718 c1 = d[i]; 719 if (c1 == 0xff) 720 c1 = 0x20; 721 else 722 c1 += c_min; 723 res[i] = c1; 724 } 725 } 726 return l; 727 case DECOMP_TYPE_LS2: 728 d += (c - code) * 3; 729 if (!(res[0] = unicode_get16(d))) 730 return 0; 731 res[1] = unicode_get_short_code(d[2]); 732 return 2; 733 case DECOMP_TYPE_PAT3: 734 res[0] = unicode_get16(d); 735 res[2] = unicode_get16(d + 2); 736 d += 4 + (c - code) * 2; 737 res[1] = unicode_get16(d); 738 return 3; 739 case DECOMP_TYPE_S2_UL: 740 case DECOMP_TYPE_LS2_UL: 741 c1 = c - code; 742 if (type == DECOMP_TYPE_S2_UL) { 743 d += c1 & ~1; 744 c = unicode_get_short_code(*d); 745 d++; 746 } else { 747 d += (c1 >> 1) * 3; 748 c = unicode_get16(d); 749 d += 2; 750 } 751 if (c1 & 1) 752 c = unicode_get_lower_simple(c); 753 res[0] = c; 754 res[1] = unicode_get_short_code(*d); 755 return 2; 756 } 757 } 758 return 0; 759 } 760 761 762 /* return the length of the decomposition (length <= 763 UNICODE_DECOMP_LEN_MAX) or 0 if no decomposition */ 764 static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1) 765 { 766 uint32_t v, type, is_compat, code, len; 767 int idx_min, idx_max, idx; 768 769 idx_min = 0; 770 idx_max = countof(unicode_decomp_table1) - 1; 771 while (idx_min <= idx_max) { 772 idx = (idx_max + idx_min) / 2; 773 v = unicode_decomp_table1[idx]; 774 code = v >> (32 - 18); 775 len = (v >> (32 - 18 - 7)) & 0x7f; 776 // printf("idx=%d code=%05x len=%d\n", idx, code, len); 777 if (c < code) { 778 idx_max = idx - 1; 779 } else if (c >= code + len) { 780 idx_min = idx + 1; 781 } else { 782 is_compat = v & 1; 783 if (is_compat1 < is_compat) 784 break; 785 type = (v >> (32 - 18 - 7 - 6)) & 0x3f; 786 return unicode_decomp_entry(res, c, idx, code, len, type); 787 } 788 } 789 return 0; 790 } 791 792 /* return 0 if no pair found */ 793 static int unicode_compose_pair(uint32_t c0, uint32_t c1) 794 { 795 uint32_t code, len, type, v, idx1, d_idx, d_offset, ch; 796 int idx_min, idx_max, idx, d; 797 uint32_t pair[2]; 798 799 idx_min = 0; 800 idx_max = countof(unicode_comp_table) - 1; 801 while (idx_min <= idx_max) { 802 idx = (idx_max + idx_min) / 2; 803 idx1 = unicode_comp_table[idx]; 804 805 /* idx1 represent an entry of the decomposition table */ 806 d_idx = idx1 >> 6; 807 d_offset = idx1 & 0x3f; 808 v = unicode_decomp_table1[d_idx]; 809 code = v >> (32 - 18); 810 len = (v >> (32 - 18 - 7)) & 0x7f; 811 type = (v >> (32 - 18 - 7 - 6)) & 0x3f; 812 ch = code + d_offset; 813 unicode_decomp_entry(pair, ch, d_idx, code, len, type); 814 d = c0 - pair[0]; 815 if (d == 0) 816 d = c1 - pair[1]; 817 if (d < 0) { 818 idx_max = idx - 1; 819 } else if (d > 0) { 820 idx_min = idx + 1; 821 } else { 822 return ch; 823 } 824 } 825 return 0; 826 } 827 828 /* return the combining class of character c (between 0 and 255) */ 829 static int unicode_get_cc(uint32_t c) 830 { 831 uint32_t code, n, type, cc, c1, b; 832 int pos; 833 const uint8_t *p; 834 835 pos = get_index_pos(&code, c, 836 unicode_cc_index, sizeof(unicode_cc_index) / 3); 837 if (pos < 0) 838 return 0; 839 p = unicode_cc_table + pos; 840 /* Compressed run length encoding: 841 - 2 high order bits are combining class type 842 - 0:0, 1:230, 2:extra byte linear progression, 3:extra byte 843 - 00..2F: range length (add 1) 844 - 30..37: 3-bit range-length + 1 extra byte 845 - 38..3F: 3-bit range-length + 2 extra byte 846 */ 847 for(;;) { 848 b = *p++; 849 type = b >> 6; 850 n = b & 0x3f; 851 if (n < 48) { 852 } else if (n < 56) { 853 n = (n - 48) << 8; 854 n |= *p++; 855 n += 48; 856 } else { 857 n = (n - 56) << 8; 858 n |= *p++ << 8; 859 n |= *p++; 860 n += 48 + (1 << 11); 861 } 862 if (type <= 1) 863 p++; 864 c1 = code + n + 1; 865 if (c < c1) { 866 switch(type) { 867 case 0: 868 cc = p[-1]; 869 break; 870 case 1: 871 cc = p[-1] + c - code; 872 break; 873 case 2: 874 cc = 0; 875 break; 876 default: 877 case 3: 878 cc = 230; 879 break; 880 } 881 return cc; 882 } 883 code = c1; 884 } 885 } 886 887 static void sort_cc(int *buf, int len) 888 { 889 int i, j, k, cc, cc1, start, ch1; 890 891 for(i = 0; i < len; i++) { 892 cc = unicode_get_cc(buf[i]); 893 if (cc != 0) { 894 start = i; 895 j = i + 1; 896 while (j < len) { 897 ch1 = buf[j]; 898 cc1 = unicode_get_cc(ch1); 899 if (cc1 == 0) 900 break; 901 k = j - 1; 902 while (k >= start) { 903 if (unicode_get_cc(buf[k]) <= cc1) 904 break; 905 buf[k + 1] = buf[k]; 906 k--; 907 } 908 buf[k + 1] = ch1; 909 j++; 910 } 911 #if 0 912 printf("cc:"); 913 for(k = start; k < j; k++) { 914 printf(" %3d", unicode_get_cc(buf[k])); 915 } 916 printf("\n"); 917 #endif 918 i = j; 919 } 920 } 921 } 922 923 static void to_nfd_rec(DynBuf *dbuf, 924 const int *src, int src_len, int is_compat) 925 { 926 uint32_t c, v; 927 int i, l; 928 uint32_t res[UNICODE_DECOMP_LEN_MAX]; 929 930 for(i = 0; i < src_len; i++) { 931 c = src[i]; 932 if (c >= 0xac00 && c < 0xd7a4) { 933 /* Hangul decomposition */ 934 c -= 0xac00; 935 dbuf_put_u32(dbuf, 0x1100 + c / 588); 936 dbuf_put_u32(dbuf, 0x1161 + (c % 588) / 28); 937 v = c % 28; 938 if (v != 0) 939 dbuf_put_u32(dbuf, 0x11a7 + v); 940 } else { 941 l = unicode_decomp_char(res, c, is_compat); 942 if (l) { 943 to_nfd_rec(dbuf, (int *)res, l, is_compat); 944 } else { 945 dbuf_put_u32(dbuf, c); 946 } 947 } 948 } 949 } 950 951 /* return 0 if not found */ 952 static int compose_pair(uint32_t c0, uint32_t c1) 953 { 954 /* Hangul composition */ 955 if (c0 >= 0x1100 && c0 < 0x1100 + 19 && 956 c1 >= 0x1161 && c1 < 0x1161 + 21) { 957 return 0xac00 + (c0 - 0x1100) * 588 + (c1 - 0x1161) * 28; 958 } else if (c0 >= 0xac00 && c0 < 0xac00 + 11172 && 959 (c0 - 0xac00) % 28 == 0 && 960 c1 >= 0x11a7 && c1 < 0x11a7 + 28) { 961 return c0 + c1 - 0x11a7; 962 } else { 963 return unicode_compose_pair(c0, c1); 964 } 965 } 966 967 int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len, 968 UnicodeNormalizationEnum n_type, 969 void *opaque, DynBufReallocFunc *realloc_func) 970 { 971 int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len; 972 BOOL is_compat; 973 DynBuf dbuf_s, *dbuf = &dbuf_s; 974 975 is_compat = n_type >> 1; 976 977 dbuf_init2(dbuf, opaque, realloc_func); 978 if (dbuf_realloc(dbuf, sizeof(int) * src_len)) 979 goto fail; 980 981 /* common case: latin1 is unaffected by NFC */ 982 if (n_type == UNICODE_NFC) { 983 for(i = 0; i < src_len; i++) { 984 if (src[i] >= 0x100) 985 goto not_latin1; 986 } 987 buf = (int *)dbuf->buf; 988 memcpy(buf, src, src_len * sizeof(int)); 989 *pdst = (uint32_t *)buf; 990 return src_len; 991 not_latin1: ; 992 } 993 994 to_nfd_rec(dbuf, (const int *)src, src_len, is_compat); 995 if (dbuf_error(dbuf)) { 996 fail: 997 *pdst = NULL; 998 return -1; 999 } 1000 buf = (int *)dbuf->buf; 1001 buf_len = dbuf->size / sizeof(int); 1002 1003 sort_cc(buf, buf_len); 1004 1005 if (buf_len <= 1 || (n_type & 1) != 0) { 1006 /* NFD / NFKD */ 1007 *pdst = (uint32_t *)buf; 1008 return buf_len; 1009 } 1010 1011 i = 1; 1012 out_len = 1; 1013 while (i < buf_len) { 1014 /* find the starter character and test if it is blocked from 1015 the character at 'i' */ 1016 last_cc = unicode_get_cc(buf[i]); 1017 starter_pos = out_len - 1; 1018 while (starter_pos >= 0) { 1019 cc = unicode_get_cc(buf[starter_pos]); 1020 if (cc == 0) 1021 break; 1022 if (cc >= last_cc) 1023 goto next; 1024 last_cc = 256; 1025 starter_pos--; 1026 } 1027 if (starter_pos >= 0 && 1028 (p = compose_pair(buf[starter_pos], buf[i])) != 0) { 1029 buf[starter_pos] = p; 1030 i++; 1031 } else { 1032 next: 1033 buf[out_len++] = buf[i++]; 1034 } 1035 } 1036 *pdst = (uint32_t *)buf; 1037 return out_len; 1038 } 1039 1040 /* char ranges for various unicode properties */ 1041 1042 static int unicode_find_name(const char *name_table, const char *name) 1043 { 1044 const char *p, *r; 1045 int pos; 1046 size_t name_len, len; 1047 1048 p = name_table; 1049 pos = 0; 1050 name_len = strlen(name); 1051 while (*p) { 1052 for(;;) { 1053 r = strchr(p, ','); 1054 if (!r) 1055 len = strlen(p); 1056 else 1057 len = r - p; 1058 if (len == name_len && !memcmp(p, name, name_len)) 1059 return pos; 1060 p += len + 1; 1061 if (!r) 1062 break; 1063 } 1064 pos++; 1065 } 1066 return -1; 1067 } 1068 1069 /* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2 1070 if not found */ 1071 int unicode_script(CharRange *cr, 1072 const char *script_name, BOOL is_ext) 1073 { 1074 int script_idx; 1075 const uint8_t *p, *p_end; 1076 uint32_t c, c1, b, n, v, v_len, i, type; 1077 CharRange cr1_s, *cr1; 1078 CharRange cr2_s, *cr2 = &cr2_s; 1079 BOOL is_common; 1080 1081 script_idx = unicode_find_name(unicode_script_name_table, script_name); 1082 if (script_idx < 0) 1083 return -2; 1084 /* Note: we remove the "Unknown" Script */ 1085 script_idx += UNICODE_SCRIPT_Unknown + 1; 1086 1087 is_common = (script_idx == UNICODE_SCRIPT_Common || 1088 script_idx == UNICODE_SCRIPT_Inherited); 1089 if (is_ext) { 1090 cr1 = &cr1_s; 1091 cr_init(cr1, cr->mem_opaque, cr->realloc_func); 1092 cr_init(cr2, cr->mem_opaque, cr->realloc_func); 1093 } else { 1094 cr1 = cr; 1095 } 1096 1097 p = unicode_script_table; 1098 p_end = unicode_script_table + countof(unicode_script_table); 1099 c = 0; 1100 while (p < p_end) { 1101 b = *p++; 1102 type = b >> 7; 1103 n = b & 0x7f; 1104 if (n < 96) { 1105 } else if (n < 112) { 1106 n = (n - 96) << 8; 1107 n |= *p++; 1108 n += 96; 1109 } else { 1110 n = (n - 112) << 16; 1111 n |= *p++ << 8; 1112 n |= *p++; 1113 n += 96 + (1 << 12); 1114 } 1115 if (type == 0) 1116 v = 0; 1117 else 1118 v = *p++; 1119 c1 = c + n + 1; 1120 if (v == script_idx) { 1121 if (cr_add_interval(cr1, c, c1)) 1122 goto fail; 1123 } 1124 c = c1; 1125 } 1126 1127 if (is_ext) { 1128 /* add the script extensions */ 1129 p = unicode_script_ext_table; 1130 p_end = unicode_script_ext_table + countof(unicode_script_ext_table); 1131 c = 0; 1132 while (p < p_end) { 1133 b = *p++; 1134 if (b < 128) { 1135 n = b; 1136 } else if (b < 128 + 64) { 1137 n = (b - 128) << 8; 1138 n |= *p++; 1139 n += 128; 1140 } else { 1141 n = (b - 128 - 64) << 16; 1142 n |= *p++ << 8; 1143 n |= *p++; 1144 n += 128 + (1 << 14); 1145 } 1146 c1 = c + n + 1; 1147 v_len = *p++; 1148 if (is_common) { 1149 if (v_len != 0) { 1150 if (cr_add_interval(cr2, c, c1)) 1151 goto fail; 1152 } 1153 } else { 1154 for(i = 0; i < v_len; i++) { 1155 if (p[i] == script_idx) { 1156 if (cr_add_interval(cr2, c, c1)) 1157 goto fail; 1158 break; 1159 } 1160 } 1161 } 1162 p += v_len; 1163 c = c1; 1164 } 1165 if (is_common) { 1166 /* remove all the characters with script extensions */ 1167 if (cr_invert(cr2)) 1168 goto fail; 1169 if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len, 1170 CR_OP_INTER)) 1171 goto fail; 1172 } else { 1173 if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len, 1174 CR_OP_UNION)) 1175 goto fail; 1176 } 1177 cr_free(cr1); 1178 cr_free(cr2); 1179 } 1180 return 0; 1181 fail: 1182 if (is_ext) { 1183 cr_free(cr1); 1184 cr_free(cr2); 1185 } 1186 goto fail; 1187 } 1188 1189 #define M(id) (1U << UNICODE_GC_ ## id) 1190 1191 static int unicode_general_category1(CharRange *cr, uint32_t gc_mask) 1192 { 1193 const uint8_t *p, *p_end; 1194 uint32_t c, c0, b, n, v; 1195 1196 p = unicode_gc_table; 1197 p_end = unicode_gc_table + countof(unicode_gc_table); 1198 c = 0; 1199 /* Compressed range encoding: 1200 initial byte: 1201 bits 0..4: category number (special case 31) 1202 bits 5..7: range length (add 1) 1203 special case bits 5..7 == 7: read an extra byte 1204 - 00..7F: range length (add 7 + 1) 1205 - 80..BF: 6-bits plus extra byte for range length (add 7 + 128) 1206 - C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384) 1207 */ 1208 while (p < p_end) { 1209 b = *p++; 1210 n = b >> 5; 1211 v = b & 0x1f; 1212 if (n == 7) { 1213 n = *p++; 1214 if (n < 128) { 1215 n += 7; 1216 } else if (n < 128 + 64) { 1217 n = (n - 128) << 8; 1218 n |= *p++; 1219 n += 7 + 128; 1220 } else { 1221 n = (n - 128 - 64) << 16; 1222 n |= *p++ << 8; 1223 n |= *p++; 1224 n += 7 + 128 + (1 << 14); 1225 } 1226 } 1227 c0 = c; 1228 c += n + 1; 1229 if (v == 31) { 1230 /* run of Lu / Ll */ 1231 b = gc_mask & (M(Lu) | M(Ll)); 1232 if (b != 0) { 1233 if (b == (M(Lu) | M(Ll))) { 1234 goto add_range; 1235 } else { 1236 c0 += ((gc_mask & M(Ll)) != 0); 1237 for(; c0 < c; c0 += 2) { 1238 if (cr_add_interval(cr, c0, c0 + 1)) 1239 return -1; 1240 } 1241 } 1242 } 1243 } else if ((gc_mask >> v) & 1) { 1244 add_range: 1245 if (cr_add_interval(cr, c0, c)) 1246 return -1; 1247 } 1248 } 1249 return 0; 1250 } 1251 1252 static int unicode_prop1(CharRange *cr, int prop_idx) 1253 { 1254 const uint8_t *p, *p_end; 1255 uint32_t c, c0, b, bit; 1256 1257 p = unicode_prop_table[prop_idx]; 1258 p_end = p + unicode_prop_len_table[prop_idx]; 1259 c = 0; 1260 bit = 0; 1261 /* Compressed range encoding: 1262 00..3F: 2 packed lengths: 3-bit + 3-bit 1263 40..5F: 5-bits plus extra byte for length 1264 60..7F: 5-bits plus 2 extra bytes for length 1265 80..FF: 7-bit length 1266 lengths must be incremented to get character count 1267 Ranges alternate between false and true return value. 1268 */ 1269 while (p < p_end) { 1270 c0 = c; 1271 b = *p++; 1272 if (b < 64) { 1273 c += (b >> 3) + 1; 1274 if (bit) { 1275 if (cr_add_interval(cr, c0, c)) 1276 return -1; 1277 } 1278 bit ^= 1; 1279 c0 = c; 1280 c += (b & 7) + 1; 1281 } else if (b >= 0x80) { 1282 c += b - 0x80 + 1; 1283 } else if (b < 0x60) { 1284 c += (((b - 0x40) << 8) | p[0]) + 1; 1285 p++; 1286 } else { 1287 c += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1; 1288 p += 2; 1289 } 1290 if (bit) { 1291 if (cr_add_interval(cr, c0, c)) 1292 return -1; 1293 } 1294 bit ^= 1; 1295 } 1296 return 0; 1297 } 1298 1299 #define CASE_U (1 << 0) 1300 #define CASE_L (1 << 1) 1301 #define CASE_F (1 << 2) 1302 1303 /* use the case conversion table to generate range of characters. 1304 CASE_U: set char if modified by uppercasing, 1305 CASE_L: set char if modified by lowercasing, 1306 CASE_F: set char if modified by case folding, 1307 */ 1308 static int unicode_case1(CharRange *cr, int case_mask) 1309 { 1310 #define MR(x) (1 << RUN_TYPE_ ## x) 1311 const uint32_t tab_run_mask[3] = { 1312 MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) | 1313 MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3), 1314 1315 MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2), 1316 1317 MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3), 1318 }; 1319 #undef MR 1320 uint32_t mask, v, code, type, len, i, idx; 1321 1322 if (case_mask == 0) 1323 return 0; 1324 mask = 0; 1325 for(i = 0; i < 3; i++) { 1326 if ((case_mask >> i) & 1) 1327 mask |= tab_run_mask[i]; 1328 } 1329 for(idx = 0; idx < countof(case_conv_table1); idx++) { 1330 v = case_conv_table1[idx]; 1331 type = (v >> (32 - 17 - 7 - 4)) & 0xf; 1332 code = v >> (32 - 17); 1333 len = (v >> (32 - 17 - 7)) & 0x7f; 1334 if ((mask >> type) & 1) { 1335 // printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1); 1336 switch(type) { 1337 case RUN_TYPE_UL: 1338 if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F))) 1339 goto def_case; 1340 code += ((case_mask & CASE_U) != 0); 1341 for(i = 0; i < len; i += 2) { 1342 if (cr_add_interval(cr, code + i, code + i + 1)) 1343 return -1; 1344 } 1345 break; 1346 case RUN_TYPE_LSU: 1347 if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F))) 1348 goto def_case; 1349 if (!(case_mask & CASE_U)) { 1350 if (cr_add_interval(cr, code, code + 1)) 1351 return -1; 1352 } 1353 if (cr_add_interval(cr, code + 1, code + 2)) 1354 return -1; 1355 if (case_mask & CASE_U) { 1356 if (cr_add_interval(cr, code + 2, code + 3)) 1357 return -1; 1358 } 1359 break; 1360 default: 1361 def_case: 1362 if (cr_add_interval(cr, code, code + len)) 1363 return -1; 1364 break; 1365 } 1366 } 1367 } 1368 return 0; 1369 } 1370 1371 static int point_cmp(const void *p1, const void *p2, void *arg) 1372 { 1373 uint32_t v1 = *(uint32_t *)p1; 1374 uint32_t v2 = *(uint32_t *)p2; 1375 return (v1 > v2) - (v1 < v2); 1376 } 1377 1378 static void cr_sort_and_remove_overlap(CharRange *cr) 1379 { 1380 uint32_t start, end, start1, end1, i, j; 1381 1382 /* the resulting ranges are not necessarily sorted and may overlap */ 1383 rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL); 1384 j = 0; 1385 for(i = 0; i < cr->len; ) { 1386 start = cr->points[i]; 1387 end = cr->points[i + 1]; 1388 i += 2; 1389 while (i < cr->len) { 1390 start1 = cr->points[i]; 1391 end1 = cr->points[i + 1]; 1392 if (start1 > end) { 1393 /* |------| 1394 * |-------| */ 1395 break; 1396 } else if (end1 <= end) { 1397 /* |------| 1398 * |--| */ 1399 i += 2; 1400 } else { 1401 /* |------| 1402 * |-------| */ 1403 end = end1; 1404 i += 2; 1405 } 1406 } 1407 cr->points[j] = start; 1408 cr->points[j + 1] = end; 1409 j += 2; 1410 } 1411 cr->len = j; 1412 } 1413 1414 /* canonicalize a character set using the JS regex case folding rules 1415 (see lre_canonicalize()) */ 1416 int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode) 1417 { 1418 CharRange cr_inter, cr_mask, cr_result, cr_sub; 1419 uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d; 1420 1421 cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func); 1422 cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func); 1423 cr_init(&cr_result, cr->mem_opaque, cr->realloc_func); 1424 cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func); 1425 1426 if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U)) 1427 goto fail; 1428 if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER)) 1429 goto fail; 1430 1431 if (cr_invert(&cr_mask)) 1432 goto fail; 1433 if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER)) 1434 goto fail; 1435 1436 /* cr_inter = cr & cr_mask */ 1437 /* cr_sub = cr & ~cr_mask */ 1438 1439 /* use the case conversion table to compute the result */ 1440 d_start = -1; 1441 d_end = -1; 1442 idx = 0; 1443 v = case_conv_table1[idx]; 1444 code = v >> (32 - 17); 1445 len = (v >> (32 - 17 - 7)) & 0x7f; 1446 for(i = 0; i < cr_inter.len; i += 2) { 1447 start = cr_inter.points[i]; 1448 end = cr_inter.points[i + 1]; 1449 1450 for(c = start; c < end; c++) { 1451 for(;;) { 1452 if (c >= code && c < code + len) 1453 break; 1454 idx++; 1455 assert(idx < countof(case_conv_table1)); 1456 v = case_conv_table1[idx]; 1457 code = v >> (32 - 17); 1458 len = (v >> (32 - 17 - 7)) & 0x7f; 1459 } 1460 d = lre_case_folding_entry(c, idx, v, is_unicode); 1461 /* try to merge with the current interval */ 1462 if (d_start == -1) { 1463 d_start = d; 1464 d_end = d + 1; 1465 } else if (d_end == d) { 1466 d_end++; 1467 } else { 1468 cr_add_interval(&cr_result, d_start, d_end); 1469 d_start = d; 1470 d_end = d + 1; 1471 } 1472 } 1473 } 1474 if (d_start != -1) { 1475 if (cr_add_interval(&cr_result, d_start, d_end)) 1476 goto fail; 1477 } 1478 1479 /* the resulting ranges are not necessarily sorted and may overlap */ 1480 cr_sort_and_remove_overlap(&cr_result); 1481 1482 /* or with the character not affected by the case folding */ 1483 cr->len = 0; 1484 if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION)) 1485 goto fail; 1486 1487 cr_free(&cr_inter); 1488 cr_free(&cr_mask); 1489 cr_free(&cr_result); 1490 cr_free(&cr_sub); 1491 return 0; 1492 fail: 1493 cr_free(&cr_inter); 1494 cr_free(&cr_mask); 1495 cr_free(&cr_result); 1496 cr_free(&cr_sub); 1497 return -1; 1498 } 1499 1500 typedef enum { 1501 POP_GC, 1502 POP_PROP, 1503 POP_CASE, 1504 POP_UNION, 1505 POP_INTER, 1506 POP_XOR, 1507 POP_INVERT, 1508 POP_END, 1509 } PropOPEnum; 1510 1511 #define POP_STACK_LEN_MAX 4 1512 1513 static int unicode_prop_ops(CharRange *cr, ...) 1514 { 1515 va_list ap; 1516 CharRange stack[POP_STACK_LEN_MAX]; 1517 int stack_len, op, ret, i; 1518 uint32_t a; 1519 1520 va_start(ap, cr); 1521 stack_len = 0; 1522 for(;;) { 1523 op = va_arg(ap, int); 1524 switch(op) { 1525 case POP_GC: 1526 assert(stack_len < POP_STACK_LEN_MAX); 1527 a = va_arg(ap, int); 1528 cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func); 1529 if (unicode_general_category1(&stack[stack_len - 1], a)) 1530 goto fail; 1531 break; 1532 case POP_PROP: 1533 assert(stack_len < POP_STACK_LEN_MAX); 1534 a = va_arg(ap, int); 1535 cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func); 1536 if (unicode_prop1(&stack[stack_len - 1], a)) 1537 goto fail; 1538 break; 1539 case POP_CASE: 1540 assert(stack_len < POP_STACK_LEN_MAX); 1541 a = va_arg(ap, int); 1542 cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func); 1543 if (unicode_case1(&stack[stack_len - 1], a)) 1544 goto fail; 1545 break; 1546 case POP_UNION: 1547 case POP_INTER: 1548 case POP_XOR: 1549 { 1550 CharRange *cr1, *cr2, *cr3; 1551 assert(stack_len >= 2); 1552 assert(stack_len < POP_STACK_LEN_MAX); 1553 cr1 = &stack[stack_len - 2]; 1554 cr2 = &stack[stack_len - 1]; 1555 cr3 = &stack[stack_len++]; 1556 cr_init(cr3, cr->mem_opaque, cr->realloc_func); 1557 if (cr_op(cr3, cr1->points, cr1->len, 1558 cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION)) 1559 goto fail; 1560 cr_free(cr1); 1561 cr_free(cr2); 1562 *cr1 = *cr3; 1563 stack_len -= 2; 1564 } 1565 break; 1566 case POP_INVERT: 1567 assert(stack_len >= 1); 1568 if (cr_invert(&stack[stack_len - 1])) 1569 goto fail; 1570 break; 1571 case POP_END: 1572 goto done; 1573 default: 1574 abort(); 1575 } 1576 } 1577 done: 1578 assert(stack_len == 1); 1579 ret = cr_copy(cr, &stack[0]); 1580 cr_free(&stack[0]); 1581 return ret; 1582 fail: 1583 for(i = 0; i < stack_len; i++) 1584 cr_free(&stack[i]); 1585 return -1; 1586 } 1587 1588 static const uint32_t unicode_gc_mask_table[] = { 1589 M(Lu) | M(Ll) | M(Lt), /* LC */ 1590 M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo), /* L */ 1591 M(Mn) | M(Mc) | M(Me), /* M */ 1592 M(Nd) | M(Nl) | M(No), /* N */ 1593 M(Sm) | M(Sc) | M(Sk) | M(So), /* S */ 1594 M(Pc) | M(Pd) | M(Ps) | M(Pe) | M(Pi) | M(Pf) | M(Po), /* P */ 1595 M(Zs) | M(Zl) | M(Zp), /* Z */ 1596 M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn), /* C */ 1597 }; 1598 1599 /* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2 1600 if not found */ 1601 int unicode_general_category(CharRange *cr, const char *gc_name) 1602 { 1603 int gc_idx; 1604 uint32_t gc_mask; 1605 1606 gc_idx = unicode_find_name(unicode_gc_name_table, gc_name); 1607 if (gc_idx < 0) 1608 return -2; 1609 if (gc_idx <= UNICODE_GC_Co) { 1610 gc_mask = (uint64_t)1 << gc_idx; 1611 } else { 1612 gc_mask = unicode_gc_mask_table[gc_idx - UNICODE_GC_LC]; 1613 } 1614 return unicode_general_category1(cr, gc_mask); 1615 } 1616 1617 1618 /* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2 1619 if not found */ 1620 int unicode_prop(CharRange *cr, const char *prop_name) 1621 { 1622 int prop_idx, ret; 1623 1624 prop_idx = unicode_find_name(unicode_prop_name_table, prop_name); 1625 if (prop_idx < 0) 1626 return -2; 1627 prop_idx += UNICODE_PROP_ASCII_Hex_Digit; 1628 1629 ret = 0; 1630 switch(prop_idx) { 1631 case UNICODE_PROP_ASCII: 1632 if (cr_add_interval(cr, 0x00, 0x7f + 1)) 1633 return -1; 1634 break; 1635 case UNICODE_PROP_Any: 1636 if (cr_add_interval(cr, 0x00000, 0x10ffff + 1)) 1637 return -1; 1638 break; 1639 case UNICODE_PROP_Assigned: 1640 ret = unicode_prop_ops(cr, 1641 POP_GC, M(Cn), 1642 POP_INVERT, 1643 POP_END); 1644 break; 1645 case UNICODE_PROP_Math: 1646 ret = unicode_prop_ops(cr, 1647 POP_GC, M(Sm), 1648 POP_PROP, UNICODE_PROP_Other_Math, 1649 POP_UNION, 1650 POP_END); 1651 break; 1652 case UNICODE_PROP_Lowercase: 1653 ret = unicode_prop_ops(cr, 1654 POP_GC, M(Ll), 1655 POP_PROP, UNICODE_PROP_Other_Lowercase, 1656 POP_UNION, 1657 POP_END); 1658 break; 1659 case UNICODE_PROP_Uppercase: 1660 ret = unicode_prop_ops(cr, 1661 POP_GC, M(Lu), 1662 POP_PROP, UNICODE_PROP_Other_Uppercase, 1663 POP_UNION, 1664 POP_END); 1665 break; 1666 case UNICODE_PROP_Cased: 1667 ret = unicode_prop_ops(cr, 1668 POP_GC, M(Lu) | M(Ll) | M(Lt), 1669 POP_PROP, UNICODE_PROP_Other_Uppercase, 1670 POP_UNION, 1671 POP_PROP, UNICODE_PROP_Other_Lowercase, 1672 POP_UNION, 1673 POP_END); 1674 break; 1675 case UNICODE_PROP_Alphabetic: 1676 ret = unicode_prop_ops(cr, 1677 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl), 1678 POP_PROP, UNICODE_PROP_Other_Uppercase, 1679 POP_UNION, 1680 POP_PROP, UNICODE_PROP_Other_Lowercase, 1681 POP_UNION, 1682 POP_PROP, UNICODE_PROP_Other_Alphabetic, 1683 POP_UNION, 1684 POP_END); 1685 break; 1686 case UNICODE_PROP_Grapheme_Base: 1687 ret = unicode_prop_ops(cr, 1688 POP_GC, M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn) | M(Zl) | M(Zp) | M(Me) | M(Mn), 1689 POP_PROP, UNICODE_PROP_Other_Grapheme_Extend, 1690 POP_UNION, 1691 POP_INVERT, 1692 POP_END); 1693 break; 1694 case UNICODE_PROP_Grapheme_Extend: 1695 ret = unicode_prop_ops(cr, 1696 POP_GC, M(Me) | M(Mn), 1697 POP_PROP, UNICODE_PROP_Other_Grapheme_Extend, 1698 POP_UNION, 1699 POP_END); 1700 break; 1701 case UNICODE_PROP_XID_Start: 1702 ret = unicode_prop_ops(cr, 1703 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl), 1704 POP_PROP, UNICODE_PROP_Other_ID_Start, 1705 POP_UNION, 1706 POP_PROP, UNICODE_PROP_Pattern_Syntax, 1707 POP_PROP, UNICODE_PROP_Pattern_White_Space, 1708 POP_UNION, 1709 POP_PROP, UNICODE_PROP_XID_Start1, 1710 POP_UNION, 1711 POP_INVERT, 1712 POP_INTER, 1713 POP_END); 1714 break; 1715 case UNICODE_PROP_XID_Continue: 1716 ret = unicode_prop_ops(cr, 1717 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) | 1718 M(Mn) | M(Mc) | M(Nd) | M(Pc), 1719 POP_PROP, UNICODE_PROP_Other_ID_Start, 1720 POP_UNION, 1721 POP_PROP, UNICODE_PROP_Other_ID_Continue, 1722 POP_UNION, 1723 POP_PROP, UNICODE_PROP_Pattern_Syntax, 1724 POP_PROP, UNICODE_PROP_Pattern_White_Space, 1725 POP_UNION, 1726 POP_PROP, UNICODE_PROP_XID_Continue1, 1727 POP_UNION, 1728 POP_INVERT, 1729 POP_INTER, 1730 POP_END); 1731 break; 1732 case UNICODE_PROP_Changes_When_Uppercased: 1733 ret = unicode_case1(cr, CASE_U); 1734 break; 1735 case UNICODE_PROP_Changes_When_Lowercased: 1736 ret = unicode_case1(cr, CASE_L); 1737 break; 1738 case UNICODE_PROP_Changes_When_Casemapped: 1739 ret = unicode_case1(cr, CASE_U | CASE_L | CASE_F); 1740 break; 1741 case UNICODE_PROP_Changes_When_Titlecased: 1742 ret = unicode_prop_ops(cr, 1743 POP_CASE, CASE_U, 1744 POP_PROP, UNICODE_PROP_Changes_When_Titlecased1, 1745 POP_XOR, 1746 POP_END); 1747 break; 1748 case UNICODE_PROP_Changes_When_Casefolded: 1749 ret = unicode_prop_ops(cr, 1750 POP_CASE, CASE_F, 1751 POP_PROP, UNICODE_PROP_Changes_When_Casefolded1, 1752 POP_XOR, 1753 POP_END); 1754 break; 1755 case UNICODE_PROP_Changes_When_NFKC_Casefolded: 1756 ret = unicode_prop_ops(cr, 1757 POP_CASE, CASE_F, 1758 POP_PROP, UNICODE_PROP_Changes_When_NFKC_Casefolded1, 1759 POP_XOR, 1760 POP_END); 1761 break; 1762 #if 0 1763 case UNICODE_PROP_ID_Start: 1764 ret = unicode_prop_ops(cr, 1765 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl), 1766 POP_PROP, UNICODE_PROP_Other_ID_Start, 1767 POP_UNION, 1768 POP_PROP, UNICODE_PROP_Pattern_Syntax, 1769 POP_PROP, UNICODE_PROP_Pattern_White_Space, 1770 POP_UNION, 1771 POP_INVERT, 1772 POP_INTER, 1773 POP_END); 1774 break; 1775 case UNICODE_PROP_ID_Continue: 1776 ret = unicode_prop_ops(cr, 1777 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) | 1778 M(Mn) | M(Mc) | M(Nd) | M(Pc), 1779 POP_PROP, UNICODE_PROP_Other_ID_Start, 1780 POP_UNION, 1781 POP_PROP, UNICODE_PROP_Other_ID_Continue, 1782 POP_UNION, 1783 POP_PROP, UNICODE_PROP_Pattern_Syntax, 1784 POP_PROP, UNICODE_PROP_Pattern_White_Space, 1785 POP_UNION, 1786 POP_INVERT, 1787 POP_INTER, 1788 POP_END); 1789 break; 1790 case UNICODE_PROP_Case_Ignorable: 1791 ret = unicode_prop_ops(cr, 1792 POP_GC, M(Mn) | M(Cf) | M(Lm) | M(Sk), 1793 POP_PROP, UNICODE_PROP_Case_Ignorable1, 1794 POP_XOR, 1795 POP_END); 1796 break; 1797 #else 1798 /* we use the existing tables */ 1799 case UNICODE_PROP_ID_Continue: 1800 ret = unicode_prop_ops(cr, 1801 POP_PROP, UNICODE_PROP_ID_Start, 1802 POP_PROP, UNICODE_PROP_ID_Continue1, 1803 POP_XOR, 1804 POP_END); 1805 break; 1806 #endif 1807 default: 1808 if (prop_idx >= countof(unicode_prop_table)) 1809 return -2; 1810 ret = unicode_prop1(cr, prop_idx); 1811 break; 1812 } 1813 return ret; 1814 } 1815 1816 #endif /* CONFIG_ALL_UNICODE */ 1817 1818 /*---- lre codepoint categorizing functions ----*/ 1819 1820 #define S UNICODE_C_SPACE 1821 #define D UNICODE_C_DIGIT 1822 #define X UNICODE_C_XDIGIT 1823 #define U UNICODE_C_UPPER 1824 #define L UNICODE_C_LOWER 1825 #define _ UNICODE_C_UNDER 1826 #define d UNICODE_C_DOLLAR 1827 1828 uint8_t const lre_ctype_bits[256] = { 1829 0, 0, 0, 0, 0, 0, 0, 0, 1830 0, S, S, S, S, S, 0, 0, 1831 0, 0, 0, 0, 0, 0, 0, 0, 1832 0, 0, 0, 0, 0, 0, 0, 0, 1833 1834 S, 0, 0, 0, d, 0, 0, 0, 1835 0, 0, 0, 0, 0, 0, 0, 0, 1836 X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D, 1837 X|D, X|D, 0, 0, 0, 0, 0, 0, 1838 1839 0, X|U, X|U, X|U, X|U, X|U, X|U, U, 1840 U, U, U, U, U, U, U, U, 1841 U, U, U, U, U, U, U, U, 1842 U, U, U, 0, 0, 0, 0, _, 1843 1844 0, X|L, X|L, X|L, X|L, X|L, X|L, L, 1845 L, L, L, L, L, L, L, L, 1846 L, L, L, L, L, L, L, L, 1847 L, L, L, 0, 0, 0, 0, 0, 1848 1849 0, 0, 0, 0, 0, 0, 0, 0, 1850 0, 0, 0, 0, 0, 0, 0, 0, 1851 0, 0, 0, 0, 0, 0, 0, 0, 1852 0, 0, 0, 0, 0, 0, 0, 0, 1853 1854 S, 0, 0, 0, 0, 0, 0, 0, 1855 0, 0, 0, 0, 0, 0, 0, 0, 1856 0, 0, 0, 0, 0, 0, 0, 0, 1857 0, 0, 0, 0, 0, 0, 0, 0, 1858 1859 0, 0, 0, 0, 0, 0, 0, 0, 1860 0, 0, 0, 0, 0, 0, 0, 0, 1861 0, 0, 0, 0, 0, 0, 0, 0, 1862 0, 0, 0, 0, 0, 0, 0, 0, 1863 1864 0, 0, 0, 0, 0, 0, 0, 0, 1865 0, 0, 0, 0, 0, 0, 0, 0, 1866 0, 0, 0, 0, 0, 0, 0, 0, 1867 0, 0, 0, 0, 0, 0, 0, 0, 1868 }; 1869 1870 #undef S 1871 #undef D 1872 #undef X 1873 #undef U 1874 #undef L 1875 #undef _ 1876 #undef d 1877 1878 /* code point ranges for Zs,Zl or Zp property */ 1879 static const uint16_t char_range_s[] = { 1880 10, 1881 0x0009, 0x000D + 1, 1882 0x0020, 0x0020 + 1, 1883 0x00A0, 0x00A0 + 1, 1884 0x1680, 0x1680 + 1, 1885 0x2000, 0x200A + 1, 1886 /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */ 1887 /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */ 1888 0x2028, 0x2029 + 1, 1889 0x202F, 0x202F + 1, 1890 0x205F, 0x205F + 1, 1891 0x3000, 0x3000 + 1, 1892 /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */ 1893 0xFEFF, 0xFEFF + 1, 1894 }; 1895 1896 BOOL lre_is_space_non_ascii(uint32_t c) 1897 { 1898 size_t i, n; 1899 1900 n = countof(char_range_s); 1901 for(i = 5; i < n; i += 2) { 1902 uint32_t low = char_range_s[i]; 1903 uint32_t high = char_range_s[i + 1]; 1904 if (c < low) 1905 return FALSE; 1906 if (c < high) 1907 return TRUE; 1908 } 1909 return FALSE; 1910 }