libregexp.c (81779B)
1 /* 2 * Regular Expression Engine 3 * 4 * Copyright (c) 2017-2018 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include <stdlib.h> 25 #include <stdio.h> 26 #include <stdarg.h> 27 #include <inttypes.h> 28 #include <string.h> 29 #include <assert.h> 30 31 #include "cutils.h" 32 #include "libregexp.h" 33 #include "libunicode.h" 34 35 /* 36 TODO: 37 38 - Add a lock step execution mode (=linear time execution guaranteed) 39 when the regular expression is "simple" i.e. no backreference nor 40 complicated lookahead. The opcodes are designed for this execution 41 model. 42 */ 43 44 #if defined(TEST) 45 #define DUMP_REOP 46 #endif 47 48 typedef enum { 49 #define DEF(id, size) REOP_ ## id, 50 #include "libregexp-opcode.h" 51 #undef DEF 52 REOP_COUNT, 53 } REOPCodeEnum; 54 55 #define CAPTURE_COUNT_MAX 255 56 #define STACK_SIZE_MAX 255 57 58 /* unicode code points */ 59 #define CP_LS 0x2028 60 #define CP_PS 0x2029 61 62 #define TMP_BUF_SIZE 128 63 64 typedef struct { 65 DynBuf byte_code; 66 const uint8_t *buf_ptr; 67 const uint8_t *buf_end; 68 const uint8_t *buf_start; 69 int re_flags; 70 BOOL is_unicode; 71 BOOL ignore_case; 72 BOOL dotall; 73 int capture_count; 74 int total_capture_count; /* -1 = not computed yet */ 75 int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */ 76 void *opaque; 77 DynBuf group_names; 78 union { 79 char error_msg[TMP_BUF_SIZE]; 80 char tmp_buf[TMP_BUF_SIZE]; 81 } u; 82 } REParseState; 83 84 typedef struct { 85 #ifdef DUMP_REOP 86 const char *name; 87 #endif 88 uint8_t size; 89 } REOpCode; 90 91 static const REOpCode reopcode_info[REOP_COUNT] = { 92 #ifdef DUMP_REOP 93 #define DEF(id, size) { #id, size }, 94 #else 95 #define DEF(id, size) { size }, 96 #endif 97 #include "libregexp-opcode.h" 98 #undef DEF 99 }; 100 101 #define RE_HEADER_FLAGS 0 102 #define RE_HEADER_CAPTURE_COUNT 1 103 #define RE_HEADER_STACK_SIZE 2 104 #define RE_HEADER_BYTECODE_LEN 3 105 106 #define RE_HEADER_LEN 7 107 108 static inline int is_digit(int c) { 109 return c >= '0' && c <= '9'; 110 } 111 112 /* insert 'len' bytes at position 'pos'. Return < 0 if error. */ 113 static int dbuf_insert(DynBuf *s, int pos, int len) 114 { 115 if (dbuf_realloc(s, s->size + len)) 116 return -1; 117 memmove(s->buf + pos + len, s->buf + pos, s->size - pos); 118 s->size += len; 119 return 0; 120 } 121 122 static const uint16_t char_range_d[] = { 123 1, 124 0x0030, 0x0039 + 1, 125 }; 126 127 /* code point ranges for Zs,Zl or Zp property */ 128 static const uint16_t char_range_s[] = { 129 10, 130 0x0009, 0x000D + 1, 131 0x0020, 0x0020 + 1, 132 0x00A0, 0x00A0 + 1, 133 0x1680, 0x1680 + 1, 134 0x2000, 0x200A + 1, 135 /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */ 136 /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */ 137 0x2028, 0x2029 + 1, 138 0x202F, 0x202F + 1, 139 0x205F, 0x205F + 1, 140 0x3000, 0x3000 + 1, 141 /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */ 142 0xFEFF, 0xFEFF + 1, 143 }; 144 145 static const uint16_t char_range_w[] = { 146 4, 147 0x0030, 0x0039 + 1, 148 0x0041, 0x005A + 1, 149 0x005F, 0x005F + 1, 150 0x0061, 0x007A + 1, 151 }; 152 153 #define CLASS_RANGE_BASE 0x40000000 154 155 typedef enum { 156 CHAR_RANGE_d, 157 CHAR_RANGE_D, 158 CHAR_RANGE_s, 159 CHAR_RANGE_S, 160 CHAR_RANGE_w, 161 CHAR_RANGE_W, 162 } CharRangeEnum; 163 164 static const uint16_t * const char_range_table[] = { 165 char_range_d, 166 char_range_s, 167 char_range_w, 168 }; 169 170 static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c) 171 { 172 BOOL invert; 173 const uint16_t *c_pt; 174 int len, i; 175 176 invert = c & 1; 177 c_pt = char_range_table[c >> 1]; 178 len = *c_pt++; 179 cr_init(cr, s->opaque, lre_realloc); 180 for(i = 0; i < len * 2; i++) { 181 if (cr_add_point(cr, c_pt[i])) 182 goto fail; 183 } 184 if (invert) { 185 if (cr_invert(cr)) 186 goto fail; 187 } 188 return 0; 189 fail: 190 cr_free(cr); 191 return -1; 192 } 193 194 #ifdef DUMP_REOP 195 static __maybe_unused void lre_dump_bytecode(const uint8_t *buf, 196 int buf_len) 197 { 198 int pos, len, opcode, bc_len, re_flags, i; 199 uint32_t val; 200 201 assert(buf_len >= RE_HEADER_LEN); 202 203 re_flags = lre_get_flags(buf); 204 bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN); 205 assert(bc_len + RE_HEADER_LEN <= buf_len); 206 printf("flags: 0x%x capture_count=%d stack_size=%d\n", 207 re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_STACK_SIZE]); 208 if (re_flags & LRE_FLAG_NAMED_GROUPS) { 209 const char *p; 210 p = (char *)buf + RE_HEADER_LEN + bc_len; 211 printf("named groups: "); 212 for(i = 1; i < buf[RE_HEADER_CAPTURE_COUNT]; i++) { 213 if (i != 1) 214 printf(","); 215 printf("<%s>", p); 216 p += strlen(p) + 1; 217 } 218 printf("\n"); 219 assert(p == (char *)(buf + buf_len)); 220 } 221 printf("bytecode_len=%d\n", bc_len); 222 223 buf += RE_HEADER_LEN; 224 pos = 0; 225 while (pos < bc_len) { 226 printf("%5u: ", pos); 227 opcode = buf[pos]; 228 len = reopcode_info[opcode].size; 229 if (opcode >= REOP_COUNT) { 230 printf(" invalid opcode=0x%02x\n", opcode); 231 break; 232 } 233 if ((pos + len) > bc_len) { 234 printf(" buffer overflow (opcode=0x%02x)\n", opcode); 235 break; 236 } 237 printf("%s", reopcode_info[opcode].name); 238 switch(opcode) { 239 case REOP_char: 240 val = get_u16(buf + pos + 1); 241 if (val >= ' ' && val <= 126) 242 printf(" '%c'", val); 243 else 244 printf(" 0x%04x", val); 245 break; 246 case REOP_char32: 247 val = get_u32(buf + pos + 1); 248 if (val >= ' ' && val <= 126) 249 printf(" '%c'", val); 250 else 251 printf(" 0x%08x", val); 252 break; 253 case REOP_goto: 254 case REOP_split_goto_first: 255 case REOP_split_next_first: 256 case REOP_loop: 257 case REOP_lookahead: 258 case REOP_negative_lookahead: 259 val = get_u32(buf + pos + 1); 260 val += (pos + 5); 261 printf(" %u", val); 262 break; 263 case REOP_simple_greedy_quant: 264 printf(" %u %u %u %u", 265 get_u32(buf + pos + 1) + (pos + 17), 266 get_u32(buf + pos + 1 + 4), 267 get_u32(buf + pos + 1 + 8), 268 get_u32(buf + pos + 1 + 12)); 269 break; 270 case REOP_save_start: 271 case REOP_save_end: 272 case REOP_back_reference: 273 case REOP_backward_back_reference: 274 printf(" %u", buf[pos + 1]); 275 break; 276 case REOP_save_reset: 277 printf(" %u %u", buf[pos + 1], buf[pos + 2]); 278 break; 279 case REOP_push_i32: 280 val = get_u32(buf + pos + 1); 281 printf(" %d", val); 282 break; 283 case REOP_range: 284 { 285 int n, i; 286 n = get_u16(buf + pos + 1); 287 len += n * 4; 288 for(i = 0; i < n * 2; i++) { 289 val = get_u16(buf + pos + 3 + i * 2); 290 printf(" 0x%04x", val); 291 } 292 } 293 break; 294 case REOP_range32: 295 { 296 int n, i; 297 n = get_u16(buf + pos + 1); 298 len += n * 8; 299 for(i = 0; i < n * 2; i++) { 300 val = get_u32(buf + pos + 3 + i * 4); 301 printf(" 0x%08x", val); 302 } 303 } 304 break; 305 default: 306 break; 307 } 308 printf("\n"); 309 pos += len; 310 } 311 } 312 #endif 313 314 static void re_emit_op(REParseState *s, int op) 315 { 316 dbuf_putc(&s->byte_code, op); 317 } 318 319 /* return the offset of the u32 value */ 320 static int re_emit_op_u32(REParseState *s, int op, uint32_t val) 321 { 322 int pos; 323 dbuf_putc(&s->byte_code, op); 324 pos = s->byte_code.size; 325 dbuf_put_u32(&s->byte_code, val); 326 return pos; 327 } 328 329 static int re_emit_goto(REParseState *s, int op, uint32_t val) 330 { 331 int pos; 332 dbuf_putc(&s->byte_code, op); 333 pos = s->byte_code.size; 334 dbuf_put_u32(&s->byte_code, val - (pos + 4)); 335 return pos; 336 } 337 338 static void re_emit_op_u8(REParseState *s, int op, uint32_t val) 339 { 340 dbuf_putc(&s->byte_code, op); 341 dbuf_putc(&s->byte_code, val); 342 } 343 344 static void re_emit_op_u16(REParseState *s, int op, uint32_t val) 345 { 346 dbuf_putc(&s->byte_code, op); 347 dbuf_put_u16(&s->byte_code, val); 348 } 349 350 static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s, const char *fmt, ...) 351 { 352 va_list ap; 353 va_start(ap, fmt); 354 vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap); 355 va_end(ap); 356 return -1; 357 } 358 359 static int re_parse_out_of_memory(REParseState *s) 360 { 361 return re_parse_error(s, "out of memory"); 362 } 363 364 /* If allow_overflow is false, return -1 in case of 365 overflow. Otherwise return INT32_MAX. */ 366 static int parse_digits(const uint8_t **pp, BOOL allow_overflow) 367 { 368 const uint8_t *p; 369 uint64_t v; 370 int c; 371 372 p = *pp; 373 v = 0; 374 for(;;) { 375 c = *p; 376 if (c < '0' || c > '9') 377 break; 378 v = v * 10 + c - '0'; 379 if (v >= INT32_MAX) { 380 if (allow_overflow) 381 v = INT32_MAX; 382 else 383 return -1; 384 } 385 p++; 386 } 387 *pp = p; 388 return v; 389 } 390 391 static int re_parse_expect(REParseState *s, const uint8_t **pp, int c) 392 { 393 const uint8_t *p; 394 p = *pp; 395 if (*p != c) 396 return re_parse_error(s, "expecting '%c'", c); 397 p++; 398 *pp = p; 399 return 0; 400 } 401 402 /* Parse an escape sequence, *pp points after the '\': 403 allow_utf16 value: 404 0 : no UTF-16 escapes allowed 405 1 : UTF-16 escapes allowed 406 2 : UTF-16 escapes allowed and escapes of surrogate pairs are 407 converted to a unicode character (unicode regexp case). 408 409 Return the unicode char and update *pp if recognized, 410 return -1 if malformed escape, 411 return -2 otherwise. */ 412 int lre_parse_escape(const uint8_t **pp, int allow_utf16) 413 { 414 const uint8_t *p; 415 uint32_t c; 416 417 p = *pp; 418 c = *p++; 419 switch(c) { 420 case 'b': 421 c = '\b'; 422 break; 423 case 'f': 424 c = '\f'; 425 break; 426 case 'n': 427 c = '\n'; 428 break; 429 case 'r': 430 c = '\r'; 431 break; 432 case 't': 433 c = '\t'; 434 break; 435 case 'v': 436 c = '\v'; 437 break; 438 case 'x': 439 case 'u': 440 { 441 int h, n, i; 442 uint32_t c1; 443 444 if (*p == '{' && allow_utf16) { 445 p++; 446 c = 0; 447 for(;;) { 448 h = from_hex(*p++); 449 if (h < 0) 450 return -1; 451 c = (c << 4) | h; 452 if (c > 0x10FFFF) 453 return -1; 454 if (*p == '}') 455 break; 456 } 457 p++; 458 } else { 459 if (c == 'x') { 460 n = 2; 461 } else { 462 n = 4; 463 } 464 465 c = 0; 466 for(i = 0; i < n; i++) { 467 h = from_hex(*p++); 468 if (h < 0) { 469 return -1; 470 } 471 c = (c << 4) | h; 472 } 473 if (is_hi_surrogate(c) && 474 allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') { 475 /* convert an escaped surrogate pair into a 476 unicode char */ 477 c1 = 0; 478 for(i = 0; i < 4; i++) { 479 h = from_hex(p[2 + i]); 480 if (h < 0) 481 break; 482 c1 = (c1 << 4) | h; 483 } 484 if (i == 4 && is_lo_surrogate(c1)) { 485 p += 6; 486 c = from_surrogate(c, c1); 487 } 488 } 489 } 490 } 491 break; 492 case '0': case '1': case '2': case '3': 493 case '4': case '5': case '6': case '7': 494 c -= '0'; 495 if (allow_utf16 == 2) { 496 /* only accept \0 not followed by digit */ 497 if (c != 0 || is_digit(*p)) 498 return -1; 499 } else { 500 /* parse a legacy octal sequence */ 501 uint32_t v; 502 v = *p - '0'; 503 if (v > 7) 504 break; 505 c = (c << 3) | v; 506 p++; 507 if (c >= 32) 508 break; 509 v = *p - '0'; 510 if (v > 7) 511 break; 512 c = (c << 3) | v; 513 p++; 514 } 515 break; 516 default: 517 return -2; 518 } 519 *pp = p; 520 return c; 521 } 522 523 #ifdef CONFIG_ALL_UNICODE 524 /* XXX: we use the same chars for name and value */ 525 static BOOL is_unicode_char(int c) 526 { 527 return ((c >= '0' && c <= '9') || 528 (c >= 'A' && c <= 'Z') || 529 (c >= 'a' && c <= 'z') || 530 (c == '_')); 531 } 532 533 static int parse_unicode_property(REParseState *s, CharRange *cr, 534 const uint8_t **pp, BOOL is_inv) 535 { 536 const uint8_t *p; 537 char name[64], value[64]; 538 char *q; 539 BOOL script_ext; 540 int ret; 541 542 p = *pp; 543 if (*p != '{') 544 return re_parse_error(s, "expecting '{' after \\p"); 545 p++; 546 q = name; 547 while (is_unicode_char(*p)) { 548 if ((q - name) >= sizeof(name) - 1) 549 goto unknown_property_name; 550 *q++ = *p++; 551 } 552 *q = '\0'; 553 q = value; 554 if (*p == '=') { 555 p++; 556 while (is_unicode_char(*p)) { 557 if ((q - value) >= sizeof(value) - 1) 558 return re_parse_error(s, "unknown unicode property value"); 559 *q++ = *p++; 560 } 561 } 562 *q = '\0'; 563 if (*p != '}') 564 return re_parse_error(s, "expecting '}'"); 565 p++; 566 // printf("name=%s value=%s\n", name, value); 567 568 if (!strcmp(name, "Script") || !strcmp(name, "sc")) { 569 script_ext = FALSE; 570 goto do_script; 571 } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) { 572 script_ext = TRUE; 573 do_script: 574 cr_init(cr, s->opaque, lre_realloc); 575 ret = unicode_script(cr, value, script_ext); 576 if (ret) { 577 cr_free(cr); 578 if (ret == -2) 579 return re_parse_error(s, "unknown unicode script"); 580 else 581 goto out_of_memory; 582 } 583 } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) { 584 cr_init(cr, s->opaque, lre_realloc); 585 ret = unicode_general_category(cr, value); 586 if (ret) { 587 cr_free(cr); 588 if (ret == -2) 589 return re_parse_error(s, "unknown unicode general category"); 590 else 591 goto out_of_memory; 592 } 593 } else if (value[0] == '\0') { 594 cr_init(cr, s->opaque, lre_realloc); 595 ret = unicode_general_category(cr, name); 596 if (ret == -1) { 597 cr_free(cr); 598 goto out_of_memory; 599 } 600 if (ret < 0) { 601 ret = unicode_prop(cr, name); 602 if (ret) { 603 cr_free(cr); 604 if (ret == -2) 605 goto unknown_property_name; 606 else 607 goto out_of_memory; 608 } 609 } 610 } else { 611 unknown_property_name: 612 return re_parse_error(s, "unknown unicode property name"); 613 } 614 615 if (is_inv) { 616 if (cr_invert(cr)) { 617 cr_free(cr); 618 return -1; 619 } 620 } 621 *pp = p; 622 return 0; 623 out_of_memory: 624 return re_parse_out_of_memory(s); 625 } 626 #endif /* CONFIG_ALL_UNICODE */ 627 628 /* return -1 if error otherwise the character or a class range 629 (CLASS_RANGE_BASE). In case of class range, 'cr' is 630 initialized. Otherwise, it is ignored. */ 631 static int get_class_atom(REParseState *s, CharRange *cr, 632 const uint8_t **pp, BOOL inclass) 633 { 634 const uint8_t *p; 635 uint32_t c; 636 int ret; 637 638 p = *pp; 639 640 c = *p; 641 switch(c) { 642 case '\\': 643 p++; 644 if (p >= s->buf_end) 645 goto unexpected_end; 646 c = *p++; 647 switch(c) { 648 case 'd': 649 c = CHAR_RANGE_d; 650 goto class_range; 651 case 'D': 652 c = CHAR_RANGE_D; 653 goto class_range; 654 case 's': 655 c = CHAR_RANGE_s; 656 goto class_range; 657 case 'S': 658 c = CHAR_RANGE_S; 659 goto class_range; 660 case 'w': 661 c = CHAR_RANGE_w; 662 goto class_range; 663 case 'W': 664 c = CHAR_RANGE_W; 665 class_range: 666 if (cr_init_char_range(s, cr, c)) 667 return -1; 668 c = CLASS_RANGE_BASE; 669 break; 670 case 'c': 671 c = *p; 672 if ((c >= 'a' && c <= 'z') || 673 (c >= 'A' && c <= 'Z') || 674 (((c >= '0' && c <= '9') || c == '_') && 675 inclass && !s->is_unicode)) { /* Annex B.1.4 */ 676 c &= 0x1f; 677 p++; 678 } else if (s->is_unicode) { 679 goto invalid_escape; 680 } else { 681 /* otherwise return '\' and 'c' */ 682 p--; 683 c = '\\'; 684 } 685 break; 686 #ifdef CONFIG_ALL_UNICODE 687 case 'p': 688 case 'P': 689 if (s->is_unicode) { 690 if (parse_unicode_property(s, cr, &p, (c == 'P'))) 691 return -1; 692 c = CLASS_RANGE_BASE; 693 break; 694 } 695 /* fall thru */ 696 #endif 697 default: 698 p--; 699 ret = lre_parse_escape(&p, s->is_unicode * 2); 700 if (ret >= 0) { 701 c = ret; 702 } else { 703 if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) { 704 /* always valid to escape these characters */ 705 goto normal_char; 706 } else if (s->is_unicode) { 707 invalid_escape: 708 return re_parse_error(s, "invalid escape sequence in regular expression"); 709 } else { 710 /* just ignore the '\' */ 711 goto normal_char; 712 } 713 } 714 break; 715 } 716 break; 717 case '\0': 718 if (p >= s->buf_end) { 719 unexpected_end: 720 return re_parse_error(s, "unexpected end"); 721 } 722 /* fall thru */ 723 default: 724 normal_char: 725 /* normal char */ 726 if (c >= 128) { 727 c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); 728 if ((unsigned)c > 0xffff && !s->is_unicode) { 729 /* XXX: should handle non BMP-1 code points */ 730 return re_parse_error(s, "malformed unicode char"); 731 } 732 } else { 733 p++; 734 } 735 break; 736 } 737 *pp = p; 738 return c; 739 } 740 741 static int re_emit_range(REParseState *s, const CharRange *cr) 742 { 743 int len, i; 744 uint32_t high; 745 746 len = (unsigned)cr->len / 2; 747 if (len >= 65535) 748 return re_parse_error(s, "too many ranges"); 749 if (len == 0) { 750 /* not sure it can really happen. Emit a match that is always 751 false */ 752 re_emit_op_u32(s, REOP_char32, -1); 753 } else { 754 high = cr->points[cr->len - 1]; 755 if (high == UINT32_MAX) 756 high = cr->points[cr->len - 2]; 757 if (high <= 0xffff) { 758 /* can use 16 bit ranges with the conversion that 0xffff = 759 infinity */ 760 re_emit_op_u16(s, REOP_range, len); 761 for(i = 0; i < cr->len; i += 2) { 762 dbuf_put_u16(&s->byte_code, cr->points[i]); 763 high = cr->points[i + 1] - 1; 764 if (high == UINT32_MAX - 1) 765 high = 0xffff; 766 dbuf_put_u16(&s->byte_code, high); 767 } 768 } else { 769 re_emit_op_u16(s, REOP_range32, len); 770 for(i = 0; i < cr->len; i += 2) { 771 dbuf_put_u32(&s->byte_code, cr->points[i]); 772 dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1); 773 } 774 } 775 } 776 return 0; 777 } 778 779 static int re_parse_char_class(REParseState *s, const uint8_t **pp) 780 { 781 const uint8_t *p; 782 uint32_t c1, c2; 783 CharRange cr_s, *cr = &cr_s; 784 CharRange cr1_s, *cr1 = &cr1_s; 785 BOOL invert; 786 787 cr_init(cr, s->opaque, lre_realloc); 788 p = *pp; 789 p++; /* skip '[' */ 790 791 invert = FALSE; 792 if (*p == '^') { 793 p++; 794 invert = TRUE; 795 } 796 797 for(;;) { 798 if (*p == ']') 799 break; 800 c1 = get_class_atom(s, cr1, &p, TRUE); 801 if ((int)c1 < 0) 802 goto fail; 803 if (*p == '-' && p[1] != ']') { 804 const uint8_t *p0 = p + 1; 805 if (c1 >= CLASS_RANGE_BASE) { 806 if (s->is_unicode) { 807 cr_free(cr1); 808 goto invalid_class_range; 809 } 810 /* Annex B: match '-' character */ 811 goto class_atom; 812 } 813 c2 = get_class_atom(s, cr1, &p0, TRUE); 814 if ((int)c2 < 0) 815 goto fail; 816 if (c2 >= CLASS_RANGE_BASE) { 817 cr_free(cr1); 818 if (s->is_unicode) { 819 goto invalid_class_range; 820 } 821 /* Annex B: match '-' character */ 822 goto class_atom; 823 } 824 p = p0; 825 if (c2 < c1) { 826 invalid_class_range: 827 re_parse_error(s, "invalid class range"); 828 goto fail; 829 } 830 if (cr_union_interval(cr, c1, c2)) 831 goto memory_error; 832 } else { 833 class_atom: 834 if (c1 >= CLASS_RANGE_BASE) { 835 int ret; 836 ret = cr_union1(cr, cr1->points, cr1->len); 837 cr_free(cr1); 838 if (ret) 839 goto memory_error; 840 } else { 841 if (cr_union_interval(cr, c1, c1)) 842 goto memory_error; 843 } 844 } 845 } 846 if (s->ignore_case) { 847 if (cr_regexp_canonicalize(cr, s->is_unicode)) 848 goto memory_error; 849 } 850 if (invert) { 851 if (cr_invert(cr)) 852 goto memory_error; 853 } 854 if (re_emit_range(s, cr)) 855 goto fail; 856 cr_free(cr); 857 p++; /* skip ']' */ 858 *pp = p; 859 return 0; 860 memory_error: 861 re_parse_out_of_memory(s); 862 fail: 863 cr_free(cr); 864 return -1; 865 } 866 867 /* Return: 868 - true if the opcodes may not advance the char pointer 869 - false if the opcodes always advance the char pointer 870 */ 871 static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len) 872 { 873 int pos, opcode, len; 874 uint32_t val; 875 BOOL ret; 876 877 ret = TRUE; 878 pos = 0; 879 while (pos < bc_buf_len) { 880 opcode = bc_buf[pos]; 881 len = reopcode_info[opcode].size; 882 switch(opcode) { 883 case REOP_range: 884 val = get_u16(bc_buf + pos + 1); 885 len += val * 4; 886 goto simple_char; 887 case REOP_range32: 888 val = get_u16(bc_buf + pos + 1); 889 len += val * 8; 890 goto simple_char; 891 case REOP_char: 892 case REOP_char32: 893 case REOP_dot: 894 case REOP_any: 895 simple_char: 896 ret = FALSE; 897 break; 898 case REOP_line_start: 899 case REOP_line_end: 900 case REOP_push_i32: 901 case REOP_push_char_pos: 902 case REOP_drop: 903 case REOP_word_boundary: 904 case REOP_not_word_boundary: 905 case REOP_prev: 906 /* no effect */ 907 break; 908 case REOP_save_start: 909 case REOP_save_end: 910 case REOP_save_reset: 911 case REOP_back_reference: 912 case REOP_backward_back_reference: 913 break; 914 default: 915 /* safe behavior: we cannot predict the outcome */ 916 return TRUE; 917 } 918 pos += len; 919 } 920 return ret; 921 } 922 923 /* return -1 if a simple quantifier cannot be used. Otherwise return 924 the number of characters in the atom. */ 925 static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len) 926 { 927 int pos, opcode, len, count; 928 uint32_t val; 929 930 count = 0; 931 pos = 0; 932 while (pos < bc_buf_len) { 933 opcode = bc_buf[pos]; 934 len = reopcode_info[opcode].size; 935 switch(opcode) { 936 case REOP_range: 937 val = get_u16(bc_buf + pos + 1); 938 len += val * 4; 939 goto simple_char; 940 case REOP_range32: 941 val = get_u16(bc_buf + pos + 1); 942 len += val * 8; 943 goto simple_char; 944 case REOP_char: 945 case REOP_char32: 946 case REOP_dot: 947 case REOP_any: 948 simple_char: 949 count++; 950 break; 951 case REOP_line_start: 952 case REOP_line_end: 953 case REOP_word_boundary: 954 case REOP_not_word_boundary: 955 break; 956 default: 957 return -1; 958 } 959 pos += len; 960 } 961 return count; 962 } 963 964 /* '*pp' is the first char after '<' */ 965 static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp) 966 { 967 const uint8_t *p, *p1; 968 uint32_t c, d; 969 char *q; 970 971 p = *pp; 972 q = buf; 973 for(;;) { 974 c = *p; 975 if (c == '\\') { 976 p++; 977 if (*p != 'u') 978 return -1; 979 c = lre_parse_escape(&p, 2); // accept surrogate pairs 980 } else if (c == '>') { 981 break; 982 } else if (c >= 128) { 983 c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); 984 if (is_hi_surrogate(c)) { 985 d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1); 986 if (is_lo_surrogate(d)) { 987 c = from_surrogate(c, d); 988 p = p1; 989 } 990 } 991 } else { 992 p++; 993 } 994 if (c > 0x10FFFF) 995 return -1; 996 if (q == buf) { 997 if (!lre_js_is_ident_first(c)) 998 return -1; 999 } else { 1000 if (!lre_js_is_ident_next(c)) 1001 return -1; 1002 } 1003 if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size) 1004 return -1; 1005 if (c < 128) { 1006 *q++ = c; 1007 } else { 1008 q += unicode_to_utf8((uint8_t*)q, c); 1009 } 1010 } 1011 if (q == buf) 1012 return -1; 1013 *q = '\0'; 1014 p++; 1015 *pp = p; 1016 return 0; 1017 } 1018 1019 /* if capture_name = NULL: return the number of captures + 1. 1020 Otherwise, return the capture index corresponding to capture_name 1021 or -1 if none */ 1022 static int re_parse_captures(REParseState *s, int *phas_named_captures, 1023 const char *capture_name) 1024 { 1025 const uint8_t *p; 1026 int capture_index; 1027 char name[TMP_BUF_SIZE]; 1028 1029 capture_index = 1; 1030 *phas_named_captures = 0; 1031 for (p = s->buf_start; p < s->buf_end; p++) { 1032 switch (*p) { 1033 case '(': 1034 if (p[1] == '?') { 1035 if (p[2] == '<' && p[3] != '=' && p[3] != '!') { 1036 *phas_named_captures = 1; 1037 /* potential named capture */ 1038 if (capture_name) { 1039 p += 3; 1040 if (re_parse_group_name(name, sizeof(name), &p) == 0) { 1041 if (!strcmp(name, capture_name)) 1042 return capture_index; 1043 } 1044 } 1045 capture_index++; 1046 if (capture_index >= CAPTURE_COUNT_MAX) 1047 goto done; 1048 } 1049 } else { 1050 capture_index++; 1051 if (capture_index >= CAPTURE_COUNT_MAX) 1052 goto done; 1053 } 1054 break; 1055 case '\\': 1056 p++; 1057 break; 1058 case '[': 1059 for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) { 1060 if (*p == '\\') 1061 p++; 1062 } 1063 break; 1064 } 1065 } 1066 done: 1067 if (capture_name) 1068 return -1; 1069 else 1070 return capture_index; 1071 } 1072 1073 static int re_count_captures(REParseState *s) 1074 { 1075 if (s->total_capture_count < 0) { 1076 s->total_capture_count = re_parse_captures(s, &s->has_named_captures, 1077 NULL); 1078 } 1079 return s->total_capture_count; 1080 } 1081 1082 static BOOL re_has_named_captures(REParseState *s) 1083 { 1084 if (s->has_named_captures < 0) 1085 re_count_captures(s); 1086 return s->has_named_captures; 1087 } 1088 1089 static int find_group_name(REParseState *s, const char *name) 1090 { 1091 const char *p, *buf_end; 1092 size_t len, name_len; 1093 int capture_index; 1094 1095 p = (char *)s->group_names.buf; 1096 if (!p) return -1; 1097 buf_end = (char *)s->group_names.buf + s->group_names.size; 1098 name_len = strlen(name); 1099 capture_index = 1; 1100 while (p < buf_end) { 1101 len = strlen(p); 1102 if (len == name_len && memcmp(name, p, name_len) == 0) 1103 return capture_index; 1104 p += len + 1; 1105 capture_index++; 1106 } 1107 return -1; 1108 } 1109 1110 static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir); 1111 1112 static int re_parse_term(REParseState *s, BOOL is_backward_dir) 1113 { 1114 const uint8_t *p; 1115 int c, last_atom_start, quant_min, quant_max, last_capture_count; 1116 BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead; 1117 CharRange cr_s, *cr = &cr_s; 1118 1119 last_atom_start = -1; 1120 last_capture_count = 0; 1121 p = s->buf_ptr; 1122 c = *p; 1123 switch(c) { 1124 case '^': 1125 p++; 1126 re_emit_op(s, REOP_line_start); 1127 break; 1128 case '$': 1129 p++; 1130 re_emit_op(s, REOP_line_end); 1131 break; 1132 case '.': 1133 p++; 1134 last_atom_start = s->byte_code.size; 1135 last_capture_count = s->capture_count; 1136 if (is_backward_dir) 1137 re_emit_op(s, REOP_prev); 1138 re_emit_op(s, s->dotall ? REOP_any : REOP_dot); 1139 if (is_backward_dir) 1140 re_emit_op(s, REOP_prev); 1141 break; 1142 case '{': 1143 if (s->is_unicode) { 1144 return re_parse_error(s, "syntax error"); 1145 } else if (!is_digit(p[1])) { 1146 /* Annex B: we accept '{' not followed by digits as a 1147 normal atom */ 1148 goto parse_class_atom; 1149 } else { 1150 const uint8_t *p1 = p + 1; 1151 /* Annex B: error if it is like a repetition count */ 1152 parse_digits(&p1, TRUE); 1153 if (*p1 == ',') { 1154 p1++; 1155 if (is_digit(*p1)) { 1156 parse_digits(&p1, TRUE); 1157 } 1158 } 1159 if (*p1 != '}') { 1160 goto parse_class_atom; 1161 } 1162 } 1163 /* fall thru */ 1164 case '*': 1165 case '+': 1166 case '?': 1167 return re_parse_error(s, "nothing to repeat"); 1168 case '(': 1169 if (p[1] == '?') { 1170 if (p[2] == ':') { 1171 p += 3; 1172 last_atom_start = s->byte_code.size; 1173 last_capture_count = s->capture_count; 1174 s->buf_ptr = p; 1175 if (re_parse_disjunction(s, is_backward_dir)) 1176 return -1; 1177 p = s->buf_ptr; 1178 if (re_parse_expect(s, &p, ')')) 1179 return -1; 1180 } else if ((p[2] == '=' || p[2] == '!')) { 1181 is_neg = (p[2] == '!'); 1182 is_backward_lookahead = FALSE; 1183 p += 3; 1184 goto lookahead; 1185 } else if (p[2] == '<' && 1186 (p[3] == '=' || p[3] == '!')) { 1187 int pos; 1188 is_neg = (p[3] == '!'); 1189 is_backward_lookahead = TRUE; 1190 p += 4; 1191 /* lookahead */ 1192 lookahead: 1193 /* Annex B allows lookahead to be used as an atom for 1194 the quantifiers */ 1195 if (!s->is_unicode && !is_backward_lookahead) { 1196 last_atom_start = s->byte_code.size; 1197 last_capture_count = s->capture_count; 1198 } 1199 pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0); 1200 s->buf_ptr = p; 1201 if (re_parse_disjunction(s, is_backward_lookahead)) 1202 return -1; 1203 p = s->buf_ptr; 1204 if (re_parse_expect(s, &p, ')')) 1205 return -1; 1206 re_emit_op(s, REOP_match); 1207 /* jump after the 'match' after the lookahead is successful */ 1208 if (dbuf_error(&s->byte_code)) 1209 return -1; 1210 put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4)); 1211 } else if (p[2] == '<') { 1212 p += 3; 1213 if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf), 1214 &p)) { 1215 return re_parse_error(s, "invalid group name"); 1216 } 1217 if (find_group_name(s, s->u.tmp_buf) > 0) { 1218 return re_parse_error(s, "duplicate group name"); 1219 } 1220 /* group name with a trailing zero */ 1221 dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf, 1222 strlen(s->u.tmp_buf) + 1); 1223 s->has_named_captures = 1; 1224 goto parse_capture; 1225 } else { 1226 return re_parse_error(s, "invalid group"); 1227 } 1228 } else { 1229 int capture_index; 1230 p++; 1231 /* capture without group name */ 1232 dbuf_putc(&s->group_names, 0); 1233 parse_capture: 1234 if (s->capture_count >= CAPTURE_COUNT_MAX) 1235 return re_parse_error(s, "too many captures"); 1236 last_atom_start = s->byte_code.size; 1237 last_capture_count = s->capture_count; 1238 capture_index = s->capture_count++; 1239 re_emit_op_u8(s, REOP_save_start + is_backward_dir, 1240 capture_index); 1241 1242 s->buf_ptr = p; 1243 if (re_parse_disjunction(s, is_backward_dir)) 1244 return -1; 1245 p = s->buf_ptr; 1246 1247 re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir, 1248 capture_index); 1249 1250 if (re_parse_expect(s, &p, ')')) 1251 return -1; 1252 } 1253 break; 1254 case '\\': 1255 switch(p[1]) { 1256 case 'b': 1257 case 'B': 1258 re_emit_op(s, REOP_word_boundary + (p[1] != 'b')); 1259 p += 2; 1260 break; 1261 case 'k': 1262 { 1263 const uint8_t *p1; 1264 int dummy_res; 1265 1266 p1 = p; 1267 if (p1[2] != '<') { 1268 /* annex B: we tolerate invalid group names in non 1269 unicode mode if there is no named capture 1270 definition */ 1271 if (s->is_unicode || re_has_named_captures(s)) 1272 return re_parse_error(s, "expecting group name"); 1273 else 1274 goto parse_class_atom; 1275 } 1276 p1 += 3; 1277 if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf), 1278 &p1)) { 1279 if (s->is_unicode || re_has_named_captures(s)) 1280 return re_parse_error(s, "invalid group name"); 1281 else 1282 goto parse_class_atom; 1283 } 1284 c = find_group_name(s, s->u.tmp_buf); 1285 if (c < 0) { 1286 /* no capture name parsed before, try to look 1287 after (inefficient, but hopefully not common */ 1288 c = re_parse_captures(s, &dummy_res, s->u.tmp_buf); 1289 if (c < 0) { 1290 if (s->is_unicode || re_has_named_captures(s)) 1291 return re_parse_error(s, "group name not defined"); 1292 else 1293 goto parse_class_atom; 1294 } 1295 } 1296 p = p1; 1297 } 1298 goto emit_back_reference; 1299 case '0': 1300 p += 2; 1301 c = 0; 1302 if (s->is_unicode) { 1303 if (is_digit(*p)) { 1304 return re_parse_error(s, "invalid decimal escape in regular expression"); 1305 } 1306 } else { 1307 /* Annex B.1.4: accept legacy octal */ 1308 if (*p >= '0' && *p <= '7') { 1309 c = *p++ - '0'; 1310 if (*p >= '0' && *p <= '7') { 1311 c = (c << 3) + *p++ - '0'; 1312 } 1313 } 1314 } 1315 goto normal_char; 1316 case '1': case '2': case '3': case '4': 1317 case '5': case '6': case '7': case '8': 1318 case '9': 1319 { 1320 const uint8_t *q = ++p; 1321 1322 c = parse_digits(&p, FALSE); 1323 if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) { 1324 if (!s->is_unicode) { 1325 /* Annex B.1.4: accept legacy octal */ 1326 p = q; 1327 if (*p <= '7') { 1328 c = 0; 1329 if (*p <= '3') 1330 c = *p++ - '0'; 1331 if (*p >= '0' && *p <= '7') { 1332 c = (c << 3) + *p++ - '0'; 1333 if (*p >= '0' && *p <= '7') { 1334 c = (c << 3) + *p++ - '0'; 1335 } 1336 } 1337 } else { 1338 c = *p++; 1339 } 1340 goto normal_char; 1341 } 1342 return re_parse_error(s, "back reference out of range in regular expression"); 1343 } 1344 emit_back_reference: 1345 last_atom_start = s->byte_code.size; 1346 last_capture_count = s->capture_count; 1347 re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c); 1348 } 1349 break; 1350 default: 1351 goto parse_class_atom; 1352 } 1353 break; 1354 case '[': 1355 last_atom_start = s->byte_code.size; 1356 last_capture_count = s->capture_count; 1357 if (is_backward_dir) 1358 re_emit_op(s, REOP_prev); 1359 if (re_parse_char_class(s, &p)) 1360 return -1; 1361 if (is_backward_dir) 1362 re_emit_op(s, REOP_prev); 1363 break; 1364 case ']': 1365 case '}': 1366 if (s->is_unicode) 1367 return re_parse_error(s, "syntax error"); 1368 goto parse_class_atom; 1369 default: 1370 parse_class_atom: 1371 c = get_class_atom(s, cr, &p, FALSE); 1372 if ((int)c < 0) 1373 return -1; 1374 normal_char: 1375 last_atom_start = s->byte_code.size; 1376 last_capture_count = s->capture_count; 1377 if (is_backward_dir) 1378 re_emit_op(s, REOP_prev); 1379 if (c >= CLASS_RANGE_BASE) { 1380 int ret; 1381 /* Note: canonicalization is not needed */ 1382 ret = re_emit_range(s, cr); 1383 cr_free(cr); 1384 if (ret) 1385 return -1; 1386 } else { 1387 if (s->ignore_case) 1388 c = lre_canonicalize(c, s->is_unicode); 1389 if (c <= 0xffff) 1390 re_emit_op_u16(s, REOP_char, c); 1391 else 1392 re_emit_op_u32(s, REOP_char32, c); 1393 } 1394 if (is_backward_dir) 1395 re_emit_op(s, REOP_prev); 1396 break; 1397 } 1398 1399 /* quantifier */ 1400 if (last_atom_start >= 0) { 1401 c = *p; 1402 switch(c) { 1403 case '*': 1404 p++; 1405 quant_min = 0; 1406 quant_max = INT32_MAX; 1407 goto quantifier; 1408 case '+': 1409 p++; 1410 quant_min = 1; 1411 quant_max = INT32_MAX; 1412 goto quantifier; 1413 case '?': 1414 p++; 1415 quant_min = 0; 1416 quant_max = 1; 1417 goto quantifier; 1418 case '{': 1419 { 1420 const uint8_t *p1 = p; 1421 /* As an extension (see ES6 annex B), we accept '{' not 1422 followed by digits as a normal atom */ 1423 if (!is_digit(p[1])) { 1424 if (s->is_unicode) 1425 goto invalid_quant_count; 1426 break; 1427 } 1428 p++; 1429 quant_min = parse_digits(&p, TRUE); 1430 quant_max = quant_min; 1431 if (*p == ',') { 1432 p++; 1433 if (is_digit(*p)) { 1434 quant_max = parse_digits(&p, TRUE); 1435 if (quant_max < quant_min) { 1436 invalid_quant_count: 1437 return re_parse_error(s, "invalid repetition count"); 1438 } 1439 } else { 1440 quant_max = INT32_MAX; /* infinity */ 1441 } 1442 } 1443 if (*p != '}' && !s->is_unicode) { 1444 /* Annex B: normal atom if invalid '{' syntax */ 1445 p = p1; 1446 break; 1447 } 1448 if (re_parse_expect(s, &p, '}')) 1449 return -1; 1450 } 1451 quantifier: 1452 greedy = TRUE; 1453 if (*p == '?') { 1454 p++; 1455 greedy = FALSE; 1456 } 1457 if (last_atom_start < 0) { 1458 return re_parse_error(s, "nothing to repeat"); 1459 } 1460 if (greedy) { 1461 int len, pos; 1462 1463 if (quant_max > 0) { 1464 /* specific optimization for simple quantifiers */ 1465 if (dbuf_error(&s->byte_code)) 1466 goto out_of_memory; 1467 len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start, 1468 s->byte_code.size - last_atom_start); 1469 if (len > 0) { 1470 re_emit_op(s, REOP_match); 1471 1472 if (dbuf_insert(&s->byte_code, last_atom_start, 17)) 1473 goto out_of_memory; 1474 pos = last_atom_start; 1475 s->byte_code.buf[pos++] = REOP_simple_greedy_quant; 1476 put_u32(&s->byte_code.buf[pos], 1477 s->byte_code.size - last_atom_start - 17); 1478 pos += 4; 1479 put_u32(&s->byte_code.buf[pos], quant_min); 1480 pos += 4; 1481 put_u32(&s->byte_code.buf[pos], quant_max); 1482 pos += 4; 1483 put_u32(&s->byte_code.buf[pos], len); 1484 pos += 4; 1485 goto done; 1486 } 1487 } 1488 1489 if (dbuf_error(&s->byte_code)) 1490 goto out_of_memory; 1491 } 1492 /* the spec tells that if there is no advance when 1493 running the atom after the first quant_min times, 1494 then there is no match. We remove this test when we 1495 are sure the atom always advances the position. */ 1496 add_zero_advance_check = re_need_check_advance(s->byte_code.buf + last_atom_start, 1497 s->byte_code.size - last_atom_start); 1498 1499 { 1500 int len, pos; 1501 len = s->byte_code.size - last_atom_start; 1502 if (quant_min == 0) { 1503 /* need to reset the capture in case the atom is 1504 not executed */ 1505 if (last_capture_count != s->capture_count) { 1506 if (dbuf_insert(&s->byte_code, last_atom_start, 3)) 1507 goto out_of_memory; 1508 s->byte_code.buf[last_atom_start++] = REOP_save_reset; 1509 s->byte_code.buf[last_atom_start++] = last_capture_count; 1510 s->byte_code.buf[last_atom_start++] = s->capture_count - 1; 1511 } 1512 if (quant_max == 0) { 1513 s->byte_code.size = last_atom_start; 1514 } else if (quant_max == 1 || quant_max == INT32_MAX) { 1515 BOOL has_goto = (quant_max == INT32_MAX); 1516 if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check)) 1517 goto out_of_memory; 1518 s->byte_code.buf[last_atom_start] = REOP_split_goto_first + 1519 greedy; 1520 put_u32(s->byte_code.buf + last_atom_start + 1, 1521 len + 5 * has_goto + add_zero_advance_check * 2); 1522 if (add_zero_advance_check) { 1523 s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos; 1524 re_emit_op(s, REOP_check_advance); 1525 } 1526 if (has_goto) 1527 re_emit_goto(s, REOP_goto, last_atom_start); 1528 } else { 1529 if (dbuf_insert(&s->byte_code, last_atom_start, 10 + add_zero_advance_check)) 1530 goto out_of_memory; 1531 pos = last_atom_start; 1532 s->byte_code.buf[pos++] = REOP_push_i32; 1533 put_u32(s->byte_code.buf + pos, quant_max); 1534 pos += 4; 1535 s->byte_code.buf[pos++] = REOP_split_goto_first + greedy; 1536 put_u32(s->byte_code.buf + pos, len + 5 + add_zero_advance_check * 2); 1537 pos += 4; 1538 if (add_zero_advance_check) { 1539 s->byte_code.buf[pos++] = REOP_push_char_pos; 1540 re_emit_op(s, REOP_check_advance); 1541 } 1542 re_emit_goto(s, REOP_loop, last_atom_start + 5); 1543 re_emit_op(s, REOP_drop); 1544 } 1545 } else if (quant_min == 1 && quant_max == INT32_MAX && 1546 !add_zero_advance_check) { 1547 re_emit_goto(s, REOP_split_next_first - greedy, 1548 last_atom_start); 1549 } else { 1550 if (quant_min == 1) { 1551 /* nothing to add */ 1552 } else { 1553 if (dbuf_insert(&s->byte_code, last_atom_start, 5)) 1554 goto out_of_memory; 1555 s->byte_code.buf[last_atom_start] = REOP_push_i32; 1556 put_u32(s->byte_code.buf + last_atom_start + 1, 1557 quant_min); 1558 last_atom_start += 5; 1559 re_emit_goto(s, REOP_loop, last_atom_start); 1560 re_emit_op(s, REOP_drop); 1561 } 1562 if (quant_max == INT32_MAX) { 1563 pos = s->byte_code.size; 1564 re_emit_op_u32(s, REOP_split_goto_first + greedy, 1565 len + 5 + add_zero_advance_check * 2); 1566 if (add_zero_advance_check) 1567 re_emit_op(s, REOP_push_char_pos); 1568 /* copy the atom */ 1569 dbuf_put_self(&s->byte_code, last_atom_start, len); 1570 if (add_zero_advance_check) 1571 re_emit_op(s, REOP_check_advance); 1572 re_emit_goto(s, REOP_goto, pos); 1573 } else if (quant_max > quant_min) { 1574 re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min); 1575 pos = s->byte_code.size; 1576 re_emit_op_u32(s, REOP_split_goto_first + greedy, 1577 len + 5 + add_zero_advance_check * 2); 1578 if (add_zero_advance_check) 1579 re_emit_op(s, REOP_push_char_pos); 1580 /* copy the atom */ 1581 dbuf_put_self(&s->byte_code, last_atom_start, len); 1582 if (add_zero_advance_check) 1583 re_emit_op(s, REOP_check_advance); 1584 re_emit_goto(s, REOP_loop, pos); 1585 re_emit_op(s, REOP_drop); 1586 } 1587 } 1588 last_atom_start = -1; 1589 } 1590 break; 1591 default: 1592 break; 1593 } 1594 } 1595 done: 1596 s->buf_ptr = p; 1597 return 0; 1598 out_of_memory: 1599 return re_parse_out_of_memory(s); 1600 } 1601 1602 static int re_parse_alternative(REParseState *s, BOOL is_backward_dir) 1603 { 1604 const uint8_t *p; 1605 int ret; 1606 size_t start, term_start, end, term_size; 1607 1608 start = s->byte_code.size; 1609 for(;;) { 1610 p = s->buf_ptr; 1611 if (p >= s->buf_end) 1612 break; 1613 if (*p == '|' || *p == ')') 1614 break; 1615 term_start = s->byte_code.size; 1616 ret = re_parse_term(s, is_backward_dir); 1617 if (ret) 1618 return ret; 1619 if (is_backward_dir) { 1620 /* reverse the order of the terms (XXX: inefficient, but 1621 speed is not really critical here) */ 1622 end = s->byte_code.size; 1623 term_size = end - term_start; 1624 if (dbuf_realloc(&s->byte_code, end + term_size)) 1625 return -1; 1626 memmove(s->byte_code.buf + start + term_size, 1627 s->byte_code.buf + start, 1628 end - start); 1629 memcpy(s->byte_code.buf + start, s->byte_code.buf + end, 1630 term_size); 1631 } 1632 } 1633 return 0; 1634 } 1635 1636 static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir) 1637 { 1638 int start, len, pos; 1639 1640 if (lre_check_stack_overflow(s->opaque, 0)) 1641 return re_parse_error(s, "stack overflow"); 1642 1643 start = s->byte_code.size; 1644 if (re_parse_alternative(s, is_backward_dir)) 1645 return -1; 1646 while (*s->buf_ptr == '|') { 1647 s->buf_ptr++; 1648 1649 len = s->byte_code.size - start; 1650 1651 /* insert a split before the first alternative */ 1652 if (dbuf_insert(&s->byte_code, start, 5)) { 1653 return re_parse_out_of_memory(s); 1654 } 1655 s->byte_code.buf[start] = REOP_split_next_first; 1656 put_u32(s->byte_code.buf + start + 1, len + 5); 1657 1658 pos = re_emit_op_u32(s, REOP_goto, 0); 1659 1660 if (re_parse_alternative(s, is_backward_dir)) 1661 return -1; 1662 1663 /* patch the goto */ 1664 len = s->byte_code.size - (pos + 4); 1665 put_u32(s->byte_code.buf + pos, len); 1666 } 1667 return 0; 1668 } 1669 1670 /* the control flow is recursive so the analysis can be linear */ 1671 static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len) 1672 { 1673 int stack_size, stack_size_max, pos, opcode, len; 1674 uint32_t val; 1675 1676 stack_size = 0; 1677 stack_size_max = 0; 1678 bc_buf += RE_HEADER_LEN; 1679 bc_buf_len -= RE_HEADER_LEN; 1680 pos = 0; 1681 while (pos < bc_buf_len) { 1682 opcode = bc_buf[pos]; 1683 len = reopcode_info[opcode].size; 1684 assert(opcode < REOP_COUNT); 1685 assert((pos + len) <= bc_buf_len); 1686 switch(opcode) { 1687 case REOP_push_i32: 1688 case REOP_push_char_pos: 1689 stack_size++; 1690 if (stack_size > stack_size_max) { 1691 if (stack_size > STACK_SIZE_MAX) 1692 return -1; 1693 stack_size_max = stack_size; 1694 } 1695 break; 1696 case REOP_drop: 1697 case REOP_check_advance: 1698 assert(stack_size > 0); 1699 stack_size--; 1700 break; 1701 case REOP_range: 1702 val = get_u16(bc_buf + pos + 1); 1703 len += val * 4; 1704 break; 1705 case REOP_range32: 1706 val = get_u16(bc_buf + pos + 1); 1707 len += val * 8; 1708 break; 1709 } 1710 pos += len; 1711 } 1712 return stack_size_max; 1713 } 1714 1715 /* 'buf' must be a zero terminated UTF-8 string of length buf_len. 1716 Return NULL if error and allocate an error message in *perror_msg, 1717 otherwise the compiled bytecode and its length in plen. 1718 */ 1719 uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size, 1720 const char *buf, size_t buf_len, int re_flags, 1721 void *opaque) 1722 { 1723 REParseState s_s, *s = &s_s; 1724 int stack_size; 1725 BOOL is_sticky; 1726 1727 memset(s, 0, sizeof(*s)); 1728 s->opaque = opaque; 1729 s->buf_ptr = (const uint8_t *)buf; 1730 s->buf_end = s->buf_ptr + buf_len; 1731 s->buf_start = s->buf_ptr; 1732 s->re_flags = re_flags; 1733 s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != 0); 1734 is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0); 1735 s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0); 1736 s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0); 1737 s->capture_count = 1; 1738 s->total_capture_count = -1; 1739 s->has_named_captures = -1; 1740 1741 dbuf_init2(&s->byte_code, opaque, lre_realloc); 1742 dbuf_init2(&s->group_names, opaque, lre_realloc); 1743 1744 dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */ 1745 dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */ 1746 dbuf_putc(&s->byte_code, 0); /* stack size */ 1747 dbuf_put_u32(&s->byte_code, 0); /* bytecode length */ 1748 1749 if (!is_sticky) { 1750 /* iterate thru all positions (about the same as .*?( ... ) ) 1751 . We do it without an explicit loop so that lock step 1752 thread execution will be possible in an optimized 1753 implementation */ 1754 re_emit_op_u32(s, REOP_split_goto_first, 1 + 5); 1755 re_emit_op(s, REOP_any); 1756 re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5)); 1757 } 1758 re_emit_op_u8(s, REOP_save_start, 0); 1759 1760 if (re_parse_disjunction(s, FALSE)) { 1761 error: 1762 dbuf_free(&s->byte_code); 1763 dbuf_free(&s->group_names); 1764 pstrcpy(error_msg, error_msg_size, s->u.error_msg); 1765 *plen = 0; 1766 return NULL; 1767 } 1768 1769 re_emit_op_u8(s, REOP_save_end, 0); 1770 1771 re_emit_op(s, REOP_match); 1772 1773 if (*s->buf_ptr != '\0') { 1774 re_parse_error(s, "extraneous characters at the end"); 1775 goto error; 1776 } 1777 1778 if (dbuf_error(&s->byte_code)) { 1779 re_parse_out_of_memory(s); 1780 goto error; 1781 } 1782 1783 stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size); 1784 if (stack_size < 0) { 1785 re_parse_error(s, "too many imbricated quantifiers"); 1786 goto error; 1787 } 1788 1789 s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count; 1790 s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size; 1791 put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN, 1792 s->byte_code.size - RE_HEADER_LEN); 1793 1794 /* add the named groups if needed */ 1795 if (s->group_names.size > (s->capture_count - 1)) { 1796 dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size); 1797 s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS; 1798 } 1799 dbuf_free(&s->group_names); 1800 1801 #ifdef DUMP_REOP 1802 lre_dump_bytecode(s->byte_code.buf, s->byte_code.size); 1803 #endif 1804 1805 error_msg[0] = '\0'; 1806 *plen = s->byte_code.size; 1807 return s->byte_code.buf; 1808 } 1809 1810 static BOOL is_line_terminator(uint32_t c) 1811 { 1812 return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS); 1813 } 1814 1815 static BOOL is_word_char(uint32_t c) 1816 { 1817 return ((c >= '0' && c <= '9') || 1818 (c >= 'a' && c <= 'z') || 1819 (c >= 'A' && c <= 'Z') || 1820 (c == '_')); 1821 } 1822 1823 #define GET_CHAR(c, cptr, cbuf_end, cbuf_type) \ 1824 do { \ 1825 if (cbuf_type == 0) { \ 1826 c = *cptr++; \ 1827 } else { \ 1828 const uint16_t *_p = (const uint16_t *)cptr; \ 1829 const uint16_t *_end = (const uint16_t *)cbuf_end; \ 1830 c = *_p++; \ 1831 if (is_hi_surrogate(c) && cbuf_type == 2) { \ 1832 if (_p < _end && is_lo_surrogate(*_p)) { \ 1833 c = from_surrogate(c, *_p++); \ 1834 } \ 1835 } \ 1836 cptr = (const void *)_p; \ 1837 } \ 1838 } while (0) 1839 1840 #define PEEK_CHAR(c, cptr, cbuf_end, cbuf_type) \ 1841 do { \ 1842 if (cbuf_type == 0) { \ 1843 c = cptr[0]; \ 1844 } else { \ 1845 const uint16_t *_p = (const uint16_t *)cptr; \ 1846 const uint16_t *_end = (const uint16_t *)cbuf_end; \ 1847 c = *_p++; \ 1848 if (is_hi_surrogate(c) && cbuf_type == 2) { \ 1849 if (_p < _end && is_lo_surrogate(*_p)) { \ 1850 c = from_surrogate(c, *_p); \ 1851 } \ 1852 } \ 1853 } \ 1854 } while (0) 1855 1856 #define PEEK_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \ 1857 do { \ 1858 if (cbuf_type == 0) { \ 1859 c = cptr[-1]; \ 1860 } else { \ 1861 const uint16_t *_p = (const uint16_t *)cptr - 1; \ 1862 const uint16_t *_start = (const uint16_t *)cbuf_start; \ 1863 c = *_p; \ 1864 if (is_lo_surrogate(c) && cbuf_type == 2) { \ 1865 if (_p > _start && is_hi_surrogate(_p[-1])) { \ 1866 c = from_surrogate(*--_p, c); \ 1867 } \ 1868 } \ 1869 } \ 1870 } while (0) 1871 1872 #define GET_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \ 1873 do { \ 1874 if (cbuf_type == 0) { \ 1875 cptr--; \ 1876 c = cptr[0]; \ 1877 } else { \ 1878 const uint16_t *_p = (const uint16_t *)cptr - 1; \ 1879 const uint16_t *_start = (const uint16_t *)cbuf_start; \ 1880 c = *_p; \ 1881 if (is_lo_surrogate(c) && cbuf_type == 2) { \ 1882 if (_p > _start && is_hi_surrogate(_p[-1])) { \ 1883 c = from_surrogate(*--_p, c); \ 1884 } \ 1885 } \ 1886 cptr = (const void *)_p; \ 1887 } \ 1888 } while (0) 1889 1890 #define PREV_CHAR(cptr, cbuf_start, cbuf_type) \ 1891 do { \ 1892 if (cbuf_type == 0) { \ 1893 cptr--; \ 1894 } else { \ 1895 const uint16_t *_p = (const uint16_t *)cptr - 1; \ 1896 const uint16_t *_start = (const uint16_t *)cbuf_start; \ 1897 if (is_lo_surrogate(*_p) && cbuf_type == 2) { \ 1898 if (_p > _start && is_hi_surrogate(_p[-1])) { \ 1899 --_p; \ 1900 } \ 1901 } \ 1902 cptr = (const void *)_p; \ 1903 } \ 1904 } while (0) 1905 1906 typedef uintptr_t StackInt; 1907 1908 typedef enum { 1909 RE_EXEC_STATE_SPLIT, 1910 RE_EXEC_STATE_LOOKAHEAD, 1911 RE_EXEC_STATE_NEGATIVE_LOOKAHEAD, 1912 RE_EXEC_STATE_GREEDY_QUANT, 1913 } REExecStateEnum; 1914 1915 typedef struct REExecState { 1916 REExecStateEnum type : 8; 1917 uint8_t stack_len; 1918 size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */ 1919 const uint8_t *cptr; 1920 const uint8_t *pc; 1921 void *buf[0]; 1922 } REExecState; 1923 1924 typedef struct { 1925 const uint8_t *cbuf; 1926 const uint8_t *cbuf_end; 1927 /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */ 1928 int cbuf_type; 1929 int capture_count; 1930 int stack_size_max; 1931 BOOL multi_line; 1932 BOOL ignore_case; 1933 BOOL is_unicode; 1934 void *opaque; /* used for stack overflow check */ 1935 1936 size_t state_size; 1937 uint8_t *state_stack; 1938 size_t state_stack_size; 1939 size_t state_stack_len; 1940 } REExecContext; 1941 1942 static int push_state(REExecContext *s, 1943 uint8_t **capture, 1944 StackInt *stack, size_t stack_len, 1945 const uint8_t *pc, const uint8_t *cptr, 1946 REExecStateEnum type, size_t count) 1947 { 1948 REExecState *rs; 1949 uint8_t *new_stack; 1950 size_t new_size, i, n; 1951 StackInt *stack_buf; 1952 1953 if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) { 1954 /* reallocate the stack */ 1955 new_size = s->state_stack_size * 3 / 2; 1956 if (new_size < 8) 1957 new_size = 8; 1958 new_stack = lre_realloc(s->opaque, s->state_stack, new_size * s->state_size); 1959 if (!new_stack) 1960 return -1; 1961 s->state_stack_size = new_size; 1962 s->state_stack = new_stack; 1963 } 1964 rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size); 1965 s->state_stack_len++; 1966 rs->type = type; 1967 rs->count = count; 1968 rs->stack_len = stack_len; 1969 rs->cptr = cptr; 1970 rs->pc = pc; 1971 n = 2 * s->capture_count; 1972 for(i = 0; i < n; i++) 1973 rs->buf[i] = capture[i]; 1974 stack_buf = (StackInt *)(rs->buf + n); 1975 for(i = 0; i < stack_len; i++) 1976 stack_buf[i] = stack[i]; 1977 return 0; 1978 } 1979 1980 /* return 1 if match, 0 if not match or -1 if error. */ 1981 static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, 1982 StackInt *stack, int stack_len, 1983 const uint8_t *pc, const uint8_t *cptr, 1984 BOOL no_recurse) 1985 { 1986 int opcode, ret; 1987 int cbuf_type; 1988 uint32_t val, c; 1989 const uint8_t *cbuf_end; 1990 1991 cbuf_type = s->cbuf_type; 1992 cbuf_end = s->cbuf_end; 1993 1994 for(;;) { 1995 // printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + RE_HEADER_LEN))); 1996 opcode = *pc++; 1997 switch(opcode) { 1998 case REOP_match: 1999 { 2000 REExecState *rs; 2001 if (no_recurse) 2002 return (intptr_t)cptr; 2003 ret = 1; 2004 goto recurse; 2005 no_match: 2006 if (no_recurse) 2007 return 0; 2008 ret = 0; 2009 recurse: 2010 for(;;) { 2011 if (s->state_stack_len == 0) 2012 return ret; 2013 rs = (REExecState *)(s->state_stack + 2014 (s->state_stack_len - 1) * s->state_size); 2015 if (rs->type == RE_EXEC_STATE_SPLIT) { 2016 if (!ret) { 2017 pop_state: 2018 memcpy(capture, rs->buf, 2019 sizeof(capture[0]) * 2 * s->capture_count); 2020 pop_state1: 2021 pc = rs->pc; 2022 cptr = rs->cptr; 2023 stack_len = rs->stack_len; 2024 memcpy(stack, rs->buf + 2 * s->capture_count, 2025 stack_len * sizeof(stack[0])); 2026 s->state_stack_len--; 2027 break; 2028 } 2029 } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) { 2030 if (!ret) { 2031 uint32_t char_count, i; 2032 memcpy(capture, rs->buf, 2033 sizeof(capture[0]) * 2 * s->capture_count); 2034 stack_len = rs->stack_len; 2035 memcpy(stack, rs->buf + 2 * s->capture_count, 2036 stack_len * sizeof(stack[0])); 2037 pc = rs->pc; 2038 cptr = rs->cptr; 2039 /* go backward */ 2040 char_count = get_u32(pc + 12); 2041 for(i = 0; i < char_count; i++) { 2042 PREV_CHAR(cptr, s->cbuf, cbuf_type); 2043 } 2044 pc = (pc + 16) + (int)get_u32(pc); 2045 rs->cptr = cptr; 2046 rs->count--; 2047 if (rs->count == 0) { 2048 s->state_stack_len--; 2049 } 2050 break; 2051 } 2052 } else { 2053 ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) || 2054 (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret)); 2055 if (ret) { 2056 /* keep the capture in case of positive lookahead */ 2057 if (rs->type == RE_EXEC_STATE_LOOKAHEAD) 2058 goto pop_state1; 2059 else 2060 goto pop_state; 2061 } 2062 } 2063 s->state_stack_len--; 2064 } 2065 } 2066 break; 2067 case REOP_char32: 2068 val = get_u32(pc); 2069 pc += 4; 2070 goto test_char; 2071 case REOP_char: 2072 val = get_u16(pc); 2073 pc += 2; 2074 test_char: 2075 if (cptr >= cbuf_end) 2076 goto no_match; 2077 GET_CHAR(c, cptr, cbuf_end, cbuf_type); 2078 if (s->ignore_case) { 2079 c = lre_canonicalize(c, s->is_unicode); 2080 } 2081 if (val != c) 2082 goto no_match; 2083 break; 2084 case REOP_split_goto_first: 2085 case REOP_split_next_first: 2086 { 2087 const uint8_t *pc1; 2088 2089 val = get_u32(pc); 2090 pc += 4; 2091 if (opcode == REOP_split_next_first) { 2092 pc1 = pc + (int)val; 2093 } else { 2094 pc1 = pc; 2095 pc = pc + (int)val; 2096 } 2097 ret = push_state(s, capture, stack, stack_len, 2098 pc1, cptr, RE_EXEC_STATE_SPLIT, 0); 2099 if (ret < 0) 2100 return -1; 2101 break; 2102 } 2103 case REOP_lookahead: 2104 case REOP_negative_lookahead: 2105 val = get_u32(pc); 2106 pc += 4; 2107 ret = push_state(s, capture, stack, stack_len, 2108 pc + (int)val, cptr, 2109 RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead, 2110 0); 2111 if (ret < 0) 2112 return -1; 2113 break; 2114 2115 case REOP_goto: 2116 val = get_u32(pc); 2117 pc += 4 + (int)val; 2118 break; 2119 case REOP_line_start: 2120 if (cptr == s->cbuf) 2121 break; 2122 if (!s->multi_line) 2123 goto no_match; 2124 PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type); 2125 if (!is_line_terminator(c)) 2126 goto no_match; 2127 break; 2128 case REOP_line_end: 2129 if (cptr == cbuf_end) 2130 break; 2131 if (!s->multi_line) 2132 goto no_match; 2133 PEEK_CHAR(c, cptr, cbuf_end, cbuf_type); 2134 if (!is_line_terminator(c)) 2135 goto no_match; 2136 break; 2137 case REOP_dot: 2138 if (cptr == cbuf_end) 2139 goto no_match; 2140 GET_CHAR(c, cptr, cbuf_end, cbuf_type); 2141 if (is_line_terminator(c)) 2142 goto no_match; 2143 break; 2144 case REOP_any: 2145 if (cptr == cbuf_end) 2146 goto no_match; 2147 GET_CHAR(c, cptr, cbuf_end, cbuf_type); 2148 break; 2149 case REOP_save_start: 2150 case REOP_save_end: 2151 val = *pc++; 2152 assert(val < s->capture_count); 2153 capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr; 2154 break; 2155 case REOP_save_reset: 2156 { 2157 uint32_t val2; 2158 val = pc[0]; 2159 val2 = pc[1]; 2160 pc += 2; 2161 assert(val2 < s->capture_count); 2162 while (val <= val2) { 2163 capture[2 * val] = NULL; 2164 capture[2 * val + 1] = NULL; 2165 val++; 2166 } 2167 } 2168 break; 2169 case REOP_push_i32: 2170 val = get_u32(pc); 2171 pc += 4; 2172 stack[stack_len++] = val; 2173 break; 2174 case REOP_drop: 2175 stack_len--; 2176 break; 2177 case REOP_loop: 2178 val = get_u32(pc); 2179 pc += 4; 2180 if (--stack[stack_len - 1] != 0) { 2181 pc += (int)val; 2182 } 2183 break; 2184 case REOP_push_char_pos: 2185 stack[stack_len++] = (uintptr_t)cptr; 2186 break; 2187 case REOP_check_advance: 2188 if (stack[--stack_len] == (uintptr_t)cptr) 2189 goto no_match; 2190 break; 2191 case REOP_word_boundary: 2192 case REOP_not_word_boundary: 2193 { 2194 BOOL v1, v2; 2195 /* char before */ 2196 if (cptr == s->cbuf) { 2197 v1 = FALSE; 2198 } else { 2199 PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type); 2200 v1 = is_word_char(c); 2201 } 2202 /* current char */ 2203 if (cptr >= cbuf_end) { 2204 v2 = FALSE; 2205 } else { 2206 PEEK_CHAR(c, cptr, cbuf_end, cbuf_type); 2207 v2 = is_word_char(c); 2208 } 2209 if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode)) 2210 goto no_match; 2211 } 2212 break; 2213 case REOP_back_reference: 2214 case REOP_backward_back_reference: 2215 { 2216 const uint8_t *cptr1, *cptr1_end, *cptr1_start; 2217 uint32_t c1, c2; 2218 2219 val = *pc++; 2220 if (val >= s->capture_count) 2221 goto no_match; 2222 cptr1_start = capture[2 * val]; 2223 cptr1_end = capture[2 * val + 1]; 2224 if (!cptr1_start || !cptr1_end) 2225 break; 2226 if (opcode == REOP_back_reference) { 2227 cptr1 = cptr1_start; 2228 while (cptr1 < cptr1_end) { 2229 if (cptr >= cbuf_end) 2230 goto no_match; 2231 GET_CHAR(c1, cptr1, cptr1_end, cbuf_type); 2232 GET_CHAR(c2, cptr, cbuf_end, cbuf_type); 2233 if (s->ignore_case) { 2234 c1 = lre_canonicalize(c1, s->is_unicode); 2235 c2 = lre_canonicalize(c2, s->is_unicode); 2236 } 2237 if (c1 != c2) 2238 goto no_match; 2239 } 2240 } else { 2241 cptr1 = cptr1_end; 2242 while (cptr1 > cptr1_start) { 2243 if (cptr == s->cbuf) 2244 goto no_match; 2245 GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type); 2246 GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type); 2247 if (s->ignore_case) { 2248 c1 = lre_canonicalize(c1, s->is_unicode); 2249 c2 = lre_canonicalize(c2, s->is_unicode); 2250 } 2251 if (c1 != c2) 2252 goto no_match; 2253 } 2254 } 2255 } 2256 break; 2257 case REOP_range: 2258 { 2259 int n; 2260 uint32_t low, high, idx_min, idx_max, idx; 2261 2262 n = get_u16(pc); /* n must be >= 1 */ 2263 pc += 2; 2264 if (cptr >= cbuf_end) 2265 goto no_match; 2266 GET_CHAR(c, cptr, cbuf_end, cbuf_type); 2267 if (s->ignore_case) { 2268 c = lre_canonicalize(c, s->is_unicode); 2269 } 2270 idx_min = 0; 2271 low = get_u16(pc + 0 * 4); 2272 if (c < low) 2273 goto no_match; 2274 idx_max = n - 1; 2275 high = get_u16(pc + idx_max * 4 + 2); 2276 /* 0xffff in for last value means +infinity */ 2277 if (unlikely(c >= 0xffff) && high == 0xffff) 2278 goto range_match; 2279 if (c > high) 2280 goto no_match; 2281 while (idx_min <= idx_max) { 2282 idx = (idx_min + idx_max) / 2; 2283 low = get_u16(pc + idx * 4); 2284 high = get_u16(pc + idx * 4 + 2); 2285 if (c < low) 2286 idx_max = idx - 1; 2287 else if (c > high) 2288 idx_min = idx + 1; 2289 else 2290 goto range_match; 2291 } 2292 goto no_match; 2293 range_match: 2294 pc += 4 * n; 2295 } 2296 break; 2297 case REOP_range32: 2298 { 2299 int n; 2300 uint32_t low, high, idx_min, idx_max, idx; 2301 2302 n = get_u16(pc); /* n must be >= 1 */ 2303 pc += 2; 2304 if (cptr >= cbuf_end) 2305 goto no_match; 2306 GET_CHAR(c, cptr, cbuf_end, cbuf_type); 2307 if (s->ignore_case) { 2308 c = lre_canonicalize(c, s->is_unicode); 2309 } 2310 idx_min = 0; 2311 low = get_u32(pc + 0 * 8); 2312 if (c < low) 2313 goto no_match; 2314 idx_max = n - 1; 2315 high = get_u32(pc + idx_max * 8 + 4); 2316 if (c > high) 2317 goto no_match; 2318 while (idx_min <= idx_max) { 2319 idx = (idx_min + idx_max) / 2; 2320 low = get_u32(pc + idx * 8); 2321 high = get_u32(pc + idx * 8 + 4); 2322 if (c < low) 2323 idx_max = idx - 1; 2324 else if (c > high) 2325 idx_min = idx + 1; 2326 else 2327 goto range32_match; 2328 } 2329 goto no_match; 2330 range32_match: 2331 pc += 8 * n; 2332 } 2333 break; 2334 case REOP_prev: 2335 /* go to the previous char */ 2336 if (cptr == s->cbuf) 2337 goto no_match; 2338 PREV_CHAR(cptr, s->cbuf, cbuf_type); 2339 break; 2340 case REOP_simple_greedy_quant: 2341 { 2342 uint32_t next_pos, quant_min, quant_max; 2343 size_t q; 2344 intptr_t res; 2345 const uint8_t *pc1; 2346 2347 next_pos = get_u32(pc); 2348 quant_min = get_u32(pc + 4); 2349 quant_max = get_u32(pc + 8); 2350 pc += 16; 2351 pc1 = pc; 2352 pc += (int)next_pos; 2353 2354 q = 0; 2355 for(;;) { 2356 res = lre_exec_backtrack(s, capture, stack, stack_len, 2357 pc1, cptr, TRUE); 2358 if (res == -1) 2359 return res; 2360 if (!res) 2361 break; 2362 cptr = (uint8_t *)res; 2363 q++; 2364 if (q >= quant_max && quant_max != INT32_MAX) 2365 break; 2366 } 2367 if (q < quant_min) 2368 goto no_match; 2369 if (q > quant_min) { 2370 /* will examine all matches down to quant_min */ 2371 ret = push_state(s, capture, stack, stack_len, 2372 pc1 - 16, cptr, 2373 RE_EXEC_STATE_GREEDY_QUANT, 2374 q - quant_min); 2375 if (ret < 0) 2376 return -1; 2377 } 2378 } 2379 break; 2380 default: 2381 abort(); 2382 } 2383 } 2384 } 2385 2386 /* Return 1 if match, 0 if not match or -1 if error. cindex is the 2387 starting position of the match and must be such as 0 <= cindex <= 2388 clen. */ 2389 int lre_exec(uint8_t **capture, 2390 const uint8_t *bc_buf, const uint8_t *cbuf, int cindex, int clen, 2391 int cbuf_type, void *opaque) 2392 { 2393 REExecContext s_s, *s = &s_s; 2394 int re_flags, i, alloca_size, ret; 2395 StackInt *stack_buf; 2396 2397 re_flags = lre_get_flags(bc_buf); 2398 s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0; 2399 s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0; 2400 s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0; 2401 s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT]; 2402 s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE]; 2403 s->cbuf = cbuf; 2404 s->cbuf_end = cbuf + (clen << cbuf_type); 2405 s->cbuf_type = cbuf_type; 2406 if (s->cbuf_type == 1 && s->is_unicode) 2407 s->cbuf_type = 2; 2408 s->opaque = opaque; 2409 2410 s->state_size = sizeof(REExecState) + 2411 s->capture_count * sizeof(capture[0]) * 2 + 2412 s->stack_size_max * sizeof(stack_buf[0]); 2413 s->state_stack = NULL; 2414 s->state_stack_len = 0; 2415 s->state_stack_size = 0; 2416 2417 for(i = 0; i < s->capture_count * 2; i++) 2418 capture[i] = NULL; 2419 alloca_size = s->stack_size_max * sizeof(stack_buf[0]); 2420 stack_buf = alloca(alloca_size); 2421 ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN, 2422 cbuf + (cindex << cbuf_type), FALSE); 2423 lre_realloc(s->opaque, s->state_stack, 0); 2424 return ret; 2425 } 2426 2427 int lre_get_capture_count(const uint8_t *bc_buf) 2428 { 2429 return bc_buf[RE_HEADER_CAPTURE_COUNT]; 2430 } 2431 2432 int lre_get_flags(const uint8_t *bc_buf) 2433 { 2434 return bc_buf[RE_HEADER_FLAGS]; 2435 } 2436 2437 /* Return NULL if no group names. Otherwise, return a pointer to 2438 'capture_count - 1' zero terminated UTF-8 strings. */ 2439 const char *lre_get_groupnames(const uint8_t *bc_buf) 2440 { 2441 uint32_t re_bytecode_len; 2442 if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0) 2443 return NULL; 2444 re_bytecode_len = get_u32(bc_buf + RE_HEADER_BYTECODE_LEN); 2445 return (const char *)(bc_buf + RE_HEADER_LEN + re_bytecode_len); 2446 } 2447 2448 #ifdef TEST 2449 2450 BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size) 2451 { 2452 return FALSE; 2453 } 2454 2455 void *lre_realloc(void *opaque, void *ptr, size_t size) 2456 { 2457 return realloc(ptr, size); 2458 } 2459 2460 int main(int argc, char **argv) 2461 { 2462 int len, flags, ret, i; 2463 uint8_t *bc; 2464 char error_msg[64]; 2465 uint8_t *capture[CAPTURE_COUNT_MAX * 2]; 2466 const char *input; 2467 int input_len, capture_count; 2468 2469 if (argc < 4) { 2470 printf("usage: %s regexp flags input\n", argv[0]); 2471 return 1; 2472 } 2473 flags = atoi(argv[2]); 2474 bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1], 2475 strlen(argv[1]), flags, NULL); 2476 if (!bc) { 2477 fprintf(stderr, "error: %s\n", error_msg); 2478 exit(1); 2479 } 2480 2481 input = argv[3]; 2482 input_len = strlen(input); 2483 2484 ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL); 2485 printf("ret=%d\n", ret); 2486 if (ret == 1) { 2487 capture_count = lre_get_capture_count(bc); 2488 for(i = 0; i < 2 * capture_count; i++) { 2489 uint8_t *ptr; 2490 ptr = capture[i]; 2491 printf("%d: ", i); 2492 if (!ptr) 2493 printf("<nil>"); 2494 else 2495 printf("%u", (int)(ptr - (uint8_t *)input)); 2496 printf("\n"); 2497 } 2498 } 2499 return 0; 2500 } 2501 #endif