libregexp.c - quickjs-tart - quickjs-based runtime for wallet-core logic

libregexp.c (81779B)
      1 /*
      2  * Regular Expression Engine
      3  *
      4  * Copyright (c) 2017-2018 Fabrice Bellard
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a copy
      7  * of this software and associated documentation files (the "Software"), to deal
      8  * in the Software without restriction, including without limitation the rights
      9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10  * copies of the Software, and to permit persons to whom the Software is
     11  * furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included in
     14  * all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     22  * THE SOFTWARE.
     23  */
     24 #include <stdlib.h>
     25 #include <stdio.h>
     26 #include <stdarg.h>
     27 #include <inttypes.h>
     28 #include <string.h>
     29 #include <assert.h>
     30 
     31 #include "cutils.h"
     32 #include "libregexp.h"
     33 #include "libunicode.h"
     34 
     35 /*
     36   TODO:
     37 
     38   - Add a lock step execution mode (=linear time execution guaranteed)
     39     when the regular expression is "simple" i.e. no backreference nor
     40     complicated lookahead. The opcodes are designed for this execution
     41     model.
     42 */
     43 
     44 #if defined(TEST)
     45 #define DUMP_REOP
     46 #endif
     47 
     48 typedef enum {
     49 #define DEF(id, size) REOP_ ## id,
     50 #include "libregexp-opcode.h"
     51 #undef DEF
     52     REOP_COUNT,
     53 } REOPCodeEnum;
     54 
     55 #define CAPTURE_COUNT_MAX 255
     56 #define STACK_SIZE_MAX 255
     57 
     58 /* unicode code points */
     59 #define CP_LS   0x2028
     60 #define CP_PS   0x2029
     61 
     62 #define TMP_BUF_SIZE 128
     63 
     64 typedef struct {
     65     DynBuf byte_code;
     66     const uint8_t *buf_ptr;
     67     const uint8_t *buf_end;
     68     const uint8_t *buf_start;
     69     int re_flags;
     70     BOOL is_unicode;
     71     BOOL ignore_case;
     72     BOOL dotall;
     73     int capture_count;
     74     int total_capture_count; /* -1 = not computed yet */
     75     int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */
     76     void *opaque;
     77     DynBuf group_names;
     78     union {
     79         char error_msg[TMP_BUF_SIZE];
     80         char tmp_buf[TMP_BUF_SIZE];
     81     } u;
     82 } REParseState;
     83 
     84 typedef struct {
     85 #ifdef DUMP_REOP
     86     const char *name;
     87 #endif
     88     uint8_t size;
     89 } REOpCode;
     90 
     91 static const REOpCode reopcode_info[REOP_COUNT] = {
     92 #ifdef DUMP_REOP
     93 #define DEF(id, size) { #id, size },
     94 #else
     95 #define DEF(id, size) { size },
     96 #endif
     97 #include "libregexp-opcode.h"
     98 #undef DEF
     99 };
    100 
    101 #define RE_HEADER_FLAGS         0
    102 #define RE_HEADER_CAPTURE_COUNT 1
    103 #define RE_HEADER_STACK_SIZE    2
    104 #define RE_HEADER_BYTECODE_LEN  3
    105 
    106 #define RE_HEADER_LEN 7
    107 
    108 static inline int is_digit(int c) {
    109     return c >= '0' && c <= '9';
    110 }
    111 
    112 /* insert 'len' bytes at position 'pos'. Return < 0 if error. */
    113 static int dbuf_insert(DynBuf *s, int pos, int len)
    114 {
    115     if (dbuf_realloc(s, s->size + len))
    116         return -1;
    117     memmove(s->buf + pos + len, s->buf + pos, s->size - pos);
    118     s->size += len;
    119     return 0;
    120 }
    121 
    122 static const uint16_t char_range_d[] = {
    123     1,
    124     0x0030, 0x0039 + 1,
    125 };
    126 
    127 /* code point ranges for Zs,Zl or Zp property */
    128 static const uint16_t char_range_s[] = {
    129     10,
    130     0x0009, 0x000D + 1,
    131     0x0020, 0x0020 + 1,
    132     0x00A0, 0x00A0 + 1,
    133     0x1680, 0x1680 + 1,
    134     0x2000, 0x200A + 1,
    135     /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
    136     /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
    137     0x2028, 0x2029 + 1,
    138     0x202F, 0x202F + 1,
    139     0x205F, 0x205F + 1,
    140     0x3000, 0x3000 + 1,
    141     /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
    142     0xFEFF, 0xFEFF + 1,
    143 };
    144 
    145 static const uint16_t char_range_w[] = {
    146     4,
    147     0x0030, 0x0039 + 1,
    148     0x0041, 0x005A + 1,
    149     0x005F, 0x005F + 1,
    150     0x0061, 0x007A + 1,
    151 };
    152 
    153 #define CLASS_RANGE_BASE 0x40000000
    154 
    155 typedef enum {
    156     CHAR_RANGE_d,
    157     CHAR_RANGE_D,
    158     CHAR_RANGE_s,
    159     CHAR_RANGE_S,
    160     CHAR_RANGE_w,
    161     CHAR_RANGE_W,
    162 } CharRangeEnum;
    163 
    164 static const uint16_t * const char_range_table[] = {
    165     char_range_d,
    166     char_range_s,
    167     char_range_w,
    168 };
    169 
    170 static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
    171 {
    172     BOOL invert;
    173     const uint16_t *c_pt;
    174     int len, i;
    175 
    176     invert = c & 1;
    177     c_pt = char_range_table[c >> 1];
    178     len = *c_pt++;
    179     cr_init(cr, s->opaque, lre_realloc);
    180     for(i = 0; i < len * 2; i++) {
    181         if (cr_add_point(cr, c_pt[i]))
    182             goto fail;
    183     }
    184     if (invert) {
    185         if (cr_invert(cr))
    186             goto fail;
    187     }
    188     return 0;
    189  fail:
    190     cr_free(cr);
    191     return -1;
    192 }
    193 
    194 #ifdef DUMP_REOP
    195 static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
    196                                                      int buf_len)
    197 {
    198     int pos, len, opcode, bc_len, re_flags, i;
    199     uint32_t val;
    200 
    201     assert(buf_len >= RE_HEADER_LEN);
    202 
    203     re_flags = lre_get_flags(buf);
    204     bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN);
    205     assert(bc_len + RE_HEADER_LEN <= buf_len);
    206     printf("flags: 0x%x capture_count=%d stack_size=%d\n",
    207            re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_STACK_SIZE]);
    208     if (re_flags & LRE_FLAG_NAMED_GROUPS) {
    209         const char *p;
    210         p = (char *)buf + RE_HEADER_LEN + bc_len;
    211         printf("named groups: ");
    212         for(i = 1; i < buf[RE_HEADER_CAPTURE_COUNT]; i++) {
    213             if (i != 1)
    214                 printf(",");
    215             printf("<%s>", p);
    216             p += strlen(p) + 1;
    217         }
    218         printf("\n");
    219         assert(p == (char *)(buf + buf_len));
    220     }
    221     printf("bytecode_len=%d\n", bc_len);
    222 
    223     buf += RE_HEADER_LEN;
    224     pos = 0;
    225     while (pos < bc_len) {
    226         printf("%5u: ", pos);
    227         opcode = buf[pos];
    228         len = reopcode_info[opcode].size;
    229         if (opcode >= REOP_COUNT) {
    230             printf(" invalid opcode=0x%02x\n", opcode);
    231             break;
    232         }
    233         if ((pos + len) > bc_len) {
    234             printf(" buffer overflow (opcode=0x%02x)\n", opcode);
    235             break;
    236         }
    237         printf("%s", reopcode_info[opcode].name);
    238         switch(opcode) {
    239         case REOP_char:
    240             val = get_u16(buf + pos + 1);
    241             if (val >= ' ' && val <= 126)
    242                 printf(" '%c'", val);
    243             else
    244                 printf(" 0x%04x", val);
    245             break;
    246         case REOP_char32:
    247             val = get_u32(buf + pos + 1);
    248             if (val >= ' ' && val <= 126)
    249                 printf(" '%c'", val);
    250             else
    251                 printf(" 0x%08x", val);
    252             break;
    253         case REOP_goto:
    254         case REOP_split_goto_first:
    255         case REOP_split_next_first:
    256         case REOP_loop:
    257         case REOP_lookahead:
    258         case REOP_negative_lookahead:
    259             val = get_u32(buf + pos + 1);
    260             val += (pos + 5);
    261             printf(" %u", val);
    262             break;
    263         case REOP_simple_greedy_quant:
    264             printf(" %u %u %u %u",
    265                    get_u32(buf + pos + 1) + (pos + 17),
    266                    get_u32(buf + pos + 1 + 4),
    267                    get_u32(buf + pos + 1 + 8),
    268                    get_u32(buf + pos + 1 + 12));
    269             break;
    270         case REOP_save_start:
    271         case REOP_save_end:
    272         case REOP_back_reference:
    273         case REOP_backward_back_reference:
    274             printf(" %u", buf[pos + 1]);
    275             break;
    276         case REOP_save_reset:
    277             printf(" %u %u", buf[pos + 1], buf[pos + 2]);
    278             break;
    279         case REOP_push_i32:
    280             val = get_u32(buf + pos + 1);
    281             printf(" %d", val);
    282             break;
    283         case REOP_range:
    284             {
    285                 int n, i;
    286                 n = get_u16(buf + pos + 1);
    287                 len += n * 4;
    288                 for(i = 0; i < n * 2; i++) {
    289                     val = get_u16(buf + pos + 3 + i * 2);
    290                     printf(" 0x%04x", val);
    291                 }
    292             }
    293             break;
    294         case REOP_range32:
    295             {
    296                 int n, i;
    297                 n = get_u16(buf + pos + 1);
    298                 len += n * 8;
    299                 for(i = 0; i < n * 2; i++) {
    300                     val = get_u32(buf + pos + 3 + i * 4);
    301                     printf(" 0x%08x", val);
    302                 }
    303             }
    304             break;
    305         default:
    306             break;
    307         }
    308         printf("\n");
    309         pos += len;
    310     }
    311 }
    312 #endif
    313 
    314 static void re_emit_op(REParseState *s, int op)
    315 {
    316     dbuf_putc(&s->byte_code, op);
    317 }
    318 
    319 /* return the offset of the u32 value */
    320 static int re_emit_op_u32(REParseState *s, int op, uint32_t val)
    321 {
    322     int pos;
    323     dbuf_putc(&s->byte_code, op);
    324     pos = s->byte_code.size;
    325     dbuf_put_u32(&s->byte_code, val);
    326     return pos;
    327 }
    328 
    329 static int re_emit_goto(REParseState *s, int op, uint32_t val)
    330 {
    331     int pos;
    332     dbuf_putc(&s->byte_code, op);
    333     pos = s->byte_code.size;
    334     dbuf_put_u32(&s->byte_code, val - (pos + 4));
    335     return pos;
    336 }
    337 
    338 static void re_emit_op_u8(REParseState *s, int op, uint32_t val)
    339 {
    340     dbuf_putc(&s->byte_code, op);
    341     dbuf_putc(&s->byte_code, val);
    342 }
    343 
    344 static void re_emit_op_u16(REParseState *s, int op, uint32_t val)
    345 {
    346     dbuf_putc(&s->byte_code, op);
    347     dbuf_put_u16(&s->byte_code, val);
    348 }
    349 
    350 static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s, const char *fmt, ...)
    351 {
    352     va_list ap;
    353     va_start(ap, fmt);
    354     vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap);
    355     va_end(ap);
    356     return -1;
    357 }
    358 
    359 static int re_parse_out_of_memory(REParseState *s)
    360 {
    361     return re_parse_error(s, "out of memory");
    362 }
    363 
    364 /* If allow_overflow is false, return -1 in case of
    365    overflow. Otherwise return INT32_MAX. */
    366 static int parse_digits(const uint8_t **pp, BOOL allow_overflow)
    367 {
    368     const uint8_t *p;
    369     uint64_t v;
    370     int c;
    371 
    372     p = *pp;
    373     v = 0;
    374     for(;;) {
    375         c = *p;
    376         if (c < '0' || c > '9')
    377             break;
    378         v = v * 10 + c - '0';
    379         if (v >= INT32_MAX) {
    380             if (allow_overflow)
    381                 v = INT32_MAX;
    382             else
    383                 return -1;
    384         }
    385         p++;
    386     }
    387     *pp = p;
    388     return v;
    389 }
    390 
    391 static int re_parse_expect(REParseState *s, const uint8_t **pp, int c)
    392 {
    393     const uint8_t *p;
    394     p = *pp;
    395     if (*p != c)
    396         return re_parse_error(s, "expecting '%c'", c);
    397     p++;
    398     *pp = p;
    399     return 0;
    400 }
    401 
    402 /* Parse an escape sequence, *pp points after the '\':
    403    allow_utf16 value:
    404    0 : no UTF-16 escapes allowed
    405    1 : UTF-16 escapes allowed
    406    2 : UTF-16 escapes allowed and escapes of surrogate pairs are
    407    converted to a unicode character (unicode regexp case).
    408 
    409    Return the unicode char and update *pp if recognized,
    410    return -1 if malformed escape,
    411    return -2 otherwise. */
    412 int lre_parse_escape(const uint8_t **pp, int allow_utf16)
    413 {
    414     const uint8_t *p;
    415     uint32_t c;
    416 
    417     p = *pp;
    418     c = *p++;
    419     switch(c) {
    420     case 'b':
    421         c = '\b';
    422         break;
    423     case 'f':
    424         c = '\f';
    425         break;
    426     case 'n':
    427         c = '\n';
    428         break;
    429     case 'r':
    430         c = '\r';
    431         break;
    432     case 't':
    433         c = '\t';
    434         break;
    435     case 'v':
    436         c = '\v';
    437         break;
    438     case 'x':
    439     case 'u':
    440         {
    441             int h, n, i;
    442             uint32_t c1;
    443 
    444             if (*p == '{' && allow_utf16) {
    445                 p++;
    446                 c = 0;
    447                 for(;;) {
    448                     h = from_hex(*p++);
    449                     if (h < 0)
    450                         return -1;
    451                     c = (c << 4) | h;
    452                     if (c > 0x10FFFF)
    453                         return -1;
    454                     if (*p == '}')
    455                         break;
    456                 }
    457                 p++;
    458             } else {
    459                 if (c == 'x') {
    460                     n = 2;
    461                 } else {
    462                     n = 4;
    463                 }
    464 
    465                 c = 0;
    466                 for(i = 0; i < n; i++) {
    467                     h = from_hex(*p++);
    468                     if (h < 0) {
    469                         return -1;
    470                     }
    471                     c = (c << 4) | h;
    472                 }
    473                 if (is_hi_surrogate(c) &&
    474                     allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') {
    475                     /* convert an escaped surrogate pair into a
    476                        unicode char */
    477                     c1 = 0;
    478                     for(i = 0; i < 4; i++) {
    479                         h = from_hex(p[2 + i]);
    480                         if (h < 0)
    481                             break;
    482                         c1 = (c1 << 4) | h;
    483                     }
    484                     if (i == 4 && is_lo_surrogate(c1)) {
    485                         p += 6;
    486                         c = from_surrogate(c, c1);
    487                     }
    488                 }
    489             }
    490         }
    491         break;
    492     case '0': case '1': case '2': case '3':
    493     case '4': case '5': case '6': case '7':
    494         c -= '0';
    495         if (allow_utf16 == 2) {
    496             /* only accept \0 not followed by digit */
    497             if (c != 0 || is_digit(*p))
    498                 return -1;
    499         } else {
    500             /* parse a legacy octal sequence */
    501             uint32_t v;
    502             v = *p - '0';
    503             if (v > 7)
    504                 break;
    505             c = (c << 3) | v;
    506             p++;
    507             if (c >= 32)
    508                 break;
    509             v = *p - '0';
    510             if (v > 7)
    511                 break;
    512             c = (c << 3) | v;
    513             p++;
    514         }
    515         break;
    516     default:
    517         return -2;
    518     }
    519     *pp = p;
    520     return c;
    521 }
    522 
    523 #ifdef CONFIG_ALL_UNICODE
    524 /* XXX: we use the same chars for name and value */
    525 static BOOL is_unicode_char(int c)
    526 {
    527     return ((c >= '0' && c <= '9') ||
    528             (c >= 'A' && c <= 'Z') ||
    529             (c >= 'a' && c <= 'z') ||
    530             (c == '_'));
    531 }
    532 
    533 static int parse_unicode_property(REParseState *s, CharRange *cr,
    534                                   const uint8_t **pp, BOOL is_inv)
    535 {
    536     const uint8_t *p;
    537     char name[64], value[64];
    538     char *q;
    539     BOOL script_ext;
    540     int ret;
    541 
    542     p = *pp;
    543     if (*p != '{')
    544         return re_parse_error(s, "expecting '{' after \\p");
    545     p++;
    546     q = name;
    547     while (is_unicode_char(*p)) {
    548         if ((q - name) >= sizeof(name) - 1)
    549             goto unknown_property_name;
    550         *q++ = *p++;
    551     }
    552     *q = '\0';
    553     q = value;
    554     if (*p == '=') {
    555         p++;
    556         while (is_unicode_char(*p)) {
    557             if ((q - value) >= sizeof(value) - 1)
    558                 return re_parse_error(s, "unknown unicode property value");
    559             *q++ = *p++;
    560         }
    561     }
    562     *q = '\0';
    563     if (*p != '}')
    564         return re_parse_error(s, "expecting '}'");
    565     p++;
    566     //    printf("name=%s value=%s\n", name, value);
    567 
    568     if (!strcmp(name, "Script") || !strcmp(name, "sc")) {
    569         script_ext = FALSE;
    570         goto do_script;
    571     } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) {
    572         script_ext = TRUE;
    573     do_script:
    574         cr_init(cr, s->opaque, lre_realloc);
    575         ret = unicode_script(cr, value, script_ext);
    576         if (ret) {
    577             cr_free(cr);
    578             if (ret == -2)
    579                 return re_parse_error(s, "unknown unicode script");
    580             else
    581                 goto out_of_memory;
    582         }
    583     } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) {
    584         cr_init(cr, s->opaque, lre_realloc);
    585         ret = unicode_general_category(cr, value);
    586         if (ret) {
    587             cr_free(cr);
    588             if (ret == -2)
    589                 return re_parse_error(s, "unknown unicode general category");
    590             else
    591                 goto out_of_memory;
    592         }
    593     } else if (value[0] == '\0') {
    594         cr_init(cr, s->opaque, lre_realloc);
    595         ret = unicode_general_category(cr, name);
    596         if (ret == -1) {
    597             cr_free(cr);
    598             goto out_of_memory;
    599         }
    600         if (ret < 0) {
    601             ret = unicode_prop(cr, name);
    602             if (ret) {
    603                 cr_free(cr);
    604                 if (ret == -2)
    605                     goto unknown_property_name;
    606                 else
    607                     goto out_of_memory;
    608             }
    609         }
    610     } else {
    611     unknown_property_name:
    612         return re_parse_error(s, "unknown unicode property name");
    613     }
    614 
    615     if (is_inv) {
    616         if (cr_invert(cr)) {
    617             cr_free(cr);
    618             return -1;
    619         }
    620     }
    621     *pp = p;
    622     return 0;
    623  out_of_memory:
    624     return re_parse_out_of_memory(s);
    625 }
    626 #endif /* CONFIG_ALL_UNICODE */
    627 
    628 /* return -1 if error otherwise the character or a class range
    629    (CLASS_RANGE_BASE). In case of class range, 'cr' is
    630    initialized. Otherwise, it is ignored. */
    631 static int get_class_atom(REParseState *s, CharRange *cr,
    632                           const uint8_t **pp, BOOL inclass)
    633 {
    634     const uint8_t *p;
    635     uint32_t c;
    636     int ret;
    637 
    638     p = *pp;
    639 
    640     c = *p;
    641     switch(c) {
    642     case '\\':
    643         p++;
    644         if (p >= s->buf_end)
    645             goto unexpected_end;
    646         c = *p++;
    647         switch(c) {
    648         case 'd':
    649             c = CHAR_RANGE_d;
    650             goto class_range;
    651         case 'D':
    652             c = CHAR_RANGE_D;
    653             goto class_range;
    654         case 's':
    655             c = CHAR_RANGE_s;
    656             goto class_range;
    657         case 'S':
    658             c = CHAR_RANGE_S;
    659             goto class_range;
    660         case 'w':
    661             c = CHAR_RANGE_w;
    662             goto class_range;
    663         case 'W':
    664             c = CHAR_RANGE_W;
    665         class_range:
    666             if (cr_init_char_range(s, cr, c))
    667                 return -1;
    668             c = CLASS_RANGE_BASE;
    669             break;
    670         case 'c':
    671             c = *p;
    672             if ((c >= 'a' && c <= 'z') ||
    673                 (c >= 'A' && c <= 'Z') ||
    674                 (((c >= '0' && c <= '9') || c == '_') &&
    675                  inclass && !s->is_unicode)) {   /* Annex B.1.4 */
    676                 c &= 0x1f;
    677                 p++;
    678             } else if (s->is_unicode) {
    679                 goto invalid_escape;
    680             } else {
    681                 /* otherwise return '\' and 'c' */
    682                 p--;
    683                 c = '\\';
    684             }
    685             break;
    686 #ifdef CONFIG_ALL_UNICODE
    687         case 'p':
    688         case 'P':
    689             if (s->is_unicode) {
    690                 if (parse_unicode_property(s, cr, &p, (c == 'P')))
    691                     return -1;
    692                 c = CLASS_RANGE_BASE;
    693                 break;
    694             }
    695             /* fall thru */
    696 #endif
    697         default:
    698             p--;
    699             ret = lre_parse_escape(&p, s->is_unicode * 2);
    700             if (ret >= 0) {
    701                 c = ret;
    702             } else {
    703                 if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
    704                     /* always valid to escape these characters */
    705                     goto normal_char;
    706                 } else if (s->is_unicode) {
    707                 invalid_escape:
    708                     return re_parse_error(s, "invalid escape sequence in regular expression");
    709                 } else {
    710                     /* just ignore the '\' */
    711                     goto normal_char;
    712                 }
    713             }
    714             break;
    715         }
    716         break;
    717     case '\0':
    718         if (p >= s->buf_end) {
    719         unexpected_end:
    720             return re_parse_error(s, "unexpected end");
    721         }
    722         /* fall thru */
    723     default:
    724     normal_char:
    725         /* normal char */
    726         if (c >= 128) {
    727             c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
    728             if ((unsigned)c > 0xffff && !s->is_unicode) {
    729                 /* XXX: should handle non BMP-1 code points */
    730                 return re_parse_error(s, "malformed unicode char");
    731             }
    732         } else {
    733             p++;
    734         }
    735         break;
    736     }
    737     *pp = p;
    738     return c;
    739 }
    740 
    741 static int re_emit_range(REParseState *s, const CharRange *cr)
    742 {
    743     int len, i;
    744     uint32_t high;
    745 
    746     len = (unsigned)cr->len / 2;
    747     if (len >= 65535)
    748         return re_parse_error(s, "too many ranges");
    749     if (len == 0) {
    750         /* not sure it can really happen. Emit a match that is always
    751            false */
    752         re_emit_op_u32(s, REOP_char32, -1);
    753     } else {
    754         high = cr->points[cr->len - 1];
    755         if (high == UINT32_MAX)
    756             high = cr->points[cr->len - 2];
    757         if (high <= 0xffff) {
    758             /* can use 16 bit ranges with the conversion that 0xffff =
    759                infinity */
    760             re_emit_op_u16(s, REOP_range, len);
    761             for(i = 0; i < cr->len; i += 2) {
    762                 dbuf_put_u16(&s->byte_code, cr->points[i]);
    763                 high = cr->points[i + 1] - 1;
    764                 if (high == UINT32_MAX - 1)
    765                     high = 0xffff;
    766                 dbuf_put_u16(&s->byte_code, high);
    767             }
    768         } else {
    769             re_emit_op_u16(s, REOP_range32, len);
    770             for(i = 0; i < cr->len; i += 2) {
    771                 dbuf_put_u32(&s->byte_code, cr->points[i]);
    772                 dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1);
    773             }
    774         }
    775     }
    776     return 0;
    777 }
    778 
    779 static int re_parse_char_class(REParseState *s, const uint8_t **pp)
    780 {
    781     const uint8_t *p;
    782     uint32_t c1, c2;
    783     CharRange cr_s, *cr = &cr_s;
    784     CharRange cr1_s, *cr1 = &cr1_s;
    785     BOOL invert;
    786 
    787     cr_init(cr, s->opaque, lre_realloc);
    788     p = *pp;
    789     p++;    /* skip '[' */
    790 
    791     invert = FALSE;
    792     if (*p == '^') {
    793         p++;
    794         invert = TRUE;
    795     }
    796 
    797     for(;;) {
    798         if (*p == ']')
    799             break;
    800         c1 = get_class_atom(s, cr1, &p, TRUE);
    801         if ((int)c1 < 0)
    802             goto fail;
    803         if (*p == '-' && p[1] != ']') {
    804             const uint8_t *p0 = p + 1;
    805             if (c1 >= CLASS_RANGE_BASE) {
    806                 if (s->is_unicode) {
    807                     cr_free(cr1);
    808                     goto invalid_class_range;
    809                 }
    810                 /* Annex B: match '-' character */
    811                 goto class_atom;
    812             }
    813             c2 = get_class_atom(s, cr1, &p0, TRUE);
    814             if ((int)c2 < 0)
    815                 goto fail;
    816             if (c2 >= CLASS_RANGE_BASE) {
    817                 cr_free(cr1);
    818                 if (s->is_unicode) {
    819                     goto invalid_class_range;
    820                 }
    821                 /* Annex B: match '-' character */
    822                 goto class_atom;
    823             }
    824             p = p0;
    825             if (c2 < c1) {
    826             invalid_class_range:
    827                 re_parse_error(s, "invalid class range");
    828                 goto fail;
    829             }
    830             if (cr_union_interval(cr, c1, c2))
    831                 goto memory_error;
    832         } else {
    833         class_atom:
    834             if (c1 >= CLASS_RANGE_BASE) {
    835                 int ret;
    836                 ret = cr_union1(cr, cr1->points, cr1->len);
    837                 cr_free(cr1);
    838                 if (ret)
    839                     goto memory_error;
    840             } else {
    841                 if (cr_union_interval(cr, c1, c1))
    842                     goto memory_error;
    843             }
    844         }
    845     }
    846     if (s->ignore_case) {
    847         if (cr_regexp_canonicalize(cr, s->is_unicode))
    848             goto memory_error;
    849     }
    850     if (invert) {
    851         if (cr_invert(cr))
    852             goto memory_error;
    853     }
    854     if (re_emit_range(s, cr))
    855         goto fail;
    856     cr_free(cr);
    857     p++;    /* skip ']' */
    858     *pp = p;
    859     return 0;
    860  memory_error:
    861     re_parse_out_of_memory(s);
    862  fail:
    863     cr_free(cr);
    864     return -1;
    865 }
    866 
    867 /* Return:
    868    - true if the opcodes may not advance the char pointer
    869    - false if the opcodes always advance the char pointer
    870 */
    871 static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
    872 {
    873     int pos, opcode, len;
    874     uint32_t val;
    875     BOOL ret;
    876 
    877     ret = TRUE;
    878     pos = 0;
    879     while (pos < bc_buf_len) {
    880         opcode = bc_buf[pos];
    881         len = reopcode_info[opcode].size;
    882         switch(opcode) {
    883         case REOP_range:
    884             val = get_u16(bc_buf + pos + 1);
    885             len += val * 4;
    886             goto simple_char;
    887         case REOP_range32:
    888             val = get_u16(bc_buf + pos + 1);
    889             len += val * 8;
    890             goto simple_char;
    891         case REOP_char:
    892         case REOP_char32:
    893         case REOP_dot:
    894         case REOP_any:
    895         simple_char:
    896             ret = FALSE;
    897             break;
    898         case REOP_line_start:
    899         case REOP_line_end:
    900         case REOP_push_i32:
    901         case REOP_push_char_pos:
    902         case REOP_drop:
    903         case REOP_word_boundary:
    904         case REOP_not_word_boundary:
    905         case REOP_prev:
    906             /* no effect */
    907             break;
    908         case REOP_save_start:
    909         case REOP_save_end:
    910         case REOP_save_reset:
    911         case REOP_back_reference:
    912         case REOP_backward_back_reference:
    913             break;
    914         default:
    915             /* safe behavior: we cannot predict the outcome */
    916             return TRUE;
    917         }
    918         pos += len;
    919     }
    920     return ret;
    921 }
    922 
    923 /* return -1 if a simple quantifier cannot be used. Otherwise return
    924    the number of characters in the atom. */
    925 static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
    926 {
    927     int pos, opcode, len, count;
    928     uint32_t val;
    929 
    930     count = 0;
    931     pos = 0;
    932     while (pos < bc_buf_len) {
    933         opcode = bc_buf[pos];
    934         len = reopcode_info[opcode].size;
    935         switch(opcode) {
    936         case REOP_range:
    937             val = get_u16(bc_buf + pos + 1);
    938             len += val * 4;
    939             goto simple_char;
    940         case REOP_range32:
    941             val = get_u16(bc_buf + pos + 1);
    942             len += val * 8;
    943             goto simple_char;
    944         case REOP_char:
    945         case REOP_char32:
    946         case REOP_dot:
    947         case REOP_any:
    948         simple_char:
    949             count++;
    950             break;
    951         case REOP_line_start:
    952         case REOP_line_end:
    953         case REOP_word_boundary:
    954         case REOP_not_word_boundary:
    955             break;
    956         default:
    957             return -1;
    958         }
    959         pos += len;
    960     }
    961     return count;
    962 }
    963 
    964 /* '*pp' is the first char after '<' */
    965 static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
    966 {
    967     const uint8_t *p, *p1;
    968     uint32_t c, d;
    969     char *q;
    970 
    971     p = *pp;
    972     q = buf;
    973     for(;;) {
    974         c = *p;
    975         if (c == '\\') {
    976             p++;
    977             if (*p != 'u')
    978                 return -1;
    979             c = lre_parse_escape(&p, 2); // accept surrogate pairs
    980         } else if (c == '>') {
    981             break;
    982         } else if (c >= 128) {
    983             c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
    984             if (is_hi_surrogate(c)) {
    985                 d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
    986                 if (is_lo_surrogate(d)) {
    987                     c = from_surrogate(c, d);
    988                     p = p1;
    989                 }
    990             }
    991         } else {
    992             p++;
    993         }
    994         if (c > 0x10FFFF)
    995             return -1;
    996         if (q == buf) {
    997             if (!lre_js_is_ident_first(c))
    998                 return -1;
    999         } else {
   1000             if (!lre_js_is_ident_next(c))
   1001                 return -1;
   1002         }
   1003         if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
   1004             return -1;
   1005         if (c < 128) {
   1006             *q++ = c;
   1007         } else {
   1008             q += unicode_to_utf8((uint8_t*)q, c);
   1009         }
   1010     }
   1011     if (q == buf)
   1012         return -1;
   1013     *q = '\0';
   1014     p++;
   1015     *pp = p;
   1016     return 0;
   1017 }
   1018 
   1019 /* if capture_name = NULL: return the number of captures + 1.
   1020    Otherwise, return the capture index corresponding to capture_name
   1021    or -1 if none */
   1022 static int re_parse_captures(REParseState *s, int *phas_named_captures,
   1023                              const char *capture_name)
   1024 {
   1025     const uint8_t *p;
   1026     int capture_index;
   1027     char name[TMP_BUF_SIZE];
   1028 
   1029     capture_index = 1;
   1030     *phas_named_captures = 0;
   1031     for (p = s->buf_start; p < s->buf_end; p++) {
   1032         switch (*p) {
   1033         case '(':
   1034             if (p[1] == '?') {
   1035                 if (p[2] == '<' && p[3] != '=' && p[3] != '!') {
   1036                     *phas_named_captures = 1;
   1037                     /* potential named capture */
   1038                     if (capture_name) {
   1039                         p += 3;
   1040                         if (re_parse_group_name(name, sizeof(name), &p) == 0) {
   1041                             if (!strcmp(name, capture_name))
   1042                                 return capture_index;
   1043                         }
   1044                     }
   1045                     capture_index++;
   1046                     if (capture_index >= CAPTURE_COUNT_MAX)
   1047                         goto done;
   1048                 }
   1049             } else {
   1050                 capture_index++;
   1051                 if (capture_index >= CAPTURE_COUNT_MAX)
   1052                     goto done;
   1053             }
   1054             break;
   1055         case '\\':
   1056             p++;
   1057             break;
   1058         case '[':
   1059             for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) {
   1060                 if (*p == '\\')
   1061                     p++;
   1062             }
   1063             break;
   1064         }
   1065     }
   1066  done:
   1067     if (capture_name)
   1068         return -1;
   1069     else
   1070         return capture_index;
   1071 }
   1072 
   1073 static int re_count_captures(REParseState *s)
   1074 {
   1075     if (s->total_capture_count < 0) {
   1076         s->total_capture_count = re_parse_captures(s, &s->has_named_captures,
   1077                                                    NULL);
   1078     }
   1079     return s->total_capture_count;
   1080 }
   1081 
   1082 static BOOL re_has_named_captures(REParseState *s)
   1083 {
   1084     if (s->has_named_captures < 0)
   1085         re_count_captures(s);
   1086     return s->has_named_captures;
   1087 }
   1088 
   1089 static int find_group_name(REParseState *s, const char *name)
   1090 {
   1091     const char *p, *buf_end;
   1092     size_t len, name_len;
   1093     int capture_index;
   1094 
   1095     p = (char *)s->group_names.buf;
   1096     if (!p) return -1;
   1097     buf_end = (char *)s->group_names.buf + s->group_names.size;
   1098     name_len = strlen(name);
   1099     capture_index = 1;
   1100     while (p < buf_end) {
   1101         len = strlen(p);
   1102         if (len == name_len && memcmp(name, p, name_len) == 0)
   1103             return capture_index;
   1104         p += len + 1;
   1105         capture_index++;
   1106     }
   1107     return -1;
   1108 }
   1109 
   1110 static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir);
   1111 
   1112 static int re_parse_term(REParseState *s, BOOL is_backward_dir)
   1113 {
   1114     const uint8_t *p;
   1115     int c, last_atom_start, quant_min, quant_max, last_capture_count;
   1116     BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead;
   1117     CharRange cr_s, *cr = &cr_s;
   1118 
   1119     last_atom_start = -1;
   1120     last_capture_count = 0;
   1121     p = s->buf_ptr;
   1122     c = *p;
   1123     switch(c) {
   1124     case '^':
   1125         p++;
   1126         re_emit_op(s, REOP_line_start);
   1127         break;
   1128     case '$':
   1129         p++;
   1130         re_emit_op(s, REOP_line_end);
   1131         break;
   1132     case '.':
   1133         p++;
   1134         last_atom_start = s->byte_code.size;
   1135         last_capture_count = s->capture_count;
   1136         if (is_backward_dir)
   1137             re_emit_op(s, REOP_prev);
   1138         re_emit_op(s, s->dotall ? REOP_any : REOP_dot);
   1139         if (is_backward_dir)
   1140             re_emit_op(s, REOP_prev);
   1141         break;
   1142     case '{':
   1143         if (s->is_unicode) {
   1144             return re_parse_error(s, "syntax error");
   1145         } else if (!is_digit(p[1])) {
   1146             /* Annex B: we accept '{' not followed by digits as a
   1147                normal atom */
   1148             goto parse_class_atom;
   1149         } else {
   1150             const uint8_t *p1 = p + 1;
   1151             /* Annex B: error if it is like a repetition count */
   1152             parse_digits(&p1, TRUE);
   1153             if (*p1 == ',') {
   1154                 p1++;
   1155                 if (is_digit(*p1)) {
   1156                     parse_digits(&p1, TRUE);
   1157                 }
   1158             }
   1159             if (*p1 != '}') {
   1160                 goto parse_class_atom;
   1161             }
   1162         }
   1163         /* fall thru */
   1164     case '*':
   1165     case '+':
   1166     case '?':
   1167         return re_parse_error(s, "nothing to repeat");
   1168     case '(':
   1169         if (p[1] == '?') {
   1170             if (p[2] == ':') {
   1171                 p += 3;
   1172                 last_atom_start = s->byte_code.size;
   1173                 last_capture_count = s->capture_count;
   1174                 s->buf_ptr = p;
   1175                 if (re_parse_disjunction(s, is_backward_dir))
   1176                     return -1;
   1177                 p = s->buf_ptr;
   1178                 if (re_parse_expect(s, &p, ')'))
   1179                     return -1;
   1180             } else if ((p[2] == '=' || p[2] == '!')) {
   1181                 is_neg = (p[2] == '!');
   1182                 is_backward_lookahead = FALSE;
   1183                 p += 3;
   1184                 goto lookahead;
   1185             } else if (p[2] == '<' &&
   1186                        (p[3] == '=' || p[3] == '!')) {
   1187                 int pos;
   1188                 is_neg = (p[3] == '!');
   1189                 is_backward_lookahead = TRUE;
   1190                 p += 4;
   1191                 /* lookahead */
   1192             lookahead:
   1193                 /* Annex B allows lookahead to be used as an atom for
   1194                    the quantifiers */
   1195                 if (!s->is_unicode && !is_backward_lookahead)  {
   1196                     last_atom_start = s->byte_code.size;
   1197                     last_capture_count = s->capture_count;
   1198                 }
   1199                 pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0);
   1200                 s->buf_ptr = p;
   1201                 if (re_parse_disjunction(s, is_backward_lookahead))
   1202                     return -1;
   1203                 p = s->buf_ptr;
   1204                 if (re_parse_expect(s, &p, ')'))
   1205                     return -1;
   1206                 re_emit_op(s, REOP_match);
   1207                 /* jump after the 'match' after the lookahead is successful */
   1208                 if (dbuf_error(&s->byte_code))
   1209                     return -1;
   1210                 put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4));
   1211             } else if (p[2] == '<') {
   1212                 p += 3;
   1213                 if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
   1214                                         &p)) {
   1215                     return re_parse_error(s, "invalid group name");
   1216                 }
   1217                 if (find_group_name(s, s->u.tmp_buf) > 0) {
   1218                     return re_parse_error(s, "duplicate group name");
   1219                 }
   1220                 /* group name with a trailing zero */
   1221                 dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf,
   1222                          strlen(s->u.tmp_buf) + 1);
   1223                 s->has_named_captures = 1;
   1224                 goto parse_capture;
   1225             } else {
   1226                 return re_parse_error(s, "invalid group");
   1227             }
   1228         } else {
   1229             int capture_index;
   1230             p++;
   1231             /* capture without group name */
   1232             dbuf_putc(&s->group_names, 0);
   1233         parse_capture:
   1234             if (s->capture_count >= CAPTURE_COUNT_MAX)
   1235                 return re_parse_error(s, "too many captures");
   1236             last_atom_start = s->byte_code.size;
   1237             last_capture_count = s->capture_count;
   1238             capture_index = s->capture_count++;
   1239             re_emit_op_u8(s, REOP_save_start + is_backward_dir,
   1240                           capture_index);
   1241 
   1242             s->buf_ptr = p;
   1243             if (re_parse_disjunction(s, is_backward_dir))
   1244                 return -1;
   1245             p = s->buf_ptr;
   1246 
   1247             re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir,
   1248                           capture_index);
   1249 
   1250             if (re_parse_expect(s, &p, ')'))
   1251                 return -1;
   1252         }
   1253         break;
   1254     case '\\':
   1255         switch(p[1]) {
   1256         case 'b':
   1257         case 'B':
   1258             re_emit_op(s, REOP_word_boundary + (p[1] != 'b'));
   1259             p += 2;
   1260             break;
   1261         case 'k':
   1262             {
   1263                 const uint8_t *p1;
   1264                 int dummy_res;
   1265 
   1266                 p1 = p;
   1267                 if (p1[2] != '<') {
   1268                     /* annex B: we tolerate invalid group names in non
   1269                        unicode mode if there is no named capture
   1270                        definition */
   1271                     if (s->is_unicode || re_has_named_captures(s))
   1272                         return re_parse_error(s, "expecting group name");
   1273                     else
   1274                         goto parse_class_atom;
   1275                 }
   1276                 p1 += 3;
   1277                 if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
   1278                                         &p1)) {
   1279                     if (s->is_unicode || re_has_named_captures(s))
   1280                         return re_parse_error(s, "invalid group name");
   1281                     else
   1282                         goto parse_class_atom;
   1283                 }
   1284                 c = find_group_name(s, s->u.tmp_buf);
   1285                 if (c < 0) {
   1286                     /* no capture name parsed before, try to look
   1287                        after (inefficient, but hopefully not common */
   1288                     c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
   1289                     if (c < 0) {
   1290                         if (s->is_unicode || re_has_named_captures(s))
   1291                             return re_parse_error(s, "group name not defined");
   1292                         else
   1293                             goto parse_class_atom;
   1294                     }
   1295                 }
   1296                 p = p1;
   1297             }
   1298             goto emit_back_reference;
   1299         case '0':
   1300             p += 2;
   1301             c = 0;
   1302             if (s->is_unicode) {
   1303                 if (is_digit(*p)) {
   1304                     return re_parse_error(s, "invalid decimal escape in regular expression");
   1305                 }
   1306             } else {
   1307                 /* Annex B.1.4: accept legacy octal */
   1308                 if (*p >= '0' && *p <= '7') {
   1309                     c = *p++ - '0';
   1310                     if (*p >= '0' && *p <= '7') {
   1311                         c = (c << 3) + *p++ - '0';
   1312                     }
   1313                 }
   1314             }
   1315             goto normal_char;
   1316         case '1': case '2': case '3': case '4':
   1317         case '5': case '6': case '7': case '8':
   1318         case '9':
   1319             {
   1320                 const uint8_t *q = ++p;
   1321 
   1322                 c = parse_digits(&p, FALSE);
   1323                 if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
   1324                     if (!s->is_unicode) {
   1325                         /* Annex B.1.4: accept legacy octal */
   1326                         p = q;
   1327                         if (*p <= '7') {
   1328                             c = 0;
   1329                             if (*p <= '3')
   1330                                 c = *p++ - '0';
   1331                             if (*p >= '0' && *p <= '7') {
   1332                                 c = (c << 3) + *p++ - '0';
   1333                                 if (*p >= '0' && *p <= '7') {
   1334                                     c = (c << 3) + *p++ - '0';
   1335                                 }
   1336                             }
   1337                         } else {
   1338                             c = *p++;
   1339                         }
   1340                         goto normal_char;
   1341                     }
   1342                     return re_parse_error(s, "back reference out of range in regular expression");
   1343                 }
   1344             emit_back_reference:
   1345                 last_atom_start = s->byte_code.size;
   1346                 last_capture_count = s->capture_count;
   1347                 re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c);
   1348             }
   1349             break;
   1350         default:
   1351             goto parse_class_atom;
   1352         }
   1353         break;
   1354     case '[':
   1355         last_atom_start = s->byte_code.size;
   1356         last_capture_count = s->capture_count;
   1357         if (is_backward_dir)
   1358             re_emit_op(s, REOP_prev);
   1359         if (re_parse_char_class(s, &p))
   1360             return -1;
   1361         if (is_backward_dir)
   1362             re_emit_op(s, REOP_prev);
   1363         break;
   1364     case ']':
   1365     case '}':
   1366         if (s->is_unicode)
   1367             return re_parse_error(s, "syntax error");
   1368         goto parse_class_atom;
   1369     default:
   1370     parse_class_atom:
   1371         c = get_class_atom(s, cr, &p, FALSE);
   1372         if ((int)c < 0)
   1373             return -1;
   1374     normal_char:
   1375         last_atom_start = s->byte_code.size;
   1376         last_capture_count = s->capture_count;
   1377         if (is_backward_dir)
   1378             re_emit_op(s, REOP_prev);
   1379         if (c >= CLASS_RANGE_BASE) {
   1380             int ret;
   1381             /* Note: canonicalization is not needed */
   1382             ret = re_emit_range(s, cr);
   1383             cr_free(cr);
   1384             if (ret)
   1385                 return -1;
   1386         } else {
   1387             if (s->ignore_case)
   1388                 c = lre_canonicalize(c, s->is_unicode);
   1389             if (c <= 0xffff)
   1390                 re_emit_op_u16(s, REOP_char, c);
   1391             else
   1392                 re_emit_op_u32(s, REOP_char32, c);
   1393         }
   1394         if (is_backward_dir)
   1395             re_emit_op(s, REOP_prev);
   1396         break;
   1397     }
   1398 
   1399     /* quantifier */
   1400     if (last_atom_start >= 0) {
   1401         c = *p;
   1402         switch(c) {
   1403         case '*':
   1404             p++;
   1405             quant_min = 0;
   1406             quant_max = INT32_MAX;
   1407             goto quantifier;
   1408         case '+':
   1409             p++;
   1410             quant_min = 1;
   1411             quant_max = INT32_MAX;
   1412             goto quantifier;
   1413         case '?':
   1414             p++;
   1415             quant_min = 0;
   1416             quant_max = 1;
   1417             goto quantifier;
   1418         case '{':
   1419             {
   1420                 const uint8_t *p1 = p;
   1421                 /* As an extension (see ES6 annex B), we accept '{' not
   1422                    followed by digits as a normal atom */
   1423                 if (!is_digit(p[1])) {
   1424                     if (s->is_unicode)
   1425                         goto invalid_quant_count;
   1426                     break;
   1427                 }
   1428                 p++;
   1429                 quant_min = parse_digits(&p, TRUE);
   1430                 quant_max = quant_min;
   1431                 if (*p == ',') {
   1432                     p++;
   1433                     if (is_digit(*p)) {
   1434                         quant_max = parse_digits(&p, TRUE);
   1435                         if (quant_max < quant_min) {
   1436                         invalid_quant_count:
   1437                             return re_parse_error(s, "invalid repetition count");
   1438                         }
   1439                     } else {
   1440                         quant_max = INT32_MAX; /* infinity */
   1441                     }
   1442                 }
   1443                 if (*p != '}' && !s->is_unicode) {
   1444                     /* Annex B: normal atom if invalid '{' syntax */
   1445                     p = p1;
   1446                     break;
   1447                 }
   1448                 if (re_parse_expect(s, &p, '}'))
   1449                     return -1;
   1450             }
   1451         quantifier:
   1452             greedy = TRUE;
   1453             if (*p == '?') {
   1454                 p++;
   1455                 greedy = FALSE;
   1456             }
   1457             if (last_atom_start < 0) {
   1458                 return re_parse_error(s, "nothing to repeat");
   1459             }
   1460             if (greedy) {
   1461                 int len, pos;
   1462 
   1463                 if (quant_max > 0) {
   1464                     /* specific optimization for simple quantifiers */
   1465                     if (dbuf_error(&s->byte_code))
   1466                         goto out_of_memory;
   1467                     len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start,
   1468                                                  s->byte_code.size - last_atom_start);
   1469                     if (len > 0) {
   1470                         re_emit_op(s, REOP_match);
   1471 
   1472                         if (dbuf_insert(&s->byte_code, last_atom_start, 17))
   1473                             goto out_of_memory;
   1474                         pos = last_atom_start;
   1475                         s->byte_code.buf[pos++] = REOP_simple_greedy_quant;
   1476                         put_u32(&s->byte_code.buf[pos],
   1477                                 s->byte_code.size - last_atom_start - 17);
   1478                         pos += 4;
   1479                         put_u32(&s->byte_code.buf[pos], quant_min);
   1480                         pos += 4;
   1481                         put_u32(&s->byte_code.buf[pos], quant_max);
   1482                         pos += 4;
   1483                         put_u32(&s->byte_code.buf[pos], len);
   1484                         pos += 4;
   1485                         goto done;
   1486                     }
   1487                 }
   1488 
   1489                 if (dbuf_error(&s->byte_code))
   1490                     goto out_of_memory;
   1491             }
   1492             /* the spec tells that if there is no advance when
   1493                running the atom after the first quant_min times,
   1494                then there is no match. We remove this test when we
   1495                are sure the atom always advances the position. */
   1496             add_zero_advance_check = re_need_check_advance(s->byte_code.buf + last_atom_start,
   1497                                                            s->byte_code.size - last_atom_start);
   1498 
   1499             {
   1500                 int len, pos;
   1501                 len = s->byte_code.size - last_atom_start;
   1502                 if (quant_min == 0) {
   1503                     /* need to reset the capture in case the atom is
   1504                        not executed */
   1505                     if (last_capture_count != s->capture_count) {
   1506                         if (dbuf_insert(&s->byte_code, last_atom_start, 3))
   1507                             goto out_of_memory;
   1508                         s->byte_code.buf[last_atom_start++] = REOP_save_reset;
   1509                         s->byte_code.buf[last_atom_start++] = last_capture_count;
   1510                         s->byte_code.buf[last_atom_start++] = s->capture_count - 1;
   1511                     }
   1512                     if (quant_max == 0) {
   1513                         s->byte_code.size = last_atom_start;
   1514                     } else if (quant_max == 1 || quant_max == INT32_MAX) {
   1515                         BOOL has_goto = (quant_max == INT32_MAX);
   1516                         if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check))
   1517                             goto out_of_memory;
   1518                         s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
   1519                             greedy;
   1520                         put_u32(s->byte_code.buf + last_atom_start + 1,
   1521                                 len + 5 * has_goto + add_zero_advance_check * 2);
   1522                         if (add_zero_advance_check) {
   1523                             s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos;
   1524                             re_emit_op(s, REOP_check_advance);
   1525                         }
   1526                         if (has_goto)
   1527                             re_emit_goto(s, REOP_goto, last_atom_start);
   1528                     } else {
   1529                         if (dbuf_insert(&s->byte_code, last_atom_start, 10 + add_zero_advance_check))
   1530                             goto out_of_memory;
   1531                         pos = last_atom_start;
   1532                         s->byte_code.buf[pos++] = REOP_push_i32;
   1533                         put_u32(s->byte_code.buf + pos, quant_max);
   1534                         pos += 4;
   1535                         s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
   1536                         put_u32(s->byte_code.buf + pos, len + 5 + add_zero_advance_check * 2);
   1537                         pos += 4;
   1538                         if (add_zero_advance_check) {
   1539                             s->byte_code.buf[pos++] = REOP_push_char_pos;
   1540                             re_emit_op(s, REOP_check_advance);
   1541                         }
   1542                         re_emit_goto(s, REOP_loop, last_atom_start + 5);
   1543                         re_emit_op(s, REOP_drop);
   1544                     }
   1545                 } else if (quant_min == 1 && quant_max == INT32_MAX &&
   1546                            !add_zero_advance_check) {
   1547                     re_emit_goto(s, REOP_split_next_first - greedy,
   1548                                  last_atom_start);
   1549                 } else {
   1550                     if (quant_min == 1) {
   1551                         /* nothing to add */
   1552                     } else {
   1553                         if (dbuf_insert(&s->byte_code, last_atom_start, 5))
   1554                             goto out_of_memory;
   1555                         s->byte_code.buf[last_atom_start] = REOP_push_i32;
   1556                         put_u32(s->byte_code.buf + last_atom_start + 1,
   1557                                 quant_min);
   1558                         last_atom_start += 5;
   1559                         re_emit_goto(s, REOP_loop, last_atom_start);
   1560                         re_emit_op(s, REOP_drop);
   1561                     }
   1562                     if (quant_max == INT32_MAX) {
   1563                         pos = s->byte_code.size;
   1564                         re_emit_op_u32(s, REOP_split_goto_first + greedy,
   1565                                        len + 5 + add_zero_advance_check * 2);
   1566                         if (add_zero_advance_check)
   1567                             re_emit_op(s, REOP_push_char_pos);
   1568                         /* copy the atom */
   1569                         dbuf_put_self(&s->byte_code, last_atom_start, len);
   1570                         if (add_zero_advance_check)
   1571                             re_emit_op(s, REOP_check_advance);
   1572                         re_emit_goto(s, REOP_goto, pos);
   1573                     } else if (quant_max > quant_min) {
   1574                         re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min);
   1575                         pos = s->byte_code.size;
   1576                         re_emit_op_u32(s, REOP_split_goto_first + greedy,
   1577                                        len + 5 + add_zero_advance_check * 2);
   1578                         if (add_zero_advance_check)
   1579                             re_emit_op(s, REOP_push_char_pos);
   1580                         /* copy the atom */
   1581                         dbuf_put_self(&s->byte_code, last_atom_start, len);
   1582                         if (add_zero_advance_check)
   1583                             re_emit_op(s, REOP_check_advance);
   1584                         re_emit_goto(s, REOP_loop, pos);
   1585                         re_emit_op(s, REOP_drop);
   1586                     }
   1587                 }
   1588                 last_atom_start = -1;
   1589             }
   1590             break;
   1591         default:
   1592             break;
   1593         }
   1594     }
   1595  done:
   1596     s->buf_ptr = p;
   1597     return 0;
   1598  out_of_memory:
   1599     return re_parse_out_of_memory(s);
   1600 }
   1601 
   1602 static int re_parse_alternative(REParseState *s, BOOL is_backward_dir)
   1603 {
   1604     const uint8_t *p;
   1605     int ret;
   1606     size_t start, term_start, end, term_size;
   1607 
   1608     start = s->byte_code.size;
   1609     for(;;) {
   1610         p = s->buf_ptr;
   1611         if (p >= s->buf_end)
   1612             break;
   1613         if (*p == '|' || *p == ')')
   1614             break;
   1615         term_start = s->byte_code.size;
   1616         ret = re_parse_term(s, is_backward_dir);
   1617         if (ret)
   1618             return ret;
   1619         if (is_backward_dir) {
   1620             /* reverse the order of the terms (XXX: inefficient, but
   1621                speed is not really critical here) */
   1622             end = s->byte_code.size;
   1623             term_size = end - term_start;
   1624             if (dbuf_realloc(&s->byte_code, end + term_size))
   1625                 return -1;
   1626             memmove(s->byte_code.buf + start + term_size,
   1627                     s->byte_code.buf + start,
   1628                     end - start);
   1629             memcpy(s->byte_code.buf + start, s->byte_code.buf + end,
   1630                    term_size);
   1631         }
   1632     }
   1633     return 0;
   1634 }
   1635 
   1636 static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir)
   1637 {
   1638     int start, len, pos;
   1639 
   1640     if (lre_check_stack_overflow(s->opaque, 0))
   1641         return re_parse_error(s, "stack overflow");
   1642 
   1643     start = s->byte_code.size;
   1644     if (re_parse_alternative(s, is_backward_dir))
   1645         return -1;
   1646     while (*s->buf_ptr == '|') {
   1647         s->buf_ptr++;
   1648 
   1649         len = s->byte_code.size - start;
   1650 
   1651         /* insert a split before the first alternative */
   1652         if (dbuf_insert(&s->byte_code, start, 5)) {
   1653             return re_parse_out_of_memory(s);
   1654         }
   1655         s->byte_code.buf[start] = REOP_split_next_first;
   1656         put_u32(s->byte_code.buf + start + 1, len + 5);
   1657 
   1658         pos = re_emit_op_u32(s, REOP_goto, 0);
   1659 
   1660         if (re_parse_alternative(s, is_backward_dir))
   1661             return -1;
   1662 
   1663         /* patch the goto */
   1664         len = s->byte_code.size - (pos + 4);
   1665         put_u32(s->byte_code.buf + pos, len);
   1666     }
   1667     return 0;
   1668 }
   1669 
   1670 /* the control flow is recursive so the analysis can be linear */
   1671 static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
   1672 {
   1673     int stack_size, stack_size_max, pos, opcode, len;
   1674     uint32_t val;
   1675 
   1676     stack_size = 0;
   1677     stack_size_max = 0;
   1678     bc_buf += RE_HEADER_LEN;
   1679     bc_buf_len -= RE_HEADER_LEN;
   1680     pos = 0;
   1681     while (pos < bc_buf_len) {
   1682         opcode = bc_buf[pos];
   1683         len = reopcode_info[opcode].size;
   1684         assert(opcode < REOP_COUNT);
   1685         assert((pos + len) <= bc_buf_len);
   1686         switch(opcode) {
   1687         case REOP_push_i32:
   1688         case REOP_push_char_pos:
   1689             stack_size++;
   1690             if (stack_size > stack_size_max) {
   1691                 if (stack_size > STACK_SIZE_MAX)
   1692                     return -1;
   1693                 stack_size_max = stack_size;
   1694             }
   1695             break;
   1696         case REOP_drop:
   1697         case REOP_check_advance:
   1698             assert(stack_size > 0);
   1699             stack_size--;
   1700             break;
   1701         case REOP_range:
   1702             val = get_u16(bc_buf + pos + 1);
   1703             len += val * 4;
   1704             break;
   1705         case REOP_range32:
   1706             val = get_u16(bc_buf + pos + 1);
   1707             len += val * 8;
   1708             break;
   1709         }
   1710         pos += len;
   1711     }
   1712     return stack_size_max;
   1713 }
   1714 
   1715 /* 'buf' must be a zero terminated UTF-8 string of length buf_len.
   1716    Return NULL if error and allocate an error message in *perror_msg,
   1717    otherwise the compiled bytecode and its length in plen.
   1718 */
   1719 uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
   1720                      const char *buf, size_t buf_len, int re_flags,
   1721                      void *opaque)
   1722 {
   1723     REParseState s_s, *s = &s_s;
   1724     int stack_size;
   1725     BOOL is_sticky;
   1726 
   1727     memset(s, 0, sizeof(*s));
   1728     s->opaque = opaque;
   1729     s->buf_ptr = (const uint8_t *)buf;
   1730     s->buf_end = s->buf_ptr + buf_len;
   1731     s->buf_start = s->buf_ptr;
   1732     s->re_flags = re_flags;
   1733     s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != 0);
   1734     is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
   1735     s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
   1736     s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
   1737     s->capture_count = 1;
   1738     s->total_capture_count = -1;
   1739     s->has_named_captures = -1;
   1740 
   1741     dbuf_init2(&s->byte_code, opaque, lre_realloc);
   1742     dbuf_init2(&s->group_names, opaque, lre_realloc);
   1743 
   1744     dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */
   1745     dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */
   1746     dbuf_putc(&s->byte_code, 0); /* stack size */
   1747     dbuf_put_u32(&s->byte_code, 0); /* bytecode length */
   1748 
   1749     if (!is_sticky) {
   1750         /* iterate thru all positions (about the same as .*?( ... ) )
   1751            .  We do it without an explicit loop so that lock step
   1752            thread execution will be possible in an optimized
   1753            implementation */
   1754         re_emit_op_u32(s, REOP_split_goto_first, 1 + 5);
   1755         re_emit_op(s, REOP_any);
   1756         re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5));
   1757     }
   1758     re_emit_op_u8(s, REOP_save_start, 0);
   1759 
   1760     if (re_parse_disjunction(s, FALSE)) {
   1761     error:
   1762         dbuf_free(&s->byte_code);
   1763         dbuf_free(&s->group_names);
   1764         pstrcpy(error_msg, error_msg_size, s->u.error_msg);
   1765         *plen = 0;
   1766         return NULL;
   1767     }
   1768 
   1769     re_emit_op_u8(s, REOP_save_end, 0);
   1770 
   1771     re_emit_op(s, REOP_match);
   1772 
   1773     if (*s->buf_ptr != '\0') {
   1774         re_parse_error(s, "extraneous characters at the end");
   1775         goto error;
   1776     }
   1777 
   1778     if (dbuf_error(&s->byte_code)) {
   1779         re_parse_out_of_memory(s);
   1780         goto error;
   1781     }
   1782 
   1783     stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size);
   1784     if (stack_size < 0) {
   1785         re_parse_error(s, "too many imbricated quantifiers");
   1786         goto error;
   1787     }
   1788 
   1789     s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
   1790     s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size;
   1791     put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN,
   1792             s->byte_code.size - RE_HEADER_LEN);
   1793 
   1794     /* add the named groups if needed */
   1795     if (s->group_names.size > (s->capture_count - 1)) {
   1796         dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size);
   1797         s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS;
   1798     }
   1799     dbuf_free(&s->group_names);
   1800 
   1801 #ifdef DUMP_REOP
   1802     lre_dump_bytecode(s->byte_code.buf, s->byte_code.size);
   1803 #endif
   1804 
   1805     error_msg[0] = '\0';
   1806     *plen = s->byte_code.size;
   1807     return s->byte_code.buf;
   1808 }
   1809 
   1810 static BOOL is_line_terminator(uint32_t c)
   1811 {
   1812     return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS);
   1813 }
   1814 
   1815 static BOOL is_word_char(uint32_t c)
   1816 {
   1817     return ((c >= '0' && c <= '9') ||
   1818             (c >= 'a' && c <= 'z') ||
   1819             (c >= 'A' && c <= 'Z') ||
   1820             (c == '_'));
   1821 }
   1822 
   1823 #define GET_CHAR(c, cptr, cbuf_end, cbuf_type)                          \
   1824     do {                                                                \
   1825         if (cbuf_type == 0) {                                           \
   1826             c = *cptr++;                                                \
   1827         } else {                                                        \
   1828             const uint16_t *_p = (const uint16_t *)cptr;                \
   1829             const uint16_t *_end = (const uint16_t *)cbuf_end;          \
   1830             c = *_p++;                                                  \
   1831             if (is_hi_surrogate(c) && cbuf_type == 2) {                 \
   1832                 if (_p < _end && is_lo_surrogate(*_p)) {                \
   1833                     c = from_surrogate(c, *_p++);                       \
   1834                 }                                                       \
   1835             }                                                           \
   1836             cptr = (const void *)_p;                                    \
   1837         }                                                               \
   1838     } while (0)
   1839 
   1840 #define PEEK_CHAR(c, cptr, cbuf_end, cbuf_type)                         \
   1841     do {                                                                \
   1842         if (cbuf_type == 0) {                                           \
   1843             c = cptr[0];                                                \
   1844         } else {                                                        \
   1845             const uint16_t *_p = (const uint16_t *)cptr;                \
   1846             const uint16_t *_end = (const uint16_t *)cbuf_end;          \
   1847             c = *_p++;                                                  \
   1848             if (is_hi_surrogate(c) && cbuf_type == 2) {                 \
   1849                 if (_p < _end && is_lo_surrogate(*_p)) {                \
   1850                     c = from_surrogate(c, *_p);                         \
   1851                 }                                                       \
   1852             }                                                           \
   1853         }                                                               \
   1854     } while (0)
   1855 
   1856 #define PEEK_PREV_CHAR(c, cptr, cbuf_start, cbuf_type)                  \
   1857     do {                                                                \
   1858         if (cbuf_type == 0) {                                           \
   1859             c = cptr[-1];                                               \
   1860         } else {                                                        \
   1861             const uint16_t *_p = (const uint16_t *)cptr - 1;            \
   1862             const uint16_t *_start = (const uint16_t *)cbuf_start;      \
   1863             c = *_p;                                                    \
   1864             if (is_lo_surrogate(c) && cbuf_type == 2) {                 \
   1865                 if (_p > _start && is_hi_surrogate(_p[-1])) {           \
   1866                     c = from_surrogate(*--_p, c);                       \
   1867                 }                                                       \
   1868             }                                                           \
   1869         }                                                               \
   1870     } while (0)
   1871 
   1872 #define GET_PREV_CHAR(c, cptr, cbuf_start, cbuf_type)                   \
   1873     do {                                                                \
   1874         if (cbuf_type == 0) {                                           \
   1875             cptr--;                                                     \
   1876             c = cptr[0];                                                \
   1877         } else {                                                        \
   1878             const uint16_t *_p = (const uint16_t *)cptr - 1;            \
   1879             const uint16_t *_start = (const uint16_t *)cbuf_start;      \
   1880             c = *_p;                                                    \
   1881             if (is_lo_surrogate(c) && cbuf_type == 2) {                 \
   1882                 if (_p > _start && is_hi_surrogate(_p[-1])) {           \
   1883                     c = from_surrogate(*--_p, c);                       \
   1884                 }                                                       \
   1885             }                                                           \
   1886             cptr = (const void *)_p;                                    \
   1887         }                                                               \
   1888     } while (0)
   1889 
   1890 #define PREV_CHAR(cptr, cbuf_start, cbuf_type)                          \
   1891     do {                                                                \
   1892         if (cbuf_type == 0) {                                           \
   1893             cptr--;                                                     \
   1894         } else {                                                        \
   1895             const uint16_t *_p = (const uint16_t *)cptr - 1;            \
   1896             const uint16_t *_start = (const uint16_t *)cbuf_start;      \
   1897             if (is_lo_surrogate(*_p) && cbuf_type == 2) {               \
   1898                 if (_p > _start && is_hi_surrogate(_p[-1])) {           \
   1899                     --_p;                                               \
   1900                 }                                                       \
   1901             }                                                           \
   1902             cptr = (const void *)_p;                                    \
   1903         }                                                               \
   1904     } while (0)
   1905 
   1906 typedef uintptr_t StackInt;
   1907 
   1908 typedef enum {
   1909     RE_EXEC_STATE_SPLIT,
   1910     RE_EXEC_STATE_LOOKAHEAD,
   1911     RE_EXEC_STATE_NEGATIVE_LOOKAHEAD,
   1912     RE_EXEC_STATE_GREEDY_QUANT,
   1913 } REExecStateEnum;
   1914 
   1915 typedef struct REExecState {
   1916     REExecStateEnum type : 8;
   1917     uint8_t stack_len;
   1918     size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */
   1919     const uint8_t *cptr;
   1920     const uint8_t *pc;
   1921     void *buf[0];
   1922 } REExecState;
   1923 
   1924 typedef struct {
   1925     const uint8_t *cbuf;
   1926     const uint8_t *cbuf_end;
   1927     /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */
   1928     int cbuf_type;
   1929     int capture_count;
   1930     int stack_size_max;
   1931     BOOL multi_line;
   1932     BOOL ignore_case;
   1933     BOOL is_unicode;
   1934     void *opaque; /* used for stack overflow check */
   1935 
   1936     size_t state_size;
   1937     uint8_t *state_stack;
   1938     size_t state_stack_size;
   1939     size_t state_stack_len;
   1940 } REExecContext;
   1941 
   1942 static int push_state(REExecContext *s,
   1943                       uint8_t **capture,
   1944                       StackInt *stack, size_t stack_len,
   1945                       const uint8_t *pc, const uint8_t *cptr,
   1946                       REExecStateEnum type, size_t count)
   1947 {
   1948     REExecState *rs;
   1949     uint8_t *new_stack;
   1950     size_t new_size, i, n;
   1951     StackInt *stack_buf;
   1952 
   1953     if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) {
   1954         /* reallocate the stack */
   1955         new_size = s->state_stack_size * 3 / 2;
   1956         if (new_size < 8)
   1957             new_size = 8;
   1958         new_stack = lre_realloc(s->opaque, s->state_stack, new_size * s->state_size);
   1959         if (!new_stack)
   1960             return -1;
   1961         s->state_stack_size = new_size;
   1962         s->state_stack = new_stack;
   1963     }
   1964     rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size);
   1965     s->state_stack_len++;
   1966     rs->type = type;
   1967     rs->count = count;
   1968     rs->stack_len = stack_len;
   1969     rs->cptr = cptr;
   1970     rs->pc = pc;
   1971     n = 2 * s->capture_count;
   1972     for(i = 0; i < n; i++)
   1973         rs->buf[i] = capture[i];
   1974     stack_buf = (StackInt *)(rs->buf + n);
   1975     for(i = 0; i < stack_len; i++)
   1976         stack_buf[i] = stack[i];
   1977     return 0;
   1978 }
   1979 
   1980 /* return 1 if match, 0 if not match or -1 if error. */
   1981 static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
   1982                                    StackInt *stack, int stack_len,
   1983                                    const uint8_t *pc, const uint8_t *cptr,
   1984                                    BOOL no_recurse)
   1985 {
   1986     int opcode, ret;
   1987     int cbuf_type;
   1988     uint32_t val, c;
   1989     const uint8_t *cbuf_end;
   1990 
   1991     cbuf_type = s->cbuf_type;
   1992     cbuf_end = s->cbuf_end;
   1993 
   1994     for(;;) {
   1995         //        printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + RE_HEADER_LEN)));
   1996         opcode = *pc++;
   1997         switch(opcode) {
   1998         case REOP_match:
   1999             {
   2000                 REExecState *rs;
   2001                 if (no_recurse)
   2002                     return (intptr_t)cptr;
   2003                 ret = 1;
   2004                 goto recurse;
   2005             no_match:
   2006                 if (no_recurse)
   2007                     return 0;
   2008                 ret = 0;
   2009             recurse:
   2010                 for(;;) {
   2011                     if (s->state_stack_len == 0)
   2012                         return ret;
   2013                     rs = (REExecState *)(s->state_stack +
   2014                                          (s->state_stack_len - 1) * s->state_size);
   2015                     if (rs->type == RE_EXEC_STATE_SPLIT) {
   2016                         if (!ret) {
   2017                         pop_state:
   2018                             memcpy(capture, rs->buf,
   2019                                    sizeof(capture[0]) * 2 * s->capture_count);
   2020                         pop_state1:
   2021                             pc = rs->pc;
   2022                             cptr = rs->cptr;
   2023                             stack_len = rs->stack_len;
   2024                             memcpy(stack, rs->buf + 2 * s->capture_count,
   2025                                    stack_len * sizeof(stack[0]));
   2026                             s->state_stack_len--;
   2027                             break;
   2028                         }
   2029                     } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) {
   2030                         if (!ret) {
   2031                             uint32_t char_count, i;
   2032                             memcpy(capture, rs->buf,
   2033                                    sizeof(capture[0]) * 2 * s->capture_count);
   2034                             stack_len = rs->stack_len;
   2035                             memcpy(stack, rs->buf + 2 * s->capture_count,
   2036                                    stack_len * sizeof(stack[0]));
   2037                             pc = rs->pc;
   2038                             cptr = rs->cptr;
   2039                             /* go backward */
   2040                             char_count = get_u32(pc + 12);
   2041                             for(i = 0; i < char_count; i++) {
   2042                                 PREV_CHAR(cptr, s->cbuf, cbuf_type);
   2043                             }
   2044                             pc = (pc + 16) + (int)get_u32(pc);
   2045                             rs->cptr = cptr;
   2046                             rs->count--;
   2047                             if (rs->count == 0) {
   2048                                 s->state_stack_len--;
   2049                             }
   2050                             break;
   2051                         }
   2052                     } else {
   2053                         ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) ||
   2054                                (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret));
   2055                         if (ret) {
   2056                             /* keep the capture in case of positive lookahead */
   2057                             if (rs->type == RE_EXEC_STATE_LOOKAHEAD)
   2058                                 goto pop_state1;
   2059                             else
   2060                                 goto pop_state;
   2061                         }
   2062                     }
   2063                     s->state_stack_len--;
   2064                 }
   2065             }
   2066             break;
   2067         case REOP_char32:
   2068             val = get_u32(pc);
   2069             pc += 4;
   2070             goto test_char;
   2071         case REOP_char:
   2072             val = get_u16(pc);
   2073             pc += 2;
   2074         test_char:
   2075             if (cptr >= cbuf_end)
   2076                 goto no_match;
   2077             GET_CHAR(c, cptr, cbuf_end, cbuf_type);
   2078             if (s->ignore_case) {
   2079                 c = lre_canonicalize(c, s->is_unicode);
   2080             }
   2081             if (val != c)
   2082                 goto no_match;
   2083             break;
   2084         case REOP_split_goto_first:
   2085         case REOP_split_next_first:
   2086             {
   2087                 const uint8_t *pc1;
   2088 
   2089                 val = get_u32(pc);
   2090                 pc += 4;
   2091                 if (opcode == REOP_split_next_first) {
   2092                     pc1 = pc + (int)val;
   2093                 } else {
   2094                     pc1 = pc;
   2095                     pc = pc + (int)val;
   2096                 }
   2097                 ret = push_state(s, capture, stack, stack_len,
   2098                                  pc1, cptr, RE_EXEC_STATE_SPLIT, 0);
   2099                 if (ret < 0)
   2100                     return -1;
   2101                 break;
   2102             }
   2103         case REOP_lookahead:
   2104         case REOP_negative_lookahead:
   2105             val = get_u32(pc);
   2106             pc += 4;
   2107             ret = push_state(s, capture, stack, stack_len,
   2108                              pc + (int)val, cptr,
   2109                              RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead,
   2110                              0);
   2111             if (ret < 0)
   2112                 return -1;
   2113             break;
   2114 
   2115         case REOP_goto:
   2116             val = get_u32(pc);
   2117             pc += 4 + (int)val;
   2118             break;
   2119         case REOP_line_start:
   2120             if (cptr == s->cbuf)
   2121                 break;
   2122             if (!s->multi_line)
   2123                 goto no_match;
   2124             PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
   2125             if (!is_line_terminator(c))
   2126                 goto no_match;
   2127             break;
   2128         case REOP_line_end:
   2129             if (cptr == cbuf_end)
   2130                 break;
   2131             if (!s->multi_line)
   2132                 goto no_match;
   2133             PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
   2134             if (!is_line_terminator(c))
   2135                 goto no_match;
   2136             break;
   2137         case REOP_dot:
   2138             if (cptr == cbuf_end)
   2139                 goto no_match;
   2140             GET_CHAR(c, cptr, cbuf_end, cbuf_type);
   2141             if (is_line_terminator(c))
   2142                 goto no_match;
   2143             break;
   2144         case REOP_any:
   2145             if (cptr == cbuf_end)
   2146                 goto no_match;
   2147             GET_CHAR(c, cptr, cbuf_end, cbuf_type);
   2148             break;
   2149         case REOP_save_start:
   2150         case REOP_save_end:
   2151             val = *pc++;
   2152             assert(val < s->capture_count);
   2153             capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr;
   2154             break;
   2155         case REOP_save_reset:
   2156             {
   2157                 uint32_t val2;
   2158                 val = pc[0];
   2159                 val2 = pc[1];
   2160                 pc += 2;
   2161                 assert(val2 < s->capture_count);
   2162                 while (val <= val2) {
   2163                     capture[2 * val] = NULL;
   2164                     capture[2 * val + 1] = NULL;
   2165                     val++;
   2166                 }
   2167             }
   2168             break;
   2169         case REOP_push_i32:
   2170             val = get_u32(pc);
   2171             pc += 4;
   2172             stack[stack_len++] = val;
   2173             break;
   2174         case REOP_drop:
   2175             stack_len--;
   2176             break;
   2177         case REOP_loop:
   2178             val = get_u32(pc);
   2179             pc += 4;
   2180             if (--stack[stack_len - 1] != 0) {
   2181                 pc += (int)val;
   2182             }
   2183             break;
   2184         case REOP_push_char_pos:
   2185             stack[stack_len++] = (uintptr_t)cptr;
   2186             break;
   2187         case REOP_check_advance:
   2188             if (stack[--stack_len] == (uintptr_t)cptr)
   2189                 goto no_match;
   2190             break;
   2191         case REOP_word_boundary:
   2192         case REOP_not_word_boundary:
   2193             {
   2194                 BOOL v1, v2;
   2195                 /* char before */
   2196                 if (cptr == s->cbuf) {
   2197                     v1 = FALSE;
   2198                 } else {
   2199                     PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
   2200                     v1 = is_word_char(c);
   2201                 }
   2202                 /* current char */
   2203                 if (cptr >= cbuf_end) {
   2204                     v2 = FALSE;
   2205                 } else {
   2206                     PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
   2207                     v2 = is_word_char(c);
   2208                 }
   2209                 if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode))
   2210                     goto no_match;
   2211             }
   2212             break;
   2213         case REOP_back_reference:
   2214         case REOP_backward_back_reference:
   2215             {
   2216                 const uint8_t *cptr1, *cptr1_end, *cptr1_start;
   2217                 uint32_t c1, c2;
   2218 
   2219                 val = *pc++;
   2220                 if (val >= s->capture_count)
   2221                     goto no_match;
   2222                 cptr1_start = capture[2 * val];
   2223                 cptr1_end = capture[2 * val + 1];
   2224                 if (!cptr1_start || !cptr1_end)
   2225                     break;
   2226                 if (opcode == REOP_back_reference) {
   2227                     cptr1 = cptr1_start;
   2228                     while (cptr1 < cptr1_end) {
   2229                         if (cptr >= cbuf_end)
   2230                             goto no_match;
   2231                         GET_CHAR(c1, cptr1, cptr1_end, cbuf_type);
   2232                         GET_CHAR(c2, cptr, cbuf_end, cbuf_type);
   2233                         if (s->ignore_case) {
   2234                             c1 = lre_canonicalize(c1, s->is_unicode);
   2235                             c2 = lre_canonicalize(c2, s->is_unicode);
   2236                         }
   2237                         if (c1 != c2)
   2238                             goto no_match;
   2239                     }
   2240                 } else {
   2241                     cptr1 = cptr1_end;
   2242                     while (cptr1 > cptr1_start) {
   2243                         if (cptr == s->cbuf)
   2244                             goto no_match;
   2245                         GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type);
   2246                         GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type);
   2247                         if (s->ignore_case) {
   2248                             c1 = lre_canonicalize(c1, s->is_unicode);
   2249                             c2 = lre_canonicalize(c2, s->is_unicode);
   2250                         }
   2251                         if (c1 != c2)
   2252                             goto no_match;
   2253                     }
   2254                 }
   2255             }
   2256             break;
   2257         case REOP_range:
   2258             {
   2259                 int n;
   2260                 uint32_t low, high, idx_min, idx_max, idx;
   2261 
   2262                 n = get_u16(pc); /* n must be >= 1 */
   2263                 pc += 2;
   2264                 if (cptr >= cbuf_end)
   2265                     goto no_match;
   2266                 GET_CHAR(c, cptr, cbuf_end, cbuf_type);
   2267                 if (s->ignore_case) {
   2268                     c = lre_canonicalize(c, s->is_unicode);
   2269                 }
   2270                 idx_min = 0;
   2271                 low = get_u16(pc + 0 * 4);
   2272                 if (c < low)
   2273                     goto no_match;
   2274                 idx_max = n - 1;
   2275                 high = get_u16(pc + idx_max * 4 + 2);
   2276                 /* 0xffff in for last value means +infinity */
   2277                 if (unlikely(c >= 0xffff) && high == 0xffff)
   2278                     goto range_match;
   2279                 if (c > high)
   2280                     goto no_match;
   2281                 while (idx_min <= idx_max) {
   2282                     idx = (idx_min + idx_max) / 2;
   2283                     low = get_u16(pc + idx * 4);
   2284                     high = get_u16(pc + idx * 4 + 2);
   2285                     if (c < low)
   2286                         idx_max = idx - 1;
   2287                     else if (c > high)
   2288                         idx_min = idx + 1;
   2289                     else
   2290                         goto range_match;
   2291                 }
   2292                 goto no_match;
   2293             range_match:
   2294                 pc += 4 * n;
   2295             }
   2296             break;
   2297         case REOP_range32:
   2298             {
   2299                 int n;
   2300                 uint32_t low, high, idx_min, idx_max, idx;
   2301 
   2302                 n = get_u16(pc); /* n must be >= 1 */
   2303                 pc += 2;
   2304                 if (cptr >= cbuf_end)
   2305                     goto no_match;
   2306                 GET_CHAR(c, cptr, cbuf_end, cbuf_type);
   2307                 if (s->ignore_case) {
   2308                     c = lre_canonicalize(c, s->is_unicode);
   2309                 }
   2310                 idx_min = 0;
   2311                 low = get_u32(pc + 0 * 8);
   2312                 if (c < low)
   2313                     goto no_match;
   2314                 idx_max = n - 1;
   2315                 high = get_u32(pc + idx_max * 8 + 4);
   2316                 if (c > high)
   2317                     goto no_match;
   2318                 while (idx_min <= idx_max) {
   2319                     idx = (idx_min + idx_max) / 2;
   2320                     low = get_u32(pc + idx * 8);
   2321                     high = get_u32(pc + idx * 8 + 4);
   2322                     if (c < low)
   2323                         idx_max = idx - 1;
   2324                     else if (c > high)
   2325                         idx_min = idx + 1;
   2326                     else
   2327                         goto range32_match;
   2328                 }
   2329                 goto no_match;
   2330             range32_match:
   2331                 pc += 8 * n;
   2332             }
   2333             break;
   2334         case REOP_prev:
   2335             /* go to the previous char */
   2336             if (cptr == s->cbuf)
   2337                 goto no_match;
   2338             PREV_CHAR(cptr, s->cbuf, cbuf_type);
   2339             break;
   2340         case REOP_simple_greedy_quant:
   2341             {
   2342                 uint32_t next_pos, quant_min, quant_max;
   2343                 size_t q;
   2344                 intptr_t res;
   2345                 const uint8_t *pc1;
   2346 
   2347                 next_pos = get_u32(pc);
   2348                 quant_min = get_u32(pc + 4);
   2349                 quant_max = get_u32(pc + 8);
   2350                 pc += 16;
   2351                 pc1 = pc;
   2352                 pc += (int)next_pos;
   2353 
   2354                 q = 0;
   2355                 for(;;) {
   2356                     res = lre_exec_backtrack(s, capture, stack, stack_len,
   2357                                              pc1, cptr, TRUE);
   2358                     if (res == -1)
   2359                         return res;
   2360                     if (!res)
   2361                         break;
   2362                     cptr = (uint8_t *)res;
   2363                     q++;
   2364                     if (q >= quant_max && quant_max != INT32_MAX)
   2365                         break;
   2366                 }
   2367                 if (q < quant_min)
   2368                     goto no_match;
   2369                 if (q > quant_min) {
   2370                     /* will examine all matches down to quant_min */
   2371                     ret = push_state(s, capture, stack, stack_len,
   2372                                      pc1 - 16, cptr,
   2373                                      RE_EXEC_STATE_GREEDY_QUANT,
   2374                                      q - quant_min);
   2375                     if (ret < 0)
   2376                         return -1;
   2377                 }
   2378             }
   2379             break;
   2380         default:
   2381             abort();
   2382         }
   2383     }
   2384 }
   2385 
   2386 /* Return 1 if match, 0 if not match or -1 if error. cindex is the
   2387    starting position of the match and must be such as 0 <= cindex <=
   2388    clen. */
   2389 int lre_exec(uint8_t **capture,
   2390              const uint8_t *bc_buf, const uint8_t *cbuf, int cindex, int clen,
   2391              int cbuf_type, void *opaque)
   2392 {
   2393     REExecContext s_s, *s = &s_s;
   2394     int re_flags, i, alloca_size, ret;
   2395     StackInt *stack_buf;
   2396 
   2397     re_flags = lre_get_flags(bc_buf);
   2398     s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
   2399     s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
   2400     s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0;
   2401     s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
   2402     s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
   2403     s->cbuf = cbuf;
   2404     s->cbuf_end = cbuf + (clen << cbuf_type);
   2405     s->cbuf_type = cbuf_type;
   2406     if (s->cbuf_type == 1 && s->is_unicode)
   2407         s->cbuf_type = 2;
   2408     s->opaque = opaque;
   2409 
   2410     s->state_size = sizeof(REExecState) +
   2411         s->capture_count * sizeof(capture[0]) * 2 +
   2412         s->stack_size_max * sizeof(stack_buf[0]);
   2413     s->state_stack = NULL;
   2414     s->state_stack_len = 0;
   2415     s->state_stack_size = 0;
   2416 
   2417     for(i = 0; i < s->capture_count * 2; i++)
   2418         capture[i] = NULL;
   2419     alloca_size = s->stack_size_max * sizeof(stack_buf[0]);
   2420     stack_buf = alloca(alloca_size);
   2421     ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN,
   2422                              cbuf + (cindex << cbuf_type), FALSE);
   2423     lre_realloc(s->opaque, s->state_stack, 0);
   2424     return ret;
   2425 }
   2426 
   2427 int lre_get_capture_count(const uint8_t *bc_buf)
   2428 {
   2429     return bc_buf[RE_HEADER_CAPTURE_COUNT];
   2430 }
   2431 
   2432 int lre_get_flags(const uint8_t *bc_buf)
   2433 {
   2434     return bc_buf[RE_HEADER_FLAGS];
   2435 }
   2436 
   2437 /* Return NULL if no group names. Otherwise, return a pointer to
   2438    'capture_count - 1' zero terminated UTF-8 strings. */
   2439 const char *lre_get_groupnames(const uint8_t *bc_buf)
   2440 {
   2441     uint32_t re_bytecode_len;
   2442     if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0)
   2443         return NULL;
   2444     re_bytecode_len = get_u32(bc_buf + RE_HEADER_BYTECODE_LEN);
   2445     return (const char *)(bc_buf + RE_HEADER_LEN + re_bytecode_len);
   2446 }
   2447 
   2448 #ifdef TEST
   2449 
   2450 BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size)
   2451 {
   2452     return FALSE;
   2453 }
   2454 
   2455 void *lre_realloc(void *opaque, void *ptr, size_t size)
   2456 {
   2457     return realloc(ptr, size);
   2458 }
   2459 
   2460 int main(int argc, char **argv)
   2461 {
   2462     int len, flags, ret, i;
   2463     uint8_t *bc;
   2464     char error_msg[64];
   2465     uint8_t *capture[CAPTURE_COUNT_MAX * 2];
   2466     const char *input;
   2467     int input_len, capture_count;
   2468 
   2469     if (argc < 4) {
   2470         printf("usage: %s regexp flags input\n", argv[0]);
   2471         return 1;
   2472     }
   2473     flags = atoi(argv[2]);
   2474     bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1],
   2475                      strlen(argv[1]), flags, NULL);
   2476     if (!bc) {
   2477         fprintf(stderr, "error: %s\n", error_msg);
   2478         exit(1);
   2479     }
   2480 
   2481     input = argv[3];
   2482     input_len = strlen(input);
   2483 
   2484     ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
   2485     printf("ret=%d\n", ret);
   2486     if (ret == 1) {
   2487         capture_count = lre_get_capture_count(bc);
   2488         for(i = 0; i < 2 * capture_count; i++) {
   2489             uint8_t *ptr;
   2490             ptr = capture[i];
   2491             printf("%d: ", i);
   2492             if (!ptr)
   2493                 printf("<nil>");
   2494             else
   2495                 printf("%u", (int)(ptr - (uint8_t *)input));
   2496             printf("\n");
   2497         }
   2498     }
   2499     return 0;
   2500 }
   2501 #endif
	quickjs-tart quickjs-based runtime for wallet-core logic
	Log \| Files \| Refs \| README \| LICENSE