quickjs-tart

quickjs-based runtime for wallet-core logic
Log | Files | Refs | README | LICENSE

libunicode.h (5388B)


      1 /*
      2  * Unicode utilities
      3  *
      4  * Copyright (c) 2017-2018 Fabrice Bellard
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a copy
      7  * of this software and associated documentation files (the "Software"), to deal
      8  * in the Software without restriction, including without limitation the rights
      9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10  * copies of the Software, and to permit persons to whom the Software is
     11  * furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included in
     14  * all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     22  * THE SOFTWARE.
     23  */
     24 #ifndef LIBUNICODE_H
     25 #define LIBUNICODE_H
     26 
     27 #include <stdint.h>
     28 
     29 /* define it to include all the unicode tables (40KB larger) */
     30 #define CONFIG_ALL_UNICODE
     31 
     32 #define LRE_CC_RES_LEN_MAX 3
     33 
     34 /* char ranges */
     35 
     36 typedef struct {
     37     int len; /* in points, always even */
     38     int size;
     39     uint32_t *points; /* points sorted by increasing value */
     40     void *mem_opaque;
     41     void *(*realloc_func)(void *opaque, void *ptr, size_t size);
     42 } CharRange;
     43 
     44 typedef enum {
     45     CR_OP_UNION,
     46     CR_OP_INTER,
     47     CR_OP_XOR,
     48 } CharRangeOpEnum;
     49 
     50 void cr_init(CharRange *cr, void *mem_opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size));
     51 void cr_free(CharRange *cr);
     52 int cr_realloc(CharRange *cr, int size);
     53 int cr_copy(CharRange *cr, const CharRange *cr1);
     54 
     55 static inline int cr_add_point(CharRange *cr, uint32_t v)
     56 {
     57     if (cr->len >= cr->size) {
     58         if (cr_realloc(cr, cr->len + 1))
     59             return -1;
     60     }
     61     cr->points[cr->len++] = v;
     62     return 0;
     63 }
     64 
     65 static inline int cr_add_interval(CharRange *cr, uint32_t c1, uint32_t c2)
     66 {
     67     if ((cr->len + 2) > cr->size) {
     68         if (cr_realloc(cr, cr->len + 2))
     69             return -1;
     70     }
     71     cr->points[cr->len++] = c1;
     72     cr->points[cr->len++] = c2;
     73     return 0;
     74 }
     75 
     76 int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len);
     77 
     78 static inline int cr_union_interval(CharRange *cr, uint32_t c1, uint32_t c2)
     79 {
     80     uint32_t b_pt[2];
     81     b_pt[0] = c1;
     82     b_pt[1] = c2 + 1;
     83     return cr_union1(cr, b_pt, 2);
     84 }
     85 
     86 int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
     87           const uint32_t *b_pt, int b_len, int op);
     88 
     89 int cr_invert(CharRange *cr);
     90 
     91 int cr_regexp_canonicalize(CharRange *cr, int is_unicode);
     92 
     93 typedef enum {
     94     UNICODE_NFC,
     95     UNICODE_NFD,
     96     UNICODE_NFKC,
     97     UNICODE_NFKD,
     98 } UnicodeNormalizationEnum;
     99 
    100 int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
    101                       UnicodeNormalizationEnum n_type,
    102                       void *opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size));
    103 
    104 /* Unicode character range functions */
    105 
    106 int unicode_script(CharRange *cr, const char *script_name, int is_ext);
    107 int unicode_general_category(CharRange *cr, const char *gc_name);
    108 int unicode_prop(CharRange *cr, const char *prop_name);
    109 
    110 int lre_case_conv(uint32_t *res, uint32_t c, int conv_type);
    111 int lre_canonicalize(uint32_t c, int is_unicode);
    112 
    113 /* Code point type categories */
    114 enum {
    115     UNICODE_C_SPACE  = (1 << 0),
    116     UNICODE_C_DIGIT  = (1 << 1),
    117     UNICODE_C_UPPER  = (1 << 2),
    118     UNICODE_C_LOWER  = (1 << 3),
    119     UNICODE_C_UNDER  = (1 << 4),
    120     UNICODE_C_DOLLAR = (1 << 5),
    121     UNICODE_C_XDIGIT = (1 << 6),
    122 };
    123 extern uint8_t const lre_ctype_bits[256];
    124 
    125 /* zero or non-zero return value */
    126 int lre_is_cased(uint32_t c);
    127 int lre_is_case_ignorable(uint32_t c);
    128 int lre_is_id_start(uint32_t c);
    129 int lre_is_id_continue(uint32_t c);
    130 
    131 static inline int lre_is_space_byte(uint8_t c) {
    132     return lre_ctype_bits[c] & UNICODE_C_SPACE;
    133 }
    134 
    135 static inline int lre_is_id_start_byte(uint8_t c) {
    136     return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER |
    137                                 UNICODE_C_UNDER | UNICODE_C_DOLLAR);
    138 }
    139 
    140 static inline int lre_is_id_continue_byte(uint8_t c) {
    141     return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER |
    142                                 UNICODE_C_UNDER | UNICODE_C_DOLLAR |
    143                                 UNICODE_C_DIGIT);
    144 }
    145 
    146 int lre_is_space_non_ascii(uint32_t c);
    147 
    148 static inline int lre_is_space(uint32_t c) {
    149     if (c < 256)
    150         return lre_is_space_byte(c);
    151     else
    152         return lre_is_space_non_ascii(c);
    153 }
    154 
    155 static inline int lre_js_is_ident_first(uint32_t c) {
    156     if (c < 128) {
    157         return lre_is_id_start_byte(c);
    158     } else {
    159 #ifdef CONFIG_ALL_UNICODE
    160         return lre_is_id_start(c);
    161 #else
    162         return !lre_is_space_non_ascii(c);
    163 #endif
    164     }
    165 }
    166 
    167 static inline int lre_js_is_ident_next(uint32_t c) {
    168     if (c < 128) {
    169         return lre_is_id_continue_byte(c);
    170     } else {
    171         /* ZWNJ and ZWJ are accepted in identifiers */
    172         if (c >= 0x200C && c <= 0x200D)
    173             return TRUE;
    174 #ifdef CONFIG_ALL_UNICODE
    175         return lre_is_id_continue(c);
    176 #else
    177         return !lre_is_space_non_ascii(c);
    178 #endif
    179     }
    180 }
    181 
    182 #endif /* LIBUNICODE_H */