libunicode.h (5388B)
1 /* 2 * Unicode utilities 3 * 4 * Copyright (c) 2017-2018 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #ifndef LIBUNICODE_H 25 #define LIBUNICODE_H 26 27 #include <stdint.h> 28 29 /* define it to include all the unicode tables (40KB larger) */ 30 #define CONFIG_ALL_UNICODE 31 32 #define LRE_CC_RES_LEN_MAX 3 33 34 /* char ranges */ 35 36 typedef struct { 37 int len; /* in points, always even */ 38 int size; 39 uint32_t *points; /* points sorted by increasing value */ 40 void *mem_opaque; 41 void *(*realloc_func)(void *opaque, void *ptr, size_t size); 42 } CharRange; 43 44 typedef enum { 45 CR_OP_UNION, 46 CR_OP_INTER, 47 CR_OP_XOR, 48 } CharRangeOpEnum; 49 50 void cr_init(CharRange *cr, void *mem_opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size)); 51 void cr_free(CharRange *cr); 52 int cr_realloc(CharRange *cr, int size); 53 int cr_copy(CharRange *cr, const CharRange *cr1); 54 55 static inline int cr_add_point(CharRange *cr, uint32_t v) 56 { 57 if (cr->len >= cr->size) { 58 if (cr_realloc(cr, cr->len + 1)) 59 return -1; 60 } 61 cr->points[cr->len++] = v; 62 return 0; 63 } 64 65 static inline int cr_add_interval(CharRange *cr, uint32_t c1, uint32_t c2) 66 { 67 if ((cr->len + 2) > cr->size) { 68 if (cr_realloc(cr, cr->len + 2)) 69 return -1; 70 } 71 cr->points[cr->len++] = c1; 72 cr->points[cr->len++] = c2; 73 return 0; 74 } 75 76 int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len); 77 78 static inline int cr_union_interval(CharRange *cr, uint32_t c1, uint32_t c2) 79 { 80 uint32_t b_pt[2]; 81 b_pt[0] = c1; 82 b_pt[1] = c2 + 1; 83 return cr_union1(cr, b_pt, 2); 84 } 85 86 int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len, 87 const uint32_t *b_pt, int b_len, int op); 88 89 int cr_invert(CharRange *cr); 90 91 int cr_regexp_canonicalize(CharRange *cr, int is_unicode); 92 93 typedef enum { 94 UNICODE_NFC, 95 UNICODE_NFD, 96 UNICODE_NFKC, 97 UNICODE_NFKD, 98 } UnicodeNormalizationEnum; 99 100 int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len, 101 UnicodeNormalizationEnum n_type, 102 void *opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size)); 103 104 /* Unicode character range functions */ 105 106 int unicode_script(CharRange *cr, const char *script_name, int is_ext); 107 int unicode_general_category(CharRange *cr, const char *gc_name); 108 int unicode_prop(CharRange *cr, const char *prop_name); 109 110 int lre_case_conv(uint32_t *res, uint32_t c, int conv_type); 111 int lre_canonicalize(uint32_t c, int is_unicode); 112 113 /* Code point type categories */ 114 enum { 115 UNICODE_C_SPACE = (1 << 0), 116 UNICODE_C_DIGIT = (1 << 1), 117 UNICODE_C_UPPER = (1 << 2), 118 UNICODE_C_LOWER = (1 << 3), 119 UNICODE_C_UNDER = (1 << 4), 120 UNICODE_C_DOLLAR = (1 << 5), 121 UNICODE_C_XDIGIT = (1 << 6), 122 }; 123 extern uint8_t const lre_ctype_bits[256]; 124 125 /* zero or non-zero return value */ 126 int lre_is_cased(uint32_t c); 127 int lre_is_case_ignorable(uint32_t c); 128 int lre_is_id_start(uint32_t c); 129 int lre_is_id_continue(uint32_t c); 130 131 static inline int lre_is_space_byte(uint8_t c) { 132 return lre_ctype_bits[c] & UNICODE_C_SPACE; 133 } 134 135 static inline int lre_is_id_start_byte(uint8_t c) { 136 return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER | 137 UNICODE_C_UNDER | UNICODE_C_DOLLAR); 138 } 139 140 static inline int lre_is_id_continue_byte(uint8_t c) { 141 return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER | 142 UNICODE_C_UNDER | UNICODE_C_DOLLAR | 143 UNICODE_C_DIGIT); 144 } 145 146 int lre_is_space_non_ascii(uint32_t c); 147 148 static inline int lre_is_space(uint32_t c) { 149 if (c < 256) 150 return lre_is_space_byte(c); 151 else 152 return lre_is_space_non_ascii(c); 153 } 154 155 static inline int lre_js_is_ident_first(uint32_t c) { 156 if (c < 128) { 157 return lre_is_id_start_byte(c); 158 } else { 159 #ifdef CONFIG_ALL_UNICODE 160 return lre_is_id_start(c); 161 #else 162 return !lre_is_space_non_ascii(c); 163 #endif 164 } 165 } 166 167 static inline int lre_js_is_ident_next(uint32_t c) { 168 if (c < 128) { 169 return lre_is_id_continue_byte(c); 170 } else { 171 /* ZWNJ and ZWJ are accepted in identifiers */ 172 if (c >= 0x200C && c <= 0x200D) 173 return TRUE; 174 #ifdef CONFIG_ALL_UNICODE 175 return lre_is_id_continue(c); 176 #else 177 return !lre_is_space_non_ascii(c); 178 #endif 179 } 180 } 181 182 #endif /* LIBUNICODE_H */