diff options
author | Ben Noordhuis <info@bnoordhuis.nl> | 2015-06-19 13:23:56 +0200 |
---|---|---|
committer | Rod Vagg <rod@vagg.org> | 2015-08-04 11:56:14 -0700 |
commit | 70d1f32f5605465a1a630a64f6f0d35f96c7709d (patch) | |
tree | 0a349040a686eafcb0a09943ebc733477dce2781 /deps/v8/src/unicode.cc | |
parent | 4643b8b6671607a7aff60cbbd0b384dcf2f6959e (diff) | |
download | android-node-v8-70d1f32f5605465a1a630a64f6f0d35f96c7709d.tar.gz android-node-v8-70d1f32f5605465a1a630a64f6f0d35f96c7709d.tar.bz2 android-node-v8-70d1f32f5605465a1a630a64f6f0d35f96c7709d.zip |
deps: update v8 to 4.4.63.9
Upgrade the bundled V8 and update code in src/ and lib/ to the new API.
Notable backwards incompatible changes are the removal of the smalloc
module and dropped support for CESU-8 decoding. CESU-8 support can be
brought back if necessary by doing UTF-8 decoding ourselves.
This commit includes https://codereview.chromium.org/1192973004 to fix
a build error on python 2.6 systems. The original commit log follows:
Use optparse in js2c.py for python compatibility
Without this change, V8 won't build on RHEL/CentOS 6 because the
distro python is too old to know about the argparse module.
PR-URL: https://github.com/nodejs/io.js/pull/2022
Reviewed-By: Rod Vagg <rod@vagg.org>
Reviewed-By: Trevor Norris <trev.norris@gmail.com>
Diffstat (limited to 'deps/v8/src/unicode.cc')
-rw-r--r-- | deps/v8/src/unicode.cc | 139 |
1 files changed, 93 insertions, 46 deletions
diff --git a/deps/v8/src/unicode.cc b/deps/v8/src/unicode.cc index 0d0d63d177..df45697bde 100644 --- a/deps/v8/src/unicode.cc +++ b/deps/v8/src/unicode.cc @@ -190,71 +190,118 @@ static int LookupMapping(const int32_t* table, } -uchar Utf8::CalculateValue(const byte* str, size_t length, size_t* cursor) { - // We only get called for non-ASCII characters. - if (length == 1) { - *cursor += 1; - return kBadChar; - } - byte first = str[0]; - byte second = str[1] ^ 0x80; - if (second & 0xC0) { +static inline size_t NonASCIISequenceLength(byte first) { + // clang-format off + static const uint8_t lengths[256] = { + // The first 128 entries correspond to ASCII characters. + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // The following 64 entries correspond to continuation bytes. + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // The next are two invalid overlong encodings and 30 two-byte sequences. + 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + // 16 three-byte sequences. + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + // 5 four-byte sequences, followed by sequences that could only encode + // code points outside of the unicode range. + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + // clang-format on + return lengths[first]; +} + + +static inline bool IsContinuationCharacter(byte chr) { + return chr >= 0x80 && chr <= 0xBF; +} + + +// This method decodes an UTF-8 value according to RFC 3629. +uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) { + size_t length = NonASCIISequenceLength(str[0]); + if (length == 0 || max_length < length) { *cursor += 1; return kBadChar; } - if (first < 0xE0) { - if (first < 0xC0) { - *cursor += 1; - return kBadChar; - } - uchar code_point = ((first << 6) | second) & kMaxTwoByteChar; - if (code_point <= kMaxOneByteChar) { + if (length == 2) { + if (!IsContinuationCharacter(str[1])) { *cursor += 1; return kBadChar; } *cursor += 2; - return code_point; + return ((str[0] << 6) + str[1]) - 0x00003080; } - if (length == 2) { - *cursor += 1; - return kBadChar; - } - byte third = str[2] ^ 0x80; - if (third & 0xC0) { - *cursor += 1; - return kBadChar; - } - if (first < 0xF0) { - uchar code_point = ((((first << 6) | second) << 6) | third) - & kMaxThreeByteChar; - if (code_point <= kMaxTwoByteChar) { + if (length == 3) { + switch (str[0]) { + case 0xE0: + // Overlong three-byte sequence. + if (str[1] < 0xA0 || str[1] > 0xBF) { + *cursor += 1; + return kBadChar; + } + break; + case 0xED: + // High and low surrogate halves. + if (str[1] < 0x80 || str[1] > 0x9F) { + *cursor += 1; + return kBadChar; + } + break; + default: + if (!IsContinuationCharacter(str[1])) { + *cursor += 1; + return kBadChar; + } + } + if (!IsContinuationCharacter(str[2])) { *cursor += 1; return kBadChar; } *cursor += 3; - return code_point; + return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080; } - if (length == 3) { + DCHECK(length == 4); + switch (str[0]) { + case 0xF0: + // Overlong four-byte sequence. + if (str[1] < 0x90 || str[1] > 0xBF) { + *cursor += 1; + return kBadChar; + } + break; + case 0xF4: + // Code points outside of the unicode range. + if (str[1] < 0x80 || str[1] > 0x8F) { + *cursor += 1; + return kBadChar; + } + break; + default: + if (!IsContinuationCharacter(str[1])) { + *cursor += 1; + return kBadChar; + } + } + if (!IsContinuationCharacter(str[2])) { *cursor += 1; return kBadChar; } - byte fourth = str[3] ^ 0x80; - if (fourth & 0xC0) { + if (!IsContinuationCharacter(str[3])) { *cursor += 1; return kBadChar; } - if (first < 0xF8) { - uchar code_point = (((((first << 6 | second) << 6) | third) << 6) | fourth) - & kMaxFourByteChar; - if (code_point <= kMaxThreeByteChar) { - *cursor += 1; - return kBadChar; - } - *cursor += 4; - return code_point; - } - *cursor += 1; - return kBadChar; + *cursor += 4; + return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - + 0x03C82080; } |