summaryrefslogtreecommitdiff
path: root/deps/v8/src/strings/unicode.h
diff options
context:
space:
mode:
Diffstat (limited to 'deps/v8/src/strings/unicode.h')
-rw-r--r--deps/v8/src/strings/unicode.h257
1 files changed, 257 insertions, 0 deletions
diff --git a/deps/v8/src/strings/unicode.h b/deps/v8/src/strings/unicode.h
new file mode 100644
index 0000000000..bd94300e34
--- /dev/null
+++ b/deps/v8/src/strings/unicode.h
@@ -0,0 +1,257 @@
+// Copyright 2011 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_STRINGS_UNICODE_H_
+#define V8_STRINGS_UNICODE_H_
+
+#include <sys/types.h>
+#include "src/common/globals.h"
+#include "src/third_party/utf8-decoder/utf8-decoder.h"
+#include "src/utils/utils.h"
+/**
+ * \file
+ * Definitions and convenience functions for working with unicode.
+ */
+
+namespace unibrow {
+
+using uchar = unsigned int;
+using byte = unsigned char;
+
+/**
+ * The max length of the result of converting the case of a single
+ * character.
+ */
+const int kMaxMappingSize = 4;
+
+#ifndef V8_INTL_SUPPORT
+template <class T, int size = 256>
+class Predicate {
+ public:
+ inline Predicate() = default;
+ inline bool get(uchar c);
+
+ private:
+ friend class Test;
+ bool CalculateValue(uchar c);
+ class CacheEntry {
+ public:
+ inline CacheEntry()
+ : bit_field_(CodePointField::encode(0) | ValueField::encode(0)) {}
+ inline CacheEntry(uchar code_point, bool value)
+ : bit_field_(
+ CodePointField::encode(CodePointField::kMask & code_point) |
+ ValueField::encode(value)) {
+ DCHECK_IMPLIES((CodePointField::kMask & code_point) != code_point,
+ code_point == static_cast<uchar>(-1));
+ }
+
+ uchar code_point() const { return CodePointField::decode(bit_field_); }
+ bool value() const { return ValueField::decode(bit_field_); }
+
+ private:
+ class CodePointField : public v8::internal::BitField<uchar, 0, 21> {};
+ class ValueField : public v8::internal::BitField<bool, 21, 1> {};
+
+ uint32_t bit_field_;
+ };
+ static const int kSize = size;
+ static const int kMask = kSize - 1;
+ CacheEntry entries_[kSize];
+};
+
+// A cache used in case conversion. It caches the value for characters
+// that either have no mapping or map to a single character independent
+// of context. Characters that map to more than one character or that
+// map differently depending on context are always looked up.
+template <class T, int size = 256>
+class Mapping {
+ public:
+ inline Mapping() = default;
+ inline int get(uchar c, uchar n, uchar* result);
+
+ private:
+ friend class Test;
+ int CalculateValue(uchar c, uchar n, uchar* result);
+ struct CacheEntry {
+ inline CacheEntry() : code_point_(kNoChar), offset_(0) {}
+ inline CacheEntry(uchar code_point, signed offset)
+ : code_point_(code_point), offset_(offset) {}
+ uchar code_point_;
+ signed offset_;
+ static const int kNoChar = (1 << 21) - 1;
+ };
+ static const int kSize = size;
+ static const int kMask = kSize - 1;
+ CacheEntry entries_[kSize];
+};
+
+class UnicodeData {
+ private:
+ friend class Test;
+ static int GetByteCount();
+ static const uchar kMaxCodePoint;
+};
+
+#endif // !V8_INTL_SUPPORT
+
+class Utf16 {
+ public:
+ static const int kNoPreviousCharacter = -1;
+ static inline bool IsSurrogatePair(int lead, int trail) {
+ return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
+ }
+ static inline bool IsLeadSurrogate(int code) {
+ return (code & 0xfc00) == 0xd800;
+ }
+ static inline bool IsTrailSurrogate(int code) {
+ return (code & 0xfc00) == 0xdc00;
+ }
+
+ static inline int CombineSurrogatePair(uchar lead, uchar trail) {
+ return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
+ }
+ static const uchar kMaxNonSurrogateCharCode = 0xffff;
+ // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
+ // of UTF-8 data. The special case where the unit is a surrogate
+ // trail produces 1 byte net, because the encoding of the pair is
+ // 4 bytes and the 3 bytes that were used to encode the lead surrogate
+ // can be reclaimed.
+ static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
+ // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
+ // The illegality stems from the surrogate not being part of a pair.
+ static const int kUtf8BytesToCodeASurrogate = 3;
+ static inline uint16_t LeadSurrogate(uint32_t char_code) {
+ return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
+ }
+ static inline uint16_t TrailSurrogate(uint32_t char_code) {
+ return 0xdc00 + (char_code & 0x3ff);
+ }
+};
+
+class Latin1 {
+ public:
+ static const uint16_t kMaxChar = 0xff;
+ // Convert the character to Latin-1 case equivalent if possible.
+ static inline uint16_t TryConvertToLatin1(uint16_t c) {
+ switch (c) {
+ // This are equivalent characters in unicode.
+ case 0x39c:
+ case 0x3bc:
+ return 0xb5;
+ // This is an uppercase of a Latin-1 character
+ // outside of Latin-1.
+ case 0x178:
+ return 0xff;
+ }
+ return c;
+ }
+};
+
+class V8_EXPORT_PRIVATE Utf8 {
+ public:
+ using State = Utf8DfaDecoder::State;
+
+ static inline uchar Length(uchar chr, int previous);
+ static inline unsigned EncodeOneByte(char* out, uint8_t c);
+ static inline unsigned Encode(char* out, uchar c, int previous,
+ bool replace_invalid = false);
+ static uchar CalculateValue(const byte* str, size_t length, size_t* cursor);
+
+ // The unicode replacement character, used to signal invalid unicode
+ // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding.
+ static const uchar kBadChar = 0xFFFD;
+ static const uchar kBufferEmpty = 0x0;
+ static const uchar kIncomplete = 0xFFFFFFFC; // any non-valid code point.
+ static const unsigned kMaxEncodedSize = 4;
+ static const unsigned kMaxOneByteChar = 0x7f;
+ static const unsigned kMaxTwoByteChar = 0x7ff;
+ static const unsigned kMaxThreeByteChar = 0xffff;
+ static const unsigned kMaxFourByteChar = 0x1fffff;
+
+ // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
+ // that match are coded as a 4 byte UTF-8 sequence.
+ static const unsigned kBytesSavedByCombiningSurrogates = 2;
+ static const unsigned kSizeOfUnmatchedSurrogate = 3;
+ // The maximum size a single UTF-16 code unit may take up when encoded as
+ // UTF-8.
+ static const unsigned kMax16BitCodeUnitSize = 3;
+ static inline uchar ValueOf(const byte* str, size_t length, size_t* cursor);
+
+ using Utf8IncrementalBuffer = uint32_t;
+ static inline uchar ValueOfIncremental(const byte** cursor, State* state,
+ Utf8IncrementalBuffer* buffer);
+ static uchar ValueOfIncrementalFinish(State* state);
+
+ // Excludes non-characters from the set of valid code points.
+ static inline bool IsValidCharacter(uchar c);
+
+ // Validate if the input has a valid utf-8 encoding. Unlike JS source code
+ // this validation function will accept any unicode code point, including
+ // kBadChar and BOMs.
+ //
+ // This method checks for:
+ // - valid utf-8 endcoding (e.g. no over-long encodings),
+ // - absence of surrogates,
+ // - valid code point range.
+ static bool ValidateEncoding(const byte* str, size_t length);
+};
+
+struct Uppercase {
+ static bool Is(uchar c);
+};
+struct Letter {
+ static bool Is(uchar c);
+};
+#ifndef V8_INTL_SUPPORT
+struct V8_EXPORT_PRIVATE ID_Start {
+ static bool Is(uchar c);
+};
+struct V8_EXPORT_PRIVATE ID_Continue {
+ static bool Is(uchar c);
+};
+struct V8_EXPORT_PRIVATE WhiteSpace {
+ static bool Is(uchar c);
+};
+#endif // !V8_INTL_SUPPORT
+
+// LineTerminator: 'JS_Line_Terminator' in point.properties
+// ES#sec-line-terminators lists exactly 4 code points:
+// LF (U+000A), CR (U+000D), LS(U+2028), PS(U+2029)
+V8_INLINE bool IsLineTerminator(uchar c) {
+ return c == 0x000A || c == 0x000D || c == 0x2028 || c == 0x2029;
+}
+
+V8_INLINE bool IsStringLiteralLineTerminator(uchar c) {
+ return c == 0x000A || c == 0x000D;
+}
+
+#ifndef V8_INTL_SUPPORT
+struct ToLowercase {
+ static const int kMaxWidth = 3;
+ static const bool kIsToLower = true;
+ static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr);
+};
+struct ToUppercase {
+ static const int kMaxWidth = 3;
+ static const bool kIsToLower = false;
+ static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr);
+};
+struct V8_EXPORT_PRIVATE Ecma262Canonicalize {
+ static const int kMaxWidth = 1;
+ static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr);
+};
+struct V8_EXPORT_PRIVATE Ecma262UnCanonicalize {
+ static const int kMaxWidth = 4;
+ static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr);
+};
+struct V8_EXPORT_PRIVATE CanonicalizationRange {
+ static const int kMaxWidth = 1;
+ static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr);
+};
+#endif // !V8_INTL_SUPPORT
+
+} // namespace unibrow
+
+#endif // V8_STRINGS_UNICODE_H_