// Copyright 2014 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef V8_UNICODE_DECODER_H_ #define V8_UNICODE_DECODER_H_ #include #include #include "src/globals.h" #include "src/unicode.h" #include "src/utils.h" #include "src/vector.h" namespace unibrow { class Utf8Iterator { public: explicit Utf8Iterator(const v8::internal::Vector& stream) : Utf8Iterator(stream, 0, false) {} Utf8Iterator(const v8::internal::Vector& stream, size_t offset, bool trailing) : stream_(stream), cursor_(offset), offset_(0), char_(0), trailing_(false) { DCHECK_LE(offset, stream.length()); // Read the first char, setting offset_ to offset in the process. ++*this; // This must be set after reading the first char, since the offset marks // the start of the octet sequence that the trailing char is part of. trailing_ = trailing; if (trailing) { DCHECK_GT(char_, Utf16::kMaxNonSurrogateCharCode); } } uint16_t operator*(); Utf8Iterator& operator++(); Utf8Iterator operator++(int); bool Done(); bool Trailing() { return trailing_; } size_t Offset() { return offset_; } private: const v8::internal::Vector& stream_; size_t cursor_; size_t offset_; uint32_t char_; bool trailing_; }; class V8_EXPORT_PRIVATE Utf8DecoderBase { public: // Initialization done in subclass. inline Utf8DecoderBase(); inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length, const v8::internal::Vector& stream); inline size_t Utf16Length() const { return utf16_length_; } protected: // This reads all characters and sets the utf16_length_. // The first buffer_length utf16 chars are cached in the buffer. void Reset(uint16_t* buffer, size_t buffer_length, const v8::internal::Vector& vector); static void WriteUtf16Slow(uint16_t* data, size_t length, const v8::internal::Vector& stream, size_t offset, bool trailing); size_t bytes_read_; size_t chars_written_; size_t utf16_length_; bool trailing_; private: DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase); }; template class Utf8Decoder : public Utf8DecoderBase { public: inline Utf8Decoder() {} explicit inline Utf8Decoder(const v8::internal::Vector& stream); inline void Reset(const v8::internal::Vector& stream); inline size_t WriteUtf16( uint16_t* data, size_t length, const v8::internal::Vector& stream) const; private: uint16_t buffer_[kBufferSize]; }; Utf8DecoderBase::Utf8DecoderBase() : bytes_read_(0), chars_written_(0), utf16_length_(0), trailing_(false) {} Utf8DecoderBase::Utf8DecoderBase( uint16_t* buffer, size_t buffer_length, const v8::internal::Vector& stream) { Reset(buffer, buffer_length, stream); } template Utf8Decoder::Utf8Decoder( const v8::internal::Vector& stream) : Utf8DecoderBase(buffer_, kBufferSize, stream) {} template void Utf8Decoder::Reset( const v8::internal::Vector& stream) { Utf8DecoderBase::Reset(buffer_, kBufferSize, stream); } template size_t Utf8Decoder::WriteUtf16( uint16_t* data, size_t data_length, const v8::internal::Vector& stream) const { DCHECK_GT(data_length, 0); data_length = std::min(data_length, utf16_length_); // memcpy everything in buffer. size_t memcpy_length = std::min(data_length, chars_written_); v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t)); if (data_length <= chars_written_) return data_length; // Copy the rest the slow way. WriteUtf16Slow(data + chars_written_, data_length - chars_written_, stream, bytes_read_, trailing_); return data_length; } class Latin1 { public: static const unsigned kMaxChar = 0xff; // Convert the character to Latin-1 case equivalent if possible. static inline uint16_t TryConvertToLatin1(uint16_t); }; uint16_t Latin1::TryConvertToLatin1(uint16_t c) { switch (c) { // This are equivalent characters in unicode. case 0x39c: case 0x3bc: return 0xb5; // This is an uppercase of a Latin-1 character // outside of Latin-1. case 0x178: return 0xff; } return c; } } // namespace unibrow #endif // V8_UNICODE_DECODER_H_