diff options
Diffstat (limited to 'deps/v8/test/cctest/parsing/test-scanner-streams.cc')
-rw-r--r-- | deps/v8/test/cctest/parsing/test-scanner-streams.cc | 448 |
1 files changed, 448 insertions, 0 deletions
diff --git a/deps/v8/test/cctest/parsing/test-scanner-streams.cc b/deps/v8/test/cctest/parsing/test-scanner-streams.cc new file mode 100644 index 0000000000..fffd1200f2 --- /dev/null +++ b/deps/v8/test/cctest/parsing/test-scanner-streams.cc @@ -0,0 +1,448 @@ +// Copyright 2016 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "src/factory.h" // for i::Factory::NewExternalStringFrom*Byte +#include "src/objects-inl.h" +#include "src/parsing/scanner-character-streams.h" +#include "src/parsing/scanner.h" +#include "src/type-feedback-vector-inl.h" // for include "src/factory.h" +#include "test/cctest/cctest.h" + +namespace { + +// Implement ExternalSourceStream based on const char**. +// This will take each string as one chunk. The last chunk must be empty. +class ChunkSource : public v8::ScriptCompiler::ExternalSourceStream { + public: + explicit ChunkSource(const char** chunks) : current_(0) { + do { + chunks_.push_back( + {reinterpret_cast<const uint8_t*>(*chunks), strlen(*chunks)}); + chunks++; + } while (chunks_.back().len > 0); + } + ChunkSource(const uint8_t* data, size_t len, bool extra_chunky) + : current_(0) { + // If extra_chunky, we'll use increasingly large chunk sizes. + // If not, we'll have a single chunk of full length. + size_t chunk_size = extra_chunky ? 1 : len; + for (size_t i = 0; i < len; i += chunk_size, chunk_size *= 2) { + chunks_.push_back({data + i, i::Min(chunk_size, len - i)}); + } + chunks_.push_back({nullptr, 0}); + } + ~ChunkSource() {} + bool SetBookmark() override { return false; } + void ResetToBookmark() override {} + size_t GetMoreData(const uint8_t** src) override { + DCHECK_LT(current_, chunks_.size()); + Chunk& next = chunks_[current_++]; + uint8_t* chunk = new uint8_t[next.len]; + i::MemMove(chunk, next.ptr, next.len); + *src = chunk; + return next.len; + } + + private: + struct Chunk { + const uint8_t* ptr; + size_t len; + }; + std::vector<Chunk> chunks_; + size_t current_; +}; + +class TestExternalResource : public v8::String::ExternalStringResource { + public: + explicit TestExternalResource(uint16_t* data, int length) + : data_(data), length_(static_cast<size_t>(length)) {} + + ~TestExternalResource() {} + + const uint16_t* data() const { return data_; } + size_t length() const { return length_; } + + private: + uint16_t* data_; + size_t length_; +}; + +class TestExternalOneByteResource + : public v8::String::ExternalOneByteStringResource { + public: + TestExternalOneByteResource(const char* data, size_t length) + : data_(data), length_(length) {} + + const char* data() const { return data_; } + size_t length() const { return length_; } + + private: + const char* data_; + size_t length_; +}; + +// A test string with all lengths of utf-8 encodings. +const char unicode_utf8[] = + "abc" // 3x ascii + "\xc3\xa4" // a Umlaut, code point 228 + "\xe2\xa8\xa0" // >> (math symbol), code point 10784 + "\xf0\x9f\x92\xa9" // best character, code point 128169, + // as utf-16 surrogates: 55357 56489 + "def"; // 3x ascii again. +const uint16_t unicode_ucs2[] = {97, 98, 99, 228, 10784, 55357, + 56489, 100, 101, 102, 0}; + +} // anonymous namespace + +TEST(Utf8StreamAsciiOnly) { + const char* chunks[] = {"abc", "def", "ghi", ""}; + ChunkSource chunk_source(chunks); + std::unique_ptr<v8::internal::Utf16CharacterStream> stream( + v8::internal::ScannerStream::For( + &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8)); + + // Read the data without dying. + v8::internal::uc32 c; + do { + c = stream->Advance(); + } while (c != v8::internal::Utf16CharacterStream::kEndOfInput); +} + +TEST(Utf8StreamBOM) { + // Construct test string w/ UTF-8 BOM (byte order mark) + char data[3 + arraysize(unicode_utf8)] = {"\xef\xbb\xbf"}; + strncpy(data + 3, unicode_utf8, arraysize(unicode_utf8)); + + const char* chunks[] = {data, "\0"}; + ChunkSource chunk_source(chunks); + std::unique_ptr<v8::internal::Utf16CharacterStream> stream( + v8::internal::ScannerStream::For( + &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8)); + + // Read the data without tripping over the BOM. + for (size_t i = 0; unicode_ucs2[i]; i++) { + CHECK_EQ(unicode_ucs2[i], stream->Advance()); + } + CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput, stream->Advance()); + + // Make sure seek works. + stream->Seek(0); + CHECK_EQ(unicode_ucs2[0], stream->Advance()); + + stream->Seek(5); + CHECK_EQ(unicode_ucs2[5], stream->Advance()); +} + +TEST(Utf8SplitBOM) { + // Construct chunks with a BOM split into two chunks. + char partial_bom[] = "\xef\xbb"; + char data[1 + arraysize(unicode_utf8)] = {"\xbf"}; + strncpy(data + 1, unicode_utf8, arraysize(unicode_utf8)); + + { + const char* chunks[] = {partial_bom, data, "\0"}; + ChunkSource chunk_source(chunks); + std::unique_ptr<v8::internal::Utf16CharacterStream> stream( + v8::internal::ScannerStream::For( + &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8)); + + // Read the data without tripping over the BOM. + for (size_t i = 0; unicode_ucs2[i]; i++) { + CHECK_EQ(unicode_ucs2[i], stream->Advance()); + } + } + + // And now with single-byte BOM chunks. + char bom_byte_1[] = "\xef"; + char bom_byte_2[] = "\xbb"; + { + const char* chunks[] = {bom_byte_1, bom_byte_2, data, "\0"}; + ChunkSource chunk_source(chunks); + std::unique_ptr<v8::internal::Utf16CharacterStream> stream( + v8::internal::ScannerStream::For( + &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8)); + + // Read the data without tripping over the BOM. + for (size_t i = 0; unicode_ucs2[i]; i++) { + CHECK_EQ(unicode_ucs2[i], stream->Advance()); + } + } +} + +TEST(Utf8ChunkBoundaries) { + // Test utf-8 parsing at chunk boundaries. + + // Split the test string at each byte and pass it to the stream. This way, + // we'll have a split at each possible boundary. + size_t len = strlen(unicode_utf8); + char buffer[arraysize(unicode_utf8) + 3]; + for (size_t i = 1; i < len; i++) { + // Copy source string into buffer, splitting it at i. + // Then add three chunks, 0..i-1, i..strlen-1, empty. + strncpy(buffer, unicode_utf8, i); + strncpy(buffer + i + 1, unicode_utf8 + i, len - i); + buffer[i] = '\0'; + buffer[len + 1] = '\0'; + buffer[len + 2] = '\0'; + const char* chunks[] = {buffer, buffer + i + 1, buffer + len + 2}; + + ChunkSource chunk_source(chunks); + std::unique_ptr<v8::internal::Utf16CharacterStream> stream( + v8::internal::ScannerStream::For( + &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8)); + + for (size_t i = 0; unicode_ucs2[i]; i++) { + CHECK_EQ(unicode_ucs2[i], stream->Advance()); + } + CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput, + stream->Advance()); + } +} + +TEST(Utf8SingleByteChunks) { + // Have each byte as a single-byte chunk. + size_t len = strlen(unicode_utf8); + char buffer[arraysize(unicode_utf8) + 4]; + for (size_t i = 1; i < len - 1; i++) { + // Copy source string into buffer, make a single-byte chunk at i. + strncpy(buffer, unicode_utf8, i); + strncpy(buffer + i + 3, unicode_utf8 + i + 1, len - i - 1); + buffer[i] = '\0'; + buffer[i + 1] = unicode_utf8[i]; + buffer[i + 2] = '\0'; + buffer[len + 2] = '\0'; + buffer[len + 3] = '\0'; + const char* chunks[] = {buffer, buffer + i + 1, buffer + i + 3, + buffer + len + 3}; + + ChunkSource chunk_source(chunks); + std::unique_ptr<v8::internal::Utf16CharacterStream> stream( + v8::internal::ScannerStream::For( + &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8)); + + for (size_t j = 0; unicode_ucs2[j]; j++) { + CHECK_EQ(unicode_ucs2[j], stream->Advance()); + } + CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput, + stream->Advance()); + } +} + +#define CHECK_EQU(v1, v2) CHECK_EQ(static_cast<int>(v1), static_cast<int>(v2)) + +void TestCharacterStream(const char* reference, i::Utf16CharacterStream* stream, + unsigned length, unsigned start, unsigned end) { + // Read streams one char at a time + unsigned i; + for (i = start; i < end; i++) { + CHECK_EQU(i, stream->pos()); + CHECK_EQU(reference[i], stream->Advance()); + } + CHECK_EQU(end, stream->pos()); + + // Pushback, re-read, pushback again. + while (i > end / 4) { + int32_t c0 = reference[i - 1]; + CHECK_EQU(i, stream->pos()); + stream->Back(); + i--; + CHECK_EQU(i, stream->pos()); + int32_t c1 = stream->Advance(); + i++; + CHECK_EQU(i, stream->pos()); + CHECK_EQ(c0, c1); + stream->Back(); + i--; + CHECK_EQU(i, stream->pos()); + } + + // Seek + read streams one char at a time. + unsigned halfway = end / 2; + stream->Seek(stream->pos() + halfway - i); + for (i = halfway; i < end; i++) { + CHECK_EQU(i, stream->pos()); + CHECK_EQU(reference[i], stream->Advance()); + } + CHECK_EQU(i, stream->pos()); + CHECK_LT(stream->Advance(), 0); + + // Seek back, then seek beyond end of stream. + stream->Seek(start); + if (start < length) { + CHECK_EQU(stream->Advance(), reference[start]); + } else { + CHECK_LT(stream->Advance(), 0); + } + stream->Seek(length + 5); + CHECK_LT(stream->Advance(), 0); +} + +#undef CHECK_EQU + +void TestCharacterStreams(const char* one_byte_source, unsigned length, + unsigned start = 0, unsigned end = 0) { + if (end == 0) end = length; + + i::Isolate* isolate = CcTest::i_isolate(); + i::Factory* factory = isolate->factory(); + + // 2-byte external string + std::unique_ptr<i::uc16[]> uc16_buffer(new i::uc16[length]); + i::Vector<const i::uc16> two_byte_vector(uc16_buffer.get(), + static_cast<int>(length)); + { + for (unsigned i = 0; i < length; i++) { + uc16_buffer[i] = static_cast<i::uc16>(one_byte_source[i]); + } + TestExternalResource resource(uc16_buffer.get(), length); + i::Handle<i::String> uc16_string( + factory->NewExternalStringFromTwoByte(&resource).ToHandleChecked()); + std::unique_ptr<i::Utf16CharacterStream> uc16_stream( + i::ScannerStream::For(uc16_string, start, end)); + TestCharacterStream(one_byte_source, uc16_stream.get(), length, start, end); + } + + // 1-byte external string + i::Vector<const char> one_byte_vector(one_byte_source, + static_cast<int>(length)); + i::Handle<i::String> one_byte_string = + factory->NewStringFromAscii(one_byte_vector).ToHandleChecked(); + { + TestExternalOneByteResource one_byte_resource(one_byte_source, length); + i::Handle<i::String> ext_one_byte_string( + factory->NewExternalStringFromOneByte(&one_byte_resource) + .ToHandleChecked()); + std::unique_ptr<i::Utf16CharacterStream> one_byte_stream( + i::ScannerStream::For(ext_one_byte_string, start, end)); + TestCharacterStream(one_byte_source, one_byte_stream.get(), length, start, + end); + } + + // 1-byte generic i::String + { + std::unique_ptr<i::Utf16CharacterStream> string_stream( + i::ScannerStream::For(one_byte_string, start, end)); + TestCharacterStream(one_byte_source, string_stream.get(), length, start, + end); + } + + // 2-byte generic i::String + { + i::Handle<i::String> two_byte_string = + factory->NewStringFromTwoByte(two_byte_vector).ToHandleChecked(); + std::unique_ptr<i::Utf16CharacterStream> two_byte_string_stream( + i::ScannerStream::For(two_byte_string, start, end)); + TestCharacterStream(one_byte_source, two_byte_string_stream.get(), length, + start, end); + } + + // Streaming has no notion of start/end, so let's skip streaming tests for + // these cases. + if (start != 0 || end != length) return; + + // 1-byte streaming stream, single + many chunks. + { + const uint8_t* data = + reinterpret_cast<const uint8_t*>(one_byte_vector.begin()); + const uint8_t* data_end = + reinterpret_cast<const uint8_t*>(one_byte_vector.end()); + + ChunkSource single_chunk(data, data_end - data, false); + std::unique_ptr<i::Utf16CharacterStream> one_byte_streaming_stream( + i::ScannerStream::For(&single_chunk, + v8::ScriptCompiler::StreamedSource::ONE_BYTE)); + TestCharacterStream(one_byte_source, one_byte_streaming_stream.get(), + length, start, end); + + ChunkSource many_chunks(data, data_end - data, true); + one_byte_streaming_stream.reset(i::ScannerStream::For( + &many_chunks, v8::ScriptCompiler::StreamedSource::ONE_BYTE)); + TestCharacterStream(one_byte_source, one_byte_streaming_stream.get(), + length, start, end); + } + + // UTF-8 streaming stream, single + many chunks. + { + const uint8_t* data = + reinterpret_cast<const uint8_t*>(one_byte_vector.begin()); + const uint8_t* data_end = + reinterpret_cast<const uint8_t*>(one_byte_vector.end()); + ChunkSource chunks(data, data_end - data, false); + std::unique_ptr<i::Utf16CharacterStream> utf8_streaming_stream( + i::ScannerStream::For(&chunks, + v8::ScriptCompiler::StreamedSource::UTF8)); + TestCharacterStream(one_byte_source, utf8_streaming_stream.get(), length, + start, end); + + ChunkSource many_chunks(data, data_end - data, true); + utf8_streaming_stream.reset(i::ScannerStream::For( + &many_chunks, v8::ScriptCompiler::StreamedSource::UTF8)); + TestCharacterStream(one_byte_source, utf8_streaming_stream.get(), length, + start, end); + } + + // 2-byte streaming stream, single + many chunks. + { + const uint8_t* data = + reinterpret_cast<const uint8_t*>(two_byte_vector.begin()); + const uint8_t* data_end = + reinterpret_cast<const uint8_t*>(two_byte_vector.end()); + ChunkSource chunks(data, data_end - data, false); + std::unique_ptr<i::Utf16CharacterStream> two_byte_streaming_stream( + i::ScannerStream::For(&chunks, + v8::ScriptCompiler::StreamedSource::TWO_BYTE)); + TestCharacterStream(one_byte_source, two_byte_streaming_stream.get(), + length, start, end); + + ChunkSource many_chunks(data, data_end - data, true); + two_byte_streaming_stream.reset(i::ScannerStream::For( + &many_chunks, v8::ScriptCompiler::StreamedSource::TWO_BYTE)); + TestCharacterStream(one_byte_source, two_byte_streaming_stream.get(), + length, start, end); + } +} + +TEST(CharacterStreams) { + v8::Isolate* isolate = CcTest::isolate(); + v8::HandleScope handles(isolate); + v8::Local<v8::Context> context = v8::Context::New(isolate); + v8::Context::Scope context_scope(context); + + TestCharacterStreams("abcdefghi", 9); + TestCharacterStreams("abc\0\n\r\x7f", 7); + TestCharacterStreams("\0", 1); + TestCharacterStreams("", 0); + + // 4k large buffer. + char buffer[4096 + 1]; + for (unsigned i = 0; i < arraysize(buffer); i++) { + buffer[i] = static_cast<char>(i & 0x7F); + } + buffer[arraysize(buffer) - 1] = '\0'; + TestCharacterStreams(buffer, arraysize(buffer) - 1); + TestCharacterStreams(buffer, arraysize(buffer) - 1, 576, 3298); +} + +// Regression test for crbug.com/651333. Read invalid utf-8. +TEST(Regress651333) { + const uint8_t bytes[] = + "A\xf1" + "ad"; // Anad, with n == n-with-tilde. + const uint16_t unicode[] = {65, 65533, 97, 100}; + + // Run the test for all sub-strings 0..N of bytes, to make sure we hit the + // error condition in and at chunk boundaries. + for (size_t len = 0; len < arraysize(bytes); len++) { + // Read len bytes from bytes, and compare against the expected unicode + // characters. Expect kBadChar ( == Unicode replacement char == code point + // 65533) instead of the incorrectly coded Latin1 char. + ChunkSource chunks(bytes, len, false); + std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For( + &chunks, v8::ScriptCompiler::StreamedSource::UTF8)); + for (size_t i = 0; i < len; i++) { + CHECK_EQ(unicode[i], stream->Advance()); + } + CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance()); + } +} |