diff options
Diffstat (limited to 'deps/v8/src/strings/uri.cc')
-rw-r--r-- | deps/v8/src/strings/uri.cc | 510 |
1 files changed, 510 insertions, 0 deletions
diff --git a/deps/v8/src/strings/uri.cc b/deps/v8/src/strings/uri.cc new file mode 100644 index 0000000000..430c8dd0eb --- /dev/null +++ b/deps/v8/src/strings/uri.cc @@ -0,0 +1,510 @@ +// Copyright 2016 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "src/strings/uri.h" + +#include <vector> + +#include "src/execution/isolate-inl.h" +#include "src/strings/char-predicates-inl.h" +#include "src/strings/string-search.h" +#include "src/strings/unicode-inl.h" + +namespace v8 { +namespace internal { + +namespace { // anonymous namespace for DecodeURI helper functions +bool IsReservedPredicate(uc16 c) { + switch (c) { + case '#': + case '$': + case '&': + case '+': + case ',': + case '/': + case ':': + case ';': + case '=': + case '?': + case '@': + return true; + default: + return false; + } +} + +bool IsReplacementCharacter(const uint8_t* octets, int length) { + // The replacement character is at codepoint U+FFFD in the Unicode Specials + // table. Its UTF-8 encoding is 0xEF 0xBF 0xBD. + if (length != 3 || octets[0] != 0xEF || octets[1] != 0xBF || + octets[2] != 0xBD) { + return false; + } + return true; +} + +bool DecodeOctets(const uint8_t* octets, int length, + std::vector<uc16>* buffer) { + size_t cursor = 0; + uc32 value = unibrow::Utf8::ValueOf(octets, length, &cursor); + if (value == unibrow::Utf8::kBadChar && + !IsReplacementCharacter(octets, length)) { + return false; + } + + if (value <= static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) { + buffer->push_back(value); + } else { + buffer->push_back(unibrow::Utf16::LeadSurrogate(value)); + buffer->push_back(unibrow::Utf16::TrailSurrogate(value)); + } + return true; +} + +int TwoDigitHex(uc16 character1, uc16 character2) { + if (character1 > 'f') return -1; + int high = HexValue(character1); + if (high == -1) return -1; + if (character2 > 'f') return -1; + int low = HexValue(character2); + if (low == -1) return -1; + return (high << 4) + low; +} + +template <typename T> +void AddToBuffer(uc16 decoded, String::FlatContent* uri_content, int index, + bool is_uri, std::vector<T>* buffer) { + if (is_uri && IsReservedPredicate(decoded)) { + buffer->push_back('%'); + uc16 first = uri_content->Get(index + 1); + uc16 second = uri_content->Get(index + 2); + DCHECK_GT(std::numeric_limits<T>::max(), first); + DCHECK_GT(std::numeric_limits<T>::max(), second); + + buffer->push_back(first); + buffer->push_back(second); + } else { + buffer->push_back(decoded); + } +} + +bool IntoTwoByte(int index, bool is_uri, int uri_length, + String::FlatContent* uri_content, std::vector<uc16>* buffer) { + for (int k = index; k < uri_length; k++) { + uc16 code = uri_content->Get(k); + if (code == '%') { + int two_digits; + if (k + 2 >= uri_length || + (two_digits = TwoDigitHex(uri_content->Get(k + 1), + uri_content->Get(k + 2))) < 0) { + return false; + } + k += 2; + uc16 decoded = static_cast<uc16>(two_digits); + if (decoded > unibrow::Utf8::kMaxOneByteChar) { + uint8_t octets[unibrow::Utf8::kMaxEncodedSize]; + octets[0] = decoded; + + int number_of_continuation_bytes = 0; + while ((decoded << ++number_of_continuation_bytes) & 0x80) { + if (number_of_continuation_bytes > 3 || k + 3 >= uri_length) { + return false; + } + if (uri_content->Get(++k) != '%' || + (two_digits = TwoDigitHex(uri_content->Get(k + 1), + uri_content->Get(k + 2))) < 0) { + return false; + } + k += 2; + uc16 continuation_byte = static_cast<uc16>(two_digits); + octets[number_of_continuation_bytes] = continuation_byte; + } + + if (!DecodeOctets(octets, number_of_continuation_bytes, buffer)) { + return false; + } + } else { + AddToBuffer(decoded, uri_content, k - 2, is_uri, buffer); + } + } else { + buffer->push_back(code); + } + } + return true; +} + +bool IntoOneAndTwoByte(Handle<String> uri, bool is_uri, + std::vector<uint8_t>* one_byte_buffer, + std::vector<uc16>* two_byte_buffer) { + DisallowHeapAllocation no_gc; + String::FlatContent uri_content = uri->GetFlatContent(no_gc); + + int uri_length = uri->length(); + for (int k = 0; k < uri_length; k++) { + uc16 code = uri_content.Get(k); + if (code == '%') { + int two_digits; + if (k + 2 >= uri_length || + (two_digits = TwoDigitHex(uri_content.Get(k + 1), + uri_content.Get(k + 2))) < 0) { + return false; + } + + uc16 decoded = static_cast<uc16>(two_digits); + if (decoded > unibrow::Utf8::kMaxOneByteChar) { + return IntoTwoByte(k, is_uri, uri_length, &uri_content, + two_byte_buffer); + } + + AddToBuffer(decoded, &uri_content, k, is_uri, one_byte_buffer); + k += 2; + } else { + if (code > unibrow::Utf8::kMaxOneByteChar) { + return IntoTwoByte(k, is_uri, uri_length, &uri_content, + two_byte_buffer); + } + one_byte_buffer->push_back(code); + } + } + return true; +} + +} // anonymous namespace + +MaybeHandle<String> Uri::Decode(Isolate* isolate, Handle<String> uri, + bool is_uri) { + uri = String::Flatten(isolate, uri); + std::vector<uint8_t> one_byte_buffer; + std::vector<uc16> two_byte_buffer; + + if (!IntoOneAndTwoByte(uri, is_uri, &one_byte_buffer, &two_byte_buffer)) { + THROW_NEW_ERROR(isolate, NewURIError(), String); + } + + if (two_byte_buffer.empty()) { + return isolate->factory()->NewStringFromOneByte(Vector<const uint8_t>( + one_byte_buffer.data(), static_cast<int>(one_byte_buffer.size()))); + } + + Handle<SeqTwoByteString> result; + int result_length = + static_cast<int>(one_byte_buffer.size() + two_byte_buffer.size()); + ASSIGN_RETURN_ON_EXCEPTION( + isolate, result, isolate->factory()->NewRawTwoByteString(result_length), + String); + + DisallowHeapAllocation no_gc; + CopyChars(result->GetChars(no_gc), one_byte_buffer.data(), + one_byte_buffer.size()); + CopyChars(result->GetChars(no_gc) + one_byte_buffer.size(), + two_byte_buffer.data(), two_byte_buffer.size()); + + return result; +} + +namespace { // anonymous namespace for EncodeURI helper functions +bool IsUnescapePredicateInUriComponent(uc16 c) { + if (IsAlphaNumeric(c)) { + return true; + } + + switch (c) { + case '!': + case '\'': + case '(': + case ')': + case '*': + case '-': + case '.': + case '_': + case '~': + return true; + default: + return false; + } +} + +bool IsUriSeparator(uc16 c) { + switch (c) { + case '#': + case ':': + case ';': + case '/': + case '?': + case '$': + case '&': + case '+': + case ',': + case '@': + case '=': + return true; + default: + return false; + } +} + +void AddEncodedOctetToBuffer(uint8_t octet, std::vector<uint8_t>* buffer) { + buffer->push_back('%'); + buffer->push_back(HexCharOfValue(octet >> 4)); + buffer->push_back(HexCharOfValue(octet & 0x0F)); +} + +void EncodeSingle(uc16 c, std::vector<uint8_t>* buffer) { + char s[4] = {}; + int number_of_bytes; + number_of_bytes = + unibrow::Utf8::Encode(s, c, unibrow::Utf16::kNoPreviousCharacter, false); + for (int k = 0; k < number_of_bytes; k++) { + AddEncodedOctetToBuffer(s[k], buffer); + } +} + +void EncodePair(uc16 cc1, uc16 cc2, std::vector<uint8_t>* buffer) { + char s[4] = {}; + int number_of_bytes = + unibrow::Utf8::Encode(s, unibrow::Utf16::CombineSurrogatePair(cc1, cc2), + unibrow::Utf16::kNoPreviousCharacter, false); + for (int k = 0; k < number_of_bytes; k++) { + AddEncodedOctetToBuffer(s[k], buffer); + } +} + +} // anonymous namespace + +MaybeHandle<String> Uri::Encode(Isolate* isolate, Handle<String> uri, + bool is_uri) { + uri = String::Flatten(isolate, uri); + int uri_length = uri->length(); + std::vector<uint8_t> buffer; + buffer.reserve(uri_length); + + { + DisallowHeapAllocation no_gc; + String::FlatContent uri_content = uri->GetFlatContent(no_gc); + + for (int k = 0; k < uri_length; k++) { + uc16 cc1 = uri_content.Get(k); + if (unibrow::Utf16::IsLeadSurrogate(cc1)) { + k++; + if (k < uri_length) { + uc16 cc2 = uri->Get(k); + if (unibrow::Utf16::IsTrailSurrogate(cc2)) { + EncodePair(cc1, cc2, &buffer); + continue; + } + } + } else if (!unibrow::Utf16::IsTrailSurrogate(cc1)) { + if (IsUnescapePredicateInUriComponent(cc1) || + (is_uri && IsUriSeparator(cc1))) { + buffer.push_back(cc1); + } else { + EncodeSingle(cc1, &buffer); + } + continue; + } + + AllowHeapAllocation allocate_error_and_return; + THROW_NEW_ERROR(isolate, NewURIError(), String); + } + } + + return isolate->factory()->NewStringFromOneByte(VectorOf(buffer)); +} + +namespace { // Anonymous namespace for Escape and Unescape + +template <typename Char> +int UnescapeChar(Vector<const Char> vector, int i, int length, int* step) { + uint16_t character = vector[i]; + int32_t hi = 0; + int32_t lo = 0; + if (character == '%' && i <= length - 6 && vector[i + 1] == 'u' && + (hi = TwoDigitHex(vector[i + 2], vector[i + 3])) > -1 && + (lo = TwoDigitHex(vector[i + 4], vector[i + 5])) > -1) { + *step = 6; + return (hi << 8) + lo; + } else if (character == '%' && i <= length - 3 && + (lo = TwoDigitHex(vector[i + 1], vector[i + 2])) > -1) { + *step = 3; + return lo; + } else { + *step = 1; + return character; + } +} + +template <typename Char> +MaybeHandle<String> UnescapeSlow(Isolate* isolate, Handle<String> string, + int start_index) { + bool one_byte = true; + int length = string->length(); + + int unescaped_length = 0; + { + DisallowHeapAllocation no_allocation; + Vector<const Char> vector = string->GetCharVector<Char>(no_allocation); + for (int i = start_index; i < length; unescaped_length++) { + int step; + if (UnescapeChar(vector, i, length, &step) > + String::kMaxOneByteCharCode) { + one_byte = false; + } + i += step; + } + } + + DCHECK(start_index < length); + Handle<String> first_part = + isolate->factory()->NewProperSubString(string, 0, start_index); + + int dest_position = 0; + Handle<String> second_part; + DCHECK_LE(unescaped_length, String::kMaxLength); + if (one_byte) { + Handle<SeqOneByteString> dest = isolate->factory() + ->NewRawOneByteString(unescaped_length) + .ToHandleChecked(); + DisallowHeapAllocation no_allocation; + Vector<const Char> vector = string->GetCharVector<Char>(no_allocation); + for (int i = start_index; i < length; dest_position++) { + int step; + dest->SeqOneByteStringSet(dest_position, + UnescapeChar(vector, i, length, &step)); + i += step; + } + second_part = dest; + } else { + Handle<SeqTwoByteString> dest = isolate->factory() + ->NewRawTwoByteString(unescaped_length) + .ToHandleChecked(); + DisallowHeapAllocation no_allocation; + Vector<const Char> vector = string->GetCharVector<Char>(no_allocation); + for (int i = start_index; i < length; dest_position++) { + int step; + dest->SeqTwoByteStringSet(dest_position, + UnescapeChar(vector, i, length, &step)); + i += step; + } + second_part = dest; + } + return isolate->factory()->NewConsString(first_part, second_part); +} + +bool IsNotEscaped(uint16_t c) { + if (IsAlphaNumeric(c)) { + return true; + } + // @*_+-./ + switch (c) { + case '@': + case '*': + case '_': + case '+': + case '-': + case '.': + case '/': + return true; + default: + return false; + } +} + +template <typename Char> +static MaybeHandle<String> UnescapePrivate(Isolate* isolate, + Handle<String> source) { + int index; + { + DisallowHeapAllocation no_allocation; + StringSearch<uint8_t, Char> search(isolate, StaticCharVector("%")); + index = search.Search(source->GetCharVector<Char>(no_allocation), 0); + if (index < 0) return source; + } + return UnescapeSlow<Char>(isolate, source, index); +} + +template <typename Char> +static MaybeHandle<String> EscapePrivate(Isolate* isolate, + Handle<String> string) { + DCHECK(string->IsFlat()); + int escaped_length = 0; + int length = string->length(); + + { + DisallowHeapAllocation no_allocation; + Vector<const Char> vector = string->GetCharVector<Char>(no_allocation); + for (int i = 0; i < length; i++) { + uint16_t c = vector[i]; + if (c >= 256) { + escaped_length += 6; + } else if (IsNotEscaped(c)) { + escaped_length++; + } else { + escaped_length += 3; + } + + // We don't allow strings that are longer than a maximal length. + DCHECK_LT(String::kMaxLength, 0x7FFFFFFF - 6); // Cannot overflow. + if (escaped_length > String::kMaxLength) break; // Provoke exception. + } + } + + // No length change implies no change. Return original string if no change. + if (escaped_length == length) return string; + + Handle<SeqOneByteString> dest; + ASSIGN_RETURN_ON_EXCEPTION( + isolate, dest, isolate->factory()->NewRawOneByteString(escaped_length), + String); + int dest_position = 0; + + { + DisallowHeapAllocation no_allocation; + Vector<const Char> vector = string->GetCharVector<Char>(no_allocation); + for (int i = 0; i < length; i++) { + uint16_t c = vector[i]; + if (c >= 256) { + dest->SeqOneByteStringSet(dest_position, '%'); + dest->SeqOneByteStringSet(dest_position + 1, 'u'); + dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c >> 12)); + dest->SeqOneByteStringSet(dest_position + 3, + HexCharOfValue((c >> 8) & 0xF)); + dest->SeqOneByteStringSet(dest_position + 4, + HexCharOfValue((c >> 4) & 0xF)); + dest->SeqOneByteStringSet(dest_position + 5, HexCharOfValue(c & 0xF)); + dest_position += 6; + } else if (IsNotEscaped(c)) { + dest->SeqOneByteStringSet(dest_position, c); + dest_position++; + } else { + dest->SeqOneByteStringSet(dest_position, '%'); + dest->SeqOneByteStringSet(dest_position + 1, HexCharOfValue(c >> 4)); + dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c & 0xF)); + dest_position += 3; + } + } + } + + return dest; +} + +} // Anonymous namespace + +MaybeHandle<String> Uri::Escape(Isolate* isolate, Handle<String> string) { + Handle<String> result; + string = String::Flatten(isolate, string); + return String::IsOneByteRepresentationUnderneath(*string) + ? EscapePrivate<uint8_t>(isolate, string) + : EscapePrivate<uc16>(isolate, string); +} + +MaybeHandle<String> Uri::Unescape(Isolate* isolate, Handle<String> string) { + Handle<String> result; + string = String::Flatten(isolate, string); + return String::IsOneByteRepresentationUnderneath(*string) + ? UnescapePrivate<uint8_t>(isolate, string) + : UnescapePrivate<uc16>(isolate, string); +} + +} // namespace internal +} // namespace v8 |