// Copyright 2019 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "encoding.h" #include #include #include #include #include #include namespace v8_inspector_protocol_encoding { // ============================================================================= // Status and Error codes // ============================================================================= std::string Status::ToASCIIString() const { switch (error) { case Error::OK: return "OK"; case Error::JSON_PARSER_UNPROCESSED_INPUT_REMAINS: return ToASCIIString("JSON: unprocessed input remains"); case Error::JSON_PARSER_STACK_LIMIT_EXCEEDED: return ToASCIIString("JSON: stack limit exceeded"); case Error::JSON_PARSER_NO_INPUT: return ToASCIIString("JSON: no input"); case Error::JSON_PARSER_INVALID_TOKEN: return ToASCIIString("JSON: invalid token"); case Error::JSON_PARSER_INVALID_NUMBER: return ToASCIIString("JSON: invalid number"); case Error::JSON_PARSER_INVALID_STRING: return ToASCIIString("JSON: invalid string"); case Error::JSON_PARSER_UNEXPECTED_ARRAY_END: return ToASCIIString("JSON: unexpected array end"); case Error::JSON_PARSER_COMMA_OR_ARRAY_END_EXPECTED: return ToASCIIString("JSON: comma or array end expected"); case Error::JSON_PARSER_STRING_LITERAL_EXPECTED: return ToASCIIString("JSON: string literal expected"); case Error::JSON_PARSER_COLON_EXPECTED: return ToASCIIString("JSON: colon expected"); case Error::JSON_PARSER_UNEXPECTED_MAP_END: return ToASCIIString("JSON: unexpected map end"); case Error::JSON_PARSER_COMMA_OR_MAP_END_EXPECTED: return ToASCIIString("JSON: comma or map end expected"); case Error::JSON_PARSER_VALUE_EXPECTED: return ToASCIIString("JSON: value expected"); case Error::CBOR_INVALID_INT32: return ToASCIIString("CBOR: invalid int32"); case Error::CBOR_INVALID_DOUBLE: return ToASCIIString("CBOR: invalid double"); case Error::CBOR_INVALID_ENVELOPE: return ToASCIIString("CBOR: invalid envelope"); case Error::CBOR_INVALID_STRING8: return ToASCIIString("CBOR: invalid string8"); case Error::CBOR_INVALID_STRING16: return ToASCIIString("CBOR: invalid string16"); case Error::CBOR_INVALID_BINARY: return ToASCIIString("CBOR: invalid binary"); case Error::CBOR_UNSUPPORTED_VALUE: return ToASCIIString("CBOR: unsupported value"); case Error::CBOR_NO_INPUT: return ToASCIIString("CBOR: no input"); case Error::CBOR_INVALID_START_BYTE: return ToASCIIString("CBOR: invalid start byte"); case Error::CBOR_UNEXPECTED_EOF_EXPECTED_VALUE: return ToASCIIString("CBOR: unexpected eof expected value"); case Error::CBOR_UNEXPECTED_EOF_IN_ARRAY: return ToASCIIString("CBOR: unexpected eof in array"); case Error::CBOR_UNEXPECTED_EOF_IN_MAP: return ToASCIIString("CBOR: unexpected eof in map"); case Error::CBOR_INVALID_MAP_KEY: return ToASCIIString("CBOR: invalid map key"); case Error::CBOR_STACK_LIMIT_EXCEEDED: return ToASCIIString("CBOR: stack limit exceeded"); case Error::CBOR_TRAILING_JUNK: return ToASCIIString("CBOR: trailing junk"); case Error::CBOR_MAP_START_EXPECTED: return ToASCIIString("CBOR: map start expected"); case Error::CBOR_MAP_STOP_EXPECTED: return ToASCIIString("CBOR: map stop expected"); case Error::CBOR_ENVELOPE_SIZE_LIMIT_EXCEEDED: return ToASCIIString("CBOR: envelope size limit exceeded"); } // Some compilers can't figure out that we can't get here. return "INVALID ERROR CODE"; } std::string Status::ToASCIIString(const char* msg) const { return std::string(msg) + " at position " + std::to_string(pos); } namespace cbor { namespace { // Indicates the number of bits the "initial byte" needs to be shifted to the // right after applying |kMajorTypeMask| to produce the major type in the // lowermost bits. static constexpr uint8_t kMajorTypeBitShift = 5u; // Mask selecting the low-order 5 bits of the "initial byte", which is where // the additional information is encoded. static constexpr uint8_t kAdditionalInformationMask = 0x1f; // Mask selecting the high-order 3 bits of the "initial byte", which indicates // the major type of the encoded value. static constexpr uint8_t kMajorTypeMask = 0xe0; // Indicates the integer is in the following byte. static constexpr uint8_t kAdditionalInformation1Byte = 24u; // Indicates the integer is in the next 2 bytes. static constexpr uint8_t kAdditionalInformation2Bytes = 25u; // Indicates the integer is in the next 4 bytes. static constexpr uint8_t kAdditionalInformation4Bytes = 26u; // Indicates the integer is in the next 8 bytes. static constexpr uint8_t kAdditionalInformation8Bytes = 27u; // Encodes the initial byte, consisting of the |type| in the first 3 bits // followed by 5 bits of |additional_info|. constexpr uint8_t EncodeInitialByte(MajorType type, uint8_t additional_info) { return (static_cast(type) << kMajorTypeBitShift) | (additional_info & kAdditionalInformationMask); } // TAG 24 indicates that what follows is a byte string which is // encoded in CBOR format. We use this as a wrapper for // maps and arrays, allowing us to skip them, because the // byte string carries its size (byte length). // https://tools.ietf.org/html/rfc7049#section-2.4.4.1 static constexpr uint8_t kInitialByteForEnvelope = EncodeInitialByte(MajorType::TAG, 24); // The initial byte for a byte string with at most 2^32 bytes // of payload. This is used for envelope encoding, even if // the byte string is shorter. static constexpr uint8_t kInitialByteFor32BitLengthByteString = EncodeInitialByte(MajorType::BYTE_STRING, 26); // See RFC 7049 Section 2.2.1, indefinite length arrays / maps have additional // info = 31. static constexpr uint8_t kInitialByteIndefiniteLengthArray = EncodeInitialByte(MajorType::ARRAY, 31); static constexpr uint8_t kInitialByteIndefiniteLengthMap = EncodeInitialByte(MajorType::MAP, 31); // See RFC 7049 Section 2.3, Table 1; this is used for finishing indefinite // length maps / arrays. static constexpr uint8_t kStopByte = EncodeInitialByte(MajorType::SIMPLE_VALUE, 31); // See RFC 7049 Section 2.3, Table 2. static constexpr uint8_t kEncodedTrue = EncodeInitialByte(MajorType::SIMPLE_VALUE, 21); static constexpr uint8_t kEncodedFalse = EncodeInitialByte(MajorType::SIMPLE_VALUE, 20); static constexpr uint8_t kEncodedNull = EncodeInitialByte(MajorType::SIMPLE_VALUE, 22); static constexpr uint8_t kInitialByteForDouble = EncodeInitialByte(MajorType::SIMPLE_VALUE, 27); // See RFC 7049 Table 3 and Section 2.4.4.2. This is used as a prefix for // arbitrary binary data encoded as BYTE_STRING. static constexpr uint8_t kExpectedConversionToBase64Tag = EncodeInitialByte(MajorType::TAG, 22); // Writes the bytes for |v| to |out|, starting with the most significant byte. // See also: https://commandcenter.blogspot.com/2012/04/byte-order-fallacy.html template void WriteBytesMostSignificantByteFirst(T v, C* out) { for (int shift_bytes = sizeof(T) - 1; shift_bytes >= 0; --shift_bytes) out->push_back(0xff & (v >> (shift_bytes * 8))); } // Extracts sizeof(T) bytes from |in| to extract a value of type T // (e.g. uint64_t, uint32_t, ...), most significant byte first. // See also: https://commandcenter.blogspot.com/2012/04/byte-order-fallacy.html template T ReadBytesMostSignificantByteFirst(span in) { assert(in.size() >= sizeof(T)); T result = 0; for (size_t shift_bytes = 0; shift_bytes < sizeof(T); ++shift_bytes) result |= T(in[sizeof(T) - 1 - shift_bytes]) << (shift_bytes * 8); return result; } } // namespace namespace internals { // Reads the start of a token with definitive size from |bytes|. // |type| is the major type as specified in RFC 7049 Section 2.1. // |value| is the payload (e.g. for MajorType::UNSIGNED) or is the size // (e.g. for BYTE_STRING). // If successful, returns the number of bytes read. Otherwise returns -1. // TODO(johannes): change return type to size_t and use 0 for error. int8_t ReadTokenStart(span bytes, MajorType* type, uint64_t* value) { if (bytes.empty()) return -1; uint8_t initial_byte = bytes[0]; *type = MajorType((initial_byte & kMajorTypeMask) >> kMajorTypeBitShift); uint8_t additional_information = initial_byte & kAdditionalInformationMask; if (additional_information < 24) { // Values 0-23 are encoded directly into the additional info of the // initial byte. *value = additional_information; return 1; } if (additional_information == kAdditionalInformation1Byte) { // Values 24-255 are encoded with one initial byte, followed by the value. if (bytes.size() < 2) return -1; *value = ReadBytesMostSignificantByteFirst(bytes.subspan(1)); return 2; } if (additional_information == kAdditionalInformation2Bytes) { // Values 256-65535: 1 initial byte + 2 bytes payload. if (bytes.size() < 1 + sizeof(uint16_t)) return -1; *value = ReadBytesMostSignificantByteFirst(bytes.subspan(1)); return 3; } if (additional_information == kAdditionalInformation4Bytes) { // 32 bit uint: 1 initial byte + 4 bytes payload. if (bytes.size() < 1 + sizeof(uint32_t)) return -1; *value = ReadBytesMostSignificantByteFirst(bytes.subspan(1)); return 5; } if (additional_information == kAdditionalInformation8Bytes) { // 64 bit uint: 1 initial byte + 8 bytes payload. if (bytes.size() < 1 + sizeof(uint64_t)) return -1; *value = ReadBytesMostSignificantByteFirst(bytes.subspan(1)); return 9; } return -1; } // Writes the start of a token with |type|. The |value| may indicate the size, // or it may be the payload if the value is an unsigned integer. template void WriteTokenStartTmpl(MajorType type, uint64_t value, C* encoded) { if (value < 24) { // Values 0-23 are encoded directly into the additional info of the // initial byte. encoded->push_back(EncodeInitialByte(type, /*additional_info=*/value)); return; } if (value <= std::numeric_limits::max()) { // Values 24-255 are encoded with one initial byte, followed by the value. encoded->push_back(EncodeInitialByte(type, kAdditionalInformation1Byte)); encoded->push_back(value); return; } if (value <= std::numeric_limits::max()) { // Values 256-65535: 1 initial byte + 2 bytes payload. encoded->push_back(EncodeInitialByte(type, kAdditionalInformation2Bytes)); WriteBytesMostSignificantByteFirst(value, encoded); return; } if (value <= std::numeric_limits::max()) { // 32 bit uint: 1 initial byte + 4 bytes payload. encoded->push_back(EncodeInitialByte(type, kAdditionalInformation4Bytes)); WriteBytesMostSignificantByteFirst(static_cast(value), encoded); return; } // 64 bit uint: 1 initial byte + 8 bytes payload. encoded->push_back(EncodeInitialByte(type, kAdditionalInformation8Bytes)); WriteBytesMostSignificantByteFirst(value, encoded); } void WriteTokenStart(MajorType type, uint64_t value, std::vector* encoded) { WriteTokenStartTmpl(type, value, encoded); } void WriteTokenStart(MajorType type, uint64_t value, std::string* encoded) { WriteTokenStartTmpl(type, value, encoded); } } // namespace internals // ============================================================================= // Detecting CBOR content // ============================================================================= uint8_t InitialByteForEnvelope() { return kInitialByteForEnvelope; } uint8_t InitialByteFor32BitLengthByteString() { return kInitialByteFor32BitLengthByteString; } bool IsCBORMessage(span msg) { return msg.size() >= 6 && msg[0] == InitialByteForEnvelope() && msg[1] == InitialByteFor32BitLengthByteString(); } // ============================================================================= // Encoding invidiual CBOR items // ============================================================================= uint8_t EncodeTrue() { return kEncodedTrue; } uint8_t EncodeFalse() { return kEncodedFalse; } uint8_t EncodeNull() { return kEncodedNull; } uint8_t EncodeIndefiniteLengthArrayStart() { return kInitialByteIndefiniteLengthArray; } uint8_t EncodeIndefiniteLengthMapStart() { return kInitialByteIndefiniteLengthMap; } uint8_t EncodeStop() { return kStopByte; } template void EncodeInt32Tmpl(int32_t value, C* out) { if (value >= 0) { internals::WriteTokenStart(MajorType::UNSIGNED, value, out); } else { uint64_t representation = static_cast(-(value + 1)); internals::WriteTokenStart(MajorType::NEGATIVE, representation, out); } } void EncodeInt32(int32_t value, std::vector* out) { EncodeInt32Tmpl(value, out); } void EncodeInt32(int32_t value, std::string* out) { EncodeInt32Tmpl(value, out); } template void EncodeString16Tmpl(span in, C* out) { uint64_t byte_length = static_cast(in.size_bytes()); internals::WriteTokenStart(MajorType::BYTE_STRING, byte_length, out); // When emitting UTF16 characters, we always write the least significant byte // first; this is because it's the native representation for X86. // TODO(johannes): Implement a more efficient thing here later, e.g. // casting *iff* the machine has this byte order. // The wire format for UTF16 chars will probably remain the same // (least significant byte first) since this way we can have // golden files, unittests, etc. that port easily and universally. // See also: // https://commandcenter.blogspot.com/2012/04/byte-order-fallacy.html for (const uint16_t two_bytes : in) { out->push_back(two_bytes); out->push_back(two_bytes >> 8); } } void EncodeString16(span in, std::vector* out) { EncodeString16Tmpl(in, out); } void EncodeString16(span in, std::string* out) { EncodeString16Tmpl(in, out); } template void EncodeString8Tmpl(span in, C* out) { internals::WriteTokenStart(MajorType::STRING, static_cast(in.size_bytes()), out); out->insert(out->end(), in.begin(), in.end()); } void EncodeString8(span in, std::vector* out) { EncodeString8Tmpl(in, out); } void EncodeString8(span in, std::string* out) { EncodeString8Tmpl(in, out); } template void EncodeFromLatin1Tmpl(span latin1, C* out) { for (size_t ii = 0; ii < latin1.size(); ++ii) { if (latin1[ii] <= 127) continue; // If there's at least one non-ASCII char, convert to UTF8. std::vector utf8(latin1.begin(), latin1.begin() + ii); for (; ii < latin1.size(); ++ii) { if (latin1[ii] <= 127) { utf8.push_back(latin1[ii]); } else { // 0xC0 means it's a UTF8 sequence with 2 bytes. utf8.push_back((latin1[ii] >> 6) | 0xc0); utf8.push_back((latin1[ii] | 0x80) & 0xbf); } } EncodeString8(SpanFrom(utf8), out); return; } EncodeString8(latin1, out); } void EncodeFromLatin1(span latin1, std::vector* out) { EncodeFromLatin1Tmpl(latin1, out); } void EncodeFromLatin1(span latin1, std::string* out) { EncodeFromLatin1Tmpl(latin1, out); } template void EncodeFromUTF16Tmpl(span utf16, C* out) { // If there's at least one non-ASCII char, encode as STRING16 (UTF16). for (uint16_t ch : utf16) { if (ch <= 127) continue; EncodeString16(utf16, out); return; } // It's all US-ASCII, strip out every second byte and encode as UTF8. internals::WriteTokenStart(MajorType::STRING, static_cast(utf16.size()), out); out->insert(out->end(), utf16.begin(), utf16.end()); } void EncodeFromUTF16(span utf16, std::vector* out) { EncodeFromUTF16Tmpl(utf16, out); } void EncodeFromUTF16(span utf16, std::string* out) { EncodeFromUTF16Tmpl(utf16, out); } template void EncodeBinaryTmpl(span in, C* out) { out->push_back(kExpectedConversionToBase64Tag); uint64_t byte_length = static_cast(in.size_bytes()); internals::WriteTokenStart(MajorType::BYTE_STRING, byte_length, out); out->insert(out->end(), in.begin(), in.end()); } void EncodeBinary(span in, std::vector* out) { EncodeBinaryTmpl(in, out); } void EncodeBinary(span in, std::string* out) { EncodeBinaryTmpl(in, out); } // A double is encoded with a specific initial byte // (kInitialByteForDouble) plus the 64 bits of payload for its value. constexpr size_t kEncodedDoubleSize = 1 + sizeof(uint64_t); // An envelope is encoded with a specific initial byte // (kInitialByteForEnvelope), plus the start byte for a BYTE_STRING with a 32 // bit wide length, plus a 32 bit length for that string. constexpr size_t kEncodedEnvelopeHeaderSize = 1 + 1 + sizeof(uint32_t); template void EncodeDoubleTmpl(double value, C* out) { // The additional_info=27 indicates 64 bits for the double follow. // See RFC 7049 Section 2.3, Table 1. out->push_back(kInitialByteForDouble); union { double from_double; uint64_t to_uint64; } reinterpret; reinterpret.from_double = value; WriteBytesMostSignificantByteFirst(reinterpret.to_uint64, out); } void EncodeDouble(double value, std::vector* out) { EncodeDoubleTmpl(value, out); } void EncodeDouble(double value, std::string* out) { EncodeDoubleTmpl(value, out); } // ============================================================================= // cbor::EnvelopeEncoder - for wrapping submessages // ============================================================================= template void EncodeStartTmpl(C* out, size_t* byte_size_pos) { assert(*byte_size_pos == 0); out->push_back(kInitialByteForEnvelope); out->push_back(kInitialByteFor32BitLengthByteString); *byte_size_pos = out->size(); out->resize(out->size() + sizeof(uint32_t)); } void EnvelopeEncoder::EncodeStart(std::vector* out) { EncodeStartTmpl>(out, &byte_size_pos_); } void EnvelopeEncoder::EncodeStart(std::string* out) { EncodeStartTmpl(out, &byte_size_pos_); } template bool EncodeStopTmpl(C* out, size_t* byte_size_pos) { assert(*byte_size_pos != 0); // The byte size is the size of the payload, that is, all the // bytes that were written past the byte size position itself. uint64_t byte_size = out->size() - (*byte_size_pos + sizeof(uint32_t)); // We store exactly 4 bytes, so at most INT32MAX, with most significant // byte first. if (byte_size > std::numeric_limits::max()) return false; for (int shift_bytes = sizeof(uint32_t) - 1; shift_bytes >= 0; --shift_bytes) { (*out)[(*byte_size_pos)++] = 0xff & (byte_size >> (shift_bytes * 8)); } return true; } bool EnvelopeEncoder::EncodeStop(std::vector* out) { return EncodeStopTmpl(out, &byte_size_pos_); } bool EnvelopeEncoder::EncodeStop(std::string* out) { return EncodeStopTmpl(out, &byte_size_pos_); } // ============================================================================= // cbor::NewCBOREncoder - for encoding from a streaming parser // ============================================================================= namespace { template class CBOREncoder : public StreamingParserHandler { public: CBOREncoder(C* out, Status* status) : out_(out), status_(status) { *status_ = Status(); } void HandleMapBegin() override { if (!status_->ok()) return; envelopes_.emplace_back(); envelopes_.back().EncodeStart(out_); out_->push_back(kInitialByteIndefiniteLengthMap); } void HandleMapEnd() override { if (!status_->ok()) return; out_->push_back(kStopByte); assert(!envelopes_.empty()); if (!envelopes_.back().EncodeStop(out_)) { HandleError( Status(Error::CBOR_ENVELOPE_SIZE_LIMIT_EXCEEDED, out_->size())); return; } envelopes_.pop_back(); } void HandleArrayBegin() override { if (!status_->ok()) return; envelopes_.emplace_back(); envelopes_.back().EncodeStart(out_); out_->push_back(kInitialByteIndefiniteLengthArray); } void HandleArrayEnd() override { if (!status_->ok()) return; out_->push_back(kStopByte); assert(!envelopes_.empty()); if (!envelopes_.back().EncodeStop(out_)) { HandleError( Status(Error::CBOR_ENVELOPE_SIZE_LIMIT_EXCEEDED, out_->size())); return; } envelopes_.pop_back(); } void HandleString8(span chars) override { if (!status_->ok()) return; EncodeString8(chars, out_); } void HandleString16(span chars) override { if (!status_->ok()) return; EncodeFromUTF16(chars, out_); } void HandleBinary(span bytes) override { if (!status_->ok()) return; EncodeBinary(bytes, out_); } void HandleDouble(double value) override { if (!status_->ok()) return; EncodeDouble(value, out_); } void HandleInt32(int32_t value) override { if (!status_->ok()) return; EncodeInt32(value, out_); } void HandleBool(bool value) override { if (!status_->ok()) return; // See RFC 7049 Section 2.3, Table 2. out_->push_back(value ? kEncodedTrue : kEncodedFalse); } void HandleNull() override { if (!status_->ok()) return; // See RFC 7049 Section 2.3, Table 2. out_->push_back(kEncodedNull); } void HandleError(Status error) override { if (!status_->ok()) return; *status_ = error; out_->clear(); } private: C* out_; std::vector envelopes_; Status* status_; }; } // namespace std::unique_ptr NewCBOREncoder( std::vector* out, Status* status) { return std::unique_ptr( new CBOREncoder>(out, status)); } std::unique_ptr NewCBOREncoder(std::string* out, Status* status) { return std::unique_ptr( new CBOREncoder(out, status)); } // ============================================================================= // cbor::CBORTokenizer - for parsing individual CBOR items // ============================================================================= CBORTokenizer::CBORTokenizer(span bytes) : bytes_(bytes) { ReadNextToken(/*enter_envelope=*/false); } CBORTokenizer::~CBORTokenizer() {} CBORTokenTag CBORTokenizer::TokenTag() const { return token_tag_; } void CBORTokenizer::Next() { if (token_tag_ == CBORTokenTag::ERROR_VALUE || token_tag_ == CBORTokenTag::DONE) return; ReadNextToken(/*enter_envelope=*/false); } void CBORTokenizer::EnterEnvelope() { assert(token_tag_ == CBORTokenTag::ENVELOPE); ReadNextToken(/*enter_envelope=*/true); } Status CBORTokenizer::Status() const { return status_; } // The following accessor functions ::GetInt32, ::GetDouble, // ::GetString8, ::GetString16WireRep, ::GetBinary, ::GetEnvelopeContents // assume that a particular token was recognized in ::ReadNextToken. // That's where all the error checking is done. By design, // the accessors (assuming the token was recognized) never produce // an error. int32_t CBORTokenizer::GetInt32() const { assert(token_tag_ == CBORTokenTag::INT32); // The range checks happen in ::ReadNextToken(). return static_cast( token_start_type_ == MajorType::UNSIGNED ? token_start_internal_value_ : -static_cast(token_start_internal_value_) - 1); } double CBORTokenizer::GetDouble() const { assert(token_tag_ == CBORTokenTag::DOUBLE); union { uint64_t from_uint64; double to_double; } reinterpret; reinterpret.from_uint64 = ReadBytesMostSignificantByteFirst( bytes_.subspan(status_.pos + 1)); return reinterpret.to_double; } span CBORTokenizer::GetString8() const { assert(token_tag_ == CBORTokenTag::STRING8); auto length = static_cast(token_start_internal_value_); return bytes_.subspan(status_.pos + (token_byte_length_ - length), length); } span CBORTokenizer::GetString16WireRep() const { assert(token_tag_ == CBORTokenTag::STRING16); auto length = static_cast(token_start_internal_value_); return bytes_.subspan(status_.pos + (token_byte_length_ - length), length); } span CBORTokenizer::GetBinary() const { assert(token_tag_ == CBORTokenTag::BINARY); auto length = static_cast(token_start_internal_value_); return bytes_.subspan(status_.pos + (token_byte_length_ - length), length); } span CBORTokenizer::GetEnvelopeContents() const { assert(token_tag_ == CBORTokenTag::ENVELOPE); auto length = static_cast(token_start_internal_value_); return bytes_.subspan(status_.pos + kEncodedEnvelopeHeaderSize, length); } // All error checking happens in ::ReadNextToken, so that the accessors // can avoid having to carry an error return value. // // With respect to checking the encoded lengths of strings, arrays, etc: // On the wire, CBOR uses 1,2,4, and 8 byte unsigned integers, so // we initially read them as uint64_t, usually into token_start_internal_value_. // // However, since these containers have a representation on the machine, // we need to do corresponding size computations on the input byte array, // output span (e.g. the payload for a string), etc., and size_t is // machine specific (in practice either 32 bit or 64 bit). // // Further, we must avoid overflowing size_t. Therefore, we use this // kMaxValidLength constant to: // - Reject values that are larger than the architecture specific // max size_t (differs between 32 bit and 64 bit arch). // - Reserve at least one bit so that we can check against overflows // when adding lengths (array / string length / etc.); we do this by // ensuring that the inputs to an addition are <= kMaxValidLength, // and then checking whether the sum went past it. // // See also // https://chromium.googlesource.com/chromium/src/+/master/docs/security/integer-semantics.md static const uint64_t kMaxValidLength = std::min(std::numeric_limits::max() >> 2, std::numeric_limits::max()); void CBORTokenizer::ReadNextToken(bool enter_envelope) { if (enter_envelope) { status_.pos += kEncodedEnvelopeHeaderSize; } else { status_.pos = status_.pos == Status::npos() ? 0 : status_.pos + token_byte_length_; } status_.error = Error::OK; if (status_.pos >= bytes_.size()) { token_tag_ = CBORTokenTag::DONE; return; } const size_t remaining_bytes = bytes_.size() - status_.pos; switch (bytes_[status_.pos]) { case kStopByte: SetToken(CBORTokenTag::STOP, 1); return; case kInitialByteIndefiniteLengthMap: SetToken(CBORTokenTag::MAP_START, 1); return; case kInitialByteIndefiniteLengthArray: SetToken(CBORTokenTag::ARRAY_START, 1); return; case kEncodedTrue: SetToken(CBORTokenTag::TRUE_VALUE, 1); return; case kEncodedFalse: SetToken(CBORTokenTag::FALSE_VALUE, 1); return; case kEncodedNull: SetToken(CBORTokenTag::NULL_VALUE, 1); return; case kExpectedConversionToBase64Tag: { // BINARY const int8_t bytes_read = internals::ReadTokenStart( bytes_.subspan(status_.pos + 1), &token_start_type_, &token_start_internal_value_); if (bytes_read < 0 || token_start_type_ != MajorType::BYTE_STRING || token_start_internal_value_ > kMaxValidLength) { SetError(Error::CBOR_INVALID_BINARY); return; } const uint64_t token_byte_length = token_start_internal_value_ + /* tag before token start: */ 1 + /* token start: */ bytes_read; if (token_byte_length > remaining_bytes) { SetError(Error::CBOR_INVALID_BINARY); return; } SetToken(CBORTokenTag::BINARY, static_cast(token_byte_length)); return; } case kInitialByteForDouble: { // DOUBLE if (kEncodedDoubleSize > remaining_bytes) { SetError(Error::CBOR_INVALID_DOUBLE); return; } SetToken(CBORTokenTag::DOUBLE, kEncodedDoubleSize); return; } case kInitialByteForEnvelope: { // ENVELOPE if (kEncodedEnvelopeHeaderSize > remaining_bytes) { SetError(Error::CBOR_INVALID_ENVELOPE); return; } // The envelope must be a byte string with 32 bit length. if (bytes_[status_.pos + 1] != kInitialByteFor32BitLengthByteString) { SetError(Error::CBOR_INVALID_ENVELOPE); return; } // Read the length of the byte string. token_start_internal_value_ = ReadBytesMostSignificantByteFirst( bytes_.subspan(status_.pos + 2)); if (token_start_internal_value_ > kMaxValidLength) { SetError(Error::CBOR_INVALID_ENVELOPE); return; } uint64_t token_byte_length = token_start_internal_value_ + kEncodedEnvelopeHeaderSize; if (token_byte_length > remaining_bytes) { SetError(Error::CBOR_INVALID_ENVELOPE); return; } SetToken(CBORTokenTag::ENVELOPE, static_cast(token_byte_length)); return; } default: { const int8_t token_start_length = internals::ReadTokenStart( bytes_.subspan(status_.pos), &token_start_type_, &token_start_internal_value_); const bool success = token_start_length >= 0; switch (token_start_type_) { case MajorType::UNSIGNED: // INT32. // INT32 is a signed int32 (int32 makes sense for the // inspector_protocol, it's not a CBOR limitation), so we check // against the signed max, so that the allowable values are // 0, 1, 2, ... 2^31 - 1. if (!success || std::numeric_limits::max() < token_start_internal_value_) { SetError(Error::CBOR_INVALID_INT32); return; } SetToken(CBORTokenTag::INT32, token_start_length); return; case MajorType::NEGATIVE: { // INT32. // INT32 is a signed int32 (int32 makes sense for the // inspector_protocol, it's not a CBOR limitation); in CBOR, // the negative values for INT32 are represented as NEGATIVE, // that is, -1 INT32 is represented as 1 << 5 | 0 (major type 1, // additional info value 0). So here, we compute the INT32 value // and then check it against the INT32 min. int64_t actual_value = -static_cast(token_start_internal_value_) - 1; if (!success || actual_value < std::numeric_limits::min()) { SetError(Error::CBOR_INVALID_INT32); return; } SetToken(CBORTokenTag::INT32, token_start_length); return; } case MajorType::STRING: { // STRING8. if (!success || token_start_internal_value_ > kMaxValidLength) { SetError(Error::CBOR_INVALID_STRING8); return; } uint64_t token_byte_length = token_start_internal_value_ + token_start_length; if (token_byte_length > remaining_bytes) { SetError(Error::CBOR_INVALID_STRING8); return; } SetToken(CBORTokenTag::STRING8, static_cast(token_byte_length)); return; } case MajorType::BYTE_STRING: { // STRING16. // Length must be divisible by 2 since UTF16 is 2 bytes per // character, hence the &1 check. if (!success || token_start_internal_value_ > kMaxValidLength || token_start_internal_value_ & 1) { SetError(Error::CBOR_INVALID_STRING16); return; } uint64_t token_byte_length = token_start_internal_value_ + token_start_length; if (token_byte_length > remaining_bytes) { SetError(Error::CBOR_INVALID_STRING16); return; } SetToken(CBORTokenTag::STRING16, static_cast(token_byte_length)); return; } case MajorType::ARRAY: case MajorType::MAP: case MajorType::TAG: case MajorType::SIMPLE_VALUE: SetError(Error::CBOR_UNSUPPORTED_VALUE); return; } } } } void CBORTokenizer::SetToken(CBORTokenTag token_tag, size_t token_byte_length) { token_tag_ = token_tag; token_byte_length_ = token_byte_length; } void CBORTokenizer::SetError(Error error) { token_tag_ = CBORTokenTag::ERROR_VALUE; status_.error = error; } // ============================================================================= // cbor::ParseCBOR - for receiving streaming parser events for CBOR messages // ============================================================================= namespace { // When parsing CBOR, we limit recursion depth for objects and arrays // to this constant. static constexpr int kStackLimit = 300; // Below are three parsing routines for CBOR, which cover enough // to roundtrip JSON messages. bool ParseMap(int32_t stack_depth, CBORTokenizer* tokenizer, StreamingParserHandler* out); bool ParseArray(int32_t stack_depth, CBORTokenizer* tokenizer, StreamingParserHandler* out); bool ParseValue(int32_t stack_depth, CBORTokenizer* tokenizer, StreamingParserHandler* out); void ParseUTF16String(CBORTokenizer* tokenizer, StreamingParserHandler* out) { std::vector value; span rep = tokenizer->GetString16WireRep(); for (size_t ii = 0; ii < rep.size(); ii += 2) value.push_back((rep[ii + 1] << 8) | rep[ii]); out->HandleString16(span(value.data(), value.size())); tokenizer->Next(); } bool ParseUTF8String(CBORTokenizer* tokenizer, StreamingParserHandler* out) { assert(tokenizer->TokenTag() == CBORTokenTag::STRING8); out->HandleString8(tokenizer->GetString8()); tokenizer->Next(); return true; } bool ParseValue(int32_t stack_depth, CBORTokenizer* tokenizer, StreamingParserHandler* out) { if (stack_depth > kStackLimit) { out->HandleError( Status{Error::CBOR_STACK_LIMIT_EXCEEDED, tokenizer->Status().pos}); return false; } // Skip past the envelope to get to what's inside. if (tokenizer->TokenTag() == CBORTokenTag::ENVELOPE) tokenizer->EnterEnvelope(); switch (tokenizer->TokenTag()) { case CBORTokenTag::ERROR_VALUE: out->HandleError(tokenizer->Status()); return false; case CBORTokenTag::DONE: out->HandleError(Status{Error::CBOR_UNEXPECTED_EOF_EXPECTED_VALUE, tokenizer->Status().pos}); return false; case CBORTokenTag::TRUE_VALUE: out->HandleBool(true); tokenizer->Next(); return true; case CBORTokenTag::FALSE_VALUE: out->HandleBool(false); tokenizer->Next(); return true; case CBORTokenTag::NULL_VALUE: out->HandleNull(); tokenizer->Next(); return true; case CBORTokenTag::INT32: out->HandleInt32(tokenizer->GetInt32()); tokenizer->Next(); return true; case CBORTokenTag::DOUBLE: out->HandleDouble(tokenizer->GetDouble()); tokenizer->Next(); return true; case CBORTokenTag::STRING8: return ParseUTF8String(tokenizer, out); case CBORTokenTag::STRING16: ParseUTF16String(tokenizer, out); return true; case CBORTokenTag::BINARY: { out->HandleBinary(tokenizer->GetBinary()); tokenizer->Next(); return true; } case CBORTokenTag::MAP_START: return ParseMap(stack_depth + 1, tokenizer, out); case CBORTokenTag::ARRAY_START: return ParseArray(stack_depth + 1, tokenizer, out); default: out->HandleError( Status{Error::CBOR_UNSUPPORTED_VALUE, tokenizer->Status().pos}); return false; } } // |bytes| must start with the indefinite length array byte, so basically, // ParseArray may only be called after an indefinite length array has been // detected. bool ParseArray(int32_t stack_depth, CBORTokenizer* tokenizer, StreamingParserHandler* out) { assert(tokenizer->TokenTag() == CBORTokenTag::ARRAY_START); tokenizer->Next(); out->HandleArrayBegin(); while (tokenizer->TokenTag() != CBORTokenTag::STOP) { if (tokenizer->TokenTag() == CBORTokenTag::DONE) { out->HandleError( Status{Error::CBOR_UNEXPECTED_EOF_IN_ARRAY, tokenizer->Status().pos}); return false; } if (tokenizer->TokenTag() == CBORTokenTag::ERROR_VALUE) { out->HandleError(tokenizer->Status()); return false; } // Parse value. if (!ParseValue(stack_depth, tokenizer, out)) return false; } out->HandleArrayEnd(); tokenizer->Next(); return true; } // |bytes| must start with the indefinite length array byte, so basically, // ParseArray may only be called after an indefinite length array has been // detected. bool ParseMap(int32_t stack_depth, CBORTokenizer* tokenizer, StreamingParserHandler* out) { assert(tokenizer->TokenTag() == CBORTokenTag::MAP_START); out->HandleMapBegin(); tokenizer->Next(); while (tokenizer->TokenTag() != CBORTokenTag::STOP) { if (tokenizer->TokenTag() == CBORTokenTag::DONE) { out->HandleError( Status{Error::CBOR_UNEXPECTED_EOF_IN_MAP, tokenizer->Status().pos}); return false; } if (tokenizer->TokenTag() == CBORTokenTag::ERROR_VALUE) { out->HandleError(tokenizer->Status()); return false; } // Parse key. if (tokenizer->TokenTag() == CBORTokenTag::STRING8) { if (!ParseUTF8String(tokenizer, out)) return false; } else if (tokenizer->TokenTag() == CBORTokenTag::STRING16) { ParseUTF16String(tokenizer, out); } else { out->HandleError( Status{Error::CBOR_INVALID_MAP_KEY, tokenizer->Status().pos}); return false; } // Parse value. if (!ParseValue(stack_depth, tokenizer, out)) return false; } out->HandleMapEnd(); tokenizer->Next(); return true; } } // namespace void ParseCBOR(span bytes, StreamingParserHandler* out) { if (bytes.empty()) { out->HandleError(Status{Error::CBOR_NO_INPUT, 0}); return; } if (bytes[0] != kInitialByteForEnvelope) { out->HandleError(Status{Error::CBOR_INVALID_START_BYTE, 0}); return; } CBORTokenizer tokenizer(bytes); if (tokenizer.TokenTag() == CBORTokenTag::ERROR_VALUE) { out->HandleError(tokenizer.Status()); return; } // We checked for the envelope start byte above, so the tokenizer // must agree here, since it's not an error. assert(tokenizer.TokenTag() == CBORTokenTag::ENVELOPE); tokenizer.EnterEnvelope(); if (tokenizer.TokenTag() != CBORTokenTag::MAP_START) { out->HandleError( Status{Error::CBOR_MAP_START_EXPECTED, tokenizer.Status().pos}); return; } if (!ParseMap(/*stack_depth=*/1, &tokenizer, out)) return; if (tokenizer.TokenTag() == CBORTokenTag::DONE) return; if (tokenizer.TokenTag() == CBORTokenTag::ERROR_VALUE) { out->HandleError(tokenizer.Status()); return; } out->HandleError(Status{Error::CBOR_TRAILING_JUNK, tokenizer.Status().pos}); } // ============================================================================= // cbor::AppendString8EntryToMap - for limited in-place editing of messages // ============================================================================= template Status AppendString8EntryToCBORMapTmpl(span string8_key, span string8_value, C* cbor) { // Careful below: Don't compare (*cbor)[idx] with a uint8_t, since // it could be a char (signed!). Instead, use bytes. span bytes(reinterpret_cast(cbor->data()), cbor->size()); CBORTokenizer tokenizer(bytes); if (tokenizer.TokenTag() == CBORTokenTag::ERROR_VALUE) return tokenizer.Status(); if (tokenizer.TokenTag() != CBORTokenTag::ENVELOPE) return Status(Error::CBOR_INVALID_ENVELOPE, 0); size_t envelope_size = tokenizer.GetEnvelopeContents().size(); size_t old_size = cbor->size(); if (old_size != envelope_size + kEncodedEnvelopeHeaderSize) return Status(Error::CBOR_INVALID_ENVELOPE, 0); if (envelope_size == 0 || (tokenizer.GetEnvelopeContents()[0] != EncodeIndefiniteLengthMapStart())) return Status(Error::CBOR_MAP_START_EXPECTED, kEncodedEnvelopeHeaderSize); if (bytes[bytes.size() - 1] != EncodeStop()) return Status(Error::CBOR_MAP_STOP_EXPECTED, cbor->size() - 1); cbor->pop_back(); EncodeString8(string8_key, cbor); EncodeString8(string8_value, cbor); cbor->push_back(EncodeStop()); size_t new_envelope_size = envelope_size + (cbor->size() - old_size); if (new_envelope_size > std::numeric_limits::max()) return Status(Error::CBOR_ENVELOPE_SIZE_LIMIT_EXCEEDED, 0); size_t size_pos = cbor->size() - new_envelope_size - sizeof(uint32_t); uint8_t* out = reinterpret_cast(&cbor->at(size_pos)); *(out++) = (new_envelope_size >> 24) & 0xff; *(out++) = (new_envelope_size >> 16) & 0xff; *(out++) = (new_envelope_size >> 8) & 0xff; *(out) = new_envelope_size & 0xff; return Status(); } Status AppendString8EntryToCBORMap(span string8_key, span string8_value, std::vector* cbor) { return AppendString8EntryToCBORMapTmpl(string8_key, string8_value, cbor); } Status AppendString8EntryToCBORMap(span string8_key, span string8_value, std::string* cbor) { return AppendString8EntryToCBORMapTmpl(string8_key, string8_value, cbor); } } // namespace cbor namespace json { // ============================================================================= // json::NewJSONEncoder - for encoding streaming parser events as JSON // ============================================================================= namespace { // Prints |value| to |out| with 4 hex digits, most significant chunk first. template void PrintHex(uint16_t value, C* out) { for (int ii = 3; ii >= 0; --ii) { int four_bits = 0xf & (value >> (4 * ii)); out->push_back(four_bits + ((four_bits <= 9) ? '0' : ('a' - 10))); } } // In the writer below, we maintain a stack of State instances. // It is just enough to emit the appropriate delimiters and brackets // in JSON. enum class Container { // Used for the top-level, initial state. NONE, // Inside a JSON object. MAP, // Inside a JSON array. ARRAY }; class State { public: explicit State(Container container) : container_(container) {} void StartElement(std::vector* out) { StartElementTmpl(out); } void StartElement(std::string* out) { StartElementTmpl(out); } Container container() const { return container_; } private: template void StartElementTmpl(C* out) { assert(container_ != Container::NONE || size_ == 0); if (size_ != 0) { char delim = (!(size_ & 1) || container_ == Container::ARRAY) ? ',' : ':'; out->push_back(delim); } ++size_; } Container container_ = Container::NONE; int size_ = 0; }; constexpr char kBase64Table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz0123456789+/"; template void Base64Encode(const span& in, C* out) { // The following three cases are based on the tables in the example // section in https://en.wikipedia.org/wiki/Base64. We process three // input bytes at a time, emitting 4 output bytes at a time. size_t ii = 0; // While possible, process three input bytes. for (; ii + 3 <= in.size(); ii += 3) { uint32_t twentyfour_bits = (in[ii] << 16) | (in[ii + 1] << 8) | in[ii + 2]; out->push_back(kBase64Table[(twentyfour_bits >> 18)]); out->push_back(kBase64Table[(twentyfour_bits >> 12) & 0x3f]); out->push_back(kBase64Table[(twentyfour_bits >> 6) & 0x3f]); out->push_back(kBase64Table[twentyfour_bits & 0x3f]); } if (ii + 2 <= in.size()) { // Process two input bytes. uint32_t twentyfour_bits = (in[ii] << 16) | (in[ii + 1] << 8); out->push_back(kBase64Table[(twentyfour_bits >> 18)]); out->push_back(kBase64Table[(twentyfour_bits >> 12) & 0x3f]); out->push_back(kBase64Table[(twentyfour_bits >> 6) & 0x3f]); out->push_back('='); // Emit padding. return; } if (ii + 1 <= in.size()) { // Process a single input byte. uint32_t twentyfour_bits = (in[ii] << 16); out->push_back(kBase64Table[(twentyfour_bits >> 18)]); out->push_back(kBase64Table[(twentyfour_bits >> 12) & 0x3f]); out->push_back('='); // Emit padding. out->push_back('='); // Emit padding. } } // Implements a handler for JSON parser events to emit a JSON string. template class JSONEncoder : public StreamingParserHandler { public: JSONEncoder(const Platform* platform, C* out, Status* status) : platform_(platform), out_(out), status_(status) { *status_ = Status(); state_.emplace(Container::NONE); } void HandleMapBegin() override { if (!status_->ok()) return; assert(!state_.empty()); state_.top().StartElement(out_); state_.emplace(Container::MAP); Emit('{'); } void HandleMapEnd() override { if (!status_->ok()) return; assert(state_.size() >= 2 && state_.top().container() == Container::MAP); state_.pop(); Emit('}'); } void HandleArrayBegin() override { if (!status_->ok()) return; state_.top().StartElement(out_); state_.emplace(Container::ARRAY); Emit('['); } void HandleArrayEnd() override { if (!status_->ok()) return; assert(state_.size() >= 2 && state_.top().container() == Container::ARRAY); state_.pop(); Emit(']'); } void HandleString16(span chars) override { if (!status_->ok()) return; state_.top().StartElement(out_); Emit('"'); for (const uint16_t ch : chars) { if (ch == '"') { Emit("\\\""); } else if (ch == '\\') { Emit("\\\\"); } else if (ch == '\b') { Emit("\\b"); } else if (ch == '\f') { Emit("\\f"); } else if (ch == '\n') { Emit("\\n"); } else if (ch == '\r') { Emit("\\r"); } else if (ch == '\t') { Emit("\\t"); } else if (ch >= 32 && ch <= 126) { Emit(ch); } else { Emit("\\u"); PrintHex(ch, out_); } } Emit('"'); } void HandleString8(span chars) override { if (!status_->ok()) return; state_.top().StartElement(out_); Emit('"'); for (size_t ii = 0; ii < chars.size(); ++ii) { uint8_t c = chars[ii]; if (c == '"') { Emit("\\\""); } else if (c == '\\') { Emit("\\\\"); } else if (c == '\b') { Emit("\\b"); } else if (c == '\f') { Emit("\\f"); } else if (c == '\n') { Emit("\\n"); } else if (c == '\r') { Emit("\\r"); } else if (c == '\t') { Emit("\\t"); } else if (c >= 32 && c <= 126) { Emit(c); } else if (c < 32) { Emit("\\u"); PrintHex(static_cast(c), out_); } else { // Inspect the leading byte to figure out how long the utf8 // byte sequence is; while doing this initialize |codepoint| // with the first few bits. // See table in: https://en.wikipedia.org/wiki/UTF-8 // byte one is 110x xxxx -> 2 byte utf8 sequence // byte one is 1110 xxxx -> 3 byte utf8 sequence // byte one is 1111 0xxx -> 4 byte utf8 sequence uint32_t codepoint; int num_bytes_left; if ((c & 0xe0) == 0xc0) { // 2 byte utf8 sequence num_bytes_left = 1; codepoint = c & 0x1f; } else if ((c & 0xf0) == 0xe0) { // 3 byte utf8 sequence num_bytes_left = 2; codepoint = c & 0x0f; } else if ((c & 0xf8) == 0xf0) { // 4 byte utf8 sequence codepoint = c & 0x07; num_bytes_left = 3; } else { continue; // invalid leading byte } // If we have enough bytes in our input, decode the remaining ones // belonging to this Unicode character into |codepoint|. if (ii + num_bytes_left > chars.size()) continue; while (num_bytes_left > 0) { c = chars[++ii]; --num_bytes_left; // Check the next byte is a continuation byte, that is 10xx xxxx. if ((c & 0xc0) != 0x80) continue; codepoint = (codepoint << 6) | (c & 0x3f); } // Disallow overlong encodings for ascii characters, as these // would include " and other characters significant to JSON // string termination / control. if (codepoint < 0x7f) continue; // Invalid in UTF8, and can't be represented in UTF16 anyway. if (codepoint > 0x10ffff) continue; // So, now we transcode to UTF16, // using the math described at https://en.wikipedia.org/wiki/UTF-16, // for either one or two 16 bit characters. if (codepoint < 0xffff) { Emit("\\u"); PrintHex(static_cast(codepoint), out_); continue; } codepoint -= 0x10000; // high surrogate Emit("\\u"); PrintHex(static_cast((codepoint >> 10) + 0xd800), out_); // low surrogate Emit("\\u"); PrintHex(static_cast((codepoint & 0x3ff) + 0xdc00), out_); } } Emit('"'); } void HandleBinary(span bytes) override { if (!status_->ok()) return; state_.top().StartElement(out_); Emit('"'); Base64Encode(bytes, out_); Emit('"'); } void HandleDouble(double value) override { if (!status_->ok()) return; state_.top().StartElement(out_); // JSON cannot represent NaN or Infinity. So, for compatibility, // we behave like the JSON object in web browsers: emit 'null'. if (!std::isfinite(value)) { Emit("null"); return; } std::unique_ptr str_value = platform_->DToStr(value); // DToStr may fail to emit a 0 before the decimal dot. E.g. this is // the case in base::NumberToString in Chromium (which is based on // dmg_fp). So, much like // https://cs.chromium.org/chromium/src/base/json/json_writer.cc // we probe for this and emit the leading 0 anyway if necessary. const char* chars = str_value.get(); if (chars[0] == '.') { Emit('0'); } else if (chars[0] == '-' && chars[1] == '.') { Emit("-0"); ++chars; } Emit(chars); } void HandleInt32(int32_t value) override { if (!status_->ok()) return; state_.top().StartElement(out_); Emit(std::to_string(value)); } void HandleBool(bool value) override { if (!status_->ok()) return; state_.top().StartElement(out_); Emit(value ? "true" : "false"); } void HandleNull() override { if (!status_->ok()) return; state_.top().StartElement(out_); Emit("null"); } void HandleError(Status error) override { assert(!error.ok()); *status_ = error; out_->clear(); } private: void Emit(char c) { out_->push_back(c); } void Emit(const char* str) { out_->insert(out_->end(), str, str + strlen(str)); } void Emit(const std::string& str) { out_->insert(out_->end(), str.begin(), str.end()); } const Platform* platform_; C* out_; Status* status_; std::stack state_; }; } // namespace std::unique_ptr NewJSONEncoder( const Platform* platform, std::vector* out, Status* status) { return std::unique_ptr( new JSONEncoder>(platform, out, status)); } std::unique_ptr NewJSONEncoder(const Platform* platform, std::string* out, Status* status) { return std::unique_ptr( new JSONEncoder(platform, out, status)); } // ============================================================================= // json::ParseJSON - for receiving streaming parser events for JSON. // ============================================================================= namespace { const int kStackLimit = 300; enum Token { ObjectBegin, ObjectEnd, ArrayBegin, ArrayEnd, StringLiteral, Number, BoolTrue, BoolFalse, NullToken, ListSeparator, ObjectPairSeparator, InvalidToken, NoInput }; const char* const kNullString = "null"; const char* const kTrueString = "true"; const char* const kFalseString = "false"; template class JsonParser { public: JsonParser(const Platform* platform, StreamingParserHandler* handler) : platform_(platform), handler_(handler) {} void Parse(const Char* start, size_t length) { start_pos_ = start; const Char* end = start + length; const Char* tokenEnd = nullptr; ParseValue(start, end, &tokenEnd, 0); if (error_) return; if (tokenEnd != end) { HandleError(Error::JSON_PARSER_UNPROCESSED_INPUT_REMAINS, tokenEnd); } } private: bool CharsToDouble(const uint16_t* chars, size_t length, double* result) { std::string buffer; buffer.reserve(length + 1); for (size_t ii = 0; ii < length; ++ii) { bool is_ascii = !(chars[ii] & ~0x7F); if (!is_ascii) return false; buffer.push_back(static_cast(chars[ii])); } return platform_->StrToD(buffer.c_str(), result); } bool CharsToDouble(const uint8_t* chars, size_t length, double* result) { std::string buffer(reinterpret_cast(chars), length); return platform_->StrToD(buffer.c_str(), result); } static bool ParseConstToken(const Char* start, const Char* end, const Char** token_end, const char* token) { // |token| is \0 terminated, it's one of the constants at top of the file. while (start < end && *token != '\0' && *start++ == *token++) { } if (*token != '\0') return false; *token_end = start; return true; } static bool ReadInt(const Char* start, const Char* end, const Char** token_end, bool allow_leading_zeros) { if (start == end) return false; bool has_leading_zero = '0' == *start; int length = 0; while (start < end && '0' <= *start && *start <= '9') { ++start; ++length; } if (!length) return false; if (!allow_leading_zeros && length > 1 && has_leading_zero) return false; *token_end = start; return true; } static bool ParseNumberToken(const Char* start, const Char* end, const Char** token_end) { // We just grab the number here. We validate the size in DecodeNumber. // According to RFC4627, a valid number is: [minus] int [frac] [exp] if (start == end) return false; Char c = *start; if ('-' == c) ++start; if (!ReadInt(start, end, &start, /*allow_leading_zeros=*/false)) return false; if (start == end) { *token_end = start; return true; } // Optional fraction part c = *start; if ('.' == c) { ++start; if (!ReadInt(start, end, &start, /*allow_leading_zeros=*/true)) return false; if (start == end) { *token_end = start; return true; } c = *start; } // Optional exponent part if ('e' == c || 'E' == c) { ++start; if (start == end) return false; c = *start; if ('-' == c || '+' == c) { ++start; if (start == end) return false; } if (!ReadInt(start, end, &start, /*allow_leading_zeros=*/true)) return false; } *token_end = start; return true; } static bool ReadHexDigits(const Char* start, const Char* end, const Char** token_end, int digits) { if (end - start < digits) return false; for (int i = 0; i < digits; ++i) { Char c = *start++; if (!(('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'))) return false; } *token_end = start; return true; } static bool ParseStringToken(const Char* start, const Char* end, const Char** token_end) { while (start < end) { Char c = *start++; if ('\\' == c) { if (start == end) return false; c = *start++; // Make sure the escaped char is valid. switch (c) { case 'x': if (!ReadHexDigits(start, end, &start, 2)) return false; break; case 'u': if (!ReadHexDigits(start, end, &start, 4)) return false; break; case '\\': case '/': case 'b': case 'f': case 'n': case 'r': case 't': case 'v': case '"': break; default: return false; } } else if ('"' == c) { *token_end = start; return true; } } return false; } static bool SkipComment(const Char* start, const Char* end, const Char** comment_end) { if (start == end) return false; if (*start != '/' || start + 1 >= end) return false; ++start; if (*start == '/') { // Single line comment, read to newline. for (++start; start < end; ++start) { if (*start == '\n' || *start == '\r') { *comment_end = start + 1; return true; } } *comment_end = end; // Comment reaches end-of-input, which is fine. return true; } if (*start == '*') { Char previous = '\0'; // Block comment, read until end marker. for (++start; start < end; previous = *start++) { if (previous == '*' && *start == '/') { *comment_end = start + 1; return true; } } // Block comment must close before end-of-input. return false; } return false; } static bool IsSpaceOrNewLine(Char c) { // \v = vertial tab; \f = form feed page break. return c == ' ' || c == '\n' || c == '\v' || c == '\f' || c == '\r' || c == '\t'; } static void SkipWhitespaceAndComments(const Char* start, const Char* end, const Char** whitespace_end) { while (start < end) { if (IsSpaceOrNewLine(*start)) { ++start; } else if (*start == '/') { const Char* comment_end = nullptr; if (!SkipComment(start, end, &comment_end)) break; start = comment_end; } else { break; } } *whitespace_end = start; } static Token ParseToken(const Char* start, const Char* end, const Char** tokenStart, const Char** token_end) { SkipWhitespaceAndComments(start, end, tokenStart); start = *tokenStart; if (start == end) return NoInput; switch (*start) { case 'n': if (ParseConstToken(start, end, token_end, kNullString)) return NullToken; break; case 't': if (ParseConstToken(start, end, token_end, kTrueString)) return BoolTrue; break; case 'f': if (ParseConstToken(start, end, token_end, kFalseString)) return BoolFalse; break; case '[': *token_end = start + 1; return ArrayBegin; case ']': *token_end = start + 1; return ArrayEnd; case ',': *token_end = start + 1; return ListSeparator; case '{': *token_end = start + 1; return ObjectBegin; case '}': *token_end = start + 1; return ObjectEnd; case ':': *token_end = start + 1; return ObjectPairSeparator; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '-': if (ParseNumberToken(start, end, token_end)) return Number; break; case '"': if (ParseStringToken(start + 1, end, token_end)) return StringLiteral; break; } return InvalidToken; } static int HexToInt(Char c) { if ('0' <= c && c <= '9') return c - '0'; if ('A' <= c && c <= 'F') return c - 'A' + 10; if ('a' <= c && c <= 'f') return c - 'a' + 10; assert(false); // Unreachable. return 0; } static bool DecodeString(const Char* start, const Char* end, std::vector* output) { if (start == end) return true; if (start > end) return false; output->reserve(end - start); while (start < end) { uint16_t c = *start++; // If the |Char| we're dealing with is really a byte, then // we have utf8 here, and we need to check for multibyte characters // and transcode them to utf16 (either one or two utf16 chars). if (sizeof(Char) == sizeof(uint8_t) && c >= 0x7f) { // Inspect the leading byte to figure out how long the utf8 // byte sequence is; while doing this initialize |codepoint| // with the first few bits. // See table in: https://en.wikipedia.org/wiki/UTF-8 // byte one is 110x xxxx -> 2 byte utf8 sequence // byte one is 1110 xxxx -> 3 byte utf8 sequence // byte one is 1111 0xxx -> 4 byte utf8 sequence uint32_t codepoint; int num_bytes_left; if ((c & 0xe0) == 0xc0) { // 2 byte utf8 sequence num_bytes_left = 1; codepoint = c & 0x1f; } else if ((c & 0xf0) == 0xe0) { // 3 byte utf8 sequence num_bytes_left = 2; codepoint = c & 0x0f; } else if ((c & 0xf8) == 0xf0) { // 4 byte utf8 sequence codepoint = c & 0x07; num_bytes_left = 3; } else { return false; // invalid leading byte } // If we have enough bytes in our inpput, decode the remaining ones // belonging to this Unicode character into |codepoint|. if (start + num_bytes_left > end) return false; while (num_bytes_left > 0) { c = *start++; --num_bytes_left; // Check the next byte is a continuation byte, that is 10xx xxxx. if ((c & 0xc0) != 0x80) return false; codepoint = (codepoint << 6) | (c & 0x3f); } // Disallow overlong encodings for ascii characters, as these // would include " and other characters significant to JSON // string termination / control. if (codepoint < 0x7f) return false; // Invalid in UTF8, and can't be represented in UTF16 anyway. if (codepoint > 0x10ffff) return false; // So, now we transcode to UTF16, // using the math described at https://en.wikipedia.org/wiki/UTF-16, // for either one or two 16 bit characters. if (codepoint < 0xffff) { output->push_back(codepoint); continue; } codepoint -= 0x10000; output->push_back((codepoint >> 10) + 0xd800); // high surrogate output->push_back((codepoint & 0x3ff) + 0xdc00); // low surrogate continue; } if ('\\' != c) { output->push_back(c); continue; } if (start == end) return false; c = *start++; if (c == 'x') { // \x is not supported. return false; } switch (c) { case '"': case '/': case '\\': break; case 'b': c = '\b'; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case 'v': c = '\v'; break; case 'u': c = (HexToInt(*start) << 12) + (HexToInt(*(start + 1)) << 8) + (HexToInt(*(start + 2)) << 4) + HexToInt(*(start + 3)); start += 4; break; default: return false; } output->push_back(c); } return true; } void ParseValue(const Char* start, const Char* end, const Char** value_token_end, int depth) { if (depth > kStackLimit) { HandleError(Error::JSON_PARSER_STACK_LIMIT_EXCEEDED, start); return; } const Char* token_start = nullptr; const Char* token_end = nullptr; Token token = ParseToken(start, end, &token_start, &token_end); switch (token) { case NoInput: HandleError(Error::JSON_PARSER_NO_INPUT, token_start); return; case InvalidToken: HandleError(Error::JSON_PARSER_INVALID_TOKEN, token_start); return; case NullToken: handler_->HandleNull(); break; case BoolTrue: handler_->HandleBool(true); break; case BoolFalse: handler_->HandleBool(false); break; case Number: { double value; if (!CharsToDouble(token_start, token_end - token_start, &value)) { HandleError(Error::JSON_PARSER_INVALID_NUMBER, token_start); return; } if (value >= std::numeric_limits::min() && value <= std::numeric_limits::max() && static_cast(value) == value) handler_->HandleInt32(static_cast(value)); else handler_->HandleDouble(value); break; } case StringLiteral: { std::vector value; bool ok = DecodeString(token_start + 1, token_end - 1, &value); if (!ok) { HandleError(Error::JSON_PARSER_INVALID_STRING, token_start); return; } handler_->HandleString16(span(value.data(), value.size())); break; } case ArrayBegin: { handler_->HandleArrayBegin(); start = token_end; token = ParseToken(start, end, &token_start, &token_end); while (token != ArrayEnd) { ParseValue(start, end, &token_end, depth + 1); if (error_) return; // After a list value, we expect a comma or the end of the list. start = token_end; token = ParseToken(start, end, &token_start, &token_end); if (token == ListSeparator) { start = token_end; token = ParseToken(start, end, &token_start, &token_end); if (token == ArrayEnd) { HandleError(Error::JSON_PARSER_UNEXPECTED_ARRAY_END, token_start); return; } } else if (token != ArrayEnd) { // Unexpected value after list value. Bail out. HandleError(Error::JSON_PARSER_COMMA_OR_ARRAY_END_EXPECTED, token_start); return; } } handler_->HandleArrayEnd(); break; } case ObjectBegin: { handler_->HandleMapBegin(); start = token_end; token = ParseToken(start, end, &token_start, &token_end); while (token != ObjectEnd) { if (token != StringLiteral) { HandleError(Error::JSON_PARSER_STRING_LITERAL_EXPECTED, token_start); return; } std::vector key; if (!DecodeString(token_start + 1, token_end - 1, &key)) { HandleError(Error::JSON_PARSER_INVALID_STRING, token_start); return; } handler_->HandleString16(span(key.data(), key.size())); start = token_end; token = ParseToken(start, end, &token_start, &token_end); if (token != ObjectPairSeparator) { HandleError(Error::JSON_PARSER_COLON_EXPECTED, token_start); return; } start = token_end; ParseValue(start, end, &token_end, depth + 1); if (error_) return; start = token_end; // After a key/value pair, we expect a comma or the end of the // object. token = ParseToken(start, end, &token_start, &token_end); if (token == ListSeparator) { start = token_end; token = ParseToken(start, end, &token_start, &token_end); if (token == ObjectEnd) { HandleError(Error::JSON_PARSER_UNEXPECTED_MAP_END, token_start); return; } } else if (token != ObjectEnd) { // Unexpected value after last object value. Bail out. HandleError(Error::JSON_PARSER_COMMA_OR_MAP_END_EXPECTED, token_start); return; } } handler_->HandleMapEnd(); break; } default: // We got a token that's not a value. HandleError(Error::JSON_PARSER_VALUE_EXPECTED, token_start); return; } SkipWhitespaceAndComments(token_end, end, value_token_end); } void HandleError(Error error, const Char* pos) { assert(error != Error::OK); if (!error_) { handler_->HandleError( Status{error, static_cast(pos - start_pos_)}); error_ = true; } } const Char* start_pos_ = nullptr; bool error_ = false; const Platform* platform_; StreamingParserHandler* handler_; }; } // namespace void ParseJSON(const Platform& platform, span chars, StreamingParserHandler* handler) { JsonParser parser(&platform, handler); parser.Parse(chars.data(), chars.size()); } void ParseJSON(const Platform& platform, span chars, StreamingParserHandler* handler) { JsonParser parser(&platform, handler); parser.Parse(chars.data(), chars.size()); } // ============================================================================= // json::ConvertCBORToJSON, json::ConvertJSONToCBOR - for transcoding // ============================================================================= template Status ConvertCBORToJSONTmpl(const Platform& platform, span cbor, C* json) { Status status; std::unique_ptr json_writer = NewJSONEncoder(&platform, json, &status); cbor::ParseCBOR(cbor, json_writer.get()); return status; } Status ConvertCBORToJSON(const Platform& platform, span cbor, std::vector* json) { return ConvertCBORToJSONTmpl(platform, cbor, json); } Status ConvertCBORToJSON(const Platform& platform, span cbor, std::string* json) { return ConvertCBORToJSONTmpl(platform, cbor, json); } template Status ConvertJSONToCBORTmpl(const Platform& platform, span json, C* cbor) { Status status; std::unique_ptr encoder = cbor::NewCBOREncoder(cbor, &status); ParseJSON(platform, json, encoder.get()); return status; } Status ConvertJSONToCBOR(const Platform& platform, span json, std::string* cbor) { return ConvertJSONToCBORTmpl(platform, json, cbor); } Status ConvertJSONToCBOR(const Platform& platform, span json, std::string* cbor) { return ConvertJSONToCBORTmpl(platform, json, cbor); } Status ConvertJSONToCBOR(const Platform& platform, span json, std::vector* cbor) { return ConvertJSONToCBORTmpl(platform, json, cbor); } Status ConvertJSONToCBOR(const Platform& platform, span json, std::vector* cbor) { return ConvertJSONToCBORTmpl(platform, json, cbor); } } // namespace json } // namespace v8_inspector_protocol_encoding