#include "string_decoder.h" // NOLINT(build/include_inline) #include "string_decoder-inl.h" #include "env-inl.h" #include "node_buffer.h" #include "string_bytes.h" #include "util.h" using v8::Array; using v8::ArrayBufferView; using v8::Context; using v8::FunctionCallbackInfo; using v8::Integer; using v8::Isolate; using v8::Local; using v8::MaybeLocal; using v8::Object; using v8::String; using v8::Value; namespace node { namespace { MaybeLocal MakeString(Isolate* isolate, const char* data, size_t length, enum encoding encoding) { Local error; MaybeLocal ret; if (encoding == UTF8) { return String::NewFromUtf8( isolate, data, v8::NewStringType::kNormal, length); } else { ret = StringBytes::Encode( isolate, data, length, encoding, &error); } if (ret.IsEmpty()) { CHECK(!error.IsEmpty()); isolate->ThrowException(error); } DCHECK(ret.IsEmpty() || ret.ToLocalChecked()->IsString()); return ret.FromMaybe(Local()).As(); } } // anonymous namespace MaybeLocal StringDecoder::DecodeData(Isolate* isolate, const char* data, size_t* nread_ptr) { Local prepend, body; size_t nread = *nread_ptr; if (Encoding() == UTF8 || Encoding() == UCS2 || Encoding() == BASE64) { // See if we want bytes to finish a character from the previous // chunk; if so, copy the new bytes to the missing bytes buffer // and create a small string from it that is to be prepended to the // main body. if (MissingBytes() > 0) { // There are never more bytes missing than the pre-calculated maximum. CHECK_LE(MissingBytes() + BufferedBytes(), kIncompleteCharactersEnd); if (Encoding() == UTF8) { // For UTF-8, we need special treatment to align with the V8 decoder: // If an incomplete character is found at a chunk boundary, we use // its remainder and pass it to V8 as-is. for (size_t i = 0; i < nread && i < MissingBytes(); ++i) { if ((data[i] & 0xC0) != 0x80) { // This byte is not a continuation byte even though it should have // been one. We stop decoding of the incomplete character at this // point (but still use the rest of the incomplete bytes from this // chunk) and assume that the new, unexpected byte starts a new one. state_[kMissingBytes] = 0; memcpy(IncompleteCharacterBuffer() + BufferedBytes(), data, i); state_[kBufferedBytes] += i; data += i; nread -= i; break; } } } size_t found_bytes = std::min(nread, static_cast(MissingBytes())); memcpy(IncompleteCharacterBuffer() + BufferedBytes(), data, found_bytes); // Adjust the two buffers. data += found_bytes; nread -= found_bytes; state_[kMissingBytes] -= found_bytes; state_[kBufferedBytes] += found_bytes; if (LIKELY(MissingBytes() == 0)) { // If no more bytes are missing, create a small string that we // will later prepend. if (!MakeString(isolate, IncompleteCharacterBuffer(), BufferedBytes(), Encoding()).ToLocal(&prepend)) { return MaybeLocal(); } *nread_ptr += BufferedBytes(); // No more buffered bytes. state_[kBufferedBytes] = 0; } } // It could be that trying to finish the previous chunk already // consumed all data that we received in this chunk. if (UNLIKELY(nread == 0)) { body = !prepend.IsEmpty() ? prepend : String::Empty(isolate); prepend = Local(); } else { // If not, that means is no character left to finish at this point. DCHECK_EQ(MissingBytes(), 0); DCHECK_EQ(BufferedBytes(), 0); // See whether there is a character that we may have to cut off and // finish when receiving the next chunk. if (Encoding() == UTF8 && data[nread - 1] & 0x80) { // This is UTF-8 encoded data and we ended on a non-ASCII UTF-8 byte. // This means we'll need to figure out where the character to which // the byte belongs begins. for (size_t i = nread - 1; ; --i) { DCHECK_LT(i, nread); state_[kBufferedBytes]++; if ((data[i] & 0xC0) == 0x80) { // This byte does not start a character (a "trailing" byte). if (state_[kBufferedBytes] >= 4 || i == 0) { // We either have more then 4 trailing bytes (which means // the current character would not be inside the range for // valid Unicode, and in particular cannot be represented // through JavaScript's UTF-16-based approach to strings), or the // current buffer does not contain the start of an UTF-8 character // at all. Either way, this is invalid UTF8 and we can just // let the engine's decoder handle it. state_[kBufferedBytes] = 0; break; } } else { // Found the first byte of a UTF-8 character. By looking at the // upper bits we can tell how long the character *should* be. if ((data[i] & 0xE0) == 0xC0) { state_[kMissingBytes] = 2; } else if ((data[i] & 0xF0) == 0xE0) { state_[kMissingBytes] = 3; } else if ((data[i] & 0xF8) == 0xF0) { state_[kMissingBytes] = 4; } else { // This lead byte would indicate a character outside of the // representable range. state_[kBufferedBytes] = 0; break; } if (BufferedBytes() >= MissingBytes()) { // Received more or exactly as many trailing bytes than the lead // character would indicate. In the "==" case, we have valid // data and don't need to slice anything off; // in the ">" case, this is invalid UTF-8 anyway. state_[kMissingBytes] = 0; state_[kBufferedBytes] = 0; } state_[kMissingBytes] -= state_[kBufferedBytes]; break; } } } else if (Encoding() == UCS2) { if ((nread % 2) == 1) { // We got half a codepoint, and need the second byte of it. state_[kBufferedBytes] = 1; state_[kMissingBytes] = 1; } else if ((data[nread - 1] & 0xFC) == 0xD8) { // Half a split UTF-16 character. state_[kBufferedBytes] = 2; state_[kMissingBytes] = 2; } } else if (Encoding() == BASE64) { state_[kBufferedBytes] = nread % 3; if (state_[kBufferedBytes] > 0) state_[kMissingBytes] = 3 - BufferedBytes(); } if (BufferedBytes() > 0) { // Copy the requested number of buffered bytes from the end of the // input into the incomplete character buffer. nread -= BufferedBytes(); *nread_ptr -= BufferedBytes(); memcpy(IncompleteCharacterBuffer(), data + nread, BufferedBytes()); } if (nread > 0) { if (!MakeString(isolate, data, nread, Encoding()).ToLocal(&body)) return MaybeLocal(); } else { body = String::Empty(isolate); } } if (prepend.IsEmpty()) { return body; } else { return String::Concat(isolate, prepend, body); } } else { CHECK(Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1); return MakeString(isolate, data, nread, Encoding()); } } MaybeLocal StringDecoder::FlushData(Isolate* isolate) { if (Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1) { CHECK_EQ(MissingBytes(), 0); CHECK_EQ(BufferedBytes(), 0); } if (Encoding() == UCS2 && BufferedBytes() % 2 == 1) { // Ignore a single trailing byte, like the JS decoder does. state_[kMissingBytes]--; state_[kBufferedBytes]--; } if (BufferedBytes() == 0) return String::Empty(isolate); MaybeLocal ret = MakeString(isolate, IncompleteCharacterBuffer(), BufferedBytes(), Encoding()); state_[kMissingBytes] = 0; state_[kBufferedBytes] = 0; return ret; } namespace { void DecodeData(const FunctionCallbackInfo& args) { StringDecoder* decoder = reinterpret_cast(Buffer::Data(args[0])); CHECK_NOT_NULL(decoder); CHECK(args[1]->IsArrayBufferView()); ArrayBufferViewContents content(args[1].As()); size_t length = content.length(); MaybeLocal ret = decoder->DecodeData(args.GetIsolate(), content.data(), &length); if (!ret.IsEmpty()) args.GetReturnValue().Set(ret.ToLocalChecked()); } void FlushData(const FunctionCallbackInfo& args) { StringDecoder* decoder = reinterpret_cast(Buffer::Data(args[0])); CHECK_NOT_NULL(decoder); MaybeLocal ret = decoder->FlushData(args.GetIsolate()); if (!ret.IsEmpty()) args.GetReturnValue().Set(ret.ToLocalChecked()); } void InitializeStringDecoder(Local target, Local unused, Local context, void* priv) { Environment* env = Environment::GetCurrent(context); Isolate* isolate = env->isolate(); #define SET_DECODER_CONSTANT(name) \ target->Set(context, \ FIXED_ONE_BYTE_STRING(isolate, #name), \ Integer::New(isolate, StringDecoder::name)).FromJust() SET_DECODER_CONSTANT(kIncompleteCharactersStart); SET_DECODER_CONSTANT(kIncompleteCharactersEnd); SET_DECODER_CONSTANT(kMissingBytes); SET_DECODER_CONSTANT(kBufferedBytes); SET_DECODER_CONSTANT(kEncodingField); SET_DECODER_CONSTANT(kNumFields); Local encodings = Array::New(isolate); #define ADD_TO_ENCODINGS_ARRAY(cname, jsname) \ encodings->Set(context, \ static_cast(cname), \ FIXED_ONE_BYTE_STRING(isolate, jsname)).FromJust() ADD_TO_ENCODINGS_ARRAY(ASCII, "ascii"); ADD_TO_ENCODINGS_ARRAY(UTF8, "utf8"); ADD_TO_ENCODINGS_ARRAY(BASE64, "base64"); ADD_TO_ENCODINGS_ARRAY(UCS2, "utf16le"); ADD_TO_ENCODINGS_ARRAY(HEX, "hex"); ADD_TO_ENCODINGS_ARRAY(BUFFER, "buffer"); ADD_TO_ENCODINGS_ARRAY(LATIN1, "latin1"); target->Set(context, FIXED_ONE_BYTE_STRING(isolate, "encodings"), encodings).Check(); target->Set(context, FIXED_ONE_BYTE_STRING(isolate, "kSize"), Integer::New(isolate, sizeof(StringDecoder))).Check(); env->SetMethod(target, "decode", DecodeData); env->SetMethod(target, "flush", FlushData); } } // anonymous namespace } // namespace node NODE_MODULE_CONTEXT_AWARE_INTERNAL(string_decoder, node::InitializeStringDecoder)