From 180af17b522f531eb15b917f4fde9570b6aa95ae Mon Sep 17 00:00:00 2001 From: Anna Henningsen Date: Thu, 1 Feb 2018 02:28:39 +0100 Subject: string_decoder: reimplement in C++ Implement string decoder in C++. The perks are a decent speed boost (for decoding, whereas creation show some performance degradation), that this can now be used more easily to add native decoding support to C++ streams and (arguably) more readable variable names. PR-URL: https://github.com/nodejs/node/pull/18537 Reviewed-By: James M Snell Reviewed-By: Ben Noordhuis --- lib/string_decoder.js | 285 +++++++++----------------------------------------- 1 file changed, 47 insertions(+), 238 deletions(-) (limited to 'lib') diff --git a/lib/string_decoder.js b/lib/string_decoder.js index 1e569ba6b2..d955a66330 100644 --- a/lib/string_decoder.js +++ b/lib/string_decoder.js @@ -22,10 +22,23 @@ 'use strict'; const { Buffer } = require('buffer'); +const { + kIncompleteCharactersStart, + kIncompleteCharactersEnd, + kMissingBytes, + kBufferedBytes, + kEncodingField, + kSize, + decode, + flush, + encodings +} = internalBinding('string_decoder'); const internalUtil = require('internal/util'); const errors = require('internal/errors'); const isEncoding = Buffer[internalUtil.kIsEncodingSymbol]; +const kNativeDecoder = Symbol('kNativeDecoder'); + // Do not cache `Buffer.isEncoding` when checking encoding names as some // modules monkey-patch it to support additional encodings function normalizeEncoding(enc) { @@ -36,258 +49,54 @@ function normalizeEncoding(enc) { return nenc || enc; } +const encodingsMap = {}; +for (var i = 0; i < encodings.length; ++i) + encodingsMap[encodings[i]] = i; + // StringDecoder provides an interface for efficiently splitting a series of // buffers into a series of JS strings without breaking apart multi-byte // characters. -exports.StringDecoder = StringDecoder; -function StringDecoder(encoding) { - this.encoding = normalizeEncoding(encoding); - var nb; - switch (this.encoding) { - case 'utf16le': - this.text = utf16Text; - this.end = utf16End; - nb = 4; - break; - case 'utf8': - this.fillLast = utf8FillLast; - nb = 4; - break; - case 'base64': - this.text = base64Text; - this.end = base64End; - nb = 3; - break; - default: - this.write = simpleWrite; - this.end = simpleEnd; - return; - } - this.lastNeed = 0; - this.lastTotal = 0; - this.lastChar = Buffer.allocUnsafe(nb); -} - -StringDecoder.prototype.write = function(buf) { - if (buf.length === 0) - return ''; - var r; - var i; - if (this.lastNeed) { - r = this.fillLast(buf); - if (r === undefined) - return ''; - i = this.lastNeed; - this.lastNeed = 0; - } else { - i = 0; - } - if (i < buf.length) - return (r ? r + this.text(buf, i) : this.text(buf, i)); - return r || ''; -}; - -StringDecoder.prototype.end = utf8End; - -// Returns only complete characters in a Buffer -StringDecoder.prototype.text = utf8Text; - -// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer -StringDecoder.prototype.fillLast = function(buf) { - if (this.lastNeed <= buf.length) { - buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed); - return this.lastChar.toString(this.encoding, 0, this.lastTotal); - } - buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length); - this.lastNeed -= buf.length; -}; - -// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a -// continuation byte. If an invalid byte is detected, -2 is returned. -function utf8CheckByte(byte) { - if (byte <= 0x7F) - return 0; - else if (byte >> 5 === 0x06) - return 2; - else if (byte >> 4 === 0x0E) - return 3; - else if (byte >> 3 === 0x1E) - return 4; - return (byte >> 6 === 0x02 ? -1 : -2); -} - -// Checks at most 3 bytes at the end of a Buffer in order to detect an -// incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4) -// needed to complete the UTF-8 character (if applicable) are returned. -function utf8CheckIncomplete(self, buf, i) { - var j = buf.length - 1; - if (j < i) - return 0; - var nb = utf8CheckByte(buf[j]); - if (nb >= 0) { - if (nb > 0) - self.lastNeed = nb - 1; - return nb; - } - if (--j < i || nb === -2) - return 0; - nb = utf8CheckByte(buf[j]); - if (nb >= 0) { - if (nb > 0) - self.lastNeed = nb - 2; - return nb; - } - if (--j < i || nb === -2) - return 0; - nb = utf8CheckByte(buf[j]); - if (nb >= 0) { - if (nb > 0) { - if (nb === 2) - nb = 0; - else - self.lastNeed = nb - 3; - } - return nb; - } - return 0; -} - -// Validates as many continuation bytes for a multi-byte UTF-8 character as -// needed or are available. If we see a non-continuation byte where we expect -// one, we "replace" the validated continuation bytes we've seen so far with -// a single UTF-8 replacement character ('\ufffd'), to match v8's UTF-8 decoding -// behavior. The continuation byte check is included three times in the case -// where all of the continuation bytes for a character exist in the same buffer. -// It is also done this way as a slight performance increase instead of using a -// loop. -function utf8CheckExtraBytes(self, buf, p) { - if ((buf[0] & 0xC0) !== 0x80) { - self.lastNeed = 0; - return '\ufffd'; - } - if (self.lastNeed > 1 && buf.length > 1) { - if ((buf[1] & 0xC0) !== 0x80) { - self.lastNeed = 1; - return '\ufffd'; - } - if (self.lastNeed > 2 && buf.length > 2) { - if ((buf[2] & 0xC0) !== 0x80) { - self.lastNeed = 2; - return '\ufffd'; - } - } +class StringDecoder { + constructor(encoding) { + this.encoding = normalizeEncoding(encoding); + this[kNativeDecoder] = Buffer.alloc(kSize); + this[kNativeDecoder][kEncodingField] = encodingsMap[this.encoding]; } -} -// Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer. -function utf8FillLast(buf) { - const p = this.lastTotal - this.lastNeed; - var r = utf8CheckExtraBytes(this, buf, p); - if (r !== undefined) - return r; - if (this.lastNeed <= buf.length) { - buf.copy(this.lastChar, p, 0, this.lastNeed); - return this.lastChar.toString(this.encoding, 0, this.lastTotal); + write(buf) { + if (typeof buf === 'string') + return buf; + if (!ArrayBuffer.isView(buf)) + throw new errors.TypeError('ERR_INVALID_ARG_TYPE', 'buf', + ['Buffer', 'Uint8Array', 'ArrayBufferView']); + return decode(this[kNativeDecoder], buf); } - buf.copy(this.lastChar, p, 0, buf.length); - this.lastNeed -= buf.length; -} -// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a -// partial character, the character's bytes are buffered until the required -// number of bytes are available. -function utf8Text(buf, i) { - const total = utf8CheckIncomplete(this, buf, i); - if (!this.lastNeed) - return buf.toString('utf8', i); - this.lastTotal = total; - const end = buf.length - (total - this.lastNeed); - buf.copy(this.lastChar, 0, end); - return buf.toString('utf8', i, end); -} - -// For UTF-8, a replacement character is added when ending on a partial -// character. -function utf8End(buf) { - const r = (buf && buf.length ? this.write(buf) : ''); - if (this.lastNeed) { - this.lastNeed = 0; - this.lastTotal = 0; - return r + '\ufffd'; + end(buf) { + let ret = ''; + if (buf !== undefined) + ret = this.write(buf); + if (this[kNativeDecoder][kBufferedBytes] > 0) + ret += flush(this[kNativeDecoder]); + return ret; } - return r; -} -// UTF-16LE typically needs two bytes per character, but even if we have an even -// number of bytes available, we need to check if we end on a leading/high -// surrogate. In that case, we need to wait for the next two bytes in order to -// decode the last character properly. -function utf16Text(buf, i) { - if ((buf.length - i) % 2 === 0) { - const r = buf.toString('utf16le', i); - if (r) { - const c = r.charCodeAt(r.length - 1); - if (c >= 0xD800 && c <= 0xDBFF) { - this.lastNeed = 2; - this.lastTotal = 4; - this.lastChar[0] = buf[buf.length - 2]; - this.lastChar[1] = buf[buf.length - 1]; - return r.slice(0, -1); - } - } - return r; - } - this.lastNeed = 1; - this.lastTotal = 2; - this.lastChar[0] = buf[buf.length - 1]; - return buf.toString('utf16le', i, buf.length - 1); -} + /* Everything below this line is undocumented legacy stuff. */ -// For UTF-16LE we do not explicitly append special replacement characters if we -// end on a partial character, we simply let v8 handle that. -function utf16End(buf) { - const r = (buf && buf.length ? this.write(buf) : ''); - if (this.lastNeed) { - const end = this.lastTotal - this.lastNeed; - this.lastNeed = 0; - this.lastTotal = 0; - return r + this.lastChar.toString('utf16le', 0, end); + text(buf, offset) { + this[kNativeDecoder][kMissingBytes] = 0; + this[kNativeDecoder][kBufferedBytes] = 0; + return this.write(buf.slice(offset)); } - return r; -} -function base64Text(buf, i) { - const n = (buf.length - i) % 3; - if (n === 0) - return buf.toString('base64', i); - this.lastNeed = 3 - n; - this.lastTotal = 3; - if (n === 1) { - this.lastChar[0] = buf[buf.length - 1]; - } else { - this.lastChar[0] = buf[buf.length - 2]; - this.lastChar[1] = buf[buf.length - 1]; + get lastTotal() { + return this[kNativeDecoder][kBufferedBytes] + this.lastNeed; } - return buf.toString('base64', i, buf.length - n); -} - -function base64End(buf) { - const r = (buf && buf.length ? this.write(buf) : ''); - if (this.lastNeed) { - const end = 3 - this.lastNeed; - this.lastNeed = 0; - this.lastTotal = 0; - return r + this.lastChar.toString('base64', 0, end); + get lastChar() { + return this[kNativeDecoder].subarray(kIncompleteCharactersStart, + kIncompleteCharactersEnd); } - return r; } -// Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex) -function simpleWrite(buf) { - return buf.toString(this.encoding); -} - -function simpleEnd(buf) { - return (buf && buf.length ? this.write(buf) : ''); -} +exports.StringDecoder = StringDecoder; -- cgit v1.2.3