From 40c4beeb57adebb8495124ecd6c21ddef712c00e Mon Sep 17 00:00:00 2001 From: koichik Date: Sat, 5 May 2012 22:47:24 +0900 Subject: string_decoder: added support for UTF-16LE Fixes #3223. --- lib/string_decoder.js | 103 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 63 insertions(+), 40 deletions(-) (limited to 'lib/string_decoder.js') diff --git a/lib/string_decoder.js b/lib/string_decoder.js index 6e730c2fbd..879e590647 100644 --- a/lib/string_decoder.js +++ b/lib/string_decoder.js @@ -21,22 +21,32 @@ var StringDecoder = exports.StringDecoder = function(encoding) { this.encoding = (encoding || 'utf8').toLowerCase().replace(/[-_]/, ''); - if (this.encoding === 'utf8') { - this.charBuffer = new Buffer(6); - this.charReceived = 0; - this.charLength = 0; + switch (this.encoding) { + case 'utf8': + // CESU-8 represents each of Surrogate Pair by 3-bytes + this.surrogateSize = 3; + break; + case 'ucs2': + case 'utf16le': + // UTF-16 represents each of Surrogate Pair by 2-bytes + this.surrogateSize = 2; + this.detectIncompleteChar = utf16DetectIncompleteChar; + break; + default: + this.write = passThroughWrite; + return; } + + this.charBuffer = new Buffer(6); + this.charReceived = 0; + this.charLength = 0; }; StringDecoder.prototype.write = function(buffer) { - // If not utf8... - if (this.encoding !== 'utf8') { - return buffer.toString(this.encoding); - } - var charStr = ''; var offset = 0; + // if our last write ended with an incomplete multibyte character while (this.charLength) { // determine how many remaining bytes this buffer has to offer for this char @@ -55,16 +65,14 @@ StringDecoder.prototype.write = function(buffer) { } // get the character that was split - charStr = this.charBuffer.slice(0, this.charLength).toString(); + charStr = this.charBuffer.slice(0, this.charLength).toString(this.encoding); // lead surrogate (D800-DBFF) is also the incomplete character - if (this.charLength === 3) { - var charCode = charStr.charCodeAt(0); - if (charCode >= 0xD800 && charCode <= 0xDBFF) { - charStr = ''; - this.charLength += 3; // size of trail surrogate (DC00-DFFF) - continue; - } + var charCode = charStr.charCodeAt(charStr.length - 1); + if (charCode >= 0xD800 && charCode <= 0xDBFF) { + this.charLength += this.surrogateSize; + charStr = ''; + continue; } this.charReceived = this.charLength = 0; @@ -76,7 +84,35 @@ StringDecoder.prototype.write = function(buffer) { break; } + var lenIncomplete = this.detectIncompleteChar(buffer); + + var end = buffer.length; + if (this.charLength) { + // buffer the incomplete character bytes we got + buffer.copy(this.charBuffer, 0, buffer.length - lenIncomplete, end); + this.charReceived = lenIncomplete; + end -= lenIncomplete; + } + + charStr += buffer.toString(this.encoding, 0, end); + + var end = charStr.length - 1; + var charCode = charStr.charCodeAt(end); + // lead surrogate (D800-DBFF) is also the incomplete character + if (charCode >= 0xD800 && charCode <= 0xDBFF) { + var size = this.surrogateSize; + this.charLength += size; + this.charReceived += size; + this.charBuffer.copy(this.charBuffer, size, 0, size); + this.charBuffer.write(charStr.charAt(charStr.length - 1), this.encoding); + return charStr.substring(0, end); + } + + // or just emit the charStr + return charStr; +}; +StringDecoder.prototype.detectIncompleteChar = function(buffer) { // determine how many bytes we have to check at the end of this buffer var i = (buffer.length >= 3) ? 3 : buffer.length; @@ -106,28 +142,15 @@ StringDecoder.prototype.write = function(buffer) { } } - var end = buffer.length; - if (this.charLength) { - // buffer the incomplete character bytes we got - buffer.copy(this.charBuffer, 0, buffer.length - i, buffer.length); - this.charReceived = i; - end -= i; - } - - charStr += buffer.toString('utf8', 0, end); + return i; +}; - // lead surrogate (D800-DBFF) is also the incomplete character - end = charStr.length - 1; - var charCode = charStr.charCodeAt(end); - if (charCode >= 0xD800 && charCode <= 0xDBFF) { - // CESU-8 represents each of Surrogate Pair by 3-bytes - this.charLength += 3 - this.charReceived += 3 - this.charBuffer.copy(this.charBuffer, 3, 0, 3); - this.charBuffer.write(charStr.charAt(end)); - return charStr.substring(0, end); - } +function passThroughWrite(buffer) { + return buffer.toString(this.encoding); +} - // or just emit the charStr - return charStr; -}; +function utf16DetectIncompleteChar(buffer) { + var incomplete = this.charReceived = buffer.length % 2; + this.charLength = incomplete ? 2 : 0; + return incomplete; +} -- cgit v1.2.3