summaryrefslogtreecommitdiff
path: root/lib/string_decoder.js
diff options
context:
space:
mode:
authorBrian White <mscdex@mscdex.net>2016-06-15 12:42:52 -0400
committerBrian White <mscdex@mscdex.net>2016-06-23 23:18:10 -0400
commit79ef3b6b5df0a0c262d2472298afe023a470da68 (patch)
tree56531788217809312aae22b69e3c1388acedfe28 /lib/string_decoder.js
parent5267f29b3445d608e2458087a5724820aa996930 (diff)
downloadandroid-node-v8-79ef3b6b5df0a0c262d2472298afe023a470da68.tar.gz
android-node-v8-79ef3b6b5df0a0c262d2472298afe023a470da68.tar.bz2
android-node-v8-79ef3b6b5df0a0c262d2472298afe023a470da68.zip
string_decoder: fix bad utf8 character handling
This commit fixes an issue when extra utf8 continuation bytes appear at the end of a chunk of data, causing miscalculations to be made when checking how many bytes are needed to decode a complete character. Fixes: https://github.com/nodejs/node/issues/7308 PR-URL: https://github.com/nodejs/node/pull/7310 Reviewed-By: Colin Ihrig <cjihrig@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Fedor Indutny <fedor.indutny@gmail.com>
Diffstat (limited to 'lib/string_decoder.js')
-rw-r--r--lib/string_decoder.js75
1 files changed, 61 insertions, 14 deletions
diff --git a/lib/string_decoder.js b/lib/string_decoder.js
index 005c07ed9e..cb9fbe409f 100644
--- a/lib/string_decoder.js
+++ b/lib/string_decoder.js
@@ -25,8 +25,10 @@ function StringDecoder(encoding) {
case 'utf16le':
this.text = utf16Text;
this.end = utf16End;
- // fall through
+ nb = 4;
+ break;
case 'utf8':
+ this.fillLast = utf8FillLast;
nb = 4;
break;
case 'base64':
@@ -68,7 +70,7 @@ StringDecoder.prototype.end = utf8End;
// Returns only complete characters in a Buffer
StringDecoder.prototype.text = utf8Text;
-// Attempts to complete a partial character using bytes from a Buffer
+// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer
StringDecoder.prototype.fillLast = function(buf) {
if (this.lastNeed <= buf.length) {
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
@@ -92,38 +94,83 @@ function utf8CheckByte(byte) {
return -1;
}
-// Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8
-// character, returning the total number of bytes needed to complete the partial
-// character (if applicable).
+// Checks at most 3 bytes at the end of a Buffer in order to detect an
+// incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4)
+// needed to complete the UTF-8 character (if applicable) are returned.
function utf8CheckIncomplete(self, buf, i) {
var j = buf.length - 1;
if (j < i)
return 0;
- var nb = utf8CheckByte(buf[j--]);
+ var nb = utf8CheckByte(buf[j]);
if (nb >= 0) {
if (nb > 0)
- self.lastNeed = nb + 1 - (buf.length - j);
+ self.lastNeed = nb - 1;
return nb;
}
- if (j < i)
+ if (--j < i)
return 0;
- nb = utf8CheckByte(buf[j--]);
+ nb = utf8CheckByte(buf[j]);
if (nb >= 0) {
if (nb > 0)
- self.lastNeed = nb + 1 - (buf.length - j);
+ self.lastNeed = nb - 2;
return nb;
}
- if (j < i)
+ if (--j < i)
return 0;
- nb = utf8CheckByte(buf[j--]);
+ nb = utf8CheckByte(buf[j]);
if (nb >= 0) {
- if (nb > 0)
- self.lastNeed = nb + 1 - (buf.length - j);
+ if (nb > 0) {
+ if (nb === 2)
+ nb = 0;
+ else
+ self.lastNeed = nb - 3;
+ }
return nb;
}
return 0;
}
+// Validates as many continuation bytes for a multi-byte UTF-8 character as
+// needed or are available. If we see a non-continuation byte where we expect
+// one, we "replace" the validated continuation bytes we've seen so far with
+// UTF-8 replacement characters ('\ufffd'), to match v8's UTF-8 decoding
+// behavior. The continuation byte check is included three times in the case
+// where all of the continuation bytes for a character exist in the same buffer.
+// It is also done this way as a slight performance increase instead of using a
+// loop.
+function utf8CheckExtraBytes(self, buf, p) {
+ if ((buf[0] & 0xC0) !== 0x80) {
+ self.lastNeed = 0;
+ return '\ufffd'.repeat(p);
+ }
+ if (self.lastNeed > 1 && buf.length > 1) {
+ if ((buf[1] & 0xC0) !== 0x80) {
+ self.lastNeed = 1;
+ return '\ufffd'.repeat(p + 1);
+ }
+ if (self.lastNeed > 2 && buf.length > 2) {
+ if ((buf[2] & 0xC0) !== 0x80) {
+ self.lastNeed = 2;
+ return '\ufffd'.repeat(p + 2);
+ }
+ }
+ }
+}
+
+// Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer.
+function utf8FillLast(buf) {
+ const p = this.lastTotal - this.lastNeed;
+ var r = utf8CheckExtraBytes(this, buf, p);
+ if (r !== undefined)
+ return r;
+ if (this.lastNeed <= buf.length) {
+ buf.copy(this.lastChar, p, 0, this.lastNeed);
+ return this.lastChar.toString(this.encoding, 0, this.lastTotal);
+ }
+ buf.copy(this.lastChar, p, 0, buf.length);
+ this.lastNeed -= buf.length;
+}
+
// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
// partial character, the character's bytes are buffered until the required
// number of bytes are available.