1 files changed, 122 insertions, 66 deletions
diff --git a/deps/v8/src/unicode.cc b/deps/v8/src/unicode.cc
index db98be8675..015f8a27f2 100644
--- a/deps/v8/src/unicode.cc
+++ b/deps/v8/src/unicode.cc
@@ -190,8 +190,7 @@ static int LookupMapping(const int32_t* table,
   }
 }
 
-
-static inline size_t NonASCIISequenceLength(byte first) {
+static inline uint8_t NonASCIISequenceLength(byte first) {
   // clang-format off
   static const uint8_t lengths[256] = {
       // The first 128 entries correspond to ASCII characters.
@@ -229,80 +228,137 @@ static inline bool IsContinuationCharacter(byte chr) {
 // This method decodes an UTF-8 value according to RFC 3629.
 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
   size_t length = NonASCIISequenceLength(str[0]);
-  if (length == 0 || max_length < length) {
-    *cursor += 1;
-    return kBadChar;
-  }
-  if (length == 2) {
-    if (!IsContinuationCharacter(str[1])) {
-      *cursor += 1;
-      return kBadChar;
-    }
-    *cursor += 2;
-    return ((str[0] << 6) + str[1]) - 0x00003080;
+
+  // Check continuation characters.
+  size_t max_count = std::min(length, max_length);
+  size_t count = 1;
+  while (count < max_count && IsContinuationCharacter(str[count])) {
+    count++;
   }
+  *cursor += count;
+
+  // There must be enough continuation characters.
+  if (count != length) return kBadChar;
+
+  // Check overly long sequences & other conditions.
   if (length == 3) {
-    switch (str[0]) {
-      case 0xE0:
-        // Overlong three-byte sequence.
-        if (str[1] < 0xA0 || str[1] > 0xBF) {
-          *cursor += 1;
-          return kBadChar;
-        }
-        break;
-      case 0xED:
-        // High and low surrogate halves.
-        if (str[1] < 0x80 || str[1] > 0x9F) {
-          *cursor += 1;
-          return kBadChar;
-        }
-        break;
-      default:
-        if (!IsContinuationCharacter(str[1])) {
-          *cursor += 1;
-          return kBadChar;
-        }
-    }
-    if (!IsContinuationCharacter(str[2])) {
-      *cursor += 1;
+    if (str[0] == 0xE0 && (str[1] < 0xA0 || str[1] > 0xBF)) {
+      // Overlong three-byte sequence?
+      return kBadChar;
+    } else if (str[0] == 0xED && (str[1] < 0x80 || str[1] > 0x9F)) {
+      // High and low surrogate halves?
       return kBadChar;
     }
-    *cursor += 3;
-    return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
-  }
-  DCHECK(length == 4);
-  switch (str[0]) {
-    case 0xF0:
+  } else if (length == 4) {
+    if (str[0] == 0xF0 && (str[1] < 0x90 || str[1] > 0xBF)) {
       // Overlong four-byte sequence.
-      if (str[1] < 0x90 || str[1] > 0xBF) {
-        *cursor += 1;
-        return kBadChar;
-      }
-      break;
-    case 0xF4:
+      return kBadChar;
+    } else if (str[0] == 0xF4 && (str[1] < 0x80 || str[1] > 0x8F)) {
       // Code points outside of the unicode range.
-      if (str[1] < 0x80 || str[1] > 0x8F) {
-        *cursor += 1;
-        return kBadChar;
-      }
-      break;
-    default:
-      if (!IsContinuationCharacter(str[1])) {
-        *cursor += 1;
-        return kBadChar;
-      }
+      return kBadChar;
+    }
   }
-  if (!IsContinuationCharacter(str[2])) {
-    *cursor += 1;
-    return kBadChar;
+
+  // All errors have been handled, so we only have to assemble the result.
+  switch (length) {
+    case 1:
+      return str[0];
+    case 2:
+      return ((str[0] << 6) + str[1]) - 0x00003080;
+    case 3:
+      return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
+    case 4:
+      return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
+             0x03C82080;
   }
-  if (!IsContinuationCharacter(str[3])) {
-    *cursor += 1;
+
+  UNREACHABLE();
+  return kBadChar;
+}
+
+uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
+  DCHECK_NOT_NULL(buffer);
+
+  // The common case: 1-byte Utf8 (and no incomplete char in the buffer)
+  if (V8_LIKELY(next <= kMaxOneByteChar && *buffer == 0)) {
+    return static_cast<uchar>(next);
+  }
+
+  if (*buffer == 0) {
+    // We're at the start of a new character.
+    uint32_t kind = NonASCIISequenceLength(next);
+    if (kind >= 2 && kind <= 4) {
+      // Start of 2..4 byte character, and no buffer.
+
+      // The mask for the lower bits depends on the kind, and is
+      // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that
+      // with one shift.
+      uint8_t mask = 0x7f >> kind;
+
+      // Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)
+      // in 2nd nibble, and the value  in the bottom three. The 2nd nibble is
+      // intended as a counter about how many bytes are still needed.
+      *buffer = kind << 28 | (kind - 1) << 24 | (next & mask);
+      return kIncomplete;
+    } else {
+      // No buffer, and not the start of a 1-byte char (handled at the
+      // beginning), and not the start of a 2..4 byte char? Bad char.
+      *buffer = 0;
+      return kBadChar;
+    }
+  } else if (*buffer <= 0xff) {
+    // We have one unprocessed byte left (from the last else case in this if
+    // statement).
+    uchar previous = *buffer;
+    *buffer = 0;
+    uchar t = ValueOfIncremental(previous, buffer);
+    if (t == kIncomplete) {
+      // If we have an incomplete character, process both the previous and the
+      // next byte at once.
+      return ValueOfIncremental(next, buffer);
+    } else {
+      // Otherwise, process the previous byte and save the next byte for next
+      // time.
+      DCHECK_EQ(0, *buffer);
+      *buffer = next;
+      return t;
+    }
+  } else if (IsContinuationCharacter(next)) {
+    // We're inside of a character, as described by buffer.
+
+    // How many bytes (excluding this one) do we still expect?
+    uint8_t bytes_expected = *buffer >> 28;
+    uint8_t bytes_left = (*buffer >> 24) & 0x0f;
+    bytes_left--;
+    // Update the value.
+    uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F);
+    if (bytes_left) {
+      *buffer = (bytes_expected << 28 | bytes_left << 24 | value);
+      return kIncomplete;
+    } else {
+      *buffer = 0;
+      bool sequence_was_too_long = (bytes_expected == 2 && value < 0x80) ||
+                                   (bytes_expected == 3 && value < 0x800);
+      return sequence_was_too_long ? kBadChar : value;
+    }
+  } else {
+    // Within a character, but not a continuation character? Then the
+    // previous char was a bad char. But we need to save the current
+    // one.
+    *buffer = next;
     return kBadChar;
   }
-  *cursor += 4;
-  return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
-         0x03C82080;
+}
+
+uchar Utf8::ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer) {
+  DCHECK_NOT_NULL(buffer);
+  if (*buffer == 0) {
+    return kBufferEmpty;
+  } else {
+    // Process left-over chars. An incomplete char at the end maps to kBadChar.
+    uchar t = ValueOfIncremental(0, buffer);
+    return (t == kIncomplete) ? kBadChar : t;
+  }
 }
 
 bool Utf8::Validate(const byte* bytes, size_t length) {