4 files changed, 228 insertions, 73 deletions
diff --git a/lib/internal/readline.js b/lib/internal/readline.js
index dbe8775dba..60fe946560 100644
--- a/lib/internal/readline.js
+++ b/lib/internal/readline.js
@@ -1,103 +1,117 @@
 'use strict';
 
-// Regexes used for ansi escape code splitting
+// Regex used for ansi escape code splitting
 // eslint-disable-next-line no-control-regex
-const metaKeyCodeReAnywhere = /(?:\x1b)([a-zA-Z0-9])/;
-const functionKeyCodeReAnywhere = new RegExp('(?:\x1b+)(O|N|\\[|\\[\\[)(?:' + [
-  '(\\d+)(?:;(\\d+))?([~^$])',
-  '(?:M([@ #!a`])(.)(.))', // mouse
-  '(?:1;)?(\\d+)?([a-zA-Z])'
-].join('|') + ')');
+// Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js
+// License: MIT, authors: @sindresorhus, Qix-, and arjunmehta
+// Matches all ansi escape code sequences in a string
+const ansi =
+  /[\u001b\u009b][[()#;?]*(?:[0-9]{1,4}(?:;[0-9]{0,4})*)?[0-9A-ORZcf-nqry=><]/g;
 
 
 module.exports = {
   emitKeys,
-  getStringWidth,
-  isFullWidthCodePoint,
   stripVTControlCharacters
 };
 
+if (process.binding('config').hasIntl) {
+  const icu = process.binding('icu');
+  module.exports.getStringWidth = function getStringWidth(str, options) {
+    options = options || {};
+    if (!Number.isInteger(str))
+      str = stripVTControlCharacters(String(str));
+    return icu.getStringWidth(str,
+                              Boolean(options.ambiguousAsFullWidth),
+                              Boolean(options.expandEmojiSequence));
+  };
+  module.exports.isFullWidthCodePoint =
+    function isFullWidthCodePoint(code, options) {
+      if (typeof code !== 'number')
+        return false;
+      return icu.getStringWidth(code, options) === 2;
+    };
+} else {
+  /**
+   * Returns the number of columns required to display the given string.
+   */
+  module.exports.getStringWidth = function getStringWidth(str) {
+    if (Number.isInteger(str))
+      return module.exports.isFullWidthCodePoint(str) ? 2 : 1;
 
-/**
- * Returns the number of columns required to display the given string.
- */
-function getStringWidth(str) {
-  let width = 0;
+    let width = 0;
 
-  str = stripVTControlCharacters(str);
+    str = stripVTControlCharacters(String(str));
 
-  for (var i = 0; i < str.length; i++) {
-    const code = str.codePointAt(i);
+    for (var i = 0; i < str.length; i++) {
+      const code = str.codePointAt(i);
 
-    if (code >= 0x10000) { // surrogates
-      i++;
-    }
+      if (code >= 0x10000) { // surrogates
+        i++;
+      }
 
-    if (isFullWidthCodePoint(code)) {
-      width += 2;
-    } else {
-      width++;
+      if (module.exports.isFullWidthCodePoint(code)) {
+        width += 2;
+      } else {
+        width++;
+      }
     }
-  }
-
-  return width;
-}
 
+    return width;
+  };
 
-/**
- * Returns true if the character represented by a given
- * Unicode code point is full-width. Otherwise returns false.
- */
-function isFullWidthCodePoint(code) {
-  if (isNaN(code)) {
-    return false;
-  }
+  /**
+   * Returns true if the character represented by a given
+   * Unicode code point is full-width. Otherwise returns false.
+   */
+  module.exports.isFullWidthCodePoint = function isFullWidthCodePoint(code) {
+    if (!Number.isInteger(code)) {
+      return false;
+    }
 
-  // Code points are derived from:
-  // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
-  if (code >= 0x1100 && (
-      code <= 0x115f ||  // Hangul Jamo
-      0x2329 === code || // LEFT-POINTING ANGLE BRACKET
-      0x232a === code || // RIGHT-POINTING ANGLE BRACKET
-      // CJK Radicals Supplement .. Enclosed CJK Letters and Months
-      (0x2e80 <= code && code <= 0x3247 && code !== 0x303f) ||
-      // Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
-      0x3250 <= code && code <= 0x4dbf ||
-      // CJK Unified Ideographs .. Yi Radicals
-      0x4e00 <= code && code <= 0xa4c6 ||
-      // Hangul Jamo Extended-A
-      0xa960 <= code && code <= 0xa97c ||
-      // Hangul Syllables
-      0xac00 <= code && code <= 0xd7a3 ||
-      // CJK Compatibility Ideographs
-      0xf900 <= code && code <= 0xfaff ||
-      // Vertical Forms
-      0xfe10 <= code && code <= 0xfe19 ||
-      // CJK Compatibility Forms .. Small Form Variants
-      0xfe30 <= code && code <= 0xfe6b ||
-      // Halfwidth and Fullwidth Forms
-      0xff01 <= code && code <= 0xff60 ||
-      0xffe0 <= code && code <= 0xffe6 ||
-      // Kana Supplement
-      0x1b000 <= code && code <= 0x1b001 ||
-      // Enclosed Ideographic Supplement
-      0x1f200 <= code && code <= 0x1f251 ||
-      // CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
-      0x20000 <= code && code <= 0x3fffd)) {
-    return true;
-  }
+    // Code points are derived from:
+    // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
+    if (code >= 0x1100 && (
+        code <= 0x115f ||  // Hangul Jamo
+        0x2329 === code || // LEFT-POINTING ANGLE BRACKET
+        0x232a === code || // RIGHT-POINTING ANGLE BRACKET
+        // CJK Radicals Supplement .. Enclosed CJK Letters and Months
+        (0x2e80 <= code && code <= 0x3247 && code !== 0x303f) ||
+        // Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
+        0x3250 <= code && code <= 0x4dbf ||
+        // CJK Unified Ideographs .. Yi Radicals
+        0x4e00 <= code && code <= 0xa4c6 ||
+        // Hangul Jamo Extended-A
+        0xa960 <= code && code <= 0xa97c ||
+        // Hangul Syllables
+        0xac00 <= code && code <= 0xd7a3 ||
+        // CJK Compatibility Ideographs
+        0xf900 <= code && code <= 0xfaff ||
+        // Vertical Forms
+        0xfe10 <= code && code <= 0xfe19 ||
+        // CJK Compatibility Forms .. Small Form Variants
+        0xfe30 <= code && code <= 0xfe6b ||
+        // Halfwidth and Fullwidth Forms
+        0xff01 <= code && code <= 0xff60 ||
+        0xffe0 <= code && code <= 0xffe6 ||
+        // Kana Supplement
+        0x1b000 <= code && code <= 0x1b001 ||
+        // Enclosed Ideographic Supplement
+        0x1f200 <= code && code <= 0x1f251 ||
+        // CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
+        0x20000 <= code && code <= 0x3fffd)) {
+      return true;
+    }
 
-  return false;
+    return false;
+  };
 }
 
-
 /**
  * Tries to remove all VT control characters. Use to estimate displayed
  * string width. May be buggy due to not running a real state machine
  */
 function stripVTControlCharacters(str) {
-  str = str.replace(new RegExp(functionKeyCodeReAnywhere.source, 'g'), '');
-  return str.replace(new RegExp(metaKeyCodeReAnywhere.source, 'g'), '');
+  return str.replace(ansi, '');
 }
 
 
diff --git a/lib/readline.js b/lib/readline.js
index 3927402f63..9b925a6d99 100644
--- a/lib/readline.js
+++ b/lib/readline.js
@@ -124,6 +124,14 @@ function Interface(input, output, completer, terminal) {
 
   function onkeypress(s, key) {
     self._ttyWrite(s, key);
+    if (key && key.sequence) {
+      // if the key.sequence is half of a surrogate pair
+      // (>= 0xd800 and <= 0xdfff), refresh the line so
+      // the character is displayed appropriately.
+      const ch = key.sequence.codePointAt(0);
+      if (ch >= 0xd800 && ch <= 0xdfff)
+        self._refreshLine();
+    }
   }
 
   function onresize() {
diff --git a/src/node_i18n.cc b/src/node_i18n.cc
index f89ae40a55..e77591babf 100644
--- a/src/node_i18n.cc
+++ b/src/node_i18n.cc
@@ -31,6 +31,7 @@
 #include "v8.h"
 
 #include <unicode/putil.h>
+#include <unicode/uchar.h>
 #include <unicode/udata.h>
 #include <unicode/uidna.h>
 
@@ -185,6 +186,94 @@ static void ToASCII(const FunctionCallbackInfo<Value>& args) {
                           len).ToLocalChecked());
 }
 
+// This is similar to wcwidth except that it takes the current unicode
+// character properties database into consideration, allowing it to
+// correctly calculate the column widths of things like emoji's and
+// newer wide characters. wcwidth, on the other hand, uses a fixed
+// algorithm that does not take things like emoji into proper
+// consideration.
+static int GetColumnWidth(UChar32 codepoint,
+                          bool ambiguous_as_full_width = false) {
+  if (!u_isdefined(codepoint) ||
+      u_iscntrl(codepoint) ||
+      u_getCombiningClass(codepoint) > 0 ||
+      u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER)) {
+    return 0;
+  }
+  // UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a
+  // codepoint as being full width, wide, ambiguous, neutral, narrow,
+  // or halfwidth.
+  const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH);
+  switch (eaw) {
+    case U_EA_FULLWIDTH:
+    case U_EA_WIDE:
+      return 2;
+    case U_EA_AMBIGUOUS:
+      // See: http://www.unicode.org/reports/tr11/#Ambiguous for details
+      if (ambiguous_as_full_width) {
+        return 2;
+      }
+      // Fall through if ambiguous_as_full_width if false.
+    case U_EA_NEUTRAL:
+      if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) {
+        return 2;
+      }
+      // Fall through
+    case U_EA_HALFWIDTH:
+    case U_EA_NARROW:
+    default:
+      return 1;
+  }
+}
+
+// Returns the column width for the given String.
+static void GetStringWidth(const FunctionCallbackInfo<Value>& args) {
+  Environment* env = Environment::GetCurrent(args);
+  if (args.Length() < 1)
+    return;
+
+  bool ambiguous_as_full_width = args[1]->BooleanValue();
+  bool expand_emoji_sequence = args[2]->BooleanValue();
+
+  if (args[0]->IsNumber()) {
+    args.GetReturnValue().Set(
+        GetColumnWidth(args[0]->Uint32Value(),
+                       ambiguous_as_full_width));
+    return;
+  }
+
+  TwoByteValue value(env->isolate(), args[0]);
+  // reinterpret_cast is required by windows to compile
+  UChar* str = reinterpret_cast<UChar*>(*value);
+  UChar32 c;
+  UChar32 p;
+  size_t n = 0;
+  uint32_t width = 0;
+
+  while (n < value.length()) {
+    p = c;
+    U16_NEXT(str, n, value.length(), c);
+    // Don't count individual emoji codepoints that occur within an
+    // emoji sequence. This is not necessarily foolproof. Some
+    // environments display emoji sequences in the appropriate
+    // condensed form (as a single emoji glyph), other environments
+    // may not understand an emoji sequence and will display each
+    // individual emoji separately. When this happens, the width
+    // calculated will be off, and there's no reliable way of knowing
+    // in advance if a particular sequence is going to be supported.
+    // The expand_emoji_sequence option allows the caller to skip this
+    // check and count each code within an emoji sequence separately.
+    if (!expand_emoji_sequence &&
+        n > 0 && p == 0x200d &&  // 0x200d == ZWJ (zero width joiner)
+        (u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) ||
+         u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) {
+      continue;
+    }
+    width += GetColumnWidth(c, ambiguous_as_full_width);
+  }
+  args.GetReturnValue().Set(width);
+}
+
 void Init(Local<Object> target,
           Local<Value> unused,
           Local<Context> context,
@@ -192,6 +281,7 @@ void Init(Local<Object> target,
   Environment* env = Environment::GetCurrent(context);
   env->SetMethod(target, "toUnicode", ToUnicode);
   env->SetMethod(target, "toASCII", ToASCII);
+  env->SetMethod(target, "getStringWidth", GetStringWidth);
 }
 
 }  // namespace i18n
diff --git a/test/parallel/test-icu-stringwidth.js b/test/parallel/test-icu-stringwidth.js
new file mode 100644
index 0000000000..5b66f00c32
--- /dev/null
+++ b/test/parallel/test-icu-stringwidth.js
@@ -0,0 +1,43 @@
+// Flags: --expose_internals
+'use strict';
+
+const common = require('../common');
+const assert = require('assert');
+const readline = require('internal/readline');
+
+if (!process.binding('config').hasIntl) {
+  common.skip('missing intl... skipping test');
+  return;
+}
+
+// Test column width
+assert.strictEqual(readline.getStringWidth('a'), 1);
+assert.strictEqual(readline.getStringWidth('丁'), 2);
+assert.strictEqual(readline.getStringWidth('\ud83d\udc78\ud83c\udfff'), 2);
+assert.strictEqual(readline.getStringWidth('👅'), 2);
+assert.strictEqual(readline.getStringWidth('\n'), 0);
+assert.strictEqual(readline.getStringWidth('\u200Ef\u200F'), 1);
+assert.strictEqual(readline.getStringWidth(97), 1);
+
+// The following is an emoji sequence. In some implementations, it is
+// represented as a single glyph, in other implementations as a sequence
+// of individual glyphs. By default, the algorithm will assume the single
+// glyph interpretation and return a value of 2. By passing the
+// expandEmojiSequence: true option, each component will be counted
+// individually.
+assert.strictEqual(readline.getStringWidth('👩‍👩‍👧‍👧'), 2);
+assert.strictEqual(
+    readline.getStringWidth('👩‍👩‍👧‍👧', {expandEmojiSequence: true}), 8);
+
+// By default, unicode characters whose width is considered ambiguous will
+// be considered half-width. For these characters, getStringWidth will return
+// 1. In some contexts, however, it is more appropriate to consider them full
+// width. By default, the algorithm will assume half width. By passing
+// the ambiguousAsFullWidth: true option, ambiguous characters will be counted
+// as 2 columns.
+assert.strictEqual(readline.getStringWidth('\u01d4'), 1);
+assert.strictEqual(
+    readline.getStringWidth('\u01d4', {ambiguousAsFullWidth: true}), 2);
+
+// Control chars and combining chars are zero
+assert.strictEqual(readline.getStringWidth('\u200E\n\u220A\u20D2'), 1);