summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/internal/readline.js160
-rw-r--r--lib/readline.js8
-rw-r--r--src/node_i18n.cc90
-rw-r--r--test/parallel/test-icu-stringwidth.js43
4 files changed, 228 insertions, 73 deletions
diff --git a/lib/internal/readline.js b/lib/internal/readline.js
index dbe8775dba..60fe946560 100644
--- a/lib/internal/readline.js
+++ b/lib/internal/readline.js
@@ -1,103 +1,117 @@
'use strict';
-// Regexes used for ansi escape code splitting
+// Regex used for ansi escape code splitting
// eslint-disable-next-line no-control-regex
-const metaKeyCodeReAnywhere = /(?:\x1b)([a-zA-Z0-9])/;
-const functionKeyCodeReAnywhere = new RegExp('(?:\x1b+)(O|N|\\[|\\[\\[)(?:' + [
- '(\\d+)(?:;(\\d+))?([~^$])',
- '(?:M([@ #!a`])(.)(.))', // mouse
- '(?:1;)?(\\d+)?([a-zA-Z])'
-].join('|') + ')');
+// Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js
+// License: MIT, authors: @sindresorhus, Qix-, and arjunmehta
+// Matches all ansi escape code sequences in a string
+const ansi =
+ /[\u001b\u009b][[()#;?]*(?:[0-9]{1,4}(?:;[0-9]{0,4})*)?[0-9A-ORZcf-nqry=><]/g;
module.exports = {
emitKeys,
- getStringWidth,
- isFullWidthCodePoint,
stripVTControlCharacters
};
+if (process.binding('config').hasIntl) {
+ const icu = process.binding('icu');
+ module.exports.getStringWidth = function getStringWidth(str, options) {
+ options = options || {};
+ if (!Number.isInteger(str))
+ str = stripVTControlCharacters(String(str));
+ return icu.getStringWidth(str,
+ Boolean(options.ambiguousAsFullWidth),
+ Boolean(options.expandEmojiSequence));
+ };
+ module.exports.isFullWidthCodePoint =
+ function isFullWidthCodePoint(code, options) {
+ if (typeof code !== 'number')
+ return false;
+ return icu.getStringWidth(code, options) === 2;
+ };
+} else {
+ /**
+ * Returns the number of columns required to display the given string.
+ */
+ module.exports.getStringWidth = function getStringWidth(str) {
+ if (Number.isInteger(str))
+ return module.exports.isFullWidthCodePoint(str) ? 2 : 1;
-/**
- * Returns the number of columns required to display the given string.
- */
-function getStringWidth(str) {
- let width = 0;
+ let width = 0;
- str = stripVTControlCharacters(str);
+ str = stripVTControlCharacters(String(str));
- for (var i = 0; i < str.length; i++) {
- const code = str.codePointAt(i);
+ for (var i = 0; i < str.length; i++) {
+ const code = str.codePointAt(i);
- if (code >= 0x10000) { // surrogates
- i++;
- }
+ if (code >= 0x10000) { // surrogates
+ i++;
+ }
- if (isFullWidthCodePoint(code)) {
- width += 2;
- } else {
- width++;
+ if (module.exports.isFullWidthCodePoint(code)) {
+ width += 2;
+ } else {
+ width++;
+ }
}
- }
-
- return width;
-}
+ return width;
+ };
-/**
- * Returns true if the character represented by a given
- * Unicode code point is full-width. Otherwise returns false.
- */
-function isFullWidthCodePoint(code) {
- if (isNaN(code)) {
- return false;
- }
+ /**
+ * Returns true if the character represented by a given
+ * Unicode code point is full-width. Otherwise returns false.
+ */
+ module.exports.isFullWidthCodePoint = function isFullWidthCodePoint(code) {
+ if (!Number.isInteger(code)) {
+ return false;
+ }
- // Code points are derived from:
- // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
- if (code >= 0x1100 && (
- code <= 0x115f || // Hangul Jamo
- 0x2329 === code || // LEFT-POINTING ANGLE BRACKET
- 0x232a === code || // RIGHT-POINTING ANGLE BRACKET
- // CJK Radicals Supplement .. Enclosed CJK Letters and Months
- (0x2e80 <= code && code <= 0x3247 && code !== 0x303f) ||
- // Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
- 0x3250 <= code && code <= 0x4dbf ||
- // CJK Unified Ideographs .. Yi Radicals
- 0x4e00 <= code && code <= 0xa4c6 ||
- // Hangul Jamo Extended-A
- 0xa960 <= code && code <= 0xa97c ||
- // Hangul Syllables
- 0xac00 <= code && code <= 0xd7a3 ||
- // CJK Compatibility Ideographs
- 0xf900 <= code && code <= 0xfaff ||
- // Vertical Forms
- 0xfe10 <= code && code <= 0xfe19 ||
- // CJK Compatibility Forms .. Small Form Variants
- 0xfe30 <= code && code <= 0xfe6b ||
- // Halfwidth and Fullwidth Forms
- 0xff01 <= code && code <= 0xff60 ||
- 0xffe0 <= code && code <= 0xffe6 ||
- // Kana Supplement
- 0x1b000 <= code && code <= 0x1b001 ||
- // Enclosed Ideographic Supplement
- 0x1f200 <= code && code <= 0x1f251 ||
- // CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
- 0x20000 <= code && code <= 0x3fffd)) {
- return true;
- }
+ // Code points are derived from:
+ // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
+ if (code >= 0x1100 && (
+ code <= 0x115f || // Hangul Jamo
+ 0x2329 === code || // LEFT-POINTING ANGLE BRACKET
+ 0x232a === code || // RIGHT-POINTING ANGLE BRACKET
+ // CJK Radicals Supplement .. Enclosed CJK Letters and Months
+ (0x2e80 <= code && code <= 0x3247 && code !== 0x303f) ||
+ // Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
+ 0x3250 <= code && code <= 0x4dbf ||
+ // CJK Unified Ideographs .. Yi Radicals
+ 0x4e00 <= code && code <= 0xa4c6 ||
+ // Hangul Jamo Extended-A
+ 0xa960 <= code && code <= 0xa97c ||
+ // Hangul Syllables
+ 0xac00 <= code && code <= 0xd7a3 ||
+ // CJK Compatibility Ideographs
+ 0xf900 <= code && code <= 0xfaff ||
+ // Vertical Forms
+ 0xfe10 <= code && code <= 0xfe19 ||
+ // CJK Compatibility Forms .. Small Form Variants
+ 0xfe30 <= code && code <= 0xfe6b ||
+ // Halfwidth and Fullwidth Forms
+ 0xff01 <= code && code <= 0xff60 ||
+ 0xffe0 <= code && code <= 0xffe6 ||
+ // Kana Supplement
+ 0x1b000 <= code && code <= 0x1b001 ||
+ // Enclosed Ideographic Supplement
+ 0x1f200 <= code && code <= 0x1f251 ||
+ // CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
+ 0x20000 <= code && code <= 0x3fffd)) {
+ return true;
+ }
- return false;
+ return false;
+ };
}
-
/**
* Tries to remove all VT control characters. Use to estimate displayed
* string width. May be buggy due to not running a real state machine
*/
function stripVTControlCharacters(str) {
- str = str.replace(new RegExp(functionKeyCodeReAnywhere.source, 'g'), '');
- return str.replace(new RegExp(metaKeyCodeReAnywhere.source, 'g'), '');
+ return str.replace(ansi, '');
}
diff --git a/lib/readline.js b/lib/readline.js
index 3927402f63..9b925a6d99 100644
--- a/lib/readline.js
+++ b/lib/readline.js
@@ -124,6 +124,14 @@ function Interface(input, output, completer, terminal) {
function onkeypress(s, key) {
self._ttyWrite(s, key);
+ if (key && key.sequence) {
+ // if the key.sequence is half of a surrogate pair
+ // (>= 0xd800 and <= 0xdfff), refresh the line so
+ // the character is displayed appropriately.
+ const ch = key.sequence.codePointAt(0);
+ if (ch >= 0xd800 && ch <= 0xdfff)
+ self._refreshLine();
+ }
}
function onresize() {
diff --git a/src/node_i18n.cc b/src/node_i18n.cc
index f89ae40a55..e77591babf 100644
--- a/src/node_i18n.cc
+++ b/src/node_i18n.cc
@@ -31,6 +31,7 @@
#include "v8.h"
#include <unicode/putil.h>
+#include <unicode/uchar.h>
#include <unicode/udata.h>
#include <unicode/uidna.h>
@@ -185,6 +186,94 @@ static void ToASCII(const FunctionCallbackInfo<Value>& args) {
len).ToLocalChecked());
}
+// This is similar to wcwidth except that it takes the current unicode
+// character properties database into consideration, allowing it to
+// correctly calculate the column widths of things like emoji's and
+// newer wide characters. wcwidth, on the other hand, uses a fixed
+// algorithm that does not take things like emoji into proper
+// consideration.
+static int GetColumnWidth(UChar32 codepoint,
+ bool ambiguous_as_full_width = false) {
+ if (!u_isdefined(codepoint) ||
+ u_iscntrl(codepoint) ||
+ u_getCombiningClass(codepoint) > 0 ||
+ u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER)) {
+ return 0;
+ }
+ // UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a
+ // codepoint as being full width, wide, ambiguous, neutral, narrow,
+ // or halfwidth.
+ const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH);
+ switch (eaw) {
+ case U_EA_FULLWIDTH:
+ case U_EA_WIDE:
+ return 2;
+ case U_EA_AMBIGUOUS:
+ // See: http://www.unicode.org/reports/tr11/#Ambiguous for details
+ if (ambiguous_as_full_width) {
+ return 2;
+ }
+ // Fall through if ambiguous_as_full_width if false.
+ case U_EA_NEUTRAL:
+ if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) {
+ return 2;
+ }
+ // Fall through
+ case U_EA_HALFWIDTH:
+ case U_EA_NARROW:
+ default:
+ return 1;
+ }
+}
+
+// Returns the column width for the given String.
+static void GetStringWidth(const FunctionCallbackInfo<Value>& args) {
+ Environment* env = Environment::GetCurrent(args);
+ if (args.Length() < 1)
+ return;
+
+ bool ambiguous_as_full_width = args[1]->BooleanValue();
+ bool expand_emoji_sequence = args[2]->BooleanValue();
+
+ if (args[0]->IsNumber()) {
+ args.GetReturnValue().Set(
+ GetColumnWidth(args[0]->Uint32Value(),
+ ambiguous_as_full_width));
+ return;
+ }
+
+ TwoByteValue value(env->isolate(), args[0]);
+ // reinterpret_cast is required by windows to compile
+ UChar* str = reinterpret_cast<UChar*>(*value);
+ UChar32 c;
+ UChar32 p;
+ size_t n = 0;
+ uint32_t width = 0;
+
+ while (n < value.length()) {
+ p = c;
+ U16_NEXT(str, n, value.length(), c);
+ // Don't count individual emoji codepoints that occur within an
+ // emoji sequence. This is not necessarily foolproof. Some
+ // environments display emoji sequences in the appropriate
+ // condensed form (as a single emoji glyph), other environments
+ // may not understand an emoji sequence and will display each
+ // individual emoji separately. When this happens, the width
+ // calculated will be off, and there's no reliable way of knowing
+ // in advance if a particular sequence is going to be supported.
+ // The expand_emoji_sequence option allows the caller to skip this
+ // check and count each code within an emoji sequence separately.
+ if (!expand_emoji_sequence &&
+ n > 0 && p == 0x200d && // 0x200d == ZWJ (zero width joiner)
+ (u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) ||
+ u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) {
+ continue;
+ }
+ width += GetColumnWidth(c, ambiguous_as_full_width);
+ }
+ args.GetReturnValue().Set(width);
+}
+
void Init(Local<Object> target,
Local<Value> unused,
Local<Context> context,
@@ -192,6 +281,7 @@ void Init(Local<Object> target,
Environment* env = Environment::GetCurrent(context);
env->SetMethod(target, "toUnicode", ToUnicode);
env->SetMethod(target, "toASCII", ToASCII);
+ env->SetMethod(target, "getStringWidth", GetStringWidth);
}
} // namespace i18n
diff --git a/test/parallel/test-icu-stringwidth.js b/test/parallel/test-icu-stringwidth.js
new file mode 100644
index 0000000000..5b66f00c32
--- /dev/null
+++ b/test/parallel/test-icu-stringwidth.js
@@ -0,0 +1,43 @@
+// Flags: --expose_internals
+'use strict';
+
+const common = require('../common');
+const assert = require('assert');
+const readline = require('internal/readline');
+
+if (!process.binding('config').hasIntl) {
+ common.skip('missing intl... skipping test');
+ return;
+}
+
+// Test column width
+assert.strictEqual(readline.getStringWidth('a'), 1);
+assert.strictEqual(readline.getStringWidth('丁'), 2);
+assert.strictEqual(readline.getStringWidth('\ud83d\udc78\ud83c\udfff'), 2);
+assert.strictEqual(readline.getStringWidth('πŸ‘…'), 2);
+assert.strictEqual(readline.getStringWidth('\n'), 0);
+assert.strictEqual(readline.getStringWidth('\u200Ef\u200F'), 1);
+assert.strictEqual(readline.getStringWidth(97), 1);
+
+// The following is an emoji sequence. In some implementations, it is
+// represented as a single glyph, in other implementations as a sequence
+// of individual glyphs. By default, the algorithm will assume the single
+// glyph interpretation and return a value of 2. By passing the
+// expandEmojiSequence: true option, each component will be counted
+// individually.
+assert.strictEqual(readline.getStringWidth('πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§'), 2);
+assert.strictEqual(
+ readline.getStringWidth('πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§', {expandEmojiSequence: true}), 8);
+
+// By default, unicode characters whose width is considered ambiguous will
+// be considered half-width. For these characters, getStringWidth will return
+// 1. In some contexts, however, it is more appropriate to consider them full
+// width. By default, the algorithm will assume half width. By passing
+// the ambiguousAsFullWidth: true option, ambiguous characters will be counted
+// as 2 columns.
+assert.strictEqual(readline.getStringWidth('\u01d4'), 1);
+assert.strictEqual(
+ readline.getStringWidth('\u01d4', {ambiguousAsFullWidth: true}), 2);
+
+// Control chars and combining chars are zero
+assert.strictEqual(readline.getStringWidth('\u200E\n\u220A\u20D2'), 1);