// Copyright Joyent, Inc. and other Node contributors. // // Permission is hereby granted, free of charge, to any person obtaining a // copy of this software and associated documentation files (the // "Software"), to deal in the Software without restriction, including // without limitation the rights to use, copy, modify, merge, publish, // distribute, sublicense, and/or sell copies of the Software, and to permit // persons to whom the Software is furnished to do so, subject to the // following conditions: // // The above copyright notice and this permission notice shall be included // in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN // NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE // USE OR OTHER DEALINGS IN THE SOFTWARE. /* * notes: by srl295 * - When in NODE_HAVE_SMALL_ICU mode, ICU is linked against "stub" (null) data * ( stubdata/libicudata.a ) containing nothing, no data, and it's also * linked against a "small" data file which the SMALL_ICUDATA_ENTRY_POINT * macro names. That's the "english+root" data. * * If icu_data_path is non-null, the user has provided a path and we assume * it goes somewhere useful. We set that path in ICU, and exit. * If icu_data_path is null, they haven't set a path and we want the * "english+root" data. We call * udata_setCommonData(SMALL_ICUDATA_ENTRY_POINT,...) * to load up the english+root data. * * - when NOT in NODE_HAVE_SMALL_ICU mode, ICU is linked directly with its full * data. All of the variables and command line options for changing data at * runtime are disabled, as they wouldn't fully override the internal data. * See: http://bugs.icu-project.org/trac/ticket/10924 */ #include "node_i18n.h" #if defined(NODE_HAVE_I18N_SUPPORT) #include "node.h" #include "node_buffer.h" #include "env.h" #include "env-inl.h" #include "util.h" #include "util-inl.h" #include "v8.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef NODE_HAVE_SMALL_ICU /* if this is defined, we have a 'secondary' entry point. compare following to utypes.h defs for U_ICUDATA_ENTRY_POINT */ #define SMALL_ICUDATA_ENTRY_POINT \ SMALL_DEF2(U_ICU_VERSION_MAJOR_NUM, U_LIB_SUFFIX_C_NAME) #define SMALL_DEF2(major, suff) SMALL_DEF(major, suff) #ifndef U_LIB_SUFFIX_C_NAME #define SMALL_DEF(major, suff) icusmdt##major##_dat #else #define SMALL_DEF(major, suff) icusmdt##suff##major##_dat #endif extern "C" const char U_DATA_API SMALL_ICUDATA_ENTRY_POINT[]; #endif namespace node { using v8::Context; using v8::FunctionCallbackInfo; using v8::Isolate; using v8::Local; using v8::MaybeLocal; using v8::Object; using v8::String; using v8::Value; namespace i18n { namespace { template MaybeLocal ToBufferEndian(Environment* env, MaybeStackBuffer* buf) { MaybeLocal ret = Buffer::New(env, buf); if (ret.IsEmpty()) return ret; static_assert(sizeof(T) == 1 || sizeof(T) == 2, "Currently only one- or two-byte buffers are supported"); if (sizeof(T) > 1 && IsBigEndian()) { SPREAD_BUFFER_ARG(ret.ToLocalChecked(), retbuf); SwapBytes16(retbuf_data, retbuf_length); } return ret; } struct Converter { explicit Converter(const char* name, const char* sub = NULL) : conv(nullptr) { UErrorCode status = U_ZERO_ERROR; conv = ucnv_open(name, &status); CHECK(U_SUCCESS(status)); if (sub != NULL) { ucnv_setSubstChars(conv, sub, strlen(sub), &status); } } ~Converter() { ucnv_close(conv); } UConverter* conv; }; // One-Shot Converters void CopySourceBuffer(MaybeStackBuffer* dest, const char* data, const size_t length, const size_t length_in_chars) { dest->AllocateSufficientStorage(length_in_chars); char* dst = reinterpret_cast(**dest); memcpy(dst, data, length); if (IsBigEndian()) { SwapBytes16(dst, length); } } typedef MaybeLocal (*TranscodeFunc)(Environment* env, const char* fromEncoding, const char* toEncoding, const char* source, const size_t source_length, UErrorCode* status); MaybeLocal Transcode(Environment* env, const char* fromEncoding, const char* toEncoding, const char* source, const size_t source_length, UErrorCode* status) { *status = U_ZERO_ERROR; MaybeLocal ret; MaybeStackBuffer result; Converter to(toEncoding, "?"); Converter from(fromEncoding); const uint32_t limit = source_length * ucnv_getMaxCharSize(to.conv); result.AllocateSufficientStorage(limit); char* target = *result; ucnv_convertEx(to.conv, from.conv, &target, target + limit, &source, source + source_length, nullptr, nullptr, nullptr, nullptr, true, true, status); if (U_SUCCESS(*status)) { result.SetLength(target - &result[0]); ret = ToBufferEndian(env, &result); } return ret; } MaybeLocal TranscodeToUcs2(Environment* env, const char* fromEncoding, const char* toEncoding, const char* source, const size_t source_length, UErrorCode* status) { *status = U_ZERO_ERROR; MaybeLocal ret; MaybeStackBuffer destbuf(source_length); Converter from(fromEncoding); const size_t length_in_chars = source_length * sizeof(UChar); ucnv_toUChars(from.conv, *destbuf, length_in_chars, source, source_length, status); if (U_SUCCESS(*status)) ret = ToBufferEndian(env, &destbuf); return ret; } MaybeLocal TranscodeFromUcs2(Environment* env, const char* fromEncoding, const char* toEncoding, const char* source, const size_t source_length, UErrorCode* status) { *status = U_ZERO_ERROR; MaybeStackBuffer sourcebuf; MaybeLocal ret; Converter to(toEncoding, "?"); const size_t length_in_chars = source_length / sizeof(UChar); CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars); MaybeStackBuffer destbuf(length_in_chars); const uint32_t len = ucnv_fromUChars(to.conv, *destbuf, length_in_chars, *sourcebuf, length_in_chars, status); if (U_SUCCESS(*status)) { destbuf.SetLength(len); ret = ToBufferEndian(env, &destbuf); } return ret; } MaybeLocal TranscodeUcs2FromUtf8(Environment* env, const char* fromEncoding, const char* toEncoding, const char* source, const size_t source_length, UErrorCode* status) { *status = U_ZERO_ERROR; MaybeStackBuffer destbuf; int32_t result_length; u_strFromUTF8(*destbuf, destbuf.capacity(), &result_length, source, source_length, status); MaybeLocal ret; if (U_SUCCESS(*status)) { destbuf.SetLength(result_length); ret = ToBufferEndian(env, &destbuf); } else if (*status == U_BUFFER_OVERFLOW_ERROR) { *status = U_ZERO_ERROR; destbuf.AllocateSufficientStorage(result_length); u_strFromUTF8(*destbuf, result_length, &result_length, source, source_length, status); if (U_SUCCESS(*status)) { destbuf.SetLength(result_length); ret = ToBufferEndian(env, &destbuf); } } return ret; } MaybeLocal TranscodeUtf8FromUcs2(Environment* env, const char* fromEncoding, const char* toEncoding, const char* source, const size_t source_length, UErrorCode* status) { *status = U_ZERO_ERROR; MaybeLocal ret; const size_t length_in_chars = source_length / sizeof(UChar); int32_t result_length; MaybeStackBuffer sourcebuf; MaybeStackBuffer destbuf; CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars); u_strToUTF8(*destbuf, destbuf.capacity(), &result_length, *sourcebuf, length_in_chars, status); if (U_SUCCESS(*status)) { destbuf.SetLength(result_length); ret = ToBufferEndian(env, &destbuf); } else if (*status == U_BUFFER_OVERFLOW_ERROR) { *status = U_ZERO_ERROR; destbuf.AllocateSufficientStorage(result_length); u_strToUTF8(*destbuf, result_length, &result_length, *sourcebuf, length_in_chars, status); if (U_SUCCESS(*status)) { destbuf.SetLength(result_length); ret = ToBufferEndian(env, &destbuf); } } return ret; } const char* EncodingName(const enum encoding encoding) { switch (encoding) { case ASCII: return "us-ascii"; case LATIN1: return "iso8859-1"; case UCS2: return "utf16le"; case UTF8: return "utf-8"; default: return NULL; } } bool SupportedEncoding(const enum encoding encoding) { switch (encoding) { case ASCII: case LATIN1: case UCS2: case UTF8: return true; default: return false; } } void Transcode(const FunctionCallbackInfo&args) { Environment* env = Environment::GetCurrent(args); Isolate* isolate = env->isolate(); UErrorCode status = U_ZERO_ERROR; MaybeLocal result; THROW_AND_RETURN_UNLESS_BUFFER(env, args[0]); SPREAD_BUFFER_ARG(args[0], ts_obj); const enum encoding fromEncoding = ParseEncoding(isolate, args[1], BUFFER); const enum encoding toEncoding = ParseEncoding(isolate, args[2], BUFFER); if (SupportedEncoding(fromEncoding) && SupportedEncoding(toEncoding)) { TranscodeFunc tfn = &Transcode; switch (fromEncoding) { case ASCII: case LATIN1: if (toEncoding == UCS2) tfn = &TranscodeToUcs2; break; case UTF8: if (toEncoding == UCS2) tfn = &TranscodeUcs2FromUtf8; break; case UCS2: switch (toEncoding) { case UCS2: tfn = &Transcode; break; case UTF8: tfn = &TranscodeUtf8FromUcs2; break; default: tfn = TranscodeFromUcs2; } break; default: // This should not happen because of the SupportedEncoding checks ABORT(); } result = tfn(env, EncodingName(fromEncoding), EncodingName(toEncoding), ts_obj_data, ts_obj_length, &status); } else { status = U_ILLEGAL_ARGUMENT_ERROR; } if (result.IsEmpty()) return args.GetReturnValue().Set(status); return args.GetReturnValue().Set(result.ToLocalChecked()); } void ICUErrorName(const FunctionCallbackInfo& args) { Environment* env = Environment::GetCurrent(args); UErrorCode status = static_cast(args[0]->Int32Value()); args.GetReturnValue().Set( String::NewFromUtf8(env->isolate(), u_errorName(status), v8::NewStringType::kNormal).ToLocalChecked()); } #define TYPE_ICU "icu" #define TYPE_UNICODE "unicode" #define TYPE_CLDR "cldr" #define TYPE_TZ "tz" /** * This is the workhorse function that deals with the actual version info. * Get an ICU version. * @param type the type of version to get. One of VERSION_TYPES * @param buf optional buffer for result * @param status ICU error status. If failure, assume result is undefined. * @return version number, or NULL. May or may not be buf. */ const char* GetVersion(const char* type, char buf[U_MAX_VERSION_STRING_LENGTH], UErrorCode* status) { if (!strcmp(type, TYPE_ICU)) { return U_ICU_VERSION; } else if (!strcmp(type, TYPE_UNICODE)) { return U_UNICODE_VERSION; } else if (!strcmp(type, TYPE_TZ)) { return TimeZone::getTZDataVersion(*status); } else if (!strcmp(type, TYPE_CLDR)) { UVersionInfo versionArray; ulocdata_getCLDRVersion(versionArray, status); if (U_SUCCESS(*status)) { u_versionToString(versionArray, buf); return buf; } } // Fall through - unknown type or error case return nullptr; } void GetVersion(const FunctionCallbackInfo& args) { Environment* env = Environment::GetCurrent(args); if ( args.Length() == 0 ) { // With no args - return a comma-separated list of allowed values args.GetReturnValue().Set( String::NewFromUtf8(env->isolate(), TYPE_ICU "," TYPE_UNICODE "," TYPE_CLDR "," TYPE_TZ)); } else { CHECK_GE(args.Length(), 1); CHECK(args[0]->IsString()); Utf8Value val(env->isolate(), args[0]); UErrorCode status = U_ZERO_ERROR; char buf[U_MAX_VERSION_STRING_LENGTH] = ""; // Possible output buffer. const char* versionString = GetVersion(*val, buf, &status); if (U_SUCCESS(status) && versionString) { // Success. args.GetReturnValue().Set( String::NewFromUtf8(env->isolate(), versionString)); } } } } // anonymous namespace bool InitializeICUDirectory(const std::string& path) { if (path.empty()) { UErrorCode status = U_ZERO_ERROR; #ifdef NODE_HAVE_SMALL_ICU // install the 'small' data. udata_setCommonData(&SMALL_ICUDATA_ENTRY_POINT, &status); #else // !NODE_HAVE_SMALL_ICU // no small data, so nothing to do. #endif // !NODE_HAVE_SMALL_ICU return (status == U_ZERO_ERROR); } else { u_setDataDirectory(path.c_str()); return true; // No error. } } int32_t ToUnicode(MaybeStackBuffer* buf, const char* input, size_t length, bool lenient) { UErrorCode status = U_ZERO_ERROR; uint32_t options = UIDNA_DEFAULT; options |= UIDNA_NONTRANSITIONAL_TO_UNICODE; UIDNA* uidna = uidna_openUTS46(options, &status); if (U_FAILURE(status)) return -1; UIDNAInfo info = UIDNA_INFO_INITIALIZER; int32_t len = uidna_nameToUnicodeUTF8(uidna, input, length, **buf, buf->capacity(), &info, &status); if (status == U_BUFFER_OVERFLOW_ERROR) { status = U_ZERO_ERROR; buf->AllocateSufficientStorage(len); len = uidna_nameToUnicodeUTF8(uidna, input, length, **buf, buf->capacity(), &info, &status); } // UTS #46's ToUnicode operation applies no validation of domain name length // (nor a flag requesting it to do so, like VerifyDnsLength for ToASCII). For // that reason, unlike ToASCII below, ICU4C correctly accepts long domain // names. However, ICU4C still sets the EMPTY_LABEL error in contrary to UTS // #46. Therefore, explicitly filters out that error here. info.errors &= ~UIDNA_ERROR_EMPTY_LABEL; if (U_FAILURE(status) || (!lenient && info.errors != 0)) { len = -1; buf->SetLength(0); } else { buf->SetLength(len); } uidna_close(uidna); return len; } int32_t ToASCII(MaybeStackBuffer* buf, const char* input, size_t length, bool lenient) { UErrorCode status = U_ZERO_ERROR; uint32_t options = UIDNA_DEFAULT; options |= UIDNA_NONTRANSITIONAL_TO_ASCII; UIDNA* uidna = uidna_openUTS46(options, &status); if (U_FAILURE(status)) return -1; UIDNAInfo info = UIDNA_INFO_INITIALIZER; int32_t len = uidna_nameToASCII_UTF8(uidna, input, length, **buf, buf->capacity(), &info, &status); if (status == U_BUFFER_OVERFLOW_ERROR) { status = U_ZERO_ERROR; buf->AllocateSufficientStorage(len); len = uidna_nameToASCII_UTF8(uidna, input, length, **buf, buf->capacity(), &info, &status); } // The WHATWG URL "domain to ASCII" algorithm explicitly sets the // VerifyDnsLength flag to false, which disables the domain name length // verification step in ToASCII (as specified by UTS #46). Unfortunately, // ICU4C's IDNA module does not support disabling this flag through `options`, // so just filter out the errors that may be caused by the verification step // afterwards. info.errors &= ~UIDNA_ERROR_EMPTY_LABEL; info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG; info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; if (U_FAILURE(status) || (!lenient && info.errors != 0)) { len = -1; buf->SetLength(0); } else { buf->SetLength(len); } uidna_close(uidna); return len; } static void ToUnicode(const FunctionCallbackInfo& args) { Environment* env = Environment::GetCurrent(args); CHECK_GE(args.Length(), 1); CHECK(args[0]->IsString()); Utf8Value val(env->isolate(), args[0]); // optional arg bool lenient = args[1]->BooleanValue(env->context()).FromJust(); MaybeStackBuffer buf; int32_t len = ToUnicode(&buf, *val, val.length(), lenient); if (len < 0) { return env->ThrowError("Cannot convert name to Unicode"); } args.GetReturnValue().Set( String::NewFromUtf8(env->isolate(), *buf, v8::NewStringType::kNormal, len).ToLocalChecked()); } static void ToASCII(const FunctionCallbackInfo& args) { Environment* env = Environment::GetCurrent(args); CHECK_GE(args.Length(), 1); CHECK(args[0]->IsString()); Utf8Value val(env->isolate(), args[0]); // optional arg bool lenient = args[1]->BooleanValue(env->context()).FromJust(); MaybeStackBuffer buf; int32_t len = ToASCII(&buf, *val, val.length(), lenient); if (len < 0) { return env->ThrowError("Cannot convert name to ASCII"); } args.GetReturnValue().Set( String::NewFromUtf8(env->isolate(), *buf, v8::NewStringType::kNormal, len).ToLocalChecked()); } // This is similar to wcwidth except that it takes the current unicode // character properties database into consideration, allowing it to // correctly calculate the column widths of things like emoji's and // newer wide characters. wcwidth, on the other hand, uses a fixed // algorithm that does not take things like emoji into proper // consideration. static int GetColumnWidth(UChar32 codepoint, bool ambiguous_as_full_width = false) { if (!u_isdefined(codepoint) || u_iscntrl(codepoint) || u_getCombiningClass(codepoint) > 0 || u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER)) { return 0; } // UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a // codepoint as being full width, wide, ambiguous, neutral, narrow, // or halfwidth. const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH); switch (eaw) { case U_EA_FULLWIDTH: case U_EA_WIDE: return 2; case U_EA_AMBIGUOUS: // See: http://www.unicode.org/reports/tr11/#Ambiguous for details if (ambiguous_as_full_width) { return 2; } // Fall through if ambiguous_as_full_width if false. case U_EA_NEUTRAL: if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) { return 2; } // Fall through case U_EA_HALFWIDTH: case U_EA_NARROW: default: return 1; } } // Returns the column width for the given String. static void GetStringWidth(const FunctionCallbackInfo& args) { Environment* env = Environment::GetCurrent(args); if (args.Length() < 1) return; bool ambiguous_as_full_width = args[1]->BooleanValue(); bool expand_emoji_sequence = args[2]->BooleanValue(); if (args[0]->IsNumber()) { args.GetReturnValue().Set( GetColumnWidth(args[0]->Uint32Value(), ambiguous_as_full_width)); return; } TwoByteValue value(env->isolate(), args[0]); // reinterpret_cast is required by windows to compile UChar* str = reinterpret_cast(*value); static_assert(sizeof(*str) == sizeof(**value), "sizeof(*str) == sizeof(**value)"); UChar32 c = 0; UChar32 p; size_t n = 0; uint32_t width = 0; while (n < value.length()) { p = c; U16_NEXT(str, n, value.length(), c); // Don't count individual emoji codepoints that occur within an // emoji sequence. This is not necessarily foolproof. Some // environments display emoji sequences in the appropriate // condensed form (as a single emoji glyph), other environments // may not understand an emoji sequence and will display each // individual emoji separately. When this happens, the width // calculated will be off, and there's no reliable way of knowing // in advance if a particular sequence is going to be supported. // The expand_emoji_sequence option allows the caller to skip this // check and count each code within an emoji sequence separately. if (!expand_emoji_sequence && n > 0 && p == 0x200d && // 0x200d == ZWJ (zero width joiner) (u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) || u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) { continue; } width += GetColumnWidth(c, ambiguous_as_full_width); } args.GetReturnValue().Set(width); } void Init(Local target, Local unused, Local context, void* priv) { Environment* env = Environment::GetCurrent(context); env->SetMethod(target, "toUnicode", ToUnicode); env->SetMethod(target, "toASCII", ToASCII); env->SetMethod(target, "getStringWidth", GetStringWidth); env->SetMethod(target, "getVersion", GetVersion); // One-shot converters env->SetMethod(target, "icuErrName", ICUErrorName); env->SetMethod(target, "transcode", Transcode); } } // namespace i18n } // namespace node NODE_MODULE_CONTEXT_AWARE_BUILTIN(icu, node::i18n::Init) #endif // NODE_HAVE_I18N_SUPPORT