// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2009-2015, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #include "unicode/bytestream.h" #include "unicode/utypes.h" #include "unicode/ures.h" #include "unicode/localpointer.h" #include "unicode/putil.h" #include "unicode/uenum.h" #include "unicode/uloc.h" #include "ustr_imp.h" #include "charstr.h" #include "cmemory.h" #include "cstring.h" #include "putilimp.h" #include "uinvchar.h" #include "ulocimp.h" #include "uassert.h" /* struct holding a single variant */ typedef struct VariantListEntry { const char *variant; struct VariantListEntry *next; } VariantListEntry; /* struct holding a single attribute value */ struct AttributeListEntry : public icu::UMemory { const char *attribute; struct AttributeListEntry *next; }; /* struct holding a single extension */ struct ExtensionListEntry : public icu::UMemory { const char *key; const char *value; struct ExtensionListEntry *next; }; #define MAXEXTLANG 3 typedef struct ULanguageTag { char *buf; /* holding parsed subtags */ const char *language; const char *extlang[MAXEXTLANG]; const char *script; const char *region; VariantListEntry *variants; ExtensionListEntry *extensions; const char *privateuse; const char *grandfathered; } ULanguageTag; #define MINLEN 2 #define SEP '-' #define PRIVATEUSE 'x' #define LDMLEXT 'u' #define LOCALE_SEP '_' #define LOCALE_EXT_SEP '@' #define LOCALE_KEYWORD_SEP ';' #define LOCALE_KEY_TYPE_SEP '=' #define ISALPHA(c) uprv_isASCIILetter(c) #define ISNUMERIC(c) ((c)>='0' && (c)<='9') static const char EMPTY[] = ""; static const char LANG_UND[] = "und"; static const char PRIVATEUSE_KEY[] = "x"; static const char _POSIX[] = "_POSIX"; static const char POSIX_KEY[] = "va"; static const char POSIX_VALUE[] = "posix"; static const char LOCALE_ATTRIBUTE_KEY[] = "attribute"; static const char PRIVUSE_VARIANT_PREFIX[] = "lvariant"; static const char LOCALE_TYPE_YES[] = "yes"; #define LANG_UND_LEN 3 /* Updated on 2018-09-12 from https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry . This table has 2 parts. The parts for Grandfathered tags is generated by the following scripts from the IANA language tag registry. curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\ egrep -A 7 'Type: grandfathered' | \ egrep 'Tag|Prefe' | grep -B1 'Preferred' | grep -v '^--' | \ awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' |\ tr 'A-Z' 'a-z' The 2nd part is made of five ICU-specific entries. They're kept for the backward compatibility for now, even though there are no preferred values. They may have to be removed for the strict BCP 47 compliance. */ static const char* const GRANDFATHERED[] = { /* grandfathered preferred */ "art-lojban", "jbo", "en-gb-oed", "en-gb-oxendict", "i-ami", "ami", "i-bnn", "bnn", "i-hak", "hak", "i-klingon", "tlh", "i-lux", "lb", "i-navajo", "nv", "i-pwn", "pwn", "i-tao", "tao", "i-tay", "tay", "i-tsu", "tsu", "no-bok", "nb", "no-nyn", "nn", "sgn-be-fr", "sfb", "sgn-be-nl", "vgt", "sgn-ch-de", "sgg", "zh-guoyu", "cmn", "zh-hakka", "hak", "zh-min-nan", "nan", "zh-xiang", "hsn", // Grandfathered tags with no preferred value in the IANA // registry. Kept for now for the backward compatibility // because ICU has mapped them this way. "cel-gaulish", "xtg-x-cel-gaulish", "i-default", "en-x-i-default", "i-enochian", "und-x-i-enochian", "i-mingo", "see-x-i-mingo", "zh-min", "nan-x-zh-min", }; /* Updated on 2018-09-12 from https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry . The table lists redundant tags with preferred value in the IANA languate tag registry. It's generated with the following command: curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\ grep 'Type: redundant' -A 5 | egrep '^(Tag:|Prefer)' | grep -B1 'Preferred' | \ awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' | \ tr 'A-Z' 'a-z' In addition, ja-latn-hepburn-heploc is mapped to ja-latn-alalc97 because a variant tag 'hepburn-heploc' has the preferred subtag, 'alaic97'. */ static const char* const REDUNDANT[] = { // redundant preferred "sgn-br", "bzs", "sgn-co", "csn", "sgn-de", "gsg", "sgn-dk", "dsl", "sgn-es", "ssp", "sgn-fr", "fsl", "sgn-gb", "bfi", "sgn-gr", "gss", "sgn-ie", "isg", "sgn-it", "ise", "sgn-jp", "jsl", "sgn-mx", "mfs", "sgn-ni", "ncs", "sgn-nl", "dse", "sgn-no", "nsl", "sgn-pt", "psr", "sgn-se", "swl", "sgn-us", "ase", "sgn-za", "sfs", "zh-cmn", "cmn", "zh-cmn-hans", "cmn-hans", "zh-cmn-hant", "cmn-hant", "zh-gan", "gan", "zh-wuu", "wuu", "zh-yue", "yue", // variant tag with preferred value "ja-latn-hepburn-heploc", "ja-latn-alalc97", }; /* Updated on 2018-09-12 from https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry . grep 'Type: language' -A 7 language-subtag-registry | egrep 'Subtag|Prefe' | \ grep -B1 'Preferred' | grep -v '^--' | \ awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' Make sure that 2-letter language subtags come before 3-letter subtags. */ static const char DEPRECATEDLANGS[][4] = { /* deprecated new */ "in", "id", "iw", "he", "ji", "yi", "jw", "jv", "mo", "ro", "aam", "aas", "adp", "dz", "aue", "ktz", "ayx", "nun", "bgm", "bcg", "bjd", "drl", "ccq", "rki", "cjr", "mom", "cka", "cmr", "cmk", "xch", "coy", "pij", "cqu", "quh", "drh", "khk", "drw", "prs", "gav", "dev", "gfx", "vaj", "ggn", "gvr", "gti", "nyc", "guv", "duz", "hrr", "jal", "ibi", "opa", "ilw", "gal", "jeg", "oyb", "kgc", "tdf", "kgh", "kml", "koj", "kwv", "krm", "bmf", "ktr", "dtp", "kvs", "gdj", "kwq", "yam", "kxe", "tvd", "kzj", "dtp", "kzt", "dtp", "lii", "raq", "lmm", "rmx", "meg", "cir", "mst", "mry", "mwj", "vaj", "myt", "mry", "nad", "xny", "ncp", "kdz", "nnx", "ngv", "nts", "pij", "oun", "vaj", "pcr", "adx", "pmc", "huw", "pmu", "phr", "ppa", "bfy", "ppr", "lcq", "pry", "prt", "puz", "pub", "sca", "hle", "skk", "oyb", "tdu", "dtp", "thc", "tpo", "thx", "oyb", "tie", "ras", "tkk", "twm", "tlw", "weo", "tmp", "tyj", "tne", "kak", "tnf", "prs", "tsf", "taj", "uok", "ema", "xba", "cax", "xia", "acn", "xkh", "waw", "xsj", "suj", "ybd", "rki", "yma", "lrr", "ymt", "mtm", "yos", "zom", "yuu", "yug", }; /* Updated on 2018-04-24 from curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | \ grep 'Type: region' -A 7 | egrep 'Subtag|Prefe' | \ grep -B1 'Preferred' | \ awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' */ static const char DEPRECATEDREGIONS[][3] = { /* deprecated new */ "BU", "MM", "DD", "DE", "FX", "FR", "TP", "TL", "YD", "YE", "ZR", "CD", }; /* * ------------------------------------------------- * * These ultag_ functions may be exposed as APIs later * * ------------------------------------------------- */ static ULanguageTag* ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status); static void ultag_close(ULanguageTag* langtag); static const char* ultag_getLanguage(const ULanguageTag* langtag); #if 0 static const char* ultag_getJDKLanguage(const ULanguageTag* langtag); #endif static const char* ultag_getExtlang(const ULanguageTag* langtag, int32_t idx); static int32_t ultag_getExtlangSize(const ULanguageTag* langtag); static const char* ultag_getScript(const ULanguageTag* langtag); static const char* ultag_getRegion(const ULanguageTag* langtag); static const char* ultag_getVariant(const ULanguageTag* langtag, int32_t idx); static int32_t ultag_getVariantsSize(const ULanguageTag* langtag); static const char* ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx); static const char* ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx); static int32_t ultag_getExtensionsSize(const ULanguageTag* langtag); static const char* ultag_getPrivateUse(const ULanguageTag* langtag); #if 0 static const char* ultag_getGrandfathered(const ULanguageTag* langtag); #endif U_NAMESPACE_BEGIN /** * \class LocalULanguageTagPointer * "Smart pointer" class, closes a ULanguageTag via ultag_close(). * For most methods see the LocalPointerBase base class. * * @see LocalPointerBase * @see LocalPointer * @internal */ U_DEFINE_LOCAL_OPEN_POINTER(LocalULanguageTagPointer, ULanguageTag, ultag_close); U_NAMESPACE_END /* * ------------------------------------------------- * * Language subtag syntax validation functions * * ------------------------------------------------- */ static UBool _isAlphaString(const char* s, int32_t len) { int32_t i; for (i = 0; i < len; i++) { if (!ISALPHA(*(s + i))) { return FALSE; } } return TRUE; } static UBool _isNumericString(const char* s, int32_t len) { int32_t i; for (i = 0; i < len; i++) { if (!ISNUMERIC(*(s + i))) { return FALSE; } } return TRUE; } static UBool _isAlphaNumericString(const char* s, int32_t len) { int32_t i; for (i = 0; i < len; i++) { if (!ISALPHA(*(s + i)) && !ISNUMERIC(*(s + i))) { return FALSE; } } return TRUE; } static UBool _isAlphaNumericStringLimitedLength(const char* s, int32_t len, int32_t min, int32_t max) { if (len < 0) { len = (int32_t)uprv_strlen(s); } if (len >= min && len <= max && _isAlphaNumericString(s, len)) { return TRUE; } return FALSE; } U_CFUNC UBool ultag_isLanguageSubtag(const char* s, int32_t len) { /* * unicode_language_subtag = alpha{2,3} | alpha{5,8}; * NOTE: Per ICUTC 2019/01/23- accepting alpha 4 * See ICU-20372 */ if (len < 0) { len = (int32_t)uprv_strlen(s); } if (len >= 2 && len <= 8 && _isAlphaString(s, len)) { return TRUE; } return FALSE; } static UBool _isExtlangSubtag(const char* s, int32_t len) { /* * extlang = 3ALPHA ; selected ISO 639 codes * *2("-" 3ALPHA) ; permanently reserved */ if (len < 0) { len = (int32_t)uprv_strlen(s); } if (len == 3 && _isAlphaString(s, len)) { return TRUE; } return FALSE; } U_CFUNC UBool ultag_isScriptSubtag(const char* s, int32_t len) { /* * script = 4ALPHA ; ISO 15924 code */ if (len < 0) { len = (int32_t)uprv_strlen(s); } if (len == 4 && _isAlphaString(s, len)) { return TRUE; } return FALSE; } U_CFUNC UBool ultag_isRegionSubtag(const char* s, int32_t len) { /* * region = 2ALPHA ; ISO 3166-1 code * / 3DIGIT ; UN M.49 code */ if (len < 0) { len = (int32_t)uprv_strlen(s); } if (len == 2 && _isAlphaString(s, len)) { return TRUE; } if (len == 3 && _isNumericString(s, len)) { return TRUE; } return FALSE; } static UBool _isVariantSubtag(const char* s, int32_t len) { /* * variant = 5*8alphanum ; registered variants * / (DIGIT 3alphanum) */ if (len < 0) { len = (int32_t)uprv_strlen(s); } if (_isAlphaNumericStringLimitedLength(s, len, 5, 8)) { return TRUE; } if (len == 4 && ISNUMERIC(*s) && _isAlphaNumericString(s + 1, 3)) { return TRUE; } return FALSE; } static UBool _isSepListOf(UBool (*test)(const char*, int32_t), const char* s, int32_t len) { const char *p = s; const char *pSubtag = NULL; if (len < 0) { len = (int32_t)uprv_strlen(s); } while ((p - s) < len) { if (*p == SEP) { if (pSubtag == NULL) { return FALSE; } if (!test(pSubtag, (int32_t)(p - pSubtag))) { return FALSE; } pSubtag = NULL; } else if (pSubtag == NULL) { pSubtag = p; } p++; } if (pSubtag == NULL) { return FALSE; } return test(pSubtag, (int32_t)(p - pSubtag)); } U_CFUNC UBool ultag_isVariantSubtags(const char* s, int32_t len) { return _isSepListOf(&_isVariantSubtag, s, len); } // This is for the ICU-specific "lvariant" handling. static UBool _isPrivateuseVariantSubtag(const char* s, int32_t len) { /* * variant = 1*8alphanum ; registered variants * / (DIGIT 3alphanum) */ return _isAlphaNumericStringLimitedLength(s, len , 1, 8); } static UBool _isExtensionSingleton(const char* s, int32_t len) { /* * extension = singleton 1*("-" (2*8alphanum)) * * singleton = DIGIT ; 0 - 9 * / %x41-57 ; A - W * / %x59-5A ; Y - Z * / %x61-77 ; a - w * / %x79-7A ; y - z */ if (len < 0) { len = (int32_t)uprv_strlen(s); } if (len == 1 && (ISALPHA(*s) || ISNUMERIC(*s)) && (uprv_tolower(*s) != PRIVATEUSE)) { return TRUE; } return FALSE; } static UBool _isExtensionSubtag(const char* s, int32_t len) { /* * extension = singleton 1*("-" (2*8alphanum)) */ return _isAlphaNumericStringLimitedLength(s, len, 2, 8); } U_CFUNC UBool ultag_isExtensionSubtags(const char* s, int32_t len) { return _isSepListOf(&_isExtensionSubtag, s, len); } static UBool _isPrivateuseValueSubtag(const char* s, int32_t len) { /* * privateuse = "x" 1*("-" (1*8alphanum)) */ return _isAlphaNumericStringLimitedLength(s, len, 1, 8); } U_CFUNC UBool ultag_isPrivateuseValueSubtags(const char* s, int32_t len) { return _isSepListOf(&_isPrivateuseValueSubtag, s, len); } U_CFUNC UBool ultag_isUnicodeLocaleAttribute(const char* s, int32_t len) { /* * attribute = alphanum{3,8} ; */ return _isAlphaNumericStringLimitedLength(s, len , 3, 8); } U_CFUNC UBool ultag_isUnicodeLocaleAttributes(const char* s, int32_t len) { return _isSepListOf(&ultag_isUnicodeLocaleAttribute, s, len); } U_CFUNC UBool ultag_isUnicodeLocaleKey(const char* s, int32_t len) { /* * key = alphanum alpha ; */ if (len < 0) { len = (int32_t)uprv_strlen(s); } if (len == 2 && (ISALPHA(*s) || ISNUMERIC(*s)) && ISALPHA(s[1])) { return TRUE; } return FALSE; } U_CFUNC UBool _isUnicodeLocaleTypeSubtag(const char*s, int32_t len) { /* * alphanum{3,8} */ return _isAlphaNumericStringLimitedLength(s, len , 3, 8); } U_CFUNC UBool ultag_isUnicodeLocaleType(const char*s, int32_t len) { /* * type = alphanum{3,8} (sep alphanum{3,8})* ; */ return _isSepListOf(&_isUnicodeLocaleTypeSubtag, s, len); } static UBool _isTKey(const char* s, int32_t len) { /* * tkey = alpha digit ; */ if (len < 0) { len = (int32_t)uprv_strlen(s); } if (len == 2 && ISALPHA(*s) && ISNUMERIC(*(s + 1))) { return TRUE; } return FALSE; } static UBool _isTValue(const char* s, int32_t len) { /* * tvalue = (sep alphanum{3,8})+ ; */ return _isAlphaNumericStringLimitedLength(s, len , 3, 8); } static UBool _isTransformedExtensionSubtag(int32_t& state, const char* s, int32_t len) { const int32_t kStart = 0; // Start, wait for unicode_language_subtag, tkey or end const int32_t kGotLanguage = 1; // Got unicode_language_subtag, wait for unicode_script_subtag, // unicode_region_subtag, unicode_variant_subtag, tkey or end const int32_t kGotScript = 2; // Got unicode_script_subtag, wait for unicode_region_subtag, // unicode_variant_subtag, tkey, or end const int32_t kGotRegion = 3; // Got unicode_region_subtag, wait for unicode_variant_subtag, // tkey, or end. const int32_t kGotVariant = 4; // Got unicode_variant_subtag, wait for unicode_variant_subtag // tkey or end. const int32_t kGotTKey = -1; // Got tkey, wait for tvalue. ERROR if stop here. const int32_t kGotTValue = 6; // Got tvalue, wait for tkey, tvalue or end switch (state) { case kStart: if (ultag_isLanguageSubtag(s, len)) { state = kGotLanguage; return TRUE; } if (_isTKey(s, len)) { state = kGotTKey; return TRUE; } return FALSE; case kGotLanguage: if (ultag_isScriptSubtag(s, len)) { state = kGotScript; return TRUE; } U_FALLTHROUGH; case kGotScript: if (ultag_isRegionSubtag(s, len)) { state = kGotRegion; return TRUE; } U_FALLTHROUGH; case kGotRegion: U_FALLTHROUGH; case kGotVariant: if (_isVariantSubtag(s, len)) { state = kGotVariant; return TRUE; } if (_isTKey(s, len)) { state = kGotTKey; return TRUE; } return FALSE; case kGotTKey: if (_isTValue(s, len)) { state = kGotTValue; return TRUE; } return FALSE; case kGotTValue: if (_isTKey(s, len)) { state = kGotTKey; return TRUE; } if (_isTValue(s, len)) { return TRUE; } return FALSE; } return FALSE; } static UBool _isUnicodeExtensionSubtag(int32_t& state, const char* s, int32_t len) { const int32_t kStart = 0; // Start, wait for a key or attribute or end const int32_t kGotKey = 1; // Got a key, wait for type or key or end const int32_t kGotType = 2; // Got a type, wait for key or end switch (state) { case kStart: if (ultag_isUnicodeLocaleKey(s, len)) { state = kGotKey; return TRUE; } if (ultag_isUnicodeLocaleAttribute(s, len)) { return TRUE; } return FALSE; case kGotKey: if (ultag_isUnicodeLocaleKey(s, len)) { return TRUE; } if (_isUnicodeLocaleTypeSubtag(s, len)) { state = kGotType; return TRUE; } return FALSE; case kGotType: if (ultag_isUnicodeLocaleKey(s, len)) { state = kGotKey; return TRUE; } if (_isUnicodeLocaleTypeSubtag(s, len)) { return TRUE; } return FALSE; } return FALSE; } static UBool _isStatefulSepListOf(UBool (*test)(int32_t&, const char*, int32_t), const char* s, int32_t len) { int32_t state = 0; const char* p; const char* start = s; int32_t subtagLen = 0; if (len < 0) { len = (int32_t)uprv_strlen(s); } for (p = s; len > 0; p++, len--) { if (*p == SEP) { if (!test(state, start, subtagLen)) { return FALSE; } subtagLen = 0; start = p + 1; } else { subtagLen++; } } if (test(state, start, subtagLen) && state >= 0) { return TRUE; } return FALSE; } U_CFUNC UBool ultag_isTransformedExtensionSubtags(const char* s, int32_t len) { return _isStatefulSepListOf(&_isTransformedExtensionSubtag, s, len); } U_CFUNC UBool ultag_isUnicodeExtensionSubtags(const char* s, int32_t len) { return _isStatefulSepListOf(&_isUnicodeExtensionSubtag, s, len); } /* * ------------------------------------------------- * * Helper functions * * ------------------------------------------------- */ static UBool _addVariantToList(VariantListEntry **first, VariantListEntry *var) { UBool bAdded = TRUE; if (*first == NULL) { var->next = NULL; *first = var; } else { VariantListEntry *prev, *cur; int32_t cmp; /* variants order should be preserved */ prev = NULL; cur = *first; while (TRUE) { if (cur == NULL) { prev->next = var; var->next = NULL; break; } /* Checking for duplicate variant */ cmp = uprv_compareInvCharsAsAscii(var->variant, cur->variant); if (cmp == 0) { /* duplicated variant */ bAdded = FALSE; break; } prev = cur; cur = cur->next; } } return bAdded; } static UBool _addAttributeToList(AttributeListEntry **first, AttributeListEntry *attr) { UBool bAdded = TRUE; if (*first == NULL) { attr->next = NULL; *first = attr; } else { AttributeListEntry *prev, *cur; int32_t cmp; /* reorder variants in alphabetical order */ prev = NULL; cur = *first; while (TRUE) { if (cur == NULL) { prev->next = attr; attr->next = NULL; break; } cmp = uprv_compareInvCharsAsAscii(attr->attribute, cur->attribute); if (cmp < 0) { if (prev == NULL) { *first = attr; } else { prev->next = attr; } attr->next = cur; break; } if (cmp == 0) { /* duplicated variant */ bAdded = FALSE; break; } prev = cur; cur = cur->next; } } return bAdded; } static UBool _addExtensionToList(ExtensionListEntry **first, ExtensionListEntry *ext, UBool localeToBCP) { UBool bAdded = TRUE; if (*first == NULL) { ext->next = NULL; *first = ext; } else { ExtensionListEntry *prev, *cur; int32_t cmp; /* reorder variants in alphabetical order */ prev = NULL; cur = *first; while (TRUE) { if (cur == NULL) { prev->next = ext; ext->next = NULL; break; } if (localeToBCP) { /* special handling for locale to bcp conversion */ int32_t len, curlen; len = (int32_t)uprv_strlen(ext->key); curlen = (int32_t)uprv_strlen(cur->key); if (len == 1 && curlen == 1) { if (*(ext->key) == *(cur->key)) { cmp = 0; } else if (*(ext->key) == PRIVATEUSE) { cmp = 1; } else if (*(cur->key) == PRIVATEUSE) { cmp = -1; } else { cmp = *(ext->key) - *(cur->key); } } else if (len == 1) { cmp = *(ext->key) - LDMLEXT; } else if (curlen == 1) { cmp = LDMLEXT - *(cur->key); } else { cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key); /* Both are u extension keys - we need special handling for 'attribute' */ if (cmp != 0) { if (uprv_strcmp(cur->key, LOCALE_ATTRIBUTE_KEY) == 0) { cmp = 1; } else if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) { cmp = -1; } } } } else { cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key); } if (cmp < 0) { if (prev == NULL) { *first = ext; } else { prev->next = ext; } ext->next = cur; break; } if (cmp == 0) { /* duplicated extension key */ bAdded = FALSE; break; } prev = cur; cur = cur->next; } } return bAdded; } static void _initializeULanguageTag(ULanguageTag* langtag) { int32_t i; langtag->buf = NULL; langtag->language = EMPTY; for (i = 0; i < MAXEXTLANG; i++) { langtag->extlang[i] = NULL; } langtag->script = EMPTY; langtag->region = EMPTY; langtag->variants = NULL; langtag->extensions = NULL; langtag->grandfathered = EMPTY; langtag->privateuse = EMPTY; } static void _appendLanguageToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) { char buf[ULOC_LANG_CAPACITY]; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t len, i; if (U_FAILURE(*status)) { return; } len = uloc_getLanguage(localeID, buf, sizeof(buf), &tmpStatus); if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; return; } len = 0; } /* Note: returned language code is in lower case letters */ if (len == 0) { sink.Append(LANG_UND, LANG_UND_LEN); } else if (!ultag_isLanguageSubtag(buf, len)) { /* invalid language code */ if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; return; } sink.Append(LANG_UND, LANG_UND_LEN); } else { /* resolve deprecated */ for (i = 0; i < UPRV_LENGTHOF(DEPRECATEDLANGS); i += 2) { // 2-letter deprecated subtags are listede before 3-letter // ones in DEPRECATEDLANGS[]. Get out of loop on coming // across the 1st 3-letter subtag, if the input is a 2-letter code. // to avoid continuing to try when there's no match. if (uprv_strlen(buf) < uprv_strlen(DEPRECATEDLANGS[i])) break; if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDLANGS[i]) == 0) { uprv_strcpy(buf, DEPRECATEDLANGS[i + 1]); len = (int32_t)uprv_strlen(buf); break; } } sink.Append(buf, len); } } static void _appendScriptToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) { char buf[ULOC_SCRIPT_CAPACITY]; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t len; if (U_FAILURE(*status)) { return; } len = uloc_getScript(localeID, buf, sizeof(buf), &tmpStatus); if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; } return; } if (len > 0) { if (!ultag_isScriptSubtag(buf, len)) { /* invalid script code */ if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; } return; } else { sink.Append("-", 1); sink.Append(buf, len); } } } static void _appendRegionToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) { char buf[ULOC_COUNTRY_CAPACITY]; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t len; if (U_FAILURE(*status)) { return; } len = uloc_getCountry(localeID, buf, sizeof(buf), &tmpStatus); if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; } return; } if (len > 0) { if (!ultag_isRegionSubtag(buf, len)) { /* invalid region code */ if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; } return; } else { sink.Append("-", 1); /* resolve deprecated */ for (int i = 0; i < UPRV_LENGTHOF(DEPRECATEDREGIONS); i += 2) { if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDREGIONS[i]) == 0) { uprv_strcpy(buf, DEPRECATEDREGIONS[i + 1]); len = (int32_t)uprv_strlen(buf); break; } } sink.Append(buf, len); } } } static void _appendVariantsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool *hadPosix, UErrorCode* status) { char buf[ULOC_FULLNAME_CAPACITY]; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t len, i; if (U_FAILURE(*status)) { return; } len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus); if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; } return; } if (len > 0) { char *p, *pVar; UBool bNext = TRUE; VariantListEntry *var; VariantListEntry *varFirst = NULL; pVar = NULL; p = buf; while (bNext) { if (*p == SEP || *p == LOCALE_SEP || *p == 0) { if (*p == 0) { bNext = FALSE; } else { *p = 0; /* terminate */ } if (pVar == NULL) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } /* ignore empty variant */ } else { /* ICU uses upper case letters for variants, but the canonical format is lowercase in BCP47 */ for (i = 0; *(pVar + i) != 0; i++) { *(pVar + i) = uprv_tolower(*(pVar + i)); } /* validate */ if (_isVariantSubtag(pVar, -1)) { if (uprv_strcmp(pVar,POSIX_VALUE) || len != (int32_t)uprv_strlen(POSIX_VALUE)) { /* emit the variant to the list */ var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry)); if (var == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; break; } var->variant = pVar; if (!_addVariantToList(&varFirst, var)) { /* duplicated variant */ uprv_free(var); if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } } } else { /* Special handling for POSIX variant, need to remember that we had it and then */ /* treat it like an extension later. */ *hadPosix = TRUE; } } else if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } else if (_isPrivateuseValueSubtag(pVar, -1)) { /* Handle private use subtags separately */ break; } } /* reset variant starting position */ pVar = NULL; } else if (pVar == NULL) { pVar = p; } p++; } if (U_SUCCESS(*status)) { if (varFirst != NULL) { int32_t varLen; /* write out validated/normalized variants to the target */ var = varFirst; while (var != NULL) { sink.Append("-", 1); varLen = (int32_t)uprv_strlen(var->variant); sink.Append(var->variant, varLen); var = var->next; } } } /* clean up */ var = varFirst; while (var != NULL) { VariantListEntry *tmpVar = var->next; uprv_free(var); var = tmpVar; } if (U_FAILURE(*status)) { return; } } } static void _appendKeywordsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) { char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY] = { 0 }; int32_t attrBufLength = 0; icu::MemoryPool attrPool; icu::MemoryPool extPool; icu::MemoryPool strPool; icu::LocalUEnumerationPointer keywordEnum(uloc_openKeywords(localeID, status)); if (U_FAILURE(*status) && !hadPosix) { return; } if (keywordEnum.isValid() || hadPosix) { /* reorder extensions */ int32_t len; const char *key; ExtensionListEntry *firstExt = NULL; ExtensionListEntry *ext; AttributeListEntry *firstAttr = NULL; AttributeListEntry *attr; icu::MemoryPool extBufPool; const char *bcpKey=nullptr, *bcpValue=nullptr; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t keylen; UBool isBcpUExt; while (TRUE) { icu::CharString buf; key = uenum_next(keywordEnum.getAlias(), NULL, status); if (key == NULL) { break; } char* buffer; int32_t resultCapacity = ULOC_KEYWORD_AND_VALUES_CAPACITY; for (;;) { buffer = buf.getAppendBuffer( /*minCapacity=*/resultCapacity, /*desiredCapacityHint=*/resultCapacity, resultCapacity, tmpStatus); if (U_FAILURE(tmpStatus)) { break; } len = uloc_getKeywordValue( localeID, key, buffer, resultCapacity, &tmpStatus); if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) { break; } resultCapacity = len; tmpStatus = U_ZERO_ERROR; } if (U_FAILURE(tmpStatus)) { if (tmpStatus == U_MEMORY_ALLOCATION_ERROR) { *status = U_MEMORY_ALLOCATION_ERROR; break; } if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } /* ignore this keyword */ tmpStatus = U_ZERO_ERROR; continue; } buf.append(buffer, len, tmpStatus); if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { tmpStatus = U_ZERO_ERROR; // Terminators provided by CharString. } keylen = (int32_t)uprv_strlen(key); isBcpUExt = (keylen > 1); /* special keyword used for representing Unicode locale attributes */ if (uprv_strcmp(key, LOCALE_ATTRIBUTE_KEY) == 0) { if (len > 0) { int32_t i = 0; while (TRUE) { attrBufLength = 0; for (; i < len; i++) { if (buf[i] != '-') { attrBuf[attrBufLength++] = buf[i]; } else { i++; break; } } if (attrBufLength > 0) { attrBuf[attrBufLength] = 0; } else if (i >= len){ break; } /* create AttributeListEntry */ attr = attrPool.create(); if (attr == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; break; } icu::CharString* attrValue = strPool.create(attrBuf, attrBufLength, *status); if (attrValue == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; break; } if (U_FAILURE(*status)) { break; } attr->attribute = attrValue->data(); if (!_addAttributeToList(&firstAttr, attr)) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } } } /* for a place holder ExtensionListEntry */ bcpKey = LOCALE_ATTRIBUTE_KEY; bcpValue = NULL; } } else if (isBcpUExt) { bcpKey = uloc_toUnicodeLocaleKey(key); if (bcpKey == NULL) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } continue; } /* we've checked buf is null-terminated above */ bcpValue = uloc_toUnicodeLocaleType(key, buf.data()); if (bcpValue == NULL) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } continue; } if (bcpValue == buf.data()) { /* When uloc_toUnicodeLocaleType(key, buf) returns the input value as is, the value is well-formed, but has no known mapping. This implementation normalizes the value to lower case */ icu::CharString* extBuf = extBufPool.create(); if (extBuf == nullptr) { *status = U_MEMORY_ALLOCATION_ERROR; break; } int32_t bcpValueLen = static_cast(uprv_strlen(bcpValue)); int32_t resultCapacity; char* pExtBuf = extBuf->getAppendBuffer( /*minCapacity=*/bcpValueLen, /*desiredCapacityHint=*/bcpValueLen, resultCapacity, tmpStatus); if (U_FAILURE(tmpStatus)) { *status = tmpStatus; break; } uprv_strcpy(pExtBuf, bcpValue); T_CString_toLowerCase(pExtBuf); extBuf->append(pExtBuf, bcpValueLen, tmpStatus); if (U_FAILURE(tmpStatus)) { *status = tmpStatus; break; } bcpValue = extBuf->data(); } } else { if (*key == PRIVATEUSE) { if (!ultag_isPrivateuseValueSubtags(buf.data(), len)) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } continue; } } else { if (!_isExtensionSingleton(key, keylen) || !ultag_isExtensionSubtags(buf.data(), len)) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } continue; } } bcpKey = key; icu::CharString* extBuf = extBufPool.create(buf.data(), len, tmpStatus); if (extBuf == nullptr) { *status = U_MEMORY_ALLOCATION_ERROR; break; } if (U_FAILURE(tmpStatus)) { *status = tmpStatus; break; } bcpValue = extBuf->data(); } /* create ExtensionListEntry */ ext = extPool.create(); if (ext == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; break; } ext->key = bcpKey; ext->value = bcpValue; if (!_addExtensionToList(&firstExt, ext, TRUE)) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } } } /* Special handling for POSIX variant - add the keywords for POSIX */ if (hadPosix) { /* create ExtensionListEntry for POSIX */ ext = extPool.create(); if (ext == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return; } ext->key = POSIX_KEY; ext->value = POSIX_VALUE; if (!_addExtensionToList(&firstExt, ext, TRUE)) { // Silently ignore errors. } } if (U_SUCCESS(*status) && (firstExt != NULL || firstAttr != NULL)) { UBool startLDMLExtension = FALSE; for (ext = firstExt; ext; ext = ext->next) { if (!startLDMLExtension && uprv_strlen(ext->key) > 1) { /* first LDML u singlton extension */ sink.Append("-u", 2); startLDMLExtension = TRUE; } /* write out the sorted BCP47 attributes, extensions and private use */ if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) { /* write the value for the attributes */ for (attr = firstAttr; attr; attr = attr->next) { sink.Append("-", 1); sink.Append( attr->attribute, static_cast(uprv_strlen(attr->attribute))); } } else { sink.Append("-", 1); sink.Append(ext->key, static_cast(uprv_strlen(ext->key))); sink.Append("-", 1); sink.Append(ext->value, static_cast(uprv_strlen(ext->value))); } } } } } /** * Append keywords parsed from LDML extension value * e.g. "u-ca-gregory-co-trad" -> {calendar = gregorian} {collation = traditional} * Note: char* buf is used for storing keywords */ static void _appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendTo, icu::MemoryPool& extPool, icu::MemoryPool& kwdBuf, UBool *posixVariant, UErrorCode *status) { const char *pTag; /* beginning of current subtag */ const char *pKwds; /* beginning of key-type pairs */ UBool variantExists = *posixVariant; ExtensionListEntry *kwdFirst = NULL; /* first LDML keyword */ ExtensionListEntry *kwd, *nextKwd; int32_t len; /* Reset the posixVariant value */ *posixVariant = FALSE; pTag = ldmlext; pKwds = NULL; { AttributeListEntry *attrFirst = NULL; /* first attribute */ AttributeListEntry *attr, *nextAttr; char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY]; int32_t attrBufIdx = 0; icu::MemoryPool attrPool; /* Iterate through u extension attributes */ while (*pTag) { /* locate next separator char */ for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++); if (ultag_isUnicodeLocaleKey(pTag, len)) { pKwds = pTag; break; } /* add this attribute to the list */ attr = attrPool.create(); if (attr == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return; } if (len < (int32_t)sizeof(attrBuf) - attrBufIdx) { uprv_memcpy(&attrBuf[attrBufIdx], pTag, len); attrBuf[attrBufIdx + len] = 0; attr->attribute = &attrBuf[attrBufIdx]; attrBufIdx += (len + 1); } else { *status = U_ILLEGAL_ARGUMENT_ERROR; return; } // duplicate attribute is ignored, causes no error. _addAttributeToList(&attrFirst, attr); /* next tag */ pTag += len; if (*pTag) { /* next to the separator */ pTag++; } } if (attrFirst) { /* emit attributes as an LDML keyword, e.g. attribute=attr1-attr2 */ kwd = extPool.create(); if (kwd == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return; } icu::CharString* value = kwdBuf.create(); if (value == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return; } /* attribute subtags sorted in alphabetical order as type */ attr = attrFirst; while (attr != NULL) { nextAttr = attr->next; if (attr != attrFirst) { value->append('-', *status); } value->append(attr->attribute, *status); attr = nextAttr; } if (U_FAILURE(*status)) { return; } kwd->key = LOCALE_ATTRIBUTE_KEY; kwd->value = value->data(); if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) { *status = U_ILLEGAL_ARGUMENT_ERROR; return; } } } if (pKwds) { const char *pBcpKey = NULL; /* u extenstion key subtag */ const char *pBcpType = NULL; /* beginning of u extension type subtag(s) */ int32_t bcpKeyLen = 0; int32_t bcpTypeLen = 0; UBool isDone = FALSE; pTag = pKwds; /* BCP47 representation of LDML key/type pairs */ while (!isDone) { const char *pNextBcpKey = NULL; int32_t nextBcpKeyLen = 0; UBool emitKeyword = FALSE; if (*pTag) { /* locate next separator char */ for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++); if (ultag_isUnicodeLocaleKey(pTag, len)) { if (pBcpKey) { emitKeyword = TRUE; pNextBcpKey = pTag; nextBcpKeyLen = len; } else { pBcpKey = pTag; bcpKeyLen = len; } } else { U_ASSERT(pBcpKey != NULL); /* within LDML type subtags */ if (pBcpType) { bcpTypeLen += (len + 1); } else { pBcpType = pTag; bcpTypeLen = len; } } /* next tag */ pTag += len; if (*pTag) { /* next to the separator */ pTag++; } } else { /* processing last one */ emitKeyword = TRUE; isDone = TRUE; } if (emitKeyword) { const char *pKey = NULL; /* LDML key */ const char *pType = NULL; /* LDML type */ char bcpKeyBuf[9]; /* BCP key length is always 2 for now */ U_ASSERT(pBcpKey != NULL); if (bcpKeyLen >= (int32_t)sizeof(bcpKeyBuf)) { /* the BCP key is invalid */ *status = U_ILLEGAL_ARGUMENT_ERROR; return; } uprv_strncpy(bcpKeyBuf, pBcpKey, bcpKeyLen); bcpKeyBuf[bcpKeyLen] = 0; /* u extension key to LDML key */ pKey = uloc_toLegacyKey(bcpKeyBuf); if (pKey == NULL) { *status = U_ILLEGAL_ARGUMENT_ERROR; return; } if (pKey == bcpKeyBuf) { /* The key returned by toLegacyKey points to the input buffer. We normalize the result key to lower case. */ T_CString_toLowerCase(bcpKeyBuf); icu::CharString* key = kwdBuf.create(bcpKeyBuf, bcpKeyLen, *status); if (key == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return; } if (U_FAILURE(*status)) { return; } pKey = key->data(); } if (pBcpType) { char bcpTypeBuf[128]; /* practically long enough even considering multiple subtag type */ if (bcpTypeLen >= (int32_t)sizeof(bcpTypeBuf)) { /* the BCP type is too long */ *status = U_ILLEGAL_ARGUMENT_ERROR; return; } uprv_strncpy(bcpTypeBuf, pBcpType, bcpTypeLen); bcpTypeBuf[bcpTypeLen] = 0; /* BCP type to locale type */ pType = uloc_toLegacyType(pKey, bcpTypeBuf); if (pType == NULL) { *status = U_ILLEGAL_ARGUMENT_ERROR; return; } if (pType == bcpTypeBuf) { /* The type returned by toLegacyType points to the input buffer. We normalize the result type to lower case. */ /* normalize to lower case */ T_CString_toLowerCase(bcpTypeBuf); icu::CharString* type = kwdBuf.create(bcpTypeBuf, bcpTypeLen, *status); if (type == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return; } if (U_FAILURE(*status)) { return; } pType = type->data(); } } else { /* typeless - default type value is "yes" */ pType = LOCALE_TYPE_YES; } /* Special handling for u-va-posix, since we want to treat this as a variant, not as a keyword */ if (!variantExists && !uprv_strcmp(pKey, POSIX_KEY) && !uprv_strcmp(pType, POSIX_VALUE) ) { *posixVariant = TRUE; } else { /* create an ExtensionListEntry for this keyword */ kwd = extPool.create(); if (kwd == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return; } kwd->key = pKey; kwd->value = pType; if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) { // duplicate keyword is allowed, Only the first // is honored. } } pBcpKey = pNextBcpKey; bcpKeyLen = pNextBcpKey != NULL ? nextBcpKeyLen : 0; pBcpType = NULL; bcpTypeLen = 0; } } } kwd = kwdFirst; while (kwd != NULL) { nextKwd = kwd->next; _addExtensionToList(appendTo, kwd, FALSE); kwd = nextKwd; } } static void _appendKeywords(ULanguageTag* langtag, icu::ByteSink& sink, UErrorCode* status) { int32_t i, n; int32_t len; ExtensionListEntry *kwdFirst = NULL; ExtensionListEntry *kwd; const char *key, *type; icu::MemoryPool extPool; icu::MemoryPool kwdBuf; UBool posixVariant = FALSE; if (U_FAILURE(*status)) { return; } /* Determine if variants already exists */ if (ultag_getVariantsSize(langtag)) { posixVariant = TRUE; } n = ultag_getExtensionsSize(langtag); /* resolve locale keywords and reordering keys */ for (i = 0; i < n; i++) { key = ultag_getExtensionKey(langtag, i); type = ultag_getExtensionValue(langtag, i); if (*key == LDMLEXT) { _appendLDMLExtensionAsKeywords(type, &kwdFirst, extPool, kwdBuf, &posixVariant, status); if (U_FAILURE(*status)) { break; } } else { kwd = extPool.create(); if (kwd == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; break; } kwd->key = key; kwd->value = type; if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } } } if (U_SUCCESS(*status)) { type = ultag_getPrivateUse(langtag); if ((int32_t)uprv_strlen(type) > 0) { /* add private use as a keyword */ kwd = extPool.create(); if (kwd == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; } else { kwd->key = PRIVATEUSE_KEY; kwd->value = type; if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) { *status = U_ILLEGAL_ARGUMENT_ERROR; } } } } /* If a POSIX variant was in the extensions, write it out before writing the keywords. */ if (U_SUCCESS(*status) && posixVariant) { len = (int32_t) uprv_strlen(_POSIX); sink.Append(_POSIX, len); } if (U_SUCCESS(*status) && kwdFirst != NULL) { /* write out the sorted keywords */ UBool firstValue = TRUE; kwd = kwdFirst; do { if (firstValue) { sink.Append("@", 1); firstValue = FALSE; } else { sink.Append(";", 1); } /* key */ len = (int32_t)uprv_strlen(kwd->key); sink.Append(kwd->key, len); sink.Append("=", 1); /* type */ len = (int32_t)uprv_strlen(kwd->value); sink.Append(kwd->value, len); kwd = kwd->next; } while (kwd); } } static void _appendPrivateuseToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) { (void)hadPosix; char buf[ULOC_FULLNAME_CAPACITY]; char tmpAppend[ULOC_FULLNAME_CAPACITY]; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t len, i; int32_t reslen = 0; int32_t capacity = sizeof tmpAppend; if (U_FAILURE(*status)) { return; } len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus); if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; } return; } if (len > 0) { char *p, *pPriv; UBool bNext = TRUE; UBool firstValue = TRUE; UBool writeValue; pPriv = NULL; p = buf; while (bNext) { writeValue = FALSE; if (*p == SEP || *p == LOCALE_SEP || *p == 0) { if (*p == 0) { bNext = FALSE; } else { *p = 0; /* terminate */ } if (pPriv != NULL) { /* Private use in the canonical format is lowercase in BCP47 */ for (i = 0; *(pPriv + i) != 0; i++) { *(pPriv + i) = uprv_tolower(*(pPriv + i)); } /* validate */ if (_isPrivateuseValueSubtag(pPriv, -1)) { if (firstValue) { if (!_isVariantSubtag(pPriv, -1)) { writeValue = TRUE; } } else { writeValue = TRUE; } } else if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } else { break; } if (writeValue) { if (reslen < capacity) { tmpAppend[reslen++] = SEP; } if (firstValue) { if (reslen < capacity) { tmpAppend[reslen++] = *PRIVATEUSE_KEY; } if (reslen < capacity) { tmpAppend[reslen++] = SEP; } len = (int32_t)uprv_strlen(PRIVUSE_VARIANT_PREFIX); if (reslen < capacity) { uprv_memcpy(tmpAppend + reslen, PRIVUSE_VARIANT_PREFIX, uprv_min(len, capacity - reslen)); } reslen += len; if (reslen < capacity) { tmpAppend[reslen++] = SEP; } firstValue = FALSE; } len = (int32_t)uprv_strlen(pPriv); if (reslen < capacity) { uprv_memcpy(tmpAppend + reslen, pPriv, uprv_min(len, capacity - reslen)); } reslen += len; } } /* reset private use starting position */ pPriv = NULL; } else if (pPriv == NULL) { pPriv = p; } p++; } if (U_FAILURE(*status)) { return; } } if (U_SUCCESS(*status)) { len = reslen; sink.Append(tmpAppend, len); } } /* * ------------------------------------------------- * * ultag_ functions * * ------------------------------------------------- */ /* Bit flags used by the parser */ #define LANG 0x0001 #define EXTL 0x0002 #define SCRT 0x0004 #define REGN 0x0008 #define VART 0x0010 #define EXTS 0x0020 #define EXTV 0x0040 #define PRIV 0x0080 /** * Ticket #12705 - Visual Studio 2015 Update 3 contains a new code optimizer which has problems optimizing * this function. (See https://blogs.msdn.microsoft.com/vcblog/2016/05/04/new-code-optimizer/ ) * As a workaround, we will turn off optimization just for this function on VS2015 Update 3 and above. */ #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210)) #pragma optimize( "", off ) #endif static ULanguageTag* ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status) { char *tagBuf; int16_t next; char *pSubtag, *pNext, *pLastGoodPosition; int32_t subtagLen; int32_t extlangIdx; ExtensionListEntry *pExtension; char *pExtValueSubtag, *pExtValueSubtagEnd; int32_t i; UBool privateuseVar = FALSE; int32_t grandfatheredLen = 0; if (parsedLen != NULL) { *parsedLen = 0; } if (U_FAILURE(*status)) { return NULL; } if (tagLen < 0) { tagLen = (int32_t)uprv_strlen(tag); } /* copy the entire string */ tagBuf = (char*)uprv_malloc(tagLen + 1); if (tagBuf == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return NULL; } uprv_memcpy(tagBuf, tag, tagLen); *(tagBuf + tagLen) = 0; /* create a ULanguageTag */ icu::LocalULanguageTagPointer t( (ULanguageTag*)uprv_malloc(sizeof(ULanguageTag))); if (t.isNull()) { uprv_free(tagBuf); *status = U_MEMORY_ALLOCATION_ERROR; return NULL; } _initializeULanguageTag(t.getAlias()); t->buf = tagBuf; if (tagLen < MINLEN) { /* the input tag is too short - return empty ULanguageTag */ return t.orphan(); } size_t parsedLenDelta = 0; // Grandfathered tag will be consider together. Grandfathered tag with intervening // script and region such as art-DE-lojban or art-Latn-lojban won't be // matched. /* check if the tag is grandfathered */ for (i = 0; i < UPRV_LENGTHOF(GRANDFATHERED); i += 2) { int32_t checkGrandfatheredLen = static_cast(uprv_strlen(GRANDFATHERED[i])); if (tagLen < checkGrandfatheredLen) { continue; } if (tagLen > checkGrandfatheredLen && tagBuf[checkGrandfatheredLen] != '-') { // make sure next char is '-'. continue; } if (uprv_strnicmp(GRANDFATHERED[i], tagBuf, checkGrandfatheredLen) == 0) { int32_t newTagLength; grandfatheredLen = checkGrandfatheredLen; /* back up for output parsedLen */ int32_t replacementLen = static_cast(uprv_strlen(GRANDFATHERED[i+1])); newTagLength = replacementLen + tagLen - checkGrandfatheredLen; if (tagLen < newTagLength) { uprv_free(tagBuf); tagBuf = (char*)uprv_malloc(newTagLength + 1); if (tagBuf == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return NULL; } t->buf = tagBuf; tagLen = newTagLength; } parsedLenDelta = checkGrandfatheredLen - replacementLen; uprv_strcpy(t->buf, GRANDFATHERED[i + 1]); if (checkGrandfatheredLen != tagLen) { uprv_strcpy(t->buf + replacementLen, tag + checkGrandfatheredLen); } break; } } if (grandfatheredLen == 0) { for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) { const char* redundantTag = REDUNDANT[i]; size_t redundantTagLen = uprv_strlen(redundantTag); // The preferred tag for a redundant tag is always shorter than redundant // tag. A redundant tag may or may not be followed by other subtags. // (i.e. "zh-yue" or "zh-yue-u-co-pinyin"). if (uprv_strnicmp(redundantTag, tagBuf, static_cast(redundantTagLen)) == 0) { const char* redundantTagEnd = tagBuf + redundantTagLen; if (*redundantTagEnd == '\0' || *redundantTagEnd == SEP) { const char* preferredTag = REDUNDANT[i + 1]; size_t preferredTagLen = uprv_strlen(preferredTag); uprv_strncpy(t->buf, preferredTag, preferredTagLen); if (*redundantTagEnd == SEP) { uprv_memmove(tagBuf + preferredTagLen, redundantTagEnd, tagLen - redundantTagLen + 1); } else { tagBuf[preferredTagLen] = '\0'; } // parsedLen should be the length of the input // before redundantTag is replaced by preferredTag. // Save the delta to add it back later. parsedLenDelta = redundantTagLen - preferredTagLen; break; } } } } /* * langtag = language * ["-" script] * ["-" region] * *("-" variant) * *("-" extension) * ["-" privateuse] */ next = LANG | PRIV; pNext = pLastGoodPosition = tagBuf; extlangIdx = 0; pExtension = NULL; pExtValueSubtag = NULL; pExtValueSubtagEnd = NULL; while (pNext) { char *pSep; pSubtag = pNext; /* locate next separator char */ pSep = pSubtag; while (*pSep) { if (*pSep == SEP) { break; } pSep++; } if (*pSep == 0) { /* last subtag */ pNext = NULL; } else { pNext = pSep + 1; } subtagLen = (int32_t)(pSep - pSubtag); if (next & LANG) { if (ultag_isLanguageSubtag(pSubtag, subtagLen)) { *pSep = 0; /* terminate */ // TODO: move deprecated language code handling here. t->language = T_CString_toLowerCase(pSubtag); pLastGoodPosition = pSep; next = SCRT | REGN | VART | EXTS | PRIV; if (subtagLen <= 3) next |= EXTL; continue; } } if (next & EXTL) { if (_isExtlangSubtag(pSubtag, subtagLen)) { *pSep = 0; t->extlang[extlangIdx++] = T_CString_toLowerCase(pSubtag); pLastGoodPosition = pSep; if (extlangIdx < 3) { next = EXTL | SCRT | REGN | VART | EXTS | PRIV; } else { next = SCRT | REGN | VART | EXTS | PRIV; } continue; } } if (next & SCRT) { if (ultag_isScriptSubtag(pSubtag, subtagLen)) { char *p = pSubtag; *pSep = 0; /* to title case */ *p = uprv_toupper(*p); p++; for (; *p; p++) { *p = uprv_tolower(*p); } t->script = pSubtag; pLastGoodPosition = pSep; next = REGN | VART | EXTS | PRIV; continue; } } if (next & REGN) { if (ultag_isRegionSubtag(pSubtag, subtagLen)) { *pSep = 0; // TODO: move deprecated region code handling here. t->region = T_CString_toUpperCase(pSubtag); pLastGoodPosition = pSep; next = VART | EXTS | PRIV; continue; } } if (next & VART) { if (_isVariantSubtag(pSubtag, subtagLen) || (privateuseVar && _isPrivateuseVariantSubtag(pSubtag, subtagLen))) { VariantListEntry *var; UBool isAdded; var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry)); if (var == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return NULL; } *pSep = 0; var->variant = T_CString_toUpperCase(pSubtag); isAdded = _addVariantToList(&(t->variants), var); if (!isAdded) { /* duplicated variant entry */ uprv_free(var); break; } pLastGoodPosition = pSep; next = VART | EXTS | PRIV; continue; } } if (next & EXTS) { if (_isExtensionSingleton(pSubtag, subtagLen)) { if (pExtension != NULL) { if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) { /* the previous extension is incomplete */ uprv_free(pExtension); pExtension = NULL; break; } /* terminate the previous extension value */ *pExtValueSubtagEnd = 0; pExtension->value = T_CString_toLowerCase(pExtValueSubtag); /* insert the extension to the list */ if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) { pLastGoodPosition = pExtValueSubtagEnd; } else { /* stop parsing here */ uprv_free(pExtension); pExtension = NULL; break; } } /* create a new extension */ pExtension = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry)); if (pExtension == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return NULL; } *pSep = 0; pExtension->key = T_CString_toLowerCase(pSubtag); pExtension->value = NULL; /* will be set later */ /* * reset the start and the end location of extension value * subtags for this extension */ pExtValueSubtag = NULL; pExtValueSubtagEnd = NULL; next = EXTV; continue; } } if (next & EXTV) { if (_isExtensionSubtag(pSubtag, subtagLen)) { if (pExtValueSubtag == NULL) { /* if the start postion of this extension's value is not yet, this one is the first value subtag */ pExtValueSubtag = pSubtag; } /* Mark the end of this subtag */ pExtValueSubtagEnd = pSep; next = EXTS | EXTV | PRIV; continue; } } if (next & PRIV) { if (uprv_tolower(*pSubtag) == PRIVATEUSE && subtagLen == 1) { char *pPrivuseVal; if (pExtension != NULL) { /* Process the last extension */ if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) { /* the previous extension is incomplete */ uprv_free(pExtension); pExtension = NULL; break; } else { /* terminate the previous extension value */ *pExtValueSubtagEnd = 0; pExtension->value = T_CString_toLowerCase(pExtValueSubtag); /* insert the extension to the list */ if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) { pLastGoodPosition = pExtValueSubtagEnd; pExtension = NULL; } else { /* stop parsing here */ uprv_free(pExtension); pExtension = NULL; break; } } } /* The rest of part will be private use value subtags */ if (pNext == NULL) { /* empty private use subtag */ break; } /* back up the private use value start position */ pPrivuseVal = pNext; /* validate private use value subtags */ while (pNext) { pSubtag = pNext; pSep = pSubtag; while (*pSep) { if (*pSep == SEP) { break; } pSep++; } if (*pSep == 0) { /* last subtag */ pNext = NULL; } else { pNext = pSep + 1; } subtagLen = (int32_t)(pSep - pSubtag); if (uprv_strncmp(pSubtag, PRIVUSE_VARIANT_PREFIX, uprv_strlen(PRIVUSE_VARIANT_PREFIX)) == 0) { *pSep = 0; next = VART; privateuseVar = TRUE; break; } else if (_isPrivateuseValueSubtag(pSubtag, subtagLen)) { pLastGoodPosition = pSep; } else { break; } } if (next == VART) { continue; } if (pLastGoodPosition - pPrivuseVal > 0) { *pLastGoodPosition = 0; t->privateuse = T_CString_toLowerCase(pPrivuseVal); } /* No more subtags, exiting the parse loop */ break; } break; } /* If we fell through here, it means this subtag is illegal - quit parsing */ break; } if (pExtension != NULL) { /* Process the last extension */ if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) { /* the previous extension is incomplete */ uprv_free(pExtension); } else { /* terminate the previous extension value */ *pExtValueSubtagEnd = 0; pExtension->value = T_CString_toLowerCase(pExtValueSubtag); /* insert the extension to the list */ if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) { pLastGoodPosition = pExtValueSubtagEnd; } else { uprv_free(pExtension); } } } if (parsedLen != NULL) { *parsedLen = (int32_t)(pLastGoodPosition - t->buf + parsedLenDelta); } return t.orphan(); } /** * Ticket #12705 - Turn optimization back on. */ #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210)) #pragma optimize( "", on ) #endif static void ultag_close(ULanguageTag* langtag) { if (langtag == NULL) { return; } uprv_free(langtag->buf); if (langtag->variants) { VariantListEntry *curVar = langtag->variants; while (curVar) { VariantListEntry *nextVar = curVar->next; uprv_free(curVar); curVar = nextVar; } } if (langtag->extensions) { ExtensionListEntry *curExt = langtag->extensions; while (curExt) { ExtensionListEntry *nextExt = curExt->next; uprv_free(curExt); curExt = nextExt; } } uprv_free(langtag); } static const char* ultag_getLanguage(const ULanguageTag* langtag) { return langtag->language; } #if 0 static const char* ultag_getJDKLanguage(const ULanguageTag* langtag) { int32_t i; for (i = 0; DEPRECATEDLANGS[i] != NULL; i += 2) { if (uprv_compareInvCharsAsAscii(DEPRECATEDLANGS[i], langtag->language) == 0) { return DEPRECATEDLANGS[i + 1]; } } return langtag->language; } #endif static const char* ultag_getExtlang(const ULanguageTag* langtag, int32_t idx) { if (idx >= 0 && idx < MAXEXTLANG) { return langtag->extlang[idx]; } return NULL; } static int32_t ultag_getExtlangSize(const ULanguageTag* langtag) { int32_t size = 0; int32_t i; for (i = 0; i < MAXEXTLANG; i++) { if (langtag->extlang[i]) { size++; } } return size; } static const char* ultag_getScript(const ULanguageTag* langtag) { return langtag->script; } static const char* ultag_getRegion(const ULanguageTag* langtag) { return langtag->region; } static const char* ultag_getVariant(const ULanguageTag* langtag, int32_t idx) { const char *var = NULL; VariantListEntry *cur = langtag->variants; int32_t i = 0; while (cur) { if (i == idx) { var = cur->variant; break; } cur = cur->next; i++; } return var; } static int32_t ultag_getVariantsSize(const ULanguageTag* langtag) { int32_t size = 0; VariantListEntry *cur = langtag->variants; while (TRUE) { if (cur == NULL) { break; } size++; cur = cur->next; } return size; } static const char* ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx) { const char *key = NULL; ExtensionListEntry *cur = langtag->extensions; int32_t i = 0; while (cur) { if (i == idx) { key = cur->key; break; } cur = cur->next; i++; } return key; } static const char* ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx) { const char *val = NULL; ExtensionListEntry *cur = langtag->extensions; int32_t i = 0; while (cur) { if (i == idx) { val = cur->value; break; } cur = cur->next; i++; } return val; } static int32_t ultag_getExtensionsSize(const ULanguageTag* langtag) { int32_t size = 0; ExtensionListEntry *cur = langtag->extensions; while (TRUE) { if (cur == NULL) { break; } size++; cur = cur->next; } return size; } static const char* ultag_getPrivateUse(const ULanguageTag* langtag) { return langtag->privateuse; } #if 0 static const char* ultag_getGrandfathered(const ULanguageTag* langtag) { return langtag->grandfathered; } #endif /* * ------------------------------------------------- * * Locale/BCP47 conversion APIs, exposed as uloc_* * * ------------------------------------------------- */ U_CAPI int32_t U_EXPORT2 uloc_toLanguageTag(const char* localeID, char* langtag, int32_t langtagCapacity, UBool strict, UErrorCode* status) { if (U_FAILURE(*status)) { return 0; } icu::CheckedArrayByteSink sink(langtag, langtagCapacity); ulocimp_toLanguageTag(localeID, sink, strict, status); int32_t reslen = sink.NumberOfBytesAppended(); if (U_FAILURE(*status)) { return reslen; } if (sink.Overflowed()) { *status = U_BUFFER_OVERFLOW_ERROR; } else { u_terminateChars(langtag, langtagCapacity, reslen, status); } return reslen; } U_CAPI void U_EXPORT2 ulocimp_toLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) { icu::CharString canonical; int32_t reslen; UErrorCode tmpStatus = U_ZERO_ERROR; UBool hadPosix = FALSE; const char* pKeywordStart; /* Note: uloc_canonicalize returns "en_US_POSIX" for input locale ID "". See #6835 */ int32_t resultCapacity = static_cast(uprv_strlen(localeID)); if (resultCapacity > 0) { char* buffer; for (;;) { buffer = canonical.getAppendBuffer( /*minCapacity=*/resultCapacity, /*desiredCapacityHint=*/resultCapacity, resultCapacity, tmpStatus); if (U_FAILURE(tmpStatus)) { *status = tmpStatus; return; } reslen = uloc_canonicalize(localeID, buffer, resultCapacity, &tmpStatus); if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) { break; } resultCapacity = reslen; tmpStatus = U_ZERO_ERROR; } if (U_FAILURE(tmpStatus)) { *status = U_ILLEGAL_ARGUMENT_ERROR; return; } canonical.append(buffer, reslen, tmpStatus); if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { tmpStatus = U_ZERO_ERROR; // Terminators provided by CharString. } if (U_FAILURE(tmpStatus)) { *status = tmpStatus; return; } } /* For handling special case - private use only tag */ pKeywordStart = locale_getKeywordsStart(canonical.data()); if (pKeywordStart == canonical.data()) { int kwdCnt = 0; UBool done = FALSE; icu::LocalUEnumerationPointer kwdEnum(uloc_openKeywords(canonical.data(), &tmpStatus)); if (U_SUCCESS(tmpStatus)) { kwdCnt = uenum_count(kwdEnum.getAlias(), &tmpStatus); if (kwdCnt == 1) { const char *key; int32_t len = 0; key = uenum_next(kwdEnum.getAlias(), &len, &tmpStatus); if (len == 1 && *key == PRIVATEUSE) { char buf[ULOC_KEYWORD_AND_VALUES_CAPACITY]; buf[0] = PRIVATEUSE; buf[1] = SEP; len = uloc_getKeywordValue(localeID, key, &buf[2], sizeof(buf) - 2, &tmpStatus); if (U_SUCCESS(tmpStatus)) { if (ultag_isPrivateuseValueSubtags(&buf[2], len)) { /* return private use only tag */ sink.Append(buf, len + 2); done = TRUE; } else if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; done = TRUE; } /* if not strict mode, then "und" will be returned */ } else { *status = U_ILLEGAL_ARGUMENT_ERROR; done = TRUE; } } } if (done) { return; } } } _appendLanguageToLanguageTag(canonical.data(), sink, strict, status); _appendScriptToLanguageTag(canonical.data(), sink, strict, status); _appendRegionToLanguageTag(canonical.data(), sink, strict, status); _appendVariantsToLanguageTag(canonical.data(), sink, strict, &hadPosix, status); _appendKeywordsToLanguageTag(canonical.data(), sink, strict, hadPosix, status); _appendPrivateuseToLanguageTag(canonical.data(), sink, strict, hadPosix, status); } U_CAPI int32_t U_EXPORT2 uloc_forLanguageTag(const char* langtag, char* localeID, int32_t localeIDCapacity, int32_t* parsedLength, UErrorCode* status) { if (U_FAILURE(*status)) { return 0; } icu::CheckedArrayByteSink sink(localeID, localeIDCapacity); ulocimp_forLanguageTag(langtag, -1, sink, parsedLength, status); int32_t reslen = sink.NumberOfBytesAppended(); if (U_FAILURE(*status)) { return reslen; } if (sink.Overflowed()) { *status = U_BUFFER_OVERFLOW_ERROR; } else { u_terminateChars(localeID, localeIDCapacity, reslen, status); } return reslen; } U_CAPI void U_EXPORT2 ulocimp_forLanguageTag(const char* langtag, int32_t tagLen, icu::ByteSink& sink, int32_t* parsedLength, UErrorCode* status) { UBool isEmpty = TRUE; const char *subtag, *p; int32_t len; int32_t i, n; UBool noRegion = TRUE; icu::LocalULanguageTagPointer lt(ultag_parse(langtag, tagLen, parsedLength, status)); if (U_FAILURE(*status)) { return; } /* language */ subtag = ultag_getExtlangSize(lt.getAlias()) > 0 ? ultag_getExtlang(lt.getAlias(), 0) : ultag_getLanguage(lt.getAlias()); if (uprv_compareInvCharsAsAscii(subtag, LANG_UND) != 0) { len = (int32_t)uprv_strlen(subtag); if (len > 0) { sink.Append(subtag, len); isEmpty = FALSE; } } /* script */ subtag = ultag_getScript(lt.getAlias()); len = (int32_t)uprv_strlen(subtag); if (len > 0) { sink.Append("_", 1); isEmpty = FALSE; /* write out the script in title case */ char c = uprv_toupper(*subtag); sink.Append(&c, 1); sink.Append(subtag + 1, len - 1); } /* region */ subtag = ultag_getRegion(lt.getAlias()); len = (int32_t)uprv_strlen(subtag); if (len > 0) { sink.Append("_", 1); isEmpty = FALSE; /* write out the region in upper case */ p = subtag; while (*p) { char c = uprv_toupper(*p); sink.Append(&c, 1); p++; } noRegion = FALSE; } /* variants */ n = ultag_getVariantsSize(lt.getAlias()); if (n > 0) { if (noRegion) { sink.Append("_", 1); isEmpty = FALSE; } for (i = 0; i < n; i++) { subtag = ultag_getVariant(lt.getAlias(), i); sink.Append("_", 1); /* write out the variant in upper case */ p = subtag; while (*p) { char c = uprv_toupper(*p); sink.Append(&c, 1); p++; } } } /* keywords */ n = ultag_getExtensionsSize(lt.getAlias()); subtag = ultag_getPrivateUse(lt.getAlias()); if (n > 0 || uprv_strlen(subtag) > 0) { if (isEmpty && n > 0) { /* need a language */ sink.Append(LANG_UND, LANG_UND_LEN); } _appendKeywords(lt.getAlias(), sink, status); } }