/* ********************************************************************** * Copyright (C) 2009-2015, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #include "unicode/utypes.h" #include "unicode/ures.h" #include "unicode/putil.h" #include "unicode/uloc.h" #include "ustr_imp.h" #include "cmemory.h" #include "cstring.h" #include "putilimp.h" #include "uinvchar.h" #include "ulocimp.h" #include "uassert.h" /* struct holding a single variant */ typedef struct VariantListEntry { const char *variant; struct VariantListEntry *next; } VariantListEntry; /* struct holding a single attribute value */ typedef struct AttributeListEntry { const char *attribute; struct AttributeListEntry *next; } AttributeListEntry; /* struct holding a single extension */ typedef struct ExtensionListEntry { const char *key; const char *value; struct ExtensionListEntry *next; } ExtensionListEntry; #define MAXEXTLANG 3 typedef struct ULanguageTag { char *buf; /* holding parsed subtags */ const char *language; const char *extlang[MAXEXTLANG]; const char *script; const char *region; VariantListEntry *variants; ExtensionListEntry *extensions; const char *privateuse; const char *grandfathered; } ULanguageTag; #define MINLEN 2 #define SEP '-' #define PRIVATEUSE 'x' #define LDMLEXT 'u' #define LOCALE_SEP '_' #define LOCALE_EXT_SEP '@' #define LOCALE_KEYWORD_SEP ';' #define LOCALE_KEY_TYPE_SEP '=' #define ISALPHA(c) uprv_isASCIILetter(c) #define ISNUMERIC(c) ((c)>='0' && (c)<='9') static const char EMPTY[] = ""; static const char LANG_UND[] = "und"; static const char PRIVATEUSE_KEY[] = "x"; static const char _POSIX[] = "_POSIX"; static const char POSIX_KEY[] = "va"; static const char POSIX_VALUE[] = "posix"; static const char LOCALE_ATTRIBUTE_KEY[] = "attribute"; static const char PRIVUSE_VARIANT_PREFIX[] = "lvariant"; static const char LOCALE_TYPE_YES[] = "yes"; #define LANG_UND_LEN 3 static const char* const GRANDFATHERED[] = { /* grandfathered preferred */ "art-lojban", "jbo", "cel-gaulish", "xtg-x-cel-gaulish", "en-GB-oed", "en-GB-x-oed", "i-ami", "ami", "i-bnn", "bnn", "i-default", "en-x-i-default", "i-enochian", "und-x-i-enochian", "i-hak", "hak", "i-klingon", "tlh", "i-lux", "lb", "i-mingo", "see-x-i-mingo", "i-navajo", "nv", "i-pwn", "pwn", "i-tao", "tao", "i-tay", "tay", "i-tsu", "tsu", "no-bok", "nb", "no-nyn", "nn", "sgn-be-fr", "sfb", "sgn-be-nl", "vgt", "sgn-ch-de", "sgg", "zh-guoyu", "cmn", "zh-hakka", "hak", "zh-min", "nan-x-zh-min", "zh-min-nan", "nan", "zh-xiang", "hsn", NULL, NULL }; static const char DEPRECATEDLANGS[][4] = { /* deprecated new */ "iw", "he", "ji", "yi", "in", "id" }; /* * ------------------------------------------------- * * These ultag_ functions may be exposed as APIs later * * ------------------------------------------------- */ static ULanguageTag* ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status); static void ultag_close(ULanguageTag* langtag); static const char* ultag_getLanguage(const ULanguageTag* langtag); #if 0 static const char* ultag_getJDKLanguage(const ULanguageTag* langtag); #endif static const char* ultag_getExtlang(const ULanguageTag* langtag, int32_t idx); static int32_t ultag_getExtlangSize(const ULanguageTag* langtag); static const char* ultag_getScript(const ULanguageTag* langtag); static const char* ultag_getRegion(const ULanguageTag* langtag); static const char* ultag_getVariant(const ULanguageTag* langtag, int32_t idx); static int32_t ultag_getVariantsSize(const ULanguageTag* langtag); static const char* ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx); static const char* ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx); static int32_t ultag_getExtensionsSize(const ULanguageTag* langtag); static const char* ultag_getPrivateUse(const ULanguageTag* langtag); #if 0 static const char* ultag_getGrandfathered(const ULanguageTag* langtag); #endif /* * ------------------------------------------------- * * Language subtag syntax validation functions * * ------------------------------------------------- */ static UBool _isAlphaString(const char* s, int32_t len) { int32_t i; for (i = 0; i < len; i++) { if (!ISALPHA(*(s + i))) { return FALSE; } } return TRUE; } static UBool _isNumericString(const char* s, int32_t len) { int32_t i; for (i = 0; i < len; i++) { if (!ISNUMERIC(*(s + i))) { return FALSE; } } return TRUE; } static UBool _isAlphaNumericString(const char* s, int32_t len) { int32_t i; for (i = 0; i < len; i++) { if (!ISALPHA(*(s + i)) && !ISNUMERIC(*(s + i))) { return FALSE; } } return TRUE; } static UBool _isLanguageSubtag(const char* s, int32_t len) { /* * language = 2*3ALPHA ; shortest ISO 639 code * ["-" extlang] ; sometimes followed by * ; extended language subtags * / 4ALPHA ; or reserved for future use * / 5*8ALPHA ; or registered language subtag */ if (len < 0) { len = (int32_t)uprv_strlen(s); } if (len >= 2 && len <= 8 && _isAlphaString(s, len)) { return TRUE; } return FALSE; } static UBool _isExtlangSubtag(const char* s, int32_t len) { /* * extlang = 3ALPHA ; selected ISO 639 codes * *2("-" 3ALPHA) ; permanently reserved */ if (len < 0) { len = (int32_t)uprv_strlen(s); } if (len == 3 && _isAlphaString(s, len)) { return TRUE; } return FALSE; } static UBool _isScriptSubtag(const char* s, int32_t len) { /* * script = 4ALPHA ; ISO 15924 code */ if (len < 0) { len = (int32_t)uprv_strlen(s); } if (len == 4 && _isAlphaString(s, len)) { return TRUE; } return FALSE; } static UBool _isRegionSubtag(const char* s, int32_t len) { /* * region = 2ALPHA ; ISO 3166-1 code * / 3DIGIT ; UN M.49 code */ if (len < 0) { len = (int32_t)uprv_strlen(s); } if (len == 2 && _isAlphaString(s, len)) { return TRUE; } if (len == 3 && _isNumericString(s, len)) { return TRUE; } return FALSE; } static UBool _isVariantSubtag(const char* s, int32_t len) { /* * variant = 5*8alphanum ; registered variants * / (DIGIT 3alphanum) */ if (len < 0) { len = (int32_t)uprv_strlen(s); } if (len >= 5 && len <= 8 && _isAlphaNumericString(s, len)) { return TRUE; } if (len == 4 && ISNUMERIC(*s) && _isAlphaNumericString(s + 1, 3)) { return TRUE; } return FALSE; } static UBool _isPrivateuseVariantSubtag(const char* s, int32_t len) { /* * variant = 1*8alphanum ; registered variants * / (DIGIT 3alphanum) */ if (len < 0) { len = (int32_t)uprv_strlen(s); } if (len >= 1 && len <= 8 && _isAlphaNumericString(s, len)) { return TRUE; } return FALSE; } static UBool _isExtensionSingleton(const char* s, int32_t len) { /* * extension = singleton 1*("-" (2*8alphanum)) */ if (len < 0) { len = (int32_t)uprv_strlen(s); } if (len == 1 && ISALPHA(*s) && (uprv_tolower(*s) != PRIVATEUSE)) { return TRUE; } return FALSE; } static UBool _isExtensionSubtag(const char* s, int32_t len) { /* * extension = singleton 1*("-" (2*8alphanum)) */ if (len < 0) { len = (int32_t)uprv_strlen(s); } if (len >= 2 && len <= 8 && _isAlphaNumericString(s, len)) { return TRUE; } return FALSE; } static UBool _isExtensionSubtags(const char* s, int32_t len) { const char *p = s; const char *pSubtag = NULL; if (len < 0) { len = (int32_t)uprv_strlen(s); } while ((p - s) < len) { if (*p == SEP) { if (pSubtag == NULL) { return FALSE; } if (!_isExtensionSubtag(pSubtag, (int32_t)(p - pSubtag))) { return FALSE; } pSubtag = NULL; } else if (pSubtag == NULL) { pSubtag = p; } p++; } if (pSubtag == NULL) { return FALSE; } return _isExtensionSubtag(pSubtag, (int32_t)(p - pSubtag)); } static UBool _isPrivateuseValueSubtag(const char* s, int32_t len) { /* * privateuse = "x" 1*("-" (1*8alphanum)) */ if (len < 0) { len = (int32_t)uprv_strlen(s); } if (len >= 1 && len <= 8 && _isAlphaNumericString(s, len)) { return TRUE; } return FALSE; } static UBool _isPrivateuseValueSubtags(const char* s, int32_t len) { const char *p = s; const char *pSubtag = NULL; if (len < 0) { len = (int32_t)uprv_strlen(s); } while ((p - s) < len) { if (*p == SEP) { if (pSubtag == NULL) { return FALSE; } if (!_isPrivateuseValueSubtag(pSubtag, (int32_t)(p - pSubtag))) { return FALSE; } pSubtag = NULL; } else if (pSubtag == NULL) { pSubtag = p; } p++; } if (pSubtag == NULL) { return FALSE; } return _isPrivateuseValueSubtag(pSubtag, (int32_t)(p - pSubtag)); } U_CFUNC UBool ultag_isUnicodeLocaleKey(const char* s, int32_t len) { if (len < 0) { len = (int32_t)uprv_strlen(s); } if (len == 2 && _isAlphaNumericString(s, len)) { return TRUE; } return FALSE; } U_CFUNC UBool ultag_isUnicodeLocaleType(const char*s, int32_t len) { const char* p; int32_t subtagLen = 0; if (len < 0) { len = (int32_t)uprv_strlen(s); } for (p = s; len > 0; p++, len--) { if (*p == SEP) { if (subtagLen < 3) { return FALSE; } subtagLen = 0; } else if (ISALPHA(*p) || ISNUMERIC(*p)) { subtagLen++; if (subtagLen > 8) { return FALSE; } } else { return FALSE; } } return (subtagLen >= 3); } /* * ------------------------------------------------- * * Helper functions * * ------------------------------------------------- */ static UBool _addVariantToList(VariantListEntry **first, VariantListEntry *var) { UBool bAdded = TRUE; if (*first == NULL) { var->next = NULL; *first = var; } else { VariantListEntry *prev, *cur; int32_t cmp; /* variants order should be preserved */ prev = NULL; cur = *first; while (TRUE) { if (cur == NULL) { prev->next = var; var->next = NULL; break; } /* Checking for duplicate variant */ cmp = uprv_compareInvCharsAsAscii(var->variant, cur->variant); if (cmp == 0) { /* duplicated variant */ bAdded = FALSE; break; } prev = cur; cur = cur->next; } } return bAdded; } static UBool _addAttributeToList(AttributeListEntry **first, AttributeListEntry *attr) { UBool bAdded = TRUE; if (*first == NULL) { attr->next = NULL; *first = attr; } else { AttributeListEntry *prev, *cur; int32_t cmp; /* reorder variants in alphabetical order */ prev = NULL; cur = *first; while (TRUE) { if (cur == NULL) { prev->next = attr; attr->next = NULL; break; } cmp = uprv_compareInvCharsAsAscii(attr->attribute, cur->attribute); if (cmp < 0) { if (prev == NULL) { *first = attr; } else { prev->next = attr; } attr->next = cur; break; } if (cmp == 0) { /* duplicated variant */ bAdded = FALSE; break; } prev = cur; cur = cur->next; } } return bAdded; } static UBool _addExtensionToList(ExtensionListEntry **first, ExtensionListEntry *ext, UBool localeToBCP) { UBool bAdded = TRUE; if (*first == NULL) { ext->next = NULL; *first = ext; } else { ExtensionListEntry *prev, *cur; int32_t cmp; /* reorder variants in alphabetical order */ prev = NULL; cur = *first; while (TRUE) { if (cur == NULL) { prev->next = ext; ext->next = NULL; break; } if (localeToBCP) { /* special handling for locale to bcp conversion */ int32_t len, curlen; len = (int32_t)uprv_strlen(ext->key); curlen = (int32_t)uprv_strlen(cur->key); if (len == 1 && curlen == 1) { if (*(ext->key) == *(cur->key)) { cmp = 0; } else if (*(ext->key) == PRIVATEUSE) { cmp = 1; } else if (*(cur->key) == PRIVATEUSE) { cmp = -1; } else { cmp = *(ext->key) - *(cur->key); } } else if (len == 1) { cmp = *(ext->key) - LDMLEXT; } else if (curlen == 1) { cmp = LDMLEXT - *(cur->key); } else { cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key); } } else { cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key); } if (cmp < 0) { if (prev == NULL) { *first = ext; } else { prev->next = ext; } ext->next = cur; break; } if (cmp == 0) { /* duplicated extension key */ bAdded = FALSE; break; } prev = cur; cur = cur->next; } } return bAdded; } static void _initializeULanguageTag(ULanguageTag* langtag) { int32_t i; langtag->buf = NULL; langtag->language = EMPTY; for (i = 0; i < MAXEXTLANG; i++) { langtag->extlang[i] = NULL; } langtag->script = EMPTY; langtag->region = EMPTY; langtag->variants = NULL; langtag->extensions = NULL; langtag->grandfathered = EMPTY; langtag->privateuse = EMPTY; } static int32_t _appendLanguageToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UErrorCode* status) { char buf[ULOC_LANG_CAPACITY]; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t len, i; int32_t reslen = 0; if (U_FAILURE(*status)) { return 0; } len = uloc_getLanguage(localeID, buf, sizeof(buf), &tmpStatus); if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } len = 0; } /* Note: returned language code is in lower case letters */ if (len == 0) { if (reslen < capacity) { uprv_memcpy(appendAt + reslen, LANG_UND, uprv_min(LANG_UND_LEN, capacity - reslen)); } reslen += LANG_UND_LEN; } else if (!_isLanguageSubtag(buf, len)) { /* invalid language code */ if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } if (reslen < capacity) { uprv_memcpy(appendAt + reslen, LANG_UND, uprv_min(LANG_UND_LEN, capacity - reslen)); } reslen += LANG_UND_LEN; } else { /* resolve deprecated */ for (i = 0; i < UPRV_LENGTHOF(DEPRECATEDLANGS); i += 2) { if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDLANGS[i]) == 0) { uprv_strcpy(buf, DEPRECATEDLANGS[i + 1]); len = (int32_t)uprv_strlen(buf); break; } } if (reslen < capacity) { uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen)); } reslen += len; } u_terminateChars(appendAt, capacity, reslen, status); return reslen; } static int32_t _appendScriptToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UErrorCode* status) { char buf[ULOC_SCRIPT_CAPACITY]; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t len; int32_t reslen = 0; if (U_FAILURE(*status)) { return 0; } len = uloc_getScript(localeID, buf, sizeof(buf), &tmpStatus); if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; } return 0; } if (len > 0) { if (!_isScriptSubtag(buf, len)) { /* invalid script code */ if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; } return 0; } else { if (reslen < capacity) { *(appendAt + reslen) = SEP; } reslen++; if (reslen < capacity) { uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen)); } reslen += len; } } u_terminateChars(appendAt, capacity, reslen, status); return reslen; } static int32_t _appendRegionToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UErrorCode* status) { char buf[ULOC_COUNTRY_CAPACITY]; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t len; int32_t reslen = 0; if (U_FAILURE(*status)) { return 0; } len = uloc_getCountry(localeID, buf, sizeof(buf), &tmpStatus); if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; } return 0; } if (len > 0) { if (!_isRegionSubtag(buf, len)) { /* invalid region code */ if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; } return 0; } else { if (reslen < capacity) { *(appendAt + reslen) = SEP; } reslen++; if (reslen < capacity) { uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen)); } reslen += len; } } u_terminateChars(appendAt, capacity, reslen, status); return reslen; } static int32_t _appendVariantsToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UBool *hadPosix, UErrorCode* status) { char buf[ULOC_FULLNAME_CAPACITY]; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t len, i; int32_t reslen = 0; if (U_FAILURE(*status)) { return 0; } len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus); if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; } return 0; } if (len > 0) { char *p, *pVar; UBool bNext = TRUE; VariantListEntry *var; VariantListEntry *varFirst = NULL; pVar = NULL; p = buf; while (bNext) { if (*p == SEP || *p == LOCALE_SEP || *p == 0) { if (*p == 0) { bNext = FALSE; } else { *p = 0; /* terminate */ } if (pVar == NULL) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } /* ignore empty variant */ } else { /* ICU uses upper case letters for variants, but the canonical format is lowercase in BCP47 */ for (i = 0; *(pVar + i) != 0; i++) { *(pVar + i) = uprv_tolower(*(pVar + i)); } /* validate */ if (_isVariantSubtag(pVar, -1)) { if (uprv_strcmp(pVar,POSIX_VALUE) || len != uprv_strlen(POSIX_VALUE)) { /* emit the variant to the list */ var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry)); if (var == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; break; } var->variant = pVar; if (!_addVariantToList(&varFirst, var)) { /* duplicated variant */ uprv_free(var); if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } } } else { /* Special handling for POSIX variant, need to remember that we had it and then */ /* treat it like an extension later. */ *hadPosix = TRUE; } } else if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } else if (_isPrivateuseValueSubtag(pVar, -1)) { /* Handle private use subtags separately */ break; } } /* reset variant starting position */ pVar = NULL; } else if (pVar == NULL) { pVar = p; } p++; } if (U_SUCCESS(*status)) { if (varFirst != NULL) { int32_t varLen; /* write out validated/normalized variants to the target */ var = varFirst; while (var != NULL) { if (reslen < capacity) { *(appendAt + reslen) = SEP; } reslen++; varLen = (int32_t)uprv_strlen(var->variant); if (reslen < capacity) { uprv_memcpy(appendAt + reslen, var->variant, uprv_min(varLen, capacity - reslen)); } reslen += varLen; var = var->next; } } } /* clean up */ var = varFirst; while (var != NULL) { VariantListEntry *tmpVar = var->next; uprv_free(var); var = tmpVar; } if (U_FAILURE(*status)) { return 0; } } u_terminateChars(appendAt, capacity, reslen, status); return reslen; } static int32_t _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UBool hadPosix, UErrorCode* status) { char buf[ULOC_KEYWORD_AND_VALUES_CAPACITY]; char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY] = { 0 }; int32_t attrBufLength = 0; UBool isAttribute = FALSE; UEnumeration *keywordEnum = NULL; int32_t reslen = 0; keywordEnum = uloc_openKeywords(localeID, status); if (U_FAILURE(*status) && !hadPosix) { uenum_close(keywordEnum); return 0; } if (keywordEnum != NULL || hadPosix) { /* reorder extensions */ int32_t len; const char *key; ExtensionListEntry *firstExt = NULL; ExtensionListEntry *ext; AttributeListEntry *firstAttr = NULL; AttributeListEntry *attr; char *attrValue; char extBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY]; char *pExtBuf = extBuf; int32_t extBufCapacity = sizeof(extBuf); const char *bcpKey, *bcpValue; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t keylen; UBool isBcpUExt; while (TRUE) { isAttribute = FALSE; key = uenum_next(keywordEnum, NULL, status); if (key == NULL) { break; } len = uloc_getKeywordValue(localeID, key, buf, sizeof(buf), &tmpStatus); /* buf must be null-terminated */ if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } /* ignore this keyword */ tmpStatus = U_ZERO_ERROR; continue; } keylen = (int32_t)uprv_strlen(key); isBcpUExt = (keylen > 1); /* special keyword used for representing Unicode locale attributes */ if (uprv_strcmp(key, LOCALE_ATTRIBUTE_KEY) == 0) { isAttribute = TRUE; if (len > 0) { int32_t i = 0; while (TRUE) { attrBufLength = 0; for (; i < len; i++) { if (buf[i] != '-') { attrBuf[attrBufLength++] = buf[i]; } else { i++; break; } } if (attrBufLength > 0) { attrBuf[attrBufLength] = 0; } else if (i >= len){ break; } /* create AttributeListEntry */ attr = (AttributeListEntry*)uprv_malloc(sizeof(AttributeListEntry)); if (attr == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; break; } attrValue = (char*)uprv_malloc(attrBufLength + 1); if (attrValue == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; break; } uprv_strcpy(attrValue, attrBuf); attr->attribute = attrValue; if (!_addAttributeToList(&firstAttr, attr)) { uprv_free(attr); uprv_free(attrValue); if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } } } } } else if (isBcpUExt) { bcpKey = uloc_toUnicodeLocaleKey(key); if (bcpKey == NULL) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } continue; } /* we've checked buf is null-terminated above */ bcpValue = uloc_toUnicodeLocaleType(key, buf); if (bcpValue == NULL) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } continue; } if (bcpValue == buf) { /* When uloc_toUnicodeLocaleType(key, buf) returns the input value as is, the value is well-formed, but has no known mapping. This implementation normalizes the the value to lower case */ int32_t bcpValueLen = uprv_strlen(bcpValue); if (bcpValueLen < extBufCapacity) { uprv_strcpy(pExtBuf, bcpValue); T_CString_toLowerCase(pExtBuf); bcpValue = pExtBuf; pExtBuf += (bcpValueLen + 1); extBufCapacity -= (bcpValueLen + 1); } else { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } continue; } } } else { if (*key == PRIVATEUSE) { if (!_isPrivateuseValueSubtags(buf, len)) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } continue; } } else { if (!_isExtensionSingleton(key, keylen) || !_isExtensionSubtags(buf, len)) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } continue; } } bcpKey = key; if ((len + 1) < extBufCapacity) { uprv_memcpy(pExtBuf, buf, len); bcpValue = pExtBuf; pExtBuf += len; *pExtBuf = 0; pExtBuf++; extBufCapacity -= (len + 1); } else { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } } if (!isAttribute) { /* create ExtensionListEntry */ ext = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry)); if (ext == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; break; } ext->key = bcpKey; ext->value = bcpValue; if (!_addExtensionToList(&firstExt, ext, TRUE)) { uprv_free(ext); if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } } } } /* Special handling for POSIX variant - add the keywords for POSIX */ if (hadPosix) { /* create ExtensionListEntry for POSIX */ ext = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry)); if (ext == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } ext->key = POSIX_KEY; ext->value = POSIX_VALUE; if (!_addExtensionToList(&firstExt, ext, TRUE)) { uprv_free(ext); } } if (U_SUCCESS(*status) && (firstExt != NULL || firstAttr != NULL)) { UBool startLDMLExtension = FALSE; attr = firstAttr; ext = firstExt; do { if (!startLDMLExtension && (ext && uprv_strlen(ext->key) > 1)) { /* write LDML singleton extension */ if (reslen < capacity) { *(appendAt + reslen) = SEP; } reslen++; if (reslen < capacity) { *(appendAt + reslen) = LDMLEXT; } reslen++; startLDMLExtension = TRUE; } /* write out the sorted BCP47 attributes, extensions and private use */ if (ext && (uprv_strlen(ext->key) == 1 || attr == NULL)) { if (reslen < capacity) { *(appendAt + reslen) = SEP; } reslen++; len = (int32_t)uprv_strlen(ext->key); if (reslen < capacity) { uprv_memcpy(appendAt + reslen, ext->key, uprv_min(len, capacity - reslen)); } reslen += len; if (reslen < capacity) { *(appendAt + reslen) = SEP; } reslen++; len = (int32_t)uprv_strlen(ext->value); if (reslen < capacity) { uprv_memcpy(appendAt + reslen, ext->value, uprv_min(len, capacity - reslen)); } reslen += len; ext = ext->next; } else if (attr) { /* write the value for the attributes */ if (reslen < capacity) { *(appendAt + reslen) = SEP; } reslen++; len = (int32_t)uprv_strlen(attr->attribute); if (reslen < capacity) { uprv_memcpy(appendAt + reslen, attr->attribute, uprv_min(len, capacity - reslen)); } reslen += len; attr = attr->next; } } while (attr != NULL || ext != NULL); } cleanup: /* clean up */ ext = firstExt; while (ext != NULL) { ExtensionListEntry *tmpExt = ext->next; uprv_free(ext); ext = tmpExt; } attr = firstAttr; while (attr != NULL) { AttributeListEntry *tmpAttr = attr->next; char *pValue = (char *)attr->attribute; uprv_free(pValue); uprv_free(attr); attr = tmpAttr; } uenum_close(keywordEnum); if (U_FAILURE(*status)) { return 0; } } return u_terminateChars(appendAt, capacity, reslen, status); } /** * Append keywords parsed from LDML extension value * e.g. "u-ca-gregory-co-trad" -> {calendar = gregorian} {collation = traditional} * Note: char* buf is used for storing keywords */ static void _appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendTo, char* buf, int32_t bufSize, UBool *posixVariant, UErrorCode *status) { const char *pTag; /* beginning of current subtag */ const char *pKwds; /* beginning of key-type pairs */ UBool variantExists = *posixVariant; ExtensionListEntry *kwdFirst = NULL; /* first LDML keyword */ ExtensionListEntry *kwd, *nextKwd; AttributeListEntry *attrFirst = NULL; /* first attribute */ AttributeListEntry *attr, *nextAttr; int32_t len; int32_t bufIdx = 0; char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY]; int32_t attrBufIdx = 0; /* Reset the posixVariant value */ *posixVariant = FALSE; pTag = ldmlext; pKwds = NULL; /* Iterate through u extension attributes */ while (*pTag) { /* locate next separator char */ for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++); if (ultag_isUnicodeLocaleKey(pTag, len)) { pKwds = pTag; break; } /* add this attribute to the list */ attr = (AttributeListEntry*)uprv_malloc(sizeof(AttributeListEntry)); if (attr == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } if (len < (int32_t)sizeof(attrBuf) - attrBufIdx) { uprv_memcpy(&attrBuf[attrBufIdx], pTag, len); attrBuf[attrBufIdx + len] = 0; attr->attribute = &attrBuf[attrBufIdx]; attrBufIdx += (len + 1); } else { *status = U_ILLEGAL_ARGUMENT_ERROR; goto cleanup; } if (!_addAttributeToList(&attrFirst, attr)) { *status = U_ILLEGAL_ARGUMENT_ERROR; uprv_free(attr); goto cleanup; } /* next tag */ pTag += len; if (*pTag) { /* next to the separator */ pTag++; } } if (attrFirst) { /* emit attributes as an LDML keyword, e.g. attribute=attr1-attr2 */ if (attrBufIdx > bufSize) { /* attrBufIdx == + 1 */ *status = U_ILLEGAL_ARGUMENT_ERROR; goto cleanup; } kwd = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry)); if (kwd == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } kwd->key = LOCALE_ATTRIBUTE_KEY; kwd->value = buf; /* attribute subtags sorted in alphabetical order as type */ attr = attrFirst; while (attr != NULL) { nextAttr = attr->next; /* buffer size check is done above */ if (attr != attrFirst) { *(buf + bufIdx) = SEP; bufIdx++; } len = uprv_strlen(attr->attribute); uprv_memcpy(buf + bufIdx, attr->attribute, len); bufIdx += len; attr = nextAttr; } *(buf + bufIdx) = 0; bufIdx++; if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) { *status = U_ILLEGAL_ARGUMENT_ERROR; uprv_free(kwd); goto cleanup; } /* once keyword entry is created, delete the attribute list */ attr = attrFirst; while (attr != NULL) { nextAttr = attr->next; uprv_free(attr); attr = nextAttr; } attrFirst = NULL; } if (pKwds) { const char *pBcpKey = NULL; /* u extenstion key subtag */ const char *pBcpType = NULL; /* beginning of u extension type subtag(s) */ int32_t bcpKeyLen = 0; int32_t bcpTypeLen = 0; UBool isDone = FALSE; pTag = pKwds; /* BCP47 representation of LDML key/type pairs */ while (!isDone) { const char *pNextBcpKey = NULL; int32_t nextBcpKeyLen = 0; UBool emitKeyword = FALSE; if (*pTag) { /* locate next separator char */ for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++); if (ultag_isUnicodeLocaleKey(pTag, len)) { if (pBcpKey) { emitKeyword = TRUE; pNextBcpKey = pTag; nextBcpKeyLen = len; } else { pBcpKey = pTag; bcpKeyLen = len; } } else { U_ASSERT(pBcpKey != NULL); /* within LDML type subtags */ if (pBcpType) { bcpTypeLen += (len + 1); } else { pBcpType = pTag; bcpTypeLen = len; } } /* next tag */ pTag += len; if (*pTag) { /* next to the separator */ pTag++; } } else { /* processing last one */ emitKeyword = TRUE; isDone = TRUE; } if (emitKeyword) { const char *pKey = NULL; /* LDML key */ const char *pType = NULL; /* LDML type */ char bcpKeyBuf[9]; /* BCP key length is always 2 for now */ U_ASSERT(pBcpKey != NULL); if (bcpKeyLen >= sizeof(bcpKeyBuf)) { /* the BCP key is invalid */ *status = U_ILLEGAL_ARGUMENT_ERROR; goto cleanup; } uprv_strncpy(bcpKeyBuf, pBcpKey, bcpKeyLen); bcpKeyBuf[bcpKeyLen] = 0; /* u extension key to LDML key */ pKey = uloc_toLegacyKey(bcpKeyBuf); if (pKey == NULL) { *status = U_ILLEGAL_ARGUMENT_ERROR; goto cleanup; } if (pKey == bcpKeyBuf) { /* The key returned by toLegacyKey points to the input buffer. We normalize the result key to lower case. */ T_CString_toLowerCase(bcpKeyBuf); if (bufSize - bufIdx - 1 >= bcpKeyLen) { uprv_memcpy(buf + bufIdx, bcpKeyBuf, bcpKeyLen); pKey = buf + bufIdx; bufIdx += bcpKeyLen; *(buf + bufIdx) = 0; bufIdx++; } else { *status = U_BUFFER_OVERFLOW_ERROR; goto cleanup; } } if (pBcpType) { char bcpTypeBuf[128]; /* practically long enough even considering multiple subtag type */ if (bcpTypeLen >= sizeof(bcpTypeBuf)) { /* the BCP type is too long */ *status = U_ILLEGAL_ARGUMENT_ERROR; goto cleanup; } uprv_strncpy(bcpTypeBuf, pBcpType, bcpTypeLen); bcpTypeBuf[bcpTypeLen] = 0; /* BCP type to locale type */ pType = uloc_toLegacyType(pKey, bcpTypeBuf); if (pType == NULL) { *status = U_ILLEGAL_ARGUMENT_ERROR; goto cleanup; } if (pType == bcpTypeBuf) { /* The type returned by toLegacyType points to the input buffer. We normalize the result type to lower case. */ /* normalize to lower case */ T_CString_toLowerCase(bcpTypeBuf); if (bufSize - bufIdx - 1 >= bcpTypeLen) { uprv_memcpy(buf + bufIdx, bcpTypeBuf, bcpTypeLen); pType = buf + bufIdx; bufIdx += bcpTypeLen; *(buf + bufIdx) = 0; bufIdx++; } else { *status = U_BUFFER_OVERFLOW_ERROR; goto cleanup; } } } else { /* typeless - default type value is "yes" */ pType = LOCALE_TYPE_YES; } /* Special handling for u-va-posix, since we want to treat this as a variant, not as a keyword */ if (!variantExists && !uprv_strcmp(pKey, POSIX_KEY) && !uprv_strcmp(pType, POSIX_VALUE) ) { *posixVariant = TRUE; } else { /* create an ExtensionListEntry for this keyword */ kwd = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry)); if (kwd == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } kwd->key = pKey; kwd->value = pType; if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) { *status = U_ILLEGAL_ARGUMENT_ERROR; uprv_free(kwd); goto cleanup; } } pBcpKey = pNextBcpKey; bcpKeyLen = pNextBcpKey != NULL ? nextBcpKeyLen : 0; pBcpType = NULL; bcpTypeLen = 0; } } } kwd = kwdFirst; while (kwd != NULL) { nextKwd = kwd->next; _addExtensionToList(appendTo, kwd, FALSE); kwd = nextKwd; } return; cleanup: attr = attrFirst; while (attr != NULL) { nextAttr = attr->next; uprv_free(attr); attr = nextAttr; } kwd = kwdFirst; while (kwd != NULL) { nextKwd = kwd->next; uprv_free(kwd); kwd = nextKwd; } } static int32_t _appendKeywords(ULanguageTag* langtag, char* appendAt, int32_t capacity, UErrorCode* status) { int32_t reslen = 0; int32_t i, n; int32_t len; ExtensionListEntry *kwdFirst = NULL; ExtensionListEntry *kwd; const char *key, *type; char *kwdBuf = NULL; int32_t kwdBufLength = capacity; UBool posixVariant = FALSE; if (U_FAILURE(*status)) { return 0; } kwdBuf = (char*)uprv_malloc(kwdBufLength); if (kwdBuf == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return 0; } /* Determine if variants already exists */ if (ultag_getVariantsSize(langtag)) { posixVariant = TRUE; } n = ultag_getExtensionsSize(langtag); /* resolve locale keywords and reordering keys */ for (i = 0; i < n; i++) { key = ultag_getExtensionKey(langtag, i); type = ultag_getExtensionValue(langtag, i); if (*key == LDMLEXT) { _appendLDMLExtensionAsKeywords(type, &kwdFirst, kwdBuf, kwdBufLength, &posixVariant, status); if (U_FAILURE(*status)) { break; } } else { kwd = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry)); if (kwd == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; break; } kwd->key = key; kwd->value = type; if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) { uprv_free(kwd); *status = U_ILLEGAL_ARGUMENT_ERROR; break; } } } if (U_SUCCESS(*status)) { type = ultag_getPrivateUse(langtag); if ((int32_t)uprv_strlen(type) > 0) { /* add private use as a keyword */ kwd = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry)); if (kwd == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; } else { kwd->key = PRIVATEUSE_KEY; kwd->value = type; if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) { uprv_free(kwd); *status = U_ILLEGAL_ARGUMENT_ERROR; } } } } /* If a POSIX variant was in the extensions, write it out before writing the keywords. */ if (U_SUCCESS(*status) && posixVariant) { len = (int32_t) uprv_strlen(_POSIX); if (reslen < capacity) { uprv_memcpy(appendAt + reslen, _POSIX, uprv_min(len, capacity - reslen)); } reslen += len; } if (U_SUCCESS(*status) && kwdFirst != NULL) { /* write out the sorted keywords */ UBool firstValue = TRUE; kwd = kwdFirst; do { if (reslen < capacity) { if (firstValue) { /* '@' */ *(appendAt + reslen) = LOCALE_EXT_SEP; firstValue = FALSE; } else { /* ';' */ *(appendAt + reslen) = LOCALE_KEYWORD_SEP; } } reslen++; /* key */ len = (int32_t)uprv_strlen(kwd->key); if (reslen < capacity) { uprv_memcpy(appendAt + reslen, kwd->key, uprv_min(len, capacity - reslen)); } reslen += len; /* '=' */ if (reslen < capacity) { *(appendAt + reslen) = LOCALE_KEY_TYPE_SEP; } reslen++; /* type */ len = (int32_t)uprv_strlen(kwd->value); if (reslen < capacity) { uprv_memcpy(appendAt + reslen, kwd->value, uprv_min(len, capacity - reslen)); } reslen += len; kwd = kwd->next; } while (kwd); } /* clean up */ kwd = kwdFirst; while (kwd != NULL) { ExtensionListEntry *tmpKwd = kwd->next; uprv_free(kwd); kwd = tmpKwd; } uprv_free(kwdBuf); if (U_FAILURE(*status)) { return 0; } return u_terminateChars(appendAt, capacity, reslen, status); } static int32_t _appendPrivateuseToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UBool hadPosix, UErrorCode* status) { char buf[ULOC_FULLNAME_CAPACITY]; char tmpAppend[ULOC_FULLNAME_CAPACITY]; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t len, i; int32_t reslen = 0; if (U_FAILURE(*status)) { return 0; } len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus); if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; } return 0; } if (len > 0) { char *p, *pPriv; UBool bNext = TRUE; UBool firstValue = TRUE; UBool writeValue; pPriv = NULL; p = buf; while (bNext) { writeValue = FALSE; if (*p == SEP || *p == LOCALE_SEP || *p == 0) { if (*p == 0) { bNext = FALSE; } else { *p = 0; /* terminate */ } if (pPriv != NULL) { /* Private use in the canonical format is lowercase in BCP47 */ for (i = 0; *(pPriv + i) != 0; i++) { *(pPriv + i) = uprv_tolower(*(pPriv + i)); } /* validate */ if (_isPrivateuseValueSubtag(pPriv, -1)) { if (firstValue) { if (!_isVariantSubtag(pPriv, -1)) { writeValue = TRUE; } } else { writeValue = TRUE; } } else if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } else { break; } if (writeValue) { if (reslen < capacity) { tmpAppend[reslen++] = SEP; } if (firstValue) { if (reslen < capacity) { tmpAppend[reslen++] = *PRIVATEUSE_KEY; } if (reslen < capacity) { tmpAppend[reslen++] = SEP; } len = (int32_t)uprv_strlen(PRIVUSE_VARIANT_PREFIX); if (reslen < capacity) { uprv_memcpy(tmpAppend + reslen, PRIVUSE_VARIANT_PREFIX, uprv_min(len, capacity - reslen)); } reslen += len; if (reslen < capacity) { tmpAppend[reslen++] = SEP; } firstValue = FALSE; } len = (int32_t)uprv_strlen(pPriv); if (reslen < capacity) { uprv_memcpy(tmpAppend + reslen, pPriv, uprv_min(len, capacity - reslen)); } reslen += len; } } /* reset private use starting position */ pPriv = NULL; } else if (pPriv == NULL) { pPriv = p; } p++; } if (U_FAILURE(*status)) { return 0; } } if (U_SUCCESS(*status)) { len = reslen; if (reslen < capacity) { uprv_memcpy(appendAt, tmpAppend, uprv_min(len, capacity - reslen)); } } u_terminateChars(appendAt, capacity, reslen, status); return reslen; } /* * ------------------------------------------------- * * ultag_ functions * * ------------------------------------------------- */ /* Bit flags used by the parser */ #define LANG 0x0001 #define EXTL 0x0002 #define SCRT 0x0004 #define REGN 0x0008 #define VART 0x0010 #define EXTS 0x0020 #define EXTV 0x0040 #define PRIV 0x0080 static ULanguageTag* ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status) { ULanguageTag *t; char *tagBuf; int16_t next; char *pSubtag, *pNext, *pLastGoodPosition; int32_t subtagLen; int32_t extlangIdx; ExtensionListEntry *pExtension; char *pExtValueSubtag, *pExtValueSubtagEnd; int32_t i; UBool privateuseVar = FALSE; int32_t grandfatheredLen = 0; if (parsedLen != NULL) { *parsedLen = 0; } if (U_FAILURE(*status)) { return NULL; } if (tagLen < 0) { tagLen = (int32_t)uprv_strlen(tag); } /* copy the entire string */ tagBuf = (char*)uprv_malloc(tagLen + 1); if (tagBuf == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return NULL; } uprv_memcpy(tagBuf, tag, tagLen); *(tagBuf + tagLen) = 0; /* create a ULanguageTag */ t = (ULanguageTag*)uprv_malloc(sizeof(ULanguageTag)); if (t == NULL) { uprv_free(tagBuf); *status = U_MEMORY_ALLOCATION_ERROR; return NULL; } _initializeULanguageTag(t); t->buf = tagBuf; if (tagLen < MINLEN) { /* the input tag is too short - return empty ULanguageTag */ return t; } /* check if the tag is grandfathered */ for (i = 0; GRANDFATHERED[i] != NULL; i += 2) { if (uprv_stricmp(GRANDFATHERED[i], tagBuf) == 0) { int32_t newTagLength; grandfatheredLen = tagLen; /* back up for output parsedLen */ newTagLength = uprv_strlen(GRANDFATHERED[i+1]); if (tagLen < newTagLength) { uprv_free(tagBuf); tagBuf = (char*)uprv_malloc(newTagLength + 1); if (tagBuf == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; ultag_close(t); return NULL; } t->buf = tagBuf; tagLen = newTagLength; } uprv_strcpy(t->buf, GRANDFATHERED[i + 1]); break; } } /* * langtag = language * ["-" script] * ["-" region] * *("-" variant) * *("-" extension) * ["-" privateuse] */ next = LANG | PRIV; pNext = pLastGoodPosition = tagBuf; extlangIdx = 0; pExtension = NULL; pExtValueSubtag = NULL; pExtValueSubtagEnd = NULL; while (pNext) { char *pSep; pSubtag = pNext; /* locate next separator char */ pSep = pSubtag; while (*pSep) { if (*pSep == SEP) { break; } pSep++; } if (*pSep == 0) { /* last subtag */ pNext = NULL; } else { pNext = pSep + 1; } subtagLen = (int32_t)(pSep - pSubtag); if (next & LANG) { if (_isLanguageSubtag(pSubtag, subtagLen)) { *pSep = 0; /* terminate */ t->language = T_CString_toLowerCase(pSubtag); pLastGoodPosition = pSep; next = EXTL | SCRT | REGN | VART | EXTS | PRIV; continue; } } if (next & EXTL) { if (_isExtlangSubtag(pSubtag, subtagLen)) { *pSep = 0; t->extlang[extlangIdx++] = T_CString_toLowerCase(pSubtag); pLastGoodPosition = pSep; if (extlangIdx < 3) { next = EXTL | SCRT | REGN | VART | EXTS | PRIV; } else { next = SCRT | REGN | VART | EXTS | PRIV; } continue; } } if (next & SCRT) { if (_isScriptSubtag(pSubtag, subtagLen)) { char *p = pSubtag; *pSep = 0; /* to title case */ *p = uprv_toupper(*p); p++; for (; *p; p++) { *p = uprv_tolower(*p); } t->script = pSubtag; pLastGoodPosition = pSep; next = REGN | VART | EXTS | PRIV; continue; } } if (next & REGN) { if (_isRegionSubtag(pSubtag, subtagLen)) { *pSep = 0; t->region = T_CString_toUpperCase(pSubtag); pLastGoodPosition = pSep; next = VART | EXTS | PRIV; continue; } } if (next & VART) { if (_isVariantSubtag(pSubtag, subtagLen) || (privateuseVar && _isPrivateuseVariantSubtag(pSubtag, subtagLen))) { VariantListEntry *var; UBool isAdded; var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry)); if (var == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; goto error; } *pSep = 0; var->variant = T_CString_toUpperCase(pSubtag); isAdded = _addVariantToList(&(t->variants), var); if (!isAdded) { /* duplicated variant entry */ uprv_free(var); break; } pLastGoodPosition = pSep; next = VART | EXTS | PRIV; continue; } } if (next & EXTS) { if (_isExtensionSingleton(pSubtag, subtagLen)) { if (pExtension != NULL) { if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) { /* the previous extension is incomplete */ uprv_free(pExtension); pExtension = NULL; break; } /* terminate the previous extension value */ *pExtValueSubtagEnd = 0; pExtension->value = T_CString_toLowerCase(pExtValueSubtag); /* insert the extension to the list */ if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) { pLastGoodPosition = pExtValueSubtagEnd; } else { /* stop parsing here */ uprv_free(pExtension); pExtension = NULL; break; } } /* create a new extension */ pExtension = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry)); if (pExtension == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; goto error; } *pSep = 0; pExtension->key = T_CString_toLowerCase(pSubtag); pExtension->value = NULL; /* will be set later */ /* * reset the start and the end location of extension value * subtags for this extension */ pExtValueSubtag = NULL; pExtValueSubtagEnd = NULL; next = EXTV; continue; } } if (next & EXTV) { if (_isExtensionSubtag(pSubtag, subtagLen)) { if (pExtValueSubtag == NULL) { /* if the start postion of this extension's value is not yet, this one is the first value subtag */ pExtValueSubtag = pSubtag; } /* Mark the end of this subtag */ pExtValueSubtagEnd = pSep; next = EXTS | EXTV | PRIV; continue; } } if (next & PRIV) { if (uprv_tolower(*pSubtag) == PRIVATEUSE) { char *pPrivuseVal; if (pExtension != NULL) { /* Process the last extension */ if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) { /* the previous extension is incomplete */ uprv_free(pExtension); pExtension = NULL; break; } else { /* terminate the previous extension value */ *pExtValueSubtagEnd = 0; pExtension->value = T_CString_toLowerCase(pExtValueSubtag); /* insert the extension to the list */ if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) { pLastGoodPosition = pExtValueSubtagEnd; pExtension = NULL; } else { /* stop parsing here */ uprv_free(pExtension); pExtension = NULL; break; } } } /* The rest of part will be private use value subtags */ if (pNext == NULL) { /* empty private use subtag */ break; } /* back up the private use value start position */ pPrivuseVal = pNext; /* validate private use value subtags */ while (pNext) { pSubtag = pNext; pSep = pSubtag; while (*pSep) { if (*pSep == SEP) { break; } pSep++; } if (*pSep == 0) { /* last subtag */ pNext = NULL; } else { pNext = pSep + 1; } subtagLen = (int32_t)(pSep - pSubtag); if (uprv_strncmp(pSubtag, PRIVUSE_VARIANT_PREFIX, uprv_strlen(PRIVUSE_VARIANT_PREFIX)) == 0) { *pSep = 0; next = VART; privateuseVar = TRUE; break; } else if (_isPrivateuseValueSubtag(pSubtag, subtagLen)) { pLastGoodPosition = pSep; } else { break; } } if (next == VART) { continue; } if (pLastGoodPosition - pPrivuseVal > 0) { *pLastGoodPosition = 0; t->privateuse = T_CString_toLowerCase(pPrivuseVal); } /* No more subtags, exiting the parse loop */ break; } break; } /* If we fell through here, it means this subtag is illegal - quit parsing */ break; } if (pExtension != NULL) { /* Process the last extension */ if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) { /* the previous extension is incomplete */ uprv_free(pExtension); } else { /* terminate the previous extension value */ *pExtValueSubtagEnd = 0; pExtension->value = T_CString_toLowerCase(pExtValueSubtag); /* insert the extension to the list */ if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) { pLastGoodPosition = pExtValueSubtagEnd; } else { uprv_free(pExtension); } } } if (parsedLen != NULL) { *parsedLen = (grandfatheredLen > 0) ? grandfatheredLen : (int32_t)(pLastGoodPosition - t->buf); } return t; error: ultag_close(t); return NULL; } static void ultag_close(ULanguageTag* langtag) { if (langtag == NULL) { return; } uprv_free(langtag->buf); if (langtag->variants) { VariantListEntry *curVar = langtag->variants; while (curVar) { VariantListEntry *nextVar = curVar->next; uprv_free(curVar); curVar = nextVar; } } if (langtag->extensions) { ExtensionListEntry *curExt = langtag->extensions; while (curExt) { ExtensionListEntry *nextExt = curExt->next; uprv_free(curExt); curExt = nextExt; } } uprv_free(langtag); } static const char* ultag_getLanguage(const ULanguageTag* langtag) { return langtag->language; } #if 0 static const char* ultag_getJDKLanguage(const ULanguageTag* langtag) { int32_t i; for (i = 0; DEPRECATEDLANGS[i] != NULL; i += 2) { if (uprv_compareInvCharsAsAscii(DEPRECATEDLANGS[i], langtag->language) == 0) { return DEPRECATEDLANGS[i + 1]; } } return langtag->language; } #endif static const char* ultag_getExtlang(const ULanguageTag* langtag, int32_t idx) { if (idx >= 0 && idx < MAXEXTLANG) { return langtag->extlang[idx]; } return NULL; } static int32_t ultag_getExtlangSize(const ULanguageTag* langtag) { int32_t size = 0; int32_t i; for (i = 0; i < MAXEXTLANG; i++) { if (langtag->extlang[i]) { size++; } } return size; } static const char* ultag_getScript(const ULanguageTag* langtag) { return langtag->script; } static const char* ultag_getRegion(const ULanguageTag* langtag) { return langtag->region; } static const char* ultag_getVariant(const ULanguageTag* langtag, int32_t idx) { const char *var = NULL; VariantListEntry *cur = langtag->variants; int32_t i = 0; while (cur) { if (i == idx) { var = cur->variant; break; } cur = cur->next; i++; } return var; } static int32_t ultag_getVariantsSize(const ULanguageTag* langtag) { int32_t size = 0; VariantListEntry *cur = langtag->variants; while (TRUE) { if (cur == NULL) { break; } size++; cur = cur->next; } return size; } static const char* ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx) { const char *key = NULL; ExtensionListEntry *cur = langtag->extensions; int32_t i = 0; while (cur) { if (i == idx) { key = cur->key; break; } cur = cur->next; i++; } return key; } static const char* ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx) { const char *val = NULL; ExtensionListEntry *cur = langtag->extensions; int32_t i = 0; while (cur) { if (i == idx) { val = cur->value; break; } cur = cur->next; i++; } return val; } static int32_t ultag_getExtensionsSize(const ULanguageTag* langtag) { int32_t size = 0; ExtensionListEntry *cur = langtag->extensions; while (TRUE) { if (cur == NULL) { break; } size++; cur = cur->next; } return size; } static const char* ultag_getPrivateUse(const ULanguageTag* langtag) { return langtag->privateuse; } #if 0 static const char* ultag_getGrandfathered(const ULanguageTag* langtag) { return langtag->grandfathered; } #endif /* * ------------------------------------------------- * * Locale/BCP47 conversion APIs, exposed as uloc_* * * ------------------------------------------------- */ U_CAPI int32_t U_EXPORT2 uloc_toLanguageTag(const char* localeID, char* langtag, int32_t langtagCapacity, UBool strict, UErrorCode* status) { /* char canonical[ULOC_FULLNAME_CAPACITY]; */ /* See #6822 */ char canonical[256]; int32_t reslen = 0; UErrorCode tmpStatus = U_ZERO_ERROR; UBool hadPosix = FALSE; const char* pKeywordStart; /* Note: uloc_canonicalize returns "en_US_POSIX" for input locale ID "". See #6835 */ canonical[0] = 0; if (uprv_strlen(localeID) > 0) { uloc_canonicalize(localeID, canonical, sizeof(canonical), &tmpStatus); if (tmpStatus != U_ZERO_ERROR) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } } /* For handling special case - private use only tag */ pKeywordStart = locale_getKeywordsStart(canonical); if (pKeywordStart == canonical) { UEnumeration *kwdEnum; int kwdCnt = 0; UBool done = FALSE; kwdEnum = uloc_openKeywords((const char*)canonical, &tmpStatus); if (kwdEnum != NULL) { kwdCnt = uenum_count(kwdEnum, &tmpStatus); if (kwdCnt == 1) { const char *key; int32_t len = 0; key = uenum_next(kwdEnum, &len, &tmpStatus); if (len == 1 && *key == PRIVATEUSE) { char buf[ULOC_KEYWORD_AND_VALUES_CAPACITY]; buf[0] = PRIVATEUSE; buf[1] = SEP; len = uloc_getKeywordValue(localeID, key, &buf[2], sizeof(buf) - 2, &tmpStatus); if (U_SUCCESS(tmpStatus)) { if (_isPrivateuseValueSubtags(&buf[2], len)) { /* return private use only tag */ reslen = len + 2; uprv_memcpy(langtag, buf, uprv_min(reslen, langtagCapacity)); u_terminateChars(langtag, langtagCapacity, reslen, status); done = TRUE; } else if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; done = TRUE; } /* if not strict mode, then "und" will be returned */ } else { *status = U_ILLEGAL_ARGUMENT_ERROR; done = TRUE; } } } uenum_close(kwdEnum); if (done) { return reslen; } } } reslen += _appendLanguageToLanguageTag(canonical, langtag, langtagCapacity, strict, status); reslen += _appendScriptToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, status); reslen += _appendRegionToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, status); reslen += _appendVariantsToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, &hadPosix, status); reslen += _appendKeywordsToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, hadPosix, status); reslen += _appendPrivateuseToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, hadPosix, status); return reslen; } U_CAPI int32_t U_EXPORT2 uloc_forLanguageTag(const char* langtag, char* localeID, int32_t localeIDCapacity, int32_t* parsedLength, UErrorCode* status) { ULanguageTag *lt; int32_t reslen = 0; const char *subtag, *p; int32_t len; int32_t i, n; UBool noRegion = TRUE; lt = ultag_parse(langtag, -1, parsedLength, status); if (U_FAILURE(*status)) { return 0; } /* language */ subtag = ultag_getExtlangSize(lt) > 0 ? ultag_getExtlang(lt, 0) : ultag_getLanguage(lt); if (uprv_compareInvCharsAsAscii(subtag, LANG_UND) != 0) { len = (int32_t)uprv_strlen(subtag); if (len > 0) { if (reslen < localeIDCapacity) { uprv_memcpy(localeID, subtag, uprv_min(len, localeIDCapacity - reslen)); } reslen += len; } } /* script */ subtag = ultag_getScript(lt); len = (int32_t)uprv_strlen(subtag); if (len > 0) { if (reslen < localeIDCapacity) { *(localeID + reslen) = LOCALE_SEP; } reslen++; /* write out the script in title case */ p = subtag; while (*p) { if (reslen < localeIDCapacity) { if (p == subtag) { *(localeID + reslen) = uprv_toupper(*p); } else { *(localeID + reslen) = *p; } } reslen++; p++; } } /* region */ subtag = ultag_getRegion(lt); len = (int32_t)uprv_strlen(subtag); if (len > 0) { if (reslen < localeIDCapacity) { *(localeID + reslen) = LOCALE_SEP; } reslen++; /* write out the retion in upper case */ p = subtag; while (*p) { if (reslen < localeIDCapacity) { *(localeID + reslen) = uprv_toupper(*p); } reslen++; p++; } noRegion = FALSE; } /* variants */ n = ultag_getVariantsSize(lt); if (n > 0) { if (noRegion) { if (reslen < localeIDCapacity) { *(localeID + reslen) = LOCALE_SEP; } reslen++; } for (i = 0; i < n; i++) { subtag = ultag_getVariant(lt, i); if (reslen < localeIDCapacity) { *(localeID + reslen) = LOCALE_SEP; } reslen++; /* write out the variant in upper case */ p = subtag; while (*p) { if (reslen < localeIDCapacity) { *(localeID + reslen) = uprv_toupper(*p); } reslen++; p++; } } } /* keywords */ n = ultag_getExtensionsSize(lt); subtag = ultag_getPrivateUse(lt); if (n > 0 || uprv_strlen(subtag) > 0) { if (reslen == 0 && n > 0) { /* need a language */ if (reslen < localeIDCapacity) { uprv_memcpy(localeID + reslen, LANG_UND, uprv_min(LANG_UND_LEN, localeIDCapacity - reslen)); } reslen += LANG_UND_LEN; } len = _appendKeywords(lt, localeID + reslen, localeIDCapacity - reslen, status); reslen += len; } ultag_close(lt); return u_terminateChars(localeID, localeIDCapacity, reslen, status); }