Bug Summary

File:root/firefox-clang/intl/icu/source/common/loclikelysubtags.cpp
Warning:line 829, column 13
Value stored to 'state' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name loclikelysubtags.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -relaxed-aliasing -ffp-contract=off -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/root/firefox-clang/obj-x86_64-pc-linux-gnu/config/external/icu/common -fcoverage-compilation-dir=/root/firefox-clang/obj-x86_64-pc-linux-gnu/config/external/icu/common -resource-dir /usr/lib/llvm-21/lib/clang/21 -include /root/firefox-clang/config/gcc_hidden.h -include /root/firefox-clang/obj-x86_64-pc-linux-gnu/mozilla-config.h -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/system_wrappers -U _FORTIFY_SOURCE -D _FORTIFY_SOURCE=2 -D _GLIBCXX_ASSERTIONS -D DEBUG=1 -D U_COMMON_IMPLEMENTATION -D _LIBCPP_DISABLE_DEPRECATION_WARNINGS -D U_USING_ICU_NAMESPACE=0 -D U_NO_DEFAULT_INCLUDE_UTF_HEADERS=1 -D U_HIDE_OBSOLETE_UTF_OLD_H=1 -D UCONFIG_NO_LEGACY_CONVERSION -D UCONFIG_NO_TRANSLITERATION -D UCONFIG_NO_REGULAR_EXPRESSIONS -D UCONFIG_NO_BREAK_ITERATION -D UCONFIG_NO_IDNA -D UCONFIG_NO_MF2 -D U_CHARSET_IS_UTF8 -D UNISTR_FROM_CHAR_EXPLICIT=explicit -D UNISTR_FROM_STRING_EXPLICIT=explicit -D U_ENABLE_DYLOAD=0 -D U_DEBUG=1 -I /root/firefox-clang/config/external/icu/common -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/config/external/icu/common -I /root/firefox-clang/intl/icu/source/i18n -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include/nspr -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include/nss -D MOZILLA_CLIENT -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../include/c++/14 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../include/x86_64-linux-gnu/c++/14 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../include/c++/14/backward -internal-isystem /usr/lib/llvm-21/lib/clang/21/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-error=pessimizing-move -Wno-error=large-by-value-copy=128 -Wno-error=implicit-int-float-conversion -Wno-error=thread-safety-analysis -Wno-error=tautological-type-limit-compare -Wno-invalid-offsetof -Wno-range-loop-analysis -Wno-deprecated-anon-enum-enum-conversion -Wno-deprecated-enum-enum-conversion -Wno-deprecated-this-capture -Wno-inline-new-delete -Wno-error=deprecated-declarations -Wno-error=array-bounds -Wno-error=free-nonheap-object -Wno-error=atomic-alignment -Wno-error=deprecated-builtins -Wno-psabi -Wno-error=builtin-macro-redefined -Wno-vla-cxx-extension -Wno-unknown-warning-option -Wno-comma -Wno-implicit-const-int-float-conversion -Wno-macro-redefined -Wno-microsoft-include -Wno-tautological-unsigned-enum-zero-compare -Wno-unreachable-code-loop-increment -Wno-unreachable-code-return -fdeprecated-macro -ferror-limit 19 -fstrict-flex-arrays=1 -stack-protector 2 -fstack-clash-protection -ftrivial-auto-var-init=pattern -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -fno-sized-deallocation -fno-aligned-allocation -vectorize-loops -vectorize-slp -analyzer-checker optin.performance.Padding -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2025-06-27-100320-3286336-1 -x c++ /root/firefox-clang/intl/icu/source/common/loclikelysubtags.cpp
1// © 2019 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3
4// loclikelysubtags.cpp
5// created: 2019may08 Markus W. Scherer
6
7#include <utility>
8#include "unicode/utypes.h"
9#include "unicode/bytestrie.h"
10#include "unicode/localpointer.h"
11#include "unicode/locid.h"
12#include "unicode/uobject.h"
13#include "unicode/ures.h"
14#include "unicode/uscript.h"
15#include "charstr.h"
16#include "cstring.h"
17#include "loclikelysubtags.h"
18#include "lsr.h"
19#include "uassert.h"
20#include "ucln_cmn.h"
21#include "uhash.h"
22#include "uinvchar.h"
23#include "umutex.h"
24#include "uniquecharstr.h"
25#include "uresdata.h"
26#include "uresimp.h"
27#include "uvector.h"
28
29U_NAMESPACE_BEGINnamespace icu_77 {
30
31namespace {
32
33constexpr char PSEUDO_ACCENTS_PREFIX = '\''; // -XA, -PSACCENT
34constexpr char PSEUDO_BIDI_PREFIX = '+'; // -XB, -PSBIDI
35constexpr char PSEUDO_CRACKED_PREFIX = ','; // -XC, -PSCRACK
36
37} // namespace
38
39LocaleDistanceData::LocaleDistanceData(LocaleDistanceData &&data) :
40 distanceTrieBytes(data.distanceTrieBytes),
41 regionToPartitions(data.regionToPartitions),
42 partitions(data.partitions),
43 paradigms(data.paradigms), paradigmsLength(data.paradigmsLength),
44 distances(data.distances) {
45 data.partitions = nullptr;
46 data.paradigms = nullptr;
47}
48
49LocaleDistanceData::~LocaleDistanceData() {
50 uprv_freeuprv_free_77(partitions);
51 delete[] paradigms;
52}
53
54struct LikelySubtagsData {
55 UResourceBundle *langInfoBundle = nullptr;
56 UniqueCharStrings strings;
57 CharStringMap languageAliases;
58 CharStringMap regionAliases;
59 const uint8_t *trieBytes = nullptr;
60 LSR *lsrs = nullptr;
61 int32_t lsrsLength = 0;
62
63 LocaleDistanceData distanceData;
64
65 LikelySubtagsData(UErrorCode &errorCode) : strings(errorCode) {}
66
67 ~LikelySubtagsData() {
68 ures_closeures_close_77(langInfoBundle);
69 delete[] lsrs;
70 }
71
72 void load(UErrorCode &errorCode) {
73 if (U_FAILURE(errorCode)) { return; }
74 langInfoBundle = ures_openDirectures_openDirect_77(nullptr, "langInfo", &errorCode);
75 if (U_FAILURE(errorCode)) { return; }
76 StackUResourceBundle stackTempBundle;
77 ResourceDataValue value;
78 ures_getValueWithFallbackures_getValueWithFallback_77(langInfoBundle, "likely", stackTempBundle.getAlias(),
79 value, errorCode);
80 ResourceTable likelyTable = value.getTable(errorCode);
81 if (U_FAILURE(errorCode)) { return; }
82
83 // Read all strings in the resource bundle and convert them to invariant char *.
84 LocalMemory<int32_t> languageIndexes, regionIndexes, lsrSubtagIndexes;
85 int32_t languagesLength = 0, regionsLength = 0, lsrSubtagsLength = 0;
86 ResourceArray m49Array;
87 if (likelyTable.findValue("m49", value)) {
88 m49Array = value.getArray(errorCode);
89 } else {
90 errorCode = U_MISSING_RESOURCE_ERROR;
91 return;
92 }
93 if (!readStrings(likelyTable, "languageAliases", value,
94 languageIndexes, languagesLength, errorCode) ||
95 !readStrings(likelyTable, "regionAliases", value,
96 regionIndexes, regionsLength, errorCode) ||
97 !readLSREncodedStrings(likelyTable, "lsrnum", value, m49Array,
98 lsrSubtagIndexes,lsrSubtagsLength, errorCode)) {
99 return;
100 }
101 if ((languagesLength & 1) != 0 ||
102 (regionsLength & 1) != 0 ||
103 (lsrSubtagsLength % 3) != 0) {
104 errorCode = U_INVALID_FORMAT_ERROR;
105 return;
106 }
107 if (lsrSubtagsLength == 0) {
108 errorCode = U_MISSING_RESOURCE_ERROR;
109 return;
110 }
111
112 if (!likelyTable.findValue("trie", value)) {
113 errorCode = U_MISSING_RESOURCE_ERROR;
114 return;
115 }
116 int32_t length;
117 trieBytes = value.getBinary(length, errorCode);
118 if (U_FAILURE(errorCode)) { return; }
119
120 // Also read distance/matcher data if available,
121 // to open & keep only one resource bundle pointer
122 // and to use one single UniqueCharStrings.
123 UErrorCode matchErrorCode = U_ZERO_ERROR;
124 ures_getValueWithFallbackures_getValueWithFallback_77(langInfoBundle, "match", stackTempBundle.getAlias(),
125 value, matchErrorCode);
126 LocalMemory<int32_t> partitionIndexes, paradigmSubtagIndexes;
127 int32_t partitionsLength = 0, paradigmSubtagsLength = 0;
128 if (U_SUCCESS(matchErrorCode)) {
129 ResourceTable matchTable = value.getTable(errorCode);
130 if (U_FAILURE(errorCode)) { return; }
131
132 if (matchTable.findValue("trie", value)) {
133 distanceData.distanceTrieBytes = value.getBinary(length, errorCode);
134 if (U_FAILURE(errorCode)) { return; }
135 }
136
137 if (matchTable.findValue("regionToPartitions", value)) {
138 distanceData.regionToPartitions = value.getBinary(length, errorCode);
139 if (U_FAILURE(errorCode)) { return; }
140 if (length < LSR::REGION_INDEX_LIMIT) {
141 errorCode = U_INVALID_FORMAT_ERROR;
142 return;
143 }
144 }
145
146 if (!readStrings(matchTable, "partitions", value,
147 partitionIndexes, partitionsLength, errorCode) ||
148 !readLSREncodedStrings(matchTable, "paradigmnum", value, m49Array,
149 paradigmSubtagIndexes, paradigmSubtagsLength, errorCode)) {
150 return;
151 }
152 if ((paradigmSubtagsLength % 3) != 0) {
153 errorCode = U_INVALID_FORMAT_ERROR;
154 return;
155 }
156
157 if (matchTable.findValue("distances", value)) {
158 distanceData.distances = value.getIntVector(length, errorCode);
159 if (U_FAILURE(errorCode)) { return; }
160 if (length < 4) { // LocaleDistance IX_LIMIT
161 errorCode = U_INVALID_FORMAT_ERROR;
162 return;
163 }
164 }
165 } else if (matchErrorCode == U_MISSING_RESOURCE_ERROR) {
166 // ok for likely subtags
167 } else { // error other than missing resource
168 errorCode = matchErrorCode;
169 return;
170 }
171
172 // Fetch & store invariant-character versions of strings
173 // only after we have collected and de-duplicated all of them.
174 strings.freeze();
175
176 languageAliases = CharStringMap(languagesLength / 2, errorCode);
177 for (int32_t i = 0; i < languagesLength; i += 2) {
178 languageAliases.put(strings.get(languageIndexes[i]),
179 strings.get(languageIndexes[i + 1]), errorCode);
180 }
181
182 regionAliases = CharStringMap(regionsLength / 2, errorCode);
183 for (int32_t i = 0; i < regionsLength; i += 2) {
184 regionAliases.put(strings.get(regionIndexes[i]),
185 strings.get(regionIndexes[i + 1]), errorCode);
186 }
187 if (U_FAILURE(errorCode)) { return; }
188
189 lsrsLength = lsrSubtagsLength / 3;
190 lsrs = new LSR[lsrsLength];
191 if (lsrs == nullptr) {
192 errorCode = U_MEMORY_ALLOCATION_ERROR;
193 return;
194 }
195 for (int32_t i = 0, j = 0; i < lsrSubtagsLength; i += 3, ++j) {
196 lsrs[j] = LSR(strings.get(lsrSubtagIndexes[i]),
197 strings.get(lsrSubtagIndexes[i + 1]),
198 strings.get(lsrSubtagIndexes[i + 2]),
199 LSR::IMPLICIT_LSR);
200 }
201
202 if (partitionsLength > 0) {
203 distanceData.partitions = static_cast<const char **>(
204 uprv_mallocuprv_malloc_77(partitionsLength * sizeof(const char *)));
205 if (distanceData.partitions == nullptr) {
206 errorCode = U_MEMORY_ALLOCATION_ERROR;
207 return;
208 }
209 for (int32_t i = 0; i < partitionsLength; ++i) {
210 distanceData.partitions[i] = strings.get(partitionIndexes[i]);
211 }
212 }
213
214 if (paradigmSubtagsLength > 0) {
215 distanceData.paradigmsLength = paradigmSubtagsLength / 3;
216 LSR *paradigms = new LSR[distanceData.paradigmsLength];
217 if (paradigms == nullptr) {
218 errorCode = U_MEMORY_ALLOCATION_ERROR;
219 return;
220 }
221 for (int32_t i = 0, j = 0; i < paradigmSubtagsLength; i += 3, ++j) {
222 paradigms[j] = LSR(strings.get(paradigmSubtagIndexes[i]),
223 strings.get(paradigmSubtagIndexes[i + 1]),
224 strings.get(paradigmSubtagIndexes[i + 2]),
225 LSR::DONT_CARE_FLAGS);
226 }
227 distanceData.paradigms = paradigms;
228 }
229 }
230
231private:
232 bool readStrings(const ResourceTable &table, const char *key, ResourceValue &value,
233 LocalMemory<int32_t> &indexes, int32_t &length, UErrorCode &errorCode) {
234 if (U_FAILURE(errorCode)) { return false; }
235 if (table.findValue(key, value)) {
236 ResourceArray stringArray = value.getArray(errorCode);
237 if (U_FAILURE(errorCode)) { return false; }
238 length = stringArray.getSize();
239 if (length == 0) { return true; }
240 int32_t *rawIndexes = indexes.allocateInsteadAndCopy(length);
241 if (rawIndexes == nullptr) {
242 errorCode = U_MEMORY_ALLOCATION_ERROR;
243 return false;
244 }
245 for (int i = 0; i < length; ++i) {
246 if (stringArray.getValue(i, value)) { // returns true because i < length
247 int32_t strLength = 0;
248 rawIndexes[i] = strings.add(value.getString(strLength, errorCode), errorCode);
249 if (U_FAILURE(errorCode)) { return false; }
250 }
251 }
252 }
253 return true;
254 }
255 UnicodeString toLanguage(int encoded) {
256 if (encoded == 0) {
257 return UNICODE_STRING_SIMPLE("")icu::UnicodeString(true, u"", -1);
258 }
259 if (encoded == 1) {
260 return UNICODE_STRING_SIMPLE("skip")icu::UnicodeString(true, u"skip", -1);
261 }
262 encoded &= 0x00ffffff;
263 encoded %= 27*27*27;
264 char lang[3];
265 lang[0] = 'a' + ((encoded % 27) - 1);
266 lang[1] = 'a' + (((encoded / 27 ) % 27) - 1);
267 if (encoded / (27 * 27) == 0) {
268 return UnicodeString(lang, 2, US_INVicu::UnicodeString::kInvariant);
269 }
270 lang[2] = 'a' + ((encoded / (27 * 27)) - 1);
271 return UnicodeString(lang, 3, US_INVicu::UnicodeString::kInvariant);
272 }
273 UnicodeString toScript(int encoded) {
274 if (encoded == 0) {
275 return UNICODE_STRING_SIMPLE("")icu::UnicodeString(true, u"", -1);
276 }
277 if (encoded == 1) {
278 return UNICODE_STRING_SIMPLE("script")icu::UnicodeString(true, u"script", -1);
279 }
280 encoded = (encoded >> 24) & 0x000000ff;
281 const char* script = uscript_getShortNameuscript_getShortName_77(static_cast<UScriptCode>(encoded));
282 if (script == nullptr) {
283 return UNICODE_STRING_SIMPLE("")icu::UnicodeString(true, u"", -1);
284 }
285 U_ASSERT(uprv_strlen(script) == 4)(static_cast <bool> (:: strlen(script) == 4) ? void (0)
: __assert_fail (":: strlen(script) == 4", __builtin_FILE ()
, __builtin_LINE (), __extension__ __PRETTY_FUNCTION__))
;
286 return UnicodeString(script, 4, US_INVicu::UnicodeString::kInvariant);
287 }
288 UnicodeString m49IndexToCode(const ResourceArray &m49Array, ResourceValue &value, int index, UErrorCode &errorCode) {
289 if (U_FAILURE(errorCode)) {
290 return UNICODE_STRING_SIMPLE("")icu::UnicodeString(true, u"", -1);
291 }
292 if (m49Array.getValue(index, value)) {
293 return value.getUnicodeString(errorCode);
294 }
295 // "m49" does not include the index.
296 errorCode = U_MISSING_RESOURCE_ERROR;
297 return UNICODE_STRING_SIMPLE("")icu::UnicodeString(true, u"", -1);
298 }
299
300 UnicodeString toRegion(const ResourceArray& m49Array, ResourceValue &value, int encoded, UErrorCode &errorCode) {
301 if (U_FAILURE(errorCode) || encoded == 0 || encoded == 1) {
302 return UNICODE_STRING_SIMPLE("")icu::UnicodeString(true, u"", -1);
303 }
304 encoded &= 0x00ffffff;
305 encoded /= 27 * 27 * 27;
306 encoded %= 27 * 27;
307 if (encoded < 27) {
308 // Selected M49 code index, find the code from "m49" resource.
309 return m49IndexToCode(m49Array, value, encoded, errorCode);
310 }
311 char region[2];
312 region[0] = 'A' + ((encoded % 27) - 1);
313 region[1] = 'A' + (((encoded / 27) % 27) - 1);
314 return UnicodeString(region, 2, US_INVicu::UnicodeString::kInvariant);
315 }
316
317 bool readLSREncodedStrings(const ResourceTable &table, const char* key, ResourceValue &value, const ResourceArray& m49Array,
318 LocalMemory<int32_t> &indexes, int32_t &length, UErrorCode &errorCode) {
319 if (U_FAILURE(errorCode)) { return false; }
320 if (table.findValue(key, value)) {
321 const int32_t* vectors = value.getIntVector(length, errorCode);
322 if (U_FAILURE(errorCode)) { return false; }
323 if (length == 0) { return true; }
324 int32_t *rawIndexes = indexes.allocateInsteadAndCopy(length * 3);
325 if (rawIndexes == nullptr) {
326 errorCode = U_MEMORY_ALLOCATION_ERROR;
327 return false;
328 }
329 for (int i = 0; i < length; ++i) {
330 rawIndexes[i*3] = strings.addByValue(toLanguage(vectors[i]), errorCode);
331 rawIndexes[i*3+1] = strings.addByValue(toScript(vectors[i]), errorCode);
332 rawIndexes[i*3+2] = strings.addByValue(
333 toRegion(m49Array, value, vectors[i], errorCode), errorCode);
334 if (U_FAILURE(errorCode)) { return false; }
335 }
336 length *= 3;
337 }
338 return true;
339 }
340};
341
342namespace {
343
344LikelySubtags *gLikelySubtags = nullptr;
345UVector *gMacroregions = nullptr;
346UInitOnce gInitOnce {};
347
348UBool U_CALLCONV cleanup() {
349 delete gLikelySubtags;
350 gLikelySubtags = nullptr;
351 delete gMacroregions;
352 gMacroregions = nullptr;
353 gInitOnce.reset();
354 return true;
355}
356
357constexpr const char16_t* MACROREGION_HARDCODE[] = {
358 u"001~3",
359 u"005",
360 u"009",
361 u"011",
362 u"013~5",
363 u"017~9",
364 u"021",
365 u"029",
366 u"030",
367 u"034~5",
368 u"039",
369 u"053~4",
370 u"057",
371 u"061",
372 u"142~3",
373 u"145",
374 u"150~1",
375 u"154~5",
376 u"202",
377 u"419",
378 u"EU",
379 u"EZ",
380 u"QO",
381 u"UN",
382};
383
384constexpr char16_t RANGE_MARKER = 0x7E; /* '~' */
385void processMacroregionRange(const UnicodeString& regionName, UVector* newMacroRegions, UErrorCode& status) {
386 if (U_FAILURE(status)) { return; }
387 int32_t rangeMarkerLocation = regionName.indexOf(RANGE_MARKER);
388 char16_t buf[6];
389 regionName.extract(buf,6,status);
390 if ( rangeMarkerLocation > 0 ) {
391 char16_t endRange = regionName.charAt(rangeMarkerLocation+1);
392 buf[rangeMarkerLocation] = 0;
393 while ( buf[rangeMarkerLocation-1] <= endRange && U_SUCCESS(status)) {
394 LocalPointer<UnicodeString> newRegion(new UnicodeString(buf), status);
395 newMacroRegions->adoptElement(newRegion.orphan(),status);
396 buf[rangeMarkerLocation-1]++;
397 }
398 } else {
399 LocalPointer<UnicodeString> newRegion(new UnicodeString(regionName), status);
400 newMacroRegions->adoptElement(newRegion.orphan(),status);
401 }
402}
403
404#if U_DEBUG1
405UVector* loadMacroregions(UErrorCode &status) {
406 if (U_FAILURE(status)) { return nullptr; }
407 LocalPointer<UVector> newMacroRegions(new UVector(uprv_deleteUObjectuprv_deleteUObject_77, uhash_compareUnicodeStringuhash_compareUnicodeString_77, status), status);
408
409 LocalUResourceBundlePointer supplementalData(ures_openDirectures_openDirect_77(nullptr,"supplementalData",&status));
410 LocalUResourceBundlePointer idValidity(ures_getByKeyures_getByKey_77(supplementalData.getAlias(),"idValidity",nullptr,&status));
411 LocalUResourceBundlePointer regionList(ures_getByKeyures_getByKey_77(idValidity.getAlias(),"region",nullptr,&status));
412 LocalUResourceBundlePointer regionMacro(ures_getByKeyures_getByKey_77(regionList.getAlias(),"macroregion",nullptr,&status));
413
414 if (U_FAILURE(status)) {
415 return nullptr;
416 }
417
418 while (ures_hasNextures_hasNext_77(regionMacro.getAlias())) {
419 UnicodeString regionName = ures_getNextUnicodeString(regionMacro.getAlias(),nullptr,&status);
420 processMacroregionRange(regionName, newMacroRegions.getAlias(), status);
421 if (U_FAILURE(status)) {
422 return nullptr;
423 }
424 }
425
426 return newMacroRegions.orphan();
427}
428#endif // U_DEBUG
429
430UVector* getStaticMacroregions(UErrorCode &status) {
431 if (U_FAILURE(status)) { return nullptr; }
432 LocalPointer<UVector> newMacroRegions(new UVector(uprv_deleteUObjectuprv_deleteUObject_77, uhash_compareUnicodeStringuhash_compareUnicodeString_77, status), status);
433
434 if (U_FAILURE(status)) {
435 return nullptr;
436 }
437
438 for (const auto *region : MACROREGION_HARDCODE) {
439 UnicodeString regionName(region);
440 processMacroregionRange(regionName, newMacroRegions.getAlias(), status);
441 if (U_FAILURE(status)) {
442 return nullptr;
443 }
444 }
445
446 return newMacroRegions.orphan();
447}
448
449} // namespace
450
451void U_CALLCONV LikelySubtags::initLikelySubtags(UErrorCode &errorCode) {
452 // This function is invoked only via umtx_initOnce().
453 U_ASSERT(gLikelySubtags == nullptr)(static_cast <bool> (gLikelySubtags == nullptr) ? void (
0) : __assert_fail ("gLikelySubtags == nullptr", __builtin_FILE
(), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__))
;
454 LikelySubtagsData data(errorCode);
455 data.load(errorCode);
456 if (U_FAILURE(errorCode)) { return; }
457 gLikelySubtags = new LikelySubtags(data);
458 gMacroregions = getStaticMacroregions(errorCode);
459#if U_DEBUG1
460 auto macroregionsFromData = loadMacroregions(errorCode);
461 U_ASSERT((*gMacroregions) == (*macroregionsFromData))(static_cast <bool> ((*gMacroregions) == (*macroregionsFromData
)) ? void (0) : __assert_fail ("(*gMacroregions) == (*macroregionsFromData)"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
))
;
462 delete macroregionsFromData;
463#endif
464 if (U_FAILURE(errorCode) || gLikelySubtags == nullptr || gMacroregions == nullptr) {
465 delete gLikelySubtags;
466 delete gMacroregions;
467 errorCode = U_MEMORY_ALLOCATION_ERROR;
468 return;
469 }
470
471 ucln_common_registerCleanupucln_common_registerCleanup_77(UCLN_COMMON_LIKELY_SUBTAGS, cleanup);
472}
473
474const LikelySubtags *LikelySubtags::getSingleton(UErrorCode &errorCode) {
475 if (U_FAILURE(errorCode)) { return nullptr; }
476 umtx_initOnce(gInitOnce, &LikelySubtags::initLikelySubtags, errorCode);
477 return gLikelySubtags;
478}
479
480LikelySubtags::LikelySubtags(LikelySubtagsData &data) :
481 langInfoBundle(data.langInfoBundle),
482 strings(data.strings.orphanCharStrings()),
483 languageAliases(std::move(data.languageAliases)),
484 regionAliases(std::move(data.regionAliases)),
485 trie(data.trieBytes),
486 lsrs(data.lsrs),
487#if U_DEBUG1
488 lsrsLength(data.lsrsLength),
489#endif // U_DEBUG
490 distanceData(std::move(data.distanceData)) {
491 data.langInfoBundle = nullptr;
492 data.lsrs = nullptr;
493
494 // Cache the result of looking up language="und" encoded as "*", and "und-Zzzz" ("**").
495 UStringTrieResult result = trie.next(u'*');
496 U_ASSERT(USTRINGTRIE_HAS_NEXT(result))(static_cast <bool> (((result)&1)) ? void (0) : __assert_fail
("((result)&1)", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__))
;
497 trieUndState = trie.getState64();
498 result = trie.next(u'*');
499 U_ASSERT(USTRINGTRIE_HAS_NEXT(result))(static_cast <bool> (((result)&1)) ? void (0) : __assert_fail
("((result)&1)", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__))
;
500 trieUndZzzzState = trie.getState64();
501 result = trie.next(u'*');
502 U_ASSERT(USTRINGTRIE_HAS_VALUE(result))(static_cast <bool> (((result)>=USTRINGTRIE_FINAL_VALUE
)) ? void (0) : __assert_fail ("((result)>=USTRINGTRIE_FINAL_VALUE)"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
))
;
503 defaultLsrIndex = trie.getValue();
504 trie.reset();
505
506 for (char16_t c = u'a'; c <= u'z'; ++c) {
507 result = trie.next(c);
508 if (result == USTRINGTRIE_NO_VALUE) {
509 trieFirstLetterStates[c - u'a'] = trie.getState64();
510 }
511 trie.reset();
512 }
513}
514
515LikelySubtags::~LikelySubtags() {
516 ures_closeures_close_77(langInfoBundle);
517 delete strings;
518 delete[] lsrs;
519}
520
521LSR LikelySubtags::makeMaximizedLsrFrom(const Locale &locale,
522 bool returnInputIfUnmatch,
523 UErrorCode &errorCode) const {
524 if (U_FAILURE(errorCode)) { return {}; }
525 if (locale.isBogus()) {
526 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
527 return {};
528 }
529 const char *name = locale.getName();
530 if (!returnInputIfUnmatch && uprv_isAtSign(name[0])((name[0])=='@') && name[1] == 'x' && name[2] == '=') { // name.startsWith("@x=")
531 // Private use language tag x-subtag-subtag... which CLDR changes to
532 // und-x-subtag-subtag...
533 return LSR(name, "", "", LSR::EXPLICIT_LSR);
534 }
535 LSR max = makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
536 locale.getVariant(), returnInputIfUnmatch, errorCode);
537
538 if (uprv_strlen(max.language):: strlen(max.language) == 0 &&
539 uprv_strlen(max.script):: strlen(max.script) == 0 &&
540 uprv_strlen(max.region):: strlen(max.region) == 0) {
541 // No match. ICU API mandate us to
542 // If the provided ULocale instance is already in the maximal form, or
543 // there is no data available available for maximization, it will be
544 // returned.
545 return LSR(locale.getLanguage(), locale.getScript(), locale.getCountry(), LSR::EXPLICIT_LSR, errorCode);
546 }
547 return max;
548}
549
550namespace {
551
552const char *getCanonical(const CharStringMap &aliases, const char *alias) {
553 const char *canonical = aliases.get(alias);
554 return canonical == nullptr ? alias : canonical;
555}
556
557} // namespace
558
559LSR LikelySubtags::makeMaximizedLsr(const char *language, const char *script, const char *region,
560 const char *variant,
561 bool returnInputIfUnmatch,
562 UErrorCode &errorCode) const {
563 if (U_FAILURE(errorCode)) { return {}; }
564 // Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK.
565 // They should match only themselves,
566 // not other locales with what looks like the same language and script subtags.
567 if (!returnInputIfUnmatch) {
568 char c1;
569 if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) {
570 switch (c1) {
571 case 'A':
572 return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region,
573 LSR::EXPLICIT_LSR, errorCode);
574 case 'B':
575 return LSR(PSEUDO_BIDI_PREFIX, language, script, region,
576 LSR::EXPLICIT_LSR, errorCode);
577 case 'C':
578 return LSR(PSEUDO_CRACKED_PREFIX, language, script, region,
579 LSR::EXPLICIT_LSR, errorCode);
580 default: // normal locale
581 break;
582 }
583 }
584
585 if (variant[0] == 'P' && variant[1] == 'S') {
586 int32_t lsrFlags = *region == 0 ?
587 LSR::EXPLICIT_LANGUAGE | LSR::EXPLICIT_SCRIPT : LSR::EXPLICIT_LSR;
588 if (uprv_strcmp(variant, "PSACCENT"):: strcmp(variant, "PSACCENT") == 0) {
589 return LSR(PSEUDO_ACCENTS_PREFIX, language, script,
590 *region == 0 ? "XA" : region, lsrFlags, errorCode);
591 } else if (uprv_strcmp(variant, "PSBIDI"):: strcmp(variant, "PSBIDI") == 0) {
592 return LSR(PSEUDO_BIDI_PREFIX, language, script,
593 *region == 0 ? "XB" : region, lsrFlags, errorCode);
594 } else if (uprv_strcmp(variant, "PSCRACK"):: strcmp(variant, "PSCRACK") == 0) {
595 return LSR(PSEUDO_CRACKED_PREFIX, language, script,
596 *region == 0 ? "XC" : region, lsrFlags, errorCode);
597 }
598 // else normal locale
599 }
600 } // end of if (!returnInputIfUnmatch)
601
602 language = getCanonical(languageAliases, language);
603 // (We have no script mappings.)
604 region = getCanonical(regionAliases, region);
605 return maximize(language, script, region, returnInputIfUnmatch, errorCode);
606}
607
608LSR LikelySubtags::maximize(const char *language, const char *script, const char *region,
609 bool returnInputIfUnmatch,
610 UErrorCode &errorCode) const {
611 if (U_FAILURE(errorCode)) { return {}; }
612 return maximize({language, static_cast<int32_t>(uprv_strlen(language):: strlen(language))},
613 {script, static_cast<int32_t>(uprv_strlen(script):: strlen(script))},
614 {region, static_cast<int32_t>(uprv_strlen(region):: strlen(region))},
615 returnInputIfUnmatch,
616 errorCode);
617}
618
619bool LikelySubtags::isMacroregion(StringPiece& region, UErrorCode& errorCode) const {
620 if (U_FAILURE(errorCode)) { return false; }
621 // In Java, we use Region class. In C++, since Region is under i18n,
622 // we read the same data used by Region into gMacroregions avoid dependency
623 // from common to i18n/region.cpp
624 umtx_initOnce(gInitOnce, &LikelySubtags::initLikelySubtags, errorCode);
625 if (U_FAILURE(errorCode)) { return false; }
626 UnicodeString str(UnicodeString::fromUTF8(region));
627 return gMacroregions->contains((void *)&str);
628}
629
630LSR LikelySubtags::maximize(StringPiece language, StringPiece script, StringPiece region,
631 bool returnInputIfUnmatch,
632 UErrorCode &errorCode) const {
633 if (U_FAILURE(errorCode)) { return {}; }
634 if (language.compare("und") == 0) {
635 language = "";
636 }
637 if (script.compare("Zzzz") == 0) {
638 script = "";
639 }
640 if (region.compare("ZZ") == 0) {
641 region = "";
642 }
643 if (!script.empty() && !region.empty() && !language.empty()) {
644 return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode); // already maximized
645 }
646 bool retainLanguage = false;
647 bool retainScript = false;
648 bool retainRegion = false;
649
650 BytesTrie iter(trie);
651 uint64_t state;
652 int32_t value;
653 // Small optimization: Array lookup for first language letter.
654 int32_t c0;
655 if (0 <= (c0 = uprv_lowerOrdinal(language.data()[0])) && c0 <= 25 &&
656 language.length() >= 2 &&
657 (state = trieFirstLetterStates[c0]) != 0) {
658 value = trieNext(iter.resetToState64(state), language, 1);
659 } else {
660 value = trieNext(iter, language, 0);
661 }
662 bool matchLanguage = (value >= 0);
663 bool matchScript = false;
664 if (value >= 0) {
665 retainLanguage = !language.empty();
666 state = iter.getState64();
667 } else {
668 retainLanguage = true;
669 iter.resetToState64(trieUndState); // "und" ("*")
670 state = 0;
671 }
672
673 if (value >= 0 && !script.empty()) {
674 matchScript = true;
675 }
676 if (value > 0) {
677 // Intermediate or final value from just language.
678 if (value == SKIP_SCRIPT) {
679 value = 0;
680 }
681 retainScript = !script.empty();
682 } else {
683 value = trieNext(iter, script, 0);
684 if (value >= 0) {
685 retainScript = !script.empty();
686 state = iter.getState64();
687 } else {
688 retainScript = true;
689 if (state == 0) {
690 iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
691 } else {
692 iter.resetToState64(state);
693 value = trieNext(iter, "", 0);
694 U_ASSERT(value >= 0)(static_cast <bool> (value >= 0) ? void (0) : __assert_fail
("value >= 0", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__))
;
695 state = iter.getState64();
696 }
697 }
698 }
699
700 bool matchRegion = false;
701 if (value > 0) {
702 // Final value from just language or language+script.
703 retainRegion = !region.empty();
704 } else {
705 value = trieNext(iter, region, 0);
706 if (value >= 0) {
707 if (!region.empty() && !isMacroregion(region, errorCode)) {
708 retainRegion = true;
709 matchRegion = true;
710 }
711 } else {
712 retainRegion = true;
713 if (state == 0) {
714 value = defaultLsrIndex;
715 } else {
716 iter.resetToState64(state);
717 value = trieNext(iter, "", 0);
718 U_ASSERT(value > 0)(static_cast <bool> (value > 0) ? void (0) : __assert_fail
("value > 0", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__))
;
719 }
720 }
721 }
722 U_ASSERT(value < lsrsLength)(static_cast <bool> (value < lsrsLength) ? void (0) :
__assert_fail ("value < lsrsLength", __builtin_FILE (), __builtin_LINE
(), __extension__ __PRETTY_FUNCTION__))
;
723 const LSR &matched = lsrs[value];
724
725 if (returnInputIfUnmatch &&
726 (!(matchLanguage || matchScript || (matchRegion && language.empty())))) {
727 return LSR("", "", "", LSR::EXPLICIT_LSR, errorCode); // no matching.
728 }
729 if (language.empty()) {
730 language = StringPiece("und");
731 }
732
733 if (!(retainLanguage || retainScript || retainRegion)) {
734 // Quickly return a copy of the lookup-result LSR
735 // without new allocation of the subtags.
736 return LSR(matched.language, matched.script, matched.region, matched.flags);
737 }
738 if (!retainLanguage) {
739 language = matched.language;
740 }
741 if (!retainScript) {
742 script = matched.script;
743 }
744 if (!retainRegion) {
745 region = matched.region;
746 }
747 int32_t retainMask = (retainLanguage ? 4 : 0) + (retainScript ? 2 : 0) + (retainRegion ? 1 : 0);
748 // retainOldMask flags = LSR explicit-subtag flags
749 return LSR(language, script, region, retainMask, errorCode);
750}
751
752int32_t LikelySubtags::compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const {
753 // If likelyInfo >= 0:
754 // likelyInfo bit 1 is set if the previous comparison with lsr
755 // was for equal language and script.
756 // Otherwise the scripts differed.
757 if (uprv_strcmp(lsr.language, other.language):: strcmp(lsr.language, other.language) != 0) {
758 return 0xfffffffc; // negative, lsr not better than other
759 }
760 if (uprv_strcmp(lsr.script, other.script):: strcmp(lsr.script, other.script) != 0) {
761 int32_t index;
762 if (likelyInfo >= 0 && (likelyInfo & 2) == 0) {
763 index = likelyInfo >> 2;
764 } else {
765 index = getLikelyIndex(lsr.language, "");
766 likelyInfo = index << 2;
767 }
768 const LSR &likely = lsrs[index];
769 if (uprv_strcmp(lsr.script, likely.script):: strcmp(lsr.script, likely.script) == 0) {
770 return likelyInfo | 1;
771 } else {
772 return likelyInfo & ~1;
773 }
774 }
775 if (uprv_strcmp(lsr.region, other.region):: strcmp(lsr.region, other.region) != 0) {
776 int32_t index;
777 if (likelyInfo >= 0 && (likelyInfo & 2) != 0) {
778 index = likelyInfo >> 2;
779 } else {
780 index = getLikelyIndex(lsr.language, lsr.region);
781 likelyInfo = (index << 2) | 2;
782 }
783 const LSR &likely = lsrs[index];
784 if (uprv_strcmp(lsr.region, likely.region):: strcmp(lsr.region, likely.region) == 0) {
785 return likelyInfo | 1;
786 } else {
787 return likelyInfo & ~1;
788 }
789 }
790 return likelyInfo & ~1; // lsr not better than other
791}
792
793// Subset of maximize().
794int32_t LikelySubtags::getLikelyIndex(const char *language, const char *script) const {
795 if (uprv_strcmp(language, "und"):: strcmp(language, "und") == 0) {
796 language = "";
797 }
798 if (uprv_strcmp(script, "Zzzz"):: strcmp(script, "Zzzz") == 0) {
799 script = "";
800 }
801
802 BytesTrie iter(trie);
803 uint64_t state;
804 int32_t value;
805 // Small optimization: Array lookup for first language letter.
806 int32_t c0;
807 if (0 <= (c0 = uprv_lowerOrdinal(language[0])) && c0 <= 25 &&
808 language[1] != 0 && // language.length() >= 2
809 (state = trieFirstLetterStates[c0]) != 0) {
810 value = trieNext(iter.resetToState64(state), language, 1);
811 } else {
812 value = trieNext(iter, language, 0);
813 }
814 if (value >= 0) {
815 state = iter.getState64();
816 } else {
817 iter.resetToState64(trieUndState); // "und" ("*")
818 state = 0;
819 }
820
821 if (value > 0) {
822 // Intermediate or final value from just language.
823 if (value == SKIP_SCRIPT) {
824 value = 0;
825 }
826 } else {
827 value = trieNext(iter, script, 0);
828 if (value >= 0) {
829 state = iter.getState64();
Value stored to 'state' is never read
830 } else {
831 if (state == 0) {
832 iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
833 } else {
834 iter.resetToState64(state);
835 value = trieNext(iter, "", 0);
836 U_ASSERT(value >= 0)(static_cast <bool> (value >= 0) ? void (0) : __assert_fail
("value >= 0", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__))
;
837 state = iter.getState64();
838 }
839 }
840 }
841
842 if (value > 0) {
843 // Final value from just language or language+script.
844 } else {
845 value = trieNext(iter, "", 0);
846 U_ASSERT(value > 0)(static_cast <bool> (value > 0) ? void (0) : __assert_fail
("value > 0", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__))
;
847 }
848 U_ASSERT(value < lsrsLength)(static_cast <bool> (value < lsrsLength) ? void (0) :
__assert_fail ("value < lsrsLength", __builtin_FILE (), __builtin_LINE
(), __extension__ __PRETTY_FUNCTION__))
;
849 return value;
850}
851
852int32_t LikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) {
853 UStringTrieResult result;
854 uint8_t c;
855 if ((c = s[i]) == 0) {
856 result = iter.next(u'*');
857 } else {
858 for (;;) {
859 c = uprv_invCharToAscii(c)(c);
860 // EBCDIC: If s[i] is not an invariant character,
861 // then c is now 0 and will simply not match anything, which is harmless.
862 uint8_t next = s[++i];
863 if (next != 0) {
864 if (!USTRINGTRIE_HAS_NEXT(iter.next(c))((iter.next(c))&1)) {
865 return -1;
866 }
867 } else {
868 // last character of this subtag
869 result = iter.next(c | 0x80);
870 break;
871 }
872 c = next;
873 }
874 }
875 switch (result) {
876 case USTRINGTRIE_NO_MATCH: return -1;
877 case USTRINGTRIE_NO_VALUE: return 0;
878 case USTRINGTRIE_INTERMEDIATE_VALUE:
879 U_ASSERT(iter.getValue() == SKIP_SCRIPT)(static_cast <bool> (iter.getValue() == SKIP_SCRIPT) ? void
(0) : __assert_fail ("iter.getValue() == SKIP_SCRIPT", __builtin_FILE
(), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__))
;
880 return SKIP_SCRIPT;
881 case USTRINGTRIE_FINAL_VALUE: return iter.getValue();
882 default: return -1;
883 }
884}
885int32_t LikelySubtags::trieNext(BytesTrie &iter, StringPiece s, int32_t i) {
886 UStringTrieResult result;
887 uint8_t c;
888 if (s.length() == i) {
889 result = iter.next(u'*');
890 } else {
891 c = s.data()[i];
892 for (;;) {
893 c = uprv_invCharToAscii(c)(c);
894 // EBCDIC: If s[i] is not an invariant character,
895 // then c is now 0 and will simply not match anything, which is harmless.
896 if (i+1 != s.length()) {
897 if (!USTRINGTRIE_HAS_NEXT(iter.next(c))((iter.next(c))&1)) {
898 return -1;
899 }
900 c = s.data()[++i];
901 } else {
902 // last character of this subtag
903 result = iter.next(c | 0x80);
904 break;
905 }
906 }
907 }
908 switch (result) {
909 case USTRINGTRIE_NO_MATCH: return -1;
910 case USTRINGTRIE_NO_VALUE: return 0;
911 case USTRINGTRIE_INTERMEDIATE_VALUE:
912 U_ASSERT(iter.getValue() == SKIP_SCRIPT)(static_cast <bool> (iter.getValue() == SKIP_SCRIPT) ? void
(0) : __assert_fail ("iter.getValue() == SKIP_SCRIPT", __builtin_FILE
(), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__))
;
913 return SKIP_SCRIPT;
914 case USTRINGTRIE_FINAL_VALUE: return iter.getValue();
915 default: return -1;
916 }
917}
918
919LSR LikelySubtags::minimizeSubtags(StringPiece language, StringPiece script,
920 StringPiece region,
921 bool favorScript,
922 UErrorCode &errorCode) const {
923 if (U_FAILURE(errorCode)) { return {}; }
924 LSR max = maximize(language, script, region, true, errorCode);
925 if (U_FAILURE(errorCode)) { return {}; }
926 // If no match, return it.
927 if (uprv_strlen(max.language):: strlen(max.language) == 0 &&
928 uprv_strlen(max.script):: strlen(max.script) == 0 &&
929 uprv_strlen(max.region):: strlen(max.region) == 0) {
930 // No match. ICU API mandate us to
931 // "If this Locale is already in the minimal form, or not valid, or
932 // there is no data available for minimization, the Locale will be
933 // unchanged."
934 return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode);
935 }
936 // try language
937 LSR test = maximize(max.language, "", "", true, errorCode);
938 if (U_FAILURE(errorCode)) { return {}; }
939 if (test.isEquivalentTo(max)) {
940 return LSR(max.language, "", "", LSR::DONT_CARE_FLAGS, errorCode);
941 }
942
943 if (!favorScript) {
944 // favor Region
945 // try language and region
946 test = maximize(max.language, "", max.region, true, errorCode);
947 if (U_FAILURE(errorCode)) { return {}; }
948 if (test.isEquivalentTo(max)) {
949 return LSR(max.language, "", max.region, LSR::DONT_CARE_FLAGS, errorCode);
950 }
951 }
952 // try language and script
953 test = maximize(max.language, max.script, "", true, errorCode);
954 if (U_FAILURE(errorCode)) { return {}; }
955 if (test.isEquivalentTo(max)) {
956 return LSR(max.language, max.script, "", LSR::DONT_CARE_FLAGS, errorCode);
957 }
958 if (favorScript) {
959 // try language and region
960 test = maximize(max.language, "", max.region, true, errorCode);
961 if (U_FAILURE(errorCode)) { return {}; }
962 if (test.isEquivalentTo(max)) {
963 return LSR(max.language, "", max.region, LSR::DONT_CARE_FLAGS, errorCode);
964 }
965 }
966 return LSR(max.language, max.script, max.region, LSR::DONT_CARE_FLAGS, errorCode);
967}
968
969U_NAMESPACE_END}