File: | root/firefox-clang/intl/icu/source/common/loclikelysubtags.cpp |
Warning: | line 829, column 13 Value stored to 'state' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | // © 2019 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | |
4 | // loclikelysubtags.cpp |
5 | // created: 2019may08 Markus W. Scherer |
6 | |
7 | #include <utility> |
8 | #include "unicode/utypes.h" |
9 | #include "unicode/bytestrie.h" |
10 | #include "unicode/localpointer.h" |
11 | #include "unicode/locid.h" |
12 | #include "unicode/uobject.h" |
13 | #include "unicode/ures.h" |
14 | #include "unicode/uscript.h" |
15 | #include "charstr.h" |
16 | #include "cstring.h" |
17 | #include "loclikelysubtags.h" |
18 | #include "lsr.h" |
19 | #include "uassert.h" |
20 | #include "ucln_cmn.h" |
21 | #include "uhash.h" |
22 | #include "uinvchar.h" |
23 | #include "umutex.h" |
24 | #include "uniquecharstr.h" |
25 | #include "uresdata.h" |
26 | #include "uresimp.h" |
27 | #include "uvector.h" |
28 | |
29 | U_NAMESPACE_BEGINnamespace icu_77 { |
30 | |
31 | namespace { |
32 | |
33 | constexpr char PSEUDO_ACCENTS_PREFIX = '\''; // -XA, -PSACCENT |
34 | constexpr char PSEUDO_BIDI_PREFIX = '+'; // -XB, -PSBIDI |
35 | constexpr char PSEUDO_CRACKED_PREFIX = ','; // -XC, -PSCRACK |
36 | |
37 | } // namespace |
38 | |
39 | LocaleDistanceData::LocaleDistanceData(LocaleDistanceData &&data) : |
40 | distanceTrieBytes(data.distanceTrieBytes), |
41 | regionToPartitions(data.regionToPartitions), |
42 | partitions(data.partitions), |
43 | paradigms(data.paradigms), paradigmsLength(data.paradigmsLength), |
44 | distances(data.distances) { |
45 | data.partitions = nullptr; |
46 | data.paradigms = nullptr; |
47 | } |
48 | |
49 | LocaleDistanceData::~LocaleDistanceData() { |
50 | uprv_freeuprv_free_77(partitions); |
51 | delete[] paradigms; |
52 | } |
53 | |
54 | struct LikelySubtagsData { |
55 | UResourceBundle *langInfoBundle = nullptr; |
56 | UniqueCharStrings strings; |
57 | CharStringMap languageAliases; |
58 | CharStringMap regionAliases; |
59 | const uint8_t *trieBytes = nullptr; |
60 | LSR *lsrs = nullptr; |
61 | int32_t lsrsLength = 0; |
62 | |
63 | LocaleDistanceData distanceData; |
64 | |
65 | LikelySubtagsData(UErrorCode &errorCode) : strings(errorCode) {} |
66 | |
67 | ~LikelySubtagsData() { |
68 | ures_closeures_close_77(langInfoBundle); |
69 | delete[] lsrs; |
70 | } |
71 | |
72 | void load(UErrorCode &errorCode) { |
73 | if (U_FAILURE(errorCode)) { return; } |
74 | langInfoBundle = ures_openDirectures_openDirect_77(nullptr, "langInfo", &errorCode); |
75 | if (U_FAILURE(errorCode)) { return; } |
76 | StackUResourceBundle stackTempBundle; |
77 | ResourceDataValue value; |
78 | ures_getValueWithFallbackures_getValueWithFallback_77(langInfoBundle, "likely", stackTempBundle.getAlias(), |
79 | value, errorCode); |
80 | ResourceTable likelyTable = value.getTable(errorCode); |
81 | if (U_FAILURE(errorCode)) { return; } |
82 | |
83 | // Read all strings in the resource bundle and convert them to invariant char *. |
84 | LocalMemory<int32_t> languageIndexes, regionIndexes, lsrSubtagIndexes; |
85 | int32_t languagesLength = 0, regionsLength = 0, lsrSubtagsLength = 0; |
86 | ResourceArray m49Array; |
87 | if (likelyTable.findValue("m49", value)) { |
88 | m49Array = value.getArray(errorCode); |
89 | } else { |
90 | errorCode = U_MISSING_RESOURCE_ERROR; |
91 | return; |
92 | } |
93 | if (!readStrings(likelyTable, "languageAliases", value, |
94 | languageIndexes, languagesLength, errorCode) || |
95 | !readStrings(likelyTable, "regionAliases", value, |
96 | regionIndexes, regionsLength, errorCode) || |
97 | !readLSREncodedStrings(likelyTable, "lsrnum", value, m49Array, |
98 | lsrSubtagIndexes,lsrSubtagsLength, errorCode)) { |
99 | return; |
100 | } |
101 | if ((languagesLength & 1) != 0 || |
102 | (regionsLength & 1) != 0 || |
103 | (lsrSubtagsLength % 3) != 0) { |
104 | errorCode = U_INVALID_FORMAT_ERROR; |
105 | return; |
106 | } |
107 | if (lsrSubtagsLength == 0) { |
108 | errorCode = U_MISSING_RESOURCE_ERROR; |
109 | return; |
110 | } |
111 | |
112 | if (!likelyTable.findValue("trie", value)) { |
113 | errorCode = U_MISSING_RESOURCE_ERROR; |
114 | return; |
115 | } |
116 | int32_t length; |
117 | trieBytes = value.getBinary(length, errorCode); |
118 | if (U_FAILURE(errorCode)) { return; } |
119 | |
120 | // Also read distance/matcher data if available, |
121 | // to open & keep only one resource bundle pointer |
122 | // and to use one single UniqueCharStrings. |
123 | UErrorCode matchErrorCode = U_ZERO_ERROR; |
124 | ures_getValueWithFallbackures_getValueWithFallback_77(langInfoBundle, "match", stackTempBundle.getAlias(), |
125 | value, matchErrorCode); |
126 | LocalMemory<int32_t> partitionIndexes, paradigmSubtagIndexes; |
127 | int32_t partitionsLength = 0, paradigmSubtagsLength = 0; |
128 | if (U_SUCCESS(matchErrorCode)) { |
129 | ResourceTable matchTable = value.getTable(errorCode); |
130 | if (U_FAILURE(errorCode)) { return; } |
131 | |
132 | if (matchTable.findValue("trie", value)) { |
133 | distanceData.distanceTrieBytes = value.getBinary(length, errorCode); |
134 | if (U_FAILURE(errorCode)) { return; } |
135 | } |
136 | |
137 | if (matchTable.findValue("regionToPartitions", value)) { |
138 | distanceData.regionToPartitions = value.getBinary(length, errorCode); |
139 | if (U_FAILURE(errorCode)) { return; } |
140 | if (length < LSR::REGION_INDEX_LIMIT) { |
141 | errorCode = U_INVALID_FORMAT_ERROR; |
142 | return; |
143 | } |
144 | } |
145 | |
146 | if (!readStrings(matchTable, "partitions", value, |
147 | partitionIndexes, partitionsLength, errorCode) || |
148 | !readLSREncodedStrings(matchTable, "paradigmnum", value, m49Array, |
149 | paradigmSubtagIndexes, paradigmSubtagsLength, errorCode)) { |
150 | return; |
151 | } |
152 | if ((paradigmSubtagsLength % 3) != 0) { |
153 | errorCode = U_INVALID_FORMAT_ERROR; |
154 | return; |
155 | } |
156 | |
157 | if (matchTable.findValue("distances", value)) { |
158 | distanceData.distances = value.getIntVector(length, errorCode); |
159 | if (U_FAILURE(errorCode)) { return; } |
160 | if (length < 4) { // LocaleDistance IX_LIMIT |
161 | errorCode = U_INVALID_FORMAT_ERROR; |
162 | return; |
163 | } |
164 | } |
165 | } else if (matchErrorCode == U_MISSING_RESOURCE_ERROR) { |
166 | // ok for likely subtags |
167 | } else { // error other than missing resource |
168 | errorCode = matchErrorCode; |
169 | return; |
170 | } |
171 | |
172 | // Fetch & store invariant-character versions of strings |
173 | // only after we have collected and de-duplicated all of them. |
174 | strings.freeze(); |
175 | |
176 | languageAliases = CharStringMap(languagesLength / 2, errorCode); |
177 | for (int32_t i = 0; i < languagesLength; i += 2) { |
178 | languageAliases.put(strings.get(languageIndexes[i]), |
179 | strings.get(languageIndexes[i + 1]), errorCode); |
180 | } |
181 | |
182 | regionAliases = CharStringMap(regionsLength / 2, errorCode); |
183 | for (int32_t i = 0; i < regionsLength; i += 2) { |
184 | regionAliases.put(strings.get(regionIndexes[i]), |
185 | strings.get(regionIndexes[i + 1]), errorCode); |
186 | } |
187 | if (U_FAILURE(errorCode)) { return; } |
188 | |
189 | lsrsLength = lsrSubtagsLength / 3; |
190 | lsrs = new LSR[lsrsLength]; |
191 | if (lsrs == nullptr) { |
192 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
193 | return; |
194 | } |
195 | for (int32_t i = 0, j = 0; i < lsrSubtagsLength; i += 3, ++j) { |
196 | lsrs[j] = LSR(strings.get(lsrSubtagIndexes[i]), |
197 | strings.get(lsrSubtagIndexes[i + 1]), |
198 | strings.get(lsrSubtagIndexes[i + 2]), |
199 | LSR::IMPLICIT_LSR); |
200 | } |
201 | |
202 | if (partitionsLength > 0) { |
203 | distanceData.partitions = static_cast<const char **>( |
204 | uprv_mallocuprv_malloc_77(partitionsLength * sizeof(const char *))); |
205 | if (distanceData.partitions == nullptr) { |
206 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
207 | return; |
208 | } |
209 | for (int32_t i = 0; i < partitionsLength; ++i) { |
210 | distanceData.partitions[i] = strings.get(partitionIndexes[i]); |
211 | } |
212 | } |
213 | |
214 | if (paradigmSubtagsLength > 0) { |
215 | distanceData.paradigmsLength = paradigmSubtagsLength / 3; |
216 | LSR *paradigms = new LSR[distanceData.paradigmsLength]; |
217 | if (paradigms == nullptr) { |
218 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
219 | return; |
220 | } |
221 | for (int32_t i = 0, j = 0; i < paradigmSubtagsLength; i += 3, ++j) { |
222 | paradigms[j] = LSR(strings.get(paradigmSubtagIndexes[i]), |
223 | strings.get(paradigmSubtagIndexes[i + 1]), |
224 | strings.get(paradigmSubtagIndexes[i + 2]), |
225 | LSR::DONT_CARE_FLAGS); |
226 | } |
227 | distanceData.paradigms = paradigms; |
228 | } |
229 | } |
230 | |
231 | private: |
232 | bool readStrings(const ResourceTable &table, const char *key, ResourceValue &value, |
233 | LocalMemory<int32_t> &indexes, int32_t &length, UErrorCode &errorCode) { |
234 | if (U_FAILURE(errorCode)) { return false; } |
235 | if (table.findValue(key, value)) { |
236 | ResourceArray stringArray = value.getArray(errorCode); |
237 | if (U_FAILURE(errorCode)) { return false; } |
238 | length = stringArray.getSize(); |
239 | if (length == 0) { return true; } |
240 | int32_t *rawIndexes = indexes.allocateInsteadAndCopy(length); |
241 | if (rawIndexes == nullptr) { |
242 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
243 | return false; |
244 | } |
245 | for (int i = 0; i < length; ++i) { |
246 | if (stringArray.getValue(i, value)) { // returns true because i < length |
247 | int32_t strLength = 0; |
248 | rawIndexes[i] = strings.add(value.getString(strLength, errorCode), errorCode); |
249 | if (U_FAILURE(errorCode)) { return false; } |
250 | } |
251 | } |
252 | } |
253 | return true; |
254 | } |
255 | UnicodeString toLanguage(int encoded) { |
256 | if (encoded == 0) { |
257 | return UNICODE_STRING_SIMPLE("")icu::UnicodeString(true, u"", -1); |
258 | } |
259 | if (encoded == 1) { |
260 | return UNICODE_STRING_SIMPLE("skip")icu::UnicodeString(true, u"skip", -1); |
261 | } |
262 | encoded &= 0x00ffffff; |
263 | encoded %= 27*27*27; |
264 | char lang[3]; |
265 | lang[0] = 'a' + ((encoded % 27) - 1); |
266 | lang[1] = 'a' + (((encoded / 27 ) % 27) - 1); |
267 | if (encoded / (27 * 27) == 0) { |
268 | return UnicodeString(lang, 2, US_INVicu::UnicodeString::kInvariant); |
269 | } |
270 | lang[2] = 'a' + ((encoded / (27 * 27)) - 1); |
271 | return UnicodeString(lang, 3, US_INVicu::UnicodeString::kInvariant); |
272 | } |
273 | UnicodeString toScript(int encoded) { |
274 | if (encoded == 0) { |
275 | return UNICODE_STRING_SIMPLE("")icu::UnicodeString(true, u"", -1); |
276 | } |
277 | if (encoded == 1) { |
278 | return UNICODE_STRING_SIMPLE("script")icu::UnicodeString(true, u"script", -1); |
279 | } |
280 | encoded = (encoded >> 24) & 0x000000ff; |
281 | const char* script = uscript_getShortNameuscript_getShortName_77(static_cast<UScriptCode>(encoded)); |
282 | if (script == nullptr) { |
283 | return UNICODE_STRING_SIMPLE("")icu::UnicodeString(true, u"", -1); |
284 | } |
285 | U_ASSERT(uprv_strlen(script) == 4)(static_cast <bool> (:: strlen(script) == 4) ? void (0) : __assert_fail (":: strlen(script) == 4", __builtin_FILE () , __builtin_LINE (), __extension__ __PRETTY_FUNCTION__)); |
286 | return UnicodeString(script, 4, US_INVicu::UnicodeString::kInvariant); |
287 | } |
288 | UnicodeString m49IndexToCode(const ResourceArray &m49Array, ResourceValue &value, int index, UErrorCode &errorCode) { |
289 | if (U_FAILURE(errorCode)) { |
290 | return UNICODE_STRING_SIMPLE("")icu::UnicodeString(true, u"", -1); |
291 | } |
292 | if (m49Array.getValue(index, value)) { |
293 | return value.getUnicodeString(errorCode); |
294 | } |
295 | // "m49" does not include the index. |
296 | errorCode = U_MISSING_RESOURCE_ERROR; |
297 | return UNICODE_STRING_SIMPLE("")icu::UnicodeString(true, u"", -1); |
298 | } |
299 | |
300 | UnicodeString toRegion(const ResourceArray& m49Array, ResourceValue &value, int encoded, UErrorCode &errorCode) { |
301 | if (U_FAILURE(errorCode) || encoded == 0 || encoded == 1) { |
302 | return UNICODE_STRING_SIMPLE("")icu::UnicodeString(true, u"", -1); |
303 | } |
304 | encoded &= 0x00ffffff; |
305 | encoded /= 27 * 27 * 27; |
306 | encoded %= 27 * 27; |
307 | if (encoded < 27) { |
308 | // Selected M49 code index, find the code from "m49" resource. |
309 | return m49IndexToCode(m49Array, value, encoded, errorCode); |
310 | } |
311 | char region[2]; |
312 | region[0] = 'A' + ((encoded % 27) - 1); |
313 | region[1] = 'A' + (((encoded / 27) % 27) - 1); |
314 | return UnicodeString(region, 2, US_INVicu::UnicodeString::kInvariant); |
315 | } |
316 | |
317 | bool readLSREncodedStrings(const ResourceTable &table, const char* key, ResourceValue &value, const ResourceArray& m49Array, |
318 | LocalMemory<int32_t> &indexes, int32_t &length, UErrorCode &errorCode) { |
319 | if (U_FAILURE(errorCode)) { return false; } |
320 | if (table.findValue(key, value)) { |
321 | const int32_t* vectors = value.getIntVector(length, errorCode); |
322 | if (U_FAILURE(errorCode)) { return false; } |
323 | if (length == 0) { return true; } |
324 | int32_t *rawIndexes = indexes.allocateInsteadAndCopy(length * 3); |
325 | if (rawIndexes == nullptr) { |
326 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
327 | return false; |
328 | } |
329 | for (int i = 0; i < length; ++i) { |
330 | rawIndexes[i*3] = strings.addByValue(toLanguage(vectors[i]), errorCode); |
331 | rawIndexes[i*3+1] = strings.addByValue(toScript(vectors[i]), errorCode); |
332 | rawIndexes[i*3+2] = strings.addByValue( |
333 | toRegion(m49Array, value, vectors[i], errorCode), errorCode); |
334 | if (U_FAILURE(errorCode)) { return false; } |
335 | } |
336 | length *= 3; |
337 | } |
338 | return true; |
339 | } |
340 | }; |
341 | |
342 | namespace { |
343 | |
344 | LikelySubtags *gLikelySubtags = nullptr; |
345 | UVector *gMacroregions = nullptr; |
346 | UInitOnce gInitOnce {}; |
347 | |
348 | UBool U_CALLCONV cleanup() { |
349 | delete gLikelySubtags; |
350 | gLikelySubtags = nullptr; |
351 | delete gMacroregions; |
352 | gMacroregions = nullptr; |
353 | gInitOnce.reset(); |
354 | return true; |
355 | } |
356 | |
357 | constexpr const char16_t* MACROREGION_HARDCODE[] = { |
358 | u"001~3", |
359 | u"005", |
360 | u"009", |
361 | u"011", |
362 | u"013~5", |
363 | u"017~9", |
364 | u"021", |
365 | u"029", |
366 | u"030", |
367 | u"034~5", |
368 | u"039", |
369 | u"053~4", |
370 | u"057", |
371 | u"061", |
372 | u"142~3", |
373 | u"145", |
374 | u"150~1", |
375 | u"154~5", |
376 | u"202", |
377 | u"419", |
378 | u"EU", |
379 | u"EZ", |
380 | u"QO", |
381 | u"UN", |
382 | }; |
383 | |
384 | constexpr char16_t RANGE_MARKER = 0x7E; /* '~' */ |
385 | void processMacroregionRange(const UnicodeString& regionName, UVector* newMacroRegions, UErrorCode& status) { |
386 | if (U_FAILURE(status)) { return; } |
387 | int32_t rangeMarkerLocation = regionName.indexOf(RANGE_MARKER); |
388 | char16_t buf[6]; |
389 | regionName.extract(buf,6,status); |
390 | if ( rangeMarkerLocation > 0 ) { |
391 | char16_t endRange = regionName.charAt(rangeMarkerLocation+1); |
392 | buf[rangeMarkerLocation] = 0; |
393 | while ( buf[rangeMarkerLocation-1] <= endRange && U_SUCCESS(status)) { |
394 | LocalPointer<UnicodeString> newRegion(new UnicodeString(buf), status); |
395 | newMacroRegions->adoptElement(newRegion.orphan(),status); |
396 | buf[rangeMarkerLocation-1]++; |
397 | } |
398 | } else { |
399 | LocalPointer<UnicodeString> newRegion(new UnicodeString(regionName), status); |
400 | newMacroRegions->adoptElement(newRegion.orphan(),status); |
401 | } |
402 | } |
403 | |
404 | #if U_DEBUG1 |
405 | UVector* loadMacroregions(UErrorCode &status) { |
406 | if (U_FAILURE(status)) { return nullptr; } |
407 | LocalPointer<UVector> newMacroRegions(new UVector(uprv_deleteUObjectuprv_deleteUObject_77, uhash_compareUnicodeStringuhash_compareUnicodeString_77, status), status); |
408 | |
409 | LocalUResourceBundlePointer supplementalData(ures_openDirectures_openDirect_77(nullptr,"supplementalData",&status)); |
410 | LocalUResourceBundlePointer idValidity(ures_getByKeyures_getByKey_77(supplementalData.getAlias(),"idValidity",nullptr,&status)); |
411 | LocalUResourceBundlePointer regionList(ures_getByKeyures_getByKey_77(idValidity.getAlias(),"region",nullptr,&status)); |
412 | LocalUResourceBundlePointer regionMacro(ures_getByKeyures_getByKey_77(regionList.getAlias(),"macroregion",nullptr,&status)); |
413 | |
414 | if (U_FAILURE(status)) { |
415 | return nullptr; |
416 | } |
417 | |
418 | while (ures_hasNextures_hasNext_77(regionMacro.getAlias())) { |
419 | UnicodeString regionName = ures_getNextUnicodeString(regionMacro.getAlias(),nullptr,&status); |
420 | processMacroregionRange(regionName, newMacroRegions.getAlias(), status); |
421 | if (U_FAILURE(status)) { |
422 | return nullptr; |
423 | } |
424 | } |
425 | |
426 | return newMacroRegions.orphan(); |
427 | } |
428 | #endif // U_DEBUG |
429 | |
430 | UVector* getStaticMacroregions(UErrorCode &status) { |
431 | if (U_FAILURE(status)) { return nullptr; } |
432 | LocalPointer<UVector> newMacroRegions(new UVector(uprv_deleteUObjectuprv_deleteUObject_77, uhash_compareUnicodeStringuhash_compareUnicodeString_77, status), status); |
433 | |
434 | if (U_FAILURE(status)) { |
435 | return nullptr; |
436 | } |
437 | |
438 | for (const auto *region : MACROREGION_HARDCODE) { |
439 | UnicodeString regionName(region); |
440 | processMacroregionRange(regionName, newMacroRegions.getAlias(), status); |
441 | if (U_FAILURE(status)) { |
442 | return nullptr; |
443 | } |
444 | } |
445 | |
446 | return newMacroRegions.orphan(); |
447 | } |
448 | |
449 | } // namespace |
450 | |
451 | void U_CALLCONV LikelySubtags::initLikelySubtags(UErrorCode &errorCode) { |
452 | // This function is invoked only via umtx_initOnce(). |
453 | U_ASSERT(gLikelySubtags == nullptr)(static_cast <bool> (gLikelySubtags == nullptr) ? void ( 0) : __assert_fail ("gLikelySubtags == nullptr", __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__)); |
454 | LikelySubtagsData data(errorCode); |
455 | data.load(errorCode); |
456 | if (U_FAILURE(errorCode)) { return; } |
457 | gLikelySubtags = new LikelySubtags(data); |
458 | gMacroregions = getStaticMacroregions(errorCode); |
459 | #if U_DEBUG1 |
460 | auto macroregionsFromData = loadMacroregions(errorCode); |
461 | U_ASSERT((*gMacroregions) == (*macroregionsFromData))(static_cast <bool> ((*gMacroregions) == (*macroregionsFromData )) ? void (0) : __assert_fail ("(*gMacroregions) == (*macroregionsFromData)" , __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__ )); |
462 | delete macroregionsFromData; |
463 | #endif |
464 | if (U_FAILURE(errorCode) || gLikelySubtags == nullptr || gMacroregions == nullptr) { |
465 | delete gLikelySubtags; |
466 | delete gMacroregions; |
467 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
468 | return; |
469 | } |
470 | |
471 | ucln_common_registerCleanupucln_common_registerCleanup_77(UCLN_COMMON_LIKELY_SUBTAGS, cleanup); |
472 | } |
473 | |
474 | const LikelySubtags *LikelySubtags::getSingleton(UErrorCode &errorCode) { |
475 | if (U_FAILURE(errorCode)) { return nullptr; } |
476 | umtx_initOnce(gInitOnce, &LikelySubtags::initLikelySubtags, errorCode); |
477 | return gLikelySubtags; |
478 | } |
479 | |
480 | LikelySubtags::LikelySubtags(LikelySubtagsData &data) : |
481 | langInfoBundle(data.langInfoBundle), |
482 | strings(data.strings.orphanCharStrings()), |
483 | languageAliases(std::move(data.languageAliases)), |
484 | regionAliases(std::move(data.regionAliases)), |
485 | trie(data.trieBytes), |
486 | lsrs(data.lsrs), |
487 | #if U_DEBUG1 |
488 | lsrsLength(data.lsrsLength), |
489 | #endif // U_DEBUG |
490 | distanceData(std::move(data.distanceData)) { |
491 | data.langInfoBundle = nullptr; |
492 | data.lsrs = nullptr; |
493 | |
494 | // Cache the result of looking up language="und" encoded as "*", and "und-Zzzz" ("**"). |
495 | UStringTrieResult result = trie.next(u'*'); |
496 | U_ASSERT(USTRINGTRIE_HAS_NEXT(result))(static_cast <bool> (((result)&1)) ? void (0) : __assert_fail ("((result)&1)", __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__)); |
497 | trieUndState = trie.getState64(); |
498 | result = trie.next(u'*'); |
499 | U_ASSERT(USTRINGTRIE_HAS_NEXT(result))(static_cast <bool> (((result)&1)) ? void (0) : __assert_fail ("((result)&1)", __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__)); |
500 | trieUndZzzzState = trie.getState64(); |
501 | result = trie.next(u'*'); |
502 | U_ASSERT(USTRINGTRIE_HAS_VALUE(result))(static_cast <bool> (((result)>=USTRINGTRIE_FINAL_VALUE )) ? void (0) : __assert_fail ("((result)>=USTRINGTRIE_FINAL_VALUE)" , __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__ )); |
503 | defaultLsrIndex = trie.getValue(); |
504 | trie.reset(); |
505 | |
506 | for (char16_t c = u'a'; c <= u'z'; ++c) { |
507 | result = trie.next(c); |
508 | if (result == USTRINGTRIE_NO_VALUE) { |
509 | trieFirstLetterStates[c - u'a'] = trie.getState64(); |
510 | } |
511 | trie.reset(); |
512 | } |
513 | } |
514 | |
515 | LikelySubtags::~LikelySubtags() { |
516 | ures_closeures_close_77(langInfoBundle); |
517 | delete strings; |
518 | delete[] lsrs; |
519 | } |
520 | |
521 | LSR LikelySubtags::makeMaximizedLsrFrom(const Locale &locale, |
522 | bool returnInputIfUnmatch, |
523 | UErrorCode &errorCode) const { |
524 | if (U_FAILURE(errorCode)) { return {}; } |
525 | if (locale.isBogus()) { |
526 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
527 | return {}; |
528 | } |
529 | const char *name = locale.getName(); |
530 | if (!returnInputIfUnmatch && uprv_isAtSign(name[0])((name[0])=='@') && name[1] == 'x' && name[2] == '=') { // name.startsWith("@x=") |
531 | // Private use language tag x-subtag-subtag... which CLDR changes to |
532 | // und-x-subtag-subtag... |
533 | return LSR(name, "", "", LSR::EXPLICIT_LSR); |
534 | } |
535 | LSR max = makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(), |
536 | locale.getVariant(), returnInputIfUnmatch, errorCode); |
537 | |
538 | if (uprv_strlen(max.language):: strlen(max.language) == 0 && |
539 | uprv_strlen(max.script):: strlen(max.script) == 0 && |
540 | uprv_strlen(max.region):: strlen(max.region) == 0) { |
541 | // No match. ICU API mandate us to |
542 | // If the provided ULocale instance is already in the maximal form, or |
543 | // there is no data available available for maximization, it will be |
544 | // returned. |
545 | return LSR(locale.getLanguage(), locale.getScript(), locale.getCountry(), LSR::EXPLICIT_LSR, errorCode); |
546 | } |
547 | return max; |
548 | } |
549 | |
550 | namespace { |
551 | |
552 | const char *getCanonical(const CharStringMap &aliases, const char *alias) { |
553 | const char *canonical = aliases.get(alias); |
554 | return canonical == nullptr ? alias : canonical; |
555 | } |
556 | |
557 | } // namespace |
558 | |
559 | LSR LikelySubtags::makeMaximizedLsr(const char *language, const char *script, const char *region, |
560 | const char *variant, |
561 | bool returnInputIfUnmatch, |
562 | UErrorCode &errorCode) const { |
563 | if (U_FAILURE(errorCode)) { return {}; } |
564 | // Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK. |
565 | // They should match only themselves, |
566 | // not other locales with what looks like the same language and script subtags. |
567 | if (!returnInputIfUnmatch) { |
568 | char c1; |
569 | if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) { |
570 | switch (c1) { |
571 | case 'A': |
572 | return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region, |
573 | LSR::EXPLICIT_LSR, errorCode); |
574 | case 'B': |
575 | return LSR(PSEUDO_BIDI_PREFIX, language, script, region, |
576 | LSR::EXPLICIT_LSR, errorCode); |
577 | case 'C': |
578 | return LSR(PSEUDO_CRACKED_PREFIX, language, script, region, |
579 | LSR::EXPLICIT_LSR, errorCode); |
580 | default: // normal locale |
581 | break; |
582 | } |
583 | } |
584 | |
585 | if (variant[0] == 'P' && variant[1] == 'S') { |
586 | int32_t lsrFlags = *region == 0 ? |
587 | LSR::EXPLICIT_LANGUAGE | LSR::EXPLICIT_SCRIPT : LSR::EXPLICIT_LSR; |
588 | if (uprv_strcmp(variant, "PSACCENT"):: strcmp(variant, "PSACCENT") == 0) { |
589 | return LSR(PSEUDO_ACCENTS_PREFIX, language, script, |
590 | *region == 0 ? "XA" : region, lsrFlags, errorCode); |
591 | } else if (uprv_strcmp(variant, "PSBIDI"):: strcmp(variant, "PSBIDI") == 0) { |
592 | return LSR(PSEUDO_BIDI_PREFIX, language, script, |
593 | *region == 0 ? "XB" : region, lsrFlags, errorCode); |
594 | } else if (uprv_strcmp(variant, "PSCRACK"):: strcmp(variant, "PSCRACK") == 0) { |
595 | return LSR(PSEUDO_CRACKED_PREFIX, language, script, |
596 | *region == 0 ? "XC" : region, lsrFlags, errorCode); |
597 | } |
598 | // else normal locale |
599 | } |
600 | } // end of if (!returnInputIfUnmatch) |
601 | |
602 | language = getCanonical(languageAliases, language); |
603 | // (We have no script mappings.) |
604 | region = getCanonical(regionAliases, region); |
605 | return maximize(language, script, region, returnInputIfUnmatch, errorCode); |
606 | } |
607 | |
608 | LSR LikelySubtags::maximize(const char *language, const char *script, const char *region, |
609 | bool returnInputIfUnmatch, |
610 | UErrorCode &errorCode) const { |
611 | if (U_FAILURE(errorCode)) { return {}; } |
612 | return maximize({language, static_cast<int32_t>(uprv_strlen(language):: strlen(language))}, |
613 | {script, static_cast<int32_t>(uprv_strlen(script):: strlen(script))}, |
614 | {region, static_cast<int32_t>(uprv_strlen(region):: strlen(region))}, |
615 | returnInputIfUnmatch, |
616 | errorCode); |
617 | } |
618 | |
619 | bool LikelySubtags::isMacroregion(StringPiece& region, UErrorCode& errorCode) const { |
620 | if (U_FAILURE(errorCode)) { return false; } |
621 | // In Java, we use Region class. In C++, since Region is under i18n, |
622 | // we read the same data used by Region into gMacroregions avoid dependency |
623 | // from common to i18n/region.cpp |
624 | umtx_initOnce(gInitOnce, &LikelySubtags::initLikelySubtags, errorCode); |
625 | if (U_FAILURE(errorCode)) { return false; } |
626 | UnicodeString str(UnicodeString::fromUTF8(region)); |
627 | return gMacroregions->contains((void *)&str); |
628 | } |
629 | |
630 | LSR LikelySubtags::maximize(StringPiece language, StringPiece script, StringPiece region, |
631 | bool returnInputIfUnmatch, |
632 | UErrorCode &errorCode) const { |
633 | if (U_FAILURE(errorCode)) { return {}; } |
634 | if (language.compare("und") == 0) { |
635 | language = ""; |
636 | } |
637 | if (script.compare("Zzzz") == 0) { |
638 | script = ""; |
639 | } |
640 | if (region.compare("ZZ") == 0) { |
641 | region = ""; |
642 | } |
643 | if (!script.empty() && !region.empty() && !language.empty()) { |
644 | return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode); // already maximized |
645 | } |
646 | bool retainLanguage = false; |
647 | bool retainScript = false; |
648 | bool retainRegion = false; |
649 | |
650 | BytesTrie iter(trie); |
651 | uint64_t state; |
652 | int32_t value; |
653 | // Small optimization: Array lookup for first language letter. |
654 | int32_t c0; |
655 | if (0 <= (c0 = uprv_lowerOrdinal(language.data()[0])) && c0 <= 25 && |
656 | language.length() >= 2 && |
657 | (state = trieFirstLetterStates[c0]) != 0) { |
658 | value = trieNext(iter.resetToState64(state), language, 1); |
659 | } else { |
660 | value = trieNext(iter, language, 0); |
661 | } |
662 | bool matchLanguage = (value >= 0); |
663 | bool matchScript = false; |
664 | if (value >= 0) { |
665 | retainLanguage = !language.empty(); |
666 | state = iter.getState64(); |
667 | } else { |
668 | retainLanguage = true; |
669 | iter.resetToState64(trieUndState); // "und" ("*") |
670 | state = 0; |
671 | } |
672 | |
673 | if (value >= 0 && !script.empty()) { |
674 | matchScript = true; |
675 | } |
676 | if (value > 0) { |
677 | // Intermediate or final value from just language. |
678 | if (value == SKIP_SCRIPT) { |
679 | value = 0; |
680 | } |
681 | retainScript = !script.empty(); |
682 | } else { |
683 | value = trieNext(iter, script, 0); |
684 | if (value >= 0) { |
685 | retainScript = !script.empty(); |
686 | state = iter.getState64(); |
687 | } else { |
688 | retainScript = true; |
689 | if (state == 0) { |
690 | iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**") |
691 | } else { |
692 | iter.resetToState64(state); |
693 | value = trieNext(iter, "", 0); |
694 | U_ASSERT(value >= 0)(static_cast <bool> (value >= 0) ? void (0) : __assert_fail ("value >= 0", __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__)); |
695 | state = iter.getState64(); |
696 | } |
697 | } |
698 | } |
699 | |
700 | bool matchRegion = false; |
701 | if (value > 0) { |
702 | // Final value from just language or language+script. |
703 | retainRegion = !region.empty(); |
704 | } else { |
705 | value = trieNext(iter, region, 0); |
706 | if (value >= 0) { |
707 | if (!region.empty() && !isMacroregion(region, errorCode)) { |
708 | retainRegion = true; |
709 | matchRegion = true; |
710 | } |
711 | } else { |
712 | retainRegion = true; |
713 | if (state == 0) { |
714 | value = defaultLsrIndex; |
715 | } else { |
716 | iter.resetToState64(state); |
717 | value = trieNext(iter, "", 0); |
718 | U_ASSERT(value > 0)(static_cast <bool> (value > 0) ? void (0) : __assert_fail ("value > 0", __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__)); |
719 | } |
720 | } |
721 | } |
722 | U_ASSERT(value < lsrsLength)(static_cast <bool> (value < lsrsLength) ? void (0) : __assert_fail ("value < lsrsLength", __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__)); |
723 | const LSR &matched = lsrs[value]; |
724 | |
725 | if (returnInputIfUnmatch && |
726 | (!(matchLanguage || matchScript || (matchRegion && language.empty())))) { |
727 | return LSR("", "", "", LSR::EXPLICIT_LSR, errorCode); // no matching. |
728 | } |
729 | if (language.empty()) { |
730 | language = StringPiece("und"); |
731 | } |
732 | |
733 | if (!(retainLanguage || retainScript || retainRegion)) { |
734 | // Quickly return a copy of the lookup-result LSR |
735 | // without new allocation of the subtags. |
736 | return LSR(matched.language, matched.script, matched.region, matched.flags); |
737 | } |
738 | if (!retainLanguage) { |
739 | language = matched.language; |
740 | } |
741 | if (!retainScript) { |
742 | script = matched.script; |
743 | } |
744 | if (!retainRegion) { |
745 | region = matched.region; |
746 | } |
747 | int32_t retainMask = (retainLanguage ? 4 : 0) + (retainScript ? 2 : 0) + (retainRegion ? 1 : 0); |
748 | // retainOldMask flags = LSR explicit-subtag flags |
749 | return LSR(language, script, region, retainMask, errorCode); |
750 | } |
751 | |
752 | int32_t LikelySubtags::compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const { |
753 | // If likelyInfo >= 0: |
754 | // likelyInfo bit 1 is set if the previous comparison with lsr |
755 | // was for equal language and script. |
756 | // Otherwise the scripts differed. |
757 | if (uprv_strcmp(lsr.language, other.language):: strcmp(lsr.language, other.language) != 0) { |
758 | return 0xfffffffc; // negative, lsr not better than other |
759 | } |
760 | if (uprv_strcmp(lsr.script, other.script):: strcmp(lsr.script, other.script) != 0) { |
761 | int32_t index; |
762 | if (likelyInfo >= 0 && (likelyInfo & 2) == 0) { |
763 | index = likelyInfo >> 2; |
764 | } else { |
765 | index = getLikelyIndex(lsr.language, ""); |
766 | likelyInfo = index << 2; |
767 | } |
768 | const LSR &likely = lsrs[index]; |
769 | if (uprv_strcmp(lsr.script, likely.script):: strcmp(lsr.script, likely.script) == 0) { |
770 | return likelyInfo | 1; |
771 | } else { |
772 | return likelyInfo & ~1; |
773 | } |
774 | } |
775 | if (uprv_strcmp(lsr.region, other.region):: strcmp(lsr.region, other.region) != 0) { |
776 | int32_t index; |
777 | if (likelyInfo >= 0 && (likelyInfo & 2) != 0) { |
778 | index = likelyInfo >> 2; |
779 | } else { |
780 | index = getLikelyIndex(lsr.language, lsr.region); |
781 | likelyInfo = (index << 2) | 2; |
782 | } |
783 | const LSR &likely = lsrs[index]; |
784 | if (uprv_strcmp(lsr.region, likely.region):: strcmp(lsr.region, likely.region) == 0) { |
785 | return likelyInfo | 1; |
786 | } else { |
787 | return likelyInfo & ~1; |
788 | } |
789 | } |
790 | return likelyInfo & ~1; // lsr not better than other |
791 | } |
792 | |
793 | // Subset of maximize(). |
794 | int32_t LikelySubtags::getLikelyIndex(const char *language, const char *script) const { |
795 | if (uprv_strcmp(language, "und"):: strcmp(language, "und") == 0) { |
796 | language = ""; |
797 | } |
798 | if (uprv_strcmp(script, "Zzzz"):: strcmp(script, "Zzzz") == 0) { |
799 | script = ""; |
800 | } |
801 | |
802 | BytesTrie iter(trie); |
803 | uint64_t state; |
804 | int32_t value; |
805 | // Small optimization: Array lookup for first language letter. |
806 | int32_t c0; |
807 | if (0 <= (c0 = uprv_lowerOrdinal(language[0])) && c0 <= 25 && |
808 | language[1] != 0 && // language.length() >= 2 |
809 | (state = trieFirstLetterStates[c0]) != 0) { |
810 | value = trieNext(iter.resetToState64(state), language, 1); |
811 | } else { |
812 | value = trieNext(iter, language, 0); |
813 | } |
814 | if (value >= 0) { |
815 | state = iter.getState64(); |
816 | } else { |
817 | iter.resetToState64(trieUndState); // "und" ("*") |
818 | state = 0; |
819 | } |
820 | |
821 | if (value > 0) { |
822 | // Intermediate or final value from just language. |
823 | if (value == SKIP_SCRIPT) { |
824 | value = 0; |
825 | } |
826 | } else { |
827 | value = trieNext(iter, script, 0); |
828 | if (value >= 0) { |
829 | state = iter.getState64(); |
Value stored to 'state' is never read | |
830 | } else { |
831 | if (state == 0) { |
832 | iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**") |
833 | } else { |
834 | iter.resetToState64(state); |
835 | value = trieNext(iter, "", 0); |
836 | U_ASSERT(value >= 0)(static_cast <bool> (value >= 0) ? void (0) : __assert_fail ("value >= 0", __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__)); |
837 | state = iter.getState64(); |
838 | } |
839 | } |
840 | } |
841 | |
842 | if (value > 0) { |
843 | // Final value from just language or language+script. |
844 | } else { |
845 | value = trieNext(iter, "", 0); |
846 | U_ASSERT(value > 0)(static_cast <bool> (value > 0) ? void (0) : __assert_fail ("value > 0", __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__)); |
847 | } |
848 | U_ASSERT(value < lsrsLength)(static_cast <bool> (value < lsrsLength) ? void (0) : __assert_fail ("value < lsrsLength", __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__)); |
849 | return value; |
850 | } |
851 | |
852 | int32_t LikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) { |
853 | UStringTrieResult result; |
854 | uint8_t c; |
855 | if ((c = s[i]) == 0) { |
856 | result = iter.next(u'*'); |
857 | } else { |
858 | for (;;) { |
859 | c = uprv_invCharToAscii(c)(c); |
860 | // EBCDIC: If s[i] is not an invariant character, |
861 | // then c is now 0 and will simply not match anything, which is harmless. |
862 | uint8_t next = s[++i]; |
863 | if (next != 0) { |
864 | if (!USTRINGTRIE_HAS_NEXT(iter.next(c))((iter.next(c))&1)) { |
865 | return -1; |
866 | } |
867 | } else { |
868 | // last character of this subtag |
869 | result = iter.next(c | 0x80); |
870 | break; |
871 | } |
872 | c = next; |
873 | } |
874 | } |
875 | switch (result) { |
876 | case USTRINGTRIE_NO_MATCH: return -1; |
877 | case USTRINGTRIE_NO_VALUE: return 0; |
878 | case USTRINGTRIE_INTERMEDIATE_VALUE: |
879 | U_ASSERT(iter.getValue() == SKIP_SCRIPT)(static_cast <bool> (iter.getValue() == SKIP_SCRIPT) ? void (0) : __assert_fail ("iter.getValue() == SKIP_SCRIPT", __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__)); |
880 | return SKIP_SCRIPT; |
881 | case USTRINGTRIE_FINAL_VALUE: return iter.getValue(); |
882 | default: return -1; |
883 | } |
884 | } |
885 | int32_t LikelySubtags::trieNext(BytesTrie &iter, StringPiece s, int32_t i) { |
886 | UStringTrieResult result; |
887 | uint8_t c; |
888 | if (s.length() == i) { |
889 | result = iter.next(u'*'); |
890 | } else { |
891 | c = s.data()[i]; |
892 | for (;;) { |
893 | c = uprv_invCharToAscii(c)(c); |
894 | // EBCDIC: If s[i] is not an invariant character, |
895 | // then c is now 0 and will simply not match anything, which is harmless. |
896 | if (i+1 != s.length()) { |
897 | if (!USTRINGTRIE_HAS_NEXT(iter.next(c))((iter.next(c))&1)) { |
898 | return -1; |
899 | } |
900 | c = s.data()[++i]; |
901 | } else { |
902 | // last character of this subtag |
903 | result = iter.next(c | 0x80); |
904 | break; |
905 | } |
906 | } |
907 | } |
908 | switch (result) { |
909 | case USTRINGTRIE_NO_MATCH: return -1; |
910 | case USTRINGTRIE_NO_VALUE: return 0; |
911 | case USTRINGTRIE_INTERMEDIATE_VALUE: |
912 | U_ASSERT(iter.getValue() == SKIP_SCRIPT)(static_cast <bool> (iter.getValue() == SKIP_SCRIPT) ? void (0) : __assert_fail ("iter.getValue() == SKIP_SCRIPT", __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__)); |
913 | return SKIP_SCRIPT; |
914 | case USTRINGTRIE_FINAL_VALUE: return iter.getValue(); |
915 | default: return -1; |
916 | } |
917 | } |
918 | |
919 | LSR LikelySubtags::minimizeSubtags(StringPiece language, StringPiece script, |
920 | StringPiece region, |
921 | bool favorScript, |
922 | UErrorCode &errorCode) const { |
923 | if (U_FAILURE(errorCode)) { return {}; } |
924 | LSR max = maximize(language, script, region, true, errorCode); |
925 | if (U_FAILURE(errorCode)) { return {}; } |
926 | // If no match, return it. |
927 | if (uprv_strlen(max.language):: strlen(max.language) == 0 && |
928 | uprv_strlen(max.script):: strlen(max.script) == 0 && |
929 | uprv_strlen(max.region):: strlen(max.region) == 0) { |
930 | // No match. ICU API mandate us to |
931 | // "If this Locale is already in the minimal form, or not valid, or |
932 | // there is no data available for minimization, the Locale will be |
933 | // unchanged." |
934 | return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode); |
935 | } |
936 | // try language |
937 | LSR test = maximize(max.language, "", "", true, errorCode); |
938 | if (U_FAILURE(errorCode)) { return {}; } |
939 | if (test.isEquivalentTo(max)) { |
940 | return LSR(max.language, "", "", LSR::DONT_CARE_FLAGS, errorCode); |
941 | } |
942 | |
943 | if (!favorScript) { |
944 | // favor Region |
945 | // try language and region |
946 | test = maximize(max.language, "", max.region, true, errorCode); |
947 | if (U_FAILURE(errorCode)) { return {}; } |
948 | if (test.isEquivalentTo(max)) { |
949 | return LSR(max.language, "", max.region, LSR::DONT_CARE_FLAGS, errorCode); |
950 | } |
951 | } |
952 | // try language and script |
953 | test = maximize(max.language, max.script, "", true, errorCode); |
954 | if (U_FAILURE(errorCode)) { return {}; } |
955 | if (test.isEquivalentTo(max)) { |
956 | return LSR(max.language, max.script, "", LSR::DONT_CARE_FLAGS, errorCode); |
957 | } |
958 | if (favorScript) { |
959 | // try language and region |
960 | test = maximize(max.language, "", max.region, true, errorCode); |
961 | if (U_FAILURE(errorCode)) { return {}; } |
962 | if (test.isEquivalentTo(max)) { |
963 | return LSR(max.language, "", max.region, LSR::DONT_CARE_FLAGS, errorCode); |
964 | } |
965 | } |
966 | return LSR(max.language, max.script, max.region, LSR::DONT_CARE_FLAGS, errorCode); |
967 | } |
968 | |
969 | U_NAMESPACE_END} |