Bug Summary

File:root/firefox-clang/intl/icu/source/i18n/collationiterator.h
Warning:line 103, column 20
Assigned value is uninitialized

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name collationdatabuilder.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -relaxed-aliasing -ffp-contract=off -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/root/firefox-clang/obj-x86_64-pc-linux-gnu/config/external/icu/i18n -fcoverage-compilation-dir=/root/firefox-clang/obj-x86_64-pc-linux-gnu/config/external/icu/i18n -resource-dir /usr/lib/llvm-21/lib/clang/21 -include /root/firefox-clang/config/gcc_hidden.h -include /root/firefox-clang/obj-x86_64-pc-linux-gnu/mozilla-config.h -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/system_wrappers -U _FORTIFY_SOURCE -D _FORTIFY_SOURCE=2 -D _GLIBCXX_ASSERTIONS -D DEBUG=1 -D U_I18N_IMPLEMENTATION -D _LIBCPP_DISABLE_DEPRECATION_WARNINGS -D U_USING_ICU_NAMESPACE=0 -D U_NO_DEFAULT_INCLUDE_UTF_HEADERS=1 -D U_HIDE_OBSOLETE_UTF_OLD_H=1 -D UCONFIG_NO_LEGACY_CONVERSION -D UCONFIG_NO_TRANSLITERATION -D UCONFIG_NO_REGULAR_EXPRESSIONS -D UCONFIG_NO_BREAK_ITERATION -D UCONFIG_NO_IDNA -D UCONFIG_NO_MF2 -D U_CHARSET_IS_UTF8 -D UNISTR_FROM_CHAR_EXPLICIT=explicit -D UNISTR_FROM_STRING_EXPLICIT=explicit -D U_ENABLE_DYLOAD=0 -D U_DEBUG=1 -I /root/firefox-clang/config/external/icu/i18n -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/config/external/icu/i18n -I /root/firefox-clang/intl/icu/source/common -I /root/firefox-clang/mfbt/double-conversion -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include/nspr -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include/nss -D MOZILLA_CLIENT -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../include/c++/14 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../include/x86_64-linux-gnu/c++/14 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../include/c++/14/backward -internal-isystem /usr/lib/llvm-21/lib/clang/21/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-error=pessimizing-move -Wno-error=large-by-value-copy=128 -Wno-error=implicit-int-float-conversion -Wno-error=thread-safety-analysis -Wno-error=tautological-type-limit-compare -Wno-invalid-offsetof -Wno-range-loop-analysis -Wno-deprecated-anon-enum-enum-conversion -Wno-deprecated-enum-enum-conversion -Wno-deprecated-this-capture -Wno-inline-new-delete -Wno-error=deprecated-declarations -Wno-error=array-bounds -Wno-error=free-nonheap-object -Wno-error=atomic-alignment -Wno-error=deprecated-builtins -Wno-psabi -Wno-error=builtin-macro-redefined -Wno-vla-cxx-extension -Wno-unknown-warning-option -Wno-comma -Wno-implicit-const-int-float-conversion -Wno-macro-redefined -Wno-microsoft-include -Wno-tautological-unsigned-enum-zero-compare -Wno-unreachable-code-loop-increment -Wno-unreachable-code-return -fdeprecated-macro -ferror-limit 19 -fstrict-flex-arrays=1 -stack-protector 2 -fstack-clash-protection -ftrivial-auto-var-init=pattern -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -fno-sized-deallocation -fno-aligned-allocation -vectorize-loops -vectorize-slp -analyzer-checker optin.performance.Padding -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2025-06-27-100320-3286336-1 -x c++ /root/firefox-clang/intl/icu/source/i18n/collationdatabuilder.cpp

/root/firefox-clang/intl/icu/source/i18n/collationdatabuilder.cpp

1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2012-2015, International Business Machines
6* Corporation and others. All Rights Reserved.
7*******************************************************************************
8* collationdatabuilder.cpp
9*
10* (replaced the former ucol_elm.cpp)
11*
12* created on: 2012apr01
13* created by: Markus W. Scherer
14*/
15
16#include "unicode/utypes.h"
17
18#if !UCONFIG_NO_COLLATION0
19
20#include "unicode/localpointer.h"
21#include "unicode/uchar.h"
22#include "unicode/ucharstrie.h"
23#include "unicode/ucharstriebuilder.h"
24#include "unicode/uniset.h"
25#include "unicode/unistr.h"
26#include "unicode/usetiter.h"
27#include "unicode/utf16.h"
28#include "cmemory.h"
29#include "collation.h"
30#include "collationdata.h"
31#include "collationdatabuilder.h"
32#include "collationfastlatinbuilder.h"
33#include "collationiterator.h"
34#include "normalizer2impl.h"
35#include "utrie2.h"
36#include "uvectr32.h"
37#include "uvectr64.h"
38#include "uvector.h"
39
40U_NAMESPACE_BEGINnamespace icu_77 {
41
42CollationDataBuilder::CEModifier::~CEModifier() {}
43
44/**
45 * Build-time context and CE32 for a code point.
46 * If a code point has contextual mappings, then the default (no-context) mapping
47 * and all conditional mappings are stored in a singly-linked list
48 * of ConditionalCE32, sorted by context strings.
49 *
50 * Context strings sort by prefix length, then by prefix, then by contraction suffix.
51 * Context strings must be unique and in ascending order.
52 */
53struct ConditionalCE32 : public UMemory {
54 ConditionalCE32()
55 : context(),
56 ce32(0), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32),
57 next(-1) {}
58 ConditionalCE32(const UnicodeString &ct, uint32_t ce)
59 : context(ct),
60 ce32(ce), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32),
61 next(-1) {}
62
63 inline UBool hasContext() const { return context.length() > 1; }
64 inline int32_t prefixLength() const { return context.charAt(0); }
65
66 /**
67 * "\0" for the first entry for any code point, with its default CE32.
68 *
69 * Otherwise one unit with the length of the prefix string,
70 * then the prefix string, then the contraction suffix.
71 */
72 UnicodeString context;
73 /**
74 * CE32 for the code point and its context.
75 * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag).
76 */
77 uint32_t ce32;
78 /**
79 * Default CE32 for all contexts with this same prefix.
80 * Initially NO_CE32. Set only while building runtime data structures,
81 * and only on one of the nodes of a sub-list with the same prefix.
82 */
83 uint32_t defaultCE32;
84 /**
85 * CE32 for the built contexts.
86 * When fetching CEs from the builder, the contexts are built into their runtime form
87 * so that the normal collation implementation can process them.
88 * The result is cached in the list head. It is reset when the contexts are modified.
89 * All of these builtCE32 are invalidated by clearContexts(),
90 * via incrementing the contextsEra.
91 */
92 uint32_t builtCE32;
93 /**
94 * The "era" of building intermediate contexts when the above builtCE32 was set.
95 * When the array of cached, temporary contexts overflows, then clearContexts()
96 * removes them all and invalidates the builtCE32 that used to point to built tries.
97 */
98 int32_t era = -1;
99 /**
100 * Index of the next ConditionalCE32.
101 * Negative for the end of the list.
102 */
103 int32_t next;
104 // Note: We could create a separate class for all of the contextual mappings for
105 // a code point, with the builtCE32, the era, and a list of the actual mappings.
106 // The class that represents one mapping would then not need to
107 // store those fields in each element.
108};
109
110U_CDECL_BEGINextern "C" {
111
112void U_CALLCONV
113uprv_deleteConditionalCE32uprv_deleteConditionalCE32_77(void *obj) {
114 delete static_cast<ConditionalCE32 *>(obj);
115}
116
117U_CDECL_END}
118
119/**
120 * Build-time collation element and character iterator.
121 * Uses the runtime CollationIterator for fetching CEs for a string
122 * but reads from the builder's unfinished data structures.
123 * In particular, this class reads from the unfinished trie
124 * and has to avoid CollationIterator::nextCE() and redirect other
125 * calls to data->getCE32() and data->getCE32FromSupplementary().
126 *
127 * We do this so that we need not implement the collation algorithm
128 * again for the builder and make it behave exactly like the runtime code.
129 * That would be more difficult to test and maintain than this indirection.
130 *
131 * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data,
132 * so the data accesses from those code paths need not be modified.
133 *
134 * This class iterates directly over whole code points
135 * so that the CollationIterator does not need the finished trie
136 * for handling the LEAD_SURROGATE_TAG.
137 */
138class DataBuilderCollationIterator : public CollationIterator {
139public:
140 DataBuilderCollationIterator(CollationDataBuilder &b);
141
142 virtual ~DataBuilderCollationIterator();
143
144 int32_t fetchCEs(const UnicodeString &str, int32_t start, int64_t ces[], int32_t cesLength);
145
146 virtual void resetToOffset(int32_t newOffset) override;
147 virtual int32_t getOffset() const override;
148
149 virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
150 virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
151
152protected:
153 virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
154 virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
155
156 virtual uint32_t getDataCE32(UChar32 c) const override;
157 virtual uint32_t getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode) override;
158
159 CollationDataBuilder &builder;
160 CollationData builderData;
161 uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH];
162 const UnicodeString *s;
163 int32_t pos;
164};
165
166DataBuilderCollationIterator::DataBuilderCollationIterator(CollationDataBuilder &b)
167 : CollationIterator(&builderData, /*numeric=*/ false),
5
Calling constructor for 'CollationIterator'
168 builder(b), builderData(b.nfcImpl),
169 s(nullptr), pos(0) {
170 builderData.base = builder.base;
171 // Set all of the jamoCE32s[] to indirection CE32s.
172 for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types.
173 UChar32 jamo = CollationDataBuilder::jamoCpFromIndex(j);
174 jamoCE32s[j] = Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, jamo) |
175 CollationDataBuilder::IS_BUILDER_JAMO_CE32;
176 }
177 builderData.jamoCE32s = jamoCE32s;
178}
179
180DataBuilderCollationIterator::~DataBuilderCollationIterator() {}
181
182int32_t
183DataBuilderCollationIterator::fetchCEs(const UnicodeString &str, int32_t start,
184 int64_t ces[], int32_t cesLength) {
185 // Set the pointers each time, in case they changed due to reallocation.
186 builderData.ce32s = reinterpret_cast<const uint32_t *>(builder.ce32s.getBuffer());
187 builderData.ces = builder.ce64s.getBuffer();
188 builderData.contexts = builder.contexts.getBuffer();
189 // Modified copy of CollationIterator::nextCE() and CollationIterator::nextCEFromCE32().
190 reset();
191 s = &str;
192 pos = start;
193 UErrorCode errorCode = U_ZERO_ERROR;
194 while(U_SUCCESS(errorCode) && pos < s->length()) {
195 // No need to keep all CEs in the iterator buffer.
196 clearCEs();
197 UChar32 c = s->char32At(pos);
198 pos += U16_LENGTH(c)((uint32_t)(c)<=0xffff ? 1 : 2);
199 uint32_t ce32 = utrie2_get32utrie2_get32_77(builder.trie, c);
200 const CollationData *d;
201 if(ce32 == Collation::FALLBACK_CE32) {
202 d = builder.base;
203 ce32 = builder.base->getCE32(c);
204 } else {
205 d = &builderData;
206 }
207 appendCEsFromCE32(d, c, ce32, /*forward=*/ true, errorCode);
208 U_ASSERT(U_SUCCESS(errorCode))(static_cast <bool> (U_SUCCESS(errorCode)) ? void (0) :
__assert_fail ("U_SUCCESS(errorCode)", __builtin_FILE (), __builtin_LINE
(), __extension__ __PRETTY_FUNCTION__))
;
209 for(int32_t i = 0; i < getCEsLength(); ++i) {
210 int64_t ce = getCE(i);
211 if(ce != 0) {
212 if(cesLength < Collation::MAX_EXPANSION_LENGTH) {
213 ces[cesLength] = ce;
214 }
215 ++cesLength;
216 }
217 }
218 }
219 return cesLength;
220}
221
222void
223DataBuilderCollationIterator::resetToOffset(int32_t newOffset) {
224 reset();
225 pos = newOffset;
226}
227
228int32_t
229DataBuilderCollationIterator::getOffset() const {
230 return pos;
231}
232
233UChar32
234DataBuilderCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
235 if(pos == s->length()) {
236 return U_SENTINEL(-1);
237 }
238 UChar32 c = s->char32At(pos);
239 pos += U16_LENGTH(c)((uint32_t)(c)<=0xffff ? 1 : 2);
240 return c;
241}
242
243UChar32
244DataBuilderCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
245 if(pos == 0) {
246 return U_SENTINEL(-1);
247 }
248 UChar32 c = s->char32At(pos - 1);
249 pos -= U16_LENGTH(c)((uint32_t)(c)<=0xffff ? 1 : 2);
250 return c;
251}
252
253void
254DataBuilderCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
255 pos = s->moveIndex32(pos, num);
256}
257
258void
259DataBuilderCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
260 pos = s->moveIndex32(pos, -num);
261}
262
263uint32_t
264DataBuilderCollationIterator::getDataCE32(UChar32 c) const {
265 return utrie2_get32utrie2_get32_77(builder.trie, c);
266}
267
268uint32_t
269DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode) {
270 if (U_FAILURE(errorCode)) { return 0; }
271 U_ASSERT(Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG))(static_cast <bool> (Collation::hasCE32Tag(ce32, Collation
::BUILDER_DATA_TAG)) ? void (0) : __assert_fail ("Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG)"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
))
;
272 if((ce32 & CollationDataBuilder::IS_BUILDER_JAMO_CE32) != 0) {
273 UChar32 jamo = Collation::indexFromCE32(ce32);
274 return utrie2_get32utrie2_get32_77(builder.trie, jamo);
275 } else {
276 ConditionalCE32 *cond = builder.getConditionalCE32ForCE32(ce32);
277 if (cond == nullptr) {
278 errorCode = U_INTERNAL_PROGRAM_ERROR;
279 // TODO: ICU-21531 figure out why this happens.
280 return 0;
281 }
282 if(cond->builtCE32 == Collation::NO_CE32 || cond->era != builder.contextsEra) {
283 // Build the context-sensitive mappings into their runtime form and cache the result.
284 cond->builtCE32 = builder.buildContext(cond, errorCode);
285 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
286 errorCode = U_ZERO_ERROR;
287 builder.clearContexts();
288 cond->builtCE32 = builder.buildContext(cond, errorCode);
289 }
290 cond->era = builder.contextsEra;
291 builderData.contexts = builder.contexts.getBuffer();
292 }
293 return cond->builtCE32;
294 }
295}
296
297// ------------------------------------------------------------------------- ***
298
299CollationDataBuilder::CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode)
300 : nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),
301 base(nullptr), baseSettings(nullptr),
302 trie(nullptr),
303 ce32s(errorCode), ce64s(errorCode), conditionalCE32s(errorCode),
304 modified(false),
305 icu4xMode(icu4xMode),
306 fastLatinEnabled(false), fastLatinBuilder(nullptr),
307 collIter(nullptr) {
308 // Reserve the first CE32 for U+0000.
309 if (!icu4xMode) {
310 ce32s.addElement(0, errorCode);
311 }
312 conditionalCE32s.setDeleter(uprv_deleteConditionalCE32uprv_deleteConditionalCE32_77);
313}
314
315CollationDataBuilder::~CollationDataBuilder() {
316 utrie2_closeutrie2_close_77(trie);
317 delete fastLatinBuilder;
318 delete collIter;
319}
320
321void
322CollationDataBuilder::initForTailoring(const CollationData *b, UErrorCode &errorCode) {
323 if(U_FAILURE(errorCode)) { return; }
324 if(trie != nullptr) {
325 errorCode = U_INVALID_STATE_ERROR;
326 return;
327 }
328 if(b == nullptr) {
329 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
330 return;
331 }
332 base = b;
333
334 // For a tailoring, the default is to fall back to the base.
335 // For ICU4X, use the same value for fallback as for the default
336 // to avoid having to have different blocks for the two.
337 trie = utrie2_openutrie2_open_77(Collation::FALLBACK_CE32, icu4xMode ? Collation::FALLBACK_CE32 : Collation::FFFD_CE32, &errorCode);
338
339 if (!icu4xMode) {
340 // Set the Latin-1 letters block so that it is allocated first in the data array,
341 // to try to improve locality of reference when sorting Latin-1 text.
342 // Do not use utrie2_setRange32() since that will not actually allocate blocks
343 // that are filled with the default value.
344 // ASCII (0..7F) is already preallocated anyway.
345 for(UChar32 c = 0xc0; c <= 0xff; ++c) {
346 utrie2_set32utrie2_set32_77(trie, c, Collation::FALLBACK_CE32, &errorCode);
347 }
348
349 // Hangul syllables are not tailorable (except via tailoring Jamos).
350 // Always set the Hangul tag to help performance.
351 // Do this here, rather than in buildMappings(),
352 // so that we see the HANGUL_TAG in various assertions.
353 uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
354 utrie2_setRange32utrie2_setRange32_77(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, true, &errorCode);
355
356 // Copy the set contents but don't copy/clone the set as a whole because
357 // that would copy the isFrozen state too.
358 unsafeBackwardSet.addAll(*b->unsafeBackwardSet);
359 }
360
361 if(U_FAILURE(errorCode)) { return; }
362}
363
364UBool
365CollationDataBuilder::maybeSetPrimaryRange(UChar32 start, UChar32 end,
366 uint32_t primary, int32_t step,
367 UErrorCode &errorCode) {
368 if(U_FAILURE(errorCode)) { return false; }
369 U_ASSERT(start <= end)(static_cast <bool> (start <= end) ? void (0) : __assert_fail
("start <= end", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__))
;
370 // TODO: Do we need to check what values are currently set for start..end?
371 // An offset range is worth it only if we can achieve an overlap between
372 // adjacent UTrie2 blocks of 32 code points each.
373 // An offset CE is also a little more expensive to look up and compute
374 // than a simple CE.
375 // If the range spans at least three UTrie2 block boundaries (> 64 code points),
376 // then we take it.
377 // If the range spans one or two block boundaries and there are
378 // at least 4 code points on either side, then we take it.
379 // (We could additionally require a minimum range length of, say, 16.)
380 int32_t blockDelta = (end >> 5) - (start >> 5);
381 if(2 <= step && step <= 0x7f &&
382 (blockDelta >= 3 ||
383 (blockDelta > 0 && (start & 0x1f) <= 0x1c && (end & 0x1f) >= 3))) {
384 int64_t dataCE = (static_cast<int64_t>(primary) << 32) | (start << 8) | step;
385 if(isCompressiblePrimary(primary)) { dataCE |= 0x80; }
386 int32_t index = addCE(dataCE, errorCode);
387 if(U_FAILURE(errorCode)) { return 0; }
388 if(index > Collation::MAX_INDEX) {
389 errorCode = U_BUFFER_OVERFLOW_ERROR;
390 return 0;
391 }
392 uint32_t offsetCE32 = Collation::makeCE32FromTagAndIndex(Collation::OFFSET_TAG, index);
393 utrie2_setRange32utrie2_setRange32_77(trie, start, end, offsetCE32, true, &errorCode);
394 modified = true;
395 return true;
396 } else {
397 return false;
398 }
399}
400
401uint32_t
402CollationDataBuilder::setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end,
403 uint32_t primary, int32_t step,
404 UErrorCode &errorCode) {
405 if(U_FAILURE(errorCode)) { return 0; }
406 UBool isCompressible = isCompressiblePrimary(primary);
407 if(maybeSetPrimaryRange(start, end, primary, step, errorCode)) {
408 return Collation::incThreeBytePrimaryByOffset(primary, isCompressible,
409 (end - start + 1) * step);
410 } else {
411 // Short range: Set individual CE32s.
412 for(;;) {
413 utrie2_set32utrie2_set32_77(trie, start, Collation::makeLongPrimaryCE32(primary), &errorCode);
414 ++start;
415 primary = Collation::incThreeBytePrimaryByOffset(primary, isCompressible, step);
416 if(start > end) { return primary; }
417 }
418 modified = true;
419 }
420}
421
422uint32_t
423CollationDataBuilder::getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const {
424 int32_t i = Collation::indexFromCE32(ce32);
425 int64_t dataCE = fromBase ? base->ces[i] : ce64s.elementAti(i);
426 uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE);
427 return Collation::makeLongPrimaryCE32(p);
428}
429
430UBool
431CollationDataBuilder::isCompressibleLeadByte(uint32_t b) const {
432 return base->isCompressibleLeadByte(b);
433}
434
435UBool
436CollationDataBuilder::isAssigned(UChar32 c) const {
437 return Collation::isAssignedCE32(utrie2_get32utrie2_get32_77(trie, c));
438}
439
440uint32_t
441CollationDataBuilder::getLongPrimaryIfSingleCE(UChar32 c) const {
442 uint32_t ce32 = utrie2_get32utrie2_get32_77(trie, c);
443 if(Collation::isLongPrimaryCE32(ce32)) {
444 return Collation::primaryFromLongPrimaryCE32(ce32);
445 } else {
446 return 0;
447 }
448}
449
450int64_t
451CollationDataBuilder::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
452 if(U_FAILURE(errorCode)) { return 0; }
453 // Keep parallel with CollationData::getSingleCE().
454 UBool fromBase = false;
455 uint32_t ce32 = utrie2_get32utrie2_get32_77(trie, c);
456 if(ce32 == Collation::FALLBACK_CE32) {
457 fromBase = true;
458 ce32 = base->getCE32(c);
459 }
460 while(Collation::isSpecialCE32(ce32)) {
461 switch(Collation::tagFromCE32(ce32)) {
462 case Collation::LATIN_EXPANSION_TAG:
463 case Collation::BUILDER_DATA_TAG:
464 case Collation::PREFIX_TAG:
465 case Collation::CONTRACTION_TAG:
466 case Collation::HANGUL_TAG:
467 case Collation::LEAD_SURROGATE_TAG:
468 errorCode = U_UNSUPPORTED_ERROR;
469 return 0;
470 case Collation::FALLBACK_TAG:
471 case Collation::RESERVED_TAG_3:
472 errorCode = U_INTERNAL_PROGRAM_ERROR;
473 return 0;
474 case Collation::LONG_PRIMARY_TAG:
475 return Collation::ceFromLongPrimaryCE32(ce32);
476 case Collation::LONG_SECONDARY_TAG:
477 return Collation::ceFromLongSecondaryCE32(ce32);
478 case Collation::EXPANSION32_TAG:
479 if(Collation::lengthFromCE32(ce32) == 1) {
480 int32_t i = Collation::indexFromCE32(ce32);
481 ce32 = fromBase ? base->ce32s[i] : ce32s.elementAti(i);
482 break;
483 } else {
484 errorCode = U_UNSUPPORTED_ERROR;
485 return 0;
486 }
487 case Collation::EXPANSION_TAG: {
488 if(Collation::lengthFromCE32(ce32) == 1) {
489 int32_t i = Collation::indexFromCE32(ce32);
490 return fromBase ? base->ces[i] : ce64s.elementAti(i);
491 } else {
492 errorCode = U_UNSUPPORTED_ERROR;
493 return 0;
494 }
495 }
496 case Collation::DIGIT_TAG:
497 // Fetch the non-numeric-collation CE32 and continue.
498 ce32 = ce32s.elementAti(Collation::indexFromCE32(ce32));
499 break;
500 case Collation::U0000_TAG:
501 U_ASSERT(c == 0)(static_cast <bool> (c == 0) ? void (0) : __assert_fail
("c == 0", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__))
;
502 // Fetch the normal ce32 for U+0000 and continue.
503 ce32 = fromBase ? base->ce32s[0] : ce32s.elementAti(0);
504 break;
505 case Collation::OFFSET_TAG:
506 ce32 = getCE32FromOffsetCE32(fromBase, c, ce32);
507 break;
508 case Collation::IMPLICIT_TAG:
509 return Collation::unassignedCEFromCodePoint(c);
510 }
511 }
512 return Collation::ceFromSimpleCE32(ce32);
513}
514
515int32_t
516CollationDataBuilder::addCE(int64_t ce, UErrorCode &errorCode) {
517 int32_t length = ce64s.size();
518 for(int32_t i = 0; i < length; ++i) {
519 if(ce == ce64s.elementAti(i)) { return i; }
520 }
521 ce64s.addElement(ce, errorCode);
522 return length;
523}
524
525int32_t
526CollationDataBuilder::addCE32(uint32_t ce32, UErrorCode &errorCode) {
527 int32_t length = ce32s.size();
528 for(int32_t i = 0; i < length; ++i) {
529 if (ce32 == static_cast<uint32_t>(ce32s.elementAti(i))) { return i; }
530 }
531 ce32s.addElement(static_cast<int32_t>(ce32), errorCode);
532 return length;
533}
534
535int32_t
536CollationDataBuilder::addConditionalCE32(const UnicodeString &context, uint32_t ce32,
537 UErrorCode &errorCode) {
538 if(U_FAILURE(errorCode)) { return -1; }
539 U_ASSERT(!context.isEmpty())(static_cast <bool> (!context.isEmpty()) ? void (0) : __assert_fail
("!context.isEmpty()", __builtin_FILE (), __builtin_LINE (),
__extension__ __PRETTY_FUNCTION__))
;
540 int32_t index = conditionalCE32s.size();
541 if(index > Collation::MAX_INDEX) {
542 errorCode = U_BUFFER_OVERFLOW_ERROR;
543 return -1;
544 }
545 LocalPointer<ConditionalCE32> cond(new ConditionalCE32(context, ce32), errorCode);
546 conditionalCE32s.adoptElement(cond.orphan(), errorCode);
547 if(U_FAILURE(errorCode)) {
548 return -1;
549 }
550 return index;
551}
552
553void
554CollationDataBuilder::add(const UnicodeString &prefix, const UnicodeString &s,
555 const int64_t ces[], int32_t cesLength,
556 UErrorCode &errorCode) {
557 uint32_t ce32 = encodeCEs(ces, cesLength, errorCode);
558 addCE32(prefix, s, ce32, errorCode);
559}
560
561void
562CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &s,
563 uint32_t ce32, UErrorCode &errorCode) {
564 if(U_FAILURE(errorCode)) { return; }
565 if(s.isEmpty()) {
566 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
567 return;
568 }
569 if(trie == nullptr || utrie2_isFrozenutrie2_isFrozen_77(trie)) {
570 errorCode = U_INVALID_STATE_ERROR;
571 return;
572 }
573 UChar32 c = s.char32At(0);
574 int32_t cLength = U16_LENGTH(c)((uint32_t)(c)<=0xffff ? 1 : 2);
575 uint32_t oldCE32 = utrie2_get32utrie2_get32_77(trie, c);
576 UBool hasContext = !prefix.isEmpty() || s.length() > cLength;
577
578 if (icu4xMode) {
579 if (base && c >= 0x1100 && c < 0x1200) {
580 // Omit jamo tailorings.
581 // TODO(https://github.com/unicode-org/icu4x/issues/1941).
582 }
583 const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(errorCode);
584 UnicodeString sInNfd;
585 nfdNormalizer->normalize(s, sInNfd, errorCode);
586 if (s != sInNfd) {
587 // s is not in NFD, so it cannot match in ICU4X, since ICU4X only
588 // does NFD lookups.
589
590 // As of Unicode 16 alpha, the cases that come here are:
591 //
592 // 1. The second character is a special decomposing Tibetan vowel
593 // sign. These are OK to ignore in the precomposed form, since
594 // the decomposed form is added also.
595 // 2. Likewise for KIRAT RAI VOWEL SIGN AA followed by KIRAT RAI VOWEL SIGN AI
596 // and other such cases.
597 // For details see the normalization section of
598 // https://www.unicode.org/review/pri497/pri497-background.html
599 // 3. U+FDD1 followed by U+AC00 is a marker for the alphabetical
600 // index feature of ICU4C, which at this time does not have
601 // a counterpart in ICU4X.
602 return;
603 }
604
605 if (!prefix.isEmpty()) {
606 UnicodeString prefixInNfd;
607 nfdNormalizer->normalize(prefix, prefixInNfd, errorCode);
608 if (prefix != prefixInNfd) {
609 errorCode = U_UNSUPPORTED_ERROR;
610 return;
611 }
612
613 int32_t count = prefix.countChar32();
614 if (count > 2) {
615 // Prefix too long for ICU4X.
616 errorCode = U_UNSUPPORTED_ERROR;
617 return;
618 }
619 UChar32 utf32[4];
620 int32_t len = prefix.toUTF32(utf32, 4, errorCode);
621 if (len != count) {
622 errorCode = U_INVALID_STATE_ERROR;
623 return;
624 }
625 UChar32 c = utf32[0];
626 if (u_getCombiningClassu_getCombiningClass_77(c)) {
627 // Prefix must start with as starter for ICU4X.
628 errorCode = U_UNSUPPORTED_ERROR;
629 return;
630 }
631 // XXX: Korean searchjl has jamo in prefix, so commenting out this
632 // check for now. ICU4X currently ignores non-root jamo tables anyway.
633 // searchjl was added in
634 // https://unicode-org.atlassian.net/browse/CLDR-3560
635 // Contractions were changed to prefixes in
636 // https://unicode-org.atlassian.net/browse/CLDR-6546
637 //
638 // if ((c >= 0x1100 && c < 0x1200) || (c >= 0xAC00 && c < 0xD7A4)) {
639 // errorCode = U_UNSUPPORTED_ERROR;
640 // return;
641 // }
642 if ((len > 1) && !(utf32[1] == 0x3099 || utf32[1] == 0x309A)) {
643 // Second character in prefix, if present, must be a kana voicing mark for ICU4X.
644 errorCode = U_UNSUPPORTED_ERROR;
645 return;
646 }
647 }
648
649 if (s.length() > cLength) {
650 // Check that there's no modern Hangul in contractions.
651 for (int32_t i = 0; i < s.length(); ++i) {
652 char16_t c = s.charAt(i);
653 if ((c >= 0x1100 && c < 0x1100 + 19) || (c >= 0x1161 && c < 0x1161 + 21) || (c >= 0x11A7 && c < 0x11A7 + 28) || (c >= 0xAC00 && c < 0xD7A4)) {
654 errorCode = U_UNSUPPORTED_ERROR;
655 return;
656 }
657 }
658 }
659 }
660
661 if(oldCE32 == Collation::FALLBACK_CE32) {
662 // First tailoring for c.
663 // If c has contextual base mappings or if we add a contextual mapping,
664 // then copy the base mappings.
665 // Otherwise we just override the base mapping.
666 uint32_t baseCE32 = base->getFinalCE32(base->getCE32(c));
667 if(hasContext || Collation::ce32HasContext(baseCE32)) {
668 oldCE32 = copyFromBaseCE32(c, baseCE32, true, errorCode);
669 utrie2_set32utrie2_set32_77(trie, c, oldCE32, &errorCode);
670 if(U_FAILURE(errorCode)) { return; }
671 }
672 }
673 if(!hasContext) {
674 // No prefix, no contraction.
675 if(!isBuilderContextCE32(oldCE32)) {
676 utrie2_set32utrie2_set32_77(trie, c, ce32, &errorCode);
677 } else {
678 ConditionalCE32 *cond = getConditionalCE32ForCE32(oldCE32);
679 cond->builtCE32 = Collation::NO_CE32;
680 cond->ce32 = ce32;
681 }
682 } else {
683 ConditionalCE32 *cond;
684 if(!isBuilderContextCE32(oldCE32)) {
685 // Replace the simple oldCE32 with a builder context CE32
686 // pointing to a new ConditionalCE32 list head.
687 int32_t index = addConditionalCE32(UnicodeString(static_cast<char16_t>(0)), oldCE32, errorCode);
688 if(U_FAILURE(errorCode)) { return; }
689 uint32_t contextCE32 = makeBuilderContextCE32(index);
690 utrie2_set32utrie2_set32_77(trie, c, contextCE32, &errorCode);
691 contextChars.add(c);
692 cond = getConditionalCE32(index);
693 } else {
694 cond = getConditionalCE32ForCE32(oldCE32);
695 cond->builtCE32 = Collation::NO_CE32;
696 }
697 UnicodeString suffix(s, cLength);
698 UnicodeString context(static_cast<char16_t>(prefix.length()));
699 context.append(prefix).append(suffix);
700 unsafeBackwardSet.addAll(suffix);
701 for(;;) {
702 // invariant: context > cond->context
703 int32_t next = cond->next;
704 if(next < 0) {
705 // Append a new ConditionalCE32 after cond.
706 int32_t index = addConditionalCE32(context, ce32, errorCode);
707 if(U_FAILURE(errorCode)) { return; }
708 cond->next = index;
709 break;
710 }
711 ConditionalCE32 *nextCond = getConditionalCE32(next);
712 int8_t cmp = context.compare(nextCond->context);
713 if(cmp < 0) {
714 // Insert a new ConditionalCE32 between cond and nextCond.
715 int32_t index = addConditionalCE32(context, ce32, errorCode);
716 if(U_FAILURE(errorCode)) { return; }
717 cond->next = index;
718 getConditionalCE32(index)->next = next;
719 break;
720 } else if(cmp == 0) {
721 // Same context as before, overwrite its ce32.
722 nextCond->ce32 = ce32;
723 break;
724 }
725 cond = nextCond;
726 }
727 }
728 modified = true;
729}
730
731uint32_t
732CollationDataBuilder::encodeOneCEAsCE32(int64_t ce) {
733 uint32_t p = static_cast<uint32_t>(ce >> 32);
734 uint32_t lower32 = static_cast<uint32_t>(ce);
735 uint32_t t = static_cast<uint32_t>(ce & 0xffff);
736 U_ASSERT((t & 0xc000) != 0xc000)(static_cast <bool> ((t & 0xc000) != 0xc000) ? void
(0) : __assert_fail ("(t & 0xc000) != 0xc000", __builtin_FILE
(), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__))
; // Impossible case bits 11 mark special CE32s.
737 if((ce & INT64_C(0xffff00ff00ff)0xffff00ff00ffL) == 0) {
738 // normal form ppppsstt
739 return p | (lower32 >> 16) | (t >> 8);
740 } else if((ce & INT64_C(0xffffffffff)0xffffffffffL) == Collation::COMMON_SEC_AND_TER_CE) {
741 // long-primary form ppppppC1
742 return Collation::makeLongPrimaryCE32(p);
743 } else if(p == 0 && (t & 0xff) == 0) {
744 // long-secondary form ssssttC2
745 return Collation::makeLongSecondaryCE32(lower32);
746 }
747 return Collation::NO_CE32;
748}
749
750uint32_t
751CollationDataBuilder::encodeOneCE(int64_t ce, UErrorCode &errorCode) {
752 // Try to encode one CE as one CE32.
753 uint32_t ce32 = encodeOneCEAsCE32(ce);
754 if(ce32 != Collation::NO_CE32) { return ce32; }
755 int32_t index = addCE(ce, errorCode);
756 if(U_FAILURE(errorCode)) { return 0; }
757 if(index > Collation::MAX_INDEX) {
758 errorCode = U_BUFFER_OVERFLOW_ERROR;
759 return 0;
760 }
761 return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, index, 1);
762}
763
764uint32_t
765CollationDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength,
766 UErrorCode &errorCode) {
767 if(U_FAILURE(errorCode)) { return 0; }
768 if(cesLength < 0 || cesLength > Collation::MAX_EXPANSION_LENGTH) {
769 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
770 return 0;
771 }
772 if(trie == nullptr || utrie2_isFrozenutrie2_isFrozen_77(trie)) {
773 errorCode = U_INVALID_STATE_ERROR;
774 return 0;
775 }
776 if(cesLength == 0) {
777 // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE.
778 // Do this here so that callers need not do it.
779 return encodeOneCEAsCE32(0);
780 } else if(cesLength == 1) {
781 return encodeOneCE(ces[0], errorCode);
782 } else if(cesLength == 2 && !icu4xMode) {
783 // Try to encode two CEs as one CE32.
784 // Turn this off for ICU4X, because without the canonical closure
785 // these are so rare that it doesn't make sense to spend a branch
786 // on checking this tag when using the data.
787 int64_t ce0 = ces[0];
788 int64_t ce1 = ces[1];
789 uint32_t p0 = static_cast<uint32_t>(ce0 >> 32);
790 if((ce0 & INT64_C(0xffffffffff00ff)0xffffffffff00ffL) == Collation::COMMON_SECONDARY_CE &&
791 (ce1 & INT64_C(0xffffffff00ffffff)0xffffffff00ffffffL) == Collation::COMMON_TERTIARY_CE &&
792 p0 != 0) {
793 // Latin mini expansion
794 return
795 p0 |
796 ((static_cast<uint32_t>(ce0) & 0xff00u) << 8) |
797 static_cast<uint32_t>(ce1 >> 16) |
798 Collation::SPECIAL_CE32_LOW_BYTE |
799 Collation::LATIN_EXPANSION_TAG;
800 }
801 }
802 // Try to encode two or more CEs as CE32s.
803 int32_t newCE32s[Collation::MAX_EXPANSION_LENGTH];
804 for(int32_t i = 0;; ++i) {
805 if(i == cesLength) {
806 return encodeExpansion32(newCE32s, cesLength, errorCode);
807 }
808 uint32_t ce32 = encodeOneCEAsCE32(ces[i]);
809 if(ce32 == Collation::NO_CE32) { break; }
810 newCE32s[i] = static_cast<int32_t>(ce32);
811 }
812 return encodeExpansion(ces, cesLength, errorCode);
813}
814
815uint32_t
816CollationDataBuilder::encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode) {
817 if(U_FAILURE(errorCode)) { return 0; }
818 // See if this sequence of CEs has already been stored.
819 int64_t first = ces[0];
820 int32_t ce64sMax = ce64s.size() - length;
821 for(int32_t i = 0; i <= ce64sMax; ++i) {
822 if(first == ce64s.elementAti(i)) {
823 if(i > Collation::MAX_INDEX) {
824 errorCode = U_BUFFER_OVERFLOW_ERROR;
825 return 0;
826 }
827 for(int32_t j = 1;; ++j) {
828 if(j == length) {
829 return Collation::makeCE32FromTagIndexAndLength(
830 Collation::EXPANSION_TAG, i, length);
831 }
832 if(ce64s.elementAti(i + j) != ces[j]) { break; }
833 }
834 }
835 }
836 // Store the new sequence.
837 int32_t i = ce64s.size();
838 if(i > Collation::MAX_INDEX) {
839 errorCode = U_BUFFER_OVERFLOW_ERROR;
840 return 0;
841 }
842 for(int32_t j = 0; j < length; ++j) {
843 ce64s.addElement(ces[j], errorCode);
844 }
845 return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, i, length);
846}
847
848uint32_t
849CollationDataBuilder::encodeExpansion32(const int32_t newCE32s[], int32_t length,
850 UErrorCode &errorCode) {
851 if(U_FAILURE(errorCode)) { return 0; }
852 // See if this sequence of CE32s has already been stored.
853 int32_t first = newCE32s[0];
854 int32_t ce32sMax = ce32s.size() - length;
855 for(int32_t i = 0; i <= ce32sMax; ++i) {
856 if(first == ce32s.elementAti(i)) {
857 if(i > Collation::MAX_INDEX) {
858 errorCode = U_BUFFER_OVERFLOW_ERROR;
859 return 0;
860 }
861 for(int32_t j = 1;; ++j) {
862 if(j == length) {
863 return Collation::makeCE32FromTagIndexAndLength(
864 Collation::EXPANSION32_TAG, i, length);
865 }
866 if(ce32s.elementAti(i + j) != newCE32s[j]) { break; }
867 }
868 }
869 }
870 // Store the new sequence.
871 int32_t i = ce32s.size();
872 if(i > Collation::MAX_INDEX) {
873 errorCode = U_BUFFER_OVERFLOW_ERROR;
874 return 0;
875 }
876 for(int32_t j = 0; j < length; ++j) {
877 ce32s.addElement(newCE32s[j], errorCode);
878 }
879 return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION32_TAG, i, length);
880}
881
882uint32_t
883CollationDataBuilder::copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext,
884 UErrorCode &errorCode) {
885 if(U_FAILURE(errorCode)) { return 0; }
886 if(!Collation::isSpecialCE32(ce32)) { return ce32; }
887 switch(Collation::tagFromCE32(ce32)) {
888 case Collation::LONG_PRIMARY_TAG:
889 case Collation::LONG_SECONDARY_TAG:
890 case Collation::LATIN_EXPANSION_TAG:
891 // copy as is
892 break;
893 case Collation::EXPANSION32_TAG: {
894 const uint32_t *baseCE32s = base->ce32s + Collation::indexFromCE32(ce32);
895 int32_t length = Collation::lengthFromCE32(ce32);
896 ce32 = encodeExpansion32(
897 reinterpret_cast<const int32_t *>(baseCE32s), length, errorCode);
898 break;
899 }
900 case Collation::EXPANSION_TAG: {
901 const int64_t *baseCEs = base->ces + Collation::indexFromCE32(ce32);
902 int32_t length = Collation::lengthFromCE32(ce32);
903 ce32 = encodeExpansion(baseCEs, length, errorCode);
904 break;
905 }
906 case Collation::PREFIX_TAG: {
907 // Flatten prefixes and nested suffixes (contractions)
908 // into a linear list of ConditionalCE32.
909 const char16_t *p = base->contexts + Collation::indexFromCE32(ce32);
910 ce32 = CollationData::readCE32(p); // Default if no prefix match.
911 if(!withContext) {
912 return copyFromBaseCE32(c, ce32, false, errorCode);
913 }
914 ConditionalCE32 head;
915 UnicodeString context(static_cast<char16_t>(0));
916 int32_t index;
917 if(Collation::isContractionCE32(ce32)) {
918 index = copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode);
919 } else {
920 ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
921 head.next = index = addConditionalCE32(context, ce32, errorCode);
922 }
923 if(U_FAILURE(errorCode)) { return 0; }
924 ConditionalCE32 *cond = getConditionalCE32(index); // the last ConditionalCE32 so far
925 UCharsTrie::Iterator prefixes(p + 2, 0, errorCode);
926 while(prefixes.next(errorCode)) {
927 context = prefixes.getString();
928 context.reverse();
929 context.insert(0, static_cast<char16_t>(context.length()));
930 ce32 = static_cast<uint32_t>(prefixes.getValue());
931 if(Collation::isContractionCE32(ce32)) {
932 index = copyContractionsFromBaseCE32(context, c, ce32, cond, errorCode);
933 } else {
934 ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
935 cond->next = index = addConditionalCE32(context, ce32, errorCode);
936 }
937 if(U_FAILURE(errorCode)) { return 0; }
938 cond = getConditionalCE32(index);
939 }
940 ce32 = makeBuilderContextCE32(head.next);
941 contextChars.add(c);
942 break;
943 }
944 case Collation::CONTRACTION_TAG: {
945 if(!withContext) {
946 const char16_t *p = base->contexts + Collation::indexFromCE32(ce32);
947 ce32 = CollationData::readCE32(p); // Default if no suffix match.
948 return copyFromBaseCE32(c, ce32, false, errorCode);
949 }
950 ConditionalCE32 head;
951 UnicodeString context(static_cast<char16_t>(0));
952 copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode);
953 ce32 = makeBuilderContextCE32(head.next);
954 contextChars.add(c);
955 break;
956 }
957 case Collation::HANGUL_TAG:
958 errorCode = U_UNSUPPORTED_ERROR; // We forbid tailoring of Hangul syllables.
959 break;
960 case Collation::OFFSET_TAG:
961 ce32 = getCE32FromOffsetCE32(true, c, ce32);
962 break;
963 case Collation::IMPLICIT_TAG:
964 ce32 = encodeOneCE(Collation::unassignedCEFromCodePoint(c), errorCode);
965 break;
966 default:
967 UPRV_UNREACHABLE_EXITabort(); // require ce32 == base->getFinalCE32(ce32)
968 }
969 return ce32;
970}
971
972int32_t
973CollationDataBuilder::copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32,
974 ConditionalCE32 *cond, UErrorCode &errorCode) {
975 if(U_FAILURE(errorCode)) { return 0; }
976 const char16_t *p = base->contexts + Collation::indexFromCE32(ce32);
977 int32_t index;
978 if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
979 // No match on the single code point.
980 // We are underneath a prefix, and the default mapping is just
981 // a fallback to the mappings for a shorter prefix.
982 U_ASSERT(context.length() > 1)(static_cast <bool> (context.length() > 1) ? void (0
) : __assert_fail ("context.length() > 1", __builtin_FILE (
), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__))
;
983 index = -1;
984 } else {
985 ce32 = CollationData::readCE32(p); // Default if no suffix match.
986 U_ASSERT(!Collation::isContractionCE32(ce32))(static_cast <bool> (!Collation::isContractionCE32(ce32
)) ? void (0) : __assert_fail ("!Collation::isContractionCE32(ce32)"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
))
;
987 ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
988 cond->next = index = addConditionalCE32(context, ce32, errorCode);
989 if(U_FAILURE(errorCode)) { return 0; }
990 cond = getConditionalCE32(index);
991 }
992
993 int32_t suffixStart = context.length();
994 UCharsTrie::Iterator suffixes(p + 2, 0, errorCode);
995 while(suffixes.next(errorCode)) {
996 context.append(suffixes.getString());
997 ce32 = copyFromBaseCE32(c, static_cast<uint32_t>(suffixes.getValue()), true, errorCode);
998 cond->next = index = addConditionalCE32(context, ce32, errorCode);
999 if(U_FAILURE(errorCode)) { return 0; }
1000 // No need to update the unsafeBackwardSet because the tailoring set
1001 // is already a copy of the base set.
1002 cond = getConditionalCE32(index);
1003 context.truncate(suffixStart);
1004 }
1005 U_ASSERT(index >= 0)(static_cast <bool> (index >= 0) ? void (0) : __assert_fail
("index >= 0", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__))
;
1006 return index;
1007}
1008
1009class CopyHelper {
1010public:
1011 CopyHelper(const CollationDataBuilder &s, CollationDataBuilder &d,
1012 const CollationDataBuilder::CEModifier &m, UErrorCode &initialErrorCode)
1013 : src(s), dest(d), modifier(m),
1014 errorCode(initialErrorCode) {}
1015
1016 UBool copyRangeCE32(UChar32 start, UChar32 end, uint32_t ce32) {
1017 ce32 = copyCE32(ce32);
1018 utrie2_setRange32utrie2_setRange32_77(dest.trie, start, end, ce32, true, &errorCode);
1019 if(CollationDataBuilder::isBuilderContextCE32(ce32)) {
1020 dest.contextChars.add(start, end);
1021 }
1022 return U_SUCCESS(errorCode);
1023 }
1024
1025 uint32_t copyCE32(uint32_t ce32) {
1026 if(!Collation::isSpecialCE32(ce32)) {
1027 int64_t ce = modifier.modifyCE32(ce32);
1028 if(ce != Collation::NO_CE) {
1029 ce32 = dest.encodeOneCE(ce, errorCode);
1030 }
1031 } else {
1032 int32_t tag = Collation::tagFromCE32(ce32);
1033 if(tag == Collation::EXPANSION32_TAG) {
1034 const uint32_t *srcCE32s = reinterpret_cast<uint32_t *>(src.ce32s.getBuffer());
1035 srcCE32s += Collation::indexFromCE32(ce32);
1036 int32_t length = Collation::lengthFromCE32(ce32);
1037 // Inspect the source CE32s. Just copy them if none are modified.
1038 // Otherwise copy to modifiedCEs, with modifications.
1039 UBool isModified = false;
1040 for(int32_t i = 0; i < length; ++i) {
1041 ce32 = srcCE32s[i];
1042 int64_t ce;
1043 if(Collation::isSpecialCE32(ce32) ||
1044 (ce = modifier.modifyCE32(ce32)) == Collation::NO_CE) {
1045 if(isModified) {
1046 modifiedCEs[i] = Collation::ceFromCE32(ce32);
1047 }
1048 } else {
1049 if(!isModified) {
1050 for(int32_t j = 0; j < i; ++j) {
1051 modifiedCEs[j] = Collation::ceFromCE32(srcCE32s[j]);
1052 }
1053 isModified = true;
1054 }
1055 modifiedCEs[i] = ce;
1056 }
1057 }
1058 if(isModified) {
1059 ce32 = dest.encodeCEs(modifiedCEs, length, errorCode);
1060 } else {
1061 ce32 = dest.encodeExpansion32(
1062 reinterpret_cast<const int32_t *>(srcCE32s), length, errorCode);
1063 }
1064 } else if(tag == Collation::EXPANSION_TAG) {
1065 const int64_t *srcCEs = src.ce64s.getBuffer();
1066 srcCEs += Collation::indexFromCE32(ce32);
1067 int32_t length = Collation::lengthFromCE32(ce32);
1068 // Inspect the source CEs. Just copy them if none are modified.
1069 // Otherwise copy to modifiedCEs, with modifications.
1070 UBool isModified = false;
1071 for(int32_t i = 0; i < length; ++i) {
1072 int64_t srcCE = srcCEs[i];
1073 int64_t ce = modifier.modifyCE(srcCE);
1074 if(ce == Collation::NO_CE) {
1075 if(isModified) {
1076 modifiedCEs[i] = srcCE;
1077 }
1078 } else {
1079 if(!isModified) {
1080 for(int32_t j = 0; j < i; ++j) {
1081 modifiedCEs[j] = srcCEs[j];
1082 }
1083 isModified = true;
1084 }
1085 modifiedCEs[i] = ce;
1086 }
1087 }
1088 if(isModified) {
1089 ce32 = dest.encodeCEs(modifiedCEs, length, errorCode);
1090 } else {
1091 ce32 = dest.encodeExpansion(srcCEs, length, errorCode);
1092 }
1093 } else if(tag == Collation::BUILDER_DATA_TAG) {
1094 // Copy the list of ConditionalCE32.
1095 ConditionalCE32 *cond = src.getConditionalCE32ForCE32(ce32);
1096 U_ASSERT(!cond->hasContext())(static_cast <bool> (!cond->hasContext()) ? void (0)
: __assert_fail ("!cond->hasContext()", __builtin_FILE ()
, __builtin_LINE (), __extension__ __PRETTY_FUNCTION__))
;
1097 int32_t destIndex = dest.addConditionalCE32(
1098 cond->context, copyCE32(cond->ce32), errorCode);
1099 ce32 = CollationDataBuilder::makeBuilderContextCE32(destIndex);
1100 while(cond->next >= 0) {
1101 cond = src.getConditionalCE32(cond->next);
1102 ConditionalCE32 *prevDestCond = dest.getConditionalCE32(destIndex);
1103 destIndex = dest.addConditionalCE32(
1104 cond->context, copyCE32(cond->ce32), errorCode);
1105 int32_t suffixStart = cond->prefixLength() + 1;
1106 dest.unsafeBackwardSet.addAll(cond->context.tempSubString(suffixStart));
1107 prevDestCond->next = destIndex;
1108 }
1109 } else {
1110 // Just copy long CEs and Latin mini expansions (and other expected values) as is,
1111 // assuming that the modifier would not modify them.
1112 U_ASSERT(tag == Collation::LONG_PRIMARY_TAG ||(static_cast <bool> (tag == Collation::LONG_PRIMARY_TAG
|| tag == Collation::LONG_SECONDARY_TAG || tag == Collation::
LATIN_EXPANSION_TAG || tag == Collation::HANGUL_TAG) ? void (
0) : __assert_fail ("tag == Collation::LONG_PRIMARY_TAG || tag == Collation::LONG_SECONDARY_TAG || tag == Collation::LATIN_EXPANSION_TAG || tag == Collation::HANGUL_TAG"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
))
1113 tag == Collation::LONG_SECONDARY_TAG ||(static_cast <bool> (tag == Collation::LONG_PRIMARY_TAG
|| tag == Collation::LONG_SECONDARY_TAG || tag == Collation::
LATIN_EXPANSION_TAG || tag == Collation::HANGUL_TAG) ? void (
0) : __assert_fail ("tag == Collation::LONG_PRIMARY_TAG || tag == Collation::LONG_SECONDARY_TAG || tag == Collation::LATIN_EXPANSION_TAG || tag == Collation::HANGUL_TAG"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
))
1114 tag == Collation::LATIN_EXPANSION_TAG ||(static_cast <bool> (tag == Collation::LONG_PRIMARY_TAG
|| tag == Collation::LONG_SECONDARY_TAG || tag == Collation::
LATIN_EXPANSION_TAG || tag == Collation::HANGUL_TAG) ? void (
0) : __assert_fail ("tag == Collation::LONG_PRIMARY_TAG || tag == Collation::LONG_SECONDARY_TAG || tag == Collation::LATIN_EXPANSION_TAG || tag == Collation::HANGUL_TAG"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
))
1115 tag == Collation::HANGUL_TAG)(static_cast <bool> (tag == Collation::LONG_PRIMARY_TAG
|| tag == Collation::LONG_SECONDARY_TAG || tag == Collation::
LATIN_EXPANSION_TAG || tag == Collation::HANGUL_TAG) ? void (
0) : __assert_fail ("tag == Collation::LONG_PRIMARY_TAG || tag == Collation::LONG_SECONDARY_TAG || tag == Collation::LATIN_EXPANSION_TAG || tag == Collation::HANGUL_TAG"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
))
;
1116 }
1117 }
1118 return ce32;
1119 }
1120
1121 const CollationDataBuilder &src;
1122 CollationDataBuilder &dest;
1123 const CollationDataBuilder::CEModifier &modifier;
1124 int64_t modifiedCEs[Collation::MAX_EXPANSION_LENGTH];
1125 UErrorCode errorCode;
1126};
1127
1128U_CDECL_BEGINextern "C" {
1129
1130static UBool U_CALLCONV
1131enumRangeForCopy(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1132 return
1133 value == Collation::UNASSIGNED_CE32 || value == Collation::FALLBACK_CE32 ||
1134 ((CopyHelper *)context)->copyRangeCE32(start, end, value);
1135}
1136
1137U_CDECL_END}
1138
1139void
1140CollationDataBuilder::copyFrom(const CollationDataBuilder &src, const CEModifier &modifier,
1141 UErrorCode &errorCode) {
1142 if(U_FAILURE(errorCode)) { return; }
1143 if(trie == nullptr || utrie2_isFrozenutrie2_isFrozen_77(trie)) {
1144 errorCode = U_INVALID_STATE_ERROR;
1145 return;
1146 }
1147 CopyHelper helper(src, *this, modifier, errorCode);
1148 utrie2_enumutrie2_enum_77(src.trie, nullptr, enumRangeForCopy, &helper);
1149 errorCode = helper.errorCode;
1150 // Update the contextChars and the unsafeBackwardSet while copying,
1151 // in case a character had conditional mappings in the source builder
1152 // and they were removed later.
1153 modified |= src.modified;
1154}
1155
1156void
1157CollationDataBuilder::optimize(const UnicodeSet &set, UErrorCode &errorCode) {
1158 if(U_FAILURE(errorCode) || set.isEmpty()) { return; }
1159 UnicodeSetIterator iter(set);
1160 while(iter.next() && !iter.isString()) {
1161 UChar32 c = iter.getCodepoint();
1162 uint32_t ce32 = utrie2_get32utrie2_get32_77(trie, c);
1163 if(ce32 == Collation::FALLBACK_CE32) {
1164 ce32 = base->getFinalCE32(base->getCE32(c));
1165 ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
1166 utrie2_set32utrie2_set32_77(trie, c, ce32, &errorCode);
1167 }
1168 }
1169 modified = true;
1170}
1171
1172void
1173CollationDataBuilder::suppressContractions(const UnicodeSet &set, UErrorCode &errorCode) {
1174 if(U_FAILURE(errorCode) || set.isEmpty()) { return; }
1175 UnicodeSetIterator iter(set);
1176 while(iter.next() && !iter.isString()) {
1177 UChar32 c = iter.getCodepoint();
1178 uint32_t ce32 = utrie2_get32utrie2_get32_77(trie, c);
1179 if(ce32 == Collation::FALLBACK_CE32) {
1180 ce32 = base->getFinalCE32(base->getCE32(c));
1181 if(Collation::ce32HasContext(ce32)) {
1182 ce32 = copyFromBaseCE32(c, ce32, false /* without context */, errorCode);
1183 utrie2_set32utrie2_set32_77(trie, c, ce32, &errorCode);
1184 }
1185 } else if(isBuilderContextCE32(ce32)) {
1186 ce32 = getConditionalCE32ForCE32(ce32)->ce32;
1187 // Simply abandon the list of ConditionalCE32.
1188 // The caller will copy this builder in the end,
1189 // eliminating unreachable data.
1190 utrie2_set32utrie2_set32_77(trie, c, ce32, &errorCode);
1191 contextChars.remove(c);
1192 }
1193 }
1194 modified = true;
1195}
1196
1197UBool
1198CollationDataBuilder::getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode) {
1199 if(U_FAILURE(errorCode)) { return false; }
1200 UBool anyJamoAssigned = base == nullptr; // always set jamoCE32s in the base data
1201 UBool needToCopyFromBase = false;
1202 for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types.
1203 UChar32 jamo = jamoCpFromIndex(j);
1204 UBool fromBase = false;
1205 uint32_t ce32 = utrie2_get32utrie2_get32_77(trie, jamo);
1206 anyJamoAssigned |= Collation::isAssignedCE32(ce32);
1207 // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned.
1208 // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.)
1209 if(ce32 == Collation::FALLBACK_CE32) {
1210 fromBase = true;
1211 ce32 = base->getCE32(jamo);
1212 }
1213 if(Collation::isSpecialCE32(ce32)) {
1214 switch(Collation::tagFromCE32(ce32)) {
1215 case Collation::LONG_PRIMARY_TAG:
1216 case Collation::LONG_SECONDARY_TAG:
1217 case Collation::LATIN_EXPANSION_TAG:
1218 // Copy the ce32 as-is.
1219 break;
1220 case Collation::EXPANSION32_TAG:
1221 case Collation::EXPANSION_TAG:
1222 case Collation::PREFIX_TAG:
1223 case Collation::CONTRACTION_TAG:
1224 if(fromBase) {
1225 // Defer copying until we know if anyJamoAssigned.
1226 ce32 = Collation::FALLBACK_CE32;
1227 needToCopyFromBase = true;
1228 }
1229 break;
1230 case Collation::IMPLICIT_TAG:
1231 // An unassigned Jamo should only occur in tests with incomplete bases.
1232 U_ASSERT(fromBase)(static_cast <bool> (fromBase) ? void (0) : __assert_fail
("fromBase", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__))
;
1233 ce32 = Collation::FALLBACK_CE32;
1234 needToCopyFromBase = true;
1235 break;
1236 case Collation::OFFSET_TAG:
1237 ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32);
1238 break;
1239 case Collation::FALLBACK_TAG:
1240 case Collation::RESERVED_TAG_3:
1241 case Collation::BUILDER_DATA_TAG:
1242 case Collation::DIGIT_TAG:
1243 case Collation::U0000_TAG:
1244 case Collation::HANGUL_TAG:
1245 case Collation::LEAD_SURROGATE_TAG:
1246 errorCode = U_INTERNAL_PROGRAM_ERROR;
1247 return false;
1248 }
1249 }
1250 jamoCE32s[j] = ce32;
1251 }
1252 if(anyJamoAssigned && needToCopyFromBase) {
1253 for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) {
1254 if(jamoCE32s[j] == Collation::FALLBACK_CE32) {
1255 UChar32 jamo = jamoCpFromIndex(j);
1256 jamoCE32s[j] = copyFromBaseCE32(jamo, base->getCE32(jamo),
1257 /*withContext=*/ true, errorCode);
1258 }
1259 }
1260 }
1261 return anyJamoAssigned && U_SUCCESS(errorCode);
1262}
1263
1264void
1265CollationDataBuilder::setDigitTags(UErrorCode &errorCode) {
1266 UnicodeSet digits(UNICODE_STRING_SIMPLE("[:Nd:]")icu::UnicodeString(true, u"[:Nd:]", -1), errorCode);
1267 if(U_FAILURE(errorCode)) { return; }
1268 UnicodeSetIterator iter(digits);
1269 while(iter.next()) {
1270 U_ASSERT(!iter.isString())(static_cast <bool> (!iter.isString()) ? void (0) : __assert_fail
("!iter.isString()", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__))
;
1271 UChar32 c = iter.getCodepoint();
1272 uint32_t ce32 = utrie2_get32utrie2_get32_77(trie, c);
1273 if(ce32 != Collation::FALLBACK_CE32 && ce32 != Collation::UNASSIGNED_CE32) {
1274 int32_t index = addCE32(ce32, errorCode);
1275 if(U_FAILURE(errorCode)) { return; }
1276 if(index > Collation::MAX_INDEX) {
1277 errorCode = U_BUFFER_OVERFLOW_ERROR;
1278 return;
1279 }
1280 ce32 = Collation::makeCE32FromTagIndexAndLength(
1281 Collation::DIGIT_TAG, index, u_charDigitValueu_charDigitValue_77(c));
1282 utrie2_set32utrie2_set32_77(trie, c, ce32, &errorCode);
1283 }
1284 }
1285}
1286
1287U_CDECL_BEGINextern "C" {
1288
1289static UBool U_CALLCONV
1290enumRangeLeadValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
1291 int32_t *pValue = (int32_t *)context;
1292 if(value == Collation::UNASSIGNED_CE32) {
1293 value = Collation::LEAD_ALL_UNASSIGNED;
1294 } else if(value == Collation::FALLBACK_CE32) {
1295 value = Collation::LEAD_ALL_FALLBACK;
1296 } else {
1297 *pValue = Collation::LEAD_MIXED;
1298 return false;
1299 }
1300 if(*pValue < 0) {
1301 *pValue = (int32_t)value;
1302 } else if(*pValue != (int32_t)value) {
1303 *pValue = Collation::LEAD_MIXED;
1304 return false;
1305 }
1306 return true;
1307}
1308
1309U_CDECL_END}
1310
1311void
1312CollationDataBuilder::setLeadSurrogates(UErrorCode &errorCode) {
1313 for(char16_t lead = 0xd800; lead < 0xdc00; ++lead) {
1314 int32_t value = -1;
1315 utrie2_enumForLeadSurrogateutrie2_enumForLeadSurrogate_77(trie, lead, nullptr, enumRangeLeadValue, &value);
1316 utrie2_set32ForLeadSurrogateCodeUnitutrie2_set32ForLeadSurrogateCodeUnit_77(
1317 trie, lead,
1318 Collation::makeCE32FromTagAndIndex(Collation::LEAD_SURROGATE_TAG, 0) | static_cast<uint32_t>(value),
1319 &errorCode);
1320 }
1321}
1322
1323void
1324CollationDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
1325 buildMappings(data, errorCode);
1326 if(base != nullptr) {
1327 data.numericPrimary = base->numericPrimary;
1328 data.compressibleBytes = base->compressibleBytes;
1329 data.numScripts = base->numScripts;
1330 data.scriptsIndex = base->scriptsIndex;
1331 data.scriptStarts = base->scriptStarts;
1332 data.scriptStartsLength = base->scriptStartsLength;
1333 }
1334 buildFastLatinTable(data, errorCode);
1335}
1336
1337void
1338CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode) {
1339 if(U_FAILURE(errorCode)) { return; }
1340 if(trie == nullptr || utrie2_isFrozenutrie2_isFrozen_77(trie)) {
1341 errorCode = U_INVALID_STATE_ERROR;
1342 return;
1343 }
1344
1345 buildContexts(errorCode);
1346
1347 uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH];
1348 int32_t jamoIndex = -1;
1349 if(getJamoCE32s(jamoCE32s, errorCode)) {
1350 jamoIndex = ce32s.size();
1351 for(int32_t i = 0; i < CollationData::JAMO_CE32S_LENGTH; ++i) {
1352 ce32s.addElement(static_cast<int32_t>(jamoCE32s[i]), errorCode);
1353 }
1354 // Small optimization: Use a bit in the Hangul ce32
1355 // to indicate that none of the Jamo CE32s are isSpecialCE32()
1356 // (as it should be in the root collator).
1357 // It allows CollationIterator to avoid recursive function calls and per-Jamo tests.
1358 // In order to still have good trie compression and keep this code simple,
1359 // we only set this flag if a whole block of 588 Hangul syllables starting with
1360 // a common leading consonant (Jamo L) has this property.
1361 UBool isAnyJamoVTSpecial = false;
1362 for(int32_t i = Hangul::JAMO_L_COUNT; i < CollationData::JAMO_CE32S_LENGTH; ++i) {
1363 if(Collation::isSpecialCE32(jamoCE32s[i])) {
1364 isAnyJamoVTSpecial = true;
1365 break;
1366 }
1367 }
1368 uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
1369 UChar32 c = Hangul::HANGUL_BASE;
1370 for(int32_t i = 0; i < Hangul::JAMO_L_COUNT; ++i) { // iterate over the Jamo L
1371 uint32_t ce32 = hangulCE32;
1372 if(!isAnyJamoVTSpecial && !Collation::isSpecialCE32(jamoCE32s[i])) {
1373 ce32 |= Collation::HANGUL_NO_SPECIAL_JAMO;
1374 }
1375 UChar32 limit = c + Hangul::JAMO_VT_COUNT;
1376 utrie2_setRange32utrie2_setRange32_77(trie, c, limit - 1, ce32, true, &errorCode);
1377 c = limit;
1378 }
1379 } else {
1380 // Copy the Hangul CE32s from the base in blocks per Jamo L,
1381 // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks.
1382 for(UChar32 c = Hangul::HANGUL_BASE; c < Hangul::HANGUL_LIMIT;) {
1383 uint32_t ce32 = base->getCE32(c);
1384 U_ASSERT(Collation::hasCE32Tag(ce32, Collation::HANGUL_TAG))(static_cast <bool> (Collation::hasCE32Tag(ce32, Collation
::HANGUL_TAG)) ? void (0) : __assert_fail ("Collation::hasCE32Tag(ce32, Collation::HANGUL_TAG)"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
))
;
1385 UChar32 limit = c + Hangul::JAMO_VT_COUNT;
1386 utrie2_setRange32utrie2_setRange32_77(trie, c, limit - 1, ce32, true, &errorCode);
1387 c = limit;
1388 }
1389 }
1390
1391 setDigitTags(errorCode);
1392 setLeadSurrogates(errorCode);
1393
1394 if (!icu4xMode) {
1395 // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
1396 ce32s.setElementAt(static_cast<int32_t>(utrie2_get32utrie2_get32_77(trie, 0)), 0);
1397 utrie2_set32utrie2_set32_77(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode);
1398 }
1399
1400 utrie2_freezeutrie2_freeze_77(trie, UTRIE2_32_VALUE_BITS, &errorCode);
1401 if(U_FAILURE(errorCode)) { return; }
1402
1403 // Mark each lead surrogate as "unsafe"
1404 // if any of its 1024 associated supplementary code points is "unsafe".
1405 UChar32 c = 0x10000;
1406 for(char16_t lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
1407 if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) {
1408 unsafeBackwardSet.add(lead);
1409 }
1410 }
1411 unsafeBackwardSet.freeze();
1412
1413 data.trie = trie;
1414 data.ce32s = reinterpret_cast<const uint32_t *>(ce32s.getBuffer());
1415 data.ces = ce64s.getBuffer();
1416 data.contexts = contexts.getBuffer();
1417
1418 data.ce32sLength = ce32s.size();
1419 data.cesLength = ce64s.size();
1420 data.contextsLength = contexts.length();
1421
1422 data.base = base;
1423 if(jamoIndex >= 0) {
1424 data.jamoCE32s = data.ce32s + jamoIndex;
1425 } else {
1426 data.jamoCE32s = base->jamoCE32s;
1427 }
1428 data.unsafeBackwardSet = &unsafeBackwardSet;
1429}
1430
1431void
1432CollationDataBuilder::clearContexts() {
1433 contexts.remove();
1434 // Incrementing the contexts build "era" invalidates all of the builtCE32
1435 // from before this clearContexts() call.
1436 // Simpler than finding and resetting all of those fields.
1437 ++contextsEra;
1438}
1439
1440void
1441CollationDataBuilder::buildContexts(UErrorCode &errorCode) {
1442 if(U_FAILURE(errorCode)) { return; }
1443 // Ignore abandoned lists and the cached builtCE32,
1444 // and build all contexts from scratch.
1445 clearContexts();
1446 UnicodeSetIterator iter(contextChars);
1447 while(U_SUCCESS(errorCode) && iter.next()) {
1448 U_ASSERT(!iter.isString())(static_cast <bool> (!iter.isString()) ? void (0) : __assert_fail
("!iter.isString()", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__))
;
1449 UChar32 c = iter.getCodepoint();
1450 uint32_t ce32 = utrie2_get32utrie2_get32_77(trie, c);
1451 if(!isBuilderContextCE32(ce32)) {
1452 // Impossible: No context data for c in contextChars.
1453 errorCode = U_INTERNAL_PROGRAM_ERROR;
1454 return;
1455 }
1456 ConditionalCE32 *cond = getConditionalCE32ForCE32(ce32);
1457 ce32 = buildContext(cond, errorCode);
1458 utrie2_set32utrie2_set32_77(trie, c, ce32, &errorCode);
1459 }
1460}
1461
1462uint32_t
1463CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode) {
1464 if(U_FAILURE(errorCode)) { return 0; }
1465 // The list head must have no context.
1466 U_ASSERT(!head->hasContext())(static_cast <bool> (!head->hasContext()) ? void (0)
: __assert_fail ("!head->hasContext()", __builtin_FILE ()
, __builtin_LINE (), __extension__ __PRETTY_FUNCTION__))
;
1467 // The list head must be followed by one or more nodes that all do have context.
1468 U_ASSERT(head->next >= 0)(static_cast <bool> (head->next >= 0) ? void (0) :
__assert_fail ("head->next >= 0", __builtin_FILE (), __builtin_LINE
(), __extension__ __PRETTY_FUNCTION__))
;
1469 UCharsTrieBuilder prefixBuilder(errorCode);
1470 UCharsTrieBuilder contractionBuilder(errorCode);
1471 // This outer loop goes from each prefix to the next.
1472 // For each prefix it finds the one or more same-prefix entries (firstCond..lastCond).
1473 // If there are multiple suffixes for the same prefix,
1474 // then an inner loop builds a contraction trie for them.
1475 for(ConditionalCE32 *cond = head;; cond = getConditionalCE32(cond->next)) {
1476 if(U_FAILURE(errorCode)) { return 0; } // early out for memory allocation errors
1477 // After the list head, the prefix or suffix can be empty, but not both.
1478 U_ASSERT(cond == head || cond->hasContext())(static_cast <bool> (cond == head || cond->hasContext
()) ? void (0) : __assert_fail ("cond == head || cond->hasContext()"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
))
;
1479 int32_t prefixLength = cond->prefixLength();
1480 UnicodeString prefix(cond->context, 0, prefixLength + 1);
1481 // Collect all contraction suffixes for one prefix.
1482 ConditionalCE32 *firstCond = cond;
1483 ConditionalCE32 *lastCond;
1484 do {
1485 lastCond = cond;
1486 // Clear the defaultCE32 fields as we go.
1487 // They are left over from building a previous version of this list of contexts.
1488 //
1489 // One of the code paths below may copy a preceding defaultCE32
1490 // into its emptySuffixCE32.
1491 // If a new suffix has been inserted before what used to be
1492 // the firstCond for its prefix, then that previous firstCond could still
1493 // contain an outdated defaultCE32 from an earlier buildContext() and
1494 // result in an incorrect emptySuffixCE32.
1495 // So we reset all defaultCE32 before reading and setting new values.
1496 cond->defaultCE32 = Collation::NO_CE32;
1497 } while(cond->next >= 0 &&
1498 (cond = getConditionalCE32(cond->next))->context.startsWith(prefix));
1499 uint32_t ce32;
1500 int32_t suffixStart = prefixLength + 1; // == prefix.length()
1501 if(lastCond->context.length() == suffixStart) {
1502 // One prefix without contraction suffix.
1503 U_ASSERT(firstCond == lastCond)(static_cast <bool> (firstCond == lastCond) ? void (0) :
__assert_fail ("firstCond == lastCond", __builtin_FILE (), __builtin_LINE
(), __extension__ __PRETTY_FUNCTION__))
;
1504 ce32 = lastCond->ce32;
1505 cond = lastCond;
1506 } else {
1507 // Build the contractions trie.
1508 contractionBuilder.clear();
1509 // Entry for an empty suffix, to be stored before the trie.
1510 uint32_t emptySuffixCE32 = 0;
1511 uint32_t flags = 0;
1512 if(firstCond->context.length() == suffixStart) {
1513 // There is a mapping for the prefix and the single character c. (p|c)
1514 // If no other suffix matches, then we return this value.
1515 emptySuffixCE32 = firstCond->ce32;
1516 cond = getConditionalCE32(firstCond->next);
1517 } else {
1518 // There is no mapping for the prefix and just the single character.
1519 // (There is no p|c, only p|cd, p|ce etc.)
1520 flags |= Collation::CONTRACT_SINGLE_CP_NO_MATCH;
1521 // When the prefix matches but none of the prefix-specific suffixes,
1522 // then we fall back to the mappings with the next-longest prefix,
1523 // and ultimately to mappings with no prefix.
1524 // Each fallback might be another set of contractions.
1525 // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c,
1526 // then in text "pch" we find the ch contraction.
1527 for(cond = head;; cond = getConditionalCE32(cond->next)) {
1528 int32_t length = cond->prefixLength();
1529 if(length == prefixLength) { break; }
1530 if(cond->defaultCE32 != Collation::NO_CE32 &&
1531 (length==0 || prefix.endsWith(cond->context, 1, length))) {
1532 emptySuffixCE32 = cond->defaultCE32;
1533 }
1534 }
1535 cond = firstCond;
1536 }
1537 // Optimization: Set a flag when
1538 // the first character of every contraction suffix has lccc!=0.
1539 // Short-circuits contraction matching when a normal letter follows.
1540 flags |= Collation::CONTRACT_NEXT_CCC;
1541 // Add all of the non-empty suffixes into the contraction trie.
1542 for(;;) {
1543 UnicodeString suffix(cond->context, suffixStart);
1544 uint16_t fcd16 = nfcImpl.getFCD16(suffix.char32At(0));
1545 if(fcd16 <= 0xff) {
1546 flags &= ~Collation::CONTRACT_NEXT_CCC;
1547 }
1548 fcd16 = nfcImpl.getFCD16(suffix.char32At(suffix.length() - 1));
1549 if(fcd16 > 0xff) {
1550 // The last suffix character has lccc!=0, allowing for discontiguous contractions.
1551 flags |= Collation::CONTRACT_TRAILING_CCC;
1552 }
1553 if (icu4xMode && (flags & Collation::CONTRACT_HAS_STARTER) == 0) {
1554 for (int32_t i = 0; i < suffix.length();) {
1555 UChar32 c = suffix.char32At(i);
1556 if (!u_getCombiningClassu_getCombiningClass_77(c)) {
1557 flags |= Collation::CONTRACT_HAS_STARTER;
1558 break;
1559 }
1560 if (c > 0xFFFF) {
1561 i += 2;
1562 } else {
1563 ++i;
1564 }
1565 }
1566 }
1567 contractionBuilder.add(suffix, static_cast<int32_t>(cond->ce32), errorCode);
1568 if(cond == lastCond) { break; }
1569 cond = getConditionalCE32(cond->next);
1570 }
1571 int32_t index = addContextTrie(emptySuffixCE32, contractionBuilder, errorCode);
1572 if(U_FAILURE(errorCode)) { return 0; }
1573 if(index > Collation::MAX_INDEX) {
1574 errorCode = U_BUFFER_OVERFLOW_ERROR;
1575 return 0;
1576 }
1577 ce32 = Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG, index) | flags;
1578 }
1579 U_ASSERT(cond == lastCond)(static_cast <bool> (cond == lastCond) ? void (0) : __assert_fail
("cond == lastCond", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__))
;
1580 firstCond->defaultCE32 = ce32;
1581 if(prefixLength == 0) {
1582 if(cond->next < 0) {
1583 // No non-empty prefixes, only contractions.
1584 return ce32;
1585 }
1586 } else {
1587 prefix.remove(0, 1); // Remove the length unit.
1588 prefix.reverse();
1589 prefixBuilder.add(prefix, static_cast<int32_t>(ce32), errorCode);
1590 if(cond->next < 0) { break; }
1591 }
1592 }
1593 U_ASSERT(head->defaultCE32 != Collation::NO_CE32)(static_cast <bool> (head->defaultCE32 != Collation::
NO_CE32) ? void (0) : __assert_fail ("head->defaultCE32 != Collation::NO_CE32"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
))
;
1594 int32_t index = addContextTrie(head->defaultCE32, prefixBuilder, errorCode);
1595 if(U_FAILURE(errorCode)) { return 0; }
1596 if(index > Collation::MAX_INDEX) {
1597 errorCode = U_BUFFER_OVERFLOW_ERROR;
1598 return 0;
1599 }
1600 return Collation::makeCE32FromTagAndIndex(Collation::PREFIX_TAG, index);
1601}
1602
1603int32_t
1604CollationDataBuilder::addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder,
1605 UErrorCode &errorCode) {
1606 UnicodeString context;
1607 context.append(static_cast<char16_t>(defaultCE32 >> 16)).append(static_cast<char16_t>(defaultCE32));
1608 UnicodeString trieString;
1609 context.append(trieBuilder.buildUnicodeString(USTRINGTRIE_BUILD_SMALL, trieString, errorCode));
1610 if(U_FAILURE(errorCode)) { return -1; }
1611 int32_t index = contexts.indexOf(context);
1612 if(index < 0) {
1613 index = contexts.length();
1614 contexts.append(context);
1615 }
1616 return index;
1617}
1618
1619void
1620CollationDataBuilder::buildFastLatinTable(CollationData &data, UErrorCode &errorCode) {
1621 if(U_FAILURE(errorCode) || !fastLatinEnabled) { return; }
1622
1623 delete fastLatinBuilder;
1624 fastLatinBuilder = new CollationFastLatinBuilder(errorCode);
1625 if(fastLatinBuilder == nullptr) {
1626 errorCode = U_MEMORY_ALLOCATION_ERROR;
1627 return;
1628 }
1629 if(fastLatinBuilder->forData(data, errorCode)) {
1630 const uint16_t *table = fastLatinBuilder->getTable();
1631 int32_t length = fastLatinBuilder->lengthOfTable();
1632 if(base != nullptr && length == base->fastLatinTableLength &&
1633 uprv_memcmp(table, base->fastLatinTable, length * 2):: memcmp(table, base->fastLatinTable,length * 2) == 0) {
1634 // Same fast Latin table as in the base, use that one instead.
1635 delete fastLatinBuilder;
1636 fastLatinBuilder = nullptr;
1637 table = base->fastLatinTable;
1638 }
1639 data.fastLatinTable = table;
1640 data.fastLatinTableLength = length;
1641 } else {
1642 delete fastLatinBuilder;
1643 fastLatinBuilder = nullptr;
1644 }
1645}
1646
1647int32_t
1648CollationDataBuilder::getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength) {
1649 return getCEs(s, 0, ces, cesLength);
1
Calling 'CollationDataBuilder::getCEs'
1650}
1651
1652int32_t
1653CollationDataBuilder::getCEs(const UnicodeString &prefix, const UnicodeString &s,
1654 int64_t ces[], int32_t cesLength) {
1655 int32_t prefixLength = prefix.length();
1656 if(prefixLength == 0) {
1657 return getCEs(s, 0, ces, cesLength);
1658 } else {
1659 return getCEs(prefix + s, prefixLength, ces, cesLength);
1660 }
1661}
1662
1663int32_t
1664CollationDataBuilder::getCEs(const UnicodeString &s, int32_t start,
1665 int64_t ces[], int32_t cesLength) {
1666 if(collIter == nullptr) {
2
Assuming the condition is true
3
Taking true branch
1667 collIter = new DataBuilderCollationIterator(*this);
4
Calling constructor for 'DataBuilderCollationIterator'
1668 if(collIter == nullptr) { return 0; }
1669 }
1670 return collIter->fetchCEs(s, start, ces, cesLength);
1671}
1672
1673U_NAMESPACE_END}
1674
1675#endif // !UCONFIG_NO_COLLATION

/root/firefox-clang/intl/icu/source/i18n/collationiterator.h

1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2010-2014, International Business Machines
6* Corporation and others. All Rights Reserved.
7*******************************************************************************
8* collationiterator.h
9*
10* created on: 2010oct27
11* created by: Markus W. Scherer
12*/
13
14#ifndef __COLLATIONITERATOR_H__
15#define __COLLATIONITERATOR_H__
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_COLLATION0
20
21#include "cmemory.h"
22#include "collation.h"
23#include "collationdata.h"
24
25U_NAMESPACE_BEGINnamespace icu_77 {
26
27class SkippedState;
28class UCharsTrie;
29class UVector32;
30
31/* Large enough for CEs of most short strings. */
32#define CEBUFFER_INITIAL_CAPACITY40 40
33
34// Export an explicit template instantiation of the MaybeStackArray that
35// is used as a data member of CEBuffer.
36//
37// When building DLLs for Windows this is required even though
38// no direct access to the MaybeStackArray leaks out of the i18n library.
39//
40// See digitlst.h, pluralaffix.h, datefmt.h, and others for similar examples.
41//
42#if U_PF_WINDOWS1000 <= U_PLATFORM4000 && U_PLATFORM4000 <= U_PF_CYGWIN1900
43template class U_I18N_API MaybeStackArray<int64_t, CEBUFFER_INITIAL_CAPACITY40>;
44#endif
45
46/**
47 * Collation element iterator and abstract character iterator.
48 *
49 * When a method returns a code point value, it must be in 0..10FFFF,
50 * except it can be negative as a sentinel value.
51 */
52class U_I18N_API CollationIterator : public UObject {
53private:
54 class U_I18N_API CEBuffer {
55 private:
56 /** Large enough for CEs of most short strings. */
57 static const int32_t INITIAL_CAPACITY = CEBUFFER_INITIAL_CAPACITY40;
58 public:
59 CEBuffer() : length(0) {}
60 ~CEBuffer();
61
62 inline void append(int64_t ce, UErrorCode &errorCode) {
63 if(length < INITIAL_CAPACITY || ensureAppendCapacity(1, errorCode)) {
64 buffer[length++] = ce;
65 }
66 }
67
68 inline void appendUnsafe(int64_t ce) {
69 buffer[length++] = ce;
70 }
71
72 UBool ensureAppendCapacity(int32_t appCap, UErrorCode &errorCode);
73
74 inline UBool incLength(UErrorCode &errorCode) {
75 // Use INITIAL_CAPACITY for a very simple fastpath.
76 // (Rather than buffer.getCapacity().)
77 if(length < INITIAL_CAPACITY || ensureAppendCapacity(1, errorCode)) {
78 ++length;
79 return true;
80 } else {
81 return false;
82 }
83 }
84
85 inline int64_t set(int32_t i, int64_t ce) {
86 return buffer[i] = ce;
87 }
88 inline int64_t get(int32_t i) const { return buffer[i]; }
89
90 const int64_t *getCEs() const { return buffer.getAlias(); }
91
92 int32_t length;
93
94 private:
95 CEBuffer(const CEBuffer &) = delete;
96 void operator=(const CEBuffer &) = delete;
97
98 MaybeStackArray<int64_t, INITIAL_CAPACITY> buffer;
99 };
100
101public:
102 CollationIterator(const CollationData *d, UBool numeric)
103 : trie(d->trie),
6
Assigned value is uninitialized
104 data(d),
105 cesIndex(0),
106 skipped(nullptr),
107 numCpFwd(-1),
108 isNumeric(numeric) {}
109
110 virtual ~CollationIterator();
111
112 virtual bool operator==(const CollationIterator &other) const;
113 inline bool operator!=(const CollationIterator &other) const {
114 return !operator==(other);
115 }
116
117 /**
118 * Resets the iterator state and sets the position to the specified offset.
119 * Subclasses must implement, and must call the parent class method,
120 * or CollationIterator::reset().
121 */
122 virtual void resetToOffset(int32_t newOffset) = 0;
123
124 virtual int32_t getOffset() const = 0;
125
126 /**
127 * Returns the next collation element.
128 */
129 inline int64_t nextCE(UErrorCode &errorCode) {
130 if(cesIndex < ceBuffer.length) {
131 // Return the next buffered CE.
132 return ceBuffer.get(cesIndex++);
133 }
134 // assert cesIndex == ceBuffer.length;
135 if(!ceBuffer.incLength(errorCode)) {
136 return Collation::NO_CE;
137 }
138 UChar32 c;
139 uint32_t ce32 = handleNextCE32(c, errorCode);
140 uint32_t t = ce32 & 0xff;
141 if(t < Collation::SPECIAL_CE32_LOW_BYTE) { // Forced-inline of isSpecialCE32(ce32).
142 // Normal CE from the main data.
143 // Forced-inline of ceFromSimpleCE32(ce32).
144 return ceBuffer.set(cesIndex++,
145 (static_cast<int64_t>(ce32 & 0xffff0000) << 32) | ((ce32 & 0xff00) << 16) | (t << 8));
146 }
147 const CollationData *d;
148 // The compiler should be able to optimize the previous and the following
149 // comparisons of t with the same constant.
150 if(t == Collation::SPECIAL_CE32_LOW_BYTE) {
151 if(c < 0) {
152 return ceBuffer.set(cesIndex++, Collation::NO_CE);
153 }
154 d = data->base;
155 ce32 = d->getCE32(c);
156 t = ce32 & 0xff;
157 if(t < Collation::SPECIAL_CE32_LOW_BYTE) {
158 // Normal CE from the base data.
159 return ceBuffer.set(cesIndex++,
160 (static_cast<int64_t>(ce32 & 0xffff0000) << 32) | ((ce32 & 0xff00) << 16) | (t << 8));
161 }
162 } else {
163 d = data;
164 }
165 if(t == Collation::LONG_PRIMARY_CE32_LOW_BYTE) {
166 // Forced-inline of ceFromLongPrimaryCE32(ce32).
167 return ceBuffer.set(cesIndex++,
168 (static_cast<int64_t>(ce32 - t) << 32) | Collation::COMMON_SEC_AND_TER_CE);
169 }
170 return nextCEFromCE32(d, c, ce32, errorCode);
171 }
172
173 /**
174 * Fetches all CEs.
175 * @return getCEsLength()
176 */
177 int32_t fetchCEs(UErrorCode &errorCode);
178
179 /**
180 * Overwrites the current CE (the last one returned by nextCE()).
181 */
182 void setCurrentCE(int64_t ce) {
183 // assert cesIndex > 0;
184 ceBuffer.set(cesIndex - 1, ce);
185 }
186
187 /**
188 * Returns the previous collation element.
189 */
190 int64_t previousCE(UVector32 &offsets, UErrorCode &errorCode);
191
192 inline int32_t getCEsLength() const {
193 return ceBuffer.length;
194 }
195
196 inline int64_t getCE(int32_t i) const {
197 return ceBuffer.get(i);
198 }
199
200 const int64_t *getCEs() const {
201 return ceBuffer.getCEs();
202 }
203
204 void clearCEs() {
205 cesIndex = ceBuffer.length = 0;
206 }
207
208 void clearCEsIfNoneRemaining() {
209 if(cesIndex == ceBuffer.length) { clearCEs(); }
210 }
211
212 /**
213 * Returns the next code point (with post-increment).
214 * Public for identical-level comparison and for testing.
215 */
216 virtual UChar32 nextCodePoint(UErrorCode &errorCode) = 0;
217
218 /**
219 * Returns the previous code point (with pre-decrement).
220 * Public for identical-level comparison and for testing.
221 */
222 virtual UChar32 previousCodePoint(UErrorCode &errorCode) = 0;
223
224protected:
225 CollationIterator(const CollationIterator &other);
226
227 void reset();
228
229 /**
230 * Returns the next code point and its local CE32 value.
231 * Returns Collation::FALLBACK_CE32 at the end of the text (c<0)
232 * or when c's CE32 value is to be looked up in the base data (fallback).
233 *
234 * The code point is used for fallbacks, context and implicit weights.
235 * It is ignored when the returned CE32 is not special (e.g., FFFD_CE32).
236 */
237 virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
238
239 /**
240 * Called when handleNextCE32() returns a LEAD_SURROGATE_TAG for a lead surrogate code unit.
241 * Returns the trail surrogate in that case and advances past it,
242 * if a trail surrogate follows the lead surrogate.
243 * Otherwise returns any other code unit and does not advance.
244 */
245 virtual char16_t handleGetTrailSurrogate();
246
247 /**
248 * Called when handleNextCE32() returns with c==0, to see whether it is a NUL terminator.
249 * (Not needed in Java.)
250 */
251 virtual UBool foundNULTerminator();
252
253 /**
254 * @return false if surrogate code points U+D800..U+DFFF
255 * map to their own implicit primary weights (for UTF-16),
256 * or true if they map to CE(U+FFFD) (for UTF-8)
257 */
258 virtual UBool forbidSurrogateCodePoints() const;
259
260 virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) = 0;
261
262 virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) = 0;
263
264 /**
265 * Returns the CE32 from the data trie.
266 * Normally the same as data->getCE32(), but overridden in the builder.
267 * Call this only when the faster data->getCE32() cannot be used.
268 */
269 virtual uint32_t getDataCE32(UChar32 c) const;
270
271 virtual uint32_t getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode);
272
273 void appendCEsFromCE32(const CollationData *d, UChar32 c, uint32_t ce32,
274 UBool forward, UErrorCode &errorCode);
275
276 // Main lookup trie of the data object.
277 const UTrie2 *trie;
278 const CollationData *data;
279
280private:
281 int64_t nextCEFromCE32(const CollationData *d, UChar32 c, uint32_t ce32,
282 UErrorCode &errorCode);
283
284 uint32_t getCE32FromPrefix(const CollationData *d, uint32_t ce32,
285 UErrorCode &errorCode);
286
287 UChar32 nextSkippedCodePoint(UErrorCode &errorCode);
288
289 void backwardNumSkipped(int32_t n, UErrorCode &errorCode);
290
291 uint32_t nextCE32FromContraction(
292 const CollationData *d, uint32_t contractionCE32,
293 const char16_t *p, uint32_t ce32, UChar32 c,
294 UErrorCode &errorCode);
295
296 uint32_t nextCE32FromDiscontiguousContraction(
297 const CollationData *d, UCharsTrie &suffixes, uint32_t ce32,
298 int32_t lookAhead, UChar32 c,
299 UErrorCode &errorCode);
300
301 /**
302 * Returns the previous CE when data->isUnsafeBackward(c, isNumeric).
303 */
304 int64_t previousCEUnsafe(UChar32 c, UVector32 &offsets, UErrorCode &errorCode);
305
306 /**
307 * Turns a string of digits (bytes 0..9)
308 * into a sequence of CEs that will sort in numeric order.
309 *
310 * Starts from this ce32's digit value and consumes the following/preceding digits.
311 * The digits string must not be empty and must not have leading zeros.
312 */
313 void appendNumericCEs(uint32_t ce32, UBool forward, UErrorCode &errorCode);
314
315 /**
316 * Turns 1..254 digits into a sequence of CEs.
317 * Called by appendNumericCEs() for each segment of at most 254 digits.
318 */
319 void appendNumericSegmentCEs(const char *digits, int32_t length, UErrorCode &errorCode);
320
321 CEBuffer ceBuffer;
322 int32_t cesIndex;
323
324 SkippedState *skipped;
325
326 // Number of code points to read forward, or -1.
327 // Used as a forward iteration limit in previousCEUnsafe().
328 int32_t numCpFwd;
329 // Numeric collation (CollationSettings::NUMERIC).
330 UBool isNumeric;
331};
332
333U_NAMESPACE_END}
334
335#endif // !UCONFIG_NO_COLLATION
336#endif // __COLLATIONITERATOR_H__