Bug Summary

File:root/firefox-clang/intl/icu/source/common/uniset.cpp
Warning:line 1912, column 19
Assigned value is uninitialized

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name uniset.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -relaxed-aliasing -ffp-contract=off -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/root/firefox-clang/obj-x86_64-pc-linux-gnu/config/external/icu/common -fcoverage-compilation-dir=/root/firefox-clang/obj-x86_64-pc-linux-gnu/config/external/icu/common -resource-dir /usr/lib/llvm-21/lib/clang/21 -include /root/firefox-clang/config/gcc_hidden.h -include /root/firefox-clang/obj-x86_64-pc-linux-gnu/mozilla-config.h -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/system_wrappers -U _FORTIFY_SOURCE -D _FORTIFY_SOURCE=2 -D _GLIBCXX_ASSERTIONS -D DEBUG=1 -D U_COMMON_IMPLEMENTATION -D _LIBCPP_DISABLE_DEPRECATION_WARNINGS -D U_USING_ICU_NAMESPACE=0 -D U_NO_DEFAULT_INCLUDE_UTF_HEADERS=1 -D U_HIDE_OBSOLETE_UTF_OLD_H=1 -D UCONFIG_NO_LEGACY_CONVERSION -D UCONFIG_NO_TRANSLITERATION -D UCONFIG_NO_REGULAR_EXPRESSIONS -D UCONFIG_NO_BREAK_ITERATION -D UCONFIG_NO_IDNA -D UCONFIG_NO_MF2 -D U_CHARSET_IS_UTF8 -D UNISTR_FROM_CHAR_EXPLICIT=explicit -D UNISTR_FROM_STRING_EXPLICIT=explicit -D U_ENABLE_DYLOAD=0 -D U_DEBUG=1 -I /root/firefox-clang/config/external/icu/common -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/config/external/icu/common -I /root/firefox-clang/intl/icu/source/i18n -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include/nspr -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include/nss -D MOZILLA_CLIENT -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../include/c++/14 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../include/x86_64-linux-gnu/c++/14 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../include/c++/14/backward -internal-isystem /usr/lib/llvm-21/lib/clang/21/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-error=pessimizing-move -Wno-error=large-by-value-copy=128 -Wno-error=implicit-int-float-conversion -Wno-error=thread-safety-analysis -Wno-error=tautological-type-limit-compare -Wno-invalid-offsetof -Wno-range-loop-analysis -Wno-deprecated-anon-enum-enum-conversion -Wno-deprecated-enum-enum-conversion -Wno-deprecated-this-capture -Wno-inline-new-delete -Wno-error=deprecated-declarations -Wno-error=array-bounds -Wno-error=free-nonheap-object -Wno-error=atomic-alignment -Wno-error=deprecated-builtins -Wno-psabi -Wno-error=builtin-macro-redefined -Wno-vla-cxx-extension -Wno-unknown-warning-option -Wno-comma -Wno-implicit-const-int-float-conversion -Wno-macro-redefined -Wno-microsoft-include -Wno-tautological-unsigned-enum-zero-compare -Wno-unreachable-code-loop-increment -Wno-unreachable-code-return -fdeprecated-macro -ferror-limit 19 -fstrict-flex-arrays=1 -stack-protector 2 -fstack-clash-protection -ftrivial-auto-var-init=pattern -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -fno-sized-deallocation -fno-aligned-allocation -vectorize-loops -vectorize-slp -analyzer-checker optin.performance.Padding -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2025-06-27-100320-3286336-1 -x c++ /root/firefox-clang/intl/icu/source/common/uniset.cpp
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (C) 1999-2015, International Business Machines
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8* Date Name Description
9* 10/20/99 alan Creation.
10**********************************************************************
11*/
12
13#include "unicode/utypes.h"
14#include "unicode/parsepos.h"
15#include "unicode/symtable.h"
16#include "unicode/uniset.h"
17#include "unicode/ustring.h"
18#include "unicode/utf8.h"
19#include "unicode/utf16.h"
20#include "ruleiter.h"
21#include "cmemory.h"
22#include "cstring.h"
23#include "patternprops.h"
24#include "uelement.h"
25#include "util.h"
26#include "uvector.h"
27#include "charstr.h"
28#include "ustrfmt.h"
29#include "uassert.h"
30#include "bmpset.h"
31#include "unisetspan.h"
32
33// HIGH_VALUE > all valid values. 110000 for codepoints
34#define UNICODESET_HIGH0x0110000 0x0110000
35
36// LOW <= all valid values. ZERO for codepoints
37#define UNICODESET_LOW0x000000 0x000000
38
39/** Max list [0, 1, 2, ..., max code point, HIGH] */
40constexpr int32_t MAX_LENGTH = UNICODESET_HIGH0x0110000 + 1;
41
42U_NAMESPACE_BEGINnamespace icu_77 {
43
44SymbolTable::~SymbolTable() {}
45
46UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)UClassID UnicodeSet::getStaticClassID() { static char classID
= 0; return (UClassID)&classID; } UClassID UnicodeSet::getDynamicClassID
() const { return UnicodeSet::getStaticClassID(); }
47
48/**
49 * Modify the given UChar32 variable so that it is in range, by
50 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
51 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
52 * It modifies its argument in-place and also returns it.
53 */
54static inline UChar32 pinCodePoint(UChar32& c) {
55 if (c < UNICODESET_LOW0x000000) {
56 c = UNICODESET_LOW0x000000;
57 } else if (c > (UNICODESET_HIGH0x0110000-1)) {
58 c = (UNICODESET_HIGH0x0110000-1);
59 }
60 return c;
61}
62
63//----------------------------------------------------------------
64// Debugging
65//----------------------------------------------------------------
66
67// DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
68// To enable the debugging, define the symbol DEBUG_MEM in the line
69// below. This will result in text being sent to stdout that looks
70// like this:
71// DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
72// DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
73// Each line lists a construction (ct) or destruction (dt) event, the
74// object address, the number of outstanding objects after the event,
75// and the pattern of the object in question.
76
77// #define DEBUG_MEM
78
79#ifdef DEBUG_MEM
80#include <stdio.h>
81static int32_t _dbgCount = 0;
82
83static inline void _dbgct(UnicodeSet* set) {
84 UnicodeString str;
85 set->toPattern(str, true);
86 char buf[40];
87 str.extract(0, 39, buf, "");
88 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
89}
90
91static inline void _dbgdt(UnicodeSet* set) {
92 UnicodeString str;
93 set->toPattern(str, true);
94 char buf[40];
95 str.extract(0, 39, buf, "");
96 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
97}
98
99#else
100
101#define _dbgct(set)
102#define _dbgdt(set)
103
104#endif
105
106//----------------------------------------------------------------
107// UnicodeString in UVector support
108//----------------------------------------------------------------
109
110static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
111 dst->pointer = new UnicodeString(*static_cast<UnicodeString*>(src->pointer));
112}
113
114static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
115 const UnicodeString& a = *static_cast<const UnicodeString*>(t1.pointer);
116 const UnicodeString& b = *static_cast<const UnicodeString*>(t2.pointer);
117 return a.compare(b);
118}
119
120UBool UnicodeSet::hasStrings() const {
121 return strings_ != nullptr && !strings_->isEmpty();
122}
123
124int32_t UnicodeSet::stringsSize() const {
125 return strings_ == nullptr ? 0 : strings_->size();
126}
127
128UBool UnicodeSet::stringsContains(const UnicodeString &s) const {
129 return strings_ != nullptr && strings_->contains((void*) &s);
130}
131
132//----------------------------------------------------------------
133// Constructors &c
134//----------------------------------------------------------------
135
136/**
137 * Constructs an empty set.
138 */
139UnicodeSet::UnicodeSet() {
140 list[0] = UNICODESET_HIGH0x0110000;
141 _dbgct(this);
142}
143
144/**
145 * Constructs a set containing the given range. If <code>end >
146 * start</code> then an empty set is created.
147 *
148 * @param start first character, inclusive, of range
149 * @param end last character, inclusive, of range
150 */
151UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) {
152 list[0] = UNICODESET_HIGH0x0110000;
153 add(start, end);
154 _dbgct(this);
155}
156
157/**
158 * Constructs a set that is identical to the given UnicodeSet.
159 */
160UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) {
161 *this = o;
162 _dbgct(this);
163}
164
165// Copy-construct as thawed.
166UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) {
167 if (ensureCapacity(o.len)) {
168 // *this = o except for bmpSet and stringSpan
169 len = o.len;
170 uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32))do { clang diagnostic push clang diagnostic ignored "-Waddress"
(static_cast <bool> (list != __null) ? void (0) : __assert_fail
("list != __null", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__)); (static_cast <bool> (o.list != __null
) ? void (0) : __assert_fail ("o.list != __null", __builtin_FILE
(), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__)); clang
diagnostic pop :: memcpy(list, o.list, (size_t)len*sizeof(UChar32
)); } while (false)
;
171 if (o.hasStrings()) {
172 UErrorCode status = U_ZERO_ERROR;
173 if (!allocateStrings(status) ||
174 (strings_->assign(*o.strings_, cloneUnicodeString, status), U_FAILURE(status))) {
175 setToBogus();
176 return;
177 }
178 }
179 if (o.pat) {
180 setPattern(o.pat, o.patLen);
181 }
182 _dbgct(this);
183 }
184}
185
186/**
187 * Destructs the set.
188 */
189UnicodeSet::~UnicodeSet() {
190 _dbgdt(this); // first!
191 if (list != stackList) {
192 uprv_freeuprv_free_77(list);
193 }
194 delete bmpSet;
195 if (buffer != stackList) {
196 uprv_freeuprv_free_77(buffer);
197 }
198 delete strings_;
199 delete stringSpan;
200 releasePattern();
201}
202
203/**
204 * Assigns this object to be a copy of another.
205 */
206UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
207 return copyFrom(o, false);
208}
209
210UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
211 if (this == &o) {
212 return *this;
213 }
214 if (isFrozen()) {
215 return *this;
216 }
217 if (o.isBogus()) {
218 setToBogus();
219 return *this;
220 }
221 if (!ensureCapacity(o.len)) {
222 // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.
223 return *this;
224 }
225 len = o.len;
226 uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32))do { clang diagnostic push clang diagnostic ignored "-Waddress"
(static_cast <bool> (list != __null) ? void (0) : __assert_fail
("list != __null", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__)); (static_cast <bool> (o.list != __null
) ? void (0) : __assert_fail ("o.list != __null", __builtin_FILE
(), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__)); clang
diagnostic pop :: memcpy(list, o.list, (size_t)len*sizeof(UChar32
)); } while (false)
;
227 if (o.bmpSet != nullptr && !asThawed) {
228 bmpSet = new BMPSet(*o.bmpSet, list, len);
229 if (bmpSet == nullptr) { // Check for memory allocation error.
230 setToBogus();
231 return *this;
232 }
233 }
234 if (o.hasStrings()) {
235 UErrorCode status = U_ZERO_ERROR;
236 if ((strings_ == nullptr && !allocateStrings(status)) ||
237 (strings_->assign(*o.strings_, cloneUnicodeString, status), U_FAILURE(status))) {
238 setToBogus();
239 return *this;
240 }
241 } else if (hasStrings()) {
242 strings_->removeAllElements();
243 }
244 if (o.stringSpan != nullptr && !asThawed) {
245 stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings_);
246 if (stringSpan == nullptr) { // Check for memory allocation error.
247 setToBogus();
248 return *this;
249 }
250 }
251 releasePattern();
252 if (o.pat) {
253 setPattern(o.pat, o.patLen);
254 }
255 return *this;
256}
257
258/**
259 * Returns a copy of this object. All UnicodeMatcher objects have
260 * to support cloning in order to allow classes using
261 * UnicodeMatchers, such as Transliterator, to implement cloning.
262 */
263UnicodeSet* UnicodeSet::clone() const {
264 return new UnicodeSet(*this);
265}
266
267UnicodeSet *UnicodeSet::cloneAsThawed() const {
268 return new UnicodeSet(*this, true);
269}
270
271/**
272 * Compares the specified object with this set for equality. Returns
273 * <tt>true</tt> if the two sets
274 * have the same size, and every member of the specified set is
275 * contained in this set (or equivalently, every member of this set is
276 * contained in the specified set).
277 *
278 * @param o set to be compared for equality with this set.
279 * @return <tt>true</tt> if the specified set is equal to this set.
280 */
281bool UnicodeSet::operator==(const UnicodeSet& o) const {
282 if (len != o.len) return false;
283 for (int32_t i = 0; i < len; ++i) {
284 if (list[i] != o.list[i]) return false;
285 }
286 if (hasStrings() != o.hasStrings()) { return false; }
287 if (hasStrings() && *strings_ != *o.strings_) return false;
288 return true;
289}
290
291/**
292 * Returns the hash code value for this set.
293 *
294 * @return the hash code value for this set.
295 * @see Object#hashCode()
296 */
297int32_t UnicodeSet::hashCode() const {
298 uint32_t result = static_cast<uint32_t>(len);
299 for (int32_t i = 0; i < len; ++i) {
300 result *= 1000003u;
301 result += list[i];
302 }
303 return static_cast<int32_t>(result);
304}
305
306//----------------------------------------------------------------
307// Public API
308//----------------------------------------------------------------
309
310/**
311 * Returns the number of elements in this set (its cardinality),
312 * Note than the elements of a set may include both individual
313 * codepoints and strings.
314 *
315 * @return the number of elements in this set (its cardinality).
316 */
317int32_t UnicodeSet::size() const {
318 int32_t n = 0;
319 int32_t count = getRangeCount();
320 for (int32_t i = 0; i < count; ++i) {
321 n += getRangeEnd(i) - getRangeStart(i) + 1;
322 }
323 return n + stringsSize();
324}
325
326/**
327 * Returns <tt>true</tt> if this set contains no elements.
328 *
329 * @return <tt>true</tt> if this set contains no elements.
330 */
331UBool UnicodeSet::isEmpty() const {
332 return len == 1 && !hasStrings();
333}
334
335/**
336 * Returns true if this set contains the given character.
337 * @param c character to be checked for containment
338 * @return true if the test condition is met
339 */
340UBool UnicodeSet::contains(UChar32 c) const {
341 // Set i to the index of the start item greater than ch
342 // We know we will terminate without length test!
343 // LATER: for large sets, add binary search
344 //int32_t i = -1;
345 //for (;;) {
346 // if (c < list[++i]) break;
347 //}
348 if (bmpSet != nullptr) {
349 return bmpSet->contains(c);
350 }
351 if (stringSpan != nullptr) {
352 return stringSpan->contains(c);
353 }
354 if (c >= UNICODESET_HIGH0x0110000) { // Don't need to check LOW bound
355 return false;
356 }
357 int32_t i = findCodePoint(c);
358 return i & 1; // return true if odd
359}
360
361/**
362 * Returns the smallest value i such that c < list[i]. Caller
363 * must ensure that c is a legal value or this method will enter
364 * an infinite loop. This method performs a binary search.
365 * @param c a character in the range MIN_VALUE..MAX_VALUE
366 * inclusive
367 * @return the smallest integer i in the range 0..len-1,
368 * inclusive, such that c < list[i]
369 */
370int32_t UnicodeSet::findCodePoint(UChar32 c) const {
371 /* Examples:
372 findCodePoint(c)
373 set list[] c=0 1 3 4 7 8
374 === ============== ===========
375 [] [110000] 0 0 0 0 0 0
376 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
377 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
378 [:Any:] [0, 110000] 1 1 1 1 1 1
379 */
380
381 // Return the smallest i such that c < list[i]. Assume
382 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
383 if (c < list[0])
384 return 0;
385 // High runner test. c is often after the last range, so an
386 // initial check for this condition pays off.
387 int32_t lo = 0;
388 int32_t hi = len - 1;
389 if (lo >= hi || c >= list[hi-1])
390 return hi;
391 // invariant: c >= list[lo]
392 // invariant: c < list[hi]
393 for (;;) {
394 int32_t i = (lo + hi) >> 1;
395 if (i == lo) {
396 break; // Found!
397 } else if (c < list[i]) {
398 hi = i;
399 } else {
400 lo = i;
401 }
402 }
403 return hi;
404}
405
406/**
407 * Returns true if this set contains every character
408 * of the given range.
409 * @param start first character, inclusive, of the range
410 * @param end last character, inclusive, of the range
411 * @return true if the test condition is met
412 */
413UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
414 //int32_t i = -1;
415 //for (;;) {
416 // if (start < list[++i]) break;
417 //}
418 int32_t i = findCodePoint(start);
419 return ((i & 1) != 0 && end < list[i]);
420}
421
422/**
423 * Returns <tt>true</tt> if this set contains the given
424 * multicharacter string.
425 * @param s string to be checked for containment
426 * @return <tt>true</tt> if this set contains the specified string
427 */
428UBool UnicodeSet::contains(const UnicodeString& s) const {
429 int32_t cp = getSingleCP(s);
430 if (cp < 0) {
431 return stringsContains(s);
432 } else {
433 return contains(static_cast<UChar32>(cp));
434 }
435}
436
437/**
438 * Returns true if this set contains all the characters and strings
439 * of the given set.
440 * @param c set to be checked for containment
441 * @return true if the test condition is met
442 */
443UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
444 // The specified set is a subset if all of its pairs are contained in
445 // this set. It's possible to code this more efficiently in terms of
446 // direct manipulation of the inversion lists if the need arises.
447 int32_t n = c.getRangeCount();
448 for (int i=0; i<n; ++i) {
449 if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
450 return false;
451 }
452 }
453 return !c.hasStrings() || (strings_ != nullptr && strings_->containsAll(*c.strings_));
454}
455
456/**
457 * Returns true if this set contains all the characters
458 * of the given string.
459 * @param s string containing characters to be checked for containment
460 * @return true if the test condition is met
461 */
462UBool UnicodeSet::containsAll(const UnicodeString& s) const {
463 return span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) == s.length();
464}
465
466/**
467 * Returns true if this set contains none of the characters
468 * of the given range.
469 * @param start first character, inclusive, of the range
470 * @param end last character, inclusive, of the range
471 * @return true if the test condition is met
472 */
473UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
474 //int32_t i = -1;
475 //for (;;) {
476 // if (start < list[++i]) break;
477 //}
478 int32_t i = findCodePoint(start);
479 return ((i & 1) == 0 && end < list[i]);
480}
481
482/**
483 * Returns true if this set contains none of the characters and strings
484 * of the given set.
485 * @param c set to be checked for containment
486 * @return true if the test condition is met
487 */
488UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
489 // The specified set is a subset if all of its pairs are contained in
490 // this set. It's possible to code this more efficiently in terms of
491 // direct manipulation of the inversion lists if the need arises.
492 int32_t n = c.getRangeCount();
493 for (int32_t i=0; i<n; ++i) {
494 if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
495 return false;
496 }
497 }
498 return strings_ == nullptr || !c.hasStrings() || strings_->containsNone(*c.strings_);
499}
500
501/**
502 * Returns true if this set contains none of the characters
503 * of the given string.
504 * @param s string containing characters to be checked for containment
505 * @return true if the test condition is met
506 */
507UBool UnicodeSet::containsNone(const UnicodeString& s) const {
508 return span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) == s.length();
509}
510
511/**
512 * Returns <tt>true</tt> if this set contains any character whose low byte
513 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
514 * indexing.
515 */
516UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
517 /* The index value v, in the range [0,255], is contained in this set if
518 * it is contained in any pair of this set. Pairs either have the high
519 * bytes equal, or unequal. If the high bytes are equal, then we have
520 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
521 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
522 * Then v is contained if xx <= v || v <= yy. (This is identical to the
523 * time zone month containment logic.)
524 */
525 int32_t i;
526 int32_t rangeCount=getRangeCount();
527 for (i=0; i<rangeCount; ++i) {
528 UChar32 low = getRangeStart(i);
529 UChar32 high = getRangeEnd(i);
530 if ((low & ~0xFF) == (high & ~0xFF)) {
531 if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
532 return true;
533 }
534 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
535 return true;
536 }
537 }
538 if (hasStrings()) {
539 for (i=0; i<strings_->size(); ++i) {
540 const UnicodeString& s = *static_cast<const UnicodeString*>(strings_->elementAt(i));
541 if (s.isEmpty()) {
542 continue; // skip the empty string
543 }
544 UChar32 c = s.char32At(0);
545 if ((c & 0xFF) == v) {
546 return true;
547 }
548 }
549 }
550 return false;
551}
552
553/**
554 * Implementation of UnicodeMatcher::matches(). Always matches the
555 * longest possible multichar string.
556 */
557UMatchDegree UnicodeSet::matches(const Replaceable& text,
558 int32_t& offset,
559 int32_t limit,
560 UBool incremental) {
561 if (offset == limit) {
562 if (contains(U_ETHER((char16_t)0xFFFF))) {
563 return incremental ? U_PARTIAL_MATCH : U_MATCH;
564 } else {
565 return U_MISMATCH;
566 }
567 } else {
568 if (hasStrings()) { // try strings first
569
570 // might separate forward and backward loops later
571 // for now they are combined
572
573 // TODO Improve efficiency of this, at least in the forward
574 // direction, if not in both. In the forward direction we
575 // can assume the strings are sorted.
576
577 int32_t i;
578 UBool forward = offset < limit;
579
580 // firstChar is the leftmost char to match in the
581 // forward direction or the rightmost char to match in
582 // the reverse direction.
583 char16_t firstChar = text.charAt(offset);
584
585 // If there are multiple strings that can match we
586 // return the longest match.
587 int32_t highWaterLength = 0;
588
589 for (i=0; i<strings_->size(); ++i) {
590 const UnicodeString& trial = *static_cast<const UnicodeString*>(strings_->elementAt(i));
591 if (trial.isEmpty()) {
592 continue; // skip the empty string
593 }
594
595 char16_t c = trial.charAt(forward ? 0 : trial.length() - 1);
596
597 // Strings are sorted, so we can optimize in the
598 // forward direction.
599 if (forward && c > firstChar) break;
600 if (c != firstChar) continue;
601
602 int32_t matchLen = matchRest(text, offset, limit, trial);
603
604 if (incremental) {
605 int32_t maxLen = forward ? limit-offset : offset-limit;
606 if (matchLen == maxLen) {
607 // We have successfully matched but only up to limit.
608 return U_PARTIAL_MATCH;
609 }
610 }
611
612 if (matchLen == trial.length()) {
613 // We have successfully matched the whole string.
614 if (matchLen > highWaterLength) {
615 highWaterLength = matchLen;
616 }
617 // In the forward direction we know strings
618 // are sorted so we can bail early.
619 if (forward && matchLen < highWaterLength) {
620 break;
621 }
622 continue;
623 }
624 }
625
626 // We've checked all strings without a partial match.
627 // If we have full matches, return the longest one.
628 if (highWaterLength != 0) {
629 offset += forward ? highWaterLength : -highWaterLength;
630 return U_MATCH;
631 }
632 }
633 return UnicodeFilter::matches(text, offset, limit, incremental);
634 }
635}
636
637/**
638 * Returns the longest match for s in text at the given position.
639 * If limit > start then match forward from start+1 to limit
640 * matching all characters except s.charAt(0). If limit < start,
641 * go backward starting from start-1 matching all characters
642 * except s.charAt(s.length()-1). This method assumes that the
643 * first character, text.charAt(start), matches s, so it does not
644 * check it.
645 * @param text the text to match
646 * @param start the first character to match. In the forward
647 * direction, text.charAt(start) is matched against s.charAt(0).
648 * In the reverse direction, it is matched against
649 * s.charAt(s.length()-1).
650 * @param limit the limit offset for matching, either last+1 in
651 * the forward direction, or last-1 in the reverse direction,
652 * where last is the index of the last character to match.
653 * @return If part of s matches up to the limit, return |limit -
654 * start|. If all of s matches before reaching the limit, return
655 * s.length(). If there is a mismatch between s and text, return
656 * 0
657 */
658int32_t UnicodeSet::matchRest(const Replaceable& text,
659 int32_t start, int32_t limit,
660 const UnicodeString& s) {
661 int32_t i;
662 int32_t maxLen;
663 int32_t slen = s.length();
664 if (start < limit) {
665 maxLen = limit - start;
666 if (maxLen > slen) maxLen = slen;
667 for (i = 1; i < maxLen; ++i) {
668 if (text.charAt(start + i) != s.charAt(i)) return 0;
669 }
670 } else {
671 maxLen = start - limit;
672 if (maxLen > slen) maxLen = slen;
673 --slen; // <=> slen = s.length() - 1;
674 for (i = 1; i < maxLen; ++i) {
675 if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
676 }
677 }
678 return maxLen;
679}
680
681/**
682 * Implement of UnicodeMatcher
683 */
684void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
685 toUnionTo.addAll(*this);
686}
687
688/**
689 * Returns the index of the given character within this set, where
690 * the set is ordered by ascending code point. If the character
691 * is not in this set, return -1. The inverse of this method is
692 * <code>charAt()</code>.
693 * @return an index from 0..size()-1, or -1
694 */
695int32_t UnicodeSet::indexOf(UChar32 c) const {
696 if (c < MIN_VALUE || c > MAX_VALUE) {
697 return -1;
698 }
699 int32_t i = 0;
700 int32_t n = 0;
701 for (;;) {
702 UChar32 start = list[i++];
703 if (c < start) {
704 return -1;
705 }
706 UChar32 limit = list[i++];
707 if (c < limit) {
708 return n + c - start;
709 }
710 n += limit - start;
711 }
712}
713
714/**
715 * Returns the character at the given index within this set, where
716 * the set is ordered by ascending code point. If the index is
717 * out of range, return (UChar32)-1. The inverse of this method is
718 * <code>indexOf()</code>.
719 * @param index an index from 0..size()-1
720 * @return the character at the given index, or (UChar32)-1.
721 */
722UChar32 UnicodeSet::charAt(int32_t index) const {
723 if (index >= 0) {
724 // len2 is the largest even integer <= len, that is, it is len
725 // for even values and len-1 for odd values. With odd values
726 // the last entry is UNICODESET_HIGH.
727 int32_t len2 = len & ~1;
728 for (int32_t i=0; i < len2;) {
729 UChar32 start = list[i++];
730 int32_t count = list[i++] - start;
731 if (index < count) {
732 return static_cast<UChar32>(start + index);
733 }
734 index -= count;
735 }
736 }
737 return static_cast<UChar32>(-1);
738}
739
740/**
741 * Make this object represent the range <code>start - end</code>.
742 * If <code>end > start</code> then this object is set to an
743 * an empty range.
744 *
745 * @param start first character in the set, inclusive
746 * @rparam end last character in the set, inclusive
747 */
748UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
749 clear();
750 complement(start, end);
751 return *this;
752}
753
754/**
755 * Adds the specified range to this set if it is not already
756 * present. If this set already contains the specified range,
757 * the call leaves this set unchanged. If <code>end > start</code>
758 * then an empty range is added, leaving the set unchanged.
759 *
760 * @param start first character, inclusive, of range to be added
761 * to this set.
762 * @param end last character, inclusive, of range to be added
763 * to this set.
764 */
765UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
766 if (pinCodePoint(start) < pinCodePoint(end)) {
767 UChar32 limit = end + 1;
768 // Fast path for adding a new range after the last one.
769 // Odd list length: [..., lastStart, lastLimit, HIGH]
770 if ((len & 1) != 0) {
771 // If the list is empty, set lastLimit low enough to not be adjacent to 0.
772 UChar32 lastLimit = len == 1 ? -2 : list[len - 2];
773 if (lastLimit <= start && !isFrozen() && !isBogus()) {
774 if (lastLimit == start) {
775 // Extend the last range.
776 list[len - 2] = limit;
777 if (limit == UNICODESET_HIGH0x0110000) {
778 --len;
779 }
780 } else {
781 list[len - 1] = start;
782 if (limit < UNICODESET_HIGH0x0110000) {
783 if (ensureCapacity(len + 2)) {
784 list[len++] = limit;
785 list[len++] = UNICODESET_HIGH0x0110000;
786 }
787 } else { // limit == UNICODESET_HIGH
788 if (ensureCapacity(len + 1)) {
789 list[len++] = UNICODESET_HIGH0x0110000;
790 }
791 }
792 }
793 releasePattern();
794 return *this;
795 }
796 }
797 // This is slow. Could be much faster using findCodePoint(start)
798 // and modifying the list, dealing with adjacent & overlapping ranges.
799 UChar32 range[3] = { start, limit, UNICODESET_HIGH0x0110000 };
800 add(range, 2, 0);
801 } else if (start == end) {
802 add(start);
803 }
804 return *this;
805}
806
807// #define DEBUG_US_ADD
808
809#ifdef DEBUG_US_ADD
810#include <stdio.h>
811void dump(UChar32 c) {
812 if (c <= 0xFF) {
813 printf("%c", (char)c);
814 } else {
815 printf("U+%04X", c);
816 }
817}
818void dump(const UChar32* list, int32_t len) {
819 printf("[");
820 for (int32_t i=0; i<len; ++i) {
821 if (i != 0) printf(", ");
822 dump(list[i]);
823 }
824 printf("]");
825}
826#endif
827
828/**
829 * Adds the specified character to this set if it is not already
830 * present. If this set already contains the specified character,
831 * the call leaves this set unchanged.
832 */
833UnicodeSet& UnicodeSet::add(UChar32 c) {
834 // find smallest i such that c < list[i]
835 // if odd, then it is IN the set
836 // if even, then it is OUT of the set
837 int32_t i = findCodePoint(pinCodePoint(c));
838
839 // already in set?
840 if ((i & 1) != 0 || isFrozen() || isBogus()) return *this;
841
842 // HIGH is 0x110000
843 // assert(list[len-1] == HIGH);
844
845 // empty = [HIGH]
846 // [start_0, limit_0, start_1, limit_1, HIGH]
847
848 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
849 // ^
850 // list[i]
851
852 // i == 0 means c is before the first range
853
854#ifdef DEBUG_US_ADD
855 printf("Add of ");
856 dump(c);
857 printf(" found at %d", i);
858 printf(": ");
859 dump(list, len);
860 printf(" => ");
861#endif
862
863 if (c == list[i]-1) {
864 // c is before start of next range
865 list[i] = c;
866 // if we touched the HIGH mark, then add a new one
867 if (c == (UNICODESET_HIGH0x0110000 - 1)) {
868 if (!ensureCapacity(len+1)) {
869 // ensureCapacity will mark the object as Bogus if OOM failure happens.
870 return *this;
871 }
872 list[len++] = UNICODESET_HIGH0x0110000;
873 }
874 if (i > 0 && c == list[i-1]) {
875 // collapse adjacent ranges
876
877 // [..., start_k-1, c, c, limit_k, ..., HIGH]
878 // ^
879 // list[i]
880
881 //for (int32_t k=i-1; k<len-2; ++k) {
882 // list[k] = list[k+2];
883 //}
884 UChar32* dst = list + i - 1;
885 UChar32* src = dst + 2;
886 UChar32* srclimit = list + len;
887 while (src < srclimit) *(dst++) = *(src++);
888
889 len -= 2;
890 }
891 }
892
893 else if (i > 0 && c == list[i-1]) {
894 // c is after end of prior range
895 list[i-1]++;
896 // no need to check for collapse here
897 }
898
899 else {
900 // At this point we know the new char is not adjacent to
901 // any existing ranges, and it is not 10FFFF.
902
903
904 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
905 // ^
906 // list[i]
907
908 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
909 // ^
910 // list[i]
911
912 if (!ensureCapacity(len+2)) {
913 // ensureCapacity will mark the object as Bogus if OOM failure happens.
914 return *this;
915 }
916
917 UChar32 *p = list + i;
918 uprv_memmove(p + 2, p, (len - i) * sizeof(*p))do { clang diagnostic push clang diagnostic ignored "-Waddress"
(static_cast <bool> (p + 2 != __null) ? void (0) : __assert_fail
("p + 2 != __null", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__)); (static_cast <bool> (p != __null
) ? void (0) : __assert_fail ("p != __null", __builtin_FILE (
), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__)); clang
diagnostic pop :: memmove(p + 2, p, (len - i) * sizeof(*p))
; } while (false)
;
919 list[i] = c;
920 list[i+1] = c+1;
921 len += 2;
922 }
923
924#ifdef DEBUG_US_ADD
925 dump(list, len);
926 printf("\n");
927
928 for (i=1; i<len; ++i) {
929 if (list[i] <= list[i-1]) {
930 // Corrupt array!
931 printf("ERROR: list has been corrupted\n");
932 exit(1);
933 }
934 }
935#endif
936
937 releasePattern();
938 return *this;
939}
940
941/**
942 * Adds the specified multicharacter to this set if it is not already
943 * present. If this set already contains the multicharacter,
944 * the call leaves this set unchanged.
945 * Thus "ch" => {"ch"}
946 *
947 * @param s the source string
948 * @return the modified set, for chaining
949 */
950UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
951 if (isFrozen() || isBogus()) return *this;
952 int32_t cp = getSingleCP(s);
953 if (cp < 0) {
954 if (!stringsContains(s)) {
955 _add(s);
956 releasePattern();
957 }
958 } else {
959 add(static_cast<UChar32>(cp));
960 }
961 return *this;
962}
963
964/**
965 * Adds the given string, in order, to 'strings_'. The given string
966 * must have been checked by the caller to not already be in 'strings_'.
967 */
968void UnicodeSet::_add(const UnicodeString& s) {
969 if (isFrozen() || isBogus()) {
970 return;
971 }
972 UErrorCode ec = U_ZERO_ERROR;
973 if (strings_ == nullptr && !allocateStrings(ec)) {
974 setToBogus();
975 return;
976 }
977 UnicodeString* t = new UnicodeString(s);
978 if (t == nullptr) { // Check for memory allocation error.
979 setToBogus();
980 return;
981 }
982 strings_->sortedInsert(t, compareUnicodeString, ec);
983 if (U_FAILURE(ec)) {
984 setToBogus();
985 }
986}
987
988/**
989 * @return a code point IF the string consists of a single one.
990 * otherwise returns -1.
991 * @param string to test
992 */
993int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
994 int32_t sLength = s.length();
995 if (sLength == 1) return s.charAt(0);
996 if (sLength == 2) {
997 UChar32 cp = s.char32At(0);
998 if (cp > 0xFFFF) { // is surrogate pair
999 return cp;
1000 }
1001 }
1002 return -1;
1003}
1004
1005/**
1006 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1007 * If this set already any particular character, it has no effect on that character.
1008 * @param the source string
1009 * @return the modified set, for chaining
1010 */
1011UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1012 UChar32 cp;
1013 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)((uint32_t)(cp)<=0xffff ? 1 : 2)) {
1014 cp = s.char32At(i);
1015 add(cp);
1016 }
1017 return *this;
1018}
1019
1020/**
1021 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1022 * If this set already any particular character, it has no effect on that character.
1023 * @param the source string
1024 * @return the modified set, for chaining
1025 */
1026UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1027 UnicodeSet set;
1028 set.addAll(s);
1029 retainAll(set);
1030 return *this;
1031}
1032
1033/**
1034 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1035 * If this set already any particular character, it has no effect on that character.
1036 * @param the source string
1037 * @return the modified set, for chaining
1038 */
1039UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1040 UnicodeSet set;
1041 set.addAll(s);
1042 complementAll(set);
1043 return *this;
1044}
1045
1046/**
1047 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1048 * If this set already any particular character, it has no effect on that character.
1049 * @param the source string
1050 * @return the modified set, for chaining
1051 */
1052UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1053 UnicodeSet set;
1054 set.addAll(s);
1055 removeAll(set);
1056 return *this;
1057}
1058
1059UnicodeSet& UnicodeSet::removeAllStrings() {
1060 if (!isFrozen() && hasStrings()) {
1061 strings_->removeAllElements();
1062 releasePattern();
1063 }
1064 return *this;
1065}
1066
1067
1068/**
1069 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1070 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1071 * @param the source string
1072 * @return a newly created set containing the given string
1073 */
1074UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1075 UnicodeSet *set = new UnicodeSet();
1076 if (set != nullptr) { // Check for memory allocation error.
1077 set->add(s);
1078 }
1079 return set;
1080}
1081
1082
1083/**
1084 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1085 * @param the source string
1086 * @return a newly created set containing the given characters
1087 */
1088UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1089 UnicodeSet *set = new UnicodeSet();
1090 if (set != nullptr) { // Check for memory allocation error.
1091 set->addAll(s);
1092 }
1093 return set;
1094}
1095
1096/**
1097 * Retain only the elements in this set that are contained in the
1098 * specified range. If <code>end > start</code> then an empty range is
1099 * retained, leaving the set empty.
1100 *
1101 * @param start first character, inclusive, of range to be retained
1102 * to this set.
1103 * @param end last character, inclusive, of range to be retained
1104 * to this set.
1105 */
1106UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1107 if (pinCodePoint(start) <= pinCodePoint(end)) {
1108 UChar32 range[3] = { start, end+1, UNICODESET_HIGH0x0110000 };
1109 retain(range, 2, 0);
1110 } else {
1111 clear();
1112 }
1113 return *this;
1114}
1115
1116UnicodeSet& UnicodeSet::retain(UChar32 c) {
1117 return retain(c, c);
1118}
1119
1120UnicodeSet& UnicodeSet::retain(const UnicodeString &s) {
1121 if (isFrozen() || isBogus()) { return *this; }
1122 UChar32 cp = getSingleCP(s);
1123 if (cp < 0) {
1124 bool isIn = stringsContains(s);
1125 // Check for getRangeCount() first to avoid somewhat-expensive size()
1126 // when there are single code points.
1127 if (isIn && getRangeCount() == 0 && size() == 1) {
1128 return *this;
1129 }
1130 clear();
1131 if (isIn) {
1132 _add(s);
1133 }
1134 } else {
1135 retain(cp, cp);
1136 }
1137 return *this;
1138}
1139
1140/**
1141 * Removes the specified range from this set if it is present.
1142 * The set will not contain the specified range once the call
1143 * returns. If <code>end > start</code> then an empty range is
1144 * removed, leaving the set unchanged.
1145 *
1146 * @param start first character, inclusive, of range to be removed
1147 * from this set.
1148 * @param end last character, inclusive, of range to be removed
1149 * from this set.
1150 */
1151UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1152 if (pinCodePoint(start) <= pinCodePoint(end)) {
6
Taking true branch
1153 UChar32 range[3] = { start, end+1, UNICODESET_HIGH0x0110000 };
1154 retain(range, 2, 2);
7
Calling 'UnicodeSet::retain'
1155 }
1156 return *this;
1157}
1158
1159/**
1160 * Removes the specified character from this set if it is present.
1161 * The set will not contain the specified range once the call
1162 * returns.
1163 */
1164UnicodeSet& UnicodeSet::remove(UChar32 c) {
1165 return remove(c, c);
1166}
1167
1168/**
1169 * Removes the specified string from this set if it is present.
1170 * The set will not contain the specified character once the call
1171 * returns.
1172 * @param the source string
1173 * @return the modified set, for chaining
1174 */
1175UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1176 if (isFrozen() || isBogus()) return *this;
1
Assuming the condition is false
2
Assuming the condition is false
3
Taking false branch
1177 int32_t cp = getSingleCP(s);
1178 if (cp
3.1
'cp' is >= 0
< 0) {
4
Taking false branch
1179 if (strings_ != nullptr && strings_->removeElement((void*) &s)) {
1180 releasePattern();
1181 }
1182 } else {
1183 remove(static_cast<UChar32>(cp), static_cast<UChar32>(cp));
5
Calling 'UnicodeSet::remove'
1184 }
1185 return *this;
1186}
1187
1188/**
1189 * Complements the specified range in this set. Any character in
1190 * the range will be removed if it is in this set, or will be
1191 * added if it is not in this set. If <code>end > start</code>
1192 * then an empty range is xor'ed, leaving the set unchanged.
1193 *
1194 * @param start first character, inclusive, of range to be removed
1195 * from this set.
1196 * @param end last character, inclusive, of range to be removed
1197 * from this set.
1198 */
1199UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1200 if (isFrozen() || isBogus()) {
1201 return *this;
1202 }
1203 if (pinCodePoint(start) <= pinCodePoint(end)) {
1204 UChar32 range[3] = { start, end+1, UNICODESET_HIGH0x0110000 };
1205 exclusiveOr(range, 2, 0);
1206 }
1207 releasePattern();
1208 return *this;
1209}
1210
1211UnicodeSet& UnicodeSet::complement(UChar32 c) {
1212 return complement(c, c);
1213}
1214
1215/**
1216 * This is equivalent to
1217 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1218 */
1219UnicodeSet& UnicodeSet::complement() {
1220 if (isFrozen() || isBogus()) {
1221 return *this;
1222 }
1223 if (list[0] == UNICODESET_LOW0x000000) {
1224 uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32))do { clang diagnostic push clang diagnostic ignored "-Waddress"
(static_cast <bool> (list != __null) ? void (0) : __assert_fail
("list != __null", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__)); (static_cast <bool> (list + 1 !=
__null) ? void (0) : __assert_fail ("list + 1 != __null", __builtin_FILE
(), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__)); clang
diagnostic pop :: memmove(list, list + 1, (size_t)(len-1)*sizeof
(UChar32)); } while (false)
;
1225 --len;
1226 } else {
1227 if (!ensureCapacity(len+1)) {
1228 return *this;
1229 }
1230 uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32))do { clang diagnostic push clang diagnostic ignored "-Waddress"
(static_cast <bool> (list + 1 != __null) ? void (0) :
__assert_fail ("list + 1 != __null", __builtin_FILE (), __builtin_LINE
(), __extension__ __PRETTY_FUNCTION__)); (static_cast <bool
> (list != __null) ? void (0) : __assert_fail ("list != __null"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
)); clang diagnostic pop :: memmove(list + 1, list, (size_t)
len*sizeof(UChar32)); } while (false)
;
1231 list[0] = UNICODESET_LOW0x000000;
1232 ++len;
1233 }
1234 releasePattern();
1235 return *this;
1236}
1237
1238/**
1239 * Complement the specified string in this set.
1240 * The set will not contain the specified string once the call
1241 * returns.
1242 *
1243 * @param s the string to complement
1244 * @return this object, for chaining
1245 */
1246UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1247 if (isFrozen() || isBogus()) return *this;
1248 int32_t cp = getSingleCP(s);
1249 if (cp < 0) {
1250 if (stringsContains(s)) {
1251 strings_->removeElement((void*) &s);
1252 } else {
1253 _add(s);
1254 }
1255 releasePattern();
1256 } else {
1257 complement(static_cast<UChar32>(cp), static_cast<UChar32>(cp));
1258 }
1259 return *this;
1260}
1261
1262/**
1263 * Adds all of the elements in the specified set to this set if
1264 * they're not already present. This operation effectively
1265 * modifies this set so that its value is the <i>union</i> of the two
1266 * sets. The behavior of this operation is unspecified if the specified
1267 * collection is modified while the operation is in progress.
1268 *
1269 * @param c set whose elements are to be added to this set.
1270 * @see #add(char, char)
1271 */
1272UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1273 if ( c.len>0 && c.list!=nullptr ) {
1274 add(c.list, c.len, 0);
1275 }
1276
1277 // Add strings in order
1278 if ( c.strings_!=nullptr ) {
1279 for (int32_t i=0; i<c.strings_->size(); ++i) {
1280 const UnicodeString* s = static_cast<const UnicodeString*>(c.strings_->elementAt(i));
1281 if (!stringsContains(*s)) {
1282 _add(*s);
1283 }
1284 }
1285 }
1286 return *this;
1287}
1288
1289/**
1290 * Retains only the elements in this set that are contained in the
1291 * specified set. In other words, removes from this set all of
1292 * its elements that are not contained in the specified set. This
1293 * operation effectively modifies this set so that its value is
1294 * the <i>intersection</i> of the two sets.
1295 *
1296 * @param c set that defines which elements this set will retain.
1297 */
1298UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1299 if (isFrozen() || isBogus()) {
1300 return *this;
1301 }
1302 retain(c.list, c.len, 0);
1303 if (hasStrings()) {
1304 if (!c.hasStrings()) {
1305 strings_->removeAllElements();
1306 } else {
1307 strings_->retainAll(*c.strings_);
1308 }
1309 }
1310 return *this;
1311}
1312
1313/**
1314 * Removes from this set all of its elements that are contained in the
1315 * specified set. This operation effectively modifies this
1316 * set so that its value is the <i>asymmetric set difference</i> of
1317 * the two sets.
1318 *
1319 * @param c set that defines which elements will be removed from
1320 * this set.
1321 */
1322UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1323 if (isFrozen() || isBogus()) {
1324 return *this;
1325 }
1326 retain(c.list, c.len, 2);
1327 if (hasStrings() && c.hasStrings()) {
1328 strings_->removeAll(*c.strings_);
1329 }
1330 return *this;
1331}
1332
1333/**
1334 * Complements in this set all elements contained in the specified
1335 * set. Any character in the other set will be removed if it is
1336 * in this set, or will be added if it is not in this set.
1337 *
1338 * @param c set that defines which elements will be xor'ed from
1339 * this set.
1340 */
1341UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1342 if (isFrozen() || isBogus()) {
1343 return *this;
1344 }
1345 exclusiveOr(c.list, c.len, 0);
1346
1347 if (c.strings_ != nullptr) {
1348 for (int32_t i=0; i<c.strings_->size(); ++i) {
1349 void* e = c.strings_->elementAt(i);
1350 if (strings_ == nullptr || !strings_->removeElement(e)) {
1351 _add(*static_cast<const UnicodeString*>(e));
1352 }
1353 }
1354 }
1355 return *this;
1356}
1357
1358/**
1359 * Removes all of the elements from this set. This set will be
1360 * empty after this call returns.
1361 */
1362UnicodeSet& UnicodeSet::clear() {
1363 if (isFrozen()) {
1364 return *this;
1365 }
1366 list[0] = UNICODESET_HIGH0x0110000;
1367 len = 1;
1368 releasePattern();
1369 if (strings_ != nullptr) {
1370 strings_->removeAllElements();
1371 }
1372 // Remove bogus
1373 fFlags = 0;
1374 return *this;
1375}
1376
1377/**
1378 * Iteration method that returns the number of ranges contained in
1379 * this set.
1380 * @see #getRangeStart
1381 * @see #getRangeEnd
1382 */
1383int32_t UnicodeSet::getRangeCount() const {
1384 return len/2;
1385}
1386
1387/**
1388 * Iteration method that returns the first character in the
1389 * specified range of this set.
1390 * @see #getRangeCount
1391 * @see #getRangeEnd
1392 */
1393UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1394 return list[index*2];
1395}
1396
1397/**
1398 * Iteration method that returns the last character in the
1399 * specified range of this set.
1400 * @see #getRangeStart
1401 * @see #getRangeEnd
1402 */
1403UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1404 return list[index*2 + 1] - 1;
1405}
1406
1407const UnicodeString* UnicodeSet::getString(int32_t index) const {
1408 return static_cast<const UnicodeString*>(strings_->elementAt(index));
1409}
1410
1411/**
1412 * Reallocate this objects internal structures to take up the least
1413 * possible space, without changing this object's value.
1414 */
1415UnicodeSet& UnicodeSet::compact() {
1416 if (isFrozen() || isBogus()) {
1417 return *this;
1418 }
1419 // Delete buffer first to defragment memory less.
1420 if (buffer != stackList) {
1421 uprv_freeuprv_free_77(buffer);
1422 buffer = nullptr;
1423 bufferCapacity = 0;
1424 }
1425 if (list == stackList) {
1426 // pass
1427 } else if (len <= INITIAL_CAPACITY) {
1428 uprv_memcpy(stackList, list, len * sizeof(UChar32))do { clang diagnostic push clang diagnostic ignored "-Waddress"
(static_cast <bool> (stackList != __null) ? void (0) :
__assert_fail ("stackList != __null", __builtin_FILE (), __builtin_LINE
(), __extension__ __PRETTY_FUNCTION__)); (static_cast <bool
> (list != __null) ? void (0) : __assert_fail ("list != __null"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
)); clang diagnostic pop :: memcpy(stackList, list, len * sizeof
(UChar32)); } while (false)
;
1429 uprv_freeuprv_free_77(list);
1430 list = stackList;
1431 capacity = INITIAL_CAPACITY;
1432 } else if ((len + 7) < capacity) {
1433 // If we have more than a little unused capacity, shrink it to len.
1434 UChar32* temp = static_cast<UChar32*>(uprv_reallocuprv_realloc_77(list, sizeof(UChar32) * len));
1435 if (temp) {
1436 list = temp;
1437 capacity = len;
1438 }
1439 // else what the heck happened?! We allocated less memory!
1440 // Oh well. We'll keep our original array.
1441 }
1442 if (strings_ != nullptr && strings_->isEmpty()) {
1443 delete strings_;
1444 strings_ = nullptr;
1445 }
1446 return *this;
1447}
1448
1449#ifdef DEBUG_SERIALIZE
1450#include <stdio.h>
1451#endif
1452
1453/**
1454 * Deserialize constructor.
1455 */
1456UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization,
1457 UErrorCode &ec) {
1458
1459 if(U_FAILURE(ec)) {
1460 setToBogus();
1461 return;
1462 }
1463
1464 if( (serialization != kSerialized)
1465 || (data==nullptr)
1466 || (dataLen < 1)) {
1467 ec = U_ILLEGAL_ARGUMENT_ERROR;
1468 setToBogus();
1469 return;
1470 }
1471
1472 // bmp?
1473 int32_t headerSize = ((data[0]&0x8000)) ?2:1;
1474 int32_t bmpLength = (headerSize==1)?data[0]:data[1];
1475
1476 int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
1477#ifdef DEBUG_SERIALIZE
1478 printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]);
1479#endif
1480 if(!ensureCapacity(newLength + 1)) { // +1 for HIGH
1481 return;
1482 }
1483 // copy bmp
1484 int32_t i;
1485 for(i = 0; i< bmpLength;i++) {
1486 list[i] = data[i+headerSize];
1487#ifdef DEBUG_SERIALIZE
1488 printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
1489#endif
1490 }
1491 // copy smp
1492 for(i=bmpLength;i<newLength;i++) {
1493 list[i] = (static_cast<UChar32>(data[headerSize + bmpLength + (i - bmpLength) * 2 + 0]) << 16) +
1494 static_cast<UChar32>(data[headerSize + bmpLength + (i - bmpLength) * 2 + 1]);
1495#ifdef DEBUG_SERIALIZE
1496 printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
1497#endif
1498 }
1499 U_ASSERT(i == newLength)(static_cast <bool> (i == newLength) ? void (0) : __assert_fail
("i == newLength", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__))
;
1500 if (i == 0 || list[i - 1] != UNICODESET_HIGH0x0110000) {
1501 list[i++] = UNICODESET_HIGH0x0110000;
1502 }
1503 len = i;
1504}
1505
1506
1507int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1508 int32_t bmpLength, length, destLength;
1509
1510 if (U_FAILURE(ec)) {
1511 return 0;
1512 }
1513
1514 if (destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
1515 ec=U_ILLEGAL_ARGUMENT_ERROR;
1516 return 0;
1517 }
1518
1519 /* count necessary 16-bit units */
1520 length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1521 // assert(length>=0);
1522 if (length==0) {
1523 /* empty set */
1524 if (destCapacity>0) {
1525 *dest=0;
1526 } else {
1527 ec=U_BUFFER_OVERFLOW_ERROR;
1528 }
1529 return 1;
1530 }
1531 /* now length>0 */
1532
1533 if (this->list[length-1]<=0xffff) {
1534 /* all BMP */
1535 bmpLength=length;
1536 } else if (this->list[0]>=0x10000) {
1537 /* all supplementary */
1538 bmpLength=0;
1539 length*=2;
1540 } else {
1541 /* some BMP, some supplementary */
1542 for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1543 length=bmpLength+2*(length-bmpLength);
1544 }
1545#ifdef DEBUG_SERIALIZE
1546 printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
1547#endif
1548 /* length: number of 16-bit array units */
1549 if (length>0x7fff) {
1550 /* there are only 15 bits for the length in the first serialized word */
1551 ec=U_INDEX_OUTOFBOUNDS_ERROR;
1552 return 0;
1553 }
1554
1555 /*
1556 * total serialized length:
1557 * number of 16-bit array units (length) +
1558 * 1 length unit (always) +
1559 * 1 bmpLength unit (if there are supplementary values)
1560 */
1561 destLength=length+((length>bmpLength)?2:1);
1562 if (destLength<=destCapacity) {
1563 const UChar32 *p;
1564 int32_t i;
1565
1566#ifdef DEBUG_SERIALIZE
1567 printf("writeHdr\n");
1568#endif
1569 *dest = static_cast<uint16_t>(length);
1570 if (length>bmpLength) {
1571 *dest|=0x8000;
1572 *++dest = static_cast<uint16_t>(bmpLength);
1573 }
1574 ++dest;
1575
1576 /* write the BMP part of the array */
1577 p=this->list;
1578 for (i=0; i<bmpLength; ++i) {
1579#ifdef DEBUG_SERIALIZE
1580 printf("writebmp: %x\n", (int)*p);
1581#endif
1582 *dest++ = static_cast<uint16_t>(*p++);
1583 }
1584
1585 /* write the supplementary part of the array */
1586 for (; i<length; i+=2) {
1587#ifdef DEBUG_SERIALIZE
1588 printf("write32: %x\n", (int)*p);
1589#endif
1590 *dest++ = static_cast<uint16_t>(*p >> 16);
1591 *dest++ = static_cast<uint16_t>(*p++);
1592 }
1593 } else {
1594 ec=U_BUFFER_OVERFLOW_ERROR;
1595 }
1596 return destLength;
1597}
1598
1599//----------------------------------------------------------------
1600// Implementation: Utility methods
1601//----------------------------------------------------------------
1602
1603/**
1604 * Allocate our strings vector and return true if successful.
1605 */
1606UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1607 if (U_FAILURE(status)) {
1608 return false;
1609 }
1610 strings_ = new UVector(uprv_deleteUObjectuprv_deleteUObject_77,
1611 uhash_compareUnicodeStringuhash_compareUnicodeString_77, 1, status);
1612 if (strings_ == nullptr) { // Check for memory allocation error.
1613 status = U_MEMORY_ALLOCATION_ERROR;
1614 return false;
1615 }
1616 if (U_FAILURE(status)) {
1617 delete strings_;
1618 strings_ = nullptr;
1619 return false;
1620 }
1621 return true;
1622}
1623
1624int32_t UnicodeSet::nextCapacity(int32_t minCapacity) {
1625 // Grow exponentially to reduce the frequency of allocations.
1626 if (minCapacity < INITIAL_CAPACITY) {
1627 return minCapacity + INITIAL_CAPACITY;
1628 } else if (minCapacity <= 2500) {
1629 return 5 * minCapacity;
1630 } else {
1631 int32_t newCapacity = 2 * minCapacity;
1632 if (newCapacity > MAX_LENGTH) {
1633 newCapacity = MAX_LENGTH;
1634 }
1635 return newCapacity;
1636 }
1637}
1638
1639bool UnicodeSet::ensureCapacity(int32_t newLen) {
1640 if (newLen > MAX_LENGTH) {
1641 newLen = MAX_LENGTH;
1642 }
1643 if (newLen <= capacity) {
1644 return true;
1645 }
1646 int32_t newCapacity = nextCapacity(newLen);
1647 UChar32* temp = static_cast<UChar32*>(uprv_mallocuprv_malloc_77(newCapacity * sizeof(UChar32)));
1648 if (temp == nullptr) {
1649 setToBogus(); // set the object to bogus state if an OOM failure occurred.
1650 return false;
1651 }
1652 // Copy only the actual contents.
1653 uprv_memcpy(temp, list, len * sizeof(UChar32))do { clang diagnostic push clang diagnostic ignored "-Waddress"
(static_cast <bool> (temp != __null) ? void (0) : __assert_fail
("temp != __null", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__)); (static_cast <bool> (list != __null
) ? void (0) : __assert_fail ("list != __null", __builtin_FILE
(), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__)); clang
diagnostic pop :: memcpy(temp, list, len * sizeof(UChar32))
; } while (false)
;
1654 if (list != stackList) {
1655 uprv_freeuprv_free_77(list);
1656 }
1657 list = temp;
1658 capacity = newCapacity;
1659 return true;
1660}
1661
1662bool UnicodeSet::ensureBufferCapacity(int32_t newLen) {
1663 if (newLen > MAX_LENGTH) {
1664 newLen = MAX_LENGTH;
1665 }
1666 if (newLen <= bufferCapacity) {
1667 return true;
1668 }
1669 int32_t newCapacity = nextCapacity(newLen);
1670 UChar32* temp = static_cast<UChar32*>(uprv_mallocuprv_malloc_77(newCapacity * sizeof(UChar32)));
1671 if (temp == nullptr) {
1672 setToBogus();
1673 return false;
1674 }
1675 // The buffer has no contents to be copied.
1676 // It is always filled from scratch after this call.
1677 if (buffer != stackList) {
1678 uprv_freeuprv_free_77(buffer);
1679 }
1680 buffer = temp;
1681 bufferCapacity = newCapacity;
1682 return true;
1683}
1684
1685/**
1686 * Swap list and buffer.
1687 */
1688void UnicodeSet::swapBuffers() {
1689 // swap list and buffer
1690 UChar32* temp = list;
1691 list = buffer;
1692 buffer = temp;
1693
1694 int32_t c = capacity;
1695 capacity = bufferCapacity;
1696 bufferCapacity = c;
1697}
1698
1699void UnicodeSet::setToBogus() {
1700 clear(); // Remove everything in the set.
1701 fFlags = kIsBogus;
1702}
1703
1704//----------------------------------------------------------------
1705// Implementation: Fundamental operators
1706//----------------------------------------------------------------
1707
1708static inline UChar32 max(UChar32 a, UChar32 b) {
1709 return (a > b) ? a : b;
1710}
1711
1712// polarity = 0, 3 is normal: x xor y
1713// polarity = 1, 2: x xor ~y == x === y
1714
1715void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1716 if (isFrozen() || isBogus()) {
1717 return;
1718 }
1719 if (!ensureBufferCapacity(len + otherLen)) {
1720 return;
1721 }
1722
1723 int32_t i = 0, j = 0, k = 0;
1724 UChar32 a = list[i++];
1725 UChar32 b;
1726 if (polarity == 1 || polarity == 2) {
1727 b = UNICODESET_LOW0x000000;
1728 if (other[j] == UNICODESET_LOW0x000000) { // skip base if already LOW
1729 ++j;
1730 b = other[j];
1731 }
1732 } else {
1733 b = other[j++];
1734 }
1735 // simplest of all the routines
1736 // sort the values, discarding identicals!
1737 for (;;) {
1738 if (a < b) {
1739 buffer[k++] = a;
1740 a = list[i++];
1741 } else if (b < a) {
1742 buffer[k++] = b;
1743 b = other[j++];
1744 } else if (a != UNICODESET_HIGH0x0110000) { // at this point, a == b
1745 // discard both values!
1746 a = list[i++];
1747 b = other[j++];
1748 } else { // DONE!
1749 buffer[k++] = UNICODESET_HIGH0x0110000;
1750 len = k;
1751 break;
1752 }
1753 }
1754 swapBuffers();
1755 releasePattern();
1756}
1757
1758// polarity = 0 is normal: x union y
1759// polarity = 2: x union ~y
1760// polarity = 1: ~x union y
1761// polarity = 3: ~x union ~y
1762
1763void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1764 if (isFrozen() || isBogus() || other==nullptr) {
1765 return;
1766 }
1767 if (!ensureBufferCapacity(len + otherLen)) {
1768 return;
1769 }
1770
1771 int32_t i = 0, j = 0, k = 0;
1772 UChar32 a = list[i++];
1773 UChar32 b = other[j++];
1774 // change from xor is that we have to check overlapping pairs
1775 // polarity bit 1 means a is second, bit 2 means b is.
1776 for (;;) {
1777 switch (polarity) {
1778 case 0: // both first; take lower if unequal
1779 if (a < b) { // take a
1780 // Back up over overlapping ranges in buffer[]
1781 if (k > 0 && a <= buffer[k-1]) {
1782 // Pick latter end value in buffer[] vs. list[]
1783 a = max(list[i], buffer[--k]);
1784 } else {
1785 // No overlap
1786 buffer[k++] = a;
1787 a = list[i];
1788 }
1789 i++; // Common if/else code factored out
1790 polarity ^= 1;
1791 } else if (b < a) { // take b
1792 if (k > 0 && b <= buffer[k-1]) {
1793 b = max(other[j], buffer[--k]);
1794 } else {
1795 buffer[k++] = b;
1796 b = other[j];
1797 }
1798 j++;
1799 polarity ^= 2;
1800 } else { // a == b, take a, drop b
1801 if (a == UNICODESET_HIGH0x0110000) goto loop_end;
1802 // This is symmetrical; it doesn't matter if
1803 // we backtrack with a or b. - liu
1804 if (k > 0 && a <= buffer[k-1]) {
1805 a = max(list[i], buffer[--k]);
1806 } else {
1807 // No overlap
1808 buffer[k++] = a;
1809 a = list[i];
1810 }
1811 i++;
1812 polarity ^= 1;
1813 b = other[j++];
1814 polarity ^= 2;
1815 }
1816 break;
1817 case 3: // both second; take higher if unequal, and drop other
1818 if (b <= a) { // take a
1819 if (a == UNICODESET_HIGH0x0110000) goto loop_end;
1820 buffer[k++] = a;
1821 } else { // take b
1822 if (b == UNICODESET_HIGH0x0110000) goto loop_end;
1823 buffer[k++] = b;
1824 }
1825 a = list[i++];
1826 polarity ^= 1; // factored common code
1827 b = other[j++];
1828 polarity ^= 2;
1829 break;
1830 case 1: // a second, b first; if b < a, overlap
1831 if (a < b) { // no overlap, take a
1832 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1833 } else if (b < a) { // OVERLAP, drop b
1834 b = other[j++];
1835 polarity ^= 2;
1836 } else { // a == b, drop both!
1837 if (a == UNICODESET_HIGH0x0110000) goto loop_end;
1838 a = list[i++];
1839 polarity ^= 1;
1840 b = other[j++];
1841 polarity ^= 2;
1842 }
1843 break;
1844 case 2: // a first, b second; if a < b, overlap
1845 if (b < a) { // no overlap, take b
1846 buffer[k++] = b;
1847 b = other[j++];
1848 polarity ^= 2;
1849 } else if (a < b) { // OVERLAP, drop a
1850 a = list[i++];
1851 polarity ^= 1;
1852 } else { // a == b, drop both!
1853 if (a == UNICODESET_HIGH0x0110000) goto loop_end;
1854 a = list[i++];
1855 polarity ^= 1;
1856 b = other[j++];
1857 polarity ^= 2;
1858 }
1859 break;
1860 }
1861 }
1862 loop_end:
1863 buffer[k++] = UNICODESET_HIGH0x0110000; // terminate
1864 len = k;
1865 swapBuffers();
1866 releasePattern();
1867}
1868
1869// polarity = 0 is normal: x intersect y
1870// polarity = 2: x intersect ~y == set-minus
1871// polarity = 1: ~x intersect y
1872// polarity = 3: ~x intersect ~y
1873
1874void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1875 if (isFrozen() || isBogus()) {
8
Assuming the condition is false
9
Assuming the condition is false
10
Taking false branch
1876 return;
1877 }
1878 if (!ensureBufferCapacity(len + otherLen)) {
11
Taking false branch
1879 return;
1880 }
1881
1882 int32_t i = 0, j = 0, k = 0;
1883 UChar32 a = list[i++];
1884 UChar32 b = other[j++];
1885 // change from xor is that we have to check overlapping pairs
1886 // polarity bit 1 means a is second, bit 2 means b is.
1887 for (;;) {
12
Loop condition is true. Entering loop body
17
Loop condition is true. Entering loop body
26
Loop condition is true. Entering loop body
1888 switch (polarity) {
13
Control jumps to 'case 2:' at line 1939
18
Control jumps to 'case 0:' at line 1889
27
Control jumps to 'case 3:' at line 1905
1889 case 0: // both first; drop the smaller
1890 if (a < b) { // drop a
19
Assuming 'a' is >= 'b'
20
Taking false branch
1891 a = list[i++];
1892 polarity ^= 1;
1893 } else if (b < a) { // drop b
21
Assuming 'b' is >= 'a'
22
Taking false branch
1894 b = other[j++];
1895 polarity ^= 2;
1896 } else { // a == b, take one, drop other
1897 if (a == UNICODESET_HIGH0x0110000) goto loop_end;
23
Assuming 'a' is not equal to UNICODESET_HIGH
24
Taking false branch
1898 buffer[k++] = a;
1899 a = list[i++];
1900 polarity ^= 1;
1901 b = other[j++];
1902 polarity ^= 2;
1903 }
1904 break;
25
Execution continues on line 1887
1905 case 3: // both second; take lower if unequal
1906 if (a < b) { // take a
28
Assuming 'a' is >= 'b'
29
Taking false branch
1907 buffer[k++] = a;
1908 a = list[i++];
1909 polarity ^= 1;
1910 } else if (b < a) { // take b
30
Assuming 'b' is < 'a'
31
Taking true branch
1911 buffer[k++] = b;
1912 b = other[j++];
32
Assigned value is uninitialized
1913 polarity ^= 2;
1914 } else { // a == b, take one, drop other
1915 if (a == UNICODESET_HIGH0x0110000) goto loop_end;
1916 buffer[k++] = a;
1917 a = list[i++];
1918 polarity ^= 1;
1919 b = other[j++];
1920 polarity ^= 2;
1921 }
1922 break;
1923 case 1: // a second, b first;
1924 if (a < b) { // NO OVERLAP, drop a
1925 a = list[i++];
1926 polarity ^= 1;
1927 } else if (b < a) { // OVERLAP, take b
1928 buffer[k++] = b;
1929 b = other[j++];
1930 polarity ^= 2;
1931 } else { // a == b, drop both!
1932 if (a == UNICODESET_HIGH0x0110000) goto loop_end;
1933 a = list[i++];
1934 polarity ^= 1;
1935 b = other[j++];
1936 polarity ^= 2;
1937 }
1938 break;
1939 case 2: // a first, b second; if a < b, overlap
1940 if (b < a) { // no overlap, drop b
14
Assuming 'b' is < 'a'
15
Taking true branch
1941 b = other[j++];
1942 polarity ^= 2;
1943 } else if (a < b) { // OVERLAP, take a
1944 buffer[k++] = a;
1945 a = list[i++];
1946 polarity ^= 1;
1947 } else { // a == b, drop both!
1948 if (a == UNICODESET_HIGH0x0110000) goto loop_end;
1949 a = list[i++];
1950 polarity ^= 1;
1951 b = other[j++];
1952 polarity ^= 2;
1953 }
1954 break;
16
Execution continues on line 1887
1955 }
1956 }
1957 loop_end:
1958 buffer[k++] = UNICODESET_HIGH0x0110000; // terminate
1959 len = k;
1960 swapBuffers();
1961 releasePattern();
1962}
1963
1964/**
1965 * Append the <code>toPattern()</code> representation of a
1966 * string to the given <code>StringBuffer</code>.
1967 */
1968void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) {
1969 UChar32 cp;
1970 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)((uint32_t)(cp)<=0xffff ? 1 : 2)) {
1971 _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1972 }
1973}
1974
1975/**
1976 * Append the <code>toPattern()</code> representation of a
1977 * character to the given <code>StringBuffer</code>.
1978 */
1979void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) {
1980 if (escapeUnprintable ? ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
1981 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1982 // unprintable
1983 ICU_Utility::escape(buf, c);
1984 return;
1985 }
1986 // Okay to let ':' pass through
1987 switch (c) {
1988 case u'[':
1989 case u']':
1990 case u'-':
1991 case u'^':
1992 case u'&':
1993 case u'\\':
1994 case u'{':
1995 case u'}':
1996 case u':':
1997 case SymbolTable::SYMBOL_REF:
1998 buf.append(u'\\');
1999 break;
2000 default:
2001 // Escape whitespace
2002 if (PatternProps::isWhiteSpace(c)) {
2003 buf.append(u'\\');
2004 }
2005 break;
2006 }
2007 buf.append(c);
2008}
2009
2010void UnicodeSet::_appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
2011 UBool escapeUnprintable) {
2012 _appendToPat(result, start, escapeUnprintable);
2013 if (start != end) {
2014 if ((start+1) != end ||
2015 // Avoid writing what looks like a lead+trail surrogate pair.
2016 start == 0xdbff) {
2017 result.append(u'-');
2018 }
2019 _appendToPat(result, end, escapeUnprintable);
2020 }
2021}
2022
2023/**
2024 * Append a string representation of this set to result. This will be
2025 * a cleaned version of the string passed to applyPattern(), if there
2026 * is one. Otherwise it will be generated.
2027 */
2028UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
2029 UBool escapeUnprintable) const
2030{
2031 if (pat != nullptr) {
2032 int32_t i;
2033 int32_t backslashCount = 0;
2034 for (i=0; i<patLen; ) {
2035 UChar32 c;
2036 U16_NEXT(pat, i, patLen, c)do { (c)=(pat)[(i)++]; if((((c)&0xfffffc00)==0xd800)) { uint16_t
__c2; if((i)!=(patLen) && (((__c2=(pat)[(i)])&0xfffffc00
)==0xdc00)) { ++(i); (c)=(((UChar32)((c))<<10UL)+(UChar32
)(__c2)-((0xd800<<10UL)+0xdc00-0x10000)); } } } while (
false)
;
2037 if (escapeUnprintable ?
2038 ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
2039 // If the unprintable character is preceded by an odd
2040 // number of backslashes, then it has been escaped.
2041 // Before unescaping it, we delete the final
2042 // backslash.
2043 if ((backslashCount % 2) == 1) {
2044 result.truncate(result.length() - 1);
2045 }
2046 ICU_Utility::escape(result, c);
2047 backslashCount = 0;
2048 } else {
2049 result.append(c);
2050 if (c == u'\\') {
2051 ++backslashCount;
2052 } else {
2053 backslashCount = 0;
2054 }
2055 }
2056 }
2057 return result;
2058 }
2059
2060 return _generatePattern(result, escapeUnprintable);
2061}
2062
2063/**
2064 * Returns a string representation of this set. If the result of
2065 * calling this function is passed to a UnicodeSet constructor, it
2066 * will produce another set that is equal to this one.
2067 */
2068UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
2069 UBool escapeUnprintable) const
2070{
2071 result.truncate(0);
2072 return _toPattern(result, escapeUnprintable);
2073}
2074
2075/**
2076 * Generate and append a string representation of this set to result.
2077 * This does not use this.pat, the cleaned up copy of the string
2078 * passed to applyPattern().
2079 */
2080UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
2081 UBool escapeUnprintable) const
2082{
2083 result.append(u'[');
2084
2085 int32_t i = 0;
2086 int32_t limit = len & ~1; // = 2 * getRangeCount()
2087
2088 // If the set contains at least 2 intervals and includes both
2089 // MIN_VALUE and MAX_VALUE, then the inverse representation will
2090 // be more economical.
2091 // if (getRangeCount() >= 2 &&
2092 // getRangeStart(0) == MIN_VALUE &&
2093 // getRangeEnd(last) == MAX_VALUE)
2094 // Invariant: list[len-1] == HIGH == MAX_VALUE + 1
2095 // If limit == len then len is even and the last range ends with MAX_VALUE.
2096 //
2097 // *But* do not write the inverse (complement) if there are strings.
2098 // Since ICU 70, the '^' performs a code point complement which removes all strings.
2099 if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) {
2100 // Emit the inverse
2101 result.append(u'^');
2102 // Offsetting the inversion list index by one lets us
2103 // iterate over the ranges of the set complement.
2104 i = 1;
2105 --limit;
2106 }
2107
2108 // Emit the ranges as pairs.
2109 while (i < limit) {
2110 UChar32 start = list[i]; // getRangeStart()
2111 UChar32 end = list[i + 1] - 1; // getRangeEnd() = range limit minus one
2112 if (!(0xd800 <= end && end <= 0xdbff)) {
2113 _appendToPat(result, start, end, escapeUnprintable);
2114 i += 2;
2115 } else {
2116 // The range ends with a lead surrogate.
2117 // Avoid writing what looks like a lead+trail surrogate pair.
2118 // 1. Postpone ranges that start with a lead surrogate code point.
2119 int32_t firstLead = i;
2120 while ((i += 2) < limit && list[i] <= 0xdbff) {}
2121 int32_t firstAfterLead = i;
2122 // 2. Write following ranges that start with a trail surrogate code point.
2123 while (i < limit && (start = list[i]) <= 0xdfff) {
2124 _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable);
2125 i += 2;
2126 }
2127 // 3. Now write the postponed ranges.
2128 for (int j = firstLead; j < firstAfterLead; j += 2) {
2129 _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable);
2130 }
2131 }
2132 }
2133
2134 if (strings_ != nullptr) {
2135 for (int32_t i = 0; i<strings_->size(); ++i) {
2136 result.append(u'{');
2137 _appendToPat(result,
2138 *static_cast<const UnicodeString*>(strings_->elementAt(i)),
2139 escapeUnprintable);
2140 result.append(u'}');
2141 }
2142 }
2143 return result.append(u']');
2144}
2145
2146/**
2147* Release existing cached pattern
2148*/
2149void UnicodeSet::releasePattern() {
2150 if (pat) {
2151 uprv_freeuprv_free_77(pat);
2152 pat = nullptr;
2153 patLen = 0;
2154 }
2155}
2156
2157/**
2158* Set the new pattern to cache.
2159*/
2160void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) {
2161 releasePattern();
2162 pat = static_cast<char16_t*>(uprv_mallocuprv_malloc_77((newPatLen + 1) * sizeof(char16_t)));
2163 if (pat) {
2164 patLen = newPatLen;
2165 u_memcpyu_memcpy_77(pat, newPat, patLen);
2166 pat[patLen] = 0;
2167 }
2168 // else we don't care if malloc failed. This was just a nice cache.
2169 // We can regenerate an equivalent pattern later when requested.
2170}
2171
2172UnicodeSet *UnicodeSet::freeze() {
2173 if(!isFrozen() && !isBogus()) {
2174 compact();
2175
2176 // Optimize contains() and span() and similar functions.
2177 if (hasStrings()) {
2178 stringSpan = new UnicodeSetStringSpan(*this, *strings_, UnicodeSetStringSpan::ALL);
2179 if (stringSpan == nullptr) {
2180 setToBogus();
2181 return this;
2182 } else if (!stringSpan->needsStringSpanUTF16()) {
2183 // All strings are irrelevant for span() etc. because
2184 // all of each string's code points are contained in this set.
2185 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2186 // many relevant strings as UTF-16.
2187 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2188 delete stringSpan;
2189 stringSpan = nullptr;
2190 }
2191 }
2192 if (stringSpan == nullptr) {
2193 // No span-relevant strings: Optimize for code point spans.
2194 bmpSet=new BMPSet(list, len);
2195 if (bmpSet == nullptr) { // Check for memory allocation error.
2196 setToBogus();
2197 }
2198 }
2199 }
2200 return this;
2201}
2202
2203int32_t UnicodeSet::span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
2204 if(length>0 && bmpSet!=nullptr) {
2205 return static_cast<int32_t>(bmpSet->span(s, s + length, spanCondition) - s);
2206 }
2207 if(length<0) {
2208 length=u_strlenu_strlen_77(s);
2209 }
2210 if(length==0) {
2211 return 0;
2212 }
2213 if(stringSpan!=nullptr) {
2214 return stringSpan->span(s, length, spanCondition);
2215 } else if(hasStrings()) {
2216 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2217 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2218 UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2219 UnicodeSetStringSpan strSpan(*this, *strings_, which);
2220 if(strSpan.needsStringSpanUTF16()) {
2221 return strSpan.span(s, length, spanCondition);
2222 }
2223 }
2224
2225 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2226 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2227 }
2228
2229 UChar32 c;
2230 int32_t start=0, prev=0;
2231 do {
2232 U16_NEXT(s, start, length, c)do { (c)=(s)[(start)++]; if((((c)&0xfffffc00)==0xd800)) {
uint16_t __c2; if((start)!=(length) && (((__c2=(s)[(
start)])&0xfffffc00)==0xdc00)) { ++(start); (c)=(((UChar32
)((c))<<10UL)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00
-0x10000)); } } } while (false)
;
2233 if(spanCondition!=contains(c)) {
2234 break;
2235 }
2236 } while((prev=start)<length);
2237 return prev;
2238}
2239
2240int32_t UnicodeSet::spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
2241 if(length>0 && bmpSet!=nullptr) {
2242 return static_cast<int32_t>(bmpSet->spanBack(s, s + length, spanCondition) - s);
2243 }
2244 if(length<0) {
2245 length=u_strlenu_strlen_77(s);
2246 }
2247 if(length==0) {
2248 return 0;
2249 }
2250 if(stringSpan!=nullptr) {
2251 return stringSpan->spanBack(s, length, spanCondition);
2252 } else if(hasStrings()) {
2253 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2254 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2255 UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2256 UnicodeSetStringSpan strSpan(*this, *strings_, which);
2257 if(strSpan.needsStringSpanUTF16()) {
2258 return strSpan.spanBack(s, length, spanCondition);
2259 }
2260 }
2261
2262 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2263 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2264 }
2265
2266 UChar32 c;
2267 int32_t prev=length;
2268 do {
2269 U16_PREV(s, 0, length, c)do { (c)=(s)[--(length)]; if((((c)&0xfffffc00)==0xdc00)) {
uint16_t __c2; if((length)>(0) && (((__c2=(s)[(length
)-1])&0xfffffc00)==0xd800)) { --(length); (c)=(((UChar32)
(__c2)<<10UL)+(UChar32)((c))-((0xd800<<10UL)+0xdc00
-0x10000)); } } } while (false)
;
2270 if(spanCondition!=contains(c)) {
2271 break;
2272 }
2273 } while((prev=length)>0);
2274 return prev;
2275}
2276
2277int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2278 if(length>0 && bmpSet!=nullptr) {
2279 const uint8_t* s0 = reinterpret_cast<const uint8_t*>(s);
2280 return static_cast<int32_t>(bmpSet->spanUTF8(s0, length, spanCondition) - s0);
2281 }
2282 if(length<0) {
2283 length = static_cast<int32_t>(uprv_strlen(s):: strlen(s));
2284 }
2285 if(length==0) {
2286 return 0;
2287 }
2288 if(stringSpan!=nullptr) {
2289 return stringSpan->spanUTF8(reinterpret_cast<const uint8_t*>(s), length, spanCondition);
2290 } else if(hasStrings()) {
2291 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2292 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2293 UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2294 UnicodeSetStringSpan strSpan(*this, *strings_, which);
2295 if(strSpan.needsStringSpanUTF8()) {
2296 return strSpan.spanUTF8(reinterpret_cast<const uint8_t*>(s), length, spanCondition);
2297 }
2298 }
2299
2300 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2301 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2302 }
2303
2304 UChar32 c;
2305 int32_t start=0, prev=0;
2306 do {
2307 U8_NEXT_OR_FFFD(s, start, length, c)do { (c)=(uint8_t)(s)[(start)++]; if(!(((c)&0x80)==0)) { uint8_t
__t = 0; if((start)!=(length) && ((c)>=0xe0 ? ((c
)<0xf0 ? "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
[(c)&=0xf]&(1<<((__t=(s)[start])>>5)) &&
(__t&=0x3f, 1) : ((c)-=0xf0)<=4 && "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
[(__t=(s)[start])>>4]&(1<<(c)) && ((c
)=((c)<<6)|(__t&0x3f), ++(start)!=(length)) &&
(__t=(s)[start]-0x80)<=0x3f) && ((c)=((c)<<
6)|__t, ++(start)!=(length)) : (c)>=0xc2 && ((c)&=
0x1f, 1)) && (__t=(s)[start]-0x80)<=0x3f &&
((c)=((c)<<6)|__t, ++(start), 1)) { } else { (c)=(0xfffd
); } } } while (false)
;
2308 if(spanCondition!=contains(c)) {
2309 break;
2310 }
2311 } while((prev=start)<length);
2312 return prev;
2313}
2314
2315int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2316 if(length>0 && bmpSet!=nullptr) {
2317 const uint8_t* s0 = reinterpret_cast<const uint8_t*>(s);
2318 return bmpSet->spanBackUTF8(s0, length, spanCondition);
2319 }
2320 if(length<0) {
2321 length = static_cast<int32_t>(uprv_strlen(s):: strlen(s));
2322 }
2323 if(length==0) {
2324 return 0;
2325 }
2326 if(stringSpan!=nullptr) {
2327 return stringSpan->spanBackUTF8(reinterpret_cast<const uint8_t*>(s), length, spanCondition);
2328 } else if(hasStrings()) {
2329 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2330 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2331 UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2332 UnicodeSetStringSpan strSpan(*this, *strings_, which);
2333 if(strSpan.needsStringSpanUTF8()) {
2334 return strSpan.spanBackUTF8(reinterpret_cast<const uint8_t*>(s), length, spanCondition);
2335 }
2336 }
2337
2338 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2339 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2340 }
2341
2342 UChar32 c;
2343 int32_t prev=length;
2344 do {
2345 U8_PREV_OR_FFFD(s, 0, length, c)do { (c)=(uint8_t)(s)[--(length)]; if(!(((c)&0x80)==0)) {
(c)=utf8_prevCharSafeBody_77((const uint8_t *)s, 0, &(length
), c, -3); } } while (false)
;
2346 if(spanCondition!=contains(c)) {
2347 break;
2348 }
2349 } while((prev=length)>0);
2350 return prev;
2351}
2352
2353U_NAMESPACE_END}