Bug Summary

File:root/firefox-clang/intl/icu/source/common/normalizer2impl.h
Warning:line 641, column 25
Addition of a null pointer (via field 'extraData') and a probably nonzero integer value may result in undefined behavior

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -O2 -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name normalizer2impl.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -relaxed-aliasing -ffp-contract=off -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/root/firefox-clang/obj-x86_64-pc-linux-gnu/config/external/icu/common -fcoverage-compilation-dir=/root/firefox-clang/obj-x86_64-pc-linux-gnu/config/external/icu/common -resource-dir /usr/lib/llvm-22/lib/clang/22 -include /root/firefox-clang/config/gcc_hidden.h -include /root/firefox-clang/obj-x86_64-pc-linux-gnu/mozilla-config.h -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/system_wrappers -U _FORTIFY_SOURCE -D _FORTIFY_SOURCE=2 -D _GLIBCXX_ASSERTIONS -D DEBUG=1 -D U_COMMON_IMPLEMENTATION -D _LIBCPP_DISABLE_DEPRECATION_WARNINGS -D U_USING_ICU_NAMESPACE=0 -D U_NO_DEFAULT_INCLUDE_UTF_HEADERS=1 -D U_HIDE_OBSOLETE_UTF_OLD_H=1 -D UCONFIG_NO_LEGACY_CONVERSION -D UCONFIG_NO_TRANSLITERATION -D UCONFIG_NO_REGULAR_EXPRESSIONS -D UCONFIG_NO_BREAK_ITERATION -D UCONFIG_NO_IDNA -D UCONFIG_NO_MF2 -D U_CHARSET_IS_UTF8 -D UNISTR_FROM_CHAR_EXPLICIT=explicit -D UNISTR_FROM_STRING_EXPLICIT=explicit -D U_ENABLE_DYLOAD=0 -D U_DEBUG=1 -I /root/firefox-clang/config/external/icu/common -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/config/external/icu/common -I /root/firefox-clang/intl/icu/source/i18n -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include/nspr -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include/nss -D MOZILLA_CLIENT -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/15/../../../../include/c++/15 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/15/../../../../include/x86_64-linux-gnu/c++/15 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/15/../../../../include/c++/15/backward -internal-isystem /usr/lib/llvm-22/lib/clang/22/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/15/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -Wno-error=pessimizing-move -Wno-error=large-by-value-copy=128 -Wno-error=implicit-int-float-conversion -Wno-error=thread-safety-analysis -Wno-error=tautological-type-limit-compare -Wno-invalid-offsetof -Wno-range-loop-analysis -Wno-deprecated-anon-enum-enum-conversion -Wno-deprecated-enum-enum-conversion -Wno-deprecated-this-capture -Wno-inline-new-delete -Wno-error=deprecated-declarations -Wno-error=array-bounds -Wno-error=free-nonheap-object -Wno-error=atomic-alignment -Wno-error=deprecated-builtins -Wno-psabi -Wno-error=builtin-macro-redefined -Wno-vla-cxx-extension -Wno-unknown-warning-option -Wno-character-conversion -Wno-comma -Wno-implicit-const-int-float-conversion -Wno-macro-redefined -Wno-microsoft-include -Wno-tautological-unsigned-enum-zero-compare -Wno-unreachable-code-loop-increment -Wno-unreachable-code-return -fdeprecated-macro -ferror-limit 19 -fstrict-flex-arrays=1 -stack-protector 2 -fstack-clash-protection -ftrivial-auto-var-init=pattern -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -fno-sized-deallocation -fno-aligned-allocation -vectorize-loops -vectorize-slp -analyzer-checker optin.performance.Padding -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -fdwarf2-cfi-asm -o /tmp/scan-build-2026-01-17-100050-2808198-1 -x c++ /root/firefox-clang/intl/icu/source/common/normalizer2impl.cpp

/root/firefox-clang/intl/icu/source/common/normalizer2impl.cpp

1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2009-2014, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: normalizer2impl.cpp
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2009nov22
16* created by: Markus W. Scherer
17*/
18
19// #define UCPTRIE_DEBUG
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_NORMALIZATION0
24
25#include "unicode/bytestream.h"
26#include "unicode/edits.h"
27#include "unicode/normalizer2.h"
28#include "unicode/stringoptions.h"
29#include "unicode/ucptrie.h"
30#include "unicode/udata.h"
31#include "unicode/umutablecptrie.h"
32#include "unicode/ustring.h"
33#include "unicode/utf16.h"
34#include "unicode/utf8.h"
35#include "bytesinkutil.h"
36#include "cmemory.h"
37#include "mutex.h"
38#include "normalizer2impl.h"
39#include "putilimp.h"
40#include "uassert.h"
41#include "ucptrie_impl.h"
42#include "uset_imp.h"
43#include "uvector.h"
44
45U_NAMESPACE_BEGINnamespace icu_77 {
46
47namespace {
48
49/**
50 * UTF-8 lead byte for minNoMaybeCP.
51 * Can be lower than the actual lead byte for c.
52 * Typically U+0300 for NFC/NFD, U+00A0 for NFKC/NFKD, U+0041 for NFKC_Casefold.
53 */
54inline uint8_t leadByteForCP(UChar32 c) {
55 if (c <= 0x7f) {
56 return static_cast<uint8_t>(c);
57 } else if (c <= 0x7ff) {
58 return static_cast<uint8_t>(0xc0 + (c >> 6));
59 } else {
60 // Should not occur because ccc(U+0300)!=0.
61 return 0xe0;
62 }
63}
64
65/**
66 * Returns the code point from one single well-formed UTF-8 byte sequence
67 * between cpStart and cpLimit.
68 *
69 * Trie UTF-8 macros do not assemble whole code points (for efficiency).
70 * When we do need the code point, we call this function.
71 * We should not need it for normalization-inert data (norm16==0).
72 * Illegal sequences yield the error value norm16==0 just like real normalization-inert code points.
73 */
74UChar32 codePointFromValidUTF8(const uint8_t *cpStart, const uint8_t *cpLimit) {
75 // Similar to U8_NEXT_UNSAFE(s, i, c).
76 U_ASSERT(cpStart < cpLimit)(static_cast <bool> (cpStart < cpLimit) ? void (0) :
__assert_fail ("cpStart < cpLimit", __builtin_FILE (), __builtin_LINE
(), __extension__ __PRETTY_FUNCTION__))
;
77 uint8_t c = *cpStart;
78 switch(cpLimit-cpStart) {
79 case 1:
80 return c;
81 case 2:
82 return ((c&0x1f)<<6) | (cpStart[1]&0x3f);
83 case 3:
84 // no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (char16_t)
85 return static_cast<char16_t>((c << 12) | ((cpStart[1] & 0x3f) << 6) | (cpStart[2] & 0x3f));
86 case 4:
87 return ((c&7)<<18) | ((cpStart[1]&0x3f)<<12) | ((cpStart[2]&0x3f)<<6) | (cpStart[3]&0x3f);
88 default:
89 UPRV_UNREACHABLE_EXITabort(); // Should not occur.
90 }
91}
92
93/**
94 * Returns the last code point in [start, p[ if it is valid and in U+1000..U+D7FF.
95 * Otherwise returns a negative value.
96 */
97UChar32 previousHangulOrJamo(const uint8_t *start, const uint8_t *p) {
98 if ((p - start) >= 3) {
99 p -= 3;
100 uint8_t l = *p;
101 uint8_t t1, t2;
102 if (0xe1 <= l && l <= 0xed &&
103 (t1 = static_cast<uint8_t>(p[1] - 0x80)) <= 0x3f &&
104 (t2 = static_cast<uint8_t>(p[2] - 0x80)) <= 0x3f &&
105 (l < 0xed || t1 <= 0x1f)) {
106 return ((l & 0xf) << 12) | (t1 << 6) | t2;
107 }
108 }
109 return U_SENTINEL(-1);
110}
111
112/**
113 * Returns the offset from the Jamo T base if [src, limit[ starts with a single Jamo T code point.
114 * Otherwise returns a negative value.
115 */
116int32_t getJamoTMinusBase(const uint8_t *src, const uint8_t *limit) {
117 // Jamo T: E1 86 A8..E1 87 82
118 if ((limit - src) >= 3 && *src == 0xe1) {
119 if (src[1] == 0x86) {
120 uint8_t t = src[2];
121 // The first Jamo T is U+11A8 but JAMO_T_BASE is 11A7.
122 // Offset 0 does not correspond to any conjoining Jamo.
123 if (0xa8 <= t && t <= 0xbf) {
124 return t - 0xa7;
125 }
126 } else if (src[1] == 0x87) {
127 uint8_t t = src[2];
128 if (static_cast<int8_t>(t) <= static_cast<int8_t>(0x82u)) {
129 return t - (0xa7 - 0x40);
130 }
131 }
132 }
133 return -1;
134}
135
136void
137appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t delta,
138 ByteSink &sink, Edits *edits) {
139 char buffer[U8_MAX_LENGTH4];
140 int32_t length;
141 int32_t cpLength = static_cast<int32_t>(cpLimit - cpStart);
142 if (cpLength == 1) {
143 // The builder makes ASCII map to ASCII.
144 buffer[0] = static_cast<uint8_t>(*cpStart + delta);
145 length = 1;
146 } else {
147 int32_t trail = *(cpLimit-1) + delta;
148 if (0x80 <= trail && trail <= 0xbf) {
149 // The delta only changes the last trail byte.
150 --cpLimit;
151 length = 0;
152 do { buffer[length++] = *cpStart++; } while (cpStart < cpLimit);
153 buffer[length++] = static_cast<uint8_t>(trail);
154 } else {
155 // Decode the code point, add the delta, re-encode.
156 UChar32 c = codePointFromValidUTF8(cpStart, cpLimit) + delta;
157 length = 0;
158 U8_APPEND_UNSAFE(buffer, length, c)do { uint32_t __uc=(c); if(__uc<=0x7f) { (buffer)[(length)
++]=(uint8_t)__uc; } else { if(__uc<=0x7ff) { (buffer)[(length
)++]=(uint8_t)((__uc>>6)|0xc0); } else { if(__uc<=0xffff
) { (buffer)[(length)++]=(uint8_t)((__uc>>12)|0xe0); } else
{ (buffer)[(length)++]=(uint8_t)((__uc>>18)|0xf0); (buffer
)[(length)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); } (
buffer)[(length)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80
); } (buffer)[(length)++]=(uint8_t)((__uc&0x3f)|0x80); } }
while (false)
;
159 }
160 }
161 if (edits != nullptr) {
162 edits->addReplace(cpLength, length);
163 }
164 sink.Append(buffer, length);
165}
166
167} // namespace
168
169// ReorderingBuffer -------------------------------------------------------- ***
170
171ReorderingBuffer::ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest,
172 UErrorCode &errorCode) :
173 impl(ni), str(dest),
174 start(str.getBuffer(8)), reorderStart(start), limit(start),
175 remainingCapacity(str.getCapacity()), lastCC(0) {
176 if (start == nullptr && U_SUCCESS(errorCode)) {
177 // getBuffer() already did str.setToBogus()
178 errorCode = U_MEMORY_ALLOCATION_ERROR;
179 }
180}
181
182UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
183 int32_t length=str.length();
184 start=str.getBuffer(destCapacity);
185 if(start==nullptr) {
186 // getBuffer() already did str.setToBogus()
187 errorCode=U_MEMORY_ALLOCATION_ERROR;
188 return false;
189 }
190 limit=start+length;
191 remainingCapacity=str.getCapacity()-length;
192 reorderStart=start;
193 if(start==limit) {
194 lastCC=0;
195 } else {
196 setIterator();
197 lastCC=previousCC();
198 // Set reorderStart after the last code point with cc<=1 if there is one.
199 if(lastCC>1) {
200 while(previousCC()>1) {}
201 }
202 reorderStart=codePointLimit;
203 }
204 return true;
205}
206
207UBool ReorderingBuffer::equals(const char16_t *otherStart, const char16_t *otherLimit) const {
208 int32_t length = static_cast<int32_t>(limit - start);
209 return
210 length == static_cast<int32_t>(otherLimit - otherStart) &&
211 0==u_memcmpu_memcmp_77(start, otherStart, length);
212}
213
214UBool ReorderingBuffer::equals(const uint8_t *otherStart, const uint8_t *otherLimit) const {
215 U_ASSERT((otherLimit - otherStart) <= INT32_MAX)(static_cast <bool> ((otherLimit - otherStart) <= (2147483647
)) ? void (0) : __assert_fail ("(otherLimit - otherStart) <= (2147483647)"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
))
; // ensured by caller
216 int32_t length = static_cast<int32_t>(limit - start);
217 int32_t otherLength = static_cast<int32_t>(otherLimit - otherStart);
218 // For equal strings, UTF-8 is at least as long as UTF-16, and at most three times as long.
219 if (otherLength < length || (otherLength / 3) > length) {
220 return false;
221 }
222 // Compare valid strings from between normalization boundaries.
223 // (Invalid sequences are normalization-inert.)
224 for (int32_t i = 0, j = 0;;) {
225 if (i >= length) {
226 return j >= otherLength;
227 } else if (j >= otherLength) {
228 return false;
229 }
230 // Not at the end of either string yet.
231 UChar32 c, other;
232 U16_NEXT_UNSAFE(start, i, c)do { (c)=(start)[(i)++]; if((((c)&0xfffffc00)==0xd800)) {
(c)=(((UChar32)((c))<<10UL)+(UChar32)((start)[(i)++])-
((0xd800<<10UL)+0xdc00-0x10000)); } } while (false)
;
233 U8_NEXT_UNSAFE(otherStart, j, other)do { (other)=(uint8_t)(otherStart)[(j)++]; if(!(((other)&
0x80)==0)) { if((other)<0xe0) { (other)=(((other)&0x1f
)<<6)|((otherStart)[(j)++]&0x3f); } else if((other)
<0xf0) { (other)=(UChar)(((other)<<12)|(((otherStart
)[j]&0x3f)<<6)|((otherStart)[(j)+1]&0x3f)); (j)
+=2; } else { (other)=(((other)&7)<<18)|(((otherStart
)[j]&0x3f)<<12)|(((otherStart)[(j)+1]&0x3f)<<
6)|((otherStart)[(j)+2]&0x3f); (j)+=3; } } } while (false
)
;
234 if (c != other) {
235 return false;
236 }
237 }
238}
239
240UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
241 if(remainingCapacity<2 && !resize(2, errorCode)) {
242 return false;
243 }
244 if(lastCC<=cc || cc==0) {
245 limit[0]=U16_LEAD(c)(UChar)(((c)>>10)+0xd7c0);
246 limit[1]=U16_TRAIL(c)(UChar)(((c)&0x3ff)|0xdc00);
247 limit+=2;
248 lastCC=cc;
249 if(cc<=1) {
250 reorderStart=limit;
251 }
252 } else {
253 insert(c, cc);
254 }
255 remainingCapacity-=2;
256 return true;
257}
258
259UBool ReorderingBuffer::append(const char16_t *s, int32_t length, UBool isNFD,
260 uint8_t leadCC, uint8_t trailCC,
261 UErrorCode &errorCode) {
262 if(length==0) {
263 return true;
264 }
265 if(remainingCapacity<length && !resize(length, errorCode)) {
266 return false;
267 }
268 remainingCapacity-=length;
269 if(lastCC<=leadCC || leadCC==0) {
270 if(trailCC<=1) {
271 reorderStart=limit+length;
272 } else if(leadCC<=1) {
273 reorderStart=limit+1; // Ok if not a code point boundary.
274 }
275 const char16_t *sLimit=s+length;
276 do { *limit++=*s++; } while(s!=sLimit);
277 lastCC=trailCC;
278 } else {
279 int32_t i=0;
280 UChar32 c;
281 U16_NEXT(s, i, length, c)do { (c)=(s)[(i)++]; if((((c)&0xfffffc00)==0xd800)) { uint16_t
__c2; if((i)!=(length) && (((__c2=(s)[(i)])&0xfffffc00
)==0xdc00)) { ++(i); (c)=(((UChar32)((c))<<10UL)+(UChar32
)(__c2)-((0xd800<<10UL)+0xdc00-0x10000)); } } } while (
false)
;
282 insert(c, leadCC); // insert first code point
283 while(i<length) {
284 U16_NEXT(s, i, length, c)do { (c)=(s)[(i)++]; if((((c)&0xfffffc00)==0xd800)) { uint16_t
__c2; if((i)!=(length) && (((__c2=(s)[(i)])&0xfffffc00
)==0xdc00)) { ++(i); (c)=(((UChar32)((c))<<10UL)+(UChar32
)(__c2)-((0xd800<<10UL)+0xdc00-0x10000)); } } } while (
false)
;
285 if(i<length) {
286 if (isNFD) {
287 leadCC = Normalizer2Impl::getCCFromYesOrMaybeYes(impl.getRawNorm16(c));
288 } else {
289 leadCC = impl.getCC(impl.getNorm16(c));
290 }
291 } else {
292 leadCC=trailCC;
293 }
294 append(c, leadCC, errorCode);
295 }
296 }
297 return true;
298}
299
300UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
301 int32_t cpLength=U16_LENGTH(c)((uint32_t)(c)<=0xffff ? 1 : 2);
302 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
303 return false;
304 }
305 remainingCapacity-=cpLength;
306 if(cpLength==1) {
307 *limit++ = static_cast<char16_t>(c);
308 } else {
309 limit[0]=U16_LEAD(c)(UChar)(((c)>>10)+0xd7c0);
310 limit[1]=U16_TRAIL(c)(UChar)(((c)&0x3ff)|0xdc00);
311 limit+=2;
312 }
313 lastCC=0;
314 reorderStart=limit;
315 return true;
316}
317
318UBool ReorderingBuffer::appendZeroCC(const char16_t *s, const char16_t *sLimit, UErrorCode &errorCode) {
319 if(s==sLimit) {
320 return true;
321 }
322 int32_t length = static_cast<int32_t>(sLimit - s);
323 if(remainingCapacity<length && !resize(length, errorCode)) {
324 return false;
325 }
326 u_memcpyu_memcpy_77(limit, s, length);
327 limit+=length;
328 remainingCapacity-=length;
329 lastCC=0;
330 reorderStart=limit;
331 return true;
332}
333
334void ReorderingBuffer::remove() {
335 reorderStart=limit=start;
336 remainingCapacity=str.getCapacity();
337 lastCC=0;
338}
339
340void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
341 if(suffixLength<(limit-start)) {
342 limit-=suffixLength;
343 remainingCapacity+=suffixLength;
344 } else {
345 limit=start;
346 remainingCapacity=str.getCapacity();
347 }
348 lastCC=0;
349 reorderStart=limit;
350}
351
352UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
353 int32_t reorderStartIndex = static_cast<int32_t>(reorderStart - start);
354 int32_t length = static_cast<int32_t>(limit - start);
355 str.releaseBuffer(length);
356 int32_t newCapacity=length+appendLength;
357 int32_t doubleCapacity=2*str.getCapacity();
358 if(newCapacity<doubleCapacity) {
359 newCapacity=doubleCapacity;
360 }
361 if(newCapacity<256) {
362 newCapacity=256;
363 }
364 start=str.getBuffer(newCapacity);
365 if(start==nullptr) {
366 // getBuffer() already did str.setToBogus()
367 errorCode=U_MEMORY_ALLOCATION_ERROR;
368 return false;
369 }
370 reorderStart=start+reorderStartIndex;
371 limit=start+length;
372 remainingCapacity=str.getCapacity()-length;
373 return true;
374}
375
376void ReorderingBuffer::skipPrevious() {
377 codePointLimit=codePointStart;
378 char16_t c=*--codePointStart;
379 if(U16_IS_TRAIL(c)(((c)&0xfffffc00)==0xdc00) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))(((*(codePointStart-1))&0xfffffc00)==0xd800)) {
380 --codePointStart;
381 }
382}
383
384uint8_t ReorderingBuffer::previousCC() {
385 codePointLimit=codePointStart;
386 if(reorderStart>=codePointStart) {
387 return 0;
388 }
389 UChar32 c=*--codePointStart;
390 char16_t c2;
391 if(U16_IS_TRAIL(c)(((c)&0xfffffc00)==0xdc00) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))(((c2=*(codePointStart-1))&0xfffffc00)==0xd800)) {
392 --codePointStart;
393 c=U16_GET_SUPPLEMENTARY(c2, c)(((UChar32)(c2)<<10UL)+(UChar32)(c)-((0xd800<<10UL
)+0xdc00-0x10000))
;
394 }
395 return impl.getCCFromYesOrMaybeYesCP(c);
396}
397
398// Inserts c somewhere before the last character.
399// Requires 0<cc<lastCC which implies reorderStart<limit.
400void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
401 for(setIterator(), skipPrevious(); previousCC()>cc;) {}
402 // insert c at codePointLimit, after the character with prevCC<=cc
403 char16_t *q=limit;
404 char16_t *r=limit+=U16_LENGTH(c)((uint32_t)(c)<=0xffff ? 1 : 2);
405 do {
406 *--r=*--q;
407 } while(codePointLimit!=q);
408 writeCodePoint(q, c);
409 if(cc<=1) {
410 reorderStart=r;
411 }
412}
413
414// Normalizer2Impl --------------------------------------------------------- ***
415
416struct CanonIterData : public UMemory {
417 CanonIterData(UErrorCode &errorCode);
418 ~CanonIterData();
419 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
420 UMutableCPTrie *mutableTrie;
421 UCPTrie *trie;
422 UVector canonStartSets; // contains UnicodeSet *
423};
424
425Normalizer2Impl::~Normalizer2Impl() {
426 delete fCanonIterData;
427}
428
429void
430Normalizer2Impl::init(const int32_t *inIndexes, const UCPTrie *inTrie,
431 const uint16_t *inExtraData, const uint8_t *inSmallFCD) {
432 minDecompNoCP = static_cast<char16_t>(inIndexes[IX_MIN_DECOMP_NO_CP]);
433 minCompNoMaybeCP = static_cast<char16_t>(inIndexes[IX_MIN_COMP_NO_MAYBE_CP]);
434 minLcccCP = static_cast<char16_t>(inIndexes[IX_MIN_LCCC_CP]);
435
436 minYesNo = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO]);
437 minYesNoMappingsOnly = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]);
438 minNoNo = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO]);
439 minNoNoCompBoundaryBefore = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]);
440 minNoNoCompNoMaybeCC = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]);
441 minNoNoEmpty = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_EMPTY]);
442 limitNoNo = static_cast<uint16_t>(inIndexes[IX_LIMIT_NO_NO]);
443 minMaybeNo = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_NO]);
444 minMaybeNoCombinesFwd = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_NO_COMBINES_FWD]);
445 minMaybeYes = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_YES]);
446 U_ASSERT((minMaybeNo & 7) == 0)(static_cast <bool> ((minMaybeNo & 7) == 0) ? void (
0) : __assert_fail ("(minMaybeNo & 7) == 0", __builtin_FILE
(), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__))
; // 8-aligned for noNoDelta bit fields
447 centerNoNoDelta = (minMaybeNo >> DELTA_SHIFT) - MAX_DELTA - 1;
448
449 normTrie=inTrie;
450 extraData=inExtraData;
451 smallFCD=inSmallFCD;
452}
453
454U_CDECL_BEGINextern "C" {
455
456static uint32_t U_CALLCONV
457segmentStarterMapper(const void * /*context*/, uint32_t value) {
458 return value&CANON_NOT_SEGMENT_STARTER0x80000000;
459}
460
461U_CDECL_END}
462
463void
464Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
465 UChar32 start = 0, end;
466 uint32_t norm16;
467 while ((end = ucptrie_getRangeucptrie_getRange_77(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
468 nullptr, nullptr, &norm16)) >= 0) {
469 if (norm16 > Normalizer2Impl::MIN_NORMAL_MAYBE_YES &&
470 norm16 != Normalizer2Impl::JAMO_VT) {
471 set.add(start, end);
472 } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) {
473 uint16_t fcd16 = getFCD16(start);
474 if (fcd16 > 0xff) { set.add(start, end); }
475 }
476 start = end + 1;
477 }
478}
479
480void
481Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
482 // Add the start code point of each same-value range of the trie.
483 UChar32 start = 0, end;
484 uint32_t value;
485 while ((end = ucptrie_getRangeucptrie_getRange_77(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
486 nullptr, nullptr, &value)) >= 0) {
487 sa->add(sa->set, start);
488 if (start != end && isAlgorithmicNoNo(static_cast<uint16_t>(value)) &&
489 (value & Normalizer2Impl::DELTA_TCCC_MASK) > Normalizer2Impl::DELTA_TCCC_1) {
490 // Range of code points with same-norm16-value algorithmic decompositions.
491 // They might have different non-zero FCD16 values.
492 uint16_t prevFCD16 = getFCD16(start);
493 while (++start <= end) {
494 uint16_t fcd16 = getFCD16(start);
495 if (fcd16 != prevFCD16) {
496 sa->add(sa->set, start);
497 prevFCD16 = fcd16;
498 }
499 }
500 }
501 start = end + 1;
502 }
503
504 /* add Hangul LV syllables and LV+1 because of skippables */
505 for(char16_t c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
506 sa->add(sa->set, c);
507 sa->add(sa->set, c+1);
508 }
509 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
510}
511
512void
513Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
514 // Add the start code point of each same-value range of the canonical iterator data trie.
515 if (!ensureCanonIterData(errorCode)) { return; }
516 // Currently only used for the SEGMENT_STARTER property.
517 UChar32 start = 0, end;
518 uint32_t value;
519 while ((end = ucptrie_getRangeucptrie_getRange_77(fCanonIterData->trie, start, UCPMAP_RANGE_NORMAL, 0,
520 segmentStarterMapper, nullptr, &value)) >= 0) {
521 sa->add(sa->set, start);
522 start = end + 1;
523 }
524}
525
526const char16_t *
527Normalizer2Impl::copyLowPrefixFromNulTerminated(const char16_t *src,
528 UChar32 minNeedDataCP,
529 ReorderingBuffer *buffer,
530 UErrorCode &errorCode) const {
531 // Make some effort to support NUL-terminated strings reasonably.
532 // Take the part of the fast quick check loop that does not look up
533 // data and check the first part of the string.
534 // After this prefix, determine the string length to simplify the rest
535 // of the code.
536 const char16_t *prevSrc=src;
537 char16_t c;
538 while((c=*src++)<minNeedDataCP && c!=0) {}
539 // Back out the last character for full processing.
540 // Copy this prefix.
541 if(--src!=prevSrc) {
542 if(buffer!=nullptr) {
543 buffer->appendZeroCC(prevSrc, src, errorCode);
544 }
545 }
546 return src;
547}
548
549UnicodeString &
550Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest,
551 UErrorCode &errorCode) const {
552 if(U_FAILURE(errorCode)) {
553 dest.setToBogus();
554 return dest;
555 }
556 const char16_t *sArray=src.getBuffer();
557 if(&dest==&src || sArray==nullptr) {
558 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
559 dest.setToBogus();
560 return dest;
561 }
562 decompose(sArray, sArray+src.length(), dest, src.length(), errorCode);
563 return dest;
564}
565
566void
567Normalizer2Impl::decompose(const char16_t *src, const char16_t *limit,
568 UnicodeString &dest,
569 int32_t destLengthEstimate,
570 UErrorCode &errorCode) const {
571 if(destLengthEstimate<0 && limit!=nullptr) {
572 destLengthEstimate = static_cast<int32_t>(limit - src);
573 }
574 dest.remove();
575 ReorderingBuffer buffer(*this, dest);
576 if(buffer.init(destLengthEstimate, errorCode)) {
577 decompose(src, limit, &buffer, errorCode);
578 }
579}
580
581// Dual functionality:
582// buffer!=nullptr: normalize
583// buffer==nullptr: isNormalized/spanQuickCheckYes
584const char16_t *
585Normalizer2Impl::decompose(const char16_t *src, const char16_t *limit,
586 ReorderingBuffer *buffer,
587 UErrorCode &errorCode) const {
588 UChar32 minNoCP=minDecompNoCP;
589 if(limit==nullptr) {
590 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
591 if(U_FAILURE(errorCode)) {
592 return src;
593 }
594 limit=u_strchru_strchr_77(src, 0);
595 }
596
597 const char16_t *prevSrc;
598 UChar32 c=0;
599 uint16_t norm16=0;
600
601 // only for quick check
602 const char16_t *prevBoundary=src;
603 uint8_t prevCC=0;
604
605 for(;;) {
606 // count code units below the minimum or with irrelevant data for the quick check
607 for(prevSrc=src; src!=limit;) {
608 if( (c=*src)<minNoCP ||
609 isMostDecompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c)((normTrie)->data.ptr16[((int32_t)(normTrie)->index[(c)
>> UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK
))])
)
610 ) {
611 ++src;
612 } else if(!U16_IS_LEAD(c)(((c)&0xfffffc00)==0xd800)) {
613 break;
614 } else {
615 char16_t c2;
616 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])(((c2=src[1])&0xfffffc00)==0xdc00)) {
617 c=U16_GET_SUPPLEMENTARY(c, c2)(((UChar32)(c)<<10UL)+(UChar32)(c2)-((0xd800<<10UL
)+0xdc00-0x10000))
;
618 norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c)((normTrie)->data.ptr16[((c) >= (normTrie)->highStart
? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallIndex_77(normTrie, c))])
;
619 if(isMostDecompYesAndZeroCC(norm16)) {
620 src+=2;
621 } else {
622 break;
623 }
624 } else {
625 ++src; // unpaired lead surrogate: inert
626 }
627 }
628 }
629 // copy these code units all at once
630 if(src!=prevSrc) {
631 if(buffer!=nullptr) {
632 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
633 break;
634 }
635 } else {
636 prevCC=0;
637 prevBoundary=src;
638 }
639 }
640 if(src==limit) {
641 break;
642 }
643
644 // Check one above-minimum, relevant code point.
645 src+=U16_LENGTH(c)((uint32_t)(c)<=0xffff ? 1 : 2);
646 if(buffer!=nullptr) {
647 if(!decompose(c, norm16, *buffer, errorCode)) {
648 break;
649 }
650 } else {
651 if(isDecompYes(norm16)) {
652 uint8_t cc=getCCFromYesOrMaybeYes(norm16);
653 if(prevCC<=cc || cc==0) {
654 prevCC=cc;
655 if(cc<=1) {
656 prevBoundary=src;
657 }
658 continue;
659 }
660 }
661 return prevBoundary; // "no" or cc out of order
662 }
663 }
664 return src;
665}
666
667// Decompose a short piece of text which is likely to contain characters that
668// fail the quick check loop and/or where the quick check loop's overhead
669// is unlikely to be amortized.
670// Called by the compose() and makeFCD() implementations.
671const char16_t *
672Normalizer2Impl::decomposeShort(const char16_t *src, const char16_t *limit,
673 UBool stopAtCompBoundary, UBool onlyContiguous,
674 ReorderingBuffer &buffer, UErrorCode &errorCode) const {
675 if (U_FAILURE(errorCode)) {
676 return nullptr;
677 }
678 while(src<limit) {
679 if (stopAtCompBoundary && *src < minCompNoMaybeCP) {
680 return src;
681 }
682 const char16_t *prevSrc = src;
683 UChar32 c;
684 uint16_t norm16;
685 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16)do { (c) = *(src)++; int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)==0) && (src
) != (limit) && (((__c2 = *(src))&0xfffffc00)==0xdc00
)) { ++(src); (c) = (((UChar32)((c))<<10UL)+(UChar32)(__c2
)-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c) >=
(normTrie)->highStart ? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallIndex_77(normTrie, c)); } else { __index
= (normTrie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
; } } (norm16) = ((normTrie)->data.ptr16[__index]); } while
(false)
;
686 if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
687 return prevSrc;
688 }
689 if(!decompose(c, norm16, buffer, errorCode)) {
690 return nullptr;
691 }
692 if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
693 return src;
694 }
695 }
696 return src;
697}
698
699UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
700 ReorderingBuffer &buffer,
701 UErrorCode &errorCode) const {
702 // get the decomposition and the lead and trail cc's
703 if (norm16 >= limitNoNo) {
704 if (isMaybeYesOrNonZeroCC(norm16)) {
705 return buffer.append(c, getCCFromYesOrMaybeYes(norm16), errorCode);
706 } else if (norm16 < minMaybeNo) {
707 // Maps to an isCompYesAndZeroCC.
708 c=mapAlgorithmic(c, norm16);
709 norm16=getRawNorm16(c);
710 }
711 }
712 if (norm16 < minYesNo) {
713 // c does not decompose
714 return buffer.append(c, 0, errorCode);
715 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
716 // Hangul syllable: decompose algorithmically
717 char16_t jamos[3];
718 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
719 }
720 // c decomposes, get everything from the variable-length extra data
721 const uint16_t *mapping=getData(norm16);
722 uint16_t firstUnit=*mapping;
723 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
724 uint8_t leadCC, trailCC;
725 trailCC = static_cast<uint8_t>(firstUnit >> 8);
726 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
727 leadCC = static_cast<uint8_t>(*(mapping - 1) >> 8);
728 } else {
729 leadCC=0;
730 }
731 return buffer.append(reinterpret_cast<const char16_t*>(mapping) + 1, length, true, leadCC, trailCC, errorCode);
732}
733
734// Dual functionality:
735// sink != nullptr: normalize
736// sink == nullptr: isNormalized/spanQuickCheckYes
737const uint8_t *
738Normalizer2Impl::decomposeUTF8(uint32_t options,
739 const uint8_t *src, const uint8_t *limit,
740 ByteSink *sink, Edits *edits, UErrorCode &errorCode) const {
741 U_ASSERT(limit != nullptr)(static_cast <bool> (limit != nullptr) ? void (0) : __assert_fail
("limit != nullptr", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__))
;
742 UnicodeString s16;
743 uint8_t minNoLead = leadByteForCP(minDecompNoCP);
744
745 const uint8_t *prevBoundary = src;
746 // only for quick check
747 uint8_t prevCC = 0;
748
749 for (;;) {
750 // Fast path: Scan over a sequence of characters below the minimum "no" code point,
751 // or with (decompYes && ccc==0) properties.
752 const uint8_t *fastStart = src;
753 const uint8_t *prevSrc;
754 uint16_t norm16 = 0;
755
756 for (;;) {
757 if (src == limit) {
758 if (prevBoundary != limit && sink != nullptr) {
759 ByteSinkUtil::appendUnchanged(prevBoundary, limit,
760 *sink, options, edits, errorCode);
761 }
762 return src;
763 }
764 if (*src < minNoLead) {
765 ++src;
766 } else {
767 prevSrc = src;
768 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16)do { int32_t __lead = (uint8_t)*(src)++; if (!(((__lead)&
0x80)==0)) { uint8_t __t1, __t2, __t3; if ((src) != (limit) &&
(__lead >= 0xe0 ? __lead < 0xf0 ? "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
[__lead &= 0xf] & (1 << ((__t1 = *(src)) >>
5)) && ++(src) != (limit) && (__t2 = *(src) -
0x80) <= 0x3f && (__lead = ((int32_t)(normTrie)->
index[(__lead << 6) + (__t1 & 0x3f)]) + __t2, 1) : (
__lead -= 0xf0) <= 4 && "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
[(__t1 = *(src)) >> 4] & (1 << __lead) &&
(__lead = (__lead << 6) | (__t1 & 0x3f), ++(src) !=
(limit)) && (__t2 = *(src) - 0x80) <= 0x3f &&
++(src) != (limit) && (__t3 = *(src) - 0x80) <= 0x3f
&& (__lead = __lead >= (normTrie)->shifted12HighStart
? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallU8Index_77((normTrie), __lead, __t2, __t3
), 1) : __lead >= 0xc2 && (__t1 = *(src) - 0x80) <=
0x3f && (__lead = (int32_t)(normTrie)->index[__lead
& 0x1f] + __t1, 1))) { ++(src); } else { __lead = (normTrie
)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (
norm16) = ((normTrie)->data.ptr16[__lead]); } while (false
)
;
769 if (!isMostDecompYesAndZeroCC(norm16)) {
770 break;
771 }
772 }
773 }
774 // isMostDecompYesAndZeroCC(norm16) is false, that is, norm16>=minYesNo,
775 // and the current character at [prevSrc..src[ is not a common case with cc=0
776 // (MIN_NORMAL_MAYBE_YES or JAMO_VT).
777 // It could still be a maybeYes with cc=0.
778 if (prevSrc != fastStart) {
779 // The fast path looped over yes/0 characters before the current one.
780 if (sink != nullptr &&
781 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
782 *sink, options, edits, errorCode)) {
783 break;
784 }
785 prevBoundary = prevSrc;
786 prevCC = 0;
787 }
788
789 // Medium-fast path: Quick check.
790 if (isMaybeYesOrNonZeroCC(norm16)) {
791 // Does not decompose.
792 uint8_t cc = getCCFromYesOrMaybeYes(norm16);
793 if (prevCC <= cc || cc == 0) {
794 prevCC = cc;
795 if (cc <= 1) {
796 if (sink != nullptr &&
797 !ByteSinkUtil::appendUnchanged(prevBoundary, src,
798 *sink, options, edits, errorCode)) {
799 break;
800 }
801 prevBoundary = src;
802 }
803 continue;
804 }
805 }
806 if (sink == nullptr) {
807 return prevBoundary; // quick check: "no" or cc out of order
808 }
809
810 // Slow path
811 // Decompose up to and including the current character.
812 if (prevBoundary != prevSrc && norm16HasDecompBoundaryBefore(norm16)) {
813 if (!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
814 *sink, options, edits, errorCode)) {
815 break;
816 }
817 prevBoundary = prevSrc;
818 }
819 ReorderingBuffer buffer(*this, s16, errorCode);
820 if (U_FAILURE(errorCode)) {
821 break;
822 }
823 decomposeShort(prevBoundary, src, STOP_AT_LIMIT, false /* onlyContiguous */,
824 buffer, errorCode);
825 // Decompose until the next boundary.
826 if (buffer.getLastCC() > 1) {
827 src = decomposeShort(src, limit, STOP_AT_DECOMP_BOUNDARY, false /* onlyContiguous */,
828 buffer, errorCode);
829 }
830 if (U_FAILURE(errorCode)) {
831 break;
832 }
833 if ((src - prevSrc) > INT32_MAX(2147483647)) { // guard before buffer.equals()
834 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
835 break;
836 }
837 // We already know there was a change if the original character decomposed;
838 // otherwise compare.
839 if (isMaybeYesOrNonZeroCC(norm16) && buffer.equals(prevBoundary, src)) {
840 if (!ByteSinkUtil::appendUnchanged(prevBoundary, src,
841 *sink, options, edits, errorCode)) {
842 break;
843 }
844 } else {
845 if (!ByteSinkUtil::appendChange(prevBoundary, src, buffer.getStart(), buffer.length(),
846 *sink, edits, errorCode)) {
847 break;
848 }
849 }
850 prevBoundary = src;
851 prevCC = 0;
852 }
853 return src;
854}
855
856const uint8_t *
857Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
858 StopAt stopAt, UBool onlyContiguous,
859 ReorderingBuffer &buffer, UErrorCode &errorCode) const {
860 if (U_FAILURE(errorCode)) {
861 return nullptr;
862 }
863 while (src < limit) {
864 const uint8_t *prevSrc = src;
865 uint16_t norm16;
866 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16)do { int32_t __lead = (uint8_t)*(src)++; if (!(((__lead)&
0x80)==0)) { uint8_t __t1, __t2, __t3; if ((src) != (limit) &&
(__lead >= 0xe0 ? __lead < 0xf0 ? "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
[__lead &= 0xf] & (1 << ((__t1 = *(src)) >>
5)) && ++(src) != (limit) && (__t2 = *(src) -
0x80) <= 0x3f && (__lead = ((int32_t)(normTrie)->
index[(__lead << 6) + (__t1 & 0x3f)]) + __t2, 1) : (
__lead -= 0xf0) <= 4 && "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
[(__t1 = *(src)) >> 4] & (1 << __lead) &&
(__lead = (__lead << 6) | (__t1 & 0x3f), ++(src) !=
(limit)) && (__t2 = *(src) - 0x80) <= 0x3f &&
++(src) != (limit) && (__t3 = *(src) - 0x80) <= 0x3f
&& (__lead = __lead >= (normTrie)->shifted12HighStart
? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallU8Index_77((normTrie), __lead, __t2, __t3
), 1) : __lead >= 0xc2 && (__t1 = *(src) - 0x80) <=
0x3f && (__lead = (int32_t)(normTrie)->index[__lead
& 0x1f] + __t1, 1))) { ++(src); } else { __lead = (normTrie
)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (
norm16) = ((normTrie)->data.ptr16[__lead]); } while (false
)
;
867 // Get the decomposition and the lead and trail cc's.
868 UChar32 c = U_SENTINEL(-1);
869 if (norm16 >= limitNoNo) {
870 if (isMaybeYesOrNonZeroCC(norm16)) {
871 // No comp boundaries around this character.
872 uint8_t cc = getCCFromYesOrMaybeYes(norm16);
873 if (cc == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) {
874 return prevSrc;
875 }
876 c = codePointFromValidUTF8(prevSrc, src);
877 if (!buffer.append(c, cc, errorCode)) {
878 return nullptr;
879 }
880 if (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1) {
881 return src;
882 }
883 continue;
884 } else if (norm16 < minMaybeNo) {
885 // Maps to an isCompYesAndZeroCC.
886 if (stopAt != STOP_AT_LIMIT) {
887 return prevSrc;
888 }
889 c = codePointFromValidUTF8(prevSrc, src);
890 c = mapAlgorithmic(c, norm16);
891 norm16 = getRawNorm16(c);
892 }
893 } else if (stopAt != STOP_AT_LIMIT && norm16 < minNoNoCompNoMaybeCC) {
894 return prevSrc;
895 }
896 // norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8.
897 // We do not see invalid UTF-8 here because
898 // its norm16==INERT is normalization-inert,
899 // so it gets copied unchanged in the fast path,
900 // and we stop the slow path where invalid UTF-8 begins.
901 // c >= 0 is the result of an algorithmic mapping.
902 U_ASSERT(c >= 0 || norm16 != INERT)(static_cast <bool> (c >= 0 || norm16 != INERT) ? void
(0) : __assert_fail ("c >= 0 || norm16 != INERT", __builtin_FILE
(), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__))
;
903 if (norm16 < minYesNo) {
904 if (c < 0) {
905 c = codePointFromValidUTF8(prevSrc, src);
906 }
907 // does not decompose
908 if (!buffer.append(c, 0, errorCode)) {
909 return nullptr;
910 }
911 } else if (isHangulLV(norm16) || isHangulLVT(norm16)) {
912 // Hangul syllable: decompose algorithmically
913 if (c < 0) {
914 c = codePointFromValidUTF8(prevSrc, src);
915 }
916 char16_t jamos[3];
917 if (!buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode)) {
918 return nullptr;
919 }
920 } else {
921 // The character decomposes, get everything from the variable-length extra data.
922 const uint16_t *mapping = getData(norm16);
923 uint16_t firstUnit = *mapping;
924 int32_t length = firstUnit & MAPPING_LENGTH_MASK;
925 uint8_t trailCC = static_cast<uint8_t>(firstUnit >> 8);
926 uint8_t leadCC;
927 if (firstUnit & MAPPING_HAS_CCC_LCCC_WORD) {
928 leadCC = static_cast<uint8_t>(*(mapping - 1) >> 8);
929 } else {
930 leadCC = 0;
931 }
932 if (leadCC == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) {
933 return prevSrc;
934 }
935 if (!buffer.append(reinterpret_cast<const char16_t*>(mapping) + 1, length, true, leadCC, trailCC, errorCode)) {
936 return nullptr;
937 }
938 }
939 if ((stopAt == STOP_AT_COMP_BOUNDARY && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) ||
940 (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1)) {
941 return src;
942 }
943 }
944 return src;
945}
946
947const char16_t *
948Normalizer2Impl::getDecomposition(UChar32 c, char16_t buffer[4], int32_t &length) const {
949 uint16_t norm16;
950 if(c<minDecompNoCP || isMaybeYesOrNonZeroCC(norm16=getNorm16(c))) {
951 // c does not decompose
952 return nullptr;
953 }
954 const char16_t *decomp = nullptr;
955 if(isDecompNoAlgorithmic(norm16)) {
956 // Maps to an isCompYesAndZeroCC.
957 c=mapAlgorithmic(c, norm16);
958 decomp=buffer;
959 length=0;
960 U16_APPEND_UNSAFE(buffer, length, c)do { if((uint32_t)(c)<=0xffff) { (buffer)[(length)++]=(uint16_t
)(c); } else { (buffer)[(length)++]=(uint16_t)(((c)>>10
)+0xd7c0); (buffer)[(length)++]=(uint16_t)(((c)&0x3ff)|0xdc00
); } } while (false)
;
961 // The mapping might decompose further.
962 norm16 = getRawNorm16(c);
963 }
964 if (norm16 < minYesNo) {
965 return decomp;
966 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
967 // Hangul syllable: decompose algorithmically
968 length=Hangul::decompose(c, buffer);
969 return buffer;
970 }
971 // c decomposes, get everything from the variable-length extra data
972 const uint16_t *mapping=getData(norm16);
973 length=*mapping&MAPPING_LENGTH_MASK;
974 return reinterpret_cast<const char16_t*>(mapping) + 1;
975}
976
977// The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
978// so that a raw mapping fits that consists of one unit ("rm0")
979// plus all but the first two code units of the normal mapping.
980// The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
981const char16_t *
982Normalizer2Impl::getRawDecomposition(UChar32 c, char16_t buffer[30], int32_t &length) const {
983 uint16_t norm16;
984 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
985 // c does not decompose
986 return nullptr;
987 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
988 // Hangul syllable: decompose algorithmically
989 Hangul::getRawDecomposition(c, buffer);
990 length=2;
991 return buffer;
992 } else if(isDecompNoAlgorithmic(norm16)) {
993 c=mapAlgorithmic(c, norm16);
994 length=0;
995 U16_APPEND_UNSAFE(buffer, length, c)do { if((uint32_t)(c)<=0xffff) { (buffer)[(length)++]=(uint16_t
)(c); } else { (buffer)[(length)++]=(uint16_t)(((c)>>10
)+0xd7c0); (buffer)[(length)++]=(uint16_t)(((c)&0x3ff)|0xdc00
); } } while (false)
;
996 return buffer;
997 }
998 // c decomposes, get everything from the variable-length extra data
999 const uint16_t *mapping=getData(norm16);
1000 uint16_t firstUnit=*mapping;
1001 int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping
1002 if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
1003 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
1004 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
1005 const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
1006 uint16_t rm0=*rawMapping;
1007 if(rm0<=MAPPING_LENGTH_MASK) {
1008 length=rm0;
1009 return reinterpret_cast<const char16_t*>(rawMapping) - rm0;
1010 } else {
1011 // Copy the normal mapping and replace its first two code units with rm0.
1012 buffer[0] = static_cast<char16_t>(rm0);
1013 u_memcpyu_memcpy_77(buffer + 1, reinterpret_cast<const char16_t*>(mapping) + 1 + 2, mLength - 2);
1014 length=mLength-1;
1015 return buffer;
1016 }
1017 } else {
1018 length=mLength;
1019 return reinterpret_cast<const char16_t*>(mapping) + 1;
1020 }
1021}
1022
1023void Normalizer2Impl::decomposeAndAppend(const char16_t *src, const char16_t *limit,
1024 UBool doDecompose,
1025 UnicodeString &safeMiddle,
1026 ReorderingBuffer &buffer,
1027 UErrorCode &errorCode) const {
1028 buffer.copyReorderableSuffixTo(safeMiddle);
1029 if(doDecompose) {
1030 decompose(src, limit, &buffer, errorCode);
1031 return;
1032 }
1033 // Just merge the strings at the boundary.
1034 bool isFirst = true;
1035 uint8_t firstCC = 0, prevCC = 0, cc;
1036 const char16_t *p = src;
1037 while (p != limit) {
1038 const char16_t *codePointStart = p;
1039 UChar32 c;
1040 uint16_t norm16;
1041 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16)do { (c) = *(p)++; int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)==0) && (p
) != (limit) && (((__c2 = *(p))&0xfffffc00)==0xdc00
)) { ++(p); (c) = (((UChar32)((c))<<10UL)+(UChar32)(__c2
)-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c) >=
(normTrie)->highStart ? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallIndex_77(normTrie, c)); } else { __index
= (normTrie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
; } } (norm16) = ((normTrie)->data.ptr16[__index]); } while
(false)
;
1042 if ((cc = getCC(norm16)) == 0) {
1043 p = codePointStart;
1044 break;
1045 }
1046 if (isFirst) {
1047 firstCC = cc;
1048 isFirst = false;
1049 }
1050 prevCC = cc;
1051 }
1052 if(limit==nullptr) { // appendZeroCC() needs limit!=nullptr
1053 limit=u_strchru_strchr_77(p, 0);
1054 }
1055
1056 if (buffer.append(src, static_cast<int32_t>(p - src), false, firstCC, prevCC, errorCode)) {
1057 buffer.appendZeroCC(p, limit, errorCode);
1058 }
1059}
1060
1061UBool Normalizer2Impl::hasDecompBoundaryBefore(UChar32 c) const {
1062 return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||
1063 norm16HasDecompBoundaryBefore(getNorm16(c));
1064}
1065
1066UBool Normalizer2Impl::norm16HasDecompBoundaryBefore(uint16_t norm16) const {
1067 if (norm16 < minNoNoCompNoMaybeCC) {
1068 return true;
1069 }
1070 if (norm16 >= limitNoNo) {
1071 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1072 }
1073 // c decomposes, get everything from the variable-length extra data
1074 const uint16_t *mapping=getDataForYesOrNo(norm16);
1075 uint16_t firstUnit=*mapping;
1076 // true if leadCC==0 (hasFCDBoundaryBefore())
1077 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
1078}
1079
1080UBool Normalizer2Impl::hasDecompBoundaryAfter(UChar32 c) const {
1081 if (c < minDecompNoCP) {
1082 return true;
1083 }
1084 if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) {
1085 return true;
1086 }
1087 return norm16HasDecompBoundaryAfter(getNorm16(c));
1088}
1089
1090UBool Normalizer2Impl::norm16HasDecompBoundaryAfter(uint16_t norm16) const {
1091 if(norm16 <= minYesNo || isHangulLVT(norm16)) {
1092 return true;
1093 }
1094 if (norm16 >= limitNoNo) {
1095 if (isMaybeYesOrNonZeroCC(norm16)) {
1096 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1097 } else if (norm16 < minMaybeNo) {
1098 // Maps to an isCompYesAndZeroCC.
1099 return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
1100 }
1101 }
1102 // c decomposes, get everything from the variable-length extra data
1103 const uint16_t *mapping=getData(norm16);
1104 uint16_t firstUnit=*mapping;
1105 // decomp after-boundary: same as hasFCDBoundaryAfter(),
1106 // fcd16<=1 || trailCC==0
1107 if(firstUnit>0x1ff) {
1108 return false; // trailCC>1
1109 }
1110 if(firstUnit<=0xff) {
1111 return true; // trailCC==0
1112 }
1113 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
1114 // true if leadCC==0 (hasFCDBoundaryBefore())
1115 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
1116}
1117
1118/*
1119 * Finds the recomposition result for
1120 * a forward-combining "lead" character,
1121 * specified with a pointer to its compositions list,
1122 * and a backward-combining "trail" character.
1123 *
1124 * If the lead and trail characters combine, then this function returns
1125 * the following "compositeAndFwd" value:
1126 * Bits 21..1 composite character
1127 * Bit 0 set if the composite is a forward-combining starter
1128 * otherwise it returns -1.
1129 *
1130 * The compositions list has (trail, compositeAndFwd) pair entries,
1131 * encoded as either pairs or triples of 16-bit units.
1132 * The last entry has the high bit of its first unit set.
1133 *
1134 * The list is sorted by ascending trail characters (there are no duplicates).
1135 * A linear search is used.
1136 *
1137 * See normalizer2impl.h for a more detailed description
1138 * of the compositions list format.
1139 */
1140int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
1141 uint16_t key1, firstUnit;
1142 if(trail<COMP_1_TRAIL_LIMIT) {
1143 // trail character is 0..33FF
1144 // result entry may have 2 or 3 units
1145 key1 = static_cast<uint16_t>(trail << 1);
1146 while(key1>(firstUnit=*list)) {
1147 list+=2+(firstUnit&COMP_1_TRIPLE);
1148 }
1149 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1150 if(firstUnit&COMP_1_TRIPLE) {
1151 return (static_cast<int32_t>(list[1]) << 16) | list[2];
1152 } else {
1153 return list[1];
1154 }
1155 }
1156 } else {
1157 // trail character is 3400..10FFFF
1158 // result entry has 3 units
1159 key1 = static_cast<uint16_t>(COMP_1_TRAIL_LIMIT +
1160 (((trail>>COMP_1_TRAIL_SHIFT))&
1161 ~COMP_1_TRIPLE));
1162 uint16_t key2 = static_cast<uint16_t>(trail << COMP_2_TRAIL_SHIFT);
1163 uint16_t secondUnit;
1164 for(;;) {
1165 if(key1>(firstUnit=*list)) {
1166 list+=2+(firstUnit&COMP_1_TRIPLE);
1167 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1168 if(key2>(secondUnit=list[1])) {
1169 if(firstUnit&COMP_1_LAST_TUPLE) {
1170 break;
1171 } else {
1172 list+=3;
1173 }
1174 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1175 return (static_cast<int32_t>(secondUnit & ~COMP_2_TRAIL_MASK) << 16) | list[2];
1176 } else {
1177 break;
1178 }
1179 } else {
1180 break;
1181 }
1182 }
1183 }
1184 return -1;
1185}
1186
1187/**
1188 * @param list some character's compositions list
1189 * @param set recursively receives the composites from these compositions
1190 */
1191void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
1192 uint16_t firstUnit;
1193 int32_t compositeAndFwd;
1194 do {
1195 firstUnit=*list;
1196 if((firstUnit&COMP_1_TRIPLE)==0) {
1197 compositeAndFwd=list[1];
1198 list+=2;
1199 } else {
1200 compositeAndFwd = ((static_cast<int32_t>(list[1]) & ~COMP_2_TRAIL_MASK) << 16) | list[2];
1201 list+=3;
1202 }
1203 UChar32 composite=compositeAndFwd>>1;
1204 if((compositeAndFwd&1)!=0) {
1205 addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set);
1206 }
1207 set.add(composite);
1208 } while((firstUnit&COMP_1_LAST_TUPLE)==0);
1209}
1210
1211/*
1212 * Recomposes the buffer text starting at recomposeStartIndex
1213 * (which is in NFD - decomposed and canonically ordered),
1214 * and truncates the buffer contents.
1215 *
1216 * Note that recomposition never lengthens the text:
1217 * Any character consists of either one or two code units;
1218 * a composition may contain at most one more code unit than the original starter,
1219 * while the combining mark that is removed has at least one code unit.
1220 */
1221void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
1222 UBool onlyContiguous) const {
1223 char16_t *p=buffer.getStart()+recomposeStartIndex;
1224 char16_t *limit=buffer.getLimit();
1225 if(p==limit) {
19
Assuming 'p' is not equal to 'limit'
20
Taking false branch
1226 return;
1227 }
1228
1229 char16_t *starter, *pRemove, *q, *r;
1230 const uint16_t *compositionsList;
1231 UChar32 c, compositeAndFwd;
1232 uint16_t norm16;
1233 uint8_t cc, prevCC;
1234 UBool starterIsSupplementary;
1235
1236 // Some of the following variables are not used until we have a forward-combining starter
1237 // and are only initialized now to avoid compiler warnings.
1238 compositionsList=nullptr; // used as indicator for whether we have a forward-combining starter
1239 starter=nullptr;
1240 starterIsSupplementary=false;
1241 prevCC=0;
1242
1243 for(;;) {
21
Loop condition is true. Entering loop body
30
Loop condition is true. Entering loop body
1244 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16)do { (c) = *(p)++; int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)==0) && (p
) != (limit) && (((__c2 = *(p))&0xfffffc00)==0xdc00
)) { ++(p); (c) = (((UChar32)((c))<<10UL)+(UChar32)(__c2
)-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c) >=
(normTrie)->highStart ? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallIndex_77(normTrie, c)); } else { __index
= (normTrie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
; } } (norm16) = ((normTrie)->data.ptr16[__index]); } while
(false)
;
22
Assuming the condition is true
23
Taking true branch
24
Loop condition is false. Exiting loop
31
Assuming the condition is true
32
Taking true branch
33
Loop condition is false. Exiting loop
1245 cc=getCCFromYesOrMaybeYes(norm16);
1246 if( // this character combines backward and
1247 isMaybe(norm16) &&
1248 // we have seen a starter that combines forward and
1249 compositionsList!=nullptr &&
1250 // the backward-combining character is not blocked
1251 (prevCC<cc || prevCC==0)
1252 ) {
1253 if(isJamoVT(norm16)) {
1254 // c is a Jamo V/T, see if we can compose it with the previous character.
1255 if(c<Hangul::JAMO_T_BASE) {
1256 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1257 char16_t prev = static_cast<char16_t>(*starter - Hangul::JAMO_L_BASE);
1258 if(prev<Hangul::JAMO_L_COUNT) {
1259 pRemove=p-1;
1260 char16_t syllable = static_cast<char16_t>(
1261 Hangul::HANGUL_BASE +
1262 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
1263 Hangul::JAMO_T_COUNT);
1264 char16_t t;
1265 if (p != limit && (t = static_cast<char16_t>(*p - Hangul::JAMO_T_BASE)) < Hangul::JAMO_T_COUNT) {
1266 ++p;
1267 syllable+=t; // The next character was a Jamo T.
1268 }
1269 *starter=syllable;
1270 // remove the Jamo V/T
1271 q=pRemove;
1272 r=p;
1273 while(r<limit) {
1274 *q++=*r++;
1275 }
1276 limit=q;
1277 p=pRemove;
1278 }
1279 }
1280 /*
1281 * No "else" for Jamo T:
1282 * Since the input is in NFD, there are no Hangul LV syllables that
1283 * a Jamo T could combine with.
1284 * All Jamo Ts are combined above when handling Jamo Vs.
1285 */
1286 if(p==limit) {
1287 break;
1288 }
1289 compositionsList=nullptr;
1290 continue;
1291 } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
1292 // The starter and the combining mark (c) do combine.
1293 UChar32 composite=compositeAndFwd>>1;
1294
1295 // Replace the starter with the composite, remove the combining mark.
1296 pRemove=p-U16_LENGTH(c)((uint32_t)(c)<=0xffff ? 1 : 2); // pRemove & p: start & limit of the combining mark
1297 if(starterIsSupplementary) {
1298 if(U_IS_SUPPLEMENTARY(composite)((uint32_t)((composite)-0x10000)<=0xfffff)) {
1299 // both are supplementary
1300 starter[0]=U16_LEAD(composite)(UChar)(((composite)>>10)+0xd7c0);
1301 starter[1]=U16_TRAIL(composite)(UChar)(((composite)&0x3ff)|0xdc00);
1302 } else {
1303 *starter = static_cast<char16_t>(composite);
1304 // The composite is shorter than the starter,
1305 // move the intermediate characters forward one.
1306 starterIsSupplementary=false;
1307 q=starter+1;
1308 r=q+1;
1309 while(r<pRemove) {
1310 *q++=*r++;
1311 }
1312 --pRemove;
1313 }
1314 } else if(U_IS_SUPPLEMENTARY(composite)((uint32_t)((composite)-0x10000)<=0xfffff)) {
1315 // The composite is longer than the starter,
1316 // move the intermediate characters back one.
1317 starterIsSupplementary=true;
1318 ++starter; // temporarily increment for the loop boundary
1319 q=pRemove;
1320 r=++pRemove;
1321 while(starter<q) {
1322 *--r=*--q;
1323 }
1324 *starter=U16_TRAIL(composite)(UChar)(((composite)&0x3ff)|0xdc00);
1325 *--starter=U16_LEAD(composite)(UChar)(((composite)>>10)+0xd7c0); // undo the temporary increment
1326 } else {
1327 // both are on the BMP
1328 *starter = static_cast<char16_t>(composite);
1329 }
1330
1331 /* remove the combining mark by moving the following text over it */
1332 if(pRemove<p) {
1333 q=pRemove;
1334 r=p;
1335 while(r<limit) {
1336 *q++=*r++;
1337 }
1338 limit=q;
1339 p=pRemove;
1340 }
1341 // Keep prevCC because we removed the combining mark.
1342
1343 if(p==limit) {
1344 break;
1345 }
1346 // Is the composite a starter that combines forward?
1347 if(compositeAndFwd&1) {
1348 compositionsList=
1349 getCompositionsListForComposite(getRawNorm16(composite));
1350 } else {
1351 compositionsList=nullptr;
1352 }
1353
1354 // We combined; continue with looking for compositions.
1355 continue;
1356 }
1357 }
1358
1359 // no combination this time
1360 prevCC=cc;
1361 if(p==limit) {
25
Assuming 'p' is not equal to 'limit'
26
Taking false branch
34
Assuming 'p' is not equal to 'limit'
35
Taking false branch
1362 break;
1363 }
1364
1365 // If c did not combine, then check if it is a starter.
1366 if(cc
26.1
'cc' is equal to 0
35.1
'cc' is equal to 0
26.1
'cc' is equal to 0
35.1
'cc' is equal to 0
==0) {
27
Taking true branch
36
Taking true branch
1367 // Found a new starter.
1368 if((compositionsList=getCompositionsListForDecompYes(norm16))!=nullptr) {
28
Assuming pointer value is null
29
Taking false branch
37
Calling 'Normalizer2Impl::getCompositionsListForDecompYes'
1369 // It may combine with something, prepare for it.
1370 if(U_IS_BMP(c)((uint32_t)(c)<=0xffff)) {
1371 starterIsSupplementary=false;
1372 starter=p-1;
1373 } else {
1374 starterIsSupplementary=true;
1375 starter=p-2;
1376 }
1377 }
1378 } else if(onlyContiguous) {
1379 // FCC: no discontiguous compositions; any intervening character blocks.
1380 compositionsList=nullptr;
1381 }
1382 }
1383 buffer.setReorderingLimit(limit);
1384}
1385
1386UChar32
1387Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
1388 uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16
1389 const uint16_t *list;
1390 if(isInert(norm16)) {
1391 return U_SENTINEL(-1);
1392 } else if(norm16<minYesNoMappingsOnly) {
1393 // a combines forward.
1394 if(isJamoL(norm16)) {
1395 if (b < Hangul::JAMO_V_BASE) {
1396 return U_SENTINEL(-1);
1397 }
1398 b-=Hangul::JAMO_V_BASE;
1399 if(b<Hangul::JAMO_V_COUNT) {
1400 return
1401 (Hangul::HANGUL_BASE+
1402 ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
1403 Hangul::JAMO_T_COUNT);
1404 } else {
1405 return U_SENTINEL(-1);
1406 }
1407 } else if(isHangulLV(norm16)) {
1408 if (b <= Hangul::JAMO_T_BASE) {
1409 return U_SENTINEL(-1);
1410 }
1411 b-=Hangul::JAMO_T_BASE;
1412 if(b<Hangul::JAMO_T_COUNT) { // not b==0!
1413 return a+b;
1414 } else {
1415 return U_SENTINEL(-1);
1416 }
1417 } else {
1418 // 'a' has a compositions list in extraData
1419 list=getDataForYesOrNo(norm16);
1420 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list
1421 list+= // mapping pointer
1422 1+ // +1 to skip the first unit with the mapping length
1423 (*list&MAPPING_LENGTH_MASK); // + mapping length
1424 }
1425 }
1426 } else if(norm16<minMaybeNoCombinesFwd || MIN_NORMAL_MAYBE_YES<=norm16) {
1427 return U_SENTINEL(-1);
1428 } else {
1429 list=getDataForMaybe(norm16);
1430 if(norm16<minMaybeYes) { // composite 'a' has both mapping & compositions list
1431 list+= // mapping pointer
1432 1+ // +1 to skip the first unit with the mapping length
1433 (*list&MAPPING_LENGTH_MASK); // + mapping length
1434 }
1435 }
1436 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b
1437 return U_SENTINEL(-1);
1438 }
1439#if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC1
1440 return combine(list, b)>>1;
1441#else
1442 int32_t compositeAndFwd=combine(list, b);
1443 return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL(-1);
1444#endif
1445}
1446
1447// Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1448// doCompose: normalize
1449// !doCompose: isNormalized (buffer must be empty and initialized)
1450UBool
1451Normalizer2Impl::compose(const char16_t *src, const char16_t *limit,
1452 UBool onlyContiguous,
1453 UBool doCompose,
1454 ReorderingBuffer &buffer,
1455 UErrorCode &errorCode) const {
1456 const char16_t *prevBoundary=src;
1457 UChar32 minNoMaybeCP=minCompNoMaybeCP;
1458 if(limit==nullptr) {
1
Assuming the condition is false
2
Taking false branch
1459 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
1460 doCompose ? &buffer : nullptr,
1461 errorCode);
1462 if(U_FAILURE(errorCode)) {
1463 return false;
1464 }
1465 limit=u_strchru_strchr_77(src, 0);
1466 if (prevBoundary != src) {
1467 if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) {
1468 prevBoundary = src;
1469 } else {
1470 buffer.removeSuffix(1);
1471 prevBoundary = --src;
1472 }
1473 }
1474 }
1475
1476 for (;;) {
3
Loop condition is true. Entering loop body
1477 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1478 // or with (compYes && ccc==0) properties.
1479 const char16_t *prevSrc;
1480 UChar32 c = 0;
1481 uint16_t norm16 = 0;
1482 for (;;) {
4
Loop condition is true. Entering loop body
1483 if (src == limit) {
5
Assuming 'src' is not equal to 'limit'
1484 if (prevBoundary != limit && doCompose) {
1485 buffer.appendZeroCC(prevBoundary, limit, errorCode);
1486 }
1487 return true;
1488 }
1489 if( (c=*src)<minNoMaybeCP ||
6
Assuming the condition is false
7
Taking false branch
1490 isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c)((normTrie)->data.ptr16[((int32_t)(normTrie)->index[(c)
>> UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK
))])
)
1491 ) {
1492 ++src;
1493 } else {
1494 prevSrc = src++;
1495 if(!U16_IS_LEAD(c)(((c)&0xfffffc00)==0xd800)) {
8
Assuming the condition is false
9
Taking true branch
1496 break;
1497 } else {
1498 char16_t c2;
1499 if(src!=limit && U16_IS_TRAIL(c2=*src)(((c2=*src)&0xfffffc00)==0xdc00)) {
1500 ++src;
1501 c=U16_GET_SUPPLEMENTARY(c, c2)(((UChar32)(c)<<10UL)+(UChar32)(c2)-((0xd800<<10UL
)+0xdc00-0x10000))
;
1502 norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c)((normTrie)->data.ptr16[((c) >= (normTrie)->highStart
? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallIndex_77(normTrie, c))])
;
1503 if(!isCompYesAndZeroCC(norm16)) {
1504 break;
1505 }
1506 }
1507 }
1508 }
1509 }
1510 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1511 // The current character is either a "noNo" (has a mapping)
1512 // or a "maybeYes" / "maybeNo" (combines backward)
1513 // or a "yesYes" with ccc!=0.
1514 // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1515
1516 // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1517 if (norm16 < minMaybeNo) { // minNoNo <= norm16 < minMaybeNo
10
Execution continues on line 1517
11
Assuming 'norm16' is >= field 'minMaybeNo'
1518 if (!doCompose) {
1519 return false;
1520 }
1521 // Fast path for mapping a character that is immediately surrounded by boundaries.
1522 // In this case, we need not decompose around the current character.
1523 if (isDecompNoAlgorithmic(norm16)) {
1524 // Maps to a single isCompYesAndZeroCC character
1525 // which also implies hasCompBoundaryBefore.
1526 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1527 hasCompBoundaryBefore(src, limit)) {
1528 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1529 break;
1530 }
1531 if(!buffer.append(mapAlgorithmic(c, norm16), 0, errorCode)) {
1532 break;
1533 }
1534 prevBoundary = src;
1535 continue;
1536 }
1537 } else if (norm16 < minNoNoCompBoundaryBefore) {
1538 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1539 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1540 hasCompBoundaryBefore(src, limit)) {
1541 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1542 break;
1543 }
1544 const char16_t *mapping = reinterpret_cast<const char16_t *>(getDataForYesOrNo(norm16));
1545 int32_t length = *mapping++ & MAPPING_LENGTH_MASK;
1546 if(!buffer.appendZeroCC(mapping, mapping + length, errorCode)) {
1547 break;
1548 }
1549 prevBoundary = src;
1550 continue;
1551 }
1552 } else if (norm16 >= minNoNoEmpty) {
1553 // The current character maps to nothing.
1554 // Simply omit it from the output if there is a boundary before _or_ after it.
1555 // The character itself implies no boundaries.
1556 if (hasCompBoundaryBefore(src, limit) ||
1557 hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {
1558 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1559 break;
1560 }
1561 prevBoundary = src;
1562 continue;
1563 }
1564 }
1565 // Other "noNo" type, or need to examine more text around this character:
1566 // Fall through to the slow path.
1567 } else if (isJamoVT(norm16) && prevBoundary != prevSrc) {
12
Taking false branch
1568 char16_t prev=*(prevSrc-1);
1569 if(c<Hangul::JAMO_T_BASE) {
1570 // The current character is a Jamo Vowel,
1571 // compose with previous Jamo L and following Jamo T.
1572 char16_t l = static_cast<char16_t>(prev - Hangul::JAMO_L_BASE);
1573 if(l<Hangul::JAMO_L_COUNT) {
1574 if (!doCompose) {
1575 return false;
1576 }
1577 int32_t t;
1578 if (src != limit &&
1579 0 < (t = (static_cast<int32_t>(*src) - Hangul::JAMO_T_BASE)) &&
1580 t < Hangul::JAMO_T_COUNT) {
1581 // The next character is a Jamo T.
1582 ++src;
1583 } else if (hasCompBoundaryBefore(src, limit)) {
1584 // No Jamo T follows, not even via decomposition.
1585 t = 0;
1586 } else {
1587 t = -1;
1588 }
1589 if (t >= 0) {
1590 UChar32 syllable = Hangul::HANGUL_BASE +
1591 (l*Hangul::JAMO_V_COUNT + (c-Hangul::JAMO_V_BASE)) *
1592 Hangul::JAMO_T_COUNT + t;
1593 --prevSrc; // Replace the Jamo L as well.
1594 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1595 break;
1596 }
1597 if (!buffer.appendBMP(static_cast<char16_t>(syllable), 0, errorCode)) {
1598 break;
1599 }
1600 prevBoundary = src;
1601 continue;
1602 }
1603 // If we see L+V+x where x!=T then we drop to the slow path,
1604 // decompose and recompose.
1605 // This is to deal with NFKC finding normal L and V but a
1606 // compatibility variant of a T.
1607 // We need to either fully compose that combination here
1608 // (which would complicate the code and may not work with strange custom data)
1609 // or use the slow path.
1610 }
1611 } else if (Hangul::isHangulLV(prev)) {
1612 // The current character is a Jamo Trailing consonant,
1613 // compose with previous Hangul LV that does not contain a Jamo T.
1614 if (!doCompose) {
1615 return false;
1616 }
1617 UChar32 syllable = prev + c - Hangul::JAMO_T_BASE;
1618 --prevSrc; // Replace the Hangul LV as well.
1619 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1620 break;
1621 }
1622 if (!buffer.appendBMP(static_cast<char16_t>(syllable), 0, errorCode)) {
1623 break;
1624 }
1625 prevBoundary = src;
1626 continue;
1627 }
1628 // No matching context, or may need to decompose surrounding text first:
1629 // Fall through to the slow path.
1630 } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC
13
Assuming 'norm16' is <= JAMO_VT
1631 // One or more combining marks that do not combine-back:
1632 // Check for canonical order, copy unchanged if ok and
1633 // if followed by a character with a boundary-before.
1634 uint8_t cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0
1635 if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) {
1636 // Fails FCD test, need to decompose and contiguously recompose.
1637 if (!doCompose) {
1638 return false;
1639 }
1640 } else {
1641 // If !onlyContiguous (not FCC), then we ignore the tccc of
1642 // the previous character which passed the quick check "yes && ccc==0" test.
1643 const char16_t *nextSrc;
1644 uint16_t n16;
1645 for (;;) {
1646 if (src == limit) {
1647 if (doCompose) {
1648 buffer.appendZeroCC(prevBoundary, limit, errorCode);
1649 }
1650 return true;
1651 }
1652 uint8_t prevCC = cc;
1653 nextSrc = src;
1654 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, n16)do { (c) = *(nextSrc)++; int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)==0) && (nextSrc
) != (limit) && (((__c2 = *(nextSrc))&0xfffffc00)
==0xdc00)) { ++(nextSrc); (c) = (((UChar32)((c))<<10UL)
+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000)); __index
= ((c) >= (normTrie)->highStart ? (normTrie)->dataLength
- UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : ucptrie_internalSmallIndex_77
(normTrie, c)); } else { __index = (normTrie)->dataLength -
UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (n16) = ((normTrie)
->data.ptr16[__index]); } while (false)
;
1655 if (n16 >= MIN_YES_YES_WITH_CC) {
1656 cc = getCCFromNormalYesOrMaybe(n16);
1657 if (prevCC > cc) {
1658 if (!doCompose) {
1659 return false;
1660 }
1661 break;
1662 }
1663 } else {
1664 break;
1665 }
1666 src = nextSrc;
1667 }
1668 // src is after the last in-order combining mark.
1669 // If there is a boundary here, then we continue with no change.
1670 if (norm16HasCompBoundaryBefore(n16)) {
1671 if (isCompYesAndZeroCC(n16)) {
1672 src = nextSrc;
1673 }
1674 continue;
1675 }
1676 // Use the slow path. There is no boundary in [prevSrc, src[.
1677 }
1678 }
1679
1680 // Slow path: Find the nearest boundaries around the current character,
1681 // decompose and recompose.
1682 if (prevBoundary
13.1
'prevBoundary' is equal to 'prevSrc'
13.1
'prevBoundary' is equal to 'prevSrc'
!= prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
1683 const char16_t *p = prevSrc;
1684 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, norm16)do { (c) = *--(p); int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)!=0) && (p
) != (prevBoundary) && (((__c2 = *((p) - 1))&0xfffffc00
)==0xd800)) { --(p); (c) = (((UChar32)(__c2)<<10UL)+(UChar32
)((c))-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c)
>= (normTrie)->highStart ? (normTrie)->dataLength -
UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : ucptrie_internalSmallIndex_77
(normTrie, c)); } else { __index = (normTrie)->dataLength -
UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (norm16) = ((normTrie
)->data.ptr16[__index]); } while (false)
;
1685 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1686 prevSrc = p;
1687 }
1688 }
1689 if (doCompose && prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
14
Assuming 'doCompose' is 0
1690 break;
1691 }
1692 int32_t recomposeStartIndex=buffer.length();
1693 // We know there is not a boundary here.
1694 decomposeShort(prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous,
1695 buffer, errorCode);
1696 // Decompose until the next boundary.
1697 src = decomposeShort(src, limit, true /* stopAtCompBoundary */, onlyContiguous,
1698 buffer, errorCode);
1699 if (U_FAILURE(errorCode)) {
15
Taking false branch
1700 break;
1701 }
1702 if ((src - prevSrc) > INT32_MAX(2147483647)) { // guard before buffer.equals()
16
Assuming the condition is false
17
Taking false branch
1703 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1704 return true;
1705 }
1706 recompose(buffer, recomposeStartIndex, onlyContiguous);
18
Calling 'Normalizer2Impl::recompose'
1707 if(!doCompose) {
1708 if(!buffer.equals(prevSrc, src)) {
1709 return false;
1710 }
1711 buffer.remove();
1712 }
1713 prevBoundary=src;
1714 }
1715 return true;
1716}
1717
1718// Very similar to compose(): Make the same changes in both places if relevant.
1719// pQCResult==nullptr: spanQuickCheckYes
1720// pQCResult!=nullptr: quickCheck (*pQCResult must be UNORM_YES)
1721const char16_t *
1722Normalizer2Impl::composeQuickCheck(const char16_t *src, const char16_t *limit,
1723 UBool onlyContiguous,
1724 UNormalizationCheckResult *pQCResult) const {
1725 const char16_t *prevBoundary=src;
1726 UChar32 minNoMaybeCP=minCompNoMaybeCP;
1727 if(limit==nullptr) {
1728 UErrorCode errorCode=U_ZERO_ERROR;
1729 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, nullptr, errorCode);
1730 limit=u_strchru_strchr_77(src, 0);
1731 if (prevBoundary != src) {
1732 if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) {
1733 prevBoundary = src;
1734 } else {
1735 prevBoundary = --src;
1736 }
1737 }
1738 }
1739
1740 for(;;) {
1741 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1742 // or with (compYes && ccc==0) properties.
1743 const char16_t *prevSrc;
1744 UChar32 c = 0;
1745 uint16_t norm16 = 0;
1746 for (;;) {
1747 if(src==limit) {
1748 return src;
1749 }
1750 if( (c=*src)<minNoMaybeCP ||
1751 isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c)((normTrie)->data.ptr16[((int32_t)(normTrie)->index[(c)
>> UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK
))])
)
1752 ) {
1753 ++src;
1754 } else {
1755 prevSrc = src++;
1756 if(!U16_IS_LEAD(c)(((c)&0xfffffc00)==0xd800)) {
1757 break;
1758 } else {
1759 char16_t c2;
1760 if(src!=limit && U16_IS_TRAIL(c2=*src)(((c2=*src)&0xfffffc00)==0xdc00)) {
1761 ++src;
1762 c=U16_GET_SUPPLEMENTARY(c, c2)(((UChar32)(c)<<10UL)+(UChar32)(c2)-((0xd800<<10UL
)+0xdc00-0x10000))
;
1763 norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c)((normTrie)->data.ptr16[((c) >= (normTrie)->highStart
? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallIndex_77(normTrie, c))])
;
1764 if(!isCompYesAndZeroCC(norm16)) {
1765 break;
1766 }
1767 }
1768 }
1769 }
1770 }
1771 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1772 // The current character is either a "noNo" (has a mapping)
1773 // or a "maybeYes" / "maybeNo" (combines backward)
1774 // or a "yesYes" with ccc!=0.
1775 // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1776
1777 uint16_t prevNorm16 = INERT;
1778 if (prevBoundary != prevSrc) {
1779 if (norm16HasCompBoundaryBefore(norm16)) {
1780 prevBoundary = prevSrc;
1781 } else {
1782 const char16_t *p = prevSrc;
1783 uint16_t n16;
1784 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, n16)do { (c) = *--(p); int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)!=0) && (p
) != (prevBoundary) && (((__c2 = *((p) - 1))&0xfffffc00
)==0xd800)) { --(p); (c) = (((UChar32)(__c2)<<10UL)+(UChar32
)((c))-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c)
>= (normTrie)->highStart ? (normTrie)->dataLength -
UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : ucptrie_internalSmallIndex_77
(normTrie, c)); } else { __index = (normTrie)->dataLength -
UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (n16) = ((normTrie)
->data.ptr16[__index]); } while (false)
;
1785 if (norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1786 prevBoundary = prevSrc;
1787 } else {
1788 prevBoundary = p;
1789 prevNorm16 = n16;
1790 }
1791 }
1792 }
1793
1794 if (norm16 >= minMaybeNo) {
1795 uint16_t fcd16 = getFCD16FromMaybeOrNonZeroCC(norm16);
1796 uint8_t cc = fcd16 >> 8;
1797 if (onlyContiguous /* FCC */ && cc != 0 &&
1798 getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {
1799 // The [prevBoundary..prevSrc[ character
1800 // passed the quick check "yes && ccc==0" test
1801 // but is out of canonical order with the current combining mark.
1802 } else {
1803 // If !onlyContiguous (not FCC), then we ignore the tccc of
1804 // the previous character which passed the quick check "yes && ccc==0" test.
1805 const char16_t *nextSrc;
1806 for (;;) {
1807 if (norm16 < MIN_YES_YES_WITH_CC) {
1808 if (pQCResult != nullptr) {
1809 *pQCResult = UNORM_MAYBE;
1810 } else {
1811 return prevBoundary;
1812 }
1813 }
1814 if (src == limit) {
1815 return src;
1816 }
1817 uint8_t prevCC = fcd16;
1818 nextSrc = src;
1819 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, norm16)do { (c) = *(nextSrc)++; int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)==0) && (nextSrc
) != (limit) && (((__c2 = *(nextSrc))&0xfffffc00)
==0xdc00)) { ++(nextSrc); (c) = (((UChar32)((c))<<10UL)
+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000)); __index
= ((c) >= (normTrie)->highStart ? (normTrie)->dataLength
- UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : ucptrie_internalSmallIndex_77
(normTrie, c)); } else { __index = (normTrie)->dataLength -
UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (norm16) = ((normTrie
)->data.ptr16[__index]); } while (false)
;
1820 if (norm16 >= minMaybeNo) {
1821 fcd16 = getFCD16FromMaybeOrNonZeroCC(norm16);
1822 cc = fcd16 >> 8;
1823 if (!(prevCC <= cc || cc == 0)) {
1824 break;
1825 }
1826 } else {
1827 break;
1828 }
1829 src = nextSrc;
1830 }
1831 // src is after the last in-order combining mark.
1832 if (isCompYesAndZeroCC(norm16)) {
1833 prevBoundary = src;
1834 src = nextSrc;
1835 continue;
1836 }
1837 }
1838 }
1839 if(pQCResult!=nullptr) {
1840 *pQCResult=UNORM_NO;
1841 }
1842 return prevBoundary;
1843 }
1844}
1845
1846void Normalizer2Impl::composeAndAppend(const char16_t *src, const char16_t *limit,
1847 UBool doCompose,
1848 UBool onlyContiguous,
1849 UnicodeString &safeMiddle,
1850 ReorderingBuffer &buffer,
1851 UErrorCode &errorCode) const {
1852 if(!buffer.isEmpty()) {
1853 const char16_t *firstStarterInSrc=findNextCompBoundary(src, limit, onlyContiguous);
1854 if(src!=firstStarterInSrc) {
1855 const char16_t *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1856 buffer.getLimit(), onlyContiguous);
1857 int32_t destSuffixLength = static_cast<int32_t>(buffer.getLimit() - lastStarterInDest);
1858 UnicodeString middle(lastStarterInDest, destSuffixLength);
1859 buffer.removeSuffix(destSuffixLength);
1860 safeMiddle=middle;
1861 middle.append(src, static_cast<int32_t>(firstStarterInSrc - src));
1862 const char16_t *middleStart=middle.getBuffer();
1863 compose(middleStart, middleStart+middle.length(), onlyContiguous,
1864 true, buffer, errorCode);
1865 if(U_FAILURE(errorCode)) {
1866 return;
1867 }
1868 src=firstStarterInSrc;
1869 }
1870 }
1871 if(doCompose) {
1872 compose(src, limit, onlyContiguous, true, buffer, errorCode);
1873 } else {
1874 if(limit==nullptr) { // appendZeroCC() needs limit!=nullptr
1875 limit=u_strchru_strchr_77(src, 0);
1876 }
1877 buffer.appendZeroCC(src, limit, errorCode);
1878 }
1879}
1880
1881UBool
1882Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
1883 const uint8_t *src, const uint8_t *limit,
1884 ByteSink *sink, Edits *edits, UErrorCode &errorCode) const {
1885 U_ASSERT(limit != nullptr)(static_cast <bool> (limit != nullptr) ? void (0) : __assert_fail
("limit != nullptr", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__))
;
1886 UnicodeString s16;
1887 uint8_t minNoMaybeLead = leadByteForCP(minCompNoMaybeCP);
1888 const uint8_t *prevBoundary = src;
1889
1890 for (;;) {
1891 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1892 // or with (compYes && ccc==0) properties.
1893 const uint8_t *prevSrc;
1894 uint16_t norm16 = 0;
1895 for (;;) {
1896 if (src == limit) {
1897 if (prevBoundary != limit && sink != nullptr) {
1898 ByteSinkUtil::appendUnchanged(prevBoundary, limit,
1899 *sink, options, edits, errorCode);
1900 }
1901 return true;
1902 }
1903 if (*src < minNoMaybeLead) {
1904 ++src;
1905 } else {
1906 prevSrc = src;
1907 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16)do { int32_t __lead = (uint8_t)*(src)++; if (!(((__lead)&
0x80)==0)) { uint8_t __t1, __t2, __t3; if ((src) != (limit) &&
(__lead >= 0xe0 ? __lead < 0xf0 ? "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
[__lead &= 0xf] & (1 << ((__t1 = *(src)) >>
5)) && ++(src) != (limit) && (__t2 = *(src) -
0x80) <= 0x3f && (__lead = ((int32_t)(normTrie)->
index[(__lead << 6) + (__t1 & 0x3f)]) + __t2, 1) : (
__lead -= 0xf0) <= 4 && "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
[(__t1 = *(src)) >> 4] & (1 << __lead) &&
(__lead = (__lead << 6) | (__t1 & 0x3f), ++(src) !=
(limit)) && (__t2 = *(src) - 0x80) <= 0x3f &&
++(src) != (limit) && (__t3 = *(src) - 0x80) <= 0x3f
&& (__lead = __lead >= (normTrie)->shifted12HighStart
? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallU8Index_77((normTrie), __lead, __t2, __t3
), 1) : __lead >= 0xc2 && (__t1 = *(src) - 0x80) <=
0x3f && (__lead = (int32_t)(normTrie)->index[__lead
& 0x1f] + __t1, 1))) { ++(src); } else { __lead = (normTrie
)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (
norm16) = ((normTrie)->data.ptr16[__lead]); } while (false
)
;
1908 if (!isCompYesAndZeroCC(norm16)) {
1909 break;
1910 }
1911 }
1912 }
1913 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1914 // The current character is either a "noNo" (has a mapping)
1915 // or a "maybeYes" / "maybeNo" (combines backward)
1916 // or a "yesYes" with ccc!=0.
1917 // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1918
1919 // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1920 if (norm16 < minMaybeNo) { // minNoNo <= norm16 < minMaybeNo
1921 if (sink == nullptr) {
1922 return false;
1923 }
1924 // Fast path for mapping a character that is immediately surrounded by boundaries.
1925 // In this case, we need not decompose around the current character.
1926 if (isDecompNoAlgorithmic(norm16)) {
1927 // Maps to a single isCompYesAndZeroCC character
1928 // which also implies hasCompBoundaryBefore.
1929 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1930 hasCompBoundaryBefore(src, limit)) {
1931 if (prevBoundary != prevSrc &&
1932 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1933 *sink, options, edits, errorCode)) {
1934 break;
1935 }
1936 appendCodePointDelta(prevSrc, src, getAlgorithmicDelta(norm16), *sink, edits);
1937 prevBoundary = src;
1938 continue;
1939 }
1940 } else if (norm16 < minNoNoCompBoundaryBefore) {
1941 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1942 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1943 hasCompBoundaryBefore(src, limit)) {
1944 if (prevBoundary != prevSrc &&
1945 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1946 *sink, options, edits, errorCode)) {
1947 break;
1948 }
1949 const uint16_t *mapping = getDataForYesOrNo(norm16);
1950 int32_t length = *mapping++ & MAPPING_LENGTH_MASK;
1951 if (!ByteSinkUtil::appendChange(prevSrc, src, reinterpret_cast<const char16_t*>(mapping), length,
1952 *sink, edits, errorCode)) {
1953 break;
1954 }
1955 prevBoundary = src;
1956 continue;
1957 }
1958 } else if (norm16 >= minNoNoEmpty) {
1959 // The current character maps to nothing.
1960 // Simply omit it from the output if there is a boundary before _or_ after it.
1961 // The character itself implies no boundaries.
1962 if (hasCompBoundaryBefore(src, limit) ||
1963 hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {
1964 if (prevBoundary != prevSrc &&
1965 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1966 *sink, options, edits, errorCode)) {
1967 break;
1968 }
1969 if (edits != nullptr) {
1970 edits->addReplace(static_cast<int32_t>(src - prevSrc), 0);
1971 }
1972 prevBoundary = src;
1973 continue;
1974 }
1975 }
1976 // Other "noNo" type, or need to examine more text around this character:
1977 // Fall through to the slow path.
1978 } else if (isJamoVT(norm16)) {
1979 // Jamo L: E1 84 80..92
1980 // Jamo V: E1 85 A1..B5
1981 // Jamo T: E1 86 A8..E1 87 82
1982 U_ASSERT((src - prevSrc) == 3 && *prevSrc == 0xe1)(static_cast <bool> ((src - prevSrc) == 3 && *prevSrc
== 0xe1) ? void (0) : __assert_fail ("(src - prevSrc) == 3 && *prevSrc == 0xe1"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
))
;
1983 UChar32 prev = previousHangulOrJamo(prevBoundary, prevSrc);
1984 if (prevSrc[1] == 0x85) {
1985 // The current character is a Jamo Vowel,
1986 // compose with previous Jamo L and following Jamo T.
1987 UChar32 l = prev - Hangul::JAMO_L_BASE;
1988 if (static_cast<uint32_t>(l) < Hangul::JAMO_L_COUNT) {
1989 if (sink == nullptr) {
1990 return false;
1991 }
1992 int32_t t = getJamoTMinusBase(src, limit);
1993 if (t >= 0) {
1994 // The next character is a Jamo T.
1995 src += 3;
1996 } else if (hasCompBoundaryBefore(src, limit)) {
1997 // No Jamo T follows, not even via decomposition.
1998 t = 0;
1999 }
2000 if (t >= 0) {
2001 UChar32 syllable = Hangul::HANGUL_BASE +
2002 (l*Hangul::JAMO_V_COUNT + (prevSrc[2]-0xa1)) *
2003 Hangul::JAMO_T_COUNT + t;
2004 prevSrc -= 3; // Replace the Jamo L as well.
2005 if (prevBoundary != prevSrc &&
2006 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
2007 *sink, options, edits, errorCode)) {
2008 break;
2009 }
2010 ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
2011 prevBoundary = src;
2012 continue;
2013 }
2014 // If we see L+V+x where x!=T then we drop to the slow path,
2015 // decompose and recompose.
2016 // This is to deal with NFKC finding normal L and V but a
2017 // compatibility variant of a T.
2018 // We need to either fully compose that combination here
2019 // (which would complicate the code and may not work with strange custom data)
2020 // or use the slow path.
2021 }
2022 } else if (Hangul::isHangulLV(prev)) {
2023 // The current character is a Jamo Trailing consonant,
2024 // compose with previous Hangul LV that does not contain a Jamo T.
2025 if (sink == nullptr) {
2026 return false;
2027 }
2028 UChar32 syllable = prev + getJamoTMinusBase(prevSrc, src);
2029 prevSrc -= 3; // Replace the Hangul LV as well.
2030 if (prevBoundary != prevSrc &&
2031 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
2032 *sink, options, edits, errorCode)) {
2033 break;
2034 }
2035 ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
2036 prevBoundary = src;
2037 continue;
2038 }
2039 // No matching context, or may need to decompose surrounding text first:
2040 // Fall through to the slow path.
2041 } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC
2042 // One or more combining marks that do not combine-back:
2043 // Check for canonical order, copy unchanged if ok and
2044 // if followed by a character with a boundary-before.
2045 uint8_t cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0
2046 if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) {
2047 // Fails FCD test, need to decompose and contiguously recompose.
2048 if (sink == nullptr) {
2049 return false;
2050 }
2051 } else {
2052 // If !onlyContiguous (not FCC), then we ignore the tccc of
2053 // the previous character which passed the quick check "yes && ccc==0" test.
2054 const uint8_t *nextSrc;
2055 uint16_t n16;
2056 for (;;) {
2057 if (src == limit) {
2058 if (sink != nullptr) {
2059 ByteSinkUtil::appendUnchanged(prevBoundary, limit,
2060 *sink, options, edits, errorCode);
2061 }
2062 return true;
2063 }
2064 uint8_t prevCC = cc;
2065 nextSrc = src;
2066 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, n16)do { int32_t __lead = (uint8_t)*(nextSrc)++; if (!(((__lead)&
0x80)==0)) { uint8_t __t1, __t2, __t3; if ((nextSrc) != (limit
) && (__lead >= 0xe0 ? __lead < 0xf0 ? "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
[__lead &= 0xf] & (1 << ((__t1 = *(nextSrc)) >>
5)) && ++(nextSrc) != (limit) && (__t2 = *(nextSrc
) - 0x80) <= 0x3f && (__lead = ((int32_t)(normTrie
)->index[(__lead << 6) + (__t1 & 0x3f)]) + __t2,
1) : (__lead -= 0xf0) <= 4 && "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
[(__t1 = *(nextSrc)) >> 4] & (1 << __lead) &&
(__lead = (__lead << 6) | (__t1 & 0x3f), ++(nextSrc
) != (limit)) && (__t2 = *(nextSrc) - 0x80) <= 0x3f
&& ++(nextSrc) != (limit) && (__t3 = *(nextSrc
) - 0x80) <= 0x3f && (__lead = __lead >= (normTrie
)->shifted12HighStart ? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallU8Index_77((normTrie), __lead, __t2, __t3
), 1) : __lead >= 0xc2 && (__t1 = *(nextSrc) - 0x80
) <= 0x3f && (__lead = (int32_t)(normTrie)->index
[__lead & 0x1f] + __t1, 1))) { ++(nextSrc); } else { __lead
= (normTrie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
; } } (n16) = ((normTrie)->data.ptr16[__lead]); } while (false
)
;
2067 if (n16 >= MIN_YES_YES_WITH_CC) {
2068 cc = getCCFromNormalYesOrMaybe(n16);
2069 if (prevCC > cc) {
2070 if (sink == nullptr) {
2071 return false;
2072 }
2073 break;
2074 }
2075 } else {
2076 break;
2077 }
2078 src = nextSrc;
2079 }
2080 // src is after the last in-order combining mark.
2081 // If there is a boundary here, then we continue with no change.
2082 if (norm16HasCompBoundaryBefore(n16)) {
2083 if (isCompYesAndZeroCC(n16)) {
2084 src = nextSrc;
2085 }
2086 continue;
2087 }
2088 // Use the slow path. There is no boundary in [prevSrc, src[.
2089 }
2090 }
2091
2092 // Slow path: Find the nearest boundaries around the current character,
2093 // decompose and recompose.
2094 if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
2095 const uint8_t *p = prevSrc;
2096 UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, prevBoundary, p, norm16)do { int32_t __index = (uint8_t)*--(p); if (!(((__index)&
0x80)==0)) { __index = ucptrie_internalU8PrevIndex_77((normTrie
), __index, (const uint8_t *)(prevBoundary), (const uint8_t *
)(p)); (p) -= __index & 7; __index >>= 3; } (norm16
) = ((normTrie)->data.ptr16[__index]); } while (false)
;
2097 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2098 prevSrc = p;
2099 }
2100 }
2101 ReorderingBuffer buffer(*this, s16, errorCode);
2102 if (U_FAILURE(errorCode)) {
2103 break;
2104 }
2105 // We know there is not a boundary here.
2106 decomposeShort(prevSrc, src, STOP_AT_LIMIT, onlyContiguous,
2107 buffer, errorCode);
2108 // Decompose until the next boundary.
2109 src = decomposeShort(src, limit, STOP_AT_COMP_BOUNDARY, onlyContiguous,
2110 buffer, errorCode);
2111 if (U_FAILURE(errorCode)) {
2112 break;
2113 }
2114 if ((src - prevSrc) > INT32_MAX(2147483647)) { // guard before buffer.equals()
2115 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
2116 return true;
2117 }
2118 recompose(buffer, 0, onlyContiguous);
2119 if (!buffer.equals(prevSrc, src)) {
2120 if (sink == nullptr) {
2121 return false;
2122 }
2123 if (prevBoundary != prevSrc &&
2124 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
2125 *sink, options, edits, errorCode)) {
2126 break;
2127 }
2128 if (!ByteSinkUtil::appendChange(prevSrc, src, buffer.getStart(), buffer.length(),
2129 *sink, edits, errorCode)) {
2130 break;
2131 }
2132 prevBoundary = src;
2133 }
2134 }
2135 return true;
2136}
2137
2138UBool Normalizer2Impl::hasCompBoundaryBefore(const char16_t *src, const char16_t *limit) const {
2139 if (src == limit || *src < minCompNoMaybeCP) {
2140 return true;
2141 }
2142 UChar32 c;
2143 uint16_t norm16;
2144 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16)do { (c) = *(src)++; int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)==0) && (src
) != (limit) && (((__c2 = *(src))&0xfffffc00)==0xdc00
)) { ++(src); (c) = (((UChar32)((c))<<10UL)+(UChar32)(__c2
)-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c) >=
(normTrie)->highStart ? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallIndex_77(normTrie, c)); } else { __index
= (normTrie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
; } } (norm16) = ((normTrie)->data.ptr16[__index]); } while
(false)
;
2145 return norm16HasCompBoundaryBefore(norm16);
2146}
2147
2148UBool Normalizer2Impl::hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const {
2149 if (src == limit) {
2150 return true;
2151 }
2152 uint16_t norm16;
2153 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16)do { int32_t __lead = (uint8_t)*(src)++; if (!(((__lead)&
0x80)==0)) { uint8_t __t1, __t2, __t3; if ((src) != (limit) &&
(__lead >= 0xe0 ? __lead < 0xf0 ? "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
[__lead &= 0xf] & (1 << ((__t1 = *(src)) >>
5)) && ++(src) != (limit) && (__t2 = *(src) -
0x80) <= 0x3f && (__lead = ((int32_t)(normTrie)->
index[(__lead << 6) + (__t1 & 0x3f)]) + __t2, 1) : (
__lead -= 0xf0) <= 4 && "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
[(__t1 = *(src)) >> 4] & (1 << __lead) &&
(__lead = (__lead << 6) | (__t1 & 0x3f), ++(src) !=
(limit)) && (__t2 = *(src) - 0x80) <= 0x3f &&
++(src) != (limit) && (__t3 = *(src) - 0x80) <= 0x3f
&& (__lead = __lead >= (normTrie)->shifted12HighStart
? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallU8Index_77((normTrie), __lead, __t2, __t3
), 1) : __lead >= 0xc2 && (__t1 = *(src) - 0x80) <=
0x3f && (__lead = (int32_t)(normTrie)->index[__lead
& 0x1f] + __t1, 1))) { ++(src); } else { __lead = (normTrie
)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (
norm16) = ((normTrie)->data.ptr16[__lead]); } while (false
)
;
2154 return norm16HasCompBoundaryBefore(norm16);
2155}
2156
2157UBool Normalizer2Impl::hasCompBoundaryAfter(const char16_t *start, const char16_t *p,
2158 UBool onlyContiguous) const {
2159 if (start == p) {
2160 return true;
2161 }
2162 UChar32 c;
2163 uint16_t norm16;
2164 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16)do { (c) = *--(p); int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)!=0) && (p
) != (start) && (((__c2 = *((p) - 1))&0xfffffc00)
==0xd800)) { --(p); (c) = (((UChar32)(__c2)<<10UL)+(UChar32
)((c))-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c)
>= (normTrie)->highStart ? (normTrie)->dataLength -
UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : ucptrie_internalSmallIndex_77
(normTrie, c)); } else { __index = (normTrie)->dataLength -
UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (norm16) = ((normTrie
)->data.ptr16[__index]); } while (false)
;
2165 return norm16HasCompBoundaryAfter(norm16, onlyContiguous);
2166}
2167
2168UBool Normalizer2Impl::hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p,
2169 UBool onlyContiguous) const {
2170 if (start == p) {
2171 return true;
2172 }
2173 uint16_t norm16;
2174 UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, start, p, norm16)do { int32_t __index = (uint8_t)*--(p); if (!(((__index)&
0x80)==0)) { __index = ucptrie_internalU8PrevIndex_77((normTrie
), __index, (const uint8_t *)(start), (const uint8_t *)(p)); (
p) -= __index & 7; __index >>= 3; } (norm16) = ((normTrie
)->data.ptr16[__index]); } while (false)
;
2175 return norm16HasCompBoundaryAfter(norm16, onlyContiguous);
2176}
2177
2178const char16_t *Normalizer2Impl::findPreviousCompBoundary(const char16_t *start, const char16_t *p,
2179 UBool onlyContiguous) const {
2180 while (p != start) {
2181 const char16_t *codePointLimit = p;
2182 UChar32 c;
2183 uint16_t norm16;
2184 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16)do { (c) = *--(p); int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)!=0) && (p
) != (start) && (((__c2 = *((p) - 1))&0xfffffc00)
==0xd800)) { --(p); (c) = (((UChar32)(__c2)<<10UL)+(UChar32
)((c))-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c)
>= (normTrie)->highStart ? (normTrie)->dataLength -
UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : ucptrie_internalSmallIndex_77
(normTrie, c)); } else { __index = (normTrie)->dataLength -
UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (norm16) = ((normTrie
)->data.ptr16[__index]); } while (false)
;
2185 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2186 return codePointLimit;
2187 }
2188 if (hasCompBoundaryBefore(c, norm16)) {
2189 return p;
2190 }
2191 }
2192 return p;
2193}
2194
2195const char16_t *Normalizer2Impl::findNextCompBoundary(const char16_t *p, const char16_t *limit,
2196 UBool onlyContiguous) const {
2197 while (p != limit) {
2198 const char16_t *codePointStart = p;
2199 UChar32 c;
2200 uint16_t norm16;
2201 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16)do { (c) = *(p)++; int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)==0) && (p
) != (limit) && (((__c2 = *(p))&0xfffffc00)==0xdc00
)) { ++(p); (c) = (((UChar32)((c))<<10UL)+(UChar32)(__c2
)-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c) >=
(normTrie)->highStart ? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallIndex_77(normTrie, c)); } else { __index
= (normTrie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
; } } (norm16) = ((normTrie)->data.ptr16[__index]); } while
(false)
;
2202 if (hasCompBoundaryBefore(c, norm16)) {
2203 return codePointStart;
2204 }
2205 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2206 return p;
2207 }
2208 }
2209 return p;
2210}
2211
2212uint8_t Normalizer2Impl::getPreviousTrailCC(const char16_t *start, const char16_t *p) const {
2213 if (start == p) {
2214 return 0;
2215 }
2216 int32_t i = static_cast<int32_t>(p - start);
2217 UChar32 c;
2218 U16_PREV(start, 0, i, c)do { (c)=(start)[--(i)]; if((((c)&0xfffffc00)==0xdc00)) {
uint16_t __c2; if((i)>(0) && (((__c2=(start)[(i)-
1])&0xfffffc00)==0xd800)) { --(i); (c)=(((UChar32)(__c2)<<
10UL)+(UChar32)((c))-((0xd800<<10UL)+0xdc00-0x10000)); }
} } while (false)
;
2219 return static_cast<uint8_t>(getFCD16(c));
2220}
2221
2222uint8_t Normalizer2Impl::getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const {
2223 if (start == p) {
2224 return 0;
2225 }
2226 int32_t i = static_cast<int32_t>(p - start);
2227 UChar32 c;
2228 U8_PREV(start, 0, i, c)do { (c)=(uint8_t)(start)[--(i)]; if(!(((c)&0x80)==0)) { (
c)=utf8_prevCharSafeBody_77((const uint8_t *)start, 0, &(
i), c, -1); } } while (false)
;
2229 return static_cast<uint8_t>(getFCD16(c));
2230}
2231
2232// Note: normalizer2impl.cpp r30982 (2011-nov-27)
2233// still had getFCDTrie() which built and cached an FCD trie.
2234// That provided faster access to FCD data than getFCD16FromNormData()
2235// but required synchronization and consumed some 10kB of heap memory
2236// in any process that uses FCD (e.g., via collation).
2237// minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance,
2238// at least for ASCII & CJK.
2239
2240// Ticket 20907 - The optimizer in MSVC/Visual Studio versions below 16.4 has trouble with this
2241// function on Windows ARM64. As a work-around, we disable optimizations for this function.
2242// This work-around could/should be removed once the following versions of Visual Studio are no
2243// longer supported: All versions of VS2017, and versions of VS2019 below 16.4.
2244#if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))
2245#pragma optimize( "", off )
2246#endif
2247// Gets the FCD value from the regular normalization data.
2248uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
2249 uint16_t norm16=getNorm16(c);
2250 if (norm16 >= limitNoNo) {
2251 if(norm16>=MIN_NORMAL_MAYBE_YES) {
2252 // combining mark
2253 norm16=getCCFromNormalYesOrMaybe(norm16);
2254 return norm16|(norm16<<8);
2255 } else if(norm16>=minMaybeYes) {
2256 return 0;
2257 } else if(norm16<minMaybeNo) { // isDecompNoAlgorithmic(norm16)
2258 uint16_t deltaTrailCC = norm16 & DELTA_TCCC_MASK;
2259 if (deltaTrailCC <= DELTA_TCCC_1) {
2260 return deltaTrailCC >> OFFSET_SHIFT;
2261 }
2262 // Maps to an isCompYesAndZeroCC.
2263 c=mapAlgorithmic(c, norm16);
2264 norm16=getRawNorm16(c);
2265 }
2266 }
2267 if(norm16<=minYesNo || isHangulLVT(norm16)) {
2268 // no decomposition or Hangul syllable, all zeros
2269 return 0;
2270 }
2271 // c decomposes, get everything from the variable-length extra data
2272 const uint16_t *mapping=getData(norm16);
2273 uint16_t firstUnit=*mapping;
2274 norm16=firstUnit>>8; // tccc
2275 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
2276 norm16|=*(mapping-1)&0xff00; // lccc
2277 }
2278 return norm16;
2279}
2280#if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))
2281#pragma optimize( "", on )
2282#endif
2283
2284uint16_t Normalizer2Impl::getFCD16FromMaybeOrNonZeroCC(uint16_t norm16) const {
2285 U_ASSERT(norm16 >= minMaybeNo)(static_cast <bool> (norm16 >= minMaybeNo) ? void (0
) : __assert_fail ("norm16 >= minMaybeNo", __builtin_FILE (
), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__))
;
2286 if (norm16 >= MIN_NORMAL_MAYBE_YES) {
2287 // combining mark
2288 norm16 = getCCFromNormalYesOrMaybe(norm16);
2289 return norm16 | (norm16<<8);
2290 } else if (norm16 >= minMaybeYes) {
2291 return 0;
2292 }
2293 // c decomposes, get everything from the variable-length extra data
2294 const uint16_t *mapping = getDataForMaybe(norm16);
2295 uint16_t firstUnit = *mapping;
2296 // maybeNo has lccc = 0
2297 U_ASSERT((firstUnit & MAPPING_HAS_CCC_LCCC_WORD) == 0 || (*(mapping - 1) & 0xff00) == 0)(static_cast <bool> ((firstUnit & MAPPING_HAS_CCC_LCCC_WORD
) == 0 || (*(mapping - 1) & 0xff00) == 0) ? void (0) : __assert_fail
("(firstUnit & MAPPING_HAS_CCC_LCCC_WORD) == 0 || (*(mapping - 1) & 0xff00) == 0"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
))
;
2298 return firstUnit >> 8; // tccc
2299}
2300
2301// Dual functionality:
2302// buffer!=nullptr: normalize
2303// buffer==nullptr: isNormalized/quickCheck/spanQuickCheckYes
2304const char16_t *
2305Normalizer2Impl::makeFCD(const char16_t *src, const char16_t *limit,
2306 ReorderingBuffer *buffer,
2307 UErrorCode &errorCode) const {
2308 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
2309 // Similar to the prevBoundary in the compose() implementation.
2310 const char16_t *prevBoundary=src;
2311 int32_t prevFCD16=0;
2312 if(limit==nullptr) {
2313 src=copyLowPrefixFromNulTerminated(src, minLcccCP, buffer, errorCode);
2314 if(U_FAILURE(errorCode)) {
2315 return src;
2316 }
2317 if(prevBoundary<src) {
2318 prevBoundary=src;
2319 // We know that the previous character's lccc==0.
2320 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
2321 prevFCD16=getFCD16(*(src-1));
2322 if(prevFCD16>1) {
2323 --prevBoundary;
2324 }
2325 }
2326 limit=u_strchru_strchr_77(src, 0);
2327 }
2328
2329 // Note: In this function we use buffer->appendZeroCC() because we track
2330 // the lead and trail combining classes here, rather than leaving it to
2331 // the ReorderingBuffer.
2332 // The exception is the call to decomposeShort() which uses the buffer
2333 // in the normal way.
2334
2335 const char16_t *prevSrc;
2336 UChar32 c=0;
2337 uint16_t fcd16=0;
2338
2339 for(;;) {
2340 // count code units with lccc==0
2341 for(prevSrc=src; src!=limit;) {
2342 if((c=*src)<minLcccCP) {
2343 prevFCD16=~c;
2344 ++src;
2345 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
2346 prevFCD16=0;
2347 ++src;
2348 } else {
2349 if(U16_IS_LEAD(c)(((c)&0xfffffc00)==0xd800)) {
2350 char16_t c2;
2351 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])(((c2=src[1])&0xfffffc00)==0xdc00)) {
2352 c=U16_GET_SUPPLEMENTARY(c, c2)(((UChar32)(c)<<10UL)+(UChar32)(c2)-((0xd800<<10UL
)+0xdc00-0x10000))
;
2353 }
2354 }
2355 if((fcd16=getFCD16FromNormData(c))<=0xff) {
2356 prevFCD16=fcd16;
2357 src+=U16_LENGTH(c)((uint32_t)(c)<=0xffff ? 1 : 2);
2358 } else {
2359 break;
2360 }
2361 }
2362 }
2363 // copy these code units all at once
2364 if(src!=prevSrc) {
2365 if(buffer!=nullptr && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
2366 break;
2367 }
2368 if(src==limit) {
2369 break;
2370 }
2371 prevBoundary=src;
2372 // We know that the previous character's lccc==0.
2373 if(prevFCD16<0) {
2374 // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
2375 UChar32 prev=~prevFCD16;
2376 if(prev<minDecompNoCP) {
2377 prevFCD16=0;
2378 } else {
2379 prevFCD16=getFCD16FromNormData(prev);
2380 if(prevFCD16>1) {
2381 --prevBoundary;
2382 }
2383 }
2384 } else {
2385 const char16_t *p=src-1;
2386 if(U16_IS_TRAIL(*p)(((*p)&0xfffffc00)==0xdc00) && prevSrc<p && U16_IS_LEAD(*(p-1))(((*(p-1))&0xfffffc00)==0xd800)) {
2387 --p;
2388 // Need to fetch the previous character's FCD value because
2389 // prevFCD16 was just for the trail surrogate code point.
2390 prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1])(((UChar32)(p[0])<<10UL)+(UChar32)(p[1])-((0xd800<<
10UL)+0xdc00-0x10000))
);
2391 // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
2392 }
2393 if(prevFCD16>1) {
2394 prevBoundary=p;
2395 }
2396 }
2397 // The start of the current character (c).
2398 prevSrc=src;
2399 } else if(src==limit) {
2400 break;
2401 }
2402
2403 src+=U16_LENGTH(c)((uint32_t)(c)<=0xffff ? 1 : 2);
2404 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
2405 // Check for proper order, and decompose locally if necessary.
2406 if((prevFCD16&0xff)<=(fcd16>>8)) {
2407 // proper order: prev tccc <= current lccc
2408 if((fcd16&0xff)<=1) {
2409 prevBoundary=src;
2410 }
2411 if(buffer!=nullptr && !buffer->appendZeroCC(c, errorCode)) {
2412 break;
2413 }
2414 prevFCD16=fcd16;
2415 continue;
2416 } else if(buffer==nullptr) {
2417 return prevBoundary; // quick check "no"
2418 } else {
2419 /*
2420 * Back out the part of the source that we copied or appended
2421 * already but is now going to be decomposed.
2422 * prevSrc is set to after what was copied/appended.
2423 */
2424 buffer->removeSuffix(static_cast<int32_t>(prevSrc - prevBoundary));
2425 /*
2426 * Find the part of the source that needs to be decomposed,
2427 * up to the next safe boundary.
2428 */
2429 src=findNextFCDBoundary(src, limit);
2430 /*
2431 * The source text does not fulfill the conditions for FCD.
2432 * Decompose and reorder a limited piece of the text.
2433 */
2434 decomposeShort(prevBoundary, src, false, false, *buffer, errorCode);
2435 if (U_FAILURE(errorCode)) {
2436 break;
2437 }
2438 prevBoundary=src;
2439 prevFCD16=0;
2440 }
2441 }
2442 return src;
2443}
2444
2445void Normalizer2Impl::makeFCDAndAppend(const char16_t *src, const char16_t *limit,
2446 UBool doMakeFCD,
2447 UnicodeString &safeMiddle,
2448 ReorderingBuffer &buffer,
2449 UErrorCode &errorCode) const {
2450 if(!buffer.isEmpty()) {
2451 const char16_t *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
2452 if(src!=firstBoundaryInSrc) {
2453 const char16_t *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
2454 buffer.getLimit());
2455 int32_t destSuffixLength = static_cast<int32_t>(buffer.getLimit() - lastBoundaryInDest);
2456 UnicodeString middle(lastBoundaryInDest, destSuffixLength);
2457 buffer.removeSuffix(destSuffixLength);
2458 safeMiddle=middle;
2459 middle.append(src, static_cast<int32_t>(firstBoundaryInSrc - src));
2460 const char16_t *middleStart=middle.getBuffer();
2461 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
2462 if(U_FAILURE(errorCode)) {
2463 return;
2464 }
2465 src=firstBoundaryInSrc;
2466 }
2467 }
2468 if(doMakeFCD) {
2469 makeFCD(src, limit, &buffer, errorCode);
2470 } else {
2471 if(limit==nullptr) { // appendZeroCC() needs limit!=nullptr
2472 limit=u_strchru_strchr_77(src, 0);
2473 }
2474 buffer.appendZeroCC(src, limit, errorCode);
2475 }
2476}
2477
2478const char16_t *Normalizer2Impl::findPreviousFCDBoundary(const char16_t *start, const char16_t *p) const {
2479 while(start<p) {
2480 const char16_t *codePointLimit = p;
2481 UChar32 c;
2482 uint16_t norm16;
2483 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16)do { (c) = *--(p); int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)!=0) && (p
) != (start) && (((__c2 = *((p) - 1))&0xfffffc00)
==0xd800)) { --(p); (c) = (((UChar32)(__c2)<<10UL)+(UChar32
)((c))-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c)
>= (normTrie)->highStart ? (normTrie)->dataLength -
UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : ucptrie_internalSmallIndex_77
(normTrie, c)); } else { __index = (normTrie)->dataLength -
UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (norm16) = ((normTrie
)->data.ptr16[__index]); } while (false)
;
2484 if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16)) {
2485 return codePointLimit;
2486 }
2487 if (norm16HasDecompBoundaryBefore(norm16)) {
2488 return p;
2489 }
2490 }
2491 return p;
2492}
2493
2494const char16_t *Normalizer2Impl::findNextFCDBoundary(const char16_t *p, const char16_t *limit) const {
2495 while(p<limit) {
2496 const char16_t *codePointStart=p;
2497 UChar32 c;
2498 uint16_t norm16;
2499 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16)do { (c) = *(p)++; int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)==0) && (p
) != (limit) && (((__c2 = *(p))&0xfffffc00)==0xdc00
)) { ++(p); (c) = (((UChar32)((c))<<10UL)+(UChar32)(__c2
)-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c) >=
(normTrie)->highStart ? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallIndex_77(normTrie, c)); } else { __index
= (normTrie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
; } } (norm16) = ((normTrie)->data.ptr16[__index]); } while
(false)
;
2500 if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16)) {
2501 return codePointStart;
2502 }
2503 if (norm16HasDecompBoundaryAfter(norm16)) {
2504 return p;
2505 }
2506 }
2507 return p;
2508}
2509
2510// CanonicalIterator data -------------------------------------------------- ***
2511
2512CanonIterData::CanonIterData(UErrorCode &errorCode) :
2513 mutableTrie(umutablecptrie_openumutablecptrie_open_77(0, 0, &errorCode)), trie(nullptr),
2514 canonStartSets(uprv_deleteUObjectuprv_deleteUObject_77, nullptr, errorCode) {}
2515
2516CanonIterData::~CanonIterData() {
2517 umutablecptrie_closeumutablecptrie_close_77(mutableTrie);
2518 ucptrie_closeucptrie_close_77(trie);
2519}
2520
2521void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
2522 uint32_t canonValue = umutablecptrie_getumutablecptrie_get_77(mutableTrie, decompLead);
2523 if((canonValue&(CANON_HAS_SET0x200000|CANON_VALUE_MASK0x1fffff))==0 && origin!=0) {
2524 // origin is the first character whose decomposition starts with
2525 // the character for which we are setting the value.
2526 umutablecptrie_setumutablecptrie_set_77(mutableTrie, decompLead, canonValue|origin, &errorCode);
2527 } else {
2528 // origin is not the first character, or it is U+0000.
2529 UnicodeSet *set;
2530 if((canonValue&CANON_HAS_SET0x200000)==0) {
2531 LocalPointer<UnicodeSet> lpSet(new UnicodeSet, errorCode);
2532 set=lpSet.getAlias();
2533 if(U_FAILURE(errorCode)) {
2534 return;
2535 }
2536 UChar32 firstOrigin = static_cast<UChar32>(canonValue & CANON_VALUE_MASK0x1fffff);
2537 canonValue = (canonValue & ~CANON_VALUE_MASK0x1fffff) | CANON_HAS_SET0x200000 | static_cast<uint32_t>(canonStartSets.size());
2538 umutablecptrie_setumutablecptrie_set_77(mutableTrie, decompLead, canonValue, &errorCode);
2539 canonStartSets.adoptElement(lpSet.orphan(), errorCode);
2540 if (U_FAILURE(errorCode)) {
2541 return;
2542 }
2543 if(firstOrigin!=0) {
2544 set->add(firstOrigin);
2545 }
2546 } else {
2547 set = static_cast<UnicodeSet*>(canonStartSets[static_cast<int32_t>(canonValue & CANON_VALUE_MASK0x1fffff)]);
2548 }
2549 set->add(origin);
2550 }
2551}
2552
2553// C++ class for friend access to private Normalizer2Impl members.
2554class InitCanonIterData {
2555public:
2556 static void doInit(Normalizer2Impl *impl, UErrorCode &errorCode);
2557};
2558
2559U_CDECL_BEGINextern "C" {
2560
2561// UInitOnce instantiation function for CanonIterData
2562static void U_CALLCONV
2563initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
2564 InitCanonIterData::doInit(impl, errorCode);
2565}
2566
2567U_CDECL_END}
2568
2569void InitCanonIterData::doInit(Normalizer2Impl *impl, UErrorCode &errorCode) {
2570 U_ASSERT(impl->fCanonIterData == nullptr)(static_cast <bool> (impl->fCanonIterData == nullptr
) ? void (0) : __assert_fail ("impl->fCanonIterData == nullptr"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
))
;
2571 impl->fCanonIterData = new CanonIterData(errorCode);
2572 if (impl->fCanonIterData == nullptr) {
2573 errorCode=U_MEMORY_ALLOCATION_ERROR;
2574 }
2575 if (U_SUCCESS(errorCode)) {
2576 UChar32 start = 0, end;
2577 uint32_t value;
2578 while ((end = ucptrie_getRangeucptrie_getRange_77(impl->normTrie, start,
2579 UCPMAP_RANGE_FIXED_LEAD_SURROGATES, Normalizer2Impl::INERT,
2580 nullptr, nullptr, &value)) >= 0) {
2581 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
2582 if (value != Normalizer2Impl::INERT) {
2583 impl->makeCanonIterDataFromNorm16(start, end, value, *impl->fCanonIterData, errorCode);
2584 }
2585 start = end + 1;
2586 }
2587#ifdef UCPTRIE_DEBUG
2588 umutablecptrie_setName(impl->fCanonIterData->mutableTrie, "CanonIterData");
2589#endif
2590 impl->fCanonIterData->trie = umutablecptrie_buildImmutableumutablecptrie_buildImmutable_77(
2591 impl->fCanonIterData->mutableTrie, UCPTRIE_TYPE_SMALL, UCPTRIE_VALUE_BITS_32, &errorCode);
2592 umutablecptrie_closeumutablecptrie_close_77(impl->fCanonIterData->mutableTrie);
2593 impl->fCanonIterData->mutableTrie = nullptr;
2594 }
2595 if (U_FAILURE(errorCode)) {
2596 delete impl->fCanonIterData;
2597 impl->fCanonIterData = nullptr;
2598 }
2599}
2600
2601void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,
2602 CanonIterData &newData,
2603 UErrorCode &errorCode) const {
2604 if(isInert(norm16) ||
2605 (minYesNo<=norm16 && norm16<minNoNo) ||
2606 (minMaybeNo<=norm16 && norm16<minMaybeYes)) {
2607 // Inert, or 2-way mapping (including Hangul syllable).
2608 // We do not write a canonStartSet for any yesNo/maybeNo character.
2609 // Composites from 2-way mappings are added at runtime from the
2610 // starter's compositions list, and the other characters in
2611 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
2612 // "maybe" characters.
2613 return;
2614 }
2615 for(UChar32 c=start; c<=end; ++c) {
2616 uint32_t oldValue = umutablecptrie_getumutablecptrie_get_77(newData.mutableTrie, c);
2617 uint32_t newValue=oldValue;
2618 if(isMaybeYesOrNonZeroCC(norm16)) {
2619 // not a segment starter if it occurs in a decomposition or has cc!=0
2620 newValue|=CANON_NOT_SEGMENT_STARTER0x80000000;
2621 if(norm16<MIN_NORMAL_MAYBE_YES) {
2622 newValue|=CANON_HAS_COMPOSITIONS0x40000000;
2623 }
2624 } else if(norm16<minYesNo) {
2625 newValue|=CANON_HAS_COMPOSITIONS0x40000000;
2626 } else {
2627 // c has a one-way decomposition
2628 UChar32 c2=c;
2629 // Do not modify the whole-range norm16 value.
2630 uint16_t norm16_2=norm16;
2631 if (isDecompNoAlgorithmic(norm16_2)) {
2632 // Maps to an isCompYesAndZeroCC.
2633 c2 = mapAlgorithmic(c2, norm16_2);
2634 norm16_2 = getRawNorm16(c2);
2635 // No compatibility mappings for the CanonicalIterator.
2636 U_ASSERT(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2)))(static_cast <bool> (!(isHangulLV(norm16_2) || isHangulLVT
(norm16_2))) ? void (0) : __assert_fail ("!(isHangulLV(norm16_2) || isHangulLVT(norm16_2))"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
))
;
2637 }
2638 if (norm16_2 > minYesNo) {
2639 // c decomposes, get everything from the variable-length extra data
2640 const uint16_t *mapping=getDataForYesOrNo(norm16_2);
2641 uint16_t firstUnit=*mapping;
2642 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
2643 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
2644 if(c==c2 && (*(mapping-1)&0xff)!=0) {
2645 newValue|=CANON_NOT_SEGMENT_STARTER0x80000000; // original c has cc!=0
2646 }
2647 }
2648 // Skip empty mappings (no characters in the decomposition).
2649 if(length!=0) {
2650 ++mapping; // skip over the firstUnit
2651 // add c to first code point's start set
2652 int32_t i=0;
2653 U16_NEXT_UNSAFE(mapping, i, c2)do { (c2)=(mapping)[(i)++]; if((((c2)&0xfffffc00)==0xd800
)) { (c2)=(((UChar32)((c2))<<10UL)+(UChar32)((mapping)[
(i)++])-((0xd800<<10UL)+0xdc00-0x10000)); } } while (false
)
;
2654 newData.addToStartSet(c, c2, errorCode);
2655 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
2656 // one-way mapping. A 2-way mapping is possible here after
2657 // intermediate algorithmic mapping.
2658 if(norm16_2>=minNoNo) {
2659 while(i<length) {
2660 U16_NEXT_UNSAFE(mapping, i, c2)do { (c2)=(mapping)[(i)++]; if((((c2)&0xfffffc00)==0xd800
)) { (c2)=(((UChar32)((c2))<<10UL)+(UChar32)((mapping)[
(i)++])-((0xd800<<10UL)+0xdc00-0x10000)); } } while (false
)
;
2661 uint32_t c2Value = umutablecptrie_getumutablecptrie_get_77(newData.mutableTrie, c2);
2662 if((c2Value&CANON_NOT_SEGMENT_STARTER0x80000000)==0) {
2663 umutablecptrie_setumutablecptrie_set_77(newData.mutableTrie, c2,
2664 c2Value|CANON_NOT_SEGMENT_STARTER0x80000000, &errorCode);
2665 }
2666 }
2667 }
2668 }
2669 } else {
2670 // c decomposed to c2 algorithmically; c has cc==0
2671 newData.addToStartSet(c, c2, errorCode);
2672 }
2673 }
2674 if(newValue!=oldValue) {
2675 umutablecptrie_setumutablecptrie_set_77(newData.mutableTrie, c, newValue, &errorCode);
2676 }
2677 }
2678}
2679
2680UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
2681 // Logically const: Synchronized instantiation.
2682 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
2683 umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
2684 return U_SUCCESS(errorCode);
2685}
2686
2687int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
2688 return static_cast<int32_t>(ucptrie_getucptrie_get_77(fCanonIterData->trie, c));
2689}
2690
2691const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
2692 return *static_cast<const UnicodeSet*>(fCanonIterData->canonStartSets[n]);
2693}
2694
2695UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
2696 return getCanonValue(c)>=0;
2697}
2698
2699UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
2700 int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER0x80000000;
2701 if(canonValue==0) {
2702 return false;
2703 }
2704 set.clear();
2705 int32_t value=canonValue&CANON_VALUE_MASK0x1fffff;
2706 if((canonValue&CANON_HAS_SET0x200000)!=0) {
2707 set.addAll(getCanonStartSet(value));
2708 } else if(value!=0) {
2709 set.add(value);
2710 }
2711 if((canonValue&CANON_HAS_COMPOSITIONS0x40000000)!=0) {
2712 uint16_t norm16=getRawNorm16(c);
2713 if(norm16==JAMO_L) {
2714 UChar32 syllable=
2715 static_cast<UChar32>(Hangul::HANGUL_BASE + (c - Hangul::JAMO_L_BASE) * Hangul::JAMO_VT_COUNT);
2716 set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
2717 } else {
2718 addComposites(getCompositionsList(norm16), set);
2719 }
2720 }
2721 return true;
2722}
2723
2724U_NAMESPACE_END}
2725
2726// Normalizer2 data swapping ----------------------------------------------- ***
2727
2728U_NAMESPACE_USEusing namespace icu_77;
2729
2730U_CAPIextern "C" int32_t U_EXPORT2
2731unorm2_swapunorm2_swap_77(const UDataSwapper *ds,
2732 const void *inData, int32_t length, void *outData,
2733 UErrorCode *pErrorCode) {
2734 const UDataInfo *pInfo;
2735 int32_t headerSize;
2736
2737 const uint8_t *inBytes;
2738 uint8_t *outBytes;
2739
2740 const int32_t *inIndexes;
2741 int32_t indexes[Normalizer2Impl::IX_TOTAL_SIZE+1];
2742
2743 int32_t i, offset, nextOffset, size;
2744
2745 /* udata_swapDataHeader checks the arguments */
2746 headerSize=udata_swapDataHeaderudata_swapDataHeader_77(ds, inData, length, outData, pErrorCode);
2747 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
2748 return 0;
2749 }
2750
2751 /* check data format and format version */
2752 pInfo=(const UDataInfo *)((const char *)inData+4);
2753 uint8_t formatVersion0=pInfo->formatVersion[0];
2754 if(!(
2755 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
2756 pInfo->dataFormat[1]==0x72 &&
2757 pInfo->dataFormat[2]==0x6d &&
2758 pInfo->dataFormat[3]==0x32 &&
2759 (1<=formatVersion0 && formatVersion0<=5)
2760 )) {
2761 udata_printErrorudata_printError_77(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
2762 pInfo->dataFormat[0], pInfo->dataFormat[1],
2763 pInfo->dataFormat[2], pInfo->dataFormat[3],
2764 pInfo->formatVersion[0]);
2765 *pErrorCode=U_UNSUPPORTED_ERROR;
2766 return 0;
2767 }
2768
2769 inBytes=(const uint8_t *)inData+headerSize;
2770 outBytes=(outData == nullptr) ? nullptr : (uint8_t *)outData+headerSize;
2771
2772 inIndexes=(const int32_t *)inBytes;
2773 int32_t minIndexesLength;
2774 if(formatVersion0==1) {
2775 minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_YES+1;
2776 } else if(formatVersion0==2) {
2777 minIndexesLength=Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY+1;
2778 } else if(formatVersion0<=4) {
2779 minIndexesLength=Normalizer2Impl::IX_MIN_LCCC_CP+1;
2780 } else {
2781 minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_NO_COMBINES_FWD+1;
2782 }
2783
2784 if(length>=0) {
2785 length-=headerSize;
2786 if(length<minIndexesLength*4) {
2787 udata_printErrorudata_printError_77(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
2788 length);
2789 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2790 return 0;
2791 }
2792 }
2793
2794 /* read the first few indexes */
2795 for(i=0; i<UPRV_LENGTHOF(indexes)(int32_t)(sizeof(indexes)/sizeof((indexes)[0])); ++i) {
2796 indexes[i]=udata_readInt32udata_readInt32_77(ds, inIndexes[i]);
2797 }
2798
2799 /* get the total length of the data */
2800 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
2801
2802 if(length>=0) {
2803 if(length<size) {
2804 udata_printErrorudata_printError_77(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
2805 length);
2806 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2807 return 0;
2808 }
2809
2810 /* copy the data for inaccessible bytes */
2811 if(inBytes!=outBytes) {
2812 uprv_memcpy(outBytes, inBytes, size)do { clang diagnostic push clang diagnostic ignored "-Waddress"
(static_cast <bool> (outBytes != __null) ? void (0) :
__assert_fail ("outBytes != __null", __builtin_FILE (), __builtin_LINE
(), __extension__ __PRETTY_FUNCTION__)); (static_cast <bool
> (inBytes != __null) ? void (0) : __assert_fail ("inBytes != __null"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
)); clang diagnostic pop :: memcpy(outBytes, inBytes, size);
} while (false)
;
2813 }
2814
2815 offset=0;
2816
2817 /* swap the int32_t indexes[] */
2818 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
2819 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
2820 offset=nextOffset;
2821
2822 /* swap the trie */
2823 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
2824 utrie_swapAnyVersionutrie_swapAnyVersion_77(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2825 offset=nextOffset;
2826
2827 /* swap the uint16_t extraData[] */
2828 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
2829 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2830 offset=nextOffset;
2831
2832 /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
2833 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
2834 offset=nextOffset;
2835
2836 U_ASSERT(offset==size)(static_cast <bool> (offset==size) ? void (0) : __assert_fail
("offset==size", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__))
;
2837 }
2838
2839 return headerSize+size;
2840}
2841
2842#endif // !UCONFIG_NO_NORMALIZATION

/root/firefox-clang/intl/icu/source/common/normalizer2impl.h

1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2009-2014, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: normalizer2impl.h
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2009nov22
16* created by: Markus W. Scherer
17*/
18
19#ifndef __NORMALIZER2IMPL_H__
20#define __NORMALIZER2IMPL_H__
21
22#include "unicode/utypes.h"
23
24#if !UCONFIG_NO_NORMALIZATION0
25
26#include "unicode/normalizer2.h"
27#include "unicode/ucptrie.h"
28#include "unicode/unistr.h"
29#include "unicode/unorm.h"
30#include "unicode/utf.h"
31#include "unicode/utf16.h"
32#include "mutex.h"
33#include "udataswp.h"
34#include "uset_imp.h"
35
36// When the nfc.nrm data is *not* hardcoded into the common library
37// (with this constant set to 0),
38// then it needs to be built into the data package:
39// Add nfc.nrm to icu4c/source/data/Makefile.in DAT_FILES_SHORT
40#define NORM2_HARDCODE_NFC_DATA1 1
41
42U_NAMESPACE_BEGINnamespace icu_77 {
43
44struct CanonIterData;
45
46class ByteSink;
47class Edits;
48class InitCanonIterData;
49class LcccContext;
50
51class U_COMMON_API Hangul {
52public:
53 /* Korean Hangul and Jamo constants */
54 enum {
55 JAMO_L_BASE=0x1100, /* "lead" jamo */
56 JAMO_L_END=0x1112,
57 JAMO_V_BASE=0x1161, /* "vowel" jamo */
58 JAMO_V_END=0x1175,
59 JAMO_T_BASE=0x11a7, /* "trail" jamo */
60 JAMO_T_END=0x11c2,
61
62 HANGUL_BASE=0xac00,
63 HANGUL_END=0xd7a3,
64
65 JAMO_L_COUNT=19,
66 JAMO_V_COUNT=21,
67 JAMO_T_COUNT=28,
68
69 JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT,
70
71 HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT,
72 HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT
73 };
74
75 static inline UBool isHangul(UChar32 c) {
76 return HANGUL_BASE<=c && c<HANGUL_LIMIT;
77 }
78 static inline UBool
79 isHangulLV(UChar32 c) {
80 c-=HANGUL_BASE;
81 return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
82 }
83 static inline UBool isJamoL(UChar32 c) {
84 return static_cast<uint32_t>(c - JAMO_L_BASE) < JAMO_L_COUNT;
85 }
86 static inline UBool isJamoV(UChar32 c) {
87 return static_cast<uint32_t>(c - JAMO_V_BASE) < JAMO_V_COUNT;
88 }
89 static inline UBool isJamoT(UChar32 c) {
90 int32_t t=c-JAMO_T_BASE;
91 return 0<t && t<JAMO_T_COUNT; // not JAMO_T_BASE itself
92 }
93 static UBool isJamo(UChar32 c) {
94 return JAMO_L_BASE<=c && c<=JAMO_T_END &&
95 (c<=JAMO_L_END || (JAMO_V_BASE<=c && c<=JAMO_V_END) || JAMO_T_BASE<c);
96 }
97
98 /**
99 * Decomposes c, which must be a Hangul syllable, into buffer
100 * and returns the length of the decomposition (2 or 3).
101 */
102 static inline int32_t decompose(UChar32 c, char16_t buffer[3]) {
103 c-=HANGUL_BASE;
104 UChar32 c2=c%JAMO_T_COUNT;
105 c/=JAMO_T_COUNT;
106 buffer[0] = static_cast<char16_t>(JAMO_L_BASE + c / JAMO_V_COUNT);
107 buffer[1] = static_cast<char16_t>(JAMO_V_BASE + c % JAMO_V_COUNT);
108 if(c2==0) {
109 return 2;
110 } else {
111 buffer[2] = static_cast<char16_t>(JAMO_T_BASE + c2);
112 return 3;
113 }
114 }
115
116 /**
117 * Decomposes c, which must be a Hangul syllable, into buffer.
118 * This is the raw, not recursive, decomposition. Its length is always 2.
119 */
120 static inline void getRawDecomposition(UChar32 c, char16_t buffer[2]) {
121 UChar32 orig=c;
122 c-=HANGUL_BASE;
123 UChar32 c2=c%JAMO_T_COUNT;
124 if(c2==0) {
125 c/=JAMO_T_COUNT;
126 buffer[0] = static_cast<char16_t>(JAMO_L_BASE + c / JAMO_V_COUNT);
127 buffer[1] = static_cast<char16_t>(JAMO_V_BASE + c % JAMO_V_COUNT);
128 } else {
129 buffer[0] = static_cast<char16_t>(orig - c2); // LV syllable
130 buffer[1] = static_cast<char16_t>(JAMO_T_BASE + c2);
131 }
132 }
133private:
134 Hangul() = delete; // no instantiation
135};
136
137class Normalizer2Impl;
138
139class U_COMMON_API ReorderingBuffer : public UMemory {
140public:
141 /** Constructs only; init() should be called. */
142 ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) :
143 impl(ni), str(dest),
144 start(nullptr), reorderStart(nullptr), limit(nullptr),
145 remainingCapacity(0), lastCC(0) {}
146 /** Constructs, removes the string contents, and initializes for a small initial capacity. */
147 ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest, UErrorCode &errorCode);
148 ~ReorderingBuffer() {
149 if (start != nullptr) {
150 str.releaseBuffer(static_cast<int32_t>(limit - start));
151 }
152 }
153 UBool init(int32_t destCapacity, UErrorCode &errorCode);
154
155 UBool isEmpty() const { return start==limit; }
156 int32_t length() const { return static_cast<int32_t>(limit - start); }
157 char16_t *getStart() { return start; }
158 char16_t *getLimit() { return limit; }
159 uint8_t getLastCC() const { return lastCC; }
160
161 UBool equals(const char16_t *start, const char16_t *limit) const;
162 UBool equals(const uint8_t *otherStart, const uint8_t *otherLimit) const;
163
164 UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
165 return (c<=0xffff) ?
166 appendBMP(static_cast<char16_t>(c), cc, errorCode) :
167 appendSupplementary(c, cc, errorCode);
168 }
169 UBool append(const char16_t *s, int32_t length, UBool isNFD,
170 uint8_t leadCC, uint8_t trailCC,
171 UErrorCode &errorCode);
172 UBool appendBMP(char16_t c, uint8_t cc, UErrorCode &errorCode) {
173 if(remainingCapacity==0 && !resize(1, errorCode)) {
174 return false;
175 }
176 if(lastCC<=cc || cc==0) {
177 *limit++=c;
178 lastCC=cc;
179 if(cc<=1) {
180 reorderStart=limit;
181 }
182 } else {
183 insert(c, cc);
184 }
185 --remainingCapacity;
186 return true;
187 }
188 UBool appendZeroCC(UChar32 c, UErrorCode &errorCode);
189 UBool appendZeroCC(const char16_t *s, const char16_t *sLimit, UErrorCode &errorCode);
190 void remove();
191 void removeSuffix(int32_t suffixLength);
192 void setReorderingLimit(char16_t *newLimit) {
193 remainingCapacity += static_cast<int32_t>(limit - newLimit);
194 reorderStart=limit=newLimit;
195 lastCC=0;
196 }
197 void copyReorderableSuffixTo(UnicodeString &s) const {
198 s.setTo(ConstChar16Ptr(reorderStart), static_cast<int32_t>(limit - reorderStart));
199 }
200private:
201 /*
202 * TODO: Revisit whether it makes sense to track reorderStart.
203 * It is set to after the last known character with cc<=1,
204 * which stops previousCC() before it reads that character and looks up its cc.
205 * previousCC() is normally only called from insert().
206 * In other words, reorderStart speeds up the insertion of a combining mark
207 * into a multi-combining mark sequence where it does not belong at the end.
208 * This might not be worth the trouble.
209 * On the other hand, it's not a huge amount of trouble.
210 *
211 * We probably need it for UNORM_SIMPLE_APPEND.
212 */
213
214 UBool appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode);
215 void insert(UChar32 c, uint8_t cc);
216 static void writeCodePoint(char16_t *p, UChar32 c) {
217 if(c<=0xffff) {
218 *p = static_cast<char16_t>(c);
219 } else {
220 p[0]=U16_LEAD(c)(UChar)(((c)>>10)+0xd7c0);
221 p[1]=U16_TRAIL(c)(UChar)(((c)&0x3ff)|0xdc00);
222 }
223 }
224 UBool resize(int32_t appendLength, UErrorCode &errorCode);
225
226 const Normalizer2Impl &impl;
227 UnicodeString &str;
228 char16_t *start, *reorderStart, *limit;
229 int32_t remainingCapacity;
230 uint8_t lastCC;
231
232 // private backward iterator
233 void setIterator() { codePointStart=limit; }
234 void skipPrevious(); // Requires start<codePointStart.
235 uint8_t previousCC(); // Returns 0 if there is no previous character.
236
237 char16_t *codePointStart, *codePointLimit;
238};
239
240/**
241 * Low-level implementation of the Unicode Normalization Algorithm.
242 * For the data structure and details see the documentation at the end of
243 * this normalizer2impl.h and in the design doc at
244 * https://unicode-org.github.io/icu/design/normalization/custom.html
245 */
246class U_COMMON_API Normalizer2Impl : public UObject {
247public:
248 Normalizer2Impl() : normTrie(nullptr), fCanonIterData(nullptr) {}
249 virtual ~Normalizer2Impl();
250
251 void init(const int32_t *inIndexes, const UCPTrie *inTrie,
252 const uint16_t *inExtraData, const uint8_t *inSmallFCD);
253
254 void addLcccChars(UnicodeSet &set) const;
255 void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
256 void addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
257
258 // low-level properties ------------------------------------------------ ***
259
260 UBool ensureCanonIterData(UErrorCode &errorCode) const;
261
262 // The trie stores values for lead surrogate code *units*.
263 // Surrogate code *points* are inert.
264 uint16_t getNorm16(UChar32 c) const {
265 return U_IS_LEAD(c)(((c)&0xfffffc00)==0xd800) ?
266 static_cast<uint16_t>(INERT) :
267 UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c)((normTrie)->data.ptr16[((uint32_t)(c) <= (uint32_t)(0xffff
) ? ((int32_t)(normTrie)->index[(c) >> UCPTRIE_FAST_SHIFT
] + ((c) & UCPTRIE_FAST_DATA_MASK)) : (uint32_t)(c) <=
0x10ffff ? ((c) >= (normTrie)->highStart ? (normTrie)->
dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : ucptrie_internalSmallIndex_77
(normTrie, c)) : (normTrie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
)])
;
268 }
269 uint16_t getRawNorm16(UChar32 c) const { return UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c)((normTrie)->data.ptr16[((uint32_t)(c) <= (uint32_t)(0xffff
) ? ((int32_t)(normTrie)->index[(c) >> UCPTRIE_FAST_SHIFT
] + ((c) & UCPTRIE_FAST_DATA_MASK)) : (uint32_t)(c) <=
0x10ffff ? ((c) >= (normTrie)->highStart ? (normTrie)->
dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : ucptrie_internalSmallIndex_77
(normTrie, c)) : (normTrie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
)])
; }
270
271 UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const {
272 if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
273 return UNORM_YES;
274 } else if(minMaybeNo<=norm16) {
275 return UNORM_MAYBE;
276 } else {
277 return UNORM_NO;
278 }
279 }
280 UBool isAlgorithmicNoNo(uint16_t norm16) const { return limitNoNo<=norm16 && norm16<minMaybeNo; }
281 UBool isCompNo(uint16_t norm16) const { return minNoNo<=norm16 && norm16<minMaybeNo; }
282 UBool isDecompYes(uint16_t norm16) const { return norm16<minYesNo || minMaybeYes<=norm16; }
283
284 uint8_t getCC(uint16_t norm16) const {
285 if(norm16>=MIN_NORMAL_MAYBE_YES) {
286 return getCCFromNormalYesOrMaybe(norm16);
287 }
288 if(norm16<minNoNo || limitNoNo<=norm16) {
289 return 0;
290 }
291 return getCCFromNoNo(norm16);
292 }
293 static uint8_t getCCFromNormalYesOrMaybe(uint16_t norm16) {
294 return static_cast<uint8_t>(norm16 >> OFFSET_SHIFT);
295 }
296 static uint8_t getCCFromYesOrMaybeYes(uint16_t norm16) {
297 return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
298 }
299 uint8_t getCCFromYesOrMaybeYesCP(UChar32 c) const {
300 if (c < minCompNoMaybeCP) { return 0; }
301 return getCCFromYesOrMaybeYes(getNorm16(c));
302 }
303
304 /**
305 * Returns the FCD data for code point c.
306 * @param c A Unicode code point.
307 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
308 */
309 uint16_t getFCD16(UChar32 c) const {
310 if(c<minDecompNoCP) {
311 return 0;
312 } else if(c<=0xffff) {
313 if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
314 }
315 return getFCD16FromNormData(c);
316 }
317 /**
318 * Returns the FCD data for the next code point (post-increment).
319 * Might skip only a lead surrogate rather than the whole surrogate pair if none of
320 * the supplementary code points associated with the lead surrogate have non-zero FCD data.
321 * @param s A valid pointer into a string. Requires s!=limit.
322 * @param limit The end of the string, or NULL.
323 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
324 */
325 uint16_t nextFCD16(const char16_t *&s, const char16_t *limit) const {
326 UChar32 c=*s++;
327 if(c<minDecompNoCP || !singleLeadMightHaveNonZeroFCD16(c)) {
328 return 0;
329 }
330 char16_t c2;
331 if(U16_IS_LEAD(c)(((c)&0xfffffc00)==0xd800) && s!=limit && U16_IS_TRAIL(c2=*s)(((c2=*s)&0xfffffc00)==0xdc00)) {
332 c=U16_GET_SUPPLEMENTARY(c, c2)(((UChar32)(c)<<10UL)+(UChar32)(c2)-((0xd800<<10UL
)+0xdc00-0x10000))
;
333 ++s;
334 }
335 return getFCD16FromNormData(c);
336 }
337 /**
338 * Returns the FCD data for the previous code point (pre-decrement).
339 * @param start The start of the string.
340 * @param s A valid pointer into a string. Requires start<s.
341 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
342 */
343 uint16_t previousFCD16(const char16_t *start, const char16_t *&s) const {
344 UChar32 c=*--s;
345 if(c<minDecompNoCP) {
346 return 0;
347 }
348 if(!U16_IS_TRAIL(c)(((c)&0xfffffc00)==0xdc00)) {
349 if(!singleLeadMightHaveNonZeroFCD16(c)) {
350 return 0;
351 }
352 } else {
353 char16_t c2;
354 if(start<s && U16_IS_LEAD(c2=*(s-1))(((c2=*(s-1))&0xfffffc00)==0xd800)) {
355 c=U16_GET_SUPPLEMENTARY(c2, c)(((UChar32)(c2)<<10UL)+(UChar32)(c)-((0xd800<<10UL
)+0xdc00-0x10000))
;
356 --s;
357 }
358 }
359 return getFCD16FromNormData(c);
360 }
361
362 /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
363 UBool singleLeadMightHaveNonZeroFCD16(UChar32 lead) const {
364 // 0<=lead<=0xffff
365 uint8_t bits=smallFCD[lead>>8];
366 if(bits==0) { return false; }
367 return (bits >> ((lead >> 5) & 7)) & 1;
368 }
369 /** Returns the FCD value from the regular normalization data. */
370 uint16_t getFCD16FromNormData(UChar32 c) const;
371
372 uint16_t getFCD16FromMaybeOrNonZeroCC(uint16_t norm16) const;
373
374 /**
375 * Gets the decomposition for one code point.
376 * @param c code point
377 * @param buffer out-only buffer for algorithmic decompositions
378 * @param length out-only, takes the length of the decomposition, if any
379 * @return pointer to the decomposition, or NULL if none
380 */
381 const char16_t *getDecomposition(UChar32 c, char16_t buffer[4], int32_t &length) const;
382
383 /**
384 * Gets the raw decomposition for one code point.
385 * @param c code point
386 * @param buffer out-only buffer for algorithmic decompositions
387 * @param length out-only, takes the length of the decomposition, if any
388 * @return pointer to the decomposition, or NULL if none
389 */
390 const char16_t *getRawDecomposition(UChar32 c, char16_t buffer[30], int32_t &length) const;
391
392 UChar32 composePair(UChar32 a, UChar32 b) const;
393
394 UBool isCanonSegmentStarter(UChar32 c) const;
395 UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const;
396
397 enum {
398 // Fixed norm16 values.
399 MIN_YES_YES_WITH_CC=0xfe02,
400 JAMO_VT=0xfe00,
401 MIN_NORMAL_MAYBE_YES=0xfc00,
402 JAMO_L=2, // offset=1 hasCompBoundaryAfter=false
403 INERT=1, // offset=0 hasCompBoundaryAfter=true
404
405 // norm16 bit 0 is comp-boundary-after.
406 HAS_COMP_BOUNDARY_AFTER=1,
407 OFFSET_SHIFT=1,
408
409 // For algorithmic one-way mappings, norm16 bits 2..1 indicate the
410 // tccc (0, 1, >1) for quick FCC boundary-after tests.
411 DELTA_TCCC_0=0,
412 DELTA_TCCC_1=2,
413 DELTA_TCCC_GT_1=4,
414 DELTA_TCCC_MASK=6,
415 DELTA_SHIFT=3,
416
417 MAX_DELTA=0x40
418 };
419
420 enum {
421 // Byte offsets from the start of the data, after the generic header.
422 IX_NORM_TRIE_OFFSET,
423 IX_EXTRA_DATA_OFFSET,
424 IX_SMALL_FCD_OFFSET,
425 IX_RESERVED3_OFFSET,
426 IX_RESERVED4_OFFSET,
427 IX_RESERVED5_OFFSET,
428 IX_RESERVED6_OFFSET,
429 IX_TOTAL_SIZE,
430
431 // Code point thresholds for quick check codes.
432 IX_MIN_DECOMP_NO_CP,
433 IX_MIN_COMP_NO_MAYBE_CP,
434
435 // Norm16 value thresholds for quick check combinations and types of extra data.
436
437 /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
438 IX_MIN_YES_NO,
439 /** Mappings are comp-normalized. */
440 IX_MIN_NO_NO,
441 IX_LIMIT_NO_NO,
442 IX_MIN_MAYBE_YES,
443
444 /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
445 IX_MIN_YES_NO_MAPPINGS_ONLY,
446 /** Mappings are not comp-normalized but have a comp boundary before. */
447 IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE,
448 /** Mappings do not have a comp boundary before. */
449 IX_MIN_NO_NO_COMP_NO_MAYBE_CC,
450 /** Mappings to the empty string. */
451 IX_MIN_NO_NO_EMPTY,
452
453 IX_MIN_LCCC_CP,
454 IX_RESERVED19,
455
456 /** Two-way mappings; each starts with a character that combines backward. */
457 IX_MIN_MAYBE_NO, // 20
458 /** Two-way mappings & compositions. */
459 IX_MIN_MAYBE_NO_COMBINES_FWD,
460
461 IX_COUNT // 22
462 };
463
464 enum {
465 MAPPING_HAS_CCC_LCCC_WORD=0x80,
466 MAPPING_HAS_RAW_MAPPING=0x40,
467 // unused bit 0x20,
468 MAPPING_LENGTH_MASK=0x1f
469 };
470
471 enum {
472 COMP_1_LAST_TUPLE=0x8000,
473 COMP_1_TRIPLE=1,
474 COMP_1_TRAIL_LIMIT=0x3400,
475 COMP_1_TRAIL_MASK=0x7ffe,
476 COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit
477 COMP_2_TRAIL_SHIFT=6,
478 COMP_2_TRAIL_MASK=0xffc0
479 };
480
481 // higher-level functionality ------------------------------------------ ***
482
483 // NFD without an NFD Normalizer2 instance.
484 UnicodeString &decompose(const UnicodeString &src, UnicodeString &dest,
485 UErrorCode &errorCode) const;
486 /**
487 * Decomposes [src, limit[ and writes the result to dest.
488 * limit can be NULL if src is NUL-terminated.
489 * destLengthEstimate is the initial dest buffer capacity and can be -1.
490 */
491 void decompose(const char16_t *src, const char16_t *limit,
492 UnicodeString &dest, int32_t destLengthEstimate,
493 UErrorCode &errorCode) const;
494
495 const char16_t *decompose(const char16_t *src, const char16_t *limit,
496 ReorderingBuffer *buffer, UErrorCode &errorCode) const;
497 void decomposeAndAppend(const char16_t *src, const char16_t *limit,
498 UBool doDecompose,
499 UnicodeString &safeMiddle,
500 ReorderingBuffer &buffer,
501 UErrorCode &errorCode) const;
502
503 /** sink==nullptr: isNormalized()/spanQuickCheckYes() */
504 const uint8_t *decomposeUTF8(uint32_t options,
505 const uint8_t *src, const uint8_t *limit,
506 ByteSink *sink, Edits *edits, UErrorCode &errorCode) const;
507
508 UBool compose(const char16_t *src, const char16_t *limit,
509 UBool onlyContiguous,
510 UBool doCompose,
511 ReorderingBuffer &buffer,
512 UErrorCode &errorCode) const;
513 const char16_t *composeQuickCheck(const char16_t *src, const char16_t *limit,
514 UBool onlyContiguous,
515 UNormalizationCheckResult *pQCResult) const;
516 void composeAndAppend(const char16_t *src, const char16_t *limit,
517 UBool doCompose,
518 UBool onlyContiguous,
519 UnicodeString &safeMiddle,
520 ReorderingBuffer &buffer,
521 UErrorCode &errorCode) const;
522
523 /** sink==nullptr: isNormalized() */
524 UBool composeUTF8(uint32_t options, UBool onlyContiguous,
525 const uint8_t *src, const uint8_t *limit,
526 ByteSink *sink, icu::Edits *edits, UErrorCode &errorCode) const;
527
528 const char16_t *makeFCD(const char16_t *src, const char16_t *limit,
529 ReorderingBuffer *buffer, UErrorCode &errorCode) const;
530 void makeFCDAndAppend(const char16_t *src, const char16_t *limit,
531 UBool doMakeFCD,
532 UnicodeString &safeMiddle,
533 ReorderingBuffer &buffer,
534 UErrorCode &errorCode) const;
535
536 UBool hasDecompBoundaryBefore(UChar32 c) const;
537 UBool norm16HasDecompBoundaryBefore(uint16_t norm16) const;
538 UBool hasDecompBoundaryAfter(UChar32 c) const;
539 UBool norm16HasDecompBoundaryAfter(uint16_t norm16) const;
540 UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); }
541
542 UBool hasCompBoundaryBefore(UChar32 c) const {
543 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));
544 }
545 UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous) const {
546 return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);
547 }
548 UBool isCompInert(UChar32 c, UBool onlyContiguous) const {
549 uint16_t norm16=getNorm16(c);
550 return isCompYesAndZeroCC(norm16) &&
551 (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
552 (!onlyContiguous || isInert(norm16) || *getDataForYesOrNo(norm16) <= 0x1ff);
553 // The last check fetches the mapping's first unit and checks tccc<=1.
554 }
555
556 UBool hasFCDBoundaryBefore(UChar32 c) const { return hasDecompBoundaryBefore(c); }
557 UBool hasFCDBoundaryAfter(UChar32 c) const { return hasDecompBoundaryAfter(c); }
558 UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; }
559private:
560 friend class InitCanonIterData;
561 friend class LcccContext;
562
563 UBool isMaybe(uint16_t norm16) const { return minMaybeNo<=norm16 && norm16<=JAMO_VT; }
564 UBool isMaybeYesOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; }
565 static UBool isInert(uint16_t norm16) { return norm16==INERT; }
566 static UBool isJamoL(uint16_t norm16) { return norm16==JAMO_L; }
567 static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; }
568 uint16_t hangulLVT() const { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; }
569 UBool isHangulLV(uint16_t norm16) const { return norm16==minYesNo; }
570 UBool isHangulLVT(uint16_t norm16) const {
571 return norm16==hangulLVT();
572 }
573 UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; }
574 // UBool isCompYes(uint16_t norm16) const {
575 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
576 // }
577 // UBool isCompYesOrMaybe(uint16_t norm16) const {
578 // return norm16<minNoNo || minMaybeNo<=norm16;
579 // }
580 // UBool hasZeroCCFromDecompYes(uint16_t norm16) const {
581 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
582 // }
583 UBool isDecompYesAndZeroCC(uint16_t norm16) const {
584 return norm16<minYesNo ||
585 norm16==JAMO_VT ||
586 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
587 }
588 /**
589 * A little faster and simpler than isDecompYesAndZeroCC() but does not include
590 * the MaybeYes which combine-forward and have ccc=0.
591 */
592 UBool isMostDecompYesAndZeroCC(uint16_t norm16) const {
593 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
594 }
595 /** Since formatVersion 5: same as isAlgorithmicNoNo() */
596 UBool isDecompNoAlgorithmic(uint16_t norm16) const { return limitNoNo<=norm16 && norm16<minMaybeNo; }
597
598 // For use with isCompYes().
599 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
600 // static uint8_t getCCFromYes(uint16_t norm16) {
601 // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
602 // }
603 uint8_t getCCFromNoNo(uint16_t norm16) const {
604 const uint16_t *mapping=getDataForYesOrNo(norm16);
605 if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) {
606 return static_cast<uint8_t>(*(mapping - 1));
607 } else {
608 return 0;
609 }
610 }
611 // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
612 uint8_t getTrailCCFromCompYesAndZeroCC(uint16_t norm16) const {
613 if(norm16<=minYesNo) {
614 return 0; // yesYes and Hangul LV have ccc=tccc=0
615 } else {
616 // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
617 return static_cast<uint8_t>(*getDataForYesOrNo(norm16) >> 8); // tccc from yesNo
618 }
619 }
620 uint8_t getPreviousTrailCC(const char16_t *start, const char16_t *p) const;
621 uint8_t getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const;
622
623 // Requires algorithmic-NoNo.
624 UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const {
625 return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;
626 }
627 UChar32 getAlgorithmicDelta(uint16_t norm16) const {
628 return (norm16>>DELTA_SHIFT)-centerNoNoDelta;
629 }
630
631 const uint16_t *getDataForYesOrNo(uint16_t norm16) const {
632 return extraData+(norm16>>OFFSET_SHIFT);
633 }
634 const uint16_t *getDataForMaybe(uint16_t norm16) const {
635 return extraData+((norm16-minMaybeNo+limitNoNo)>>OFFSET_SHIFT);
636 }
637 const uint16_t *getData(uint16_t norm16) const {
638 if(norm16>=minMaybeNo) {
41
Assuming 'norm16' is < field 'minMaybeNo'
42
Taking false branch
639 norm16=norm16-minMaybeNo+limitNoNo;
640 }
641 return extraData+(norm16>>OFFSET_SHIFT);
43
Addition of a null pointer (via field 'extraData') and a probably nonzero integer value may result in undefined behavior
642 }
643 const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const {
644 if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16
38.1
'norm16' is < MIN_NORMAL_MAYBE_YES
38.1
'norm16' is < MIN_NORMAL_MAYBE_YES
) {
38
Assuming 'norm16' is >= JAMO_L
39
Taking false branch
645 return nullptr;
646 } else {
647 // if yesYes: if Jamo L: harmless empty list
648 return getData(norm16);
40
Calling 'Normalizer2Impl::getData'
649 }
650 }
651 const uint16_t *getCompositionsListForComposite(uint16_t norm16) const {
652 // A composite has both mapping & compositions list.
653 const uint16_t *list=getData(norm16);
654 return list+ // mapping pointer
655 1+ // +1 to skip the first unit with the mapping length
656 (*list&MAPPING_LENGTH_MASK); // + mapping length
657 }
658 /**
659 * @param c code point must have compositions
660 * @return compositions list pointer
661 */
662 const uint16_t *getCompositionsList(uint16_t norm16) const {
663 return isDecompYes(norm16) ?
664 getCompositionsListForDecompYes(norm16) :
665 getCompositionsListForComposite(norm16);
666 }
667
668 const char16_t *copyLowPrefixFromNulTerminated(const char16_t *src,
669 UChar32 minNeedDataCP,
670 ReorderingBuffer *buffer,
671 UErrorCode &errorCode) const;
672
673 enum StopAt { STOP_AT_LIMIT, STOP_AT_DECOMP_BOUNDARY, STOP_AT_COMP_BOUNDARY };
674
675 const char16_t *decomposeShort(const char16_t *src, const char16_t *limit,
676 UBool stopAtCompBoundary, UBool onlyContiguous,
677 ReorderingBuffer &buffer, UErrorCode &errorCode) const;
678 UBool decompose(UChar32 c, uint16_t norm16,
679 ReorderingBuffer &buffer, UErrorCode &errorCode) const;
680
681 const uint8_t *decomposeShort(const uint8_t *src, const uint8_t *limit,
682 StopAt stopAt, UBool onlyContiguous,
683 ReorderingBuffer &buffer, UErrorCode &errorCode) const;
684
685 static int32_t combine(const uint16_t *list, UChar32 trail);
686 void addComposites(const uint16_t *list, UnicodeSet &set) const;
687 void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
688 UBool onlyContiguous) const;
689
690 UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
691 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);
692 }
693 UBool norm16HasCompBoundaryBefore(uint16_t norm16) const {
694 return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);
695 }
696 UBool hasCompBoundaryBefore(const char16_t *src, const char16_t *limit) const;
697 UBool hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const;
698 UBool hasCompBoundaryAfter(const char16_t *start, const char16_t *p,
699 UBool onlyContiguous) const;
700 UBool hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p,
701 UBool onlyContiguous) const;
702 UBool norm16HasCompBoundaryAfter(uint16_t norm16, UBool onlyContiguous) const {
703 return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
704 (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));
705 }
706 /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
707 UBool isTrailCC01ForCompBoundaryAfter(uint16_t norm16) const {
708 return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?
709 (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : *getDataForYesOrNo(norm16) <= 0x1ff);
710 }
711
712 const char16_t *findPreviousCompBoundary(const char16_t *start, const char16_t *p,
713 UBool onlyContiguous) const;
714 const char16_t *findNextCompBoundary(const char16_t *p, const char16_t *limit,
715 UBool onlyContiguous) const;
716
717 const char16_t *findPreviousFCDBoundary(const char16_t *start, const char16_t *p) const;
718 const char16_t *findNextFCDBoundary(const char16_t *p, const char16_t *limit) const;
719
720 void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,
721 CanonIterData &newData, UErrorCode &errorCode) const;
722
723 int32_t getCanonValue(UChar32 c) const;
724 const UnicodeSet &getCanonStartSet(int32_t n) const;
725
726 // UVersionInfo dataVersion;
727
728 // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
729 char16_t minDecompNoCP;
730 char16_t minCompNoMaybeCP;
731 char16_t minLcccCP;
732
733 // Norm16 value thresholds for quick check combinations and types of extra data.
734 uint16_t minYesNo;
735 uint16_t minYesNoMappingsOnly;
736 uint16_t minNoNo;
737 uint16_t minNoNoCompBoundaryBefore;
738 uint16_t minNoNoCompNoMaybeCC;
739 uint16_t minNoNoEmpty;
740 uint16_t limitNoNo;
741 uint16_t centerNoNoDelta;
742 uint16_t minMaybeNo;
743 uint16_t minMaybeNoCombinesFwd;
744 uint16_t minMaybeYes;
745
746 const UCPTrie *normTrie;
747 const uint16_t *extraData; // mappings and/or compositions
748 const uint8_t *smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
749
750 UInitOnce fCanonIterDataInitOnce {};
751 CanonIterData *fCanonIterData;
752};
753
754// bits in canonIterData
755#define CANON_NOT_SEGMENT_STARTER0x80000000 0x80000000
756#define CANON_HAS_COMPOSITIONS0x40000000 0x40000000
757#define CANON_HAS_SET0x200000 0x200000
758#define CANON_VALUE_MASK0x1fffff 0x1fffff
759
760/**
761 * ICU-internal shortcut for quick access to standard Unicode normalization.
762 */
763class U_COMMON_API Normalizer2Factory {
764public:
765 static const Normalizer2 *getFCDInstance(UErrorCode &errorCode);
766 static const Normalizer2 *getFCCInstance(UErrorCode &errorCode);
767 static const Normalizer2 *getNoopInstance(UErrorCode &errorCode);
768
769 static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &errorCode);
770
771 static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode);
772 static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode);
773 static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode);
774
775 // Get the Impl instance of the Normalizer2.
776 // Must be used only when it is known that norm2 is a Normalizer2WithImpl instance.
777 static const Normalizer2Impl *getImpl(const Normalizer2 *norm2);
778private:
779 Normalizer2Factory() = delete; // No instantiation.
780};
781
782U_NAMESPACE_END}
783
784U_CAPIextern "C" int32_t U_EXPORT2
785unorm2_swapunorm2_swap_77(const UDataSwapper *ds,
786 const void *inData, int32_t length, void *outData,
787 UErrorCode *pErrorCode);
788
789/**
790 * Get the NF*_QC property for a code point, for u_getIntPropertyValue().
791 * @internal
792 */
793U_CFUNCextern "C" UNormalizationCheckResult
794unorm_getQuickCheckunorm_getQuickCheck_77(UChar32 c, UNormalizationMode mode);
795
796/**
797 * Gets the 16-bit FCD value (lead & trail CCs) for a code point, for u_getIntPropertyValue().
798 * @internal
799 */
800U_CFUNCextern "C" uint16_t
801unorm_getFCD16unorm_getFCD16_77(UChar32 c);
802
803/**
804 * Format of Normalizer2 .nrm data files.
805 * Format version 5.0.
806 *
807 * Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms.
808 * ICU ships with data files for standard Unicode Normalization Forms
809 * NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm),
810 * NFKC_Casefold (nfkc_cf.nrm) and NFKC_Simple_Casefold (nfkc_scf.nrm).
811 * Custom (application-specific) data can be built into additional .nrm files
812 * with the gennorm2 build tool.
813 * ICU ships with one such file, uts46.nrm, for the implementation of UTS #46.
814 *
815 * Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been
816 * cached already. Internally, Normalizer2Impl.load() reads the .nrm file.
817 *
818 * A .nrm file begins with a standard ICU data file header
819 * (DataHeader, see ucmndata.h and unicode/udata.h).
820 * The UDataInfo.dataVersion field usually contains the Unicode version
821 * for which the data was generated.
822 *
823 * After the header, the file contains the following parts.
824 * Constants are defined as enum values of the Normalizer2Impl class.
825 *
826 * Many details of the data structures are described in the design doc
827 * which is at https://unicode-org.github.io/icu/design/normalization/custom.html
828 *
829 * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_NORM_TRIE_OFFSET]/4;
830 *
831 * The first eight indexes are byte offsets in ascending order.
832 * Each byte offset marks the start of the next part in the data file,
833 * and the end of the previous one.
834 * When two consecutive byte offsets are the same, then the corresponding part is empty.
835 * Byte offsets are offsets from after the header,
836 * that is, from the beginning of the indexes[].
837 * Each part starts at an offset with proper alignment for its data.
838 * If necessary, the previous part may include padding bytes to achieve this alignment.
839 *
840 * minDecompNoCP=indexes[IX_MIN_DECOMP_NO_CP] is the lowest code point
841 * with a decomposition mapping, that is, with NF*D_QC=No.
842 * minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point
843 * with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward).
844 * minLcccCP=indexes[IX_MIN_LCCC_CP] (index 18, new in formatVersion 3)
845 * is the lowest code point with lccc!=0.
846 *
847 * The next eight indexes are thresholds of 16-bit trie values for ranges of
848 * values indicating multiple normalization properties.
849 * Format version 5 adds the two minMaybeNo* threshold indexes.
850 * The thresholds are listed here in threshold order,
851 * not in the order they are stored in the indexes.
852 * minYesNo=indexes[IX_MIN_YES_NO];
853 * minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
854 * minNoNo=indexes[IX_MIN_NO_NO];
855 * minNoNoCompBoundaryBefore=indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
856 * minNoNoCompNoMaybeCC=indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
857 * minNoNoEmpty=indexes[IX_MIN_NO_NO_EMPTY];
858 * limitNoNo=indexes[IX_LIMIT_NO_NO];
859 * minMaybeNo=indexes[IX_MIN_MAYBE_NO];
860 * minMaybeNoCombinesFwd=indexes[IX_MIN_MAYBE_NO_COMBINES_FWD];
861 * minMaybeYes=indexes[IX_MIN_MAYBE_YES];
862 * See the normTrie description below and the design doc for details.
863 *
864 * UCPTrie normTrie; -- see ucptrie_impl.h and ucptrie.h, same as Java CodePointTrie
865 *
866 * The trie holds the main normalization data. Each code point is mapped to a 16-bit value.
867 * Rather than using independent bits in the value (which would require more than 16 bits),
868 * information is extracted primarily via range checks.
869 * Except, format version 3+ uses bit 0 for hasCompBoundaryAfter().
870 * For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo
871 * means that the character has NF*C_QC=Yes and NF*D_QC=No properties,
872 * which means it has a two-way (round-trip) decomposition mapping.
873 * Values in the ranges 2<=norm16<limitNoNo and minMaybeNo<=norm16<minMaybeYes
874 * are also directly indexes into the extraData
875 * pointing to mappings, compositions lists, or both.
876 * Value norm16==INERT (0 in versions 1 & 2, 1 in version 3+)
877 * means that the character is normalization-inert, that is,
878 * it does not have a mapping, does not participate in composition, has a zero
879 * canonical combining class, and forms a boundary where text before it and after it
880 * can be normalized independently.
881 * For details about how multiple properties are encoded in 16-bit values
882 * see the design doc.
883 * Note that the encoding cannot express all combinations of the properties involved;
884 * it only supports those combinations that are allowed by
885 * the Unicode Normalization algorithms. Details are in the design doc as well.
886 * The gennorm2 tool only builds .nrm files for data that conforms to the limitations.
887 *
888 * The trie has a value for each lead surrogate code unit representing the "worst case"
889 * properties of the 1024 supplementary characters whose UTF-16 form starts with
890 * the lead surrogate. If all of the 1024 supplementary characters are normalization-inert,
891 * then their lead surrogate code unit has the trie value INERT.
892 * When the lead surrogate unit's value exceeds the quick check minimum during processing,
893 * the properties for the full supplementary code point need to be looked up.
894 *
895 * uint16_t extraData[];
896 *
897 * The extraData array contains many per-character data sections.
898 * Each section contains mappings and/or composition lists.
899 * The norm16 value of each character that has such data is directly an index to
900 * a section of the extraData array.
901 *
902 * In version 3+, the norm16 values must be shifted right by OFFSET_SHIFT
903 * for accessing extraData.
904 *
905 * The data structures for compositions lists and mappings are described in the design doc.
906 *
907 * In version 4 and below, the composition lists for MaybeYes characters were stored before
908 * the data for other characters.
909 * This sub-array had a length of MIN_NORMAL_MAYBE_YES-minMaybeYes.
910 * In version 3 & 4, the difference must be shifted right by OFFSET_SHIFT.
911 *
912 * In version 5, the data for MaybeNo and MaybeYes characters is stored after
913 * the data for other characters.
914 *
915 * If there are no MaybeNo and no MaybeYes characters,
916 * then minMaybeYes==minMaybeNo==MIN_NORMAL_MAYBE_YES.
917 * If there are such characters, then minMaybeNo is subtracted from their norm16 values
918 * to get the index into the extraData.
919 * In version 4 and below, the data index for Yes* and No* characters needs to be
920 * offset by the length of the MaybeYes data.
921 * In version 5, the data index for Maybe* characters needs to be offset by limitNoNo.
922 *
923 * Version 5 is the first to support MaybeNo characters, and
924 * adds the minMaybeNo and minMaybeNoCombinesFwd thresholds and
925 * the corresponding sections of the extraData.
926 *
927 * uint8_t smallFCD[0x100]; -- new in format version 2
928 *
929 * This is a bit set to help speed up FCD value lookups in the absence of a full
930 * UTrie2 or other large data structure with the full FCD value mapping.
931 *
932 * Each smallFCD bit is set if any of the corresponding 32 BMP code points
933 * has a non-zero FCD value (lccc!=0 or tccc!=0).
934 * Bit 0 of smallFCD[0] is for U+0000..U+001F. Bit 7 of smallFCD[0xff] is for U+FFE0..U+FFFF.
935 * A bit for 32 lead surrogates is set if any of the 32k corresponding
936 * _supplementary_ code points has a non-zero FCD value.
937 *
938 * This bit set is most useful for the large blocks of CJK characters with FCD=0.
939 *
940 * Changes from format version 1 to format version 2 ---------------------------
941 *
942 * - Addition of data for raw (not recursively decomposed) mappings.
943 * + The MAPPING_NO_COMP_BOUNDARY_AFTER bit in the extraData is now also set when
944 * the mapping is to an empty string or when the character combines-forward.
945 * This subsumes the one actual use of the MAPPING_PLUS_COMPOSITION_LIST bit which
946 * is then repurposed for the MAPPING_HAS_RAW_MAPPING bit.
947 * + For details see the design doc.
948 * - Addition of indexes[IX_MIN_YES_NO_MAPPINGS_ONLY] and separation of the yesNo extraData into
949 * distinct ranges (combines-forward vs. not)
950 * so that a range check can be used to find out if there is a compositions list.
951 * This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag.
952 * It is needed for the new (in ICU 49) composePair(), not for other normalization.
953 * - Addition of the smallFCD[] bit set.
954 *
955 * Changes from format version 2 to format version 3 (ICU 60) ------------------
956 *
957 * - norm16 bit 0 indicates hasCompBoundaryAfter(),
958 * except that for contiguous composition (FCC) the tccc must be checked as well.
959 * Data indexes and ccc values are shifted left by one (OFFSET_SHIFT).
960 * Thresholds like minNoNo are tested before shifting.
961 *
962 * - Algorithmic mapping deltas are shifted left by two more bits (total DELTA_SHIFT),
963 * to make room for two bits (three values) indicating whether the tccc is 0, 1, or greater.
964 * See DELTA_TCCC_MASK etc.
965 * This helps with fetching tccc/FCD values and FCC hasCompBoundaryAfter().
966 * minMaybeNo is 8-aligned so that the DELTA_TCCC_MASK bits can be tested directly.
967 *
968 * - Algorithmic mappings are only used for mapping to "comp yes and ccc=0" characters,
969 * and ASCII characters are mapped algorithmically only to other ASCII characters.
970 * This helps with hasCompBoundaryBefore() and compose() fast paths.
971 * It is never necessary any more to loop for algorithmic mappings.
972 *
973 * - Addition of indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE],
974 * indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC], and indexes[IX_MIN_NO_NO_EMPTY],
975 * and separation of the noNo extraData into distinct ranges.
976 * With this, the noNo norm16 value indicates whether the mapping is
977 * compose-normalized, not normalized but hasCompBoundaryBefore(),
978 * not even that, or maps to an empty string.
979 * hasCompBoundaryBefore() can be determined solely from the norm16 value.
980 *
981 * - The norm16 value for Hangul LVT is now different from that for Hangul LV,
982 * so that hasCompBoundaryAfter() need not check for the syllable type.
983 * For Hangul LV, minYesNo continues to be used (no comp-boundary-after).
984 * For Hangul LVT, minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER is used.
985 * The extraData units at these indexes are set to firstUnit=2 and firstUnit=3, respectively,
986 * to simplify some code.
987 *
988 * - The extraData firstUnit bit 5 is no longer necessary
989 * (norm16 bit 0 used instead of firstUnit MAPPING_NO_COMP_BOUNDARY_AFTER),
990 * is reserved again, and always set to 0.
991 *
992 * - Addition of indexes[IX_MIN_LCCC_CP], the first code point where lccc!=0.
993 * This used to be hardcoded to U+0300, but in data like NFKC_Casefold it is lower:
994 * U+00AD Soft Hyphen maps to an empty string,
995 * which is artificially assigned "worst case" values lccc=1 and tccc=255.
996 *
997 * - A mapping to an empty string has explicit lccc=1 and tccc=255 values.
998 *
999 * Changes from format version 3 to format version 4 (ICU 63) ------------------
1000 *
1001 * Switched from UTrie2 to UCPTrie/CodePointTrie.
1002 *
1003 * The new trie no longer stores different values for surrogate code *units* vs.
1004 * surrogate code *points*.
1005 * Lead surrogates still have values for optimized UTF-16 string processing.
1006 * When looking up code point properties, the code now checks for lead surrogates and
1007 * treats them as inert.
1008 *
1009 * gennorm2 now has to reject mappings for surrogate code points.
1010 * UTS #46 maps unpaired surrogates to U+FFFD in code rather than via its
1011 * custom normalization data file.
1012 *
1013 * Changes from format version 4 to format version 5 (ICU 76) ------------------
1014 *
1015 * Unicode 16 adds the first MaybeYes characters which combine both backward and forward,
1016 * taking this formerly theoretical data structure into reality.
1017 *
1018 * Unicode 16 also adds the first characters that have two-way mappings whose first characters
1019 * combine backward. In order for normalization and the quick check to work properly,
1020 * these composite characters also must be marked as NFC_QC=Maybe,
1021 * corresponding to "combines back", although the composites themselves do not combine backward.
1022 * Format version 5 adds two new ranges between "algorithmic NoNo" and MaybeYes,
1023 * with thresholds minMaybeNo and minMaybeNoCombinesFwd,
1024 * and indexes[IX_MIN_MAYBE_NO] and indexes[IX_MIN_MAYBE_NO_COMBINES_FWD],
1025 * and corresponding mappings and composition lists in the extraData.
1026 *
1027 * Format version 5 moves the data for Maybe* characters from the start of the extraData array
1028 * to its end.
1029 */
1030
1031#endif /* !UCONFIG_NO_NORMALIZATION */
1032#endif /* __NORMALIZER2IMPL_H__ */