/root/firefox-clang/intl/icu/source/common/normalizer2impl.cpp

1

2

// License & terms of use: http://www.unicode.org/copyright.html

3

/*

4

*******************************************************************************

5

*

6

7

8

*

9

*******************************************************************************

10

* file name: normalizer2impl.cpp

11

* encoding: UTF-8

12

* tab size: 8 (not used)

13

* indentation:4

14

*

15

* created on: 2009nov22

16

* created by: Markus W. Scherer

17

*/

18

19

// #define UCPTRIE_DEBUG

20

21

#include "unicode/utypes.h"

22

23

#if !UCONFIG_NO_NORMALIZATION0

24

25

#include "unicode/bytestream.h"

26

#include "unicode/edits.h"

27

#include "unicode/normalizer2.h"

28

#include "unicode/stringoptions.h"

29

#include "unicode/ucptrie.h"

30

#include "unicode/udata.h"

31

#include "unicode/umutablecptrie.h"

32

#include "unicode/ustring.h"

33

#include "unicode/utf16.h"

34

#include "unicode/utf8.h"

35

#include "bytesinkutil.h"

36

#include "cmemory.h"

37

#include "mutex.h"

38

#include "normalizer2impl.h"

39

#include "putilimp.h"

40

#include "uassert.h"

41

#include "ucptrie_impl.h"

42

#include "uset_imp.h"

43

#include "uvector.h"

44

45

U_NAMESPACE_BEGINnamespace icu_77 {

46

47

namespace {

48

49

/**

50

* UTF-8 lead byte for minNoMaybeCP.

51

* Can be lower than the actual lead byte for c.

52

* Typically U+0300 for NFC/NFD, U+00A0 for NFKC/NFKD, U+0041 for NFKC_Casefold.

53

*/

54

inline uint8_t leadByteForCP(UChar32 c) {

55

if (c <= 0x7f) {

56

return static_cast<uint8_t>(c);

57

} else if (c <= 0x7ff) {

58

return static_cast<uint8_t>(0xc0 + (c >> 6));

59

} else {

60

// Should not occur because ccc(U+0300)!=0.

61

return 0xe0;

62

}

63

}

64

65

/**

66

* Returns the code point from one single well-formed UTF-8 byte sequence

67

* between cpStart and cpLimit.

68

*

69

* Trie UTF-8 macros do not assemble whole code points (for efficiency).

70

* When we do need the code point, we call this function.

71

* We should not need it for normalization-inert data (norm16==0).

72

* Illegal sequences yield the error value norm16==0 just like real normalization-inert code points.

73

*/

74

UChar32 codePointFromValidUTF8(const uint8_t *cpStart, const uint8_t *cpLimit) {

75

// Similar to U8_NEXT_UNSAFE(s, i, c).

76

U_ASSERT(cpStart < cpLimit)(static_cast <bool> (cpStart < cpLimit) ? void (0) :
__assert_fail ("cpStart < cpLimit", __builtin_FILE (), __builtin_LINE
(), __extension__ __PRETTY_FUNCTION__));

77

uint8_t c = *cpStart;

78

switch(cpLimit-cpStart) {

79

case 1:

80

return c;

81

case 2:

82

return ((c&0x1f)<<6) | (cpStart[1]&0x3f);

83

case 3:

84

// no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (char16_t)

85

return static_cast<char16_t>((c << 12) | ((cpStart[1] & 0x3f) << 6) | (cpStart[2] & 0x3f));

86

case 4:

87

return ((c&7)<<18) | ((cpStart[1]&0x3f)<<12) | ((cpStart[2]&0x3f)<<6) | (cpStart[3]&0x3f);

88

default:

89

UPRV_UNREACHABLE_EXITabort(); // Should not occur.

90

}

91

}

92

93

/**

94

* Returns the last code point in [start, p[ if it is valid and in U+1000..U+D7FF.

95

* Otherwise returns a negative value.

96

*/

97

UChar32 previousHangulOrJamo(const uint8_t *start, const uint8_t *p) {

98

if ((p - start) >= 3) {

99

p -= 3;

100

uint8_t l = *p;

101

uint8_t t1, t2;

102

if (0xe1 <= l && l <= 0xed &&

103

(t1 = static_cast<uint8_t>(p[1] - 0x80)) <= 0x3f &&

104

(t2 = static_cast<uint8_t>(p[2] - 0x80)) <= 0x3f &&

105

(l < 0xed || t1 <= 0x1f)) {

106

return ((l & 0xf) << 12) | (t1 << 6) | t2;

107

}

108

}

109

return U_SENTINEL(-1);

110

}

111

112

/**

113

* Returns the offset from the Jamo T base if [src, limit[ starts with a single Jamo T code point.

114

* Otherwise returns a negative value.

115

*/

116

int32_t getJamoTMinusBase(const uint8_t *src, const uint8_t *limit) {

117

// Jamo T: E1 86 A8..E1 87 82

118

if ((limit - src) >= 3 && *src == 0xe1) {

119

if (src[1] == 0x86) {

120

uint8_t t = src[2];

121

// The first Jamo T is U+11A8 but JAMO_T_BASE is 11A7.

122

// Offset 0 does not correspond to any conjoining Jamo.

123

if (0xa8 <= t && t <= 0xbf) {

124

return t - 0xa7;

125

}

126

} else if (src[1] == 0x87) {

127

uint8_t t = src[2];

128

if (static_cast<int8_t>(t) <= static_cast<int8_t>(0x82u)) {

129

return t - (0xa7 - 0x40);

130

}

131

}

132

}

133

return -1;

134

}

135

136

void

137

appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t delta,

138

ByteSink &sink, Edits *edits) {

139

char buffer[U8_MAX_LENGTH4];

140

int32_t length;

141

int32_t cpLength = static_cast<int32_t>(cpLimit - cpStart);

142

if (cpLength == 1) {

143

// The builder makes ASCII map to ASCII.

144

buffer[0] = static_cast<uint8_t>(*cpStart + delta);

145

length = 1;

146

} else {

147

int32_t trail = *(cpLimit-1) + delta;

148

if (0x80 <= trail && trail <= 0xbf) {

149

// The delta only changes the last trail byte.

150

--cpLimit;

151

length = 0;

152

do { buffer[length++] = *cpStart++; } while (cpStart < cpLimit);

153

buffer[length++] = static_cast<uint8_t>(trail);

154

} else {

155

// Decode the code point, add the delta, re-encode.

156

UChar32 c = codePointFromValidUTF8(cpStart, cpLimit) + delta;

157

length = 0;

158

U8_APPEND_UNSAFE(buffer, length, c)do { uint32_t __uc=(c); if(__uc<=0x7f) { (buffer)[(length)
++]=(uint8_t)__uc; } else { if(__uc<=0x7ff) { (buffer)[(length
)++]=(uint8_t)((__uc>>6)|0xc0); } else { if(__uc<=0xffff
) { (buffer)[(length)++]=(uint8_t)((__uc>>12)|0xe0); } else
{ (buffer)[(length)++]=(uint8_t)((__uc>>18)|0xf0); (buffer
)[(length)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); } (
buffer)[(length)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80
); } (buffer)[(length)++]=(uint8_t)((__uc&0x3f)|0x80); } }
while (false);

159

}

160

}

161

if (edits != nullptr) {

162

edits->addReplace(cpLength, length);

163

}

164

sink.Append(buffer, length);

165

}

166

167

} // namespace

168

169

// ReorderingBuffer -------------------------------------------------------- ***

170

171

ReorderingBuffer::ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest,

172

UErrorCode &errorCode) :

173

impl(ni), str(dest),

174

start(str.getBuffer(8)), reorderStart(start), limit(start),

175

remainingCapacity(str.getCapacity()), lastCC(0) {

176

if (start == nullptr && U_SUCCESS(errorCode)) {

177

// getBuffer() already did str.setToBogus()

178

errorCode = U_MEMORY_ALLOCATION_ERROR;

179

}

180

}

181

182

UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {

183

int32_t length=str.length();

184

start=str.getBuffer(destCapacity);

185

if(start==nullptr) {

186

// getBuffer() already did str.setToBogus()

187

errorCode=U_MEMORY_ALLOCATION_ERROR;

188

return false;

189

}

190

limit=start+length;

191

remainingCapacity=str.getCapacity()-length;

192

reorderStart=start;

193

if(start==limit) {

194

lastCC=0;

195

} else {

196

setIterator();

197

lastCC=previousCC();

198

// Set reorderStart after the last code point with cc<=1 if there is one.

199

if(lastCC>1) {

200

while(previousCC()>1) {}

201

}

202

reorderStart=codePointLimit;

203

}

204

return true;

205

}

206

207

UBool ReorderingBuffer::equals(const char16_t *otherStart, const char16_t *otherLimit) const {

208

int32_t length = static_cast<int32_t>(limit - start);

209

return

210

length == static_cast<int32_t>(otherLimit - otherStart) &&

211

0==u_memcmpu_memcmp_77(start, otherStart, length);

212

}

213

214

UBool ReorderingBuffer::equals(const uint8_t *otherStart, const uint8_t *otherLimit) const {

215

U_ASSERT((otherLimit - otherStart) <= INT32_MAX)(static_cast <bool> ((otherLimit - otherStart) <= (2147483647
)) ? void (0) : __assert_fail ("(otherLimit - otherStart) <= (2147483647)"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
)); // ensured by caller

216

int32_t length = static_cast<int32_t>(limit - start);

217

int32_t otherLength = static_cast<int32_t>(otherLimit - otherStart);

218

// For equal strings, UTF-8 is at least as long as UTF-16, and at most three times as long.

219

if (otherLength < length || (otherLength / 3) > length) {

220

return false;

221

}

222

// Compare valid strings from between normalization boundaries.

223

// (Invalid sequences are normalization-inert.)

224

for (int32_t i = 0, j = 0;;) {

225

if (i >= length) {

226

return j >= otherLength;

227

} else if (j >= otherLength) {

228

return false;

229

}

230

// Not at the end of either string yet.

231

UChar32 c, other;

232

U16_NEXT_UNSAFE(start, i, c)do { (c)=(start)[(i)++]; if((((c)&0xfffffc00)==0xd800)) {
(c)=(((UChar32)((c))<<10UL)+(UChar32)((start)[(i)++])-
((0xd800<<10UL)+0xdc00-0x10000)); } } while (false);

233

U8_NEXT_UNSAFE(otherStart, j, other)do { (other)=(uint8_t)(otherStart)[(j)++]; if(!(((other)&
0x80)==0)) { if((other)<0xe0) { (other)=(((other)&0x1f
)<<6)|((otherStart)[(j)++]&0x3f); } else if((other)
<0xf0) { (other)=(UChar)(((other)<<12)|(((otherStart
)[j]&0x3f)<<6)|((otherStart)[(j)+1]&0x3f)); (j)
+=2; } else { (other)=(((other)&7)<<18)|(((otherStart
)[j]&0x3f)<<12)|(((otherStart)[(j)+1]&0x3f)<<
6)|((otherStart)[(j)+2]&0x3f); (j)+=3; } } } while (false
);

234

if (c != other) {

235

return false;

236

}

237

}

238

}

239

240

UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {

241

if(remainingCapacity<2 && !resize(2, errorCode)) {

242

return false;

243

}

244

if(lastCC<=cc || cc==0) {

245

limit[0]=U16_LEAD(c)(UChar)(((c)>>10)+0xd7c0);

246

limit[1]=U16_TRAIL(c)(UChar)(((c)&0x3ff)|0xdc00);

247

limit+=2;

248

lastCC=cc;

249

if(cc<=1) {

250

reorderStart=limit;

251

}

252

} else {

253

insert(c, cc);

254

}

255

remainingCapacity-=2;

256

return true;

257

}

258

259

UBool ReorderingBuffer::append(const char16_t *s, int32_t length, UBool isNFD,

260

uint8_t leadCC, uint8_t trailCC,

261

UErrorCode &errorCode) {

262

if(length==0) {

263

return true;

264

}

265

if(remainingCapacity<length && !resize(length, errorCode)) {

266

return false;

267

}

268

remainingCapacity-=length;

269

if(lastCC<=leadCC || leadCC==0) {

270

if(trailCC<=1) {

271

reorderStart=limit+length;

272

} else if(leadCC<=1) {

273

reorderStart=limit+1; // Ok if not a code point boundary.

274

}

275

const char16_t *sLimit=s+length;

276

do { *limit++=*s++; } while(s!=sLimit);

277

lastCC=trailCC;

278

} else {

279

int32_t i=0;

280

UChar32 c;

281

U16_NEXT(s, i, length, c)do { (c)=(s)[(i)++]; if((((c)&0xfffffc00)==0xd800)) { uint16_t
__c2; if((i)!=(length) && (((__c2=(s)[(i)])&0xfffffc00
)==0xdc00)) { ++(i); (c)=(((UChar32)((c))<<10UL)+(UChar32
)(__c2)-((0xd800<<10UL)+0xdc00-0x10000)); } } } while (
false);

282

insert(c, leadCC); // insert first code point

283

while(i<length) {

284

U16_NEXT(s, i, length, c)do { (c)=(s)[(i)++]; if((((c)&0xfffffc00)==0xd800)) { uint16_t
__c2; if((i)!=(length) && (((__c2=(s)[(i)])&0xfffffc00
)==0xdc00)) { ++(i); (c)=(((UChar32)((c))<<10UL)+(UChar32
)(__c2)-((0xd800<<10UL)+0xdc00-0x10000)); } } } while (
false);

285

if(i<length) {

286

if (isNFD) {

287

leadCC = Normalizer2Impl::getCCFromYesOrMaybeYes(impl.getRawNorm16(c));

288

} else {

289

leadCC = impl.getCC(impl.getNorm16(c));

290

}

291

} else {

292

leadCC=trailCC;

293

}

294

append(c, leadCC, errorCode);

295

}

296

}

297

return true;

298

}

299

300

UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {

301

int32_t cpLength=U16_LENGTH(c)((uint32_t)(c)<=0xffff ? 1 : 2);

302

if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {

303

return false;

304

}

305

remainingCapacity-=cpLength;

306

if(cpLength==1) {

307

*limit++ = static_cast<char16_t>(c);

308

} else {

309

limit[0]=U16_LEAD(c)(UChar)(((c)>>10)+0xd7c0);

310

limit[1]=U16_TRAIL(c)(UChar)(((c)&0x3ff)|0xdc00);

311

limit+=2;

312

}

313

lastCC=0;

314

reorderStart=limit;

315

return true;

316

}

317

318

UBool ReorderingBuffer::appendZeroCC(const char16_t *s, const char16_t *sLimit, UErrorCode &errorCode) {

319

if(s==sLimit) {

320

return true;

321

}

322

int32_t length = static_cast<int32_t>(sLimit - s);

323

if(remainingCapacity<length && !resize(length, errorCode)) {

324

return false;

325

}

326

u_memcpyu_memcpy_77(limit, s, length);

327

limit+=length;

328

remainingCapacity-=length;

329

lastCC=0;

330

reorderStart=limit;

331

return true;

332

}

333

334

void ReorderingBuffer::remove() {

335

reorderStart=limit=start;

336

remainingCapacity=str.getCapacity();

337

lastCC=0;

338

}

339

340

void ReorderingBuffer::removeSuffix(int32_t suffixLength) {

341

if(suffixLength<(limit-start)) {

342

limit-=suffixLength;

343

remainingCapacity+=suffixLength;

344

} else {

345

limit=start;

346

remainingCapacity=str.getCapacity();

347

}

348

lastCC=0;

349

reorderStart=limit;

350

}

351

352

UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {

353

int32_t reorderStartIndex = static_cast<int32_t>(reorderStart - start);

354

int32_t length = static_cast<int32_t>(limit - start);

355

str.releaseBuffer(length);

356

int32_t newCapacity=length+appendLength;

357

int32_t doubleCapacity=2*str.getCapacity();

358

if(newCapacity<doubleCapacity) {

359

newCapacity=doubleCapacity;

360

}

361

if(newCapacity<256) {

362

newCapacity=256;

363

}

364

start=str.getBuffer(newCapacity);

365

if(start==nullptr) {

366

// getBuffer() already did str.setToBogus()

367

errorCode=U_MEMORY_ALLOCATION_ERROR;

368

return false;

369

}

370

reorderStart=start+reorderStartIndex;

371

limit=start+length;

372

remainingCapacity=str.getCapacity()-length;

373

return true;

374

}

375

376

void ReorderingBuffer::skipPrevious() {

377

codePointLimit=codePointStart;

378

char16_t c=*--codePointStart;

379

if(U16_IS_TRAIL(c)(((c)&0xfffffc00)==0xdc00) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))(((*(codePointStart-1))&0xfffffc00)==0xd800)) {

380

--codePointStart;

381

}

382

}

383

384

uint8_t ReorderingBuffer::previousCC() {

385

codePointLimit=codePointStart;

386

if(reorderStart>=codePointStart) {

387

return 0;

388

}

389

UChar32 c=*--codePointStart;

390

char16_t c2;

391

if(U16_IS_TRAIL(c)(((c)&0xfffffc00)==0xdc00) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))(((c2=*(codePointStart-1))&0xfffffc00)==0xd800)) {

392

--codePointStart;

393

c=U16_GET_SUPPLEMENTARY(c2, c)(((UChar32)(c2)<<10UL)+(UChar32)(c)-((0xd800<<10UL
)+0xdc00-0x10000));

394

}

395

return impl.getCCFromYesOrMaybeYesCP(c);

396

}

397

398

// Inserts c somewhere before the last character.

399

// Requires 0<cc<lastCC which implies reorderStart<limit.

400

void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {

401

for(setIterator(), skipPrevious(); previousCC()>cc;) {}

402

// insert c at codePointLimit, after the character with prevCC<=cc

403

char16_t *q=limit;

404

char16_t *r=limit+=U16_LENGTH(c)((uint32_t)(c)<=0xffff ? 1 : 2);

405

do {

406

*--r=*--q;

407

} while(codePointLimit!=q);

408

writeCodePoint(q, c);

409

if(cc<=1) {

410

reorderStart=r;

411

}

412

}

413

414

// Normalizer2Impl --------------------------------------------------------- ***

415

416

struct CanonIterData : public UMemory {

417

CanonIterData(UErrorCode &errorCode);

418

~CanonIterData();

419

void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);

420

UMutableCPTrie *mutableTrie;

421

UCPTrie *trie;

422

UVector canonStartSets; // contains UnicodeSet *

423

};

424

425

Normalizer2Impl::~Normalizer2Impl() {

426

delete fCanonIterData;

427

}

428

429

void

430

Normalizer2Impl::init(const int32_t *inIndexes, const UCPTrie *inTrie,

431

const uint16_t *inExtraData, const uint8_t *inSmallFCD) {

432

minDecompNoCP = static_cast<char16_t>(inIndexes[IX_MIN_DECOMP_NO_CP]);

433

minCompNoMaybeCP = static_cast<char16_t>(inIndexes[IX_MIN_COMP_NO_MAYBE_CP]);

434

minLcccCP = static_cast<char16_t>(inIndexes[IX_MIN_LCCC_CP]);

435

436

minYesNo = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO]);

437

minYesNoMappingsOnly = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]);

438

minNoNo = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO]);

439

minNoNoCompBoundaryBefore = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]);

440

minNoNoCompNoMaybeCC = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]);

441

minNoNoEmpty = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_EMPTY]);

442

limitNoNo = static_cast<uint16_t>(inIndexes[IX_LIMIT_NO_NO]);

443

minMaybeNo = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_NO]);

444

minMaybeNoCombinesFwd = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_NO_COMBINES_FWD]);

445

minMaybeYes = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_YES]);

446

U_ASSERT((minMaybeNo & 7) == 0)(static_cast <bool> ((minMaybeNo & 7) == 0) ? void (
0) : __assert_fail ("(minMaybeNo & 7) == 0", __builtin_FILE
(), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__)); // 8-aligned for noNoDelta bit fields

447

centerNoNoDelta = (minMaybeNo >> DELTA_SHIFT) - MAX_DELTA - 1;

448

449

normTrie=inTrie;

450

extraData=inExtraData;

451

smallFCD=inSmallFCD;

452

}

453

454

U_CDECL_BEGINextern "C" {

455

456

static uint32_t U_CALLCONV

457

segmentStarterMapper(const void * /*context*/, uint32_t value) {

458

return value&CANON_NOT_SEGMENT_STARTER0x80000000;

459

}

460

461

U_CDECL_END}

462

463

void

464

Normalizer2Impl::addLcccChars(UnicodeSet &set) const {

465

UChar32 start = 0, end;

466

uint32_t norm16;

467

while ((end = ucptrie_getRangeucptrie_getRange_77(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,

468

nullptr, nullptr, &norm16)) >= 0) {

469

if (norm16 > Normalizer2Impl::MIN_NORMAL_MAYBE_YES &&

470

norm16 != Normalizer2Impl::JAMO_VT) {

471

set.add(start, end);

472

} else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) {

473

uint16_t fcd16 = getFCD16(start);

474

if (fcd16 > 0xff) { set.add(start, end); }

475

}

476

start = end + 1;

477

}

478

}

479

480

void

481

Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {

482

// Add the start code point of each same-value range of the trie.

483

UChar32 start = 0, end;

484

uint32_t value;

485

while ((end = ucptrie_getRangeucptrie_getRange_77(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,

486

nullptr, nullptr, &value)) >= 0) {

487

sa->add(sa->set, start);

488

if (start != end && isAlgorithmicNoNo(static_cast<uint16_t>(value)) &&

489

(value & Normalizer2Impl::DELTA_TCCC_MASK) > Normalizer2Impl::DELTA_TCCC_1) {

490

// Range of code points with same-norm16-value algorithmic decompositions.

491

// They might have different non-zero FCD16 values.

492

uint16_t prevFCD16 = getFCD16(start);

493

while (++start <= end) {

494

uint16_t fcd16 = getFCD16(start);

495

if (fcd16 != prevFCD16) {

496

sa->add(sa->set, start);

497

prevFCD16 = fcd16;

498

}

499

}

500

}

501

start = end + 1;

502

}

503

504

/* add Hangul LV syllables and LV+1 because of skippables */

505

for(char16_t c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {

506

sa->add(sa->set, c);

507

sa->add(sa->set, c+1);

508

}

509

sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */

510

}

511

512

void

513

Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {

514

// Add the start code point of each same-value range of the canonical iterator data trie.

515

if (!ensureCanonIterData(errorCode)) { return; }

516

// Currently only used for the SEGMENT_STARTER property.

517

UChar32 start = 0, end;

518

uint32_t value;

519

while ((end = ucptrie_getRangeucptrie_getRange_77(fCanonIterData->trie, start, UCPMAP_RANGE_NORMAL, 0,

520

segmentStarterMapper, nullptr, &value)) >= 0) {

521

sa->add(sa->set, start);

522

start = end + 1;

523

}

524

}

525

526

const char16_t *

527

Normalizer2Impl::copyLowPrefixFromNulTerminated(const char16_t *src,

528

UChar32 minNeedDataCP,

529

ReorderingBuffer *buffer,

530

UErrorCode &errorCode) const {

531

// Make some effort to support NUL-terminated strings reasonably.

532

// Take the part of the fast quick check loop that does not look up

533

// data and check the first part of the string.

534

// After this prefix, determine the string length to simplify the rest

535

// of the code.

536

const char16_t *prevSrc=src;

537

char16_t c;

538

while((c=*src++)<minNeedDataCP && c!=0) {}

539

// Back out the last character for full processing.

540

// Copy this prefix.

541

if(--src!=prevSrc) {

542

if(buffer!=nullptr) {

543

buffer->appendZeroCC(prevSrc, src, errorCode);

544

}

545

}

546

return src;

547

}

548

549

UnicodeString &

550

Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest,

551

UErrorCode &errorCode) const {

552

if(U_FAILURE(errorCode)) {

553

dest.setToBogus();

554

return dest;

555

}

556

const char16_t *sArray=src.getBuffer();

557

if(&dest==&src || sArray==nullptr) {

558

errorCode=U_ILLEGAL_ARGUMENT_ERROR;

559

dest.setToBogus();

560

return dest;

561

}

562

decompose(sArray, sArray+src.length(), dest, src.length(), errorCode);

563

return dest;

564

}

565

566

void

567

Normalizer2Impl::decompose(const char16_t *src, const char16_t *limit,

568

UnicodeString &dest,

569

int32_t destLengthEstimate,

570

UErrorCode &errorCode) const {

571

if(destLengthEstimate<0 && limit!=nullptr) {

572

destLengthEstimate = static_cast<int32_t>(limit - src);

573

}

574

dest.remove();

575

ReorderingBuffer buffer(*this, dest);

576

if(buffer.init(destLengthEstimate, errorCode)) {

577

decompose(src, limit, &buffer, errorCode);

578

}

579

}

580

581

// Dual functionality:

582

// buffer!=nullptr: normalize

583

// buffer==nullptr: isNormalized/spanQuickCheckYes

584

const char16_t *

585

Normalizer2Impl::decompose(const char16_t *src, const char16_t *limit,

586

ReorderingBuffer *buffer,

587

UErrorCode &errorCode) const {

588

UChar32 minNoCP=minDecompNoCP;

589

if(limit==nullptr) {

590

src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);

591

if(U_FAILURE(errorCode)) {

592

return src;

593

}

594

limit=u_strchru_strchr_77(src, 0);

595

}

596

597

const char16_t *prevSrc;

598

UChar32 c=0;

599

uint16_t norm16=0;

600

601

// only for quick check

602

const char16_t *prevBoundary=src;

603

uint8_t prevCC=0;

604

605

for(;;) {

606

// count code units below the minimum or with irrelevant data for the quick check

607

for(prevSrc=src; src!=limit;) {

608

if( (c=*src)<minNoCP ||

609

isMostDecompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c)((normTrie)->data.ptr16[((int32_t)(normTrie)->index[(c)
>> UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK
))]))

610

) {

611

++src;

612

} else if(!U16_IS_LEAD(c)(((c)&0xfffffc00)==0xd800)) {

613

break;

614

} else {

615

char16_t c2;

616

if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])(((c2=src[1])&0xfffffc00)==0xdc00)) {

617

c=U16_GET_SUPPLEMENTARY(c, c2)(((UChar32)(c)<<10UL)+(UChar32)(c2)-((0xd800<<10UL
)+0xdc00-0x10000));

618

norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c)((normTrie)->data.ptr16[((c) >= (normTrie)->highStart
? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallIndex_77(normTrie, c))]);

619

if(isMostDecompYesAndZeroCC(norm16)) {

620

src+=2;

621

} else {

622

break;

623

}

624

} else {

625

++src; // unpaired lead surrogate: inert

626

}

627

}

628

}

629

// copy these code units all at once

630

if(src!=prevSrc) {

631

if(buffer!=nullptr) {

632

if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {

633

break;

634

}

635

} else {

636

prevCC=0;

637

prevBoundary=src;

638

}

639

}

640

if(src==limit) {

641

break;

642

}

643

644

// Check one above-minimum, relevant code point.

645

src+=U16_LENGTH(c)((uint32_t)(c)<=0xffff ? 1 : 2);

646

if(buffer!=nullptr) {

647

if(!decompose(c, norm16, *buffer, errorCode)) {

648

break;

649

}

650

} else {

651

if(isDecompYes(norm16)) {

652

uint8_t cc=getCCFromYesOrMaybeYes(norm16);

653

if(prevCC<=cc || cc==0) {

654

prevCC=cc;

655

if(cc<=1) {

656

prevBoundary=src;

657

}

658

continue;

659

}

660

}

661

return prevBoundary; // "no" or cc out of order

662

}

663

}

664

return src;

665

}

666

667

// Decompose a short piece of text which is likely to contain characters that

668

// fail the quick check loop and/or where the quick check loop's overhead

669

// is unlikely to be amortized.

670

// Called by the compose() and makeFCD() implementations.

671

const char16_t *

672

Normalizer2Impl::decomposeShort(const char16_t *src, const char16_t *limit,

673

UBool stopAtCompBoundary, UBool onlyContiguous,

674

ReorderingBuffer &buffer, UErrorCode &errorCode) const {

675

if (U_FAILURE(errorCode)) {

676

return nullptr;

677

}

678

while(src<limit) {

679

if (stopAtCompBoundary && *src < minCompNoMaybeCP) {

680

return src;

681

}

682

const char16_t *prevSrc = src;

683

UChar32 c;

684

uint16_t norm16;

685

UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16)do { (c) = *(src)++; int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)==0) && (src
) != (limit) && (((__c2 = *(src))&0xfffffc00)==0xdc00
)) { ++(src); (c) = (((UChar32)((c))<<10UL)+(UChar32)(__c2
)-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c) >=
(normTrie)->highStart ? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallIndex_77(normTrie, c)); } else { __index
= (normTrie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
; } } (norm16) = ((normTrie)->data.ptr16[__index]); } while
(false);

686

if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {

687

return prevSrc;

688

}

689

if(!decompose(c, norm16, buffer, errorCode)) {

690

return nullptr;

691

}

692

if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {

693

return src;

694

}

695

}

696

return src;

697

}

698

699

UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,

700

ReorderingBuffer &buffer,

701

UErrorCode &errorCode) const {

702

// get the decomposition and the lead and trail cc's

703

if (norm16 >= limitNoNo) {

704

if (isMaybeYesOrNonZeroCC(norm16)) {

705

return buffer.append(c, getCCFromYesOrMaybeYes(norm16), errorCode);

706

} else if (norm16 < minMaybeNo) {

707

// Maps to an isCompYesAndZeroCC.

708

c=mapAlgorithmic(c, norm16);

709

norm16=getRawNorm16(c);

710

}

711

}

712

if (norm16 < minYesNo) {

713

// c does not decompose

714

return buffer.append(c, 0, errorCode);

715

} else if(isHangulLV(norm16) || isHangulLVT(norm16)) {

716

// Hangul syllable: decompose algorithmically

717

char16_t jamos[3];

718

return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);

719

}

720

// c decomposes, get everything from the variable-length extra data

721

const uint16_t *mapping=getData(norm16);

722

uint16_t firstUnit=*mapping;

723

int32_t length=firstUnit&MAPPING_LENGTH_MASK;

724

uint8_t leadCC, trailCC;

725

trailCC = static_cast<uint8_t>(firstUnit >> 8);

726

if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {

727

leadCC = static_cast<uint8_t>(*(mapping - 1) >> 8);

728

} else {

729

leadCC=0;

730

}

731

return buffer.append(reinterpret_cast<const char16_t*>(mapping) + 1, length, true, leadCC, trailCC, errorCode);

732

}

733

734

// Dual functionality:

735

// sink != nullptr: normalize

736

// sink == nullptr: isNormalized/spanQuickCheckYes

737

const uint8_t *

738

Normalizer2Impl::decomposeUTF8(uint32_t options,

739

const uint8_t *src, const uint8_t *limit,

740

ByteSink *sink, Edits *edits, UErrorCode &errorCode) const {

741

U_ASSERT(limit != nullptr)(static_cast <bool> (limit != nullptr) ? void (0) : __assert_fail
("limit != nullptr", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__));

742

UnicodeString s16;

743

uint8_t minNoLead = leadByteForCP(minDecompNoCP);

744

745

const uint8_t *prevBoundary = src;

746

// only for quick check

747

uint8_t prevCC = 0;

748

749

for (;;) {

750

// Fast path: Scan over a sequence of characters below the minimum "no" code point,

751

// or with (decompYes && ccc==0) properties.

752

const uint8_t *fastStart = src;

753

const uint8_t *prevSrc;

754

uint16_t norm16 = 0;

755

756

for (;;) {

757

if (src == limit) {

758

if (prevBoundary != limit && sink != nullptr) {

759

ByteSinkUtil::appendUnchanged(prevBoundary, limit,

760

*sink, options, edits, errorCode);

761

}

762

return src;

763

}

764

if (*src < minNoLead) {

765

++src;

766

} else {

767

prevSrc = src;

768

UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16)do { int32_t __lead = (uint8_t)*(src)++; if (!(((__lead)&
0x80)==0)) { uint8_t __t1, __t2, __t3; if ((src) != (limit) &&
(__lead >= 0xe0 ? __lead < 0xf0 ? "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
[__lead &= 0xf] & (1 << ((__t1 = *(src)) >>
5)) && ++(src) != (limit) && (__t2 = *(src) -
0x80) <= 0x3f && (__lead = ((int32_t)(normTrie)->
index[(__lead << 6) + (__t1 & 0x3f)]) + __t2, 1) : (
__lead -= 0xf0) <= 4 && "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
[(__t1 = *(src)) >> 4] & (1 << __lead) &&
(__lead = (__lead << 6) | (__t1 & 0x3f), ++(src) !=
(limit)) && (__t2 = *(src) - 0x80) <= 0x3f &&
++(src) != (limit) && (__t3 = *(src) - 0x80) <= 0x3f
&& (__lead = __lead >= (normTrie)->shifted12HighStart
? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallU8Index_77((normTrie), __lead, __t2, __t3
), 1) : __lead >= 0xc2 && (__t1 = *(src) - 0x80) <=
0x3f && (__lead = (int32_t)(normTrie)->index[__lead
& 0x1f] + __t1, 1))) { ++(src); } else { __lead = (normTrie
)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (
norm16) = ((normTrie)->data.ptr16[__lead]); } while (false
);

769

if (!isMostDecompYesAndZeroCC(norm16)) {

770

break;

771

}

772

}

773

}

774

// isMostDecompYesAndZeroCC(norm16) is false, that is, norm16>=minYesNo,

775

// and the current character at [prevSrc..src[ is not a common case with cc=0

776

// (MIN_NORMAL_MAYBE_YES or JAMO_VT).

777

// It could still be a maybeYes with cc=0.

778

if (prevSrc != fastStart) {

779

// The fast path looped over yes/0 characters before the current one.

780

if (sink != nullptr &&

781

!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,

782

*sink, options, edits, errorCode)) {

783

break;

784

}

785

prevBoundary = prevSrc;

786

prevCC = 0;

787

}

788

789

// Medium-fast path: Quick check.

790

if (isMaybeYesOrNonZeroCC(norm16)) {

791

// Does not decompose.

792

uint8_t cc = getCCFromYesOrMaybeYes(norm16);

793

if (prevCC <= cc || cc == 0) {

794

prevCC = cc;

795

if (cc <= 1) {

796

if (sink != nullptr &&

797

!ByteSinkUtil::appendUnchanged(prevBoundary, src,

798

*sink, options, edits, errorCode)) {

799

break;

800

}

801

prevBoundary = src;

802

}

803

continue;

804

}

805

}

806

if (sink == nullptr) {

807

return prevBoundary; // quick check: "no" or cc out of order

808

}

809

810

// Slow path

811

// Decompose up to and including the current character.

812

if (prevBoundary != prevSrc && norm16HasDecompBoundaryBefore(norm16)) {

813

if (!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,

814

*sink, options, edits, errorCode)) {

815

break;

816

}

817

prevBoundary = prevSrc;

818

}

819

ReorderingBuffer buffer(*this, s16, errorCode);

820

if (U_FAILURE(errorCode)) {

821

break;

822

}

823

decomposeShort(prevBoundary, src, STOP_AT_LIMIT, false /* onlyContiguous */,

824

buffer, errorCode);

825

// Decompose until the next boundary.

826

if (buffer.getLastCC() > 1) {

827

src = decomposeShort(src, limit, STOP_AT_DECOMP_BOUNDARY, false /* onlyContiguous */,

828

buffer, errorCode);

829

}

830

if (U_FAILURE(errorCode)) {

831

break;

832

}

833

if ((src - prevSrc) > INT32_MAX(2147483647)) { // guard before buffer.equals()

834

errorCode = U_INDEX_OUTOFBOUNDS_ERROR;

835

break;

836

}

837

// We already know there was a change if the original character decomposed;

838

// otherwise compare.

839

if (isMaybeYesOrNonZeroCC(norm16) && buffer.equals(prevBoundary, src)) {

840

if (!ByteSinkUtil::appendUnchanged(prevBoundary, src,

841

*sink, options, edits, errorCode)) {

842

break;

843

}

844

} else {

845

if (!ByteSinkUtil::appendChange(prevBoundary, src, buffer.getStart(), buffer.length(),

846

*sink, edits, errorCode)) {

847

break;

848

}

849

}

850

prevBoundary = src;

851

prevCC = 0;

852

}

853

return src;

854

}

855

856

const uint8_t *

857

Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,

858

StopAt stopAt, UBool onlyContiguous,

859

ReorderingBuffer &buffer, UErrorCode &errorCode) const {

860

if (U_FAILURE(errorCode)) {

861

return nullptr;

862

}

863

while (src < limit) {

864

const uint8_t *prevSrc = src;

865

uint16_t norm16;

866

UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16)do { int32_t __lead = (uint8_t)*(src)++; if (!(((__lead)&
0x80)==0)) { uint8_t __t1, __t2, __t3; if ((src) != (limit) &&
(__lead >= 0xe0 ? __lead < 0xf0 ? "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
[__lead &= 0xf] & (1 << ((__t1 = *(src)) >>
5)) && ++(src) != (limit) && (__t2 = *(src) -
0x80) <= 0x3f && (__lead = ((int32_t)(normTrie)->
index[(__lead << 6) + (__t1 & 0x3f)]) + __t2, 1) : (
__lead -= 0xf0) <= 4 && "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
[(__t1 = *(src)) >> 4] & (1 << __lead) &&
(__lead = (__lead << 6) | (__t1 & 0x3f), ++(src) !=
(limit)) && (__t2 = *(src) - 0x80) <= 0x3f &&
++(src) != (limit) && (__t3 = *(src) - 0x80) <= 0x3f
&& (__lead = __lead >= (normTrie)->shifted12HighStart
? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallU8Index_77((normTrie), __lead, __t2, __t3
), 1) : __lead >= 0xc2 && (__t1 = *(src) - 0x80) <=
0x3f && (__lead = (int32_t)(normTrie)->index[__lead
& 0x1f] + __t1, 1))) { ++(src); } else { __lead = (normTrie
)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (
norm16) = ((normTrie)->data.ptr16[__lead]); } while (false
);

867

// Get the decomposition and the lead and trail cc's.

868

UChar32 c = U_SENTINEL(-1);

869

if (norm16 >= limitNoNo) {

870

if (isMaybeYesOrNonZeroCC(norm16)) {

871

// No comp boundaries around this character.

872

uint8_t cc = getCCFromYesOrMaybeYes(norm16);

873

if (cc == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) {

874

return prevSrc;

875

}

876

c = codePointFromValidUTF8(prevSrc, src);

877

if (!buffer.append(c, cc, errorCode)) {

878

return nullptr;

879

}

880

if (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1) {

881

return src;

882

}

883

continue;

884

} else if (norm16 < minMaybeNo) {

885

// Maps to an isCompYesAndZeroCC.

886

if (stopAt != STOP_AT_LIMIT) {

887

return prevSrc;

888

}

889

c = codePointFromValidUTF8(prevSrc, src);

890

c = mapAlgorithmic(c, norm16);

891

norm16 = getRawNorm16(c);

892

}

893

} else if (stopAt != STOP_AT_LIMIT && norm16 < minNoNoCompNoMaybeCC) {

894

return prevSrc;

895

}

896

// norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8.

897

// We do not see invalid UTF-8 here because

898

// its norm16==INERT is normalization-inert,

899

// so it gets copied unchanged in the fast path,

900

// and we stop the slow path where invalid UTF-8 begins.

901

// c >= 0 is the result of an algorithmic mapping.

902

U_ASSERT(c >= 0 || norm16 != INERT)(static_cast <bool> (c >= 0 || norm16 != INERT) ? void
(0) : __assert_fail ("c >= 0 || norm16 != INERT", __builtin_FILE
(), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__));

903

if (norm16 < minYesNo) {

904

if (c < 0) {

905

c = codePointFromValidUTF8(prevSrc, src);

906

}

907

// does not decompose

908

if (!buffer.append(c, 0, errorCode)) {

909

return nullptr;

910

}

911

} else if (isHangulLV(norm16) || isHangulLVT(norm16)) {

912

// Hangul syllable: decompose algorithmically

913

if (c < 0) {

914

c = codePointFromValidUTF8(prevSrc, src);

915

}

916

char16_t jamos[3];

917

if (!buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode)) {

918

return nullptr;

919

}

920

} else {

921

// The character decomposes, get everything from the variable-length extra data.

922

const uint16_t *mapping = getData(norm16);

923

uint16_t firstUnit = *mapping;

924

int32_t length = firstUnit & MAPPING_LENGTH_MASK;

925

uint8_t trailCC = static_cast<uint8_t>(firstUnit >> 8);

926

uint8_t leadCC;

927

if (firstUnit & MAPPING_HAS_CCC_LCCC_WORD) {

928

leadCC = static_cast<uint8_t>(*(mapping - 1) >> 8);

929

} else {

930

leadCC = 0;

931

}

932

if (leadCC == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) {

933

return prevSrc;

934

}

935

if (!buffer.append(reinterpret_cast<const char16_t*>(mapping) + 1, length, true, leadCC, trailCC, errorCode)) {

936

return nullptr;

937

}

938

}

939

if ((stopAt == STOP_AT_COMP_BOUNDARY && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) ||

940

(stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1)) {

941

return src;

942

}

943

}

944

return src;

945

}

946

947

const char16_t *

948

Normalizer2Impl::getDecomposition(UChar32 c, char16_t buffer[4], int32_t &length) const {

949

uint16_t norm16;

950

if(c<minDecompNoCP || isMaybeYesOrNonZeroCC(norm16=getNorm16(c))) {

951

// c does not decompose

952

return nullptr;

953

}

954

const char16_t *decomp = nullptr;

955

if(isDecompNoAlgorithmic(norm16)) {

956

// Maps to an isCompYesAndZeroCC.

957

c=mapAlgorithmic(c, norm16);

958

decomp=buffer;

959

length=0;

960

U16_APPEND_UNSAFE(buffer, length, c)do { if((uint32_t)(c)<=0xffff) { (buffer)[(length)++]=(uint16_t
)(c); } else { (buffer)[(length)++]=(uint16_t)(((c)>>10
)+0xd7c0); (buffer)[(length)++]=(uint16_t)(((c)&0x3ff)|0xdc00
); } } while (false);

961

// The mapping might decompose further.

962

norm16 = getRawNorm16(c);

963

}

964

if (norm16 < minYesNo) {

965

return decomp;

966

} else if(isHangulLV(norm16) || isHangulLVT(norm16)) {

967

// Hangul syllable: decompose algorithmically

968

length=Hangul::decompose(c, buffer);

969

return buffer;

970

}

971

// c decomposes, get everything from the variable-length extra data

972

const uint16_t *mapping=getData(norm16);

973

length=*mapping&MAPPING_LENGTH_MASK;

974

return reinterpret_cast<const char16_t*>(mapping) + 1;

975

}

976

977

// The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1

978

// so that a raw mapping fits that consists of one unit ("rm0")

979

// plus all but the first two code units of the normal mapping.

980

// The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.

981

const char16_t *

982

Normalizer2Impl::getRawDecomposition(UChar32 c, char16_t buffer[30], int32_t &length) const {

983

uint16_t norm16;

984

if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {

985

// c does not decompose

986

return nullptr;

987

} else if(isHangulLV(norm16) || isHangulLVT(norm16)) {

988

// Hangul syllable: decompose algorithmically

989

Hangul::getRawDecomposition(c, buffer);

990

length=2;

991

return buffer;

992

} else if(isDecompNoAlgorithmic(norm16)) {

993

c=mapAlgorithmic(c, norm16);

994

length=0;

995

U16_APPEND_UNSAFE(buffer, length, c)do { if((uint32_t)(c)<=0xffff) { (buffer)[(length)++]=(uint16_t
)(c); } else { (buffer)[(length)++]=(uint16_t)(((c)>>10
)+0xd7c0); (buffer)[(length)++]=(uint16_t)(((c)&0x3ff)|0xdc00
); } } while (false);

996

return buffer;

997

}

998

// c decomposes, get everything from the variable-length extra data

999

const uint16_t *mapping=getData(norm16);

1000

uint16_t firstUnit=*mapping;

1001

int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping

1002

if(firstUnit&MAPPING_HAS_RAW_MAPPING) {

1003

// Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.

1004

// Bit 7=MAPPING_HAS_CCC_LCCC_WORD

1005

const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;

1006

uint16_t rm0=*rawMapping;

1007

if(rm0<=MAPPING_LENGTH_MASK) {

1008

length=rm0;

1009

return reinterpret_cast<const char16_t*>(rawMapping) - rm0;

1010

} else {

1011

// Copy the normal mapping and replace its first two code units with rm0.

1012

buffer[0] = static_cast<char16_t>(rm0);

1013

u_memcpyu_memcpy_77(buffer + 1, reinterpret_cast<const char16_t*>(mapping) + 1 + 2, mLength - 2);

1014

length=mLength-1;

1015

return buffer;

1016

}

1017

} else {

1018

length=mLength;

1019

return reinterpret_cast<const char16_t*>(mapping) + 1;

1020

}

1021

}

1022

1023

void Normalizer2Impl::decomposeAndAppend(const char16_t *src, const char16_t *limit,

1024

UBool doDecompose,

1025

UnicodeString &safeMiddle,

1026

ReorderingBuffer &buffer,

1027

UErrorCode &errorCode) const {

1028

buffer.copyReorderableSuffixTo(safeMiddle);

1029

if(doDecompose) {

1030

decompose(src, limit, &buffer, errorCode);

1031

return;

1032

}

1033

// Just merge the strings at the boundary.

1034

bool isFirst = true;

1035

uint8_t firstCC = 0, prevCC = 0, cc;

1036

const char16_t *p = src;

1037

while (p != limit) {

1038

const char16_t *codePointStart = p;

1039

UChar32 c;

1040

uint16_t norm16;

1041

UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16)do { (c) = *(p)++; int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)==0) && (p
) != (limit) && (((__c2 = *(p))&0xfffffc00)==0xdc00
)) { ++(p); (c) = (((UChar32)((c))<<10UL)+(UChar32)(__c2
)-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c) >=
(normTrie)->highStart ? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallIndex_77(normTrie, c)); } else { __index
= (normTrie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
; } } (norm16) = ((normTrie)->data.ptr16[__index]); } while
(false);

1042

if ((cc = getCC(norm16)) == 0) {

1043

p = codePointStart;

1044

break;

1045

}

1046

if (isFirst) {

1047

firstCC = cc;

1048

isFirst = false;

1049

}

1050

prevCC = cc;

1051

}

1052

if(limit==nullptr) { // appendZeroCC() needs limit!=nullptr

1053

limit=u_strchru_strchr_77(p, 0);

1054

}

1055

1056

if (buffer.append(src, static_cast<int32_t>(p - src), false, firstCC, prevCC, errorCode)) {

1057

buffer.appendZeroCC(p, limit, errorCode);

1058

}

1059

}

1060

1061

UBool Normalizer2Impl::hasDecompBoundaryBefore(UChar32 c) const {

1062

return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||

1063

norm16HasDecompBoundaryBefore(getNorm16(c));

1064

}

1065

1066

UBool Normalizer2Impl::norm16HasDecompBoundaryBefore(uint16_t norm16) const {

1067

if (norm16 < minNoNoCompNoMaybeCC) {

1068

return true;

1069

}

1070

if (norm16 >= limitNoNo) {

1071

return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;

1072

}

1073

// c decomposes, get everything from the variable-length extra data

1074

const uint16_t *mapping=getDataForYesOrNo(norm16);

1075

uint16_t firstUnit=*mapping;

1076

// true if leadCC==0 (hasFCDBoundaryBefore())

1077

return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;

1078

}

1079

1080

UBool Normalizer2Impl::hasDecompBoundaryAfter(UChar32 c) const {

1081

if (c < minDecompNoCP) {

1082

return true;

1083

}

1084

if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) {

1085

return true;

1086

}

1087

return norm16HasDecompBoundaryAfter(getNorm16(c));

1088

}

1089

1090

UBool Normalizer2Impl::norm16HasDecompBoundaryAfter(uint16_t norm16) const {

1091

if(norm16 <= minYesNo || isHangulLVT(norm16)) {

1092

return true;

1093

}

1094

if (norm16 >= limitNoNo) {

1095

if (isMaybeYesOrNonZeroCC(norm16)) {

1096

return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;

1097

} else if (norm16 < minMaybeNo) {

1098

// Maps to an isCompYesAndZeroCC.

1099

return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;

1100

}

1101

}

1102

// c decomposes, get everything from the variable-length extra data

1103

const uint16_t *mapping=getData(norm16);

1104

uint16_t firstUnit=*mapping;

1105

// decomp after-boundary: same as hasFCDBoundaryAfter(),

1106

// fcd16<=1 || trailCC==0

1107

if(firstUnit>0x1ff) {

1108

return false; // trailCC>1

1109

}

1110

if(firstUnit<=0xff) {

1111

return true; // trailCC==0

1112

}

1113

// if(trailCC==1) test leadCC==0, same as checking for before-boundary

1114

// true if leadCC==0 (hasFCDBoundaryBefore())

1115

return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;

1116

}

1117

1118

/*

1119

* Finds the recomposition result for

1120

* a forward-combining "lead" character,

1121

* specified with a pointer to its compositions list,

1122

* and a backward-combining "trail" character.

1123

*

1124

* If the lead and trail characters combine, then this function returns

1125

* the following "compositeAndFwd" value:

1126

* Bits 21..1 composite character

1127

* Bit 0 set if the composite is a forward-combining starter

1128

* otherwise it returns -1.

1129

*

1130

* The compositions list has (trail, compositeAndFwd) pair entries,

1131

* encoded as either pairs or triples of 16-bit units.

1132

* The last entry has the high bit of its first unit set.

1133

*

1134

* The list is sorted by ascending trail characters (there are no duplicates).

1135

* A linear search is used.

1136

*

1137

* See normalizer2impl.h for a more detailed description

1138

* of the compositions list format.

1139

*/

1140

int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {

1141

uint16_t key1, firstUnit;

1142

if(trail<COMP_1_TRAIL_LIMIT) {

1143

// trail character is 0..33FF

1144

// result entry may have 2 or 3 units

1145

key1 = static_cast<uint16_t>(trail << 1);

1146

while(key1>(firstUnit=*list)) {

1147

list+=2+(firstUnit&COMP_1_TRIPLE);

1148

}

1149

if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {

1150

if(firstUnit&COMP_1_TRIPLE) {

1151

return (static_cast<int32_t>(list[1]) << 16) | list[2];

1152

} else {

1153

return list[1];

1154

}

1155

}

1156

} else {

1157

// trail character is 3400..10FFFF

1158

// result entry has 3 units

1159

key1 = static_cast<uint16_t>(COMP_1_TRAIL_LIMIT +

1160

(((trail>>COMP_1_TRAIL_SHIFT))&

1161

~COMP_1_TRIPLE));

1162

uint16_t key2 = static_cast<uint16_t>(trail << COMP_2_TRAIL_SHIFT);

1163

uint16_t secondUnit;

1164

for(;;) {

1165

if(key1>(firstUnit=*list)) {

1166

list+=2+(firstUnit&COMP_1_TRIPLE);

1167

} else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {

1168

if(key2>(secondUnit=list[1])) {

1169

if(firstUnit&COMP_1_LAST_TUPLE) {

1170

break;

1171

} else {

1172

list+=3;

1173

}

1174

} else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {

1175

return (static_cast<int32_t>(secondUnit & ~COMP_2_TRAIL_MASK) << 16) | list[2];

1176

} else {

1177

break;

1178

}

1179

} else {

1180

break;

1181

}

1182

}

1183

}

1184

return -1;

1185

}

1186

1187

/**

1188

* @param list some character's compositions list

1189

* @param set recursively receives the composites from these compositions

1190

*/

1191

void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {

1192

uint16_t firstUnit;

1193

int32_t compositeAndFwd;

1194

do {

1195

firstUnit=*list;

1196

if((firstUnit&COMP_1_TRIPLE)==0) {

1197

compositeAndFwd=list[1];

1198

list+=2;

1199

} else {

1200

compositeAndFwd = ((static_cast<int32_t>(list[1]) & ~COMP_2_TRAIL_MASK) << 16) | list[2];

1201

list+=3;

1202

}

1203

UChar32 composite=compositeAndFwd>>1;

1204

if((compositeAndFwd&1)!=0) {

1205

addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set);

1206

}

1207

set.add(composite);

1208

} while((firstUnit&COMP_1_LAST_TUPLE)==0);

1209

}

1210

1211

/*

1212

* Recomposes the buffer text starting at recomposeStartIndex

1213

* (which is in NFD - decomposed and canonically ordered),

1214

* and truncates the buffer contents.

1215

*

1216

* Note that recomposition never lengthens the text:

1217

* Any character consists of either one or two code units;

1218

* a composition may contain at most one more code unit than the original starter,

1219

* while the combining mark that is removed has at least one code unit.

1220

*/

1221

void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,

1222

UBool onlyContiguous) const {

1223

char16_t *p=buffer.getStart()+recomposeStartIndex;

1224

char16_t *limit=buffer.getLimit();

1225

if(p==limit) {

1226

return;

1227

}

1228

1229

char16_t *starter, *pRemove, *q, *r;

1230

const uint16_t *compositionsList;

1231

UChar32 c, compositeAndFwd;

1232

uint16_t norm16;

1233

uint8_t cc, prevCC;

1234

UBool starterIsSupplementary;

1235

1236

// Some of the following variables are not used until we have a forward-combining starter

1237

// and are only initialized now to avoid compiler warnings.

1238

compositionsList=nullptr; // used as indicator for whether we have a forward-combining starter

1239

starter=nullptr;

1240

starterIsSupplementary=false;

1241

prevCC=0;

1242

1243

for(;;) {

1244

UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16)do { (c) = *(p)++; int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)==0) && (p
) != (limit) && (((__c2 = *(p))&0xfffffc00)==0xdc00
)) { ++(p); (c) = (((UChar32)((c))<<10UL)+(UChar32)(__c2
)-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c) >=
(normTrie)->highStart ? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallIndex_77(normTrie, c)); } else { __index
= (normTrie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
; } } (norm16) = ((normTrie)->data.ptr16[__index]); } while
(false);

1245

cc=getCCFromYesOrMaybeYes(norm16);

1246

if( // this character combines backward and

1247

isMaybe(norm16) &&

1248

// we have seen a starter that combines forward and

1249

compositionsList!=nullptr &&

1250

// the backward-combining character is not blocked

1251

(prevCC<cc || prevCC==0)

1252

) {

1253

if(isJamoVT(norm16)) {

1254

// c is a Jamo V/T, see if we can compose it with the previous character.

1255

if(c<Hangul::JAMO_T_BASE) {

1256

// c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.

1257

char16_t prev = static_cast<char16_t>(*starter - Hangul::JAMO_L_BASE);

1258

if(prev<Hangul::JAMO_L_COUNT) {

1259

pRemove=p-1;

1260

char16_t syllable = static_cast<char16_t>(

1261

Hangul::HANGUL_BASE +

1262

(prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*

1263

Hangul::JAMO_T_COUNT);

1264

char16_t t;

1265

if (p != limit && (t = static_cast<char16_t>(*p - Hangul::JAMO_T_BASE)) < Hangul::JAMO_T_COUNT) {

1266

++p;

1267

syllable+=t; // The next character was a Jamo T.

1268

}

1269

*starter=syllable;

1270

// remove the Jamo V/T

1271

q=pRemove;

1272

r=p;

1273

while(r<limit) {

1274

*q++=*r++;

1275

}

1276

limit=q;

1277

p=pRemove;

1278

}

1279

}

1280

/*

1281

* No "else" for Jamo T:

1282

* Since the input is in NFD, there are no Hangul LV syllables that

1283

* a Jamo T could combine with.

1284

* All Jamo Ts are combined above when handling Jamo Vs.

1285

*/

1286

if(p==limit) {

1287

break;

1288

}

1289

compositionsList=nullptr;

1290

continue;

1291

} else if((compositeAndFwd=combine(compositionsList, c))>=0) {

1292

// The starter and the combining mark (c) do combine.

1293

UChar32 composite=compositeAndFwd>>1;

1294

1295

// Replace the starter with the composite, remove the combining mark.

1296

pRemove=p-U16_LENGTH(c)((uint32_t)(c)<=0xffff ? 1 : 2); // pRemove & p: start & limit of the combining mark

1297

if(starterIsSupplementary) {

1298

if(U_IS_SUPPLEMENTARY(composite)((uint32_t)((composite)-0x10000)<=0xfffff)) {

1299

// both are supplementary

1300

starter[0]=U16_LEAD(composite)(UChar)(((composite)>>10)+0xd7c0);

1301

starter[1]=U16_TRAIL(composite)(UChar)(((composite)&0x3ff)|0xdc00);

1302

} else {

1303

*starter = static_cast<char16_t>(composite);

1304

// The composite is shorter than the starter,

1305

// move the intermediate characters forward one.

1306

starterIsSupplementary=false;

1307

q=starter+1;

1308

r=q+1;

1309

while(r<pRemove) {

1310

*q++=*r++;

1311

}

1312

--pRemove;

1313

}

1314

} else if(U_IS_SUPPLEMENTARY(composite)((uint32_t)((composite)-0x10000)<=0xfffff)) {

1315

// The composite is longer than the starter,

1316

// move the intermediate characters back one.

1317

starterIsSupplementary=true;

1318

++starter; // temporarily increment for the loop boundary

1319

q=pRemove;

1320

r=++pRemove;

1321

while(starter<q) {

1322

*--r=*--q;

1323

}

1324

*starter=U16_TRAIL(composite)(UChar)(((composite)&0x3ff)|0xdc00);

1325

*--starter=U16_LEAD(composite)(UChar)(((composite)>>10)+0xd7c0); // undo the temporary increment

1326

} else {

1327

// both are on the BMP

1328

*starter = static_cast<char16_t>(composite);

1329

}

1330

1331

/* remove the combining mark by moving the following text over it */

1332

if(pRemove<p) {

1333

q=pRemove;

1334

r=p;

1335

while(r<limit) {

1336

*q++=*r++;

1337

}

1338

limit=q;

1339

p=pRemove;

1340

}

1341

// Keep prevCC because we removed the combining mark.

1342

1343

if(p==limit) {

1344

break;

1345

}

1346

// Is the composite a starter that combines forward?

1347

if(compositeAndFwd&1) {

1348

compositionsList=

1349

getCompositionsListForComposite(getRawNorm16(composite));

1350

} else {

1351

compositionsList=nullptr;

1352

}

1353

1354

// We combined; continue with looking for compositions.

1355

continue;

1356

}

1357

}

1358

1359

// no combination this time

1360

prevCC=cc;

1361

if(p==limit) {

1362

break;

1363

}

1364

1365

// If c did not combine, then check if it is a starter.

1366

if(cc==0) {

1367

// Found a new starter.

1368

if((compositionsList=getCompositionsListForDecompYes(norm16))!=nullptr) {

1369

// It may combine with something, prepare for it.

1370

if(U_IS_BMP(c)((uint32_t)(c)<=0xffff)) {

1371

starterIsSupplementary=false;

1372

starter=p-1;

1373

} else {

1374

starterIsSupplementary=true;

1375

starter=p-2;

1376

}

1377

}

1378

} else if(onlyContiguous) {

1379

// FCC: no discontiguous compositions; any intervening character blocks.

1380

compositionsList=nullptr;

1381

}

1382

}

1383

buffer.setReorderingLimit(limit);

1384

}

1385

1386

UChar32

1387

Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {

1388

uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16

1389

const uint16_t *list;

1390

if(isInert(norm16)) {

1391

return U_SENTINEL(-1);

1392

} else if(norm16<minYesNoMappingsOnly) {

1393

// a combines forward.

1394

if(isJamoL(norm16)) {

1395

if (b < Hangul::JAMO_V_BASE) {

1396

return U_SENTINEL(-1);

1397

}

1398

b-=Hangul::JAMO_V_BASE;

1399

if(b<Hangul::JAMO_V_COUNT) {

1400

return

1401

(Hangul::HANGUL_BASE+

1402

((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*

1403

Hangul::JAMO_T_COUNT);

1404

} else {

1405

return U_SENTINEL(-1);

1406

}

1407

} else if(isHangulLV(norm16)) {

1408

if (b <= Hangul::JAMO_T_BASE) {

1409

return U_SENTINEL(-1);

1410

}

1411

b-=Hangul::JAMO_T_BASE;

1412

if(b<Hangul::JAMO_T_COUNT) { // not b==0!

1413

return a+b;

1414

} else {

1415

return U_SENTINEL(-1);

1416

}

1417

} else {

1418

// 'a' has a compositions list in extraData

1419

list=getDataForYesOrNo(norm16);

1420

if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list

1421

list+= // mapping pointer

1422

1+ // +1 to skip the first unit with the mapping length

1423

(*list&MAPPING_LENGTH_MASK); // + mapping length

1424

}

1425

}

1426

} else if(norm16<minMaybeNoCombinesFwd || MIN_NORMAL_MAYBE_YES<=norm16) {

1427

return U_SENTINEL(-1);

1428

} else {

1429

list=getDataForMaybe(norm16);

1430

if(norm16<minMaybeYes) { // composite 'a' has both mapping & compositions list

1431

list+= // mapping pointer

1432

1+ // +1 to skip the first unit with the mapping length

1433

(*list&MAPPING_LENGTH_MASK); // + mapping length

1434

}

1435

}

1436

if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b

1437

return U_SENTINEL(-1);

1438

}

1439

#if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC1

1440

return combine(list, b)>>1;

1441

#else

1442

int32_t compositeAndFwd=combine(list, b);

1443

return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL(-1);

1444

#endif

1445

}

1446

1447

// Very similar to composeQuickCheck(): Make the same changes in both places if relevant.

1448

// doCompose: normalize

1449

// !doCompose: isNormalized (buffer must be empty and initialized)

1450

UBool

1451

Normalizer2Impl::compose(const char16_t *src, const char16_t *limit,

1452

UBool onlyContiguous,

1453

UBool doCompose,

1454

ReorderingBuffer &buffer,

1455

UErrorCode &errorCode) const {

1456

const char16_t *prevBoundary=src;

1457

UChar32 minNoMaybeCP=minCompNoMaybeCP;

1458

if(limit==nullptr) {

1459

src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,

1460

doCompose ? &buffer : nullptr,

1461

errorCode);

1462

if(U_FAILURE(errorCode)) {

1463

return false;

1464

}

1465

limit=u_strchru_strchr_77(src, 0);

1466

if (prevBoundary != src) {

1467

if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) {

1468

prevBoundary = src;

1469

} else {

1470

buffer.removeSuffix(1);

1471

prevBoundary = --src;

1472

}

1473

}

1474

}

1475

1476

for (;;) {

1477

// Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,

1478

// or with (compYes && ccc==0) properties.

1479

const char16_t *prevSrc;

1480

UChar32 c = 0;

1481

uint16_t norm16 = 0;

1482

for (;;) {

1483

if (src == limit) {

1484

if (prevBoundary != limit && doCompose) {

1485

buffer.appendZeroCC(prevBoundary, limit, errorCode);

1486

}

1487

return true;

1488

}

1489

if( (c=*src)<minNoMaybeCP ||

1490

isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c)((normTrie)->data.ptr16[((int32_t)(normTrie)->index[(c)
>> UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK
))]))

1491

) {

1492

++src;

1493

} else {

1494

prevSrc = src++;

1495

if(!U16_IS_LEAD(c)(((c)&0xfffffc00)==0xd800)) {

1496

break;

1497

} else {

1498

char16_t c2;

1499

if(src!=limit && U16_IS_TRAIL(c2=*src)(((c2=*src)&0xfffffc00)==0xdc00)) {

1500

++src;

1501

c=U16_GET_SUPPLEMENTARY(c, c2)(((UChar32)(c)<<10UL)+(UChar32)(c2)-((0xd800<<10UL
)+0xdc00-0x10000));

1502

norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c)((normTrie)->data.ptr16[((c) >= (normTrie)->highStart
? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallIndex_77(normTrie, c))]);

1503

if(!isCompYesAndZeroCC(norm16)) {

1504

break;

1505

}

1506

}

1507

}

1508

}

1509

}

1510

// isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.

1511

// The current character is either a "noNo" (has a mapping)

1512

// or a "maybeYes" / "maybeNo" (combines backward)

1513

// or a "yesYes" with ccc!=0.

1514

// It is not a Hangul syllable or Jamo L because those have "yes" properties.

1515

1516

// Medium-fast path: Handle cases that do not require full decomposition and recomposition.

1517

if (norm16 < minMaybeNo) { // minNoNo <= norm16 < minMaybeNo

1518

if (!doCompose) {

1519

return false;

1520

}

1521

// Fast path for mapping a character that is immediately surrounded by boundaries.

1522

// In this case, we need not decompose around the current character.

1523

if (isDecompNoAlgorithmic(norm16)) {

1524

// Maps to a single isCompYesAndZeroCC character

1525

// which also implies hasCompBoundaryBefore.

1526

if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||

1527

hasCompBoundaryBefore(src, limit)) {

1528

if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {

1529

break;

1530

}

1531

if(!buffer.append(mapAlgorithmic(c, norm16), 0, errorCode)) {

1532

break;

1533

}

1534

prevBoundary = src;

1535

continue;

1536

}

1537

} else if (norm16 < minNoNoCompBoundaryBefore) {

1538

// The mapping is comp-normalized which also implies hasCompBoundaryBefore.

1539

if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||

1540

hasCompBoundaryBefore(src, limit)) {

1541

if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {

1542

break;

1543

}

1544

const char16_t *mapping = reinterpret_cast<const char16_t *>(getDataForYesOrNo(norm16));

1545

int32_t length = *mapping++ & MAPPING_LENGTH_MASK;

1546

if(!buffer.appendZeroCC(mapping, mapping + length, errorCode)) {

1547

break;

1548

}

1549

prevBoundary = src;

1550

continue;

1551

}

1552

} else if (norm16 >= minNoNoEmpty) {

1553

// The current character maps to nothing.

1554

// Simply omit it from the output if there is a boundary before _or_ after it.

1555

// The character itself implies no boundaries.

1556

if (hasCompBoundaryBefore(src, limit) ||

1557

hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {

1558

if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {

1559

break;

1560

}

1561

prevBoundary = src;

1562

continue;

1563

}

1564

}

1565

// Other "noNo" type, or need to examine more text around this character:

1566

// Fall through to the slow path.

1567

} else if (isJamoVT(norm16) && prevBoundary != prevSrc) {

1568

char16_t prev=*(prevSrc-1);

1569

if(c<Hangul::JAMO_T_BASE) {

1570

// The current character is a Jamo Vowel,

1571

// compose with previous Jamo L and following Jamo T.

1572

char16_t l = static_cast<char16_t>(prev - Hangul::JAMO_L_BASE);

1573

if(l<Hangul::JAMO_L_COUNT) {

1574

if (!doCompose) {

1575

return false;

1576

}

1577

int32_t t;

1578

if (src != limit &&

1579

0 < (t = (static_cast<int32_t>(*src) - Hangul::JAMO_T_BASE)) &&

1580

t < Hangul::JAMO_T_COUNT) {

1581

// The next character is a Jamo T.

1582

++src;

1583

} else if (hasCompBoundaryBefore(src, limit)) {

1584

// No Jamo T follows, not even via decomposition.

1585

t = 0;

1586

} else {

1587

t = -1;

1588

}

1589

if (t >= 0) {

1590

UChar32 syllable = Hangul::HANGUL_BASE +

1591

(l*Hangul::JAMO_V_COUNT + (c-Hangul::JAMO_V_BASE)) *

1592

Hangul::JAMO_T_COUNT + t;

1593

--prevSrc; // Replace the Jamo L as well.

1594

if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {

1595

break;

1596

}

1597

if (!buffer.appendBMP(static_cast<char16_t>(syllable), 0, errorCode)) {

1598

break;

1599

}

1600

prevBoundary = src;

1601

continue;

1602

}

1603

// If we see L+V+x where x!=T then we drop to the slow path,

1604

// decompose and recompose.

1605

// This is to deal with NFKC finding normal L and V but a

1606

// compatibility variant of a T.

1607

// We need to either fully compose that combination here

1608

// (which would complicate the code and may not work with strange custom data)

1609

// or use the slow path.

1610

}

1611

} else if (Hangul::isHangulLV(prev)) {

1612

// The current character is a Jamo Trailing consonant,

1613

// compose with previous Hangul LV that does not contain a Jamo T.

1614

if (!doCompose) {

1615

return false;

1616

}

1617

UChar32 syllable = prev + c - Hangul::JAMO_T_BASE;

1618

--prevSrc; // Replace the Hangul LV as well.

1619

if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {

1620

break;

1621

}

1622

if (!buffer.appendBMP(static_cast<char16_t>(syllable), 0, errorCode)) {

1623

break;

1624

}

1625

prevBoundary = src;

1626

continue;

1627

}

1628

// No matching context, or may need to decompose surrounding text first:

1629

// Fall through to the slow path.

1630

} else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC

1631

// One or more combining marks that do not combine-back:

1632

// Check for canonical order, copy unchanged if ok and

1633

// if followed by a character with a boundary-before.

1634

uint8_t cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0

1635

if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) {

1636

// Fails FCD test, need to decompose and contiguously recompose.

1637

if (!doCompose) {

1638

return false;

1639

}

1640

} else {

1641

// If !onlyContiguous (not FCC), then we ignore the tccc of

1642

// the previous character which passed the quick check "yes && ccc==0" test.

1643

const char16_t *nextSrc;

1644

uint16_t n16;

1645

for (;;) {

1646

if (src == limit) {

1647

if (doCompose) {

1648

buffer.appendZeroCC(prevBoundary, limit, errorCode);

1649

}

1650

return true;

1651

}

1652

uint8_t prevCC = cc;

1653

nextSrc = src;

1654

UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, n16)do { (c) = *(nextSrc)++; int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)==0) && (nextSrc
) != (limit) && (((__c2 = *(nextSrc))&0xfffffc00)
==0xdc00)) { ++(nextSrc); (c) = (((UChar32)((c))<<10UL)
+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000)); __index
= ((c) >= (normTrie)->highStart ? (normTrie)->dataLength
- UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : ucptrie_internalSmallIndex_77
(normTrie, c)); } else { __index = (normTrie)->dataLength -
UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (n16) = ((normTrie)
->data.ptr16[__index]); } while (false);

1655

if (n16 >= MIN_YES_YES_WITH_CC) {

1656

cc = getCCFromNormalYesOrMaybe(n16);

1657

if (prevCC > cc) {

1658

if (!doCompose) {

1659

return false;

1660

}

1661

break;

1662

}

1663

} else {

1664

break;

1665

}

1666

src = nextSrc;

1667

}

1668

// src is after the last in-order combining mark.

1669

// If there is a boundary here, then we continue with no change.

1670

if (norm16HasCompBoundaryBefore(n16)) {

1671

if (isCompYesAndZeroCC(n16)) {

1672

src = nextSrc;

1673

}

1674

continue;

1675

}

1676

// Use the slow path. There is no boundary in [prevSrc, src[.

1677

}

1678

}

1679

1680

// Slow path: Find the nearest boundaries around the current character,

1681

// decompose and recompose.

1682

if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {

1683

const char16_t *p = prevSrc;

1684

UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, norm16)do { (c) = *--(p); int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)!=0) && (p
) != (prevBoundary) && (((__c2 = *((p) - 1))&0xfffffc00
)==0xd800)) { --(p); (c) = (((UChar32)(__c2)<<10UL)+(UChar32
)((c))-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c)
>= (normTrie)->highStart ? (normTrie)->dataLength -
UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : ucptrie_internalSmallIndex_77
(normTrie, c)); } else { __index = (normTrie)->dataLength -
UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (norm16) = ((normTrie
)->data.ptr16[__index]); } while (false);

1685

if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {

1686

prevSrc = p;

1687

}

1688

}

1689

if (doCompose && prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {

1690

break;

1691

}

1692

int32_t recomposeStartIndex=buffer.length();

1693

// We know there is not a boundary here.

1694

decomposeShort(prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous,

1695

buffer, errorCode);

1696

// Decompose until the next boundary.

1697

src = decomposeShort(src, limit, true /* stopAtCompBoundary */, onlyContiguous,

1698

buffer, errorCode);

1699

if (U_FAILURE(errorCode)) {

1700

break;

1701

}

1702

if ((src - prevSrc) > INT32_MAX(2147483647)) { // guard before buffer.equals()

1703

errorCode = U_INDEX_OUTOFBOUNDS_ERROR;

1704

return true;

1705

}

1706

recompose(buffer, recomposeStartIndex, onlyContiguous);

1707

if(!doCompose) {

1708

if(!buffer.equals(prevSrc, src)) {

1709

return false;

1710

}

1711

buffer.remove();

1712

}

1713

prevBoundary=src;

1714

}

1715

return true;

1716

}

1717

1718

// Very similar to compose(): Make the same changes in both places if relevant.

1719

// pQCResult==nullptr: spanQuickCheckYes

1720

// pQCResult!=nullptr: quickCheck (*pQCResult must be UNORM_YES)

1721

const char16_t *

1722

Normalizer2Impl::composeQuickCheck(const char16_t *src, const char16_t *limit,

1723

UBool onlyContiguous,

1724

UNormalizationCheckResult *pQCResult) const {

1725

const char16_t *prevBoundary=src;

1726

UChar32 minNoMaybeCP=minCompNoMaybeCP;

1727

if(limit==nullptr) {

1728

UErrorCode errorCode=U_ZERO_ERROR;

1729

src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, nullptr, errorCode);

1730

limit=u_strchru_strchr_77(src, 0);

1731

if (prevBoundary != src) {

1732

if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) {

1733

prevBoundary = src;

1734

} else {

1735

prevBoundary = --src;

1736

}

1737

}

1738

}

1739

1740

for(;;) {

1741

// Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,

1742

// or with (compYes && ccc==0) properties.

1743

const char16_t *prevSrc;

1744

UChar32 c = 0;

1745

uint16_t norm16 = 0;

1746

for (;;) {

1747

if(src==limit) {

1748

return src;

1749

}

1750

if( (c=*src)<minNoMaybeCP ||

1751

isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c)((normTrie)->data.ptr16[((int32_t)(normTrie)->index[(c)
>> UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK
))]))

1752

) {

1753

++src;

1754

} else {

1755

prevSrc = src++;

1756

if(!U16_IS_LEAD(c)(((c)&0xfffffc00)==0xd800)) {

1757

break;

1758

} else {

1759

char16_t c2;

1760

if(src!=limit && U16_IS_TRAIL(c2=*src)(((c2=*src)&0xfffffc00)==0xdc00)) {

1761

++src;

1762

c=U16_GET_SUPPLEMENTARY(c, c2)(((UChar32)(c)<<10UL)+(UChar32)(c2)-((0xd800<<10UL
)+0xdc00-0x10000));

1763

norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c)((normTrie)->data.ptr16[((c) >= (normTrie)->highStart
? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallIndex_77(normTrie, c))]);

1764

if(!isCompYesAndZeroCC(norm16)) {

1765

break;

1766

}

1767

}

1768

}

1769

}

1770

}

1771

// isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.

1772

// The current character is either a "noNo" (has a mapping)

1773

// or a "maybeYes" / "maybeNo" (combines backward)

1774

// or a "yesYes" with ccc!=0.

1775

// It is not a Hangul syllable or Jamo L because those have "yes" properties.

1776

1777

uint16_t prevNorm16 = INERT;

1778

if (prevBoundary != prevSrc) {

1779

if (norm16HasCompBoundaryBefore(norm16)) {

1780

prevBoundary = prevSrc;

1781

} else {

1782

const char16_t *p = prevSrc;

1783

uint16_t n16;

1784

UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, n16)do { (c) = *--(p); int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)!=0) && (p
) != (prevBoundary) && (((__c2 = *((p) - 1))&0xfffffc00
)==0xd800)) { --(p); (c) = (((UChar32)(__c2)<<10UL)+(UChar32
)((c))-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c)
>= (normTrie)->highStart ? (normTrie)->dataLength -
UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : ucptrie_internalSmallIndex_77
(normTrie, c)); } else { __index = (normTrie)->dataLength -
UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (n16) = ((normTrie)
->data.ptr16[__index]); } while (false);

1785

if (norm16HasCompBoundaryAfter(n16, onlyContiguous)) {

1786

prevBoundary = prevSrc;

1787

} else {

1788

prevBoundary = p;

1789

prevNorm16 = n16;

1790

}

1791

}

1792

}

1793

1794

if (norm16 >= minMaybeNo) {

1795

uint16_t fcd16 = getFCD16FromMaybeOrNonZeroCC(norm16);

1796

uint8_t cc = fcd16 >> 8;

1797

if (onlyContiguous /* FCC */ && cc != 0 &&

1798

getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {

1799

// The [prevBoundary..prevSrc[ character

1800

// passed the quick check "yes && ccc==0" test

1801

// but is out of canonical order with the current combining mark.

1802

} else {

1803

// If !onlyContiguous (not FCC), then we ignore the tccc of

1804

// the previous character which passed the quick check "yes && ccc==0" test.

1805

const char16_t *nextSrc;

1806

for (;;) {

1807

if (norm16 < MIN_YES_YES_WITH_CC) {

1808

if (pQCResult != nullptr) {

1809

*pQCResult = UNORM_MAYBE;

1810

} else {

1811

return prevBoundary;

1812

}

1813

}

1814

if (src == limit) {

1815

return src;

1816

}

1817

uint8_t prevCC = fcd16;

1818

nextSrc = src;

1819

UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, norm16)do { (c) = *(nextSrc)++; int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)==0) && (nextSrc
) != (limit) && (((__c2 = *(nextSrc))&0xfffffc00)
==0xdc00)) { ++(nextSrc); (c) = (((UChar32)((c))<<10UL)
+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000)); __index
= ((c) >= (normTrie)->highStart ? (normTrie)->dataLength
- UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : ucptrie_internalSmallIndex_77
(normTrie, c)); } else { __index = (normTrie)->dataLength -
UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (norm16) = ((normTrie
)->data.ptr16[__index]); } while (false);

1820

if (norm16 >= minMaybeNo) {

1821

fcd16 = getFCD16FromMaybeOrNonZeroCC(norm16);

1822

cc = fcd16 >> 8;

1823

if (!(prevCC <= cc || cc == 0)) {

1824

break;

1825

}

1826

} else {

1827

break;

1828

}

1829

src = nextSrc;

1830

}

1831

// src is after the last in-order combining mark.

1832

if (isCompYesAndZeroCC(norm16)) {

1833

prevBoundary = src;

1834

src = nextSrc;

1835

continue;

1836

}

1837

}

1838

}

1839

if(pQCResult!=nullptr) {

1840

*pQCResult=UNORM_NO;

1841

}

1842

return prevBoundary;

1843

}

1844

}

1845

1846

void Normalizer2Impl::composeAndAppend(const char16_t *src, const char16_t *limit,

1847

UBool doCompose,

1848

UBool onlyContiguous,

1849

UnicodeString &safeMiddle,

1850

ReorderingBuffer &buffer,

1851

UErrorCode &errorCode) const {

1852

if(!buffer.isEmpty()) {

1853

const char16_t *firstStarterInSrc=findNextCompBoundary(src, limit, onlyContiguous);

1854

if(src!=firstStarterInSrc) {

1855

const char16_t *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),

1856

buffer.getLimit(), onlyContiguous);

1857

int32_t destSuffixLength = static_cast<int32_t>(buffer.getLimit() - lastStarterInDest);

1858

UnicodeString middle(lastStarterInDest, destSuffixLength);

1859

buffer.removeSuffix(destSuffixLength);

1860

safeMiddle=middle;

1861

middle.append(src, static_cast<int32_t>(firstStarterInSrc - src));

1862

const char16_t *middleStart=middle.getBuffer();

1863

compose(middleStart, middleStart+middle.length(), onlyContiguous,

1864

true, buffer, errorCode);

1865

if(U_FAILURE(errorCode)) {

1866

return;

1867

}

1868

src=firstStarterInSrc;

1869

}

1870

}

1871

if(doCompose) {

1872

compose(src, limit, onlyContiguous, true, buffer, errorCode);

1873

} else {

1874

if(limit==nullptr) { // appendZeroCC() needs limit!=nullptr

1875

limit=u_strchru_strchr_77(src, 0);

1876

}

1877

buffer.appendZeroCC(src, limit, errorCode);

1878

}

1879

}

1880

1881

UBool

1882

Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,

1883

const uint8_t *src, const uint8_t *limit,

1884

ByteSink *sink, Edits *edits, UErrorCode &errorCode) const {

1885

U_ASSERT(limit != nullptr)(static_cast <bool> (limit != nullptr) ? void (0) : __assert_fail
("limit != nullptr", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__));

1886

UnicodeString s16;

1887

uint8_t minNoMaybeLead = leadByteForCP(minCompNoMaybeCP);

1888

const uint8_t *prevBoundary = src;

1889

1890

for (;;) {

1891

// Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,

1892

// or with (compYes && ccc==0) properties.

1893

const uint8_t *prevSrc;

1894

uint16_t norm16 = 0;

1895

for (;;) {

1896

if (src == limit) {

1897

if (prevBoundary != limit && sink != nullptr) {

1898

ByteSinkUtil::appendUnchanged(prevBoundary, limit,

1899

*sink, options, edits, errorCode);

1900

}

1901

return true;

1902

}

1903

if (*src < minNoMaybeLead) {

1904

++src;

1905

} else {

1906

prevSrc = src;

1907

UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16)do { int32_t __lead = (uint8_t)*(src)++; if (!(((__lead)&
0x80)==0)) { uint8_t __t1, __t2, __t3; if ((src) != (limit) &&
(__lead >= 0xe0 ? __lead < 0xf0 ? "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
[__lead &= 0xf] & (1 << ((__t1 = *(src)) >>
5)) && ++(src) != (limit) && (__t2 = *(src) -
0x80) <= 0x3f && (__lead = ((int32_t)(normTrie)->
index[(__lead << 6) + (__t1 & 0x3f)]) + __t2, 1) : (
__lead -= 0xf0) <= 4 && "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
[(__t1 = *(src)) >> 4] & (1 << __lead) &&
(__lead = (__lead << 6) | (__t1 & 0x3f), ++(src) !=
(limit)) && (__t2 = *(src) - 0x80) <= 0x3f &&
++(src) != (limit) && (__t3 = *(src) - 0x80) <= 0x3f
&& (__lead = __lead >= (normTrie)->shifted12HighStart
? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallU8Index_77((normTrie), __lead, __t2, __t3
), 1) : __lead >= 0xc2 && (__t1 = *(src) - 0x80) <=
0x3f && (__lead = (int32_t)(normTrie)->index[__lead
& 0x1f] + __t1, 1))) { ++(src); } else { __lead = (normTrie
)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (
norm16) = ((normTrie)->data.ptr16[__lead]); } while (false
);

1908

if (!isCompYesAndZeroCC(norm16)) {

1909

break;

1910

}

1911

}

1912

}

1913

// isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.

1914

// The current character is either a "noNo" (has a mapping)

1915

// or a "maybeYes" / "maybeNo" (combines backward)

1916

// or a "yesYes" with ccc!=0.

1917

// It is not a Hangul syllable or Jamo L because those have "yes" properties.

1918

1919

// Medium-fast path: Handle cases that do not require full decomposition and recomposition.

1920

if (norm16 < minMaybeNo) { // minNoNo <= norm16 < minMaybeNo

1921

if (sink == nullptr) {

1922

return false;

1923

}

1924

// Fast path for mapping a character that is immediately surrounded by boundaries.

1925

// In this case, we need not decompose around the current character.

1926

if (isDecompNoAlgorithmic(norm16)) {

1927

// Maps to a single isCompYesAndZeroCC character

1928

// which also implies hasCompBoundaryBefore.

1929

if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||

1930

hasCompBoundaryBefore(src, limit)) {

1931

if (prevBoundary != prevSrc &&

1932

!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,

1933

*sink, options, edits, errorCode)) {

1934

break;

1935

}

1936

appendCodePointDelta(prevSrc, src, getAlgorithmicDelta(norm16), *sink, edits);

1937

prevBoundary = src;

1938

continue;

1939

}

1940

} else if (norm16 < minNoNoCompBoundaryBefore) {

1941

// The mapping is comp-normalized which also implies hasCompBoundaryBefore.

1942

if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||

1943

hasCompBoundaryBefore(src, limit)) {

1944

if (prevBoundary != prevSrc &&

1945

!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,

1946

*sink, options, edits, errorCode)) {

1947

break;

1948

}

1949

const uint16_t *mapping = getDataForYesOrNo(norm16);

1950

int32_t length = *mapping++ & MAPPING_LENGTH_MASK;

1951

if (!ByteSinkUtil::appendChange(prevSrc, src, reinterpret_cast<const char16_t*>(mapping), length,

1952

*sink, edits, errorCode)) {

1953

break;

1954

}

1955

prevBoundary = src;

1956

continue;

1957

}

1958

} else if (norm16 >= minNoNoEmpty) {

1959

// The current character maps to nothing.

1960

// Simply omit it from the output if there is a boundary before _or_ after it.

1961

// The character itself implies no boundaries.

1962

if (hasCompBoundaryBefore(src, limit) ||

1963

hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {

1964

if (prevBoundary != prevSrc &&

1965

!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,

1966

*sink, options, edits, errorCode)) {

1967

break;

1968

}

1969

if (edits != nullptr) {

1970

edits->addReplace(static_cast<int32_t>(src - prevSrc), 0);

1971

}

1972

prevBoundary = src;

1973

continue;

1974

}

1975

}

1976

// Other "noNo" type, or need to examine more text around this character:

1977

// Fall through to the slow path.

1978

} else if (isJamoVT(norm16)) {

1979

// Jamo L: E1 84 80..92

1980

// Jamo V: E1 85 A1..B5

1981

// Jamo T: E1 86 A8..E1 87 82

1982

U_ASSERT((src - prevSrc) == 3 && *prevSrc == 0xe1)(static_cast <bool> ((src - prevSrc) == 3 && *prevSrc
== 0xe1) ? void (0) : __assert_fail ("(src - prevSrc) == 3 && *prevSrc == 0xe1"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
));

1983

UChar32 prev = previousHangulOrJamo(prevBoundary, prevSrc);

1984

if (prevSrc[1] == 0x85) {

1985

// The current character is a Jamo Vowel,

1986

// compose with previous Jamo L and following Jamo T.

1987

UChar32 l = prev - Hangul::JAMO_L_BASE;

1988

if (static_cast<uint32_t>(l) < Hangul::JAMO_L_COUNT) {

1989

if (sink == nullptr) {

1990

return false;

1991

}

1992

int32_t t = getJamoTMinusBase(src, limit);

1993

if (t >= 0) {

1994

// The next character is a Jamo T.

1995

src += 3;

1996

} else if (hasCompBoundaryBefore(src, limit)) {

1997

// No Jamo T follows, not even via decomposition.

1998

t = 0;

1999

}

2000

if (t >= 0) {

2001

UChar32 syllable = Hangul::HANGUL_BASE +

2002

(l*Hangul::JAMO_V_COUNT + (prevSrc[2]-0xa1)) *

2003

Hangul::JAMO_T_COUNT + t;

2004

prevSrc -= 3; // Replace the Jamo L as well.

2005

if (prevBoundary != prevSrc &&

2006

!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,

2007

*sink, options, edits, errorCode)) {

2008

break;

2009

}

2010

ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);

2011

prevBoundary = src;

2012

continue;

2013

}

2014

// If we see L+V+x where x!=T then we drop to the slow path,

2015

// decompose and recompose.

2016

// This is to deal with NFKC finding normal L and V but a

2017

// compatibility variant of a T.

2018

// We need to either fully compose that combination here

2019

// (which would complicate the code and may not work with strange custom data)

2020

// or use the slow path.

2021

}

2022

} else if (Hangul::isHangulLV(prev)) {

2023

// The current character is a Jamo Trailing consonant,

2024

// compose with previous Hangul LV that does not contain a Jamo T.

2025

if (sink == nullptr) {

2026

return false;

2027

}

2028

UChar32 syllable = prev + getJamoTMinusBase(prevSrc, src);

2029

prevSrc -= 3; // Replace the Hangul LV as well.

2030

if (prevBoundary != prevSrc &&

2031

!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,

2032

*sink, options, edits, errorCode)) {

2033

break;

2034

}

2035

ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);

2036

prevBoundary = src;

2037

continue;

2038

}

2039

// No matching context, or may need to decompose surrounding text first:

2040

// Fall through to the slow path.

2041

} else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC

2042

// One or more combining marks that do not combine-back:

2043

// Check for canonical order, copy unchanged if ok and

2044

// if followed by a character with a boundary-before.

2045

uint8_t cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0

2046

if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) {

2047

// Fails FCD test, need to decompose and contiguously recompose.

2048

if (sink == nullptr) {

2049

return false;

2050

}

2051

} else {

2052

// If !onlyContiguous (not FCC), then we ignore the tccc of

2053

// the previous character which passed the quick check "yes && ccc==0" test.

2054

const uint8_t *nextSrc;

2055

uint16_t n16;

2056

for (;;) {

2057

if (src == limit) {

2058

if (sink != nullptr) {

2059

ByteSinkUtil::appendUnchanged(prevBoundary, limit,

2060

*sink, options, edits, errorCode);

2061

}

2062

return true;

2063

}

2064

uint8_t prevCC = cc;

2065

nextSrc = src;

2066

UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, n16)do { int32_t __lead = (uint8_t)*(nextSrc)++; if (!(((__lead)&
0x80)==0)) { uint8_t __t1, __t2, __t3; if ((nextSrc) != (limit
) && (__lead >= 0xe0 ? __lead < 0xf0 ? "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
[__lead &= 0xf] & (1 << ((__t1 = *(nextSrc)) >>
5)) && ++(nextSrc) != (limit) && (__t2 = *(nextSrc
) - 0x80) <= 0x3f && (__lead = ((int32_t)(normTrie
)->index[(__lead << 6) + (__t1 & 0x3f)]) + __t2,
1) : (__lead -= 0xf0) <= 4 && "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
[(__t1 = *(nextSrc)) >> 4] & (1 << __lead) &&
(__lead = (__lead << 6) | (__t1 & 0x3f), ++(nextSrc
) != (limit)) && (__t2 = *(nextSrc) - 0x80) <= 0x3f
&& ++(nextSrc) != (limit) && (__t3 = *(nextSrc
) - 0x80) <= 0x3f && (__lead = __lead >= (normTrie
)->shifted12HighStart ? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallU8Index_77((normTrie), __lead, __t2, __t3
), 1) : __lead >= 0xc2 && (__t1 = *(nextSrc) - 0x80
) <= 0x3f && (__lead = (int32_t)(normTrie)->index
[__lead & 0x1f] + __t1, 1))) { ++(nextSrc); } else { __lead
= (normTrie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
; } } (n16) = ((normTrie)->data.ptr16[__lead]); } while (false
);

2067

if (n16 >= MIN_YES_YES_WITH_CC) {

2068

cc = getCCFromNormalYesOrMaybe(n16);

2069

if (prevCC > cc) {

2070

if (sink == nullptr) {

2071

return false;

2072

}

2073

break;

2074

}

2075

} else {

2076

break;

2077

}

2078

src = nextSrc;

2079

}

2080

// src is after the last in-order combining mark.

2081

// If there is a boundary here, then we continue with no change.

2082

if (norm16HasCompBoundaryBefore(n16)) {

2083

if (isCompYesAndZeroCC(n16)) {

2084

src = nextSrc;

2085

}

2086

continue;

2087

}

2088

// Use the slow path. There is no boundary in [prevSrc, src[.

2089

}

2090

}

2091

2092

// Slow path: Find the nearest boundaries around the current character,

2093

// decompose and recompose.

2094

if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {

2095

const uint8_t *p = prevSrc;

2096

UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, prevBoundary, p, norm16)do { int32_t __index = (uint8_t)*--(p); if (!(((__index)&
0x80)==0)) { __index = ucptrie_internalU8PrevIndex_77((normTrie
), __index, (const uint8_t *)(prevBoundary), (const uint8_t *
)(p)); (p) -= __index & 7; __index >>= 3; } (norm16
) = ((normTrie)->data.ptr16[__index]); } while (false);

2097

if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {

2098

prevSrc = p;

2099

}

2100

}

2101

ReorderingBuffer buffer(*this, s16, errorCode);

2102

if (U_FAILURE(errorCode)) {

2103

break;

2104

}

2105

// We know there is not a boundary here.

2106

decomposeShort(prevSrc, src, STOP_AT_LIMIT, onlyContiguous,

2107

buffer, errorCode);

2108

// Decompose until the next boundary.

2109

src = decomposeShort(src, limit, STOP_AT_COMP_BOUNDARY, onlyContiguous,

2110

buffer, errorCode);

2111

if (U_FAILURE(errorCode)) {

2112

break;

2113

}

2114

if ((src - prevSrc) > INT32_MAX(2147483647)) { // guard before buffer.equals()

2115

errorCode = U_INDEX_OUTOFBOUNDS_ERROR;

2116

return true;

2117

}

2118

recompose(buffer, 0, onlyContiguous);

2119

if (!buffer.equals(prevSrc, src)) {

2120

if (sink == nullptr) {

2121

return false;

2122

}

2123

if (prevBoundary != prevSrc &&

2124

!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,

2125

*sink, options, edits, errorCode)) {

2126

break;

2127

}

2128

if (!ByteSinkUtil::appendChange(prevSrc, src, buffer.getStart(), buffer.length(),

2129

*sink, edits, errorCode)) {

2130

break;

2131

}

2132

prevBoundary = src;

2133

}

2134

}

2135

return true;

2136

}

2137

2138

UBool Normalizer2Impl::hasCompBoundaryBefore(const char16_t *src, const char16_t *limit) const {

2139

if (src == limit || *src < minCompNoMaybeCP) {

2140

return true;

2141

}

2142

UChar32 c;

2143

uint16_t norm16;

2144

UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16)do { (c) = *(src)++; int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)==0) && (src
) != (limit) && (((__c2 = *(src))&0xfffffc00)==0xdc00
)) { ++(src); (c) = (((UChar32)((c))<<10UL)+(UChar32)(__c2
)-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c) >=
(normTrie)->highStart ? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallIndex_77(normTrie, c)); } else { __index
= (normTrie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
; } } (norm16) = ((normTrie)->data.ptr16[__index]); } while
(false);

2145

return norm16HasCompBoundaryBefore(norm16);

2146

}

2147

2148

UBool Normalizer2Impl::hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const {

2149

if (src == limit) {

2150

return true;

2151

}

2152

uint16_t norm16;

2153

UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16)do { int32_t __lead = (uint8_t)*(src)++; if (!(((__lead)&
0x80)==0)) { uint8_t __t1, __t2, __t3; if ((src) != (limit) &&
(__lead >= 0xe0 ? __lead < 0xf0 ? "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
[__lead &= 0xf] & (1 << ((__t1 = *(src)) >>
5)) && ++(src) != (limit) && (__t2 = *(src) -
0x80) <= 0x3f && (__lead = ((int32_t)(normTrie)->
index[(__lead << 6) + (__t1 & 0x3f)]) + __t2, 1) : (
__lead -= 0xf0) <= 4 && "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
[(__t1 = *(src)) >> 4] & (1 << __lead) &&
(__lead = (__lead << 6) | (__t1 & 0x3f), ++(src) !=
(limit)) && (__t2 = *(src) - 0x80) <= 0x3f &&
++(src) != (limit) && (__t3 = *(src) - 0x80) <= 0x3f
&& (__lead = __lead >= (normTrie)->shifted12HighStart
? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallU8Index_77((normTrie), __lead, __t2, __t3
), 1) : __lead >= 0xc2 && (__t1 = *(src) - 0x80) <=
0x3f && (__lead = (int32_t)(normTrie)->index[__lead
& 0x1f] + __t1, 1))) { ++(src); } else { __lead = (normTrie
)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (
norm16) = ((normTrie)->data.ptr16[__lead]); } while (false
);

2154

return norm16HasCompBoundaryBefore(norm16);

2155

}

2156

2157

UBool Normalizer2Impl::hasCompBoundaryAfter(const char16_t *start, const char16_t *p,

2158

UBool onlyContiguous) const {

2159

if (start == p) {

2160

return true;

2161

}

2162

UChar32 c;

2163

uint16_t norm16;

2164

UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16)do { (c) = *--(p); int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)!=0) && (p
) != (start) && (((__c2 = *((p) - 1))&0xfffffc00)
==0xd800)) { --(p); (c) = (((UChar32)(__c2)<<10UL)+(UChar32
)((c))-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c)
>= (normTrie)->highStart ? (normTrie)->dataLength -
UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : ucptrie_internalSmallIndex_77
(normTrie, c)); } else { __index = (normTrie)->dataLength -
UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (norm16) = ((normTrie
)->data.ptr16[__index]); } while (false);

2165

return norm16HasCompBoundaryAfter(norm16, onlyContiguous);

2166

}

2167

2168

UBool Normalizer2Impl::hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p,

2169

UBool onlyContiguous) const {

2170

if (start == p) {

2171

return true;

2172

}

2173

uint16_t norm16;

2174

UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, start, p, norm16)do { int32_t __index = (uint8_t)*--(p); if (!(((__index)&
0x80)==0)) { __index = ucptrie_internalU8PrevIndex_77((normTrie
), __index, (const uint8_t *)(start), (const uint8_t *)(p)); (
p) -= __index & 7; __index >>= 3; } (norm16) = ((normTrie
)->data.ptr16[__index]); } while (false);

2175

return norm16HasCompBoundaryAfter(norm16, onlyContiguous);

2176

}

2177

2178

const char16_t *Normalizer2Impl::findPreviousCompBoundary(const char16_t *start, const char16_t *p,

2179

UBool onlyContiguous) const {

2180

while (p != start) {

2181

const char16_t *codePointLimit = p;

2182

UChar32 c;

2183

uint16_t norm16;

2184

UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16)do { (c) = *--(p); int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)!=0) && (p
) != (start) && (((__c2 = *((p) - 1))&0xfffffc00)
==0xd800)) { --(p); (c) = (((UChar32)(__c2)<<10UL)+(UChar32
)((c))-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c)
>= (normTrie)->highStart ? (normTrie)->dataLength -
UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : ucptrie_internalSmallIndex_77
(normTrie, c)); } else { __index = (normTrie)->dataLength -
UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (norm16) = ((normTrie
)->data.ptr16[__index]); } while (false);

2185

if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {

2186

return codePointLimit;

2187

}

2188

if (hasCompBoundaryBefore(c, norm16)) {

2189

return p;

2190

}

2191

}

2192

return p;

2193

}

2194

2195

const char16_t *Normalizer2Impl::findNextCompBoundary(const char16_t *p, const char16_t *limit,

2196

UBool onlyContiguous) const {

2197

while (p != limit) {

2198

const char16_t *codePointStart = p;

2199

UChar32 c;

2200

uint16_t norm16;

2201

UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16)do { (c) = *(p)++; int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)==0) && (p
) != (limit) && (((__c2 = *(p))&0xfffffc00)==0xdc00
)) { ++(p); (c) = (((UChar32)((c))<<10UL)+(UChar32)(__c2
)-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c) >=
(normTrie)->highStart ? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallIndex_77(normTrie, c)); } else { __index
= (normTrie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
; } } (norm16) = ((normTrie)->data.ptr16[__index]); } while
(false);

2202

if (hasCompBoundaryBefore(c, norm16)) {

2203

return codePointStart;

2204

}

2205

if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {

2206

return p;

2207

}

2208

}

2209

return p;

2210

}

2211

2212

uint8_t Normalizer2Impl::getPreviousTrailCC(const char16_t *start, const char16_t *p) const {

2213

if (start == p) {

2214

return 0;

2215

}

2216

int32_t i = static_cast<int32_t>(p - start);

2217

UChar32 c;

2218

U16_PREV(start, 0, i, c)do { (c)=(start)[--(i)]; if((((c)&0xfffffc00)==0xdc00)) {
uint16_t __c2; if((i)>(0) && (((__c2=(start)[(i)-
1])&0xfffffc00)==0xd800)) { --(i); (c)=(((UChar32)(__c2)<<
10UL)+(UChar32)((c))-((0xd800<<10UL)+0xdc00-0x10000)); }
} } while (false);

2219

return static_cast<uint8_t>(getFCD16(c));

2220

}

2221

2222

uint8_t Normalizer2Impl::getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const {

2223

if (start == p) {

2224

return 0;

2225

}

2226

int32_t i = static_cast<int32_t>(p - start);

2227

UChar32 c;

2228

U8_PREV(start, 0, i, c)do { (c)=(uint8_t)(start)[--(i)]; if(!(((c)&0x80)==0)) { (
c)=utf8_prevCharSafeBody_77((const uint8_t *)start, 0, &(
i), c, -1); } } while (false);

2229

return static_cast<uint8_t>(getFCD16(c));

2230

}

2231

2232

// Note: normalizer2impl.cpp r30982 (2011-nov-27)

2233

// still had getFCDTrie() which built and cached an FCD trie.

2234

// That provided faster access to FCD data than getFCD16FromNormData()

2235

// but required synchronization and consumed some 10kB of heap memory

2236

// in any process that uses FCD (e.g., via collation).

2237

// minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance,

2238

// at least for ASCII & CJK.

2239

2240

// Ticket 20907 - The optimizer in MSVC/Visual Studio versions below 16.4 has trouble with this

2241

// function on Windows ARM64. As a work-around, we disable optimizations for this function.

2242

// This work-around could/should be removed once the following versions of Visual Studio are no

2243

// longer supported: All versions of VS2017, and versions of VS2019 below 16.4.

2244

#if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))

2245

#pragma optimize( "", off )

2246

#endif

2247

// Gets the FCD value from the regular normalization data.

2248

uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {

2249

uint16_t norm16=getNorm16(c);

2250

if (norm16 >= limitNoNo) {

2251

if(norm16>=MIN_NORMAL_MAYBE_YES) {

2252

// combining mark

2253

norm16=getCCFromNormalYesOrMaybe(norm16);

2254

return norm16|(norm16<<8);

2255

} else if(norm16>=minMaybeYes) {

2256

return 0;

2257

} else if(norm16<minMaybeNo) { // isDecompNoAlgorithmic(norm16)

2258

uint16_t deltaTrailCC = norm16 & DELTA_TCCC_MASK;

2259

if (deltaTrailCC <= DELTA_TCCC_1) {

2260

return deltaTrailCC >> OFFSET_SHIFT;

2261

}

2262

// Maps to an isCompYesAndZeroCC.

2263

c=mapAlgorithmic(c, norm16);

2264

norm16=getRawNorm16(c);

2265

}

2266

}

2267

if(norm16<=minYesNo || isHangulLVT(norm16)) {

2268

// no decomposition or Hangul syllable, all zeros

2269

return 0;

2270

}

2271

// c decomposes, get everything from the variable-length extra data

2272

const uint16_t *mapping=getData(norm16);

2273

uint16_t firstUnit=*mapping;

2274

norm16=firstUnit>>8; // tccc

2275

if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {

2276

norm16|=*(mapping-1)&0xff00; // lccc

2277

}

2278

return norm16;

2279

}

2280

#if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))

2281

#pragma optimize( "", on )

2282

#endif

2283

2284

uint16_t Normalizer2Impl::getFCD16FromMaybeOrNonZeroCC(uint16_t norm16) const {

2285

U_ASSERT(norm16 >= minMaybeNo)(static_cast <bool> (norm16 >= minMaybeNo) ? void (0
) : __assert_fail ("norm16 >= minMaybeNo", __builtin_FILE (
), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__));

2286

if (norm16 >= MIN_NORMAL_MAYBE_YES) {

2287

// combining mark

2288

norm16 = getCCFromNormalYesOrMaybe(norm16);

2289

return norm16 | (norm16<<8);

2290

} else if (norm16 >= minMaybeYes) {

2291

return 0;

2292

}

2293

// c decomposes, get everything from the variable-length extra data

2294

const uint16_t *mapping = getDataForMaybe(norm16);

2295

uint16_t firstUnit = *mapping;

2296

// maybeNo has lccc = 0

2297

U_ASSERT((firstUnit & MAPPING_HAS_CCC_LCCC_WORD) == 0 || (*(mapping - 1) & 0xff00) == 0)(static_cast <bool> ((firstUnit & MAPPING_HAS_CCC_LCCC_WORD
) == 0 || (*(mapping - 1) & 0xff00) == 0) ? void (0) : __assert_fail
("(firstUnit & MAPPING_HAS_CCC_LCCC_WORD) == 0 || (*(mapping - 1) & 0xff00) == 0"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
));

2298

return firstUnit >> 8; // tccc

2299

}

2300

2301

// Dual functionality:

2302

// buffer!=nullptr: normalize

2303

// buffer==nullptr: isNormalized/quickCheck/spanQuickCheckYes

2304

const char16_t *

2305

Normalizer2Impl::makeFCD(const char16_t *src, const char16_t *limit,

2306

ReorderingBuffer *buffer,

2307

UErrorCode &errorCode) const {

2308

// Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.

2309

// Similar to the prevBoundary in the compose() implementation.

2310

const char16_t *prevBoundary=src;

2311

int32_t prevFCD16=0;

2312

if(limit==nullptr) {

2313

src=copyLowPrefixFromNulTerminated(src, minLcccCP, buffer, errorCode);

2314

if(U_FAILURE(errorCode)) {

2315

return src;

2316

}

2317

if(prevBoundary<src) {

2318

prevBoundary=src;

2319

// We know that the previous character's lccc==0.

2320

// Fetching the fcd16 value was deferred for this below-U+0300 code point.

2321

prevFCD16=getFCD16(*(src-1));

2322

if(prevFCD16>1) {

2323

--prevBoundary;

2324

}

2325

}

2326

limit=u_strchru_strchr_77(src, 0);

2327

}

2328

2329

// Note: In this function we use buffer->appendZeroCC() because we track

2330

// the lead and trail combining classes here, rather than leaving it to

2331

// the ReorderingBuffer.

2332

// The exception is the call to decomposeShort() which uses the buffer

2333

// in the normal way.

2334

2335

const char16_t *prevSrc;

2336

UChar32 c=0;

2337

uint16_t fcd16=0;

2338

2339

for(;;) {

2340

// count code units with lccc==0

2341

for(prevSrc=src; src!=limit;) {

2342

if((c=*src)<minLcccCP) {

2343

prevFCD16=~c;

2344

++src;

2345

} else if(!singleLeadMightHaveNonZeroFCD16(c)) {

2346

prevFCD16=0;

2347

++src;

2348

} else {

2349

if(U16_IS_LEAD(c)(((c)&0xfffffc00)==0xd800)) {

2350

char16_t c2;

2351

if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])(((c2=src[1])&0xfffffc00)==0xdc00)) {

2352

c=U16_GET_SUPPLEMENTARY(c, c2)(((UChar32)(c)<<10UL)+(UChar32)(c2)-((0xd800<<10UL
)+0xdc00-0x10000));

2353

}

2354

}

2355

if((fcd16=getFCD16FromNormData(c))<=0xff) {

2356

prevFCD16=fcd16;

2357

src+=U16_LENGTH(c)((uint32_t)(c)<=0xffff ? 1 : 2);

2358

} else {

2359

break;

2360

}

2361

}

2362

}

2363

// copy these code units all at once

2364

if(src!=prevSrc) {

2365

if(buffer!=nullptr && !buffer->appendZeroCC(prevSrc, src, errorCode)) {

2366

break;

2367

}

2368

if(src==limit) {

2369

break;

2370

}

2371

prevBoundary=src;

2372

// We know that the previous character's lccc==0.

2373

if(prevFCD16<0) {

2374

// Fetching the fcd16 value was deferred for this below-minLcccCP code point.

2375

UChar32 prev=~prevFCD16;

2376

if(prev<minDecompNoCP) {

2377

prevFCD16=0;

2378

} else {

2379

prevFCD16=getFCD16FromNormData(prev);

2380

if(prevFCD16>1) {

2381

--prevBoundary;

2382

}

2383

}

2384

} else {

2385

const char16_t *p=src-1;

2386

if(U16_IS_TRAIL(*p)(((*p)&0xfffffc00)==0xdc00) && prevSrc<p && U16_IS_LEAD(*(p-1))(((*(p-1))&0xfffffc00)==0xd800)) {

2387

--p;

2388

// Need to fetch the previous character's FCD value because

2389

// prevFCD16 was just for the trail surrogate code point.

2390

prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1])(((UChar32)(p[0])<<10UL)+(UChar32)(p[1])-((0xd800<<
10UL)+0xdc00-0x10000)));

2391

// Still known to have lccc==0 because its lead surrogate unit had lccc==0.

2392

}

2393

if(prevFCD16>1) {

2394

prevBoundary=p;

2395

}

2396

}

2397

// The start of the current character (c).

2398

prevSrc=src;

2399

} else if(src==limit) {

2400

break;

2401

}

2402

2403

src+=U16_LENGTH(c)((uint32_t)(c)<=0xffff ? 1 : 2);

2404

// The current character (c) at [prevSrc..src[ has a non-zero lead combining class.

2405

// Check for proper order, and decompose locally if necessary.

2406

if((prevFCD16&0xff)<=(fcd16>>8)) {

2407

// proper order: prev tccc <= current lccc

2408

if((fcd16&0xff)<=1) {

2409

prevBoundary=src;

2410

}

2411

if(buffer!=nullptr && !buffer->appendZeroCC(c, errorCode)) {

2412

break;

2413

}

2414

prevFCD16=fcd16;

2415

continue;

2416

} else if(buffer==nullptr) {

2417

return prevBoundary; // quick check "no"

2418

} else {

2419

/*

2420

* Back out the part of the source that we copied or appended

2421

* already but is now going to be decomposed.

2422

* prevSrc is set to after what was copied/appended.

2423

*/

2424

buffer->removeSuffix(static_cast<int32_t>(prevSrc - prevBoundary));

2425

/*

2426

* Find the part of the source that needs to be decomposed,

2427

* up to the next safe boundary.

2428

*/

2429

src=findNextFCDBoundary(src, limit);

2430

/*

2431

* The source text does not fulfill the conditions for FCD.

2432

* Decompose and reorder a limited piece of the text.

2433

*/

2434

decomposeShort(prevBoundary, src, false, false, *buffer, errorCode);

2435

if (U_FAILURE(errorCode)) {

2436

break;

2437

}

2438

prevBoundary=src;

2439

prevFCD16=0;

2440

}

2441

}

2442

return src;

2443

}

2444

2445

void Normalizer2Impl::makeFCDAndAppend(const char16_t *src, const char16_t *limit,

2446

UBool doMakeFCD,

2447

UnicodeString &safeMiddle,

2448

ReorderingBuffer &buffer,

2449

UErrorCode &errorCode) const {

2450

if(!buffer.isEmpty()) {

2451

const char16_t *firstBoundaryInSrc=findNextFCDBoundary(src, limit);

2452

if(src!=firstBoundaryInSrc) {

2453

const char16_t *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),

2454

buffer.getLimit());

2455

int32_t destSuffixLength = static_cast<int32_t>(buffer.getLimit() - lastBoundaryInDest);

2456

UnicodeString middle(lastBoundaryInDest, destSuffixLength);

2457

buffer.removeSuffix(destSuffixLength);

2458

safeMiddle=middle;

2459

middle.append(src, static_cast<int32_t>(firstBoundaryInSrc - src));

2460

const char16_t *middleStart=middle.getBuffer();

2461

makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);

2462

if(U_FAILURE(errorCode)) {

2463

return;

2464

}

2465

src=firstBoundaryInSrc;

2466

}

2467

}

2468

if(doMakeFCD) {

2469

makeFCD(src, limit, &buffer, errorCode);

2470

} else {

2471

if(limit==nullptr) { // appendZeroCC() needs limit!=nullptr

2472

limit=u_strchru_strchr_77(src, 0);

2473

}

2474

buffer.appendZeroCC(src, limit, errorCode);

2475

}

2476

}

2477

2478

const char16_t *Normalizer2Impl::findPreviousFCDBoundary(const char16_t *start, const char16_t *p) const {

2479

while(start<p) {

2480

const char16_t *codePointLimit = p;

2481

UChar32 c;

2482

uint16_t norm16;

2483

UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16)do { (c) = *--(p); int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)!=0) && (p
) != (start) && (((__c2 = *((p) - 1))&0xfffffc00)
==0xd800)) { --(p); (c) = (((UChar32)(__c2)<<10UL)+(UChar32
)((c))-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c)
>= (normTrie)->highStart ? (normTrie)->dataLength -
UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : ucptrie_internalSmallIndex_77
(normTrie, c)); } else { __index = (normTrie)->dataLength -
UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; } } (norm16) = ((normTrie
)->data.ptr16[__index]); } while (false);

2484

if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16)) {

2485

return codePointLimit;

2486

}

2487

if (norm16HasDecompBoundaryBefore(norm16)) {

2488

return p;

2489

}

2490

}

2491

return p;

2492

}

2493

2494

const char16_t *Normalizer2Impl::findNextFCDBoundary(const char16_t *p, const char16_t *limit) const {

2495

while(p<limit) {

2496

const char16_t *codePointStart=p;

2497

UChar32 c;

2498

uint16_t norm16;

2499

UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16)do { (c) = *(p)++; int32_t __index; if (!(((c)&0xfffff800
)==0xd800)) { __index = ((int32_t)(normTrie)->index[(c) >>
UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)); }
else { uint16_t __c2; if ((((c)&0x400)==0) && (p
) != (limit) && (((__c2 = *(p))&0xfffffc00)==0xdc00
)) { ++(p); (c) = (((UChar32)((c))<<10UL)+(UChar32)(__c2
)-((0xd800<<10UL)+0xdc00-0x10000)); __index = ((c) >=
(normTrie)->highStart ? (normTrie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
: ucptrie_internalSmallIndex_77(normTrie, c)); } else { __index
= (normTrie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
; } } (norm16) = ((normTrie)->data.ptr16[__index]); } while
(false);

2500

if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16)) {

2501

return codePointStart;

2502

}

2503

if (norm16HasDecompBoundaryAfter(norm16)) {

2504

return p;

2505

}

2506

}

2507

return p;

2508

}

2509

2510

// CanonicalIterator data -------------------------------------------------- ***

2511

2512

CanonIterData::CanonIterData(UErrorCode &errorCode) :

2513

mutableTrie(umutablecptrie_openumutablecptrie_open_77(0, 0, &errorCode)), trie(nullptr),

2514

canonStartSets(uprv_deleteUObjectuprv_deleteUObject_77, nullptr, errorCode) {}

2515

2516

CanonIterData::~CanonIterData() {

2517

umutablecptrie_closeumutablecptrie_close_77(mutableTrie);

2518

ucptrie_closeucptrie_close_77(trie);

2519

}

2520

2521

void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {

2522

uint32_t canonValue = umutablecptrie_getumutablecptrie_get_77(mutableTrie, decompLead);

2523

if((canonValue&(CANON_HAS_SET0x200000|CANON_VALUE_MASK0x1fffff))==0 && origin!=0) {

2524

// origin is the first character whose decomposition starts with

2525

// the character for which we are setting the value.

2526

umutablecptrie_setumutablecptrie_set_77(mutableTrie, decompLead, canonValue|origin, &errorCode);

2527

} else {

2528

// origin is not the first character, or it is U+0000.

2529

UnicodeSet *set;

2530

if((canonValue&CANON_HAS_SET0x200000)==0) {

2531

LocalPointer<UnicodeSet> lpSet(new UnicodeSet, errorCode);

2532

set=lpSet.getAlias();

2533

if(U_FAILURE(errorCode)) {

2534

return;

2535

}

2536

UChar32 firstOrigin = static_cast<UChar32>(canonValue & CANON_VALUE_MASK0x1fffff);

2537

canonValue = (canonValue & ~CANON_VALUE_MASK0x1fffff) | CANON_HAS_SET0x200000 | static_cast<uint32_t>(canonStartSets.size());

2538

umutablecptrie_setumutablecptrie_set_77(mutableTrie, decompLead, canonValue, &errorCode);

2539

canonStartSets.adoptElement(lpSet.orphan(), errorCode);

2540

if (U_FAILURE(errorCode)) {

2541

return;

2542

}

2543

if(firstOrigin!=0) {

2544

set->add(firstOrigin);

2545

}

2546

} else {

2547

set = static_cast<UnicodeSet*>(canonStartSets[static_cast<int32_t>(canonValue & CANON_VALUE_MASK0x1fffff)]);

2548

}

2549

set->add(origin);

2550

}

2551

}

2552

2553

// C++ class for friend access to private Normalizer2Impl members.

2554

class InitCanonIterData {

2555

public:

2556

static void doInit(Normalizer2Impl *impl, UErrorCode &errorCode);

2557

};

2558

2559

U_CDECL_BEGINextern "C" {

2560

2561

// UInitOnce instantiation function for CanonIterData

2562

static void U_CALLCONV

2563

initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {

2564

InitCanonIterData::doInit(impl, errorCode);

2565

}

2566

2567

U_CDECL_END}

2568

2569

void InitCanonIterData::doInit(Normalizer2Impl *impl, UErrorCode &errorCode) {

2570

U_ASSERT(impl->fCanonIterData == nullptr)(static_cast <bool> (impl->fCanonIterData == nullptr
) ? void (0) : __assert_fail ("impl->fCanonIterData == nullptr"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
));

2571

impl->fCanonIterData = new CanonIterData(errorCode);

2572

if (impl->fCanonIterData == nullptr) {

2573

errorCode=U_MEMORY_ALLOCATION_ERROR;

2574

}

2575

if (U_SUCCESS(errorCode)) {

2576

UChar32 start = 0, end;

2577

uint32_t value;

2578

while ((end = ucptrie_getRangeucptrie_getRange_77(impl->normTrie, start,

2579

UCPMAP_RANGE_FIXED_LEAD_SURROGATES, Normalizer2Impl::INERT,

2580

nullptr, nullptr, &value)) >= 0) {

2581

// Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.

2582

if (value != Normalizer2Impl::INERT) {

2583

impl->makeCanonIterDataFromNorm16(start, end, value, *impl->fCanonIterData, errorCode);

2584

}

2585

start = end + 1;

2586

}

2587

#ifdef UCPTRIE_DEBUG

2588

umutablecptrie_setName(impl->fCanonIterData->mutableTrie, "CanonIterData");

2589

#endif

2590

impl->fCanonIterData->trie = umutablecptrie_buildImmutableumutablecptrie_buildImmutable_77(

2591

impl->fCanonIterData->mutableTrie, UCPTRIE_TYPE_SMALL, UCPTRIE_VALUE_BITS_32, &errorCode);

2592

umutablecptrie_closeumutablecptrie_close_77(impl->fCanonIterData->mutableTrie);

2593

impl->fCanonIterData->mutableTrie = nullptr;

2594

}

2595

if (U_FAILURE(errorCode)) {

2596

delete impl->fCanonIterData;

2597

impl->fCanonIterData = nullptr;

2598

}

2599

}

2600

2601

void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,

2602

CanonIterData &newData,

2603

UErrorCode &errorCode) const {

2604

if(isInert(norm16) ||

2605

(minYesNo<=norm16 && norm16<minNoNo) ||

2606

(minMaybeNo<=norm16 && norm16<minMaybeYes)) {

2607

// Inert, or 2-way mapping (including Hangul syllable).

2608

// We do not write a canonStartSet for any yesNo/maybeNo character.

2609

// Composites from 2-way mappings are added at runtime from the

2610

// starter's compositions list, and the other characters in

2611

// 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are

2612

// "maybe" characters.

2613

return;

2614

}

2615

for(UChar32 c=start; c<=end; ++c) {

2616

uint32_t oldValue = umutablecptrie_getumutablecptrie_get_77(newData.mutableTrie, c);

2617

uint32_t newValue=oldValue;

2618

if(isMaybeYesOrNonZeroCC(norm16)) {

2619

// not a segment starter if it occurs in a decomposition or has cc!=0

2620

newValue|=CANON_NOT_SEGMENT_STARTER0x80000000;

2621

if(norm16<MIN_NORMAL_MAYBE_YES) {

2622

newValue|=CANON_HAS_COMPOSITIONS0x40000000;

2623

}

2624

} else if(norm16<minYesNo) {

2625

newValue|=CANON_HAS_COMPOSITIONS0x40000000;

2626

} else {

2627

// c has a one-way decomposition

2628

UChar32 c2=c;

2629

// Do not modify the whole-range norm16 value.

2630

uint16_t norm16_2=norm16;

2631

if (isDecompNoAlgorithmic(norm16_2)) {

2632

// Maps to an isCompYesAndZeroCC.

2633

c2 = mapAlgorithmic(c2, norm16_2);

2634

norm16_2 = getRawNorm16(c2);

2635

// No compatibility mappings for the CanonicalIterator.

2636

U_ASSERT(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2)))(static_cast <bool> (!(isHangulLV(norm16_2) || isHangulLVT
(norm16_2))) ? void (0) : __assert_fail ("!(isHangulLV(norm16_2) || isHangulLVT(norm16_2))"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
));

2637

}

2638

if (norm16_2 > minYesNo) {

2639

// c decomposes, get everything from the variable-length extra data

2640

const uint16_t *mapping=getDataForYesOrNo(norm16_2);

2641

uint16_t firstUnit=*mapping;

2642

int32_t length=firstUnit&MAPPING_LENGTH_MASK;

2643

if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {

2644

if(c==c2 && (*(mapping-1)&0xff)!=0) {

2645

newValue|=CANON_NOT_SEGMENT_STARTER0x80000000; // original c has cc!=0

2646

}

2647

}

2648

// Skip empty mappings (no characters in the decomposition).

2649

if(length!=0) {

2650

++mapping; // skip over the firstUnit

2651

// add c to first code point's start set

2652

int32_t i=0;

2653

U16_NEXT_UNSAFE(mapping, i, c2)do { (c2)=(mapping)[(i)++]; if((((c2)&0xfffffc00)==0xd800
)) { (c2)=(((UChar32)((c2))<<10UL)+(UChar32)((mapping)[
(i)++])-((0xd800<<10UL)+0xdc00-0x10000)); } } while (false
);

2654

newData.addToStartSet(c, c2, errorCode);

2655

// Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a

2656

// one-way mapping. A 2-way mapping is possible here after

2657

// intermediate algorithmic mapping.

2658

if(norm16_2>=minNoNo) {

2659

while(i<length) {

2660

U16_NEXT_UNSAFE(mapping, i, c2)do { (c2)=(mapping)[(i)++]; if((((c2)&0xfffffc00)==0xd800
)) { (c2)=(((UChar32)((c2))<<10UL)+(UChar32)((mapping)[
(i)++])-((0xd800<<10UL)+0xdc00-0x10000)); } } while (false
);

2661

uint32_t c2Value = umutablecptrie_getumutablecptrie_get_77(newData.mutableTrie, c2);

2662

if((c2Value&CANON_NOT_SEGMENT_STARTER0x80000000)==0) {

2663

umutablecptrie_setumutablecptrie_set_77(newData.mutableTrie, c2,

2664

c2Value|CANON_NOT_SEGMENT_STARTER0x80000000, &errorCode);

2665

}

2666

}

2667

}

2668

}

2669

} else {

2670

// c decomposed to c2 algorithmically; c has cc==0

2671

newData.addToStartSet(c, c2, errorCode);

2672

}

2673

}

2674

if(newValue!=oldValue) {

2675

umutablecptrie_setumutablecptrie_set_77(newData.mutableTrie, c, newValue, &errorCode);

2676

}

2677

}

2678

}

2679

2680

UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {

2681

// Logically const: Synchronized instantiation.

2682

Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);

2683

umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);

2684

return U_SUCCESS(errorCode);

2685

}

2686

2687

int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {

2688

return static_cast<int32_t>(ucptrie_getucptrie_get_77(fCanonIterData->trie, c));

2689

}

2690

2691

const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {

2692

return *static_cast<const UnicodeSet*>(fCanonIterData->canonStartSets[n]);

2693

}

2694

2695

UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {

2696

return getCanonValue(c)>=0;

2697

}

2698

2699

UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {

2700

int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER0x80000000;

2701

if(canonValue==0) {

2702

return false;

2703

}

2704

set.clear();

2705

int32_t value=canonValue&CANON_VALUE_MASK0x1fffff;

2706

if((canonValue&CANON_HAS_SET0x200000)!=0) {

2707

set.addAll(getCanonStartSet(value));

2708

} else if(value!=0) {

2709

set.add(value);

2710

}

2711

if((canonValue&CANON_HAS_COMPOSITIONS0x40000000)!=0) {

2712

uint16_t norm16=getRawNorm16(c);

2713

if(norm16==JAMO_L) {

2714

UChar32 syllable=

2715

static_cast<UChar32>(Hangul::HANGUL_BASE + (c - Hangul::JAMO_L_BASE) * Hangul::JAMO_VT_COUNT);

2716

set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);

2717

} else {

2718

addComposites(getCompositionsList(norm16), set);

2719

}

2720

}

2721

return true;

2722

}

2723

2724

U_NAMESPACE_END}

2725

2726

// Normalizer2 data swapping ----------------------------------------------- ***

2727

2728

U_NAMESPACE_USEusing namespace icu_77;

2729

2730

U_CAPIextern "C" int32_t U_EXPORT2

2731

unorm2_swapunorm2_swap_77(const UDataSwapper *ds,

2732

const void *inData, int32_t length, void *outData,

2733

UErrorCode *pErrorCode) {

2734

const UDataInfo *pInfo;

2735

int32_t headerSize;

2736

2737

const uint8_t *inBytes;

2738

uint8_t *outBytes;

2739

2740

const int32_t *inIndexes;

2741

int32_t indexes[Normalizer2Impl::IX_TOTAL_SIZE+1];

2742

2743

int32_t i, offset, nextOffset, size;

2744

2745

/* udata_swapDataHeader checks the arguments */

2746

headerSize=udata_swapDataHeaderudata_swapDataHeader_77(ds, inData, length, outData, pErrorCode);

2747

if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {

2748

return 0;

2749

}

2750

2751

/* check data format and format version */

2752

pInfo=(const UDataInfo *)((const char *)inData+4);

2753

uint8_t formatVersion0=pInfo->formatVersion[0];

2754

if(!(

2755

pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */

2756

pInfo->dataFormat[1]==0x72 &&

2757

pInfo->dataFormat[2]==0x6d &&

2758

pInfo->dataFormat[3]==0x32 &&

2759

(1<=formatVersion0 && formatVersion0<=5)

2760

)) {

2761

udata_printErrorudata_printError_77(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",

2762

pInfo->dataFormat[0], pInfo->dataFormat[1],

2763

pInfo->dataFormat[2], pInfo->dataFormat[3],

2764

pInfo->formatVersion[0]);

2765

*pErrorCode=U_UNSUPPORTED_ERROR;

2766

return 0;

2767

}

2768

2769

inBytes=(const uint8_t *)inData+headerSize;

2770

outBytes=(outData == nullptr) ? nullptr : (uint8_t *)outData+headerSize;

2771

2772

inIndexes=(const int32_t *)inBytes;

2773

int32_t minIndexesLength;

2774

if(formatVersion0==1) {

2775

minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_YES+1;

2776

} else if(formatVersion0==2) {

2777

minIndexesLength=Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY+1;

2778

} else if(formatVersion0<=4) {

2779

minIndexesLength=Normalizer2Impl::IX_MIN_LCCC_CP+1;

2780

} else {

2781

minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_NO_COMBINES_FWD+1;

2782

}

2783

2784

if(length>=0) {

2785

length-=headerSize;

2786

if(length<minIndexesLength*4) {

2787

udata_printErrorudata_printError_77(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",

2788

length);

2789

*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;

2790

return 0;

2791

}

2792

}

2793

2794

/* read the first few indexes */

2795

for(i=0; i<UPRV_LENGTHOF(indexes)(int32_t)(sizeof(indexes)/sizeof((indexes)[0])); ++i) {

2796

indexes[i]=udata_readInt32udata_readInt32_77(ds, inIndexes[i]);

2797

}

2798

2799

/* get the total length of the data */

2800

size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];

2801

2802

if(length>=0) {

2803

if(length<size) {

2804

udata_printErrorudata_printError_77(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",

2805

length);

2806

*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;

2807

return 0;

2808

}

2809

2810

/* copy the data for inaccessible bytes */

2811

if(inBytes!=outBytes) {

2812

uprv_memcpy(outBytes, inBytes, size)do { clang diagnostic push clang diagnostic ignored "-Waddress"
(static_cast <bool> (outBytes != __null) ? void (0) :
__assert_fail ("outBytes != __null", __builtin_FILE (), __builtin_LINE
(), __extension__ __PRETTY_FUNCTION__)); (static_cast <bool
> (inBytes != __null) ? void (0) : __assert_fail ("inBytes != __null"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
)); clang diagnostic pop :: memcpy(outBytes, inBytes, size);
} while (false);

2813

}

2814

2815

offset=0;

2816

2817

/* swap the int32_t indexes[] */

2818

nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];

2819

ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);

2820

offset=nextOffset;

2821

2822

/* swap the trie */

2823

nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];

2824

utrie_swapAnyVersionutrie_swapAnyVersion_77(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);

2825

offset=nextOffset;

2826

2827

/* swap the uint16_t extraData[] */

2828

nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];

2829

ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);

2830

offset=nextOffset;

2831

2832

/* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */

2833

nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];

2834

offset=nextOffset;

2835

2836

U_ASSERT(offset==size)(static_cast <bool> (offset==size) ? void (0) : __assert_fail
("offset==size", __builtin_FILE (), __builtin_LINE (), __extension__
__PRETTY_FUNCTION__));

2837

}

2838

2839

return headerSize+size;

2840

}

2841

2842

#endif // !UCONFIG_NO_NORMALIZATION

File:	root/firefox-clang/intl/icu/source/common/normalizer2impl.cpp
Warning:	line 2830, column 9 Value stored to 'offset' is never read

Bug Summary

Annotated Source Code