Bug Summary

File:root/firefox-clang/intl/icu/source/i18n/nfrule.cpp
Warning:line 500, column 13
Value stored to 'subEnd' during its initialization is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name nfrule.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -relaxed-aliasing -ffp-contract=off -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/root/firefox-clang/obj-x86_64-pc-linux-gnu/config/external/icu/i18n -fcoverage-compilation-dir=/root/firefox-clang/obj-x86_64-pc-linux-gnu/config/external/icu/i18n -resource-dir /usr/lib/llvm-21/lib/clang/21 -include /root/firefox-clang/config/gcc_hidden.h -include /root/firefox-clang/obj-x86_64-pc-linux-gnu/mozilla-config.h -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/system_wrappers -U _FORTIFY_SOURCE -D _FORTIFY_SOURCE=2 -D _GLIBCXX_ASSERTIONS -D DEBUG=1 -D U_I18N_IMPLEMENTATION -D _LIBCPP_DISABLE_DEPRECATION_WARNINGS -D U_USING_ICU_NAMESPACE=0 -D U_NO_DEFAULT_INCLUDE_UTF_HEADERS=1 -D U_HIDE_OBSOLETE_UTF_OLD_H=1 -D UCONFIG_NO_LEGACY_CONVERSION -D UCONFIG_NO_TRANSLITERATION -D UCONFIG_NO_REGULAR_EXPRESSIONS -D UCONFIG_NO_BREAK_ITERATION -D UCONFIG_NO_IDNA -D UCONFIG_NO_MF2 -D U_CHARSET_IS_UTF8 -D UNISTR_FROM_CHAR_EXPLICIT=explicit -D UNISTR_FROM_STRING_EXPLICIT=explicit -D U_ENABLE_DYLOAD=0 -D U_DEBUG=1 -I /root/firefox-clang/config/external/icu/i18n -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/config/external/icu/i18n -I /root/firefox-clang/intl/icu/source/common -I /root/firefox-clang/mfbt/double-conversion -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include/nspr -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include/nss -D MOZILLA_CLIENT -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../include/c++/14 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../include/x86_64-linux-gnu/c++/14 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../include/c++/14/backward -internal-isystem /usr/lib/llvm-21/lib/clang/21/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-error=pessimizing-move -Wno-error=large-by-value-copy=128 -Wno-error=implicit-int-float-conversion -Wno-error=thread-safety-analysis -Wno-error=tautological-type-limit-compare -Wno-invalid-offsetof -Wno-range-loop-analysis -Wno-deprecated-anon-enum-enum-conversion -Wno-deprecated-enum-enum-conversion -Wno-deprecated-this-capture -Wno-inline-new-delete -Wno-error=deprecated-declarations -Wno-error=array-bounds -Wno-error=free-nonheap-object -Wno-error=atomic-alignment -Wno-error=deprecated-builtins -Wno-psabi -Wno-error=builtin-macro-redefined -Wno-vla-cxx-extension -Wno-unknown-warning-option -Wno-comma -Wno-implicit-const-int-float-conversion -Wno-macro-redefined -Wno-microsoft-include -Wno-tautological-unsigned-enum-zero-compare -Wno-unreachable-code-loop-increment -Wno-unreachable-code-return -fdeprecated-macro -ferror-limit 19 -fstrict-flex-arrays=1 -stack-protector 2 -fstack-clash-protection -ftrivial-auto-var-init=pattern -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -fno-sized-deallocation -fno-aligned-allocation -vectorize-loops -vectorize-slp -analyzer-checker optin.performance.Padding -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2025-06-27-100320-3286336-1 -x c++ /root/firefox-clang/intl/icu/source/i18n/nfrule.cpp
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5* Copyright (C) 1997-2015, International Business Machines
6* Corporation and others. All Rights Reserved.
7******************************************************************************
8* file name: nfrule.cpp
9* encoding: UTF-8
10* tab size: 8 (not used)
11* indentation:4
12*
13* Modification history
14* Date Name Comments
15* 10/11/2001 Doug Ported from ICU4J
16*/
17
18#include "nfrule.h"
19
20#if U_HAVE_RBNF1
21
22#include "unicode/localpointer.h"
23#include "unicode/rbnf.h"
24#include "unicode/tblcoll.h"
25#include "unicode/plurfmt.h"
26#include "unicode/upluralrules.h"
27#include "unicode/coleitr.h"
28#include "unicode/uchar.h"
29#include "nfrs.h"
30#include "nfrlist.h"
31#include "nfsubs.h"
32#include "patternprops.h"
33#include "putilimp.h"
34
35U_NAMESPACE_BEGINnamespace icu_77 {
36
37NFRule::NFRule(const RuleBasedNumberFormat* _rbnf, const UnicodeString &_ruleText, UErrorCode &status)
38 : baseValue(static_cast<int32_t>(0))
39 , radix(10)
40 , exponent(0)
41 , decimalPoint(0)
42 , fRuleText(_ruleText)
43 , sub1(nullptr)
44 , sub2(nullptr)
45 , formatter(_rbnf)
46 , rulePatternFormat(nullptr)
47{
48 if (!fRuleText.isEmpty()) {
49 parseRuleDescriptor(fRuleText, status);
50 }
51}
52
53NFRule::~NFRule()
54{
55 if (sub1 != sub2) {
56 delete sub2;
57 sub2 = nullptr;
58 }
59 delete sub1;
60 sub1 = nullptr;
61 delete rulePatternFormat;
62 rulePatternFormat = nullptr;
63}
64
65static const char16_t gLeftBracket = 0x005b;
66static const char16_t gRightBracket = 0x005d;
67static const char16_t gVerticalLine = 0x007C;
68static const char16_t gColon = 0x003a;
69static const char16_t gZero = 0x0030;
70static const char16_t gNine = 0x0039;
71static const char16_t gSpace = 0x0020;
72static const char16_t gSlash = 0x002f;
73static const char16_t gGreaterThan = 0x003e;
74static const char16_t gLessThan = 0x003c;
75static const char16_t gComma = 0x002c;
76static const char16_t gDot = 0x002e;
77static const char16_t gTick = 0x0027;
78//static const char16_t gMinus = 0x002d;
79static const char16_t gSemicolon = 0x003b;
80static const char16_t gX = 0x0078;
81
82static const char16_t gMinusX[] = {0x2D, 0x78, 0}; /* "-x" */
83static const char16_t gInf[] = {0x49, 0x6E, 0x66, 0}; /* "Inf" */
84static const char16_t gNaN[] = {0x4E, 0x61, 0x4E, 0}; /* "NaN" */
85
86static const char16_t gDollarOpenParenthesis[] = {0x24, 0x28, 0}; /* "$(" */
87static const char16_t gClosedParenthesisDollar[] = {0x29, 0x24, 0}; /* ")$" */
88
89static const char16_t gLessLess[] = {0x3C, 0x3C, 0}; /* "<<" */
90static const char16_t gLessPercent[] = {0x3C, 0x25, 0}; /* "<%" */
91static const char16_t gLessHash[] = {0x3C, 0x23, 0}; /* "<#" */
92static const char16_t gLessZero[] = {0x3C, 0x30, 0}; /* "<0" */
93static const char16_t gGreaterGreater[] = {0x3E, 0x3E, 0}; /* ">>" */
94static const char16_t gGreaterPercent[] = {0x3E, 0x25, 0}; /* ">%" */
95static const char16_t gGreaterHash[] = {0x3E, 0x23, 0}; /* ">#" */
96static const char16_t gGreaterZero[] = {0x3E, 0x30, 0}; /* ">0" */
97static const char16_t gEqualPercent[] = {0x3D, 0x25, 0}; /* "=%" */
98static const char16_t gEqualHash[] = {0x3D, 0x23, 0}; /* "=#" */
99static const char16_t gEqualZero[] = {0x3D, 0x30, 0}; /* "=0" */
100static const char16_t gGreaterGreaterGreater[] = {0x3E, 0x3E, 0x3E, 0}; /* ">>>" */
101
102static const char16_t * const RULE_PREFIXES[] = {
103 gLessLess, gLessPercent, gLessHash, gLessZero,
104 gGreaterGreater, gGreaterPercent,gGreaterHash, gGreaterZero,
105 gEqualPercent, gEqualHash, gEqualZero, nullptr
106};
107
108void
109NFRule::makeRules(UnicodeString& description,
110 NFRuleSet *owner,
111 const NFRule *predecessor,
112 const RuleBasedNumberFormat *rbnf,
113 NFRuleList& rules,
114 UErrorCode& status)
115{
116 // we know we're making at least one rule, so go ahead and
117 // new it up and initialize its basevalue and divisor
118 // (this also strips the rule descriptor, if any, off the
119 // description string)
120 LocalPointer<NFRule> rule1(new NFRule(rbnf, description, status));
121 /* test for nullptr */
122 if (rule1.isNull()) {
123 status = U_MEMORY_ALLOCATION_ERROR;
124 return;
125 }
126 description = rule1->fRuleText;
127
128 // check the description to see whether there's text enclosed
129 // in brackets
130 int32_t brack1 = description.indexOf(gLeftBracket);
131 int32_t brack2 = brack1 < 0 ? -1 : description.indexOf(gRightBracket);
132
133 // if the description doesn't contain a matched pair of brackets,
134 // or if it's of a type that doesn't recognize bracketed text,
135 // then leave the description alone, initialize the rule's
136 // rule text and substitutions, and return that rule
137 if (brack2 < 0 || brack1 > brack2
138 || rule1->getType() == kProperFractionRule
139 || rule1->getType() == kNegativeNumberRule
140 || rule1->getType() == kInfinityRule
141 || rule1->getType() == kNaNRule)
142 {
143 rule1->extractSubstitutions(owner, description, predecessor, status);
144 }
145 else {
146 // if the description does contain a matched pair of brackets,
147 // then it's really shorthand for two rules (with one exception)
148 LocalPointer<NFRule> rule2;
149 UnicodeString sbuf;
150 int32_t orElseOp = description.indexOf(gVerticalLine);
151
152 // we'll actually only split the rule into two rules if its
153 // base value is an even multiple of its divisor (or it's one
154 // of the special rules)
155 if ((rule1->baseValue > 0
156 && (rule1->baseValue % util64_pow(rule1->radix, rule1->exponent)) == 0)
157 || rule1->getType() == kImproperFractionRule
158 || rule1->getType() == kDefaultRule) {
159
160 // if it passes that test, new up the second rule. If the
161 // rule set both rules will belong to is a fraction rule
162 // set, they both have the same base value; otherwise,
163 // increment the original rule's base value ("rule1" actually
164 // goes SECOND in the rule set's rule list)
165 rule2.adoptInstead(new NFRule(rbnf, UnicodeString(), status));
166 /* test for nullptr */
167 if (rule2.isNull()) {
168 status = U_MEMORY_ALLOCATION_ERROR;
169 return;
170 }
171 if (rule1->baseValue >= 0) {
172 rule2->baseValue = rule1->baseValue;
173 if (!owner->isFractionRuleSet()) {
174 ++rule1->baseValue;
175 }
176 }
177
178 // if the description began with "x.x" and contains bracketed
179 // text, it describes both the improper fraction rule and
180 // the proper fraction rule
181 else if (rule1->getType() == kImproperFractionRule) {
182 rule2->setType(kProperFractionRule);
183 }
184
185 // if the description began with "x.0" and contains bracketed
186 // text, it describes both the default rule and the
187 // improper fraction rule
188 else if (rule1->getType() == kDefaultRule) {
189 rule2->baseValue = rule1->baseValue;
190 rule1->setType(kImproperFractionRule);
191 }
192
193 // both rules have the same radix and exponent (i.e., the
194 // same divisor)
195 rule2->radix = rule1->radix;
196 rule2->exponent = rule1->exponent;
197
198 // By default, rule2's rule text omits the stuff in brackets,
199 // unless it contains a | between the brackets.
200 // Initialize its rule text and substitutions accordingly.
201 sbuf.append(description, 0, brack1);
202 if (orElseOp >= 0) {
203 sbuf.append(description, orElseOp + 1, brack2 - orElseOp - 1);
204 }
205 if (brack2 + 1 < description.length()) {
206 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1);
207 }
208 rule2->extractSubstitutions(owner, sbuf, predecessor, status);
209 }
210
211 // rule1's text includes the text in the brackets but omits
212 // the brackets themselves: initialize _its_ rule text and
213 // substitutions accordingly
214 sbuf.setTo(description, 0, brack1);
215 if (orElseOp >= 0) {
216 sbuf.append(description, brack1 + 1, orElseOp - brack1 - 1);
217 }
218 else {
219 sbuf.append(description, brack1 + 1, brack2 - brack1 - 1);
220 }
221 if (brack2 + 1 < description.length()) {
222 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1);
223 }
224 rule1->extractSubstitutions(owner, sbuf, predecessor, status);
225
226 // if we only have one rule, return it; if we have two, return
227 // a two-element array containing them (notice that rule2 goes
228 // BEFORE rule1 in the list: in all cases, rule2 OMITS the
229 // material in the brackets and rule1 INCLUDES the material
230 // in the brackets)
231 if (!rule2.isNull()) {
232 if (rule2->baseValue >= kNoBase) {
233 rules.add(rule2.orphan());
234 }
235 else {
236 owner->setNonNumericalRule(rule2.orphan());
237 }
238 }
239 }
240 if (rule1->baseValue >= kNoBase) {
241 rules.add(rule1.orphan());
242 }
243 else {
244 owner->setNonNumericalRule(rule1.orphan());
245 }
246}
247
248/**
249 * This function parses the rule's rule descriptor (i.e., the base
250 * value and/or other tokens that precede the rule's rule text
251 * in the description) and sets the rule's base value, radix, and
252 * exponent according to the descriptor. (If the description doesn't
253 * include a rule descriptor, then this function sets everything to
254 * default values and the rule set sets the rule's real base value).
255 * @param description The rule's description
256 * @return If "description" included a rule descriptor, this is
257 * "description" with the descriptor and any trailing whitespace
258 * stripped off. Otherwise; it's "descriptor" unchangd.
259 */
260void
261NFRule::parseRuleDescriptor(UnicodeString& description, UErrorCode& status)
262{
263 // the description consists of a rule descriptor and a rule body,
264 // separated by a colon. The rule descriptor is optional. If
265 // it's omitted, just set the base value to 0.
266 int32_t p = description.indexOf(gColon);
267 if (p != -1) {
268 // copy the descriptor out into its own string and strip it,
269 // along with any trailing whitespace, out of the original
270 // description
271 UnicodeString descriptor;
272 descriptor.setTo(description, 0, p);
273
274 ++p;
275 while (p < description.length() && PatternProps::isWhiteSpace(description.charAt(p))) {
276 ++p;
277 }
278 description.removeBetween(0, p);
279
280 // check first to see if the rule descriptor matches the token
281 // for one of the special rules. If it does, set the base
282 // value to the correct identifier value
283 int descriptorLength = descriptor.length();
284 char16_t firstChar = descriptor.charAt(0);
285 char16_t lastChar = descriptor.charAt(descriptorLength - 1);
286 if (firstChar >= gZero && firstChar <= gNine && lastChar != gX) {
287 // if the rule descriptor begins with a digit, it's a descriptor
288 // for a normal rule
289 // since we don't have Long.parseLong, and this isn't much work anyway,
290 // just build up the value as we encounter the digits.
291 int64_t val = 0;
292 p = 0;
293 char16_t c = gSpace;
294
295 // begin parsing the descriptor: copy digits
296 // into "tempValue", skip periods, commas, and spaces,
297 // stop on a slash or > sign (or at the end of the string),
298 // and throw an exception on any other character
299 while (p < descriptorLength) {
300 c = descriptor.charAt(p);
301 if (c >= gZero && c <= gNine) {
302 int64_t digit = static_cast<int64_t>(c - gZero);
303 if ((val > 0 && val > (INT64_MAX(9223372036854775807L) - digit) / 10) ||
304 (val < 0 && val < (INT64_MIN(-9223372036854775807L -1) - digit) / 10)) {
305 // out of int64_t range
306 status = U_PARSE_ERROR;
307 return;
308 }
309 val = val * 10 + digit;
310 }
311 else if (c == gSlash || c == gGreaterThan) {
312 break;
313 }
314 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) {
315 }
316 else {
317 // throw new IllegalArgumentException("Illegal character in rule descriptor");
318 status = U_PARSE_ERROR;
319 return;
320 }
321 ++p;
322 }
323
324 // we have the base value, so set it
325 setBaseValue(val, status);
326
327 // if we stopped the previous loop on a slash, we're
328 // now parsing the rule's radix. Again, accumulate digits
329 // in tempValue, skip punctuation, stop on a > mark, and
330 // throw an exception on anything else
331 if (c == gSlash) {
332 val = 0;
333 ++p;
334 while (p < descriptorLength) {
335 c = descriptor.charAt(p);
336 if (c >= gZero && c <= gNine) {
337 int64_t digit = static_cast<int64_t>(c - gZero);
338 if ((val > 0 && val > (INT64_MAX(9223372036854775807L) - digit) / 10) ||
339 (val < 0 && val < (INT64_MIN(-9223372036854775807L -1) - digit) / 10)) {
340 // out of int64_t range
341 status = U_PARSE_ERROR;
342 return;
343 }
344 val = val * 10 + digit;
345 }
346 else if (c == gGreaterThan) {
347 break;
348 }
349 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) {
350 }
351 else {
352 // throw new IllegalArgumentException("Illegal character is rule descriptor");
353 status = U_PARSE_ERROR;
354 return;
355 }
356 ++p;
357 }
358
359 // tempValue now contain's the rule's radix. Set it
360 // accordingly, and recalculate the rule's exponent
361 radix = static_cast<int32_t>(val);
362 if (radix == 0) {
363 // throw new IllegalArgumentException("Rule can't have radix of 0");
364 status = U_PARSE_ERROR;
365 }
366
367 exponent = expectedExponent();
368 }
369
370 // if we stopped the previous loop on a > sign, then continue
371 // for as long as we still see > signs. For each one,
372 // decrement the exponent (unless the exponent is already 0).
373 // If we see another character before reaching the end of
374 // the descriptor, that's also a syntax error.
375 if (c == gGreaterThan) {
376 while (p < descriptor.length()) {
377 c = descriptor.charAt(p);
378 if (c == gGreaterThan && exponent > 0) {
379 --exponent;
380 } else {
381 // throw new IllegalArgumentException("Illegal character in rule descriptor");
382 status = U_PARSE_ERROR;
383 return;
384 }
385 ++p;
386 }
387 }
388 }
389 else if (0 == descriptor.compare(gMinusX, 2)) {
390 setType(kNegativeNumberRule);
391 }
392 else if (descriptorLength == 3) {
393 if (firstChar == gZero && lastChar == gX) {
394 setBaseValue(kProperFractionRule, status);
395 decimalPoint = descriptor.charAt(1);
396 }
397 else if (firstChar == gX && lastChar == gX) {
398 setBaseValue(kImproperFractionRule, status);
399 decimalPoint = descriptor.charAt(1);
400 }
401 else if (firstChar == gX && lastChar == gZero) {
402 setBaseValue(kDefaultRule, status);
403 decimalPoint = descriptor.charAt(1);
404 }
405 else if (descriptor.compare(gNaN, 3) == 0) {
406 setBaseValue(kNaNRule, status);
407 }
408 else if (descriptor.compare(gInf, 3) == 0) {
409 setBaseValue(kInfinityRule, status);
410 }
411 }
412 }
413 // else use the default base value for now.
414
415 // finally, if the rule body begins with an apostrophe, strip it off
416 // (this is generally used to put whitespace at the beginning of
417 // a rule's rule text)
418 if (!description.isEmpty() && description.charAt(0) == gTick) {
419 description.removeBetween(0, 1);
420 }
421
422 // return the description with all the stuff we've just waded through
423 // stripped off the front. It now contains just the rule body.
424 // return description;
425}
426
427/**
428* Searches the rule's rule text for the substitution tokens,
429* creates the substitutions, and removes the substitution tokens
430* from the rule's rule text.
431* @param owner The rule set containing this rule
432* @param predecessor The rule preseding this one in "owners" rule list
433* @param ownersOwner The RuleBasedFormat that owns this rule
434*/
435void
436NFRule::extractSubstitutions(const NFRuleSet* ruleSet,
437 const UnicodeString &ruleText,
438 const NFRule* predecessor,
439 UErrorCode& status)
440{
441 if (U_FAILURE(status)) {
442 return;
443 }
444 fRuleText = ruleText;
445 sub1 = extractSubstitution(ruleSet, predecessor, status);
446 if (sub1 == nullptr) {
447 // Small optimization. There is no need to create a redundant NullSubstitution.
448 sub2 = nullptr;
449 }
450 else {
451 sub2 = extractSubstitution(ruleSet, predecessor, status);
452 }
453 int32_t pluralRuleStart = fRuleText.indexOf(gDollarOpenParenthesis, -1, 0);
454 int32_t pluralRuleEnd = (pluralRuleStart >= 0 ? fRuleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart) : -1);
455 if (pluralRuleEnd >= 0) {
456 int32_t endType = fRuleText.indexOf(gComma, pluralRuleStart);
457 if (endType < 0) {
458 status = U_PARSE_ERROR;
459 return;
460 }
461 UnicodeString type(fRuleText.tempSubString(pluralRuleStart + 2, endType - pluralRuleStart - 2));
462 UPluralType pluralType;
463 if (type.startsWith(UNICODE_STRING_SIMPLE("cardinal")icu::UnicodeString(true, u"cardinal", -1))) {
464 pluralType = UPLURAL_TYPE_CARDINAL;
465 }
466 else if (type.startsWith(UNICODE_STRING_SIMPLE("ordinal")icu::UnicodeString(true, u"ordinal", -1))) {
467 pluralType = UPLURAL_TYPE_ORDINAL;
468 }
469 else {
470 status = U_ILLEGAL_ARGUMENT_ERROR;
471 return;
472 }
473 rulePatternFormat = formatter->createPluralFormat(pluralType,
474 fRuleText.tempSubString(endType + 1, pluralRuleEnd - endType - 1), status);
475 }
476}
477
478/**
479* Searches the rule's rule text for the first substitution token,
480* creates a substitution based on it, and removes the token from
481* the rule's rule text.
482* @param owner The rule set containing this rule
483* @param predecessor The rule preceding this one in the rule set's
484* rule list
485* @param ownersOwner The RuleBasedNumberFormat that owns this rule
486* @return The newly-created substitution. This is never null; if
487* the rule text doesn't contain any substitution tokens, this will
488* be a NullSubstitution.
489*/
490NFSubstitution *
491NFRule::extractSubstitution(const NFRuleSet* ruleSet,
492 const NFRule* predecessor,
493 UErrorCode& status)
494{
495 NFSubstitution* result = nullptr;
496
497 // search the rule's rule text for the first two characters of
498 // a substitution token
499 int32_t subStart = indexOfAnyRulePrefix();
500 int32_t subEnd = subStart;
Value stored to 'subEnd' during its initialization is never read
501
502 // if we didn't find one, create a null substitution positioned
503 // at the end of the rule text
504 if (subStart == -1) {
505 return nullptr;
506 }
507
508 // special-case the ">>>" token, since searching for the > at the
509 // end will actually find the > in the middle
510 if (fRuleText.indexOf(gGreaterGreaterGreater, 3, 0) == subStart) {
511 subEnd = subStart + 2;
512
513 // otherwise the substitution token ends with the same character
514 // it began with
515 } else {
516 char16_t c = fRuleText.charAt(subStart);
517 subEnd = fRuleText.indexOf(c, subStart + 1);
518 // special case for '<%foo<<'
519 if (c == gLessThan && subEnd != -1 && subEnd < fRuleText.length() - 1 && fRuleText.charAt(subEnd+1) == c) {
520 // ordinals use "=#,##0==%abbrev=" as their rule. Notice that the '==' in the middle
521 // occurs because of the juxtaposition of two different rules. The check for '<' is a hack
522 // to get around this. Having the duplicate at the front would cause problems with
523 // rules like "<<%" to format, say, percents...
524 ++subEnd;
525 }
526 }
527
528 // if we don't find the end of the token (i.e., if we're on a single,
529 // unmatched token character), create a null substitution positioned
530 // at the end of the rule
531 if (subEnd == -1) {
532 return nullptr;
533 }
534
535 // if we get here, we have a real substitution token (or at least
536 // some text bounded by substitution token characters). Use
537 // makeSubstitution() to create the right kind of substitution
538 UnicodeString subToken;
539 subToken.setTo(fRuleText, subStart, subEnd + 1 - subStart);
540 result = NFSubstitution::makeSubstitution(subStart, this, predecessor, ruleSet,
541 this->formatter, subToken, status);
542
543 // remove the substitution from the rule text
544 fRuleText.removeBetween(subStart, subEnd+1);
545
546 return result;
547}
548
549/**
550 * Sets the rule's base value, and causes the radix and exponent
551 * to be recalculated. This is used during construction when we
552 * don't know the rule's base value until after it's been
553 * constructed. It should be used at any other time.
554 * @param The new base value for the rule.
555 */
556void
557NFRule::setBaseValue(int64_t newBaseValue, UErrorCode& status)
558{
559 // set the base value
560 baseValue = newBaseValue;
561 radix = 10;
562
563 // if this isn't a special rule, recalculate the radix and exponent
564 // (the radix always defaults to 10; if it's supposed to be something
565 // else, it's cleaned up by the caller and the exponent is
566 // recalculated again-- the only function that does this is
567 // NFRule.parseRuleDescriptor() )
568 if (baseValue >= 1) {
569 exponent = expectedExponent();
570
571 // this function gets called on a fully-constructed rule whose
572 // description didn't specify a base value. This means it
573 // has substitutions, and some substitutions hold on to copies
574 // of the rule's divisor. Fix their copies of the divisor.
575 if (sub1 != nullptr) {
576 sub1->setDivisor(radix, exponent, status);
577 }
578 if (sub2 != nullptr) {
579 sub2->setDivisor(radix, exponent, status);
580 }
581
582 // if this is a special rule, its radix and exponent are basically
583 // ignored. Set them to "safe" default values
584 } else {
585 exponent = 0;
586 }
587}
588
589/**
590* This calculates the rule's exponent based on its radix and base
591* value. This will be the highest power the radix can be raised to
592* and still produce a result less than or equal to the base value.
593*/
594int16_t
595NFRule::expectedExponent() const
596{
597 // since the log of 0, or the log base 0 of something, causes an
598 // error, declare the exponent in these cases to be 0 (we also
599 // deal with the special-rule identifiers here)
600 if (radix == 0 || baseValue < 1) {
601 return 0;
602 }
603
604 // we get rounding error in some cases-- for example, log 1000 / log 10
605 // gives us 1.9999999996 instead of 2. The extra logic here is to take
606 // that into account
607 int16_t tempResult = static_cast<int16_t>(uprv_loguprv_log_77(static_cast<double>(baseValue)) /
608 uprv_loguprv_log_77(static_cast<double>(radix)));
609 int64_t temp = util64_pow(radix, tempResult + 1);
610 if (temp <= baseValue) {
611 tempResult += 1;
612 }
613 return tempResult;
614}
615
616/**
617 * Searches the rule's rule text for any of the specified strings.
618 * @return The index of the first match in the rule's rule text
619 * (i.e., the first substring in the rule's rule text that matches
620 * _any_ of the strings in "strings"). If none of the strings in
621 * "strings" is found in the rule's rule text, returns -1.
622 */
623int32_t
624NFRule::indexOfAnyRulePrefix() const
625{
626 int result = -1;
627 for (int i = 0; RULE_PREFIXES[i]; i++) {
628 int32_t pos = fRuleText.indexOf(*RULE_PREFIXES[i]);
629 if (pos != -1 && (result == -1 || pos < result)) {
630 result = pos;
631 }
632 }
633 return result;
634}
635
636//-----------------------------------------------------------------------
637// boilerplate
638//-----------------------------------------------------------------------
639
640static UBool
641util_equalSubstitutions(const NFSubstitution* sub1, const NFSubstitution* sub2)
642{
643 if (sub1) {
644 if (sub2) {
645 return *sub1 == *sub2;
646 }
647 } else if (!sub2) {
648 return true;
649 }
650 return false;
651}
652
653/**
654* Tests two rules for equality.
655* @param that The rule to compare this one against
656* @return True is the two rules are functionally equivalent
657*/
658bool
659NFRule::operator==(const NFRule& rhs) const
660{
661 return baseValue == rhs.baseValue
662 && radix == rhs.radix
663 && exponent == rhs.exponent
664 && fRuleText == rhs.fRuleText
665 && util_equalSubstitutions(sub1, rhs.sub1)
666 && util_equalSubstitutions(sub2, rhs.sub2);
667}
668
669/**
670* Returns a textual representation of the rule. This won't
671* necessarily be the same as the description that this rule
672* was created with, but it will produce the same result.
673* @return A textual description of the rule
674*/
675static void util_append64(UnicodeString& result, int64_t n)
676{
677 char16_t buffer[256];
678 int32_t len = util64_tou(n, buffer, sizeof(buffer));
679 UnicodeString temp(buffer, len);
680 result.append(temp);
681}
682
683void
684NFRule::_appendRuleText(UnicodeString& result) const
685{
686 switch (getType()) {
687 case kNegativeNumberRule: result.append(gMinusX, 2); break;
688 case kImproperFractionRule: result.append(gX).append(decimalPoint == 0 ? gDot : decimalPoint).append(gX); break;
689 case kProperFractionRule: result.append(gZero).append(decimalPoint == 0 ? gDot : decimalPoint).append(gX); break;
690 case kDefaultRule: result.append(gX).append(decimalPoint == 0 ? gDot : decimalPoint).append(gZero); break;
691 case kInfinityRule: result.append(gInf, 3); break;
692 case kNaNRule: result.append(gNaN, 3); break;
693 default:
694 // for a normal rule, write out its base value, and if the radix is
695 // something other than 10, write out the radix (with the preceding
696 // slash, of course). Then calculate the expected exponent and if
697 // if isn't the same as the actual exponent, write an appropriate
698 // number of > signs. Finally, terminate the whole thing with
699 // a colon.
700 util_append64(result, baseValue);
701 if (radix != 10) {
702 result.append(gSlash);
703 util_append64(result, radix);
704 }
705 int numCarets = expectedExponent() - exponent;
706 for (int i = 0; i < numCarets; i++) {
707 result.append(gGreaterThan);
708 }
709 break;
710 }
711 result.append(gColon);
712 result.append(gSpace);
713
714 // if the rule text begins with a space, write an apostrophe
715 // (whitespace after the rule descriptor is ignored; the
716 // apostrophe is used to make the whitespace significant)
717 if (fRuleText.charAt(0) == gSpace && (sub1 == nullptr || sub1->getPos() != 0)) {
718 result.append(gTick);
719 }
720
721 // now, write the rule's rule text, inserting appropriate
722 // substitution tokens in the appropriate places
723 UnicodeString ruleTextCopy;
724 ruleTextCopy.setTo(fRuleText);
725
726 UnicodeString temp;
727 if (sub2 != nullptr) {
728 sub2->toString(temp);
729 ruleTextCopy.insert(sub2->getPos(), temp);
730 }
731 if (sub1 != nullptr) {
732 sub1->toString(temp);
733 ruleTextCopy.insert(sub1->getPos(), temp);
734 }
735
736 result.append(ruleTextCopy);
737
738 // and finally, top the whole thing off with a semicolon and
739 // return the result
740 result.append(gSemicolon);
741}
742
743int64_t NFRule::getDivisor() const
744{
745 return util64_pow(radix, exponent);
746}
747
748/**
749 * Internal function to facilitate numerical rounding. See the explanation in MultiplierSubstitution::transformNumber().
750 */
751bool NFRule::hasModulusSubstitution() const
752{
753 return (sub1 != nullptr && sub1->isModulusSubstitution()) || (sub2 != nullptr && sub2->isModulusSubstitution());
754}
755
756
757//-----------------------------------------------------------------------
758// formatting
759//-----------------------------------------------------------------------
760
761/**
762* Formats the number, and inserts the resulting text into
763* toInsertInto.
764* @param number The number being formatted
765* @param toInsertInto The string where the resultant text should
766* be inserted
767* @param pos The position in toInsertInto where the resultant text
768* should be inserted
769*/
770void
771NFRule::doFormat(int64_t number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const
772{
773 // first, insert the rule's rule text into toInsertInto at the
774 // specified position, then insert the results of the substitutions
775 // into the right places in toInsertInto (notice we do the
776 // substitutions in reverse order so that the offsets don't get
777 // messed up)
778 int32_t pluralRuleStart = fRuleText.length();
779 int32_t lengthOffset = 0;
780 if (!rulePatternFormat) {
781 toInsertInto.insert(pos, fRuleText);
782 }
783 else {
784 pluralRuleStart = fRuleText.indexOf(gDollarOpenParenthesis, -1, 0);
785 int pluralRuleEnd = fRuleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart);
786 int initialLength = toInsertInto.length();
787 if (pluralRuleEnd < fRuleText.length() - 1) {
788 toInsertInto.insert(pos, fRuleText.tempSubString(pluralRuleEnd + 2));
789 }
790 toInsertInto.insert(pos,
791 rulePatternFormat->format(static_cast<int32_t>(number / util64_pow(radix, exponent)), status));
792 if (pluralRuleStart > 0) {
793 toInsertInto.insert(pos, fRuleText.tempSubString(0, pluralRuleStart));
794 }
795 lengthOffset = fRuleText.length() - (toInsertInto.length() - initialLength);
796 }
797
798 if (sub2 != nullptr) {
799 sub2->doSubstitution(number, toInsertInto, pos - (sub2->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
800 }
801 if (sub1 != nullptr) {
802 sub1->doSubstitution(number, toInsertInto, pos - (sub1->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
803 }
804}
805
806/**
807* Formats the number, and inserts the resulting text into
808* toInsertInto.
809* @param number The number being formatted
810* @param toInsertInto The string where the resultant text should
811* be inserted
812* @param pos The position in toInsertInto where the resultant text
813* should be inserted
814*/
815void
816NFRule::doFormat(double number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const
817{
818 // first, insert the rule's rule text into toInsertInto at the
819 // specified position, then insert the results of the substitutions
820 // into the right places in toInsertInto
821 // [again, we have two copies of this routine that do the same thing
822 // so that we don't sacrifice precision in a long by casting it
823 // to a double]
824 int32_t pluralRuleStart = fRuleText.length();
825 int32_t lengthOffset = 0;
826 if (!rulePatternFormat) {
827 toInsertInto.insert(pos, fRuleText);
828 }
829 else {
830 pluralRuleStart = fRuleText.indexOf(gDollarOpenParenthesis, -1, 0);
831 int pluralRuleEnd = fRuleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart);
832 int initialLength = toInsertInto.length();
833 if (pluralRuleEnd < fRuleText.length() - 1) {
834 toInsertInto.insert(pos, fRuleText.tempSubString(pluralRuleEnd + 2));
835 }
836 double pluralVal = number;
837 if (0 <= pluralVal && pluralVal < 1) {
838 // We're in a fractional rule, and we have to match the NumeratorSubstitution behavior.
839 // 2.3 can become 0.2999999999999998 for the fraction due to rounding errors.
840 pluralVal = uprv_rounduprv_round_77(pluralVal * util64_pow(radix, exponent));
841 }
842 else {
843 pluralVal = pluralVal / util64_pow(radix, exponent);
844 }
845 toInsertInto.insert(pos, rulePatternFormat->format(static_cast<int32_t>(pluralVal), status));
846 if (pluralRuleStart > 0) {
847 toInsertInto.insert(pos, fRuleText.tempSubString(0, pluralRuleStart));
848 }
849 lengthOffset = fRuleText.length() - (toInsertInto.length() - initialLength);
850 }
851
852 if (sub2 != nullptr) {
853 sub2->doSubstitution(number, toInsertInto, pos - (sub2->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
854 }
855 if (sub1 != nullptr) {
856 sub1->doSubstitution(number, toInsertInto, pos - (sub1->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
857 }
858}
859
860/**
861* Used by the owning rule set to determine whether to invoke the
862* rollback rule (i.e., whether this rule or the one that precedes
863* it in the rule set's list should be used to format the number)
864* @param The number being formatted
865* @return True if the rule set should use the rule that precedes
866* this one in its list; false if it should use this rule
867*/
868UBool
869NFRule::shouldRollBack(int64_t number) const
870{
871 // we roll back if the rule contains a modulus substitution,
872 // the number being formatted is an even multiple of the rule's
873 // divisor, and the rule's base value is NOT an even multiple
874 // of its divisor
875 // In other words, if the original description had
876 // 100: << hundred[ >>];
877 // that expands into
878 // 100: << hundred;
879 // 101: << hundred >>;
880 // internally. But when we're formatting 200, if we use the rule
881 // at 101, which would normally apply, we get "two hundred zero".
882 // To prevent this, we roll back and use the rule at 100 instead.
883 // This is the logic that makes this happen: the rule at 101 has
884 // a modulus substitution, its base value isn't an even multiple
885 // of 100, and the value we're trying to format _is_ an even
886 // multiple of 100. This is called the "rollback rule."
887 if ((sub1 != nullptr && sub1->isModulusSubstitution()) || (sub2 != nullptr && sub2->isModulusSubstitution())) {
888 int64_t re = util64_pow(radix, exponent);
889 return (number % re) == 0 && (baseValue % re) != 0;
890 }
891 return false;
892}
893
894//-----------------------------------------------------------------------
895// parsing
896//-----------------------------------------------------------------------
897
898/**
899* Attempts to parse the string with this rule.
900* @param text The string being parsed
901* @param parsePosition On entry, the value is ignored and assumed to
902* be 0. On exit, this has been updated with the position of the first
903* character not consumed by matching the text against this rule
904* (if this rule doesn't match the text at all, the parse position
905* if left unchanged (presumably at 0) and the function returns
906* new Long(0)).
907* @param isFractionRule True if this rule is contained within a
908* fraction rule set. This is only used if the rule has no
909* substitutions.
910* @return If this rule matched the text, this is the rule's base value
911* combined appropriately with the results of parsing the substitutions.
912* If nothing matched, this is new Long(0) and the parse position is
913* left unchanged. The result will be an instance of Long if the
914* result is an integer and Double otherwise. The result is never null.
915*/
916#ifdef RBNF_DEBUG
917#include <stdio.h>
918
919static void dumpUS(FILE* f, const UnicodeString& us) {
920 int len = us.length();
921 char* buf = (char *)uprv_mallocuprv_malloc_77((len+1)*sizeof(char)); //new char[len+1];
922 if (buf != nullptr) {
923 us.extract(0, len, buf);
924 buf[len] = 0;
925 fprintf(f, "%s", buf);
926 uprv_freeuprv_free_77(buf); //delete[] buf;
927 }
928}
929#endif
930UBool
931NFRule::doParse(const UnicodeString& text,
932 ParsePosition& parsePosition,
933 UBool isFractionRule,
934 double upperBound,
935 uint32_t nonNumericalExecutedRuleMask,
936 int32_t recursionCount,
937 Formattable& resVal) const
938{
939 // internally we operate on a copy of the string being parsed
940 // (because we're going to change it) and use our own ParsePosition
941 ParsePosition pp;
942 UnicodeString workText(text);
943
944 int32_t sub1Pos = sub1 != nullptr ? sub1->getPos() : fRuleText.length();
945 int32_t sub2Pos = sub2 != nullptr ? sub2->getPos() : fRuleText.length();
946
947 // check to see whether the text before the first substitution
948 // matches the text at the beginning of the string being
949 // parsed. If it does, strip that off the front of workText;
950 // otherwise, dump out with a mismatch
951 UnicodeString prefix;
952 prefix.setTo(fRuleText, 0, sub1Pos);
953
954#ifdef RBNF_DEBUG
955 fprintf(stderrstderr, "doParse %p ", this);
956 {
957 UnicodeString rt;
958 _appendRuleText(rt);
959 dumpUS(stderrstderr, rt);
960 }
961
962 fprintf(stderrstderr, " text: '");
963 dumpUS(stderrstderr, text);
964 fprintf(stderrstderr, "' prefix: '");
965 dumpUS(stderrstderr, prefix);
966#endif
967 stripPrefix(workText, prefix, pp);
968 int32_t prefixLength = text.length() - workText.length();
969
970#ifdef RBNF_DEBUG
971 fprintf(stderrstderr, "' pl: %d ppi: %d s1p: %d\n", prefixLength, pp.getIndex(), sub1Pos);
972#endif
973
974 if (pp.getIndex() == 0 && sub1Pos != 0) {
975 // commented out because ParsePosition doesn't have error index in 1.1.x
976 // restored for ICU4C port
977 parsePosition.setErrorIndex(pp.getErrorIndex());
978 resVal.setLong(0);
979 return true;
980 }
981 if (baseValue == kInfinityRule) {
982 // If you match this, don't try to perform any calculations on it.
983 parsePosition.setIndex(pp.getIndex());
984 resVal.setDouble(uprv_getInfinityuprv_getInfinity_77());
985 return true;
986 }
987 if (baseValue == kNaNRule) {
988 // If you match this, don't try to perform any calculations on it.
989 parsePosition.setIndex(pp.getIndex());
990 resVal.setDouble(uprv_getNaNuprv_getNaN_77());
991 return true;
992 }
993
994 // this is the fun part. The basic guts of the rule-matching
995 // logic is matchToDelimiter(), which is called twice. The first
996 // time it searches the input string for the rule text BETWEEN
997 // the substitutions and tries to match the intervening text
998 // in the input string with the first substitution. If that
999 // succeeds, it then calls it again, this time to look for the
1000 // rule text after the second substitution and to match the
1001 // intervening input text against the second substitution.
1002 //
1003 // For example, say we have a rule that looks like this:
1004 // first << middle >> last;
1005 // and input text that looks like this:
1006 // first one middle two last
1007 // First we use stripPrefix() to match "first " in both places and
1008 // strip it off the front, leaving
1009 // one middle two last
1010 // Then we use matchToDelimiter() to match " middle " and try to
1011 // match "one" against a substitution. If it's successful, we now
1012 // have
1013 // two last
1014 // We use matchToDelimiter() a second time to match " last" and
1015 // try to match "two" against a substitution. If "two" matches
1016 // the substitution, we have a successful parse.
1017 //
1018 // Since it's possible in many cases to find multiple instances
1019 // of each of these pieces of rule text in the input string,
1020 // we need to try all the possible combinations of these
1021 // locations. This prevents us from prematurely declaring a mismatch,
1022 // and makes sure we match as much input text as we can.
1023 int highWaterMark = 0;
1024 double result = 0;
1025 int start = 0;
1026 double tempBaseValue = static_cast<double>(baseValue <= 0 ? 0 : baseValue);
1027
1028 UnicodeString temp;
1029 do {
1030 // our partial parse result starts out as this rule's base
1031 // value. If it finds a successful match, matchToDelimiter()
1032 // will compose this in some way with what it gets back from
1033 // the substitution, giving us a new partial parse result
1034 pp.setIndex(0);
1035
1036 temp.setTo(fRuleText, sub1Pos, sub2Pos - sub1Pos);
1037 double partialResult = matchToDelimiter(workText, start, tempBaseValue,
1038 temp, pp, sub1,
1039 nonNumericalExecutedRuleMask,
1040 recursionCount,
1041 upperBound);
1042
1043 // if we got a successful match (or were trying to match a
1044 // null substitution), pp is now pointing at the first unmatched
1045 // character. Take note of that, and try matchToDelimiter()
1046 // on the input text again
1047 if (pp.getIndex() != 0 || sub1 == nullptr) {
1048 start = pp.getIndex();
1049
1050 UnicodeString workText2;
1051 workText2.setTo(workText, pp.getIndex(), workText.length() - pp.getIndex());
1052 ParsePosition pp2;
1053
1054 // the second matchToDelimiter() will compose our previous
1055 // partial result with whatever it gets back from its
1056 // substitution if there's a successful match, giving us
1057 // a real result
1058 temp.setTo(fRuleText, sub2Pos, fRuleText.length() - sub2Pos);
1059 partialResult = matchToDelimiter(workText2, 0, partialResult,
1060 temp, pp2, sub2,
1061 nonNumericalExecutedRuleMask,
1062 recursionCount,
1063 upperBound);
1064
1065 // if we got a successful match on this second
1066 // matchToDelimiter() call, update the high-water mark
1067 // and result (if necessary)
1068 if (pp2.getIndex() != 0 || sub2 == nullptr) {
1069 if (prefixLength + pp.getIndex() + pp2.getIndex() > highWaterMark) {
1070 highWaterMark = prefixLength + pp.getIndex() + pp2.getIndex();
1071 result = partialResult;
1072 }
1073 }
1074 else {
1075 // commented out because ParsePosition doesn't have error index in 1.1.x
1076 // restored for ICU4C port
1077 int32_t i_temp = pp2.getErrorIndex() + sub1Pos + pp.getIndex();
1078 if (i_temp> parsePosition.getErrorIndex()) {
1079 parsePosition.setErrorIndex(i_temp);
1080 }
1081 }
1082 }
1083 else {
1084 // commented out because ParsePosition doesn't have error index in 1.1.x
1085 // restored for ICU4C port
1086 int32_t i_temp = sub1Pos + pp.getErrorIndex();
1087 if (i_temp > parsePosition.getErrorIndex()) {
1088 parsePosition.setErrorIndex(i_temp);
1089 }
1090 }
1091 // keep trying to match things until the outer matchToDelimiter()
1092 // call fails to make a match (each time, it picks up where it
1093 // left off the previous time)
1094 } while (sub1Pos != sub2Pos
1095 && pp.getIndex() > 0
1096 && pp.getIndex() < workText.length()
1097 && pp.getIndex() != start);
1098
1099 // update the caller's ParsePosition with our high-water mark
1100 // (i.e., it now points at the first character this function
1101 // didn't match-- the ParsePosition is therefore unchanged if
1102 // we didn't match anything)
1103 parsePosition.setIndex(highWaterMark);
1104 // commented out because ParsePosition doesn't have error index in 1.1.x
1105 // restored for ICU4C port
1106 if (highWaterMark > 0) {
1107 parsePosition.setErrorIndex(0);
1108 }
1109
1110 // this is a hack for one unusual condition: Normally, whether this
1111 // rule belong to a fraction rule set or not is handled by its
1112 // substitutions. But if that rule HAS NO substitutions, then
1113 // we have to account for it here. By definition, if the matching
1114 // rule in a fraction rule set has no substitutions, its numerator
1115 // is 1, and so the result is the reciprocal of its base value.
1116 if (isFractionRule && highWaterMark > 0 && sub1 == nullptr) {
1117 result = 1 / result;
1118 }
1119
1120 resVal.setDouble(result);
1121 return true; // ??? do we need to worry if it is a long or a double?
1122}
1123
1124/**
1125* This function is used by parse() to match the text being parsed
1126* against a possible prefix string. This function
1127* matches characters from the beginning of the string being parsed
1128* to characters from the prospective prefix. If they match, pp is
1129* updated to the first character not matched, and the result is
1130* the unparsed part of the string. If they don't match, the whole
1131* string is returned, and pp is left unchanged.
1132* @param text The string being parsed
1133* @param prefix The text to match against
1134* @param pp On entry, ignored and assumed to be 0. On exit, points
1135* to the first unmatched character (assuming the whole prefix matched),
1136* or is unchanged (if the whole prefix didn't match).
1137* @return If things match, this is the unparsed part of "text";
1138* if they didn't match, this is "text".
1139*/
1140void
1141NFRule::stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosition& pp) const
1142{
1143 // if the prefix text is empty, dump out without doing anything
1144 if (prefix.length() != 0) {
1145 UErrorCode status = U_ZERO_ERROR;
1146 // use prefixLength() to match the beginning of
1147 // "text" against "prefix". This function returns the
1148 // number of characters from "text" that matched (or 0 if
1149 // we didn't match the whole prefix)
1150 int32_t pfl = prefixLength(text, prefix, status);
1151 if (U_FAILURE(status)) { // Memory allocation error.
1152 return;
1153 }
1154 if (pfl != 0) {
1155 // if we got a successful match, update the parse position
1156 // and strip the prefix off of "text"
1157 pp.setIndex(pp.getIndex() + pfl);
1158 text.remove(0, pfl);
1159 }
1160 }
1161}
1162
1163/**
1164* Used by parse() to match a substitution and any following text.
1165* "text" is searched for instances of "delimiter". For each instance
1166* of delimiter, the intervening text is tested to see whether it
1167* matches the substitution. The longest match wins.
1168* @param text The string being parsed
1169* @param startPos The position in "text" where we should start looking
1170* for "delimiter".
1171* @param baseValue A partial parse result (often the rule's base value),
1172* which is combined with the result from matching the substitution
1173* @param delimiter The string to search "text" for.
1174* @param pp Ignored and presumed to be 0 on entry. If there's a match,
1175* on exit this will point to the first unmatched character.
1176* @param sub If we find "delimiter" in "text", this substitution is used
1177* to match the text between the beginning of the string and the
1178* position of "delimiter." (If "delimiter" is the empty string, then
1179* this function just matches against this substitution and updates
1180* everything accordingly.)
1181* @param upperBound When matching the substitution, it will only
1182* consider rules with base values lower than this value.
1183* @return If there's a match, this is the result of composing
1184* baseValue with the result of matching the substitution. Otherwise,
1185* this is new Long(0). It's never null. If the result is an integer,
1186* this will be an instance of Long; otherwise, it's an instance of
1187* Double.
1188*
1189* !!! note {dlf} in point of fact, in the java code the caller always converts
1190* the result to a double, so we might as well return one.
1191*/
1192double
1193NFRule::matchToDelimiter(const UnicodeString& text,
1194 int32_t startPos,
1195 double _baseValue,
1196 const UnicodeString& delimiter,
1197 ParsePosition& pp,
1198 const NFSubstitution* sub,
1199 uint32_t nonNumericalExecutedRuleMask,
1200 int32_t recursionCount,
1201 double upperBound) const
1202{
1203 UErrorCode status = U_ZERO_ERROR;
1204 // if "delimiter" contains real (i.e., non-ignorable) text, search
1205 // it for "delimiter" beginning at "start". If that succeeds, then
1206 // use "sub"'s doParse() method to match the text before the
1207 // instance of "delimiter" we just found.
1208 if (!allIgnorable(delimiter, status)) {
1209 if (U_FAILURE(status)) { //Memory allocation error.
1210 return 0;
1211 }
1212 ParsePosition tempPP;
1213 Formattable result;
1214
1215 // use findText() to search for "delimiter". It returns a two-
1216 // element array: element 0 is the position of the match, and
1217 // element 1 is the number of characters that matched
1218 // "delimiter".
1219 int32_t dLen;
1220 int32_t dPos = findText(text, delimiter, startPos, &dLen);
1221
1222 // if findText() succeeded, isolate the text preceding the
1223 // match, and use "sub" to match that text
1224 while (dPos >= 0) {
1225 UnicodeString subText;
1226 subText.setTo(text, 0, dPos);
1227 if (subText.length() > 0) {
1228 UBool success = sub->doParse(subText, tempPP, _baseValue, upperBound,
1229#if UCONFIG_NO_COLLATION0
1230 false,
1231#else
1232 formatter->isLenient(),
1233#endif
1234 nonNumericalExecutedRuleMask,
1235 recursionCount,
1236 result);
1237
1238 // if the substitution could match all the text up to
1239 // where we found "delimiter", then this function has
1240 // a successful match. Bump the caller's parse position
1241 // to point to the first character after the text
1242 // that matches "delimiter", and return the result
1243 // we got from parsing the substitution.
1244 if (success && tempPP.getIndex() == dPos) {
1245 pp.setIndex(dPos + dLen);
1246 return result.getDouble();
1247 }
1248 else {
1249 // commented out because ParsePosition doesn't have error index in 1.1.x
1250 // restored for ICU4C port
1251 if (tempPP.getErrorIndex() > 0) {
1252 pp.setErrorIndex(tempPP.getErrorIndex());
1253 } else {
1254 pp.setErrorIndex(tempPP.getIndex());
1255 }
1256 }
1257 }
1258
1259 // if we didn't match the substitution, search for another
1260 // copy of "delimiter" in "text" and repeat the loop if
1261 // we find it
1262 tempPP.setIndex(0);
1263 dPos = findText(text, delimiter, dPos + dLen, &dLen);
1264 }
1265 // if we make it here, this was an unsuccessful match, and we
1266 // leave pp unchanged and return 0
1267 pp.setIndex(0);
1268 return 0;
1269
1270 // if "delimiter" is empty, or consists only of ignorable characters
1271 // (i.e., is semantically empty), thwe we obviously can't search
1272 // for "delimiter". Instead, just use "sub" to parse as much of
1273 // "text" as possible.
1274 }
1275 else if (sub == nullptr) {
1276 return _baseValue;
1277 }
1278 else {
1279 ParsePosition tempPP;
1280 Formattable result;
1281
1282 // try to match the whole string against the substitution
1283 UBool success = sub->doParse(text, tempPP, _baseValue, upperBound,
1284#if UCONFIG_NO_COLLATION0
1285 false,
1286#else
1287 formatter->isLenient(),
1288#endif
1289 nonNumericalExecutedRuleMask,
1290 recursionCount,
1291 result);
1292 if (success && (tempPP.getIndex() != 0)) {
1293 // if there's a successful match (or it's a null
1294 // substitution), update pp to point to the first
1295 // character we didn't match, and pass the result from
1296 // sub.doParse() on through to the caller
1297 pp.setIndex(tempPP.getIndex());
1298 return result.getDouble();
1299 }
1300 else {
1301 // commented out because ParsePosition doesn't have error index in 1.1.x
1302 // restored for ICU4C port
1303 pp.setErrorIndex(tempPP.getErrorIndex());
1304 }
1305
1306 // and if we get to here, then nothing matched, so we return
1307 // 0 and leave pp alone
1308 return 0;
1309 }
1310}
1311
1312/**
1313* Used by stripPrefix() to match characters. If lenient parse mode
1314* is off, this just calls startsWith(). If lenient parse mode is on,
1315* this function uses CollationElementIterators to match characters in
1316* the strings (only primary-order differences are significant in
1317* determining whether there's a match).
1318* @param str The string being tested
1319* @param prefix The text we're hoping to see at the beginning
1320* of "str"
1321* @return If "prefix" is found at the beginning of "str", this
1322* is the number of characters in "str" that were matched (this
1323* isn't necessarily the same as the length of "prefix" when matching
1324* text with a collator). If there's no match, this is 0.
1325*/
1326int32_t
1327NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErrorCode& status) const
1328{
1329 // if we're looking for an empty prefix, it obviously matches
1330 // zero characters. Just go ahead and return 0.
1331 if (prefix.length() == 0) {
1332 return 0;
1333 }
1334
1335#if !UCONFIG_NO_COLLATION0
1336 // go through all this grief if we're in lenient-parse mode
1337 if (formatter->isLenient()) {
1338 // Check if non-lenient rule finds the text before call lenient parsing
1339 if (str.startsWith(prefix)) {
1340 return prefix.length();
1341 }
1342 // get the formatter's collator and use it to create two
1343 // collation element iterators, one over the target string
1344 // and another over the prefix (right now, we'll throw an
1345 // exception if the collator we get back from the formatter
1346 // isn't a RuleBasedCollator, because RuleBasedCollator defines
1347 // the CollationElementIterator protocol. Hopefully, this
1348 // will change someday.)
1349 const RuleBasedCollator* collator = formatter->getCollator();
1350 if (collator == nullptr) {
1351 status = U_MEMORY_ALLOCATION_ERROR;
1352 return 0;
1353 }
1354 LocalPointer<CollationElementIterator> strIter(collator->createCollationElementIterator(str));
1355 LocalPointer<CollationElementIterator> prefixIter(collator->createCollationElementIterator(prefix));
1356 // Check for memory allocation error.
1357 if (strIter.isNull() || prefixIter.isNull()) {
1358 status = U_MEMORY_ALLOCATION_ERROR;
1359 return 0;
1360 }
1361
1362 UErrorCode err = U_ZERO_ERROR;
1363
1364 // The original code was problematic. Consider this match:
1365 // prefix = "fifty-"
1366 // string = " fifty-7"
1367 // The intent is to match string up to the '7', by matching 'fifty-' at position 1
1368 // in the string. Unfortunately, we were getting a match, and then computing where
1369 // the match terminated by rematching the string. The rematch code was using as an
1370 // initial guess the substring of string between 0 and prefix.length. Because of
1371 // the leading space and trailing hyphen (both ignorable) this was succeeding, leaving
1372 // the position before the hyphen in the string. Recursing down, we then parsed the
1373 // remaining string '-7' as numeric. The resulting number turned out as 43 (50 - 7).
1374 // This was not pretty, especially since the string "fifty-7" parsed just fine.
1375 //
1376 // We have newer APIs now, so we can use calls on the iterator to determine what we
1377 // matched up to. If we terminate because we hit the last element in the string,
1378 // our match terminates at this length. If we terminate because we hit the last element
1379 // in the target, our match terminates at one before the element iterator position.
1380
1381 // match collation elements between the strings
1382 int32_t oStr = strIter->next(err);
1383 int32_t oPrefix = prefixIter->next(err);
1384
1385 while (oPrefix != CollationElementIterator::NULLORDER) {
1386 // skip over ignorable characters in the target string
1387 while (CollationElementIterator::primaryOrder(oStr) == 0
1388 && oStr != CollationElementIterator::NULLORDER) {
1389 oStr = strIter->next(err);
1390 }
1391
1392 // skip over ignorable characters in the prefix
1393 while (CollationElementIterator::primaryOrder(oPrefix) == 0
1394 && oPrefix != CollationElementIterator::NULLORDER) {
1395 oPrefix = prefixIter->next(err);
1396 }
1397
1398 // dlf: move this above following test, if we consume the
1399 // entire target, aren't we ok even if the source was also
1400 // entirely consumed?
1401
1402 // if skipping over ignorables brought to the end of
1403 // the prefix, we DID match: drop out of the loop
1404 if (oPrefix == CollationElementIterator::NULLORDER) {
1405 break;
1406 }
1407
1408 // if skipping over ignorables brought us to the end
1409 // of the target string, we didn't match and return 0
1410 if (oStr == CollationElementIterator::NULLORDER) {
1411 return 0;
1412 }
1413
1414 // match collation elements from the two strings
1415 // (considering only primary differences). If we
1416 // get a mismatch, dump out and return 0
1417 if (CollationElementIterator::primaryOrder(oStr)
1418 != CollationElementIterator::primaryOrder(oPrefix)) {
1419 return 0;
1420
1421 // otherwise, advance to the next character in each string
1422 // and loop (we drop out of the loop when we exhaust
1423 // collation elements in the prefix)
1424 } else {
1425 oStr = strIter->next(err);
1426 oPrefix = prefixIter->next(err);
1427 }
1428 }
1429
1430 int32_t result = strIter->getOffset();
1431 if (oStr != CollationElementIterator::NULLORDER) {
1432 --result; // back over character that we don't want to consume;
1433 }
1434
1435#ifdef RBNF_DEBUG
1436 fprintf(stderrstderr, "prefix length: %d\n", result);
1437#endif
1438 return result;
1439#if 0
1440 //----------------------------------------------------------------
1441 // JDK 1.2-specific API call
1442 // return strIter.getOffset();
1443 //----------------------------------------------------------------
1444 // JDK 1.1 HACK (take out for 1.2-specific code)
1445
1446 // if we make it to here, we have a successful match. Now we
1447 // have to find out HOW MANY characters from the target string
1448 // matched the prefix (there isn't necessarily a one-to-one
1449 // mapping between collation elements and characters).
1450 // In JDK 1.2, there's a simple getOffset() call we can use.
1451 // In JDK 1.1, on the other hand, we have to go through some
1452 // ugly contortions. First, use the collator to compare the
1453 // same number of characters from the prefix and target string.
1454 // If they're equal, we're done.
1455 collator->setStrength(Collator::PRIMARY);
1456 if (str.length() >= prefix.length()) {
1457 UnicodeString temp;
1458 temp.setTo(str, 0, prefix.length());
1459 if (collator->equals(temp, prefix)) {
1460#ifdef RBNF_DEBUG
1461 fprintf(stderrstderr, "returning: %d\n", prefix.length());
1462#endif
1463 return prefix.length();
1464 }
1465 }
1466
1467 // if they're not equal, then we have to compare successively
1468 // larger and larger substrings of the target string until we
1469 // get to one that matches the prefix. At that point, we know
1470 // how many characters matched the prefix, and we can return.
1471 int32_t p = 1;
1472 while (p <= str.length()) {
1473 UnicodeString temp;
1474 temp.setTo(str, 0, p);
1475 if (collator->equals(temp, prefix)) {
1476 return p;
1477 } else {
1478 ++p;
1479 }
1480 }
1481
1482 // SHOULD NEVER GET HERE!!!
1483 return 0;
1484 //----------------------------------------------------------------
1485#endif
1486
1487 // If lenient parsing is turned off, forget all that crap above.
1488 // Just use String.startsWith() and be done with it.
1489 } else
1490#endif
1491 {
1492 if (str.startsWith(prefix)) {
1493 return prefix.length();
1494 } else {
1495 return 0;
1496 }
1497 }
1498}
1499
1500/**
1501* Searches a string for another string. If lenient parsing is off,
1502* this just calls indexOf(). If lenient parsing is on, this function
1503* uses CollationElementIterator to match characters, and only
1504* primary-order differences are significant in determining whether
1505* there's a match.
1506* @param str The string to search
1507* @param key The string to search "str" for
1508* @param startingAt The index into "str" where the search is to
1509* begin
1510* @return A two-element array of ints. Element 0 is the position
1511* of the match, or -1 if there was no match. Element 1 is the
1512* number of characters in "str" that matched (which isn't necessarily
1513* the same as the length of "key")
1514*/
1515int32_t
1516NFRule::findText(const UnicodeString& str,
1517 const UnicodeString& key,
1518 int32_t startingAt,
1519 int32_t* length) const
1520{
1521 if (rulePatternFormat) {
1522 Formattable result;
1523 FieldPosition position(UNUM_INTEGER_FIELD);
1524 position.setBeginIndex(startingAt);
1525 rulePatternFormat->parseType(str, this, result, position);
1526 int start = position.getBeginIndex();
1527 if (start >= 0) {
1528 int32_t pluralRuleStart = fRuleText.indexOf(gDollarOpenParenthesis, -1, 0);
1529 int32_t pluralRuleSuffix = fRuleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart) + 2;
1530 int32_t matchLen = position.getEndIndex() - start;
1531 UnicodeString prefix(fRuleText.tempSubString(0, pluralRuleStart));
1532 UnicodeString suffix(fRuleText.tempSubString(pluralRuleSuffix));
1533 if (str.compare(start - prefix.length(), prefix.length(), prefix, 0, prefix.length()) == 0
1534 && str.compare(start + matchLen, suffix.length(), suffix, 0, suffix.length()) == 0)
1535 {
1536 *length = matchLen + prefix.length() + suffix.length();
1537 return start - prefix.length();
1538 }
1539 }
1540 *length = 0;
1541 return -1;
1542 }
1543 if (!formatter->isLenient()) {
1544 // if lenient parsing is turned off, this is easy: just call
1545 // String.indexOf() and we're done
1546 *length = key.length();
1547 return str.indexOf(key, startingAt);
1548 }
1549 else {
1550 // Check if non-lenient rule finds the text before call lenient parsing
1551 *length = key.length();
1552 int32_t pos = str.indexOf(key, startingAt);
1553 if(pos >= 0) {
1554 return pos;
1555 } else {
1556 // but if lenient parsing is turned ON, we've got some work ahead of us
1557 return findTextLenient(str, key, startingAt, length);
1558 }
1559 }
1560}
1561
1562int32_t
1563NFRule::findTextLenient(const UnicodeString& str,
1564 const UnicodeString& key,
1565 int32_t startingAt,
1566 int32_t* length) const
1567{
1568 //----------------------------------------------------------------
1569 // JDK 1.1 HACK (take out of 1.2-specific code)
1570
1571 // in JDK 1.2, CollationElementIterator provides us with an
1572 // API to map between character offsets and collation elements
1573 // and we can do this by marching through the string comparing
1574 // collation elements. We can't do that in JDK 1.1. Instead,
1575 // we have to go through this horrible slow mess:
1576 int32_t p = startingAt;
1577 int32_t keyLen = 0;
1578
1579 // basically just isolate smaller and smaller substrings of
1580 // the target string (each running to the end of the string,
1581 // and with the first one running from startingAt to the end)
1582 // and then use prefixLength() to see if the search key is at
1583 // the beginning of each substring. This is excruciatingly
1584 // slow, but it will locate the key and tell use how long the
1585 // matching text was.
1586 UnicodeString temp;
1587 UErrorCode status = U_ZERO_ERROR;
1588 while (p < str.length() && keyLen == 0) {
1589 temp.setTo(str, p, str.length() - p);
1590 keyLen = prefixLength(temp, key, status);
1591 if (U_FAILURE(status)) {
1592 break;
1593 }
1594 if (keyLen != 0) {
1595 *length = keyLen;
1596 return p;
1597 }
1598 ++p;
1599 }
1600 // if we make it to here, we didn't find it. Return -1 for the
1601 // location. The length should be ignored, but set it to 0,
1602 // which should be "safe"
1603 *length = 0;
1604 return -1;
1605}
1606
1607/**
1608* Checks to see whether a string consists entirely of ignorable
1609* characters.
1610* @param str The string to test.
1611* @return true if the string is empty of consists entirely of
1612* characters that the number formatter's collator says are
1613* ignorable at the primary-order level. false otherwise.
1614*/
1615UBool
1616NFRule::allIgnorable(const UnicodeString& str, UErrorCode& status) const
1617{
1618 // if the string is empty, we can just return true
1619 if (str.length() == 0) {
1620 return true;
1621 }
1622
1623#if !UCONFIG_NO_COLLATION0
1624 // if lenient parsing is turned on, walk through the string with
1625 // a collation element iterator and make sure each collation
1626 // element is 0 (ignorable) at the primary level
1627 if (formatter->isLenient()) {
1628 const RuleBasedCollator* collator = formatter->getCollator();
1629 if (collator == nullptr) {
1630 status = U_MEMORY_ALLOCATION_ERROR;
1631 return false;
1632 }
1633 LocalPointer<CollationElementIterator> iter(collator->createCollationElementIterator(str));
1634
1635 // Memory allocation error check.
1636 if (iter.isNull()) {
1637 status = U_MEMORY_ALLOCATION_ERROR;
1638 return false;
1639 }
1640
1641 UErrorCode err = U_ZERO_ERROR;
1642 int32_t o = iter->next(err);
1643 while (o != CollationElementIterator::NULLORDER
1644 && CollationElementIterator::primaryOrder(o) == 0) {
1645 o = iter->next(err);
1646 }
1647
1648 return o == CollationElementIterator::NULLORDER;
1649 }
1650#endif
1651
1652 // if lenient parsing is turned off, there is no such thing as
1653 // an ignorable character: return true only if the string is empty
1654 return false;
1655}
1656
1657void
1658NFRule::setDecimalFormatSymbols(const DecimalFormatSymbols& newSymbols, UErrorCode& status) {
1659 if (sub1 != nullptr) {
1660 sub1->setDecimalFormatSymbols(newSymbols, status);
1661 }
1662 if (sub2 != nullptr) {
1663 sub2->setDecimalFormatSymbols(newSymbols, status);
1664 }
1665}
1666
1667U_NAMESPACE_END}
1668
1669/* U_HAVE_RBNF */
1670#endif