Bug Summary

File:root/firefox-clang/extensions/spellcheck/hunspell/src/affixmgr.cxx
Warning:line 4114, column 13
Value stored to 'numdefcpd' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name Unified_cpp_hunspell_src0.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -relaxed-aliasing -ffp-contract=off -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/root/firefox-clang/obj-x86_64-pc-linux-gnu/extensions/spellcheck/hunspell/src -fcoverage-compilation-dir=/root/firefox-clang/obj-x86_64-pc-linux-gnu/extensions/spellcheck/hunspell/src -resource-dir /usr/lib/llvm-21/lib/clang/21 -include /root/firefox-clang/config/gcc_hidden.h -include /root/firefox-clang/obj-x86_64-pc-linux-gnu/mozilla-config.h -include hunspell_alloc_hooks.h -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/stl_wrappers -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/system_wrappers -U _FORTIFY_SOURCE -D _FORTIFY_SOURCE=2 -D _GLIBCXX_ASSERTIONS -D DEBUG=1 -D HUNSPELL_STATIC -D MOZ_HAS_MOZGLUE -D MOZILLA_INTERNAL_API -D IMPL_LIBXUL -D MOZ_SUPPORT_LEAKCHECKING -D STATIC_EXPORTABLE_JS_API -I /root/firefox-clang/extensions/spellcheck/hunspell/src -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/extensions/spellcheck/hunspell/src -I /root/firefox-clang/extensions/spellcheck/hunspell/glue -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/ipc/ipdl/_ipdlheaders -I /root/firefox-clang/ipc/chromium/src -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include/nspr -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include/nss -D MOZILLA_CLIENT -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../include/c++/14 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../include/x86_64-linux-gnu/c++/14 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../include/c++/14/backward -internal-isystem /usr/lib/llvm-21/lib/clang/21/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-error=pessimizing-move -Wno-error=large-by-value-copy=128 -Wno-error=implicit-int-float-conversion -Wno-error=thread-safety-analysis -Wno-error=tautological-type-limit-compare -Wno-invalid-offsetof -Wno-range-loop-analysis -Wno-deprecated-anon-enum-enum-conversion -Wno-deprecated-enum-enum-conversion -Wno-deprecated-this-capture -Wno-inline-new-delete -Wno-error=deprecated-declarations -Wno-error=array-bounds -Wno-error=free-nonheap-object -Wno-error=atomic-alignment -Wno-error=deprecated-builtins -Wno-psabi -Wno-error=builtin-macro-redefined -Wno-vla-cxx-extension -Wno-unknown-warning-option -Wno-implicit-fallthrough -fdeprecated-macro -ferror-limit 19 -fstrict-flex-arrays=1 -stack-protector 2 -fstack-clash-protection -ftrivial-auto-var-init=pattern -fno-rtti -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -fno-sized-deallocation -fno-aligned-allocation -vectorize-loops -vectorize-slp -analyzer-checker optin.performance.Padding -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2025-06-27-100320-3286336-1 -x c++ Unified_cpp_hunspell_src0.cpp
1/* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * Copyright (C) 2002-2022 Németh László
5 *
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
10 *
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
15 *
16 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17 *
18 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23 *
24 * Alternatively, the contents of this file may be used under the terms of
25 * either the GNU General Public License Version 2 or later (the "GPL"), or
26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
35 *
36 * ***** END LICENSE BLOCK ***** */
37/*
38 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39 * And Contributors. All rights reserved.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 *
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 *
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 *
52 * 3. All modifications to the source code must be clearly marked as
53 * such. Binary redistributions based on modified source code
54 * must be clearly marked as modified versions in the documentation
55 * and/or other materials provided with the distribution.
56 *
57 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
61 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 */
70
71#include <stdlib.h>
72#include <string.h>
73#include <stdio.h>
74#include <ctype.h>
75#include <time.h>
76
77#include <algorithm>
78#include <limits>
79#include <string>
80#include <vector>
81
82#include "affixmgr.hxx"
83#include "affentry.hxx"
84#include "langnum.hxx"
85
86#include "csutil.hxx"
87
88AffixMgr::AffixMgr(const char* affpath,
89 const std::vector<HashMgr*>& ptr,
90 const char* key)
91 : alldic(ptr)
92 , pHMgr(ptr[0]) {
93
94 // register hash manager and load affix data from aff file
95 csconv = NULL__null;
96 utf8 = 0;
97 complexprefixes = 0;
98 parsedmaptable = false;
99 parsedbreaktable = false;
100 iconvtable = NULL__null;
101 oconvtable = NULL__null;
102 // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
103 simplifiedcpd = 0;
104 parsedcheckcpd = false;
105 parseddefcpd = false;
106 phone = NULL__null;
107 compoundflag = FLAG_NULL0x00; // permits word in compound forms
108 compoundbegin = FLAG_NULL0x00; // may be first word in compound forms
109 compoundmiddle = FLAG_NULL0x00; // may be middle word in compound forms
110 compoundend = FLAG_NULL0x00; // may be last word in compound forms
111 compoundroot = FLAG_NULL0x00; // compound word signing flag
112 compoundpermitflag = FLAG_NULL0x00; // compound permitting flag for suffixed word
113 compoundforbidflag = FLAG_NULL0x00; // compound fordidden flag for suffixed word
114 compoundmoresuffixes = 0; // allow more suffixes within compound words
115 checkcompounddup = 0; // forbid double words in compounds
116 checkcompoundrep = 0; // forbid bad compounds (may be non-compound word with
117 // a REP substitution)
118 checkcompoundcase =
119 0; // forbid upper and lowercase combinations at word bounds
120 checkcompoundtriple = 0; // forbid compounds with triple letters
121 simplifiedtriple = 0; // allow simplified triple letters in compounds
122 // (Schiff+fahrt -> Schiffahrt)
123 forbiddenword = FORBIDDENWORD65510; // forbidden word signing flag
124 nosuggest = FLAG_NULL0x00; // don't suggest words signed with NOSUGGEST flag
125 nongramsuggest = FLAG_NULL0x00;
126 langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)
127 needaffix = FLAG_NULL0x00; // forbidden root, allowed only with suffixes
128 cpdwordmax = -1; // default: unlimited wordcount in compound words
129 cpdmin = -1; // undefined
130 cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words
131 pfxappnd = NULL__null; // previous prefix for counting syllables of the prefix BUG
132 sfxappnd = NULL__null; // previous suffix for counting syllables of the suffix BUG
133 sfxextra = 0; // modifier for syllable count of sfxappnd BUG
134 checknum = 0; // checking numbers, and word with numbers
135 havecontclass = 0; // flags of possible continuing classes (double affix)
136 // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
137 // in morhological description in dictionary file. It's often combined with
138 // PSEUDOROOT.
139 lemma_present = FLAG_NULL0x00;
140 circumfix = FLAG_NULL0x00;
141 onlyincompound = FLAG_NULL0x00;
142 maxngramsugs = -1; // undefined
143 maxdiff = -1; // undefined
144 onlymaxdiff = 0;
145 maxcpdsugs = -1; // undefined
146 nosplitsugs = 0;
147 sugswithdots = 0;
148 keepcase = 0;
149 forceucase = 0;
150 warn = 0;
151 forbidwarn = 0;
152 checksharps = 0;
153 substandard = FLAG_NULL0x00;
154 fullstrip = 0;
155
156 sfx = NULL__null;
157 pfx = NULL__null;
158
159 for (int i = 0; i < SETSIZE256; i++) {
160 pStart[i] = NULL__null;
161 sStart[i] = NULL__null;
162 pFlag[i] = NULL__null;
163 sFlag[i] = NULL__null;
164 }
165
166 for (int j = 0; j < CONTSIZE65536; j++) {
167 contclasses[j] = 0;
168 }
169
170 if (parse_file(affpath, key)) {
171 HUNSPELL_WARNING(stderrstderr, "Failure loading aff file %s\n", affpath);
172 }
173
174 if (cpdmin == -1)
175 cpdmin = MINCPDLEN3;
176}
177
178AffixMgr::~AffixMgr() {
179 // pass through linked prefix entries and clean up
180 for (int i = 0; i < SETSIZE256; i++) {
181 pFlag[i] = NULL__null;
182 PfxEntry* ptr = pStart[i];
183 PfxEntry* nptr = NULL__null;
184 while (ptr) {
185 nptr = ptr->getNext();
186 delete (ptr);
187 ptr = nptr;
188 nptr = NULL__null;
189 }
190 }
191
192 // pass through linked suffix entries and clean up
193 for (int j = 0; j < SETSIZE256; j++) {
194 sFlag[j] = NULL__null;
195 SfxEntry* ptr = sStart[j];
196 SfxEntry* nptr = NULL__null;
197 while (ptr) {
198 nptr = ptr->getNext();
199 delete (ptr);
200 ptr = nptr;
201 nptr = NULL__null;
202 }
203 sStart[j] = NULL__null;
204 }
205
206 delete iconvtable;
207 delete oconvtable;
208 delete phone;
209
210 FREE_FLAG(compoundflag)compoundflag = 0;
211 FREE_FLAG(compoundbegin)compoundbegin = 0;
212 FREE_FLAG(compoundmiddle)compoundmiddle = 0;
213 FREE_FLAG(compoundend)compoundend = 0;
214 FREE_FLAG(compoundpermitflag)compoundpermitflag = 0;
215 FREE_FLAG(compoundforbidflag)compoundforbidflag = 0;
216 FREE_FLAG(compoundroot)compoundroot = 0;
217 FREE_FLAG(forbiddenword)forbiddenword = 0;
218 FREE_FLAG(nosuggest)nosuggest = 0;
219 FREE_FLAG(nongramsuggest)nongramsuggest = 0;
220 FREE_FLAG(needaffix)needaffix = 0;
221 FREE_FLAG(lemma_present)lemma_present = 0;
222 FREE_FLAG(circumfix)circumfix = 0;
223 FREE_FLAG(onlyincompound)onlyincompound = 0;
224
225 cpdwordmax = 0;
226 pHMgr = NULL__null;
227 cpdmin = 0;
228 cpdmaxsyllable = 0;
229 free_utf_tbl();
230 checknum = 0;
231#ifdef MOZILLA_CLIENT1
232 delete[] csconv;
233#endif
234}
235
236void AffixMgr::finishFileMgr(FileMgr* afflst) {
237 delete afflst;
238
239 // convert affix trees to sorted list
240 process_pfx_tree_to_list();
241 process_sfx_tree_to_list();
242}
243
244// read in aff file and build up prefix and suffix entry objects
245int AffixMgr::parse_file(const char* affpath, const char* key) {
246
247 // checking flag duplication
248 char dupflags[CONTSIZE65536];
249 char dupflags_ini = 1;
250
251 // first line indicator for removing byte order mark
252 int firstline = 1;
253
254 // open the affix file
255 FileMgr* afflst = new FileMgr(affpath, key);
256 if (!afflst) {
257 HUNSPELL_WARNING(
258 stderrstderr, "error: could not open affix description file %s\n", affpath);
259 return 1;
260 }
261
262 // step one is to parse the affix file building up the internal
263 // affix data structures
264
265 // read in each line ignoring any that do not
266 // start with a known line type indicator
267 std::string line;
268 while (afflst->getline(line)) {
269 mychomp(line);
270
271 /* remove byte order mark */
272 if (firstline) {
273 firstline = 0;
274 // Affix file begins with byte order mark: possible incompatibility with
275 // old Hunspell versions
276 if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
277 line.erase(0, 3);
278 }
279 }
280
281 /* parse in the keyboard string */
282 if (line.compare(0, 3, "KEY", 3) == 0) {
283 if (!parse_string(line, keystring, afflst->getlinenum())) {
284 finishFileMgr(afflst);
285 return 1;
286 }
287 }
288
289 /* parse in the try string */
290 if (line.compare(0, 3, "TRY", 3) == 0) {
291 if (!parse_string(line, trystring, afflst->getlinenum())) {
292 finishFileMgr(afflst);
293 return 1;
294 }
295 }
296
297 /* parse in the name of the character set used by the .dict and .aff */
298 if (line.compare(0, 3, "SET", 3) == 0) {
299 if (!parse_string(line, encoding, afflst->getlinenum())) {
300 finishFileMgr(afflst);
301 return 1;
302 }
303 if (encoding == "UTF-8") {
304 utf8 = 1;
305#ifndef OPENOFFICEORG
306#ifndef MOZILLA_CLIENT1
307 initialize_utf_tbl();
308#endif
309#endif
310 }
311 }
312
313 /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left
314 * writing system */
315 if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0)
316 complexprefixes = 1;
317
318 /* parse in the flag used by the controlled compound words */
319 if (line.compare(0, 12, "COMPOUNDFLAG", 12) == 0) {
320 if (!parse_flag(line, &compoundflag, afflst)) {
321 finishFileMgr(afflst);
322 return 1;
323 }
324 }
325
326 /* parse in the flag used by compound words */
327 if (line.compare(0, 13, "COMPOUNDBEGIN", 13) == 0) {
328 if (complexprefixes) {
329 if (!parse_flag(line, &compoundend, afflst)) {
330 finishFileMgr(afflst);
331 return 1;
332 }
333 } else {
334 if (!parse_flag(line, &compoundbegin, afflst)) {
335 finishFileMgr(afflst);
336 return 1;
337 }
338 }
339 }
340
341 /* parse in the flag used by compound words */
342 if (line.compare(0, 14, "COMPOUNDMIDDLE", 14) == 0) {
343 if (!parse_flag(line, &compoundmiddle, afflst)) {
344 finishFileMgr(afflst);
345 return 1;
346 }
347 }
348
349 /* parse in the flag used by compound words */
350 if (line.compare(0, 11, "COMPOUNDEND", 11) == 0) {
351 if (complexprefixes) {
352 if (!parse_flag(line, &compoundbegin, afflst)) {
353 finishFileMgr(afflst);
354 return 1;
355 }
356 } else {
357 if (!parse_flag(line, &compoundend, afflst)) {
358 finishFileMgr(afflst);
359 return 1;
360 }
361 }
362 }
363
364 /* parse in the data used by compound_check() method */
365 if (line.compare(0, 15, "COMPOUNDWORDMAX", 15) == 0) {
366 if (!parse_num(line, &cpdwordmax, afflst)) {
367 finishFileMgr(afflst);
368 return 1;
369 }
370 }
371
372 /* parse in the flag sign compounds in dictionary */
373 if (line.compare(0, 12, "COMPOUNDROOT", 12) == 0) {
374 if (!parse_flag(line, &compoundroot, afflst)) {
375 finishFileMgr(afflst);
376 return 1;
377 }
378 }
379
380 /* parse in the flag used by compound_check() method */
381 if (line.compare(0, 18, "COMPOUNDPERMITFLAG", 18) == 0) {
382 if (!parse_flag(line, &compoundpermitflag, afflst)) {
383 finishFileMgr(afflst);
384 return 1;
385 }
386 }
387
388 /* parse in the flag used by compound_check() method */
389 if (line.compare(0, 18, "COMPOUNDFORBIDFLAG", 18) == 0) {
390 if (!parse_flag(line, &compoundforbidflag, afflst)) {
391 finishFileMgr(afflst);
392 return 1;
393 }
394 }
395
396 if (line.compare(0, 20, "COMPOUNDMORESUFFIXES", 20) == 0) {
397 compoundmoresuffixes = 1;
398 }
399
400 if (line.compare(0, 16, "CHECKCOMPOUNDDUP", 16) == 0) {
401 checkcompounddup = 1;
402 }
403
404 if (line.compare(0, 16, "CHECKCOMPOUNDREP", 16) == 0) {
405 checkcompoundrep = 1;
406 }
407
408 if (line.compare(0, 19, "CHECKCOMPOUNDTRIPLE", 19) == 0) {
409 checkcompoundtriple = 1;
410 }
411
412 if (line.compare(0, 16, "SIMPLIFIEDTRIPLE", 16) == 0) {
413 simplifiedtriple = 1;
414 }
415
416 if (line.compare(0, 17, "CHECKCOMPOUNDCASE", 17) == 0) {
417 checkcompoundcase = 1;
418 }
419
420 if (line.compare(0, 9, "NOSUGGEST", 9) == 0) {
421 if (!parse_flag(line, &nosuggest, afflst)) {
422 finishFileMgr(afflst);
423 return 1;
424 }
425 }
426
427 if (line.compare(0, 14, "NONGRAMSUGGEST", 14) == 0) {
428 if (!parse_flag(line, &nongramsuggest, afflst)) {
429 finishFileMgr(afflst);
430 return 1;
431 }
432 }
433
434 /* parse in the flag used by forbidden words */
435 if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) {
436 if (!parse_flag(line, &forbiddenword, afflst)) {
437 finishFileMgr(afflst);
438 return 1;
439 }
440 }
441
442 /* parse in the flag used by forbidden words (is deprecated) */
443 if (line.compare(0, 13, "LEMMA_PRESENT", 13) == 0) {
444 if (!parse_flag(line, &lemma_present, afflst)) {
445 finishFileMgr(afflst);
446 return 1;
447 }
448 }
449
450 /* parse in the flag used by circumfixes */
451 if (line.compare(0, 9, "CIRCUMFIX", 9) == 0) {
452 if (!parse_flag(line, &circumfix, afflst)) {
453 finishFileMgr(afflst);
454 return 1;
455 }
456 }
457
458 /* parse in the flag used by fogemorphemes */
459 if (line.compare(0, 14, "ONLYINCOMPOUND", 14) == 0) {
460 if (!parse_flag(line, &onlyincompound, afflst)) {
461 finishFileMgr(afflst);
462 return 1;
463 }
464 }
465
466 /* parse in the flag used by `needaffixs' (is deprecated) */
467 if (line.compare(0, 10, "PSEUDOROOT", 10) == 0) {
468 if (!parse_flag(line, &needaffix, afflst)) {
469 finishFileMgr(afflst);
470 return 1;
471 }
472 }
473
474 /* parse in the flag used by `needaffixs' */
475 if (line.compare(0, 9, "NEEDAFFIX", 9) == 0) {
476 if (!parse_flag(line, &needaffix, afflst)) {
477 finishFileMgr(afflst);
478 return 1;
479 }
480 }
481
482 /* parse in the minimal length for words in compounds */
483 if (line.compare(0, 11, "COMPOUNDMIN", 11) == 0) {
484 if (!parse_num(line, &cpdmin, afflst)) {
485 finishFileMgr(afflst);
486 return 1;
487 }
488 if (cpdmin < 1)
489 cpdmin = 1;
490 }
491
492 /* parse in the max. words and syllables in compounds */
493 if (line.compare(0, 16, "COMPOUNDSYLLABLE", 16) == 0) {
494 if (!parse_cpdsyllable(line, afflst)) {
495 finishFileMgr(afflst);
496 return 1;
497 }
498 }
499
500 /* parse in the flag used by compound_check() method */
501 if (line.compare(0, 11, "SYLLABLENUM", 11) == 0) {
502 if (!parse_string(line, cpdsyllablenum, afflst->getlinenum())) {
503 finishFileMgr(afflst);
504 return 1;
505 }
506 }
507
508 /* parse in the flag used by the controlled compound words */
509 if (line.compare(0, 8, "CHECKNUM", 8) == 0) {
510 checknum = 1;
511 }
512
513 /* parse in the extra word characters */
514 if (line.compare(0, 9, "WORDCHARS", 9) == 0) {
515 if (!parse_array(line, wordchars, wordchars_utf16,
516 utf8, afflst->getlinenum())) {
517 finishFileMgr(afflst);
518 return 1;
519 }
520 }
521
522 /* parse in the ignored characters (for example, Arabic optional diacretics
523 * charachters */
524 if (line.compare(0, 6, "IGNORE", 6) == 0) {
525 if (!parse_array(line, ignorechars, ignorechars_utf16,
526 utf8, afflst->getlinenum())) {
527 finishFileMgr(afflst);
528 return 1;
529 }
530 }
531
532 /* parse in the input conversion table */
533 if (line.compare(0, 5, "ICONV", 5) == 0) {
534 if (!parse_convtable(line, afflst, &iconvtable, "ICONV")) {
535 finishFileMgr(afflst);
536 return 1;
537 }
538 }
539
540 /* parse in the output conversion table */
541 if (line.compare(0, 5, "OCONV", 5) == 0) {
542 if (!parse_convtable(line, afflst, &oconvtable, "OCONV")) {
543 finishFileMgr(afflst);
544 return 1;
545 }
546 }
547
548 /* parse in the phonetic translation table */
549 if (line.compare(0, 5, "PHONE", 5) == 0) {
550 if (!parse_phonetable(line, afflst)) {
551 finishFileMgr(afflst);
552 return 1;
553 }
554 }
555
556 /* parse in the checkcompoundpattern table */
557 if (line.compare(0, 20, "CHECKCOMPOUNDPATTERN", 20) == 0) {
558 if (!parse_checkcpdtable(line, afflst)) {
559 finishFileMgr(afflst);
560 return 1;
561 }
562 }
563
564 /* parse in the defcompound table */
565 if (line.compare(0, 12, "COMPOUNDRULE", 12) == 0) {
566 if (!parse_defcpdtable(line, afflst)) {
567 finishFileMgr(afflst);
568 return 1;
569 }
570 }
571
572 /* parse in the related character map table */
573 if (line.compare(0, 3, "MAP", 3) == 0) {
574 if (!parse_maptable(line, afflst)) {
575 finishFileMgr(afflst);
576 return 1;
577 }
578 }
579
580 /* parse in the word breakpoints table */
581 if (line.compare(0, 5, "BREAK", 5) == 0) {
582 if (!parse_breaktable(line, afflst)) {
583 finishFileMgr(afflst);
584 return 1;
585 }
586 }
587
588 /* parse in the language for language specific codes */
589 if (line.compare(0, 4, "LANG", 4) == 0) {
590 if (!parse_string(line, lang, afflst->getlinenum())) {
591 finishFileMgr(afflst);
592 return 1;
593 }
594 langnum = get_lang_num(lang);
595 }
596
597 if (line.compare(0, 7, "VERSION", 7) == 0) {
598 size_t startpos = line.find_first_not_of(" \t", 7);
599 if (startpos != std::string::npos) {
600 version = line.substr(startpos);
601 }
602 }
603
604 if (line.compare(0, 12, "MAXNGRAMSUGS", 12) == 0) {
605 if (!parse_num(line, &maxngramsugs, afflst)) {
606 finishFileMgr(afflst);
607 return 1;
608 }
609 }
610
611 if (line.compare(0, 11, "ONLYMAXDIFF", 11) == 0)
612 onlymaxdiff = 1;
613
614 if (line.compare(0, 7, "MAXDIFF", 7) == 0) {
615 if (!parse_num(line, &maxdiff, afflst)) {
616 finishFileMgr(afflst);
617 return 1;
618 }
619 }
620
621 if (line.compare(0, 10, "MAXCPDSUGS", 10) == 0) {
622 if (!parse_num(line, &maxcpdsugs, afflst)) {
623 finishFileMgr(afflst);
624 return 1;
625 }
626 }
627
628 if (line.compare(0, 11, "NOSPLITSUGS", 11) == 0) {
629 nosplitsugs = 1;
630 }
631
632 if (line.compare(0, 9, "FULLSTRIP", 9) == 0) {
633 fullstrip = 1;
634 }
635
636 if (line.compare(0, 12, "SUGSWITHDOTS", 12) == 0) {
637 sugswithdots = 1;
638 }
639
640 /* parse in the flag used by forbidden words */
641 if (line.compare(0, 8, "KEEPCASE", 8) == 0) {
642 if (!parse_flag(line, &keepcase, afflst)) {
643 finishFileMgr(afflst);
644 return 1;
645 }
646 }
647
648 /* parse in the flag used by `forceucase' */
649 if (line.compare(0, 10, "FORCEUCASE", 10) == 0) {
650 if (!parse_flag(line, &forceucase, afflst)) {
651 finishFileMgr(afflst);
652 return 1;
653 }
654 }
655
656 /* parse in the flag used by `warn' */
657 if (line.compare(0, 4, "WARN", 4) == 0) {
658 if (!parse_flag(line, &warn, afflst)) {
659 finishFileMgr(afflst);
660 return 1;
661 }
662 }
663
664 if (line.compare(0, 10, "FORBIDWARN", 10) == 0) {
665 forbidwarn = 1;
666 }
667
668 /* parse in the flag used by the affix generator */
669 if (line.compare(0, 11, "SUBSTANDARD", 11) == 0) {
670 if (!parse_flag(line, &substandard, afflst)) {
671 finishFileMgr(afflst);
672 return 1;
673 }
674 }
675
676 if (line.compare(0, 11, "CHECKSHARPS", 11) == 0) {
677 checksharps = 1;
678 }
679
680 /* parse this affix: P - prefix, S - suffix */
681 // affix type
682 char ft = ' ';
683 if (line.compare(0, 3, "PFX", 3) == 0)
684 ft = complexprefixes ? 'S' : 'P';
685 if (line.compare(0, 3, "SFX", 3) == 0)
686 ft = complexprefixes ? 'P' : 'S';
687 if (ft != ' ') {
688 if (dupflags_ini) {
689 memset(dupflags, 0, sizeof(dupflags));
690 dupflags_ini = 0;
691 }
692 if (!parse_affix(line, ft, afflst, dupflags)) {
693 finishFileMgr(afflst);
694 return 1;
695 }
696 }
697 }
698
699 finishFileMgr(afflst);
700 // affix trees are sorted now
701
702 // now we can speed up performance greatly taking advantage of the
703 // relationship between the affixes and the idea of "subsets".
704
705 // View each prefix as a potential leading subset of another and view
706 // each suffix (reversed) as a potential trailing subset of another.
707
708 // To illustrate this relationship if we know the prefix "ab" is found in the
709 // word to examine, only prefixes that "ab" is a leading subset of need be
710 // examined.
711 // Furthermore is "ab" is not present then none of the prefixes that "ab" is
712 // is a subset need be examined.
713 // The same argument goes for suffix string that are reversed.
714
715 // Then to top this off why not examine the first char of the word to quickly
716 // limit the set of prefixes to examine (i.e. the prefixes to examine must
717 // be leading supersets of the first character of the word (if they exist)
718
719 // To take advantage of this "subset" relationship, we need to add two links
720 // from entry. One to take next if the current prefix is found (call it
721 // nexteq)
722 // and one to take next if the current prefix is not found (call it nextne).
723
724 // Since we have built ordered lists, all that remains is to properly
725 // initialize
726 // the nextne and nexteq pointers that relate them
727
728 process_pfx_order();
729 process_sfx_order();
730
731 /* get encoding for CHECKCOMPOUNDCASE */
732 if (!utf8) {
733 csconv = get_current_cs(get_encoding());
734 for (int i = 0; i <= 255; i++) {
735 if ((csconv[i].cupper != csconv[i].clower) &&
736 (wordchars.find((char)i) == std::string::npos)) {
737 wordchars.push_back((char)i);
738 }
739 }
740
741 }
742
743 // default BREAK definition
744 if (!parsedbreaktable) {
745 breaktable.push_back("-");
746 breaktable.push_back("^-");
747 breaktable.push_back("-$");
748 parsedbreaktable = true;
749 }
750 return 0;
751}
752
753// we want to be able to quickly access prefix information
754// both by prefix flag, and sorted by prefix string itself
755// so we need to set up two indexes
756
757int AffixMgr::build_pfxtree(PfxEntry* pfxptr) {
758 PfxEntry* ptr;
759 PfxEntry* pptr;
760 PfxEntry* ep = pfxptr;
761
762 // get the right starting points
763 const char* key = ep->getKey();
764 const unsigned char flg = (unsigned char)(ep->getFlag() & 0x00FF);
765
766 // first index by flag which must exist
767 ptr = pFlag[flg];
768 ep->setFlgNxt(ptr);
769 pFlag[flg] = ep;
770
771 // handle the special case of null affix string
772 if (strlen(key) == 0) {
773 // always inset them at head of list at element 0
774 ptr = pStart[0];
775 ep->setNext(ptr);
776 pStart[0] = ep;
777 return 0;
778 }
779
780 // now handle the normal case
781 ep->setNextEQ(NULL__null);
782 ep->setNextNE(NULL__null);
783
784 unsigned char sp = *((const unsigned char*)key);
785 ptr = pStart[sp];
786
787 // handle the first insert
788 if (!ptr) {
789 pStart[sp] = ep;
790 return 0;
791 }
792
793 // otherwise use binary tree insertion so that a sorted
794 // list can easily be generated later
795 pptr = NULL__null;
796 for (;;) {
797 pptr = ptr;
798 if (strcmp(ep->getKey(), ptr->getKey()) <= 0) {
799 ptr = ptr->getNextEQ();
800 if (!ptr) {
801 pptr->setNextEQ(ep);
802 break;
803 }
804 } else {
805 ptr = ptr->getNextNE();
806 if (!ptr) {
807 pptr->setNextNE(ep);
808 break;
809 }
810 }
811 }
812 return 0;
813}
814
815// we want to be able to quickly access suffix information
816// both by suffix flag, and sorted by the reverse of the
817// suffix string itself; so we need to set up two indexes
818int AffixMgr::build_sfxtree(SfxEntry* sfxptr) {
819
820 sfxptr->initReverseWord();
821
822 SfxEntry* ptr;
823 SfxEntry* pptr;
824 SfxEntry* ep = sfxptr;
825
826 /* get the right starting point */
827 const char* key = ep->getKey();
828 const unsigned char flg = (unsigned char)(ep->getFlag() & 0x00FF);
829
830 // first index by flag which must exist
831 ptr = sFlag[flg];
832 ep->setFlgNxt(ptr);
833 sFlag[flg] = ep;
834
835 // next index by affix string
836
837 // handle the special case of null affix string
838 if (strlen(key) == 0) {
839 // always inset them at head of list at element 0
840 ptr = sStart[0];
841 ep->setNext(ptr);
842 sStart[0] = ep;
843 return 0;
844 }
845
846 // now handle the normal case
847 ep->setNextEQ(NULL__null);
848 ep->setNextNE(NULL__null);
849
850 unsigned char sp = *((const unsigned char*)key);
851 ptr = sStart[sp];
852
853 // handle the first insert
854 if (!ptr) {
855 sStart[sp] = ep;
856 return 0;
857 }
858
859 // otherwise use binary tree insertion so that a sorted
860 // list can easily be generated later
861 pptr = NULL__null;
862 for (;;) {
863 pptr = ptr;
864 if (strcmp(ep->getKey(), ptr->getKey()) <= 0) {
865 ptr = ptr->getNextEQ();
866 if (!ptr) {
867 pptr->setNextEQ(ep);
868 break;
869 }
870 } else {
871 ptr = ptr->getNextNE();
872 if (!ptr) {
873 pptr->setNextNE(ep);
874 break;
875 }
876 }
877 }
878 return 0;
879}
880
881// convert from binary tree to sorted list
882int AffixMgr::process_pfx_tree_to_list() {
883 for (int i = 1; i < SETSIZE256; i++) {
884 pStart[i] = process_pfx_in_order(pStart[i], NULL__null);
885 }
886 return 0;
887}
888
889PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr) {
890 if (ptr) {
891 nptr = process_pfx_in_order(ptr->getNextNE(), nptr);
892 ptr->setNext(nptr);
893 nptr = process_pfx_in_order(ptr->getNextEQ(), ptr);
894 }
895 return nptr;
896}
897
898// convert from binary tree to sorted list
899int AffixMgr::process_sfx_tree_to_list() {
900 for (int i = 1; i < SETSIZE256; i++) {
901 sStart[i] = process_sfx_in_order(sStart[i], NULL__null);
902 }
903 return 0;
904}
905
906SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr) {
907 if (ptr) {
908 nptr = process_sfx_in_order(ptr->getNextNE(), nptr);
909 ptr->setNext(nptr);
910 nptr = process_sfx_in_order(ptr->getNextEQ(), ptr);
911 }
912 return nptr;
913}
914
915// reinitialize the PfxEntry links NextEQ and NextNE to speed searching
916// using the idea of leading subsets this time
917int AffixMgr::process_pfx_order() {
918 PfxEntry* ptr;
919
920 // loop through each prefix list starting point
921 for (int i = 1; i < SETSIZE256; i++) {
922 ptr = pStart[i];
923
924 // look through the remainder of the list
925 // and find next entry with affix that
926 // the current one is not a subset of
927 // mark that as destination for NextNE
928 // use next in list that you are a subset
929 // of as NextEQ
930
931 for (; ptr != NULL__null; ptr = ptr->getNext()) {
932 PfxEntry* nptr = ptr->getNext();
933 for (; nptr != NULL__null; nptr = nptr->getNext()) {
934 if (!isSubset(ptr->getKey(), nptr->getKey()))
935 break;
936 }
937 ptr->setNextNE(nptr);
938 ptr->setNextEQ(NULL__null);
939 if ((ptr->getNext()) &&
940 isSubset(ptr->getKey(), (ptr->getNext())->getKey()))
941 ptr->setNextEQ(ptr->getNext());
942 }
943
944 // now clean up by adding smart search termination strings:
945 // if you are already a superset of the previous prefix
946 // but not a subset of the next, search can end here
947 // so set NextNE properly
948
949 ptr = pStart[i];
950 for (; ptr != NULL__null; ptr = ptr->getNext()) {
951 PfxEntry* nptr = ptr->getNext();
952 PfxEntry* mptr = NULL__null;
953 for (; nptr != NULL__null; nptr = nptr->getNext()) {
954 if (!isSubset(ptr->getKey(), nptr->getKey()))
955 break;
956 mptr = nptr;
957 }
958 if (mptr)
959 mptr->setNextNE(NULL__null);
960 }
961 }
962 return 0;
963}
964
965// initialize the SfxEntry links NextEQ and NextNE to speed searching
966// using the idea of leading subsets this time
967int AffixMgr::process_sfx_order() {
968 SfxEntry* ptr;
969
970 // loop through each prefix list starting point
971 for (int i = 1; i < SETSIZE256; i++) {
972 ptr = sStart[i];
973
974 // look through the remainder of the list
975 // and find next entry with affix that
976 // the current one is not a subset of
977 // mark that as destination for NextNE
978 // use next in list that you are a subset
979 // of as NextEQ
980
981 for (; ptr != NULL__null; ptr = ptr->getNext()) {
982 SfxEntry* nptr = ptr->getNext();
983 for (; nptr != NULL__null; nptr = nptr->getNext()) {
984 if (!isSubset(ptr->getKey(), nptr->getKey()))
985 break;
986 }
987 ptr->setNextNE(nptr);
988 ptr->setNextEQ(NULL__null);
989 if ((ptr->getNext()) &&
990 isSubset(ptr->getKey(), (ptr->getNext())->getKey()))
991 ptr->setNextEQ(ptr->getNext());
992 }
993
994 // now clean up by adding smart search termination strings:
995 // if you are already a superset of the previous suffix
996 // but not a subset of the next, search can end here
997 // so set NextNE properly
998
999 ptr = sStart[i];
1000 for (; ptr != NULL__null; ptr = ptr->getNext()) {
1001 SfxEntry* nptr = ptr->getNext();
1002 SfxEntry* mptr = NULL__null;
1003 for (; nptr != NULL__null; nptr = nptr->getNext()) {
1004 if (!isSubset(ptr->getKey(), nptr->getKey()))
1005 break;
1006 mptr = nptr;
1007 }
1008 if (mptr)
1009 mptr->setNextNE(NULL__null);
1010 }
1011 }
1012 return 0;
1013}
1014
1015// add flags to the result for dictionary debugging
1016std::string& AffixMgr::debugflag(std::string& result, unsigned short flag) {
1017 char* st = encode_flag(flag);
1018 result.push_back(MSEP_FLD' ');
1019 result.append(MORPH_FLAG"fl:");
1020 if (st) {
1021 result.append(st);
1022 free(st)HunspellAllocator::CountingFree(st);
1023 }
1024 return result;
1025}
1026
1027// calculate the character length of the condition
1028int AffixMgr::condlen(const char* st) {
1029 int l = 0;
1030 bool group = false;
1031 for (; *st; st++) {
1032 if (*st == '[') {
1033 group = true;
1034 l++;
1035 } else if (*st == ']')
1036 group = false;
1037 else if (!group && (!utf8 || (!(*st & 0x80) || ((*st & 0xc0) == 0x80))))
1038 l++;
1039 }
1040 return l;
1041}
1042
1043int AffixMgr::encodeit(AffEntry& entry, const char* cs) {
1044 if (strcmp(cs, ".") != 0) {
1045 entry.numconds = (char)condlen(cs);
1046 const size_t cslen = strlen(cs);
1047 const size_t short_part = std::min<size_t>(MAXCONDLEN20, cslen);
1048 memcpy(entry.c.conds, cs, short_part);
1049 if (short_part < MAXCONDLEN20) {
1050 //blank out the remaining space
1051 memset(entry.c.conds + short_part, 0, MAXCONDLEN20 - short_part);
1052 } else if (cs[MAXCONDLEN20]) {
1053 //there is more conditions than fit in fixed space, so its
1054 //a long condition
1055 entry.opts |= aeLONGCOND(1 << 4);
1056 entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1(20 - sizeof(char*)));
1057 if (!entry.c.l.conds2)
1058 return 1;
1059 }
1060 } else {
1061 entry.numconds = 0;
1062 entry.c.conds[0] = '\0';
1063 }
1064 return 0;
1065}
1066
1067// return 1 if s1 is a leading subset of s2 (dots are for infixes)
1068inline int AffixMgr::isSubset(const char* s1, const char* s2) {
1069 while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
1070 s1++;
1071 s2++;
1072 }
1073 return (*s1 == '\0');
1074}
1075
1076// check word for prefixes
1077struct hentry* AffixMgr::prefix_check(const char* word,
1078 int len,
1079 char in_compound,
1080 const FLAGunsigned short needflag) {
1081 struct hentry* rv = NULL__null;
1082
1083 pfx = NULL__null;
1084 pfxappnd = NULL__null;
1085 sfxappnd = NULL__null;
1086 sfxextra = 0;
1087
1088 // first handle the special case of 0 length prefixes
1089 PfxEntry* pe = pStart[0];
1090 while (pe) {
1091 if (
1092 // fogemorpheme
1093 ((in_compound != IN_CPD_NOT0) ||
1094 !(pe->getCont() &&
1095 (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())(std::binary_search(pe->getCont(), pe->getCont() + pe->
getContLen(), onlyincompound))
))) &&
1096 // permit prefixes in compounds
1097 ((in_compound != IN_CPD_END2) ||
1098 (pe->getCont() &&
1099 (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen())(std::binary_search(pe->getCont(), pe->getCont() + pe->
getContLen(), compoundpermitflag))
)))) {
1100 // check prefix
1101 rv = pe->checkword(word, len, in_compound, needflag);
1102 if (rv) {
1103 pfx = pe; // BUG: pfx not stateless
1104 return rv;
1105 }
1106 }
1107 pe = pe->getNext();
1108 }
1109
1110 // now handle the general case
1111 unsigned char sp = *((const unsigned char*)word);
1112 PfxEntry* pptr = pStart[sp];
1113
1114 while (pptr) {
1115 if (isSubset(pptr->getKey(), word)) {
1116 if (
1117 // fogemorpheme
1118 ((in_compound != IN_CPD_NOT0) ||
1119 !(pptr->getCont() &&
1120 (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())(std::binary_search(pptr->getCont(), pptr->getCont() + pptr
->getContLen(), onlyincompound))
))) &&
1121 // permit prefixes in compounds
1122 ((in_compound != IN_CPD_END2) ||
1123 (pptr->getCont() && (TESTAFF(pptr->getCont(), compoundpermitflag,(std::binary_search(pptr->getCont(), pptr->getCont() + pptr
->getContLen(), compoundpermitflag))
1124 pptr->getContLen())(std::binary_search(pptr->getCont(), pptr->getCont() + pptr
->getContLen(), compoundpermitflag))
)))) {
1125 // check prefix
1126 rv = pptr->checkword(word, len, in_compound, needflag);
1127 if (rv) {
1128 pfx = pptr; // BUG: pfx not stateless
1129 return rv;
1130 }
1131 }
1132 pptr = pptr->getNextEQ();
1133 } else {
1134 pptr = pptr->getNextNE();
1135 }
1136 }
1137
1138 return NULL__null;
1139}
1140
1141// check word for prefixes and two-level suffixes
1142struct hentry* AffixMgr::prefix_check_twosfx(const char* word,
1143 int len,
1144 char in_compound,
1145 const FLAGunsigned short needflag) {
1146 struct hentry* rv = NULL__null;
1147
1148 pfx = NULL__null;
1149 sfxappnd = NULL__null;
1150 sfxextra = 0;
1151
1152 // first handle the special case of 0 length prefixes
1153 PfxEntry* pe = pStart[0];
1154
1155 while (pe) {
1156 rv = pe->check_twosfx(word, len, in_compound, needflag);
1157 if (rv)
1158 return rv;
1159 pe = pe->getNext();
1160 }
1161
1162 // now handle the general case
1163 unsigned char sp = *((const unsigned char*)word);
1164 PfxEntry* pptr = pStart[sp];
1165
1166 while (pptr) {
1167 if (isSubset(pptr->getKey(), word)) {
1168 rv = pptr->check_twosfx(word, len, in_compound, needflag);
1169 if (rv) {
1170 pfx = pptr;
1171 return rv;
1172 }
1173 pptr = pptr->getNextEQ();
1174 } else {
1175 pptr = pptr->getNextNE();
1176 }
1177 }
1178
1179 return NULL__null;
1180}
1181
1182// check word for prefixes and morph
1183std::string AffixMgr::prefix_check_morph(const char* word,
1184 int len,
1185 char in_compound,
1186 const FLAGunsigned short needflag) {
1187
1188 std::string result;
1189
1190 pfx = NULL__null;
1191 sfxappnd = NULL__null;
1192 sfxextra = 0;
1193
1194 // first handle the special case of 0 length prefixes
1195 PfxEntry* pe = pStart[0];
1196 while (pe) {
1197 std::string st = pe->check_morph(word, len, in_compound, needflag);
1198 if (!st.empty()) {
1199 result.append(st);
1200 }
1201 pe = pe->getNext();
1202 }
1203
1204 // now handle the general case
1205 unsigned char sp = *((const unsigned char*)word);
1206 PfxEntry* pptr = pStart[sp];
1207
1208 while (pptr) {
1209 if (isSubset(pptr->getKey(), word)) {
1210 std::string st = pptr->check_morph(word, len, in_compound, needflag);
1211 if (!st.empty()) {
1212 // fogemorpheme
1213 if ((in_compound != IN_CPD_NOT0) ||
1214 !((pptr->getCont() && (TESTAFF(pptr->getCont(), onlyincompound,(std::binary_search(pptr->getCont(), pptr->getCont() + pptr
->getContLen(), onlyincompound))
1215 pptr->getContLen())(std::binary_search(pptr->getCont(), pptr->getCont() + pptr
->getContLen(), onlyincompound))
)))) {
1216 result.append(st);
1217 pfx = pptr;
1218 }
1219 }
1220 pptr = pptr->getNextEQ();
1221 } else {
1222 pptr = pptr->getNextNE();
1223 }
1224 }
1225
1226 return result;
1227}
1228
1229// check word for prefixes and morph and two-level suffixes
1230std::string AffixMgr::prefix_check_twosfx_morph(const char* word,
1231 int len,
1232 char in_compound,
1233 const FLAGunsigned short needflag) {
1234 std::string result;
1235
1236 pfx = NULL__null;
1237 sfxappnd = NULL__null;
1238 sfxextra = 0;
1239
1240 // first handle the special case of 0 length prefixes
1241 PfxEntry* pe = pStart[0];
1242 while (pe) {
1243 std::string st = pe->check_twosfx_morph(word, len, in_compound, needflag);
1244 if (!st.empty()) {
1245 result.append(st);
1246 }
1247 pe = pe->getNext();
1248 }
1249
1250 // now handle the general case
1251 unsigned char sp = *((const unsigned char*)word);
1252 PfxEntry* pptr = pStart[sp];
1253
1254 while (pptr) {
1255 if (isSubset(pptr->getKey(), word)) {
1256 std::string st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
1257 if (!st.empty()) {
1258 result.append(st);
1259 pfx = pptr;
1260 }
1261 pptr = pptr->getNextEQ();
1262 } else {
1263 pptr = pptr->getNextNE();
1264 }
1265 }
1266
1267 return result;
1268}
1269
1270// Is word a non-compound with a REP substitution (see checkcompoundrep)?
1271int AffixMgr::cpdrep_check(const char* word, int wl) {
1272
1273 if ((wl < 2) || get_reptable().empty())
1274 return 0;
1275
1276 for (size_t i = 0; i < get_reptable().size(); ++i) {
1277 // use only available mid patterns
1278 if (!get_reptable()[i].outstrings[0].empty()) {
1279 const char* r = word;
1280 const size_t lenp = get_reptable()[i].pattern.size();
1281 // search every occurence of the pattern in the word
1282 while ((r = strstr(r, get_reptable()[i].pattern.c_str())) != NULL__null) {
1283 std::string candidate(word);
1284 candidate.replace(r - word, lenp, get_reptable()[i].outstrings[0]);
1285 if (candidate_check(candidate.c_str(), candidate.size()))
1286 return 1;
1287 ++r; // search for the next letter
1288 }
1289 }
1290 }
1291
1292 return 0;
1293}
1294
1295// forbid compound words, if they are in the dictionary as a
1296// word pair separated by space
1297int AffixMgr::cpdwordpair_check(const char * word, int wl) {
1298 if (wl > 2) {
1299 std::string candidate(word);
1300 for (size_t i = 1; i < candidate.size(); i++) {
1301 // go to end of the UTF-8 character
1302 if (utf8 && ((word[i] & 0xc0) == 0x80))
1303 continue;
1304 candidate.insert(i, 1, ' ');
1305 if (candidate_check(candidate.c_str(), candidate.size()))
1306 return 1;
1307 candidate.erase(i, 1);
1308 }
1309 }
1310
1311 return 0;
1312}
1313
1314// forbid compoundings when there are special patterns at word bound
1315int AffixMgr::cpdpat_check(const char* word,
1316 int pos,
1317 hentry* r1,
1318 hentry* r2,
1319 const char /*affixed*/) {
1320 for (size_t i = 0; i < checkcpdtable.size(); ++i) {
1321 size_t len;
1322 if (isSubset(checkcpdtable[i].pattern2.c_str(), word + pos) &&
1323 (!r1 || !checkcpdtable[i].cond ||
1324 (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen)(std::binary_search(r1->astr, r1->astr + r1->alen, checkcpdtable
[i].cond))
)) &&
1325 (!r2 || !checkcpdtable[i].cond2 ||
1326 (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen)(std::binary_search(r2->astr, r2->astr + r2->alen, checkcpdtable
[i].cond2))
)) &&
1327 // zero length pattern => only TESTAFF
1328 // zero pattern (0/flag) => unmodified stem (zero affixes allowed)
1329 (checkcpdtable[i].pattern.empty() ||
1330 ((checkcpdtable[i].pattern[0] == '0' && r1->blen <= pos &&
1331 strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) ||
1332 (checkcpdtable[i].pattern[0] != '0' &&
1333 ((len = checkcpdtable[i].pattern.size()) != 0) &&
1334 strncmp(word + pos - len, checkcpdtable[i].pattern.c_str(), len) == 0)))) {
1335 return 1;
1336 }
1337 }
1338 return 0;
1339}
1340
1341// forbid compounding with neighbouring upper and lower case characters at word
1342// bounds
1343int AffixMgr::cpdcase_check(const char* word, int pos) {
1344 if (utf8) {
1345 const char* p;
1346 for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--)
1347 ;
1348 std::string pair(p);
1349 std::vector<w_char> pair_u;
1350 u8_u16(pair_u, pair);
1351 unsigned short a = pair_u.size() > 1 ? ((pair_u[1].h << 8) + pair_u[1].l) : 0;
1352 unsigned short b = !pair_u.empty() ? ((pair_u[0].h << 8) + pair_u[0].l) : 0;
1353 if (((unicodetoupper(a, langnum) == a) ||
1354 (unicodetoupper(b, langnum) == b)) &&
1355 (a != '-') && (b != '-'))
1356 return 1;
1357 } else {
1358 unsigned char a = *(word + pos - 1);
1359 unsigned char b = *(word + pos);
1360 if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-'))
1361 return 1;
1362 }
1363 return 0;
1364}
1365
1366struct metachar_data {
1367 signed short btpp; // metacharacter (*, ?) position for backtracking
1368 signed short btwp; // word position for metacharacters
1369 int btnum; // number of matched characters in metacharacter
1370};
1371
1372// check compound patterns
1373int AffixMgr::defcpd_check(hentry*** words,
1374 short wnum,
1375 hentry* rv,
1376 hentry** def,
1377 char all) {
1378 int w = 0;
1379
1380 if (!*words) {
1381 w = 1;
1382 *words = def;
1383 }
1384
1385 if (!*words) {
1386 return 0;
1387 }
1388
1389 std::vector<metachar_data> btinfo(1);
1390
1391 short bt = 0;
1392
1393 (*words)[wnum] = rv;
1394
1395 // has the last word COMPOUNDRULE flag?
1396 if (rv->alen == 0) {
1397 (*words)[wnum] = NULL__null;
1398 if (w)
1399 *words = NULL__null;
1400 return 0;
1401 }
1402 int ok = 0;
1403 for (size_t i = 0; i < defcpdtable.size(); ++i) {
1404 for (size_t j = 0; j < defcpdtable[i].size(); ++j) {
1405 if (defcpdtable[i][j] != '*' && defcpdtable[i][j] != '?' &&
1406 TESTAFF(rv->astr, defcpdtable[i][j], rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, defcpdtable
[i][j]))
) {
1407 ok = 1;
1408 break;
1409 }
1410 }
1411 }
1412 if (ok == 0) {
1413 (*words)[wnum] = NULL__null;
1414 if (w)
1415 *words = NULL__null;
1416 return 0;
1417 }
1418
1419 for (size_t i = 0; i < defcpdtable.size(); ++i) {
1420 size_t pp = 0; // pattern position
1421 signed short wp = 0; // "words" position
1422 int ok2;
1423 ok = 1;
1424 ok2 = 1;
1425 do {
1426 while ((pp < defcpdtable[i].size()) && (wp <= wnum)) {
1427 if (((pp + 1) < defcpdtable[i].size()) &&
1428 ((defcpdtable[i][pp + 1] == '*') ||
1429 (defcpdtable[i][pp + 1] == '?'))) {
1430 int wend = (defcpdtable[i][pp + 1] == '?') ? wp : wnum;
1431 ok2 = 1;
1432 pp += 2;
1433 btinfo[bt].btpp = pp;
1434 btinfo[bt].btwp = wp;
1435 while (wp <= wend) {
1436 if (!(*words)[wp]->alen ||
1437 !TESTAFF((*words)[wp]->astr, defcpdtable[i][pp - 2],(std::binary_search((*words)[wp]->astr, (*words)[wp]->astr
+ (*words)[wp]->alen, defcpdtable[i][pp - 2]))
1438 (*words)[wp]->alen)(std::binary_search((*words)[wp]->astr, (*words)[wp]->astr
+ (*words)[wp]->alen, defcpdtable[i][pp - 2]))
) {
1439 ok2 = 0;
1440 break;
1441 }
1442 wp++;
1443 }
1444 if (wp <= wnum)
1445 ok2 = 0;
1446 btinfo[bt].btnum = wp - btinfo[bt].btwp;
1447 if (btinfo[bt].btnum > 0) {
1448 ++bt;
1449 btinfo.resize(bt+1);
1450 }
1451 if (ok2)
1452 break;
1453 } else {
1454 ok2 = 1;
1455 if (!(*words)[wp] || !(*words)[wp]->alen ||
1456 !TESTAFF((*words)[wp]->astr, defcpdtable[i][pp],(std::binary_search((*words)[wp]->astr, (*words)[wp]->astr
+ (*words)[wp]->alen, defcpdtable[i][pp]))
1457 (*words)[wp]->alen)(std::binary_search((*words)[wp]->astr, (*words)[wp]->astr
+ (*words)[wp]->alen, defcpdtable[i][pp]))
) {
1458 ok = 0;
1459 break;
1460 }
1461 pp++;
1462 wp++;
1463 if ((defcpdtable[i].size() == pp) && !(wp > wnum))
1464 ok = 0;
1465 }
1466 }
1467 if (ok && ok2) {
1468 size_t r = pp;
1469 while ((defcpdtable[i].size() > r) && ((r + 1) < defcpdtable[i].size()) &&
1470 ((defcpdtable[i][r + 1] == '*') ||
1471 (defcpdtable[i][r + 1] == '?')))
1472 r += 2;
1473 if (defcpdtable[i].size() <= r)
1474 return 1;
1475 }
1476 // backtrack
1477 if (bt)
1478 do {
1479 ok = 1;
1480 btinfo[bt - 1].btnum--;
1481 pp = btinfo[bt - 1].btpp;
1482 wp = btinfo[bt - 1].btwp + (signed short)btinfo[bt - 1].btnum;
1483 } while ((btinfo[bt - 1].btnum < 0) && --bt);
1484 } while (bt);
1485
1486 if (ok && ok2 && (!all || (defcpdtable[i].size() <= pp)))
1487 return 1;
1488
1489 // check zero ending
1490 while (ok && ok2 && (defcpdtable[i].size() > pp) &&
1491 ((pp + 1) < defcpdtable[i].size()) &&
1492 ((defcpdtable[i][pp + 1] == '*') ||
1493 (defcpdtable[i][pp + 1] == '?')))
1494 pp += 2;
1495 if (ok && ok2 && (defcpdtable[i].size() <= pp))
1496 return 1;
1497 }
1498 (*words)[wnum] = NULL__null;
1499 if (w)
1500 *words = NULL__null;
1501 return 0;
1502}
1503
1504inline int AffixMgr::candidate_check(const char* word, int len) {
1505
1506 struct hentry* rv = lookup(word);
1507 if (rv)
1508 return 1;
1509
1510 // rv = prefix_check(word,len,1);
1511 // if (rv) return 1;
1512
1513 rv = affix_check(word, len);
1514 if (rv)
1515 return 1;
1516 return 0;
1517}
1518
1519// calculate number of syllable for compound-checking
1520short AffixMgr::get_syllable(const std::string& word) {
1521 if (cpdmaxsyllable == 0)
1522 return 0;
1523
1524 short num = 0;
1525
1526 if (!utf8) {
1527 for (size_t i = 0; i < word.size(); ++i) {
1528 if (std::binary_search(cpdvowels.begin(), cpdvowels.end(),
1529 word[i])) {
1530 ++num;
1531 }
1532 }
1533 } else if (!cpdvowels_utf16.empty()) {
1534 std::vector<w_char> w;
1535 u8_u16(w, word);
1536 for (size_t i = 0; i < w.size(); ++i) {
1537 if (std::binary_search(cpdvowels_utf16.begin(),
1538 cpdvowels_utf16.end(),
1539 w[i])) {
1540 ++num;
1541 }
1542 }
1543 }
1544
1545 return num;
1546}
1547
1548void AffixMgr::setcminmax(int* cmin, int* cmax, const char* word, int len) {
1549 if (utf8) {
1550 int i;
1551 for (*cmin = 0, i = 0; (i < cpdmin) && *cmin < len; i++) {
1552 for ((*cmin)++; *cmin < len && (word[*cmin] & 0xc0) == 0x80; (*cmin)++)
1553 ;
1554 }
1555 for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax >= 0; i++) {
1556 for ((*cmax)--; *cmax >= 0 && (word[*cmax] & 0xc0) == 0x80; (*cmax)--)
1557 ;
1558 }
1559 } else {
1560 *cmin = cpdmin;
1561 *cmax = len - cpdmin + 1;
1562 }
1563}
1564
1565// check if compound word is correctly spelled
1566// hu_mov_rule = spec. Hungarian rule (XXX)
1567struct hentry* AffixMgr::compound_check(const std::string& word,
1568 short wordnum,
1569 short numsyllable,
1570 short maxwordnum,
1571 short wnum,
1572 hentry** words = NULL__null,
1573 hentry** rwords = NULL__null,
1574 char hu_mov_rule = 0,
1575 char is_sug = 0,
1576 int* info = NULL__null) {
1577 int i;
1578 short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
1579 struct hentry* rv = NULL__null;
1580 struct hentry* rv_first;
1581 std::string st;
1582 char ch = '\0';
1583 int cmin;
1584 int cmax;
1585 int striple = 0;
1586 size_t scpd = 0;
1587 int soldi = 0;
1588 int oldcmin = 0;
1589 int oldcmax = 0;
1590 int oldlen = 0;
1591 int checkedstriple = 0;
1592 char affixed = 0;
1593 hentry** oldwords = words;
1594 size_t len = word.size();
1595
1596 int checked_prefix;
1597
1598 // add a time limit to handle possible
1599 // combinatorical explosion of the overlapping words
1600
1601 HUNSPELL_THREAD_LOCALthread_local clock_t timelimit;
1602
1603 if (wordnum == 0) {
1604 // get the start time, seeing as we're reusing this set to 0
1605 // to flag timeout, use clock() + 1 to avoid start clock()
1606 // of 0 as being a timeout
1607 timelimit = clock() + 1;
1608 }
1609 else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT(((__clock_t) 1000000) / 20))) {
1610 timelimit = 0;
1611 }
1612
1613 setcminmax(&cmin, &cmax, word.c_str(), len);
1614
1615 st.assign(word);
1616
1617 for (i = cmin; i < cmax; i++) {
1618 // go to end of the UTF-8 character
1619 if (utf8) {
1620 for (; (st[i] & 0xc0) == 0x80; i++)
1621 ;
1622 if (i >= cmax)
1623 return NULL__null;
1624 }
1625
1626 words = oldwords;
1627 int onlycpdrule = (words) ? 1 : 0;
1628
1629 do { // onlycpdrule loop
1630
1631 oldnumsyllable = numsyllable;
1632 oldwordnum = wordnum;
1633 checked_prefix = 0;
1634
1635 do { // simplified checkcompoundpattern loop
1636
1637 if (timelimit == 0)
1638 return 0;
1639
1640 if (scpd > 0) {
1641 for (; scpd <= checkcpdtable.size() &&
1642 (checkcpdtable[scpd - 1].pattern3.empty() ||
1643 strncmp(word.c_str() + i, checkcpdtable[scpd - 1].pattern3.c_str(),
1644 checkcpdtable[scpd - 1].pattern3.size()) != 0);
1645 scpd++)
1646 ;
1647
1648 if (scpd > checkcpdtable.size())
1649 break; // break simplified checkcompoundpattern loop
1650 st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern);
1651 soldi = i;
1652 i += checkcpdtable[scpd - 1].pattern.size();
1653 st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern2);
1654 st.replace(i + checkcpdtable[scpd - 1].pattern2.size(), std::string::npos,
1655 word.substr(soldi + checkcpdtable[scpd - 1].pattern3.size()));
1656
1657 oldlen = len;
1658 len += checkcpdtable[scpd - 1].pattern.size() +
1659 checkcpdtable[scpd - 1].pattern2.size() -
1660 checkcpdtable[scpd - 1].pattern3.size();
1661 oldcmin = cmin;
1662 oldcmax = cmax;
1663 setcminmax(&cmin, &cmax, st.c_str(), len);
1664
1665 cmax = len - cpdmin + 1;
1666 }
1667
1668 ch = st[i];
1669 st[i] = '\0';
1670
1671 sfx = NULL__null;
1672 pfx = NULL__null;
1673
1674 // FIRST WORD
1675
1676 affixed = 1;
1677 rv = lookup(st.c_str()); // perhaps without prefix
1678
1679 // forbid dictionary stems with COMPOUNDFORBIDFLAG in
1680 // compound words, overriding the effect of COMPOUNDPERMITFLAG
1681 if ((rv) && compoundforbidflag &&
1682 TESTAFF(rv->astr, compoundforbidflag, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundforbidflag
))
&& !hu_mov_rule)
1683 continue;
1684
1685 // search homonym with compound flag
1686 while ((rv) && !hu_mov_rule &&
1687 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, needaffix
))
) ||
1688 !((compoundflag && !words && !onlycpdrule &&
1689 TESTAFF(rv->astr, compoundflag, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundflag
))
) ||
1690 (compoundbegin && !wordnum && !onlycpdrule &&
1691 TESTAFF(rv->astr, compoundbegin, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundbegin
))
) ||
1692 (compoundmiddle && wordnum && !words && !onlycpdrule &&
1693 TESTAFF(rv->astr, compoundmiddle, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundmiddle
))
) ||
1694 (!defcpdtable.empty() && onlycpdrule &&
1695 ((!words && !wordnum &&
1696 defcpd_check(&words, wnum, rv, rwords, 0)) ||
1697 (words &&
1698 defcpd_check(&words, wnum, rv, rwords, 0))))) ||
1699 (scpd != 0 && checkcpdtable[scpd - 1].cond != FLAG_NULL0x00 &&
1700 !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, checkcpdtable
[scpd - 1].cond))
))) {
1701 rv = rv->next_homonym;
1702 }
1703
1704 if (rv)
1705 affixed = 0;
1706
1707 if (!rv) {
1708 if (onlycpdrule)
1709 break;
1710 if (compoundflag &&
1711 !(rv = prefix_check(st.c_str(), i,
1712 hu_mov_rule ? IN_CPD_OTHER3 : IN_CPD_BEGIN1,
1713 compoundflag))) {
1714 if (((rv = suffix_check(
1715 st.c_str(), i, 0, NULL__null, FLAG_NULL0x00, compoundflag,
1716 hu_mov_rule ? IN_CPD_OTHER3 : IN_CPD_BEGIN1)) ||
1717 (compoundmoresuffixes &&
1718 (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL__null, compoundflag)))) &&
1719 !hu_mov_rule && sfx->getCont() &&
1720 ((compoundforbidflag &&
1721 TESTAFF(sfx->getCont(), compoundforbidflag,(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), compoundforbidflag))
1722 sfx->getContLen())(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), compoundforbidflag))
) ||
1723 (compoundend &&
1724 TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), compoundend))
))) {
1725 rv = NULL__null;
1726 }
1727 }
1728
1729 if (rv ||
1730 (((wordnum == 0) && compoundbegin &&
1731 ((rv = suffix_check(
1732 st.c_str(), i, 0, NULL__null, FLAG_NULL0x00, compoundbegin,
1733 hu_mov_rule ? IN_CPD_OTHER3 : IN_CPD_BEGIN1)) ||
1734 (compoundmoresuffixes &&
1735 (rv = suffix_check_twosfx(
1736 st.c_str(), i, 0, NULL__null,
1737 compoundbegin))) || // twofold suffixes + compound
1738 (rv = prefix_check(st.c_str(), i,
1739 hu_mov_rule ? IN_CPD_OTHER3 : IN_CPD_BEGIN1,
1740 compoundbegin)))) ||
1741 ((wordnum > 0) && compoundmiddle &&
1742 ((rv = suffix_check(
1743 st.c_str(), i, 0, NULL__null, FLAG_NULL0x00, compoundmiddle,
1744 hu_mov_rule ? IN_CPD_OTHER3 : IN_CPD_BEGIN1)) ||
1745 (compoundmoresuffixes &&
1746 (rv = suffix_check_twosfx(
1747 st.c_str(), i, 0, NULL__null,
1748 compoundmiddle))) || // twofold suffixes + compound
1749 (rv = prefix_check(st.c_str(), i,
1750 hu_mov_rule ? IN_CPD_OTHER3 : IN_CPD_BEGIN1,
1751 compoundmiddle))))))
1752 checked_prefix = 1;
1753 // else check forbiddenwords and needaffix
1754 } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, forbiddenword
))
||
1755 TESTAFF(rv->astr, needaffix, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, needaffix
))
||
1756 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 65511
))
||
1757 (is_sug && nosuggest &&
1758 TESTAFF(rv->astr, nosuggest, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, nosuggest
))
))) {
1759 st[i] = ch;
1760 // continue;
1761 break;
1762 }
1763
1764 // check non_compound flag in suffix and prefix
1765 if ((rv) && !hu_mov_rule &&
1766 ((pfx && pfx->getCont() &&
1767 TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())(std::binary_search(pfx->getCont(), pfx->getCont() + pfx
->getContLen(), compoundforbidflag))
) ||
1768 (sfx && sfx->getCont() &&
1769 TESTAFF(sfx->getCont(), compoundforbidflag,(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), compoundforbidflag))
1770 sfx->getContLen())(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), compoundforbidflag))
))) {
1771 rv = NULL__null;
1772 }
1773
1774 // check compoundend flag in suffix and prefix
1775 if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
1776 ((pfx && pfx->getCont() &&
1777 TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())(std::binary_search(pfx->getCont(), pfx->getCont() + pfx
->getContLen(), compoundend))
) ||
1778 (sfx && sfx->getCont() &&
1779 TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), compoundend))
))) {
1780 rv = NULL__null;
1781 }
1782
1783 // check compoundmiddle flag in suffix and prefix
1784 if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle &&
1785 !hu_mov_rule &&
1786 ((pfx && pfx->getCont() &&
1787 TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())(std::binary_search(pfx->getCont(), pfx->getCont() + pfx
->getContLen(), compoundmiddle))
) ||
1788 (sfx && sfx->getCont() &&
1789 TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), compoundmiddle))
))) {
1790 rv = NULL__null;
1791 }
1792
1793 // check forbiddenwords
1794 if ((rv) && (rv->astr) &&
1795 (TESTAFF(rv->astr, forbiddenword, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, forbiddenword
))
||
1796 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 65511
))
||
1797 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, nosuggest
))
))) {
1798 return NULL__null;
1799 }
1800
1801 // increment word number, if the second root has a compoundroot flag
1802 if ((rv) && compoundroot &&
1803 (TESTAFF(rv->astr, compoundroot, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundroot
))
)) {
1804 wordnum++;
1805 }
1806
1807 // first word is acceptable in compound words?
1808 if (((rv) &&
1809 (checked_prefix || (words && words[wnum]) ||
1810 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundflag
))
) ||
1811 ((oldwordnum == 0) && compoundbegin &&
1812 TESTAFF(rv->astr, compoundbegin, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundbegin
))
) ||
1813 ((oldwordnum > 0) && compoundmiddle &&
1814 TESTAFF(rv->astr, compoundmiddle, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundmiddle
))
)
1815
1816 // LANG_hu section: spec. Hungarian rule
1817 || ((langnum == LANG_hu) && hu_mov_rule &&
1818 (TESTAFF((std::binary_search(rv->astr, rv->astr + rv->alen, 'F'
))
1819 rv->astr, 'F',(std::binary_search(rv->astr, rv->astr + rv->alen, 'F'
))
1820 rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 'F'
))
|| // XXX hardwired Hungarian dictionary codes
1821 TESTAFF(rv->astr, 'G', rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 'G'
))
||
1822 TESTAFF(rv->astr, 'H', rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 'H'
))
))
1823 // END of LANG_hu section
1824 ) &&
1825 (
1826 // test CHECKCOMPOUNDPATTERN conditions
1827 scpd == 0 || checkcpdtable[scpd - 1].cond == FLAG_NULL0x00 ||
1828 TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, checkcpdtable
[scpd - 1].cond))
) &&
1829 !((checkcompoundtriple && scpd == 0 &&
1830 !words && // test triple letters
1831 (word[i - 1] == word[i]) &&
1832 (((i > 1) && (word[i - 1] == word[i - 2])) ||
1833 ((word[i - 1] == word[i + 1])) // may be word[i+1] == '\0'
1834 )) ||
1835 (checkcompoundcase && scpd == 0 && !words &&
1836 cpdcase_check(word.c_str(), i))))
1837 // LANG_hu section: spec. Hungarian rule
1838 || ((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
1839 (rv = affix_check(st.c_str(), i)) &&
1840 (sfx && sfx->getCont() &&
1841 ( // XXX hardwired Hungarian dic. codes
1842 TESTAFF(sfx->getCont(), (unsigned short)'x',(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), (unsigned short)'x'))
1843 sfx->getContLen())(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), (unsigned short)'x'))
||
1844 TESTAFF((std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), (unsigned short)'%'))
1845 sfx->getCont(), (unsigned short)'%',(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), (unsigned short)'%'))
1846 sfx->getContLen())(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), (unsigned short)'%'))
)))) { // first word is ok condition
1847
1848 // LANG_hu section: spec. Hungarian rule
1849 if (langnum == LANG_hu) {
1850 // calculate syllable number of the word
1851 numsyllable += get_syllable(st.substr(0, i));
1852 // + 1 word, if syllable number of the prefix > 1 (hungarian
1853 // convention)
1854 if (pfx && (get_syllable(pfx->getKey()) > 1))
1855 wordnum++;
1856 }
1857 // END of LANG_hu section
1858
1859 // NEXT WORD(S)
1860 rv_first = rv;
1861 st[i] = ch;
1862
1863 do { // striple loop
1864
1865 // check simplifiedtriple
1866 if (simplifiedtriple) {
1867 if (striple) {
1868 checkedstriple = 1;
1869 i--; // check "fahrt" instead of "ahrt" in "Schiffahrt"
1870 } else if (i > 2 && word[i - 1] == word[i - 2])
1871 striple = 1;
1872 }
1873
1874 rv = lookup(st.c_str() + i); // perhaps without prefix
1875
1876 // search homonym with compound flag
1877 while ((rv) &&
1878 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, needaffix
))
) ||
1879 !((compoundflag && !words &&
1880 TESTAFF(rv->astr, compoundflag, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundflag
))
) ||
1881 (compoundend && !words &&
1882 TESTAFF(rv->astr, compoundend, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundend
))
) ||
1883 (!defcpdtable.empty() && words &&
1884 defcpd_check(&words, wnum + 1, rv, NULL__null, 1))) ||
1885 (scpd != 0 && checkcpdtable[scpd - 1].cond2 != FLAG_NULL0x00 &&
1886 !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2,(std::binary_search(rv->astr, rv->astr + rv->alen, checkcpdtable
[scpd - 1].cond2))
1887 rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, checkcpdtable
[scpd - 1].cond2))
))) {
1888 rv = rv->next_homonym;
1889 }
1890
1891 // check FORCEUCASE
1892 if (rv && forceucase && (rv) &&
1893 (TESTAFF(rv->astr, forceucase, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, forceucase
))
) &&
1894 !(info && *info & SPELL_ORIGCAP(1 << 5)))
1895 rv = NULL__null;
1896
1897 if (rv && words && words[wnum + 1])
1898 return rv_first;
1899
1900 oldnumsyllable2 = numsyllable;
1901 oldwordnum2 = wordnum;
1902
1903 // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary
1904 // code
1905 if ((rv) && (langnum == LANG_hu) &&
1906 (TESTAFF(rv->astr, 'I', rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 'I'
))
) &&
1907 !(TESTAFF(rv->astr, 'J', rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 'J'
))
)) {
1908 numsyllable--;
1909 }
1910 // END of LANG_hu section
1911
1912 // increment word number, if the second root has a compoundroot flag
1913 if ((rv) && (compoundroot) &&
1914 (TESTAFF(rv->astr, compoundroot, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundroot
))
)) {
1915 wordnum++;
1916 }
1917
1918 // check forbiddenwords
1919 if ((rv) && (rv->astr) &&
1920 (TESTAFF(rv->astr, forbiddenword, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, forbiddenword
))
||
1921 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 65511
))
||
1922 (is_sug && nosuggest &&
1923 TESTAFF(rv->astr, nosuggest, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, nosuggest
))
)))
1924 return NULL__null;
1925
1926 // second word is acceptable, as a root?
1927 // hungarian conventions: compounding is acceptable,
1928 // when compound forms consist of 2 words, or if more,
1929 // then the syllable number of root words must be 6, or lesser.
1930
1931 if ((rv) &&
1932 ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundflag
))
) ||
1933 (compoundend && TESTAFF(rv->astr, compoundend, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundend
))
)) &&
1934 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
1935 ((cpdmaxsyllable != 0) &&
1936 (numsyllable + get_syllable(std::string(HENTRY_WORD(rv)&(rv->word[0]), rv->blen)) <=
1937 cpdmaxsyllable))) &&
1938 (
1939 // test CHECKCOMPOUNDPATTERN
1940 checkcpdtable.empty() || scpd != 0 ||
1941 !cpdpat_check(word.c_str(), i, rv_first, rv, 0)) &&
1942 ((!checkcompounddup || (rv != rv_first)))
1943 // test CHECKCOMPOUNDPATTERN conditions
1944 &&
1945 (scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL0x00 ||
1946 TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, checkcpdtable
[scpd - 1].cond2))
)) {
1947 // forbid compound word, if it is a non-compound word with typical
1948 // fault
1949 if ((checkcompoundrep && cpdrep_check(word.c_str(), len)) ||
1950 cpdwordpair_check(word.c_str(), len))
1951 return NULL__null;
1952 return rv_first;
1953 }
1954
1955 numsyllable = oldnumsyllable2;
1956 wordnum = oldwordnum2;
1957
1958 // perhaps second word has prefix or/and suffix
1959 sfx = NULL__null;
1960 sfxflag = FLAG_NULL0x00;
1961 rv = (compoundflag && !onlycpdrule)
1962 ? affix_check((word.c_str() + i), strlen(word.c_str() + i), compoundflag,
1963 IN_CPD_END2)
1964 : NULL__null;
1965 if (!rv && compoundend && !onlycpdrule) {
1966 sfx = NULL__null;
1967 pfx = NULL__null;
1968 rv = affix_check((word.c_str() + i), strlen(word.c_str() + i), compoundend,
1969 IN_CPD_END2);
1970 }
1971
1972 if (!rv && !defcpdtable.empty() && words) {
1973 rv = affix_check((word.c_str() + i), strlen(word.c_str() + i), 0, IN_CPD_END2);
1974 if (rv && defcpd_check(&words, wnum + 1, rv, NULL__null, 1))
1975 return rv_first;
1976 rv = NULL__null;
1977 }
1978
1979 // test CHECKCOMPOUNDPATTERN conditions (allowed forms)
1980 if (rv &&
1981 !(scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL0x00 ||
1982 TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, checkcpdtable
[scpd - 1].cond2))
))
1983 rv = NULL__null;
1984
1985 // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds)
1986 if (rv && !checkcpdtable.empty() && scpd == 0 &&
1987 cpdpat_check(word.c_str(), i, rv_first, rv, affixed))
1988 rv = NULL__null;
1989
1990 // check non_compound flag in suffix and prefix
1991 if ((rv) && ((pfx && pfx->getCont() &&
1992 TESTAFF(pfx->getCont(), compoundforbidflag,(std::binary_search(pfx->getCont(), pfx->getCont() + pfx
->getContLen(), compoundforbidflag))
1993 pfx->getContLen())(std::binary_search(pfx->getCont(), pfx->getCont() + pfx
->getContLen(), compoundforbidflag))
) ||
1994 (sfx && sfx->getCont() &&
1995 TESTAFF(sfx->getCont(), compoundforbidflag,(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), compoundforbidflag))
1996 sfx->getContLen())(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), compoundforbidflag))
))) {
1997 rv = NULL__null;
1998 }
1999
2000 // check FORCEUCASE
2001 if (rv && forceucase && (rv) &&
2002 (TESTAFF(rv->astr, forceucase, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, forceucase
))
) &&
2003 !(info && *info & SPELL_ORIGCAP(1 << 5)))
2004 rv = NULL__null;
2005
2006 // check forbiddenwords
2007 if ((rv) && (rv->astr) &&
2008 (TESTAFF(rv->astr, forbiddenword, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, forbiddenword
))
||
2009 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 65511
))
||
2010 (is_sug && nosuggest &&
2011 TESTAFF(rv->astr, nosuggest, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, nosuggest
))
)))
2012 return NULL__null;
2013
2014 // pfxappnd = prefix of word+i, or NULL
2015 // calculate syllable number of prefix.
2016 // hungarian convention: when syllable number of prefix is more,
2017 // than 1, the prefix+word counts as two words.
2018
2019 if (langnum == LANG_hu) {
2020 // calculate syllable number of the word
2021 numsyllable += get_syllable(word.c_str() + i);
2022
2023 // - affix syllable num.
2024 // XXX only second suffix (inflections, not derivations)
2025 if (sfxappnd) {
2026 std::string tmp(sfxappnd);
2027 reverseword(tmp);
2028 numsyllable -= short(get_syllable(tmp) + sfxextra);
2029 } else {
2030 numsyllable -= short(sfxextra);
2031 }
2032
2033 // + 1 word, if syllable number of the prefix > 1 (hungarian
2034 // convention)
2035 if (pfx && (get_syllable(pfx->getKey()) > 1))
2036 wordnum++;
2037
2038 // increment syllable num, if last word has a SYLLABLENUM flag
2039 // and the suffix is beginning `s'
2040
2041 if (!cpdsyllablenum.empty()) {
2042 switch (sfxflag) {
2043 case 'c': {
2044 numsyllable += 2;
2045 break;
2046 }
2047 case 'J': {
2048 numsyllable += 1;
2049 break;
2050 }
2051 case 'I': {
2052 if (rv && TESTAFF(rv->astr, 'J', rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 'J'
))
)
2053 numsyllable += 1;
2054 break;
2055 }
2056 }
2057 }
2058 }
2059
2060 // increment word number, if the second word has a compoundroot flag
2061 if ((rv) && (compoundroot) &&
2062 (TESTAFF(rv->astr, compoundroot, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundroot
))
)) {
2063 wordnum++;
2064 }
2065 // second word is acceptable, as a word with prefix or/and suffix?
2066 // hungarian conventions: compounding is acceptable,
2067 // when compound forms consist 2 word, otherwise
2068 // the syllable number of root words is 6, or lesser.
2069 if ((rv) &&
2070 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2071 ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) &&
2072 ((!checkcompounddup || (rv != rv_first)))) {
2073 // forbid compound word, if it is a non-compound word with typical
2074 // fault
2075 if ((checkcompoundrep && cpdrep_check(word.c_str(), len)) ||
2076 cpdwordpair_check(word.c_str(), len))
2077 return NULL__null;
2078 return rv_first;
2079 }
2080
2081 numsyllable = oldnumsyllable2;
2082 wordnum = oldwordnum2;
2083
2084 // perhaps second word is a compound word (recursive call)
2085 if (wordnum + 2 < maxwordnum) {
2086 rv = compound_check(st.substr(i), wordnum + 1,
2087 numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
2088 is_sug, info);
2089
2090 if (rv && !checkcpdtable.empty() &&
2091 ((scpd == 0 &&
2092 cpdpat_check(word.c_str(), i, rv_first, rv, affixed)) ||
2093 (scpd != 0 &&
2094 !cpdpat_check(word.c_str(), i, rv_first, rv, affixed))))
2095 rv = NULL__null;
2096 } else {
2097 rv = NULL__null;
2098 }
2099 if (rv) {
2100 // forbid compound word, if it is a non-compound word with typical
2101 // fault, or a dictionary word pair
2102
2103 if (cpdwordpair_check(word.c_str(), len))
2104 return NULL__null;
2105
2106 if (checkcompoundrep || forbiddenword) {
2107
2108 if (checkcompoundrep && cpdrep_check(word.c_str(), len))
2109 return NULL__null;
2110
2111 // check first part
2112 if (strncmp(rv->word, word.c_str() + i, rv->blen) == 0) {
2113 char r = st[i + rv->blen];
2114 st[i + rv->blen] = '\0';
2115
2116 if ((checkcompoundrep && cpdrep_check(st.c_str(), i + rv->blen)) ||
2117 cpdwordpair_check(st.c_str(), i + rv->blen)) {
2118 st[ + i + rv->blen] = r;
2119 continue;
2120 }
2121
2122 if (forbiddenword) {
2123 struct hentry* rv2 = lookup(word.c_str());
2124 if (!rv2)
2125 rv2 = affix_check(word.c_str(), len);
2126 if (rv2 && rv2->astr &&
2127 TESTAFF(rv2->astr, forbiddenword, rv2->alen)(std::binary_search(rv2->astr, rv2->astr + rv2->alen
, forbiddenword))
&&
2128 (strncmp(rv2->word, st.c_str(), i + rv->blen) == 0)) {
2129 return NULL__null;
2130 }
2131 }
2132 st[i + rv->blen] = r;
2133 }
2134 }
2135 return rv_first;
2136 }
2137 } while (striple && !checkedstriple); // end of striple loop
2138
2139 if (checkedstriple) {
2140 i++;
2141 checkedstriple = 0;
2142 striple = 0;
2143 }
2144
2145 } // first word is ok condition
2146
2147 if (soldi != 0) {
2148 i = soldi;
2149 soldi = 0;
2150 len = oldlen;
2151 cmin = oldcmin;
2152 cmax = oldcmax;
2153 }
2154 scpd++;
2155
2156 } while (!onlycpdrule && simplifiedcpd &&
2157 scpd <= checkcpdtable.size()); // end of simplifiedcpd loop
2158
2159 scpd = 0;
2160 wordnum = oldwordnum;
2161 numsyllable = oldnumsyllable;
2162
2163 if (soldi != 0) {
2164 i = soldi;
2165 st.assign(word); // XXX add more optim.
2166 soldi = 0;
2167 } else
2168 st[i] = ch;
2169
2170 } while (!defcpdtable.empty() && oldwordnum == 0 &&
2171 onlycpdrule++ < 1); // end of onlycpd loop
2172 }
2173
2174 return NULL__null;
2175}
2176
2177// check if compound word is correctly spelled
2178// hu_mov_rule = spec. Hungarian rule (XXX)
2179int AffixMgr::compound_check_morph(const char* word,
2180 int len,
2181 short wordnum,
2182 short numsyllable,
2183 short maxwordnum,
2184 short wnum,
2185 hentry** words,
2186 hentry** rwords,
2187 char hu_mov_rule,
2188 std::string& result,
2189 const std::string* partresult) {
2190 int i;
2191 short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
2192 int ok = 0;
2193
2194 struct hentry* rv = NULL__null;
2195 struct hentry* rv_first;
2196 std::string st;
2197 char ch;
2198
2199 int checked_prefix;
2200 std::string presult;
2201
2202 int cmin;
2203 int cmax;
2204
2205 char affixed = 0;
2206 hentry** oldwords = words;
2207
2208 // add a time limit to handle possible
2209 // combinatorical explosion of the overlapping words
2210
2211 HUNSPELL_THREAD_LOCALthread_local clock_t timelimit;
2212
2213 if (wordnum == 0) {
2214 // get the start time, seeing as we're reusing this set to 0
2215 // to flag timeout, use clock() + 1 to avoid start clock()
2216 // of 0 as being a timeout
2217 timelimit = clock() + 1;
2218 }
2219 else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT(((__clock_t) 1000000) / 20))) {
2220 timelimit = 0;
2221 }
2222
2223 setcminmax(&cmin, &cmax, word, len);
2224
2225 st.assign(word);
2226
2227 for (i = cmin; i < cmax; i++) {
2228 // go to end of the UTF-8 character
2229 if (utf8) {
2230 for (; (st[i] & 0xc0) == 0x80; i++)
2231 ;
2232 if (i >= cmax)
2233 return 0;
2234 }
2235
2236 words = oldwords;
2237 int onlycpdrule = (words) ? 1 : 0;
2238
2239 do { // onlycpdrule loop
2240
2241 if (timelimit == 0)
2242 return 0;
2243
2244 oldnumsyllable = numsyllable;
2245 oldwordnum = wordnum;
2246 checked_prefix = 0;
2247
2248 ch = st[i];
2249 st[i] = '\0';
2250 sfx = NULL__null;
2251
2252 // FIRST WORD
2253
2254 affixed = 1;
2255
2256 presult.clear();
2257 if (partresult)
2258 presult.append(*partresult);
2259
2260 rv = lookup(st.c_str()); // perhaps without prefix
2261
2262 // forbid dictionary stems with COMPOUNDFORBIDFLAG in
2263 // compound words, overriding the effect of COMPOUNDPERMITFLAG
2264 if ((rv) && compoundforbidflag &&
2265 TESTAFF(rv->astr, compoundforbidflag, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundforbidflag
))
&& !hu_mov_rule)
2266 continue;
2267
2268 // search homonym with compound flag
2269 while ((rv) && !hu_mov_rule &&
2270 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, needaffix
))
) ||
2271 !((compoundflag && !words && !onlycpdrule &&
2272 TESTAFF(rv->astr, compoundflag, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundflag
))
) ||
2273 (compoundbegin && !wordnum && !onlycpdrule &&
2274 TESTAFF(rv->astr, compoundbegin, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundbegin
))
) ||
2275 (compoundmiddle && wordnum && !words && !onlycpdrule &&
2276 TESTAFF(rv->astr, compoundmiddle, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundmiddle
))
) ||
2277 (!defcpdtable.empty() && onlycpdrule &&
2278 ((!words && !wordnum &&
2279 defcpd_check(&words, wnum, rv, rwords, 0)) ||
2280 (words &&
2281 defcpd_check(&words, wnum, rv, rwords, 0))))))) {
2282 rv = rv->next_homonym;
2283 }
2284
2285 if (timelimit == 0)
2286 return 0;
2287
2288 if (rv)
2289 affixed = 0;
2290
2291 if (rv) {
2292 presult.push_back(MSEP_FLD' ');
2293 presult.append(MORPH_PART"pa:");
2294 presult.append(st.c_str());
2295 if (!HENTRY_FIND(rv, MORPH_STEM"st:")) {
2296 presult.push_back(MSEP_FLD' ');
2297 presult.append(MORPH_STEM"st:");
2298 presult.append(st.c_str());
2299 }
2300 if (HENTRY_DATA(rv)) {
2301 presult.push_back(MSEP_FLD' ');
2302 presult.append(HENTRY_DATA2(rv));
2303 }
2304 }
2305
2306 if (!rv) {
2307 if (compoundflag &&
2308 !(rv =
2309 prefix_check(st.c_str(), i, hu_mov_rule ? IN_CPD_OTHER3 : IN_CPD_BEGIN1,
2310 compoundflag))) {
2311 if (((rv = suffix_check(st.c_str(), i, 0, NULL__null, FLAG_NULL0x00,
2312 compoundflag,
2313 hu_mov_rule ? IN_CPD_OTHER3 : IN_CPD_BEGIN1)) ||
2314 (compoundmoresuffixes &&
2315 (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL__null, compoundflag)))) &&
2316 !hu_mov_rule && sfx->getCont() &&
2317 ((compoundforbidflag &&
2318 TESTAFF(sfx->getCont(), compoundforbidflag,(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), compoundforbidflag))
2319 sfx->getContLen())(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), compoundforbidflag))
) ||
2320 (compoundend &&
2321 TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), compoundend))
))) {
2322 rv = NULL__null;
2323 }
2324 }
2325
2326 if (rv ||
2327 (((wordnum == 0) && compoundbegin &&
2328 ((rv = suffix_check(st.c_str(), i, 0, NULL__null, FLAG_NULL0x00,
2329 compoundbegin,
2330 hu_mov_rule ? IN_CPD_OTHER3 : IN_CPD_BEGIN1)) ||
2331 (compoundmoresuffixes &&
2332 (rv = suffix_check_twosfx(
2333 st.c_str(), i, 0, NULL__null,
2334 compoundbegin))) || // twofold suffix+compound
2335 (rv = prefix_check(st.c_str(), i,
2336 hu_mov_rule ? IN_CPD_OTHER3 : IN_CPD_BEGIN1,
2337 compoundbegin)))) ||
2338 ((wordnum > 0) && compoundmiddle &&
2339 ((rv = suffix_check(st.c_str(), i, 0, NULL__null, FLAG_NULL0x00,
2340 compoundmiddle,
2341 hu_mov_rule ? IN_CPD_OTHER3 : IN_CPD_BEGIN1)) ||
2342 (compoundmoresuffixes &&
2343 (rv = suffix_check_twosfx(
2344 st.c_str(), i, 0, NULL__null,
2345 compoundmiddle))) || // twofold suffix+compound
2346 (rv = prefix_check(st.c_str(), i,
2347 hu_mov_rule ? IN_CPD_OTHER3 : IN_CPD_BEGIN1,
2348 compoundmiddle)))))) {
2349 std::string p;
2350 if (compoundflag)
2351 p = affix_check_morph(st.c_str(), i, compoundflag);
2352 if (p.empty()) {
2353 if ((wordnum == 0) && compoundbegin) {
2354 p = affix_check_morph(st.c_str(), i, compoundbegin);
2355 } else if ((wordnum > 0) && compoundmiddle) {
2356 p = affix_check_morph(st.c_str(), i, compoundmiddle);
2357 }
2358 }
2359 if (!p.empty()) {
2360 presult.push_back(MSEP_FLD' ');
2361 presult.append(MORPH_PART"pa:");
2362 presult.append(st.c_str());
2363 line_uniq_app(p, MSEP_REC'\n');
2364 presult.append(p);
2365 }
2366 checked_prefix = 1;
2367 }
2368 // else check forbiddenwords
2369 } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, forbiddenword
))
||
2370 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 65511
))
||
2371 TESTAFF(rv->astr, needaffix, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, needaffix
))
)) {
2372 st[i] = ch;
2373 continue;
2374 }
2375
2376 // check non_compound flag in suffix and prefix
2377 if ((rv) && !hu_mov_rule &&
2378 ((pfx && pfx->getCont() &&
2379 TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())(std::binary_search(pfx->getCont(), pfx->getCont() + pfx
->getContLen(), compoundforbidflag))
) ||
2380 (sfx && sfx->getCont() &&
2381 TESTAFF(sfx->getCont(), compoundforbidflag, sfx->getContLen())(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), compoundforbidflag))
))) {
2382 continue;
2383 }
2384
2385 // check compoundend flag in suffix and prefix
2386 if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
2387 ((pfx && pfx->getCont() &&
2388 TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())(std::binary_search(pfx->getCont(), pfx->getCont() + pfx
->getContLen(), compoundend))
) ||
2389 (sfx && sfx->getCont() &&
2390 TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), compoundend))
))) {
2391 continue;
2392 }
2393
2394 // check compoundmiddle flag in suffix and prefix
2395 if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle &&
2396 !hu_mov_rule &&
2397 ((pfx && pfx->getCont() &&
2398 TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())(std::binary_search(pfx->getCont(), pfx->getCont() + pfx
->getContLen(), compoundmiddle))
) ||
2399 (sfx && sfx->getCont() &&
2400 TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), compoundmiddle))
))) {
2401 rv = NULL__null;
2402 }
2403
2404 // check forbiddenwords
2405 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, forbiddenword
))
||
2406 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 65511
))
))
2407 continue;
2408
2409 // increment word number, if the second root has a compoundroot flag
2410 if ((rv) && (compoundroot) &&
2411 (TESTAFF(rv->astr, compoundroot, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundroot
))
)) {
2412 wordnum++;
2413 }
2414
2415 // first word is acceptable in compound words?
2416 if (((rv) &&
2417 (checked_prefix || (words && words[wnum]) ||
2418 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundflag
))
) ||
2419 ((oldwordnum == 0) && compoundbegin &&
2420 TESTAFF(rv->astr, compoundbegin, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundbegin
))
) ||
2421 ((oldwordnum > 0) && compoundmiddle &&
2422 TESTAFF(rv->astr, compoundmiddle, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundmiddle
))
)
2423 // LANG_hu section: spec. Hungarian rule
2424 || ((langnum == LANG_hu) && // hu_mov_rule
2425 hu_mov_rule && (TESTAFF(rv->astr, 'F', rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 'F'
))
||
2426 TESTAFF(rv->astr, 'G', rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 'G'
))
||
2427 TESTAFF(rv->astr, 'H', rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 'H'
))
))
2428 // END of LANG_hu section
2429 ) &&
2430 !((checkcompoundtriple && !words && // test triple letters
2431 (word[i - 1] == word[i]) &&
2432 (((i > 1) && (word[i - 1] == word[i - 2])) ||
2433 ((word[i - 1] == word[i + 1])) // may be word[i+1] == '\0'
2434 )) ||
2435 (
2436 // test CHECKCOMPOUNDPATTERN
2437 !checkcpdtable.empty() && !words &&
2438 cpdpat_check(word, i, rv, NULL__null, affixed)) ||
2439 (checkcompoundcase && !words && cpdcase_check(word, i))))
2440 // LANG_hu section: spec. Hungarian rule
2441 ||
2442 ((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
2443 (rv = affix_check(st.c_str(), i)) &&
2444 (sfx && sfx->getCont() &&
2445 (TESTAFF(sfx->getCont(), (unsigned short)'x', sfx->getContLen())(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), (unsigned short)'x'))
||
2446 TESTAFF(sfx->getCont(), (unsigned short)'%', sfx->getContLen())(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), (unsigned short)'%'))
)))
2447 // END of LANG_hu section
2448 ) {
2449 // LANG_hu section: spec. Hungarian rule
2450 if (langnum == LANG_hu) {
2451 // calculate syllable number of the word
2452 numsyllable += get_syllable(st.substr(0, i));
2453
2454 // + 1 word, if syllable number of the prefix > 1 (hungarian
2455 // convention)
2456 if (pfx && (get_syllable(pfx->getKey()) > 1))
2457 wordnum++;
2458 }
2459 // END of LANG_hu section
2460
2461 // NEXT WORD(S)
2462 rv_first = rv;
2463 rv = lookup((word + i)); // perhaps without prefix
2464
2465 // search homonym with compound flag
2466 while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, needaffix
))
) ||
2467 !((compoundflag && !words &&
2468 TESTAFF(rv->astr, compoundflag, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundflag
))
) ||
2469 (compoundend && !words &&
2470 TESTAFF(rv->astr, compoundend, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundend
))
) ||
2471 (!defcpdtable.empty() && words &&
2472 defcpd_check(&words, wnum + 1, rv, NULL__null, 1))))) {
2473 rv = rv->next_homonym;
2474 }
2475
2476 if (rv && words && words[wnum + 1]) {
2477 result.append(presult);
2478 result.push_back(MSEP_FLD' ');
2479 result.append(MORPH_PART"pa:");
2480 result.append(word + i);
2481 if (complexprefixes && HENTRY_DATA(rv))
2482 result.append(HENTRY_DATA2(rv));
2483 if (!HENTRY_FIND(rv, MORPH_STEM"st:")) {
2484 result.push_back(MSEP_FLD' ');
2485 result.append(MORPH_STEM"st:");
2486 result.append(HENTRY_WORD(rv)&(rv->word[0]));
2487 }
2488 // store the pointer of the hash entry
2489 if (!complexprefixes && HENTRY_DATA(rv)) {
2490 result.push_back(MSEP_FLD' ');
2491 result.append(HENTRY_DATA2(rv));
2492 }
2493 result.push_back(MSEP_REC'\n');
2494 return 0;
2495 }
2496
2497 oldnumsyllable2 = numsyllable;
2498 oldwordnum2 = wordnum;
2499
2500 // LANG_hu section: spec. Hungarian rule
2501 if ((rv) && (langnum == LANG_hu) &&
2502 (TESTAFF(rv->astr, 'I', rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 'I'
))
) &&
2503 !(TESTAFF(rv->astr, 'J', rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 'J'
))
)) {
2504 numsyllable--;
2505 }
2506 // END of LANG_hu section
2507 // increment word number, if the second root has a compoundroot flag
2508 if ((rv) && (compoundroot) &&
2509 (TESTAFF(rv->astr, compoundroot, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundroot
))
)) {
2510 wordnum++;
2511 }
2512
2513 // check forbiddenwords
2514 if ((rv) && (rv->astr) &&
2515 (TESTAFF(rv->astr, forbiddenword, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, forbiddenword
))
||
2516 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 65511
))
)) {
2517 st[i] = ch;
2518 continue;
2519 }
2520
2521 // second word is acceptable, as a root?
2522 // hungarian conventions: compounding is acceptable,
2523 // when compound forms consist of 2 words, or if more,
2524 // then the syllable number of root words must be 6, or lesser.
2525 if ((rv) &&
2526 ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundflag
))
) ||
2527 (compoundend && TESTAFF(rv->astr, compoundend, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundend
))
)) &&
2528 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2529 ((cpdmaxsyllable != 0) &&
2530 (numsyllable + get_syllable(std::string(HENTRY_WORD(rv)&(rv->word[0]), rv->blen)) <=
2531 cpdmaxsyllable))) &&
2532 ((!checkcompounddup || (rv != rv_first)))) {
2533 // bad compound word
2534 result.append(presult);
2535 result.push_back(MSEP_FLD' ');
2536 result.append(MORPH_PART"pa:");
2537 result.append(word + i);
2538
2539 if (HENTRY_DATA(rv)) {
2540 if (complexprefixes)
2541 result.append(HENTRY_DATA2(rv));
2542 if (!HENTRY_FIND(rv, MORPH_STEM"st:")) {
2543 result.push_back(MSEP_FLD' ');
2544 result.append(MORPH_STEM"st:");
2545 result.append(HENTRY_WORD(rv)&(rv->word[0]));
2546 }
2547 // store the pointer of the hash entry
2548 if (!complexprefixes) {
2549 result.push_back(MSEP_FLD' ');
2550 result.append(HENTRY_DATA2(rv));
2551 }
2552 }
2553 result.push_back(MSEP_REC'\n');
2554 ok = 1;
2555 }
2556
2557 numsyllable = oldnumsyllable2;
2558 wordnum = oldwordnum2;
2559
2560 // perhaps second word has prefix or/and suffix
2561 sfx = NULL__null;
2562 sfxflag = FLAG_NULL0x00;
2563
2564 if (compoundflag && !onlycpdrule)
2565 rv = affix_check((word + i), strlen(word + i), compoundflag);
2566 else
2567 rv = NULL__null;
2568
2569 if (!rv && compoundend && !onlycpdrule) {
2570 sfx = NULL__null;
2571 pfx = NULL__null;
2572 rv = affix_check((word + i), strlen(word + i), compoundend);
2573 }
2574
2575 if (!rv && !defcpdtable.empty() && words) {
2576 rv = affix_check((word + i), strlen(word + i), 0, IN_CPD_END2);
2577 if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL__null, 1)) {
2578 std::string m;
2579 if (compoundflag)
2580 m = affix_check_morph((word + i), strlen(word + i), compoundflag);
2581 if (m.empty() && compoundend) {
2582 m = affix_check_morph((word + i), strlen(word + i), compoundend);
2583 }
2584 result.append(presult);
2585 if (!m.empty()) {
2586 result.push_back(MSEP_FLD' ');
2587 result.append(MORPH_PART"pa:");
2588 result.append(word + i);
2589 line_uniq_app(m, MSEP_REC'\n');
2590 result.append(m);
2591 }
2592 result.push_back(MSEP_REC'\n');
2593 ok = 1;
2594 }
2595 }
2596
2597 // check non_compound flag in suffix and prefix
2598 if ((rv) &&
2599 ((pfx && pfx->getCont() &&
2600 TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())(std::binary_search(pfx->getCont(), pfx->getCont() + pfx
->getContLen(), compoundforbidflag))
) ||
2601 (sfx && sfx->getCont() &&
2602 TESTAFF(sfx->getCont(), compoundforbidflag,(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), compoundforbidflag))
2603 sfx->getContLen())(std::binary_search(sfx->getCont(), sfx->getCont() + sfx
->getContLen(), compoundforbidflag))
))) {
2604 rv = NULL__null;
2605 }
2606
2607 // check forbiddenwords
2608 if ((rv) && (rv->astr) &&
2609 (TESTAFF(rv->astr, forbiddenword, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, forbiddenword
))
||
2610 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 65511
))
) &&
2611 (!TESTAFF(rv->astr, needaffix, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, needaffix
))
)) {
2612 st[i] = ch;
2613 continue;
2614 }
2615
2616 if (langnum == LANG_hu) {
2617 // calculate syllable number of the word
2618 numsyllable += get_syllable(word + i);
2619
2620 // - affix syllable num.
2621 // XXX only second suffix (inflections, not derivations)
2622 if (sfxappnd) {
2623 std::string tmp(sfxappnd);
2624 reverseword(tmp);
2625 numsyllable -= short(get_syllable(tmp) + sfxextra);
2626 } else {
2627 numsyllable -= short(sfxextra);
2628 }
2629
2630 // + 1 word, if syllable number of the prefix > 1 (hungarian
2631 // convention)
2632 if (pfx && (get_syllable(pfx->getKey()) > 1))
2633 wordnum++;
2634
2635 // increment syllable num, if last word has a SYLLABLENUM flag
2636 // and the suffix is beginning `s'
2637
2638 if (!cpdsyllablenum.empty()) {
2639 switch (sfxflag) {
2640 case 'c': {
2641 numsyllable += 2;
2642 break;
2643 }
2644 case 'J': {
2645 numsyllable += 1;
2646 break;
2647 }
2648 case 'I': {
2649 if (rv && TESTAFF(rv->astr, 'J', rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, 'J'
))
)
2650 numsyllable += 1;
2651 break;
2652 }
2653 }
2654 }
2655 }
2656
2657 // increment word number, if the second word has a compoundroot flag
2658 if ((rv) && (compoundroot) &&
2659 (TESTAFF(rv->astr, compoundroot, rv->alen)(std::binary_search(rv->astr, rv->astr + rv->alen, compoundroot
))
)) {
2660 wordnum++;
2661 }
2662 // second word is acceptable, as a word with prefix or/and suffix?
2663 // hungarian conventions: compounding is acceptable,
2664 // when compound forms consist 2 word, otherwise
2665 // the syllable number of root words is 6, or lesser.
2666 if ((rv) &&
2667 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2668 ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) &&
2669 ((!checkcompounddup || (rv != rv_first)))) {
2670 std::string m;
2671 if (compoundflag)
2672 m = affix_check_morph((word + i), strlen(word + i), compoundflag);
2673 if (m.empty() && compoundend) {
2674 m = affix_check_morph((word + i), strlen(word + i), compoundend);
2675 }
2676 result.append(presult);
2677 if (!m.empty()) {
2678 result.push_back(MSEP_FLD' ');
2679 result.append(MORPH_PART"pa:");
2680 result.append(word + i);
2681 line_uniq_app(m, MSEP_REC'\n');
2682 result.push_back(MSEP_FLD' ');
2683 result.append(m);
2684 }
2685 result.push_back(MSEP_REC'\n');
2686 ok = 1;
2687 }
2688
2689 numsyllable = oldnumsyllable2;
2690 wordnum = oldwordnum2;
2691
2692 // perhaps second word is a compound word (recursive call)
2693 if ((wordnum + 2 < maxwordnum) && (ok == 0)) {
2694 compound_check_morph((word + i), strlen(word + i), wordnum + 1,
2695 numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
2696 result, &presult);
2697 } else {
2698 rv = NULL__null;
2699 }
2700 }
2701 st[i] = ch;
2702 wordnum = oldwordnum;
2703 numsyllable = oldnumsyllable;
2704
2705 } while (!defcpdtable.empty() && oldwordnum == 0 &&
2706 onlycpdrule++ < 1); // end of onlycpd loop
2707 }
2708 return 0;
2709}
2710
2711
2712inline int AffixMgr::isRevSubset(const char* s1,
2713 const char* end_of_s2,
2714 int len) {
2715 while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) {
2716 s1++;
2717 end_of_s2--;
2718 len--;
2719 }
2720 return (*s1 == '\0');
2721}
2722
2723// check word for suffixes
2724struct hentry* AffixMgr::suffix_check(const char* word,
2725 int len,
2726 int sfxopts,
2727 PfxEntry* ppfx,
2728 const FLAGunsigned short cclass,
2729 const FLAGunsigned short needflag,
2730 char in_compound) {
2731 struct hentry* rv = NULL__null;
2732 PfxEntry* ep = ppfx;
2733
2734 // first handle the special case of 0 length suffixes
2735 SfxEntry* se = sStart[0];
2736
2737 while (se) {
2738 if (!cclass || se->getCont()) {
2739 // suffixes are not allowed in beginning of compounds
2740 if ((((in_compound != IN_CPD_BEGIN1)) || // && !cclass
2741 // except when signed with compoundpermitflag flag
2742 (se->getCont() && compoundpermitflag &&
2743 TESTAFF(se->getCont(), compoundpermitflag, se->getContLen())(std::binary_search(se->getCont(), se->getCont() + se->
getContLen(), compoundpermitflag))
)) &&
2744 (!circumfix ||
2745 // no circumfix flag in prefix and suffix
2746 ((!ppfx || !(ep->getCont()) ||
2747 !TESTAFF(ep->getCont(), circumfix, ep->getContLen())(std::binary_search(ep->getCont(), ep->getCont() + ep->
getContLen(), circumfix))
) &&
2748 (!se->getCont() ||
2749 !(TESTAFF(se->getCont(), circumfix, se->getContLen())(std::binary_search(se->getCont(), se->getCont() + se->
getContLen(), circumfix))
))) ||
2750 // circumfix flag in prefix AND suffix
2751 ((ppfx && (ep->getCont()) &&
2752 TESTAFF(ep->getCont(), circumfix, ep->getContLen())(std::binary_search(ep->getCont(), ep->getCont() + ep->
getContLen(), circumfix))
) &&
2753 (se->getCont() &&
2754 (TESTAFF(se->getCont(), circumfix, se->getContLen())(std::binary_search(se->getCont(), se->getCont() + se->
getContLen(), circumfix))
)))) &&
2755 // fogemorpheme
2756 (in_compound ||
2757 !(se->getCont() &&
2758 (TESTAFF(se->getCont(), onlyincompound, se->getContLen())(std::binary_search(se->getCont(), se->getCont() + se->
getContLen(), onlyincompound))
))) &&
2759 // needaffix on prefix or first suffix
2760 (cclass ||
2761 !(se->getCont() &&
2762 TESTAFF(se->getCont(), needaffix, se->getContLen())(std::binary_search(se->getCont(), se->getCont() + se->
getContLen(), needaffix))
) ||
2763 (ppfx &&
2764 !((ep->getCont()) &&
2765 TESTAFF(ep->getCont(), needaffix, ep->getContLen())(std::binary_search(ep->getCont(), ep->getCont() + ep->
getContLen(), needaffix))
)))) {
2766 rv = se->checkword(word, len, sfxopts, ppfx,
2767 (FLAGunsigned short)cclass, needflag,
2768 (in_compound ? 0 : onlyincompound));
2769 if (rv) {
2770 sfx = se; // BUG: sfx not stateless
2771 return rv;
2772 }
2773 }
2774 }
2775 se = se->getNext();
2776 }
2777
2778 // now handle the general case
2779 if (len == 0)
2780 return NULL__null; // FULLSTRIP
2781 unsigned char sp = *((const unsigned char*)(word + len - 1));
2782 SfxEntry* sptr = sStart[sp];
2783
2784 while (sptr) {
2785 if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2786 // suffixes are not allowed in beginning of compounds
2787 if ((((in_compound != IN_CPD_BEGIN1)) || // && !cclass
2788 // except when signed with compoundpermitflag flag
2789 (sptr->getCont() && compoundpermitflag &&
2790 TESTAFF(sptr->getCont(), compoundpermitflag,(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), compoundpermitflag))
2791 sptr->getContLen())(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), compoundpermitflag))
)) &&
2792 (!circumfix ||
2793 // no circumfix flag in prefix and suffix
2794 ((!ppfx || !(ep->getCont()) ||
2795 !TESTAFF(ep->getCont(), circumfix, ep->getContLen())(std::binary_search(ep->getCont(), ep->getCont() + ep->
getContLen(), circumfix))
) &&
2796 (!sptr->getCont() ||
2797 !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), circumfix))
))) ||
2798 // circumfix flag in prefix AND suffix
2799 ((ppfx && (ep->getCont()) &&
2800 TESTAFF(ep->getCont(), circumfix, ep->getContLen())(std::binary_search(ep->getCont(), ep->getCont() + ep->
getContLen(), circumfix))
) &&
2801 (sptr->getCont() &&
2802 (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), circumfix))
)))) &&
2803 // fogemorpheme
2804 (in_compound ||
2805 !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound,(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), onlyincompound))
2806 sptr->getContLen())(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), onlyincompound))
)))) &&
2807 // needaffix on prefix or first suffix
2808 (cclass ||
2809 !(sptr->getCont() &&
2810 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), needaffix))
) ||
2811 (ppfx &&
2812 !((ep->getCont()) &&
2813 TESTAFF(ep->getCont(), needaffix, ep->getContLen())(std::binary_search(ep->getCont(), ep->getCont() + ep->
getContLen(), needaffix))
))))
2814 if (in_compound != IN_CPD_END2 || ppfx ||
2815 !(sptr->getCont() &&
2816 TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen())(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), onlyincompound))
)) {
2817 rv = sptr->checkword(word, len, sfxopts, ppfx,
2818 cclass, needflag,
2819 (in_compound ? 0 : onlyincompound));
2820 if (rv) {
2821 sfx = sptr; // BUG: sfx not stateless
2822 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2823 if (!sptr->getCont())
2824 sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless
2825 // LANG_hu section: spec. Hungarian rule
2826 else if (langnum == LANG_hu && sptr->getKeyLen() &&
2827 sptr->getKey()[0] == 'i' && sptr->getKey()[1] != 'y' &&
2828 sptr->getKey()[1] != 't') {
2829 sfxextra = 1;
2830 }
2831 // END of LANG_hu section
2832 return rv;
2833 }
2834 }
2835 sptr = sptr->getNextEQ();
2836 } else {
2837 sptr = sptr->getNextNE();
2838 }
2839 }
2840
2841 return NULL__null;
2842}
2843
2844// check word for two-level suffixes
2845struct hentry* AffixMgr::suffix_check_twosfx(const char* word,
2846 int len,
2847 int sfxopts,
2848 PfxEntry* ppfx,
2849 const FLAGunsigned short needflag) {
2850 struct hentry* rv = NULL__null;
2851
2852 // first handle the special case of 0 length suffixes
2853 SfxEntry* se = sStart[0];
2854 while (se) {
2855 if (contclasses[se->getFlag()]) {
2856 rv = se->check_twosfx(word, len, sfxopts, ppfx, needflag);
2857 if (rv)
2858 return rv;
2859 }
2860 se = se->getNext();
2861 }
2862
2863 // now handle the general case
2864 if (len == 0)
2865 return NULL__null; // FULLSTRIP
2866 unsigned char sp = *((const unsigned char*)(word + len - 1));
2867 SfxEntry* sptr = sStart[sp];
2868
2869 while (sptr) {
2870 if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2871 if (contclasses[sptr->getFlag()]) {
2872 rv = sptr->check_twosfx(word, len, sfxopts, ppfx, needflag);
2873 if (rv) {
2874 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2875 if (!sptr->getCont())
2876 sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless
2877 return rv;
2878 }
2879 }
2880 sptr = sptr->getNextEQ();
2881 } else {
2882 sptr = sptr->getNextNE();
2883 }
2884 }
2885
2886 return NULL__null;
2887}
2888
2889// check word for two-level suffixes and morph
2890std::string AffixMgr::suffix_check_twosfx_morph(const char* word,
2891 int len,
2892 int sfxopts,
2893 PfxEntry* ppfx,
2894 const FLAGunsigned short needflag) {
2895 std::string result;
2896 std::string result2;
2897 std::string result3;
2898
2899 // first handle the special case of 0 length suffixes
2900 SfxEntry* se = sStart[0];
2901 while (se) {
2902 if (contclasses[se->getFlag()]) {
2903 std::string st = se->check_twosfx_morph(word, len, sfxopts, ppfx, needflag);
2904 if (!st.empty()) {
2905 if (ppfx) {
2906 if (ppfx->getMorph()) {
2907 result.append(ppfx->getMorph());
2908 result.push_back(MSEP_FLD' ');
2909 } else
2910 debugflag(result, ppfx->getFlag());
2911 }
2912 result.append(st);
2913 if (se->getMorph()) {
2914 result.push_back(MSEP_FLD' ');
2915 result.append(se->getMorph());
2916 } else
2917 debugflag(result, se->getFlag());
2918 result.push_back(MSEP_REC'\n');
2919 }
2920 }
2921 se = se->getNext();
2922 }
2923
2924 // now handle the general case
2925 if (len == 0)
2926 return std::string(); // FULLSTRIP
2927 unsigned char sp = *((const unsigned char*)(word + len - 1));
2928 SfxEntry* sptr = sStart[sp];
2929
2930 while (sptr) {
2931 if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2932 if (contclasses[sptr->getFlag()]) {
2933 std::string st = sptr->check_twosfx_morph(word, len, sfxopts, ppfx, needflag);
2934 if (!st.empty()) {
2935 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2936 if (!sptr->getCont())
2937 sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless
2938 result2.assign(st);
2939
2940 result3.clear();
2941
2942 if (sptr->getMorph()) {
2943 result3.push_back(MSEP_FLD' ');
2944 result3.append(sptr->getMorph());
2945 } else
2946 debugflag(result3, sptr->getFlag());
2947 strlinecat(result2, result3);
2948 result2.push_back(MSEP_REC'\n');
2949 result.append(result2);
2950 }
2951 }
2952 sptr = sptr->getNextEQ();
2953 } else {
2954 sptr = sptr->getNextNE();
2955 }
2956 }
2957
2958 return result;
2959}
2960
2961std::string AffixMgr::suffix_check_morph(const char* word,
2962 int len,
2963 int sfxopts,
2964 PfxEntry* ppfx,
2965 const FLAGunsigned short cclass,
2966 const FLAGunsigned short needflag,
2967 char in_compound) {
2968 std::string result;
2969
2970 struct hentry* rv = NULL__null;
2971
2972 PfxEntry* ep = ppfx;
2973
2974 // first handle the special case of 0 length suffixes
2975 SfxEntry* se = sStart[0];
2976 while (se) {
2977 if (!cclass || se->getCont()) {
2978 // suffixes are not allowed in beginning of compounds
2979 if (((((in_compound != IN_CPD_BEGIN1)) || // && !cclass
2980 // except when signed with compoundpermitflag flag
2981 (se->getCont() && compoundpermitflag &&
2982 TESTAFF(se->getCont(), compoundpermitflag, se->getContLen())(std::binary_search(se->getCont(), se->getCont() + se->
getContLen(), compoundpermitflag))
)) &&
2983 (!circumfix ||
2984 // no circumfix flag in prefix and suffix
2985 ((!ppfx || !(ep->getCont()) ||
2986 !TESTAFF(ep->getCont(), circumfix, ep->getContLen())(std::binary_search(ep->getCont(), ep->getCont() + ep->
getContLen(), circumfix))
) &&
2987 (!se->getCont() ||
2988 !(TESTAFF(se->getCont(), circumfix, se->getContLen())(std::binary_search(se->getCont(), se->getCont() + se->
getContLen(), circumfix))
))) ||
2989 // circumfix flag in prefix AND suffix
2990 ((ppfx && (ep->getCont()) &&
2991 TESTAFF(ep->getCont(), circumfix, ep->getContLen())(std::binary_search(ep->getCont(), ep->getCont() + ep->
getContLen(), circumfix))
) &&
2992 (se->getCont() &&
2993 (TESTAFF(se->getCont(), circumfix, se->getContLen())(std::binary_search(se->getCont(), se->getCont() + se->
getContLen(), circumfix))
)))) &&
2994 // fogemorpheme
2995 (in_compound ||
2996 !((se->getCont() &&
2997 (TESTAFF(se->getCont(), onlyincompound, se->getContLen())(std::binary_search(se->getCont(), se->getCont() + se->
getContLen(), onlyincompound))
)))) &&
2998 // needaffix on prefix or first suffix
2999 (cclass ||
3000 !(se->getCont() &&
3001 TESTAFF(se->getCont(), needaffix, se->getContLen())(std::binary_search(se->getCont(), se->getCont() + se->
getContLen(), needaffix))
) ||
3002 (ppfx &&
3003 !((ep->getCont()) &&
3004 TESTAFF(ep->getCont(), needaffix, ep->getContLen())(std::binary_search(ep->getCont(), ep->getCont() + ep->
getContLen(), needaffix))
)))))
3005 rv = se->checkword(word, len, sfxopts, ppfx, cclass,
3006 needflag, FLAG_NULL0x00);
3007 while (rv) {
3008 if (ppfx) {
3009 if (ppfx->getMorph()) {
3010 result.append(ppfx->getMorph());
3011 result.push_back(MSEP_FLD' ');
3012 } else
3013 debugflag(result, ppfx->getFlag());
3014 }
3015 if (complexprefixes && HENTRY_DATA(rv))
3016 result.append(HENTRY_DATA2(rv));
3017 if (!HENTRY_FIND(rv, MORPH_STEM"st:")) {
3018 result.push_back(MSEP_FLD' ');
3019 result.append(MORPH_STEM"st:");
3020 result.append(HENTRY_WORD(rv)&(rv->word[0]));
3021 }
3022
3023 if (!complexprefixes && HENTRY_DATA(rv)) {
3024 result.push_back(MSEP_FLD' ');
3025 result.append(HENTRY_DATA2(rv));
3026 }
3027 if (se->getMorph()) {
3028 result.push_back(MSEP_FLD' ');
3029 result.append(se->getMorph());
3030 } else
3031 debugflag(result, se->getFlag());
3032 result.push_back(MSEP_REC'\n');
3033 rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
3034 }
3035 }
3036 se = se->getNext();
3037 }
3038
3039 // now handle the general case
3040 if (len == 0)
3041 return std::string(); // FULLSTRIP
3042 unsigned char sp = *((const unsigned char*)(word + len - 1));
3043 SfxEntry* sptr = sStart[sp];
3044
3045 while (sptr) {
3046 if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
3047 // suffixes are not allowed in beginning of compounds
3048 if (((((in_compound != IN_CPD_BEGIN1)) || // && !cclass
3049 // except when signed with compoundpermitflag flag
3050 (sptr->getCont() && compoundpermitflag &&
3051 TESTAFF(sptr->getCont(), compoundpermitflag,(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), compoundpermitflag))
3052 sptr->getContLen())(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), compoundpermitflag))
)) &&
3053 (!circumfix ||
3054 // no circumfix flag in prefix and suffix
3055 ((!ppfx || !(ep->getCont()) ||
3056 !TESTAFF(ep->getCont(), circumfix, ep->getContLen())(std::binary_search(ep->getCont(), ep->getCont() + ep->
getContLen(), circumfix))
) &&
3057 (!sptr->getCont() ||
3058 !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), circumfix))
))) ||
3059 // circumfix flag in prefix AND suffix
3060 ((ppfx && (ep->getCont()) &&
3061 TESTAFF(ep->getCont(), circumfix, ep->getContLen())(std::binary_search(ep->getCont(), ep->getCont() + ep->
getContLen(), circumfix))
) &&
3062 (sptr->getCont() &&
3063 (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), circumfix))
)))) &&
3064 // fogemorpheme
3065 (in_compound ||
3066 !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound,(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), onlyincompound))
3067 sptr->getContLen())(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), onlyincompound))
)))) &&
3068 // needaffix on first suffix
3069 (cclass ||
3070 !(sptr->getCont() &&
3071 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), needaffix))
))))
3072 rv = sptr->checkword(word, len, sfxopts, ppfx, cclass,
3073 needflag, FLAG_NULL0x00);
3074 while (rv) {
3075 if (ppfx) {
3076 if (ppfx->getMorph()) {
3077 result.append(ppfx->getMorph());
3078 result.push_back(MSEP_FLD' ');
3079 } else
3080 debugflag(result, ppfx->getFlag());
3081 }
3082 if (complexprefixes && HENTRY_DATA(rv))
3083 result.append(HENTRY_DATA2(rv));
3084 if (!HENTRY_FIND(rv, MORPH_STEM"st:")) {
3085 result.push_back(MSEP_FLD' ');
3086 result.append(MORPH_STEM"st:");
3087 result.append(HENTRY_WORD(rv)&(rv->word[0]));
3088 }
3089
3090 if (!complexprefixes && HENTRY_DATA(rv)) {
3091 result.push_back(MSEP_FLD' ');
3092 result.append(HENTRY_DATA2(rv));
3093 }
3094
3095 if (sptr->getMorph()) {
3096 result.push_back(MSEP_FLD' ');
3097 result.append(sptr->getMorph());
3098 } else
3099 debugflag(result, sptr->getFlag());
3100 result.push_back(MSEP_REC'\n');
3101 rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
3102 }
3103 sptr = sptr->getNextEQ();
3104 } else {
3105 sptr = sptr->getNextNE();
3106 }
3107 }
3108
3109 return result;
3110}
3111
3112// check if word with affixes is correctly spelled
3113struct hentry* AffixMgr::affix_check(const char* word,
3114 int len,
3115 const FLAGunsigned short needflag,
3116 char in_compound) {
3117
3118 // check all prefixes (also crossed with suffixes if allowed)
3119 struct hentry* rv = prefix_check(word, len, in_compound, needflag);
3120 if (rv)
3121 return rv;
3122
3123 // if still not found check all suffixes
3124 rv = suffix_check(word, len, 0, NULL__null, FLAG_NULL0x00, needflag, in_compound);
3125
3126 if (havecontclass) {
3127 sfx = NULL__null;
3128 pfx = NULL__null;
3129
3130 if (rv)
3131 return rv;
3132 // if still not found check all two-level suffixes
3133 rv = suffix_check_twosfx(word, len, 0, NULL__null, needflag);
3134
3135 if (rv)
3136 return rv;
3137 // if still not found check all two-level suffixes
3138 rv = prefix_check_twosfx(word, len, IN_CPD_NOT0, needflag);
3139 }
3140
3141 return rv;
3142}
3143
3144// check if word with affixes is correctly spelled
3145std::string AffixMgr::affix_check_morph(const char* word,
3146 int len,
3147 const FLAGunsigned short needflag,
3148 char in_compound) {
3149 std::string result;
3150
3151 // check all prefixes (also crossed with suffixes if allowed)
3152 std::string st = prefix_check_morph(word, len, in_compound);
3153 if (!st.empty()) {
3154 result.append(st);
3155 }
3156
3157 // if still not found check all suffixes
3158 st = suffix_check_morph(word, len, 0, NULL__null, '\0', needflag, in_compound);
3159 if (!st.empty()) {
3160 result.append(st);
3161 }
3162
3163 if (havecontclass) {
3164 sfx = NULL__null;
3165 pfx = NULL__null;
3166 // if still not found check all two-level suffixes
3167 st = suffix_check_twosfx_morph(word, len, 0, NULL__null, needflag);
3168 if (!st.empty()) {
3169 result.append(st);
3170 }
3171
3172 // if still not found check all two-level suffixes
3173 st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT0, needflag);
3174 if (!st.empty()) {
3175 result.append(st);
3176 }
3177 }
3178
3179 return result;
3180}
3181
3182// morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields
3183// in the first line of the inputs
3184// return 0, if inputs equal
3185// return 1, if inputs may equal with a secondary suffix
3186// otherwise return -1
3187static int morphcmp(const char* s, const char* t) {
3188 int se = 0;
3189 int te = 0;
3190 const char* sl;
3191 const char* tl;
3192 const char* olds;
3193 const char* oldt;
3194 if (!s || !t)
3195 return 1;
3196 olds = s;
3197 sl = strchr(s, '\n');
3198 s = strstr(s, MORPH_DERI_SFX"ds:");
3199 if (!s || (sl && sl < s))
3200 s = strstr(olds, MORPH_INFL_SFX"is:");
3201 if (!s || (sl && sl < s)) {
3202 s = strstr(olds, MORPH_TERM_SFX"ts:");
3203 olds = NULL__null;
3204 }
3205 oldt = t;
3206 tl = strchr(t, '\n');
3207 t = strstr(t, MORPH_DERI_SFX"ds:");
3208 if (!t || (tl && tl < t))
3209 t = strstr(oldt, MORPH_INFL_SFX"is:");
3210 if (!t || (tl && tl < t)) {
3211 t = strstr(oldt, MORPH_TERM_SFX"ts:");
3212 oldt = NULL__null;
3213 }
3214 while (s && t && (!sl || sl > s) && (!tl || tl > t)) {
3215 s += MORPH_TAG_LENstrlen("st:");
3216 t += MORPH_TAG_LENstrlen("st:");
3217 se = 0;
3218 te = 0;
3219 while ((*s == *t) && !se && !te) {
3220 s++;
3221 t++;
3222 switch (*s) {
3223 case ' ':
3224 case '\n':
3225 case '\t':
3226 case '\0':
3227 se = 1;
3228 }
3229 switch (*t) {
3230 case ' ':
3231 case '\n':
3232 case '\t':
3233 case '\0':
3234 te = 1;
3235 }
3236 }
3237 if (!se || !te) {
3238 // not terminal suffix difference
3239 if (olds)
3240 return -1;
3241 return 1;
3242 }
3243 olds = s;
3244 s = strstr(s, MORPH_DERI_SFX"ds:");
3245 if (!s || (sl && sl < s))
3246 s = strstr(olds, MORPH_INFL_SFX"is:");
3247 if (!s || (sl && sl < s)) {
3248 s = strstr(olds, MORPH_TERM_SFX"ts:");
3249 olds = NULL__null;
3250 }
3251 oldt = t;
3252 t = strstr(t, MORPH_DERI_SFX"ds:");
3253 if (!t || (tl && tl < t))
3254 t = strstr(oldt, MORPH_INFL_SFX"is:");
3255 if (!t || (tl && tl < t)) {
3256 t = strstr(oldt, MORPH_TERM_SFX"ts:");
3257 oldt = NULL__null;
3258 }
3259 }
3260 if (!s && !t && se && te)
3261 return 0;
3262 return 1;
3263}
3264
3265std::string AffixMgr::morphgen(const char* ts,
3266 int wl,
3267 const unsigned short* ap,
3268 unsigned short al,
3269 const char* morph,
3270 const char* targetmorph,
3271 int level) {
3272 // handle suffixes
3273 if (!morph)
3274 return std::string();
3275
3276 // check substandard flag
3277 if (TESTAFF(ap, substandard, al)(std::binary_search(ap, ap + al, substandard)))
3278 return std::string();
3279
3280 if (morphcmp(morph, targetmorph) == 0)
3281 return ts;
3282
3283 size_t stemmorphcatpos;
3284 std::string mymorph;
3285
3286 // use input suffix fields, if exist
3287 if (strstr(morph, MORPH_INFL_SFX"is:") || strstr(morph, MORPH_DERI_SFX"ds:")) {
3288 mymorph.assign(morph);
3289 mymorph.push_back(MSEP_FLD' ');
3290 stemmorphcatpos = mymorph.size();
3291 } else {
3292 stemmorphcatpos = std::string::npos;
3293 }
3294
3295 for (int i = 0; i < al; i++) {
3296 const unsigned char c = (unsigned char)(ap[i] & 0x00FF);
3297 SfxEntry* sptr = sFlag[c];
3298 while (sptr) {
3299 if (sptr->getFlag() == ap[i] && sptr->getMorph() &&
3300 ((sptr->getContLen() == 0) ||
3301 // don't generate forms with substandard affixes
3302 !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), substandard))
)) {
3303 const char* stemmorph;
3304 if (stemmorphcatpos != std::string::npos) {
3305 mymorph.replace(stemmorphcatpos, std::string::npos, sptr->getMorph());
3306 stemmorph = mymorph.c_str();
3307 } else {
3308 stemmorph = sptr->getMorph();
3309 }
3310
3311 int cmp = morphcmp(stemmorph, targetmorph);
3312
3313 if (cmp == 0) {
3314 std::string newword = sptr->add(ts, wl);
3315 if (!newword.empty()) {
3316 hentry* check = pHMgr->lookup(newword.c_str()); // XXX extra dic
3317 if (!check || !check->astr ||
3318 !(TESTAFF(check->astr, forbiddenword, check->alen)(std::binary_search(check->astr, check->astr + check->
alen, forbiddenword))
||
3319 TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen)(std::binary_search(check->astr, check->astr + check->
alen, 65511))
)) {
3320 return newword;
3321 }
3322 }
3323 }
3324
3325 // recursive call for secondary suffixes
3326 if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) &&
3327 !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), substandard))
) {
3328 std::string newword = sptr->add(ts, wl);
3329 if (!newword.empty()) {
3330 std::string newword2 =
3331 morphgen(newword.c_str(), newword.size(), sptr->getCont(),
3332 sptr->getContLen(), stemmorph, targetmorph, 1);
3333
3334 if (!newword2.empty()) {
3335 return newword2;
3336 }
3337 }
3338 }
3339 }
3340 sptr = sptr->getFlgNxt();
3341 }
3342 }
3343 return std::string();
3344}
3345
3346int AffixMgr::expand_rootword(struct guessword* wlst,
3347 int maxn,
3348 const char* ts,
3349 int wl,
3350 const unsigned short* ap,
3351 unsigned short al,
3352 const char* bad,
3353 int badl,
3354 const char* phon) {
3355 int nh = 0;
3356 // first add root word to list
3357 if ((nh < maxn) &&
3358 !(al && ((needaffix && TESTAFF(ap, needaffix, al)(std::binary_search(ap, ap + al, needaffix))) ||
3359 (onlyincompound && TESTAFF(ap, onlyincompound, al)(std::binary_search(ap, ap + al, onlyincompound)))))) {
3360 wlst[nh].word = mystrdup(ts);
3361 if (!wlst[nh].word)
3362 return 0;
3363 wlst[nh].allow = false;
3364 wlst[nh].orig = NULL__null;
3365 nh++;
3366 // add special phonetic version
3367 if (phon && (nh < maxn)) {
3368 wlst[nh].word = mystrdup(phon);
3369 if (!wlst[nh].word)
3370 return nh - 1;
3371 wlst[nh].allow = false;
3372 wlst[nh].orig = mystrdup(ts);
3373 if (!wlst[nh].orig)
3374 return nh - 1;
3375 nh++;
3376 }
3377 }
3378
3379 // handle suffixes
3380 for (int i = 0; i < al; i++) {
3381 const unsigned char c = (unsigned char)(ap[i] & 0x00FF);
3382 SfxEntry* sptr = sFlag[c];
3383 while (sptr) {
3384 if ((sptr->getFlag() == ap[i]) &&
3385 (!sptr->getKeyLen() ||
3386 ((badl > sptr->getKeyLen()) &&
3387 (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) &&
3388 // check needaffix flag
3389 !(sptr->getCont() &&
3390 ((needaffix &&
3391 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), needaffix))
) ||
3392 (circumfix &&
3393 TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), circumfix))
) ||
3394 (onlyincompound &&
3395 TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen())(std::binary_search(sptr->getCont(), sptr->getCont() + sptr
->getContLen(), onlyincompound))
)))) {
3396 std::string newword = sptr->add(ts, wl);
3397 if (!newword.empty()) {
3398 if (nh < maxn) {
3399 wlst[nh].word = mystrdup(newword.c_str());
3400 wlst[nh].allow = sptr->allowCross();
3401 wlst[nh].orig = NULL__null;
3402 nh++;
3403 // add special phonetic version
3404 if (phon && (nh < maxn)) {
3405 std::string prefix(phon);
3406 std::string key(sptr->getKey());
3407 reverseword(key);
3408 prefix.append(key);
3409 wlst[nh].word = mystrdup(prefix.c_str());
3410 if (!wlst[nh].word)
3411 return nh - 1;
3412 wlst[nh].allow = false;
3413 wlst[nh].orig = mystrdup(newword.c_str());
3414 if (!wlst[nh].orig)
3415 return nh - 1;
3416 nh++;
3417 }
3418 }
3419 }
3420 }
3421 sptr = sptr->getFlgNxt();
3422 }
3423 }
3424
3425 int n = nh;
3426
3427 // handle cross products of prefixes and suffixes
3428 for (int j = 1; j < n; j++)
3429 if (wlst[j].allow) {
3430 for (int k = 0; k < al; k++) {
3431 const unsigned char c = (unsigned char)(ap[k] & 0x00FF);
3432 PfxEntry* cptr = pFlag[c];
3433 while (cptr) {
3434 if ((cptr->getFlag() == ap[k]) && cptr->allowCross() &&
3435 (!cptr->getKeyLen() ||
3436 ((badl > cptr->getKeyLen()) &&
3437 (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) {
3438 int l1 = strlen(wlst[j].word);
3439 std::string newword = cptr->add(wlst[j].word, l1);
3440 if (!newword.empty()) {
3441 if (nh < maxn) {
3442 wlst[nh].word = mystrdup(newword.c_str());
3443 wlst[nh].allow = cptr->allowCross();
3444 wlst[nh].orig = NULL__null;
3445 nh++;
3446 }
3447 }
3448 }
3449 cptr = cptr->getFlgNxt();
3450 }
3451 }
3452 }
3453
3454 // now handle pure prefixes
3455 for (int m = 0; m < al; m++) {
3456 const unsigned char c = (unsigned char)(ap[m] & 0x00FF);
3457 PfxEntry* ptr = pFlag[c];
3458 while (ptr) {
3459 if ((ptr->getFlag() == ap[m]) &&
3460 (!ptr->getKeyLen() ||
3461 ((badl > ptr->getKeyLen()) &&
3462 (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) &&
3463 // check needaffix flag
3464 !(ptr->getCont() &&
3465 ((needaffix &&
3466 TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())(std::binary_search(ptr->getCont(), ptr->getCont() + ptr
->getContLen(), needaffix))
) ||
3467 (circumfix &&
3468 TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())(std::binary_search(ptr->getCont(), ptr->getCont() + ptr
->getContLen(), circumfix))
) ||
3469 (onlyincompound &&
3470 TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen())(std::binary_search(ptr->getCont(), ptr->getCont() + ptr
->getContLen(), onlyincompound))
)))) {
3471 std::string newword = ptr->add(ts, wl);
3472 if (!newword.empty()) {
3473 if (nh < maxn) {
3474 wlst[nh].word = mystrdup(newword.c_str());
3475 wlst[nh].allow = ptr->allowCross();
3476 wlst[nh].orig = NULL__null;
3477 nh++;
3478 }
3479 }
3480 }
3481 ptr = ptr->getFlgNxt();
3482 }
3483 }
3484
3485 return nh;
3486}
3487
3488// return replacing table
3489const std::vector<replentry>& AffixMgr::get_reptable() const {
3490 return pHMgr->get_reptable();
3491}
3492
3493// return iconv table
3494RepList* AffixMgr::get_iconvtable() const {
3495 if (!iconvtable)
3496 return NULL__null;
3497 return iconvtable;
3498}
3499
3500// return oconv table
3501RepList* AffixMgr::get_oconvtable() const {
3502 if (!oconvtable)
3503 return NULL__null;
3504 return oconvtable;
3505}
3506
3507// return replacing table
3508struct phonetable* AffixMgr::get_phonetable() const {
3509 if (!phone)
3510 return NULL__null;
3511 return phone;
3512}
3513
3514// return character map table
3515const std::vector<mapentry>& AffixMgr::get_maptable() const {
3516 return maptable;
3517}
3518
3519// return character map table
3520const std::vector<std::string>& AffixMgr::get_breaktable() const {
3521 return breaktable;
3522}
3523
3524// return text encoding of dictionary
3525const std::string& AffixMgr::get_encoding() {
3526 if (encoding.empty())
3527 encoding = SPELL_ENCODING"ISO8859-1";
3528 return encoding;
3529}
3530
3531// return text encoding of dictionary
3532int AffixMgr::get_langnum() const {
3533 return langnum;
3534}
3535
3536// return double prefix option
3537int AffixMgr::get_complexprefixes() const {
3538 return complexprefixes;
3539}
3540
3541// return FULLSTRIP option
3542int AffixMgr::get_fullstrip() const {
3543 return fullstrip;
3544}
3545
3546FLAGunsigned short AffixMgr::get_keepcase() const {
3547 return keepcase;
3548}
3549
3550FLAGunsigned short AffixMgr::get_forceucase() const {
3551 return forceucase;
3552}
3553
3554FLAGunsigned short AffixMgr::get_warn() const {
3555 return warn;
3556}
3557
3558int AffixMgr::get_forbidwarn() const {
3559 return forbidwarn;
3560}
3561
3562int AffixMgr::get_checksharps() const {
3563 return checksharps;
3564}
3565
3566char* AffixMgr::encode_flag(unsigned short aflag) const {
3567 return pHMgr->encode_flag(aflag);
3568}
3569
3570// return the preferred ignore string for suggestions
3571const char* AffixMgr::get_ignore() const {
3572 if (ignorechars.empty())
3573 return NULL__null;
3574 return ignorechars.c_str();
3575}
3576
3577// return the preferred ignore string for suggestions
3578const std::vector<w_char>& AffixMgr::get_ignore_utf16() const {
3579 return ignorechars_utf16;
3580}
3581
3582// return the keyboard string for suggestions
3583char* AffixMgr::get_key_string() {
3584 if (keystring.empty())
3585 keystring = SPELL_KEYSTRING"qwertyuiop|asdfghjkl|zxcvbnm";
3586 return mystrdup(keystring.c_str());
3587}
3588
3589// return the preferred try string for suggestions
3590char* AffixMgr::get_try_string() const {
3591 if (trystring.empty())
3592 return NULL__null;
3593 return mystrdup(trystring.c_str());
3594}
3595
3596// return the preferred try string for suggestions
3597const std::string& AffixMgr::get_wordchars() const {
3598 return wordchars;
3599}
3600
3601const std::vector<w_char>& AffixMgr::get_wordchars_utf16() const {
3602 return wordchars_utf16;
3603}
3604
3605// is there compounding?
3606int AffixMgr::get_compound() const {
3607 return compoundflag || compoundbegin || !defcpdtable.empty();
3608}
3609
3610// return the compound words control flag
3611FLAGunsigned short AffixMgr::get_compoundflag() const {
3612 return compoundflag;
3613}
3614
3615// return the forbidden words control flag
3616FLAGunsigned short AffixMgr::get_forbiddenword() const {
3617 return forbiddenword;
3618}
3619
3620// return the forbidden words control flag
3621FLAGunsigned short AffixMgr::get_nosuggest() const {
3622 return nosuggest;
3623}
3624
3625// return the forbidden words control flag
3626FLAGunsigned short AffixMgr::get_nongramsuggest() const {
3627 return nongramsuggest;
3628}
3629
3630// return the substandard root/affix control flag
3631FLAGunsigned short AffixMgr::get_substandard() const {
3632 return substandard;
3633}
3634
3635// return the forbidden words flag modify flag
3636FLAGunsigned short AffixMgr::get_needaffix() const {
3637 return needaffix;
3638}
3639
3640// return the onlyincompound flag
3641FLAGunsigned short AffixMgr::get_onlyincompound() const {
3642 return onlyincompound;
3643}
3644
3645// return the value of suffix
3646const std::string& AffixMgr::get_version() const {
3647 return version;
3648}
3649
3650// utility method to look up root words in hash table
3651struct hentry* AffixMgr::lookup(const char* word) {
3652 struct hentry* he = NULL__null;
3653 for (size_t i = 0; i < alldic.size() && !he; ++i) {
3654 he = alldic[i]->lookup(word);
3655 }
3656 return he;
3657}
3658
3659// return the value of suffix
3660int AffixMgr::have_contclass() const {
3661 return havecontclass;
3662}
3663
3664// return utf8
3665int AffixMgr::get_utf8() const {
3666 return utf8;
3667}
3668
3669int AffixMgr::get_maxngramsugs(void) const {
3670 return maxngramsugs;
3671}
3672
3673int AffixMgr::get_maxcpdsugs(void) const {
3674 return maxcpdsugs;
3675}
3676
3677int AffixMgr::get_maxdiff(void) const {
3678 return maxdiff;
3679}
3680
3681int AffixMgr::get_onlymaxdiff(void) const {
3682 return onlymaxdiff;
3683}
3684
3685// return nosplitsugs
3686int AffixMgr::get_nosplitsugs(void) const {
3687 return nosplitsugs;
3688}
3689
3690// return sugswithdots
3691int AffixMgr::get_sugswithdots(void) const {
3692 return sugswithdots;
3693}
3694
3695/* parse flag */
3696bool AffixMgr::parse_flag(const std::string& line, unsigned short* out, FileMgr* af) {
3697 if (*out != FLAG_NULL0x00 && !(*out >= DEFAULTFLAGS65510)) {
3698 HUNSPELL_WARNING(
3699 stderrstderr,
3700 "error: line %d: multiple definitions of an affix file parameter\n",
3701 af->getlinenum());
3702 return false;
3703 }
3704 std::string s;
3705 if (!parse_string(line, s, af->getlinenum()))
3706 return false;
3707 *out = pHMgr->decode_flag(s.c_str());
3708 return true;
3709}
3710
3711/* parse num */
3712bool AffixMgr::parse_num(const std::string& line, int* out, FileMgr* af) {
3713 if (*out != -1) {
3714 HUNSPELL_WARNING(
3715 stderrstderr,
3716 "error: line %d: multiple definitions of an affix file parameter\n",
3717 af->getlinenum());
3718 return false;
3719 }
3720 std::string s;
3721 if (!parse_string(line, s, af->getlinenum()))
3722 return false;
3723 *out = atoi(s.c_str());
3724 return true;
3725}
3726
3727/* parse in the max syllablecount of compound words and */
3728bool AffixMgr::parse_cpdsyllable(const std::string& line, FileMgr* af) {
3729 int i = 0;
3730 int np = 0;
3731 std::string::const_iterator iter = line.begin();
3732 std::string::const_iterator start_piece = mystrsep(line, iter);
3733 while (start_piece != line.end()) {
3734 switch (i) {
3735 case 0: {
3736 np++;
3737 break;
3738 }
3739 case 1: {
3740 cpdmaxsyllable = atoi(std::string(start_piece, iter).c_str());
3741 np++;
3742 break;
3743 }
3744 case 2: {
3745 if (!utf8) {
3746 cpdvowels.assign(start_piece, iter);
3747 std::sort(cpdvowels.begin(), cpdvowels.end());
3748 } else {
3749 std::string piece(start_piece, iter);
3750 u8_u16(cpdvowels_utf16, piece);
3751 std::sort(cpdvowels_utf16.begin(), cpdvowels_utf16.end());
3752 }
3753 np++;
3754 break;
3755 }
3756 default:
3757 break;
3758 }
3759 ++i;
3760 start_piece = mystrsep(line, iter);
3761 }
3762 if (np < 2) {
3763 HUNSPELL_WARNING(stderrstderr,
3764 "error: line %d: missing compoundsyllable information\n",
3765 af->getlinenum());
3766 return false;
3767 }
3768 if (np == 2)
3769 cpdvowels = "AEIOUaeiou";
3770 return true;
3771}
3772
3773bool AffixMgr::parse_convtable(const std::string& line,
3774 FileMgr* af,
3775 RepList** rl,
3776 const std::string& keyword) {
3777 if (*rl) {
3778 HUNSPELL_WARNING(stderrstderr, "error: line %d: multiple table definitions\n",
3779 af->getlinenum());
3780 return false;
3781 }
3782 int i = 0;
3783 int np = 0;
3784 int numrl = 0;
3785 std::string::const_iterator iter = line.begin();
3786 std::string::const_iterator start_piece = mystrsep(line, iter);
3787 while (start_piece != line.end()) {
3788 switch (i) {
3789 case 0: {
3790 np++;
3791 break;
3792 }
3793 case 1: {
3794 numrl = atoi(std::string(start_piece, iter).c_str());
3795 if (numrl < 1) {
3796 HUNSPELL_WARNING(stderrstderr, "error: line %d: incorrect entry number\n",
3797 af->getlinenum());
3798 return false;
3799 }
3800 *rl = new RepList(numrl);
3801 if (!*rl)
3802 return false;
3803 np++;
3804 break;
3805 }
3806 default:
3807 break;
3808 }
3809 ++i;
3810 start_piece = mystrsep(line, iter);
3811 }
3812 if (np != 2) {
3813 HUNSPELL_WARNING(stderrstderr, "error: line %d: missing data\n",
3814 af->getlinenum());
3815 return false;
3816 }
3817
3818 /* now parse the num lines to read in the remainder of the table */
3819 for (int j = 0; j < numrl; j++) {
3820 std::string nl;
3821 if (!af->getline(nl))
3822 return false;
3823 mychomp(nl);
3824 i = 0;
3825 std::string pattern;
3826 std::string pattern2;
3827 iter = nl.begin();
3828 start_piece = mystrsep(nl, iter);
3829 while (start_piece != nl.end()) {
3830 {
3831 switch (i) {
3832 case 0: {
3833 if (nl.compare(start_piece - nl.begin(), keyword.size(), keyword, 0, keyword.size()) != 0) {
3834 HUNSPELL_WARNING(stderrstderr, "error: line %d: table is corrupt\n",
3835 af->getlinenum());
3836 delete *rl;
3837 *rl = NULL__null;
3838 return false;
3839 }
3840 break;
3841 }
3842 case 1: {
3843 pattern.assign(start_piece, iter);
3844 break;
3845 }
3846 case 2: {
3847 pattern2.assign(start_piece, iter);
3848 break;
3849 }
3850 default:
3851 break;
3852 }
3853 ++i;
3854 }
3855 start_piece = mystrsep(nl, iter);
3856 }
3857 if (pattern.empty() || pattern2.empty()) {
3858 HUNSPELL_WARNING(stderrstderr, "error: line %d: table is corrupt\n",
3859 af->getlinenum());
3860 return false;
3861 }
3862 (*rl)->add(pattern, pattern2);
3863 }
3864 return true;
3865}
3866
3867/* parse in the typical fault correcting table */
3868bool AffixMgr::parse_phonetable(const std::string& line, FileMgr* af) {
3869 if (phone) {
3870 HUNSPELL_WARNING(stderrstderr, "error: line %d: multiple table definitions\n",
3871 af->getlinenum());
3872 return false;
3873 }
3874 int num = -1;
3875 int i = 0;
3876 int np = 0;
3877 std::string::const_iterator iter = line.begin();
3878 std::string::const_iterator start_piece = mystrsep(line, iter);
3879 while (start_piece != line.end()) {
3880 switch (i) {
3881 case 0: {
3882 np++;
3883 break;
3884 }
3885 case 1: {
3886 num = atoi(std::string(start_piece, iter).c_str());
3887 if (num < 1) {
3888 HUNSPELL_WARNING(stderrstderr, "error: line %d: bad entry number\n",
3889 af->getlinenum());
3890 return false;
3891 }
3892 phone = new phonetable;
3893 phone->utf8 = (char)utf8;
3894 np++;
3895 break;
3896 }
3897 default:
3898 break;
3899 }
3900 ++i;
3901 start_piece = mystrsep(line, iter);
3902 }
3903 if (np != 2) {
3904 HUNSPELL_WARNING(stderrstderr, "error: line %d: missing data\n",
3905 af->getlinenum());
3906 return false;
3907 }
3908
3909 /* now parse the phone->num lines to read in the remainder of the table */
3910 for (int j = 0; j < num; ++j) {
3911 std::string nl;
3912 if (!af->getline(nl))
3913 return false;
3914 mychomp(nl);
3915 i = 0;
3916 const size_t old_size = phone->rules.size();
3917 iter = nl.begin();
3918 start_piece = mystrsep(nl, iter);
3919 while (start_piece != nl.end()) {
3920 {
3921 switch (i) {
3922 case 0: {
3923 if (nl.compare(start_piece - nl.begin(), 5, "PHONE", 5) != 0) {
3924 HUNSPELL_WARNING(stderrstderr, "error: line %d: table is corrupt\n",
3925 af->getlinenum());
3926 return false;
3927 }
3928 break;
3929 }
3930 case 1: {
3931 phone->rules.push_back(std::string(start_piece, iter));
3932 break;
3933 }
3934 case 2: {
3935 phone->rules.push_back(std::string(start_piece, iter));
3936 mystrrep(phone->rules.back(), "_", "");
3937 break;
3938 }
3939 default:
3940 break;
3941 }
3942 ++i;
3943 }
3944 start_piece = mystrsep(nl, iter);
3945 }
3946 if (phone->rules.size() != old_size + 2) {
3947 HUNSPELL_WARNING(stderrstderr, "error: line %d: table is corrupt\n",
3948 af->getlinenum());
3949 phone->rules.clear();
3950 return false;
3951 }
3952 }
3953 phone->rules.push_back("");
3954 phone->rules.push_back("");
3955 init_phonet_hash(*phone);
3956 return true;
3957}
3958
3959/* parse in the checkcompoundpattern table */
3960bool AffixMgr::parse_checkcpdtable(const std::string& line, FileMgr* af) {
3961 if (parsedcheckcpd) {
3962 HUNSPELL_WARNING(stderrstderr, "error: line %d: multiple table definitions\n",
3963 af->getlinenum());
3964 return false;
3965 }
3966 parsedcheckcpd = true;
3967 int numcheckcpd = -1;
3968 int i = 0;
3969 int np = 0;
3970 std::string::const_iterator iter = line.begin();
3971 std::string::const_iterator start_piece = mystrsep(line, iter);
3972 while (start_piece != line.end()) {
3973 switch (i) {
3974 case 0: {
3975 np++;
3976 break;
3977 }
3978 case 1: {
3979 numcheckcpd = atoi(std::string(start_piece, iter).c_str());
3980 if (numcheckcpd < 1) {
3981 HUNSPELL_WARNING(stderrstderr, "error: line %d: bad entry number\n",
3982 af->getlinenum());
3983 return false;
3984 }
3985 checkcpdtable.reserve(numcheckcpd);
3986 np++;
3987 break;
3988 }
3989 default:
3990 break;
3991 }
3992 ++i;
3993 start_piece = mystrsep(line, iter);
3994 }
3995 if (np != 2) {
3996 HUNSPELL_WARNING(stderrstderr, "error: line %d: missing data\n",
3997 af->getlinenum());
3998 return false;
3999 }
4000
4001 /* now parse the numcheckcpd lines to read in the remainder of the table */
4002 for (int j = 0; j < numcheckcpd; ++j) {
4003 std::string nl;
4004 if (!af->getline(nl))
4005 return false;
4006 mychomp(nl);
4007 i = 0;
4008 checkcpdtable.push_back(patentry());
4009 iter = nl.begin();
4010 start_piece = mystrsep(nl, iter);
4011 while (start_piece != nl.end()) {
4012 switch (i) {
4013 case 0: {
4014 if (nl.compare(start_piece - nl.begin(), 20, "CHECKCOMPOUNDPATTERN", 20) != 0) {
4015 HUNSPELL_WARNING(stderrstderr, "error: line %d: table is corrupt\n",
4016 af->getlinenum());
4017 return false;
4018 }
4019 break;
4020 }
4021 case 1: {
4022 checkcpdtable.back().pattern.assign(start_piece, iter);
4023 size_t slash_pos = checkcpdtable.back().pattern.find('/');
4024 if (slash_pos != std::string::npos) {
4025 std::string chunk(checkcpdtable.back().pattern, slash_pos + 1);
4026 checkcpdtable.back().pattern.resize(slash_pos);
4027 checkcpdtable.back().cond = pHMgr->decode_flag(chunk.c_str());
4028 }
4029 break;
4030 }
4031 case 2: {
4032 checkcpdtable.back().pattern2.assign(start_piece, iter);
4033 size_t slash_pos = checkcpdtable.back().pattern2.find('/');
4034 if (slash_pos != std::string::npos) {
4035 std::string chunk(checkcpdtable.back().pattern2, slash_pos + 1);
4036 checkcpdtable.back().pattern2.resize(slash_pos);
4037 checkcpdtable.back().cond2 = pHMgr->decode_flag(chunk.c_str());
4038 }
4039 break;
4040 }
4041 case 3: {
4042 checkcpdtable.back().pattern3.assign(start_piece, iter);
4043 simplifiedcpd = 1;
4044 break;
4045 }
4046 default:
4047 break;
4048 }
4049 i++;
4050 start_piece = mystrsep(nl, iter);
4051 }
4052 }
4053 return true;
4054}
4055
4056/* parse in the compound rule table */
4057bool AffixMgr::parse_defcpdtable(const std::string& line, FileMgr* af) {
4058 if (parseddefcpd) {
4059 HUNSPELL_WARNING(stderrstderr, "error: line %d: multiple table definitions\n",
4060 af->getlinenum());
4061 return false;
4062 }
4063 parseddefcpd = true;
4064 int numdefcpd = -1;
4065 int i = 0;
4066 int np = 0;
4067 std::string::const_iterator iter = line.begin();
4068 std::string::const_iterator start_piece = mystrsep(line, iter);
4069 while (start_piece != line.end()) {
4070 switch (i) {
4071 case 0: {
4072 np++;
4073 break;
4074 }
4075 case 1: {
4076 numdefcpd = atoi(std::string(start_piece, iter).c_str());
4077 if (numdefcpd < 1) {
4078 HUNSPELL_WARNING(stderrstderr, "error: line %d: bad entry number\n",
4079 af->getlinenum());
4080 return false;
4081 }
4082 defcpdtable.reserve(numdefcpd);
4083 np++;
4084 break;
4085 }
4086 default:
4087 break;
4088 }
4089 ++i;
4090 start_piece = mystrsep(line, iter);
4091 }
4092 if (np != 2) {
4093 HUNSPELL_WARNING(stderrstderr, "error: line %d: missing data\n",
4094 af->getlinenum());
4095 return false;
4096 }
4097
4098 /* now parse the numdefcpd lines to read in the remainder of the table */
4099 for (int j = 0; j < numdefcpd; ++j) {
4100 std::string nl;
4101 if (!af->getline(nl))
4102 return false;
4103 mychomp(nl);
4104 i = 0;
4105 defcpdtable.push_back(flagentry());
4106 iter = nl.begin();
4107 start_piece = mystrsep(nl, iter);
4108 while (start_piece != nl.end()) {
4109 switch (i) {
4110 case 0: {
4111 if (nl.compare(start_piece - nl.begin(), 12, "COMPOUNDRULE", 12) != 0) {
4112 HUNSPELL_WARNING(stderrstderr, "error: line %d: table is corrupt\n",
4113 af->getlinenum());
4114 numdefcpd = 0;
Value stored to 'numdefcpd' is never read
4115 return false;
4116 }
4117 break;
4118 }
4119 case 1: { // handle parenthesized flags
4120 if (std::find(start_piece, iter, '(') != iter) {
4121 for (std::string::const_iterator k = start_piece; k != iter; ++k) {
4122 std::string::const_iterator chb = k;
4123 std::string::const_iterator che = k + 1;
4124 if (*k == '(') {
4125 std::string::const_iterator parpos = std::find(k, iter, ')');
4126 if (parpos != iter) {
4127 chb = k + 1;
4128 che = parpos;
4129 k = parpos;
4130 }
4131 }
4132
4133 if (*chb == '*' || *chb == '?') {
4134 defcpdtable.back().push_back((FLAGunsigned short)*chb);
4135 } else {
4136 pHMgr->decode_flags(defcpdtable.back(), std::string(chb, che), af);
4137 }
4138 }
4139 } else {
4140 pHMgr->decode_flags(defcpdtable.back(), std::string(start_piece, iter), af);
4141 }
4142 break;
4143 }
4144 default:
4145 break;
4146 }
4147 ++i;
4148 start_piece = mystrsep(nl, iter);
4149 }
4150 if (defcpdtable.back().empty()) {
4151 HUNSPELL_WARNING(stderrstderr, "error: line %d: table is corrupt\n",
4152 af->getlinenum());
4153 return false;
4154 }
4155 }
4156 return true;
4157}
4158
4159/* parse in the character map table */
4160bool AffixMgr::parse_maptable(const std::string& line, FileMgr* af) {
4161 if (parsedmaptable) {
4162 HUNSPELL_WARNING(stderrstderr, "error: line %d: multiple table definitions\n",
4163 af->getlinenum());
4164 return false;
4165 }
4166 parsedmaptable = true;
4167 int nummap = -1;
4168 int i = 0;
4169 int np = 0;
4170 std::string::const_iterator iter = line.begin();
4171 std::string::const_iterator start_piece = mystrsep(line, iter);
4172 while (start_piece != line.end()) {
4173 switch (i) {
4174 case 0: {
4175 np++;
4176 break;
4177 }
4178 case 1: {
4179 nummap = atoi(std::string(start_piece, iter).c_str());
4180 if (nummap < 1) {
4181 HUNSPELL_WARNING(stderrstderr, "error: line %d: bad entry number\n",
4182 af->getlinenum());
4183 return false;
4184 }
4185 maptable.reserve(nummap);
4186 np++;
4187 break;
4188 }
4189 default:
4190 break;
4191 }
4192 ++i;
4193 start_piece = mystrsep(line, iter);
4194 }
4195 if (np != 2) {
4196 HUNSPELL_WARNING(stderrstderr, "error: line %d: missing data\n",
4197 af->getlinenum());
4198 return false;
4199 }
4200
4201 /* now parse the nummap lines to read in the remainder of the table */
4202 for (int j = 0; j < nummap; ++j) {
4203 std::string nl;
4204 if (!af->getline(nl))
4205 return false;
4206 mychomp(nl);
4207 i = 0;
4208 maptable.push_back(mapentry());
4209 iter = nl.begin();
4210 start_piece = mystrsep(nl, iter);
4211 while (start_piece != nl.end()) {
4212 switch (i) {
4213 case 0: {
4214 if (nl.compare(start_piece - nl.begin(), 3, "MAP", 3) != 0) {
4215 HUNSPELL_WARNING(stderrstderr, "error: line %d: table is corrupt\n",
4216 af->getlinenum());
4217 nummap = 0;
4218 return false;
4219 }
4220 break;
4221 }
4222 case 1: {
4223 for (std::string::const_iterator k = start_piece; k != iter; ++k) {
4224 std::string::const_iterator chb = k;
4225 std::string::const_iterator che = k + 1;
4226 if (*k == '(') {
4227 std::string::const_iterator parpos = std::find(k, iter, ')');
4228 if (parpos != iter) {
4229 chb = k + 1;
4230 che = parpos;
4231 k = parpos;
4232 }
4233 } else {
4234 if (utf8 && (*k & 0xc0) == 0xc0) {
4235 ++k;
4236 while (k != iter && (*k & 0xc0) == 0x80)
4237 ++k;
4238 che = k;
4239 --k;
4240 }
4241 }
4242 maptable.back().push_back(std::string(chb, che));
4243 }
4244 break;
4245 }
4246 default:
4247 break;
4248 }
4249 ++i;
4250 start_piece = mystrsep(nl, iter);
4251 }
4252 if (maptable.back().empty()) {
4253 HUNSPELL_WARNING(stderrstderr, "error: line %d: table is corrupt\n",
4254 af->getlinenum());
4255 return false;
4256 }
4257 }
4258 return true;
4259}
4260
4261/* parse in the word breakpoint table */
4262bool AffixMgr::parse_breaktable(const std::string& line, FileMgr* af) {
4263 if (parsedbreaktable) {
4264 HUNSPELL_WARNING(stderrstderr, "error: line %d: multiple table definitions\n",
4265 af->getlinenum());
4266 return false;
4267 }
4268 parsedbreaktable = true;
4269 int numbreak = -1;
4270 int i = 0;
4271 int np = 0;
4272 std::string::const_iterator iter = line.begin();
4273 std::string::const_iterator start_piece = mystrsep(line, iter);
4274 while (start_piece != line.end()) {
4275 switch (i) {
4276 case 0: {
4277 np++;
4278 break;
4279 }
4280 case 1: {
4281 numbreak = atoi(std::string(start_piece, iter).c_str());
4282 if (numbreak < 0) {
4283 HUNSPELL_WARNING(stderrstderr, "error: line %d: bad entry number\n",
4284 af->getlinenum());
4285 return false;
4286 }
4287 if (numbreak == 0)
4288 return true;
4289 breaktable.reserve(numbreak);
4290 np++;
4291 break;
4292 }
4293 default:
4294 break;
4295 }
4296 ++i;
4297 start_piece = mystrsep(line, iter);
4298 }
4299 if (np != 2) {
4300 HUNSPELL_WARNING(stderrstderr, "error: line %d: missing data\n",
4301 af->getlinenum());
4302 return false;
4303 }
4304
4305 /* now parse the numbreak lines to read in the remainder of the table */
4306 for (int j = 0; j < numbreak; ++j) {
4307 std::string nl;
4308 if (!af->getline(nl))
4309 return false;
4310 mychomp(nl);
4311 i = 0;
4312 iter = nl.begin();
4313 start_piece = mystrsep(nl, iter);
4314 while (start_piece != nl.end()) {
4315 switch (i) {
4316 case 0: {
4317 if (nl.compare(start_piece - nl.begin(), 5, "BREAK", 5) != 0) {
4318 HUNSPELL_WARNING(stderrstderr, "error: line %d: table is corrupt\n",
4319 af->getlinenum());
4320 numbreak = 0;
4321 return false;
4322 }
4323 break;
4324 }
4325 case 1: {
4326 breaktable.push_back(std::string(start_piece, iter));
4327 break;
4328 }
4329 default:
4330 break;
4331 }
4332 ++i;
4333 start_piece = mystrsep(nl, iter);
4334 }
4335 }
4336
4337 if (breaktable.size() != static_cast<size_t>(numbreak)) {
4338 HUNSPELL_WARNING(stderrstderr, "error: line %d: table is corrupt\n",
4339 af->getlinenum());
4340 return false;
4341 }
4342
4343 return true;
4344}
4345
4346void AffixMgr::reverse_condition(std::string& piece) {
4347 if (piece.empty())
4348 return;
4349
4350 int neg = 0;
4351 for (std::string::reverse_iterator k = piece.rbegin(); k != piece.rend(); ++k) {
4352 switch (*k) {
4353 case '[': {
4354 if (neg)
4355 *(k - 1) = '[';
4356 else
4357 *k = ']';
4358 break;
4359 }
4360 case ']': {
4361 *k = '[';
4362 if (neg)
4363 *(k - 1) = '^';
4364 neg = 0;
4365 break;
4366 }
4367 case '^': {
4368 if (*(k - 1) == ']')
4369 neg = 1;
4370 else if (neg)
4371 *(k - 1) = *k;
4372 break;
4373 }
4374 default: {
4375 if (neg)
4376 *(k - 1) = *k;
4377 }
4378 }
4379 }
4380}
4381
4382class entries_container {
4383 std::vector<AffEntry*> entries;
4384 AffixMgr* m_mgr;
4385 char m_at;
4386public:
4387 entries_container(char at, AffixMgr* mgr)
4388 : m_mgr(mgr)
4389 , m_at(at) {
4390 }
4391 void release() {
4392 entries.clear();
4393 }
4394 void initialize(int numents,
4395 char opts, unsigned short aflag) {
4396 entries.reserve(numents);
4397
4398 if (m_at == 'P') {
4399 entries.push_back(new PfxEntry(m_mgr));
4400 } else {
4401 entries.push_back(new SfxEntry(m_mgr));
4402 }
4403
4404 entries.back()->opts = opts;
4405 entries.back()->aflag = aflag;
4406 }
4407
4408 AffEntry* add_entry(char opts) {
4409 if (m_at == 'P') {
4410 entries.push_back(new PfxEntry(m_mgr));
4411 } else {
4412 entries.push_back(new SfxEntry(m_mgr));
4413 }
4414 AffEntry* ret = entries.back();
4415 ret->opts = entries[0]->opts & opts;
4416 return ret;
4417 }
4418
4419 AffEntry* first_entry() {
4420 return entries.empty() ? NULL__null : entries[0];
4421 }
4422
4423 ~entries_container() {
4424 for (size_t i = 0; i < entries.size(); ++i) {
4425 delete entries[i];
4426 }
4427 }
4428
4429 std::vector<AffEntry*>::iterator begin() { return entries.begin(); }
4430 std::vector<AffEntry*>::iterator end() { return entries.end(); }
4431};
4432
4433bool AffixMgr::parse_affix(const std::string& line,
4434 const char at,
4435 FileMgr* af,
4436 char* dupflags) {
4437 int numents = 0; // number of AffEntry structures to parse
4438
4439 unsigned short aflag = 0; // affix char identifier
4440
4441 char ff = 0;
4442 entries_container affentries(at, this);
4443
4444 int i = 0;
4445
4446// checking lines with bad syntax
4447#ifdef DEBUG1
4448 int basefieldnum = 0;
4449#endif
4450
4451 // split affix header line into pieces
4452
4453 int np = 0;
4454 std::string::const_iterator iter = line.begin();
4455 std::string::const_iterator start_piece = mystrsep(line, iter);
4456 while (start_piece != line.end()) {
4457 switch (i) {
4458 // piece 1 - is type of affix
4459 case 0: {
4460 np++;
4461 break;
4462 }
4463
4464 // piece 2 - is affix char
4465 case 1: {
4466 np++;
4467 aflag = pHMgr->decode_flag(std::string(start_piece, iter).c_str());
4468 if (((at == 'S') && (dupflags[aflag] & dupSFX(1 << 0))) ||
4469 ((at == 'P') && (dupflags[aflag] & dupPFX(1 << 1)))) {
4470 HUNSPELL_WARNING(
4471 stderrstderr,
4472 "error: line %d: multiple definitions of an affix flag\n",
4473 af->getlinenum());
4474 }
4475 dupflags[aflag] += (char)((at == 'S') ? dupSFX(1 << 0) : dupPFX(1 << 1));
4476 break;
4477 }
4478 // piece 3 - is cross product indicator
4479 case 2: {
4480 np++;
4481 if (*start_piece == 'Y')
4482 ff = aeXPRODUCT(1 << 0);
4483 break;
4484 }
4485
4486 // piece 4 - is number of affentries
4487 case 3: {
4488 np++;
4489 numents = atoi(std::string(start_piece, iter).c_str());
4490 if ((numents <= 0) || ((std::numeric_limits<size_t>::max() /
4491 sizeof(AffEntry)) < static_cast<size_t>(numents))) {
4492 char* err = pHMgr->encode_flag(aflag);
4493 if (err) {
4494 HUNSPELL_WARNING(stderrstderr, "error: line %d: bad entry number\n",
4495 af->getlinenum());
4496 free(err)HunspellAllocator::CountingFree(err);
4497 }
4498 return false;
4499 }
4500
4501 char opts = ff;
4502 if (utf8)
4503 opts |= aeUTF8(1 << 1);
4504 if (pHMgr->is_aliasf())
4505 opts |= aeALIASF(1 << 2);
4506 if (pHMgr->is_aliasm())
4507 opts |= aeALIASM(1 << 3);
4508 affentries.initialize(numents, opts, aflag);
4509 }
4510
4511 default:
4512 break;
4513 }
4514 ++i;
4515 start_piece = mystrsep(line, iter);
4516 }
4517 // check to make sure we parsed enough pieces
4518 if (np != 4) {
4519 char* err = pHMgr->encode_flag(aflag);
4520 if (err) {
4521 HUNSPELL_WARNING(stderrstderr, "error: line %d: missing data\n",
4522 af->getlinenum());
4523 free(err)HunspellAllocator::CountingFree(err);
4524 }
4525 return false;
4526 }
4527
4528 // now parse numents affentries for this affix
4529 AffEntry* entry = affentries.first_entry();
4530 for (int ent = 0; ent < numents; ++ent) {
4531 std::string nl;
4532 if (!af->getline(nl))
4533 return false;
4534 mychomp(nl);
4535
4536 iter = nl.begin();
4537 i = 0;
4538 np = 0;
4539
4540 // split line into pieces
4541 start_piece = mystrsep(nl, iter);
4542 while (start_piece != nl.end()) {
4543 switch (i) {
4544 // piece 1 - is type
4545 case 0: {
4546 np++;
4547 if (ent != 0)
4548 entry = affentries.add_entry((char)(aeXPRODUCT(1 << 0) + aeUTF8(1 << 1) + aeALIASF(1 << 2) + aeALIASM(1 << 3)));
4549 break;
4550 }
4551
4552 // piece 2 - is affix char
4553 case 1: {
4554 np++;
4555 std::string chunk(start_piece, iter);
4556 if (pHMgr->decode_flag(chunk.c_str()) != aflag) {
4557 char* err = pHMgr->encode_flag(aflag);
4558 if (err) {
4559 HUNSPELL_WARNING(stderrstderr,
4560 "error: line %d: affix %s is corrupt\n",
4561 af->getlinenum(), err);
4562 free(err)HunspellAllocator::CountingFree(err);
4563 }
4564 return false;
4565 }
4566
4567 if (ent != 0) {
4568 AffEntry* start_entry = affentries.first_entry();
4569 entry->aflag = start_entry->aflag;
4570 }
4571 break;
4572 }
4573
4574 // piece 3 - is string to strip or 0 for null
4575 case 2: {
4576 np++;
4577 entry->strip = std::string(start_piece, iter);
4578 if (complexprefixes) {
4579 if (utf8)
4580 reverseword_utf(entry->strip);
4581 else
4582 reverseword(entry->strip);
4583 }
4584 if (entry->strip.compare("0") == 0) {
4585 entry->strip.clear();
4586 }
4587 break;
4588 }
4589
4590 // piece 4 - is affix string or 0 for null
4591 case 3: {
4592 entry->morphcode = NULL__null;
4593 entry->contclass = NULL__null;
4594 entry->contclasslen = 0;
4595 np++;
4596 std::string::const_iterator dash = std::find(start_piece, iter, '/');
4597 if (dash != iter) {
4598 entry->appnd = std::string(start_piece, dash);
4599 std::string dash_str(dash + 1, iter);
4600
4601 if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) {
4602 if (utf8) {
4603 remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
4604 } else {
4605 remove_ignored_chars(entry->appnd, ignorechars);
4606 }
4607 }
4608
4609 if (complexprefixes) {
4610 if (utf8)
4611 reverseword_utf(entry->appnd);
4612 else
4613 reverseword(entry->appnd);
4614 }
4615
4616 if (pHMgr->is_aliasf()) {
4617 int index = atoi(dash_str.c_str());
4618 entry->contclasslen = (unsigned short)pHMgr->get_aliasf(
4619 index, &(entry->contclass), af);
4620 if (!entry->contclasslen)
4621 HUNSPELL_WARNING(stderrstderr,
4622 "error: bad affix flag alias: \"%s\"\n",
4623 dash_str.c_str());
4624 } else {
4625 entry->contclasslen = (unsigned short)pHMgr->decode_flags(
4626 &(entry->contclass), dash_str.c_str(), af);
4627 std::sort(entry->contclass, entry->contclass + entry->contclasslen);
4628 }
4629
4630 havecontclass = 1;
4631 for (unsigned short _i = 0; _i < entry->contclasslen; _i++) {
4632 contclasses[(entry->contclass)[_i]] = 1;
4633 }
4634 } else {
4635 entry->appnd = std::string(start_piece, iter);
4636
4637 if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) {
4638 if (utf8) {
4639 remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
4640 } else {
4641 remove_ignored_chars(entry->appnd, ignorechars);
4642 }
4643 }
4644
4645 if (complexprefixes) {
4646 if (utf8)
4647 reverseword_utf(entry->appnd);
4648 else
4649 reverseword(entry->appnd);
4650 }
4651 }
4652
4653 if (entry->appnd.compare("0") == 0) {
4654 entry->appnd.clear();
4655 }
4656 break;
4657 }
4658
4659 // piece 5 - is the conditions descriptions
4660 case 4: {
4661 std::string chunk(start_piece, iter);
4662 np++;
4663 if (complexprefixes) {
4664 if (utf8)
4665 reverseword_utf(chunk);
4666 else
4667 reverseword(chunk);
4668 reverse_condition(chunk);
4669 }
4670 if (!entry->strip.empty() && chunk != "." &&
4671 redundant_condition(at, entry->strip.c_str(), entry->strip.size(), chunk.c_str(),
4672 af->getlinenum()))
4673 chunk = ".";
4674 if (at == 'S') {
4675 reverseword(chunk);
4676 reverse_condition(chunk);
4677 }
4678 if (encodeit(*entry, chunk.c_str()))
4679 return false;
4680 break;
4681 }
4682
4683 case 5: {
4684 std::string chunk(start_piece, iter);
4685 np++;
4686 if (pHMgr->is_aliasm()) {
4687 int index = atoi(chunk.c_str());
4688 entry->morphcode = pHMgr->get_aliasm(index);
4689 } else {
4690 if (complexprefixes) { // XXX - fix me for morph. gen.
4691 if (utf8)
4692 reverseword_utf(chunk);
4693 else
4694 reverseword(chunk);
4695 }
4696 // add the remaining of the line
4697 std::string::const_iterator end = nl.end();
4698 if (iter != end) {
4699 chunk.append(iter, end);
4700 }
4701 entry->morphcode = mystrdup(chunk.c_str());
4702 if (!entry->morphcode)
4703 return false;
4704 }
4705 break;
4706 }
4707 default:
4708 break;
4709 }
4710 i++;
4711 start_piece = mystrsep(nl, iter);
4712 }
4713 // check to make sure we parsed enough pieces
4714 if (np < 4) {
4715 char* err = pHMgr->encode_flag(aflag);
4716 if (err) {
4717 HUNSPELL_WARNING(stderrstderr, "error: line %d: affix %s is corrupt\n",
4718 af->getlinenum(), err);
4719 free(err)HunspellAllocator::CountingFree(err);
4720 }
4721 return false;
4722 }
4723
4724#ifdef DEBUG1
4725 // detect unnecessary fields, excepting comments
4726 if (basefieldnum) {
4727 int fieldnum =
4728 !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6);
4729 if (fieldnum != basefieldnum)
4730 HUNSPELL_WARNING(stderrstderr, "warning: line %d: bad field number\n",
4731 af->getlinenum());
4732 } else {
4733 basefieldnum =
4734 !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6);
4735 }
4736#endif
4737 }
4738
4739 // now create SfxEntry or PfxEntry objects and use links to
4740 // build an ordered (sorted by affix string) list
4741 std::vector<AffEntry*>::iterator start = affentries.begin();
4742 std::vector<AffEntry*>::iterator end = affentries.end();
4743 for (std::vector<AffEntry*>::iterator affentry = start; affentry != end; ++affentry) {
4744 if (at == 'P') {
4745 build_pfxtree(static_cast<PfxEntry*>(*affentry));
4746 } else {
4747 build_sfxtree(static_cast<SfxEntry*>(*affentry));
4748 }
4749 }
4750
4751 //contents belong to AffixMgr now
4752 affentries.release();
4753
4754 return true;
4755}
4756
4757int AffixMgr::redundant_condition(char ft,
4758 const char* strip,
4759 int stripl,
4760 const char* cond,
4761 int linenum) {
4762 int condl = strlen(cond);
4763 int i;
4764 int j;
4765 int neg;
4766 int in;
4767 if (ft == 'P') { // prefix
4768 if (strncmp(strip, cond, condl) == 0)
4769 return 1;
4770 if (utf8) {
4771 } else {
4772 for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) {
4773 if (cond[j] != '[') {
4774 if (cond[j] != strip[i]) {
4775 HUNSPELL_WARNING(stderrstderr,
4776 "warning: line %d: incompatible stripping "
4777 "characters and condition\n",
4778 linenum);
4779 return 0;
4780 }
4781 } else {
4782 neg = (cond[j + 1] == '^') ? 1 : 0;
4783 in = 0;
4784 do {
4785 j++;
4786 if (strip[i] == cond[j])
4787 in = 1;
4788 } while ((j < (condl - 1)) && (cond[j] != ']'));
4789 if (j == (condl - 1) && (cond[j] != ']')) {
4790 HUNSPELL_WARNING(stderrstderr,
4791 "error: line %d: missing ] in condition:\n%s\n",
4792 linenum, cond);
4793 return 0;
4794 }
4795 if ((!neg && !in) || (neg && in)) {
4796 HUNSPELL_WARNING(stderrstderr,
4797 "warning: line %d: incompatible stripping "
4798 "characters and condition\n",
4799 linenum);
4800 return 0;
4801 }
4802 }
4803 }
4804 if (j >= condl)
4805 return 1;
4806 }
4807 } else { // suffix
4808 if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0)
4809 return 1;
4810 if (utf8) {
4811 } else {
4812 for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) {
4813 if (cond[j] != ']') {
4814 if (cond[j] != strip[i]) {
4815 HUNSPELL_WARNING(stderrstderr,
4816 "warning: line %d: incompatible stripping "
4817 "characters and condition\n",
4818 linenum);
4819 return 0;
4820 }
4821 } else {
4822 in = 0;
4823 do {
4824 j--;
4825 if (strip[i] == cond[j])
4826 in = 1;
4827 } while ((j > 0) && (cond[j] != '['));
4828 if ((j == 0) && (cond[j] != '[')) {
4829 HUNSPELL_WARNING(stderrstderr,
4830 "error: line: %d: missing ] in condition:\n%s\n",
4831 linenum, cond);
4832 return 0;
4833 }
4834 neg = (cond[j + 1] == '^') ? 1 : 0;
4835 if ((!neg && !in) || (neg && in)) {
4836 HUNSPELL_WARNING(stderrstderr,
4837 "warning: line %d: incompatible stripping "
4838 "characters and condition\n",
4839 linenum);
4840 return 0;
4841 }
4842 }
4843 }
4844 if (j < 0)
4845 return 1;
4846 }
4847 }
4848 return 0;
4849}
4850
4851std::vector<std::string> AffixMgr::get_suffix_words(short unsigned* suff,
4852 int len,
4853 const char* root_word) {
4854 std::vector<std::string> slst;
4855 short unsigned* start_ptr = suff;
4856 for (int j = 0; j < SETSIZE256; j++) {
4857 SfxEntry* ptr = sStart[j];
4858 while (ptr) {
4859 suff = start_ptr;
4860 for (int i = 0; i < len; i++) {
4861 if ((*suff) == ptr->getFlag()) {
4862 std::string nw(root_word);
4863 nw.append(ptr->getAffix());
4864 hentry* ht = ptr->checkword(nw.c_str(), nw.size(), 0, NULL__null, 0, 0, 0);
4865 if (ht) {
4866 slst.push_back(nw);
4867 }
4868 }
4869 suff++;
4870 }
4871 ptr = ptr->getNext();
4872 }
4873 }
4874 return slst;
4875}