Bug Summary

File:out/../deps/icu-small/source/i18n/usearch.cpp
Warning:line 2337, column 22
Although the value stored to 'maxLimit' is used in the enclosing expression, the value is never actually read from 'maxLimit'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name usearch.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/home/maurizio/node-v18.6.0/out -resource-dir /usr/local/lib/clang/16.0.0 -D V8_DEPRECATION_WARNINGS -D V8_IMMINENT_DEPRECATION_WARNINGS -D _GLIBCXX_USE_CXX11_ABI=1 -D NODE_OPENSSL_CONF_NAME=nodejs_conf -D NODE_OPENSSL_HAS_QUIC -D __STDC_FORMAT_MACROS -D OPENSSL_NO_PINSHARED -D OPENSSL_THREADS -D U_COMMON_IMPLEMENTATION=1 -D U_I18N_IMPLEMENTATION=1 -D U_IO_IMPLEMENTATION=1 -D U_TOOLUTIL_IMPLEMENTATION=1 -D U_ATTRIBUTE_DEPRECATED= -D _CRT_SECURE_NO_DEPRECATE= -D U_STATIC_IMPLEMENTATION=1 -D UCONFIG_NO_SERVICE=1 -D U_ENABLE_DYLOAD=0 -D U_HAVE_STD_STRING=1 -D UCONFIG_NO_BREAK_ITERATION=0 -I ../deps/icu-small/source/common -I ../deps/icu-small/source/i18n -I ../deps/icu-small/source/tools/toolutil -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8 -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/x86_64-redhat-linux -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/backward -internal-isystem /usr/local/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../x86_64-redhat-linux/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-unused-parameter -Wno-deprecated-declarations -Wno-strict-aliasing -std=gnu++17 -fdeprecated-macro -fdebug-compilation-dir=/home/maurizio/node-v18.6.0/out -ferror-limit 19 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-08-22-142216-507842-1 -x c++ ../deps/icu-small/source/i18n/usearch.cpp
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (C) 2001-2015 IBM and others. All rights reserved.
6**********************************************************************
7* Date Name Description
8* 07/02/2001 synwee Creation.
9**********************************************************************
10*/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION0 && !UCONFIG_NO_BREAK_ITERATION0
15
16#include "unicode/usearch.h"
17#include "unicode/ustring.h"
18#include "unicode/uchar.h"
19#include "unicode/utf16.h"
20#include "normalizer2impl.h"
21#include "usrchimp.h"
22#include "cmemory.h"
23#include "ucln_in.h"
24#include "uassert.h"
25#include "ustr_imp.h"
26
27U_NAMESPACE_USEusing namespace icu_71;
28
29// internal definition ---------------------------------------------------
30
31#define LAST_BYTE_MASK_0xFF 0xFF
32#define SECOND_LAST_BYTE_SHIFT_8 8
33#define SUPPLEMENTARY_MIN_VALUE_0x10000 0x10000
34
35static const Normalizer2Impl *g_nfcImpl = nullptr;
36
37// internal methods -------------------------------------------------
38
39/**
40* Fast collation element iterator setOffset.
41* This function does not check for bounds.
42* @param coleiter collation element iterator
43* @param offset to set
44*/
45static
46inline void setColEIterOffset(UCollationElements *elems,
47 int32_t offset,
48 UErrorCode &status)
49{
50 // Note: Not "fast" any more after the 2013 collation rewrite.
51 // We do not want to expose more internals than necessary.
52 ucol_setOffsetucol_setOffset_71(elems, offset, &status);
53}
54
55/**
56* Getting the mask for collation strength
57* @param strength collation strength
58* @return collation element mask
59*/
60static
61inline uint32_t getMask(UCollationStrength strength)
62{
63 switch (strength)
64 {
65 case UCOL_PRIMARY:
66 return UCOL_PRIMARYORDERMASK0xffff0000;
67 case UCOL_SECONDARY:
68 return UCOL_SECONDARYORDERMASK0x0000ff00 | UCOL_PRIMARYORDERMASK0xffff0000;
69 default:
70 return UCOL_TERTIARYORDERMASK0x000000ff | UCOL_SECONDARYORDERMASK0x0000ff00 |
71 UCOL_PRIMARYORDERMASK0xffff0000;
72 }
73}
74
75U_CDECL_BEGINextern "C" {
76static UBool U_CALLCONV
77usearch_cleanup(void) {
78 g_nfcImpl = nullptr;
79 return TRUE1;
80}
81U_CDECL_END}
82
83/**
84* Initializing the fcd tables.
85* Internal method, status assumed to be a success.
86* @param status output error if any, caller to check status before calling
87* method, status assumed to be success when passed in.
88*/
89static
90inline void initializeFCD(UErrorCode *status)
91{
92 if (g_nfcImpl == nullptr) {
93 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
94 ucln_i18n_registerCleanupucln_i18n_registerCleanup_71(UCLN_I18N_USEARCH, usearch_cleanup);
95 }
96}
97
98/**
99* Gets the fcd value for a character at the argument index.
100* This method takes into accounts of the supplementary characters.
101* @param str UTF16 string where character for fcd retrieval resides
102* @param offset position of the character whose fcd is to be retrieved, to be
103* overwritten with the next character position, taking
104* surrogate characters into consideration.
105* @param strlength length of the argument string
106* @return fcd value
107*/
108static
109uint16_t getFCD(const UChar *str, int32_t *offset,
110 int32_t strlength)
111{
112 const UChar *temp = str + *offset;
113 uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength);
114 *offset = (int32_t)(temp - str);
115 return result;
116}
117
118/**
119* Getting the modified collation elements taking into account the collation
120* attributes
121* @param strsrch string search data
122* @param sourcece
123* @return the modified collation element
124*/
125static
126inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
127{
128 // note for tertiary we can't use the collator->tertiaryMask, that
129 // is a preprocessed mask that takes into account case options. since
130 // we are only concerned with exact matches, we don't need that.
131 sourcece &= strsrch->ceMask;
132
133 if (strsrch->toShift) {
134 // alternate handling here, since only the 16 most significant digits
135 // is only used, we can safely do a compare without masking
136 // if the ce is a variable, we mask and get only the primary values
137 // no shifting to quartenary is required since all primary values
138 // less than variabletop will need to be masked off anyway.
139 if (strsrch->variableTop > sourcece) {
140 if (strsrch->strength >= UCOL_QUATERNARY) {
141 sourcece &= UCOL_PRIMARYORDERMASK0xffff0000;
142 }
143 else {
144 sourcece = UCOL_IGNORABLE0;
145 }
146 }
147 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE0) {
148 sourcece = 0xFFFF;
149 }
150
151 return sourcece;
152}
153
154/**
155* Allocate a memory and returns nullptr if it failed.
156* Internal method, status assumed to be a success.
157* @param size to allocate
158* @param status output error if any, caller to check status before calling
159* method, status assumed to be success when passed in.
160* @return newly allocated array, nullptr otherwise
161*/
162static
163inline void * allocateMemory(uint32_t size, UErrorCode *status)
164{
165 uint32_t *result = (uint32_t *)uprv_mallocuprv_malloc_71(size);
166 if (result == nullptr) {
167 *status = U_MEMORY_ALLOCATION_ERROR;
168 }
169 return result;
170}
171
172/**
173* Adds a uint32_t value to a destination array.
174* Creates a new array if we run out of space. The caller will have to
175* manually deallocate the newly allocated array.
176* Internal method, status assumed to be success, caller has to check status
177* before calling this method. destination not to be nullptr and has at least
178* size destinationlength.
179* @param destination target array
180* @param offset destination offset to add value
181* @param destinationlength target array size, return value for the new size
182* @param value to be added
183* @param increments incremental size expected
184* @param status output error if any, caller to check status before calling
185* method, status assumed to be success when passed in.
186* @return new destination array, destination if there was no new allocation
187*/
188static
189inline int32_t * addTouint32_tArray(int32_t *destination,
190 uint32_t offset,
191 uint32_t *destinationlength,
192 uint32_t value,
193 uint32_t increments,
194 UErrorCode *status)
195{
196 uint32_t newlength = *destinationlength;
197 if (offset + 1 == newlength) {
198 newlength += increments;
199 int32_t *temp = (int32_t *)allocateMemory(
200 sizeof(int32_t) * newlength, status);
201 if (U_FAILURE(*status)) {
202 return nullptr;
203 }
204 uprv_memcpy(temp, destination, sizeof(int32_t) * (size_t)offset)do { clang diagnostic push clang diagnostic ignored "-Waddress"
(void)0; (void)0; clang diagnostic pop :: memcpy(temp, destination
, sizeof(int32_t) * (size_t)offset); } while (false)
;
205 *destinationlength = newlength;
206 destination = temp;
207 }
208 destination[offset] = value;
209 return destination;
210}
211
212/**
213* Adds a uint64_t value to a destination array.
214* Creates a new array if we run out of space. The caller will have to
215* manually deallocate the newly allocated array.
216* Internal method, status assumed to be success, caller has to check status
217* before calling this method. destination not to be nullptr and has at least
218* size destinationlength.
219* @param destination target array
220* @param offset destination offset to add value
221* @param destinationlength target array size, return value for the new size
222* @param value to be added
223* @param increments incremental size expected
224* @param status output error if any, caller to check status before calling
225* method, status assumed to be success when passed in.
226* @return new destination array, destination if there was no new allocation
227*/
228static
229inline int64_t * addTouint64_tArray(int64_t *destination,
230 uint32_t offset,
231 uint32_t *destinationlength,
232 uint64_t value,
233 uint32_t increments,
234 UErrorCode *status)
235{
236 uint32_t newlength = *destinationlength;
237 if (offset + 1 == newlength) {
238 newlength += increments;
239 int64_t *temp = (int64_t *)allocateMemory(
240 sizeof(int64_t) * newlength, status);
241
242 if (U_FAILURE(*status)) {
243 return nullptr;
244 }
245
246 uprv_memcpy(temp, destination, sizeof(int64_t) * (size_t)offset)do { clang diagnostic push clang diagnostic ignored "-Waddress"
(void)0; (void)0; clang diagnostic pop :: memcpy(temp, destination
, sizeof(int64_t) * (size_t)offset); } while (false)
;
247 *destinationlength = newlength;
248 destination = temp;
249 }
250
251 destination[offset] = value;
252
253 return destination;
254}
255
256/**
257* Initializing the ce table for a pattern.
258* Stores non-ignorable collation keys.
259* Table size will be estimated by the size of the pattern text. Table
260* expansion will be perform as we go along. Adding 1 to ensure that the table
261* size definitely increases.
262* Internal method, status assumed to be a success.
263* @param strsrch string search data
264* @param status output error if any, caller to check status before calling
265* method, status assumed to be success when passed in.
266*/
267static
268inline void initializePatternCETable(UStringSearch *strsrch, UErrorCode *status)
269{
270 UPattern *pattern = &(strsrch->pattern);
271 uint32_t cetablesize = INITIAL_ARRAY_SIZE_256;
272 int32_t *cetable = pattern->cesBuffer;
273 uint32_t patternlength = pattern->textLength;
274 UCollationElements *coleiter = strsrch->utilIter;
275
276 if (coleiter == nullptr) {
277 coleiter = ucol_openElementsucol_openElements_71(strsrch->collator, pattern->text,
278 patternlength, status);
279 // status will be checked in ucol_next(..) later and if it is an
280 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
281 // returned.
282 strsrch->utilIter = coleiter;
283 }
284 else {
285 ucol_setTextucol_setText_71(coleiter, pattern->text, pattern->textLength, status);
286 }
287 if(U_FAILURE(*status)) {
288 return;
289 }
290
291 if (pattern->ces != cetable && pattern->ces) {
292 uprv_freeuprv_free_71(pattern->ces);
293 }
294
295 uint32_t offset = 0;
296 int32_t ce;
297
298 while ((ce = ucol_nextucol_next_71(coleiter, status)) != UCOL_NULLORDER((int32_t)0xFFFFFFFF) &&
299 U_SUCCESS(*status)) {
300 uint32_t newce = getCE(strsrch, ce);
301 if (newce) {
302 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
303 newce,
304 patternlength - ucol_getOffsetucol_getOffset_71(coleiter) + 1,
305 status);
306 if (U_FAILURE(*status)) {
307 return;
308 }
309 offset ++;
310 if (cetable != temp && cetable != pattern->cesBuffer) {
311 uprv_freeuprv_free_71(cetable);
312 }
313 cetable = temp;
314 }
315 }
316
317 cetable[offset] = 0;
318 pattern->ces = cetable;
319 pattern->cesLength = offset;
320}
321
322/**
323* Initializing the pce table for a pattern.
324* Stores non-ignorable collation keys.
325* Table size will be estimated by the size of the pattern text. Table
326* expansion will be perform as we go along. Adding 1 to ensure that the table
327* size definitely increases.
328* Internal method, status assumed to be a success.
329* @param strsrch string search data
330* @param status output error if any, caller to check status before calling
331* method, status assumed to be success when passed in.
332*/
333static
334inline void initializePatternPCETable(UStringSearch *strsrch,
335 UErrorCode *status)
336{
337 UPattern *pattern = &(strsrch->pattern);
338 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_256;
339 int64_t *pcetable = pattern->pcesBuffer;
340 uint32_t patternlength = pattern->textLength;
341 UCollationElements *coleiter = strsrch->utilIter;
342
343 if (coleiter == nullptr) {
344 coleiter = ucol_openElementsucol_openElements_71(strsrch->collator, pattern->text,
345 patternlength, status);
346 // status will be checked in nextProcessed(..) later and if it is an error
347 // then UCOL_PROCESSED_NULLORDER is returned by nextProcessed(..), so 0 will be
348 // returned.
349 strsrch->utilIter = coleiter;
350 } else {
351 ucol_setTextucol_setText_71(coleiter, pattern->text, pattern->textLength, status);
352 }
353 if(U_FAILURE(*status)) {
354 return;
355 }
356
357 if (pattern->pces != pcetable && pattern->pces != nullptr) {
358 uprv_freeuprv_free_71(pattern->pces);
359 }
360
361 uint32_t offset = 0;
362 int64_t pce;
363
364 icu::UCollationPCE iter(coleiter);
365
366 // ** Should processed CEs be signed or unsigned?
367 // ** (the rest of the code in this file seems to play fast-and-loose with
368 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
369 while ((pce = iter.nextProcessed(nullptr, nullptr, status)) != UCOL_PROCESSED_NULLORDER((int64_t)((int64_t)(9223372036854775807L))) &&
370 U_SUCCESS(*status)) {
371 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
372 pce,
373 patternlength - ucol_getOffsetucol_getOffset_71(coleiter) + 1,
374 status);
375
376 if (U_FAILURE(*status)) {
377 return;
378 }
379
380 offset += 1;
381
382 if (pcetable != temp && pcetable != pattern->pcesBuffer) {
383 uprv_freeuprv_free_71(pcetable);
384 }
385
386 pcetable = temp;
387 }
388
389 pcetable[offset] = 0;
390 pattern->pces = pcetable;
391 pattern->pcesLength = offset;
392}
393
394/**
395* Initializes the pattern struct.
396* @param strsrch UStringSearch data storage
397* @param status output error if any, caller to check status before calling
398* method, status assumed to be success when passed in.
399*/
400static
401inline void initializePattern(UStringSearch *strsrch, UErrorCode *status)
402{
403 if (U_FAILURE(*status)) { return; }
404
405 UPattern *pattern = &(strsrch->pattern);
406 const UChar *patterntext = pattern->text;
407 int32_t length = pattern->textLength;
408 int32_t index = 0;
409
410 // Since the strength is primary, accents are ignored in the pattern.
411 if (strsrch->strength == UCOL_PRIMARY) {
412 pattern->hasPrefixAccents = 0;
413 pattern->hasSuffixAccents = 0;
414 } else {
415 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
416 SECOND_LAST_BYTE_SHIFT_8;
417 index = length;
418 U16_BACK_1(patterntext, 0, index)do { if(((((patterntext)[--(index)])&0xfffffc00)==0xdc00)
&& (index)>(0) && ((((patterntext)[(index
)-1])&0xfffffc00)==0xd800)) { --(index); } } while (false
)
;
419 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
420 LAST_BYTE_MASK_0xFF;
421 }
422
423 // ** HACK **
424 if (strsrch->pattern.pces != nullptr) {
425 if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
426 uprv_freeuprv_free_71(strsrch->pattern.pces);
427 }
428
429 strsrch->pattern.pces = nullptr;
430 }
431
432 initializePatternCETable(strsrch, status);
433}
434
435/**
436* Initializes the pattern struct and builds the pattern collation element table.
437* @param strsrch UStringSearch data storage
438* @param status for output errors if it occurs, status is assumed to be a
439* success when it is passed in.
440*/
441static
442inline void initialize(UStringSearch *strsrch, UErrorCode *status)
443{
444 initializePattern(strsrch, status);
445}
446
447#if !UCONFIG_NO_BREAK_ITERATION0
448// If the caller provided a character breakiterator we'll return that,
449// otherwise we lazily create the internal break iterator.
450static UBreakIterator* getBreakIterator(UStringSearch *strsrch, UErrorCode &status)
451{
452 if (U_FAILURE(status)) {
453 return nullptr;
454 }
455
456 if (strsrch->search->breakIter != nullptr) {
457 return strsrch->search->breakIter;
458 }
459
460 if (strsrch->search->internalBreakIter != nullptr) {
461 return strsrch->search->internalBreakIter;
462 }
463
464 // Need to create the internal break iterator.
465 strsrch->search->internalBreakIter = ubrk_openubrk_open_71(UBRK_CHARACTER,
466 ucol_getLocaleByTypeucol_getLocaleByType_71(strsrch->collator, ULOC_VALID_LOCALE, &status),
467 strsrch->search->text, strsrch->search->textLength, &status);
468
469 return strsrch->search->internalBreakIter;
470}
471#endif
472
473/**
474* Sets the match result to "not found", regardless of the incoming error status.
475* If an error occurs while setting the result, it is reported back.
476*
477* @param strsrch string search data
478* @param status for output errors, if they occur.
479*/
480static
481inline void setMatchNotFound(UStringSearch *strsrch, UErrorCode &status)
482{
483 UErrorCode localStatus = U_ZERO_ERROR;
484
485 strsrch->search->matchedIndex = USEARCH_DONE-1;
486 strsrch->search->matchedLength = 0;
487 if (strsrch->search->isForwardSearching) {
488 setColEIterOffset(strsrch->textIter, strsrch->search->textLength, localStatus);
489 }
490 else {
491 setColEIterOffset(strsrch->textIter, 0, localStatus);
492 }
493
494 // If an error occurred while setting the result to not found (ex: OOM),
495 // then we want to report that error back to the caller.
496 if (U_SUCCESS(status) && U_FAILURE(localStatus)) {
497 status = localStatus;
498 }
499}
500
501/**
502* Checks if the offset runs out of the text string
503* @param offset
504* @param textlength of the text string
505* @return TRUE if offset is out of bounds, FALSE otherwise
506*/
507static
508inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
509{
510 return offset < 0 || offset > textlength;
511}
512
513/**
514* Checks for identical match
515* @param strsrch string search data
516* @param start offset of possible match
517* @param end offset of possible match
518* @return TRUE if identical match is found
519*/
520static
521inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start, int32_t end)
522{
523 if (strsrch->strength != UCOL_IDENTICAL) {
524 return TRUE1;
525 }
526
527 // Note: We could use Normalizer::compare() or similar, but for short strings
528 // which may not be in FCD it might be faster to just NFD them.
529 UErrorCode status = U_ZERO_ERROR;
530 UnicodeString t2, p2;
531 strsrch->nfd->normalize(
532 UnicodeString(FALSE0, strsrch->search->text + start, end - start), t2, status);
533 strsrch->nfd->normalize(
534 UnicodeString(FALSE0, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
535 // return FALSE if NFD failed
536 return U_SUCCESS(status) && t2 == p2;
537}
538
539// constructors and destructor -------------------------------------------
540
541U_CAPIextern "C" UStringSearch * U_EXPORT2 usearch_openusearch_open_71(const UChar *pattern,
542 int32_t patternlength,
543 const UChar *text,
544 int32_t textlength,
545 const char *locale,
546 UBreakIterator *breakiter,
547 UErrorCode *status)
548{
549 if (U_FAILURE(*status)) {
550 return nullptr;
551 }
552#if UCONFIG_NO_BREAK_ITERATION0
553 if (breakiter != nullptr) {
554 *status = U_UNSUPPORTED_ERROR;
555 return nullptr;
556 }
557#endif
558 if (locale) {
559 // ucol_open internally checks for status
560 UCollator *collator = ucol_openucol_open_71(locale, status);
561 // pattern, text checks are done in usearch_openFromCollator
562 UStringSearch *result = usearch_openFromCollatorusearch_openFromCollator_71(pattern,
563 patternlength, text, textlength,
564 collator, breakiter, status);
565
566 if (result == nullptr || U_FAILURE(*status)) {
567 if (collator) {
568 ucol_closeucol_close_71(collator);
569 }
570 return nullptr;
571 }
572 else {
573 result->ownCollator = TRUE1;
574 }
575 return result;
576 }
577 *status = U_ILLEGAL_ARGUMENT_ERROR;
578 return nullptr;
579}
580
581U_CAPIextern "C" UStringSearch * U_EXPORT2 usearch_openFromCollatorusearch_openFromCollator_71(
582 const UChar *pattern,
583 int32_t patternlength,
584 const UChar *text,
585 int32_t textlength,
586 const UCollator *collator,
587 UBreakIterator *breakiter,
588 UErrorCode *status)
589{
590 if (U_FAILURE(*status)) {
591 return nullptr;
592 }
593#if UCONFIG_NO_BREAK_ITERATION0
594 if (breakiter != nullptr) {
595 *status = U_UNSUPPORTED_ERROR;
596 return nullptr;
597 }
598#endif
599 if (pattern == nullptr || text == nullptr || collator == nullptr) {
600 *status = U_ILLEGAL_ARGUMENT_ERROR;
601 return nullptr;
602 }
603
604 // string search does not really work when numeric collation is turned on
605 if(ucol_getAttributeucol_getAttribute_71(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
606 *status = U_UNSUPPORTED_ERROR;
607 return nullptr;
608 }
609
610 if (U_SUCCESS(*status)) {
611 initializeFCD(status);
612 if (U_FAILURE(*status)) {
613 return nullptr;
614 }
615
616 UStringSearch *result;
617 if (textlength == -1) {
618 textlength = u_strlenu_strlen_71(text);
619 }
620 if (patternlength == -1) {
621 patternlength = u_strlenu_strlen_71(pattern);
622 }
623 if (textlength <= 0 || patternlength <= 0) {
624 *status = U_ILLEGAL_ARGUMENT_ERROR;
625 return nullptr;
626 }
627
628 result = (UStringSearch *)uprv_mallocuprv_malloc_71(sizeof(UStringSearch));
629 if (result == nullptr) {
630 *status = U_MEMORY_ALLOCATION_ERROR;
631 return nullptr;
632 }
633
634 result->collator = collator;
635 result->strength = ucol_getStrengthucol_getStrength_71(collator);
636 result->ceMask = getMask(result->strength);
637 result->toShift =
638 ucol_getAttributeucol_getAttribute_71(collator, UCOL_ALTERNATE_HANDLING, status) ==
639 UCOL_SHIFTED;
640 result->variableTop = ucol_getVariableTopucol_getVariableTop_71(collator, status);
641
642 result->nfd = Normalizer2::getNFDInstance(*status);
643
644 if (U_FAILURE(*status)) {
645 uprv_freeuprv_free_71(result);
646 return nullptr;
647 }
648
649 result->search = (USearch *)uprv_mallocuprv_malloc_71(sizeof(USearch));
650 if (result->search == nullptr) {
651 *status = U_MEMORY_ALLOCATION_ERROR;
652 uprv_freeuprv_free_71(result);
653 return nullptr;
654 }
655
656 result->search->text = text;
657 result->search->textLength = textlength;
658
659 result->pattern.text = pattern;
660 result->pattern.textLength = patternlength;
661 result->pattern.ces = nullptr;
662 result->pattern.pces = nullptr;
663
664 result->search->breakIter = breakiter;
665#if !UCONFIG_NO_BREAK_ITERATION0
666 result->search->internalBreakIter = nullptr; // Lazily created.
667 if (breakiter) {
668 ubrk_setTextubrk_setText_71(breakiter, text, textlength, status);
669 }
670#endif
671
672 result->ownCollator = FALSE0;
673 result->search->matchedLength = 0;
674 result->search->matchedIndex = USEARCH_DONE-1;
675 result->utilIter = nullptr;
676 result->textIter = ucol_openElementsucol_openElements_71(collator, text,
677 textlength, status);
678 result->textProcessedIter = nullptr;
679 if (U_FAILURE(*status)) {
680 usearch_closeusearch_close_71(result);
681 return nullptr;
682 }
683
684 result->search->isOverlap = FALSE0;
685 result->search->isCanonicalMatch = FALSE0;
686 result->search->elementComparisonType = 0;
687 result->search->isForwardSearching = TRUE1;
688 result->search->reset = TRUE1;
689
690 initialize(result, status);
691
692 if (U_FAILURE(*status)) {
693 usearch_closeusearch_close_71(result);
694 return nullptr;
695 }
696
697 return result;
698 }
699 return nullptr;
700}
701
702U_CAPIextern "C" void U_EXPORT2 usearch_closeusearch_close_71(UStringSearch *strsrch)
703{
704 if (strsrch) {
705 if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
706 strsrch->pattern.ces) {
707 uprv_freeuprv_free_71(strsrch->pattern.ces);
708 }
709
710 if (strsrch->pattern.pces != nullptr &&
711 strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
712 uprv_freeuprv_free_71(strsrch->pattern.pces);
713 }
714
715 delete strsrch->textProcessedIter;
716 ucol_closeElementsucol_closeElements_71(strsrch->textIter);
717 ucol_closeElementsucol_closeElements_71(strsrch->utilIter);
718
719 if (strsrch->ownCollator && strsrch->collator) {
720 ucol_closeucol_close_71((UCollator *)strsrch->collator);
721 }
722
723#if !UCONFIG_NO_BREAK_ITERATION0
724 if (strsrch->search->internalBreakIter != nullptr) {
725 ubrk_closeubrk_close_71(strsrch->search->internalBreakIter);
726 }
727#endif
728
729 uprv_freeuprv_free_71(strsrch->search);
730 uprv_freeuprv_free_71(strsrch);
731 }
732}
733
734namespace {
735
736UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
737 if (U_FAILURE(*status)) { return FALSE0; }
738 if (strsrch->textProcessedIter == nullptr) {
739 strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
740 if (strsrch->textProcessedIter == nullptr) {
741 *status = U_MEMORY_ALLOCATION_ERROR;
742 return FALSE0;
743 }
744 } else {
745 strsrch->textProcessedIter->init(strsrch->textIter);
746 }
747 return TRUE1;
748}
749
750}
751
752// set and get methods --------------------------------------------------
753
754U_CAPIextern "C" void U_EXPORT2 usearch_setOffsetusearch_setOffset_71(UStringSearch *strsrch,
755 int32_t position,
756 UErrorCode *status)
757{
758 if (U_SUCCESS(*status) && strsrch) {
759 if (isOutOfBounds(strsrch->search->textLength, position)) {
760 *status = U_INDEX_OUTOFBOUNDS_ERROR;
761 }
762 else {
763 setColEIterOffset(strsrch->textIter, position, *status);
764 }
765 strsrch->search->matchedIndex = USEARCH_DONE-1;
766 strsrch->search->matchedLength = 0;
767 strsrch->search->reset = FALSE0;
768 }
769}
770
771U_CAPIextern "C" int32_t U_EXPORT2 usearch_getOffsetusearch_getOffset_71(const UStringSearch *strsrch)
772{
773 if (strsrch) {
774 int32_t result = ucol_getOffsetucol_getOffset_71(strsrch->textIter);
775 if (isOutOfBounds(strsrch->search->textLength, result)) {
776 return USEARCH_DONE-1;
777 }
778 return result;
779 }
780 return USEARCH_DONE-1;
781}
782
783U_CAPIextern "C" void U_EXPORT2 usearch_setAttributeusearch_setAttribute_71(UStringSearch *strsrch,
784 USearchAttribute attribute,
785 USearchAttributeValue value,
786 UErrorCode *status)
787{
788 if (U_SUCCESS(*status) && strsrch) {
789 switch (attribute)
790 {
791 case USEARCH_OVERLAP :
792 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE1 : FALSE0);
793 break;
794 case USEARCH_CANONICAL_MATCH :
795 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE1 :
796 FALSE0);
797 break;
798 case USEARCH_ELEMENT_COMPARISON :
799 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
800 strsrch->search->elementComparisonType = (int16_t)value;
801 } else {
802 strsrch->search->elementComparisonType = 0;
803 }
804 break;
805 case USEARCH_ATTRIBUTE_COUNT :
806 default:
807 *status = U_ILLEGAL_ARGUMENT_ERROR;
808 }
809 }
810 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
811 *status = U_ILLEGAL_ARGUMENT_ERROR;
812 }
813}
814
815U_CAPIextern "C" USearchAttributeValue U_EXPORT2 usearch_getAttributeusearch_getAttribute_71(
816 const UStringSearch *strsrch,
817 USearchAttribute attribute)
818{
819 if (strsrch) {
820 switch (attribute) {
821 case USEARCH_OVERLAP :
822 return (strsrch->search->isOverlap == TRUE1 ? USEARCH_ON :
823 USEARCH_OFF);
824 case USEARCH_CANONICAL_MATCH :
825 return (strsrch->search->isCanonicalMatch == TRUE1 ? USEARCH_ON :
826 USEARCH_OFF);
827 case USEARCH_ELEMENT_COMPARISON :
828 {
829 int16_t value = strsrch->search->elementComparisonType;
830 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
831 return (USearchAttributeValue)value;
832 } else {
833 return USEARCH_STANDARD_ELEMENT_COMPARISON;
834 }
835 }
836 case USEARCH_ATTRIBUTE_COUNT :
837 return USEARCH_DEFAULT;
838 }
839 }
840 return USEARCH_DEFAULT;
841}
842
843U_CAPIextern "C" int32_t U_EXPORT2 usearch_getMatchedStartusearch_getMatchedStart_71(
844 const UStringSearch *strsrch)
845{
846 if (strsrch == nullptr) {
847 return USEARCH_DONE-1;
848 }
849 return strsrch->search->matchedIndex;
850}
851
852
853U_CAPIextern "C" int32_t U_EXPORT2 usearch_getMatchedTextusearch_getMatchedText_71(const UStringSearch *strsrch,
854 UChar *result,
855 int32_t resultCapacity,
856 UErrorCode *status)
857{
858 if (U_FAILURE(*status)) {
859 return USEARCH_DONE-1;
860 }
861 if (strsrch == nullptr || resultCapacity < 0 || (resultCapacity > 0 &&
862 result == nullptr)) {
863 *status = U_ILLEGAL_ARGUMENT_ERROR;
864 return USEARCH_DONE-1;
865 }
866
867 int32_t copylength = strsrch->search->matchedLength;
868 int32_t copyindex = strsrch->search->matchedIndex;
869 if (copyindex == USEARCH_DONE-1) {
870 u_terminateUCharsu_terminateUChars_71(result, resultCapacity, 0, status);
871 return USEARCH_DONE-1;
872 }
873
874 if (resultCapacity < copylength) {
875 copylength = resultCapacity;
876 }
877 if (copylength > 0) {
878 uprv_memcpy(result, strsrch->search->text + copyindex,do { clang diagnostic push clang diagnostic ignored "-Waddress"
(void)0; (void)0; clang diagnostic pop :: memcpy(result, strsrch
->search->text + copyindex, copylength * sizeof(UChar))
; } while (false)
879 copylength * sizeof(UChar))do { clang diagnostic push clang diagnostic ignored "-Waddress"
(void)0; (void)0; clang diagnostic pop :: memcpy(result, strsrch
->search->text + copyindex, copylength * sizeof(UChar))
; } while (false)
;
880 }
881 return u_terminateUCharsu_terminateUChars_71(result, resultCapacity,
882 strsrch->search->matchedLength, status);
883}
884
885U_CAPIextern "C" int32_t U_EXPORT2 usearch_getMatchedLengthusearch_getMatchedLength_71(
886 const UStringSearch *strsrch)
887{
888 if (strsrch) {
889 return strsrch->search->matchedLength;
890 }
891 return USEARCH_DONE-1;
892}
893
894#if !UCONFIG_NO_BREAK_ITERATION0
895
896U_CAPIextern "C" void U_EXPORT2 usearch_setBreakIteratorusearch_setBreakIterator_71(UStringSearch *strsrch,
897 UBreakIterator *breakiter,
898 UErrorCode *status)
899{
900 if (U_SUCCESS(*status) && strsrch) {
901 strsrch->search->breakIter = breakiter;
902 if (breakiter) {
903 ubrk_setTextubrk_setText_71(breakiter, strsrch->search->text,
904 strsrch->search->textLength, status);
905 }
906 }
907}
908
909U_CAPIextern "C" const UBreakIterator* U_EXPORT2
910usearch_getBreakIteratorusearch_getBreakIterator_71(const UStringSearch *strsrch)
911{
912 if (strsrch) {
913 return strsrch->search->breakIter;
914 }
915 return nullptr;
916}
917
918#endif
919
920U_CAPIextern "C" void U_EXPORT2 usearch_setTextusearch_setText_71( UStringSearch *strsrch,
921 const UChar *text,
922 int32_t textlength,
923 UErrorCode *status)
924{
925 if (U_SUCCESS(*status)) {
926 if (strsrch == nullptr || text == nullptr || textlength < -1 ||
927 textlength == 0) {
928 *status = U_ILLEGAL_ARGUMENT_ERROR;
929 }
930 else {
931 if (textlength == -1) {
932 textlength = u_strlenu_strlen_71(text);
933 }
934 strsrch->search->text = text;
935 strsrch->search->textLength = textlength;
936 ucol_setTextucol_setText_71(strsrch->textIter, text, textlength, status);
937 strsrch->search->matchedIndex = USEARCH_DONE-1;
938 strsrch->search->matchedLength = 0;
939 strsrch->search->reset = TRUE1;
940#if !UCONFIG_NO_BREAK_ITERATION0
941 if (strsrch->search->breakIter != nullptr) {
942 ubrk_setTextubrk_setText_71(strsrch->search->breakIter, text,
943 textlength, status);
944 }
945 if (strsrch->search->internalBreakIter != nullptr) {
946 ubrk_setTextubrk_setText_71(strsrch->search->internalBreakIter, text, textlength, status);
947 }
948#endif
949 }
950 }
951}
952
953U_CAPIextern "C" const UChar * U_EXPORT2 usearch_getTextusearch_getText_71(const UStringSearch *strsrch,
954 int32_t *length)
955{
956 if (strsrch) {
957 *length = strsrch->search->textLength;
958 return strsrch->search->text;
959 }
960 return nullptr;
961}
962
963U_CAPIextern "C" void U_EXPORT2 usearch_setCollatorusearch_setCollator_71( UStringSearch *strsrch,
964 const UCollator *collator,
965 UErrorCode *status)
966{
967 if (U_SUCCESS(*status)) {
968 if (collator == nullptr) {
969 *status = U_ILLEGAL_ARGUMENT_ERROR;
970 return;
971 }
972
973 if (strsrch) {
974 delete strsrch->textProcessedIter;
975 strsrch->textProcessedIter = nullptr;
976 ucol_closeElementsucol_closeElements_71(strsrch->textIter);
977 ucol_closeElementsucol_closeElements_71(strsrch->utilIter);
978 strsrch->textIter = strsrch->utilIter = nullptr;
979 if (strsrch->ownCollator && (strsrch->collator != collator)) {
980 ucol_closeucol_close_71((UCollator *)strsrch->collator);
981 strsrch->ownCollator = FALSE0;
982 }
983 strsrch->collator = collator;
984 strsrch->strength = ucol_getStrengthucol_getStrength_71(collator);
985 strsrch->ceMask = getMask(strsrch->strength);
986#if !UCONFIG_NO_BREAK_ITERATION0
987 if (strsrch->search->internalBreakIter != nullptr) {
988 ubrk_closeubrk_close_71(strsrch->search->internalBreakIter);
989 strsrch->search->internalBreakIter = nullptr; // Lazily created.
990 }
991#endif
992 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
993 strsrch->toShift =
994 ucol_getAttributeucol_getAttribute_71(collator, UCOL_ALTERNATE_HANDLING, status) ==
995 UCOL_SHIFTED;
996 // if status is a failure, ucol_getVariableTop returns 0
997 strsrch->variableTop = ucol_getVariableTopucol_getVariableTop_71(collator, status);
998 strsrch->textIter = ucol_openElementsucol_openElements_71(collator,
999 strsrch->search->text,
1000 strsrch->search->textLength,
1001 status);
1002 strsrch->utilIter = ucol_openElementsucol_openElements_71(
1003 collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
1004 // initialize() _after_ setting the iterators for the new collator.
1005 initialize(strsrch, status);
1006 }
1007
1008 // **** are these calls needed?
1009 // **** we call uprv_init_pce in initializePatternPCETable
1010 // **** and the CEIBuffer constructor...
1011#if 0
1012 uprv_init_pce(strsrch->textIter);
1013 uprv_init_pce(strsrch->utilIter);
1014#endif
1015 }
1016}
1017
1018U_CAPIextern "C" UCollator * U_EXPORT2 usearch_getCollatorusearch_getCollator_71(const UStringSearch *strsrch)
1019{
1020 if (strsrch) {
1021 return (UCollator *)strsrch->collator;
1022 }
1023 return nullptr;
1024}
1025
1026U_CAPIextern "C" void U_EXPORT2 usearch_setPatternusearch_setPattern_71( UStringSearch *strsrch,
1027 const UChar *pattern,
1028 int32_t patternlength,
1029 UErrorCode *status)
1030{
1031 if (U_SUCCESS(*status)) {
1032 if (strsrch == nullptr || pattern == nullptr) {
1033 *status = U_ILLEGAL_ARGUMENT_ERROR;
1034 }
1035 else {
1036 if (patternlength == -1) {
1037 patternlength = u_strlenu_strlen_71(pattern);
1038 }
1039 if (patternlength == 0) {
1040 *status = U_ILLEGAL_ARGUMENT_ERROR;
1041 return;
1042 }
1043 strsrch->pattern.text = pattern;
1044 strsrch->pattern.textLength = patternlength;
1045 initialize(strsrch, status);
1046 }
1047 }
1048}
1049
1050U_CAPIextern "C" const UChar* U_EXPORT2
1051usearch_getPatternusearch_getPattern_71(const UStringSearch *strsrch,
1052 int32_t *length)
1053{
1054 if (strsrch) {
1055 *length = strsrch->pattern.textLength;
1056 return strsrch->pattern.text;
1057 }
1058 return nullptr;
1059}
1060
1061// miscellaneous methods --------------------------------------------------
1062
1063U_CAPIextern "C" int32_t U_EXPORT2 usearch_firstusearch_first_71(UStringSearch *strsrch,
1064 UErrorCode *status)
1065{
1066 if (strsrch && U_SUCCESS(*status)) {
1067 strsrch->search->isForwardSearching = TRUE1;
1068 usearch_setOffsetusearch_setOffset_71(strsrch, 0, status);
1069 if (U_SUCCESS(*status)) {
1070 return usearch_nextusearch_next_71(strsrch, status);
1071 }
1072 }
1073 return USEARCH_DONE-1;
1074}
1075
1076U_CAPIextern "C" int32_t U_EXPORT2 usearch_followingusearch_following_71(UStringSearch *strsrch,
1077 int32_t position,
1078 UErrorCode *status)
1079{
1080 if (strsrch && U_SUCCESS(*status)) {
1081 strsrch->search->isForwardSearching = TRUE1;
1082 // position checked in usearch_setOffset
1083 usearch_setOffsetusearch_setOffset_71(strsrch, position, status);
1084 if (U_SUCCESS(*status)) {
1085 return usearch_nextusearch_next_71(strsrch, status);
1086 }
1087 }
1088 return USEARCH_DONE-1;
1089}
1090
1091U_CAPIextern "C" int32_t U_EXPORT2 usearch_lastusearch_last_71(UStringSearch *strsrch,
1092 UErrorCode *status)
1093{
1094 if (strsrch && U_SUCCESS(*status)) {
1095 strsrch->search->isForwardSearching = FALSE0;
1096 usearch_setOffsetusearch_setOffset_71(strsrch, strsrch->search->textLength, status);
1097 if (U_SUCCESS(*status)) {
1098 return usearch_previoususearch_previous_71(strsrch, status);
1099 }
1100 }
1101 return USEARCH_DONE-1;
1102}
1103
1104U_CAPIextern "C" int32_t U_EXPORT2 usearch_precedingusearch_preceding_71(UStringSearch *strsrch,
1105 int32_t position,
1106 UErrorCode *status)
1107{
1108 if (strsrch && U_SUCCESS(*status)) {
1109 strsrch->search->isForwardSearching = FALSE0;
1110 // position checked in usearch_setOffset
1111 usearch_setOffsetusearch_setOffset_71(strsrch, position, status);
1112 if (U_SUCCESS(*status)) {
1113 return usearch_previoususearch_previous_71(strsrch, status);
1114 }
1115 }
1116 return USEARCH_DONE-1;
1117}
1118
1119/**
1120* If a direction switch is required, we'll count the number of ces till the
1121* beginning of the collation element iterator and iterate forwards that
1122* number of times. This is so that we get to the correct point within the
1123* string to continue the search in. Imagine when we are in the middle of the
1124* normalization buffer when the change in direction is request. arrrgghh....
1125* After searching the offset within the collation element iterator will be
1126* shifted to the start of the match. If a match is not found, the offset would
1127* have been set to the end of the text string in the collation element
1128* iterator.
1129* Okay, here's my take on normalization buffer. The only time when there can
1130* be 2 matches within the same normalization is when the pattern is consists
1131* of all accents. But since the offset returned is from the text string, we
1132* should not confuse the caller by returning the second match within the
1133* same normalization buffer. If we do, the 2 results will have the same match
1134* offsets, and that'll be confusing. I'll return the next match that doesn't
1135* fall within the same normalization buffer. Note this does not affect the
1136* results of matches spanning the text and the normalization buffer.
1137* The position to start searching is taken from the collation element
1138* iterator. Callers of this API would have to set the offset in the collation
1139* element iterator before using this method.
1140*/
1141U_CAPIextern "C" int32_t U_EXPORT2 usearch_nextusearch_next_71(UStringSearch *strsrch,
1142 UErrorCode *status)
1143{
1144 if (U_SUCCESS(*status) && strsrch) {
1145 // note offset is either equivalent to the start of the previous match
1146 // or is set by the user
1147 int32_t offset = usearch_getOffsetusearch_getOffset_71(strsrch);
1148 USearch *search = strsrch->search;
1149 search->reset = FALSE0;
1150 int32_t textlength = search->textLength;
1151 if (search->isForwardSearching) {
1152 if (offset == textlength ||
1153 (! search->isOverlap &&
1154 (search->matchedIndex != USEARCH_DONE-1 &&
1155 offset + search->matchedLength > textlength))) {
1156 // not enough characters to match
1157 setMatchNotFound(strsrch, *status);
1158 return USEARCH_DONE-1;
1159 }
1160 }
1161 else {
1162 // switching direction.
1163 // if matchedIndex == USEARCH_DONE, it means that either a
1164 // setOffset has been called or that previous ran off the text
1165 // string. the iterator would have been set to offset 0 if a
1166 // match is not found.
1167 search->isForwardSearching = TRUE1;
1168 if (search->matchedIndex != USEARCH_DONE-1) {
1169 // there's no need to set the collation element iterator
1170 // the next call to next will set the offset.
1171 return search->matchedIndex;
1172 }
1173 }
1174
1175 if (U_SUCCESS(*status)) {
1176 if (strsrch->pattern.cesLength == 0) {
1177 if (search->matchedIndex == USEARCH_DONE-1) {
1178 search->matchedIndex = offset;
1179 }
1180 else { // moves by codepoints
1181 U16_FWD_1(search->text, search->matchedIndex, textlength)do { if(((((search->text)[(search->matchedIndex)++])&
0xfffffc00)==0xd800) && (search->matchedIndex)!=(textlength
) && ((((search->text)[search->matchedIndex])&
0xfffffc00)==0xdc00)) { ++(search->matchedIndex); } } while
(false)
;
1182 }
1183
1184 search->matchedLength = 0;
1185 setColEIterOffset(strsrch->textIter, search->matchedIndex, *status);
1186 // status checked below
1187 if (search->matchedIndex == textlength) {
1188 search->matchedIndex = USEARCH_DONE-1;
1189 }
1190 }
1191 else {
1192 if (search->matchedLength > 0) {
1193 // if matchlength is 0 we are at the start of the iteration
1194 if (search->isOverlap) {
1195 ucol_setOffsetucol_setOffset_71(strsrch->textIter, offset + 1, status);
1196 }
1197 else {
1198 ucol_setOffsetucol_setOffset_71(strsrch->textIter,
1199 offset + search->matchedLength, status);
1200 }
1201 }
1202 else {
1203 // for boundary check purposes. this will ensure that the
1204 // next match will not precede the current offset
1205 // note search->matchedIndex will always be set to something
1206 // in the code
1207 search->matchedIndex = offset - 1;
1208 }
1209
1210 if (search->isCanonicalMatch) {
1211 // can't use exact here since extra accents are allowed.
1212 usearch_handleNextCanonicalusearch_handleNextCanonical_71(strsrch, status);
1213 }
1214 else {
1215 usearch_handleNextExactusearch_handleNextExact_71(strsrch, status);
1216 }
1217 }
1218
1219 if (U_FAILURE(*status)) {
1220 return USEARCH_DONE-1;
1221 }
1222
1223 if (search->matchedIndex == USEARCH_DONE-1) {
1224 ucol_setOffsetucol_setOffset_71(strsrch->textIter, search->textLength, status);
1225 } else {
1226 ucol_setOffsetucol_setOffset_71(strsrch->textIter, search->matchedIndex, status);
1227 }
1228
1229 return search->matchedIndex;
1230 }
1231 }
1232 return USEARCH_DONE-1;
1233}
1234
1235U_CAPIextern "C" int32_t U_EXPORT2 usearch_previoususearch_previous_71(UStringSearch *strsrch,
1236 UErrorCode *status)
1237{
1238 if (U_SUCCESS(*status) && strsrch) {
1239 int32_t offset;
1240 USearch *search = strsrch->search;
1241 if (search->reset) {
1242 offset = search->textLength;
1243 search->isForwardSearching = FALSE0;
1244 search->reset = FALSE0;
1245 setColEIterOffset(strsrch->textIter, offset, *status);
1246 }
1247 else {
1248 offset = usearch_getOffsetusearch_getOffset_71(strsrch);
1249 }
1250
1251 int32_t matchedindex = search->matchedIndex;
1252 if (search->isForwardSearching == TRUE1) {
1253 // switching direction.
1254 // if matchedIndex == USEARCH_DONE, it means that either a
1255 // setOffset has been called or that next ran off the text
1256 // string. the iterator would have been set to offset textLength if
1257 // a match is not found.
1258 search->isForwardSearching = FALSE0;
1259 if (matchedindex != USEARCH_DONE-1) {
1260 return matchedindex;
1261 }
1262 }
1263 else {
1264
1265 // Could check pattern length, but the
1266 // linear search will do the right thing
1267 if (offset == 0 || matchedindex == 0) {
1268 setMatchNotFound(strsrch, *status);
1269 return USEARCH_DONE-1;
1270 }
1271 }
1272
1273 if (U_SUCCESS(*status)) {
1274 if (strsrch->pattern.cesLength == 0) {
1275 search->matchedIndex =
1276 (matchedindex == USEARCH_DONE-1 ? offset : matchedindex);
1277 if (search->matchedIndex == 0) {
1278 setMatchNotFound(strsrch, *status);
1279 // status checked below
1280 }
1281 else { // move by codepoints
1282 U16_BACK_1(search->text, 0, search->matchedIndex)do { if(((((search->text)[--(search->matchedIndex)])&
0xfffffc00)==0xdc00) && (search->matchedIndex)>
(0) && ((((search->text)[(search->matchedIndex)
-1])&0xfffffc00)==0xd800)) { --(search->matchedIndex);
} } while (false)
;
1283 setColEIterOffset(strsrch->textIter, search->matchedIndex, *status);
1284 // status checked below
1285 search->matchedLength = 0;
1286 }
1287 }
1288 else {
1289 if (strsrch->search->isCanonicalMatch) {
1290 // can't use exact here since extra accents are allowed.
1291 usearch_handlePreviousCanonicalusearch_handlePreviousCanonical_71(strsrch, status);
1292 // status checked below
1293 }
1294 else {
1295 usearch_handlePreviousExactusearch_handlePreviousExact_71(strsrch, status);
1296 // status checked below
1297 }
1298 }
1299
1300 if (U_FAILURE(*status)) {
1301 return USEARCH_DONE-1;
1302 }
1303
1304 return search->matchedIndex;
1305 }
1306 }
1307 return USEARCH_DONE-1;
1308}
1309
1310
1311
1312U_CAPIextern "C" void U_EXPORT2 usearch_resetusearch_reset_71(UStringSearch *strsrch)
1313{
1314 /*
1315 reset is setting the attributes that are already in
1316 string search, hence all attributes in the collator should
1317 be retrieved without any problems
1318 */
1319 if (strsrch) {
1320 UErrorCode status = U_ZERO_ERROR;
1321 UBool sameCollAttribute = TRUE1;
1322 uint32_t ceMask;
1323 UBool shift;
1324 uint32_t varTop;
1325
1326 // **** hack to deal w/ how processed CEs encode quaternary ****
1327 UCollationStrength newStrength = ucol_getStrengthucol_getStrength_71(strsrch->collator);
1328 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
1329 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
1330 sameCollAttribute = FALSE0;
1331 }
1332
1333 strsrch->strength = ucol_getStrengthucol_getStrength_71(strsrch->collator);
1334 ceMask = getMask(strsrch->strength);
1335 if (strsrch->ceMask != ceMask) {
1336 strsrch->ceMask = ceMask;
1337 sameCollAttribute = FALSE0;
1338 }
1339
1340 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
1341 shift = ucol_getAttributeucol_getAttribute_71(strsrch->collator, UCOL_ALTERNATE_HANDLING,
1342 &status) == UCOL_SHIFTED;
1343 if (strsrch->toShift != shift) {
1344 strsrch->toShift = shift;
1345 sameCollAttribute = FALSE0;
1346 }
1347
1348 // if status is a failure, ucol_getVariableTop returns 0
1349 varTop = ucol_getVariableTopucol_getVariableTop_71(strsrch->collator, &status);
1350 if (strsrch->variableTop != varTop) {
1351 strsrch->variableTop = varTop;
1352 sameCollAttribute = FALSE0;
1353 }
1354 if (!sameCollAttribute) {
1355 initialize(strsrch, &status);
1356 }
1357 ucol_setTextucol_setText_71(strsrch->textIter, strsrch->search->text,
1358 strsrch->search->textLength,
1359 &status);
1360 strsrch->search->matchedLength = 0;
1361 strsrch->search->matchedIndex = USEARCH_DONE-1;
1362 strsrch->search->isOverlap = FALSE0;
1363 strsrch->search->isCanonicalMatch = FALSE0;
1364 strsrch->search->elementComparisonType = 0;
1365 strsrch->search->isForwardSearching = TRUE1;
1366 strsrch->search->reset = TRUE1;
1367 }
1368}
1369
1370//
1371// CEI Collation Element + source text index.
1372// These structs are kept in the circular buffer.
1373//
1374struct CEI {
1375 int64_t ce;
1376 int32_t lowIndex;
1377 int32_t highIndex;
1378};
1379
1380U_NAMESPACE_BEGINnamespace icu_71 {
1381
1382namespace {
1383//
1384// CEIBuffer A circular buffer of CEs-with-index from the text being searched.
1385//
1386#define DEFAULT_CEBUFFER_SIZE96 96
1387#define CEBUFFER_EXTRA32 32
1388// Some typical max values to make buffer size more reasonable for asymmetric search.
1389// #8694 is for a better long-term solution to allocation of this buffer.
1390#define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L8 8
1391#define MAX_TARGET_IGNORABLES_PER_PAT_OTHER3 3
1392#define MIGHT_BE_JAMO_L(c)((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131
&& c <= 0x314E) || (c >= 0x3165 && c <=
0x3186))
((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
1393struct CEIBuffer {
1394 CEI defBuf[DEFAULT_CEBUFFER_SIZE96];
1395 CEI *buf;
1396 int32_t bufSize;
1397 int32_t firstIx;
1398 int32_t limitIx;
1399 UCollationElements *ceIter;
1400 UStringSearch *strSearch;
1401
1402
1403
1404 CEIBuffer(UStringSearch *ss, UErrorCode *status);
1405 ~CEIBuffer();
1406 const CEI *get(int32_t index);
1407 const CEI *getPrevious(int32_t index);
1408};
1409
1410
1411CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
1412 buf = defBuf;
1413 strSearch = ss;
1414 bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA32;
1415 if (ss->search->elementComparisonType != 0) {
1416 const UChar * patText = ss->pattern.text;
1417 if (patText) {
1418 const UChar * patTextLimit = patText + ss->pattern.textLength;
1419 while ( patText < patTextLimit ) {
1420 UChar c = *patText++;
1421 if (MIGHT_BE_JAMO_L(c)((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131
&& c <= 0x314E) || (c >= 0x3165 && c <=
0x3186))
) {
1422 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L8;
1423 } else {
1424 // No check for surrogates, we might allocate slightly more buffer than necessary.
1425 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER3;
1426 }
1427 }
1428 }
1429 }
1430 ceIter = ss->textIter;
1431 firstIx = 0;
1432 limitIx = 0;
1433
1434 if (!initTextProcessedIter(ss, status)) { return; }
1435
1436 if (bufSize>DEFAULT_CEBUFFER_SIZE96) {
1437 buf = (CEI *)uprv_mallocuprv_malloc_71(bufSize * sizeof(CEI));
1438 if (buf == nullptr) {
1439 *status = U_MEMORY_ALLOCATION_ERROR;
1440 }
1441 }
1442}
1443
1444// TODO: add a reset or init function so that allocated
1445// buffers can be retained & reused.
1446
1447CEIBuffer::~CEIBuffer() {
1448 if (buf != defBuf) {
1449 uprv_freeuprv_free_71(buf);
1450 }
1451}
1452
1453
1454// Get the CE with the specified index.
1455// Index must be in the range
1456// n-history_size < index < n+1
1457// where n is the largest index to have been fetched by some previous call to this function.
1458// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
1459//
1460const CEI *CEIBuffer::get(int32_t index) {
1461 int i = index % bufSize;
1462
1463 if (index>=firstIx && index<limitIx) {
1464 // The request was for an entry already in our buffer.
1465 // Just return it.
1466 return &buf[i];
1467 }
1468
1469 // Caller is requesting a new, never accessed before, CE.
1470 // Verify that it is the next one in sequence, which is all
1471 // that is allowed.
1472 if (index != limitIx) {
1473 UPRV_UNREACHABLE_ASSERT(void)0;
1474 // TODO: In ICU 64 the above was changed from U_ASSERT to UPRV_UNREACHABLE,
1475 // which unconditionally called abort(). However, there were cases in which it
1476 // was being hit, so it was changed back to U_ASSERT per ICU-20680. In ICU 70,
1477 // we now use the new UPRV_UNREACHABLE_ASSERT to better indicate the situation.
1478 // ICU-20792 tracks the follow-up work/further investigation on this.
1479 return nullptr;
1480 }
1481
1482 // Manage the circular CE buffer indexing
1483 limitIx++;
1484
1485 if (limitIx - firstIx >= bufSize) {
1486 // The buffer is full, knock out the lowest-indexed entry.
1487 firstIx++;
1488 }
1489
1490 UErrorCode status = U_ZERO_ERROR;
1491
1492 buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
1493
1494 return &buf[i];
1495}
1496
1497// Get the CE with the specified index.
1498// Index must be in the range
1499// n-history_size < index < n+1
1500// where n is the largest index to have been fetched by some previous call to this function.
1501// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
1502//
1503const CEI *CEIBuffer::getPrevious(int32_t index) {
1504 int i = index % bufSize;
1505
1506 if (index>=firstIx && index<limitIx) {
1507 // The request was for an entry already in our buffer.
1508 // Just return it.
1509 return &buf[i];
1510 }
1511
1512 // Caller is requesting a new, never accessed before, CE.
1513 // Verify that it is the next one in sequence, which is all
1514 // that is allowed.
1515 if (index != limitIx) {
1516 UPRV_UNREACHABLE_ASSERT(void)0;
1517 // TODO: In ICU 64 the above was changed from U_ASSERT to UPRV_UNREACHABLE,
1518 // which unconditionally called abort(). However, there were cases in which it
1519 // was being hit, so it was changed back to U_ASSERT per ICU-20680. In ICU 70,
1520 // we now use the new UPRV_UNREACHABLE_ASSERT to better indicate the situation.
1521 // ICU-20792 tracks the follow-up work/further investigation on this.
1522 return nullptr;
1523 }
1524
1525 // Manage the circular CE buffer indexing
1526 limitIx++;
1527
1528 if (limitIx - firstIx >= bufSize) {
1529 // The buffer is full, knock out the lowest-indexed entry.
1530 firstIx++;
1531 }
1532
1533 UErrorCode status = U_ZERO_ERROR;
1534
1535 buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
1536
1537 return &buf[i];
1538}
1539
1540}
1541
1542U_NAMESPACE_END}
1543
1544
1545// #define USEARCH_DEBUG
1546
1547#ifdef USEARCH_DEBUG
1548#include <stdio.h>
1549#include <stdlib.h>
1550#endif
1551
1552/*
1553 * Find the next break boundary after startIndex. If the UStringSearch object
1554 * has an external break iterator, use that. Otherwise use the internal character
1555 * break iterator.
1556 */
1557static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex, UErrorCode &status) {
1558 if (U_FAILURE(status)) {
1559 return startIndex;
1560 }
1561#if 0
1562 const UChar *text = strsrch->search->text;
1563 int32_t textLen = strsrch->search->textLength;
1564
1565 U_ASSERT(startIndex>=0)(void)0;
1566 U_ASSERT(startIndex<=textLen)(void)0;
1567
1568 if (startIndex >= textLen) {
1569 return startIndex;
1570 }
1571
1572 UChar32 c;
1573 int32_t i = startIndex;
1574 U16_NEXT(text, i, textLen, c)do { (c)=(text)[(i)++]; if((((c)&0xfffffc00)==0xd800)) { uint16_t
__c2; if((i)!=(textLen) && (((__c2=(text)[(i)])&
0xfffffc00)==0xdc00)) { ++(i); (c)=(((UChar32)((c))<<10UL
)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000)); } }
} while (false)
;
1575
1576 // If we are on a control character, stop without looking for combining marks.
1577 // Control characters do not combine.
1578 int32_t gcProperty = u_getIntPropertyValueu_getIntPropertyValue_71(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1579 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
1580 return i;
1581 }
1582
1583 // The initial character was not a control, and can thus accept trailing
1584 // combining characters. Advance over however many of them there are.
1585 int32_t indexOfLastCharChecked;
1586 for (;;) {
1587 indexOfLastCharChecked = i;
1588 if (i>=textLen) {
1589 break;
1590 }
1591 U16_NEXT(text, i, textLen, c)do { (c)=(text)[(i)++]; if((((c)&0xfffffc00)==0xd800)) { uint16_t
__c2; if((i)!=(textLen) && (((__c2=(text)[(i)])&
0xfffffc00)==0xdc00)) { ++(i); (c)=(((UChar32)((c))<<10UL
)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00-0x10000)); } }
} while (false)
;
1592 gcProperty = u_getIntPropertyValueu_getIntPropertyValue_71(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1593 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
1594 break;
1595 }
1596 }
1597 return indexOfLastCharChecked;
1598#elif !UCONFIG_NO_BREAK_ITERATION0
1599 UBreakIterator *breakiterator = getBreakIterator(strsrch, status);
1600 if (U_FAILURE(status)) {
1601 return startIndex;
1602 }
1603
1604 return ubrk_followingubrk_following_71(breakiterator, startIndex);
1605#else
1606 // **** or should we use the original code? ****
1607 return startIndex;
1608#endif
1609
1610}
1611
1612/*
1613 * Returns TRUE if index is on a break boundary. If the UStringSearch
1614 * has an external break iterator, test using that, otherwise test
1615 * using the internal character break iterator.
1616 */
1617static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index, UErrorCode &status) {
1618 if (U_FAILURE(status)) {
1619 return TRUE1;
1620 }
1621#if 0
1622 const UChar *text = strsrch->search->text;
1623 int32_t textLen = strsrch->search->textLength;
1624
1625 U_ASSERT(index>=0)(void)0;
1626 U_ASSERT(index<=textLen)(void)0;
1627
1628 if (index>=textLen || index<=0) {
1629 return TRUE1;
1630 }
1631
1632 // If the character at the current index is not a GRAPHEME_EXTEND
1633 // then we can not be within a combining sequence.
1634 UChar32 c;
1635 U16_GET(text, 0, index, textLen, c)do { (c)=(text)[index]; if((((c)&0xfffff800)==0xd800)) { uint16_t
__c2; if((((c)&0x400)==0)) { if((index)+1!=(textLen) &&
(((__c2=(text)[(index)+1])&0xfffffc00)==0xdc00)) { (c)=(
((UChar32)((c))<<10UL)+(UChar32)(__c2)-((0xd800<<
10UL)+0xdc00-0x10000)); } } else { if((index)>(0) &&
(((__c2=(text)[(index)-1])&0xfffffc00)==0xd800)) { (c)=(
((UChar32)(__c2)<<10UL)+(UChar32)((c))-((0xd800<<
10UL)+0xdc00-0x10000)); } } } } while (false)
;
1636 int32_t gcProperty = u_getIntPropertyValueu_getIntPropertyValue_71(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1637 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
1638 return TRUE1;
1639 }
1640
1641 // We are at a combining mark. If the preceding character is anything
1642 // except a CONTROL, CR or LF, we are in a combining sequence.
1643 U16_PREV(text, 0, index, c)do { (c)=(text)[--(index)]; if((((c)&0xfffffc00)==0xdc00)
) { uint16_t __c2; if((index)>(0) && (((__c2=(text
)[(index)-1])&0xfffffc00)==0xd800)) { --(index); (c)=(((UChar32
)(__c2)<<10UL)+(UChar32)((c))-((0xd800<<10UL)+0xdc00
-0x10000)); } } } while (false)
;
1644 gcProperty = u_getIntPropertyValueu_getIntPropertyValue_71(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1645 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
1646 return !combining;
1647#elif !UCONFIG_NO_BREAK_ITERATION0
1648 UBreakIterator *breakiterator = getBreakIterator(strsrch, status);
1649 if (U_FAILURE(status)) {
1650 return TRUE1;
1651 }
1652
1653 return ubrk_isBoundaryubrk_isBoundary_71(breakiterator, index);
1654#else
1655 // **** or use the original code? ****
1656 return TRUE1;
1657#endif
1658}
1659
1660#if 0
1661static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end, UErrorCode &status)
1662{
1663 if (U_FAILURE(status)) {
1664 return TRUE1;
1665 }
1666
1667#if !UCONFIG_NO_BREAK_ITERATION0
1668 UBreakIterator *breakiterator = getBreakIterator(strsrch, status);
1669 if (U_SUCCESS(status)) {
1670 int32_t startindex = ubrk_firstubrk_first_71(breakiterator);
1671 int32_t endindex = ubrk_lastubrk_last_71(breakiterator);
1672
1673 // out-of-range indexes are never boundary positions
1674 if (start < startindex || start > endindex ||
1675 end < startindex || end > endindex) {
1676 return FALSE0;
1677 }
1678
1679 return ubrk_isBoundaryubrk_isBoundary_71(breakiterator, start) &&
1680 ubrk_isBoundaryubrk_isBoundary_71(breakiterator, end);
1681 }
1682#endif
1683
1684 return TRUE1;
1685}
1686#endif
1687
1688typedef enum {
1689 U_CE_MATCH = -1,
1690 U_CE_NO_MATCH = 0,
1691 U_CE_SKIP_TARG,
1692 U_CE_SKIP_PATN
1693} UCompareCEsResult;
1694#define U_CE_LEVEL2_BASE0x00000005 0x00000005
1695#define U_CE_LEVEL3_BASE0x00050000 0x00050000
1696
1697static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
1698 if (targCE == patCE) {
1699 return U_CE_MATCH;
1700 }
1701 if (compareType == 0) {
1702 return U_CE_NO_MATCH;
1703 }
1704
1705 int64_t targCEshifted = targCE >> 32;
1706 int64_t patCEshifted = patCE >> 32;
1707 int64_t mask;
1708
1709 mask = 0xFFFF0000;
1710 int32_t targLev1 = (int32_t)(targCEshifted & mask);
1711 int32_t patLev1 = (int32_t)(patCEshifted & mask);
1712 if ( targLev1 != patLev1 ) {
1713 if ( targLev1 == 0 ) {
1714 return U_CE_SKIP_TARG;
1715 }
1716 if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
1717 return U_CE_SKIP_PATN;
1718 }
1719 return U_CE_NO_MATCH;
1720 }
1721
1722 mask = 0x0000FFFF;
1723 int32_t targLev2 = (int32_t)(targCEshifted & mask);
1724 int32_t patLev2 = (int32_t)(patCEshifted & mask);
1725 if ( targLev2 != patLev2 ) {
1726 if ( targLev2 == 0 ) {
1727 return U_CE_SKIP_TARG;
1728 }
1729 if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
1730 return U_CE_SKIP_PATN;
1731 }
1732 return (patLev2 == U_CE_LEVEL2_BASE0x00000005 || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE0x00000005) )?
1733 U_CE_MATCH: U_CE_NO_MATCH;
1734 }
1735
1736 mask = 0xFFFF0000;
1737 int32_t targLev3 = (int32_t)(targCE & mask);
1738 int32_t patLev3 = (int32_t)(patCE & mask);
1739 if ( targLev3 != patLev3 ) {
1740 return (patLev3 == U_CE_LEVEL3_BASE0x00050000 || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE0x00050000) )?
1741 U_CE_MATCH: U_CE_NO_MATCH;
1742 }
1743
1744 return U_CE_MATCH;
1745}
1746
1747namespace {
1748
1749UChar32 codePointAt(const USearch &search, int32_t index) {
1750 if (index < search.textLength) {
1751 UChar32 c;
1752 U16_NEXT(search.text, index, search.textLength, c)do { (c)=(search.text)[(index)++]; if((((c)&0xfffffc00)==
0xd800)) { uint16_t __c2; if((index)!=(search.textLength) &&
(((__c2=(search.text)[(index)])&0xfffffc00)==0xdc00)) { ++
(index); (c)=(((UChar32)((c))<<10UL)+(UChar32)(__c2)-((
0xd800<<10UL)+0xdc00-0x10000)); } } } while (false)
;
1753 return c;
1754 }
1755 return U_SENTINEL(-1);
1756}
1757
1758UChar32 codePointBefore(const USearch &search, int32_t index) {
1759 if (0 < index) {
1760 UChar32 c;
1761 U16_PREV(search.text, 0, index, c)do { (c)=(search.text)[--(index)]; if((((c)&0xfffffc00)==
0xdc00)) { uint16_t __c2; if((index)>(0) && (((__c2
=(search.text)[(index)-1])&0xfffffc00)==0xd800)) { --(index
); (c)=(((UChar32)(__c2)<<10UL)+(UChar32)((c))-((0xd800
<<10UL)+0xdc00-0x10000)); } } } while (false)
;
1762 return c;
1763 }
1764 return U_SENTINEL(-1);
1765}
1766
1767} // namespace
1768
1769U_CAPIextern "C" UBool U_EXPORT2 usearch_searchusearch_search_71(UStringSearch *strsrch,
1770 int32_t startIdx,
1771 int32_t *matchStart,
1772 int32_t *matchLimit,
1773 UErrorCode *status)
1774{
1775 if (U_FAILURE(*status)) {
1776 return FALSE0;
1777 }
1778
1779 // TODO: reject search patterns beginning with a combining char.
1780
1781#ifdef USEARCH_DEBUG
1782 if (getenv("USEARCH_DEBUG") != nullptr) {
1783 printf("Pattern CEs\n");
1784 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
1785 printf(" %8x", strsrch->pattern.ces[ii]);
1786 }
1787 printf("\n");
1788 }
1789
1790#endif
1791 // Input parameter sanity check.
1792 // TODO: should input indices clip to the text length
1793 // in the same way that UText does.
1794 if(strsrch->pattern.cesLength == 0 ||
1795 startIdx < 0 ||
1796 startIdx > strsrch->search->textLength ||
1797 strsrch->pattern.ces == nullptr) {
1798 *status = U_ILLEGAL_ARGUMENT_ERROR;
1799 return FALSE0;
1800 }
1801
1802 if (strsrch->pattern.pces == nullptr) {
1803 initializePatternPCETable(strsrch, status);
1804 }
1805
1806 ucol_setOffsetucol_setOffset_71(strsrch->textIter, startIdx, status);
1807 CEIBuffer ceb(strsrch, status);
1808
1809 // An out-of-memory (OOM) failure can occur in the initializePatternPCETable function
1810 // or CEIBuffer constructor above, so we need to check the status.
1811 if (U_FAILURE(*status)) {
1812 return FALSE0;
1813 }
1814
1815 int32_t targetIx = 0;
1816 const CEI *targetCEI = nullptr;
1817 int32_t patIx;
1818 UBool found;
1819
1820 int32_t mStart = -1;
1821 int32_t mLimit = -1;
1822 int32_t minLimit;
1823 int32_t maxLimit;
1824
1825
1826
1827 // Outer loop moves over match starting positions in the
1828 // target CE space.
1829 // Here we see the target as a sequence of collation elements, resulting from the following:
1830 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
1831 // (for example, digraphs such as IJ may be broken into two characters).
1832 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
1833 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
1834 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
1835 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strength is primary),
1836 // then the CE is deleted, so the following code sees only CEs that are relevant.
1837 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
1838 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
1839 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
1840 //
1841 for(targetIx=0; ; targetIx++)
1842 {
1843 found = TRUE1;
1844 // Inner loop checks for a match beginning at each
1845 // position from the outer loop.
1846 int32_t targetIxOffset = 0;
1847 int64_t patCE = 0;
1848 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
1849 // (compared to the last CE fetched for the previous targetIx value) as we need to go
1850 // for this targetIx value, so if it is non-nullptr then other ceb.get calls should be OK.
1851 const CEI *firstCEI = ceb.get(targetIx);
1852 if (firstCEI == nullptr) {
1853 *status = U_INTERNAL_PROGRAM_ERROR;
1854 found = FALSE0;
1855 break;
1856 }
1857
1858 for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
1859 patCE = strsrch->pattern.pces[patIx];
1860 targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
1861 // Compare CE from target string with CE from the pattern.
1862 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
1863 // which will fail the compare, below.
1864 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
1865 if ( ceMatch == U_CE_NO_MATCH ) {
1866 found = FALSE0;
1867 break;
1868 } else if ( ceMatch > U_CE_NO_MATCH ) {
1869 if ( ceMatch == U_CE_SKIP_TARG ) {
1870 // redo with same patCE, next targCE
1871 patIx--;
1872 targetIxOffset++;
1873 } else { // ceMatch == U_CE_SKIP_PATN
1874 // redo with same targCE, next patCE
1875 targetIxOffset--;
1876 }
1877 }
1878 }
1879 targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
1880
1881 if (!found && ((targetCEI == nullptr) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER((int64_t)((int64_t)(9223372036854775807L)))))) {
1882 // No match at this targetIx. Try again at the next.
1883 continue;
1884 }
1885
1886 if (!found) {
1887 // No match at all, we have run off the end of the target text.
1888 break;
1889 }
1890
1891
1892 // We have found a match in CE space.
1893 // Now determine the bounds in string index space.
1894 // There still is a chance of match failure if the CE range not correspond to
1895 // an acceptable character range.
1896 //
1897 const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1);
1898
1899 mStart = firstCEI->lowIndex;
1900 minLimit = lastCEI->lowIndex;
1901
1902 // Look at the CE following the match. If it is UCOL_NULLORDER the match
1903 // extended to the end of input, and the match is good.
1904
1905 // Look at the high and low indices of the CE following the match. If
1906 // they are the same it means one of two things:
1907 // 1. The match extended to the last CE from the target text, which is OK, or
1908 // 2. The last CE that was part of the match is in an expansion that extends
1909 // to the first CE after the match. In this case, we reject the match.
1910 const CEI *nextCEI = 0;
1911 if (strsrch->search->elementComparisonType == 0) {
1912 nextCEI = ceb.get(targetIx + targetIxOffset);
1913 maxLimit = nextCEI->lowIndex;
1914 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER((int64_t)((int64_t)(9223372036854775807L)))) {
1915 found = FALSE0;
1916 }
1917 } else {
1918 for ( ; ; ++targetIxOffset ) {
1919 nextCEI = ceb.get(targetIx + targetIxOffset);
1920 maxLimit = nextCEI->lowIndex;
1921 // If we are at the end of the target too, match succeeds
1922 if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER((int64_t)((int64_t)(9223372036854775807L))) ) {
1923 break;
1924 }
1925 // As long as the next CE has primary weight of 0,
1926 // it is part of the last target element matched by the pattern;
1927 // make sure it can be part of a match with the last patCE
1928 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
1929 UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
1930 if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
1931 found = FALSE0;
1932 break;
1933 }
1934 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
1935 // target element, but it has non-zero primary weight => match fails
1936 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
1937 found = false;
1938 break;
1939 // Else the target CE is not part of an expansion of the last matched element, match succeeds
1940 } else {
1941 break;
1942 }
1943 }
1944 }
1945
1946
1947 // Check for the start of the match being within a combining sequence.
1948 // This can happen if the pattern itself begins with a combining char, and
1949 // the match found combining marks in the target text that were attached
1950 // to something else.
1951 // This type of match should be rejected for not completely consuming a
1952 // combining sequence.
1953 if (!isBreakBoundary(strsrch, mStart, *status)) {
1954 found = FALSE0;
1955 }
1956 if (U_FAILURE(*status)) {
1957 break;
1958 }
1959
1960 // Check for the start of the match being within an Collation Element Expansion,
1961 // meaning that the first char of the match is only partially matched.
1962 // With expansions, the first CE will report the index of the source
1963 // character, and all subsequent (expansions) CEs will report the source index of the
1964 // _following_ character.
1965 int32_t secondIx = firstCEI->highIndex;
1966 if (mStart == secondIx) {
1967 found = FALSE0;
1968 }
1969
1970 // Allow matches to end in the middle of a grapheme cluster if the following
1971 // conditions are met; this is needed to make prefix search work properly in
1972 // Indic, see #11750
1973 // * the default breakIter is being used
1974 // * the next collation element after this combining sequence
1975 // - has non-zero primary weight
1976 // - corresponds to a separate character following the one at end of the current match
1977 // (the second of these conditions, and perhaps both, may be redundant given the
1978 // subsequent check for normalization boundary; however they are likely much faster
1979 // tests in any case)
1980 // * the match limit is a normalization boundary
1981 UBool allowMidclusterMatch = FALSE0;
1982 if (strsrch->search->text != nullptr && strsrch->search->textLength > maxLimit) {
1983 allowMidclusterMatch =
1984 strsrch->search->breakIter == nullptr &&
1985 nextCEI != nullptr && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
1986 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
1987 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
1988 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
1989 }
1990 // If those conditions are met, then:
1991 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
1992 // the match limit may be backed off to a previous break boundary. This handles
1993 // cases in which mLimit includes target characters that are ignorable with current
1994 // settings (such as space) and which extend beyond the pattern match.
1995 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
1996 // * do NOT require that match limit be on a breakIter boundary
1997
1998 // Advance the match end position to the first acceptable match boundary.
1999 // This advances the index over any combining characters.
2000 mLimit = maxLimit;
2001 if (minLimit < maxLimit) {
2002 // When the last CE's low index is same with its high index, the CE is likely
2003 // a part of expansion. In this case, the index is located just after the
2004 // character corresponding to the CEs compared above. If the index is right
2005 // at the break boundary, move the position to the next boundary will result
2006 // incorrect match length when there are ignorable characters exist between
2007 // the position and the next character produces CE(s). See ticket#8482.
2008 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit, *status)) {
2009 mLimit = minLimit;
2010 } else {
2011 int32_t nba = nextBoundaryAfter(strsrch, minLimit, *status);
2012 // Note that we can have nba < maxLimit && nba >= minLImit, in which
2013 // case we want to set mLimit to nba regardless of allowMidclusterMatch
2014 // (i.e. we back off mLimit to the previous breakIterator boundary).
2015 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
2016 mLimit = nba;
2017 }
2018 }
2019 }
2020
2021 if (U_FAILURE(*status)) {
2022 break;
2023 }
2024
2025 #ifdef USEARCH_DEBUG
2026 if (getenv("USEARCH_DEBUG") != nullptr) {
2027 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
2028 }
2029 #endif
2030
2031 if (!allowMidclusterMatch) {
2032 // If advancing to the end of a combining sequence in character indexing space
2033 // advanced us beyond the end of the match in CE space, reject this match.
2034 if (mLimit > maxLimit) {
2035 found = FALSE0;
2036 }
2037
2038 if (!isBreakBoundary(strsrch, mLimit, *status)) {
2039 found = FALSE0;
2040 }
2041 if (U_FAILURE(*status)) {
2042 break;
2043 }
2044 }
2045
2046 if (! checkIdentical(strsrch, mStart, mLimit)) {
2047 found = FALSE0;
2048 }
2049
2050 if (found) {
2051 break;
2052 }
2053 }
2054
2055 #ifdef USEARCH_DEBUG
2056 if (getenv("USEARCH_DEBUG") != nullptr) {
2057 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
2058 int32_t lastToPrint = ceb.limitIx+2;
2059 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
2060 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
2061 }
2062 printf("\n%s\n", found? "match found" : "no match");
2063 }
2064 #endif
2065
2066 // All Done. Store back the match bounds to the caller.
2067 //
2068
2069 if (U_FAILURE(*status)) {
2070 found = FALSE0; // No match if a failure occured.
2071 }
2072
2073 if (found==FALSE0) {
2074 mLimit = -1;
2075 mStart = -1;
2076 }
2077
2078 if (matchStart != nullptr) {
2079 *matchStart= mStart;
2080 }
2081
2082 if (matchLimit != nullptr) {
2083 *matchLimit = mLimit;
2084 }
2085
2086 return found;
2087}
2088
2089U_CAPIextern "C" UBool U_EXPORT2 usearch_searchBackwardsusearch_searchBackwards_71(UStringSearch *strsrch,
2090 int32_t startIdx,
2091 int32_t *matchStart,
2092 int32_t *matchLimit,
2093 UErrorCode *status)
2094{
2095 if (U_FAILURE(*status)) {
2096 return FALSE0;
2097 }
2098
2099 // TODO: reject search patterns beginning with a combining char.
2100
2101#ifdef USEARCH_DEBUG
2102 if (getenv("USEARCH_DEBUG") != nullptr) {
2103 printf("Pattern CEs\n");
2104 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
2105 printf(" %8x", strsrch->pattern.ces[ii]);
2106 }
2107 printf("\n");
2108 }
2109
2110#endif
2111 // Input parameter sanity check.
2112 // TODO: should input indices clip to the text length
2113 // in the same way that UText does.
2114 if(strsrch->pattern.cesLength == 0 ||
2115 startIdx < 0 ||
2116 startIdx > strsrch->search->textLength ||
2117 strsrch->pattern.ces == nullptr) {
2118 *status = U_ILLEGAL_ARGUMENT_ERROR;
2119 return FALSE0;
2120 }
2121
2122 if (strsrch->pattern.pces == nullptr) {
2123 initializePatternPCETable(strsrch, status);
2124 }
2125
2126 CEIBuffer ceb(strsrch, status);
2127 int32_t targetIx = 0;
2128
2129 /*
2130 * Pre-load the buffer with the CE's for the grapheme
2131 * after our starting position so that we're sure that
2132 * we can look at the CE following the match when we
2133 * check the match boundaries.
2134 *
2135 * This will also pre-fetch the first CE that we'll
2136 * consider for the match.
2137 */
2138 if (startIdx < strsrch->search->textLength) {
2139 UBreakIterator *breakiterator = getBreakIterator(strsrch, *status);
2140 if (U_FAILURE(*status)) {
2141 return FALSE0;
2142 }
2143 int32_t next = ubrk_followingubrk_following_71(breakiterator, startIdx);
2144
2145 ucol_setOffsetucol_setOffset_71(strsrch->textIter, next, status);
2146
2147 for (targetIx = 0; ; targetIx += 1) {
2148 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
2149 break;
2150 }
2151 }
2152 } else {
2153 ucol_setOffsetucol_setOffset_71(strsrch->textIter, startIdx, status);
2154 }
2155
2156 // An out-of-memory (OOM) failure can occur above, so we need to check the status.
2157 if (U_FAILURE(*status)) {
2158 return FALSE0;
2159 }
2160
2161 const CEI *targetCEI = nullptr;
2162 int32_t patIx;
2163 UBool found;
2164
2165 int32_t limitIx = targetIx;
2166 int32_t mStart = -1;
2167 int32_t mLimit = -1;
2168 int32_t minLimit;
2169 int32_t maxLimit;
2170
2171
2172
2173 // Outer loop moves over match starting positions in the
2174 // target CE space.
2175 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
2176 // But patIx is 0 at the beginning of the pattern and increases toward the end.
2177 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
2178 // and the beginning of the base text.
2179 for(targetIx = limitIx; ; targetIx += 1)
2180 {
2181 found = TRUE1;
2182 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
2183 // (compared to the last CE fetched for the previous targetIx value) as we need to go
2184 // for this targetIx value, so if it is non-nullptr then other ceb.getPrevious calls should be OK.
2185 const CEI *lastCEI = ceb.getPrevious(targetIx);
2186 if (lastCEI == nullptr) {
2187 *status = U_INTERNAL_PROGRAM_ERROR;
2188 found = FALSE0;
2189 break;
2190 }
2191 // Inner loop checks for a match beginning at each
2192 // position from the outer loop.
2193 int32_t targetIxOffset = 0;
2194 for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
2195 int64_t patCE = strsrch->pattern.pces[patIx];
2196
2197 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
2198 // Compare CE from target string with CE from the pattern.
2199 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
2200 // which will fail the compare, below.
2201 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
2202 if ( ceMatch == U_CE_NO_MATCH ) {
2203 found = FALSE0;
2204 break;
2205 } else if ( ceMatch > U_CE_NO_MATCH ) {
2206 if ( ceMatch == U_CE_SKIP_TARG ) {
2207 // redo with same patCE, next targCE
2208 patIx++;
2209 targetIxOffset++;
2210 } else { // ceMatch == U_CE_SKIP_PATN
2211 // redo with same targCE, next patCE
2212 targetIxOffset--;
2213 }
2214 }
2215 }
2216
2217 if (!found && ((targetCEI == nullptr) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER((int64_t)((int64_t)(9223372036854775807L)))))) {
2218 // No match at this targetIx. Try again at the next.
2219 continue;
2220 }
2221
2222 if (!found) {
2223 // No match at all, we have run off the end of the target text.
2224 break;
2225 }
2226
2227
2228 // We have found a match in CE space.
2229 // Now determine the bounds in string index space.
2230 // There still is a chance of match failure if the CE range not correspond to
2231 // an acceptable character range.
2232 //
2233 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
2234 mStart = firstCEI->lowIndex;
2235
2236 // Check for the start of the match being within a combining sequence.
2237 // This can happen if the pattern itself begins with a combining char, and
2238 // the match found combining marks in the target text that were attached
2239 // to something else.
2240 // This type of match should be rejected for not completely consuming a
2241 // combining sequence.
2242 if (!isBreakBoundary(strsrch, mStart, *status)) {
2243 found = FALSE0;
2244 }
2245 if (U_FAILURE(*status)) {
2246 break;
2247 }
2248
2249 // Look at the high index of the first CE in the match. If it's the same as the
2250 // low index, the first CE in the match is in the middle of an expansion.
2251 if (mStart == firstCEI->highIndex) {
2252 found = FALSE0;
2253 }
2254
2255
2256 minLimit = lastCEI->lowIndex;
2257
2258 if (targetIx > 0) {
2259 // Look at the CE following the match. If it is UCOL_NULLORDER the match
2260 // extended to the end of input, and the match is good.
2261
2262 // Look at the high and low indices of the CE following the match. If
2263 // they are the same it means one of two things:
2264 // 1. The match extended to the last CE from the target text, which is OK, or
2265 // 2. The last CE that was part of the match is in an expansion that extends
2266 // to the first CE after the match. In this case, we reject the match.
2267 const CEI *nextCEI = ceb.getPrevious(targetIx - 1);
2268
2269 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER((int64_t)((int64_t)(9223372036854775807L)))) {
2270 found = FALSE0;
2271 }
2272
2273 mLimit = maxLimit = nextCEI->lowIndex;
2274
2275 // Allow matches to end in the middle of a grapheme cluster if the following
2276 // conditions are met; this is needed to make prefix search work properly in
2277 // Indic, see #11750
2278 // * the default breakIter is being used
2279 // * the next collation element after this combining sequence
2280 // - has non-zero primary weight
2281 // - corresponds to a separate character following the one at end of the current match
2282 // (the second of these conditions, and perhaps both, may be redundant given the
2283 // subsequent check for normalization boundary; however they are likely much faster
2284 // tests in any case)
2285 // * the match limit is a normalization boundary
2286 UBool allowMidclusterMatch = FALSE0;
2287 if (strsrch->search->text != nullptr && strsrch->search->textLength > maxLimit) {
2288 allowMidclusterMatch =
2289 strsrch->search->breakIter == nullptr &&
2290 nextCEI != nullptr && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
2291 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
2292 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
2293 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
2294 }
2295 // If those conditions are met, then:
2296 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
2297 // the match limit may be backed off to a previous break boundary. This handles
2298 // cases in which mLimit includes target characters that are ignorable with current
2299 // settings (such as space) and which extend beyond the pattern match.
2300 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
2301 // * do NOT require that match limit be on a breakIter boundary
2302
2303 // Advance the match end position to the first acceptable match boundary.
2304 // This advances the index over any combining characters.
2305 if (minLimit < maxLimit) {
2306 int32_t nba = nextBoundaryAfter(strsrch, minLimit, *status);
2307 // Note that we can have nba < maxLimit && nba >= minLImit, in which
2308 // case we want to set mLimit to nba regardless of allowMidclusterMatch
2309 // (i.e. we back off mLimit to the previous breakIterator boundary).
2310 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
2311 mLimit = nba;
2312 }
2313 }
2314
2315 if (!allowMidclusterMatch) {
2316 // If advancing to the end of a combining sequence in character indexing space
2317 // advanced us beyond the end of the match in CE space, reject this match.
2318 if (mLimit > maxLimit) {
2319 found = FALSE0;
2320 }
2321
2322 // Make sure the end of the match is on a break boundary
2323 if (!isBreakBoundary(strsrch, mLimit, *status)) {
2324 found = FALSE0;
2325 }
2326 if (U_FAILURE(*status)) {
2327 break;
2328 }
2329 }
2330
2331 } else {
2332 // No non-ignorable CEs after this point.
2333 // The maximum position is detected by boundary after
2334 // the last non-ignorable CE. Combining sequence
2335 // across the start index will be truncated.
2336 int32_t nba = nextBoundaryAfter(strsrch, minLimit, *status);
2337 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
Although the value stored to 'maxLimit' is used in the enclosing expression, the value is never actually read from 'maxLimit'
2338 }
2339
2340 #ifdef USEARCH_DEBUG
2341 if (getenv("USEARCH_DEBUG") != nullptr) {
2342 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
2343 }
2344 #endif
2345
2346
2347 if (! checkIdentical(strsrch, mStart, mLimit)) {
2348 found = FALSE0;
2349 }
2350
2351 if (found) {
2352 break;
2353 }
2354 }
2355
2356 #ifdef USEARCH_DEBUG
2357 if (getenv("USEARCH_DEBUG") != nullptr) {
2358 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
2359 int32_t lastToPrint = ceb.limitIx+2;
2360 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
2361 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
2362 }
2363 printf("\n%s\n", found? "match found" : "no match");
2364 }
2365 #endif
2366
2367 // All Done. Store back the match bounds to the caller.
2368 //
2369
2370 if (U_FAILURE(*status)) {
2371 found = FALSE0; // No match if a failure occured.
2372 }
2373
2374 if (found==FALSE0) {
2375 mLimit = -1;
2376 mStart = -1;
2377 }
2378
2379 if (matchStart != nullptr) {
2380 *matchStart= mStart;
2381 }
2382
2383 if (matchLimit != nullptr) {
2384 *matchLimit = mLimit;
2385 }
2386
2387 return found;
2388}
2389
2390// internal use methods declared in usrchimp.h -----------------------------
2391
2392UBool usearch_handleNextExactusearch_handleNextExact_71(UStringSearch *strsrch, UErrorCode *status)
2393{
2394 if (U_FAILURE(*status)) {
2395 setMatchNotFound(strsrch, *status);
2396 return FALSE0;
2397 }
2398
2399 int32_t textOffset = ucol_getOffsetucol_getOffset_71(strsrch->textIter);
2400 int32_t start = -1;
2401 int32_t end = -1;
2402
2403 if (usearch_searchusearch_search_71(strsrch, textOffset, &start, &end, status)) {
2404 strsrch->search->matchedIndex = start;
2405 strsrch->search->matchedLength = end - start;
2406 return TRUE1;
2407 } else {
2408 setMatchNotFound(strsrch, *status);
2409 return FALSE0;
2410 }
2411}
2412
2413UBool usearch_handleNextCanonicalusearch_handleNextCanonical_71(UStringSearch *strsrch, UErrorCode *status)
2414{
2415 if (U_FAILURE(*status)) {
2416 setMatchNotFound(strsrch, *status);
2417 return FALSE0;
2418 }
2419
2420 int32_t textOffset = ucol_getOffsetucol_getOffset_71(strsrch->textIter);
2421 int32_t start = -1;
2422 int32_t end = -1;
2423
2424 if (usearch_searchusearch_search_71(strsrch, textOffset, &start, &end, status)) {
2425 strsrch->search->matchedIndex = start;
2426 strsrch->search->matchedLength = end - start;
2427 return TRUE1;
2428 } else {
2429 setMatchNotFound(strsrch, *status);
2430 return FALSE0;
2431 }
2432}
2433
2434UBool usearch_handlePreviousExactusearch_handlePreviousExact_71(UStringSearch *strsrch, UErrorCode *status)
2435{
2436 if (U_FAILURE(*status)) {
2437 setMatchNotFound(strsrch, *status);
2438 return FALSE0;
2439 }
2440
2441 int32_t textOffset;
2442
2443 if (strsrch->search->isOverlap) {
2444 if (strsrch->search->matchedIndex != USEARCH_DONE-1) {
2445 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
2446 } else {
2447 // move the start position at the end of possible match
2448 initializePatternPCETable(strsrch, status);
2449 if (!initTextProcessedIter(strsrch, status)) {
2450 setMatchNotFound(strsrch, *status);
2451 return FALSE0;
2452 }
2453 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
2454 int64_t pce = strsrch->textProcessedIter->nextProcessed(nullptr, nullptr, status);
2455 if (pce == UCOL_PROCESSED_NULLORDER((int64_t)((int64_t)(9223372036854775807L)))) {
2456 // at the end of the text
2457 break;
2458 }
2459 }
2460 if (U_FAILURE(*status)) {
2461 setMatchNotFound(strsrch, *status);
2462 return FALSE0;
2463 }
2464 textOffset = ucol_getOffsetucol_getOffset_71(strsrch->textIter);
2465 }
2466 } else {
2467 textOffset = ucol_getOffsetucol_getOffset_71(strsrch->textIter);
2468 }
2469
2470 int32_t start = -1;
2471 int32_t end = -1;
2472
2473 if (usearch_searchBackwardsusearch_searchBackwards_71(strsrch, textOffset, &start, &end, status)) {
2474 strsrch->search->matchedIndex = start;
2475 strsrch->search->matchedLength = end - start;
2476 return TRUE1;
2477 } else {
2478 setMatchNotFound(strsrch, *status);
2479 return FALSE0;
2480 }
2481}
2482
2483UBool usearch_handlePreviousCanonicalusearch_handlePreviousCanonical_71(UStringSearch *strsrch,
2484 UErrorCode *status)
2485{
2486 if (U_FAILURE(*status)) {
2487 setMatchNotFound(strsrch, *status);
2488 return FALSE0;
2489 }
2490
2491 int32_t textOffset;
2492
2493 if (strsrch->search->isOverlap) {
2494 if (strsrch->search->matchedIndex != USEARCH_DONE-1) {
2495 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
2496 } else {
2497 // move the start position at the end of possible match
2498 initializePatternPCETable(strsrch, status);
2499 if (!initTextProcessedIter(strsrch, status)) {
2500 setMatchNotFound(strsrch, *status);
2501 return FALSE0;
2502 }
2503 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
2504 int64_t pce = strsrch->textProcessedIter->nextProcessed(nullptr, nullptr, status);
2505 if (pce == UCOL_PROCESSED_NULLORDER((int64_t)((int64_t)(9223372036854775807L)))) {
2506 // at the end of the text
2507 break;
2508 }
2509 }
2510 if (U_FAILURE(*status)) {
2511 setMatchNotFound(strsrch, *status);
2512 return FALSE0;
2513 }
2514 textOffset = ucol_getOffsetucol_getOffset_71(strsrch->textIter);
2515 }
2516 } else {
2517 textOffset = ucol_getOffsetucol_getOffset_71(strsrch->textIter);
2518 }
2519
2520 int32_t start = -1;
2521 int32_t end = -1;
2522
2523 if (usearch_searchBackwardsusearch_searchBackwards_71(strsrch, textOffset, &start, &end, status)) {
2524 strsrch->search->matchedIndex = start;
2525 strsrch->search->matchedLength = end - start;
2526 return TRUE1;
2527 } else {
2528 setMatchNotFound(strsrch, *status);
2529 return FALSE0;
2530 }
2531}
2532
2533#endif /* #if !UCONFIG_NO_COLLATION */