Bug Summary

File:out/../deps/icu-small/source/common/unisetspan.cpp
Warning:line 154, column 16
Although the value stored to 'result' is used in the enclosing expression, the value is never actually read from 'result'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name unisetspan.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/home/maurizio/node-v18.6.0/out -resource-dir /usr/local/lib/clang/16.0.0 -D V8_DEPRECATION_WARNINGS -D V8_IMMINENT_DEPRECATION_WARNINGS -D _GLIBCXX_USE_CXX11_ABI=1 -D NODE_OPENSSL_CONF_NAME=nodejs_conf -D NODE_OPENSSL_HAS_QUIC -D __STDC_FORMAT_MACROS -D OPENSSL_NO_PINSHARED -D OPENSSL_THREADS -D U_COMMON_IMPLEMENTATION=1 -D U_ATTRIBUTE_DEPRECATED= -D _CRT_SECURE_NO_DEPRECATE= -D U_STATIC_IMPLEMENTATION=1 -D UCONFIG_NO_SERVICE=1 -D U_ENABLE_DYLOAD=0 -D U_HAVE_STD_STRING=1 -D UCONFIG_NO_BREAK_ITERATION=0 -I ../deps/icu-small/source/common -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8 -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/x86_64-redhat-linux -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/backward -internal-isystem /usr/local/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../x86_64-redhat-linux/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-unused-parameter -Wno-deprecated-declarations -Wno-strict-aliasing -std=gnu++17 -fdeprecated-macro -fdebug-compilation-dir=/home/maurizio/node-v18.6.0/out -ferror-limit 19 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-08-22-142216-507842-1 -x c++ ../deps/icu-small/source/common/unisetspan.cpp
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5*
6* Copyright (C) 2007-2012, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9******************************************************************************
10* file name: unisetspan.cpp
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2007mar01
16* created by: Markus W. Scherer
17*/
18
19#include "unicode/utypes.h"
20#include "unicode/uniset.h"
21#include "unicode/ustring.h"
22#include "unicode/utf8.h"
23#include "unicode/utf16.h"
24#include "cmemory.h"
25#include "uvector.h"
26#include "unisetspan.h"
27
28U_NAMESPACE_BEGINnamespace icu_71 {
29
30/*
31 * List of offsets from the current position from where to try matching
32 * a code point or a string.
33 * Store offsets rather than indexes to simplify the code and use the same list
34 * for both increments (in span()) and decrements (in spanBack()).
35 *
36 * Assumption: The maximum offset is limited, and the offsets that are stored
37 * at any one time are relatively dense, that is, there are normally no gaps of
38 * hundreds or thousands of offset values.
39 *
40 * The implementation uses a circular buffer of byte flags,
41 * each indicating whether the corresponding offset is in the list.
42 * This avoids inserting into a sorted list of offsets (or absolute indexes) and
43 * physically moving part of the list.
44 *
45 * Note: In principle, the caller should setMaxLength() to the maximum of the
46 * max string length and U16_LENGTH/U8_LENGTH to account for
47 * "long" single code points.
48 * However, this implementation uses at least a staticList with more than
49 * U8_LENGTH entries anyway.
50 *
51 * Note: If maxLength were guaranteed to be no more than 32 or 64,
52 * the list could be stored as bit flags in a single integer.
53 * Rather than handling a circular buffer with a start list index,
54 * the integer would simply be shifted when lower offsets are removed.
55 * UnicodeSet does not have a limit on the lengths of strings.
56 */
57class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory.
58public:
59 OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
60
61 ~OffsetList() {
62 if(list!=staticList) {
63 uprv_freeuprv_free_71(list);
64 }
65 }
66
67 // Call exactly once if the list is to be used.
68 void setMaxLength(int32_t maxLength) {
69 if(maxLength<=(int32_t)sizeof(staticList)) {
70 capacity=(int32_t)sizeof(staticList);
71 } else {
72 UBool *l=(UBool *)uprv_mallocuprv_malloc_71(maxLength);
73 if(l!=NULL__null) {
74 list=l;
75 capacity=maxLength;
76 }
77 }
78 uprv_memset(list, 0, capacity):: memset(list, 0, capacity);
79 }
80
81 void clear() {
82 uprv_memset(list, 0, capacity):: memset(list, 0, capacity);
83 start=length=0;
84 }
85
86 UBool isEmpty() const {
87 return (UBool)(length==0);
88 }
89
90 // Reduce all stored offsets by delta, used when the current position
91 // moves by delta.
92 // There must not be any offsets lower than delta.
93 // If there is an offset equal to delta, it is removed.
94 // delta=[1..maxLength]
95 void shift(int32_t delta) {
96 int32_t i=start+delta;
97 if(i>=capacity) {
98 i-=capacity;
99 }
100 if(list[i]) {
101 list[i]=FALSE0;
102 --length;
103 }
104 start=i;
105 }
106
107 // Add an offset. The list must not contain it yet.
108 // offset=[1..maxLength]
109 void addOffset(int32_t offset) {
110 int32_t i=start+offset;
111 if(i>=capacity) {
112 i-=capacity;
113 }
114 list[i]=TRUE1;
115 ++length;
116 }
117
118 // offset=[1..maxLength]
119 UBool containsOffset(int32_t offset) const {
120 int32_t i=start+offset;
121 if(i>=capacity) {
122 i-=capacity;
123 }
124 return list[i];
125 }
126
127 // Find the lowest stored offset from a non-empty list, remove it,
128 // and reduce all other offsets by this minimum.
129 // Returns [1..maxLength].
130 int32_t popMinimum() {
131 // Look for the next offset in list[start+1..capacity-1].
132 int32_t i=start, result;
133 while(++i<capacity) {
134 if(list[i]) {
135 list[i]=FALSE0;
136 --length;
137 result=i-start;
138 start=i;
139 return result;
140 }
141 }
142 // i==capacity
143
144 // Wrap around and look for the next offset in list[0..start].
145 // Since the list is not empty, there will be one.
146 result=capacity-start;
147 i=0;
148 while(!list[i]) {
149 ++i;
150 }
151 list[i]=FALSE0;
152 --length;
153 start=i;
154 return result+=i;
Although the value stored to 'result' is used in the enclosing expression, the value is never actually read from 'result'
155 }
156
157private:
158 UBool *list;
159 int32_t capacity;
160 int32_t length;
161 int32_t start;
162
163 UBool staticList[16];
164};
165
166// Get the number of UTF-8 bytes for a UTF-16 (sub)string.
167static int32_t
168getUTF8Length(const UChar *s, int32_t length) {
169 UErrorCode errorCode=U_ZERO_ERROR;
170 int32_t length8=0;
171 u_strToUTF8u_strToUTF8_71(NULL__null, 0, &length8, s, length, &errorCode);
172 if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
173 return length8;
174 } else {
175 // The string contains an unpaired surrogate.
176 // Ignore this string.
177 return 0;
178 }
179}
180
181// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
182static int32_t
183appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
184 UErrorCode errorCode=U_ZERO_ERROR;
185 int32_t length8=0;
186 u_strToUTF8u_strToUTF8_71((char *)t, capacity, &length8, s, length, &errorCode);
187 if(U_SUCCESS(errorCode)) {
188 return length8;
189 } else {
190 // The string contains an unpaired surrogate.
191 // Ignore this string.
192 return 0;
193 }
194}
195
196static inline uint8_t
197makeSpanLengthByte(int32_t spanLength) {
198 // 0xfe==UnicodeSetStringSpan::LONG_SPAN
199 return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
200}
201
202// Construct for all variants of span(), or only for any one variant.
203// Initialize as little as possible, for single use.
204UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
205 const UVector &setStrings,
206 uint32_t which)
207 : spanSet(0, 0x10ffff), pSpanNotSet(NULL__null), strings(setStrings),
208 utf8Lengths(NULL__null), spanLengths(NULL__null), utf8(NULL__null),
209 utf8Length(0),
210 maxLength16(0), maxLength8(0),
211 all((UBool)(which==ALL)) {
212 spanSet.retainAll(set);
213 if(which&NOT_CONTAINED) {
214 // Default to the same sets.
215 // addToSpanNotSet() will create a separate set if necessary.
216 pSpanNotSet=&spanSet;
217 }
218
219 // Determine if the strings even need to be taken into account at all for span() etc.
220 // If any string is relevant, then all strings need to be used for
221 // span(longest match) but only the relevant ones for span(while contained).
222 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
223 // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
224 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
225 // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
226 int32_t stringsLength=strings.size();
227
228 int32_t i, spanLength;
229 UBool someRelevant=FALSE0;
230 for(i=0; i<stringsLength; ++i) {
231 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
232 const UChar *s16=string.getBuffer();
233 int32_t length16=string.length();
234 if (length16==0) {
235 continue; // skip the empty string
236 }
237 UBool thisRelevant;
238 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
239 if(spanLength<length16) { // Relevant string.
240 someRelevant=thisRelevant=TRUE1;
241 } else {
242 thisRelevant=FALSE0;
243 }
244 if((which&UTF16) && length16>maxLength16) {
245 maxLength16=length16;
246 }
247 if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
248 int32_t length8=getUTF8Length(s16, length16);
249 utf8Length+=length8;
250 if(length8>maxLength8) {
251 maxLength8=length8;
252 }
253 }
254 }
255 if(!someRelevant) {
256 maxLength16=maxLength8=0;
257 return;
258 }
259
260 // Freeze after checking for the need to use strings at all because freezing
261 // a set takes some time and memory which are wasted if there are no relevant strings.
262 if(all) {
263 spanSet.freeze();
264 }
265
266 uint8_t *spanBackLengths;
267 uint8_t *spanUTF8Lengths;
268 uint8_t *spanBackUTF8Lengths;
269
270 // Allocate a block of meta data.
271 int32_t allocSize;
272 if(all) {
273 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
274 allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
275 } else {
276 allocSize=stringsLength; // One set of span lengths.
277 if(which&UTF8) {
278 // UTF-8 lengths and UTF-8 strings.
279 allocSize+=stringsLength*4+utf8Length;
280 }
281 }
282 if(allocSize<=(int32_t)sizeof(staticLengths)) {
283 utf8Lengths=staticLengths;
284 } else {
285 utf8Lengths=(int32_t *)uprv_mallocuprv_malloc_71(allocSize);
286 if(utf8Lengths==NULL__null) {
287 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
288 return; // Out of memory.
289 }
290 }
291
292 if(all) {
293 // Store span lengths for all span() variants.
294 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
295 spanBackLengths=spanLengths+stringsLength;
296 spanUTF8Lengths=spanBackLengths+stringsLength;
297 spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
298 utf8=spanBackUTF8Lengths+stringsLength;
299 } else {
300 // Store span lengths for only one span() variant.
301 if(which&UTF8) {
302 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
303 utf8=spanLengths+stringsLength;
304 } else {
305 spanLengths=(uint8_t *)utf8Lengths;
306 }
307 spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
308 }
309
310 // Set the meta data and pSpanNotSet and write the UTF-8 strings.
311 int32_t utf8Count=0; // Count UTF-8 bytes written so far.
312
313 for(i=0; i<stringsLength; ++i) {
314 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
315 const UChar *s16=string.getBuffer();
316 int32_t length16=string.length();
317 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
318 if(spanLength<length16 && length16>0) { // Relevant string.
319 if(which&UTF16) {
320 if(which&CONTAINED) {
321 if(which&FWD) {
322 spanLengths[i]=makeSpanLengthByte(spanLength);
323 }
324 if(which&BACK) {
325 spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
326 spanBackLengths[i]=makeSpanLengthByte(spanLength);
327 }
328 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
329 spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag.
330 }
331 }
332 if(which&UTF8) {
333 uint8_t *s8=utf8+utf8Count;
334 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
335 utf8Count+=utf8Lengths[i]=length8;
336 if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8.
337 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
338 } else { // Relevant for UTF-8.
339 if(which&CONTAINED) {
340 if(which&FWD) {
341 spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
342 spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
343 }
344 if(which&BACK) {
345 spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
346 spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
347 }
348 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
349 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag.
350 }
351 }
352 }
353 if(which&NOT_CONTAINED) {
354 // Add string start and end code points to the spanNotSet so that
355 // a span(while not contained) stops before any string.
356 UChar32 c;
357 if(which&FWD) {
358 int32_t len=0;
359 U16_NEXT(s16, len, length16, c)do { (c)=(s16)[(len)++]; if((((c)&0xfffffc00)==0xd800)) {
uint16_t __c2; if((len)!=(length16) && (((__c2=(s16)
[(len)])&0xfffffc00)==0xdc00)) { ++(len); (c)=(((UChar32)
((c))<<10UL)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00
-0x10000)); } } } while (false)
;
360 addToSpanNotSet(c);
361 }
362 if(which&BACK) {
363 int32_t len=length16;
364 U16_PREV(s16, 0, len, c)do { (c)=(s16)[--(len)]; if((((c)&0xfffffc00)==0xdc00)) {
uint16_t __c2; if((len)>(0) && (((__c2=(s16)[(len
)-1])&0xfffffc00)==0xd800)) { --(len); (c)=(((UChar32)(__c2
)<<10UL)+(UChar32)((c))-((0xd800<<10UL)+0xdc00-0x10000
)); } } } while (false)
;
365 addToSpanNotSet(c);
366 }
367 }
368 } else { // Irrelevant string. (Also the empty string.)
369 if(which&UTF8) {
370 if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.
371 uint8_t *s8=utf8+utf8Count;
372 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
373 utf8Count+=utf8Lengths[i]=length8;
374 } else {
375 utf8Lengths[i]=0;
376 }
377 }
378 if(all) {
379 spanLengths[i]=spanBackLengths[i]=
380 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
381 (uint8_t)ALL_CP_CONTAINED;
382 } else {
383 // All spanXYZLengths pointers contain the same address.
384 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
385 }
386 }
387 }
388
389 // Finish.
390 if(all) {
391 pSpanNotSet->freeze();
392 }
393}
394
395// Copy constructor. Assumes which==ALL for a frozen set.
396UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
397 const UVector &newParentSetStrings)
398 : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL__null), strings(newParentSetStrings),
399 utf8Lengths(NULL__null), spanLengths(NULL__null), utf8(NULL__null),
400 utf8Length(otherStringSpan.utf8Length),
401 maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
402 all(TRUE1) {
403 if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
404 pSpanNotSet=&spanSet;
405 } else {
406 pSpanNotSet=otherStringSpan.pSpanNotSet->clone();
407 }
408
409 // Allocate a block of meta data.
410 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
411 int32_t stringsLength=strings.size();
412 int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
413 if(allocSize<=(int32_t)sizeof(staticLengths)) {
414 utf8Lengths=staticLengths;
415 } else {
416 utf8Lengths=(int32_t *)uprv_mallocuprv_malloc_71(allocSize);
417 if(utf8Lengths==NULL__null) {
418 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
419 return; // Out of memory.
420 }
421 }
422
423 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
424 utf8=spanLengths+stringsLength*4;
425 uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize)do { clang diagnostic push clang diagnostic ignored "-Waddress"
(void)0; (void)0; clang diagnostic pop :: memcpy(utf8Lengths
, otherStringSpan.utf8Lengths, allocSize); } while (false)
;
426}
427
428UnicodeSetStringSpan::~UnicodeSetStringSpan() {
429 if(pSpanNotSet!=NULL__null && pSpanNotSet!=&spanSet) {
430 delete pSpanNotSet;
431 }
432 if(utf8Lengths!=NULL__null && utf8Lengths!=staticLengths) {
433 uprv_freeuprv_free_71(utf8Lengths);
434 }
435}
436
437void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
438 if(pSpanNotSet==NULL__null || pSpanNotSet==&spanSet) {
439 if(spanSet.contains(c)) {
440 return; // Nothing to do.
441 }
442 UnicodeSet *newSet=spanSet.cloneAsThawed();
443 if(newSet==NULL__null) {
444 return; // Out of memory.
445 } else {
446 pSpanNotSet=newSet;
447 }
448 }
449 pSpanNotSet->add(c);
450}
451
452// Compare strings without any argument checks. Requires length>0.
453static inline UBool
454matches16(const UChar *s, const UChar *t, int32_t length) {
455 do {
456 if(*s++!=*t++) {
457 return FALSE0;
458 }
459 } while(--length>0);
460 return TRUE1;
461}
462
463static inline UBool
464matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
465 do {
466 if(*s++!=*t++) {
467 return FALSE0;
468 }
469 } while(--length>0);
470 return TRUE1;
471}
472
473// Compare 16-bit Unicode strings (which may be malformed UTF-16)
474// at code point boundaries.
475// That is, each edge of a match must not be in the middle of a surrogate pair.
476static inline UBool
477matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
478 s+=start;
479 limit-=start;
480 return matches16(s, t, length) &&
481 !(0<start && U16_IS_LEAD(s[-1])(((s[-1])&0xfffffc00)==0xd800) && U16_IS_TRAIL(s[0])(((s[0])&0xfffffc00)==0xdc00)) &&
482 !(length<limit && U16_IS_LEAD(s[length-1])(((s[length-1])&0xfffffc00)==0xd800) && U16_IS_TRAIL(s[length])(((s[length])&0xfffffc00)==0xdc00));
483}
484
485// Does the set contain the next code point?
486// If so, return its length; otherwise return its negative length.
487static inline int32_t
488spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
489 UChar c=*s, c2;
490 if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])(((c2=s[1])&0xfffffc00)==0xdc00)) {
491 return set.contains(U16_GET_SUPPLEMENTARY(c, c2)(((UChar32)(c)<<10UL)+(UChar32)(c2)-((0xd800<<10UL
)+0xdc00-0x10000))
) ? 2 : -2;
492 }
493 return set.contains(c) ? 1 : -1;
494}
495
496static inline int32_t
497spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
498 UChar c=s[length-1], c2;
499 if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])(((c2=s[length-2])&0xfffffc00)==0xd800)) {
500 return set.contains(U16_GET_SUPPLEMENTARY(c2, c)(((UChar32)(c2)<<10UL)+(UChar32)(c)-((0xd800<<10UL
)+0xdc00-0x10000))
) ? 2 : -2;
501 }
502 return set.contains(c) ? 1 : -1;
503}
504
505static inline int32_t
506spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
507 UChar32 c=*s;
508 if(U8_IS_SINGLE(c)(((c)&0x80)==0)) {
509 return set.contains(c) ? 1 : -1;
510 }
511 // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
512 int32_t i=0;
513 U8_NEXT_OR_FFFD(s, i, length, c)do { (c)=(uint8_t)(s)[(i)++]; if(!(((c)&0x80)==0)) { uint8_t
__t = 0; if((i)!=(length) && ((c)>=0xe0 ? ((c)<
0xf0 ? "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) &&
(__t&=0x3f, 1) : ((c)-=0xf0)<=4 && "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
[(__t=(s)[i])>>4]&(1<<(c)) && ((c)=((
c)<<6)|(__t&0x3f), ++(i)!=(length)) && (__t
=(s)[i]-0x80)<=0x3f) && ((c)=((c)<<6)|__t, ++
(i)!=(length)) : (c)>=0xc2 && ((c)&=0x1f, 1)) &&
(__t=(s)[i]-0x80)<=0x3f && ((c)=((c)<<6)|__t
, ++(i), 1)) { } else { (c)=(0xfffd); } } } while (false)
;
514 return set.contains(c) ? i : -i;
515}
516
517static inline int32_t
518spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
519 UChar32 c=s[length-1];
520 if(U8_IS_SINGLE(c)(((c)&0x80)==0)) {
521 return set.contains(c) ? 1 : -1;
522 }
523 int32_t i=length-1;
524 c=utf8_prevCharSafeBodyutf8_prevCharSafeBody_71(s, 0, &i, c, -3);
525 length-=i;
526 return set.contains(c) ? length : -length;
527}
528
529/*
530 * Note: In span() when spanLength==0 (after a string match, or at the beginning
531 * after an empty code point span) and in spanNot() and spanNotUTF8(),
532 * string matching could use a binary search
533 * because all string matches are done from the same start index.
534 *
535 * For UTF-8, this would require a comparison function that returns UTF-16 order.
536 *
537 * This optimization should not be necessary for normal UnicodeSets because
538 * most sets have no strings, and most sets with strings have
539 * very few very short strings.
540 * For cases with many strings, it might be better to use a different API
541 * and implementation with a DFA (state machine).
542 */
543
544/*
545 * Algorithm for span(USET_SPAN_CONTAINED)
546 *
547 * Theoretical algorithm:
548 * - Iterate through the string, and at each code point boundary:
549 * + If the code point there is in the set, then remember to continue after it.
550 * + If a set string matches at the current position, then remember to continue after it.
551 * + Either recursively span for each code point or string match,
552 * or recursively span for all but the shortest one and
553 * iteratively continue the span with the shortest local match.
554 * + Remember the longest recursive span (the farthest end point).
555 * + If there is no match at the current position, neither for the code point there
556 * nor for any set string, then stop and return the longest recursive span length.
557 *
558 * Optimized implementation:
559 *
560 * (We assume that most sets will have very few very short strings.
561 * A span using a string-less set is extremely fast.)
562 *
563 * Create and cache a spanSet which contains all of the single code points
564 * of the original set but none of its strings.
565 *
566 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
567 * - Loop:
568 * + Try to match each set string at the end of the spanLength.
569 * ~ Set strings that start with set-contained code points must be matched
570 * with a partial overlap because the recursive algorithm would have tried
571 * to match them at every position.
572 * ~ Set strings that entirely consist of set-contained code points
573 * are irrelevant for span(USET_SPAN_CONTAINED) because the
574 * recursive algorithm would continue after them anyway
575 * and find the longest recursive match from their end.
576 * ~ Rather than recursing, note each end point of a set string match.
577 * + If no set string matched after spanSet.span(), then return
578 * with where the spanSet.span() ended.
579 * + If at least one set string matched after spanSet.span(), then
580 * pop the shortest string match end point and continue
581 * the loop, trying to match all set strings from there.
582 * + If at least one more set string matched after a previous string match,
583 * then test if the code point after the previous string match is also
584 * contained in the set.
585 * Continue the loop with the shortest end point of either this code point
586 * or a matching set string.
587 * + If no more set string matched after a previous string match,
588 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
589 * Stop if spanLength==0, otherwise continue the loop.
590 *
591 * By noting each end point of a set string match,
592 * the function visits each string position at most once and finishes
593 * in linear time.
594 *
595 * The recursive algorithm may visit the same string position many times
596 * if multiple paths lead to it and finishes in exponential time.
597 */
598
599/*
600 * Algorithm for span(USET_SPAN_SIMPLE)
601 *
602 * Theoretical algorithm:
603 * - Iterate through the string, and at each code point boundary:
604 * + If the code point there is in the set, then remember to continue after it.
605 * + If a set string matches at the current position, then remember to continue after it.
606 * + Continue from the farthest match position and ignore all others.
607 * + If there is no match at the current position,
608 * then stop and return the current position.
609 *
610 * Optimized implementation:
611 *
612 * (Same assumption and spanSet as above.)
613 *
614 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
615 * - Loop:
616 * + Try to match each set string at the end of the spanLength.
617 * ~ Set strings that start with set-contained code points must be matched
618 * with a partial overlap because the standard algorithm would have tried
619 * to match them earlier.
620 * ~ Set strings that entirely consist of set-contained code points
621 * must be matched with a full overlap because the longest-match algorithm
622 * would hide set string matches that end earlier.
623 * Such set strings need not be matched earlier inside the code point span
624 * because the standard algorithm would then have continued after
625 * the set string match anyway.
626 * ~ Remember the longest set string match (farthest end point) from the earliest
627 * starting point.
628 * + If no set string matched after spanSet.span(), then return
629 * with where the spanSet.span() ended.
630 * + If at least one set string matched, then continue the loop after the
631 * longest match from the earliest position.
632 * + If no more set string matched after a previous string match,
633 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
634 * Stop if spanLength==0, otherwise continue the loop.
635 */
636
637int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
638 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
639 return spanNot(s, length);
640 }
641 int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
642 if(spanLength==length) {
643 return length;
644 }
645
646 // Consider strings; they may overlap with the span.
647 OffsetList offsets;
648 if(spanCondition==USET_SPAN_CONTAINED) {
649 // Use offset list to try all possibilities.
650 offsets.setMaxLength(maxLength16);
651 }
652 int32_t pos=spanLength, rest=length-pos;
653 int32_t i, stringsLength=strings.size();
654 for(;;) {
655 if(spanCondition==USET_SPAN_CONTAINED) {
656 for(i=0; i<stringsLength; ++i) {
657 int32_t overlap=spanLengths[i];
658 if(overlap==ALL_CP_CONTAINED) {
659 continue; // Irrelevant string. (Also the empty string.)
660 }
661 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
662 const UChar *s16=string.getBuffer();
663 int32_t length16=string.length();
664 U_ASSERT(length>0)(void)0;
665
666 // Try to match this string at pos-overlap..pos.
667 if(overlap>=LONG_SPAN) {
668 overlap=length16;
669 // While contained: No point matching fully inside the code point span.
670 U16_BACK_1(s16, 0, overlap)do { if(((((s16)[--(overlap)])&0xfffffc00)==0xdc00) &&
(overlap)>(0) && ((((s16)[(overlap)-1])&0xfffffc00
)==0xd800)) { --(overlap); } } while (false)
; // Length of the string minus the last code point.
671 }
672 if(overlap>spanLength) {
673 overlap=spanLength;
674 }
675 int32_t inc=length16-overlap; // Keep overlap+inc==length16.
676 for(;;) {
677 if(inc>rest) {
678 break;
679 }
680 // Try to match if the increment is not listed already.
681 if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
682 if(inc==rest) {
683 return length; // Reached the end of the string.
684 }
685 offsets.addOffset(inc);
686 }
687 if(overlap==0) {
688 break;
689 }
690 --overlap;
691 ++inc;
692 }
693 }
694 } else /* USET_SPAN_SIMPLE */ {
695 int32_t maxInc=0, maxOverlap=0;
696 for(i=0; i<stringsLength; ++i) {
697 int32_t overlap=spanLengths[i];
698 // For longest match, we do need to try to match even an all-contained string
699 // to find the match from the earliest start.
700
701 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
702 const UChar *s16=string.getBuffer();
703 int32_t length16=string.length();
704 if (length16==0) {
705 continue; // skip the empty string
706 }
707
708 // Try to match this string at pos-overlap..pos.
709 if(overlap>=LONG_SPAN) {
710 overlap=length16;
711 // Longest match: Need to match fully inside the code point span
712 // to find the match from the earliest start.
713 }
714 if(overlap>spanLength) {
715 overlap=spanLength;
716 }
717 int32_t inc=length16-overlap; // Keep overlap+inc==length16.
718 for(;;) {
719 if(inc>rest || overlap<maxOverlap) {
720 break;
721 }
722 // Try to match if the string is longer or starts earlier.
723 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
724 matches16CPB(s, pos-overlap, length, s16, length16)
725 ) {
726 maxInc=inc; // Longest match from earliest start.
727 maxOverlap=overlap;
728 break;
729 }
730 --overlap;
731 ++inc;
732 }
733 }
734
735 if(maxInc!=0 || maxOverlap!=0) {
736 // Longest-match algorithm, and there was a string match.
737 // Simply continue after it.
738 pos+=maxInc;
739 rest-=maxInc;
740 if(rest==0) {
741 return length; // Reached the end of the string.
742 }
743 spanLength=0; // Match strings from after a string match.
744 continue;
745 }
746 }
747 // Finished trying to match all strings at pos.
748
749 if(spanLength!=0 || pos==0) {
750 // The position is after an unlimited code point span (spanLength!=0),
751 // not after a string match.
752 // The only position where spanLength==0 after a span is pos==0.
753 // Otherwise, an unlimited code point span is only tried again when no
754 // strings match, and if such a non-initial span fails we stop.
755 if(offsets.isEmpty()) {
756 return pos; // No strings matched after a span.
757 }
758 // Match strings from after the next string match.
759 } else {
760 // The position is after a string match (or a single code point).
761 if(offsets.isEmpty()) {
762 // No more strings matched after a previous string match.
763 // Try another code point span from after the last string match.
764 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
765 if( spanLength==rest || // Reached the end of the string, or
766 spanLength==0 // neither strings nor span progressed.
767 ) {
768 return pos+spanLength;
769 }
770 pos+=spanLength;
771 rest-=spanLength;
772 continue; // spanLength>0: Match strings from after a span.
773 } else {
774 // Try to match only one code point from after a string match if some
775 // string matched beyond it, so that we try all possible positions
776 // and don't overshoot.
777 spanLength=spanOne(spanSet, s+pos, rest);
778 if(spanLength>0) {
779 if(spanLength==rest) {
780 return length; // Reached the end of the string.
781 }
782 // Match strings after this code point.
783 // There cannot be any increments below it because UnicodeSet strings
784 // contain multiple code points.
785 pos+=spanLength;
786 rest-=spanLength;
787 offsets.shift(spanLength);
788 spanLength=0;
789 continue; // Match strings from after a single code point.
790 }
791 // Match strings from after the next string match.
792 }
793 }
794 int32_t minOffset=offsets.popMinimum();
795 pos+=minOffset;
796 rest-=minOffset;
797 spanLength=0; // Match strings from after a string match.
798 }
799}
800
801int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
802 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
803 return spanNotBack(s, length);
804 }
805 int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
806 if(pos==0) {
807 return 0;
808 }
809 int32_t spanLength=length-pos;
810
811 // Consider strings; they may overlap with the span.
812 OffsetList offsets;
813 if(spanCondition==USET_SPAN_CONTAINED) {
814 // Use offset list to try all possibilities.
815 offsets.setMaxLength(maxLength16);
816 }
817 int32_t i, stringsLength=strings.size();
818 uint8_t *spanBackLengths=spanLengths;
819 if(all) {
820 spanBackLengths+=stringsLength;
821 }
822 for(;;) {
823 if(spanCondition==USET_SPAN_CONTAINED) {
824 for(i=0; i<stringsLength; ++i) {
825 int32_t overlap=spanBackLengths[i];
826 if(overlap==ALL_CP_CONTAINED) {
827 continue; // Irrelevant string. (Also the empty string.)
828 }
829 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
830 const UChar *s16=string.getBuffer();
831 int32_t length16=string.length();
832 U_ASSERT(length>0)(void)0;
833
834 // Try to match this string at pos-(length16-overlap)..pos-length16.
835 if(overlap>=LONG_SPAN) {
836 overlap=length16;
837 // While contained: No point matching fully inside the code point span.
838 int32_t len1=0;
839 U16_FWD_1(s16, len1, overlap)do { if(((((s16)[(len1)++])&0xfffffc00)==0xd800) &&
(len1)!=(overlap) && ((((s16)[len1])&0xfffffc00)
==0xdc00)) { ++(len1); } } while (false)
;
840 overlap-=len1; // Length of the string minus the first code point.
841 }
842 if(overlap>spanLength) {
843 overlap=spanLength;
844 }
845 int32_t dec=length16-overlap; // Keep dec+overlap==length16.
846 for(;;) {
847 if(dec>pos) {
848 break;
849 }
850 // Try to match if the decrement is not listed already.
851 if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
852 if(dec==pos) {
853 return 0; // Reached the start of the string.
854 }
855 offsets.addOffset(dec);
856 }
857 if(overlap==0) {
858 break;
859 }
860 --overlap;
861 ++dec;
862 }
863 }
864 } else /* USET_SPAN_SIMPLE */ {
865 int32_t maxDec=0, maxOverlap=0;
866 for(i=0; i<stringsLength; ++i) {
867 int32_t overlap=spanBackLengths[i];
868 // For longest match, we do need to try to match even an all-contained string
869 // to find the match from the latest end.
870
871 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
872 const UChar *s16=string.getBuffer();
873 int32_t length16=string.length();
874 if (length16==0) {
875 continue; // skip the empty string
876 }
877
878 // Try to match this string at pos-(length16-overlap)..pos-length16.
879 if(overlap>=LONG_SPAN) {
880 overlap=length16;
881 // Longest match: Need to match fully inside the code point span
882 // to find the match from the latest end.
883 }
884 if(overlap>spanLength) {
885 overlap=spanLength;
886 }
887 int32_t dec=length16-overlap; // Keep dec+overlap==length16.
888 for(;;) {
889 if(dec>pos || overlap<maxOverlap) {
890 break;
891 }
892 // Try to match if the string is longer or ends later.
893 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
894 matches16CPB(s, pos-dec, length, s16, length16)
895 ) {
896 maxDec=dec; // Longest match from latest end.
897 maxOverlap=overlap;
898 break;
899 }
900 --overlap;
901 ++dec;
902 }
903 }
904
905 if(maxDec!=0 || maxOverlap!=0) {
906 // Longest-match algorithm, and there was a string match.
907 // Simply continue before it.
908 pos-=maxDec;
909 if(pos==0) {
910 return 0; // Reached the start of the string.
911 }
912 spanLength=0; // Match strings from before a string match.
913 continue;
914 }
915 }
916 // Finished trying to match all strings at pos.
917
918 if(spanLength!=0 || pos==length) {
919 // The position is before an unlimited code point span (spanLength!=0),
920 // not before a string match.
921 // The only position where spanLength==0 before a span is pos==length.
922 // Otherwise, an unlimited code point span is only tried again when no
923 // strings match, and if such a non-initial span fails we stop.
924 if(offsets.isEmpty()) {
925 return pos; // No strings matched before a span.
926 }
927 // Match strings from before the next string match.
928 } else {
929 // The position is before a string match (or a single code point).
930 if(offsets.isEmpty()) {
931 // No more strings matched before a previous string match.
932 // Try another code point span from before the last string match.
933 int32_t oldPos=pos;
934 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
935 spanLength=oldPos-pos;
936 if( pos==0 || // Reached the start of the string, or
937 spanLength==0 // neither strings nor span progressed.
938 ) {
939 return pos;
940 }
941 continue; // spanLength>0: Match strings from before a span.
942 } else {
943 // Try to match only one code point from before a string match if some
944 // string matched beyond it, so that we try all possible positions
945 // and don't overshoot.
946 spanLength=spanOneBack(spanSet, s, pos);
947 if(spanLength>0) {
948 if(spanLength==pos) {
949 return 0; // Reached the start of the string.
950 }
951 // Match strings before this code point.
952 // There cannot be any decrements below it because UnicodeSet strings
953 // contain multiple code points.
954 pos-=spanLength;
955 offsets.shift(spanLength);
956 spanLength=0;
957 continue; // Match strings from before a single code point.
958 }
959 // Match strings from before the next string match.
960 }
961 }
962 pos-=offsets.popMinimum();
963 spanLength=0; // Match strings from before a string match.
964 }
965}
966
967int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
968 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
969 return spanNotUTF8(s, length);
970 }
971 int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
972 if(spanLength==length) {
973 return length;
974 }
975
976 // Consider strings; they may overlap with the span.
977 OffsetList offsets;
978 if(spanCondition==USET_SPAN_CONTAINED) {
979 // Use offset list to try all possibilities.
980 offsets.setMaxLength(maxLength8);
981 }
982 int32_t pos=spanLength, rest=length-pos;
983 int32_t i, stringsLength=strings.size();
984 uint8_t *spanUTF8Lengths=spanLengths;
985 if(all) {
986 spanUTF8Lengths+=2*stringsLength;
987 }
988 for(;;) {
989 const uint8_t *s8=utf8;
990 int32_t length8;
991 if(spanCondition==USET_SPAN_CONTAINED) {
992 for(i=0; i<stringsLength; ++i) {
993 length8=utf8Lengths[i];
994 if(length8==0) {
995 continue; // String not representable in UTF-8.
996 }
997 int32_t overlap=spanUTF8Lengths[i];
998 if(overlap==ALL_CP_CONTAINED) {
999 s8+=length8;
1000 continue; // Irrelevant string.
1001 }
1002
1003 // Try to match this string at pos-overlap..pos.
1004 if(overlap>=LONG_SPAN) {
1005 overlap=length8;
1006 // While contained: No point matching fully inside the code point span.
1007 U8_BACK_1(s8, 0, overlap)do { if(((int8_t)((s8)[--(overlap)])<-0x40)) { (overlap)=utf8_back1SafeBody_71
(s8, 0, (overlap)); } } while (false)
; // Length of the string minus the last code point.
1008 }
1009 if(overlap>spanLength) {
1010 overlap=spanLength;
1011 }
1012 int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1013 for(;;) {
1014 if(inc>rest) {
1015 break;
1016 }
1017 // Try to match if the increment is not listed already.
1018 // Match at code point boundaries. (The UTF-8 strings were converted
1019 // from UTF-16 and are guaranteed to be well-formed.)
1020 if(!U8_IS_TRAIL(s[pos-overlap])((int8_t)(s[pos-overlap])<-0x40) &&
1021 !offsets.containsOffset(inc) &&
1022 matches8(s+pos-overlap, s8, length8)) {
1023 if(inc==rest) {
1024 return length; // Reached the end of the string.
1025 }
1026 offsets.addOffset(inc);
1027 }
1028 if(overlap==0) {
1029 break;
1030 }
1031 --overlap;
1032 ++inc;
1033 }
1034 s8+=length8;
1035 }
1036 } else /* USET_SPAN_SIMPLE */ {
1037 int32_t maxInc=0, maxOverlap=0;
1038 for(i=0; i<stringsLength; ++i) {
1039 length8=utf8Lengths[i];
1040 if(length8==0) {
1041 continue; // String not representable in UTF-8.
1042 }
1043 int32_t overlap=spanUTF8Lengths[i];
1044 // For longest match, we do need to try to match even an all-contained string
1045 // to find the match from the earliest start.
1046
1047 // Try to match this string at pos-overlap..pos.
1048 if(overlap>=LONG_SPAN) {
1049 overlap=length8;
1050 // Longest match: Need to match fully inside the code point span
1051 // to find the match from the earliest start.
1052 }
1053 if(overlap>spanLength) {
1054 overlap=spanLength;
1055 }
1056 int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1057 for(;;) {
1058 if(inc>rest || overlap<maxOverlap) {
1059 break;
1060 }
1061 // Try to match if the string is longer or starts earlier.
1062 // Match at code point boundaries. (The UTF-8 strings were converted
1063 // from UTF-16 and are guaranteed to be well-formed.)
1064 if(!U8_IS_TRAIL(s[pos-overlap])((int8_t)(s[pos-overlap])<-0x40) &&
1065 (overlap>maxOverlap ||
1066 /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1067 matches8(s+pos-overlap, s8, length8)) {
1068 maxInc=inc; // Longest match from earliest start.
1069 maxOverlap=overlap;
1070 break;
1071 }
1072 --overlap;
1073 ++inc;
1074 }
1075 s8+=length8;
1076 }
1077
1078 if(maxInc!=0 || maxOverlap!=0) {
1079 // Longest-match algorithm, and there was a string match.
1080 // Simply continue after it.
1081 pos+=maxInc;
1082 rest-=maxInc;
1083 if(rest==0) {
1084 return length; // Reached the end of the string.
1085 }
1086 spanLength=0; // Match strings from after a string match.
1087 continue;
1088 }
1089 }
1090 // Finished trying to match all strings at pos.
1091
1092 if(spanLength!=0 || pos==0) {
1093 // The position is after an unlimited code point span (spanLength!=0),
1094 // not after a string match.
1095 // The only position where spanLength==0 after a span is pos==0.
1096 // Otherwise, an unlimited code point span is only tried again when no
1097 // strings match, and if such a non-initial span fails we stop.
1098 if(offsets.isEmpty()) {
1099 return pos; // No strings matched after a span.
1100 }
1101 // Match strings from after the next string match.
1102 } else {
1103 // The position is after a string match (or a single code point).
1104 if(offsets.isEmpty()) {
1105 // No more strings matched after a previous string match.
1106 // Try another code point span from after the last string match.
1107 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1108 if( spanLength==rest || // Reached the end of the string, or
1109 spanLength==0 // neither strings nor span progressed.
1110 ) {
1111 return pos+spanLength;
1112 }
1113 pos+=spanLength;
1114 rest-=spanLength;
1115 continue; // spanLength>0: Match strings from after a span.
1116 } else {
1117 // Try to match only one code point from after a string match if some
1118 // string matched beyond it, so that we try all possible positions
1119 // and don't overshoot.
1120 spanLength=spanOneUTF8(spanSet, s+pos, rest);
1121 if(spanLength>0) {
1122 if(spanLength==rest) {
1123 return length; // Reached the end of the string.
1124 }
1125 // Match strings after this code point.
1126 // There cannot be any increments below it because UnicodeSet strings
1127 // contain multiple code points.
1128 pos+=spanLength;
1129 rest-=spanLength;
1130 offsets.shift(spanLength);
1131 spanLength=0;
1132 continue; // Match strings from after a single code point.
1133 }
1134 // Match strings from after the next string match.
1135 }
1136 }
1137 int32_t minOffset=offsets.popMinimum();
1138 pos+=minOffset;
1139 rest-=minOffset;
1140 spanLength=0; // Match strings from after a string match.
1141 }
1142}
1143
1144int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1145 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1146 return spanNotBackUTF8(s, length);
1147 }
1148 int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1149 if(pos==0) {
1150 return 0;
1151 }
1152 int32_t spanLength=length-pos;
1153
1154 // Consider strings; they may overlap with the span.
1155 OffsetList offsets;
1156 if(spanCondition==USET_SPAN_CONTAINED) {
1157 // Use offset list to try all possibilities.
1158 offsets.setMaxLength(maxLength8);
1159 }
1160 int32_t i, stringsLength=strings.size();
1161 uint8_t *spanBackUTF8Lengths=spanLengths;
1162 if(all) {
1163 spanBackUTF8Lengths+=3*stringsLength;
1164 }
1165 for(;;) {
1166 const uint8_t *s8=utf8;
1167 int32_t length8;
1168 if(spanCondition==USET_SPAN_CONTAINED) {
1169 for(i=0; i<stringsLength; ++i) {
1170 length8=utf8Lengths[i];
1171 if(length8==0) {
1172 continue; // String not representable in UTF-8.
1173 }
1174 int32_t overlap=spanBackUTF8Lengths[i];
1175 if(overlap==ALL_CP_CONTAINED) {
1176 s8+=length8;
1177 continue; // Irrelevant string.
1178 }
1179
1180 // Try to match this string at pos-(length8-overlap)..pos-length8.
1181 if(overlap>=LONG_SPAN) {
1182 overlap=length8;
1183 // While contained: No point matching fully inside the code point span.
1184 int32_t len1=0;
1185 U8_FWD_1(s8, len1, overlap)do { uint8_t __b=(s8)[(len1)++]; if(((uint8_t)((__b)-0xc2)<=
0x32) && (len1)!=(overlap)) { uint8_t __t1=(s8)[len1]
; if((0xe0<=__b && __b<0xf0)) { if(("\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
[(__b)&0xf]&(1<<((uint8_t)(__t1)>>5))) &&
++(len1)!=(overlap) && ((int8_t)((s8)[len1])<-0x40
)) { ++(len1); } } else if(__b<0xe0) { if(((int8_t)(__t1)<
-0x40)) { ++(len1); } } else { if(("\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
[(uint8_t)(__t1)>>4]&(1<<((__b)&7))) &&
++(len1)!=(overlap) && ((int8_t)((s8)[len1])<-0x40
) && ++(len1)!=(overlap) && ((int8_t)((s8)[len1
])<-0x40)) { ++(len1); } } } } while (false)
;
1186 overlap-=len1; // Length of the string minus the first code point.
1187 }
1188 if(overlap>spanLength) {
1189 overlap=spanLength;
1190 }
1191 int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1192 for(;;) {
1193 if(dec>pos) {
1194 break;
1195 }
1196 // Try to match if the decrement is not listed already.
1197 // Match at code point boundaries. (The UTF-8 strings were converted
1198 // from UTF-16 and are guaranteed to be well-formed.)
1199 if( !U8_IS_TRAIL(s[pos-dec])((int8_t)(s[pos-dec])<-0x40) &&
1200 !offsets.containsOffset(dec) &&
1201 matches8(s+pos-dec, s8, length8)
1202 ) {
1203 if(dec==pos) {
1204 return 0; // Reached the start of the string.
1205 }
1206 offsets.addOffset(dec);
1207 }
1208 if(overlap==0) {
1209 break;
1210 }
1211 --overlap;
1212 ++dec;
1213 }
1214 s8+=length8;
1215 }
1216 } else /* USET_SPAN_SIMPLE */ {
1217 int32_t maxDec=0, maxOverlap=0;
1218 for(i=0; i<stringsLength; ++i) {
1219 length8=utf8Lengths[i];
1220 if(length8==0) {
1221 continue; // String not representable in UTF-8.
1222 }
1223 int32_t overlap=spanBackUTF8Lengths[i];
1224 // For longest match, we do need to try to match even an all-contained string
1225 // to find the match from the latest end.
1226
1227 // Try to match this string at pos-(length8-overlap)..pos-length8.
1228 if(overlap>=LONG_SPAN) {
1229 overlap=length8;
1230 // Longest match: Need to match fully inside the code point span
1231 // to find the match from the latest end.
1232 }
1233 if(overlap>spanLength) {
1234 overlap=spanLength;
1235 }
1236 int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1237 for(;;) {
1238 if(dec>pos || overlap<maxOverlap) {
1239 break;
1240 }
1241 // Try to match if the string is longer or ends later.
1242 // Match at code point boundaries. (The UTF-8 strings were converted
1243 // from UTF-16 and are guaranteed to be well-formed.)
1244 if( !U8_IS_TRAIL(s[pos-dec])((int8_t)(s[pos-dec])<-0x40) &&
1245 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1246 matches8(s+pos-dec, s8, length8)
1247 ) {
1248 maxDec=dec; // Longest match from latest end.
1249 maxOverlap=overlap;
1250 break;
1251 }
1252 --overlap;
1253 ++dec;
1254 }
1255 s8+=length8;
1256 }
1257
1258 if(maxDec!=0 || maxOverlap!=0) {
1259 // Longest-match algorithm, and there was a string match.
1260 // Simply continue before it.
1261 pos-=maxDec;
1262 if(pos==0) {
1263 return 0; // Reached the start of the string.
1264 }
1265 spanLength=0; // Match strings from before a string match.
1266 continue;
1267 }
1268 }
1269 // Finished trying to match all strings at pos.
1270
1271 if(spanLength!=0 || pos==length) {
1272 // The position is before an unlimited code point span (spanLength!=0),
1273 // not before a string match.
1274 // The only position where spanLength==0 before a span is pos==length.
1275 // Otherwise, an unlimited code point span is only tried again when no
1276 // strings match, and if such a non-initial span fails we stop.
1277 if(offsets.isEmpty()) {
1278 return pos; // No strings matched before a span.
1279 }
1280 // Match strings from before the next string match.
1281 } else {
1282 // The position is before a string match (or a single code point).
1283 if(offsets.isEmpty()) {
1284 // No more strings matched before a previous string match.
1285 // Try another code point span from before the last string match.
1286 int32_t oldPos=pos;
1287 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1288 spanLength=oldPos-pos;
1289 if( pos==0 || // Reached the start of the string, or
1290 spanLength==0 // neither strings nor span progressed.
1291 ) {
1292 return pos;
1293 }
1294 continue; // spanLength>0: Match strings from before a span.
1295 } else {
1296 // Try to match only one code point from before a string match if some
1297 // string matched beyond it, so that we try all possible positions
1298 // and don't overshoot.
1299 spanLength=spanOneBackUTF8(spanSet, s, pos);
1300 if(spanLength>0) {
1301 if(spanLength==pos) {
1302 return 0; // Reached the start of the string.
1303 }
1304 // Match strings before this code point.
1305 // There cannot be any decrements below it because UnicodeSet strings
1306 // contain multiple code points.
1307 pos-=spanLength;
1308 offsets.shift(spanLength);
1309 spanLength=0;
1310 continue; // Match strings from before a single code point.
1311 }
1312 // Match strings from before the next string match.
1313 }
1314 }
1315 pos-=offsets.popMinimum();
1316 spanLength=0; // Match strings from before a string match.
1317 }
1318}
1319
1320/*
1321 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1322 *
1323 * Theoretical algorithm:
1324 * - Iterate through the string, and at each code point boundary:
1325 * + If the code point there is in the set, then return with the current position.
1326 * + If a set string matches at the current position, then return with the current position.
1327 *
1328 * Optimized implementation:
1329 *
1330 * (Same assumption as for span() above.)
1331 *
1332 * Create and cache a spanNotSet which contains all of the single code points
1333 * of the original set but none of its strings.
1334 * For each set string add its initial code point to the spanNotSet.
1335 * (Also add its final code point for spanNotBack().)
1336 *
1337 * - Loop:
1338 * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1339 * + If the current code point is in the original set, then
1340 * return the current position.
1341 * + If any set string matches at the current position, then
1342 * return the current position.
1343 * + If there is no match at the current position, neither for the code point there
1344 * nor for any set string, then skip this code point and continue the loop.
1345 * This happens for set-string-initial code points that were added to spanNotSet
1346 * when there is not actually a match for such a set string.
1347 */
1348
1349int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1350 int32_t pos=0, rest=length;
1351 int32_t i, stringsLength=strings.size();
1352 do {
1353 // Span until we find a code point from the set,
1354 // or a code point that starts or ends some string.
1355 i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1356 if(i==rest) {
1357 return length; // Reached the end of the string.
1358 }
1359 pos+=i;
1360 rest-=i;
1361
1362 // Check whether the current code point is in the original set,
1363 // without the string starts and ends.
1364 int32_t cpLength=spanOne(spanSet, s+pos, rest);
1365 if(cpLength>0) {
1366 return pos; // There is a set element at pos.
1367 }
1368
1369 // Try to match the strings at pos.
1370 for(i=0; i<stringsLength; ++i) {
1371 if(spanLengths[i]==ALL_CP_CONTAINED) {
1372 continue; // Irrelevant string. (Also the empty string.)
1373 }
1374 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1375 const UChar *s16=string.getBuffer();
1376 int32_t length16=string.length();
1377 U_ASSERT(length>0)(void)0;
1378 if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1379 return pos; // There is a set element at pos.
1380 }
1381 }
1382
1383 // The span(while not contained) ended on a string start/end which is
1384 // not in the original set. Skip this code point and continue.
1385 // cpLength<0
1386 pos-=cpLength;
1387 rest+=cpLength;
1388 } while(rest!=0);
1389 return length; // Reached the end of the string.
1390}
1391
1392int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1393 int32_t pos=length;
1394 int32_t i, stringsLength=strings.size();
1395 do {
1396 // Span until we find a code point from the set,
1397 // or a code point that starts or ends some string.
1398 pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1399 if(pos==0) {
1400 return 0; // Reached the start of the string.
1401 }
1402
1403 // Check whether the current code point is in the original set,
1404 // without the string starts and ends.
1405 int32_t cpLength=spanOneBack(spanSet, s, pos);
1406 if(cpLength>0) {
1407 return pos; // There is a set element at pos.
1408 }
1409
1410 // Try to match the strings at pos.
1411 for(i=0; i<stringsLength; ++i) {
1412 // Use spanLengths rather than a spanBackLengths pointer because
1413 // it is easier and we only need to know whether the string is irrelevant
1414 // which is the same in either array.
1415 if(spanLengths[i]==ALL_CP_CONTAINED) {
1416 continue; // Irrelevant string. (Also the empty string.)
1417 }
1418 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1419 const UChar *s16=string.getBuffer();
1420 int32_t length16=string.length();
1421 U_ASSERT(length>0)(void)0;
1422 if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1423 return pos; // There is a set element at pos.
1424 }
1425 }
1426
1427 // The span(while not contained) ended on a string start/end which is
1428 // not in the original set. Skip this code point and continue.
1429 // cpLength<0
1430 pos+=cpLength;
1431 } while(pos!=0);
1432 return 0; // Reached the start of the string.
1433}
1434
1435int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1436 int32_t pos=0, rest=length;
1437 int32_t i, stringsLength=strings.size();
1438 uint8_t *spanUTF8Lengths=spanLengths;
1439 if(all) {
1440 spanUTF8Lengths+=2*stringsLength;
1441 }
1442 do {
1443 // Span until we find a code point from the set,
1444 // or a code point that starts or ends some string.
1445 i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1446 if(i==rest) {
1447 return length; // Reached the end of the string.
1448 }
1449 pos+=i;
1450 rest-=i;
1451
1452 // Check whether the current code point is in the original set,
1453 // without the string starts and ends.
1454 int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1455 if(cpLength>0) {
1456 return pos; // There is a set element at pos.
1457 }
1458
1459 // Try to match the strings at pos.
1460 const uint8_t *s8=utf8;
1461 int32_t length8;
1462 for(i=0; i<stringsLength; ++i) {
1463 length8=utf8Lengths[i];
1464 // ALL_CP_CONTAINED: Irrelevant string.
1465 if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1466 return pos; // There is a set element at pos.
1467 }
1468 s8+=length8;
1469 }
1470
1471 // The span(while not contained) ended on a string start/end which is
1472 // not in the original set. Skip this code point and continue.
1473 // cpLength<0
1474 pos-=cpLength;
1475 rest+=cpLength;
1476 } while(rest!=0);
1477 return length; // Reached the end of the string.
1478}
1479
1480int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1481 int32_t pos=length;
1482 int32_t i, stringsLength=strings.size();
1483 uint8_t *spanBackUTF8Lengths=spanLengths;
1484 if(all) {
1485 spanBackUTF8Lengths+=3*stringsLength;
1486 }
1487 do {
1488 // Span until we find a code point from the set,
1489 // or a code point that starts or ends some string.
1490 pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1491 if(pos==0) {
1492 return 0; // Reached the start of the string.
1493 }
1494
1495 // Check whether the current code point is in the original set,
1496 // without the string starts and ends.
1497 int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1498 if(cpLength>0) {
1499 return pos; // There is a set element at pos.
1500 }
1501
1502 // Try to match the strings at pos.
1503 const uint8_t *s8=utf8;
1504 int32_t length8;
1505 for(i=0; i<stringsLength; ++i) {
1506 length8=utf8Lengths[i];
1507 // ALL_CP_CONTAINED: Irrelevant string.
1508 if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1509 return pos; // There is a set element at pos.
1510 }
1511 s8+=length8;
1512 }
1513
1514 // The span(while not contained) ended on a string start/end which is
1515 // not in the original set. Skip this code point and continue.
1516 // cpLength<0
1517 pos+=cpLength;
1518 } while(pos!=0);
1519 return 0; // Reached the start of the string.
1520}
1521
1522U_NAMESPACE_END}